• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /*
2  * Copyright © 2018 Valve Corporation
3  * Copyright © 2018 Google
4  *
5  * Permission is hereby granted, free of charge, to any person obtaining a
6  * copy of this software and associated documentation files (the "Software"),
7  * to deal in the Software without restriction, including without limitation
8  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
9  * and/or sell copies of the Software, and to permit persons to whom the
10  * Software is furnished to do so, subject to the following conditions:
11  *
12  * The above copyright notice and this permission notice (including the next
13  * paragraph) shall be included in all copies or substantial portions of the
14  * Software.
15  *
16  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
19  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
21  * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
22  * IN THE SOFTWARE.
23  *
24  */
25 
26 #include "aco_instruction_selection.h"
27 
28 #include "aco_builder.h"
29 #include "aco_interface.h"
30 #include "aco_ir.h"
31 
32 #include "common/ac_nir.h"
33 #include "common/sid.h"
34 
35 #include "util/fast_idiv_by_const.h"
36 #include "util/memstream.h"
37 
38 #include <array>
39 #include <functional>
40 #include <map>
41 #include <numeric>
42 #include <stack>
43 #include <utility>
44 #include <vector>
45 
46 namespace aco {
47 namespace {
48 
49 #define isel_err(...) _isel_err(ctx, __FILE__, __LINE__, __VA_ARGS__)
50 
51 static void
_isel_err(isel_context * ctx,const char * file,unsigned line,const nir_instr * instr,const char * msg)52 _isel_err(isel_context* ctx, const char* file, unsigned line, const nir_instr* instr,
53           const char* msg)
54 {
55    char* out;
56    size_t outsize;
57    struct u_memstream mem;
58    u_memstream_open(&mem, &out, &outsize);
59    FILE* const memf = u_memstream_get(&mem);
60 
61    fprintf(memf, "%s: ", msg);
62    nir_print_instr(instr, memf);
63    u_memstream_close(&mem);
64 
65    _aco_err(ctx->program, file, line, out);
66    free(out);
67 }
68 
69 struct if_context {
70    Temp cond;
71 
72    bool divergent_old;
73    bool exec_potentially_empty_discard_old;
74    bool exec_potentially_empty_break_old;
75    bool had_divergent_discard_old;
76    bool had_divergent_discard_then;
77    uint16_t exec_potentially_empty_break_depth_old;
78 
79    unsigned BB_if_idx;
80    unsigned invert_idx;
81    bool uniform_has_then_branch;
82    bool then_branch_divergent;
83    Block BB_invert;
84    Block BB_endif;
85 };
86 
87 struct loop_context {
88    Block loop_exit;
89 
90    unsigned header_idx_old;
91    Block* exit_old;
92    bool divergent_cont_old;
93    bool divergent_branch_old;
94    bool divergent_if_old;
95 };
96 
97 static bool visit_cf_list(struct isel_context* ctx, struct exec_list* list);
98 
99 static void
add_logical_edge(unsigned pred_idx,Block * succ)100 add_logical_edge(unsigned pred_idx, Block* succ)
101 {
102    succ->logical_preds.emplace_back(pred_idx);
103 }
104 
105 static void
add_linear_edge(unsigned pred_idx,Block * succ)106 add_linear_edge(unsigned pred_idx, Block* succ)
107 {
108    succ->linear_preds.emplace_back(pred_idx);
109 }
110 
111 static void
add_edge(unsigned pred_idx,Block * succ)112 add_edge(unsigned pred_idx, Block* succ)
113 {
114    add_logical_edge(pred_idx, succ);
115    add_linear_edge(pred_idx, succ);
116 }
117 
118 static void
append_logical_start(Block * b)119 append_logical_start(Block* b)
120 {
121    Builder(NULL, b).pseudo(aco_opcode::p_logical_start);
122 }
123 
124 static void
append_logical_end(Block * b)125 append_logical_end(Block* b)
126 {
127    Builder(NULL, b).pseudo(aco_opcode::p_logical_end);
128 }
129 
130 Temp
get_ssa_temp(struct isel_context * ctx,nir_def * def)131 get_ssa_temp(struct isel_context* ctx, nir_def* def)
132 {
133    uint32_t id = ctx->first_temp_id + def->index;
134    return Temp(id, ctx->program->temp_rc[id]);
135 }
136 
137 Temp
emit_mbcnt(isel_context * ctx,Temp dst,Operand mask=Operand (),Operand base=Operand::zero ())138 emit_mbcnt(isel_context* ctx, Temp dst, Operand mask = Operand(), Operand base = Operand::zero())
139 {
140    Builder bld(ctx->program, ctx->block);
141    assert(mask.isUndefined() || mask.isTemp() || (mask.isFixed() && mask.physReg() == exec));
142    assert(mask.isUndefined() || mask.bytes() == bld.lm.bytes());
143 
144    if (ctx->program->wave_size == 32) {
145       Operand mask_lo = mask.isUndefined() ? Operand::c32(-1u) : mask;
146       return bld.vop3(aco_opcode::v_mbcnt_lo_u32_b32, Definition(dst), mask_lo, base);
147    }
148 
149    Operand mask_lo = Operand::c32(-1u);
150    Operand mask_hi = Operand::c32(-1u);
151 
152    if (mask.isTemp()) {
153       RegClass rc = RegClass(mask.regClass().type(), 1);
154       Builder::Result mask_split =
155          bld.pseudo(aco_opcode::p_split_vector, bld.def(rc), bld.def(rc), mask);
156       mask_lo = Operand(mask_split.def(0).getTemp());
157       mask_hi = Operand(mask_split.def(1).getTemp());
158    } else if (mask.physReg() == exec) {
159       mask_lo = Operand(exec_lo, s1);
160       mask_hi = Operand(exec_hi, s1);
161    }
162 
163    Temp mbcnt_lo = bld.vop3(aco_opcode::v_mbcnt_lo_u32_b32, bld.def(v1), mask_lo, base);
164 
165    if (ctx->program->gfx_level <= GFX7)
166       return bld.vop2(aco_opcode::v_mbcnt_hi_u32_b32, Definition(dst), mask_hi, mbcnt_lo);
167    else
168       return bld.vop3(aco_opcode::v_mbcnt_hi_u32_b32_e64, Definition(dst), mask_hi, mbcnt_lo);
169 }
170 
171 inline void
set_wqm(isel_context * ctx,bool enable_helpers=false)172 set_wqm(isel_context* ctx, bool enable_helpers = false)
173 {
174    if (ctx->program->stage == fragment_fs) {
175       ctx->wqm_block_idx = ctx->block->index;
176       ctx->wqm_instruction_idx = ctx->block->instructions.size();
177       if (ctx->shader)
178          enable_helpers |= ctx->shader->info.fs.require_full_quads;
179       ctx->program->needs_wqm |= enable_helpers;
180    }
181 }
182 
183 static Temp
emit_bpermute(isel_context * ctx,Builder & bld,Temp index,Temp data)184 emit_bpermute(isel_context* ctx, Builder& bld, Temp index, Temp data)
185 {
186    if (index.regClass() == s1)
187       return bld.readlane(bld.def(s1), data, index);
188 
189    /* Avoid using shared VGPRs for shuffle on GFX10 when the shader consists
190     * of multiple binaries, because the VGPR use is not known when choosing
191     * which registers to use for the shared VGPRs.
192     */
193    const bool avoid_shared_vgprs =
194       ctx->options->gfx_level >= GFX10 && ctx->options->gfx_level < GFX11 &&
195       ctx->program->wave_size == 64 &&
196       (ctx->program->info.has_epilog || ctx->program->info.merged_shader_compiled_separately ||
197        ctx->stage == raytracing_cs);
198 
199    if (ctx->options->gfx_level <= GFX7 || avoid_shared_vgprs) {
200       /* GFX6-7: there is no bpermute instruction */
201       Operand index_op(index);
202       Operand input_data(data);
203       index_op.setLateKill(true);
204       input_data.setLateKill(true);
205 
206       return bld.pseudo(aco_opcode::p_bpermute_readlane, bld.def(v1), bld.def(bld.lm),
207                         bld.def(bld.lm, vcc), index_op, input_data);
208    } else if (ctx->options->gfx_level >= GFX10 && ctx->program->wave_size == 64) {
209 
210       /* GFX10 wave64 mode: emulate full-wave bpermute */
211       Temp index_is_lo =
212          bld.vopc(aco_opcode::v_cmp_ge_u32, bld.def(bld.lm), Operand::c32(31u), index);
213       Builder::Result index_is_lo_split =
214          bld.pseudo(aco_opcode::p_split_vector, bld.def(s1), bld.def(s1), index_is_lo);
215       Temp index_is_lo_n1 = bld.sop1(aco_opcode::s_not_b32, bld.def(s1), bld.def(s1, scc),
216                                      index_is_lo_split.def(1).getTemp());
217       Operand same_half = bld.pseudo(aco_opcode::p_create_vector, bld.def(s2),
218                                      index_is_lo_split.def(0).getTemp(), index_is_lo_n1);
219       Operand index_x4 = bld.vop2(aco_opcode::v_lshlrev_b32, bld.def(v1), Operand::c32(2u), index);
220       Operand input_data(data);
221 
222       index_x4.setLateKill(true);
223       input_data.setLateKill(true);
224       same_half.setLateKill(true);
225 
226       if (ctx->options->gfx_level <= GFX10_3) {
227          /* We need one pair of shared VGPRs:
228           * Note, that these have twice the allocation granularity of normal VGPRs
229           */
230          ctx->program->config->num_shared_vgprs = 2 * ctx->program->dev.vgpr_alloc_granule;
231 
232          return bld.pseudo(aco_opcode::p_bpermute_shared_vgpr, bld.def(v1), bld.def(s2),
233                            bld.def(s1, scc), index_x4, input_data, same_half);
234       } else {
235          return bld.pseudo(aco_opcode::p_bpermute_permlane, bld.def(v1), bld.def(s2),
236                            bld.def(s1, scc), Operand(v1.as_linear()), index_x4, input_data,
237                            same_half);
238       }
239    } else {
240       /* GFX8-9 or GFX10 wave32: bpermute works normally */
241       Temp index_x4 = bld.vop2(aco_opcode::v_lshlrev_b32, bld.def(v1), Operand::c32(2u), index);
242       return bld.ds(aco_opcode::ds_bpermute_b32, bld.def(v1), index_x4, data);
243    }
244 }
245 
246 static Temp
emit_masked_swizzle(isel_context * ctx,Builder & bld,Temp src,unsigned mask,bool allow_fi)247 emit_masked_swizzle(isel_context* ctx, Builder& bld, Temp src, unsigned mask, bool allow_fi)
248 {
249    if (ctx->options->gfx_level >= GFX8) {
250       unsigned and_mask = mask & 0x1f;
251       unsigned or_mask = (mask >> 5) & 0x1f;
252       unsigned xor_mask = (mask >> 10) & 0x1f;
253 
254       /* Eliminate or_mask. */
255       and_mask &= ~or_mask;
256       xor_mask ^= or_mask;
257 
258       uint16_t dpp_ctrl = 0xffff;
259 
260       /* DPP16 before DPP8 before v_permlane(x)16_b32
261        * because DPP16 supports modifiers and v_permlane
262        * can't be folded into valu instructions.
263        */
264       if ((and_mask & 0x1c) == 0x1c && xor_mask < 4) {
265          unsigned res[4];
266          for (unsigned i = 0; i < 4; i++)
267             res[i] = ((i & and_mask) ^ xor_mask);
268          dpp_ctrl = dpp_quad_perm(res[0], res[1], res[2], res[3]);
269       } else if (and_mask == 0x1f && xor_mask == 8) {
270          dpp_ctrl = dpp_row_rr(8);
271       } else if (and_mask == 0x1f && xor_mask == 0xf) {
272          dpp_ctrl = dpp_row_mirror;
273       } else if (and_mask == 0x1f && xor_mask == 0x7) {
274          dpp_ctrl = dpp_row_half_mirror;
275       } else if (ctx->options->gfx_level >= GFX11 && and_mask == 0x10 && xor_mask < 0x10) {
276          dpp_ctrl = dpp_row_share(xor_mask);
277       } else if (ctx->options->gfx_level >= GFX11 && and_mask == 0x1f && xor_mask < 0x10) {
278          dpp_ctrl = dpp_row_xmask(xor_mask);
279       } else if (ctx->options->gfx_level >= GFX10 && (and_mask & 0x18) == 0x18 && xor_mask < 8) {
280          uint32_t lane_sel = 0;
281          for (unsigned i = 0; i < 8; i++)
282             lane_sel |= ((i & and_mask) ^ xor_mask) << (i * 3);
283          return bld.vop1_dpp8(aco_opcode::v_mov_b32, bld.def(v1), src, lane_sel, allow_fi);
284       } else if (ctx->options->gfx_level >= GFX10 && (and_mask & 0x10) == 0x10) {
285          uint64_t lane_mask = 0;
286          for (unsigned i = 0; i < 16; i++)
287             lane_mask |= uint64_t((i & and_mask) ^ (xor_mask & 0xf)) << i * 4;
288          aco_opcode opcode =
289             xor_mask & 0x10 ? aco_opcode::v_permlanex16_b32 : aco_opcode::v_permlane16_b32;
290          Temp op1 = bld.copy(bld.def(s1), Operand::c32(lane_mask & 0xffffffff));
291          Temp op2 = bld.copy(bld.def(s1), Operand::c32(lane_mask >> 32));
292          Builder::Result ret = bld.vop3(opcode, bld.def(v1), src, op1, op2);
293          ret->valu().opsel[0] = allow_fi; /* set FETCH_INACTIVE */
294          ret->valu().opsel[1] = true;     /* set BOUND_CTRL */
295          return ret;
296       }
297 
298       if (dpp_ctrl != 0xffff)
299          return bld.vop1_dpp(aco_opcode::v_mov_b32, bld.def(v1), src, dpp_ctrl, 0xf, 0xf, true,
300                              allow_fi);
301    }
302 
303    return bld.ds(aco_opcode::ds_swizzle_b32, bld.def(v1), src, mask, 0, false);
304 }
305 
306 Temp
as_vgpr(Builder & bld,Temp val)307 as_vgpr(Builder& bld, Temp val)
308 {
309    if (val.type() == RegType::sgpr)
310       return bld.copy(bld.def(RegType::vgpr, val.size()), val);
311    assert(val.type() == RegType::vgpr);
312    return val;
313 }
314 
315 Temp
as_vgpr(isel_context * ctx,Temp val)316 as_vgpr(isel_context* ctx, Temp val)
317 {
318    Builder bld(ctx->program, ctx->block);
319    return as_vgpr(bld, val);
320 }
321 
322 void
emit_extract_vector(isel_context * ctx,Temp src,uint32_t idx,Temp dst)323 emit_extract_vector(isel_context* ctx, Temp src, uint32_t idx, Temp dst)
324 {
325    Builder bld(ctx->program, ctx->block);
326    bld.pseudo(aco_opcode::p_extract_vector, Definition(dst), src, Operand::c32(idx));
327 }
328 
329 Temp
emit_extract_vector(isel_context * ctx,Temp src,uint32_t idx,RegClass dst_rc)330 emit_extract_vector(isel_context* ctx, Temp src, uint32_t idx, RegClass dst_rc)
331 {
332    /* no need to extract the whole vector */
333    if (src.regClass() == dst_rc) {
334       assert(idx == 0);
335       return src;
336    }
337 
338    assert(src.bytes() > (idx * dst_rc.bytes()));
339    Builder bld(ctx->program, ctx->block);
340    auto it = ctx->allocated_vec.find(src.id());
341    if (it != ctx->allocated_vec.end() && dst_rc.bytes() == it->second[idx].regClass().bytes()) {
342       if (it->second[idx].regClass() == dst_rc) {
343          return it->second[idx];
344       } else {
345          assert(!dst_rc.is_subdword());
346          assert(dst_rc.type() == RegType::vgpr && it->second[idx].type() == RegType::sgpr);
347          return bld.copy(bld.def(dst_rc), it->second[idx]);
348       }
349    }
350 
351    if (dst_rc.is_subdword())
352       src = as_vgpr(ctx, src);
353 
354    if (src.bytes() == dst_rc.bytes()) {
355       assert(idx == 0);
356       return bld.copy(bld.def(dst_rc), src);
357    } else {
358       Temp dst = bld.tmp(dst_rc);
359       emit_extract_vector(ctx, src, idx, dst);
360       return dst;
361    }
362 }
363 
364 void
emit_split_vector(isel_context * ctx,Temp vec_src,unsigned num_components)365 emit_split_vector(isel_context* ctx, Temp vec_src, unsigned num_components)
366 {
367    if (num_components == 1)
368       return;
369    if (ctx->allocated_vec.find(vec_src.id()) != ctx->allocated_vec.end())
370       return;
371    RegClass rc;
372    if (num_components > vec_src.size()) {
373       if (vec_src.type() == RegType::sgpr) {
374          /* should still help get_alu_src() */
375          emit_split_vector(ctx, vec_src, vec_src.size());
376          return;
377       }
378       /* sub-dword split */
379       rc = RegClass(RegType::vgpr, vec_src.bytes() / num_components).as_subdword();
380    } else {
381       rc = RegClass(vec_src.type(), vec_src.size() / num_components);
382    }
383    aco_ptr<Pseudo_instruction> split{create_instruction<Pseudo_instruction>(
384       aco_opcode::p_split_vector, Format::PSEUDO, 1, num_components)};
385    split->operands[0] = Operand(vec_src);
386    std::array<Temp, NIR_MAX_VEC_COMPONENTS> elems;
387    for (unsigned i = 0; i < num_components; i++) {
388       elems[i] = ctx->program->allocateTmp(rc);
389       split->definitions[i] = Definition(elems[i]);
390    }
391    ctx->block->instructions.emplace_back(std::move(split));
392    ctx->allocated_vec.emplace(vec_src.id(), elems);
393 }
394 
395 /* This vector expansion uses a mask to determine which elements in the new vector
396  * come from the original vector. The other elements are undefined. */
397 void
expand_vector(isel_context * ctx,Temp vec_src,Temp dst,unsigned num_components,unsigned mask,bool zero_padding=false)398 expand_vector(isel_context* ctx, Temp vec_src, Temp dst, unsigned num_components, unsigned mask,
399               bool zero_padding = false)
400 {
401    assert(vec_src.type() == RegType::vgpr);
402    Builder bld(ctx->program, ctx->block);
403 
404    if (dst.type() == RegType::sgpr && num_components > dst.size()) {
405       Temp tmp_dst = bld.tmp(RegClass::get(RegType::vgpr, 2 * num_components));
406       expand_vector(ctx, vec_src, tmp_dst, num_components, mask, zero_padding);
407       bld.pseudo(aco_opcode::p_as_uniform, Definition(dst), tmp_dst);
408       ctx->allocated_vec[dst.id()] = ctx->allocated_vec[tmp_dst.id()];
409       return;
410    }
411 
412    emit_split_vector(ctx, vec_src, util_bitcount(mask));
413 
414    if (vec_src == dst)
415       return;
416 
417    if (num_components == 1) {
418       if (dst.type() == RegType::sgpr)
419          bld.pseudo(aco_opcode::p_as_uniform, Definition(dst), vec_src);
420       else
421          bld.copy(Definition(dst), vec_src);
422       return;
423    }
424 
425    unsigned component_bytes = dst.bytes() / num_components;
426    RegClass src_rc = RegClass::get(RegType::vgpr, component_bytes);
427    RegClass dst_rc = RegClass::get(dst.type(), component_bytes);
428    assert(dst.type() == RegType::vgpr || !src_rc.is_subdword());
429    std::array<Temp, NIR_MAX_VEC_COMPONENTS> elems;
430 
431    Temp padding = Temp(0, dst_rc);
432    if (zero_padding)
433       padding = bld.copy(bld.def(dst_rc), Operand::zero(component_bytes));
434 
435    aco_ptr<Pseudo_instruction> vec{create_instruction<Pseudo_instruction>(
436       aco_opcode::p_create_vector, Format::PSEUDO, num_components, 1)};
437    vec->definitions[0] = Definition(dst);
438    unsigned k = 0;
439    for (unsigned i = 0; i < num_components; i++) {
440       if (mask & (1 << i)) {
441          Temp src = emit_extract_vector(ctx, vec_src, k++, src_rc);
442          if (dst.type() == RegType::sgpr)
443             src = bld.as_uniform(src);
444          vec->operands[i] = Operand(src);
445          elems[i] = src;
446       } else {
447          vec->operands[i] = Operand::zero(component_bytes);
448          elems[i] = padding;
449       }
450    }
451    ctx->block->instructions.emplace_back(std::move(vec));
452    ctx->allocated_vec.emplace(dst.id(), elems);
453 }
454 
455 /* adjust misaligned small bit size loads */
456 void
byte_align_scalar(isel_context * ctx,Temp vec,Operand offset,Temp dst)457 byte_align_scalar(isel_context* ctx, Temp vec, Operand offset, Temp dst)
458 {
459    Builder bld(ctx->program, ctx->block);
460    Operand shift;
461    Temp select = Temp();
462    if (offset.isConstant()) {
463       assert(offset.constantValue() && offset.constantValue() < 4);
464       shift = Operand::c32(offset.constantValue() * 8);
465    } else {
466       /* bit_offset = 8 * (offset & 0x3) */
467       Temp tmp =
468          bld.sop2(aco_opcode::s_and_b32, bld.def(s1), bld.def(s1, scc), offset, Operand::c32(3u));
469       select = bld.tmp(s1);
470       shift = bld.sop2(aco_opcode::s_lshl_b32, bld.def(s1), bld.scc(Definition(select)), tmp,
471                        Operand::c32(3u));
472    }
473 
474    if (vec.size() == 1) {
475       bld.sop2(aco_opcode::s_lshr_b32, Definition(dst), bld.def(s1, scc), vec, shift);
476    } else if (vec.size() == 2) {
477       Temp tmp = dst.size() == 2 ? dst : bld.tmp(s2);
478       bld.sop2(aco_opcode::s_lshr_b64, Definition(tmp), bld.def(s1, scc), vec, shift);
479       if (tmp == dst)
480          emit_split_vector(ctx, dst, 2);
481       else
482          emit_extract_vector(ctx, tmp, 0, dst);
483    } else if (vec.size() == 3 || vec.size() == 4) {
484       Temp lo = bld.tmp(s2), hi;
485       if (vec.size() == 3) {
486          /* this can happen if we use VMEM for a uniform load */
487          hi = bld.tmp(s1);
488          bld.pseudo(aco_opcode::p_split_vector, Definition(lo), Definition(hi), vec);
489       } else {
490          hi = bld.tmp(s2);
491          bld.pseudo(aco_opcode::p_split_vector, Definition(lo), Definition(hi), vec);
492          hi = bld.pseudo(aco_opcode::p_extract_vector, bld.def(s1), hi, Operand::zero());
493       }
494       if (select != Temp())
495          hi =
496             bld.sop2(aco_opcode::s_cselect_b32, bld.def(s1), hi, Operand::zero(), bld.scc(select));
497       lo = bld.sop2(aco_opcode::s_lshr_b64, bld.def(s2), bld.def(s1, scc), lo, shift);
498       Temp mid = bld.tmp(s1);
499       lo = bld.pseudo(aco_opcode::p_split_vector, bld.def(s1), Definition(mid), lo);
500       hi = bld.sop2(aco_opcode::s_lshl_b32, bld.def(s1), bld.def(s1, scc), hi, shift);
501       mid = bld.sop2(aco_opcode::s_or_b32, bld.def(s1), bld.def(s1, scc), hi, mid);
502       bld.pseudo(aco_opcode::p_create_vector, Definition(dst), lo, mid);
503       emit_split_vector(ctx, dst, 2);
504    }
505 }
506 
507 void
byte_align_vector(isel_context * ctx,Temp vec,Operand offset,Temp dst,unsigned component_size)508 byte_align_vector(isel_context* ctx, Temp vec, Operand offset, Temp dst, unsigned component_size)
509 {
510    Builder bld(ctx->program, ctx->block);
511    if (offset.isTemp()) {
512       Temp tmp[4] = {vec, vec, vec, vec};
513 
514       if (vec.size() == 4) {
515          tmp[0] = bld.tmp(v1), tmp[1] = bld.tmp(v1), tmp[2] = bld.tmp(v1), tmp[3] = bld.tmp(v1);
516          bld.pseudo(aco_opcode::p_split_vector, Definition(tmp[0]), Definition(tmp[1]),
517                     Definition(tmp[2]), Definition(tmp[3]), vec);
518       } else if (vec.size() == 3) {
519          tmp[0] = bld.tmp(v1), tmp[1] = bld.tmp(v1), tmp[2] = bld.tmp(v1);
520          bld.pseudo(aco_opcode::p_split_vector, Definition(tmp[0]), Definition(tmp[1]),
521                     Definition(tmp[2]), vec);
522       } else if (vec.size() == 2) {
523          tmp[0] = bld.tmp(v1), tmp[1] = bld.tmp(v1), tmp[2] = tmp[1];
524          bld.pseudo(aco_opcode::p_split_vector, Definition(tmp[0]), Definition(tmp[1]), vec);
525       }
526       for (unsigned i = 0; i < dst.size(); i++)
527          tmp[i] = bld.vop3(aco_opcode::v_alignbyte_b32, bld.def(v1), tmp[i + 1], tmp[i], offset);
528 
529       vec = tmp[0];
530       if (dst.size() == 2)
531          vec = bld.pseudo(aco_opcode::p_create_vector, bld.def(v2), tmp[0], tmp[1]);
532 
533       offset = Operand::zero();
534    }
535 
536    unsigned num_components = vec.bytes() / component_size;
537    if (vec.regClass() == dst.regClass()) {
538       assert(offset.constantValue() == 0);
539       bld.copy(Definition(dst), vec);
540       emit_split_vector(ctx, dst, num_components);
541       return;
542    }
543 
544    emit_split_vector(ctx, vec, num_components);
545    std::array<Temp, NIR_MAX_VEC_COMPONENTS> elems;
546    RegClass rc = RegClass(RegType::vgpr, component_size).as_subdword();
547 
548    assert(offset.constantValue() % component_size == 0);
549    unsigned skip = offset.constantValue() / component_size;
550    for (unsigned i = skip; i < num_components; i++)
551       elems[i - skip] = emit_extract_vector(ctx, vec, i, rc);
552 
553    if (dst.type() == RegType::vgpr) {
554       /* if dst is vgpr - split the src and create a shrunk version according to the mask. */
555       num_components = dst.bytes() / component_size;
556       aco_ptr<Pseudo_instruction> create_vec{create_instruction<Pseudo_instruction>(
557          aco_opcode::p_create_vector, Format::PSEUDO, num_components, 1)};
558       for (unsigned i = 0; i < num_components; i++)
559          create_vec->operands[i] = Operand(elems[i]);
560       create_vec->definitions[0] = Definition(dst);
561       bld.insert(std::move(create_vec));
562 
563    } else if (skip) {
564       /* if dst is sgpr - split the src, but move the original to sgpr. */
565       vec = bld.pseudo(aco_opcode::p_as_uniform, bld.def(RegClass(RegType::sgpr, vec.size())), vec);
566       byte_align_scalar(ctx, vec, offset, dst);
567    } else {
568       assert(dst.size() == vec.size());
569       bld.pseudo(aco_opcode::p_as_uniform, Definition(dst), vec);
570    }
571 
572    ctx->allocated_vec.emplace(dst.id(), elems);
573 }
574 
575 Temp
get_ssa_temp_tex(struct isel_context * ctx,nir_def * def,bool is_16bit)576 get_ssa_temp_tex(struct isel_context* ctx, nir_def* def, bool is_16bit)
577 {
578    RegClass rc = RegClass::get(RegType::vgpr, (is_16bit ? 2 : 4) * def->num_components);
579    Temp tmp = get_ssa_temp(ctx, def);
580    if (tmp.bytes() != rc.bytes())
581       return emit_extract_vector(ctx, tmp, 0, rc);
582    else
583       return tmp;
584 }
585 
586 Temp
bool_to_vector_condition(isel_context * ctx,Temp val,Temp dst=Temp (0,s2))587 bool_to_vector_condition(isel_context* ctx, Temp val, Temp dst = Temp(0, s2))
588 {
589    Builder bld(ctx->program, ctx->block);
590    if (!dst.id())
591       dst = bld.tmp(bld.lm);
592 
593    assert(val.regClass() == s1);
594    assert(dst.regClass() == bld.lm);
595 
596    return bld.sop2(Builder::s_cselect, Definition(dst), Operand::c32(-1), Operand::zero(),
597                    bld.scc(val));
598 }
599 
600 Temp
bool_to_scalar_condition(isel_context * ctx,Temp val,Temp dst=Temp (0,s1))601 bool_to_scalar_condition(isel_context* ctx, Temp val, Temp dst = Temp(0, s1))
602 {
603    Builder bld(ctx->program, ctx->block);
604    if (!dst.id())
605       dst = bld.tmp(s1);
606 
607    assert(val.regClass() == bld.lm);
608    assert(dst.regClass() == s1);
609 
610    /* if we're currently in WQM mode, ensure that the source is also computed in WQM */
611    bld.sop2(Builder::s_and, bld.def(bld.lm), bld.scc(Definition(dst)), val, Operand(exec, bld.lm));
612    return dst;
613 }
614 
615 /**
616  * Copies the first src_bits of the input to the output Temp. Input bits at positions larger than
617  * src_bits and dst_bits are truncated.
618  *
619  * Sign extension may be applied using the sign_extend parameter. The position of the input sign
620  * bit is indicated by src_bits in this case.
621  *
622  * If dst.bytes() is larger than dst_bits/8, the value of the upper bits is undefined.
623  */
624 Temp
convert_int(isel_context * ctx,Builder & bld,Temp src,unsigned src_bits,unsigned dst_bits,bool sign_extend,Temp dst=Temp ())625 convert_int(isel_context* ctx, Builder& bld, Temp src, unsigned src_bits, unsigned dst_bits,
626             bool sign_extend, Temp dst = Temp())
627 {
628    assert(!(sign_extend && dst_bits < src_bits) &&
629           "Shrinking integers is not supported for signed inputs");
630 
631    if (!dst.id()) {
632       if (dst_bits % 32 == 0 || src.type() == RegType::sgpr)
633          dst = bld.tmp(src.type(), DIV_ROUND_UP(dst_bits, 32u));
634       else
635          dst = bld.tmp(RegClass(RegType::vgpr, dst_bits / 8u).as_subdword());
636    }
637 
638    assert(src.type() == RegType::sgpr || src_bits == src.bytes() * 8);
639    assert(dst.type() == RegType::sgpr || dst_bits == dst.bytes() * 8);
640 
641    if (dst.bytes() == src.bytes() && dst_bits < src_bits) {
642       /* Copy the raw value, leaving an undefined value in the upper bits for
643        * the caller to handle appropriately */
644       return bld.copy(Definition(dst), src);
645    } else if (dst.bytes() < src.bytes()) {
646       return bld.pseudo(aco_opcode::p_extract_vector, Definition(dst), src, Operand::zero());
647    }
648 
649    Temp tmp = dst;
650    if (dst_bits == 64)
651       tmp = src_bits == 32 ? src : bld.tmp(src.type(), 1);
652 
653    if (tmp == src) {
654    } else if (src.regClass() == s1) {
655       assert(src_bits < 32);
656       bld.pseudo(aco_opcode::p_extract, Definition(tmp), bld.def(s1, scc), src, Operand::zero(),
657                  Operand::c32(src_bits), Operand::c32((unsigned)sign_extend));
658    } else {
659       assert(src_bits < 32);
660       bld.pseudo(aco_opcode::p_extract, Definition(tmp), src, Operand::zero(),
661                  Operand::c32(src_bits), Operand::c32((unsigned)sign_extend));
662    }
663 
664    if (dst_bits == 64) {
665       if (sign_extend && dst.regClass() == s2) {
666          Temp high =
667             bld.sop2(aco_opcode::s_ashr_i32, bld.def(s1), bld.def(s1, scc), tmp, Operand::c32(31u));
668          bld.pseudo(aco_opcode::p_create_vector, Definition(dst), tmp, high);
669       } else if (sign_extend && dst.regClass() == v2) {
670          Temp high = bld.vop2(aco_opcode::v_ashrrev_i32, bld.def(v1), Operand::c32(31u), tmp);
671          bld.pseudo(aco_opcode::p_create_vector, Definition(dst), tmp, high);
672       } else {
673          bld.pseudo(aco_opcode::p_create_vector, Definition(dst), tmp, Operand::zero());
674       }
675    }
676 
677    return dst;
678 }
679 
680 enum sgpr_extract_mode {
681    sgpr_extract_sext,
682    sgpr_extract_zext,
683    sgpr_extract_undef,
684 };
685 
686 Temp
extract_8_16_bit_sgpr_element(isel_context * ctx,Temp dst,nir_alu_src * src,sgpr_extract_mode mode)687 extract_8_16_bit_sgpr_element(isel_context* ctx, Temp dst, nir_alu_src* src, sgpr_extract_mode mode)
688 {
689    Temp vec = get_ssa_temp(ctx, src->src.ssa);
690    unsigned src_size = src->src.ssa->bit_size;
691    unsigned swizzle = src->swizzle[0];
692 
693    if (vec.size() > 1) {
694       assert(src_size == 16);
695       vec = emit_extract_vector(ctx, vec, swizzle / 2, s1);
696       swizzle = swizzle & 1;
697    }
698 
699    Builder bld(ctx->program, ctx->block);
700    Temp tmp = dst.regClass() == s2 ? bld.tmp(s1) : dst;
701 
702    if (mode == sgpr_extract_undef && swizzle == 0)
703       bld.copy(Definition(tmp), vec);
704    else
705       bld.pseudo(aco_opcode::p_extract, Definition(tmp), bld.def(s1, scc), Operand(vec),
706                  Operand::c32(swizzle), Operand::c32(src_size),
707                  Operand::c32((mode == sgpr_extract_sext)));
708 
709    if (dst.regClass() == s2)
710       convert_int(ctx, bld, tmp, 32, 64, mode == sgpr_extract_sext, dst);
711 
712    return dst;
713 }
714 
715 Temp
get_alu_src(struct isel_context * ctx,nir_alu_src src,unsigned size=1)716 get_alu_src(struct isel_context* ctx, nir_alu_src src, unsigned size = 1)
717 {
718    if (src.src.ssa->num_components == 1 && size == 1)
719       return get_ssa_temp(ctx, src.src.ssa);
720 
721    Temp vec = get_ssa_temp(ctx, src.src.ssa);
722    unsigned elem_size = src.src.ssa->bit_size / 8u;
723    bool identity_swizzle = true;
724 
725    for (unsigned i = 0; identity_swizzle && i < size; i++) {
726       if (src.swizzle[i] != i)
727          identity_swizzle = false;
728    }
729    if (identity_swizzle)
730       return emit_extract_vector(ctx, vec, 0, RegClass::get(vec.type(), elem_size * size));
731 
732    assert(elem_size > 0);
733    assert(vec.bytes() % elem_size == 0);
734 
735    if (elem_size < 4 && vec.type() == RegType::sgpr && size == 1) {
736       assert(src.src.ssa->bit_size == 8 || src.src.ssa->bit_size == 16);
737       return extract_8_16_bit_sgpr_element(ctx, ctx->program->allocateTmp(s1), &src,
738                                            sgpr_extract_undef);
739    }
740 
741    bool as_uniform = elem_size < 4 && vec.type() == RegType::sgpr;
742    if (as_uniform)
743       vec = as_vgpr(ctx, vec);
744 
745    RegClass elem_rc = elem_size < 4 ? RegClass(vec.type(), elem_size).as_subdword()
746                                     : RegClass(vec.type(), elem_size / 4);
747    if (size == 1) {
748       return emit_extract_vector(ctx, vec, src.swizzle[0], elem_rc);
749    } else {
750       assert(size <= 4);
751       std::array<Temp, NIR_MAX_VEC_COMPONENTS> elems;
752       aco_ptr<Pseudo_instruction> vec_instr{create_instruction<Pseudo_instruction>(
753          aco_opcode::p_create_vector, Format::PSEUDO, size, 1)};
754       for (unsigned i = 0; i < size; ++i) {
755          elems[i] = emit_extract_vector(ctx, vec, src.swizzle[i], elem_rc);
756          vec_instr->operands[i] = Operand{elems[i]};
757       }
758       Temp dst = ctx->program->allocateTmp(RegClass(vec.type(), elem_size * size / 4));
759       vec_instr->definitions[0] = Definition(dst);
760       ctx->block->instructions.emplace_back(std::move(vec_instr));
761       ctx->allocated_vec.emplace(dst.id(), elems);
762       return vec.type() == RegType::sgpr ? Builder(ctx->program, ctx->block).as_uniform(dst) : dst;
763    }
764 }
765 
766 Temp
get_alu_src_vop3p(struct isel_context * ctx,nir_alu_src src)767 get_alu_src_vop3p(struct isel_context* ctx, nir_alu_src src)
768 {
769    /* returns v2b or v1 for vop3p usage.
770     * The source expects exactly 2 16bit components
771     * which are within the same dword
772     */
773    assert(src.src.ssa->bit_size == 16);
774    assert(src.swizzle[0] >> 1 == src.swizzle[1] >> 1);
775 
776    Temp tmp = get_ssa_temp(ctx, src.src.ssa);
777    if (tmp.size() == 1)
778       return tmp;
779 
780    /* the size is larger than 1 dword: check the swizzle */
781    unsigned dword = src.swizzle[0] >> 1;
782 
783    /* extract a full dword if possible */
784    if (tmp.bytes() >= (dword + 1) * 4) {
785       /* if the source is split into components, use p_create_vector */
786       auto it = ctx->allocated_vec.find(tmp.id());
787       if (it != ctx->allocated_vec.end()) {
788          unsigned index = dword << 1;
789          Builder bld(ctx->program, ctx->block);
790          if (it->second[index].regClass() == v2b)
791             return bld.pseudo(aco_opcode::p_create_vector, bld.def(v1), it->second[index],
792                               it->second[index + 1]);
793       }
794       return emit_extract_vector(ctx, tmp, dword, v1);
795    } else {
796       /* This must be a swizzled access to %a.zz where %a is v6b */
797       assert(((src.swizzle[0] | src.swizzle[1]) & 1) == 0);
798       assert(tmp.regClass() == v6b && dword == 1);
799       return emit_extract_vector(ctx, tmp, dword * 2, v2b);
800    }
801 }
802 
803 uint32_t
get_alu_src_ub(isel_context * ctx,nir_alu_instr * instr,int src_idx)804 get_alu_src_ub(isel_context* ctx, nir_alu_instr* instr, int src_idx)
805 {
806    nir_scalar scalar = nir_scalar{instr->src[src_idx].src.ssa, instr->src[src_idx].swizzle[0]};
807    return nir_unsigned_upper_bound(ctx->shader, ctx->range_ht, scalar, &ctx->ub_config);
808 }
809 
810 Temp
convert_pointer_to_64_bit(isel_context * ctx,Temp ptr,bool non_uniform=false)811 convert_pointer_to_64_bit(isel_context* ctx, Temp ptr, bool non_uniform = false)
812 {
813    if (ptr.size() == 2)
814       return ptr;
815    Builder bld(ctx->program, ctx->block);
816    if (ptr.type() == RegType::vgpr && !non_uniform)
817       ptr = bld.as_uniform(ptr);
818    return bld.pseudo(aco_opcode::p_create_vector, bld.def(RegClass(ptr.type(), 2)), ptr,
819                      Operand::c32((unsigned)ctx->options->address32_hi));
820 }
821 
822 void
emit_sop2_instruction(isel_context * ctx,nir_alu_instr * instr,aco_opcode op,Temp dst,bool writes_scc,uint8_t uses_ub=0)823 emit_sop2_instruction(isel_context* ctx, nir_alu_instr* instr, aco_opcode op, Temp dst,
824                       bool writes_scc, uint8_t uses_ub = 0)
825 {
826    aco_ptr<SOP2_instruction> sop2{
827       create_instruction<SOP2_instruction>(op, Format::SOP2, 2, writes_scc ? 2 : 1)};
828    sop2->operands[0] = Operand(get_alu_src(ctx, instr->src[0]));
829    sop2->operands[1] = Operand(get_alu_src(ctx, instr->src[1]));
830    sop2->definitions[0] = Definition(dst);
831    if (instr->no_unsigned_wrap)
832       sop2->definitions[0].setNUW(true);
833    if (writes_scc)
834       sop2->definitions[1] = Definition(ctx->program->allocateId(s1), scc, s1);
835 
836    for (int i = 0; i < 2; i++) {
837       if (uses_ub & (1 << i)) {
838          uint32_t src_ub = get_alu_src_ub(ctx, instr, i);
839          if (src_ub <= 0xffff)
840             sop2->operands[i].set16bit(true);
841          else if (src_ub <= 0xffffff)
842             sop2->operands[i].set24bit(true);
843       }
844    }
845 
846    ctx->block->instructions.emplace_back(std::move(sop2));
847 }
848 
849 void
emit_vop2_instruction(isel_context * ctx,nir_alu_instr * instr,aco_opcode opc,Temp dst,bool commutative,bool swap_srcs=false,bool flush_denorms=false,bool nuw=false,uint8_t uses_ub=0)850 emit_vop2_instruction(isel_context* ctx, nir_alu_instr* instr, aco_opcode opc, Temp dst,
851                       bool commutative, bool swap_srcs = false, bool flush_denorms = false,
852                       bool nuw = false, uint8_t uses_ub = 0)
853 {
854    Builder bld(ctx->program, ctx->block);
855    bld.is_precise = instr->exact;
856 
857    Temp src0 = get_alu_src(ctx, instr->src[swap_srcs ? 1 : 0]);
858    Temp src1 = get_alu_src(ctx, instr->src[swap_srcs ? 0 : 1]);
859    if (src1.type() == RegType::sgpr) {
860       if (commutative && src0.type() == RegType::vgpr) {
861          Temp t = src0;
862          src0 = src1;
863          src1 = t;
864       } else {
865          src1 = as_vgpr(ctx, src1);
866       }
867    }
868 
869    Operand op[2] = {Operand(src0), Operand(src1)};
870 
871    for (int i = 0; i < 2; i++) {
872       if (uses_ub & (1 << i)) {
873          uint32_t src_ub = get_alu_src_ub(ctx, instr, swap_srcs ? !i : i);
874          if (src_ub <= 0xffff)
875             op[i].set16bit(true);
876          else if (src_ub <= 0xffffff)
877             op[i].set24bit(true);
878       }
879    }
880 
881    if (flush_denorms && ctx->program->gfx_level < GFX9) {
882       assert(dst.size() == 1);
883       Temp tmp = bld.vop2(opc, bld.def(dst.regClass()), op[0], op[1]);
884       if (dst.bytes() == 2)
885          bld.vop2(aco_opcode::v_mul_f16, Definition(dst), Operand::c16(0x3c00), tmp);
886       else
887          bld.vop2(aco_opcode::v_mul_f32, Definition(dst), Operand::c32(0x3f800000u), tmp);
888    } else {
889       if (nuw) {
890          bld.nuw().vop2(opc, Definition(dst), op[0], op[1]);
891       } else {
892          bld.vop2(opc, Definition(dst), op[0], op[1]);
893       }
894    }
895 }
896 
897 void
emit_vop2_instruction_logic64(isel_context * ctx,nir_alu_instr * instr,aco_opcode op,Temp dst)898 emit_vop2_instruction_logic64(isel_context* ctx, nir_alu_instr* instr, aco_opcode op, Temp dst)
899 {
900    Builder bld(ctx->program, ctx->block);
901    bld.is_precise = instr->exact;
902 
903    Temp src0 = get_alu_src(ctx, instr->src[0]);
904    Temp src1 = get_alu_src(ctx, instr->src[1]);
905 
906    if (src1.type() == RegType::sgpr) {
907       assert(src0.type() == RegType::vgpr);
908       std::swap(src0, src1);
909    }
910 
911    Temp src00 = bld.tmp(src0.type(), 1);
912    Temp src01 = bld.tmp(src0.type(), 1);
913    bld.pseudo(aco_opcode::p_split_vector, Definition(src00), Definition(src01), src0);
914    Temp src10 = bld.tmp(v1);
915    Temp src11 = bld.tmp(v1);
916    bld.pseudo(aco_opcode::p_split_vector, Definition(src10), Definition(src11), src1);
917    Temp lo = bld.vop2(op, bld.def(v1), src00, src10);
918    Temp hi = bld.vop2(op, bld.def(v1), src01, src11);
919    bld.pseudo(aco_opcode::p_create_vector, Definition(dst), lo, hi);
920 }
921 
922 void
emit_vop3a_instruction(isel_context * ctx,nir_alu_instr * instr,aco_opcode op,Temp dst,bool flush_denorms=false,unsigned num_sources=2,bool swap_srcs=false)923 emit_vop3a_instruction(isel_context* ctx, nir_alu_instr* instr, aco_opcode op, Temp dst,
924                        bool flush_denorms = false, unsigned num_sources = 2, bool swap_srcs = false)
925 {
926    assert(num_sources == 2 || num_sources == 3);
927    Temp src[3] = {Temp(0, v1), Temp(0, v1), Temp(0, v1)};
928    bool has_sgpr = false;
929    for (unsigned i = 0; i < num_sources; i++) {
930       src[i] = get_alu_src(ctx, instr->src[(swap_srcs && i < 2) ? 1 - i : i]);
931       if (has_sgpr)
932          src[i] = as_vgpr(ctx, src[i]);
933       else
934          has_sgpr = src[i].type() == RegType::sgpr;
935    }
936 
937    Builder bld(ctx->program, ctx->block);
938    bld.is_precise = instr->exact;
939    if (flush_denorms && ctx->program->gfx_level < GFX9) {
940       Temp tmp;
941       if (num_sources == 3)
942          tmp = bld.vop3(op, bld.def(dst.regClass()), src[0], src[1], src[2]);
943       else
944          tmp = bld.vop3(op, bld.def(dst.regClass()), src[0], src[1]);
945       if (dst.size() == 1)
946          bld.vop2(aco_opcode::v_mul_f32, Definition(dst), Operand::c32(0x3f800000u), tmp);
947       else
948          bld.vop3(aco_opcode::v_mul_f64, Definition(dst), Operand::c64(0x3FF0000000000000), tmp);
949    } else if (num_sources == 3) {
950       bld.vop3(op, Definition(dst), src[0], src[1], src[2]);
951    } else {
952       bld.vop3(op, Definition(dst), src[0], src[1]);
953    }
954 }
955 
956 Builder::Result
emit_vop3p_instruction(isel_context * ctx,nir_alu_instr * instr,aco_opcode op,Temp dst,bool swap_srcs=false)957 emit_vop3p_instruction(isel_context* ctx, nir_alu_instr* instr, aco_opcode op, Temp dst,
958                        bool swap_srcs = false)
959 {
960    Temp src0 = get_alu_src_vop3p(ctx, instr->src[swap_srcs]);
961    Temp src1 = get_alu_src_vop3p(ctx, instr->src[!swap_srcs]);
962    if (src0.type() == RegType::sgpr && src1.type() == RegType::sgpr)
963       src1 = as_vgpr(ctx, src1);
964    assert(instr->def.num_components == 2);
965 
966    /* swizzle to opsel: all swizzles are either 0 (x) or 1 (y) */
967    unsigned opsel_lo =
968       (instr->src[!swap_srcs].swizzle[0] & 1) << 1 | (instr->src[swap_srcs].swizzle[0] & 1);
969    unsigned opsel_hi =
970       (instr->src[!swap_srcs].swizzle[1] & 1) << 1 | (instr->src[swap_srcs].swizzle[1] & 1);
971 
972    Builder bld(ctx->program, ctx->block);
973    bld.is_precise = instr->exact;
974    Builder::Result res = bld.vop3p(op, Definition(dst), src0, src1, opsel_lo, opsel_hi);
975    return res;
976 }
977 
978 void
emit_idot_instruction(isel_context * ctx,nir_alu_instr * instr,aco_opcode op,Temp dst,bool clamp,unsigned neg_lo=0)979 emit_idot_instruction(isel_context* ctx, nir_alu_instr* instr, aco_opcode op, Temp dst, bool clamp,
980                       unsigned neg_lo = 0)
981 {
982    Temp src[3] = {Temp(0, v1), Temp(0, v1), Temp(0, v1)};
983    bool has_sgpr = false;
984    for (unsigned i = 0; i < 3; i++) {
985       src[i] = get_alu_src(ctx, instr->src[i]);
986       if (has_sgpr)
987          src[i] = as_vgpr(ctx, src[i]);
988       else
989          has_sgpr = src[i].type() == RegType::sgpr;
990    }
991 
992    Builder bld(ctx->program, ctx->block);
993    bld.is_precise = instr->exact;
994    VALU_instruction& vop3p =
995       bld.vop3p(op, Definition(dst), src[0], src[1], src[2], 0x0, 0x7)->valu();
996    vop3p.clamp = clamp;
997    vop3p.neg_lo = neg_lo;
998 }
999 
1000 void
emit_vop1_instruction(isel_context * ctx,nir_alu_instr * instr,aco_opcode op,Temp dst)1001 emit_vop1_instruction(isel_context* ctx, nir_alu_instr* instr, aco_opcode op, Temp dst)
1002 {
1003    Builder bld(ctx->program, ctx->block);
1004    bld.is_precise = instr->exact;
1005    if (dst.type() == RegType::sgpr)
1006       bld.pseudo(aco_opcode::p_as_uniform, Definition(dst),
1007                  bld.vop1(op, bld.def(RegType::vgpr, dst.size()), get_alu_src(ctx, instr->src[0])));
1008    else
1009       bld.vop1(op, Definition(dst), get_alu_src(ctx, instr->src[0]));
1010 }
1011 
1012 void
emit_vopc_instruction(isel_context * ctx,nir_alu_instr * instr,aco_opcode op,Temp dst)1013 emit_vopc_instruction(isel_context* ctx, nir_alu_instr* instr, aco_opcode op, Temp dst)
1014 {
1015    Temp src0 = get_alu_src(ctx, instr->src[0]);
1016    Temp src1 = get_alu_src(ctx, instr->src[1]);
1017    assert(src0.size() == src1.size());
1018 
1019    aco_ptr<Instruction> vopc;
1020    if (src1.type() == RegType::sgpr) {
1021       if (src0.type() == RegType::vgpr) {
1022          /* to swap the operands, we might also have to change the opcode */
1023          switch (op) {
1024          case aco_opcode::v_cmp_lt_f16: op = aco_opcode::v_cmp_gt_f16; break;
1025          case aco_opcode::v_cmp_ge_f16: op = aco_opcode::v_cmp_le_f16; break;
1026          case aco_opcode::v_cmp_lt_i16: op = aco_opcode::v_cmp_gt_i16; break;
1027          case aco_opcode::v_cmp_ge_i16: op = aco_opcode::v_cmp_le_i16; break;
1028          case aco_opcode::v_cmp_lt_u16: op = aco_opcode::v_cmp_gt_u16; break;
1029          case aco_opcode::v_cmp_ge_u16: op = aco_opcode::v_cmp_le_u16; break;
1030          case aco_opcode::v_cmp_lt_f32: op = aco_opcode::v_cmp_gt_f32; break;
1031          case aco_opcode::v_cmp_ge_f32: op = aco_opcode::v_cmp_le_f32; break;
1032          case aco_opcode::v_cmp_lt_i32: op = aco_opcode::v_cmp_gt_i32; break;
1033          case aco_opcode::v_cmp_ge_i32: op = aco_opcode::v_cmp_le_i32; break;
1034          case aco_opcode::v_cmp_lt_u32: op = aco_opcode::v_cmp_gt_u32; break;
1035          case aco_opcode::v_cmp_ge_u32: op = aco_opcode::v_cmp_le_u32; break;
1036          case aco_opcode::v_cmp_lt_f64: op = aco_opcode::v_cmp_gt_f64; break;
1037          case aco_opcode::v_cmp_ge_f64: op = aco_opcode::v_cmp_le_f64; break;
1038          case aco_opcode::v_cmp_lt_i64: op = aco_opcode::v_cmp_gt_i64; break;
1039          case aco_opcode::v_cmp_ge_i64: op = aco_opcode::v_cmp_le_i64; break;
1040          case aco_opcode::v_cmp_lt_u64: op = aco_opcode::v_cmp_gt_u64; break;
1041          case aco_opcode::v_cmp_ge_u64: op = aco_opcode::v_cmp_le_u64; break;
1042          default: /* eq and ne are commutative */ break;
1043          }
1044          Temp t = src0;
1045          src0 = src1;
1046          src1 = t;
1047       } else {
1048          src1 = as_vgpr(ctx, src1);
1049       }
1050    }
1051 
1052    Builder bld(ctx->program, ctx->block);
1053    bld.vopc(op, Definition(dst), src0, src1);
1054 }
1055 
1056 void
emit_sopc_instruction(isel_context * ctx,nir_alu_instr * instr,aco_opcode op,Temp dst)1057 emit_sopc_instruction(isel_context* ctx, nir_alu_instr* instr, aco_opcode op, Temp dst)
1058 {
1059    Temp src0 = get_alu_src(ctx, instr->src[0]);
1060    Temp src1 = get_alu_src(ctx, instr->src[1]);
1061    Builder bld(ctx->program, ctx->block);
1062 
1063    assert(dst.regClass() == bld.lm);
1064    assert(src0.type() == RegType::sgpr);
1065    assert(src1.type() == RegType::sgpr);
1066 
1067    /* Emit the SALU comparison instruction */
1068    Temp cmp = bld.sopc(op, bld.scc(bld.def(s1)), src0, src1);
1069    /* Turn the result into a per-lane bool */
1070    bool_to_vector_condition(ctx, cmp, dst);
1071 }
1072 
1073 void
emit_comparison(isel_context * ctx,nir_alu_instr * instr,Temp dst,aco_opcode v16_op,aco_opcode v32_op,aco_opcode v64_op,aco_opcode s32_op=aco_opcode::num_opcodes,aco_opcode s64_op=aco_opcode::num_opcodes)1074 emit_comparison(isel_context* ctx, nir_alu_instr* instr, Temp dst, aco_opcode v16_op,
1075                 aco_opcode v32_op, aco_opcode v64_op, aco_opcode s32_op = aco_opcode::num_opcodes,
1076                 aco_opcode s64_op = aco_opcode::num_opcodes)
1077 {
1078    aco_opcode s_op = instr->src[0].src.ssa->bit_size == 64   ? s64_op
1079                      : instr->src[0].src.ssa->bit_size == 32 ? s32_op
1080                                                              : aco_opcode::num_opcodes;
1081    aco_opcode v_op = instr->src[0].src.ssa->bit_size == 64   ? v64_op
1082                      : instr->src[0].src.ssa->bit_size == 32 ? v32_op
1083                                                              : v16_op;
1084    bool use_valu = s_op == aco_opcode::num_opcodes || instr->def.divergent ||
1085                    get_ssa_temp(ctx, instr->src[0].src.ssa).type() == RegType::vgpr ||
1086                    get_ssa_temp(ctx, instr->src[1].src.ssa).type() == RegType::vgpr;
1087    aco_opcode op = use_valu ? v_op : s_op;
1088    assert(op != aco_opcode::num_opcodes);
1089    assert(dst.regClass() == ctx->program->lane_mask);
1090 
1091    if (use_valu)
1092       emit_vopc_instruction(ctx, instr, op, dst);
1093    else
1094       emit_sopc_instruction(ctx, instr, op, dst);
1095 }
1096 
1097 void
emit_boolean_logic(isel_context * ctx,nir_alu_instr * instr,Builder::WaveSpecificOpcode op,Temp dst)1098 emit_boolean_logic(isel_context* ctx, nir_alu_instr* instr, Builder::WaveSpecificOpcode op,
1099                    Temp dst)
1100 {
1101    Builder bld(ctx->program, ctx->block);
1102    Temp src0 = get_alu_src(ctx, instr->src[0]);
1103    Temp src1 = get_alu_src(ctx, instr->src[1]);
1104 
1105    assert(dst.regClass() == bld.lm);
1106    assert(src0.regClass() == bld.lm);
1107    assert(src1.regClass() == bld.lm);
1108 
1109    bld.sop2(op, Definition(dst), bld.def(s1, scc), src0, src1);
1110 }
1111 
1112 void
select_vec2(isel_context * ctx,Temp dst,Temp cond,Temp then,Temp els)1113 select_vec2(isel_context* ctx, Temp dst, Temp cond, Temp then, Temp els)
1114 {
1115    Builder bld(ctx->program, ctx->block);
1116 
1117    Temp then_lo = bld.tmp(v1), then_hi = bld.tmp(v1);
1118    bld.pseudo(aco_opcode::p_split_vector, Definition(then_lo), Definition(then_hi), then);
1119    Temp else_lo = bld.tmp(v1), else_hi = bld.tmp(v1);
1120    bld.pseudo(aco_opcode::p_split_vector, Definition(else_lo), Definition(else_hi), els);
1121 
1122    Temp dst0 = bld.vop2(aco_opcode::v_cndmask_b32, bld.def(v1), else_lo, then_lo, cond);
1123    Temp dst1 = bld.vop2(aco_opcode::v_cndmask_b32, bld.def(v1), else_hi, then_hi, cond);
1124 
1125    bld.pseudo(aco_opcode::p_create_vector, Definition(dst), dst0, dst1);
1126 }
1127 
1128 void
emit_bcsel(isel_context * ctx,nir_alu_instr * instr,Temp dst)1129 emit_bcsel(isel_context* ctx, nir_alu_instr* instr, Temp dst)
1130 {
1131    Builder bld(ctx->program, ctx->block);
1132    Temp cond = get_alu_src(ctx, instr->src[0]);
1133    Temp then = get_alu_src(ctx, instr->src[1]);
1134    Temp els = get_alu_src(ctx, instr->src[2]);
1135 
1136    assert(cond.regClass() == bld.lm);
1137 
1138    if (dst.type() == RegType::vgpr) {
1139       aco_ptr<Instruction> bcsel;
1140       if (dst.size() == 1) {
1141          then = as_vgpr(ctx, then);
1142          els = as_vgpr(ctx, els);
1143 
1144          bld.vop2(aco_opcode::v_cndmask_b32, Definition(dst), els, then, cond);
1145       } else if (dst.size() == 2) {
1146          select_vec2(ctx, dst, cond, then, els);
1147       } else {
1148          isel_err(&instr->instr, "Unimplemented NIR instr bit size");
1149       }
1150       return;
1151    }
1152 
1153    if (instr->def.bit_size == 1) {
1154       assert(dst.regClass() == bld.lm);
1155       assert(then.regClass() == bld.lm);
1156       assert(els.regClass() == bld.lm);
1157    }
1158 
1159    if (!nir_src_is_divergent(instr->src[0].src)) { /* uniform condition and values in sgpr */
1160       if (dst.regClass() == s1 || dst.regClass() == s2) {
1161          assert((then.regClass() == s1 || then.regClass() == s2) &&
1162                 els.regClass() == then.regClass());
1163          assert(dst.size() == then.size());
1164          aco_opcode op =
1165             dst.regClass() == s1 ? aco_opcode::s_cselect_b32 : aco_opcode::s_cselect_b64;
1166          bld.sop2(op, Definition(dst), then, els, bld.scc(bool_to_scalar_condition(ctx, cond)));
1167       } else {
1168          isel_err(&instr->instr, "Unimplemented uniform bcsel bit size");
1169       }
1170       return;
1171    }
1172 
1173    /* divergent boolean bcsel
1174     * this implements bcsel on bools: dst = s0 ? s1 : s2
1175     * are going to be: dst = (s0 & s1) | (~s0 & s2) */
1176    assert(instr->def.bit_size == 1);
1177 
1178    if (cond.id() != then.id())
1179       then = bld.sop2(Builder::s_and, bld.def(bld.lm), bld.def(s1, scc), cond, then);
1180 
1181    if (cond.id() == els.id())
1182       bld.copy(Definition(dst), then);
1183    else
1184       bld.sop2(Builder::s_or, Definition(dst), bld.def(s1, scc), then,
1185                bld.sop2(Builder::s_andn2, bld.def(bld.lm), bld.def(s1, scc), els, cond));
1186 }
1187 
1188 void
emit_scaled_op(isel_context * ctx,Builder & bld,Definition dst,Temp val,aco_opcode op,uint32_t undo)1189 emit_scaled_op(isel_context* ctx, Builder& bld, Definition dst, Temp val, aco_opcode op,
1190                uint32_t undo)
1191 {
1192    /* multiply by 16777216 to handle denormals */
1193    Temp is_denormal = bld.tmp(bld.lm);
1194    VALU_instruction& valu =
1195       bld.vopc_e64(aco_opcode::v_cmp_class_f32, Definition(is_denormal), val, Operand::c32(1u << 4))
1196          ->valu();
1197    valu.neg[0] = true;
1198    valu.abs[0] = true;
1199    Temp scaled = bld.vop2(aco_opcode::v_mul_f32, bld.def(v1), Operand::c32(0x4b800000u), val);
1200    scaled = bld.vop1(op, bld.def(v1), scaled);
1201    scaled = bld.vop2(aco_opcode::v_mul_f32, bld.def(v1), Operand::c32(undo), scaled);
1202 
1203    Temp not_scaled = bld.vop1(op, bld.def(v1), val);
1204 
1205    bld.vop2(aco_opcode::v_cndmask_b32, dst, not_scaled, scaled, is_denormal);
1206 }
1207 
1208 void
emit_rcp(isel_context * ctx,Builder & bld,Definition dst,Temp val)1209 emit_rcp(isel_context* ctx, Builder& bld, Definition dst, Temp val)
1210 {
1211    if (ctx->block->fp_mode.denorm32 == 0) {
1212       bld.vop1(aco_opcode::v_rcp_f32, dst, val);
1213       return;
1214    }
1215 
1216    emit_scaled_op(ctx, bld, dst, val, aco_opcode::v_rcp_f32, 0x4b800000u);
1217 }
1218 
1219 void
emit_rsq(isel_context * ctx,Builder & bld,Definition dst,Temp val)1220 emit_rsq(isel_context* ctx, Builder& bld, Definition dst, Temp val)
1221 {
1222    if (ctx->block->fp_mode.denorm32 == 0) {
1223       bld.vop1(aco_opcode::v_rsq_f32, dst, val);
1224       return;
1225    }
1226 
1227    emit_scaled_op(ctx, bld, dst, val, aco_opcode::v_rsq_f32, 0x45800000u);
1228 }
1229 
1230 void
emit_sqrt(isel_context * ctx,Builder & bld,Definition dst,Temp val)1231 emit_sqrt(isel_context* ctx, Builder& bld, Definition dst, Temp val)
1232 {
1233    if (ctx->block->fp_mode.denorm32 == 0) {
1234       bld.vop1(aco_opcode::v_sqrt_f32, dst, val);
1235       return;
1236    }
1237 
1238    emit_scaled_op(ctx, bld, dst, val, aco_opcode::v_sqrt_f32, 0x39800000u);
1239 }
1240 
1241 void
emit_log2(isel_context * ctx,Builder & bld,Definition dst,Temp val)1242 emit_log2(isel_context* ctx, Builder& bld, Definition dst, Temp val)
1243 {
1244    if (ctx->block->fp_mode.denorm32 == 0) {
1245       bld.vop1(aco_opcode::v_log_f32, dst, val);
1246       return;
1247    }
1248 
1249    emit_scaled_op(ctx, bld, dst, val, aco_opcode::v_log_f32, 0xc1c00000u);
1250 }
1251 
1252 Temp
emit_trunc_f64(isel_context * ctx,Builder & bld,Definition dst,Temp val)1253 emit_trunc_f64(isel_context* ctx, Builder& bld, Definition dst, Temp val)
1254 {
1255    if (ctx->options->gfx_level >= GFX7)
1256       return bld.vop1(aco_opcode::v_trunc_f64, Definition(dst), val);
1257 
1258    /* GFX6 doesn't support V_TRUNC_F64, lower it. */
1259    /* TODO: create more efficient code! */
1260    if (val.type() == RegType::sgpr)
1261       val = as_vgpr(ctx, val);
1262 
1263    /* Split the input value. */
1264    Temp val_lo = bld.tmp(v1), val_hi = bld.tmp(v1);
1265    bld.pseudo(aco_opcode::p_split_vector, Definition(val_lo), Definition(val_hi), val);
1266 
1267    /* Extract the exponent and compute the unbiased value. */
1268    Temp exponent =
1269       bld.vop3(aco_opcode::v_bfe_u32, bld.def(v1), val_hi, Operand::c32(20u), Operand::c32(11u));
1270    exponent = bld.vsub32(bld.def(v1), exponent, Operand::c32(1023u));
1271 
1272    /* Extract the fractional part. */
1273    Temp fract_mask = bld.pseudo(aco_opcode::p_create_vector, bld.def(v2), Operand::c32(-1u),
1274                                 Operand::c32(0x000fffffu));
1275    fract_mask = bld.vop3(aco_opcode::v_lshr_b64, bld.def(v2), fract_mask, exponent);
1276 
1277    Temp fract_mask_lo = bld.tmp(v1), fract_mask_hi = bld.tmp(v1);
1278    bld.pseudo(aco_opcode::p_split_vector, Definition(fract_mask_lo), Definition(fract_mask_hi),
1279               fract_mask);
1280 
1281    Temp fract_lo = bld.tmp(v1), fract_hi = bld.tmp(v1);
1282    Temp tmp = bld.vop1(aco_opcode::v_not_b32, bld.def(v1), fract_mask_lo);
1283    fract_lo = bld.vop2(aco_opcode::v_and_b32, bld.def(v1), val_lo, tmp);
1284    tmp = bld.vop1(aco_opcode::v_not_b32, bld.def(v1), fract_mask_hi);
1285    fract_hi = bld.vop2(aco_opcode::v_and_b32, bld.def(v1), val_hi, tmp);
1286 
1287    /* Get the sign bit. */
1288    Temp sign = bld.vop2(aco_opcode::v_and_b32, bld.def(v1), Operand::c32(0x80000000u), val_hi);
1289 
1290    /* Decide the operation to apply depending on the unbiased exponent. */
1291    Temp exp_lt0 =
1292       bld.vopc_e64(aco_opcode::v_cmp_lt_i32, bld.def(bld.lm), exponent, Operand::zero());
1293    Temp dst_lo = bld.vop2(aco_opcode::v_cndmask_b32, bld.def(v1), fract_lo,
1294                           bld.copy(bld.def(v1), Operand::zero()), exp_lt0);
1295    Temp dst_hi = bld.vop2(aco_opcode::v_cndmask_b32, bld.def(v1), fract_hi, sign, exp_lt0);
1296    Temp exp_gt51 = bld.vopc_e64(aco_opcode::v_cmp_gt_i32, bld.def(s2), exponent, Operand::c32(51u));
1297    dst_lo = bld.vop2(aco_opcode::v_cndmask_b32, bld.def(v1), dst_lo, val_lo, exp_gt51);
1298    dst_hi = bld.vop2(aco_opcode::v_cndmask_b32, bld.def(v1), dst_hi, val_hi, exp_gt51);
1299 
1300    return bld.pseudo(aco_opcode::p_create_vector, Definition(dst), dst_lo, dst_hi);
1301 }
1302 
1303 Temp
emit_floor_f64(isel_context * ctx,Builder & bld,Definition dst,Temp val)1304 emit_floor_f64(isel_context* ctx, Builder& bld, Definition dst, Temp val)
1305 {
1306    if (ctx->options->gfx_level >= GFX7)
1307       return bld.vop1(aco_opcode::v_floor_f64, Definition(dst), val);
1308 
1309    /* GFX6 doesn't support V_FLOOR_F64, lower it (note that it's actually
1310     * lowered at NIR level for precision reasons). */
1311    Temp src0 = as_vgpr(ctx, val);
1312 
1313    Temp min_val = bld.pseudo(aco_opcode::p_create_vector, bld.def(s2), Operand::c32(-1u),
1314                              Operand::c32(0x3fefffffu));
1315 
1316    Temp isnan = bld.vopc(aco_opcode::v_cmp_neq_f64, bld.def(bld.lm), src0, src0);
1317    Temp fract = bld.vop1(aco_opcode::v_fract_f64, bld.def(v2), src0);
1318    Temp min = bld.vop3(aco_opcode::v_min_f64, bld.def(v2), fract, min_val);
1319 
1320    Temp then_lo = bld.tmp(v1), then_hi = bld.tmp(v1);
1321    bld.pseudo(aco_opcode::p_split_vector, Definition(then_lo), Definition(then_hi), src0);
1322    Temp else_lo = bld.tmp(v1), else_hi = bld.tmp(v1);
1323    bld.pseudo(aco_opcode::p_split_vector, Definition(else_lo), Definition(else_hi), min);
1324 
1325    Temp dst0 = bld.vop2(aco_opcode::v_cndmask_b32, bld.def(v1), else_lo, then_lo, isnan);
1326    Temp dst1 = bld.vop2(aco_opcode::v_cndmask_b32, bld.def(v1), else_hi, then_hi, isnan);
1327 
1328    Temp v = bld.pseudo(aco_opcode::p_create_vector, bld.def(v2), dst0, dst1);
1329 
1330    Instruction* add = bld.vop3(aco_opcode::v_add_f64, Definition(dst), src0, v);
1331    add->valu().neg[1] = true;
1332 
1333    return add->definitions[0].getTemp();
1334 }
1335 
1336 Temp
uadd32_sat(Builder & bld,Definition dst,Temp src0,Temp src1)1337 uadd32_sat(Builder& bld, Definition dst, Temp src0, Temp src1)
1338 {
1339    if (bld.program->gfx_level < GFX8) {
1340       Builder::Result add = bld.vadd32(bld.def(v1), src0, src1, true);
1341       return bld.vop2_e64(aco_opcode::v_cndmask_b32, dst, add.def(0).getTemp(), Operand::c32(-1),
1342                           add.def(1).getTemp());
1343    }
1344 
1345    Builder::Result add(NULL);
1346    if (bld.program->gfx_level >= GFX9) {
1347       add = bld.vop2_e64(aco_opcode::v_add_u32, dst, src0, src1);
1348    } else {
1349       add = bld.vop2_e64(aco_opcode::v_add_co_u32, dst, bld.def(bld.lm), src0, src1);
1350    }
1351    add->valu().clamp = 1;
1352    return dst.getTemp();
1353 }
1354 
1355 Temp
usub32_sat(Builder & bld,Definition dst,Temp src0,Temp src1)1356 usub32_sat(Builder& bld, Definition dst, Temp src0, Temp src1)
1357 {
1358    if (bld.program->gfx_level < GFX8) {
1359       Builder::Result sub = bld.vsub32(bld.def(v1), src0, src1, true);
1360       return bld.vop2_e64(aco_opcode::v_cndmask_b32, dst, sub.def(0).getTemp(), Operand::c32(0u),
1361                           sub.def(1).getTemp());
1362    }
1363 
1364    Builder::Result sub(NULL);
1365    if (bld.program->gfx_level >= GFX9) {
1366       sub = bld.vop2_e64(aco_opcode::v_sub_u32, dst, src0, src1);
1367    } else {
1368       sub = bld.vop2_e64(aco_opcode::v_sub_co_u32, dst, bld.def(bld.lm), src0, src1);
1369    }
1370    sub->valu().clamp = 1;
1371    return dst.getTemp();
1372 }
1373 
1374 void
emit_vec2_f2f16(isel_context * ctx,nir_alu_instr * instr,Temp dst)1375 emit_vec2_f2f16(isel_context* ctx, nir_alu_instr* instr, Temp dst)
1376 {
1377    Builder bld(ctx->program, ctx->block);
1378    Temp src = get_ssa_temp(ctx, instr->src[0].src.ssa);
1379    RegClass rc = RegClass(src.regClass().type(), instr->src[0].src.ssa->bit_size / 32);
1380    Temp src0 = emit_extract_vector(ctx, src, instr->src[0].swizzle[0], rc);
1381    Temp src1 = emit_extract_vector(ctx, src, instr->src[0].swizzle[1], rc);
1382 
1383    src1 = as_vgpr(ctx, src1);
1384    if (ctx->program->gfx_level == GFX8 || ctx->program->gfx_level == GFX9)
1385       bld.vop3(aco_opcode::v_cvt_pkrtz_f16_f32_e64, Definition(dst), src0, src1);
1386    else
1387       bld.vop2(aco_opcode::v_cvt_pkrtz_f16_f32, Definition(dst), src0, src1);
1388    emit_split_vector(ctx, dst, 2);
1389 }
1390 
1391 void
visit_alu_instr(isel_context * ctx,nir_alu_instr * instr)1392 visit_alu_instr(isel_context* ctx, nir_alu_instr* instr)
1393 {
1394    Builder bld(ctx->program, ctx->block);
1395    bld.is_precise = instr->exact;
1396    Temp dst = get_ssa_temp(ctx, &instr->def);
1397    switch (instr->op) {
1398    case nir_op_vec2:
1399    case nir_op_vec3:
1400    case nir_op_vec4:
1401    case nir_op_vec5:
1402    case nir_op_vec8:
1403    case nir_op_vec16: {
1404       std::array<Temp, NIR_MAX_VEC_COMPONENTS> elems;
1405       unsigned num = instr->def.num_components;
1406       for (unsigned i = 0; i < num; ++i)
1407          elems[i] = get_alu_src(ctx, instr->src[i]);
1408 
1409       if (instr->def.bit_size >= 32 || dst.type() == RegType::vgpr) {
1410          aco_ptr<Pseudo_instruction> vec{create_instruction<Pseudo_instruction>(
1411             aco_opcode::p_create_vector, Format::PSEUDO, instr->def.num_components, 1)};
1412          RegClass elem_rc = RegClass::get(RegType::vgpr, instr->def.bit_size / 8u);
1413          for (unsigned i = 0; i < num; ++i) {
1414             if (elems[i].type() == RegType::sgpr && elem_rc.is_subdword())
1415                elems[i] = emit_extract_vector(ctx, elems[i], 0, elem_rc);
1416             vec->operands[i] = Operand{elems[i]};
1417          }
1418          vec->definitions[0] = Definition(dst);
1419          ctx->block->instructions.emplace_back(std::move(vec));
1420          ctx->allocated_vec.emplace(dst.id(), elems);
1421       } else {
1422          bool use_s_pack = ctx->program->gfx_level >= GFX9;
1423          Temp mask = bld.copy(bld.def(s1), Operand::c32((1u << instr->def.bit_size) - 1));
1424 
1425          std::array<Temp, NIR_MAX_VEC_COMPONENTS> packed;
1426          uint32_t const_vals[NIR_MAX_VEC_COMPONENTS] = {};
1427          for (unsigned i = 0; i < num; i++) {
1428             unsigned packed_size = use_s_pack ? 16 : 32;
1429             unsigned idx = i * instr->def.bit_size / packed_size;
1430             unsigned offset = i * instr->def.bit_size % packed_size;
1431             if (nir_src_is_const(instr->src[i].src)) {
1432                const_vals[idx] |= nir_src_as_uint(instr->src[i].src) << offset;
1433                continue;
1434             }
1435             if (nir_src_is_undef(instr->src[i].src))
1436                continue;
1437 
1438             if (offset != packed_size - instr->def.bit_size)
1439                elems[i] =
1440                   bld.sop2(aco_opcode::s_and_b32, bld.def(s1), bld.def(s1, scc), elems[i], mask);
1441 
1442             if (offset)
1443                elems[i] = bld.sop2(aco_opcode::s_lshl_b32, bld.def(s1), bld.def(s1, scc), elems[i],
1444                                    Operand::c32(offset));
1445 
1446             if (packed[idx].id())
1447                packed[idx] = bld.sop2(aco_opcode::s_or_b32, bld.def(s1), bld.def(s1, scc), elems[i],
1448                                       packed[idx]);
1449             else
1450                packed[idx] = elems[i];
1451          }
1452 
1453          if (use_s_pack) {
1454             for (unsigned i = 0; i < dst.size(); i++) {
1455                bool same = !!packed[i * 2].id() == !!packed[i * 2 + 1].id();
1456 
1457                if (packed[i * 2].id() && packed[i * 2 + 1].id())
1458                   packed[i] = bld.sop2(aco_opcode::s_pack_ll_b32_b16, bld.def(s1), packed[i * 2],
1459                                        packed[i * 2 + 1]);
1460                else if (packed[i * 2 + 1].id())
1461                   packed[i] = bld.sop2(aco_opcode::s_pack_ll_b32_b16, bld.def(s1),
1462                                        Operand::c32(const_vals[i * 2]), packed[i * 2 + 1]);
1463                else if (packed[i * 2].id())
1464                   packed[i] = bld.sop2(aco_opcode::s_pack_ll_b32_b16, bld.def(s1), packed[i * 2],
1465                                        Operand::c32(const_vals[i * 2 + 1]));
1466                else
1467                   packed[i] = Temp(); /* Both constants, so reset the entry */
1468 
1469                if (same)
1470                   const_vals[i] = const_vals[i * 2] | (const_vals[i * 2 + 1] << 16);
1471                else
1472                   const_vals[i] = 0;
1473             }
1474          }
1475 
1476          for (unsigned i = 0; i < dst.size(); i++) {
1477             if (const_vals[i] && packed[i].id())
1478                packed[i] = bld.sop2(aco_opcode::s_or_b32, bld.def(s1), bld.def(s1, scc),
1479                                     Operand::c32(const_vals[i]), packed[i]);
1480             else if (!packed[i].id())
1481                packed[i] = bld.copy(bld.def(s1), Operand::c32(const_vals[i]));
1482          }
1483 
1484          if (dst.size() == 1)
1485             bld.copy(Definition(dst), packed[0]);
1486          else {
1487             aco_ptr<Pseudo_instruction> vec{create_instruction<Pseudo_instruction>(
1488                aco_opcode::p_create_vector, Format::PSEUDO, dst.size(), 1)};
1489             vec->definitions[0] = Definition(dst);
1490             for (unsigned i = 0; i < dst.size(); ++i)
1491                vec->operands[i] = Operand(packed[i]);
1492             bld.insert(std::move(vec));
1493          }
1494       }
1495       break;
1496    }
1497    case nir_op_mov: {
1498       Temp src = get_alu_src(ctx, instr->src[0]);
1499       if (src.type() == RegType::vgpr && dst.type() == RegType::sgpr) {
1500          /* use size() instead of bytes() for 8/16-bit */
1501          assert(src.size() == dst.size() && "wrong src or dst register class for nir_op_mov");
1502          bld.pseudo(aco_opcode::p_as_uniform, Definition(dst), src);
1503       } else {
1504          assert(src.bytes() == dst.bytes() && "wrong src or dst register class for nir_op_mov");
1505          bld.copy(Definition(dst), src);
1506       }
1507       break;
1508    }
1509    case nir_op_inot: {
1510       Temp src = get_alu_src(ctx, instr->src[0]);
1511       if (dst.regClass() == v1 || dst.regClass() == v2b || dst.regClass() == v1b) {
1512          emit_vop1_instruction(ctx, instr, aco_opcode::v_not_b32, dst);
1513       } else if (dst.regClass() == v2) {
1514          Temp lo = bld.tmp(v1), hi = bld.tmp(v1);
1515          bld.pseudo(aco_opcode::p_split_vector, Definition(lo), Definition(hi), src);
1516          lo = bld.vop1(aco_opcode::v_not_b32, bld.def(v1), lo);
1517          hi = bld.vop1(aco_opcode::v_not_b32, bld.def(v1), hi);
1518          bld.pseudo(aco_opcode::p_create_vector, Definition(dst), lo, hi);
1519       } else if (dst.type() == RegType::sgpr) {
1520          aco_opcode opcode = dst.size() == 1 ? aco_opcode::s_not_b32 : aco_opcode::s_not_b64;
1521          bld.sop1(opcode, Definition(dst), bld.def(s1, scc), src);
1522       } else {
1523          isel_err(&instr->instr, "Unimplemented NIR instr bit size");
1524       }
1525       break;
1526    }
1527    case nir_op_iabs: {
1528       if (dst.regClass() == v1 && instr->def.bit_size == 16) {
1529          Temp src = get_alu_src_vop3p(ctx, instr->src[0]);
1530 
1531          unsigned opsel_lo = (instr->src[0].swizzle[0] & 1) << 1;
1532          unsigned opsel_hi = ((instr->src[0].swizzle[1] & 1) << 1) | 1;
1533 
1534          Temp sub = bld.vop3p(aco_opcode::v_pk_sub_u16, Definition(bld.tmp(v1)), Operand::zero(),
1535                               src, opsel_lo, opsel_hi);
1536          bld.vop3p(aco_opcode::v_pk_max_i16, Definition(dst), sub, src, opsel_lo, opsel_hi);
1537          break;
1538       }
1539       Temp src = get_alu_src(ctx, instr->src[0]);
1540       if (dst.regClass() == s1) {
1541          bld.sop1(aco_opcode::s_abs_i32, Definition(dst), bld.def(s1, scc), src);
1542       } else if (dst.regClass() == v1) {
1543          bld.vop2(aco_opcode::v_max_i32, Definition(dst), src,
1544                   bld.vsub32(bld.def(v1), Operand::zero(), src));
1545       } else if (dst.regClass() == v2b && ctx->program->gfx_level >= GFX10) {
1546          bld.vop3(
1547             aco_opcode::v_max_i16_e64, Definition(dst), src,
1548             bld.vop3(aco_opcode::v_sub_u16_e64, Definition(bld.tmp(v2b)), Operand::zero(2), src));
1549       } else if (dst.regClass() == v2b) {
1550          src = as_vgpr(ctx, src);
1551          bld.vop2(aco_opcode::v_max_i16, Definition(dst), src,
1552                   bld.vop2(aco_opcode::v_sub_u16, Definition(bld.tmp(v2b)), Operand::zero(2), src));
1553       } else {
1554          isel_err(&instr->instr, "Unimplemented NIR instr bit size");
1555       }
1556       break;
1557    }
1558    case nir_op_isign: {
1559       Temp src = get_alu_src(ctx, instr->src[0]);
1560       if (dst.regClass() == s1) {
1561          Temp tmp =
1562             bld.sop2(aco_opcode::s_max_i32, bld.def(s1), bld.def(s1, scc), src, Operand::c32(-1));
1563          bld.sop2(aco_opcode::s_min_i32, Definition(dst), bld.def(s1, scc), tmp, Operand::c32(1u));
1564       } else if (dst.regClass() == s2) {
1565          Temp neg =
1566             bld.sop2(aco_opcode::s_ashr_i64, bld.def(s2), bld.def(s1, scc), src, Operand::c32(63u));
1567          Temp neqz;
1568          if (ctx->program->gfx_level >= GFX8)
1569             neqz = bld.sopc(aco_opcode::s_cmp_lg_u64, bld.def(s1, scc), src, Operand::zero());
1570          else
1571             neqz =
1572                bld.sop2(aco_opcode::s_or_b64, bld.def(s2), bld.def(s1, scc), src, Operand::zero())
1573                   .def(1)
1574                   .getTemp();
1575          /* SCC gets zero-extended to 64 bit */
1576          bld.sop2(aco_opcode::s_or_b64, Definition(dst), bld.def(s1, scc), neg, bld.scc(neqz));
1577       } else if (dst.regClass() == v1) {
1578          bld.vop3(aco_opcode::v_med3_i32, Definition(dst), Operand::c32(-1), src, Operand::c32(1u));
1579       } else if (dst.regClass() == v2b && ctx->program->gfx_level >= GFX9) {
1580          bld.vop3(aco_opcode::v_med3_i16, Definition(dst), Operand::c16(-1), src, Operand::c16(1u));
1581       } else if (dst.regClass() == v2b) {
1582          src = as_vgpr(ctx, src);
1583          bld.vop2(aco_opcode::v_max_i16, Definition(dst), Operand::c16(-1),
1584                   bld.vop2(aco_opcode::v_min_i16, Definition(bld.tmp(v1)), Operand::c16(1u), src));
1585       } else if (dst.regClass() == v2) {
1586          Temp upper = emit_extract_vector(ctx, src, 1, v1);
1587          Temp neg = bld.vop2(aco_opcode::v_ashrrev_i32, bld.def(v1), Operand::c32(31u), upper);
1588          Temp gtz = bld.vopc(aco_opcode::v_cmp_ge_i64, bld.def(bld.lm), Operand::zero(), src);
1589          Temp lower = bld.vop2(aco_opcode::v_cndmask_b32, bld.def(v1), Operand::c32(1u), neg, gtz);
1590          upper = bld.vop2(aco_opcode::v_cndmask_b32, bld.def(v1), Operand::zero(), neg, gtz);
1591          bld.pseudo(aco_opcode::p_create_vector, Definition(dst), lower, upper);
1592       } else {
1593          isel_err(&instr->instr, "Unimplemented NIR instr bit size");
1594       }
1595       break;
1596    }
1597    case nir_op_imax: {
1598       if (dst.regClass() == v2b && ctx->program->gfx_level >= GFX10) {
1599          emit_vop3a_instruction(ctx, instr, aco_opcode::v_max_i16_e64, dst);
1600       } else if (dst.regClass() == v2b) {
1601          emit_vop2_instruction(ctx, instr, aco_opcode::v_max_i16, dst, true);
1602       } else if (dst.regClass() == v1 && instr->def.bit_size == 16) {
1603          emit_vop3p_instruction(ctx, instr, aco_opcode::v_pk_max_i16, dst);
1604       } else if (dst.regClass() == v1) {
1605          emit_vop2_instruction(ctx, instr, aco_opcode::v_max_i32, dst, true);
1606       } else if (dst.regClass() == s1) {
1607          emit_sop2_instruction(ctx, instr, aco_opcode::s_max_i32, dst, true);
1608       } else {
1609          isel_err(&instr->instr, "Unimplemented NIR instr bit size");
1610       }
1611       break;
1612    }
1613    case nir_op_umax: {
1614       if (dst.regClass() == v2b && ctx->program->gfx_level >= GFX10) {
1615          emit_vop3a_instruction(ctx, instr, aco_opcode::v_max_u16_e64, dst);
1616       } else if (dst.regClass() == v2b) {
1617          emit_vop2_instruction(ctx, instr, aco_opcode::v_max_u16, dst, true);
1618       } else if (dst.regClass() == v1 && instr->def.bit_size == 16) {
1619          emit_vop3p_instruction(ctx, instr, aco_opcode::v_pk_max_u16, dst);
1620       } else if (dst.regClass() == v1) {
1621          emit_vop2_instruction(ctx, instr, aco_opcode::v_max_u32, dst, true);
1622       } else if (dst.regClass() == s1) {
1623          emit_sop2_instruction(ctx, instr, aco_opcode::s_max_u32, dst, true);
1624       } else {
1625          isel_err(&instr->instr, "Unimplemented NIR instr bit size");
1626       }
1627       break;
1628    }
1629    case nir_op_imin: {
1630       if (dst.regClass() == v2b && ctx->program->gfx_level >= GFX10) {
1631          emit_vop3a_instruction(ctx, instr, aco_opcode::v_min_i16_e64, dst);
1632       } else if (dst.regClass() == v2b) {
1633          emit_vop2_instruction(ctx, instr, aco_opcode::v_min_i16, dst, true);
1634       } else if (dst.regClass() == v1 && instr->def.bit_size == 16) {
1635          emit_vop3p_instruction(ctx, instr, aco_opcode::v_pk_min_i16, dst);
1636       } else if (dst.regClass() == v1) {
1637          emit_vop2_instruction(ctx, instr, aco_opcode::v_min_i32, dst, true);
1638       } else if (dst.regClass() == s1) {
1639          emit_sop2_instruction(ctx, instr, aco_opcode::s_min_i32, dst, true);
1640       } else {
1641          isel_err(&instr->instr, "Unimplemented NIR instr bit size");
1642       }
1643       break;
1644    }
1645    case nir_op_umin: {
1646       if (dst.regClass() == v2b && ctx->program->gfx_level >= GFX10) {
1647          emit_vop3a_instruction(ctx, instr, aco_opcode::v_min_u16_e64, dst);
1648       } else if (dst.regClass() == v2b) {
1649          emit_vop2_instruction(ctx, instr, aco_opcode::v_min_u16, dst, true);
1650       } else if (dst.regClass() == v1 && instr->def.bit_size == 16) {
1651          emit_vop3p_instruction(ctx, instr, aco_opcode::v_pk_min_u16, dst);
1652       } else if (dst.regClass() == v1) {
1653          emit_vop2_instruction(ctx, instr, aco_opcode::v_min_u32, dst, true);
1654       } else if (dst.regClass() == s1) {
1655          emit_sop2_instruction(ctx, instr, aco_opcode::s_min_u32, dst, true);
1656       } else {
1657          isel_err(&instr->instr, "Unimplemented NIR instr bit size");
1658       }
1659       break;
1660    }
1661    case nir_op_ior: {
1662       if (instr->def.bit_size == 1) {
1663          emit_boolean_logic(ctx, instr, Builder::s_or, dst);
1664       } else if (dst.regClass() == v1 || dst.regClass() == v2b || dst.regClass() == v1b) {
1665          emit_vop2_instruction(ctx, instr, aco_opcode::v_or_b32, dst, true);
1666       } else if (dst.regClass() == v2) {
1667          emit_vop2_instruction_logic64(ctx, instr, aco_opcode::v_or_b32, dst);
1668       } else if (dst.regClass() == s1) {
1669          emit_sop2_instruction(ctx, instr, aco_opcode::s_or_b32, dst, true);
1670       } else if (dst.regClass() == s2) {
1671          emit_sop2_instruction(ctx, instr, aco_opcode::s_or_b64, dst, true);
1672       } else {
1673          isel_err(&instr->instr, "Unimplemented NIR instr bit size");
1674       }
1675       break;
1676    }
1677    case nir_op_iand: {
1678       if (instr->def.bit_size == 1) {
1679          emit_boolean_logic(ctx, instr, Builder::s_and, dst);
1680       } else if (dst.regClass() == v1 || dst.regClass() == v2b || dst.regClass() == v1b) {
1681          emit_vop2_instruction(ctx, instr, aco_opcode::v_and_b32, dst, true);
1682       } else if (dst.regClass() == v2) {
1683          emit_vop2_instruction_logic64(ctx, instr, aco_opcode::v_and_b32, dst);
1684       } else if (dst.regClass() == s1) {
1685          emit_sop2_instruction(ctx, instr, aco_opcode::s_and_b32, dst, true);
1686       } else if (dst.regClass() == s2) {
1687          emit_sop2_instruction(ctx, instr, aco_opcode::s_and_b64, dst, true);
1688       } else {
1689          isel_err(&instr->instr, "Unimplemented NIR instr bit size");
1690       }
1691       break;
1692    }
1693    case nir_op_ixor: {
1694       if (instr->def.bit_size == 1) {
1695          emit_boolean_logic(ctx, instr, Builder::s_xor, dst);
1696       } else if (dst.regClass() == v1 || dst.regClass() == v2b || dst.regClass() == v1b) {
1697          emit_vop2_instruction(ctx, instr, aco_opcode::v_xor_b32, dst, true);
1698       } else if (dst.regClass() == v2) {
1699          emit_vop2_instruction_logic64(ctx, instr, aco_opcode::v_xor_b32, dst);
1700       } else if (dst.regClass() == s1) {
1701          emit_sop2_instruction(ctx, instr, aco_opcode::s_xor_b32, dst, true);
1702       } else if (dst.regClass() == s2) {
1703          emit_sop2_instruction(ctx, instr, aco_opcode::s_xor_b64, dst, true);
1704       } else {
1705          isel_err(&instr->instr, "Unimplemented NIR instr bit size");
1706       }
1707       break;
1708    }
1709    case nir_op_ushr: {
1710       if (dst.regClass() == v2b && ctx->program->gfx_level >= GFX10) {
1711          emit_vop3a_instruction(ctx, instr, aco_opcode::v_lshrrev_b16_e64, dst, false, 2, true);
1712       } else if (dst.regClass() == v2b) {
1713          emit_vop2_instruction(ctx, instr, aco_opcode::v_lshrrev_b16, dst, false, true);
1714       } else if (dst.regClass() == v1 && instr->def.bit_size == 16) {
1715          emit_vop3p_instruction(ctx, instr, aco_opcode::v_pk_lshrrev_b16, dst, true);
1716       } else if (dst.regClass() == v1) {
1717          emit_vop2_instruction(ctx, instr, aco_opcode::v_lshrrev_b32, dst, false, true);
1718       } else if (dst.regClass() == v2 && ctx->program->gfx_level >= GFX8) {
1719          bld.vop3(aco_opcode::v_lshrrev_b64, Definition(dst), get_alu_src(ctx, instr->src[1]),
1720                   get_alu_src(ctx, instr->src[0]));
1721       } else if (dst.regClass() == v2) {
1722          emit_vop3a_instruction(ctx, instr, aco_opcode::v_lshr_b64, dst);
1723       } else if (dst.regClass() == s2) {
1724          emit_sop2_instruction(ctx, instr, aco_opcode::s_lshr_b64, dst, true);
1725       } else if (dst.regClass() == s1) {
1726          emit_sop2_instruction(ctx, instr, aco_opcode::s_lshr_b32, dst, true);
1727       } else {
1728          isel_err(&instr->instr, "Unimplemented NIR instr bit size");
1729       }
1730       break;
1731    }
1732    case nir_op_ishl: {
1733       if (dst.regClass() == v2b && ctx->program->gfx_level >= GFX10) {
1734          emit_vop3a_instruction(ctx, instr, aco_opcode::v_lshlrev_b16_e64, dst, false, 2, true);
1735       } else if (dst.regClass() == v2b) {
1736          emit_vop2_instruction(ctx, instr, aco_opcode::v_lshlrev_b16, dst, false, true);
1737       } else if (dst.regClass() == v1 && instr->def.bit_size == 16) {
1738          emit_vop3p_instruction(ctx, instr, aco_opcode::v_pk_lshlrev_b16, dst, true);
1739       } else if (dst.regClass() == v1) {
1740          emit_vop2_instruction(ctx, instr, aco_opcode::v_lshlrev_b32, dst, false, true, false,
1741                                false, 2);
1742       } else if (dst.regClass() == v2 && ctx->program->gfx_level >= GFX8) {
1743          bld.vop3(aco_opcode::v_lshlrev_b64, Definition(dst), get_alu_src(ctx, instr->src[1]),
1744                   get_alu_src(ctx, instr->src[0]));
1745       } else if (dst.regClass() == v2) {
1746          emit_vop3a_instruction(ctx, instr, aco_opcode::v_lshl_b64, dst);
1747       } else if (dst.regClass() == s1) {
1748          emit_sop2_instruction(ctx, instr, aco_opcode::s_lshl_b32, dst, true, 1);
1749       } else if (dst.regClass() == s2) {
1750          emit_sop2_instruction(ctx, instr, aco_opcode::s_lshl_b64, dst, true);
1751       } else {
1752          isel_err(&instr->instr, "Unimplemented NIR instr bit size");
1753       }
1754       break;
1755    }
1756    case nir_op_ishr: {
1757       if (dst.regClass() == v2b && ctx->program->gfx_level >= GFX10) {
1758          emit_vop3a_instruction(ctx, instr, aco_opcode::v_ashrrev_i16_e64, dst, false, 2, true);
1759       } else if (dst.regClass() == v2b) {
1760          emit_vop2_instruction(ctx, instr, aco_opcode::v_ashrrev_i16, dst, false, true);
1761       } else if (dst.regClass() == v1 && instr->def.bit_size == 16) {
1762          emit_vop3p_instruction(ctx, instr, aco_opcode::v_pk_ashrrev_i16, dst, true);
1763       } else if (dst.regClass() == v1) {
1764          emit_vop2_instruction(ctx, instr, aco_opcode::v_ashrrev_i32, dst, false, true);
1765       } else if (dst.regClass() == v2 && ctx->program->gfx_level >= GFX8) {
1766          bld.vop3(aco_opcode::v_ashrrev_i64, Definition(dst), get_alu_src(ctx, instr->src[1]),
1767                   get_alu_src(ctx, instr->src[0]));
1768       } else if (dst.regClass() == v2) {
1769          emit_vop3a_instruction(ctx, instr, aco_opcode::v_ashr_i64, dst);
1770       } else if (dst.regClass() == s1) {
1771          emit_sop2_instruction(ctx, instr, aco_opcode::s_ashr_i32, dst, true);
1772       } else if (dst.regClass() == s2) {
1773          emit_sop2_instruction(ctx, instr, aco_opcode::s_ashr_i64, dst, true);
1774       } else {
1775          isel_err(&instr->instr, "Unimplemented NIR instr bit size");
1776       }
1777       break;
1778    }
1779    case nir_op_find_lsb: {
1780       Temp src = get_alu_src(ctx, instr->src[0]);
1781       if (src.regClass() == s1) {
1782          bld.sop1(aco_opcode::s_ff1_i32_b32, Definition(dst), src);
1783       } else if (src.regClass() == v1) {
1784          emit_vop1_instruction(ctx, instr, aco_opcode::v_ffbl_b32, dst);
1785       } else if (src.regClass() == s2) {
1786          bld.sop1(aco_opcode::s_ff1_i32_b64, Definition(dst), src);
1787       } else if (src.regClass() == v2) {
1788          Temp lo = bld.tmp(v1), hi = bld.tmp(v1);
1789          bld.pseudo(aco_opcode::p_split_vector, Definition(lo), Definition(hi), src);
1790          lo = bld.vop1(aco_opcode::v_ffbl_b32, bld.def(v1), lo);
1791          hi = bld.vop1(aco_opcode::v_ffbl_b32, bld.def(v1), hi);
1792          hi = uadd32_sat(bld, bld.def(v1), bld.copy(bld.def(s1), Operand::c32(32u)), hi);
1793          bld.vop2(aco_opcode::v_min_u32, Definition(dst), lo, hi);
1794       } else {
1795          isel_err(&instr->instr, "Unimplemented NIR instr bit size");
1796       }
1797       break;
1798    }
1799    case nir_op_ufind_msb:
1800    case nir_op_ifind_msb: {
1801       Temp src = get_alu_src(ctx, instr->src[0]);
1802       if (src.regClass() == s1 || src.regClass() == s2) {
1803          aco_opcode op = src.regClass() == s2
1804                             ? (instr->op == nir_op_ufind_msb ? aco_opcode::s_flbit_i32_b64
1805                                                              : aco_opcode::s_flbit_i32_i64)
1806                             : (instr->op == nir_op_ufind_msb ? aco_opcode::s_flbit_i32_b32
1807                                                              : aco_opcode::s_flbit_i32);
1808          Temp msb_rev = bld.sop1(op, bld.def(s1), src);
1809 
1810          Builder::Result sub = bld.sop2(aco_opcode::s_sub_u32, bld.def(s1), bld.def(s1, scc),
1811                                         Operand::c32(src.size() * 32u - 1u), msb_rev);
1812          Temp msb = sub.def(0).getTemp();
1813          Temp carry = sub.def(1).getTemp();
1814 
1815          bld.sop2(aco_opcode::s_cselect_b32, Definition(dst), Operand::c32(-1), msb,
1816                   bld.scc(carry));
1817       } else if (src.regClass() == v1) {
1818          aco_opcode op =
1819             instr->op == nir_op_ufind_msb ? aco_opcode::v_ffbh_u32 : aco_opcode::v_ffbh_i32;
1820          Temp msb_rev = bld.tmp(v1);
1821          emit_vop1_instruction(ctx, instr, op, msb_rev);
1822          Temp msb = bld.tmp(v1);
1823          Temp carry =
1824             bld.vsub32(Definition(msb), Operand::c32(31u), Operand(msb_rev), true).def(1).getTemp();
1825          bld.vop2(aco_opcode::v_cndmask_b32, Definition(dst), msb, msb_rev, carry);
1826       } else if (src.regClass() == v2) {
1827          aco_opcode op =
1828             instr->op == nir_op_ufind_msb ? aco_opcode::v_ffbh_u32 : aco_opcode::v_ffbh_i32;
1829 
1830          Temp lo = bld.tmp(v1), hi = bld.tmp(v1);
1831          bld.pseudo(aco_opcode::p_split_vector, Definition(lo), Definition(hi), src);
1832 
1833          lo = uadd32_sat(bld, bld.def(v1), bld.copy(bld.def(s1), Operand::c32(32u)),
1834                          bld.vop1(op, bld.def(v1), lo));
1835          hi = bld.vop1(op, bld.def(v1), hi);
1836          Temp found_hi = bld.vopc(aco_opcode::v_cmp_lg_u32, bld.def(bld.lm), Operand::c32(-1), hi);
1837 
1838          Temp msb_rev = bld.vop2(aco_opcode::v_cndmask_b32, bld.def(v1), lo, hi, found_hi);
1839 
1840          Temp msb = bld.tmp(v1);
1841          Temp carry =
1842             bld.vsub32(Definition(msb), Operand::c32(63u), Operand(msb_rev), true).def(1).getTemp();
1843          bld.vop2(aco_opcode::v_cndmask_b32, Definition(dst), msb, msb_rev, carry);
1844       } else {
1845          isel_err(&instr->instr, "Unimplemented NIR instr bit size");
1846       }
1847       break;
1848    }
1849    case nir_op_ufind_msb_rev:
1850    case nir_op_ifind_msb_rev: {
1851       Temp src = get_alu_src(ctx, instr->src[0]);
1852       if (src.regClass() == s1) {
1853          aco_opcode op = instr->op == nir_op_ufind_msb_rev ? aco_opcode::s_flbit_i32_b32
1854                                                            : aco_opcode::s_flbit_i32;
1855          bld.sop1(op, Definition(dst), src);
1856       } else if (src.regClass() == v1) {
1857          aco_opcode op =
1858             instr->op == nir_op_ufind_msb_rev ? aco_opcode::v_ffbh_u32 : aco_opcode::v_ffbh_i32;
1859          emit_vop1_instruction(ctx, instr, op, dst);
1860       } else {
1861          isel_err(&instr->instr, "Unimplemented NIR instr bit size");
1862       }
1863       break;
1864    }
1865    case nir_op_bitfield_reverse: {
1866       if (dst.regClass() == s1) {
1867          bld.sop1(aco_opcode::s_brev_b32, Definition(dst), get_alu_src(ctx, instr->src[0]));
1868       } else if (dst.regClass() == v1) {
1869          bld.vop1(aco_opcode::v_bfrev_b32, Definition(dst), get_alu_src(ctx, instr->src[0]));
1870       } else {
1871          isel_err(&instr->instr, "Unimplemented NIR instr bit size");
1872       }
1873       break;
1874    }
1875    case nir_op_iadd: {
1876       if (dst.regClass() == s1) {
1877          emit_sop2_instruction(ctx, instr, aco_opcode::s_add_u32, dst, true);
1878          break;
1879       } else if (dst.bytes() <= 2 && ctx->program->gfx_level >= GFX10) {
1880          emit_vop3a_instruction(ctx, instr, aco_opcode::v_add_u16_e64, dst);
1881          break;
1882       } else if (dst.bytes() <= 2 && ctx->program->gfx_level >= GFX8) {
1883          emit_vop2_instruction(ctx, instr, aco_opcode::v_add_u16, dst, true);
1884          break;
1885       } else if (dst.regClass() == v1 && instr->def.bit_size == 16) {
1886          emit_vop3p_instruction(ctx, instr, aco_opcode::v_pk_add_u16, dst);
1887          break;
1888       }
1889 
1890       Temp src0 = get_alu_src(ctx, instr->src[0]);
1891       Temp src1 = get_alu_src(ctx, instr->src[1]);
1892       if (dst.type() == RegType::vgpr && dst.bytes() <= 4) {
1893          if (instr->no_unsigned_wrap)
1894             bld.nuw().vadd32(Definition(dst), Operand(src0), Operand(src1));
1895          else
1896             bld.vadd32(Definition(dst), Operand(src0), Operand(src1));
1897          break;
1898       }
1899 
1900       assert(src0.size() == 2 && src1.size() == 2);
1901       Temp src00 = bld.tmp(src0.type(), 1);
1902       Temp src01 = bld.tmp(dst.type(), 1);
1903       bld.pseudo(aco_opcode::p_split_vector, Definition(src00), Definition(src01), src0);
1904       Temp src10 = bld.tmp(src1.type(), 1);
1905       Temp src11 = bld.tmp(dst.type(), 1);
1906       bld.pseudo(aco_opcode::p_split_vector, Definition(src10), Definition(src11), src1);
1907 
1908       if (dst.regClass() == s2) {
1909          Temp carry = bld.tmp(s1);
1910          Temp dst0 =
1911             bld.sop2(aco_opcode::s_add_u32, bld.def(s1), bld.scc(Definition(carry)), src00, src10);
1912          Temp dst1 = bld.sop2(aco_opcode::s_addc_u32, bld.def(s1), bld.def(s1, scc), src01, src11,
1913                               bld.scc(carry));
1914          bld.pseudo(aco_opcode::p_create_vector, Definition(dst), dst0, dst1);
1915       } else if (dst.regClass() == v2) {
1916          Temp dst0 = bld.tmp(v1);
1917          Temp carry = bld.vadd32(Definition(dst0), src00, src10, true).def(1).getTemp();
1918          Temp dst1 = bld.vadd32(bld.def(v1), src01, src11, false, carry);
1919          bld.pseudo(aco_opcode::p_create_vector, Definition(dst), dst0, dst1);
1920       } else {
1921          isel_err(&instr->instr, "Unimplemented NIR instr bit size");
1922       }
1923       break;
1924    }
1925    case nir_op_uadd_sat: {
1926       if (dst.regClass() == v1 && instr->def.bit_size == 16) {
1927          Instruction* add_instr = emit_vop3p_instruction(ctx, instr, aco_opcode::v_pk_add_u16, dst);
1928          add_instr->valu().clamp = 1;
1929          break;
1930       }
1931       Temp src0 = get_alu_src(ctx, instr->src[0]);
1932       Temp src1 = get_alu_src(ctx, instr->src[1]);
1933       if (dst.regClass() == s1) {
1934          Temp tmp = bld.tmp(s1), carry = bld.tmp(s1);
1935          bld.sop2(aco_opcode::s_add_u32, Definition(tmp), bld.scc(Definition(carry)), src0, src1);
1936          bld.sop2(aco_opcode::s_cselect_b32, Definition(dst), Operand::c32(-1), tmp,
1937                   bld.scc(carry));
1938          break;
1939       } else if (dst.regClass() == v2b) {
1940          Instruction* add_instr;
1941          if (ctx->program->gfx_level >= GFX10) {
1942             add_instr = bld.vop3(aco_opcode::v_add_u16_e64, Definition(dst), src0, src1).instr;
1943          } else {
1944             if (src1.type() == RegType::sgpr)
1945                std::swap(src0, src1);
1946             add_instr =
1947                bld.vop2_e64(aco_opcode::v_add_u16, Definition(dst), src0, as_vgpr(ctx, src1)).instr;
1948          }
1949          add_instr->valu().clamp = 1;
1950          break;
1951       } else if (dst.regClass() == v1) {
1952          uadd32_sat(bld, Definition(dst), src0, src1);
1953          break;
1954       }
1955 
1956       assert(src0.size() == 2 && src1.size() == 2);
1957 
1958       Temp src00 = bld.tmp(src0.type(), 1);
1959       Temp src01 = bld.tmp(src0.type(), 1);
1960       bld.pseudo(aco_opcode::p_split_vector, Definition(src00), Definition(src01), src0);
1961       Temp src10 = bld.tmp(src1.type(), 1);
1962       Temp src11 = bld.tmp(src1.type(), 1);
1963       bld.pseudo(aco_opcode::p_split_vector, Definition(src10), Definition(src11), src1);
1964 
1965       if (dst.regClass() == s2) {
1966          Temp carry0 = bld.tmp(s1);
1967          Temp carry1 = bld.tmp(s1);
1968 
1969          Temp no_sat0 =
1970             bld.sop2(aco_opcode::s_add_u32, bld.def(s1), bld.scc(Definition(carry0)), src00, src10);
1971          Temp no_sat1 = bld.sop2(aco_opcode::s_addc_u32, bld.def(s1), bld.scc(Definition(carry1)),
1972                                  src01, src11, bld.scc(carry0));
1973 
1974          Temp no_sat = bld.pseudo(aco_opcode::p_create_vector, bld.def(s2), no_sat0, no_sat1);
1975 
1976          bld.sop2(aco_opcode::s_cselect_b64, Definition(dst), Operand::c64(-1), no_sat,
1977                   bld.scc(carry1));
1978       } else if (dst.regClass() == v2) {
1979          Temp no_sat0 = bld.tmp(v1);
1980          Temp dst0 = bld.tmp(v1);
1981          Temp dst1 = bld.tmp(v1);
1982 
1983          Temp carry0 = bld.vadd32(Definition(no_sat0), src00, src10, true).def(1).getTemp();
1984          Temp carry1;
1985 
1986          if (ctx->program->gfx_level >= GFX8) {
1987             carry1 = bld.tmp(bld.lm);
1988             bld.vop2_e64(aco_opcode::v_addc_co_u32, Definition(dst1), Definition(carry1),
1989                          as_vgpr(ctx, src01), as_vgpr(ctx, src11), carry0)
1990                ->valu()
1991                .clamp = 1;
1992          } else {
1993             Temp no_sat1 = bld.tmp(v1);
1994             carry1 = bld.vadd32(Definition(no_sat1), src01, src11, true, carry0).def(1).getTemp();
1995             bld.vop2_e64(aco_opcode::v_cndmask_b32, Definition(dst1), no_sat1, Operand::c32(-1),
1996                          carry1);
1997          }
1998 
1999          bld.vop2_e64(aco_opcode::v_cndmask_b32, Definition(dst0), no_sat0, Operand::c32(-1),
2000                       carry1);
2001          bld.pseudo(aco_opcode::p_create_vector, Definition(dst), dst0, dst1);
2002       } else {
2003          isel_err(&instr->instr, "Unimplemented NIR instr bit size");
2004       }
2005       break;
2006    }
2007    case nir_op_iadd_sat: {
2008       if (dst.regClass() == v1 && instr->def.bit_size == 16) {
2009          Instruction* add_instr = emit_vop3p_instruction(ctx, instr, aco_opcode::v_pk_add_i16, dst);
2010          add_instr->valu().clamp = 1;
2011          break;
2012       }
2013       Temp src0 = get_alu_src(ctx, instr->src[0]);
2014       Temp src1 = get_alu_src(ctx, instr->src[1]);
2015       if (dst.regClass() == s1) {
2016          Temp cond = bld.sopc(aco_opcode::s_cmp_lt_i32, bld.def(s1, scc), src1, Operand::zero());
2017          Temp bound = bld.sop2(aco_opcode::s_add_u32, bld.def(s1), bld.scc(bld.def(s1, scc)),
2018                                Operand::c32(INT32_MAX), cond);
2019          Temp overflow = bld.tmp(s1);
2020          Temp add =
2021             bld.sop2(aco_opcode::s_add_i32, bld.def(s1), bld.scc(Definition(overflow)), src0, src1);
2022          bld.sop2(aco_opcode::s_cselect_b32, Definition(dst), bound, add, bld.scc(overflow));
2023          break;
2024       }
2025 
2026       src1 = as_vgpr(ctx, src1);
2027 
2028       if (dst.regClass() == v2b) {
2029          Instruction* add_instr =
2030             bld.vop3(aco_opcode::v_add_i16, Definition(dst), src0, src1).instr;
2031          add_instr->valu().clamp = 1;
2032       } else if (dst.regClass() == v1) {
2033          Instruction* add_instr =
2034             bld.vop3(aco_opcode::v_add_i32, Definition(dst), src0, src1).instr;
2035          add_instr->valu().clamp = 1;
2036       } else {
2037          isel_err(&instr->instr, "Unimplemented NIR instr bit size");
2038       }
2039       break;
2040    }
2041    case nir_op_uadd_carry: {
2042       Temp src0 = get_alu_src(ctx, instr->src[0]);
2043       Temp src1 = get_alu_src(ctx, instr->src[1]);
2044       if (dst.regClass() == s1) {
2045          bld.sop2(aco_opcode::s_add_u32, bld.def(s1), bld.scc(Definition(dst)), src0, src1);
2046          break;
2047       }
2048       if (dst.regClass() == v1) {
2049          Temp carry = bld.vadd32(bld.def(v1), src0, src1, true).def(1).getTemp();
2050          bld.vop2_e64(aco_opcode::v_cndmask_b32, Definition(dst), Operand::zero(), Operand::c32(1u),
2051                       carry);
2052          break;
2053       }
2054 
2055       Temp src00 = bld.tmp(src0.type(), 1);
2056       Temp src01 = bld.tmp(dst.type(), 1);
2057       bld.pseudo(aco_opcode::p_split_vector, Definition(src00), Definition(src01), src0);
2058       Temp src10 = bld.tmp(src1.type(), 1);
2059       Temp src11 = bld.tmp(dst.type(), 1);
2060       bld.pseudo(aco_opcode::p_split_vector, Definition(src10), Definition(src11), src1);
2061       if (dst.regClass() == s2) {
2062          Temp carry = bld.tmp(s1);
2063          bld.sop2(aco_opcode::s_add_u32, bld.def(s1), bld.scc(Definition(carry)), src00, src10);
2064          carry = bld.sop2(aco_opcode::s_addc_u32, bld.def(s1), bld.scc(bld.def(s1)), src01, src11,
2065                           bld.scc(carry))
2066                     .def(1)
2067                     .getTemp();
2068          bld.pseudo(aco_opcode::p_create_vector, Definition(dst), carry, Operand::zero());
2069       } else if (dst.regClass() == v2) {
2070          Temp carry = bld.vadd32(bld.def(v1), src00, src10, true).def(1).getTemp();
2071          carry = bld.vadd32(bld.def(v1), src01, src11, true, carry).def(1).getTemp();
2072          carry = bld.vop2_e64(aco_opcode::v_cndmask_b32, bld.def(v1), Operand::zero(),
2073                               Operand::c32(1u), carry);
2074          bld.pseudo(aco_opcode::p_create_vector, Definition(dst), carry, Operand::zero());
2075       } else {
2076          isel_err(&instr->instr, "Unimplemented NIR instr bit size");
2077       }
2078       break;
2079    }
2080    case nir_op_isub: {
2081       if (dst.regClass() == s1) {
2082          emit_sop2_instruction(ctx, instr, aco_opcode::s_sub_i32, dst, true);
2083          break;
2084       } else if (dst.regClass() == v1 && instr->def.bit_size == 16) {
2085          emit_vop3p_instruction(ctx, instr, aco_opcode::v_pk_sub_u16, dst);
2086          break;
2087       }
2088 
2089       Temp src0 = get_alu_src(ctx, instr->src[0]);
2090       Temp src1 = get_alu_src(ctx, instr->src[1]);
2091       if (dst.regClass() == v1) {
2092          bld.vsub32(Definition(dst), src0, src1);
2093          break;
2094       } else if (dst.bytes() <= 2) {
2095          if (ctx->program->gfx_level >= GFX10)
2096             bld.vop3(aco_opcode::v_sub_u16_e64, Definition(dst), src0, src1);
2097          else if (src1.type() == RegType::sgpr)
2098             bld.vop2(aco_opcode::v_subrev_u16, Definition(dst), src1, as_vgpr(ctx, src0));
2099          else if (ctx->program->gfx_level >= GFX8)
2100             bld.vop2(aco_opcode::v_sub_u16, Definition(dst), src0, as_vgpr(ctx, src1));
2101          else
2102             bld.vsub32(Definition(dst), src0, src1);
2103          break;
2104       }
2105 
2106       Temp src00 = bld.tmp(src0.type(), 1);
2107       Temp src01 = bld.tmp(dst.type(), 1);
2108       bld.pseudo(aco_opcode::p_split_vector, Definition(src00), Definition(src01), src0);
2109       Temp src10 = bld.tmp(src1.type(), 1);
2110       Temp src11 = bld.tmp(dst.type(), 1);
2111       bld.pseudo(aco_opcode::p_split_vector, Definition(src10), Definition(src11), src1);
2112       if (dst.regClass() == s2) {
2113          Temp borrow = bld.tmp(s1);
2114          Temp dst0 =
2115             bld.sop2(aco_opcode::s_sub_u32, bld.def(s1), bld.scc(Definition(borrow)), src00, src10);
2116          Temp dst1 = bld.sop2(aco_opcode::s_subb_u32, bld.def(s1), bld.def(s1, scc), src01, src11,
2117                               bld.scc(borrow));
2118          bld.pseudo(aco_opcode::p_create_vector, Definition(dst), dst0, dst1);
2119       } else if (dst.regClass() == v2) {
2120          Temp lower = bld.tmp(v1);
2121          Temp borrow = bld.vsub32(Definition(lower), src00, src10, true).def(1).getTemp();
2122          Temp upper = bld.vsub32(bld.def(v1), src01, src11, false, borrow);
2123          bld.pseudo(aco_opcode::p_create_vector, Definition(dst), lower, upper);
2124       } else {
2125          isel_err(&instr->instr, "Unimplemented NIR instr bit size");
2126       }
2127       break;
2128    }
2129    case nir_op_usub_borrow: {
2130       Temp src0 = get_alu_src(ctx, instr->src[0]);
2131       Temp src1 = get_alu_src(ctx, instr->src[1]);
2132       if (dst.regClass() == s1) {
2133          bld.sop2(aco_opcode::s_sub_u32, bld.def(s1), bld.scc(Definition(dst)), src0, src1);
2134          break;
2135       } else if (dst.regClass() == v1) {
2136          Temp borrow = bld.vsub32(bld.def(v1), src0, src1, true).def(1).getTemp();
2137          bld.vop2_e64(aco_opcode::v_cndmask_b32, Definition(dst), Operand::zero(), Operand::c32(1u),
2138                       borrow);
2139          break;
2140       }
2141 
2142       Temp src00 = bld.tmp(src0.type(), 1);
2143       Temp src01 = bld.tmp(dst.type(), 1);
2144       bld.pseudo(aco_opcode::p_split_vector, Definition(src00), Definition(src01), src0);
2145       Temp src10 = bld.tmp(src1.type(), 1);
2146       Temp src11 = bld.tmp(dst.type(), 1);
2147       bld.pseudo(aco_opcode::p_split_vector, Definition(src10), Definition(src11), src1);
2148       if (dst.regClass() == s2) {
2149          Temp borrow = bld.tmp(s1);
2150          bld.sop2(aco_opcode::s_sub_u32, bld.def(s1), bld.scc(Definition(borrow)), src00, src10);
2151          borrow = bld.sop2(aco_opcode::s_subb_u32, bld.def(s1), bld.scc(bld.def(s1)), src01, src11,
2152                            bld.scc(borrow))
2153                      .def(1)
2154                      .getTemp();
2155          bld.pseudo(aco_opcode::p_create_vector, Definition(dst), borrow, Operand::zero());
2156       } else if (dst.regClass() == v2) {
2157          Temp borrow = bld.vsub32(bld.def(v1), src00, src10, true).def(1).getTemp();
2158          borrow = bld.vsub32(bld.def(v1), src01, src11, true, Operand(borrow)).def(1).getTemp();
2159          borrow = bld.vop2_e64(aco_opcode::v_cndmask_b32, bld.def(v1), Operand::zero(),
2160                                Operand::c32(1u), borrow);
2161          bld.pseudo(aco_opcode::p_create_vector, Definition(dst), borrow, Operand::zero());
2162       } else {
2163          isel_err(&instr->instr, "Unimplemented NIR instr bit size");
2164       }
2165       break;
2166    }
2167    case nir_op_usub_sat: {
2168       if (dst.regClass() == v1 && instr->def.bit_size == 16) {
2169          Instruction* sub_instr = emit_vop3p_instruction(ctx, instr, aco_opcode::v_pk_sub_u16, dst);
2170          sub_instr->valu().clamp = 1;
2171          break;
2172       }
2173       Temp src0 = get_alu_src(ctx, instr->src[0]);
2174       Temp src1 = get_alu_src(ctx, instr->src[1]);
2175       if (dst.regClass() == s1) {
2176          Temp tmp = bld.tmp(s1), carry = bld.tmp(s1);
2177          bld.sop2(aco_opcode::s_sub_u32, Definition(tmp), bld.scc(Definition(carry)), src0, src1);
2178          bld.sop2(aco_opcode::s_cselect_b32, Definition(dst), Operand::c32(0), tmp, bld.scc(carry));
2179          break;
2180       } else if (dst.regClass() == v2b) {
2181          Instruction* sub_instr;
2182          if (ctx->program->gfx_level >= GFX10) {
2183             sub_instr = bld.vop3(aco_opcode::v_sub_u16_e64, Definition(dst), src0, src1).instr;
2184          } else {
2185             aco_opcode op = aco_opcode::v_sub_u16;
2186             if (src1.type() == RegType::sgpr) {
2187                std::swap(src0, src1);
2188                op = aco_opcode::v_subrev_u16;
2189             }
2190             sub_instr = bld.vop2_e64(op, Definition(dst), src0, as_vgpr(ctx, src1)).instr;
2191          }
2192          sub_instr->valu().clamp = 1;
2193          break;
2194       } else if (dst.regClass() == v1) {
2195          usub32_sat(bld, Definition(dst), src0, as_vgpr(ctx, src1));
2196          break;
2197       }
2198 
2199       assert(src0.size() == 2 && src1.size() == 2);
2200       Temp src00 = bld.tmp(src0.type(), 1);
2201       Temp src01 = bld.tmp(src0.type(), 1);
2202       bld.pseudo(aco_opcode::p_split_vector, Definition(src00), Definition(src01), src0);
2203       Temp src10 = bld.tmp(src1.type(), 1);
2204       Temp src11 = bld.tmp(src1.type(), 1);
2205       bld.pseudo(aco_opcode::p_split_vector, Definition(src10), Definition(src11), src1);
2206 
2207       if (dst.regClass() == s2) {
2208          Temp carry0 = bld.tmp(s1);
2209          Temp carry1 = bld.tmp(s1);
2210 
2211          Temp no_sat0 =
2212             bld.sop2(aco_opcode::s_sub_u32, bld.def(s1), bld.scc(Definition(carry0)), src00, src10);
2213          Temp no_sat1 = bld.sop2(aco_opcode::s_subb_u32, bld.def(s1), bld.scc(Definition(carry1)),
2214                                  src01, src11, bld.scc(carry0));
2215 
2216          Temp no_sat = bld.pseudo(aco_opcode::p_create_vector, bld.def(s2), no_sat0, no_sat1);
2217 
2218          bld.sop2(aco_opcode::s_cselect_b64, Definition(dst), Operand::c64(0ull), no_sat,
2219                   bld.scc(carry1));
2220       } else if (dst.regClass() == v2) {
2221          Temp no_sat0 = bld.tmp(v1);
2222          Temp dst0 = bld.tmp(v1);
2223          Temp dst1 = bld.tmp(v1);
2224 
2225          Temp carry0 = bld.vsub32(Definition(no_sat0), src00, src10, true).def(1).getTemp();
2226          Temp carry1;
2227 
2228          if (ctx->program->gfx_level >= GFX8) {
2229             carry1 = bld.tmp(bld.lm);
2230             bld.vop2_e64(aco_opcode::v_subb_co_u32, Definition(dst1), Definition(carry1),
2231                          as_vgpr(ctx, src01), as_vgpr(ctx, src11), carry0)
2232                ->valu()
2233                .clamp = 1;
2234          } else {
2235             Temp no_sat1 = bld.tmp(v1);
2236             carry1 = bld.vsub32(Definition(no_sat1), src01, src11, true, carry0).def(1).getTemp();
2237             bld.vop2_e64(aco_opcode::v_cndmask_b32, Definition(dst1), no_sat1, Operand::c32(0u),
2238                          carry1);
2239          }
2240 
2241          bld.vop2_e64(aco_opcode::v_cndmask_b32, Definition(dst0), no_sat0, Operand::c32(0u),
2242                       carry1);
2243          bld.pseudo(aco_opcode::p_create_vector, Definition(dst), dst0, dst1);
2244       } else {
2245          isel_err(&instr->instr, "Unimplemented NIR instr bit size");
2246       }
2247       break;
2248    }
2249    case nir_op_isub_sat: {
2250       if (dst.regClass() == v1 && instr->def.bit_size == 16) {
2251          Instruction* sub_instr = emit_vop3p_instruction(ctx, instr, aco_opcode::v_pk_sub_i16, dst);
2252          sub_instr->valu().clamp = 1;
2253          break;
2254       }
2255       Temp src0 = get_alu_src(ctx, instr->src[0]);
2256       Temp src1 = get_alu_src(ctx, instr->src[1]);
2257       if (dst.regClass() == s1) {
2258          Temp cond = bld.sopc(aco_opcode::s_cmp_gt_i32, bld.def(s1, scc), src1, Operand::zero());
2259          Temp bound = bld.sop2(aco_opcode::s_add_u32, bld.def(s1), bld.scc(bld.def(s1, scc)),
2260                                Operand::c32(INT32_MAX), cond);
2261          Temp overflow = bld.tmp(s1);
2262          Temp sub =
2263             bld.sop2(aco_opcode::s_sub_i32, bld.def(s1), bld.scc(Definition(overflow)), src0, src1);
2264          bld.sop2(aco_opcode::s_cselect_b32, Definition(dst), bound, sub, bld.scc(overflow));
2265          break;
2266       }
2267 
2268       src1 = as_vgpr(ctx, src1);
2269 
2270       if (dst.regClass() == v2b) {
2271          Instruction* sub_instr =
2272             bld.vop3(aco_opcode::v_sub_i16, Definition(dst), src0, src1).instr;
2273          sub_instr->valu().clamp = 1;
2274       } else if (dst.regClass() == v1) {
2275          Instruction* sub_instr =
2276             bld.vop3(aco_opcode::v_sub_i32, Definition(dst), src0, src1).instr;
2277          sub_instr->valu().clamp = 1;
2278       } else {
2279          isel_err(&instr->instr, "Unimplemented NIR instr bit size");
2280       }
2281       break;
2282    }
2283    case nir_op_imul: {
2284       if (dst.bytes() <= 2 && ctx->program->gfx_level >= GFX10) {
2285          emit_vop3a_instruction(ctx, instr, aco_opcode::v_mul_lo_u16_e64, dst);
2286       } else if (dst.bytes() <= 2 && ctx->program->gfx_level >= GFX8) {
2287          emit_vop2_instruction(ctx, instr, aco_opcode::v_mul_lo_u16, dst, true);
2288       } else if (dst.regClass() == v1 && instr->def.bit_size == 16) {
2289          emit_vop3p_instruction(ctx, instr, aco_opcode::v_pk_mul_lo_u16, dst);
2290       } else if (dst.type() == RegType::vgpr) {
2291          uint32_t src0_ub = get_alu_src_ub(ctx, instr, 0);
2292          uint32_t src1_ub = get_alu_src_ub(ctx, instr, 1);
2293 
2294          if (src0_ub <= 0xffffff && src1_ub <= 0xffffff) {
2295             bool nuw_16bit = src0_ub <= 0xffff && src1_ub <= 0xffff && src0_ub * src1_ub <= 0xffff;
2296             emit_vop2_instruction(ctx, instr, aco_opcode::v_mul_u32_u24, dst,
2297                                   true /* commutative */, false, false, nuw_16bit);
2298          } else if (nir_src_is_const(instr->src[0].src)) {
2299             bld.v_mul_imm(Definition(dst), get_alu_src(ctx, instr->src[1]),
2300                           nir_src_as_uint(instr->src[0].src), false);
2301          } else if (nir_src_is_const(instr->src[1].src)) {
2302             bld.v_mul_imm(Definition(dst), get_alu_src(ctx, instr->src[0]),
2303                           nir_src_as_uint(instr->src[1].src), false);
2304          } else {
2305             emit_vop3a_instruction(ctx, instr, aco_opcode::v_mul_lo_u32, dst);
2306          }
2307       } else if (dst.regClass() == s1) {
2308          emit_sop2_instruction(ctx, instr, aco_opcode::s_mul_i32, dst, false);
2309       } else {
2310          isel_err(&instr->instr, "Unimplemented NIR instr bit size");
2311       }
2312       break;
2313    }
2314    case nir_op_umul_high: {
2315       if (dst.regClass() == s1 && ctx->options->gfx_level >= GFX9) {
2316          emit_sop2_instruction(ctx, instr, aco_opcode::s_mul_hi_u32, dst, false);
2317       } else if (dst.bytes() == 4) {
2318          uint32_t src0_ub = get_alu_src_ub(ctx, instr, 0);
2319          uint32_t src1_ub = get_alu_src_ub(ctx, instr, 1);
2320 
2321          Temp tmp = dst.regClass() == s1 ? bld.tmp(v1) : dst;
2322          if (src0_ub <= 0xffffff && src1_ub <= 0xffffff) {
2323             emit_vop2_instruction(ctx, instr, aco_opcode::v_mul_hi_u32_u24, tmp, true);
2324          } else {
2325             emit_vop3a_instruction(ctx, instr, aco_opcode::v_mul_hi_u32, tmp);
2326          }
2327 
2328          if (dst.regClass() == s1)
2329             bld.pseudo(aco_opcode::p_as_uniform, Definition(dst), tmp);
2330       } else {
2331          isel_err(&instr->instr, "Unimplemented NIR instr bit size");
2332       }
2333       break;
2334    }
2335    case nir_op_imul_high: {
2336       if (dst.regClass() == v1) {
2337          emit_vop3a_instruction(ctx, instr, aco_opcode::v_mul_hi_i32, dst);
2338       } else if (dst.regClass() == s1 && ctx->options->gfx_level >= GFX9) {
2339          emit_sop2_instruction(ctx, instr, aco_opcode::s_mul_hi_i32, dst, false);
2340       } else if (dst.regClass() == s1) {
2341          Temp tmp = bld.vop3(aco_opcode::v_mul_hi_i32, bld.def(v1), get_alu_src(ctx, instr->src[0]),
2342                              as_vgpr(ctx, get_alu_src(ctx, instr->src[1])));
2343          bld.pseudo(aco_opcode::p_as_uniform, Definition(dst), tmp);
2344       } else {
2345          isel_err(&instr->instr, "Unimplemented NIR instr bit size");
2346       }
2347       break;
2348    }
2349    case nir_op_fmul: {
2350       if (dst.regClass() == v2b) {
2351          emit_vop2_instruction(ctx, instr, aco_opcode::v_mul_f16, dst, true);
2352       } else if (dst.regClass() == v1 && instr->def.bit_size == 16) {
2353          emit_vop3p_instruction(ctx, instr, aco_opcode::v_pk_mul_f16, dst);
2354       } else if (dst.regClass() == v1) {
2355          emit_vop2_instruction(ctx, instr, aco_opcode::v_mul_f32, dst, true);
2356       } else if (dst.regClass() == v2) {
2357          emit_vop3a_instruction(ctx, instr, aco_opcode::v_mul_f64, dst);
2358       } else {
2359          isel_err(&instr->instr, "Unimplemented NIR instr bit size");
2360       }
2361       break;
2362    }
2363    case nir_op_fmulz: {
2364       if (dst.regClass() == v1) {
2365          emit_vop2_instruction(ctx, instr, aco_opcode::v_mul_legacy_f32, dst, true);
2366       } else {
2367          isel_err(&instr->instr, "Unimplemented NIR instr bit size");
2368       }
2369       break;
2370    }
2371    case nir_op_fadd: {
2372       if (dst.regClass() == v2b) {
2373          emit_vop2_instruction(ctx, instr, aco_opcode::v_add_f16, dst, true);
2374       } else if (dst.regClass() == v1 && instr->def.bit_size == 16) {
2375          emit_vop3p_instruction(ctx, instr, aco_opcode::v_pk_add_f16, dst);
2376       } else if (dst.regClass() == v1) {
2377          emit_vop2_instruction(ctx, instr, aco_opcode::v_add_f32, dst, true);
2378       } else if (dst.regClass() == v2) {
2379          emit_vop3a_instruction(ctx, instr, aco_opcode::v_add_f64, dst);
2380       } else {
2381          isel_err(&instr->instr, "Unimplemented NIR instr bit size");
2382       }
2383       break;
2384    }
2385    case nir_op_fsub: {
2386       if (dst.regClass() == v1 && instr->def.bit_size == 16) {
2387          Instruction* add = emit_vop3p_instruction(ctx, instr, aco_opcode::v_pk_add_f16, dst);
2388          VALU_instruction& sub = add->valu();
2389          sub.neg_lo[1] = true;
2390          sub.neg_hi[1] = true;
2391          break;
2392       }
2393 
2394       Temp src0 = get_alu_src(ctx, instr->src[0]);
2395       Temp src1 = get_alu_src(ctx, instr->src[1]);
2396       if (dst.regClass() == v2b) {
2397          if (src1.type() == RegType::vgpr || src0.type() != RegType::vgpr)
2398             emit_vop2_instruction(ctx, instr, aco_opcode::v_sub_f16, dst, false);
2399          else
2400             emit_vop2_instruction(ctx, instr, aco_opcode::v_subrev_f16, dst, true);
2401       } else if (dst.regClass() == v1) {
2402          if (src1.type() == RegType::vgpr || src0.type() != RegType::vgpr)
2403             emit_vop2_instruction(ctx, instr, aco_opcode::v_sub_f32, dst, false);
2404          else
2405             emit_vop2_instruction(ctx, instr, aco_opcode::v_subrev_f32, dst, true);
2406       } else if (dst.regClass() == v2) {
2407          Instruction* add = bld.vop3(aco_opcode::v_add_f64, Definition(dst), as_vgpr(ctx, src0),
2408                                      as_vgpr(ctx, src1));
2409          add->valu().neg[1] = true;
2410       } else {
2411          isel_err(&instr->instr, "Unimplemented NIR instr bit size");
2412       }
2413       break;
2414    }
2415    case nir_op_ffma: {
2416       if (dst.regClass() == v2b) {
2417          emit_vop3a_instruction(ctx, instr, aco_opcode::v_fma_f16, dst, false, 3);
2418       } else if (dst.regClass() == v1 && instr->def.bit_size == 16) {
2419          assert(instr->def.num_components == 2);
2420 
2421          Temp src0 = as_vgpr(ctx, get_alu_src_vop3p(ctx, instr->src[0]));
2422          Temp src1 = as_vgpr(ctx, get_alu_src_vop3p(ctx, instr->src[1]));
2423          Temp src2 = as_vgpr(ctx, get_alu_src_vop3p(ctx, instr->src[2]));
2424 
2425          /* swizzle to opsel: all swizzles are either 0 (x) or 1 (y) */
2426          unsigned opsel_lo = 0, opsel_hi = 0;
2427          for (unsigned i = 0; i < 3; i++) {
2428             opsel_lo |= (instr->src[i].swizzle[0] & 1) << i;
2429             opsel_hi |= (instr->src[i].swizzle[1] & 1) << i;
2430          }
2431 
2432          bld.vop3p(aco_opcode::v_pk_fma_f16, Definition(dst), src0, src1, src2, opsel_lo, opsel_hi);
2433       } else if (dst.regClass() == v1) {
2434          emit_vop3a_instruction(ctx, instr, aco_opcode::v_fma_f32, dst,
2435                                 ctx->block->fp_mode.must_flush_denorms32, 3);
2436       } else if (dst.regClass() == v2) {
2437          emit_vop3a_instruction(ctx, instr, aco_opcode::v_fma_f64, dst, false, 3);
2438       } else {
2439          isel_err(&instr->instr, "Unimplemented NIR instr bit size");
2440       }
2441       break;
2442    }
2443    case nir_op_ffmaz: {
2444       if (dst.regClass() == v1) {
2445          emit_vop3a_instruction(ctx, instr, aco_opcode::v_fma_legacy_f32, dst,
2446                                 ctx->block->fp_mode.must_flush_denorms32, 3);
2447       } else {
2448          isel_err(&instr->instr, "Unimplemented NIR instr bit size");
2449       }
2450       break;
2451    }
2452    case nir_op_fmax: {
2453       if (dst.regClass() == v2b) {
2454          emit_vop2_instruction(ctx, instr, aco_opcode::v_max_f16, dst, true, false,
2455                                ctx->block->fp_mode.must_flush_denorms16_64);
2456       } else if (dst.regClass() == v1 && instr->def.bit_size == 16) {
2457          emit_vop3p_instruction(ctx, instr, aco_opcode::v_pk_max_f16, dst);
2458       } else if (dst.regClass() == v1) {
2459          emit_vop2_instruction(ctx, instr, aco_opcode::v_max_f32, dst, true, false,
2460                                ctx->block->fp_mode.must_flush_denorms32);
2461       } else if (dst.regClass() == v2) {
2462          emit_vop3a_instruction(ctx, instr, aco_opcode::v_max_f64, dst,
2463                                 ctx->block->fp_mode.must_flush_denorms16_64);
2464       } else {
2465          isel_err(&instr->instr, "Unimplemented NIR instr bit size");
2466       }
2467       break;
2468    }
2469    case nir_op_fmin: {
2470       if (dst.regClass() == v2b) {
2471          emit_vop2_instruction(ctx, instr, aco_opcode::v_min_f16, dst, true, false,
2472                                ctx->block->fp_mode.must_flush_denorms16_64);
2473       } else if (dst.regClass() == v1 && instr->def.bit_size == 16) {
2474          emit_vop3p_instruction(ctx, instr, aco_opcode::v_pk_min_f16, dst, true);
2475       } else if (dst.regClass() == v1) {
2476          emit_vop2_instruction(ctx, instr, aco_opcode::v_min_f32, dst, true, false,
2477                                ctx->block->fp_mode.must_flush_denorms32);
2478       } else if (dst.regClass() == v2) {
2479          emit_vop3a_instruction(ctx, instr, aco_opcode::v_min_f64, dst,
2480                                 ctx->block->fp_mode.must_flush_denorms16_64);
2481       } else {
2482          isel_err(&instr->instr, "Unimplemented NIR instr bit size");
2483       }
2484       break;
2485    }
2486    case nir_op_sdot_4x8_iadd: {
2487       if (ctx->options->gfx_level >= GFX11)
2488          emit_idot_instruction(ctx, instr, aco_opcode::v_dot4_i32_iu8, dst, false, 0x3);
2489       else
2490          emit_idot_instruction(ctx, instr, aco_opcode::v_dot4_i32_i8, dst, false);
2491       break;
2492    }
2493    case nir_op_sdot_4x8_iadd_sat: {
2494       if (ctx->options->gfx_level >= GFX11)
2495          emit_idot_instruction(ctx, instr, aco_opcode::v_dot4_i32_iu8, dst, true, 0x3);
2496       else
2497          emit_idot_instruction(ctx, instr, aco_opcode::v_dot4_i32_i8, dst, true);
2498       break;
2499    }
2500    case nir_op_sudot_4x8_iadd: {
2501       emit_idot_instruction(ctx, instr, aco_opcode::v_dot4_i32_iu8, dst, false, 0x1);
2502       break;
2503    }
2504    case nir_op_sudot_4x8_iadd_sat: {
2505       emit_idot_instruction(ctx, instr, aco_opcode::v_dot4_i32_iu8, dst, true, 0x1);
2506       break;
2507    }
2508    case nir_op_udot_4x8_uadd: {
2509       emit_idot_instruction(ctx, instr, aco_opcode::v_dot4_u32_u8, dst, false);
2510       break;
2511    }
2512    case nir_op_udot_4x8_uadd_sat: {
2513       emit_idot_instruction(ctx, instr, aco_opcode::v_dot4_u32_u8, dst, true);
2514       break;
2515    }
2516    case nir_op_sdot_2x16_iadd: {
2517       emit_idot_instruction(ctx, instr, aco_opcode::v_dot2_i32_i16, dst, false);
2518       break;
2519    }
2520    case nir_op_sdot_2x16_iadd_sat: {
2521       emit_idot_instruction(ctx, instr, aco_opcode::v_dot2_i32_i16, dst, true);
2522       break;
2523    }
2524    case nir_op_udot_2x16_uadd: {
2525       emit_idot_instruction(ctx, instr, aco_opcode::v_dot2_u32_u16, dst, false);
2526       break;
2527    }
2528    case nir_op_udot_2x16_uadd_sat: {
2529       emit_idot_instruction(ctx, instr, aco_opcode::v_dot2_u32_u16, dst, true);
2530       break;
2531    }
2532    case nir_op_cube_amd: {
2533       Temp in = get_alu_src(ctx, instr->src[0], 3);
2534       Temp src[3] = {emit_extract_vector(ctx, in, 0, v1), emit_extract_vector(ctx, in, 1, v1),
2535                      emit_extract_vector(ctx, in, 2, v1)};
2536       Temp ma = bld.vop3(aco_opcode::v_cubema_f32, bld.def(v1), src[0], src[1], src[2]);
2537       Temp sc = bld.vop3(aco_opcode::v_cubesc_f32, bld.def(v1), src[0], src[1], src[2]);
2538       Temp tc = bld.vop3(aco_opcode::v_cubetc_f32, bld.def(v1), src[0], src[1], src[2]);
2539       Temp id = bld.vop3(aco_opcode::v_cubeid_f32, bld.def(v1), src[0], src[1], src[2]);
2540       bld.pseudo(aco_opcode::p_create_vector, Definition(dst), tc, sc, ma, id);
2541       break;
2542    }
2543    case nir_op_bcsel: {
2544       emit_bcsel(ctx, instr, dst);
2545       break;
2546    }
2547    case nir_op_frsq: {
2548       if (dst.regClass() == v2b) {
2549          emit_vop1_instruction(ctx, instr, aco_opcode::v_rsq_f16, dst);
2550       } else if (dst.regClass() == v1) {
2551          Temp src = get_alu_src(ctx, instr->src[0]);
2552          emit_rsq(ctx, bld, Definition(dst), src);
2553       } else if (dst.regClass() == v2) {
2554          /* Lowered at NIR level for precision reasons. */
2555          emit_vop1_instruction(ctx, instr, aco_opcode::v_rsq_f64, dst);
2556       } else {
2557          isel_err(&instr->instr, "Unimplemented NIR instr bit size");
2558       }
2559       break;
2560    }
2561    case nir_op_fneg: {
2562       if (dst.regClass() == v1 && instr->def.bit_size == 16) {
2563          Temp src = get_alu_src_vop3p(ctx, instr->src[0]);
2564          Instruction* vop3p =
2565             bld.vop3p(aco_opcode::v_pk_mul_f16, Definition(dst), src, Operand::c16(0x3C00),
2566                       instr->src[0].swizzle[0] & 1, instr->src[0].swizzle[1] & 1);
2567          vop3p->valu().neg_lo[0] = true;
2568          vop3p->valu().neg_hi[0] = true;
2569          break;
2570       }
2571       Temp src = get_alu_src(ctx, instr->src[0]);
2572       if (dst.regClass() == v2b) {
2573          bld.vop2(aco_opcode::v_mul_f16, Definition(dst), Operand::c16(0xbc00u), as_vgpr(ctx, src));
2574       } else if (dst.regClass() == v1) {
2575          bld.vop2(aco_opcode::v_mul_f32, Definition(dst), Operand::c32(0xbf800000u),
2576                   as_vgpr(ctx, src));
2577       } else if (dst.regClass() == v2) {
2578          if (ctx->block->fp_mode.must_flush_denorms16_64)
2579             src = bld.vop3(aco_opcode::v_mul_f64, bld.def(v2), Operand::c64(0x3FF0000000000000),
2580                            as_vgpr(ctx, src));
2581          Temp upper = bld.tmp(v1), lower = bld.tmp(v1);
2582          bld.pseudo(aco_opcode::p_split_vector, Definition(lower), Definition(upper), src);
2583          upper = bld.vop2(aco_opcode::v_xor_b32, bld.def(v1), Operand::c32(0x80000000u), upper);
2584          bld.pseudo(aco_opcode::p_create_vector, Definition(dst), lower, upper);
2585       } else {
2586          isel_err(&instr->instr, "Unimplemented NIR instr bit size");
2587       }
2588       break;
2589    }
2590    case nir_op_fabs: {
2591       if (dst.regClass() == v1 && instr->def.bit_size == 16) {
2592          Temp src = get_alu_src_vop3p(ctx, instr->src[0]);
2593          Instruction* vop3p =
2594             bld.vop3p(aco_opcode::v_pk_max_f16, Definition(dst), src, src,
2595                       instr->src[0].swizzle[0] & 1 ? 3 : 0, instr->src[0].swizzle[1] & 1 ? 3 : 0)
2596                .instr;
2597          vop3p->valu().neg_lo[1] = true;
2598          vop3p->valu().neg_hi[1] = true;
2599          break;
2600       }
2601       Temp src = get_alu_src(ctx, instr->src[0]);
2602       if (dst.regClass() == v2b) {
2603          Instruction* mul = bld.vop2_e64(aco_opcode::v_mul_f16, Definition(dst),
2604                                          Operand::c16(0x3c00), as_vgpr(ctx, src))
2605                                .instr;
2606          mul->valu().abs[1] = true;
2607       } else if (dst.regClass() == v1) {
2608          Instruction* mul = bld.vop2_e64(aco_opcode::v_mul_f32, Definition(dst),
2609                                          Operand::c32(0x3f800000u), as_vgpr(ctx, src))
2610                                .instr;
2611          mul->valu().abs[1] = true;
2612       } else if (dst.regClass() == v2) {
2613          if (ctx->block->fp_mode.must_flush_denorms16_64)
2614             src = bld.vop3(aco_opcode::v_mul_f64, bld.def(v2), Operand::c64(0x3FF0000000000000),
2615                            as_vgpr(ctx, src));
2616          Temp upper = bld.tmp(v1), lower = bld.tmp(v1);
2617          bld.pseudo(aco_opcode::p_split_vector, Definition(lower), Definition(upper), src);
2618          upper = bld.vop2(aco_opcode::v_and_b32, bld.def(v1), Operand::c32(0x7FFFFFFFu), upper);
2619          bld.pseudo(aco_opcode::p_create_vector, Definition(dst), lower, upper);
2620       } else {
2621          isel_err(&instr->instr, "Unimplemented NIR instr bit size");
2622       }
2623       break;
2624    }
2625    case nir_op_fsat: {
2626       if (dst.regClass() == v1 && instr->def.bit_size == 16) {
2627          Temp src = get_alu_src_vop3p(ctx, instr->src[0]);
2628          Instruction* vop3p =
2629             bld.vop3p(aco_opcode::v_pk_mul_f16, Definition(dst), src, Operand::c16(0x3C00),
2630                       instr->src[0].swizzle[0] & 1, instr->src[0].swizzle[1] & 1);
2631          vop3p->valu().clamp = true;
2632          break;
2633       }
2634       Temp src = get_alu_src(ctx, instr->src[0]);
2635       if (dst.regClass() == v2b && ctx->program->gfx_level >= GFX9) {
2636          bld.vop3(aco_opcode::v_med3_f16, Definition(dst), Operand::c16(0u), Operand::c16(0x3c00),
2637                   src);
2638       } else if (dst.regClass() == v2b) {
2639          bld.vop2_e64(aco_opcode::v_mul_f16, Definition(dst), Operand::c16(0x3c00), src)
2640             ->valu()
2641             .clamp = true;
2642       } else if (dst.regClass() == v1) {
2643          bld.vop3(aco_opcode::v_med3_f32, Definition(dst), Operand::zero(),
2644                   Operand::c32(0x3f800000u), src);
2645          /* apparently, it is not necessary to flush denorms if this instruction is used with these
2646           * operands */
2647          // TODO: confirm that this holds under any circumstances
2648       } else if (dst.regClass() == v2) {
2649          Instruction* add = bld.vop3(aco_opcode::v_add_f64, Definition(dst), src, Operand::zero());
2650          add->valu().clamp = true;
2651       } else {
2652          isel_err(&instr->instr, "Unimplemented NIR instr bit size");
2653       }
2654       break;
2655    }
2656    case nir_op_flog2: {
2657       if (dst.regClass() == v2b) {
2658          emit_vop1_instruction(ctx, instr, aco_opcode::v_log_f16, dst);
2659       } else if (dst.regClass() == v1) {
2660          Temp src = get_alu_src(ctx, instr->src[0]);
2661          emit_log2(ctx, bld, Definition(dst), src);
2662       } else {
2663          isel_err(&instr->instr, "Unimplemented NIR instr bit size");
2664       }
2665       break;
2666    }
2667    case nir_op_frcp: {
2668       if (dst.regClass() == v2b) {
2669          emit_vop1_instruction(ctx, instr, aco_opcode::v_rcp_f16, dst);
2670       } else if (dst.regClass() == v1) {
2671          Temp src = get_alu_src(ctx, instr->src[0]);
2672          emit_rcp(ctx, bld, Definition(dst), src);
2673       } else if (dst.regClass() == v2) {
2674          /* Lowered at NIR level for precision reasons. */
2675          emit_vop1_instruction(ctx, instr, aco_opcode::v_rcp_f64, dst);
2676       } else {
2677          isel_err(&instr->instr, "Unimplemented NIR instr bit size");
2678       }
2679       break;
2680    }
2681    case nir_op_fexp2: {
2682       if (dst.regClass() == v2b) {
2683          emit_vop1_instruction(ctx, instr, aco_opcode::v_exp_f16, dst);
2684       } else if (dst.regClass() == v1) {
2685          emit_vop1_instruction(ctx, instr, aco_opcode::v_exp_f32, dst);
2686       } else {
2687          isel_err(&instr->instr, "Unimplemented NIR instr bit size");
2688       }
2689       break;
2690    }
2691    case nir_op_fsqrt: {
2692       if (dst.regClass() == v2b) {
2693          emit_vop1_instruction(ctx, instr, aco_opcode::v_sqrt_f16, dst);
2694       } else if (dst.regClass() == v1) {
2695          Temp src = get_alu_src(ctx, instr->src[0]);
2696          emit_sqrt(ctx, bld, Definition(dst), src);
2697       } else if (dst.regClass() == v2) {
2698          /* Lowered at NIR level for precision reasons. */
2699          emit_vop1_instruction(ctx, instr, aco_opcode::v_sqrt_f64, dst);
2700       } else {
2701          isel_err(&instr->instr, "Unimplemented NIR instr bit size");
2702       }
2703       break;
2704    }
2705    case nir_op_ffract: {
2706       if (dst.regClass() == v2b) {
2707          emit_vop1_instruction(ctx, instr, aco_opcode::v_fract_f16, dst);
2708       } else if (dst.regClass() == v1) {
2709          emit_vop1_instruction(ctx, instr, aco_opcode::v_fract_f32, dst);
2710       } else if (dst.regClass() == v2) {
2711          emit_vop1_instruction(ctx, instr, aco_opcode::v_fract_f64, dst);
2712       } else {
2713          isel_err(&instr->instr, "Unimplemented NIR instr bit size");
2714       }
2715       break;
2716    }
2717    case nir_op_ffloor: {
2718       if (dst.regClass() == v2b) {
2719          emit_vop1_instruction(ctx, instr, aco_opcode::v_floor_f16, dst);
2720       } else if (dst.regClass() == v1) {
2721          emit_vop1_instruction(ctx, instr, aco_opcode::v_floor_f32, dst);
2722       } else if (dst.regClass() == v2) {
2723          Temp src = get_alu_src(ctx, instr->src[0]);
2724          emit_floor_f64(ctx, bld, Definition(dst), src);
2725       } else {
2726          isel_err(&instr->instr, "Unimplemented NIR instr bit size");
2727       }
2728       break;
2729    }
2730    case nir_op_fceil: {
2731       if (dst.regClass() == v2b) {
2732          emit_vop1_instruction(ctx, instr, aco_opcode::v_ceil_f16, dst);
2733       } else if (dst.regClass() == v1) {
2734          emit_vop1_instruction(ctx, instr, aco_opcode::v_ceil_f32, dst);
2735       } else if (dst.regClass() == v2) {
2736          if (ctx->options->gfx_level >= GFX7) {
2737             emit_vop1_instruction(ctx, instr, aco_opcode::v_ceil_f64, dst);
2738          } else {
2739             /* GFX6 doesn't support V_CEIL_F64, lower it. */
2740             /* trunc = trunc(src0)
2741              * if (src0 > 0.0 && src0 != trunc)
2742              *    trunc += 1.0
2743              */
2744             Temp src0 = get_alu_src(ctx, instr->src[0]);
2745             Temp trunc = emit_trunc_f64(ctx, bld, bld.def(v2), src0);
2746             Temp tmp0 =
2747                bld.vopc_e64(aco_opcode::v_cmp_gt_f64, bld.def(bld.lm), src0, Operand::zero());
2748             Temp tmp1 = bld.vopc(aco_opcode::v_cmp_lg_f64, bld.def(bld.lm), src0, trunc);
2749             Temp cond = bld.sop2(aco_opcode::s_and_b64, bld.def(s2), bld.def(s1, scc), tmp0, tmp1);
2750             Temp add = bld.vop2(aco_opcode::v_cndmask_b32, bld.def(v1),
2751                                 bld.copy(bld.def(v1), Operand::zero()),
2752                                 bld.copy(bld.def(v1), Operand::c32(0x3ff00000u)), cond);
2753             add = bld.pseudo(aco_opcode::p_create_vector, bld.def(v2),
2754                              bld.copy(bld.def(v1), Operand::zero()), add);
2755             bld.vop3(aco_opcode::v_add_f64, Definition(dst), trunc, add);
2756          }
2757       } else {
2758          isel_err(&instr->instr, "Unimplemented NIR instr bit size");
2759       }
2760       break;
2761    }
2762    case nir_op_ftrunc: {
2763       if (dst.regClass() == v2b) {
2764          emit_vop1_instruction(ctx, instr, aco_opcode::v_trunc_f16, dst);
2765       } else if (dst.regClass() == v1) {
2766          emit_vop1_instruction(ctx, instr, aco_opcode::v_trunc_f32, dst);
2767       } else if (dst.regClass() == v2) {
2768          Temp src = get_alu_src(ctx, instr->src[0]);
2769          emit_trunc_f64(ctx, bld, Definition(dst), src);
2770       } else {
2771          isel_err(&instr->instr, "Unimplemented NIR instr bit size");
2772       }
2773       break;
2774    }
2775    case nir_op_fround_even: {
2776       if (dst.regClass() == v2b) {
2777          emit_vop1_instruction(ctx, instr, aco_opcode::v_rndne_f16, dst);
2778       } else if (dst.regClass() == v1) {
2779          emit_vop1_instruction(ctx, instr, aco_opcode::v_rndne_f32, dst);
2780       } else if (dst.regClass() == v2) {
2781          if (ctx->options->gfx_level >= GFX7) {
2782             emit_vop1_instruction(ctx, instr, aco_opcode::v_rndne_f64, dst);
2783          } else {
2784             /* GFX6 doesn't support V_RNDNE_F64, lower it. */
2785             Temp src0_lo = bld.tmp(v1), src0_hi = bld.tmp(v1);
2786             Temp src0 = get_alu_src(ctx, instr->src[0]);
2787             bld.pseudo(aco_opcode::p_split_vector, Definition(src0_lo), Definition(src0_hi), src0);
2788 
2789             Temp bitmask = bld.sop1(aco_opcode::s_brev_b32, bld.def(s1),
2790                                     bld.copy(bld.def(s1), Operand::c32(-2u)));
2791             Temp bfi =
2792                bld.vop3(aco_opcode::v_bfi_b32, bld.def(v1), bitmask,
2793                         bld.copy(bld.def(v1), Operand::c32(0x43300000u)), as_vgpr(ctx, src0_hi));
2794             Temp tmp =
2795                bld.vop3(aco_opcode::v_add_f64, bld.def(v2), src0,
2796                         bld.pseudo(aco_opcode::p_create_vector, bld.def(v2), Operand::zero(), bfi));
2797             Instruction* sub =
2798                bld.vop3(aco_opcode::v_add_f64, bld.def(v2), tmp,
2799                         bld.pseudo(aco_opcode::p_create_vector, bld.def(v2), Operand::zero(), bfi));
2800             sub->valu().neg[1] = true;
2801             tmp = sub->definitions[0].getTemp();
2802 
2803             Temp v = bld.pseudo(aco_opcode::p_create_vector, bld.def(v2), Operand::c32(-1u),
2804                                 Operand::c32(0x432fffffu));
2805             Instruction* vop3 = bld.vopc_e64(aco_opcode::v_cmp_gt_f64, bld.def(bld.lm), src0, v);
2806             vop3->valu().abs[0] = true;
2807             Temp cond = vop3->definitions[0].getTemp();
2808 
2809             Temp tmp_lo = bld.tmp(v1), tmp_hi = bld.tmp(v1);
2810             bld.pseudo(aco_opcode::p_split_vector, Definition(tmp_lo), Definition(tmp_hi), tmp);
2811             Temp dst0 = bld.vop2_e64(aco_opcode::v_cndmask_b32, bld.def(v1), tmp_lo,
2812                                      as_vgpr(ctx, src0_lo), cond);
2813             Temp dst1 = bld.vop2_e64(aco_opcode::v_cndmask_b32, bld.def(v1), tmp_hi,
2814                                      as_vgpr(ctx, src0_hi), cond);
2815 
2816             bld.pseudo(aco_opcode::p_create_vector, Definition(dst), dst0, dst1);
2817          }
2818       } else {
2819          isel_err(&instr->instr, "Unimplemented NIR instr bit size");
2820       }
2821       break;
2822    }
2823    case nir_op_fsin_amd:
2824    case nir_op_fcos_amd: {
2825       Temp src = as_vgpr(ctx, get_alu_src(ctx, instr->src[0]));
2826       aco_ptr<Instruction> norm;
2827       if (dst.regClass() == v2b) {
2828          aco_opcode opcode =
2829             instr->op == nir_op_fsin_amd ? aco_opcode::v_sin_f16 : aco_opcode::v_cos_f16;
2830          bld.vop1(opcode, Definition(dst), src);
2831       } else if (dst.regClass() == v1) {
2832          /* before GFX9, v_sin_f32 and v_cos_f32 had a valid input domain of [-256, +256] */
2833          if (ctx->options->gfx_level < GFX9)
2834             src = bld.vop1(aco_opcode::v_fract_f32, bld.def(v1), src);
2835 
2836          aco_opcode opcode =
2837             instr->op == nir_op_fsin_amd ? aco_opcode::v_sin_f32 : aco_opcode::v_cos_f32;
2838          bld.vop1(opcode, Definition(dst), src);
2839       } else {
2840          isel_err(&instr->instr, "Unimplemented NIR instr bit size");
2841       }
2842       break;
2843    }
2844    case nir_op_ldexp: {
2845       if (dst.regClass() == v2b) {
2846          emit_vop2_instruction(ctx, instr, aco_opcode::v_ldexp_f16, dst, false);
2847       } else if (dst.regClass() == v1) {
2848          emit_vop3a_instruction(ctx, instr, aco_opcode::v_ldexp_f32, dst);
2849       } else if (dst.regClass() == v2) {
2850          emit_vop3a_instruction(ctx, instr, aco_opcode::v_ldexp_f64, dst);
2851       } else {
2852          isel_err(&instr->instr, "Unimplemented NIR instr bit size");
2853       }
2854       break;
2855    }
2856    case nir_op_frexp_sig: {
2857       if (dst.regClass() == v2b) {
2858          emit_vop1_instruction(ctx, instr, aco_opcode::v_frexp_mant_f16, dst);
2859       } else if (dst.regClass() == v1) {
2860          emit_vop1_instruction(ctx, instr, aco_opcode::v_frexp_mant_f32, dst);
2861       } else if (dst.regClass() == v2) {
2862          emit_vop1_instruction(ctx, instr, aco_opcode::v_frexp_mant_f64, dst);
2863       } else {
2864          isel_err(&instr->instr, "Unimplemented NIR instr bit size");
2865       }
2866       break;
2867    }
2868    case nir_op_frexp_exp: {
2869       if (instr->src[0].src.ssa->bit_size == 16) {
2870          Temp src = get_alu_src(ctx, instr->src[0]);
2871          Temp tmp = bld.vop1(aco_opcode::v_frexp_exp_i16_f16, bld.def(v1), src);
2872          tmp = bld.pseudo(aco_opcode::p_extract_vector, bld.def(v1b), tmp, Operand::zero());
2873          convert_int(ctx, bld, tmp, 8, 32, true, dst);
2874       } else if (instr->src[0].src.ssa->bit_size == 32) {
2875          emit_vop1_instruction(ctx, instr, aco_opcode::v_frexp_exp_i32_f32, dst);
2876       } else if (instr->src[0].src.ssa->bit_size == 64) {
2877          emit_vop1_instruction(ctx, instr, aco_opcode::v_frexp_exp_i32_f64, dst);
2878       } else {
2879          isel_err(&instr->instr, "Unimplemented NIR instr bit size");
2880       }
2881       break;
2882    }
2883    case nir_op_fsign: {
2884       Temp src = as_vgpr(ctx, get_alu_src(ctx, instr->src[0]));
2885       if (dst.regClass() == v2b) {
2886          /* replace negative zero with positive zero */
2887          src = bld.vop2(aco_opcode::v_add_f16, bld.def(v2b), Operand::zero(), src);
2888          if (ctx->program->gfx_level >= GFX9) {
2889             src = bld.vop3(aco_opcode::v_med3_i16, bld.def(v2b), Operand::c16(-1), src,
2890                            Operand::c16(1u));
2891             bld.vop1(aco_opcode::v_cvt_f16_i16, Definition(dst), src);
2892          } else {
2893             src = convert_int(ctx, bld, src, 16, 32, true);
2894             src = bld.vop3(aco_opcode::v_med3_i32, bld.def(v1), Operand::c32(-1), src,
2895                            Operand::c32(1u));
2896             bld.vop1(aco_opcode::v_cvt_f16_i16, Definition(dst), src);
2897          }
2898       } else if (dst.regClass() == v1) {
2899          /* Legacy multiply with +Inf means +-0.0 becomes +0.0 and all other numbers
2900           * the correctly signed Inf. After that, we only need to clamp between -1.0 and +1.0.
2901           */
2902          Temp inf = bld.copy(bld.def(s1), Operand::c32(0x7f800000));
2903          src = bld.vop2(aco_opcode::v_mul_legacy_f32, bld.def(v1), inf, src);
2904          bld.vop3(aco_opcode::v_med3_f32, Definition(dst), Operand::c32(0x3f800000), src,
2905                   Operand::c32(0xbf800000));
2906       } else if (dst.regClass() == v2) {
2907          Temp cond = bld.vopc(aco_opcode::v_cmp_nlt_f64, bld.def(bld.lm), Operand::zero(), src);
2908          Temp tmp = bld.copy(bld.def(v1), Operand::c32(0x3FF00000u));
2909          Temp upper = bld.vop2_e64(aco_opcode::v_cndmask_b32, bld.def(v1), tmp,
2910                                    emit_extract_vector(ctx, src, 1, v1), cond);
2911 
2912          cond = bld.vopc(aco_opcode::v_cmp_le_f64, bld.def(bld.lm), Operand::zero(), src);
2913          tmp = bld.copy(bld.def(v1), Operand::c32(0xBFF00000u));
2914          upper = bld.vop2(aco_opcode::v_cndmask_b32, bld.def(v1), tmp, upper, cond);
2915 
2916          bld.pseudo(aco_opcode::p_create_vector, Definition(dst), Operand::zero(), upper);
2917       } else {
2918          isel_err(&instr->instr, "Unimplemented NIR instr bit size");
2919       }
2920       break;
2921    }
2922    case nir_op_f2f16:
2923    case nir_op_f2f16_rtne: {
2924       assert(instr->src[0].src.ssa->bit_size == 32);
2925       if (instr->def.num_components == 2) {
2926          /* Vectorizing f2f16 is only possible with rtz. */
2927          assert(instr->op != nir_op_f2f16_rtne);
2928          assert(ctx->block->fp_mode.round16_64 == fp_round_tz ||
2929                 !ctx->block->fp_mode.care_about_round16_64);
2930          emit_vec2_f2f16(ctx, instr, dst);
2931          break;
2932       }
2933       Temp src = get_alu_src(ctx, instr->src[0]);
2934       if (instr->op == nir_op_f2f16_rtne && ctx->block->fp_mode.round16_64 != fp_round_ne)
2935          /* We emit s_round_mode/s_setreg_imm32 in lower_to_hw_instr to
2936           * keep value numbering and the scheduler simpler.
2937           */
2938          bld.vop1(aco_opcode::p_cvt_f16_f32_rtne, Definition(dst), src);
2939       else
2940          bld.vop1(aco_opcode::v_cvt_f16_f32, Definition(dst), src);
2941       break;
2942    }
2943    case nir_op_f2f16_rtz: {
2944       assert(instr->src[0].src.ssa->bit_size == 32);
2945       if (instr->def.num_components == 2) {
2946          emit_vec2_f2f16(ctx, instr, dst);
2947          break;
2948       }
2949       Temp src = get_alu_src(ctx, instr->src[0]);
2950       if (ctx->block->fp_mode.round16_64 == fp_round_tz)
2951          bld.vop1(aco_opcode::v_cvt_f16_f32, Definition(dst), src);
2952       else if (ctx->program->gfx_level == GFX8 || ctx->program->gfx_level == GFX9)
2953          bld.vop3(aco_opcode::v_cvt_pkrtz_f16_f32_e64, Definition(dst), src, Operand::zero());
2954       else
2955          bld.vop2(aco_opcode::v_cvt_pkrtz_f16_f32, Definition(dst), src, as_vgpr(ctx, src));
2956       break;
2957    }
2958    case nir_op_f2f32: {
2959       if (instr->src[0].src.ssa->bit_size == 16) {
2960          emit_vop1_instruction(ctx, instr, aco_opcode::v_cvt_f32_f16, dst);
2961       } else if (instr->src[0].src.ssa->bit_size == 64) {
2962          emit_vop1_instruction(ctx, instr, aco_opcode::v_cvt_f32_f64, dst);
2963       } else {
2964          isel_err(&instr->instr, "Unimplemented NIR instr bit size");
2965       }
2966       break;
2967    }
2968    case nir_op_f2f64: {
2969       assert(instr->src[0].src.ssa->bit_size == 32);
2970       Temp src = get_alu_src(ctx, instr->src[0]);
2971       bld.vop1(aco_opcode::v_cvt_f64_f32, Definition(dst), src);
2972       break;
2973    }
2974    case nir_op_i2f16: {
2975       assert(dst.regClass() == v2b);
2976       Temp src = get_alu_src(ctx, instr->src[0]);
2977       const unsigned input_size = instr->src[0].src.ssa->bit_size;
2978       if (input_size <= 16) {
2979          /* Expand integer to the size expected by the uint→float converter used below */
2980          unsigned target_size = (ctx->program->gfx_level >= GFX8 ? 16 : 32);
2981          if (input_size != target_size) {
2982             src = convert_int(ctx, bld, src, input_size, target_size, true);
2983          }
2984       }
2985 
2986       if (ctx->program->gfx_level >= GFX8 && input_size <= 16) {
2987          bld.vop1(aco_opcode::v_cvt_f16_i16, Definition(dst), src);
2988       } else {
2989          /* Large 32bit inputs need to return +-inf/FLOAT_MAX.
2990           *
2991           * This is also the fallback-path taken on GFX7 and earlier, which
2992           * do not support direct f16⟷i16 conversions.
2993           */
2994          src = bld.vop1(aco_opcode::v_cvt_f32_i32, bld.def(v1), src);
2995          bld.vop1(aco_opcode::v_cvt_f16_f32, Definition(dst), src);
2996       }
2997       break;
2998    }
2999    case nir_op_i2f32: {
3000       assert(dst.size() == 1);
3001       Temp src = get_alu_src(ctx, instr->src[0]);
3002       const unsigned input_size = instr->src[0].src.ssa->bit_size;
3003       if (input_size <= 32) {
3004          if (input_size <= 16) {
3005             /* Sign-extend to 32-bits */
3006             src = convert_int(ctx, bld, src, input_size, 32, true);
3007          }
3008          bld.vop1(aco_opcode::v_cvt_f32_i32, Definition(dst), src);
3009       } else {
3010          isel_err(&instr->instr, "Unimplemented NIR instr bit size");
3011       }
3012       break;
3013    }
3014    case nir_op_i2f64: {
3015       if (instr->src[0].src.ssa->bit_size <= 32) {
3016          Temp src = get_alu_src(ctx, instr->src[0]);
3017          if (instr->src[0].src.ssa->bit_size <= 16)
3018             src = convert_int(ctx, bld, src, instr->src[0].src.ssa->bit_size, 32, true);
3019          bld.vop1(aco_opcode::v_cvt_f64_i32, Definition(dst), src);
3020       } else {
3021          isel_err(&instr->instr, "Unimplemented NIR instr bit size");
3022       }
3023       break;
3024    }
3025    case nir_op_u2f16: {
3026       assert(dst.regClass() == v2b);
3027       Temp src = get_alu_src(ctx, instr->src[0]);
3028       const unsigned input_size = instr->src[0].src.ssa->bit_size;
3029       if (input_size <= 16) {
3030          /* Expand integer to the size expected by the uint→float converter used below */
3031          unsigned target_size = (ctx->program->gfx_level >= GFX8 ? 16 : 32);
3032          if (input_size != target_size) {
3033             src = convert_int(ctx, bld, src, input_size, target_size, false);
3034          }
3035       }
3036 
3037       if (ctx->program->gfx_level >= GFX8 && input_size <= 16) {
3038          bld.vop1(aco_opcode::v_cvt_f16_u16, Definition(dst), src);
3039       } else {
3040          /* Large 32bit inputs need to return inf/FLOAT_MAX.
3041           *
3042           * This is also the fallback-path taken on GFX7 and earlier, which
3043           * do not support direct f16⟷u16 conversions.
3044           */
3045          src = bld.vop1(aco_opcode::v_cvt_f32_u32, bld.def(v1), src);
3046          bld.vop1(aco_opcode::v_cvt_f16_f32, Definition(dst), src);
3047       }
3048       break;
3049    }
3050    case nir_op_u2f32: {
3051       assert(dst.size() == 1);
3052       Temp src = get_alu_src(ctx, instr->src[0]);
3053       const unsigned input_size = instr->src[0].src.ssa->bit_size;
3054       if (input_size == 8) {
3055          bld.vop1(aco_opcode::v_cvt_f32_ubyte0, Definition(dst), src);
3056       } else if (input_size <= 32) {
3057          if (input_size == 16)
3058             src = convert_int(ctx, bld, src, instr->src[0].src.ssa->bit_size, 32, false);
3059          bld.vop1(aco_opcode::v_cvt_f32_u32, Definition(dst), src);
3060       } else {
3061          isel_err(&instr->instr, "Unimplemented NIR instr bit size");
3062       }
3063       break;
3064    }
3065    case nir_op_u2f64: {
3066       if (instr->src[0].src.ssa->bit_size <= 32) {
3067          Temp src = get_alu_src(ctx, instr->src[0]);
3068          if (instr->src[0].src.ssa->bit_size <= 16)
3069             src = convert_int(ctx, bld, src, instr->src[0].src.ssa->bit_size, 32, false);
3070          bld.vop1(aco_opcode::v_cvt_f64_u32, Definition(dst), src);
3071       } else {
3072          isel_err(&instr->instr, "Unimplemented NIR instr bit size");
3073       }
3074       break;
3075    }
3076    case nir_op_f2i8:
3077    case nir_op_f2i16: {
3078       if (instr->src[0].src.ssa->bit_size == 16) {
3079          if (ctx->program->gfx_level >= GFX8) {
3080             emit_vop1_instruction(ctx, instr, aco_opcode::v_cvt_i16_f16, dst);
3081          } else {
3082             /* GFX7 and earlier do not support direct f16⟷i16 conversions */
3083             Temp tmp = bld.tmp(v1);
3084             emit_vop1_instruction(ctx, instr, aco_opcode::v_cvt_f32_f16, tmp);
3085             tmp = bld.vop1(aco_opcode::v_cvt_i32_f32, bld.def(v1), tmp);
3086             tmp = convert_int(ctx, bld, tmp, 32, instr->def.bit_size, false,
3087                               (dst.type() == RegType::sgpr) ? Temp() : dst);
3088             if (dst.type() == RegType::sgpr) {
3089                bld.pseudo(aco_opcode::p_as_uniform, Definition(dst), tmp);
3090             }
3091          }
3092       } else if (instr->src[0].src.ssa->bit_size == 32) {
3093          emit_vop1_instruction(ctx, instr, aco_opcode::v_cvt_i32_f32, dst);
3094       } else {
3095          emit_vop1_instruction(ctx, instr, aco_opcode::v_cvt_i32_f64, dst);
3096       }
3097       break;
3098    }
3099    case nir_op_f2u8:
3100    case nir_op_f2u16: {
3101       if (instr->src[0].src.ssa->bit_size == 16) {
3102          if (ctx->program->gfx_level >= GFX8) {
3103             emit_vop1_instruction(ctx, instr, aco_opcode::v_cvt_u16_f16, dst);
3104          } else {
3105             /* GFX7 and earlier do not support direct f16⟷u16 conversions */
3106             Temp tmp = bld.tmp(v1);
3107             emit_vop1_instruction(ctx, instr, aco_opcode::v_cvt_f32_f16, tmp);
3108             tmp = bld.vop1(aco_opcode::v_cvt_u32_f32, bld.def(v1), tmp);
3109             tmp = convert_int(ctx, bld, tmp, 32, instr->def.bit_size, false,
3110                               (dst.type() == RegType::sgpr) ? Temp() : dst);
3111             if (dst.type() == RegType::sgpr) {
3112                bld.pseudo(aco_opcode::p_as_uniform, Definition(dst), tmp);
3113             }
3114          }
3115       } else if (instr->src[0].src.ssa->bit_size == 32) {
3116          emit_vop1_instruction(ctx, instr, aco_opcode::v_cvt_u32_f32, dst);
3117       } else {
3118          emit_vop1_instruction(ctx, instr, aco_opcode::v_cvt_u32_f64, dst);
3119       }
3120       break;
3121    }
3122    case nir_op_f2i32: {
3123       Temp src = get_alu_src(ctx, instr->src[0]);
3124       if (instr->src[0].src.ssa->bit_size == 16) {
3125          Temp tmp = bld.vop1(aco_opcode::v_cvt_f32_f16, bld.def(v1), src);
3126          if (dst.type() == RegType::vgpr) {
3127             bld.vop1(aco_opcode::v_cvt_i32_f32, Definition(dst), tmp);
3128          } else {
3129             bld.pseudo(aco_opcode::p_as_uniform, Definition(dst),
3130                        bld.vop1(aco_opcode::v_cvt_i32_f32, bld.def(v1), tmp));
3131          }
3132       } else if (instr->src[0].src.ssa->bit_size == 32) {
3133          emit_vop1_instruction(ctx, instr, aco_opcode::v_cvt_i32_f32, dst);
3134       } else if (instr->src[0].src.ssa->bit_size == 64) {
3135          emit_vop1_instruction(ctx, instr, aco_opcode::v_cvt_i32_f64, dst);
3136       } else {
3137          isel_err(&instr->instr, "Unimplemented NIR instr bit size");
3138       }
3139       break;
3140    }
3141    case nir_op_f2u32: {
3142       Temp src = get_alu_src(ctx, instr->src[0]);
3143       if (instr->src[0].src.ssa->bit_size == 16) {
3144          Temp tmp = bld.vop1(aco_opcode::v_cvt_f32_f16, bld.def(v1), src);
3145          if (dst.type() == RegType::vgpr) {
3146             bld.vop1(aco_opcode::v_cvt_u32_f32, Definition(dst), tmp);
3147          } else {
3148             bld.pseudo(aco_opcode::p_as_uniform, Definition(dst),
3149                        bld.vop1(aco_opcode::v_cvt_u32_f32, bld.def(v1), tmp));
3150          }
3151       } else if (instr->src[0].src.ssa->bit_size == 32) {
3152          emit_vop1_instruction(ctx, instr, aco_opcode::v_cvt_u32_f32, dst);
3153       } else if (instr->src[0].src.ssa->bit_size == 64) {
3154          emit_vop1_instruction(ctx, instr, aco_opcode::v_cvt_u32_f64, dst);
3155       } else {
3156          isel_err(&instr->instr, "Unimplemented NIR instr bit size");
3157       }
3158       break;
3159    }
3160    case nir_op_b2f16: {
3161       Temp src = get_alu_src(ctx, instr->src[0]);
3162       assert(src.regClass() == bld.lm);
3163 
3164       if (dst.regClass() == s1) {
3165          src = bool_to_scalar_condition(ctx, src);
3166          bld.sop2(aco_opcode::s_mul_i32, Definition(dst), Operand::c32(0x3c00u), src);
3167       } else if (dst.regClass() == v2b) {
3168          Temp one = bld.copy(bld.def(v1), Operand::c32(0x3c00u));
3169          bld.vop2(aco_opcode::v_cndmask_b32, Definition(dst), Operand::zero(), one, src);
3170       } else {
3171          unreachable("Wrong destination register class for nir_op_b2f16.");
3172       }
3173       break;
3174    }
3175    case nir_op_b2f32: {
3176       Temp src = get_alu_src(ctx, instr->src[0]);
3177       assert(src.regClass() == bld.lm);
3178 
3179       if (dst.regClass() == s1) {
3180          src = bool_to_scalar_condition(ctx, src);
3181          bld.sop2(aco_opcode::s_mul_i32, Definition(dst), Operand::c32(0x3f800000u), src);
3182       } else if (dst.regClass() == v1) {
3183          bld.vop2_e64(aco_opcode::v_cndmask_b32, Definition(dst), Operand::zero(),
3184                       Operand::c32(0x3f800000u), src);
3185       } else {
3186          unreachable("Wrong destination register class for nir_op_b2f32.");
3187       }
3188       break;
3189    }
3190    case nir_op_b2f64: {
3191       Temp src = get_alu_src(ctx, instr->src[0]);
3192       assert(src.regClass() == bld.lm);
3193 
3194       if (dst.regClass() == s2) {
3195          src = bool_to_scalar_condition(ctx, src);
3196          bld.sop2(aco_opcode::s_cselect_b64, Definition(dst), Operand::c32(0x3f800000u),
3197                   Operand::zero(), bld.scc(src));
3198       } else if (dst.regClass() == v2) {
3199          Temp one = bld.copy(bld.def(v1), Operand::c32(0x3FF00000u));
3200          Temp upper =
3201             bld.vop2_e64(aco_opcode::v_cndmask_b32, bld.def(v1), Operand::zero(), one, src);
3202          bld.pseudo(aco_opcode::p_create_vector, Definition(dst), Operand::zero(), upper);
3203       } else {
3204          unreachable("Wrong destination register class for nir_op_b2f64.");
3205       }
3206       break;
3207    }
3208    case nir_op_i2i8:
3209    case nir_op_i2i16:
3210    case nir_op_i2i32: {
3211       if (dst.type() == RegType::sgpr && instr->src[0].src.ssa->bit_size < 32) {
3212          /* no need to do the extract in get_alu_src() */
3213          sgpr_extract_mode mode = instr->def.bit_size > instr->src[0].src.ssa->bit_size
3214                                      ? sgpr_extract_sext
3215                                      : sgpr_extract_undef;
3216          extract_8_16_bit_sgpr_element(ctx, dst, &instr->src[0], mode);
3217       } else {
3218          const unsigned input_bitsize = instr->src[0].src.ssa->bit_size;
3219          const unsigned output_bitsize = instr->def.bit_size;
3220          convert_int(ctx, bld, get_alu_src(ctx, instr->src[0]), input_bitsize, output_bitsize,
3221                      output_bitsize > input_bitsize, dst);
3222       }
3223       break;
3224    }
3225    case nir_op_u2u8:
3226    case nir_op_u2u16:
3227    case nir_op_u2u32: {
3228       if (dst.type() == RegType::sgpr && instr->src[0].src.ssa->bit_size < 32) {
3229          /* no need to do the extract in get_alu_src() */
3230          sgpr_extract_mode mode = instr->def.bit_size > instr->src[0].src.ssa->bit_size
3231                                      ? sgpr_extract_zext
3232                                      : sgpr_extract_undef;
3233          extract_8_16_bit_sgpr_element(ctx, dst, &instr->src[0], mode);
3234       } else {
3235          convert_int(ctx, bld, get_alu_src(ctx, instr->src[0]), instr->src[0].src.ssa->bit_size,
3236                      instr->def.bit_size, false, dst);
3237       }
3238       break;
3239    }
3240    case nir_op_b2b32:
3241    case nir_op_b2i8:
3242    case nir_op_b2i16:
3243    case nir_op_b2i32: {
3244       Temp src = get_alu_src(ctx, instr->src[0]);
3245       assert(src.regClass() == bld.lm);
3246 
3247       if (dst.regClass() == s1) {
3248          bool_to_scalar_condition(ctx, src, dst);
3249       } else if (dst.type() == RegType::vgpr) {
3250          bld.vop2_e64(aco_opcode::v_cndmask_b32, Definition(dst), Operand::zero(), Operand::c32(1u),
3251                       src);
3252       } else {
3253          unreachable("Invalid register class for b2i32");
3254       }
3255       break;
3256    }
3257    case nir_op_b2b1: {
3258       Temp src = get_alu_src(ctx, instr->src[0]);
3259       assert(dst.regClass() == bld.lm);
3260 
3261       if (src.type() == RegType::vgpr) {
3262          assert(src.regClass() == v1 || src.regClass() == v2);
3263          assert(dst.regClass() == bld.lm);
3264          bld.vopc(src.size() == 2 ? aco_opcode::v_cmp_lg_u64 : aco_opcode::v_cmp_lg_u32,
3265                   Definition(dst), Operand::zero(), src);
3266       } else {
3267          assert(src.regClass() == s1 || src.regClass() == s2);
3268          Temp tmp;
3269          if (src.regClass() == s2 && ctx->program->gfx_level <= GFX7) {
3270             tmp =
3271                bld.sop2(aco_opcode::s_or_b64, bld.def(s2), bld.def(s1, scc), Operand::zero(), src)
3272                   .def(1)
3273                   .getTemp();
3274          } else {
3275             tmp = bld.sopc(src.size() == 2 ? aco_opcode::s_cmp_lg_u64 : aco_opcode::s_cmp_lg_u32,
3276                            bld.scc(bld.def(s1)), Operand::zero(), src);
3277          }
3278          bool_to_vector_condition(ctx, tmp, dst);
3279       }
3280       break;
3281    }
3282    case nir_op_unpack_64_2x32:
3283    case nir_op_unpack_32_2x16:
3284    case nir_op_unpack_64_4x16:
3285    case nir_op_unpack_32_4x8:
3286       bld.copy(Definition(dst), get_alu_src(ctx, instr->src[0]));
3287       emit_split_vector(
3288          ctx, dst, instr->op == nir_op_unpack_32_4x8 || instr->op == nir_op_unpack_64_4x16 ? 4 : 2);
3289       break;
3290    case nir_op_pack_64_2x32_split: {
3291       Temp src0 = get_alu_src(ctx, instr->src[0]);
3292       Temp src1 = get_alu_src(ctx, instr->src[1]);
3293 
3294       bld.pseudo(aco_opcode::p_create_vector, Definition(dst), src0, src1);
3295       break;
3296    }
3297    case nir_op_unpack_64_2x32_split_x:
3298       bld.pseudo(aco_opcode::p_split_vector, Definition(dst), bld.def(dst.regClass()),
3299                  get_alu_src(ctx, instr->src[0]));
3300       break;
3301    case nir_op_unpack_64_2x32_split_y:
3302       bld.pseudo(aco_opcode::p_split_vector, bld.def(dst.regClass()), Definition(dst),
3303                  get_alu_src(ctx, instr->src[0]));
3304       break;
3305    case nir_op_unpack_32_2x16_split_x:
3306       if (dst.type() == RegType::vgpr) {
3307          bld.pseudo(aco_opcode::p_split_vector, Definition(dst), bld.def(dst.regClass()),
3308                     get_alu_src(ctx, instr->src[0]));
3309       } else {
3310          bld.copy(Definition(dst), get_alu_src(ctx, instr->src[0]));
3311       }
3312       break;
3313    case nir_op_unpack_32_2x16_split_y:
3314       if (dst.type() == RegType::vgpr) {
3315          bld.pseudo(aco_opcode::p_split_vector, bld.def(dst.regClass()), Definition(dst),
3316                     get_alu_src(ctx, instr->src[0]));
3317       } else {
3318          bld.pseudo(aco_opcode::p_extract, Definition(dst), bld.def(s1, scc),
3319                     get_alu_src(ctx, instr->src[0]), Operand::c32(1u), Operand::c32(16u),
3320                     Operand::zero());
3321       }
3322       break;
3323    case nir_op_pack_32_2x16_split: {
3324       Temp src0 = get_alu_src(ctx, instr->src[0]);
3325       Temp src1 = get_alu_src(ctx, instr->src[1]);
3326       if (dst.regClass() == v1) {
3327          src0 = emit_extract_vector(ctx, src0, 0, v2b);
3328          src1 = emit_extract_vector(ctx, src1, 0, v2b);
3329          bld.pseudo(aco_opcode::p_create_vector, Definition(dst), src0, src1);
3330       } else {
3331          src0 = bld.sop2(aco_opcode::s_and_b32, bld.def(s1), bld.def(s1, scc), src0,
3332                          Operand::c32(0xFFFFu));
3333          src1 = bld.sop2(aco_opcode::s_lshl_b32, bld.def(s1), bld.def(s1, scc), src1,
3334                          Operand::c32(16u));
3335          bld.sop2(aco_opcode::s_or_b32, Definition(dst), bld.def(s1, scc), src0, src1);
3336       }
3337       break;
3338    }
3339    case nir_op_pack_32_4x8: bld.copy(Definition(dst), get_alu_src(ctx, instr->src[0], 4)); break;
3340    case nir_op_pack_half_2x16_rtz_split:
3341    case nir_op_pack_half_2x16_split: {
3342       if (dst.regClass() == v1) {
3343          if (ctx->program->gfx_level == GFX8 || ctx->program->gfx_level == GFX9)
3344             emit_vop3a_instruction(ctx, instr, aco_opcode::v_cvt_pkrtz_f16_f32_e64, dst);
3345          else
3346             emit_vop2_instruction(ctx, instr, aco_opcode::v_cvt_pkrtz_f16_f32, dst, false);
3347       } else {
3348          isel_err(&instr->instr, "Unimplemented NIR instr bit size");
3349       }
3350       break;
3351    }
3352    case nir_op_pack_unorm_2x16:
3353    case nir_op_pack_snorm_2x16: {
3354       unsigned bit_size = instr->src[0].src.ssa->bit_size;
3355       /* Only support 16 and 32bit. */
3356       assert(bit_size == 32 || bit_size == 16);
3357 
3358       RegClass src_rc = bit_size == 32 ? v1 : v2b;
3359       Temp src = get_alu_src(ctx, instr->src[0], 2);
3360       Temp src0 = emit_extract_vector(ctx, src, 0, src_rc);
3361       Temp src1 = emit_extract_vector(ctx, src, 1, src_rc);
3362 
3363       /* Work around for pre-GFX9 GPU which don't have fp16 pknorm instruction. */
3364       if (bit_size == 16 && ctx->program->gfx_level < GFX9) {
3365          src0 = bld.vop1(aco_opcode::v_cvt_f32_f16, bld.def(v1), src0);
3366          src1 = bld.vop1(aco_opcode::v_cvt_f32_f16, bld.def(v1), src1);
3367          bit_size = 32;
3368       }
3369 
3370       aco_opcode opcode;
3371       if (bit_size == 32) {
3372          opcode = instr->op == nir_op_pack_unorm_2x16 ? aco_opcode::v_cvt_pknorm_u16_f32
3373                                                       : aco_opcode::v_cvt_pknorm_i16_f32;
3374       } else {
3375          opcode = instr->op == nir_op_pack_unorm_2x16 ? aco_opcode::v_cvt_pknorm_u16_f16
3376                                                       : aco_opcode::v_cvt_pknorm_i16_f16;
3377       }
3378       bld.vop3(opcode, Definition(dst), src0, src1);
3379       break;
3380    }
3381    case nir_op_pack_uint_2x16:
3382    case nir_op_pack_sint_2x16: {
3383       Temp src = get_alu_src(ctx, instr->src[0], 2);
3384       Temp src0 = emit_extract_vector(ctx, src, 0, v1);
3385       Temp src1 = emit_extract_vector(ctx, src, 1, v1);
3386       aco_opcode opcode = instr->op == nir_op_pack_uint_2x16 ? aco_opcode::v_cvt_pk_u16_u32
3387                                                              : aco_opcode::v_cvt_pk_i16_i32;
3388       bld.vop3(opcode, Definition(dst), src0, src1);
3389       break;
3390    }
3391    case nir_op_unpack_half_2x16_split_x_flush_to_zero:
3392    case nir_op_unpack_half_2x16_split_x: {
3393       Temp src = get_alu_src(ctx, instr->src[0]);
3394       if (src.regClass() == v1)
3395          src = bld.pseudo(aco_opcode::p_split_vector, bld.def(v2b), bld.def(v2b), src);
3396       if (dst.regClass() == v1) {
3397          assert(ctx->block->fp_mode.must_flush_denorms16_64 ==
3398                 (instr->op == nir_op_unpack_half_2x16_split_x_flush_to_zero));
3399          bld.vop1(aco_opcode::v_cvt_f32_f16, Definition(dst), src);
3400       } else {
3401          isel_err(&instr->instr, "Unimplemented NIR instr bit size");
3402       }
3403       break;
3404    }
3405    case nir_op_unpack_half_2x16_split_y_flush_to_zero:
3406    case nir_op_unpack_half_2x16_split_y: {
3407       Temp src = get_alu_src(ctx, instr->src[0]);
3408       if (src.regClass() == s1)
3409          src = bld.pseudo(aco_opcode::p_extract, bld.def(s1), bld.def(s1, scc), src,
3410                           Operand::c32(1u), Operand::c32(16u), Operand::zero());
3411       else
3412          src =
3413             bld.pseudo(aco_opcode::p_split_vector, bld.def(v2b), bld.def(v2b), src).def(1).getTemp();
3414       if (dst.regClass() == v1) {
3415          assert(ctx->block->fp_mode.must_flush_denorms16_64 ==
3416                 (instr->op == nir_op_unpack_half_2x16_split_y_flush_to_zero));
3417          bld.vop1(aco_opcode::v_cvt_f32_f16, Definition(dst), src);
3418       } else {
3419          isel_err(&instr->instr, "Unimplemented NIR instr bit size");
3420       }
3421       break;
3422    }
3423    case nir_op_msad_4x8: {
3424       assert(dst.regClass() == v1);
3425       emit_vop3a_instruction(ctx, instr, aco_opcode::v_msad_u8, dst, false, 3u, true);
3426       break;
3427    }
3428    case nir_op_fquantize2f16: {
3429       Temp src = get_alu_src(ctx, instr->src[0]);
3430       Temp f16;
3431       if (ctx->block->fp_mode.round16_64 != fp_round_ne)
3432          f16 = bld.vop1(aco_opcode::p_cvt_f16_f32_rtne, bld.def(v2b), src);
3433       else
3434          f16 = bld.vop1(aco_opcode::v_cvt_f16_f32, bld.def(v2b), src);
3435       Temp f32, cmp_res;
3436 
3437       if (ctx->program->gfx_level >= GFX8) {
3438          Temp mask = bld.copy(
3439             bld.def(s1), Operand::c32(0x36Fu)); /* value is NOT negative/positive denormal value */
3440          cmp_res = bld.vopc_e64(aco_opcode::v_cmp_class_f16, bld.def(bld.lm), f16, mask);
3441          f32 = bld.vop1(aco_opcode::v_cvt_f32_f16, bld.def(v1), f16);
3442       } else {
3443          /* 0x38800000 is smallest half float value (2^-14) in 32-bit float,
3444           * so compare the result and flush to 0 if it's smaller.
3445           */
3446          f32 = bld.vop1(aco_opcode::v_cvt_f32_f16, bld.def(v1), f16);
3447          Temp smallest = bld.copy(bld.def(s1), Operand::c32(0x38800000u));
3448          Instruction* tmp0 = bld.vopc_e64(aco_opcode::v_cmp_lt_f32, bld.def(bld.lm), f32, smallest);
3449          tmp0->valu().abs[0] = true;
3450          Temp tmp1 = bld.vopc(aco_opcode::v_cmp_lg_f32, bld.def(bld.lm), Operand::zero(), f32);
3451          cmp_res = bld.sop2(aco_opcode::s_nand_b64, bld.def(s2), bld.def(s1, scc),
3452                             tmp0->definitions[0].getTemp(), tmp1);
3453       }
3454 
3455       if (ctx->block->fp_mode.preserve_signed_zero_inf_nan32) {
3456          Temp copysign_0 =
3457             bld.vop2(aco_opcode::v_mul_f32, bld.def(v1), Operand::zero(), as_vgpr(ctx, src));
3458          bld.vop2(aco_opcode::v_cndmask_b32, Definition(dst), copysign_0, f32, cmp_res);
3459       } else {
3460          bld.vop2(aco_opcode::v_cndmask_b32, Definition(dst), Operand::zero(), f32, cmp_res);
3461       }
3462       break;
3463    }
3464    case nir_op_bfm: {
3465       Temp bits = get_alu_src(ctx, instr->src[0]);
3466       Temp offset = get_alu_src(ctx, instr->src[1]);
3467 
3468       if (dst.regClass() == s1) {
3469          bld.sop2(aco_opcode::s_bfm_b32, Definition(dst), bits, offset);
3470       } else if (dst.regClass() == v1) {
3471          bld.vop3(aco_opcode::v_bfm_b32, Definition(dst), bits, offset);
3472       } else {
3473          isel_err(&instr->instr, "Unimplemented NIR instr bit size");
3474       }
3475       break;
3476    }
3477    case nir_op_bitfield_select: {
3478 
3479       /* dst = (insert & bitmask) | (base & ~bitmask) */
3480       if (dst.regClass() == s1) {
3481          Temp bitmask = get_alu_src(ctx, instr->src[0]);
3482          Temp insert = get_alu_src(ctx, instr->src[1]);
3483          Temp base = get_alu_src(ctx, instr->src[2]);
3484          aco_ptr<Instruction> sop2;
3485          nir_const_value* const_bitmask = nir_src_as_const_value(instr->src[0].src);
3486          nir_const_value* const_insert = nir_src_as_const_value(instr->src[1].src);
3487          Operand lhs;
3488          if (const_insert && const_bitmask) {
3489             lhs = Operand::c32(const_insert->u32 & const_bitmask->u32);
3490          } else {
3491             insert =
3492                bld.sop2(aco_opcode::s_and_b32, bld.def(s1), bld.def(s1, scc), insert, bitmask);
3493             lhs = Operand(insert);
3494          }
3495 
3496          Operand rhs;
3497          nir_const_value* const_base = nir_src_as_const_value(instr->src[2].src);
3498          if (const_base && const_bitmask) {
3499             rhs = Operand::c32(const_base->u32 & ~const_bitmask->u32);
3500          } else {
3501             base = bld.sop2(aco_opcode::s_andn2_b32, bld.def(s1), bld.def(s1, scc), base, bitmask);
3502             rhs = Operand(base);
3503          }
3504 
3505          bld.sop2(aco_opcode::s_or_b32, Definition(dst), bld.def(s1, scc), rhs, lhs);
3506 
3507       } else if (dst.regClass() == v1) {
3508          emit_vop3a_instruction(ctx, instr, aco_opcode::v_bfi_b32, dst, false, 3);
3509       } else {
3510          isel_err(&instr->instr, "Unimplemented NIR instr bit size");
3511       }
3512       break;
3513    }
3514    case nir_op_ubfe:
3515    case nir_op_ibfe: {
3516       if (dst.bytes() != 4)
3517          unreachable("Unsupported BFE bit size");
3518 
3519       if (dst.type() == RegType::sgpr) {
3520          Temp base = get_alu_src(ctx, instr->src[0]);
3521 
3522          nir_const_value* const_offset = nir_src_as_const_value(instr->src[1].src);
3523          nir_const_value* const_bits = nir_src_as_const_value(instr->src[2].src);
3524          aco_opcode opcode =
3525             instr->op == nir_op_ubfe ? aco_opcode::s_bfe_u32 : aco_opcode::s_bfe_i32;
3526          if (const_offset && const_bits) {
3527             uint32_t extract = ((const_bits->u32 & 0x1f) << 16) | (const_offset->u32 & 0x1f);
3528             bld.sop2(opcode, Definition(dst), bld.def(s1, scc), base, Operand::c32(extract));
3529             break;
3530          }
3531 
3532          Temp offset = get_alu_src(ctx, instr->src[1]);
3533          Temp bits = get_alu_src(ctx, instr->src[2]);
3534 
3535          if (ctx->program->gfx_level >= GFX9) {
3536             Operand bits_op = const_bits ? Operand::c32(const_bits->u32 & 0x1f)
3537                                          : bld.sop2(aco_opcode::s_and_b32, bld.def(s1),
3538                                                     bld.def(s1, scc), bits, Operand::c32(0x1fu));
3539             Temp extract = bld.sop2(aco_opcode::s_pack_ll_b32_b16, bld.def(s1), offset, bits_op);
3540             bld.sop2(opcode, Definition(dst), bld.def(s1, scc), base, extract);
3541          } else if (instr->op == nir_op_ubfe) {
3542             Temp mask = bld.sop2(aco_opcode::s_bfm_b32, bld.def(s1), bits, offset);
3543             Temp masked =
3544                bld.sop2(aco_opcode::s_and_b32, bld.def(s1), bld.def(s1, scc), base, mask);
3545             bld.sop2(aco_opcode::s_lshr_b32, Definition(dst), bld.def(s1, scc), masked, offset);
3546          } else {
3547             Operand bits_op = const_bits
3548                                  ? Operand::c32((const_bits->u32 & 0x1f) << 16)
3549                                  : bld.sop2(aco_opcode::s_lshl_b32, bld.def(s1), bld.def(s1, scc),
3550                                             bld.sop2(aco_opcode::s_and_b32, bld.def(s1),
3551                                                      bld.def(s1, scc), bits, Operand::c32(0x1fu)),
3552                                             Operand::c32(16u));
3553             Operand offset_op = const_offset
3554                                    ? Operand::c32(const_offset->u32 & 0x1fu)
3555                                    : bld.sop2(aco_opcode::s_and_b32, bld.def(s1), bld.def(s1, scc),
3556                                               offset, Operand::c32(0x1fu));
3557 
3558             Temp extract =
3559                bld.sop2(aco_opcode::s_or_b32, bld.def(s1), bld.def(s1, scc), bits_op, offset_op);
3560             bld.sop2(aco_opcode::s_bfe_i32, Definition(dst), bld.def(s1, scc), base, extract);
3561          }
3562 
3563       } else {
3564          aco_opcode opcode =
3565             instr->op == nir_op_ubfe ? aco_opcode::v_bfe_u32 : aco_opcode::v_bfe_i32;
3566          emit_vop3a_instruction(ctx, instr, opcode, dst, false, 3);
3567       }
3568       break;
3569    }
3570    case nir_op_extract_u8:
3571    case nir_op_extract_i8:
3572    case nir_op_extract_u16:
3573    case nir_op_extract_i16: {
3574       bool is_signed = instr->op == nir_op_extract_i16 || instr->op == nir_op_extract_i8;
3575       unsigned comp = instr->op == nir_op_extract_u8 || instr->op == nir_op_extract_i8 ? 4 : 2;
3576       uint32_t bits = comp == 4 ? 8 : 16;
3577       unsigned index = nir_src_as_uint(instr->src[1].src);
3578       if (bits >= instr->def.bit_size || index * bits >= instr->def.bit_size) {
3579          assert(index == 0);
3580          bld.copy(Definition(dst), get_alu_src(ctx, instr->src[0]));
3581       } else if (dst.regClass() == s1 && instr->def.bit_size == 16) {
3582          Temp vec = get_ssa_temp(ctx, instr->src[0].src.ssa);
3583          unsigned swizzle = instr->src[0].swizzle[0];
3584          if (vec.size() > 1) {
3585             vec = emit_extract_vector(ctx, vec, swizzle / 2, s1);
3586             swizzle = swizzle & 1;
3587          }
3588          index += swizzle * instr->def.bit_size / bits;
3589          bld.pseudo(aco_opcode::p_extract, Definition(dst), bld.def(s1, scc), Operand(vec),
3590                     Operand::c32(index), Operand::c32(bits), Operand::c32(is_signed));
3591       } else {
3592          Temp src = get_alu_src(ctx, instr->src[0]);
3593          Definition def(dst);
3594          if (dst.bytes() == 8) {
3595             src = emit_extract_vector(ctx, src, index / comp, RegClass(src.type(), 1));
3596             index %= comp;
3597             def = bld.def(src.type(), 1);
3598          }
3599          assert(def.bytes() <= 4);
3600          if (def.regClass() == s1) {
3601             bld.pseudo(aco_opcode::p_extract, def, bld.def(s1, scc), Operand(src),
3602                        Operand::c32(index), Operand::c32(bits), Operand::c32(is_signed));
3603          } else {
3604             src = emit_extract_vector(ctx, src, 0, def.regClass());
3605             bld.pseudo(aco_opcode::p_extract, def, Operand(src), Operand::c32(index),
3606                        Operand::c32(bits), Operand::c32(is_signed));
3607          }
3608          if (dst.size() == 2)
3609             bld.pseudo(aco_opcode::p_create_vector, Definition(dst), def.getTemp(),
3610                        Operand::zero());
3611       }
3612       break;
3613    }
3614    case nir_op_insert_u8:
3615    case nir_op_insert_u16: {
3616       unsigned comp = instr->op == nir_op_insert_u8 ? 4 : 2;
3617       uint32_t bits = comp == 4 ? 8 : 16;
3618       unsigned index = nir_src_as_uint(instr->src[1].src);
3619       if (bits >= instr->def.bit_size || index * bits >= instr->def.bit_size) {
3620          assert(index == 0);
3621          bld.copy(Definition(dst), get_alu_src(ctx, instr->src[0]));
3622       } else {
3623          Temp src = get_alu_src(ctx, instr->src[0]);
3624          Definition def(dst);
3625          bool swap = false;
3626          if (dst.bytes() == 8) {
3627             src = emit_extract_vector(ctx, src, 0u, RegClass(src.type(), 1));
3628             swap = index >= comp;
3629             index %= comp;
3630             def = bld.def(src.type(), 1);
3631          }
3632          if (def.regClass() == s1) {
3633             bld.pseudo(aco_opcode::p_insert, def, bld.def(s1, scc), Operand(src),
3634                        Operand::c32(index), Operand::c32(bits));
3635          } else {
3636             src = emit_extract_vector(ctx, src, 0, def.regClass());
3637             bld.pseudo(aco_opcode::p_insert, def, Operand(src), Operand::c32(index),
3638                        Operand::c32(bits));
3639          }
3640          if (dst.size() == 2 && swap)
3641             bld.pseudo(aco_opcode::p_create_vector, Definition(dst), Operand::zero(),
3642                        def.getTemp());
3643          else if (dst.size() == 2)
3644             bld.pseudo(aco_opcode::p_create_vector, Definition(dst), def.getTemp(),
3645                        Operand::zero());
3646       }
3647       break;
3648    }
3649    case nir_op_bit_count: {
3650       Temp src = get_alu_src(ctx, instr->src[0]);
3651       if (src.regClass() == s1) {
3652          bld.sop1(aco_opcode::s_bcnt1_i32_b32, Definition(dst), bld.def(s1, scc), src);
3653       } else if (src.regClass() == v1) {
3654          bld.vop3(aco_opcode::v_bcnt_u32_b32, Definition(dst), src, Operand::zero());
3655       } else if (src.regClass() == v2) {
3656          bld.vop3(aco_opcode::v_bcnt_u32_b32, Definition(dst), emit_extract_vector(ctx, src, 1, v1),
3657                   bld.vop3(aco_opcode::v_bcnt_u32_b32, bld.def(v1),
3658                            emit_extract_vector(ctx, src, 0, v1), Operand::zero()));
3659       } else if (src.regClass() == s2) {
3660          bld.sop1(aco_opcode::s_bcnt1_i32_b64, Definition(dst), bld.def(s1, scc), src);
3661       } else {
3662          isel_err(&instr->instr, "Unimplemented NIR instr bit size");
3663       }
3664       break;
3665    }
3666    case nir_op_flt: {
3667       emit_comparison(ctx, instr, dst, aco_opcode::v_cmp_lt_f16, aco_opcode::v_cmp_lt_f32,
3668                       aco_opcode::v_cmp_lt_f64);
3669       break;
3670    }
3671    case nir_op_fge: {
3672       emit_comparison(ctx, instr, dst, aco_opcode::v_cmp_ge_f16, aco_opcode::v_cmp_ge_f32,
3673                       aco_opcode::v_cmp_ge_f64);
3674       break;
3675    }
3676    case nir_op_feq: {
3677       emit_comparison(ctx, instr, dst, aco_opcode::v_cmp_eq_f16, aco_opcode::v_cmp_eq_f32,
3678                       aco_opcode::v_cmp_eq_f64);
3679       break;
3680    }
3681    case nir_op_fneu: {
3682       emit_comparison(ctx, instr, dst, aco_opcode::v_cmp_neq_f16, aco_opcode::v_cmp_neq_f32,
3683                       aco_opcode::v_cmp_neq_f64);
3684       break;
3685    }
3686    case nir_op_ilt: {
3687       emit_comparison(ctx, instr, dst, aco_opcode::v_cmp_lt_i16, aco_opcode::v_cmp_lt_i32,
3688                       aco_opcode::v_cmp_lt_i64, aco_opcode::s_cmp_lt_i32);
3689       break;
3690    }
3691    case nir_op_ige: {
3692       emit_comparison(ctx, instr, dst, aco_opcode::v_cmp_ge_i16, aco_opcode::v_cmp_ge_i32,
3693                       aco_opcode::v_cmp_ge_i64, aco_opcode::s_cmp_ge_i32);
3694       break;
3695    }
3696    case nir_op_ieq: {
3697       if (instr->src[0].src.ssa->bit_size == 1)
3698          emit_boolean_logic(ctx, instr, Builder::s_xnor, dst);
3699       else
3700          emit_comparison(
3701             ctx, instr, dst, aco_opcode::v_cmp_eq_i16, aco_opcode::v_cmp_eq_i32,
3702             aco_opcode::v_cmp_eq_i64, aco_opcode::s_cmp_eq_i32,
3703             ctx->program->gfx_level >= GFX8 ? aco_opcode::s_cmp_eq_u64 : aco_opcode::num_opcodes);
3704       break;
3705    }
3706    case nir_op_ine: {
3707       if (instr->src[0].src.ssa->bit_size == 1)
3708          emit_boolean_logic(ctx, instr, Builder::s_xor, dst);
3709       else
3710          emit_comparison(
3711             ctx, instr, dst, aco_opcode::v_cmp_lg_i16, aco_opcode::v_cmp_lg_i32,
3712             aco_opcode::v_cmp_lg_i64, aco_opcode::s_cmp_lg_i32,
3713             ctx->program->gfx_level >= GFX8 ? aco_opcode::s_cmp_lg_u64 : aco_opcode::num_opcodes);
3714       break;
3715    }
3716    case nir_op_ult: {
3717       emit_comparison(ctx, instr, dst, aco_opcode::v_cmp_lt_u16, aco_opcode::v_cmp_lt_u32,
3718                       aco_opcode::v_cmp_lt_u64, aco_opcode::s_cmp_lt_u32);
3719       break;
3720    }
3721    case nir_op_uge: {
3722       emit_comparison(ctx, instr, dst, aco_opcode::v_cmp_ge_u16, aco_opcode::v_cmp_ge_u32,
3723                       aco_opcode::v_cmp_ge_u64, aco_opcode::s_cmp_ge_u32);
3724       break;
3725    }
3726    case nir_op_bitz:
3727    case nir_op_bitnz: {
3728       assert(instr->src[0].src.ssa->bit_size != 1);
3729       bool test0 = instr->op == nir_op_bitz;
3730       Temp src0 = get_alu_src(ctx, instr->src[0]);
3731       Temp src1 = get_alu_src(ctx, instr->src[1]);
3732       bool use_valu = src0.type() == RegType::vgpr || src1.type() == RegType::vgpr;
3733       if (!use_valu) {
3734          aco_opcode op = instr->src[0].src.ssa->bit_size == 64 ? aco_opcode::s_bitcmp1_b64
3735                                                                : aco_opcode::s_bitcmp1_b32;
3736          if (test0)
3737             op = instr->src[0].src.ssa->bit_size == 64 ? aco_opcode::s_bitcmp0_b64
3738                                                        : aco_opcode::s_bitcmp0_b32;
3739          emit_sopc_instruction(ctx, instr, op, dst);
3740          break;
3741       }
3742 
3743       /* We do not have a VALU version of s_bitcmp.
3744        * But if the second source is constant, we can use
3745        * v_cmp_class_f32's LUT to check the bit.
3746        * The LUT only has 10 entries, so extract a higher byte if we have to.
3747        * For sign bits comparision with 0 is better because v_cmp_class
3748        * can't be inverted.
3749        */
3750       if (nir_src_is_const(instr->src[1].src)) {
3751          uint32_t bit = nir_alu_src_as_uint(instr->src[1]);
3752          bit &= instr->src[0].src.ssa->bit_size - 1;
3753          src0 = as_vgpr(ctx, src0);
3754 
3755          if (src0.regClass() == v2) {
3756             src0 = emit_extract_vector(ctx, src0, (bit & 32) != 0, v1);
3757             bit &= 31;
3758          }
3759 
3760          if (bit == 31) {
3761             bld.vopc(test0 ? aco_opcode::v_cmp_le_i32 : aco_opcode::v_cmp_gt_i32, Definition(dst),
3762                      Operand::c32(0), src0);
3763             break;
3764          }
3765 
3766          if (bit == 15 && ctx->program->gfx_level >= GFX8) {
3767             bld.vopc(test0 ? aco_opcode::v_cmp_le_i16 : aco_opcode::v_cmp_gt_i16, Definition(dst),
3768                      Operand::c32(0), src0);
3769             break;
3770          }
3771 
3772          /* Set max_bit lower to avoid +inf if we can use sdwa+qnan instead. */
3773          const bool can_sdwa = ctx->program->gfx_level >= GFX8 && ctx->program->gfx_level < GFX11;
3774          const unsigned max_bit = can_sdwa ? 0x8 : 0x9;
3775          const bool use_opsel = bit > 0xf && (bit & 0xf) <= max_bit;
3776          if (use_opsel) {
3777             src0 = bld.pseudo(aco_opcode::p_extract, bld.def(v1), src0, Operand::c32(1),
3778                               Operand::c32(16), Operand::c32(0));
3779             bit &= 0xf;
3780          }
3781 
3782          /* If we can use sdwa the extract is free, while test0's s_not is not. */
3783          if (bit == 7 && test0 && can_sdwa) {
3784             src0 = bld.pseudo(aco_opcode::p_extract, bld.def(v1), src0, Operand::c32(bit / 8),
3785                               Operand::c32(8), Operand::c32(1));
3786             bld.vopc(test0 ? aco_opcode::v_cmp_le_i32 : aco_opcode::v_cmp_gt_i32, Definition(dst),
3787                      Operand::c32(0), src0);
3788             break;
3789          }
3790 
3791          if (bit > max_bit) {
3792             src0 = bld.pseudo(aco_opcode::p_extract, bld.def(v1), src0, Operand::c32(bit / 8),
3793                               Operand::c32(8), Operand::c32(0));
3794             bit &= 0x7;
3795          }
3796 
3797          /* denorm and snan/qnan inputs are preserved using all float control modes. */
3798          static const struct {
3799             uint32_t fp32;
3800             uint32_t fp16;
3801             bool negate;
3802          } float_lut[10] = {
3803             {0x7f800001, 0x7c01, false}, /* snan */
3804             {~0u, ~0u, false},           /* qnan */
3805             {0xff800000, 0xfc00, false}, /* -inf */
3806             {0xbf800000, 0xbc00, false}, /* -normal (-1.0) */
3807             {1, 1, true},                /* -denormal */
3808             {0, 0, true},                /* -0.0 */
3809             {0, 0, false},               /* +0.0 */
3810             {1, 1, false},               /* +denormal */
3811             {0x3f800000, 0x3c00, false}, /* +normal (+1.0) */
3812             {0x7f800000, 0x7c00, false}, /* +inf */
3813          };
3814 
3815          Temp tmp = test0 ? bld.tmp(bld.lm) : dst;
3816          /* fp16 can use s_movk for bit 0. It also supports opsel on gfx11. */
3817          const bool use_fp16 = (ctx->program->gfx_level >= GFX8 && bit == 0) ||
3818                                (ctx->program->gfx_level >= GFX11 && use_opsel);
3819          const aco_opcode op = use_fp16 ? aco_opcode::v_cmp_class_f16 : aco_opcode::v_cmp_class_f32;
3820          const uint32_t c = use_fp16 ? float_lut[bit].fp16 : float_lut[bit].fp32;
3821 
3822          VALU_instruction& res =
3823             bld.vopc(op, Definition(tmp), bld.copy(bld.def(s1), Operand::c32(c)), src0)->valu();
3824          if (float_lut[bit].negate) {
3825             res.format = asVOP3(res.format);
3826             res.neg[0] = true;
3827          }
3828 
3829          if (test0)
3830             bld.sop1(Builder::s_not, Definition(dst), bld.def(s1, scc), tmp);
3831 
3832          break;
3833       }
3834 
3835       Temp res;
3836       aco_opcode op = test0 ? aco_opcode::v_cmp_eq_i32 : aco_opcode::v_cmp_lg_i32;
3837       if (instr->src[0].src.ssa->bit_size == 16) {
3838          op = test0 ? aco_opcode::v_cmp_eq_i16 : aco_opcode::v_cmp_lg_i16;
3839          if (ctx->program->gfx_level < GFX10)
3840             res = bld.vop2_e64(aco_opcode::v_lshlrev_b16, bld.def(v2b), src1, Operand::c32(1));
3841          else
3842             res = bld.vop3(aco_opcode::v_lshlrev_b16_e64, bld.def(v2b), src1, Operand::c32(1));
3843 
3844          res = bld.vop2(aco_opcode::v_and_b32, bld.def(v2b), src0, res);
3845       } else if (instr->src[0].src.ssa->bit_size == 32) {
3846          res = bld.vop3(aco_opcode::v_bfe_u32, bld.def(v1), src0, src1, Operand::c32(1));
3847       } else if (instr->src[0].src.ssa->bit_size == 64) {
3848          if (ctx->program->gfx_level < GFX8)
3849             res = bld.vop3(aco_opcode::v_lshr_b64, bld.def(v2), src0, src1);
3850          else
3851             res = bld.vop3(aco_opcode::v_lshrrev_b64, bld.def(v2), src1, src0);
3852 
3853          res = emit_extract_vector(ctx, res, 0, v1);
3854          res = bld.vop2(aco_opcode::v_and_b32, bld.def(v1), Operand::c32(0x1), res);
3855       } else {
3856          isel_err(&instr->instr, "Unimplemented NIR instr bit size");
3857       }
3858       bld.vopc(op, Definition(dst), Operand::c32(0), res);
3859       break;
3860    }
3861    case nir_op_fddx:
3862    case nir_op_fddy:
3863    case nir_op_fddx_fine:
3864    case nir_op_fddy_fine:
3865    case nir_op_fddx_coarse:
3866    case nir_op_fddy_coarse: {
3867       if (!nir_src_is_divergent(instr->src[0].src)) {
3868          /* Source is the same in all lanes, so the derivative is zero.
3869           * This also avoids emitting invalid IR.
3870           */
3871          bld.copy(Definition(dst), Operand::zero(dst.bytes()));
3872          break;
3873       }
3874 
3875       uint16_t dpp_ctrl1, dpp_ctrl2;
3876       if (instr->op == nir_op_fddx_fine) {
3877          dpp_ctrl1 = dpp_quad_perm(0, 0, 2, 2);
3878          dpp_ctrl2 = dpp_quad_perm(1, 1, 3, 3);
3879       } else if (instr->op == nir_op_fddy_fine) {
3880          dpp_ctrl1 = dpp_quad_perm(0, 1, 0, 1);
3881          dpp_ctrl2 = dpp_quad_perm(2, 3, 2, 3);
3882       } else {
3883          dpp_ctrl1 = dpp_quad_perm(0, 0, 0, 0);
3884          if (instr->op == nir_op_fddx || instr->op == nir_op_fddx_coarse)
3885             dpp_ctrl2 = dpp_quad_perm(1, 1, 1, 1);
3886          else
3887             dpp_ctrl2 = dpp_quad_perm(2, 2, 2, 2);
3888       }
3889 
3890       if (dst.regClass() == v1 && instr->def.bit_size == 16) {
3891          assert(instr->def.num_components == 2);
3892 
3893          Temp src = as_vgpr(ctx, get_alu_src_vop3p(ctx, instr->src[0]));
3894 
3895          /* swizzle to opsel: all swizzles are either 0 (x) or 1 (y) */
3896          unsigned opsel_lo = instr->src[0].swizzle[0] & 1;
3897          unsigned opsel_hi = instr->src[0].swizzle[1] & 1;
3898          opsel_lo |= opsel_lo << 1;
3899          opsel_hi |= opsel_hi << 1;
3900 
3901          Temp tl = bld.vop1_dpp(aco_opcode::v_mov_b32, bld.def(v1), src, dpp_ctrl1);
3902          Temp tr = bld.vop1_dpp(aco_opcode::v_mov_b32, bld.def(v1), src, dpp_ctrl2);
3903 
3904          VALU_instruction& sub =
3905             bld.vop3p(aco_opcode::v_pk_add_f16, Definition(dst), tr, tl, opsel_lo, opsel_hi)
3906                .instr->valu();
3907          sub.neg_lo[1] = true;
3908          sub.neg_hi[1] = true;
3909       } else {
3910          Temp src = as_vgpr(ctx, get_alu_src(ctx, instr->src[0]));
3911 
3912          if (ctx->program->gfx_level >= GFX8) {
3913             aco_opcode sub =
3914                instr->def.bit_size == 16 ? aco_opcode::v_sub_f16 : aco_opcode::v_sub_f32;
3915             Temp tl = bld.vop1_dpp(aco_opcode::v_mov_b32, bld.def(v1), src, dpp_ctrl1);
3916             bld.vop2_dpp(sub, Definition(dst), src, tl, dpp_ctrl2);
3917          } else {
3918             Temp tl = bld.ds(aco_opcode::ds_swizzle_b32, bld.def(v1), src, (1 << 15) | dpp_ctrl1);
3919             Temp tr = bld.ds(aco_opcode::ds_swizzle_b32, bld.def(v1), src, (1 << 15) | dpp_ctrl2);
3920             bld.vop2(aco_opcode::v_sub_f32, Definition(dst), tr, tl);
3921          }
3922       }
3923       set_wqm(ctx, true);
3924       break;
3925    }
3926    default: isel_err(&instr->instr, "Unknown NIR ALU instr");
3927    }
3928 }
3929 
3930 void
visit_load_const(isel_context * ctx,nir_load_const_instr * instr)3931 visit_load_const(isel_context* ctx, nir_load_const_instr* instr)
3932 {
3933    Temp dst = get_ssa_temp(ctx, &instr->def);
3934 
3935    // TODO: we really want to have the resulting type as this would allow for 64bit literals
3936    // which get truncated the lsb if double and msb if int
3937    // for now, we only use s_mov_b64 with 64bit inline constants
3938    assert(instr->def.num_components == 1 && "Vector load_const should be lowered to scalar.");
3939    assert(dst.type() == RegType::sgpr);
3940 
3941    Builder bld(ctx->program, ctx->block);
3942 
3943    if (instr->def.bit_size == 1) {
3944       assert(dst.regClass() == bld.lm);
3945       int val = instr->value[0].b ? -1 : 0;
3946       Operand op = bld.lm.size() == 1 ? Operand::c32(val) : Operand::c64(val);
3947       bld.copy(Definition(dst), op);
3948    } else if (instr->def.bit_size == 8) {
3949       bld.copy(Definition(dst), Operand::c32(instr->value[0].u8));
3950    } else if (instr->def.bit_size == 16) {
3951       /* sign-extend to use s_movk_i32 instead of a literal */
3952       bld.copy(Definition(dst), Operand::c32(instr->value[0].i16));
3953    } else if (dst.size() == 1) {
3954       bld.copy(Definition(dst), Operand::c32(instr->value[0].u32));
3955    } else {
3956       assert(dst.size() != 1);
3957       aco_ptr<Pseudo_instruction> vec{create_instruction<Pseudo_instruction>(
3958          aco_opcode::p_create_vector, Format::PSEUDO, dst.size(), 1)};
3959       if (instr->def.bit_size == 64)
3960          for (unsigned i = 0; i < dst.size(); i++)
3961             vec->operands[i] = Operand::c32(instr->value[0].u64 >> i * 32);
3962       else {
3963          for (unsigned i = 0; i < dst.size(); i++)
3964             vec->operands[i] = Operand::c32(instr->value[i].u32);
3965       }
3966       vec->definitions[0] = Definition(dst);
3967       ctx->block->instructions.emplace_back(std::move(vec));
3968    }
3969 }
3970 
3971 Temp
emit_readfirstlane(isel_context * ctx,Temp src,Temp dst)3972 emit_readfirstlane(isel_context* ctx, Temp src, Temp dst)
3973 {
3974    Builder bld(ctx->program, ctx->block);
3975 
3976    if (src.regClass().type() == RegType::sgpr) {
3977       bld.copy(Definition(dst), src);
3978    } else if (src.size() == 1) {
3979       bld.vop1(aco_opcode::v_readfirstlane_b32, Definition(dst), src);
3980    } else {
3981       aco_ptr<Pseudo_instruction> split{create_instruction<Pseudo_instruction>(
3982          aco_opcode::p_split_vector, Format::PSEUDO, 1, src.size())};
3983       split->operands[0] = Operand(src);
3984 
3985       for (unsigned i = 0; i < src.size(); i++) {
3986          split->definitions[i] =
3987             bld.def(RegClass::get(RegType::vgpr, MIN2(src.bytes() - i * 4, 4)));
3988       }
3989 
3990       Instruction* split_raw = split.get();
3991       ctx->block->instructions.emplace_back(std::move(split));
3992 
3993       aco_ptr<Pseudo_instruction> vec{create_instruction<Pseudo_instruction>(
3994          aco_opcode::p_create_vector, Format::PSEUDO, src.size(), 1)};
3995       vec->definitions[0] = Definition(dst);
3996       for (unsigned i = 0; i < src.size(); i++) {
3997          vec->operands[i] = bld.vop1(aco_opcode::v_readfirstlane_b32, bld.def(s1),
3998                                      split_raw->definitions[i].getTemp());
3999       }
4000 
4001       ctx->block->instructions.emplace_back(std::move(vec));
4002       if (src.bytes() % 4 == 0)
4003          emit_split_vector(ctx, dst, src.size());
4004    }
4005 
4006    return dst;
4007 }
4008 
4009 bool
can_use_byte_align_for_global_load(unsigned num_components,unsigned component_size,unsigned align_,bool support_12_byte)4010 can_use_byte_align_for_global_load(unsigned num_components, unsigned component_size,
4011                                    unsigned align_, bool support_12_byte)
4012 {
4013    /* Only use byte-align for 8/16-bit loads if we won't have to increase it's size and won't have
4014     * to use unsupported load sizes.
4015     */
4016    assert(util_is_power_of_two_nonzero(align_));
4017    if (align_ < 4) {
4018       assert(component_size < 4);
4019       unsigned load_size = num_components * component_size;
4020       uint32_t new_size = align(load_size + (4 - align_), 4);
4021       return new_size == align(load_size, 4) && (new_size != 12 || support_12_byte);
4022    }
4023    return true;
4024 }
4025 
4026 struct LoadEmitInfo {
4027    Operand offset;
4028    Temp dst;
4029    unsigned num_components;
4030    unsigned component_size;
4031    Temp resource = Temp(0, s1); /* buffer resource or base 64-bit address */
4032    Temp idx = Temp(0, v1);      /* buffer index */
4033    unsigned component_stride = 0;
4034    unsigned const_offset = 0;
4035    unsigned align_mul = 0;
4036    unsigned align_offset = 0;
4037    pipe_format format;
4038 
4039    bool glc = false;
4040    bool slc = false;
4041    bool split_by_component_stride = true;
4042    bool readfirstlane_for_uniform = false;
4043    unsigned swizzle_component_size = 0;
4044    memory_sync_info sync;
4045    Temp soffset = Temp(0, s1);
4046 };
4047 
4048 struct EmitLoadParameters {
4049    using Callback = Temp (*)(Builder& bld, const LoadEmitInfo& info, Temp offset,
4050                              unsigned bytes_needed, unsigned align, unsigned const_offset,
4051                              Temp dst_hint);
4052 
4053    Callback callback;
4054    bool byte_align_loads;
4055    bool supports_8bit_16bit_loads;
4056    unsigned max_const_offset_plus_one;
4057 };
4058 
4059 void
emit_load(isel_context * ctx,Builder & bld,const LoadEmitInfo & info,const EmitLoadParameters & params)4060 emit_load(isel_context* ctx, Builder& bld, const LoadEmitInfo& info,
4061           const EmitLoadParameters& params)
4062 {
4063    unsigned load_size = info.num_components * info.component_size;
4064    unsigned component_size = info.component_size;
4065 
4066    unsigned num_vals = 0;
4067    Temp* const vals = (Temp*)alloca(info.dst.bytes() * sizeof(Temp));
4068 
4069    unsigned const_offset = info.const_offset;
4070 
4071    const unsigned align_mul = info.align_mul ? info.align_mul : component_size;
4072    unsigned align_offset = info.align_offset % align_mul;
4073 
4074    unsigned bytes_read = 0;
4075    while (bytes_read < load_size) {
4076       unsigned bytes_needed = load_size - bytes_read;
4077 
4078       /* add buffer for unaligned loads */
4079       int byte_align = 0;
4080       if (params.byte_align_loads) {
4081          byte_align = align_mul % 4 == 0 ? align_offset % 4 : -1;
4082       }
4083 
4084       if (byte_align) {
4085          if (bytes_needed > 2 || (bytes_needed == 2 && (align_mul % 2 || align_offset % 2)) ||
4086              !params.supports_8bit_16bit_loads) {
4087             if (info.component_stride) {
4088                assert(params.supports_8bit_16bit_loads && "unimplemented");
4089                bytes_needed = 2;
4090                byte_align = 0;
4091             } else {
4092                bytes_needed += byte_align == -1 ? 4 - info.align_mul : byte_align;
4093                bytes_needed = align(bytes_needed, 4);
4094             }
4095          } else {
4096             byte_align = 0;
4097          }
4098       }
4099 
4100       if (info.split_by_component_stride) {
4101          if (info.swizzle_component_size)
4102             bytes_needed = MIN2(bytes_needed, info.swizzle_component_size);
4103          if (info.component_stride)
4104             bytes_needed = MIN2(bytes_needed, info.component_size);
4105       }
4106 
4107       bool need_to_align_offset = byte_align && (align_mul % 4 || align_offset % 4);
4108 
4109       /* reduce constant offset */
4110       Operand offset = info.offset;
4111       unsigned reduced_const_offset = const_offset;
4112       bool remove_const_offset_completely = need_to_align_offset;
4113       if (const_offset &&
4114           (remove_const_offset_completely || const_offset >= params.max_const_offset_plus_one)) {
4115          unsigned to_add = const_offset;
4116          if (remove_const_offset_completely) {
4117             reduced_const_offset = 0;
4118          } else {
4119             to_add =
4120                const_offset / params.max_const_offset_plus_one * params.max_const_offset_plus_one;
4121             reduced_const_offset %= params.max_const_offset_plus_one;
4122          }
4123          Temp offset_tmp = offset.isTemp() ? offset.getTemp() : Temp();
4124          if (offset.isConstant()) {
4125             offset = Operand::c32(offset.constantValue() + to_add);
4126          } else if (offset.isUndefined()) {
4127             offset = Operand::c32(to_add);
4128          } else if (offset_tmp.regClass() == s1) {
4129             offset = bld.sop2(aco_opcode::s_add_i32, bld.def(s1), bld.def(s1, scc), offset_tmp,
4130                               Operand::c32(to_add));
4131          } else if (offset_tmp.regClass() == v1) {
4132             offset = bld.vadd32(bld.def(v1), offset_tmp, Operand::c32(to_add));
4133          } else {
4134             Temp lo = bld.tmp(offset_tmp.type(), 1);
4135             Temp hi = bld.tmp(offset_tmp.type(), 1);
4136             bld.pseudo(aco_opcode::p_split_vector, Definition(lo), Definition(hi), offset_tmp);
4137 
4138             if (offset_tmp.regClass() == s2) {
4139                Temp carry = bld.tmp(s1);
4140                lo = bld.sop2(aco_opcode::s_add_u32, bld.def(s1), bld.scc(Definition(carry)), lo,
4141                              Operand::c32(to_add));
4142                hi = bld.sop2(aco_opcode::s_add_u32, bld.def(s1), bld.def(s1, scc), hi, carry);
4143                offset = bld.pseudo(aco_opcode::p_create_vector, bld.def(s2), lo, hi);
4144             } else {
4145                Temp new_lo = bld.tmp(v1);
4146                Temp carry =
4147                   bld.vadd32(Definition(new_lo), lo, Operand::c32(to_add), true).def(1).getTemp();
4148                hi = bld.vadd32(bld.def(v1), hi, Operand::zero(), false, carry);
4149                offset = bld.pseudo(aco_opcode::p_create_vector, bld.def(v2), new_lo, hi);
4150             }
4151          }
4152       }
4153 
4154       /* align offset down if needed */
4155       Operand aligned_offset = offset;
4156       unsigned align = align_offset ? 1 << (ffs(align_offset) - 1) : align_mul;
4157       if (need_to_align_offset) {
4158          align = 4;
4159          Temp offset_tmp = offset.isTemp() ? offset.getTemp() : Temp();
4160          if (offset.isConstant()) {
4161             aligned_offset = Operand::c32(offset.constantValue() & 0xfffffffcu);
4162          } else if (offset.isUndefined()) {
4163             aligned_offset = Operand::zero();
4164          } else if (offset_tmp.regClass() == s1) {
4165             aligned_offset = bld.sop2(aco_opcode::s_and_b32, bld.def(s1), bld.def(s1, scc),
4166                                       Operand::c32(0xfffffffcu), offset_tmp);
4167          } else if (offset_tmp.regClass() == s2) {
4168             aligned_offset = bld.sop2(aco_opcode::s_and_b64, bld.def(s2), bld.def(s1, scc),
4169                                       Operand::c64(0xfffffffffffffffcllu), offset_tmp);
4170          } else if (offset_tmp.regClass() == v1) {
4171             aligned_offset =
4172                bld.vop2(aco_opcode::v_and_b32, bld.def(v1), Operand::c32(0xfffffffcu), offset_tmp);
4173          } else if (offset_tmp.regClass() == v2) {
4174             Temp hi = bld.tmp(v1), lo = bld.tmp(v1);
4175             bld.pseudo(aco_opcode::p_split_vector, Definition(lo), Definition(hi), offset_tmp);
4176             lo = bld.vop2(aco_opcode::v_and_b32, bld.def(v1), Operand::c32(0xfffffffcu), lo);
4177             aligned_offset = bld.pseudo(aco_opcode::p_create_vector, bld.def(v2), lo, hi);
4178          }
4179       }
4180       Temp aligned_offset_tmp = aligned_offset.isTemp() ? aligned_offset.getTemp()
4181                                 : aligned_offset.isConstant()
4182                                    ? bld.copy(bld.def(s1), aligned_offset)
4183                                    : Temp(0, s1);
4184 
4185       Temp val = params.callback(bld, info, aligned_offset_tmp, bytes_needed, align,
4186                                  reduced_const_offset, byte_align ? Temp() : info.dst);
4187 
4188       /* the callback wrote directly to dst */
4189       if (val == info.dst) {
4190          assert(num_vals == 0);
4191          emit_split_vector(ctx, info.dst, info.num_components);
4192          return;
4193       }
4194 
4195       /* shift result right if needed */
4196       if (params.byte_align_loads && info.component_size < 4) {
4197          Operand byte_align_off = Operand::c32(byte_align);
4198          if (byte_align == -1) {
4199             if (offset.isConstant())
4200                byte_align_off = Operand::c32(offset.constantValue() % 4u);
4201             else if (offset.isUndefined())
4202                byte_align_off = Operand::zero();
4203             else if (offset.size() == 2)
4204                byte_align_off = Operand(emit_extract_vector(ctx, offset.getTemp(), 0,
4205                                                             RegClass(offset.getTemp().type(), 1)));
4206             else
4207                byte_align_off = offset;
4208          }
4209 
4210          assert(val.bytes() >= load_size && "unimplemented");
4211          if (val.type() == RegType::sgpr)
4212             byte_align_scalar(ctx, val, byte_align_off, info.dst);
4213          else
4214             byte_align_vector(ctx, val, byte_align_off, info.dst, component_size);
4215          return;
4216       }
4217 
4218       /* add result to list and advance */
4219       if (info.component_stride) {
4220          assert(val.bytes() % info.component_size == 0);
4221          unsigned num_loaded_components = val.bytes() / info.component_size;
4222          unsigned advance_bytes = info.component_stride * num_loaded_components;
4223          const_offset += advance_bytes;
4224          align_offset = (align_offset + advance_bytes) % align_mul;
4225       } else {
4226          const_offset += val.bytes();
4227          align_offset = (align_offset + val.bytes()) % align_mul;
4228       }
4229       bytes_read += val.bytes();
4230       vals[num_vals++] = val;
4231    }
4232 
4233    /* create array of components */
4234    unsigned components_split = 0;
4235    std::array<Temp, NIR_MAX_VEC_COMPONENTS> allocated_vec;
4236    bool has_vgprs = false;
4237    for (unsigned i = 0; i < num_vals;) {
4238       Temp* const tmp = (Temp*)alloca(num_vals * sizeof(Temp));
4239       unsigned num_tmps = 0;
4240       unsigned tmp_size = 0;
4241       RegType reg_type = RegType::sgpr;
4242       while ((!tmp_size || (tmp_size % component_size)) && i < num_vals) {
4243          if (vals[i].type() == RegType::vgpr)
4244             reg_type = RegType::vgpr;
4245          tmp_size += vals[i].bytes();
4246          tmp[num_tmps++] = vals[i++];
4247       }
4248       if (num_tmps > 1) {
4249          aco_ptr<Pseudo_instruction> vec{create_instruction<Pseudo_instruction>(
4250             aco_opcode::p_create_vector, Format::PSEUDO, num_tmps, 1)};
4251          for (unsigned j = 0; j < num_tmps; j++)
4252             vec->operands[j] = Operand(tmp[j]);
4253          tmp[0] = bld.tmp(RegClass::get(reg_type, tmp_size));
4254          vec->definitions[0] = Definition(tmp[0]);
4255          bld.insert(std::move(vec));
4256       }
4257 
4258       if (tmp[0].bytes() % component_size) {
4259          /* trim tmp[0] */
4260          assert(i == num_vals);
4261          RegClass new_rc =
4262             RegClass::get(reg_type, tmp[0].bytes() / component_size * component_size);
4263          tmp[0] =
4264             bld.pseudo(aco_opcode::p_extract_vector, bld.def(new_rc), tmp[0], Operand::zero());
4265       }
4266 
4267       RegClass elem_rc = RegClass::get(reg_type, component_size);
4268 
4269       unsigned start = components_split;
4270 
4271       if (tmp_size == elem_rc.bytes()) {
4272          allocated_vec[components_split++] = tmp[0];
4273       } else {
4274          assert(tmp_size % elem_rc.bytes() == 0);
4275          aco_ptr<Pseudo_instruction> split{create_instruction<Pseudo_instruction>(
4276             aco_opcode::p_split_vector, Format::PSEUDO, 1, tmp_size / elem_rc.bytes())};
4277          for (auto& def : split->definitions) {
4278             Temp component = bld.tmp(elem_rc);
4279             allocated_vec[components_split++] = component;
4280             def = Definition(component);
4281          }
4282          split->operands[0] = Operand(tmp[0]);
4283          bld.insert(std::move(split));
4284       }
4285 
4286       /* try to p_as_uniform early so we can create more optimizable code and
4287        * also update allocated_vec */
4288       for (unsigned j = start; j < components_split; j++) {
4289          if (allocated_vec[j].bytes() % 4 == 0 && info.dst.type() == RegType::sgpr) {
4290             if (info.readfirstlane_for_uniform) {
4291                allocated_vec[j] = emit_readfirstlane(
4292                   ctx, allocated_vec[j], bld.tmp(RegClass(RegType::sgpr, allocated_vec[j].size())));
4293             } else {
4294                allocated_vec[j] = bld.as_uniform(allocated_vec[j]);
4295             }
4296          }
4297          has_vgprs |= allocated_vec[j].type() == RegType::vgpr;
4298       }
4299    }
4300 
4301    /* concatenate components and p_as_uniform() result if needed */
4302    if (info.dst.type() == RegType::vgpr || !has_vgprs)
4303       ctx->allocated_vec.emplace(info.dst.id(), allocated_vec);
4304 
4305    int padding_bytes =
4306       MAX2((int)info.dst.bytes() - int(allocated_vec[0].bytes() * info.num_components), 0);
4307 
4308    aco_ptr<Pseudo_instruction> vec{create_instruction<Pseudo_instruction>(
4309       aco_opcode::p_create_vector, Format::PSEUDO, info.num_components + !!padding_bytes, 1)};
4310    for (unsigned i = 0; i < info.num_components; i++)
4311       vec->operands[i] = Operand(allocated_vec[i]);
4312    if (padding_bytes)
4313       vec->operands[info.num_components] = Operand(RegClass::get(RegType::vgpr, padding_bytes));
4314    if (info.dst.type() == RegType::sgpr && has_vgprs) {
4315       Temp tmp = bld.tmp(RegType::vgpr, info.dst.size());
4316       vec->definitions[0] = Definition(tmp);
4317       bld.insert(std::move(vec));
4318       if (info.readfirstlane_for_uniform)
4319          emit_readfirstlane(ctx, tmp, info.dst);
4320       else
4321          bld.pseudo(aco_opcode::p_as_uniform, Definition(info.dst), tmp);
4322    } else {
4323       vec->definitions[0] = Definition(info.dst);
4324       bld.insert(std::move(vec));
4325    }
4326 }
4327 
4328 Operand
load_lds_size_m0(Builder & bld)4329 load_lds_size_m0(Builder& bld)
4330 {
4331    /* m0 does not need to be initialized on GFX9+ */
4332    if (bld.program->gfx_level >= GFX9)
4333       return Operand(s1);
4334 
4335    return bld.m0((Temp)bld.copy(bld.def(s1, m0), Operand::c32(0xffffffffu)));
4336 }
4337 
4338 Temp
lds_load_callback(Builder & bld,const LoadEmitInfo & info,Temp offset,unsigned bytes_needed,unsigned align,unsigned const_offset,Temp dst_hint)4339 lds_load_callback(Builder& bld, const LoadEmitInfo& info, Temp offset, unsigned bytes_needed,
4340                   unsigned align, unsigned const_offset, Temp dst_hint)
4341 {
4342    offset = offset.regClass() == s1 ? bld.copy(bld.def(v1), offset) : offset;
4343 
4344    Operand m = load_lds_size_m0(bld);
4345 
4346    bool large_ds_read = bld.program->gfx_level >= GFX7;
4347    bool usable_read2 = bld.program->gfx_level >= GFX7;
4348 
4349    bool read2 = false;
4350    unsigned size = 0;
4351    aco_opcode op;
4352    if (bytes_needed >= 16 && align % 16 == 0 && large_ds_read) {
4353       size = 16;
4354       op = aco_opcode::ds_read_b128;
4355    } else if (bytes_needed >= 16 && align % 8 == 0 && const_offset % 8 == 0 && usable_read2) {
4356       size = 16;
4357       read2 = true;
4358       op = aco_opcode::ds_read2_b64;
4359    } else if (bytes_needed >= 12 && align % 16 == 0 && large_ds_read) {
4360       size = 12;
4361       op = aco_opcode::ds_read_b96;
4362    } else if (bytes_needed >= 8 && align % 8 == 0) {
4363       size = 8;
4364       op = aco_opcode::ds_read_b64;
4365    } else if (bytes_needed >= 8 && align % 4 == 0 && const_offset % 4 == 0 && usable_read2) {
4366       size = 8;
4367       read2 = true;
4368       op = aco_opcode::ds_read2_b32;
4369    } else if (bytes_needed >= 4 && align % 4 == 0) {
4370       size = 4;
4371       op = aco_opcode::ds_read_b32;
4372    } else if (bytes_needed >= 2 && align % 2 == 0) {
4373       size = 2;
4374       op = bld.program->gfx_level >= GFX9 ? aco_opcode::ds_read_u16_d16 : aco_opcode::ds_read_u16;
4375    } else {
4376       size = 1;
4377       op = bld.program->gfx_level >= GFX9 ? aco_opcode::ds_read_u8_d16 : aco_opcode::ds_read_u8;
4378    }
4379 
4380    unsigned const_offset_unit = read2 ? size / 2u : 1u;
4381    unsigned const_offset_range = read2 ? 255 * const_offset_unit : 65536;
4382 
4383    if (const_offset > (const_offset_range - const_offset_unit)) {
4384       unsigned excess = const_offset - (const_offset % const_offset_range);
4385       offset = bld.vadd32(bld.def(v1), offset, Operand::c32(excess));
4386       const_offset -= excess;
4387    }
4388 
4389    const_offset /= const_offset_unit;
4390 
4391    RegClass rc = RegClass::get(RegType::vgpr, size);
4392    Temp val = rc == info.dst.regClass() && dst_hint.id() ? dst_hint : bld.tmp(rc);
4393    Instruction* instr;
4394    if (read2)
4395       instr = bld.ds(op, Definition(val), offset, m, const_offset, const_offset + 1);
4396    else
4397       instr = bld.ds(op, Definition(val), offset, m, const_offset);
4398    instr->ds().sync = info.sync;
4399 
4400    if (m.isUndefined())
4401       instr->operands.pop_back();
4402 
4403    return val;
4404 }
4405 
4406 const EmitLoadParameters lds_load_params{lds_load_callback, false, true, UINT32_MAX};
4407 
4408 Temp
smem_load_callback(Builder & bld,const LoadEmitInfo & info,Temp offset,unsigned bytes_needed,unsigned align,unsigned const_offset,Temp dst_hint)4409 smem_load_callback(Builder& bld, const LoadEmitInfo& info, Temp offset, unsigned bytes_needed,
4410                    unsigned align, unsigned const_offset, Temp dst_hint)
4411 {
4412    assert(align >= 4u);
4413 
4414    bld.program->has_smem_buffer_or_global_loads = true;
4415 
4416    bool buffer = info.resource.id() && info.resource.bytes() == 16;
4417    Temp addr = info.resource;
4418    if (!buffer && !addr.id()) {
4419       addr = offset;
4420       offset = Temp();
4421    }
4422 
4423    bytes_needed = MIN2(bytes_needed, 64);
4424    unsigned needed_round_up = util_next_power_of_two(bytes_needed);
4425    unsigned needed_round_down = needed_round_up >> (needed_round_up != bytes_needed ? 1 : 0);
4426    /* Only round-up global loads if it's aligned so that it won't cross pages */
4427    bytes_needed = buffer || align % needed_round_up == 0 ? needed_round_up : needed_round_down;
4428 
4429    aco_opcode op;
4430    if (bytes_needed <= 4) {
4431       op = buffer ? aco_opcode::s_buffer_load_dword : aco_opcode::s_load_dword;
4432    } else if (bytes_needed <= 8) {
4433       op = buffer ? aco_opcode::s_buffer_load_dwordx2 : aco_opcode::s_load_dwordx2;
4434    } else if (bytes_needed <= 16) {
4435       op = buffer ? aco_opcode::s_buffer_load_dwordx4 : aco_opcode::s_load_dwordx4;
4436    } else if (bytes_needed <= 32) {
4437       op = buffer ? aco_opcode::s_buffer_load_dwordx8 : aco_opcode::s_load_dwordx8;
4438    } else {
4439       assert(bytes_needed == 64);
4440       op = buffer ? aco_opcode::s_buffer_load_dwordx16 : aco_opcode::s_load_dwordx16;
4441    }
4442 
4443    aco_ptr<SMEM_instruction> load{create_instruction<SMEM_instruction>(op, Format::SMEM, 2, 1)};
4444    if (buffer) {
4445       if (const_offset)
4446          offset = bld.sop2(aco_opcode::s_add_u32, bld.def(s1), bld.def(s1, scc), offset,
4447                            Operand::c32(const_offset));
4448       load->operands[0] = Operand(info.resource);
4449       load->operands[1] = Operand(offset);
4450    } else {
4451       load->operands[0] = Operand(addr);
4452       if (offset.id() && const_offset)
4453          load->operands[1] = bld.sop2(aco_opcode::s_add_u32, bld.def(s1), bld.def(s1, scc), offset,
4454                                       Operand::c32(const_offset));
4455       else if (offset.id())
4456          load->operands[1] = Operand(offset);
4457       else
4458          load->operands[1] = Operand::c32(const_offset);
4459    }
4460    RegClass rc(RegType::sgpr, DIV_ROUND_UP(bytes_needed, 4u));
4461    Temp val = dst_hint.id() && dst_hint.regClass() == rc ? dst_hint : bld.tmp(rc);
4462    load->definitions[0] = Definition(val);
4463    load->glc = info.glc;
4464    load->dlc = info.glc && (bld.program->gfx_level == GFX10 || bld.program->gfx_level == GFX10_3);
4465    load->sync = info.sync;
4466    bld.insert(std::move(load));
4467    return val;
4468 }
4469 
4470 const EmitLoadParameters smem_load_params{smem_load_callback, true, false, 1024};
4471 
4472 Temp
mubuf_load_callback(Builder & bld,const LoadEmitInfo & info,Temp offset,unsigned bytes_needed,unsigned align_,unsigned const_offset,Temp dst_hint)4473 mubuf_load_callback(Builder& bld, const LoadEmitInfo& info, Temp offset, unsigned bytes_needed,
4474                     unsigned align_, unsigned const_offset, Temp dst_hint)
4475 {
4476    Operand vaddr = offset.type() == RegType::vgpr ? Operand(offset) : Operand(v1);
4477    Operand soffset = offset.type() == RegType::sgpr ? Operand(offset) : Operand::c32(0);
4478 
4479    if (info.soffset.id()) {
4480       if (soffset.isTemp())
4481          vaddr = bld.copy(bld.def(v1), soffset);
4482       soffset = Operand(info.soffset);
4483    }
4484 
4485    if (soffset.isUndefined())
4486       soffset = Operand::zero();
4487 
4488    bool offen = !vaddr.isUndefined();
4489    bool idxen = info.idx.id();
4490 
4491    if (offen && idxen)
4492       vaddr = bld.pseudo(aco_opcode::p_create_vector, bld.def(v2), info.idx, vaddr);
4493    else if (idxen)
4494       vaddr = Operand(info.idx);
4495 
4496    unsigned bytes_size = 0;
4497    aco_opcode op;
4498    if (bytes_needed == 1 || align_ % 2) {
4499       bytes_size = 1;
4500       op = aco_opcode::buffer_load_ubyte;
4501    } else if (bytes_needed == 2 || align_ % 4) {
4502       bytes_size = 2;
4503       op = aco_opcode::buffer_load_ushort;
4504    } else if (bytes_needed <= 4) {
4505       bytes_size = 4;
4506       op = aco_opcode::buffer_load_dword;
4507    } else if (bytes_needed <= 8) {
4508       bytes_size = 8;
4509       op = aco_opcode::buffer_load_dwordx2;
4510    } else if (bytes_needed <= 12 && bld.program->gfx_level > GFX6) {
4511       bytes_size = 12;
4512       op = aco_opcode::buffer_load_dwordx3;
4513    } else {
4514       bytes_size = 16;
4515       op = aco_opcode::buffer_load_dwordx4;
4516    }
4517    aco_ptr<MUBUF_instruction> mubuf{create_instruction<MUBUF_instruction>(op, Format::MUBUF, 3, 1)};
4518    mubuf->operands[0] = Operand(info.resource);
4519    mubuf->operands[1] = vaddr;
4520    mubuf->operands[2] = soffset;
4521    mubuf->offen = offen;
4522    mubuf->idxen = idxen;
4523    mubuf->glc = info.glc;
4524    mubuf->dlc = info.glc && (bld.program->gfx_level == GFX10 || bld.program->gfx_level == GFX10_3);
4525    mubuf->slc = info.slc;
4526    mubuf->sync = info.sync;
4527    mubuf->offset = const_offset;
4528    mubuf->swizzled = info.swizzle_component_size != 0;
4529    RegClass rc = RegClass::get(RegType::vgpr, bytes_size);
4530    Temp val = dst_hint.id() && rc == dst_hint.regClass() ? dst_hint : bld.tmp(rc);
4531    mubuf->definitions[0] = Definition(val);
4532    bld.insert(std::move(mubuf));
4533 
4534    return val;
4535 }
4536 
4537 const EmitLoadParameters mubuf_load_params{mubuf_load_callback, true, true, 4096};
4538 
4539 Temp
mubuf_load_format_callback(Builder & bld,const LoadEmitInfo & info,Temp offset,unsigned bytes_needed,unsigned align_,unsigned const_offset,Temp dst_hint)4540 mubuf_load_format_callback(Builder& bld, const LoadEmitInfo& info, Temp offset,
4541                            unsigned bytes_needed, unsigned align_, unsigned const_offset,
4542                            Temp dst_hint)
4543 {
4544    Operand vaddr = offset.type() == RegType::vgpr ? Operand(offset) : Operand(v1);
4545    Operand soffset = offset.type() == RegType::sgpr ? Operand(offset) : Operand::c32(0);
4546 
4547    if (info.soffset.id()) {
4548       if (soffset.isTemp())
4549          vaddr = bld.copy(bld.def(v1), soffset);
4550       soffset = Operand(info.soffset);
4551    }
4552 
4553    if (soffset.isUndefined())
4554       soffset = Operand::zero();
4555 
4556    bool offen = !vaddr.isUndefined();
4557    bool idxen = info.idx.id();
4558 
4559    if (offen && idxen)
4560       vaddr = bld.pseudo(aco_opcode::p_create_vector, bld.def(v2), info.idx, vaddr);
4561    else if (idxen)
4562       vaddr = Operand(info.idx);
4563 
4564    aco_opcode op = aco_opcode::num_opcodes;
4565    if (info.component_size == 2) {
4566       switch (bytes_needed) {
4567       case 2: op = aco_opcode::buffer_load_format_d16_x; break;
4568       case 4: op = aco_opcode::buffer_load_format_d16_xy; break;
4569       case 6: op = aco_opcode::buffer_load_format_d16_xyz; break;
4570       case 8: op = aco_opcode::buffer_load_format_d16_xyzw; break;
4571       default: unreachable("invalid buffer load format size"); break;
4572       }
4573    } else {
4574       assert(info.component_size == 4);
4575       switch (bytes_needed) {
4576       case 4: op = aco_opcode::buffer_load_format_x; break;
4577       case 8: op = aco_opcode::buffer_load_format_xy; break;
4578       case 12: op = aco_opcode::buffer_load_format_xyz; break;
4579       case 16: op = aco_opcode::buffer_load_format_xyzw; break;
4580       default: unreachable("invalid buffer load format size"); break;
4581       }
4582    }
4583 
4584    aco_ptr<MUBUF_instruction> mubuf{create_instruction<MUBUF_instruction>(op, Format::MUBUF, 3, 1)};
4585    mubuf->operands[0] = Operand(info.resource);
4586    mubuf->operands[1] = vaddr;
4587    mubuf->operands[2] = soffset;
4588    mubuf->offen = offen;
4589    mubuf->idxen = idxen;
4590    mubuf->glc = info.glc;
4591    mubuf->dlc = info.glc && (bld.program->gfx_level == GFX10 || bld.program->gfx_level == GFX10_3);
4592    mubuf->slc = info.slc;
4593    mubuf->sync = info.sync;
4594    mubuf->offset = const_offset;
4595    RegClass rc = RegClass::get(RegType::vgpr, bytes_needed);
4596    Temp val = dst_hint.id() && rc == dst_hint.regClass() ? dst_hint : bld.tmp(rc);
4597    mubuf->definitions[0] = Definition(val);
4598    bld.insert(std::move(mubuf));
4599 
4600    return val;
4601 }
4602 
4603 const EmitLoadParameters mubuf_load_format_params{mubuf_load_format_callback, false, true, 4096};
4604 
4605 Temp
scratch_load_callback(Builder & bld,const LoadEmitInfo & info,Temp offset,unsigned bytes_needed,unsigned align_,unsigned const_offset,Temp dst_hint)4606 scratch_load_callback(Builder& bld, const LoadEmitInfo& info, Temp offset, unsigned bytes_needed,
4607                       unsigned align_, unsigned const_offset, Temp dst_hint)
4608 {
4609    unsigned bytes_size = 0;
4610    aco_opcode op;
4611    if (bytes_needed == 1 || align_ % 2u) {
4612       bytes_size = 1;
4613       op = aco_opcode::scratch_load_ubyte;
4614    } else if (bytes_needed == 2 || align_ % 4u) {
4615       bytes_size = 2;
4616       op = aco_opcode::scratch_load_ushort;
4617    } else if (bytes_needed <= 4) {
4618       bytes_size = 4;
4619       op = aco_opcode::scratch_load_dword;
4620    } else if (bytes_needed <= 8) {
4621       bytes_size = 8;
4622       op = aco_opcode::scratch_load_dwordx2;
4623    } else if (bytes_needed <= 12) {
4624       bytes_size = 12;
4625       op = aco_opcode::scratch_load_dwordx3;
4626    } else {
4627       bytes_size = 16;
4628       op = aco_opcode::scratch_load_dwordx4;
4629    }
4630    RegClass rc = RegClass::get(RegType::vgpr, bytes_size);
4631    Temp val = dst_hint.id() && rc == dst_hint.regClass() ? dst_hint : bld.tmp(rc);
4632    aco_ptr<FLAT_instruction> flat{create_instruction<FLAT_instruction>(op, Format::SCRATCH, 2, 1)};
4633    flat->operands[0] = offset.regClass() == s1 ? Operand(v1) : Operand(offset);
4634    flat->operands[1] = offset.regClass() == s1 ? Operand(offset) : Operand(s1);
4635    flat->sync = info.sync;
4636    flat->offset = const_offset;
4637    flat->definitions[0] = Definition(val);
4638    bld.insert(std::move(flat));
4639 
4640    return val;
4641 }
4642 
4643 const EmitLoadParameters scratch_mubuf_load_params{mubuf_load_callback, false, true, 4096};
4644 const EmitLoadParameters scratch_flat_load_params{scratch_load_callback, false, true, 2048};
4645 
4646 Temp
get_gfx6_global_rsrc(Builder & bld,Temp addr)4647 get_gfx6_global_rsrc(Builder& bld, Temp addr)
4648 {
4649    uint32_t rsrc_conf = S_008F0C_NUM_FORMAT(V_008F0C_BUF_NUM_FORMAT_FLOAT) |
4650                         S_008F0C_DATA_FORMAT(V_008F0C_BUF_DATA_FORMAT_32);
4651 
4652    if (addr.type() == RegType::vgpr)
4653       return bld.pseudo(aco_opcode::p_create_vector, bld.def(s4), Operand::zero(), Operand::zero(),
4654                         Operand::c32(-1u), Operand::c32(rsrc_conf));
4655    return bld.pseudo(aco_opcode::p_create_vector, bld.def(s4), addr, Operand::c32(-1u),
4656                      Operand::c32(rsrc_conf));
4657 }
4658 
4659 Temp
add64_32(Builder & bld,Temp src0,Temp src1)4660 add64_32(Builder& bld, Temp src0, Temp src1)
4661 {
4662    Temp src00 = bld.tmp(src0.type(), 1);
4663    Temp src01 = bld.tmp(src0.type(), 1);
4664    bld.pseudo(aco_opcode::p_split_vector, Definition(src00), Definition(src01), src0);
4665 
4666    if (src0.type() == RegType::vgpr || src1.type() == RegType::vgpr) {
4667       Temp dst0 = bld.tmp(v1);
4668       Temp carry = bld.vadd32(Definition(dst0), src00, src1, true).def(1).getTemp();
4669       Temp dst1 = bld.vadd32(bld.def(v1), src01, Operand::zero(), false, carry);
4670       return bld.pseudo(aco_opcode::p_create_vector, bld.def(v2), dst0, dst1);
4671    } else {
4672       Temp carry = bld.tmp(s1);
4673       Temp dst0 =
4674          bld.sop2(aco_opcode::s_add_u32, bld.def(s1), bld.scc(Definition(carry)), src00, src1);
4675       Temp dst1 = bld.sop2(aco_opcode::s_add_u32, bld.def(s1), bld.def(s1, scc), src01, carry);
4676       return bld.pseudo(aco_opcode::p_create_vector, bld.def(s2), dst0, dst1);
4677    }
4678 }
4679 
4680 void
lower_global_address(Builder & bld,uint32_t offset_in,Temp * address_inout,uint32_t * const_offset_inout,Temp * offset_inout)4681 lower_global_address(Builder& bld, uint32_t offset_in, Temp* address_inout,
4682                      uint32_t* const_offset_inout, Temp* offset_inout)
4683 {
4684    Temp address = *address_inout;
4685    uint64_t const_offset = *const_offset_inout + offset_in;
4686    Temp offset = *offset_inout;
4687 
4688    uint64_t max_const_offset_plus_one =
4689       1; /* GFX7/8/9: FLAT loads do not support constant offsets */
4690    if (bld.program->gfx_level >= GFX9)
4691       max_const_offset_plus_one = bld.program->dev.scratch_global_offset_max;
4692    else if (bld.program->gfx_level == GFX6)
4693       max_const_offset_plus_one = 4096; /* MUBUF has a 12-bit unsigned offset field */
4694    uint64_t excess_offset = const_offset - (const_offset % max_const_offset_plus_one);
4695    const_offset %= max_const_offset_plus_one;
4696 
4697    if (!offset.id()) {
4698       while (unlikely(excess_offset > UINT32_MAX)) {
4699          address = add64_32(bld, address, bld.copy(bld.def(s1), Operand::c32(UINT32_MAX)));
4700          excess_offset -= UINT32_MAX;
4701       }
4702       if (excess_offset)
4703          offset = bld.copy(bld.def(s1), Operand::c32(excess_offset));
4704    } else {
4705       /* If we add to "offset", we would transform the indended
4706        * "address + u2u64(offset) + u2u64(const_offset)" into
4707        * "address + u2u64(offset + const_offset)", so add to the address.
4708        * This could be more efficient if excess_offset>UINT32_MAX by doing a full 64-bit addition,
4709        * but that should be really rare.
4710        */
4711       while (excess_offset) {
4712          uint32_t src2 = MIN2(excess_offset, UINT32_MAX);
4713          address = add64_32(bld, address, bld.copy(bld.def(s1), Operand::c32(src2)));
4714          excess_offset -= src2;
4715       }
4716    }
4717 
4718    if (bld.program->gfx_level == GFX6) {
4719       /* GFX6 (MUBUF): (SGPR address, SGPR offset) or (VGPR address, SGPR offset) */
4720       if (offset.type() != RegType::sgpr) {
4721          address = add64_32(bld, address, offset);
4722          offset = Temp();
4723       }
4724       offset = offset.id() ? offset : bld.copy(bld.def(s1), Operand::zero());
4725    } else if (bld.program->gfx_level <= GFX8) {
4726       /* GFX7,8 (FLAT): VGPR address */
4727       if (offset.id()) {
4728          address = add64_32(bld, address, offset);
4729          offset = Temp();
4730       }
4731       address = as_vgpr(bld, address);
4732    } else {
4733       /* GFX9+ (GLOBAL): (VGPR address), or (SGPR address and VGPR offset) */
4734       if (address.type() == RegType::vgpr && offset.id()) {
4735          address = add64_32(bld, address, offset);
4736          offset = Temp();
4737       } else if (address.type() == RegType::sgpr && offset.id()) {
4738          offset = as_vgpr(bld, offset);
4739       }
4740       if (address.type() == RegType::sgpr && !offset.id())
4741          offset = bld.copy(bld.def(v1), bld.copy(bld.def(s1), Operand::zero()));
4742    }
4743 
4744    *address_inout = address;
4745    *const_offset_inout = const_offset;
4746    *offset_inout = offset;
4747 }
4748 
4749 Temp
global_load_callback(Builder & bld,const LoadEmitInfo & info,Temp offset,unsigned bytes_needed,unsigned align_,unsigned const_offset,Temp dst_hint)4750 global_load_callback(Builder& bld, const LoadEmitInfo& info, Temp offset, unsigned bytes_needed,
4751                      unsigned align_, unsigned const_offset, Temp dst_hint)
4752 {
4753    Temp addr = info.resource;
4754    if (!addr.id()) {
4755       addr = offset;
4756       offset = Temp();
4757    }
4758    lower_global_address(bld, 0, &addr, &const_offset, &offset);
4759 
4760    unsigned bytes_size = 0;
4761    bool use_mubuf = bld.program->gfx_level == GFX6;
4762    bool global = bld.program->gfx_level >= GFX9;
4763    aco_opcode op;
4764    if (bytes_needed == 1 || align_ % 2u) {
4765       bytes_size = 1;
4766       op = use_mubuf ? aco_opcode::buffer_load_ubyte
4767            : global  ? aco_opcode::global_load_ubyte
4768                      : aco_opcode::flat_load_ubyte;
4769    } else if (bytes_needed == 2 || align_ % 4u) {
4770       bytes_size = 2;
4771       op = use_mubuf ? aco_opcode::buffer_load_ushort
4772            : global  ? aco_opcode::global_load_ushort
4773                      : aco_opcode::flat_load_ushort;
4774    } else if (bytes_needed <= 4) {
4775       bytes_size = 4;
4776       op = use_mubuf ? aco_opcode::buffer_load_dword
4777            : global  ? aco_opcode::global_load_dword
4778                      : aco_opcode::flat_load_dword;
4779    } else if (bytes_needed <= 8 || (bytes_needed <= 12 && use_mubuf)) {
4780       bytes_size = 8;
4781       op = use_mubuf ? aco_opcode::buffer_load_dwordx2
4782            : global  ? aco_opcode::global_load_dwordx2
4783                      : aco_opcode::flat_load_dwordx2;
4784    } else if (bytes_needed <= 12 && !use_mubuf) {
4785       bytes_size = 12;
4786       op = global ? aco_opcode::global_load_dwordx3 : aco_opcode::flat_load_dwordx3;
4787    } else {
4788       bytes_size = 16;
4789       op = use_mubuf ? aco_opcode::buffer_load_dwordx4
4790            : global  ? aco_opcode::global_load_dwordx4
4791                      : aco_opcode::flat_load_dwordx4;
4792    }
4793    RegClass rc = RegClass::get(RegType::vgpr, bytes_size);
4794    Temp val = dst_hint.id() && rc == dst_hint.regClass() ? dst_hint : bld.tmp(rc);
4795    if (use_mubuf) {
4796       aco_ptr<MUBUF_instruction> mubuf{
4797          create_instruction<MUBUF_instruction>(op, Format::MUBUF, 3, 1)};
4798       mubuf->operands[0] = Operand(get_gfx6_global_rsrc(bld, addr));
4799       mubuf->operands[1] = addr.type() == RegType::vgpr ? Operand(addr) : Operand(v1);
4800       mubuf->operands[2] = Operand(offset);
4801       mubuf->glc = info.glc;
4802       mubuf->dlc = false;
4803       mubuf->offset = const_offset;
4804       mubuf->addr64 = addr.type() == RegType::vgpr;
4805       mubuf->disable_wqm = false;
4806       mubuf->sync = info.sync;
4807       mubuf->definitions[0] = Definition(val);
4808       bld.insert(std::move(mubuf));
4809    } else {
4810       aco_ptr<FLAT_instruction> flat{
4811          create_instruction<FLAT_instruction>(op, global ? Format::GLOBAL : Format::FLAT, 2, 1)};
4812       if (addr.regClass() == s2) {
4813          assert(global && offset.id() && offset.type() == RegType::vgpr);
4814          flat->operands[0] = Operand(offset);
4815          flat->operands[1] = Operand(addr);
4816       } else {
4817          assert(addr.type() == RegType::vgpr && !offset.id());
4818          flat->operands[0] = Operand(addr);
4819          flat->operands[1] = Operand(s1);
4820       }
4821       flat->glc = info.glc;
4822       flat->dlc =
4823          info.glc && (bld.program->gfx_level == GFX10 || bld.program->gfx_level == GFX10_3);
4824       flat->sync = info.sync;
4825       assert(global || !const_offset);
4826       flat->offset = const_offset;
4827       flat->definitions[0] = Definition(val);
4828       bld.insert(std::move(flat));
4829    }
4830 
4831    return val;
4832 }
4833 
4834 const EmitLoadParameters global_load_params{global_load_callback, true, true, UINT32_MAX};
4835 
4836 Temp
load_lds(isel_context * ctx,unsigned elem_size_bytes,unsigned num_components,Temp dst,Temp address,unsigned base_offset,unsigned align)4837 load_lds(isel_context* ctx, unsigned elem_size_bytes, unsigned num_components, Temp dst,
4838          Temp address, unsigned base_offset, unsigned align)
4839 {
4840    assert(util_is_power_of_two_nonzero(align));
4841 
4842    Builder bld(ctx->program, ctx->block);
4843 
4844    LoadEmitInfo info = {Operand(as_vgpr(ctx, address)), dst, num_components, elem_size_bytes};
4845    info.align_mul = align;
4846    info.align_offset = 0;
4847    info.sync = memory_sync_info(storage_shared);
4848    info.const_offset = base_offset;
4849    /* The 2 separate loads for gfx10+ wave64 can see different values, even for uniform addresses,
4850     * if another wave writes LDS in between. Use v_readfirstlane instead of p_as_uniform in order
4851     * to avoid copy-propagation.
4852     */
4853    info.readfirstlane_for_uniform = ctx->options->gfx_level >= GFX10 &&
4854                                     ctx->program->wave_size == 64 &&
4855                                     ctx->program->workgroup_size > 64;
4856    emit_load(ctx, bld, info, lds_load_params);
4857 
4858    return dst;
4859 }
4860 
4861 void
split_store_data(isel_context * ctx,RegType dst_type,unsigned count,Temp * dst,unsigned * bytes,Temp src)4862 split_store_data(isel_context* ctx, RegType dst_type, unsigned count, Temp* dst, unsigned* bytes,
4863                  Temp src)
4864 {
4865    if (!count)
4866       return;
4867 
4868    Builder bld(ctx->program, ctx->block);
4869 
4870    /* count == 1 fast path */
4871    if (count == 1) {
4872       if (dst_type == RegType::sgpr)
4873          dst[0] = bld.as_uniform(src);
4874       else
4875          dst[0] = as_vgpr(ctx, src);
4876       return;
4877    }
4878 
4879    /* elem_size_bytes is the greatest common divisor which is a power of 2 */
4880    unsigned elem_size_bytes =
4881       1u << (ffs(std::accumulate(bytes, bytes + count, 8, std::bit_or<>{})) - 1);
4882 
4883    ASSERTED bool is_subdword = elem_size_bytes < 4;
4884    assert(!is_subdword || dst_type == RegType::vgpr);
4885 
4886    for (unsigned i = 0; i < count; i++)
4887       dst[i] = bld.tmp(RegClass::get(dst_type, bytes[i]));
4888 
4889    std::vector<Temp> temps;
4890    /* use allocated_vec if possible */
4891    auto it = ctx->allocated_vec.find(src.id());
4892    if (it != ctx->allocated_vec.end()) {
4893       if (!it->second[0].id())
4894          goto split;
4895       unsigned elem_size = it->second[0].bytes();
4896       assert(src.bytes() % elem_size == 0);
4897 
4898       for (unsigned i = 0; i < src.bytes() / elem_size; i++) {
4899          if (!it->second[i].id())
4900             goto split;
4901       }
4902       if (elem_size_bytes % elem_size)
4903          goto split;
4904 
4905       temps.insert(temps.end(), it->second.begin(), it->second.begin() + src.bytes() / elem_size);
4906       elem_size_bytes = elem_size;
4907    }
4908 
4909 split:
4910    /* split src if necessary */
4911    if (temps.empty()) {
4912       if (is_subdword && src.type() == RegType::sgpr)
4913          src = as_vgpr(ctx, src);
4914       if (dst_type == RegType::sgpr)
4915          src = bld.as_uniform(src);
4916 
4917       unsigned num_elems = src.bytes() / elem_size_bytes;
4918       aco_ptr<Instruction> split{create_instruction<Pseudo_instruction>(
4919          aco_opcode::p_split_vector, Format::PSEUDO, 1, num_elems)};
4920       split->operands[0] = Operand(src);
4921       for (unsigned i = 0; i < num_elems; i++) {
4922          temps.emplace_back(bld.tmp(RegClass::get(dst_type, elem_size_bytes)));
4923          split->definitions[i] = Definition(temps.back());
4924       }
4925       bld.insert(std::move(split));
4926    }
4927 
4928    unsigned idx = 0;
4929    for (unsigned i = 0; i < count; i++) {
4930       unsigned op_count = dst[i].bytes() / elem_size_bytes;
4931       if (op_count == 1) {
4932          if (dst_type == RegType::sgpr)
4933             dst[i] = bld.as_uniform(temps[idx++]);
4934          else
4935             dst[i] = as_vgpr(ctx, temps[idx++]);
4936          continue;
4937       }
4938 
4939       aco_ptr<Instruction> vec{create_instruction<Pseudo_instruction>(aco_opcode::p_create_vector,
4940                                                                       Format::PSEUDO, op_count, 1)};
4941       for (unsigned j = 0; j < op_count; j++) {
4942          Temp tmp = temps[idx++];
4943          if (dst_type == RegType::sgpr)
4944             tmp = bld.as_uniform(tmp);
4945          vec->operands[j] = Operand(tmp);
4946       }
4947       vec->definitions[0] = Definition(dst[i]);
4948       bld.insert(std::move(vec));
4949    }
4950    return;
4951 }
4952 
4953 bool
scan_write_mask(uint32_t mask,uint32_t todo_mask,int * start,int * count)4954 scan_write_mask(uint32_t mask, uint32_t todo_mask, int* start, int* count)
4955 {
4956    unsigned start_elem = ffs(todo_mask) - 1;
4957    bool skip = !(mask & (1 << start_elem));
4958    if (skip)
4959       mask = ~mask & todo_mask;
4960 
4961    mask &= todo_mask;
4962 
4963    u_bit_scan_consecutive_range(&mask, start, count);
4964 
4965    return !skip;
4966 }
4967 
4968 void
advance_write_mask(uint32_t * todo_mask,int start,int count)4969 advance_write_mask(uint32_t* todo_mask, int start, int count)
4970 {
4971    *todo_mask &= ~u_bit_consecutive(0, count) << start;
4972 }
4973 
4974 void
store_lds(isel_context * ctx,unsigned elem_size_bytes,Temp data,uint32_t wrmask,Temp address,unsigned base_offset,unsigned align)4975 store_lds(isel_context* ctx, unsigned elem_size_bytes, Temp data, uint32_t wrmask, Temp address,
4976           unsigned base_offset, unsigned align)
4977 {
4978    assert(util_is_power_of_two_nonzero(align));
4979    assert(util_is_power_of_two_nonzero(elem_size_bytes) && elem_size_bytes <= 8);
4980 
4981    Builder bld(ctx->program, ctx->block);
4982    bool large_ds_write = ctx->options->gfx_level >= GFX7;
4983    bool usable_write2 = ctx->options->gfx_level >= GFX7;
4984 
4985    unsigned write_count = 0;
4986    Temp write_datas[32];
4987    unsigned offsets[32];
4988    unsigned bytes[32];
4989    aco_opcode opcodes[32];
4990 
4991    wrmask = util_widen_mask(wrmask, elem_size_bytes);
4992 
4993    const unsigned wrmask_bitcnt = util_bitcount(wrmask);
4994    uint32_t todo = u_bit_consecutive(0, data.bytes());
4995 
4996    if (u_bit_consecutive(0, wrmask_bitcnt) == wrmask)
4997       todo = MIN2(todo, wrmask);
4998 
4999    while (todo) {
5000       int offset, byte;
5001       if (!scan_write_mask(wrmask, todo, &offset, &byte)) {
5002          offsets[write_count] = offset;
5003          bytes[write_count] = byte;
5004          opcodes[write_count] = aco_opcode::num_opcodes;
5005          write_count++;
5006          advance_write_mask(&todo, offset, byte);
5007          continue;
5008       }
5009 
5010       bool aligned2 = offset % 2 == 0 && align % 2 == 0;
5011       bool aligned4 = offset % 4 == 0 && align % 4 == 0;
5012       bool aligned8 = offset % 8 == 0 && align % 8 == 0;
5013       bool aligned16 = offset % 16 == 0 && align % 16 == 0;
5014 
5015       // TODO: use ds_write_b8_d16_hi/ds_write_b16_d16_hi if beneficial
5016       aco_opcode op = aco_opcode::num_opcodes;
5017       if (byte >= 16 && aligned16 && large_ds_write) {
5018          op = aco_opcode::ds_write_b128;
5019          byte = 16;
5020       } else if (byte >= 12 && aligned16 && large_ds_write) {
5021          op = aco_opcode::ds_write_b96;
5022          byte = 12;
5023       } else if (byte >= 8 && aligned8) {
5024          op = aco_opcode::ds_write_b64;
5025          byte = 8;
5026       } else if (byte >= 4 && aligned4) {
5027          op = aco_opcode::ds_write_b32;
5028          byte = 4;
5029       } else if (byte >= 2 && aligned2) {
5030          op = aco_opcode::ds_write_b16;
5031          byte = 2;
5032       } else if (byte >= 1) {
5033          op = aco_opcode::ds_write_b8;
5034          byte = 1;
5035       } else {
5036          assert(false);
5037       }
5038 
5039       offsets[write_count] = offset;
5040       bytes[write_count] = byte;
5041       opcodes[write_count] = op;
5042       write_count++;
5043       advance_write_mask(&todo, offset, byte);
5044    }
5045 
5046    Operand m = load_lds_size_m0(bld);
5047 
5048    split_store_data(ctx, RegType::vgpr, write_count, write_datas, bytes, data);
5049 
5050    for (unsigned i = 0; i < write_count; i++) {
5051       aco_opcode op = opcodes[i];
5052       if (op == aco_opcode::num_opcodes)
5053          continue;
5054 
5055       Temp split_data = write_datas[i];
5056 
5057       unsigned second = write_count;
5058       if (usable_write2 && (op == aco_opcode::ds_write_b32 || op == aco_opcode::ds_write_b64)) {
5059          for (second = i + 1; second < write_count; second++) {
5060             if (opcodes[second] == op && (offsets[second] - offsets[i]) % split_data.bytes() == 0) {
5061                op = split_data.bytes() == 4 ? aco_opcode::ds_write2_b32 : aco_opcode::ds_write2_b64;
5062                opcodes[second] = aco_opcode::num_opcodes;
5063                break;
5064             }
5065          }
5066       }
5067 
5068       bool write2 = op == aco_opcode::ds_write2_b32 || op == aco_opcode::ds_write2_b64;
5069       unsigned write2_off = (offsets[second] - offsets[i]) / split_data.bytes();
5070 
5071       unsigned inline_offset = base_offset + offsets[i];
5072       unsigned max_offset = write2 ? (255 - write2_off) * split_data.bytes() : 65535;
5073       Temp address_offset = address;
5074       if (inline_offset > max_offset) {
5075          address_offset = bld.vadd32(bld.def(v1), Operand::c32(base_offset), address_offset);
5076          inline_offset = offsets[i];
5077       }
5078 
5079       /* offsets[i] shouldn't be large enough for this to happen */
5080       assert(inline_offset <= max_offset);
5081 
5082       Instruction* instr;
5083       if (write2) {
5084          Temp second_data = write_datas[second];
5085          inline_offset /= split_data.bytes();
5086          instr = bld.ds(op, address_offset, split_data, second_data, m, inline_offset,
5087                         inline_offset + write2_off);
5088       } else {
5089          instr = bld.ds(op, address_offset, split_data, m, inline_offset);
5090       }
5091       instr->ds().sync = memory_sync_info(storage_shared);
5092 
5093       if (m.isUndefined())
5094          instr->operands.pop_back();
5095    }
5096 }
5097 
5098 aco_opcode
get_buffer_store_op(unsigned bytes)5099 get_buffer_store_op(unsigned bytes)
5100 {
5101    switch (bytes) {
5102    case 1: return aco_opcode::buffer_store_byte;
5103    case 2: return aco_opcode::buffer_store_short;
5104    case 4: return aco_opcode::buffer_store_dword;
5105    case 8: return aco_opcode::buffer_store_dwordx2;
5106    case 12: return aco_opcode::buffer_store_dwordx3;
5107    case 16: return aco_opcode::buffer_store_dwordx4;
5108    }
5109    unreachable("Unexpected store size");
5110    return aco_opcode::num_opcodes;
5111 }
5112 
5113 void
split_buffer_store(isel_context * ctx,nir_intrinsic_instr * instr,bool smem,RegType dst_type,Temp data,unsigned writemask,int swizzle_element_size,unsigned * write_count,Temp * write_datas,unsigned * offsets)5114 split_buffer_store(isel_context* ctx, nir_intrinsic_instr* instr, bool smem, RegType dst_type,
5115                    Temp data, unsigned writemask, int swizzle_element_size, unsigned* write_count,
5116                    Temp* write_datas, unsigned* offsets)
5117 {
5118    unsigned write_count_with_skips = 0;
5119    bool skips[16];
5120    unsigned bytes[16];
5121 
5122    /* determine how to split the data */
5123    unsigned todo = u_bit_consecutive(0, data.bytes());
5124    while (todo) {
5125       int offset, byte;
5126       skips[write_count_with_skips] = !scan_write_mask(writemask, todo, &offset, &byte);
5127       offsets[write_count_with_skips] = offset;
5128       if (skips[write_count_with_skips]) {
5129          bytes[write_count_with_skips] = byte;
5130          advance_write_mask(&todo, offset, byte);
5131          write_count_with_skips++;
5132          continue;
5133       }
5134 
5135       /* only supported sizes are 1, 2, 4, 8, 12 and 16 bytes and can't be
5136        * larger than swizzle_element_size */
5137       byte = MIN2(byte, swizzle_element_size);
5138       if (byte % 4)
5139          byte = byte > 4 ? byte & ~0x3 : MIN2(byte, 2);
5140 
5141       /* SMEM and GFX6 VMEM can't emit 12-byte stores */
5142       if ((ctx->program->gfx_level == GFX6 || smem) && byte == 12)
5143          byte = 8;
5144 
5145       /* dword or larger stores have to be dword-aligned */
5146       unsigned align_mul = instr ? nir_intrinsic_align_mul(instr) : 4;
5147       unsigned align_offset = (instr ? nir_intrinsic_align_offset(instr) : 0) + offset;
5148       bool dword_aligned = align_offset % 4 == 0 && align_mul % 4 == 0;
5149       if (!dword_aligned)
5150          byte = MIN2(byte, (align_offset % 2 == 0 && align_mul % 2 == 0) ? 2 : 1);
5151 
5152       bytes[write_count_with_skips] = byte;
5153       advance_write_mask(&todo, offset, byte);
5154       write_count_with_skips++;
5155    }
5156 
5157    /* actually split data */
5158    split_store_data(ctx, dst_type, write_count_with_skips, write_datas, bytes, data);
5159 
5160    /* remove skips */
5161    for (unsigned i = 0; i < write_count_with_skips; i++) {
5162       if (skips[i])
5163          continue;
5164       write_datas[*write_count] = write_datas[i];
5165       offsets[*write_count] = offsets[i];
5166       (*write_count)++;
5167    }
5168 }
5169 
5170 Temp
create_vec_from_array(isel_context * ctx,Temp arr[],unsigned cnt,RegType reg_type,unsigned elem_size_bytes,unsigned split_cnt=0u,Temp dst=Temp ())5171 create_vec_from_array(isel_context* ctx, Temp arr[], unsigned cnt, RegType reg_type,
5172                       unsigned elem_size_bytes, unsigned split_cnt = 0u, Temp dst = Temp())
5173 {
5174    Builder bld(ctx->program, ctx->block);
5175    unsigned dword_size = elem_size_bytes / 4;
5176 
5177    if (!dst.id())
5178       dst = bld.tmp(RegClass(reg_type, cnt * dword_size));
5179 
5180    std::array<Temp, NIR_MAX_VEC_COMPONENTS> allocated_vec;
5181    aco_ptr<Pseudo_instruction> instr{
5182       create_instruction<Pseudo_instruction>(aco_opcode::p_create_vector, Format::PSEUDO, cnt, 1)};
5183    instr->definitions[0] = Definition(dst);
5184 
5185    for (unsigned i = 0; i < cnt; ++i) {
5186       if (arr[i].id()) {
5187          assert(arr[i].size() == dword_size);
5188          allocated_vec[i] = arr[i];
5189          instr->operands[i] = Operand(arr[i]);
5190       } else {
5191          Temp zero = bld.copy(bld.def(RegClass(reg_type, dword_size)),
5192                               Operand::zero(dword_size == 2 ? 8 : 4));
5193          allocated_vec[i] = zero;
5194          instr->operands[i] = Operand(zero);
5195       }
5196    }
5197 
5198    bld.insert(std::move(instr));
5199 
5200    if (split_cnt)
5201       emit_split_vector(ctx, dst, split_cnt);
5202    else
5203       ctx->allocated_vec.emplace(dst.id(), allocated_vec); /* emit_split_vector already does this */
5204 
5205    return dst;
5206 }
5207 
5208 inline unsigned
resolve_excess_vmem_const_offset(Builder & bld,Temp & voffset,unsigned const_offset)5209 resolve_excess_vmem_const_offset(Builder& bld, Temp& voffset, unsigned const_offset)
5210 {
5211    if (const_offset >= 4096) {
5212       unsigned excess_const_offset = const_offset / 4096u * 4096u;
5213       const_offset %= 4096u;
5214 
5215       if (!voffset.id())
5216          voffset = bld.copy(bld.def(v1), Operand::c32(excess_const_offset));
5217       else if (unlikely(voffset.regClass() == s1))
5218          voffset = bld.sop2(aco_opcode::s_add_u32, bld.def(s1), bld.def(s1, scc),
5219                             Operand::c32(excess_const_offset), Operand(voffset));
5220       else if (likely(voffset.regClass() == v1))
5221          voffset = bld.vadd32(bld.def(v1), Operand(voffset), Operand::c32(excess_const_offset));
5222       else
5223          unreachable("Unsupported register class of voffset");
5224    }
5225 
5226    return const_offset;
5227 }
5228 
5229 void
emit_single_mubuf_store(isel_context * ctx,Temp descriptor,Temp voffset,Temp soffset,Temp idx,Temp vdata,unsigned const_offset,memory_sync_info sync,bool glc,bool slc,bool swizzled)5230 emit_single_mubuf_store(isel_context* ctx, Temp descriptor, Temp voffset, Temp soffset, Temp idx,
5231                         Temp vdata, unsigned const_offset, memory_sync_info sync, bool glc,
5232                         bool slc, bool swizzled)
5233 {
5234    assert(vdata.id());
5235    assert(vdata.size() != 3 || ctx->program->gfx_level != GFX6);
5236    assert(vdata.size() >= 1 && vdata.size() <= 4);
5237 
5238    Builder bld(ctx->program, ctx->block);
5239    aco_opcode op = get_buffer_store_op(vdata.bytes());
5240    const_offset = resolve_excess_vmem_const_offset(bld, voffset, const_offset);
5241 
5242    bool offen = voffset.id();
5243    bool idxen = idx.id();
5244 
5245    Operand soffset_op = soffset.id() ? Operand(soffset) : Operand::zero();
5246    glc &= ctx->program->gfx_level < GFX11;
5247 
5248    Operand vaddr_op(v1);
5249    if (offen && idxen)
5250       vaddr_op = bld.pseudo(aco_opcode::p_create_vector, bld.def(v2), idx, voffset);
5251    else if (offen)
5252       vaddr_op = Operand(voffset);
5253    else if (idxen)
5254       vaddr_op = Operand(idx);
5255 
5256    Builder::Result r =
5257       bld.mubuf(op, Operand(descriptor), vaddr_op, soffset_op, Operand(vdata), const_offset, offen,
5258                 swizzled, idxen, /* addr64 */ false, /* disable_wqm */ false, glc,
5259                 /* dlc*/ false, slc);
5260 
5261    r->mubuf().sync = sync;
5262 }
5263 
5264 void
store_vmem_mubuf(isel_context * ctx,Temp src,Temp descriptor,Temp voffset,Temp soffset,Temp idx,unsigned base_const_offset,unsigned elem_size_bytes,unsigned write_mask,bool swizzled,memory_sync_info sync,bool glc,bool slc)5265 store_vmem_mubuf(isel_context* ctx, Temp src, Temp descriptor, Temp voffset, Temp soffset, Temp idx,
5266                  unsigned base_const_offset, unsigned elem_size_bytes, unsigned write_mask,
5267                  bool swizzled, memory_sync_info sync, bool glc, bool slc)
5268 {
5269    Builder bld(ctx->program, ctx->block);
5270    assert(elem_size_bytes == 1 || elem_size_bytes == 2 || elem_size_bytes == 4 ||
5271           elem_size_bytes == 8);
5272    assert(write_mask);
5273    write_mask = util_widen_mask(write_mask, elem_size_bytes);
5274 
5275    unsigned write_count = 0;
5276    Temp write_datas[32];
5277    unsigned offsets[32];
5278    split_buffer_store(ctx, NULL, false, RegType::vgpr, src, write_mask,
5279                       swizzled && ctx->program->gfx_level <= GFX8 ? 4 : 16, &write_count,
5280                       write_datas, offsets);
5281 
5282    for (unsigned i = 0; i < write_count; i++) {
5283       unsigned const_offset = offsets[i] + base_const_offset;
5284       emit_single_mubuf_store(ctx, descriptor, voffset, soffset, idx, write_datas[i], const_offset,
5285                               sync, glc, slc, swizzled);
5286    }
5287 }
5288 
5289 Temp
wave_id_in_threadgroup(isel_context * ctx)5290 wave_id_in_threadgroup(isel_context* ctx)
5291 {
5292    Builder bld(ctx->program, ctx->block);
5293    return bld.sop2(aco_opcode::s_bfe_u32, bld.def(s1), bld.def(s1, scc),
5294                    get_arg(ctx, ctx->args->merged_wave_info), Operand::c32(24u | (4u << 16)));
5295 }
5296 
5297 Temp
thread_id_in_threadgroup(isel_context * ctx)5298 thread_id_in_threadgroup(isel_context* ctx)
5299 {
5300    /* tid_in_tg = wave_id * wave_size + tid_in_wave */
5301 
5302    Builder bld(ctx->program, ctx->block);
5303    Temp tid_in_wave = emit_mbcnt(ctx, bld.tmp(v1));
5304 
5305    if (ctx->program->workgroup_size <= ctx->program->wave_size)
5306       return tid_in_wave;
5307 
5308    Temp wave_id_in_tg = wave_id_in_threadgroup(ctx);
5309    Temp num_pre_threads =
5310       bld.sop2(aco_opcode::s_lshl_b32, bld.def(s1), bld.def(s1, scc), wave_id_in_tg,
5311                Operand::c32(ctx->program->wave_size == 64 ? 6u : 5u));
5312    return bld.vadd32(bld.def(v1), Operand(num_pre_threads), Operand(tid_in_wave));
5313 }
5314 
5315 bool
store_output_to_temps(isel_context * ctx,nir_intrinsic_instr * instr)5316 store_output_to_temps(isel_context* ctx, nir_intrinsic_instr* instr)
5317 {
5318    unsigned write_mask = nir_intrinsic_write_mask(instr);
5319    unsigned component = nir_intrinsic_component(instr);
5320    nir_src offset = *nir_get_io_offset_src(instr);
5321 
5322    if (!nir_src_is_const(offset) || nir_src_as_uint(offset))
5323       return false;
5324 
5325    Temp src = get_ssa_temp(ctx, instr->src[0].ssa);
5326 
5327    if (instr->src[0].ssa->bit_size == 64)
5328       write_mask = util_widen_mask(write_mask, 2);
5329 
5330    RegClass rc = instr->src[0].ssa->bit_size == 16 ? v2b : v1;
5331 
5332    /* Use semantic location as index. radv already uses it as intrinsic base
5333     * but radeonsi does not. We need to make LS output and TCS input index
5334     * match each other, so need to use semantic location explicitly. Also for
5335     * TCS epilog to index tess factor temps using semantic location directly.
5336     */
5337    nir_io_semantics sem = nir_intrinsic_io_semantics(instr);
5338    unsigned base = sem.location;
5339    if (ctx->stage == fragment_fs) {
5340       /* color result is a legacy slot which won't appear with data result
5341        * at the same time. Here we just use the data slot for it to simplify
5342        * code handling for both of them.
5343        */
5344       if (base == FRAG_RESULT_COLOR)
5345          base = FRAG_RESULT_DATA0;
5346 
5347       /* Sencond output of dual source blend just use data1 slot for simplicity,
5348        * because dual source blend does not support multi render target.
5349        */
5350       base += sem.dual_source_blend_index;
5351    }
5352    unsigned idx = base * 4u + component;
5353 
5354    for (unsigned i = 0; i < 8; ++i) {
5355       if (write_mask & (1 << i)) {
5356          ctx->outputs.mask[idx / 4u] |= 1 << (idx % 4u);
5357          ctx->outputs.temps[idx] = emit_extract_vector(ctx, src, i, rc);
5358       }
5359       idx++;
5360    }
5361 
5362    if (ctx->stage == fragment_fs && ctx->program->info.has_epilog && base >= FRAG_RESULT_DATA0) {
5363       unsigned index = base - FRAG_RESULT_DATA0;
5364 
5365       if (nir_intrinsic_src_type(instr) == nir_type_float16) {
5366          ctx->output_color_types |= ACO_TYPE_FLOAT16 << (index * 2);
5367       } else if (nir_intrinsic_src_type(instr) == nir_type_int16) {
5368          ctx->output_color_types |= ACO_TYPE_INT16 << (index * 2);
5369       } else if (nir_intrinsic_src_type(instr) == nir_type_uint16) {
5370          ctx->output_color_types |= ACO_TYPE_UINT16 << (index * 2);
5371       }
5372    }
5373 
5374    return true;
5375 }
5376 
5377 bool
load_input_from_temps(isel_context * ctx,nir_intrinsic_instr * instr,Temp dst)5378 load_input_from_temps(isel_context* ctx, nir_intrinsic_instr* instr, Temp dst)
5379 {
5380    /* Only TCS per-vertex inputs are supported by this function.
5381     * Per-vertex inputs only match between the VS/TCS invocation id when the number of invocations
5382     * is the same.
5383     */
5384    if (ctx->shader->info.stage != MESA_SHADER_TESS_CTRL || !ctx->tcs_in_out_eq)
5385       return false;
5386 
5387    nir_src* off_src = nir_get_io_offset_src(instr);
5388    nir_src* vertex_index_src = nir_get_io_arrayed_index_src(instr);
5389    nir_instr* vertex_index_instr = vertex_index_src->ssa->parent_instr;
5390    bool can_use_temps =
5391       nir_src_is_const(*off_src) && vertex_index_instr->type == nir_instr_type_intrinsic &&
5392       nir_instr_as_intrinsic(vertex_index_instr)->intrinsic == nir_intrinsic_load_invocation_id;
5393 
5394    if (!can_use_temps)
5395       return false;
5396 
5397    nir_io_semantics sem = nir_intrinsic_io_semantics(instr);
5398 
5399    unsigned idx =
5400       sem.location * 4u + nir_intrinsic_component(instr) + 4 * nir_src_as_uint(*off_src);
5401    Temp* src = &ctx->inputs.temps[idx];
5402    create_vec_from_array(ctx, src, dst.size(), dst.regClass().type(), 4u, 0, dst);
5403 
5404    return true;
5405 }
5406 
5407 void
visit_store_output(isel_context * ctx,nir_intrinsic_instr * instr)5408 visit_store_output(isel_context* ctx, nir_intrinsic_instr* instr)
5409 {
5410    /* LS pass output to TCS by temp if they have same in/out patch size. */
5411    bool ls_need_output = ctx->stage == vertex_tess_control_hs &&
5412                          ctx->shader->info.stage == MESA_SHADER_VERTEX && ctx->tcs_in_out_eq;
5413 
5414    bool tcs_need_output = ctx->shader->info.stage == MESA_SHADER_TESS_CTRL &&
5415                           ctx->program->info.has_epilog &&
5416                           ctx->program->info.tcs.pass_tessfactors_by_reg;
5417 
5418    bool ps_need_output = ctx->stage == fragment_fs;
5419 
5420    if (ls_need_output || tcs_need_output || ps_need_output) {
5421       bool stored_to_temps = store_output_to_temps(ctx, instr);
5422       if (!stored_to_temps) {
5423          isel_err(instr->src[1].ssa->parent_instr, "Unimplemented output offset instruction");
5424          abort();
5425       }
5426    } else {
5427       unreachable("Shader stage not implemented");
5428    }
5429 }
5430 
5431 bool
in_exec_divergent_or_in_loop(isel_context * ctx)5432 in_exec_divergent_or_in_loop(isel_context* ctx)
5433 {
5434    return ctx->block->loop_nest_depth || ctx->cf_info.parent_if.is_divergent ||
5435           ctx->cf_info.had_divergent_discard;
5436 }
5437 
5438 void
emit_interp_instr_gfx11(isel_context * ctx,unsigned idx,unsigned component,Temp src,Temp dst,Temp prim_mask)5439 emit_interp_instr_gfx11(isel_context* ctx, unsigned idx, unsigned component, Temp src, Temp dst,
5440                         Temp prim_mask)
5441 {
5442    Temp coord1 = emit_extract_vector(ctx, src, 0, v1);
5443    Temp coord2 = emit_extract_vector(ctx, src, 1, v1);
5444 
5445    Builder bld(ctx->program, ctx->block);
5446 
5447    if (in_exec_divergent_or_in_loop(ctx)) {
5448       Operand prim_mask_op = bld.m0(prim_mask);
5449       prim_mask_op.setLateKill(true); /* we don't want the bld.lm definition to use m0 */
5450       Operand coord2_op(coord2);
5451       coord2_op.setLateKill(true); /* we re-use the destination reg in the middle */
5452       bld.pseudo(aco_opcode::p_interp_gfx11, Definition(dst), Operand(v1.as_linear()),
5453                  Operand::c32(idx), Operand::c32(component), coord1, coord2_op, prim_mask_op);
5454       return;
5455    }
5456 
5457    Temp p = bld.ldsdir(aco_opcode::lds_param_load, bld.def(v1), bld.m0(prim_mask), idx, component);
5458 
5459    Temp res;
5460    if (dst.regClass() == v2b) {
5461       Temp p10 =
5462          bld.vinterp_inreg(aco_opcode::v_interp_p10_f16_f32_inreg, bld.def(v1), p, coord1, p);
5463       res = bld.vinterp_inreg(aco_opcode::v_interp_p2_f16_f32_inreg, bld.def(v1), p, coord2, p10);
5464       emit_extract_vector(ctx, res, 0, dst);
5465    } else {
5466       Temp p10 = bld.vinterp_inreg(aco_opcode::v_interp_p10_f32_inreg, bld.def(v1), p, coord1, p);
5467       bld.vinterp_inreg(aco_opcode::v_interp_p2_f32_inreg, Definition(dst), p, coord2, p10);
5468    }
5469    /* lds_param_load must be done in WQM, and the result kept valid for helper lanes. */
5470    set_wqm(ctx, true);
5471 }
5472 
5473 void
emit_interp_instr(isel_context * ctx,unsigned idx,unsigned component,Temp src,Temp dst,Temp prim_mask)5474 emit_interp_instr(isel_context* ctx, unsigned idx, unsigned component, Temp src, Temp dst,
5475                   Temp prim_mask)
5476 {
5477    if (ctx->options->gfx_level >= GFX11) {
5478       emit_interp_instr_gfx11(ctx, idx, component, src, dst, prim_mask);
5479       return;
5480    }
5481 
5482    Temp coord1 = emit_extract_vector(ctx, src, 0, v1);
5483    Temp coord2 = emit_extract_vector(ctx, src, 1, v1);
5484 
5485    Builder bld(ctx->program, ctx->block);
5486 
5487    if (dst.regClass() == v2b) {
5488       if (ctx->program->dev.has_16bank_lds) {
5489          assert(ctx->options->gfx_level <= GFX8);
5490          Builder::Result interp_p1 =
5491             bld.vintrp(aco_opcode::v_interp_mov_f32, bld.def(v1), Operand::c32(2u) /* P0 */,
5492                        bld.m0(prim_mask), idx, component);
5493          interp_p1 = bld.vintrp(aco_opcode::v_interp_p1lv_f16, bld.def(v2b), coord1,
5494                                 bld.m0(prim_mask), interp_p1, idx, component);
5495          bld.vintrp(aco_opcode::v_interp_p2_legacy_f16, Definition(dst), coord2, bld.m0(prim_mask),
5496                     interp_p1, idx, component);
5497       } else {
5498          aco_opcode interp_p2_op = aco_opcode::v_interp_p2_f16;
5499 
5500          if (ctx->options->gfx_level == GFX8)
5501             interp_p2_op = aco_opcode::v_interp_p2_legacy_f16;
5502 
5503          Builder::Result interp_p1 = bld.vintrp(aco_opcode::v_interp_p1ll_f16, bld.def(v1), coord1,
5504                                                 bld.m0(prim_mask), idx, component);
5505          bld.vintrp(interp_p2_op, Definition(dst), coord2, bld.m0(prim_mask), interp_p1, idx,
5506                     component);
5507       }
5508    } else {
5509       Builder::Result interp_p1 = bld.vintrp(aco_opcode::v_interp_p1_f32, bld.def(v1), coord1,
5510                                              bld.m0(prim_mask), idx, component);
5511 
5512       if (ctx->program->dev.has_16bank_lds)
5513          interp_p1->operands[0].setLateKill(true);
5514 
5515       bld.vintrp(aco_opcode::v_interp_p2_f32, Definition(dst), coord2, bld.m0(prim_mask), interp_p1,
5516                  idx, component);
5517    }
5518 }
5519 
5520 void
emit_interp_mov_instr(isel_context * ctx,unsigned idx,unsigned component,unsigned vertex_id,Temp dst,Temp prim_mask)5521 emit_interp_mov_instr(isel_context* ctx, unsigned idx, unsigned component, unsigned vertex_id,
5522                       Temp dst, Temp prim_mask)
5523 {
5524    Builder bld(ctx->program, ctx->block);
5525    if (ctx->options->gfx_level >= GFX11) {
5526       uint16_t dpp_ctrl = dpp_quad_perm(vertex_id, vertex_id, vertex_id, vertex_id);
5527       if (in_exec_divergent_or_in_loop(ctx)) {
5528          Operand prim_mask_op = bld.m0(prim_mask);
5529          prim_mask_op.setLateKill(true); /* we don't want the bld.lm definition to use m0 */
5530          bld.pseudo(aco_opcode::p_interp_gfx11, Definition(dst), Operand(v1.as_linear()),
5531                     Operand::c32(idx), Operand::c32(component), Operand::c32(dpp_ctrl),
5532                     prim_mask_op);
5533       } else {
5534          Temp p =
5535             bld.ldsdir(aco_opcode::lds_param_load, bld.def(v1), bld.m0(prim_mask), idx, component);
5536          if (dst.regClass() == v2b) {
5537             Temp res = bld.vop1_dpp(aco_opcode::v_mov_b32, bld.def(v1), p, dpp_ctrl);
5538             emit_extract_vector(ctx, res, 0, dst);
5539          } else {
5540             bld.vop1_dpp(aco_opcode::v_mov_b32, Definition(dst), p, dpp_ctrl);
5541          }
5542          /* lds_param_load must be done in WQM, and the result kept valid for helper lanes. */
5543          set_wqm(ctx, true);
5544       }
5545    } else {
5546       bld.vintrp(aco_opcode::v_interp_mov_f32, Definition(dst), Operand::c32((vertex_id + 2) % 3),
5547                  bld.m0(prim_mask), idx, component);
5548    }
5549 }
5550 
5551 void
emit_load_frag_coord(isel_context * ctx,Temp dst,unsigned num_components)5552 emit_load_frag_coord(isel_context* ctx, Temp dst, unsigned num_components)
5553 {
5554    Builder bld(ctx->program, ctx->block);
5555 
5556    aco_ptr<Pseudo_instruction> vec(create_instruction<Pseudo_instruction>(
5557       aco_opcode::p_create_vector, Format::PSEUDO, num_components, 1));
5558    for (unsigned i = 0; i < num_components; i++) {
5559       if (ctx->args->frag_pos[i].used)
5560          vec->operands[i] = Operand(get_arg(ctx, ctx->args->frag_pos[i]));
5561       else
5562          vec->operands[i] = Operand(v1);
5563    }
5564    if (G_0286CC_POS_W_FLOAT_ENA(ctx->program->config->spi_ps_input_ena)) {
5565       assert(num_components == 4);
5566       vec->operands[3] =
5567          bld.vop1(aco_opcode::v_rcp_f32, bld.def(v1), get_arg(ctx, ctx->args->frag_pos[3]));
5568    }
5569 
5570    for (Operand& op : vec->operands)
5571       op = op.isUndefined() ? Operand::zero() : op;
5572 
5573    vec->definitions[0] = Definition(dst);
5574    ctx->block->instructions.emplace_back(std::move(vec));
5575    emit_split_vector(ctx, dst, num_components);
5576    return;
5577 }
5578 
5579 void
emit_load_frag_shading_rate(isel_context * ctx,Temp dst)5580 emit_load_frag_shading_rate(isel_context* ctx, Temp dst)
5581 {
5582    Builder bld(ctx->program, ctx->block);
5583    Temp cond;
5584 
5585    /* VRS Rate X = Ancillary[2:3]
5586     * VRS Rate Y = Ancillary[4:5]
5587     */
5588    Temp x_rate = bld.vop3(aco_opcode::v_bfe_u32, bld.def(v1), get_arg(ctx, ctx->args->ancillary),
5589                           Operand::c32(2u), Operand::c32(2u));
5590    Temp y_rate = bld.vop3(aco_opcode::v_bfe_u32, bld.def(v1), get_arg(ctx, ctx->args->ancillary),
5591                           Operand::c32(4u), Operand::c32(2u));
5592 
5593    /* xRate = xRate == 0x1 ? Horizontal2Pixels : None. */
5594    cond = bld.vopc(aco_opcode::v_cmp_eq_i32, bld.def(bld.lm), Operand::c32(1u), Operand(x_rate));
5595    x_rate = bld.vop2(aco_opcode::v_cndmask_b32, bld.def(v1), bld.copy(bld.def(v1), Operand::zero()),
5596                      bld.copy(bld.def(v1), Operand::c32(4u)), cond);
5597 
5598    /* yRate = yRate == 0x1 ? Vertical2Pixels : None. */
5599    cond = bld.vopc(aco_opcode::v_cmp_eq_i32, bld.def(bld.lm), Operand::c32(1u), Operand(y_rate));
5600    y_rate = bld.vop2(aco_opcode::v_cndmask_b32, bld.def(v1), bld.copy(bld.def(v1), Operand::zero()),
5601                      bld.copy(bld.def(v1), Operand::c32(1u)), cond);
5602 
5603    bld.vop2(aco_opcode::v_or_b32, Definition(dst), Operand(x_rate), Operand(y_rate));
5604 }
5605 
5606 void
visit_load_interpolated_input(isel_context * ctx,nir_intrinsic_instr * instr)5607 visit_load_interpolated_input(isel_context* ctx, nir_intrinsic_instr* instr)
5608 {
5609    Temp dst = get_ssa_temp(ctx, &instr->def);
5610    Temp coords = get_ssa_temp(ctx, instr->src[0].ssa);
5611    unsigned idx = nir_intrinsic_base(instr);
5612    unsigned component = nir_intrinsic_component(instr);
5613    Temp prim_mask = get_arg(ctx, ctx->args->prim_mask);
5614 
5615    assert(nir_src_is_const(instr->src[1]) && !nir_src_as_uint(instr->src[1]));
5616 
5617    if (instr->def.num_components == 1) {
5618       emit_interp_instr(ctx, idx, component, coords, dst, prim_mask);
5619    } else {
5620       aco_ptr<Pseudo_instruction> vec(create_instruction<Pseudo_instruction>(
5621          aco_opcode::p_create_vector, Format::PSEUDO, instr->def.num_components, 1));
5622       for (unsigned i = 0; i < instr->def.num_components; i++) {
5623          Temp tmp = ctx->program->allocateTmp(instr->def.bit_size == 16 ? v2b : v1);
5624          emit_interp_instr(ctx, idx, component + i, coords, tmp, prim_mask);
5625          vec->operands[i] = Operand(tmp);
5626       }
5627       vec->definitions[0] = Definition(dst);
5628       ctx->block->instructions.emplace_back(std::move(vec));
5629    }
5630 }
5631 
5632 Temp
mtbuf_load_callback(Builder & bld,const LoadEmitInfo & info,Temp offset,unsigned bytes_needed,unsigned alignment,unsigned const_offset,Temp dst_hint)5633 mtbuf_load_callback(Builder& bld, const LoadEmitInfo& info, Temp offset, unsigned bytes_needed,
5634                     unsigned alignment, unsigned const_offset, Temp dst_hint)
5635 {
5636    Operand vaddr = offset.type() == RegType::vgpr ? Operand(offset) : Operand(v1);
5637    Operand soffset = offset.type() == RegType::sgpr ? Operand(offset) : Operand::c32(0);
5638 
5639    if (info.soffset.id()) {
5640       if (soffset.isTemp())
5641          vaddr = bld.copy(bld.def(v1), soffset);
5642       soffset = Operand(info.soffset);
5643    }
5644 
5645    if (soffset.isUndefined())
5646       soffset = Operand::zero();
5647 
5648    const bool offen = !vaddr.isUndefined();
5649    const bool idxen = info.idx.id();
5650 
5651    if (offen && idxen)
5652       vaddr = bld.pseudo(aco_opcode::p_create_vector, bld.def(v2), info.idx, vaddr);
5653    else if (idxen)
5654       vaddr = Operand(info.idx);
5655 
5656    /* Determine number of fetched components.
5657     * Note, ACO IR works with GFX6-8 nfmt + dfmt fields, these are later converted for GFX10+.
5658     */
5659    const struct ac_vtx_format_info* vtx_info =
5660       ac_get_vtx_format_info(GFX8, CHIP_POLARIS10, info.format);
5661    /* The number of channels in the format determines the memory range. */
5662    const unsigned max_components = vtx_info->num_channels;
5663    /* Calculate maximum number of components loaded according to alignment. */
5664    unsigned max_fetched_components = bytes_needed / info.component_size;
5665    max_fetched_components =
5666       ac_get_safe_fetch_size(bld.program->gfx_level, vtx_info, const_offset, max_components,
5667                              alignment, max_fetched_components);
5668    const unsigned fetch_fmt = vtx_info->hw_format[max_fetched_components - 1];
5669    /* Adjust bytes needed in case we need to do a smaller load due to alignment.
5670     * If a larger format is selected, it's still OK to load a smaller amount from it.
5671     */
5672    bytes_needed = MIN2(bytes_needed, max_fetched_components * info.component_size);
5673    unsigned bytes_size = 0;
5674    const unsigned bit_size = info.component_size * 8;
5675    aco_opcode op = aco_opcode::num_opcodes;
5676 
5677    if (bytes_needed == 2) {
5678       bytes_size = 2;
5679       op = aco_opcode::tbuffer_load_format_d16_x;
5680    } else if (bytes_needed <= 4) {
5681       bytes_size = 4;
5682       if (bit_size == 16)
5683          op = aco_opcode::tbuffer_load_format_d16_xy;
5684       else
5685          op = aco_opcode::tbuffer_load_format_x;
5686    } else if (bytes_needed <= 6) {
5687       bytes_size = 6;
5688       if (bit_size == 16)
5689          op = aco_opcode::tbuffer_load_format_d16_xyz;
5690       else
5691          op = aco_opcode::tbuffer_load_format_xy;
5692    } else if (bytes_needed <= 8) {
5693       bytes_size = 8;
5694       if (bit_size == 16)
5695          op = aco_opcode::tbuffer_load_format_d16_xyzw;
5696       else
5697          op = aco_opcode::tbuffer_load_format_xy;
5698    } else if (bytes_needed <= 12) {
5699       bytes_size = 12;
5700       op = aco_opcode::tbuffer_load_format_xyz;
5701    } else {
5702       bytes_size = 16;
5703       op = aco_opcode::tbuffer_load_format_xyzw;
5704    }
5705 
5706    /* Abort when suitable opcode wasn't found so we don't compile buggy shaders. */
5707    if (op == aco_opcode::num_opcodes) {
5708       aco_err(bld.program, "unsupported bit size for typed buffer load");
5709       abort();
5710    }
5711 
5712    aco_ptr<MTBUF_instruction> mtbuf{create_instruction<MTBUF_instruction>(op, Format::MTBUF, 3, 1)};
5713    mtbuf->operands[0] = Operand(info.resource);
5714    mtbuf->operands[1] = vaddr;
5715    mtbuf->operands[2] = soffset;
5716    mtbuf->offen = offen;
5717    mtbuf->idxen = idxen;
5718    mtbuf->glc = info.glc;
5719    mtbuf->dlc = info.glc && (bld.program->gfx_level == GFX10 || bld.program->gfx_level == GFX10_3);
5720    mtbuf->slc = info.slc;
5721    mtbuf->sync = info.sync;
5722    mtbuf->offset = const_offset;
5723    mtbuf->dfmt = fetch_fmt & 0xf;
5724    mtbuf->nfmt = fetch_fmt >> 4;
5725    RegClass rc = RegClass::get(RegType::vgpr, bytes_size);
5726    Temp val = dst_hint.id() && rc == dst_hint.regClass() ? dst_hint : bld.tmp(rc);
5727    mtbuf->definitions[0] = Definition(val);
5728    bld.insert(std::move(mtbuf));
5729 
5730    return val;
5731 }
5732 
5733 const EmitLoadParameters mtbuf_load_params{mtbuf_load_callback, false, true, 4096};
5734 
5735 void
visit_load_fs_input(isel_context * ctx,nir_intrinsic_instr * instr)5736 visit_load_fs_input(isel_context* ctx, nir_intrinsic_instr* instr)
5737 {
5738    Builder bld(ctx->program, ctx->block);
5739    Temp dst = get_ssa_temp(ctx, &instr->def);
5740    nir_src offset = *nir_get_io_offset_src(instr);
5741 
5742    if (!nir_src_is_const(offset) || nir_src_as_uint(offset))
5743       isel_err(offset.ssa->parent_instr, "Unimplemented non-zero nir_intrinsic_load_input offset");
5744 
5745    Temp prim_mask = get_arg(ctx, ctx->args->prim_mask);
5746 
5747    unsigned idx = nir_intrinsic_base(instr);
5748    unsigned component = nir_intrinsic_component(instr);
5749    unsigned vertex_id = 0; /* P0 */
5750 
5751    if (instr->intrinsic == nir_intrinsic_load_input_vertex)
5752       vertex_id = nir_src_as_uint(instr->src[0]);
5753 
5754    if (instr->def.num_components == 1 && instr->def.bit_size != 64) {
5755       emit_interp_mov_instr(ctx, idx, component, vertex_id, dst, prim_mask);
5756    } else {
5757       unsigned num_components = instr->def.num_components;
5758       if (instr->def.bit_size == 64)
5759          num_components *= 2;
5760       aco_ptr<Pseudo_instruction> vec{create_instruction<Pseudo_instruction>(
5761          aco_opcode::p_create_vector, Format::PSEUDO, num_components, 1)};
5762       for (unsigned i = 0; i < num_components; i++) {
5763          unsigned chan_component = (component + i) % 4;
5764          unsigned chan_idx = idx + (component + i) / 4;
5765          vec->operands[i] = Operand(bld.tmp(instr->def.bit_size == 16 ? v2b : v1));
5766          emit_interp_mov_instr(ctx, chan_idx, chan_component, vertex_id, vec->operands[i].getTemp(),
5767                                prim_mask);
5768       }
5769       vec->definitions[0] = Definition(dst);
5770       bld.insert(std::move(vec));
5771    }
5772 }
5773 
5774 void
visit_load_tcs_per_vertex_input(isel_context * ctx,nir_intrinsic_instr * instr)5775 visit_load_tcs_per_vertex_input(isel_context* ctx, nir_intrinsic_instr* instr)
5776 {
5777    assert(ctx->shader->info.stage == MESA_SHADER_TESS_CTRL);
5778 
5779    Builder bld(ctx->program, ctx->block);
5780    Temp dst = get_ssa_temp(ctx, &instr->def);
5781 
5782    if (load_input_from_temps(ctx, instr, dst))
5783       return;
5784 
5785    unreachable("LDS-based TCS input should have been lowered in NIR.");
5786 }
5787 
5788 void
visit_load_per_vertex_input(isel_context * ctx,nir_intrinsic_instr * instr)5789 visit_load_per_vertex_input(isel_context* ctx, nir_intrinsic_instr* instr)
5790 {
5791    switch (ctx->shader->info.stage) {
5792    case MESA_SHADER_TESS_CTRL: visit_load_tcs_per_vertex_input(ctx, instr); break;
5793    default: unreachable("Unimplemented shader stage");
5794    }
5795 }
5796 
5797 void
visit_load_tess_coord(isel_context * ctx,nir_intrinsic_instr * instr)5798 visit_load_tess_coord(isel_context* ctx, nir_intrinsic_instr* instr)
5799 {
5800    assert(ctx->shader->info.stage == MESA_SHADER_TESS_EVAL);
5801 
5802    Builder bld(ctx->program, ctx->block);
5803    Temp dst = get_ssa_temp(ctx, &instr->def);
5804 
5805    Operand tes_u(get_arg(ctx, ctx->args->tes_u));
5806    Operand tes_v(get_arg(ctx, ctx->args->tes_v));
5807    Operand tes_w = Operand::zero();
5808 
5809    if (ctx->shader->info.tess._primitive_mode == TESS_PRIMITIVE_TRIANGLES) {
5810       Temp tmp = bld.vop2(aco_opcode::v_add_f32, bld.def(v1), tes_u, tes_v);
5811       tmp = bld.vop2(aco_opcode::v_sub_f32, bld.def(v1), Operand::c32(0x3f800000u /* 1.0f */), tmp);
5812       tes_w = Operand(tmp);
5813    }
5814 
5815    Temp tess_coord = bld.pseudo(aco_opcode::p_create_vector, Definition(dst), tes_u, tes_v, tes_w);
5816    emit_split_vector(ctx, tess_coord, 3);
5817 }
5818 
5819 void
load_buffer(isel_context * ctx,unsigned num_components,unsigned component_size,Temp dst,Temp rsrc,Temp offset,unsigned align_mul,unsigned align_offset,bool glc=false,bool allow_smem=true,memory_sync_info sync=memory_sync_info ())5820 load_buffer(isel_context* ctx, unsigned num_components, unsigned component_size, Temp dst,
5821             Temp rsrc, Temp offset, unsigned align_mul, unsigned align_offset, bool glc = false,
5822             bool allow_smem = true, memory_sync_info sync = memory_sync_info())
5823 {
5824    Builder bld(ctx->program, ctx->block);
5825 
5826    bool use_smem =
5827       dst.type() != RegType::vgpr && (!glc || ctx->options->gfx_level >= GFX8) && allow_smem;
5828    if (use_smem)
5829       offset = bld.as_uniform(offset);
5830    else {
5831       /* GFX6-7 are affected by a hw bug that prevents address clamping to
5832        * work correctly when the SGPR offset is used.
5833        */
5834       if (offset.type() == RegType::sgpr && ctx->options->gfx_level < GFX8)
5835          offset = as_vgpr(ctx, offset);
5836    }
5837 
5838    LoadEmitInfo info = {Operand(offset), dst, num_components, component_size, rsrc};
5839    info.glc = glc;
5840    info.sync = sync;
5841    info.align_mul = align_mul;
5842    info.align_offset = align_offset;
5843    if (use_smem)
5844       emit_load(ctx, bld, info, smem_load_params);
5845    else
5846       emit_load(ctx, bld, info, mubuf_load_params);
5847 }
5848 
5849 void
visit_load_ubo(isel_context * ctx,nir_intrinsic_instr * instr)5850 visit_load_ubo(isel_context* ctx, nir_intrinsic_instr* instr)
5851 {
5852    Temp dst = get_ssa_temp(ctx, &instr->def);
5853    Builder bld(ctx->program, ctx->block);
5854    Temp rsrc = bld.as_uniform(get_ssa_temp(ctx, instr->src[0].ssa));
5855 
5856    unsigned size = instr->def.bit_size / 8;
5857    load_buffer(ctx, instr->num_components, size, dst, rsrc, get_ssa_temp(ctx, instr->src[1].ssa),
5858                nir_intrinsic_align_mul(instr), nir_intrinsic_align_offset(instr));
5859 }
5860 
5861 void
visit_load_push_constant(isel_context * ctx,nir_intrinsic_instr * instr)5862 visit_load_push_constant(isel_context* ctx, nir_intrinsic_instr* instr)
5863 {
5864    Builder bld(ctx->program, ctx->block);
5865    Temp dst = get_ssa_temp(ctx, &instr->def);
5866    unsigned offset = nir_intrinsic_base(instr);
5867    unsigned count = instr->def.num_components;
5868    nir_const_value* index_cv = nir_src_as_const_value(instr->src[0]);
5869 
5870    if (instr->def.bit_size == 64)
5871       count *= 2;
5872 
5873    if (index_cv && instr->def.bit_size >= 32) {
5874       unsigned start = (offset + index_cv->u32) / 4u;
5875       uint64_t mask = BITFIELD64_MASK(count) << start;
5876       if ((ctx->args->inline_push_const_mask | mask) == ctx->args->inline_push_const_mask &&
5877           start + count <= (sizeof(ctx->args->inline_push_const_mask) * 8u)) {
5878          std::array<Temp, NIR_MAX_VEC_COMPONENTS> elems;
5879          aco_ptr<Pseudo_instruction> vec{create_instruction<Pseudo_instruction>(
5880             aco_opcode::p_create_vector, Format::PSEUDO, count, 1)};
5881          unsigned arg_index =
5882             util_bitcount64(ctx->args->inline_push_const_mask & BITFIELD64_MASK(start));
5883          for (unsigned i = 0; i < count; ++i) {
5884             elems[i] = get_arg(ctx, ctx->args->inline_push_consts[arg_index++]);
5885             vec->operands[i] = Operand{elems[i]};
5886          }
5887          vec->definitions[0] = Definition(dst);
5888          ctx->block->instructions.emplace_back(std::move(vec));
5889          ctx->allocated_vec.emplace(dst.id(), elems);
5890          return;
5891       }
5892    }
5893 
5894    Temp index = bld.as_uniform(get_ssa_temp(ctx, instr->src[0].ssa));
5895    if (offset != 0) // TODO check if index != 0 as well
5896       index = bld.nuw().sop2(aco_opcode::s_add_i32, bld.def(s1), bld.def(s1, scc),
5897                              Operand::c32(offset), index);
5898    Temp ptr = convert_pointer_to_64_bit(ctx, get_arg(ctx, ctx->args->push_constants));
5899    Temp vec = dst;
5900    bool trim = false;
5901    bool aligned = true;
5902 
5903    if (instr->def.bit_size == 8) {
5904       aligned = index_cv && (offset + index_cv->u32) % 4 == 0;
5905       bool fits_in_dword = count == 1 || (index_cv && ((offset + index_cv->u32) % 4 + count) <= 4);
5906       if (!aligned)
5907          vec = fits_in_dword ? bld.tmp(s1) : bld.tmp(s2);
5908    } else if (instr->def.bit_size == 16) {
5909       aligned = index_cv && (offset + index_cv->u32) % 4 == 0;
5910       if (!aligned)
5911          vec = count == 4 ? bld.tmp(s4) : count > 1 ? bld.tmp(s2) : bld.tmp(s1);
5912    }
5913 
5914    aco_opcode op;
5915 
5916    switch (vec.size()) {
5917    case 1: op = aco_opcode::s_load_dword; break;
5918    case 2: op = aco_opcode::s_load_dwordx2; break;
5919    case 3:
5920       vec = bld.tmp(s4);
5921       trim = true;
5922       FALLTHROUGH;
5923    case 4: op = aco_opcode::s_load_dwordx4; break;
5924    case 6:
5925       vec = bld.tmp(s8);
5926       trim = true;
5927       FALLTHROUGH;
5928    case 8: op = aco_opcode::s_load_dwordx8; break;
5929    default: unreachable("unimplemented or forbidden load_push_constant.");
5930    }
5931 
5932    bld.smem(op, Definition(vec), ptr, index);
5933 
5934    if (!aligned) {
5935       Operand byte_offset = index_cv ? Operand::c32((offset + index_cv->u32) % 4) : Operand(index);
5936       byte_align_scalar(ctx, vec, byte_offset, dst);
5937       return;
5938    }
5939 
5940    if (trim) {
5941       emit_split_vector(ctx, vec, 4);
5942       RegClass rc = dst.size() == 3 ? s1 : s2;
5943       bld.pseudo(aco_opcode::p_create_vector, Definition(dst), emit_extract_vector(ctx, vec, 0, rc),
5944                  emit_extract_vector(ctx, vec, 1, rc), emit_extract_vector(ctx, vec, 2, rc));
5945    }
5946    emit_split_vector(ctx, dst, instr->def.num_components);
5947 }
5948 
5949 void
visit_load_constant(isel_context * ctx,nir_intrinsic_instr * instr)5950 visit_load_constant(isel_context* ctx, nir_intrinsic_instr* instr)
5951 {
5952    Temp dst = get_ssa_temp(ctx, &instr->def);
5953 
5954    Builder bld(ctx->program, ctx->block);
5955 
5956    uint32_t desc_type =
5957       S_008F0C_DST_SEL_X(V_008F0C_SQ_SEL_X) | S_008F0C_DST_SEL_Y(V_008F0C_SQ_SEL_Y) |
5958       S_008F0C_DST_SEL_Z(V_008F0C_SQ_SEL_Z) | S_008F0C_DST_SEL_W(V_008F0C_SQ_SEL_W);
5959    if (ctx->options->gfx_level >= GFX10) {
5960       desc_type |= S_008F0C_FORMAT(V_008F0C_GFX10_FORMAT_32_FLOAT) |
5961                    S_008F0C_OOB_SELECT(V_008F0C_OOB_SELECT_RAW) |
5962                    S_008F0C_RESOURCE_LEVEL(ctx->options->gfx_level < GFX11);
5963    } else {
5964       desc_type |= S_008F0C_NUM_FORMAT(V_008F0C_BUF_NUM_FORMAT_FLOAT) |
5965                    S_008F0C_DATA_FORMAT(V_008F0C_BUF_DATA_FORMAT_32);
5966    }
5967 
5968    unsigned base = nir_intrinsic_base(instr);
5969    unsigned range = nir_intrinsic_range(instr);
5970 
5971    Temp offset = get_ssa_temp(ctx, instr->src[0].ssa);
5972    if (base && offset.type() == RegType::sgpr)
5973       offset = bld.nuw().sop2(aco_opcode::s_add_u32, bld.def(s1), bld.def(s1, scc), offset,
5974                               Operand::c32(base));
5975    else if (base && offset.type() == RegType::vgpr)
5976       offset = bld.vadd32(bld.def(v1), Operand::c32(base), offset);
5977 
5978    Temp rsrc = bld.pseudo(aco_opcode::p_create_vector, bld.def(s4),
5979                           bld.pseudo(aco_opcode::p_constaddr, bld.def(s2), bld.def(s1, scc),
5980                                      Operand::c32(ctx->constant_data_offset)),
5981                           Operand::c32(MIN2(base + range, ctx->shader->constant_data_size)),
5982                           Operand::c32(desc_type));
5983    unsigned size = instr->def.bit_size / 8;
5984    // TODO: get alignment information for subdword constants
5985    load_buffer(ctx, instr->num_components, size, dst, rsrc, offset, size, 0);
5986 }
5987 
5988 /* Packs multiple Temps of different sizes in to a vector of v1 Temps.
5989  * The byte count of each input Temp must be a multiple of 2.
5990  */
5991 static std::vector<Temp>
emit_pack_v1(isel_context * ctx,const std::vector<Temp> & unpacked)5992 emit_pack_v1(isel_context* ctx, const std::vector<Temp>& unpacked)
5993 {
5994    Builder bld(ctx->program, ctx->block);
5995    std::vector<Temp> packed;
5996    Temp low = Temp();
5997    for (Temp tmp : unpacked) {
5998       assert(tmp.bytes() % 2 == 0);
5999       unsigned byte_idx = 0;
6000       while (byte_idx < tmp.bytes()) {
6001          if (low != Temp()) {
6002             Temp high = emit_extract_vector(ctx, tmp, byte_idx / 2, v2b);
6003             Temp dword = bld.pseudo(aco_opcode::p_create_vector, bld.def(v1), low, high);
6004             low = Temp();
6005             packed.push_back(dword);
6006             byte_idx += 2;
6007          } else if (byte_idx % 4 == 0 && (byte_idx + 4) <= tmp.bytes()) {
6008             packed.emplace_back(emit_extract_vector(ctx, tmp, byte_idx / 4, v1));
6009             byte_idx += 4;
6010          } else {
6011             low = emit_extract_vector(ctx, tmp, byte_idx / 2, v2b);
6012             byte_idx += 2;
6013          }
6014       }
6015    }
6016    if (low != Temp()) {
6017       Temp dword = bld.pseudo(aco_opcode::p_create_vector, bld.def(v1), low, Operand(v2b));
6018       packed.push_back(dword);
6019    }
6020    return packed;
6021 }
6022 
6023 static bool
should_declare_array(ac_image_dim dim)6024 should_declare_array(ac_image_dim dim)
6025 {
6026    return dim == ac_image_cube || dim == ac_image_1darray || dim == ac_image_2darray ||
6027           dim == ac_image_2darraymsaa;
6028 }
6029 
6030 static int
image_type_to_components_count(enum glsl_sampler_dim dim,bool array)6031 image_type_to_components_count(enum glsl_sampler_dim dim, bool array)
6032 {
6033    switch (dim) {
6034    case GLSL_SAMPLER_DIM_BUF: return 1;
6035    case GLSL_SAMPLER_DIM_1D: return array ? 2 : 1;
6036    case GLSL_SAMPLER_DIM_2D: return array ? 3 : 2;
6037    case GLSL_SAMPLER_DIM_MS: return array ? 3 : 2;
6038    case GLSL_SAMPLER_DIM_3D:
6039    case GLSL_SAMPLER_DIM_CUBE: return 3;
6040    case GLSL_SAMPLER_DIM_RECT:
6041    case GLSL_SAMPLER_DIM_SUBPASS: return 2;
6042    case GLSL_SAMPLER_DIM_SUBPASS_MS: return 2;
6043    default: break;
6044    }
6045    return 0;
6046 }
6047 
6048 static MIMG_instruction*
emit_mimg(Builder & bld,aco_opcode op,Temp dst,Temp rsrc,Operand samp,std::vector<Temp> coords,Operand vdata=Operand (v1))6049 emit_mimg(Builder& bld, aco_opcode op, Temp dst, Temp rsrc, Operand samp, std::vector<Temp> coords,
6050           Operand vdata = Operand(v1))
6051 {
6052    size_t nsa_size = bld.program->dev.max_nsa_vgprs;
6053    nsa_size = bld.program->gfx_level >= GFX11 || coords.size() <= nsa_size ? nsa_size : 0;
6054 
6055    const bool strict_wqm = coords[0].regClass().is_linear_vgpr();
6056    if (strict_wqm)
6057       nsa_size = coords.size();
6058 
6059    for (unsigned i = 0; i < std::min(coords.size(), nsa_size); i++) {
6060       if (!coords[i].id())
6061          continue;
6062 
6063       coords[i] = as_vgpr(bld, coords[i]);
6064    }
6065 
6066    if (nsa_size < coords.size()) {
6067       Temp coord = coords[nsa_size];
6068       if (coords.size() - nsa_size > 1) {
6069          aco_ptr<Pseudo_instruction> vec{create_instruction<Pseudo_instruction>(
6070             aco_opcode::p_create_vector, Format::PSEUDO, coords.size() - nsa_size, 1)};
6071 
6072          unsigned coord_size = 0;
6073          for (unsigned i = nsa_size; i < coords.size(); i++) {
6074             vec->operands[i - nsa_size] = Operand(coords[i]);
6075             coord_size += coords[i].size();
6076          }
6077 
6078          coord = bld.tmp(RegType::vgpr, coord_size);
6079          vec->definitions[0] = Definition(coord);
6080          bld.insert(std::move(vec));
6081       } else {
6082          coord = as_vgpr(bld, coord);
6083       }
6084 
6085       coords[nsa_size] = coord;
6086       coords.resize(nsa_size + 1);
6087    }
6088 
6089    bool has_dst = dst.id() != 0;
6090 
6091    aco_ptr<MIMG_instruction> mimg{
6092       create_instruction<MIMG_instruction>(op, Format::MIMG, 3 + coords.size(), has_dst)};
6093    if (has_dst)
6094       mimg->definitions[0] = Definition(dst);
6095    mimg->operands[0] = Operand(rsrc);
6096    mimg->operands[1] = samp;
6097    mimg->operands[2] = vdata;
6098    for (unsigned i = 0; i < coords.size(); i++)
6099       mimg->operands[3 + i] = Operand(coords[i]);
6100    mimg->strict_wqm = strict_wqm;
6101 
6102    MIMG_instruction* res = mimg.get();
6103    bld.insert(std::move(mimg));
6104    return res;
6105 }
6106 
6107 void
visit_bvh64_intersect_ray_amd(isel_context * ctx,nir_intrinsic_instr * instr)6108 visit_bvh64_intersect_ray_amd(isel_context* ctx, nir_intrinsic_instr* instr)
6109 {
6110    Builder bld(ctx->program, ctx->block);
6111    Temp dst = get_ssa_temp(ctx, &instr->def);
6112    Temp resource = get_ssa_temp(ctx, instr->src[0].ssa);
6113    Temp node = get_ssa_temp(ctx, instr->src[1].ssa);
6114    Temp tmax = get_ssa_temp(ctx, instr->src[2].ssa);
6115    Temp origin = get_ssa_temp(ctx, instr->src[3].ssa);
6116    Temp dir = get_ssa_temp(ctx, instr->src[4].ssa);
6117    Temp inv_dir = get_ssa_temp(ctx, instr->src[5].ssa);
6118 
6119    /* On GFX11 image_bvh64_intersect_ray has a special vaddr layout with NSA:
6120     * There are five smaller vector groups:
6121     * node_pointer, ray_extent, ray_origin, ray_dir, ray_inv_dir.
6122     * These directly match the NIR intrinsic sources.
6123     */
6124    std::vector<Temp> args = {
6125       node, tmax, origin, dir, inv_dir,
6126    };
6127 
6128    if (bld.program->gfx_level == GFX10_3) {
6129       std::vector<Temp> scalar_args;
6130       for (Temp tmp : args) {
6131          for (unsigned i = 0; i < tmp.size(); i++)
6132             scalar_args.push_back(emit_extract_vector(ctx, tmp, i, v1));
6133       }
6134       args = std::move(scalar_args);
6135    }
6136 
6137    MIMG_instruction* mimg =
6138       emit_mimg(bld, aco_opcode::image_bvh64_intersect_ray, dst, resource, Operand(s4), args);
6139    mimg->dim = ac_image_1d;
6140    mimg->dmask = 0xf;
6141    mimg->unrm = true;
6142    mimg->r128 = true;
6143 
6144    emit_split_vector(ctx, dst, instr->def.num_components);
6145 }
6146 
6147 static std::vector<Temp>
get_image_coords(isel_context * ctx,const nir_intrinsic_instr * instr)6148 get_image_coords(isel_context* ctx, const nir_intrinsic_instr* instr)
6149 {
6150 
6151    Temp src0 = get_ssa_temp(ctx, instr->src[1].ssa);
6152    bool a16 = instr->src[1].ssa->bit_size == 16;
6153    RegClass rc = a16 ? v2b : v1;
6154    enum glsl_sampler_dim dim = nir_intrinsic_image_dim(instr);
6155    bool is_array = nir_intrinsic_image_array(instr);
6156    ASSERTED bool add_frag_pos =
6157       (dim == GLSL_SAMPLER_DIM_SUBPASS || dim == GLSL_SAMPLER_DIM_SUBPASS_MS);
6158    assert(!add_frag_pos && "Input attachments should be lowered.");
6159    bool is_ms = (dim == GLSL_SAMPLER_DIM_MS || dim == GLSL_SAMPLER_DIM_SUBPASS_MS);
6160    bool gfx9_1d = ctx->options->gfx_level == GFX9 && dim == GLSL_SAMPLER_DIM_1D;
6161    int count = image_type_to_components_count(dim, is_array);
6162    std::vector<Temp> coords;
6163    Builder bld(ctx->program, ctx->block);
6164 
6165    if (gfx9_1d) {
6166       coords.emplace_back(emit_extract_vector(ctx, src0, 0, rc));
6167       coords.emplace_back(bld.copy(bld.def(rc), Operand::zero(a16 ? 2 : 4)));
6168       if (is_array)
6169          coords.emplace_back(emit_extract_vector(ctx, src0, 1, rc));
6170    } else {
6171       for (int i = 0; i < count; i++)
6172          coords.emplace_back(emit_extract_vector(ctx, src0, i, rc));
6173    }
6174 
6175    bool has_lod = false;
6176    Temp lod;
6177 
6178    if (instr->intrinsic == nir_intrinsic_bindless_image_load ||
6179        instr->intrinsic == nir_intrinsic_bindless_image_sparse_load ||
6180        instr->intrinsic == nir_intrinsic_bindless_image_store) {
6181       int lod_index = instr->intrinsic == nir_intrinsic_bindless_image_store ? 4 : 3;
6182       assert(instr->src[lod_index].ssa->bit_size == (a16 ? 16 : 32));
6183       has_lod =
6184          !nir_src_is_const(instr->src[lod_index]) || nir_src_as_uint(instr->src[lod_index]) != 0;
6185 
6186       if (has_lod)
6187          lod = get_ssa_temp_tex(ctx, instr->src[lod_index].ssa, a16);
6188    }
6189 
6190    if (ctx->program->info.image_2d_view_of_3d && dim == GLSL_SAMPLER_DIM_2D && !is_array) {
6191       /* The hw can't bind a slice of a 3D image as a 2D image, because it
6192        * ignores BASE_ARRAY if the target is 3D. The workaround is to read
6193        * BASE_ARRAY and set it as the 3rd address operand for all 2D images.
6194        */
6195       assert(ctx->options->gfx_level == GFX9);
6196       Temp rsrc = bld.as_uniform(get_ssa_temp(ctx, instr->src[0].ssa));
6197       Temp rsrc_word5 = emit_extract_vector(ctx, rsrc, 5, v1);
6198       /* Extract the BASE_ARRAY field [0:12] from the descriptor. */
6199       Temp first_layer = bld.vop3(aco_opcode::v_bfe_u32, bld.def(v1), rsrc_word5, Operand::c32(0u),
6200                                   Operand::c32(13u));
6201 
6202       if (has_lod) {
6203          /* If there's a lod parameter it matter if the image is 3d or 2d because
6204           * the hw reads either the fourth or third component as lod. So detect
6205           * 3d images and place the lod at the third component otherwise.
6206           * For non 3D descriptors we effectively add lod twice to coords,
6207           * but the hw will only read the first one, the second is ignored.
6208           */
6209          Temp rsrc_word3 = emit_extract_vector(ctx, rsrc, 3, s1);
6210          Temp type = bld.sop2(aco_opcode::s_bfe_u32, bld.def(s1), bld.def(s1, scc), rsrc_word3,
6211                               Operand::c32(28 | (4 << 16))); /* extract last 4 bits */
6212          Temp is_3d = bld.vopc_e64(aco_opcode::v_cmp_eq_u32, bld.def(bld.lm), type,
6213                                    Operand::c32(V_008F1C_SQ_RSRC_IMG_3D));
6214          first_layer =
6215             bld.vop2(aco_opcode::v_cndmask_b32, bld.def(v1), as_vgpr(ctx, lod), first_layer, is_3d);
6216       }
6217 
6218       if (a16)
6219          coords.emplace_back(emit_extract_vector(ctx, first_layer, 0, v2b));
6220       else
6221          coords.emplace_back(first_layer);
6222    }
6223 
6224    if (is_ms && instr->intrinsic != nir_intrinsic_bindless_image_fragment_mask_load_amd) {
6225       assert(instr->src[2].ssa->bit_size == (a16 ? 16 : 32));
6226       coords.emplace_back(get_ssa_temp_tex(ctx, instr->src[2].ssa, a16));
6227    }
6228 
6229    if (has_lod)
6230       coords.emplace_back(lod);
6231 
6232    return emit_pack_v1(ctx, coords);
6233 }
6234 
6235 memory_sync_info
get_memory_sync_info(nir_intrinsic_instr * instr,storage_class storage,unsigned semantics)6236 get_memory_sync_info(nir_intrinsic_instr* instr, storage_class storage, unsigned semantics)
6237 {
6238    /* atomicrmw might not have NIR_INTRINSIC_ACCESS and there's nothing interesting there anyway */
6239    if (semantics & semantic_atomicrmw)
6240       return memory_sync_info(storage, semantics);
6241 
6242    unsigned access = nir_intrinsic_access(instr);
6243 
6244    if (access & ACCESS_VOLATILE)
6245       semantics |= semantic_volatile;
6246    if (access & ACCESS_CAN_REORDER)
6247       semantics |= semantic_can_reorder | semantic_private;
6248 
6249    return memory_sync_info(storage, semantics);
6250 }
6251 
6252 Operand
emit_tfe_init(Builder & bld,Temp dst)6253 emit_tfe_init(Builder& bld, Temp dst)
6254 {
6255    Temp tmp = bld.tmp(dst.regClass());
6256 
6257    aco_ptr<Pseudo_instruction> vec{create_instruction<Pseudo_instruction>(
6258       aco_opcode::p_create_vector, Format::PSEUDO, dst.size(), 1)};
6259    for (unsigned i = 0; i < dst.size(); i++)
6260       vec->operands[i] = Operand::zero();
6261    vec->definitions[0] = Definition(tmp);
6262    /* Since this is fixed to an instruction's definition register, any CSE will
6263     * just create copies. Copying costs about the same as zero-initialization,
6264     * but these copies can break up clauses.
6265     */
6266    vec->definitions[0].setNoCSE(true);
6267    bld.insert(std::move(vec));
6268 
6269    return Operand(tmp);
6270 }
6271 
6272 void
visit_image_load(isel_context * ctx,nir_intrinsic_instr * instr)6273 visit_image_load(isel_context* ctx, nir_intrinsic_instr* instr)
6274 {
6275    Builder bld(ctx->program, ctx->block);
6276    const enum glsl_sampler_dim dim = nir_intrinsic_image_dim(instr);
6277    bool is_array = nir_intrinsic_image_array(instr);
6278    bool is_sparse = instr->intrinsic == nir_intrinsic_bindless_image_sparse_load;
6279    Temp dst = get_ssa_temp(ctx, &instr->def);
6280 
6281    memory_sync_info sync = get_memory_sync_info(instr, storage_image, 0);
6282    unsigned access = nir_intrinsic_access(instr);
6283 
6284    unsigned result_size = instr->def.num_components - is_sparse;
6285    unsigned expand_mask = nir_def_components_read(&instr->def) & u_bit_consecutive(0, result_size);
6286    expand_mask = MAX2(expand_mask, 1); /* this can be zero in the case of sparse image loads */
6287    if (dim == GLSL_SAMPLER_DIM_BUF)
6288       expand_mask = (1u << util_last_bit(expand_mask)) - 1u;
6289    unsigned dmask = expand_mask;
6290    if (instr->def.bit_size == 64) {
6291       expand_mask &= 0x9;
6292       /* only R64_UINT and R64_SINT supported. x is in xy of the result, w in zw */
6293       dmask = ((expand_mask & 0x1) ? 0x3 : 0) | ((expand_mask & 0x8) ? 0xc : 0);
6294    }
6295    if (is_sparse)
6296       expand_mask |= 1 << result_size;
6297 
6298    bool d16 = instr->def.bit_size == 16;
6299    assert(!d16 || !is_sparse);
6300 
6301    unsigned num_bytes = util_bitcount(dmask) * (d16 ? 2 : 4) + is_sparse * 4;
6302 
6303    Temp tmp;
6304    if (num_bytes == dst.bytes() && dst.type() == RegType::vgpr)
6305       tmp = dst;
6306    else
6307       tmp = bld.tmp(RegClass::get(RegType::vgpr, num_bytes));
6308 
6309    Temp resource = bld.as_uniform(get_ssa_temp(ctx, instr->src[0].ssa));
6310 
6311    if (dim == GLSL_SAMPLER_DIM_BUF) {
6312       Temp vindex = emit_extract_vector(ctx, get_ssa_temp(ctx, instr->src[1].ssa), 0, v1);
6313 
6314       aco_opcode opcode;
6315       if (!d16) {
6316          switch (util_bitcount(dmask)) {
6317          case 1: opcode = aco_opcode::buffer_load_format_x; break;
6318          case 2: opcode = aco_opcode::buffer_load_format_xy; break;
6319          case 3: opcode = aco_opcode::buffer_load_format_xyz; break;
6320          case 4: opcode = aco_opcode::buffer_load_format_xyzw; break;
6321          default: unreachable(">4 channel buffer image load");
6322          }
6323       } else {
6324          switch (util_bitcount(dmask)) {
6325          case 1: opcode = aco_opcode::buffer_load_format_d16_x; break;
6326          case 2: opcode = aco_opcode::buffer_load_format_d16_xy; break;
6327          case 3: opcode = aco_opcode::buffer_load_format_d16_xyz; break;
6328          case 4: opcode = aco_opcode::buffer_load_format_d16_xyzw; break;
6329          default: unreachable(">4 channel buffer image load");
6330          }
6331       }
6332       aco_ptr<MUBUF_instruction> load{
6333          create_instruction<MUBUF_instruction>(opcode, Format::MUBUF, 3 + is_sparse, 1)};
6334       load->operands[0] = Operand(resource);
6335       load->operands[1] = Operand(vindex);
6336       load->operands[2] = Operand::c32(0);
6337       load->definitions[0] = Definition(tmp);
6338       load->idxen = true;
6339       load->glc = access & (ACCESS_VOLATILE | ACCESS_COHERENT);
6340       load->dlc =
6341          load->glc && (ctx->options->gfx_level == GFX10 || ctx->options->gfx_level == GFX10_3);
6342       load->sync = sync;
6343       load->tfe = is_sparse;
6344       if (load->tfe)
6345          load->operands[3] = emit_tfe_init(bld, tmp);
6346       ctx->block->instructions.emplace_back(std::move(load));
6347    } else {
6348       std::vector<Temp> coords = get_image_coords(ctx, instr);
6349 
6350       aco_opcode opcode;
6351       if (instr->intrinsic == nir_intrinsic_bindless_image_fragment_mask_load_amd) {
6352          opcode = aco_opcode::image_load;
6353       } else {
6354          bool level_zero = nir_src_is_const(instr->src[3]) && nir_src_as_uint(instr->src[3]) == 0;
6355          opcode = level_zero ? aco_opcode::image_load : aco_opcode::image_load_mip;
6356       }
6357 
6358       Operand vdata = is_sparse ? emit_tfe_init(bld, tmp) : Operand(v1);
6359       MIMG_instruction* load = emit_mimg(bld, opcode, tmp, resource, Operand(s4), coords, vdata);
6360       load->glc = access & (ACCESS_VOLATILE | ACCESS_COHERENT) ? 1 : 0;
6361       load->dlc =
6362          load->glc && (ctx->options->gfx_level == GFX10 || ctx->options->gfx_level == GFX10_3);
6363       load->a16 = instr->src[1].ssa->bit_size == 16;
6364       load->d16 = d16;
6365       load->dmask = dmask;
6366       load->unrm = true;
6367       load->tfe = is_sparse;
6368 
6369       if (instr->intrinsic == nir_intrinsic_bindless_image_fragment_mask_load_amd) {
6370          load->dim = is_array ? ac_image_2darray : ac_image_2d;
6371          load->da = is_array;
6372          load->sync = memory_sync_info();
6373       } else {
6374          ac_image_dim sdim = ac_get_image_dim(ctx->options->gfx_level, dim, is_array);
6375          load->dim = sdim;
6376          load->da = should_declare_array(sdim);
6377          load->sync = sync;
6378       }
6379    }
6380 
6381    if (is_sparse && instr->def.bit_size == 64) {
6382       /* The result components are 64-bit but the sparse residency code is
6383        * 32-bit. So add a zero to the end so expand_vector() works correctly.
6384        */
6385       tmp = bld.pseudo(aco_opcode::p_create_vector, bld.def(RegType::vgpr, tmp.size() + 1), tmp,
6386                        Operand::zero());
6387    }
6388 
6389    expand_vector(ctx, tmp, dst, instr->def.num_components, expand_mask, instr->def.bit_size == 64);
6390 }
6391 
6392 void
visit_image_store(isel_context * ctx,nir_intrinsic_instr * instr)6393 visit_image_store(isel_context* ctx, nir_intrinsic_instr* instr)
6394 {
6395    Builder bld(ctx->program, ctx->block);
6396    const enum glsl_sampler_dim dim = nir_intrinsic_image_dim(instr);
6397    bool is_array = nir_intrinsic_image_array(instr);
6398    Temp data = get_ssa_temp(ctx, instr->src[3].ssa);
6399    bool d16 = instr->src[3].ssa->bit_size == 16;
6400 
6401    /* only R64_UINT and R64_SINT supported */
6402    if (instr->src[3].ssa->bit_size == 64 && data.bytes() > 8)
6403       data = emit_extract_vector(ctx, data, 0, RegClass(data.type(), 2));
6404    data = as_vgpr(ctx, data);
6405 
6406    uint32_t num_components = d16 ? instr->src[3].ssa->num_components : data.size();
6407 
6408    memory_sync_info sync = get_memory_sync_info(instr, storage_image, 0);
6409    unsigned access = nir_intrinsic_access(instr);
6410    bool glc = ctx->options->gfx_level == GFX6 ||
6411               ((access & (ACCESS_VOLATILE | ACCESS_COHERENT)) && ctx->program->gfx_level < GFX11);
6412 
6413    uint32_t dmask = BITFIELD_MASK(num_components);
6414    if (instr->src[3].ssa->bit_size == 32 || instr->src[3].ssa->bit_size == 16) {
6415       for (uint32_t i = 0; i < instr->num_components; i++) {
6416          /* components not in dmask receive:
6417           * GFX6-11.5:  zero
6418           * GFX12+: first component in dmask
6419           */
6420          nir_scalar comp = nir_scalar_resolved(instr->src[3].ssa, i);
6421          if (nir_scalar_is_undef(comp)) {
6422             dmask &= ~BITFIELD_BIT(i);
6423          } else if (ctx->options->gfx_level <= GFX11_5) {
6424             if (nir_scalar_is_const(comp) && nir_scalar_as_uint(comp) == 0)
6425                dmask &= ~BITFIELD_BIT(i);
6426          } else {
6427             unsigned first = dim == GLSL_SAMPLER_DIM_BUF ? 0 : ffs(dmask) - 1;
6428             if (i != first && nir_scalar_equal(nir_scalar_resolved(instr->src[3].ssa, first), comp))
6429                dmask &= ~BITFIELD_BIT(i);
6430          }
6431       }
6432 
6433       /* dmask cannot be 0, at least one vgpr is always read */
6434       if (dmask == 0)
6435          dmask = 1;
6436       /* buffer store only supports consecutive components. */
6437       if (dim == GLSL_SAMPLER_DIM_BUF)
6438          dmask = BITFIELD_MASK(util_last_bit(dmask));
6439 
6440       if (dmask != BITFIELD_MASK(num_components)) {
6441          uint32_t dmask_count = util_bitcount(dmask);
6442          RegClass rc = d16 ? v2b : v1;
6443          if (dmask_count == 1) {
6444             data = emit_extract_vector(ctx, data, ffs(dmask) - 1, rc);
6445          } else {
6446             aco_ptr<Pseudo_instruction> vec{create_instruction<Pseudo_instruction>(
6447                aco_opcode::p_create_vector, Format::PSEUDO, dmask_count, 1)};
6448             uint32_t index = 0;
6449             u_foreach_bit (bit, dmask) {
6450                vec->operands[index++] = Operand(emit_extract_vector(ctx, data, bit, rc));
6451             }
6452             data = bld.tmp(RegClass::get(RegType::vgpr, dmask_count * rc.bytes()));
6453             vec->definitions[0] = Definition(data);
6454             bld.insert(std::move(vec));
6455          }
6456       }
6457    }
6458 
6459    if (dim == GLSL_SAMPLER_DIM_BUF) {
6460       Temp rsrc = bld.as_uniform(get_ssa_temp(ctx, instr->src[0].ssa));
6461       Temp vindex = emit_extract_vector(ctx, get_ssa_temp(ctx, instr->src[1].ssa), 0, v1);
6462       aco_opcode opcode;
6463       if (!d16) {
6464          switch (dmask) {
6465          case 0x1: opcode = aco_opcode::buffer_store_format_x; break;
6466          case 0x3: opcode = aco_opcode::buffer_store_format_xy; break;
6467          case 0x7: opcode = aco_opcode::buffer_store_format_xyz; break;
6468          case 0xf: opcode = aco_opcode::buffer_store_format_xyzw; break;
6469          default: unreachable(">4 channel buffer image store");
6470          }
6471       } else {
6472          switch (dmask) {
6473          case 0x1: opcode = aco_opcode::buffer_store_format_d16_x; break;
6474          case 0x3: opcode = aco_opcode::buffer_store_format_d16_xy; break;
6475          case 0x7: opcode = aco_opcode::buffer_store_format_d16_xyz; break;
6476          case 0xf: opcode = aco_opcode::buffer_store_format_d16_xyzw; break;
6477          default: unreachable(">4 channel buffer image store");
6478          }
6479       }
6480       aco_ptr<MUBUF_instruction> store{
6481          create_instruction<MUBUF_instruction>(opcode, Format::MUBUF, 4, 0)};
6482       store->operands[0] = Operand(rsrc);
6483       store->operands[1] = Operand(vindex);
6484       store->operands[2] = Operand::c32(0);
6485       store->operands[3] = Operand(data);
6486       store->idxen = true;
6487       store->glc = glc;
6488       store->dlc = false;
6489       store->disable_wqm = true;
6490       store->sync = sync;
6491       ctx->program->needs_exact = true;
6492       ctx->block->instructions.emplace_back(std::move(store));
6493       return;
6494    }
6495 
6496    assert(data.type() == RegType::vgpr);
6497    std::vector<Temp> coords = get_image_coords(ctx, instr);
6498    Temp resource = bld.as_uniform(get_ssa_temp(ctx, instr->src[0].ssa));
6499 
6500    bool level_zero = nir_src_is_const(instr->src[4]) && nir_src_as_uint(instr->src[4]) == 0;
6501    aco_opcode opcode = level_zero ? aco_opcode::image_store : aco_opcode::image_store_mip;
6502 
6503    MIMG_instruction* store =
6504       emit_mimg(bld, opcode, Temp(0, v1), resource, Operand(s4), coords, Operand(data));
6505    store->glc = glc;
6506    store->dlc = false;
6507    store->a16 = instr->src[1].ssa->bit_size == 16;
6508    store->d16 = d16;
6509    store->dmask = dmask;
6510    store->unrm = true;
6511    ac_image_dim sdim = ac_get_image_dim(ctx->options->gfx_level, dim, is_array);
6512    store->dim = sdim;
6513    store->da = should_declare_array(sdim);
6514    store->disable_wqm = true;
6515    store->sync = sync;
6516    ctx->program->needs_exact = true;
6517    return;
6518 }
6519 
6520 void
translate_buffer_image_atomic_op(const nir_atomic_op op,aco_opcode * buf_op,aco_opcode * buf_op64,aco_opcode * image_op)6521 translate_buffer_image_atomic_op(const nir_atomic_op op, aco_opcode* buf_op, aco_opcode* buf_op64,
6522                                  aco_opcode* image_op)
6523 {
6524    switch (op) {
6525    case nir_atomic_op_iadd:
6526       *buf_op = aco_opcode::buffer_atomic_add;
6527       *buf_op64 = aco_opcode::buffer_atomic_add_x2;
6528       *image_op = aco_opcode::image_atomic_add;
6529       break;
6530    case nir_atomic_op_umin:
6531       *buf_op = aco_opcode::buffer_atomic_umin;
6532       *buf_op64 = aco_opcode::buffer_atomic_umin_x2;
6533       *image_op = aco_opcode::image_atomic_umin;
6534       break;
6535    case nir_atomic_op_imin:
6536       *buf_op = aco_opcode::buffer_atomic_smin;
6537       *buf_op64 = aco_opcode::buffer_atomic_smin_x2;
6538       *image_op = aco_opcode::image_atomic_smin;
6539       break;
6540    case nir_atomic_op_umax:
6541       *buf_op = aco_opcode::buffer_atomic_umax;
6542       *buf_op64 = aco_opcode::buffer_atomic_umax_x2;
6543       *image_op = aco_opcode::image_atomic_umax;
6544       break;
6545    case nir_atomic_op_imax:
6546       *buf_op = aco_opcode::buffer_atomic_smax;
6547       *buf_op64 = aco_opcode::buffer_atomic_smax_x2;
6548       *image_op = aco_opcode::image_atomic_smax;
6549       break;
6550    case nir_atomic_op_iand:
6551       *buf_op = aco_opcode::buffer_atomic_and;
6552       *buf_op64 = aco_opcode::buffer_atomic_and_x2;
6553       *image_op = aco_opcode::image_atomic_and;
6554       break;
6555    case nir_atomic_op_ior:
6556       *buf_op = aco_opcode::buffer_atomic_or;
6557       *buf_op64 = aco_opcode::buffer_atomic_or_x2;
6558       *image_op = aco_opcode::image_atomic_or;
6559       break;
6560    case nir_atomic_op_ixor:
6561       *buf_op = aco_opcode::buffer_atomic_xor;
6562       *buf_op64 = aco_opcode::buffer_atomic_xor_x2;
6563       *image_op = aco_opcode::image_atomic_xor;
6564       break;
6565    case nir_atomic_op_xchg:
6566       *buf_op = aco_opcode::buffer_atomic_swap;
6567       *buf_op64 = aco_opcode::buffer_atomic_swap_x2;
6568       *image_op = aco_opcode::image_atomic_swap;
6569       break;
6570    case nir_atomic_op_cmpxchg:
6571       *buf_op = aco_opcode::buffer_atomic_cmpswap;
6572       *buf_op64 = aco_opcode::buffer_atomic_cmpswap_x2;
6573       *image_op = aco_opcode::image_atomic_cmpswap;
6574       break;
6575    case nir_atomic_op_inc_wrap:
6576       *buf_op = aco_opcode::buffer_atomic_inc;
6577       *buf_op64 = aco_opcode::buffer_atomic_inc_x2;
6578       *image_op = aco_opcode::image_atomic_inc;
6579       break;
6580    case nir_atomic_op_dec_wrap:
6581       *buf_op = aco_opcode::buffer_atomic_dec;
6582       *buf_op64 = aco_opcode::buffer_atomic_dec_x2;
6583       *image_op = aco_opcode::image_atomic_dec;
6584       break;
6585    case nir_atomic_op_fadd:
6586       *buf_op = aco_opcode::buffer_atomic_add_f32;
6587       *buf_op64 = aco_opcode::num_opcodes;
6588       *image_op = aco_opcode::num_opcodes;
6589       break;
6590    case nir_atomic_op_fmin:
6591       *buf_op = aco_opcode::buffer_atomic_fmin;
6592       *buf_op64 = aco_opcode::buffer_atomic_fmin_x2;
6593       *image_op = aco_opcode::image_atomic_fmin;
6594       break;
6595    case nir_atomic_op_fmax:
6596       *buf_op = aco_opcode::buffer_atomic_fmax;
6597       *buf_op64 = aco_opcode::buffer_atomic_fmax_x2;
6598       *image_op = aco_opcode::image_atomic_fmax;
6599       break;
6600    default: unreachable("unsupported atomic operation");
6601    }
6602 }
6603 
6604 void
visit_image_atomic(isel_context * ctx,nir_intrinsic_instr * instr)6605 visit_image_atomic(isel_context* ctx, nir_intrinsic_instr* instr)
6606 {
6607    bool return_previous = !nir_def_is_unused(&instr->def);
6608    const enum glsl_sampler_dim dim = nir_intrinsic_image_dim(instr);
6609    bool is_array = nir_intrinsic_image_array(instr);
6610    Builder bld(ctx->program, ctx->block);
6611 
6612    const nir_atomic_op op = nir_intrinsic_atomic_op(instr);
6613    const bool cmpswap = op == nir_atomic_op_cmpxchg;
6614 
6615    aco_opcode buf_op, buf_op64, image_op;
6616    translate_buffer_image_atomic_op(op, &buf_op, &buf_op64, &image_op);
6617 
6618    Temp data = as_vgpr(ctx, get_ssa_temp(ctx, instr->src[3].ssa));
6619    bool is_64bit = data.bytes() == 8;
6620    assert((data.bytes() == 4 || data.bytes() == 8) && "only 32/64-bit image atomics implemented.");
6621 
6622    if (cmpswap)
6623       data = bld.pseudo(aco_opcode::p_create_vector, bld.def(is_64bit ? v4 : v2),
6624                         get_ssa_temp(ctx, instr->src[4].ssa), data);
6625 
6626    Temp dst = get_ssa_temp(ctx, &instr->def);
6627    memory_sync_info sync = get_memory_sync_info(instr, storage_image, semantic_atomicrmw);
6628 
6629    if (dim == GLSL_SAMPLER_DIM_BUF) {
6630       Temp vindex = emit_extract_vector(ctx, get_ssa_temp(ctx, instr->src[1].ssa), 0, v1);
6631       Temp resource = bld.as_uniform(get_ssa_temp(ctx, instr->src[0].ssa));
6632       // assert(ctx->options->gfx_level < GFX9 && "GFX9 stride size workaround not yet
6633       // implemented.");
6634       aco_ptr<MUBUF_instruction> mubuf{create_instruction<MUBUF_instruction>(
6635          is_64bit ? buf_op64 : buf_op, Format::MUBUF, 4, return_previous ? 1 : 0)};
6636       mubuf->operands[0] = Operand(resource);
6637       mubuf->operands[1] = Operand(vindex);
6638       mubuf->operands[2] = Operand::c32(0);
6639       mubuf->operands[3] = Operand(data);
6640       Definition def =
6641          return_previous ? (cmpswap ? bld.def(data.regClass()) : Definition(dst)) : Definition();
6642       if (return_previous)
6643          mubuf->definitions[0] = def;
6644       mubuf->offset = 0;
6645       mubuf->idxen = true;
6646       mubuf->glc = return_previous;
6647       mubuf->dlc = false; /* Not needed for atomics */
6648       mubuf->disable_wqm = true;
6649       mubuf->sync = sync;
6650       ctx->program->needs_exact = true;
6651       ctx->block->instructions.emplace_back(std::move(mubuf));
6652       if (return_previous && cmpswap)
6653          bld.pseudo(aco_opcode::p_extract_vector, Definition(dst), def.getTemp(), Operand::zero());
6654       return;
6655    }
6656 
6657    std::vector<Temp> coords = get_image_coords(ctx, instr);
6658    Temp resource = bld.as_uniform(get_ssa_temp(ctx, instr->src[0].ssa));
6659    Temp tmp = return_previous ? (cmpswap ? bld.tmp(data.regClass()) : dst) : Temp(0, v1);
6660    MIMG_instruction* mimg =
6661       emit_mimg(bld, image_op, tmp, resource, Operand(s4), coords, Operand(data));
6662    mimg->glc = return_previous;
6663    mimg->dlc = false; /* Not needed for atomics */
6664    mimg->dmask = (1 << data.size()) - 1;
6665    mimg->a16 = instr->src[1].ssa->bit_size == 16;
6666    mimg->unrm = true;
6667    ac_image_dim sdim = ac_get_image_dim(ctx->options->gfx_level, dim, is_array);
6668    mimg->dim = sdim;
6669    mimg->da = should_declare_array(sdim);
6670    mimg->disable_wqm = true;
6671    mimg->sync = sync;
6672    ctx->program->needs_exact = true;
6673    if (return_previous && cmpswap)
6674       bld.pseudo(aco_opcode::p_extract_vector, Definition(dst), tmp, Operand::zero());
6675    return;
6676 }
6677 
6678 void
visit_load_ssbo(isel_context * ctx,nir_intrinsic_instr * instr)6679 visit_load_ssbo(isel_context* ctx, nir_intrinsic_instr* instr)
6680 {
6681    Builder bld(ctx->program, ctx->block);
6682    unsigned num_components = instr->num_components;
6683 
6684    Temp dst = get_ssa_temp(ctx, &instr->def);
6685    Temp rsrc = bld.as_uniform(get_ssa_temp(ctx, instr->src[0].ssa));
6686 
6687    unsigned access = nir_intrinsic_access(instr);
6688    bool glc = access & (ACCESS_VOLATILE | ACCESS_COHERENT);
6689    unsigned size = instr->def.bit_size / 8;
6690 
6691    bool allow_smem = access & ACCESS_CAN_REORDER;
6692 
6693    load_buffer(ctx, num_components, size, dst, rsrc, get_ssa_temp(ctx, instr->src[1].ssa),
6694                nir_intrinsic_align_mul(instr), nir_intrinsic_align_offset(instr), glc, allow_smem,
6695                get_memory_sync_info(instr, storage_buffer, 0));
6696 }
6697 
6698 void
visit_store_ssbo(isel_context * ctx,nir_intrinsic_instr * instr)6699 visit_store_ssbo(isel_context* ctx, nir_intrinsic_instr* instr)
6700 {
6701    Builder bld(ctx->program, ctx->block);
6702    Temp data = get_ssa_temp(ctx, instr->src[0].ssa);
6703    unsigned elem_size_bytes = instr->src[0].ssa->bit_size / 8;
6704    unsigned writemask = util_widen_mask(nir_intrinsic_write_mask(instr), elem_size_bytes);
6705    Temp offset = get_ssa_temp(ctx, instr->src[2].ssa);
6706 
6707    Temp rsrc = bld.as_uniform(get_ssa_temp(ctx, instr->src[1].ssa));
6708 
6709    memory_sync_info sync = get_memory_sync_info(instr, storage_buffer, 0);
6710    bool glc = (nir_intrinsic_access(instr) & (ACCESS_VOLATILE | ACCESS_COHERENT)) &&
6711               ctx->program->gfx_level < GFX11;
6712 
6713    unsigned write_count = 0;
6714    Temp write_datas[32];
6715    unsigned offsets[32];
6716    split_buffer_store(ctx, instr, false, RegType::vgpr, data, writemask, 16, &write_count,
6717                       write_datas, offsets);
6718 
6719    /* GFX6-7 are affected by a hw bug that prevents address clamping to work
6720     * correctly when the SGPR offset is used.
6721     */
6722    if (offset.type() == RegType::sgpr && ctx->options->gfx_level < GFX8)
6723       offset = as_vgpr(ctx, offset);
6724 
6725    for (unsigned i = 0; i < write_count; i++) {
6726       aco_opcode op = get_buffer_store_op(write_datas[i].bytes());
6727 
6728       aco_ptr<MUBUF_instruction> store{
6729          create_instruction<MUBUF_instruction>(op, Format::MUBUF, 4, 0)};
6730       store->operands[0] = Operand(rsrc);
6731       store->operands[1] = offset.type() == RegType::vgpr ? Operand(offset) : Operand(v1);
6732       store->operands[2] = offset.type() == RegType::sgpr ? Operand(offset) : Operand::c32(0);
6733       store->operands[3] = Operand(write_datas[i]);
6734       store->offset = offsets[i];
6735       store->offen = (offset.type() == RegType::vgpr);
6736       store->glc = glc;
6737       store->dlc = false;
6738       store->disable_wqm = true;
6739       store->sync = sync;
6740       ctx->program->needs_exact = true;
6741       ctx->block->instructions.emplace_back(std::move(store));
6742    }
6743 }
6744 
6745 void
visit_atomic_ssbo(isel_context * ctx,nir_intrinsic_instr * instr)6746 visit_atomic_ssbo(isel_context* ctx, nir_intrinsic_instr* instr)
6747 {
6748    Builder bld(ctx->program, ctx->block);
6749    bool return_previous = !nir_def_is_unused(&instr->def);
6750    Temp data = as_vgpr(ctx, get_ssa_temp(ctx, instr->src[2].ssa));
6751 
6752    const nir_atomic_op nir_op = nir_intrinsic_atomic_op(instr);
6753    const bool cmpswap = nir_op == nir_atomic_op_cmpxchg;
6754 
6755    aco_opcode op32, op64, image_op;
6756    translate_buffer_image_atomic_op(nir_op, &op32, &op64, &image_op);
6757 
6758    if (cmpswap)
6759       data = bld.pseudo(aco_opcode::p_create_vector, bld.def(RegType::vgpr, data.size() * 2),
6760                         get_ssa_temp(ctx, instr->src[3].ssa), data);
6761 
6762    Temp offset = get_ssa_temp(ctx, instr->src[1].ssa);
6763    Temp rsrc = bld.as_uniform(get_ssa_temp(ctx, instr->src[0].ssa));
6764    Temp dst = get_ssa_temp(ctx, &instr->def);
6765 
6766    aco_opcode op = instr->def.bit_size == 32 ? op32 : op64;
6767    aco_ptr<MUBUF_instruction> mubuf{
6768       create_instruction<MUBUF_instruction>(op, Format::MUBUF, 4, return_previous ? 1 : 0)};
6769    mubuf->operands[0] = Operand(rsrc);
6770    mubuf->operands[1] = offset.type() == RegType::vgpr ? Operand(offset) : Operand(v1);
6771    mubuf->operands[2] = offset.type() == RegType::sgpr ? Operand(offset) : Operand::c32(0);
6772    mubuf->operands[3] = Operand(data);
6773    Definition def =
6774       return_previous ? (cmpswap ? bld.def(data.regClass()) : Definition(dst)) : Definition();
6775    if (return_previous)
6776       mubuf->definitions[0] = def;
6777    mubuf->offset = 0;
6778    mubuf->offen = (offset.type() == RegType::vgpr);
6779    mubuf->glc = return_previous;
6780    mubuf->dlc = false; /* Not needed for atomics */
6781    mubuf->disable_wqm = true;
6782    mubuf->sync = get_memory_sync_info(instr, storage_buffer, semantic_atomicrmw);
6783    ctx->program->needs_exact = true;
6784    ctx->block->instructions.emplace_back(std::move(mubuf));
6785    if (return_previous && cmpswap)
6786       bld.pseudo(aco_opcode::p_extract_vector, Definition(dst), def.getTemp(), Operand::zero());
6787 }
6788 
6789 void
parse_global(isel_context * ctx,nir_intrinsic_instr * intrin,Temp * address,uint32_t * const_offset,Temp * offset)6790 parse_global(isel_context* ctx, nir_intrinsic_instr* intrin, Temp* address, uint32_t* const_offset,
6791              Temp* offset)
6792 {
6793    bool is_store = intrin->intrinsic == nir_intrinsic_store_global_amd;
6794    *address = get_ssa_temp(ctx, intrin->src[is_store ? 1 : 0].ssa);
6795 
6796    *const_offset = nir_intrinsic_base(intrin);
6797 
6798    unsigned num_src = nir_intrinsic_infos[intrin->intrinsic].num_srcs;
6799    nir_src offset_src = intrin->src[num_src - 1];
6800    if (!nir_src_is_const(offset_src) || nir_src_as_uint(offset_src))
6801       *offset = get_ssa_temp(ctx, offset_src.ssa);
6802    else
6803       *offset = Temp();
6804 }
6805 
6806 void
visit_load_global(isel_context * ctx,nir_intrinsic_instr * instr)6807 visit_load_global(isel_context* ctx, nir_intrinsic_instr* instr)
6808 {
6809    Builder bld(ctx->program, ctx->block);
6810    unsigned num_components = instr->num_components;
6811    unsigned component_size = instr->def.bit_size / 8;
6812 
6813    Temp addr, offset;
6814    uint32_t const_offset;
6815    parse_global(ctx, instr, &addr, &const_offset, &offset);
6816 
6817    LoadEmitInfo info = {Operand(addr), get_ssa_temp(ctx, &instr->def), num_components,
6818                         component_size};
6819    if (offset.id()) {
6820       info.resource = addr;
6821       info.offset = Operand(offset);
6822    }
6823    info.const_offset = const_offset;
6824    info.glc = nir_intrinsic_access(instr) & (ACCESS_VOLATILE | ACCESS_COHERENT);
6825    info.align_mul = nir_intrinsic_align_mul(instr);
6826    info.align_offset = nir_intrinsic_align_offset(instr);
6827    info.sync = get_memory_sync_info(instr, storage_buffer, 0);
6828 
6829    /* Don't expand global loads when they use MUBUF or SMEM.
6830     * Global loads don't have the bounds checking that buffer loads have that
6831     * makes this safe.
6832     */
6833    unsigned align = nir_intrinsic_align(instr);
6834    bool byte_align_for_smem_mubuf =
6835       can_use_byte_align_for_global_load(num_components, component_size, align, false);
6836 
6837    /* VMEM stores don't update the SMEM cache and it's difficult to prove that
6838     * it's safe to use SMEM */
6839    bool can_use_smem =
6840       (nir_intrinsic_access(instr) & ACCESS_NON_WRITEABLE) && byte_align_for_smem_mubuf;
6841    if (info.dst.type() == RegType::vgpr || (info.glc && ctx->options->gfx_level < GFX8) ||
6842        !can_use_smem) {
6843       EmitLoadParameters params = global_load_params;
6844       params.byte_align_loads = ctx->options->gfx_level > GFX6 || byte_align_for_smem_mubuf;
6845       emit_load(ctx, bld, info, params);
6846    } else {
6847       if (info.resource.id())
6848          info.resource = bld.as_uniform(info.resource);
6849       info.offset = Operand(bld.as_uniform(info.offset));
6850       emit_load(ctx, bld, info, smem_load_params);
6851    }
6852 }
6853 
6854 void
visit_store_global(isel_context * ctx,nir_intrinsic_instr * instr)6855 visit_store_global(isel_context* ctx, nir_intrinsic_instr* instr)
6856 {
6857    Builder bld(ctx->program, ctx->block);
6858    unsigned elem_size_bytes = instr->src[0].ssa->bit_size / 8;
6859    unsigned writemask = util_widen_mask(nir_intrinsic_write_mask(instr), elem_size_bytes);
6860 
6861    Temp data = as_vgpr(ctx, get_ssa_temp(ctx, instr->src[0].ssa));
6862    memory_sync_info sync = get_memory_sync_info(instr, storage_buffer, 0);
6863    bool glc = (nir_intrinsic_access(instr) & (ACCESS_VOLATILE | ACCESS_COHERENT)) &&
6864               ctx->program->gfx_level < GFX11;
6865 
6866    unsigned write_count = 0;
6867    Temp write_datas[32];
6868    unsigned offsets[32];
6869    split_buffer_store(ctx, instr, false, RegType::vgpr, data, writemask, 16, &write_count,
6870                       write_datas, offsets);
6871 
6872    Temp addr, offset;
6873    uint32_t const_offset;
6874    parse_global(ctx, instr, &addr, &const_offset, &offset);
6875 
6876    for (unsigned i = 0; i < write_count; i++) {
6877       Temp write_address = addr;
6878       uint32_t write_const_offset = const_offset;
6879       Temp write_offset = offset;
6880       lower_global_address(bld, offsets[i], &write_address, &write_const_offset, &write_offset);
6881 
6882       if (ctx->options->gfx_level >= GFX7) {
6883          bool global = ctx->options->gfx_level >= GFX9;
6884          aco_opcode op;
6885          switch (write_datas[i].bytes()) {
6886          case 1: op = global ? aco_opcode::global_store_byte : aco_opcode::flat_store_byte; break;
6887          case 2: op = global ? aco_opcode::global_store_short : aco_opcode::flat_store_short; break;
6888          case 4: op = global ? aco_opcode::global_store_dword : aco_opcode::flat_store_dword; break;
6889          case 8:
6890             op = global ? aco_opcode::global_store_dwordx2 : aco_opcode::flat_store_dwordx2;
6891             break;
6892          case 12:
6893             op = global ? aco_opcode::global_store_dwordx3 : aco_opcode::flat_store_dwordx3;
6894             break;
6895          case 16:
6896             op = global ? aco_opcode::global_store_dwordx4 : aco_opcode::flat_store_dwordx4;
6897             break;
6898          default: unreachable("store_global not implemented for this size.");
6899          }
6900 
6901          aco_ptr<FLAT_instruction> flat{
6902             create_instruction<FLAT_instruction>(op, global ? Format::GLOBAL : Format::FLAT, 3, 0)};
6903          if (write_address.regClass() == s2) {
6904             assert(global && write_offset.id() && write_offset.type() == RegType::vgpr);
6905             flat->operands[0] = Operand(write_offset);
6906             flat->operands[1] = Operand(write_address);
6907          } else {
6908             assert(write_address.type() == RegType::vgpr && !write_offset.id());
6909             flat->operands[0] = Operand(write_address);
6910             flat->operands[1] = Operand(s1);
6911          }
6912          flat->operands[2] = Operand(write_datas[i]);
6913          flat->glc = glc;
6914          flat->dlc = false;
6915          assert(global || !write_const_offset);
6916          flat->offset = write_const_offset;
6917          flat->disable_wqm = true;
6918          flat->sync = sync;
6919          ctx->program->needs_exact = true;
6920          ctx->block->instructions.emplace_back(std::move(flat));
6921       } else {
6922          assert(ctx->options->gfx_level == GFX6);
6923 
6924          aco_opcode op = get_buffer_store_op(write_datas[i].bytes());
6925 
6926          Temp rsrc = get_gfx6_global_rsrc(bld, write_address);
6927 
6928          aco_ptr<MUBUF_instruction> mubuf{
6929             create_instruction<MUBUF_instruction>(op, Format::MUBUF, 4, 0)};
6930          mubuf->operands[0] = Operand(rsrc);
6931          mubuf->operands[1] =
6932             write_address.type() == RegType::vgpr ? Operand(write_address) : Operand(v1);
6933          mubuf->operands[2] = Operand(write_offset);
6934          mubuf->operands[3] = Operand(write_datas[i]);
6935          mubuf->glc = glc;
6936          mubuf->dlc = false;
6937          mubuf->offset = write_const_offset;
6938          mubuf->addr64 = write_address.type() == RegType::vgpr;
6939          mubuf->disable_wqm = true;
6940          mubuf->sync = sync;
6941          ctx->program->needs_exact = true;
6942          ctx->block->instructions.emplace_back(std::move(mubuf));
6943       }
6944    }
6945 }
6946 
6947 void
visit_global_atomic(isel_context * ctx,nir_intrinsic_instr * instr)6948 visit_global_atomic(isel_context* ctx, nir_intrinsic_instr* instr)
6949 {
6950    Builder bld(ctx->program, ctx->block);
6951    bool return_previous = !nir_def_is_unused(&instr->def);
6952    Temp data = as_vgpr(ctx, get_ssa_temp(ctx, instr->src[1].ssa));
6953 
6954    const nir_atomic_op nir_op = nir_intrinsic_atomic_op(instr);
6955    const bool cmpswap = nir_op == nir_atomic_op_cmpxchg;
6956 
6957    if (cmpswap)
6958       data = bld.pseudo(aco_opcode::p_create_vector, bld.def(RegType::vgpr, data.size() * 2),
6959                         get_ssa_temp(ctx, instr->src[2].ssa), data);
6960 
6961    Temp dst = get_ssa_temp(ctx, &instr->def);
6962 
6963    aco_opcode op32, op64;
6964 
6965    Temp addr, offset;
6966    uint32_t const_offset;
6967    parse_global(ctx, instr, &addr, &const_offset, &offset);
6968    lower_global_address(bld, 0, &addr, &const_offset, &offset);
6969 
6970    if (ctx->options->gfx_level >= GFX7) {
6971       bool global = ctx->options->gfx_level >= GFX9;
6972       switch (nir_op) {
6973       case nir_atomic_op_iadd:
6974          op32 = global ? aco_opcode::global_atomic_add : aco_opcode::flat_atomic_add;
6975          op64 = global ? aco_opcode::global_atomic_add_x2 : aco_opcode::flat_atomic_add_x2;
6976          break;
6977       case nir_atomic_op_imin:
6978          op32 = global ? aco_opcode::global_atomic_smin : aco_opcode::flat_atomic_smin;
6979          op64 = global ? aco_opcode::global_atomic_smin_x2 : aco_opcode::flat_atomic_smin_x2;
6980          break;
6981       case nir_atomic_op_umin:
6982          op32 = global ? aco_opcode::global_atomic_umin : aco_opcode::flat_atomic_umin;
6983          op64 = global ? aco_opcode::global_atomic_umin_x2 : aco_opcode::flat_atomic_umin_x2;
6984          break;
6985       case nir_atomic_op_imax:
6986          op32 = global ? aco_opcode::global_atomic_smax : aco_opcode::flat_atomic_smax;
6987          op64 = global ? aco_opcode::global_atomic_smax_x2 : aco_opcode::flat_atomic_smax_x2;
6988          break;
6989       case nir_atomic_op_umax:
6990          op32 = global ? aco_opcode::global_atomic_umax : aco_opcode::flat_atomic_umax;
6991          op64 = global ? aco_opcode::global_atomic_umax_x2 : aco_opcode::flat_atomic_umax_x2;
6992          break;
6993       case nir_atomic_op_iand:
6994          op32 = global ? aco_opcode::global_atomic_and : aco_opcode::flat_atomic_and;
6995          op64 = global ? aco_opcode::global_atomic_and_x2 : aco_opcode::flat_atomic_and_x2;
6996          break;
6997       case nir_atomic_op_ior:
6998          op32 = global ? aco_opcode::global_atomic_or : aco_opcode::flat_atomic_or;
6999          op64 = global ? aco_opcode::global_atomic_or_x2 : aco_opcode::flat_atomic_or_x2;
7000          break;
7001       case nir_atomic_op_ixor:
7002          op32 = global ? aco_opcode::global_atomic_xor : aco_opcode::flat_atomic_xor;
7003          op64 = global ? aco_opcode::global_atomic_xor_x2 : aco_opcode::flat_atomic_xor_x2;
7004          break;
7005       case nir_atomic_op_xchg:
7006          op32 = global ? aco_opcode::global_atomic_swap : aco_opcode::flat_atomic_swap;
7007          op64 = global ? aco_opcode::global_atomic_swap_x2 : aco_opcode::flat_atomic_swap_x2;
7008          break;
7009       case nir_atomic_op_cmpxchg:
7010          op32 = global ? aco_opcode::global_atomic_cmpswap : aco_opcode::flat_atomic_cmpswap;
7011          op64 = global ? aco_opcode::global_atomic_cmpswap_x2 : aco_opcode::flat_atomic_cmpswap_x2;
7012          break;
7013       case nir_atomic_op_fadd:
7014          op32 = global ? aco_opcode::global_atomic_add_f32 : aco_opcode::flat_atomic_add_f32;
7015          op64 = aco_opcode::num_opcodes;
7016          break;
7017       case nir_atomic_op_fmin:
7018          op32 = global ? aco_opcode::global_atomic_fmin : aco_opcode::flat_atomic_fmin;
7019          op64 = global ? aco_opcode::global_atomic_fmin_x2 : aco_opcode::flat_atomic_fmin_x2;
7020          break;
7021       case nir_atomic_op_fmax:
7022          op32 = global ? aco_opcode::global_atomic_fmax : aco_opcode::flat_atomic_fmax;
7023          op64 = global ? aco_opcode::global_atomic_fmax_x2 : aco_opcode::flat_atomic_fmax_x2;
7024          break;
7025       default: unreachable("unsupported atomic operation");
7026       }
7027 
7028       aco_opcode op = instr->def.bit_size == 32 ? op32 : op64;
7029       aco_ptr<FLAT_instruction> flat{create_instruction<FLAT_instruction>(
7030          op, global ? Format::GLOBAL : Format::FLAT, 3, return_previous ? 1 : 0)};
7031       if (addr.regClass() == s2) {
7032          assert(global && offset.id() && offset.type() == RegType::vgpr);
7033          flat->operands[0] = Operand(offset);
7034          flat->operands[1] = Operand(addr);
7035       } else {
7036          assert(addr.type() == RegType::vgpr && !offset.id());
7037          flat->operands[0] = Operand(addr);
7038          flat->operands[1] = Operand(s1);
7039       }
7040       flat->operands[2] = Operand(data);
7041       if (return_previous)
7042          flat->definitions[0] = Definition(dst);
7043       flat->glc = return_previous;
7044       flat->dlc = false; /* Not needed for atomics */
7045       assert(global || !const_offset);
7046       flat->offset = const_offset;
7047       flat->disable_wqm = true;
7048       flat->sync = get_memory_sync_info(instr, storage_buffer, semantic_atomicrmw);
7049       ctx->program->needs_exact = true;
7050       ctx->block->instructions.emplace_back(std::move(flat));
7051    } else {
7052       assert(ctx->options->gfx_level == GFX6);
7053 
7054       UNUSED aco_opcode image_op;
7055       translate_buffer_image_atomic_op(nir_op, &op32, &op64, &image_op);
7056 
7057       Temp rsrc = get_gfx6_global_rsrc(bld, addr);
7058 
7059       aco_opcode op = instr->def.bit_size == 32 ? op32 : op64;
7060 
7061       aco_ptr<MUBUF_instruction> mubuf{
7062          create_instruction<MUBUF_instruction>(op, Format::MUBUF, 4, return_previous ? 1 : 0)};
7063       mubuf->operands[0] = Operand(rsrc);
7064       mubuf->operands[1] = addr.type() == RegType::vgpr ? Operand(addr) : Operand(v1);
7065       mubuf->operands[2] = Operand(offset);
7066       mubuf->operands[3] = Operand(data);
7067       Definition def =
7068          return_previous ? (cmpswap ? bld.def(data.regClass()) : Definition(dst)) : Definition();
7069       if (return_previous)
7070          mubuf->definitions[0] = def;
7071       mubuf->glc = return_previous;
7072       mubuf->dlc = false;
7073       mubuf->offset = const_offset;
7074       mubuf->addr64 = addr.type() == RegType::vgpr;
7075       mubuf->disable_wqm = true;
7076       mubuf->sync = get_memory_sync_info(instr, storage_buffer, semantic_atomicrmw);
7077       ctx->program->needs_exact = true;
7078       ctx->block->instructions.emplace_back(std::move(mubuf));
7079       if (return_previous && cmpswap)
7080          bld.pseudo(aco_opcode::p_extract_vector, Definition(dst), def.getTemp(), Operand::zero());
7081    }
7082 }
7083 
7084 unsigned
aco_storage_mode_from_nir_mem_mode(unsigned mem_mode)7085 aco_storage_mode_from_nir_mem_mode(unsigned mem_mode)
7086 {
7087    unsigned storage = storage_none;
7088 
7089    if (mem_mode & nir_var_shader_out)
7090       storage |= storage_vmem_output;
7091    if ((mem_mode & nir_var_mem_ssbo) || (mem_mode & nir_var_mem_global))
7092       storage |= storage_buffer;
7093    if (mem_mode & nir_var_mem_task_payload)
7094       storage |= storage_task_payload;
7095    if (mem_mode & nir_var_mem_shared)
7096       storage |= storage_shared;
7097    if (mem_mode & nir_var_image)
7098       storage |= storage_image;
7099 
7100    return storage;
7101 }
7102 
7103 void
visit_load_buffer(isel_context * ctx,nir_intrinsic_instr * intrin)7104 visit_load_buffer(isel_context* ctx, nir_intrinsic_instr* intrin)
7105 {
7106    Builder bld(ctx->program, ctx->block);
7107 
7108    /* Swizzled buffer addressing seems to be broken on GFX11 without the idxen bit. */
7109    bool swizzled = nir_intrinsic_access(intrin) & ACCESS_IS_SWIZZLED_AMD;
7110    bool idxen = (swizzled && ctx->program->gfx_level >= GFX11) ||
7111                 !nir_src_is_const(intrin->src[3]) || nir_src_as_uint(intrin->src[3]);
7112    bool v_offset_zero = nir_src_is_const(intrin->src[1]) && !nir_src_as_uint(intrin->src[1]);
7113    bool s_offset_zero = nir_src_is_const(intrin->src[2]) && !nir_src_as_uint(intrin->src[2]);
7114 
7115    Temp dst = get_ssa_temp(ctx, &intrin->def);
7116    Temp descriptor = bld.as_uniform(get_ssa_temp(ctx, intrin->src[0].ssa));
7117    Temp v_offset =
7118       v_offset_zero ? Temp(0, v1) : as_vgpr(ctx, get_ssa_temp(ctx, intrin->src[1].ssa));
7119    Temp s_offset =
7120       s_offset_zero ? Temp(0, s1) : bld.as_uniform(get_ssa_temp(ctx, intrin->src[2].ssa));
7121    Temp idx = idxen ? as_vgpr(ctx, get_ssa_temp(ctx, intrin->src[3].ssa)) : Temp();
7122 
7123    bool glc = nir_intrinsic_access(intrin) & ACCESS_COHERENT;
7124    bool slc = nir_intrinsic_access(intrin) & ACCESS_NON_TEMPORAL;
7125 
7126    unsigned const_offset = nir_intrinsic_base(intrin);
7127    unsigned elem_size_bytes = intrin->def.bit_size / 8u;
7128    unsigned num_components = intrin->def.num_components;
7129 
7130    nir_variable_mode mem_mode = nir_intrinsic_memory_modes(intrin);
7131    memory_sync_info sync(aco_storage_mode_from_nir_mem_mode(mem_mode));
7132 
7133    LoadEmitInfo info = {Operand(v_offset), dst, num_components, elem_size_bytes, descriptor};
7134    info.idx = idx;
7135    info.glc = glc;
7136    info.slc = slc;
7137    info.soffset = s_offset;
7138    info.const_offset = const_offset;
7139    info.sync = sync;
7140 
7141    if (intrin->intrinsic == nir_intrinsic_load_typed_buffer_amd) {
7142       const pipe_format format = nir_intrinsic_format(intrin);
7143       const struct ac_vtx_format_info* vtx_info =
7144          ac_get_vtx_format_info(ctx->program->gfx_level, ctx->program->family, format);
7145       const struct util_format_description* f = util_format_description(format);
7146       const unsigned align_mul = nir_intrinsic_align_mul(intrin);
7147       const unsigned align_offset = nir_intrinsic_align_offset(intrin);
7148 
7149       /* Avoid splitting:
7150        * - non-array formats because that would result in incorrect code
7151        * - when element size is same as component size (to reduce instruction count)
7152        */
7153       const bool can_split = f->is_array && elem_size_bytes != vtx_info->chan_byte_size;
7154 
7155       info.align_mul = align_mul;
7156       info.align_offset = align_offset;
7157       info.format = format;
7158       info.component_stride = can_split ? vtx_info->chan_byte_size : 0;
7159       info.split_by_component_stride = false;
7160 
7161       emit_load(ctx, bld, info, mtbuf_load_params);
7162    } else {
7163       assert(intrin->intrinsic == nir_intrinsic_load_buffer_amd);
7164 
7165       if (nir_intrinsic_access(intrin) & ACCESS_USES_FORMAT_AMD) {
7166          assert(!swizzled);
7167 
7168          emit_load(ctx, bld, info, mubuf_load_format_params);
7169       } else {
7170          const unsigned swizzle_element_size =
7171             swizzled ? (ctx->program->gfx_level <= GFX8 ? 4 : 16) : 0;
7172 
7173          info.component_stride = swizzle_element_size;
7174          info.swizzle_component_size = swizzle_element_size ? 4 : 0;
7175          info.align_mul = MIN2(elem_size_bytes, 4);
7176          info.align_offset = 0;
7177 
7178          emit_load(ctx, bld, info, mubuf_load_params);
7179       }
7180    }
7181 }
7182 
7183 void
visit_store_buffer(isel_context * ctx,nir_intrinsic_instr * intrin)7184 visit_store_buffer(isel_context* ctx, nir_intrinsic_instr* intrin)
7185 {
7186    Builder bld(ctx->program, ctx->block);
7187 
7188    /* Swizzled buffer addressing seems to be broken on GFX11 without the idxen bit. */
7189    bool swizzled = nir_intrinsic_access(intrin) & ACCESS_IS_SWIZZLED_AMD;
7190    bool idxen = (swizzled && ctx->program->gfx_level >= GFX11) ||
7191                 !nir_src_is_const(intrin->src[4]) || nir_src_as_uint(intrin->src[4]);
7192    bool v_offset_zero = nir_src_is_const(intrin->src[2]) && !nir_src_as_uint(intrin->src[2]);
7193    bool s_offset_zero = nir_src_is_const(intrin->src[3]) && !nir_src_as_uint(intrin->src[3]);
7194 
7195    Temp store_src = get_ssa_temp(ctx, intrin->src[0].ssa);
7196    Temp descriptor = bld.as_uniform(get_ssa_temp(ctx, intrin->src[1].ssa));
7197    Temp v_offset =
7198       v_offset_zero ? Temp(0, v1) : as_vgpr(ctx, get_ssa_temp(ctx, intrin->src[2].ssa));
7199    Temp s_offset =
7200       s_offset_zero ? Temp(0, s1) : bld.as_uniform(get_ssa_temp(ctx, intrin->src[3].ssa));
7201    Temp idx = idxen ? as_vgpr(ctx, get_ssa_temp(ctx, intrin->src[4].ssa)) : Temp();
7202 
7203    bool glc = nir_intrinsic_access(intrin) & ACCESS_COHERENT;
7204    bool slc = nir_intrinsic_access(intrin) & ACCESS_NON_TEMPORAL;
7205 
7206    unsigned const_offset = nir_intrinsic_base(intrin);
7207    unsigned write_mask = nir_intrinsic_write_mask(intrin);
7208    unsigned elem_size_bytes = intrin->src[0].ssa->bit_size / 8u;
7209 
7210    nir_variable_mode mem_mode = nir_intrinsic_memory_modes(intrin);
7211    /* GS outputs are only written once. */
7212    const bool written_once =
7213       mem_mode == nir_var_shader_out && ctx->shader->info.stage == MESA_SHADER_GEOMETRY;
7214    memory_sync_info sync(aco_storage_mode_from_nir_mem_mode(mem_mode),
7215                          written_once ? semantic_can_reorder : semantic_none);
7216 
7217    store_vmem_mubuf(ctx, store_src, descriptor, v_offset, s_offset, idx, const_offset,
7218                     elem_size_bytes, write_mask, swizzled, sync, glc, slc);
7219 }
7220 
7221 void
visit_load_smem(isel_context * ctx,nir_intrinsic_instr * instr)7222 visit_load_smem(isel_context* ctx, nir_intrinsic_instr* instr)
7223 {
7224    Builder bld(ctx->program, ctx->block);
7225    Temp dst = get_ssa_temp(ctx, &instr->def);
7226    Temp base = bld.as_uniform(get_ssa_temp(ctx, instr->src[0].ssa));
7227    Temp offset = bld.as_uniform(get_ssa_temp(ctx, instr->src[1].ssa));
7228 
7229    /* If base address is 32bit, convert to 64bit with the high 32bit part. */
7230    if (base.bytes() == 4) {
7231       base = bld.pseudo(aco_opcode::p_create_vector, bld.def(s2), base,
7232                         Operand::c32(ctx->options->address32_hi));
7233    }
7234 
7235    aco_opcode opcode = aco_opcode::s_load_dword;
7236    unsigned size = 1;
7237 
7238    assert(dst.bytes() <= 64);
7239 
7240    if (dst.bytes() > 32) {
7241       opcode = aco_opcode::s_load_dwordx16;
7242       size = 16;
7243    } else if (dst.bytes() > 16) {
7244       opcode = aco_opcode::s_load_dwordx8;
7245       size = 8;
7246    } else if (dst.bytes() > 8) {
7247       opcode = aco_opcode::s_load_dwordx4;
7248       size = 4;
7249    } else if (dst.bytes() > 4) {
7250       opcode = aco_opcode::s_load_dwordx2;
7251       size = 2;
7252    }
7253 
7254    if (dst.size() != size) {
7255       bld.pseudo(aco_opcode::p_extract_vector, Definition(dst),
7256                  bld.smem(opcode, bld.def(RegType::sgpr, size), base, offset), Operand::c32(0u));
7257    } else {
7258       bld.smem(opcode, Definition(dst), base, offset);
7259    }
7260    emit_split_vector(ctx, dst, instr->def.num_components);
7261 }
7262 
7263 sync_scope
translate_nir_scope(mesa_scope scope)7264 translate_nir_scope(mesa_scope scope)
7265 {
7266    switch (scope) {
7267    case SCOPE_NONE:
7268    case SCOPE_INVOCATION: return scope_invocation;
7269    case SCOPE_SUBGROUP: return scope_subgroup;
7270    case SCOPE_WORKGROUP: return scope_workgroup;
7271    case SCOPE_QUEUE_FAMILY: return scope_queuefamily;
7272    case SCOPE_DEVICE: return scope_device;
7273    case SCOPE_SHADER_CALL: return scope_invocation;
7274    }
7275    unreachable("invalid scope");
7276 }
7277 
7278 void
emit_barrier(isel_context * ctx,nir_intrinsic_instr * instr)7279 emit_barrier(isel_context* ctx, nir_intrinsic_instr* instr)
7280 {
7281    Builder bld(ctx->program, ctx->block);
7282 
7283    unsigned storage_allowed = storage_buffer | storage_image;
7284    unsigned semantics = 0;
7285    sync_scope mem_scope = translate_nir_scope(nir_intrinsic_memory_scope(instr));
7286    sync_scope exec_scope = translate_nir_scope(nir_intrinsic_execution_scope(instr));
7287 
7288    /* We use shared storage for the following:
7289     * - compute shaders expose it in their API
7290     * - when tessellation is used, TCS and VS I/O is lowered to shared memory
7291     * - when GS is used on GFX9+, VS->GS and TES->GS I/O is lowered to shared memory
7292     * - additionally, when NGG is used on GFX10+, shared memory is used for certain features
7293     */
7294    bool shared_storage_used =
7295       ctx->stage.hw == AC_HW_COMPUTE_SHADER || ctx->stage.hw == AC_HW_LOCAL_SHADER ||
7296       ctx->stage.hw == AC_HW_HULL_SHADER ||
7297       (ctx->stage.hw == AC_HW_LEGACY_GEOMETRY_SHADER && ctx->program->gfx_level >= GFX9) ||
7298       ctx->stage.hw == AC_HW_NEXT_GEN_GEOMETRY_SHADER;
7299 
7300    if (shared_storage_used)
7301       storage_allowed |= storage_shared;
7302 
7303    /* Task payload: Task Shader output, Mesh Shader input */
7304    if (ctx->stage.has(SWStage::MS) || ctx->stage.has(SWStage::TS))
7305       storage_allowed |= storage_task_payload;
7306 
7307    /* Allow VMEM output for all stages that can have outputs. */
7308    if ((ctx->stage.hw != AC_HW_COMPUTE_SHADER && ctx->stage.hw != AC_HW_PIXEL_SHADER) ||
7309        ctx->stage.has(SWStage::TS))
7310       storage_allowed |= storage_vmem_output;
7311 
7312    /* Workgroup barriers can hang merged shaders that can potentially have 0 threads in either half.
7313     * They are allowed in CS, TCS, and in any NGG shader.
7314     */
7315    ASSERTED bool workgroup_scope_allowed = ctx->stage.hw == AC_HW_COMPUTE_SHADER ||
7316                                            ctx->stage.hw == AC_HW_HULL_SHADER ||
7317                                            ctx->stage.hw == AC_HW_NEXT_GEN_GEOMETRY_SHADER;
7318 
7319    unsigned nir_storage = nir_intrinsic_memory_modes(instr);
7320    unsigned storage = aco_storage_mode_from_nir_mem_mode(nir_storage);
7321    storage &= storage_allowed;
7322 
7323    unsigned nir_semantics = nir_intrinsic_memory_semantics(instr);
7324    if (nir_semantics & NIR_MEMORY_ACQUIRE)
7325       semantics |= semantic_acquire | semantic_release;
7326    if (nir_semantics & NIR_MEMORY_RELEASE)
7327       semantics |= semantic_acquire | semantic_release;
7328 
7329    assert(!(nir_semantics & (NIR_MEMORY_MAKE_AVAILABLE | NIR_MEMORY_MAKE_VISIBLE)));
7330    assert(exec_scope != scope_workgroup || workgroup_scope_allowed);
7331 
7332    bld.barrier(aco_opcode::p_barrier,
7333                memory_sync_info((storage_class)storage, (memory_semantics)semantics, mem_scope),
7334                exec_scope);
7335 }
7336 
7337 void
visit_load_shared(isel_context * ctx,nir_intrinsic_instr * instr)7338 visit_load_shared(isel_context* ctx, nir_intrinsic_instr* instr)
7339 {
7340    // TODO: implement sparse reads using ds_read2_b32 and nir_def_components_read()
7341    Temp dst = get_ssa_temp(ctx, &instr->def);
7342    Temp address = as_vgpr(ctx, get_ssa_temp(ctx, instr->src[0].ssa));
7343    Builder bld(ctx->program, ctx->block);
7344 
7345    unsigned elem_size_bytes = instr->def.bit_size / 8;
7346    unsigned num_components = instr->def.num_components;
7347    unsigned align = nir_intrinsic_align_mul(instr) ? nir_intrinsic_align(instr) : elem_size_bytes;
7348    load_lds(ctx, elem_size_bytes, num_components, dst, address, nir_intrinsic_base(instr), align);
7349 }
7350 
7351 void
visit_store_shared(isel_context * ctx,nir_intrinsic_instr * instr)7352 visit_store_shared(isel_context* ctx, nir_intrinsic_instr* instr)
7353 {
7354    unsigned writemask = nir_intrinsic_write_mask(instr);
7355    Temp data = get_ssa_temp(ctx, instr->src[0].ssa);
7356    Temp address = as_vgpr(ctx, get_ssa_temp(ctx, instr->src[1].ssa));
7357    unsigned elem_size_bytes = instr->src[0].ssa->bit_size / 8;
7358 
7359    unsigned align = nir_intrinsic_align_mul(instr) ? nir_intrinsic_align(instr) : elem_size_bytes;
7360    store_lds(ctx, elem_size_bytes, data, writemask, address, nir_intrinsic_base(instr), align);
7361 }
7362 
7363 void
visit_shared_atomic(isel_context * ctx,nir_intrinsic_instr * instr)7364 visit_shared_atomic(isel_context* ctx, nir_intrinsic_instr* instr)
7365 {
7366    unsigned offset = nir_intrinsic_base(instr);
7367    Builder bld(ctx->program, ctx->block);
7368    Operand m = load_lds_size_m0(bld);
7369    Temp data = as_vgpr(ctx, get_ssa_temp(ctx, instr->src[1].ssa));
7370    Temp address = as_vgpr(ctx, get_ssa_temp(ctx, instr->src[0].ssa));
7371 
7372    unsigned num_operands = 3;
7373    aco_opcode op32, op64, op32_rtn, op64_rtn;
7374    switch (nir_intrinsic_atomic_op(instr)) {
7375    case nir_atomic_op_iadd:
7376       op32 = aco_opcode::ds_add_u32;
7377       op64 = aco_opcode::ds_add_u64;
7378       op32_rtn = aco_opcode::ds_add_rtn_u32;
7379       op64_rtn = aco_opcode::ds_add_rtn_u64;
7380       break;
7381    case nir_atomic_op_imin:
7382       op32 = aco_opcode::ds_min_i32;
7383       op64 = aco_opcode::ds_min_i64;
7384       op32_rtn = aco_opcode::ds_min_rtn_i32;
7385       op64_rtn = aco_opcode::ds_min_rtn_i64;
7386       break;
7387    case nir_atomic_op_umin:
7388       op32 = aco_opcode::ds_min_u32;
7389       op64 = aco_opcode::ds_min_u64;
7390       op32_rtn = aco_opcode::ds_min_rtn_u32;
7391       op64_rtn = aco_opcode::ds_min_rtn_u64;
7392       break;
7393    case nir_atomic_op_imax:
7394       op32 = aco_opcode::ds_max_i32;
7395       op64 = aco_opcode::ds_max_i64;
7396       op32_rtn = aco_opcode::ds_max_rtn_i32;
7397       op64_rtn = aco_opcode::ds_max_rtn_i64;
7398       break;
7399    case nir_atomic_op_umax:
7400       op32 = aco_opcode::ds_max_u32;
7401       op64 = aco_opcode::ds_max_u64;
7402       op32_rtn = aco_opcode::ds_max_rtn_u32;
7403       op64_rtn = aco_opcode::ds_max_rtn_u64;
7404       break;
7405    case nir_atomic_op_iand:
7406       op32 = aco_opcode::ds_and_b32;
7407       op64 = aco_opcode::ds_and_b64;
7408       op32_rtn = aco_opcode::ds_and_rtn_b32;
7409       op64_rtn = aco_opcode::ds_and_rtn_b64;
7410       break;
7411    case nir_atomic_op_ior:
7412       op32 = aco_opcode::ds_or_b32;
7413       op64 = aco_opcode::ds_or_b64;
7414       op32_rtn = aco_opcode::ds_or_rtn_b32;
7415       op64_rtn = aco_opcode::ds_or_rtn_b64;
7416       break;
7417    case nir_atomic_op_ixor:
7418       op32 = aco_opcode::ds_xor_b32;
7419       op64 = aco_opcode::ds_xor_b64;
7420       op32_rtn = aco_opcode::ds_xor_rtn_b32;
7421       op64_rtn = aco_opcode::ds_xor_rtn_b64;
7422       break;
7423    case nir_atomic_op_xchg:
7424       op32 = aco_opcode::ds_write_b32;
7425       op64 = aco_opcode::ds_write_b64;
7426       op32_rtn = aco_opcode::ds_wrxchg_rtn_b32;
7427       op64_rtn = aco_opcode::ds_wrxchg_rtn_b64;
7428       break;
7429    case nir_atomic_op_cmpxchg:
7430       op32 = aco_opcode::ds_cmpst_b32;
7431       op64 = aco_opcode::ds_cmpst_b64;
7432       op32_rtn = aco_opcode::ds_cmpst_rtn_b32;
7433       op64_rtn = aco_opcode::ds_cmpst_rtn_b64;
7434       num_operands = 4;
7435       break;
7436    case nir_atomic_op_fadd:
7437       op32 = aco_opcode::ds_add_f32;
7438       op32_rtn = aco_opcode::ds_add_rtn_f32;
7439       op64 = aco_opcode::num_opcodes;
7440       op64_rtn = aco_opcode::num_opcodes;
7441       break;
7442    case nir_atomic_op_fmin:
7443       op32 = aco_opcode::ds_min_f32;
7444       op32_rtn = aco_opcode::ds_min_rtn_f32;
7445       op64 = aco_opcode::ds_min_f64;
7446       op64_rtn = aco_opcode::ds_min_rtn_f64;
7447       break;
7448    case nir_atomic_op_fmax:
7449       op32 = aco_opcode::ds_max_f32;
7450       op32_rtn = aco_opcode::ds_max_rtn_f32;
7451       op64 = aco_opcode::ds_max_f64;
7452       op64_rtn = aco_opcode::ds_max_rtn_f64;
7453       break;
7454    default: unreachable("Unhandled shared atomic intrinsic");
7455    }
7456 
7457    bool return_previous = !nir_def_is_unused(&instr->def);
7458 
7459    aco_opcode op;
7460    if (data.size() == 1) {
7461       assert(instr->def.bit_size == 32);
7462       op = return_previous ? op32_rtn : op32;
7463    } else {
7464       assert(instr->def.bit_size == 64);
7465       op = return_previous ? op64_rtn : op64;
7466    }
7467 
7468    if (offset > 65535) {
7469       address = bld.vadd32(bld.def(v1), Operand::c32(offset), address);
7470       offset = 0;
7471    }
7472 
7473    aco_ptr<DS_instruction> ds;
7474    ds.reset(
7475       create_instruction<DS_instruction>(op, Format::DS, num_operands, return_previous ? 1 : 0));
7476    ds->operands[0] = Operand(address);
7477    ds->operands[1] = Operand(data);
7478    if (num_operands == 4) {
7479       Temp data2 = as_vgpr(ctx, get_ssa_temp(ctx, instr->src[2].ssa));
7480       ds->operands[2] = Operand(data2);
7481       if (bld.program->gfx_level >= GFX11)
7482          std::swap(ds->operands[1], ds->operands[2]);
7483    }
7484    ds->operands[num_operands - 1] = m;
7485    ds->offset0 = offset;
7486    if (return_previous)
7487       ds->definitions[0] = Definition(get_ssa_temp(ctx, &instr->def));
7488    ds->sync = memory_sync_info(storage_shared, semantic_atomicrmw);
7489 
7490    if (m.isUndefined())
7491       ds->operands.pop_back();
7492 
7493    ctx->block->instructions.emplace_back(std::move(ds));
7494 }
7495 
7496 void
visit_access_shared2_amd(isel_context * ctx,nir_intrinsic_instr * instr)7497 visit_access_shared2_amd(isel_context* ctx, nir_intrinsic_instr* instr)
7498 {
7499    bool is_store = instr->intrinsic == nir_intrinsic_store_shared2_amd;
7500    Temp address = as_vgpr(ctx, get_ssa_temp(ctx, instr->src[is_store].ssa));
7501    Builder bld(ctx->program, ctx->block);
7502 
7503    assert(bld.program->gfx_level >= GFX7);
7504 
7505    bool is64bit = (is_store ? instr->src[0].ssa->bit_size : instr->def.bit_size) == 64;
7506    uint8_t offset0 = nir_intrinsic_offset0(instr);
7507    uint8_t offset1 = nir_intrinsic_offset1(instr);
7508    bool st64 = nir_intrinsic_st64(instr);
7509 
7510    Operand m = load_lds_size_m0(bld);
7511    Instruction* ds;
7512    if (is_store) {
7513       aco_opcode op = st64
7514                          ? (is64bit ? aco_opcode::ds_write2st64_b64 : aco_opcode::ds_write2st64_b32)
7515                          : (is64bit ? aco_opcode::ds_write2_b64 : aco_opcode::ds_write2_b32);
7516       Temp data = get_ssa_temp(ctx, instr->src[0].ssa);
7517       RegClass comp_rc = is64bit ? v2 : v1;
7518       Temp data0 = emit_extract_vector(ctx, data, 0, comp_rc);
7519       Temp data1 = emit_extract_vector(ctx, data, 1, comp_rc);
7520       ds = bld.ds(op, address, data0, data1, m, offset0, offset1);
7521    } else {
7522       Temp dst = get_ssa_temp(ctx, &instr->def);
7523       Definition tmp_dst(dst.type() == RegType::vgpr ? dst : bld.tmp(is64bit ? v4 : v2));
7524       aco_opcode op = st64 ? (is64bit ? aco_opcode::ds_read2st64_b64 : aco_opcode::ds_read2st64_b32)
7525                            : (is64bit ? aco_opcode::ds_read2_b64 : aco_opcode::ds_read2_b32);
7526       ds = bld.ds(op, tmp_dst, address, m, offset0, offset1);
7527    }
7528    ds->ds().sync = memory_sync_info(storage_shared);
7529    if (m.isUndefined())
7530       ds->operands.pop_back();
7531 
7532    if (!is_store) {
7533       Temp dst = get_ssa_temp(ctx, &instr->def);
7534       if (dst.type() == RegType::sgpr) {
7535          emit_split_vector(ctx, ds->definitions[0].getTemp(), dst.size());
7536          Temp comp[4];
7537          /* Use scalar v_readfirstlane_b32 for better 32-bit copy propagation */
7538          for (unsigned i = 0; i < dst.size(); i++)
7539             comp[i] = bld.as_uniform(emit_extract_vector(ctx, ds->definitions[0].getTemp(), i, v1));
7540          if (is64bit) {
7541             Temp comp0 = bld.pseudo(aco_opcode::p_create_vector, bld.def(s2), comp[0], comp[1]);
7542             Temp comp1 = bld.pseudo(aco_opcode::p_create_vector, bld.def(s2), comp[2], comp[3]);
7543             ctx->allocated_vec[comp0.id()] = {comp[0], comp[1]};
7544             ctx->allocated_vec[comp1.id()] = {comp[2], comp[3]};
7545             bld.pseudo(aco_opcode::p_create_vector, Definition(dst), comp0, comp1);
7546             ctx->allocated_vec[dst.id()] = {comp0, comp1};
7547          } else {
7548             bld.pseudo(aco_opcode::p_create_vector, Definition(dst), comp[0], comp[1]);
7549          }
7550       }
7551 
7552       emit_split_vector(ctx, dst, 2);
7553    }
7554 }
7555 
7556 Temp
get_scratch_resource(isel_context * ctx)7557 get_scratch_resource(isel_context* ctx)
7558 {
7559    Builder bld(ctx->program, ctx->block);
7560    Temp scratch_addr = ctx->program->private_segment_buffer;
7561    if (!scratch_addr.bytes()) {
7562       Temp addr_lo =
7563          bld.sop1(aco_opcode::p_load_symbol, bld.def(s1), Operand::c32(aco_symbol_scratch_addr_lo));
7564       Temp addr_hi =
7565          bld.sop1(aco_opcode::p_load_symbol, bld.def(s1), Operand::c32(aco_symbol_scratch_addr_hi));
7566       scratch_addr = bld.pseudo(aco_opcode::p_create_vector, bld.def(s2), addr_lo, addr_hi);
7567    } else if (ctx->stage.hw != AC_HW_COMPUTE_SHADER) {
7568       scratch_addr =
7569          bld.smem(aco_opcode::s_load_dwordx2, bld.def(s2), scratch_addr, Operand::zero());
7570    }
7571 
7572    uint32_t rsrc_conf =
7573       S_008F0C_ADD_TID_ENABLE(1) | S_008F0C_INDEX_STRIDE(ctx->program->wave_size == 64 ? 3 : 2);
7574 
7575    if (ctx->program->gfx_level >= GFX10) {
7576       rsrc_conf |= S_008F0C_FORMAT(V_008F0C_GFX10_FORMAT_32_FLOAT) |
7577                    S_008F0C_OOB_SELECT(V_008F0C_OOB_SELECT_RAW) |
7578                    S_008F0C_RESOURCE_LEVEL(ctx->program->gfx_level < GFX11);
7579    } else if (ctx->program->gfx_level <=
7580               GFX7) { /* dfmt modifies stride on GFX8/GFX9 when ADD_TID_EN=1 */
7581       rsrc_conf |= S_008F0C_NUM_FORMAT(V_008F0C_BUF_NUM_FORMAT_FLOAT) |
7582                    S_008F0C_DATA_FORMAT(V_008F0C_BUF_DATA_FORMAT_32);
7583    }
7584 
7585    /* older generations need element size = 4 bytes. element size removed in GFX9 */
7586    if (ctx->program->gfx_level <= GFX8)
7587       rsrc_conf |= S_008F0C_ELEMENT_SIZE(1);
7588 
7589    return bld.pseudo(aco_opcode::p_create_vector, bld.def(s4), scratch_addr, Operand::c32(-1u),
7590                      Operand::c32(rsrc_conf));
7591 }
7592 
7593 void
visit_load_scratch(isel_context * ctx,nir_intrinsic_instr * instr)7594 visit_load_scratch(isel_context* ctx, nir_intrinsic_instr* instr)
7595 {
7596    Builder bld(ctx->program, ctx->block);
7597    Temp dst = get_ssa_temp(ctx, &instr->def);
7598 
7599    LoadEmitInfo info = {Operand(v1), dst, instr->def.num_components, instr->def.bit_size / 8u};
7600    info.align_mul = nir_intrinsic_align_mul(instr);
7601    info.align_offset = nir_intrinsic_align_offset(instr);
7602    info.swizzle_component_size = ctx->program->gfx_level <= GFX8 ? 4 : 0;
7603    info.sync = memory_sync_info(storage_scratch, semantic_private);
7604    if (ctx->program->gfx_level >= GFX9) {
7605       if (nir_src_is_const(instr->src[0])) {
7606          uint32_t max = ctx->program->dev.scratch_global_offset_max + 1;
7607          info.offset =
7608             bld.copy(bld.def(s1), Operand::c32(ROUND_DOWN_TO(nir_src_as_uint(instr->src[0]), max)));
7609          info.const_offset = nir_src_as_uint(instr->src[0]) % max;
7610       } else {
7611          info.offset = Operand(get_ssa_temp(ctx, instr->src[0].ssa));
7612       }
7613       EmitLoadParameters params = scratch_flat_load_params;
7614       params.max_const_offset_plus_one = ctx->program->dev.scratch_global_offset_max + 1;
7615       emit_load(ctx, bld, info, params);
7616    } else {
7617       info.resource = get_scratch_resource(ctx);
7618       info.offset = Operand(as_vgpr(ctx, get_ssa_temp(ctx, instr->src[0].ssa)));
7619       info.soffset = ctx->program->scratch_offset;
7620       emit_load(ctx, bld, info, scratch_mubuf_load_params);
7621    }
7622 }
7623 
7624 void
visit_store_scratch(isel_context * ctx,nir_intrinsic_instr * instr)7625 visit_store_scratch(isel_context* ctx, nir_intrinsic_instr* instr)
7626 {
7627    Builder bld(ctx->program, ctx->block);
7628    Temp data = as_vgpr(ctx, get_ssa_temp(ctx, instr->src[0].ssa));
7629    Temp offset = get_ssa_temp(ctx, instr->src[1].ssa);
7630 
7631    unsigned elem_size_bytes = instr->src[0].ssa->bit_size / 8;
7632    unsigned writemask = util_widen_mask(nir_intrinsic_write_mask(instr), elem_size_bytes);
7633 
7634    unsigned write_count = 0;
7635    Temp write_datas[32];
7636    unsigned offsets[32];
7637    unsigned swizzle_component_size = ctx->program->gfx_level <= GFX8 ? 4 : 16;
7638    split_buffer_store(ctx, instr, false, RegType::vgpr, data, writemask, swizzle_component_size,
7639                       &write_count, write_datas, offsets);
7640 
7641    if (ctx->program->gfx_level >= GFX9) {
7642       uint32_t max = ctx->program->dev.scratch_global_offset_max + 1;
7643       offset = nir_src_is_const(instr->src[1]) ? Temp(0, s1) : offset;
7644       uint32_t base_const_offset =
7645          nir_src_is_const(instr->src[1]) ? nir_src_as_uint(instr->src[1]) : 0;
7646 
7647       for (unsigned i = 0; i < write_count; i++) {
7648          aco_opcode op;
7649          switch (write_datas[i].bytes()) {
7650          case 1: op = aco_opcode::scratch_store_byte; break;
7651          case 2: op = aco_opcode::scratch_store_short; break;
7652          case 4: op = aco_opcode::scratch_store_dword; break;
7653          case 8: op = aco_opcode::scratch_store_dwordx2; break;
7654          case 12: op = aco_opcode::scratch_store_dwordx3; break;
7655          case 16: op = aco_opcode::scratch_store_dwordx4; break;
7656          default: unreachable("Unexpected store size");
7657          }
7658 
7659          uint32_t const_offset = base_const_offset + offsets[i];
7660          assert(const_offset < max || offset.id() == 0);
7661 
7662          Operand addr = offset.regClass() == s1 ? Operand(v1) : Operand(offset);
7663          Operand saddr = offset.regClass() == s1 ? Operand(offset) : Operand(s1);
7664          if (offset.id() == 0)
7665             saddr = bld.copy(bld.def(s1), Operand::c32(ROUND_DOWN_TO(const_offset, max)));
7666 
7667          bld.scratch(op, addr, saddr, write_datas[i], const_offset % max,
7668                      memory_sync_info(storage_scratch, semantic_private));
7669       }
7670    } else {
7671       Temp rsrc = get_scratch_resource(ctx);
7672       offset = as_vgpr(ctx, offset);
7673       for (unsigned i = 0; i < write_count; i++) {
7674          aco_opcode op = get_buffer_store_op(write_datas[i].bytes());
7675          Instruction* mubuf = bld.mubuf(op, rsrc, offset, ctx->program->scratch_offset,
7676                                         write_datas[i], offsets[i], true, true);
7677          mubuf->mubuf().sync = memory_sync_info(storage_scratch, semantic_private);
7678       }
7679    }
7680 }
7681 
7682 ReduceOp
get_reduce_op(nir_op op,unsigned bit_size)7683 get_reduce_op(nir_op op, unsigned bit_size)
7684 {
7685    switch (op) {
7686 #define CASEI(name)                                                                                \
7687    case nir_op_##name:                                                                             \
7688       return (bit_size == 32)   ? name##32                                                         \
7689              : (bit_size == 16) ? name##16                                                         \
7690              : (bit_size == 8)  ? name##8                                                          \
7691                                 : name##64;
7692 #define CASEF(name)                                                                                \
7693    case nir_op_##name: return (bit_size == 32) ? name##32 : (bit_size == 16) ? name##16 : name##64;
7694       CASEI(iadd)
7695       CASEI(imul)
7696       CASEI(imin)
7697       CASEI(umin)
7698       CASEI(imax)
7699       CASEI(umax)
7700       CASEI(iand)
7701       CASEI(ior)
7702       CASEI(ixor)
7703       CASEF(fadd)
7704       CASEF(fmul)
7705       CASEF(fmin)
7706       CASEF(fmax)
7707    default: unreachable("unknown reduction op");
7708 #undef CASEI
7709 #undef CASEF
7710    }
7711 }
7712 
7713 void
emit_uniform_subgroup(isel_context * ctx,nir_intrinsic_instr * instr,Temp src)7714 emit_uniform_subgroup(isel_context* ctx, nir_intrinsic_instr* instr, Temp src)
7715 {
7716    Builder bld(ctx->program, ctx->block);
7717    Definition dst(get_ssa_temp(ctx, &instr->def));
7718    assert(dst.regClass().type() != RegType::vgpr);
7719    if (src.regClass().type() == RegType::vgpr)
7720       bld.pseudo(aco_opcode::p_as_uniform, dst, src);
7721    else
7722       bld.copy(dst, src);
7723 }
7724 
7725 void
emit_addition_uniform_reduce(isel_context * ctx,nir_op op,Definition dst,nir_src src,Temp count)7726 emit_addition_uniform_reduce(isel_context* ctx, nir_op op, Definition dst, nir_src src, Temp count)
7727 {
7728    Builder bld(ctx->program, ctx->block);
7729    Temp src_tmp = get_ssa_temp(ctx, src.ssa);
7730 
7731    if (op == nir_op_fadd) {
7732       src_tmp = as_vgpr(ctx, src_tmp);
7733       Temp tmp = dst.regClass() == s1 ? bld.tmp(RegClass::get(RegType::vgpr, src.ssa->bit_size / 8))
7734                                       : dst.getTemp();
7735 
7736       if (src.ssa->bit_size == 16) {
7737          count = bld.vop1(aco_opcode::v_cvt_f16_u16, bld.def(v2b), count);
7738          bld.vop2(aco_opcode::v_mul_f16, Definition(tmp), count, src_tmp);
7739       } else {
7740          assert(src.ssa->bit_size == 32);
7741          count = bld.vop1(aco_opcode::v_cvt_f32_u32, bld.def(v1), count);
7742          bld.vop2(aco_opcode::v_mul_f32, Definition(tmp), count, src_tmp);
7743       }
7744 
7745       if (tmp != dst.getTemp())
7746          bld.pseudo(aco_opcode::p_as_uniform, dst, tmp);
7747 
7748       return;
7749    }
7750 
7751    if (dst.regClass() == s1)
7752       src_tmp = bld.as_uniform(src_tmp);
7753 
7754    if (op == nir_op_ixor && count.type() == RegType::sgpr)
7755       count =
7756          bld.sop2(aco_opcode::s_and_b32, bld.def(s1), bld.def(s1, scc), count, Operand::c32(1u));
7757    else if (op == nir_op_ixor)
7758       count = bld.vop2(aco_opcode::v_and_b32, bld.def(v1), Operand::c32(1u), count);
7759 
7760    assert(dst.getTemp().type() == count.type());
7761 
7762    if (nir_src_is_const(src)) {
7763       if (nir_src_as_uint(src) == 1 && dst.bytes() <= 2)
7764          bld.pseudo(aco_opcode::p_extract_vector, dst, count, Operand::zero());
7765       else if (nir_src_as_uint(src) == 1)
7766          bld.copy(dst, count);
7767       else if (nir_src_as_uint(src) == 0)
7768          bld.copy(dst, Operand::zero(dst.bytes()));
7769       else if (count.type() == RegType::vgpr)
7770          bld.v_mul_imm(dst, count, nir_src_as_uint(src));
7771       else
7772          bld.sop2(aco_opcode::s_mul_i32, dst, src_tmp, count);
7773    } else if (dst.bytes() <= 2 && ctx->program->gfx_level >= GFX10) {
7774       bld.vop3(aco_opcode::v_mul_lo_u16_e64, dst, src_tmp, count);
7775    } else if (dst.bytes() <= 2 && ctx->program->gfx_level >= GFX8) {
7776       bld.vop2(aco_opcode::v_mul_lo_u16, dst, src_tmp, count);
7777    } else if (dst.getTemp().type() == RegType::vgpr) {
7778       bld.vop3(aco_opcode::v_mul_lo_u32, dst, src_tmp, count);
7779    } else {
7780       bld.sop2(aco_opcode::s_mul_i32, dst, src_tmp, count);
7781    }
7782 }
7783 
7784 bool
emit_uniform_reduce(isel_context * ctx,nir_intrinsic_instr * instr)7785 emit_uniform_reduce(isel_context* ctx, nir_intrinsic_instr* instr)
7786 {
7787    nir_op op = (nir_op)nir_intrinsic_reduction_op(instr);
7788    if (op == nir_op_imul || op == nir_op_fmul)
7789       return false;
7790 
7791    if (op == nir_op_iadd || op == nir_op_ixor || op == nir_op_fadd) {
7792       Builder bld(ctx->program, ctx->block);
7793       Definition dst(get_ssa_temp(ctx, &instr->def));
7794       unsigned bit_size = instr->src[0].ssa->bit_size;
7795       if (bit_size > 32)
7796          return false;
7797 
7798       Temp thread_count =
7799          bld.sop1(Builder::s_bcnt1_i32, bld.def(s1), bld.def(s1, scc), Operand(exec, bld.lm));
7800       set_wqm(ctx);
7801 
7802       emit_addition_uniform_reduce(ctx, op, dst, instr->src[0], thread_count);
7803    } else {
7804       emit_uniform_subgroup(ctx, instr, get_ssa_temp(ctx, instr->src[0].ssa));
7805    }
7806 
7807    return true;
7808 }
7809 
7810 bool
emit_uniform_scan(isel_context * ctx,nir_intrinsic_instr * instr)7811 emit_uniform_scan(isel_context* ctx, nir_intrinsic_instr* instr)
7812 {
7813    Builder bld(ctx->program, ctx->block);
7814    Definition dst(get_ssa_temp(ctx, &instr->def));
7815    nir_op op = (nir_op)nir_intrinsic_reduction_op(instr);
7816    bool inc = instr->intrinsic == nir_intrinsic_inclusive_scan;
7817 
7818    if (op == nir_op_imul || op == nir_op_fmul)
7819       return false;
7820 
7821    if (op == nir_op_iadd || op == nir_op_ixor || op == nir_op_fadd) {
7822       if (instr->src[0].ssa->bit_size > 32)
7823          return false;
7824 
7825       Temp packed_tid;
7826       if (inc)
7827          packed_tid = emit_mbcnt(ctx, bld.tmp(v1), Operand(exec, bld.lm), Operand::c32(1u));
7828       else
7829          packed_tid = emit_mbcnt(ctx, bld.tmp(v1), Operand(exec, bld.lm));
7830       set_wqm(ctx);
7831 
7832       emit_addition_uniform_reduce(ctx, op, dst, instr->src[0], packed_tid);
7833       return true;
7834    }
7835 
7836    assert(op == nir_op_imin || op == nir_op_umin || op == nir_op_imax || op == nir_op_umax ||
7837           op == nir_op_iand || op == nir_op_ior || op == nir_op_fmin || op == nir_op_fmax);
7838 
7839    if (inc) {
7840       emit_uniform_subgroup(ctx, instr, get_ssa_temp(ctx, instr->src[0].ssa));
7841       return true;
7842    }
7843 
7844    /* Copy the source and write the reduction operation identity to the first lane. */
7845    Temp lane = bld.sop1(Builder::s_ff1_i32, bld.def(s1), Operand(exec, bld.lm));
7846    Temp src = get_ssa_temp(ctx, instr->src[0].ssa);
7847    ReduceOp reduce_op = get_reduce_op(op, instr->src[0].ssa->bit_size);
7848    if (dst.bytes() == 8) {
7849       Temp lo = bld.tmp(v1), hi = bld.tmp(v1);
7850       bld.pseudo(aco_opcode::p_split_vector, Definition(lo), Definition(hi), src);
7851       uint32_t identity_lo = get_reduction_identity(reduce_op, 0);
7852       uint32_t identity_hi = get_reduction_identity(reduce_op, 1);
7853 
7854       lo =
7855          bld.writelane(bld.def(v1), bld.copy(bld.def(s1, m0), Operand::c32(identity_lo)), lane, lo);
7856       hi =
7857          bld.writelane(bld.def(v1), bld.copy(bld.def(s1, m0), Operand::c32(identity_hi)), lane, hi);
7858       bld.pseudo(aco_opcode::p_create_vector, dst, lo, hi);
7859    } else {
7860       uint32_t identity = get_reduction_identity(reduce_op, 0);
7861       bld.writelane(dst, bld.copy(bld.def(s1, m0), Operand::c32(identity)), lane,
7862                     as_vgpr(ctx, src));
7863    }
7864 
7865    set_wqm(ctx);
7866    return true;
7867 }
7868 
7869 Temp
emit_reduction_instr(isel_context * ctx,aco_opcode aco_op,ReduceOp op,unsigned cluster_size,Definition dst,Temp src)7870 emit_reduction_instr(isel_context* ctx, aco_opcode aco_op, ReduceOp op, unsigned cluster_size,
7871                      Definition dst, Temp src)
7872 {
7873    assert(src.bytes() <= 8);
7874    assert(src.type() == RegType::vgpr);
7875 
7876    Builder bld(ctx->program, ctx->block);
7877 
7878    unsigned num_defs = 0;
7879    Definition defs[5];
7880    defs[num_defs++] = dst;
7881    defs[num_defs++] = bld.def(bld.lm); /* used internally to save/restore exec */
7882 
7883    /* scalar identity temporary */
7884    bool need_sitmp = (ctx->program->gfx_level <= GFX7 || ctx->program->gfx_level >= GFX10) &&
7885                      aco_op != aco_opcode::p_reduce;
7886    if (aco_op == aco_opcode::p_exclusive_scan) {
7887       need_sitmp |= (op == imin8 || op == imin16 || op == imin32 || op == imin64 || op == imax8 ||
7888                      op == imax16 || op == imax32 || op == imax64 || op == fmin16 || op == fmin32 ||
7889                      op == fmin64 || op == fmax16 || op == fmax32 || op == fmax64 || op == fmul16 ||
7890                      op == fmul64);
7891    }
7892    if (need_sitmp)
7893       defs[num_defs++] = bld.def(RegType::sgpr, dst.size());
7894 
7895    /* scc clobber */
7896    defs[num_defs++] = bld.def(s1, scc);
7897 
7898    /* vcc clobber */
7899    bool clobber_vcc = false;
7900    if ((op == iadd32 || op == imul64) && ctx->program->gfx_level < GFX9)
7901       clobber_vcc = true;
7902    if ((op == iadd8 || op == iadd16) && ctx->program->gfx_level < GFX8)
7903       clobber_vcc = true;
7904    if (op == iadd64 || op == umin64 || op == umax64 || op == imin64 || op == imax64)
7905       clobber_vcc = true;
7906 
7907    if (clobber_vcc)
7908       defs[num_defs++] = bld.def(bld.lm, vcc);
7909 
7910    Pseudo_reduction_instruction* reduce = create_instruction<Pseudo_reduction_instruction>(
7911       aco_op, Format::PSEUDO_REDUCTION, 3, num_defs);
7912    reduce->operands[0] = Operand(src);
7913    /* setup_reduce_temp will update these undef operands if needed */
7914    reduce->operands[1] = Operand(RegClass(RegType::vgpr, dst.size()).as_linear());
7915    reduce->operands[2] = Operand(v1.as_linear());
7916    std::copy(defs, defs + num_defs, reduce->definitions.begin());
7917 
7918    reduce->reduce_op = op;
7919    reduce->cluster_size = cluster_size;
7920    bld.insert(std::move(reduce));
7921 
7922    return dst.getTemp();
7923 }
7924 
7925 Temp
inclusive_scan_to_exclusive(isel_context * ctx,ReduceOp op,Definition dst,Temp src)7926 inclusive_scan_to_exclusive(isel_context* ctx, ReduceOp op, Definition dst, Temp src)
7927 {
7928    Builder bld(ctx->program, ctx->block);
7929 
7930    Temp scan = emit_reduction_instr(ctx, aco_opcode::p_inclusive_scan, op, ctx->program->wave_size,
7931                                     bld.def(dst.regClass()), src);
7932 
7933    switch (op) {
7934    case iadd8:
7935    case iadd16:
7936    case iadd32: return bld.vsub32(dst, scan, src);
7937    case ixor64:
7938    case iadd64: {
7939       Temp src00 = bld.tmp(v1);
7940       Temp src01 = bld.tmp(v1);
7941       bld.pseudo(aco_opcode::p_split_vector, Definition(src00), Definition(src01), scan);
7942       Temp src10 = bld.tmp(v1);
7943       Temp src11 = bld.tmp(v1);
7944       bld.pseudo(aco_opcode::p_split_vector, Definition(src10), Definition(src11), src);
7945 
7946       Temp lower = bld.tmp(v1);
7947       Temp upper = bld.tmp(v1);
7948       if (op == iadd64) {
7949          Temp borrow = bld.vsub32(Definition(lower), src00, src10, true).def(1).getTemp();
7950          bld.vsub32(Definition(upper), src01, src11, false, borrow);
7951       } else {
7952          bld.vop2(aco_opcode::v_xor_b32, Definition(lower), src00, src10);
7953          bld.vop2(aco_opcode::v_xor_b32, Definition(upper), src01, src11);
7954       }
7955       return bld.pseudo(aco_opcode::p_create_vector, dst, lower, upper);
7956    }
7957    case ixor8:
7958    case ixor16:
7959    case ixor32: return bld.vop2(aco_opcode::v_xor_b32, dst, scan, src);
7960    default: unreachable("Unsupported op");
7961    }
7962 }
7963 
7964 bool
emit_rotate_by_constant(isel_context * ctx,Temp & dst,Temp src,unsigned cluster_size,uint64_t delta)7965 emit_rotate_by_constant(isel_context* ctx, Temp& dst, Temp src, unsigned cluster_size,
7966                         uint64_t delta)
7967 {
7968    Builder bld(ctx->program, ctx->block);
7969    RegClass rc = src.regClass();
7970    dst = Temp(0, rc);
7971    delta %= cluster_size;
7972 
7973    if (delta == 0) {
7974       dst = bld.copy(bld.def(rc), src);
7975    } else if (delta * 2 == cluster_size && cluster_size <= 32) {
7976       dst = emit_masked_swizzle(ctx, bld, src, ds_pattern_bitmode(0x1f, 0, delta), true);
7977    } else if (cluster_size == 4) {
7978       unsigned res[4];
7979       for (unsigned i = 0; i < 4; i++)
7980          res[i] = (i + delta) & 0x3;
7981       uint32_t dpp_ctrl = dpp_quad_perm(res[0], res[1], res[2], res[3]);
7982       if (ctx->program->gfx_level >= GFX8)
7983          dst = bld.vop1_dpp(aco_opcode::v_mov_b32, bld.def(rc), src, dpp_ctrl);
7984       else
7985          dst = bld.ds(aco_opcode::ds_swizzle_b32, bld.def(v1), src, (1 << 15) | dpp_ctrl);
7986    } else if (cluster_size == 8 && ctx->program->gfx_level >= GFX10) {
7987       uint32_t lane_sel = 0;
7988       for (unsigned i = 0; i < 8; i++)
7989          lane_sel |= ((i + delta) & 0x7) << (i * 3);
7990       dst = bld.vop1_dpp8(aco_opcode::v_mov_b32, bld.def(rc), src, lane_sel);
7991    } else if (cluster_size == 16 && ctx->program->gfx_level >= GFX8) {
7992       dst = bld.vop1_dpp(aco_opcode::v_mov_b32, bld.def(rc), src, dpp_row_rr(16 - delta));
7993    } else if (cluster_size <= 32 && ctx->program->gfx_level >= GFX9) {
7994       uint32_t ctrl = ds_pattern_rotate(delta, ~(cluster_size - 1) & 0x1f);
7995       dst = bld.ds(aco_opcode::ds_swizzle_b32, bld.def(v1), src, ctrl);
7996    } else if (cluster_size == 64) {
7997       bool has_wf_dpp = ctx->program->gfx_level >= GFX8 && ctx->program->gfx_level < GFX10;
7998       if (delta == 32 && ctx->program->gfx_level >= GFX11) {
7999          dst = bld.vop1(aco_opcode::v_permlane64_b32, bld.def(rc), src);
8000       } else if (delta == 1 && has_wf_dpp) {
8001          dst = bld.vop1_dpp(aco_opcode::v_mov_b32, bld.def(rc), src, dpp_wf_rl1);
8002       } else if (delta == 63 && has_wf_dpp) {
8003          dst = bld.vop1_dpp(aco_opcode::v_mov_b32, bld.def(rc), src, dpp_wf_rr1);
8004       }
8005    }
8006 
8007    return dst.id() != 0;
8008 }
8009 
8010 void
emit_interp_center(isel_context * ctx,Temp dst,Temp bary,Temp pos1,Temp pos2)8011 emit_interp_center(isel_context* ctx, Temp dst, Temp bary, Temp pos1, Temp pos2)
8012 {
8013    Builder bld(ctx->program, ctx->block);
8014    Temp p1 = emit_extract_vector(ctx, bary, 0, v1);
8015    Temp p2 = emit_extract_vector(ctx, bary, 1, v1);
8016 
8017    Temp ddx_1, ddx_2, ddy_1, ddy_2;
8018    uint32_t dpp_ctrl0 = dpp_quad_perm(0, 0, 0, 0);
8019    uint32_t dpp_ctrl1 = dpp_quad_perm(1, 1, 1, 1);
8020    uint32_t dpp_ctrl2 = dpp_quad_perm(2, 2, 2, 2);
8021 
8022    /* Build DD X/Y */
8023    if (ctx->program->gfx_level >= GFX8) {
8024       Temp tl_1 = bld.vop1_dpp(aco_opcode::v_mov_b32, bld.def(v1), p1, dpp_ctrl0);
8025       ddx_1 = bld.vop2_dpp(aco_opcode::v_sub_f32, bld.def(v1), p1, tl_1, dpp_ctrl1);
8026       ddy_1 = bld.vop2_dpp(aco_opcode::v_sub_f32, bld.def(v1), p1, tl_1, dpp_ctrl2);
8027       Temp tl_2 = bld.vop1_dpp(aco_opcode::v_mov_b32, bld.def(v1), p2, dpp_ctrl0);
8028       ddx_2 = bld.vop2_dpp(aco_opcode::v_sub_f32, bld.def(v1), p2, tl_2, dpp_ctrl1);
8029       ddy_2 = bld.vop2_dpp(aco_opcode::v_sub_f32, bld.def(v1), p2, tl_2, dpp_ctrl2);
8030    } else {
8031       Temp tl_1 = bld.ds(aco_opcode::ds_swizzle_b32, bld.def(v1), p1, (1 << 15) | dpp_ctrl0);
8032       ddx_1 = bld.ds(aco_opcode::ds_swizzle_b32, bld.def(v1), p1, (1 << 15) | dpp_ctrl1);
8033       ddx_1 = bld.vop2(aco_opcode::v_sub_f32, bld.def(v1), ddx_1, tl_1);
8034       ddy_1 = bld.ds(aco_opcode::ds_swizzle_b32, bld.def(v1), p1, (1 << 15) | dpp_ctrl2);
8035       ddy_1 = bld.vop2(aco_opcode::v_sub_f32, bld.def(v1), ddy_1, tl_1);
8036 
8037       Temp tl_2 = bld.ds(aco_opcode::ds_swizzle_b32, bld.def(v1), p2, (1 << 15) | dpp_ctrl0);
8038       ddx_2 = bld.ds(aco_opcode::ds_swizzle_b32, bld.def(v1), p2, (1 << 15) | dpp_ctrl1);
8039       ddx_2 = bld.vop2(aco_opcode::v_sub_f32, bld.def(v1), ddx_2, tl_2);
8040       ddy_2 = bld.ds(aco_opcode::ds_swizzle_b32, bld.def(v1), p2, (1 << 15) | dpp_ctrl2);
8041       ddy_2 = bld.vop2(aco_opcode::v_sub_f32, bld.def(v1), ddy_2, tl_2);
8042    }
8043 
8044    /* res_k = p_k + ddx_k * pos1 + ddy_k * pos2 */
8045    aco_opcode mad =
8046       ctx->program->gfx_level >= GFX10_3 ? aco_opcode::v_fma_f32 : aco_opcode::v_mad_f32;
8047    Temp tmp1 = bld.vop3(mad, bld.def(v1), ddx_1, pos1, p1);
8048    Temp tmp2 = bld.vop3(mad, bld.def(v1), ddx_2, pos1, p2);
8049    tmp1 = bld.vop3(mad, bld.def(v1), ddy_1, pos2, tmp1);
8050    tmp2 = bld.vop3(mad, bld.def(v1), ddy_2, pos2, tmp2);
8051    bld.pseudo(aco_opcode::p_create_vector, Definition(dst), tmp1, tmp2);
8052    set_wqm(ctx, true);
8053    return;
8054 }
8055 
8056 Temp merged_wave_info_to_mask(isel_context* ctx, unsigned i);
8057 Temp lanecount_to_mask(isel_context* ctx, Temp count);
8058 void pops_await_overlapped_waves(isel_context* ctx);
8059 
8060 Temp
get_interp_param(isel_context * ctx,nir_intrinsic_op intrin,enum glsl_interp_mode interp)8061 get_interp_param(isel_context* ctx, nir_intrinsic_op intrin, enum glsl_interp_mode interp)
8062 {
8063    bool linear = interp == INTERP_MODE_NOPERSPECTIVE;
8064    if (intrin == nir_intrinsic_load_barycentric_pixel ||
8065        intrin == nir_intrinsic_load_barycentric_at_offset) {
8066       return get_arg(ctx, linear ? ctx->args->linear_center : ctx->args->persp_center);
8067    } else if (intrin == nir_intrinsic_load_barycentric_centroid) {
8068       return get_arg(ctx, linear ? ctx->args->linear_centroid : ctx->args->persp_centroid);
8069    } else {
8070       assert(intrin == nir_intrinsic_load_barycentric_sample);
8071       return get_arg(ctx, linear ? ctx->args->linear_sample : ctx->args->persp_sample);
8072    }
8073 }
8074 
8075 void
ds_ordered_count_offsets(isel_context * ctx,unsigned index_operand,unsigned wave_release,unsigned wave_done,unsigned * offset0,unsigned * offset1)8076 ds_ordered_count_offsets(isel_context* ctx, unsigned index_operand, unsigned wave_release,
8077                          unsigned wave_done, unsigned* offset0, unsigned* offset1)
8078 {
8079    unsigned ordered_count_index = index_operand & 0x3f;
8080    unsigned count_dword = (index_operand >> 24) & 0xf;
8081 
8082    assert(ctx->options->gfx_level >= GFX10);
8083    assert(count_dword >= 1 && count_dword <= 4);
8084 
8085    *offset0 = ordered_count_index << 2;
8086    *offset1 = wave_release | (wave_done << 1) | ((count_dword - 1) << 6);
8087 
8088    if (ctx->options->gfx_level < GFX11)
8089       *offset1 |= 3 /* GS shader type */ << 2;
8090 }
8091 
8092 struct aco_export_mrt {
8093    Operand out[4];
8094    unsigned enabled_channels;
8095    unsigned target;
8096    bool compr;
8097 };
8098 
8099 static void
create_fs_dual_src_export_gfx11(isel_context * ctx,const struct aco_export_mrt * mrt0,const struct aco_export_mrt * mrt1)8100 create_fs_dual_src_export_gfx11(isel_context* ctx, const struct aco_export_mrt* mrt0,
8101                                 const struct aco_export_mrt* mrt1)
8102 {
8103    Builder bld(ctx->program, ctx->block);
8104 
8105    aco_ptr<Pseudo_instruction> exp{create_instruction<Pseudo_instruction>(
8106       aco_opcode::p_dual_src_export_gfx11, Format::PSEUDO, 8, 6)};
8107    for (unsigned i = 0; i < 4; i++) {
8108       exp->operands[i] = mrt0 ? mrt0->out[i] : Operand(v1);
8109       exp->operands[i].setLateKill(true);
8110       exp->operands[i + 4] = mrt1 ? mrt1->out[i] : Operand(v1);
8111       exp->operands[i + 4].setLateKill(true);
8112    }
8113 
8114    RegClass type = RegClass(RegType::vgpr, util_bitcount(mrt0->enabled_channels));
8115    exp->definitions[0] = bld.def(type); /* mrt0 */
8116    exp->definitions[1] = bld.def(type); /* mrt1 */
8117    exp->definitions[2] = bld.def(bld.lm);
8118    exp->definitions[3] = bld.def(bld.lm);
8119    exp->definitions[4] = bld.def(bld.lm, vcc);
8120    exp->definitions[5] = bld.def(s1, scc);
8121    ctx->block->instructions.emplace_back(std::move(exp));
8122 
8123    ctx->program->has_color_exports = true;
8124 }
8125 
8126 static void
visit_cmat_muladd(isel_context * ctx,nir_intrinsic_instr * instr)8127 visit_cmat_muladd(isel_context* ctx, nir_intrinsic_instr* instr)
8128 {
8129    aco_opcode opcode = aco_opcode::num_opcodes;
8130    unsigned signed_mask = 0;
8131    bool clamp = false;
8132 
8133    switch (instr->src[0].ssa->bit_size) {
8134    case 16:
8135       switch (instr->def.bit_size) {
8136       case 32: opcode = aco_opcode::v_wmma_f32_16x16x16_f16; break;
8137       case 16: opcode = aco_opcode::v_wmma_f16_16x16x16_f16; break;
8138       }
8139       break;
8140    case 8:
8141       opcode = aco_opcode::v_wmma_i32_16x16x16_iu8;
8142       signed_mask = nir_intrinsic_cmat_signed_mask(instr);
8143       clamp = nir_intrinsic_saturate(instr);
8144       break;
8145    }
8146 
8147    if (opcode == aco_opcode::num_opcodes)
8148       unreachable("visit_cmat_muladd: invalid bit size combination");
8149 
8150    Builder bld(ctx->program, ctx->block);
8151 
8152    Temp dst = get_ssa_temp(ctx, &instr->def);
8153    Operand A(as_vgpr(ctx, get_ssa_temp(ctx, instr->src[0].ssa)));
8154    Operand B(as_vgpr(ctx, get_ssa_temp(ctx, instr->src[1].ssa)));
8155    Operand C(as_vgpr(ctx, get_ssa_temp(ctx, instr->src[2].ssa)));
8156 
8157    A.setLateKill(true);
8158    B.setLateKill(true);
8159 
8160    VALU_instruction& vop3p = bld.vop3p(opcode, Definition(dst), A, B, C, 0, 0)->valu();
8161    vop3p.neg_lo[0] = (signed_mask & 0x1) != 0;
8162    vop3p.neg_lo[1] = (signed_mask & 0x2) != 0;
8163    vop3p.clamp = clamp;
8164 
8165    emit_split_vector(ctx, dst, instr->def.num_components);
8166 }
8167 
8168 void
visit_intrinsic(isel_context * ctx,nir_intrinsic_instr * instr)8169 visit_intrinsic(isel_context* ctx, nir_intrinsic_instr* instr)
8170 {
8171    Builder bld(ctx->program, ctx->block);
8172    switch (instr->intrinsic) {
8173    case nir_intrinsic_load_barycentric_sample:
8174    case nir_intrinsic_load_barycentric_pixel:
8175    case nir_intrinsic_load_barycentric_centroid: {
8176       glsl_interp_mode mode = (glsl_interp_mode)nir_intrinsic_interp_mode(instr);
8177       Temp bary = get_interp_param(ctx, instr->intrinsic, mode);
8178       assert(bary.size() == 2);
8179       Temp dst = get_ssa_temp(ctx, &instr->def);
8180       bld.copy(Definition(dst), bary);
8181       emit_split_vector(ctx, dst, 2);
8182       break;
8183    }
8184    case nir_intrinsic_load_barycentric_model: {
8185       Temp model = get_arg(ctx, ctx->args->pull_model);
8186       assert(model.size() == 3);
8187       Temp dst = get_ssa_temp(ctx, &instr->def);
8188       bld.copy(Definition(dst), model);
8189       emit_split_vector(ctx, dst, 3);
8190       break;
8191    }
8192    case nir_intrinsic_load_barycentric_at_offset: {
8193       Temp offset = get_ssa_temp(ctx, instr->src[0].ssa);
8194       RegClass rc = RegClass(offset.type(), 1);
8195       Temp pos1 = bld.tmp(rc), pos2 = bld.tmp(rc);
8196       bld.pseudo(aco_opcode::p_split_vector, Definition(pos1), Definition(pos2), offset);
8197       Temp bary = get_interp_param(ctx, instr->intrinsic,
8198                                    (glsl_interp_mode)nir_intrinsic_interp_mode(instr));
8199       emit_interp_center(ctx, get_ssa_temp(ctx, &instr->def), bary, pos1, pos2);
8200       break;
8201    }
8202    case nir_intrinsic_load_front_face: {
8203       bld.vopc(aco_opcode::v_cmp_lg_u32, Definition(get_ssa_temp(ctx, &instr->def)),
8204                Operand::zero(), get_arg(ctx, ctx->args->front_face));
8205       break;
8206    }
8207    case nir_intrinsic_load_view_index: {
8208       Temp dst = get_ssa_temp(ctx, &instr->def);
8209       bld.copy(Definition(dst), Operand(get_arg(ctx, ctx->args->view_index)));
8210       break;
8211    }
8212    case nir_intrinsic_load_frag_coord: {
8213       emit_load_frag_coord(ctx, get_ssa_temp(ctx, &instr->def), 4);
8214       break;
8215    }
8216    case nir_intrinsic_load_frag_shading_rate:
8217       emit_load_frag_shading_rate(ctx, get_ssa_temp(ctx, &instr->def));
8218       break;
8219    case nir_intrinsic_load_sample_pos: {
8220       Temp posx = get_arg(ctx, ctx->args->frag_pos[0]);
8221       Temp posy = get_arg(ctx, ctx->args->frag_pos[1]);
8222       bld.pseudo(
8223          aco_opcode::p_create_vector, Definition(get_ssa_temp(ctx, &instr->def)),
8224          posx.id() ? bld.vop1(aco_opcode::v_fract_f32, bld.def(v1), posx) : Operand::zero(),
8225          posy.id() ? bld.vop1(aco_opcode::v_fract_f32, bld.def(v1), posy) : Operand::zero());
8226       break;
8227    }
8228    case nir_intrinsic_load_tess_coord: visit_load_tess_coord(ctx, instr); break;
8229    case nir_intrinsic_load_interpolated_input: visit_load_interpolated_input(ctx, instr); break;
8230    case nir_intrinsic_store_output: visit_store_output(ctx, instr); break;
8231    case nir_intrinsic_load_input:
8232    case nir_intrinsic_load_input_vertex:
8233       if (ctx->program->stage == fragment_fs)
8234          visit_load_fs_input(ctx, instr);
8235       else
8236          isel_err(&instr->instr, "Shader inputs should have been lowered in NIR.");
8237       break;
8238    case nir_intrinsic_load_per_vertex_input: visit_load_per_vertex_input(ctx, instr); break;
8239    case nir_intrinsic_load_ubo: visit_load_ubo(ctx, instr); break;
8240    case nir_intrinsic_load_push_constant: visit_load_push_constant(ctx, instr); break;
8241    case nir_intrinsic_load_constant: visit_load_constant(ctx, instr); break;
8242    case nir_intrinsic_load_shared: visit_load_shared(ctx, instr); break;
8243    case nir_intrinsic_store_shared: visit_store_shared(ctx, instr); break;
8244    case nir_intrinsic_shared_atomic:
8245    case nir_intrinsic_shared_atomic_swap: visit_shared_atomic(ctx, instr); break;
8246    case nir_intrinsic_load_shared2_amd:
8247    case nir_intrinsic_store_shared2_amd: visit_access_shared2_amd(ctx, instr); break;
8248    case nir_intrinsic_bindless_image_load:
8249    case nir_intrinsic_bindless_image_fragment_mask_load_amd:
8250    case nir_intrinsic_bindless_image_sparse_load: visit_image_load(ctx, instr); break;
8251    case nir_intrinsic_bindless_image_store: visit_image_store(ctx, instr); break;
8252    case nir_intrinsic_bindless_image_atomic:
8253    case nir_intrinsic_bindless_image_atomic_swap: visit_image_atomic(ctx, instr); break;
8254    case nir_intrinsic_load_ssbo: visit_load_ssbo(ctx, instr); break;
8255    case nir_intrinsic_store_ssbo: visit_store_ssbo(ctx, instr); break;
8256    case nir_intrinsic_load_typed_buffer_amd:
8257    case nir_intrinsic_load_buffer_amd: visit_load_buffer(ctx, instr); break;
8258    case nir_intrinsic_store_buffer_amd: visit_store_buffer(ctx, instr); break;
8259    case nir_intrinsic_load_smem_amd: visit_load_smem(ctx, instr); break;
8260    case nir_intrinsic_load_global_amd: visit_load_global(ctx, instr); break;
8261    case nir_intrinsic_store_global_amd: visit_store_global(ctx, instr); break;
8262    case nir_intrinsic_global_atomic_amd:
8263    case nir_intrinsic_global_atomic_swap_amd: visit_global_atomic(ctx, instr); break;
8264    case nir_intrinsic_ssbo_atomic:
8265    case nir_intrinsic_ssbo_atomic_swap: visit_atomic_ssbo(ctx, instr); break;
8266    case nir_intrinsic_load_scratch: visit_load_scratch(ctx, instr); break;
8267    case nir_intrinsic_store_scratch: visit_store_scratch(ctx, instr); break;
8268    case nir_intrinsic_barrier: emit_barrier(ctx, instr); break;
8269    case nir_intrinsic_load_num_workgroups: {
8270       Temp dst = get_ssa_temp(ctx, &instr->def);
8271       if (ctx->options->load_grid_size_from_user_sgpr) {
8272          bld.copy(Definition(dst), get_arg(ctx, ctx->args->num_work_groups));
8273       } else {
8274          Temp addr = get_arg(ctx, ctx->args->num_work_groups);
8275          assert(addr.regClass() == s2);
8276          bld.pseudo(aco_opcode::p_create_vector, Definition(dst),
8277                     bld.smem(aco_opcode::s_load_dwordx2, bld.def(s2), addr, Operand::zero()),
8278                     bld.smem(aco_opcode::s_load_dword, bld.def(s1), addr, Operand::c32(8)));
8279       }
8280       emit_split_vector(ctx, dst, 3);
8281       break;
8282    }
8283    case nir_intrinsic_load_ray_launch_size: {
8284       Temp dst = get_ssa_temp(ctx, &instr->def);
8285       bld.copy(Definition(dst), Operand(get_arg(ctx, ctx->args->rt.launch_size)));
8286       emit_split_vector(ctx, dst, 3);
8287       break;
8288    }
8289    case nir_intrinsic_load_ray_launch_id: {
8290       Temp dst = get_ssa_temp(ctx, &instr->def);
8291       bld.copy(Definition(dst), Operand(get_arg(ctx, ctx->args->rt.launch_id)));
8292       emit_split_vector(ctx, dst, 3);
8293       break;
8294    }
8295    case nir_intrinsic_load_local_invocation_id: {
8296       Temp dst = get_ssa_temp(ctx, &instr->def);
8297       if (ctx->options->gfx_level >= GFX11) {
8298          Temp local_ids[3];
8299 
8300          /* Thread IDs are packed in VGPR0, 10 bits per component. */
8301          for (uint32_t i = 0; i < 3; i++) {
8302             if (i == 0 && ctx->shader->info.workgroup_size[1] == 1 &&
8303                 ctx->shader->info.workgroup_size[2] == 1 &&
8304                 !ctx->shader->info.workgroup_size_variable) {
8305                local_ids[i] = get_arg(ctx, ctx->args->local_invocation_ids);
8306             } else if (i == 2 || (i == 1 && ctx->shader->info.workgroup_size[2] == 1 &&
8307                                   !ctx->shader->info.workgroup_size_variable)) {
8308                local_ids[i] =
8309                   bld.vop2(aco_opcode::v_lshrrev_b32, bld.def(v1), Operand::c32(i * 10u),
8310                            get_arg(ctx, ctx->args->local_invocation_ids));
8311             } else {
8312                local_ids[i] = bld.vop3(aco_opcode::v_bfe_u32, bld.def(v1),
8313                                        get_arg(ctx, ctx->args->local_invocation_ids),
8314                                        Operand::c32(i * 10u), Operand::c32(10u));
8315             }
8316          }
8317 
8318          bld.pseudo(aco_opcode::p_create_vector, Definition(dst), local_ids[0], local_ids[1],
8319                     local_ids[2]);
8320       } else {
8321          bld.copy(Definition(dst), Operand(get_arg(ctx, ctx->args->local_invocation_ids)));
8322       }
8323       emit_split_vector(ctx, dst, 3);
8324       break;
8325    }
8326    case nir_intrinsic_load_workgroup_id: {
8327       Temp dst = get_ssa_temp(ctx, &instr->def);
8328       if (ctx->stage.hw == AC_HW_COMPUTE_SHADER) {
8329          const struct ac_arg* ids = ctx->args->workgroup_ids;
8330          bld.pseudo(aco_opcode::p_create_vector, Definition(dst),
8331                     ids[0].used ? Operand(get_arg(ctx, ids[0])) : Operand::zero(),
8332                     ids[1].used ? Operand(get_arg(ctx, ids[1])) : Operand::zero(),
8333                     ids[2].used ? Operand(get_arg(ctx, ids[2])) : Operand::zero());
8334          emit_split_vector(ctx, dst, 3);
8335       } else {
8336          isel_err(&instr->instr, "Unsupported stage for load_workgroup_id");
8337       }
8338       break;
8339    }
8340    case nir_intrinsic_load_local_invocation_index: {
8341       if (ctx->stage.hw == AC_HW_LOCAL_SHADER || ctx->stage.hw == AC_HW_HULL_SHADER) {
8342          if (ctx->options->gfx_level >= GFX11) {
8343             /* On GFX11, RelAutoIndex is WaveID * WaveSize + ThreadID. */
8344             Temp wave_id =
8345                bld.sop2(aco_opcode::s_bfe_u32, bld.def(s1), bld.def(s1, scc),
8346                         get_arg(ctx, ctx->args->tcs_wave_id), Operand::c32(0u | (3u << 16)));
8347 
8348             Temp temp = bld.sop2(aco_opcode::s_mul_i32, bld.def(s1), wave_id,
8349                                  Operand::c32(ctx->program->wave_size));
8350             emit_mbcnt(ctx, get_ssa_temp(ctx, &instr->def), Operand(), Operand(temp));
8351          } else {
8352             bld.copy(Definition(get_ssa_temp(ctx, &instr->def)),
8353                      get_arg(ctx, ctx->args->vs_rel_patch_id));
8354          }
8355          break;
8356       } else if (ctx->stage.hw == AC_HW_LEGACY_GEOMETRY_SHADER ||
8357                  ctx->stage.hw == AC_HW_NEXT_GEN_GEOMETRY_SHADER) {
8358          bld.copy(Definition(get_ssa_temp(ctx, &instr->def)), thread_id_in_threadgroup(ctx));
8359          break;
8360       } else if (ctx->program->workgroup_size <= ctx->program->wave_size) {
8361          emit_mbcnt(ctx, get_ssa_temp(ctx, &instr->def));
8362          break;
8363       }
8364 
8365       Temp id = emit_mbcnt(ctx, bld.tmp(v1));
8366 
8367       /* The tg_size bits [6:11] contain the subgroup id,
8368        * we need this multiplied by the wave size, and then OR the thread id to it.
8369        */
8370       if (ctx->program->wave_size == 64) {
8371          /* After the s_and the bits are already multiplied by 64 (left shifted by 6) so we can just
8372           * feed that to v_or */
8373          Temp tg_num = bld.sop2(aco_opcode::s_and_b32, bld.def(s1), bld.def(s1, scc),
8374                                 Operand::c32(0xfc0u), get_arg(ctx, ctx->args->tg_size));
8375          bld.vop2(aco_opcode::v_or_b32, Definition(get_ssa_temp(ctx, &instr->def)), tg_num, id);
8376       } else {
8377          /* Extract the bit field and multiply the result by 32 (left shift by 5), then do the OR */
8378          Temp tg_num =
8379             bld.sop2(aco_opcode::s_bfe_u32, bld.def(s1), bld.def(s1, scc),
8380                      get_arg(ctx, ctx->args->tg_size), Operand::c32(0x6u | (0x6u << 16)));
8381          bld.vop3(aco_opcode::v_lshl_or_b32, Definition(get_ssa_temp(ctx, &instr->def)), tg_num,
8382                   Operand::c32(0x5u), id);
8383       }
8384       break;
8385    }
8386    case nir_intrinsic_load_subgroup_invocation: {
8387       emit_mbcnt(ctx, get_ssa_temp(ctx, &instr->def));
8388       break;
8389    }
8390    case nir_intrinsic_ballot_relaxed:
8391    case nir_intrinsic_ballot: {
8392       Temp src = get_ssa_temp(ctx, instr->src[0].ssa);
8393       Temp dst = get_ssa_temp(ctx, &instr->def);
8394 
8395       if (instr->src[0].ssa->bit_size == 1) {
8396          assert(src.regClass() == bld.lm);
8397       } else if (instr->src[0].ssa->bit_size == 32 && src.regClass() == v1) {
8398          src = bld.vopc(aco_opcode::v_cmp_lg_u32, bld.def(bld.lm), Operand::zero(), src);
8399       } else if (instr->src[0].ssa->bit_size == 64 && src.regClass() == v2) {
8400          src = bld.vopc(aco_opcode::v_cmp_lg_u64, bld.def(bld.lm), Operand::zero(), src);
8401       } else {
8402          isel_err(&instr->instr, "Unimplemented NIR instr bit size");
8403       }
8404 
8405       /* Make sure that all inactive lanes return zero.
8406        * Value-numbering might remove the comparison above */
8407       Definition def = dst.size() == bld.lm.size() ? Definition(dst) : bld.def(bld.lm);
8408       if (instr->intrinsic == nir_intrinsic_ballot_relaxed)
8409          src = bld.copy(def, src);
8410       else
8411          src = bld.sop2(Builder::s_and, def, bld.def(s1, scc), src, Operand(exec, bld.lm));
8412       if (dst.size() != bld.lm.size()) {
8413          /* Wave32 with ballot size set to 64 */
8414          bld.pseudo(aco_opcode::p_create_vector, Definition(dst), src, Operand::zero());
8415       }
8416 
8417       set_wqm(ctx);
8418       break;
8419    }
8420    case nir_intrinsic_inverse_ballot: {
8421       Temp src = bld.as_uniform(get_ssa_temp(ctx, instr->src[0].ssa));
8422       Temp dst = get_ssa_temp(ctx, &instr->def);
8423 
8424       assert(dst.size() == bld.lm.size());
8425       if (src.size() > dst.size()) {
8426          emit_extract_vector(ctx, src, 0, dst);
8427       } else if (src.size() < dst.size()) {
8428          bld.pseudo(aco_opcode::p_create_vector, Definition(dst), src, Operand::zero());
8429       } else {
8430          bld.copy(Definition(dst), src);
8431       }
8432       break;
8433    }
8434    case nir_intrinsic_shuffle:
8435    case nir_intrinsic_read_invocation: {
8436       Temp src = get_ssa_temp(ctx, instr->src[0].ssa);
8437       assert(instr->def.bit_size != 1);
8438       if (!nir_src_is_divergent(instr->src[0])) {
8439          emit_uniform_subgroup(ctx, instr, src);
8440       } else {
8441          Temp tid = get_ssa_temp(ctx, instr->src[1].ssa);
8442          if (instr->intrinsic == nir_intrinsic_read_invocation ||
8443              !nir_src_is_divergent(instr->src[1]))
8444             tid = bld.as_uniform(tid);
8445          Temp dst = get_ssa_temp(ctx, &instr->def);
8446 
8447          src = as_vgpr(ctx, src);
8448 
8449          if (src.regClass() == v1b || src.regClass() == v2b) {
8450             Temp tmp = bld.tmp(v1);
8451             tmp = emit_bpermute(ctx, bld, tid, src);
8452             if (dst.type() == RegType::vgpr)
8453                bld.pseudo(aco_opcode::p_split_vector, Definition(dst),
8454                           bld.def(src.regClass() == v1b ? v3b : v2b), tmp);
8455             else
8456                bld.pseudo(aco_opcode::p_as_uniform, Definition(dst), tmp);
8457          } else if (src.regClass() == v1) {
8458             Temp tmp = emit_bpermute(ctx, bld, tid, src);
8459             bld.copy(Definition(dst), tmp);
8460          } else if (src.regClass() == v2) {
8461             Temp lo = bld.tmp(v1), hi = bld.tmp(v1);
8462             bld.pseudo(aco_opcode::p_split_vector, Definition(lo), Definition(hi), src);
8463             lo = emit_bpermute(ctx, bld, tid, lo);
8464             hi = emit_bpermute(ctx, bld, tid, hi);
8465             bld.pseudo(aco_opcode::p_create_vector, Definition(dst), lo, hi);
8466             emit_split_vector(ctx, dst, 2);
8467          } else {
8468             isel_err(&instr->instr, "Unimplemented NIR instr bit size");
8469          }
8470          set_wqm(ctx);
8471       }
8472       break;
8473    }
8474    case nir_intrinsic_rotate: {
8475       Temp src = get_ssa_temp(ctx, instr->src[0].ssa);
8476       Temp delta = get_ssa_temp(ctx, instr->src[1].ssa);
8477       Temp dst = get_ssa_temp(ctx, &instr->def);
8478       assert(nir_intrinsic_execution_scope(instr) == SCOPE_SUBGROUP);
8479       assert(instr->def.bit_size > 1 && instr->def.bit_size <= 32);
8480 
8481       if (!nir_src_is_divergent(instr->src[0])) {
8482          emit_uniform_subgroup(ctx, instr, src);
8483          break;
8484       }
8485 
8486       unsigned cluster_size = nir_intrinsic_cluster_size(instr);
8487       cluster_size = util_next_power_of_two(
8488          MIN2(cluster_size ? cluster_size : ctx->program->wave_size, ctx->program->wave_size));
8489 
8490       if (cluster_size == 1) {
8491          bld.copy(Definition(dst), src);
8492          break;
8493       }
8494 
8495       delta = bld.as_uniform(delta);
8496       src = as_vgpr(ctx, src);
8497 
8498       Temp tmp;
8499       if (nir_src_is_const(instr->src[1]) &&
8500           emit_rotate_by_constant(ctx, tmp, src, cluster_size, nir_src_as_uint(instr->src[1]))) {
8501       } else if (cluster_size == 2) {
8502          Temp noswap =
8503             bld.sopc(aco_opcode::s_bitcmp0_b32, bld.def(s1, scc), delta, Operand::c32(0));
8504          noswap = bool_to_vector_condition(ctx, noswap);
8505          Temp swapped = emit_masked_swizzle(ctx, bld, src, ds_pattern_bitmode(0x1f, 0, 0x1), true);
8506          tmp = bld.vop2(aco_opcode::v_cndmask_b32, bld.def(src.regClass()), swapped, src, noswap);
8507       } else if (ctx->program->gfx_level >= GFX10 && cluster_size <= 16) {
8508          if (cluster_size == 4) /* shift mask already does this for 8/16. */
8509             delta = bld.sop2(aco_opcode::s_and_b32, bld.def(s1), bld.def(s1, scc), delta,
8510                              Operand::c32(0x3));
8511          delta =
8512             bld.sop2(aco_opcode::s_lshl_b32, bld.def(s1), bld.def(s1, scc), delta, Operand::c32(2));
8513 
8514          Temp lo = bld.copy(bld.def(s1), Operand::c32(cluster_size == 4 ? 0x32103210 : 0x76543210));
8515          Temp hi;
8516 
8517          if (cluster_size <= 8) {
8518             Temp shr = bld.sop2(aco_opcode::s_lshr_b32, bld.def(s1), bld.def(s1, scc), lo, delta);
8519             if (cluster_size == 4) {
8520                Temp lotolohi = bld.copy(bld.def(s1), Operand::c32(0x4444));
8521                Temp lohi =
8522                   bld.sop2(aco_opcode::s_or_b32, bld.def(s1), bld.def(s1, scc), shr, lotolohi);
8523                lo = bld.sop2(aco_opcode::s_pack_ll_b32_b16, bld.def(s1), shr, lohi);
8524             } else {
8525                delta = bld.sop2(aco_opcode::s_sub_u32, bld.def(s1), bld.def(s1, scc),
8526                                 Operand::c32(32), delta);
8527                Temp shl =
8528                   bld.sop2(aco_opcode::s_lshl_b32, bld.def(s1), bld.def(s1, scc), lo, delta);
8529                lo = bld.sop2(aco_opcode::s_or_b32, bld.def(s1), bld.def(s1, scc), shr, shl);
8530             }
8531             Temp lotohi = bld.copy(bld.def(s1), Operand::c32(0x88888888));
8532             hi = bld.sop2(aco_opcode::s_or_b32, bld.def(s1), bld.def(s1, scc), lo, lotohi);
8533          } else {
8534             hi = bld.copy(bld.def(s1), Operand::c32(0xfedcba98));
8535 
8536             Temp lohi = bld.pseudo(aco_opcode::p_create_vector, bld.def(s2), lo, hi);
8537 
8538             Temp shr = bld.sop2(aco_opcode::s_lshr_b64, bld.def(s2), bld.def(s1, scc), lohi, delta);
8539             delta = bld.sop2(aco_opcode::s_sub_u32, bld.def(s1), bld.def(s1, scc), Operand::c32(64),
8540                              delta);
8541             Temp shl = bld.sop2(aco_opcode::s_lshl_b64, bld.def(s2), bld.def(s1, scc), lohi, delta);
8542 
8543             lohi = bld.sop2(aco_opcode::s_or_b64, bld.def(s2), bld.def(s1, scc), shr, shl);
8544             lo = bld.tmp(s1);
8545             hi = bld.tmp(s1);
8546             bld.pseudo(aco_opcode::p_split_vector, Definition(lo), Definition(hi), lohi);
8547          }
8548 
8549          Builder::Result ret =
8550             bld.vop3(aco_opcode::v_permlane16_b32, bld.def(src.regClass()), src, lo, hi);
8551          ret->valu().opsel[0] = true; /* set FETCH_INACTIVE */
8552          ret->valu().opsel[1] = true; /* set BOUND_CTRL */
8553          tmp = ret;
8554       } else {
8555          /* Fallback to ds_bpermute if we can't find a special instruction. */
8556          Temp tid = emit_mbcnt(ctx, bld.tmp(v1));
8557          Temp src_lane = bld.vadd32(bld.def(v1), tid, delta);
8558 
8559          if (ctx->program->gfx_level >= GFX10 && cluster_size == 32) {
8560             /* ds_bpermute is restricted to 32 lanes on GFX10+. */
8561             Temp index_x4 =
8562                bld.vop2(aco_opcode::v_lshlrev_b32, bld.def(v1), Operand::c32(2u), src_lane);
8563             tmp = bld.ds(aco_opcode::ds_bpermute_b32, bld.def(v1), index_x4, src);
8564          } else {
8565             /* Technically, full wave rotate doesn't need this, but it breaks the pseudo ops. */
8566             src_lane = bld.vop3(aco_opcode::v_bfi_b32, bld.def(v1), Operand::c32(cluster_size - 1),
8567                                 src_lane, tid);
8568             tmp = emit_bpermute(ctx, bld, src_lane, src);
8569          }
8570       }
8571 
8572       tmp = emit_extract_vector(ctx, tmp, 0, dst.regClass());
8573       bld.copy(Definition(dst), tmp);
8574       set_wqm(ctx);
8575       break;
8576    }
8577    case nir_intrinsic_load_sample_id: {
8578       bld.vop3(aco_opcode::v_bfe_u32, Definition(get_ssa_temp(ctx, &instr->def)),
8579                get_arg(ctx, ctx->args->ancillary), Operand::c32(8u), Operand::c32(4u));
8580       break;
8581    }
8582    case nir_intrinsic_read_first_invocation: {
8583       Temp src = get_ssa_temp(ctx, instr->src[0].ssa);
8584       Temp dst = get_ssa_temp(ctx, &instr->def);
8585       if (instr->def.bit_size == 1) {
8586          assert(src.regClass() == bld.lm);
8587          Temp tmp = bld.sopc(Builder::s_bitcmp1, bld.def(s1, scc), src,
8588                              bld.sop1(Builder::s_ff1_i32, bld.def(s1), Operand(exec, bld.lm)));
8589          bool_to_vector_condition(ctx, tmp, dst);
8590       } else {
8591          emit_readfirstlane(ctx, src, dst);
8592       }
8593       set_wqm(ctx);
8594       break;
8595    }
8596    case nir_intrinsic_as_uniform: {
8597       Temp src = get_ssa_temp(ctx, instr->src[0].ssa);
8598       Temp dst = get_ssa_temp(ctx, &instr->def);
8599       if (src.type() == RegType::vgpr)
8600          bld.pseudo(aco_opcode::p_as_uniform, Definition(dst), src);
8601       else
8602          bld.copy(Definition(dst), src);
8603       break;
8604    }
8605    case nir_intrinsic_vote_all: {
8606       Temp src = get_ssa_temp(ctx, instr->src[0].ssa);
8607       Temp dst = get_ssa_temp(ctx, &instr->def);
8608       assert(src.regClass() == bld.lm);
8609       assert(dst.regClass() == bld.lm);
8610 
8611       Temp tmp = bld.sop1(Builder::s_not, bld.def(bld.lm), bld.def(s1, scc), src);
8612       tmp = bld.sop2(Builder::s_and, bld.def(bld.lm), bld.def(s1, scc), tmp, Operand(exec, bld.lm))
8613                .def(1)
8614                .getTemp();
8615       Temp cond = bool_to_vector_condition(ctx, tmp);
8616       bld.sop1(Builder::s_not, Definition(dst), bld.def(s1, scc), cond);
8617       set_wqm(ctx);
8618       break;
8619    }
8620    case nir_intrinsic_vote_any: {
8621       Temp src = get_ssa_temp(ctx, instr->src[0].ssa);
8622       Temp dst = get_ssa_temp(ctx, &instr->def);
8623       assert(src.regClass() == bld.lm);
8624       assert(dst.regClass() == bld.lm);
8625 
8626       Temp tmp = bool_to_scalar_condition(ctx, src);
8627       bool_to_vector_condition(ctx, tmp, dst);
8628       set_wqm(ctx);
8629       break;
8630    }
8631    case nir_intrinsic_quad_vote_any: {
8632       Temp src = get_ssa_temp(ctx, instr->src[0].ssa);
8633       src = bld.sop2(Builder::s_and, bld.def(bld.lm), bld.def(s1, scc), src, Operand(exec, bld.lm));
8634       bld.sop1(Builder::s_wqm, Definition(get_ssa_temp(ctx, &instr->def)), bld.def(s1, scc), src);
8635       set_wqm(ctx);
8636       break;
8637    }
8638    case nir_intrinsic_quad_vote_all: {
8639       Temp src = get_ssa_temp(ctx, instr->src[0].ssa);
8640       src = bld.sop1(Builder::s_not, bld.def(bld.lm), bld.def(s1, scc), src);
8641       src = bld.sop2(Builder::s_and, bld.def(bld.lm), bld.def(s1, scc), src, Operand(exec, bld.lm));
8642       src = bld.sop1(Builder::s_wqm, bld.def(bld.lm), bld.def(s1, scc), src);
8643       bld.sop1(Builder::s_not, Definition(get_ssa_temp(ctx, &instr->def)), bld.def(s1, scc), src);
8644       set_wqm(ctx);
8645       break;
8646    }
8647    case nir_intrinsic_reduce:
8648    case nir_intrinsic_inclusive_scan:
8649    case nir_intrinsic_exclusive_scan: {
8650       Temp src = get_ssa_temp(ctx, instr->src[0].ssa);
8651       Temp dst = get_ssa_temp(ctx, &instr->def);
8652       nir_op op = (nir_op)nir_intrinsic_reduction_op(instr);
8653       unsigned cluster_size =
8654          instr->intrinsic == nir_intrinsic_reduce ? nir_intrinsic_cluster_size(instr) : 0;
8655       cluster_size = util_next_power_of_two(
8656          MIN2(cluster_size ? cluster_size : ctx->program->wave_size, ctx->program->wave_size));
8657       const unsigned bit_size = instr->src[0].ssa->bit_size;
8658       assert(bit_size != 1);
8659 
8660       if (!nir_src_is_divergent(instr->src[0]) && cluster_size == ctx->program->wave_size) {
8661          /* We use divergence analysis to assign the regclass, so check if it's
8662           * working as expected */
8663          ASSERTED bool expected_divergent = instr->intrinsic == nir_intrinsic_exclusive_scan;
8664          if (instr->intrinsic == nir_intrinsic_inclusive_scan)
8665             expected_divergent = op == nir_op_iadd || op == nir_op_fadd || op == nir_op_ixor;
8666          assert(instr->def.divergent == expected_divergent);
8667 
8668          if (instr->intrinsic == nir_intrinsic_reduce) {
8669             if (emit_uniform_reduce(ctx, instr))
8670                break;
8671          } else if (emit_uniform_scan(ctx, instr)) {
8672             break;
8673          }
8674       }
8675 
8676       src = emit_extract_vector(ctx, src, 0, RegClass::get(RegType::vgpr, bit_size / 8));
8677       ReduceOp reduce_op = get_reduce_op(op, bit_size);
8678 
8679       aco_opcode aco_op;
8680       switch (instr->intrinsic) {
8681       case nir_intrinsic_reduce: aco_op = aco_opcode::p_reduce; break;
8682       case nir_intrinsic_inclusive_scan: aco_op = aco_opcode::p_inclusive_scan; break;
8683       case nir_intrinsic_exclusive_scan: aco_op = aco_opcode::p_exclusive_scan; break;
8684       default: unreachable("unknown reduce intrinsic");
8685       }
8686 
8687       /* Avoid whole wave shift. */
8688       const bool use_inclusive_for_exclusive = aco_op == aco_opcode::p_exclusive_scan &&
8689                                                (op == nir_op_iadd || op == nir_op_ixor) &&
8690                                                dst.type() == RegType::vgpr;
8691       if (use_inclusive_for_exclusive)
8692          inclusive_scan_to_exclusive(ctx, reduce_op, Definition(dst), src);
8693       else
8694          emit_reduction_instr(ctx, aco_op, reduce_op, cluster_size, Definition(dst), src);
8695 
8696       set_wqm(ctx);
8697       break;
8698    }
8699    case nir_intrinsic_quad_broadcast:
8700    case nir_intrinsic_quad_swap_horizontal:
8701    case nir_intrinsic_quad_swap_vertical:
8702    case nir_intrinsic_quad_swap_diagonal:
8703    case nir_intrinsic_quad_swizzle_amd: {
8704       Temp src = get_ssa_temp(ctx, instr->src[0].ssa);
8705 
8706       if (!instr->def.divergent) {
8707          emit_uniform_subgroup(ctx, instr, src);
8708          break;
8709       }
8710 
8711       /* Quad broadcast lane. */
8712       unsigned lane = 0;
8713       /* Use VALU for the bool instructions that don't have a SALU-only special case. */
8714       bool bool_use_valu = instr->def.bit_size == 1;
8715 
8716       uint16_t dpp_ctrl = 0;
8717 
8718       bool allow_fi = true;
8719       switch (instr->intrinsic) {
8720       case nir_intrinsic_quad_swap_horizontal: dpp_ctrl = dpp_quad_perm(1, 0, 3, 2); break;
8721       case nir_intrinsic_quad_swap_vertical: dpp_ctrl = dpp_quad_perm(2, 3, 0, 1); break;
8722       case nir_intrinsic_quad_swap_diagonal: dpp_ctrl = dpp_quad_perm(3, 2, 1, 0); break;
8723       case nir_intrinsic_quad_swizzle_amd:
8724          dpp_ctrl = nir_intrinsic_swizzle_mask(instr);
8725          allow_fi &= nir_intrinsic_fetch_inactive(instr);
8726          break;
8727       case nir_intrinsic_quad_broadcast:
8728          lane = nir_src_as_const_value(instr->src[1])->u32;
8729          dpp_ctrl = dpp_quad_perm(lane, lane, lane, lane);
8730          bool_use_valu = false;
8731          break;
8732       default: break;
8733       }
8734 
8735       Temp dst = get_ssa_temp(ctx, &instr->def);
8736 
8737       /* Setup source. */
8738       if (bool_use_valu)
8739          src = bld.vop2_e64(aco_opcode::v_cndmask_b32, bld.def(v1), Operand::zero(),
8740                             Operand::c32(-1), src);
8741       else if (instr->def.bit_size != 1)
8742          src = as_vgpr(ctx, src);
8743 
8744       if (instr->def.bit_size == 1 && instr->intrinsic == nir_intrinsic_quad_broadcast) {
8745          /* Special case for quad broadcast using SALU only. */
8746          assert(src.regClass() == bld.lm && dst.regClass() == bld.lm);
8747 
8748          uint32_t half_mask = 0x11111111u << lane;
8749          Operand mask_tmp = bld.lm.bytes() == 4
8750                                ? Operand::c32(half_mask)
8751                                : bld.pseudo(aco_opcode::p_create_vector, bld.def(bld.lm),
8752                                             Operand::c32(half_mask), Operand::c32(half_mask));
8753 
8754          src =
8755             bld.sop2(Builder::s_and, bld.def(bld.lm), bld.def(s1, scc), src, Operand(exec, bld.lm));
8756          src = bld.sop2(Builder::s_and, bld.def(bld.lm), bld.def(s1, scc), mask_tmp, src);
8757          bld.sop1(Builder::s_wqm, Definition(dst), bld.def(s1, scc), src);
8758       } else if (instr->def.bit_size <= 32 || bool_use_valu) {
8759          unsigned excess_bytes = bool_use_valu ? 0 : 4 - instr->def.bit_size / 8;
8760          Definition def = (excess_bytes || bool_use_valu) ? bld.def(v1) : Definition(dst);
8761 
8762          if (ctx->program->gfx_level >= GFX8)
8763             bld.vop1_dpp(aco_opcode::v_mov_b32, def, src, dpp_ctrl, 0xf, 0xf, true, allow_fi);
8764          else
8765             bld.ds(aco_opcode::ds_swizzle_b32, def, src, (1 << 15) | dpp_ctrl);
8766 
8767          if (excess_bytes)
8768             bld.pseudo(aco_opcode::p_split_vector, Definition(dst),
8769                        bld.def(RegClass::get(dst.type(), excess_bytes)), def.getTemp());
8770          if (bool_use_valu)
8771             bld.vopc(aco_opcode::v_cmp_lg_u32, Definition(dst), Operand::zero(), def.getTemp());
8772       } else if (instr->def.bit_size == 64) {
8773          Temp lo = bld.tmp(v1), hi = bld.tmp(v1);
8774          bld.pseudo(aco_opcode::p_split_vector, Definition(lo), Definition(hi), src);
8775 
8776          if (ctx->program->gfx_level >= GFX8) {
8777             lo = bld.vop1_dpp(aco_opcode::v_mov_b32, bld.def(v1), lo, dpp_ctrl, 0xf, 0xf, true,
8778                               allow_fi);
8779             hi = bld.vop1_dpp(aco_opcode::v_mov_b32, bld.def(v1), hi, dpp_ctrl, 0xf, 0xf, true,
8780                               allow_fi);
8781          } else {
8782             lo = bld.ds(aco_opcode::ds_swizzle_b32, bld.def(v1), lo, (1 << 15) | dpp_ctrl);
8783             hi = bld.ds(aco_opcode::ds_swizzle_b32, bld.def(v1), hi, (1 << 15) | dpp_ctrl);
8784          }
8785 
8786          bld.pseudo(aco_opcode::p_create_vector, Definition(dst), lo, hi);
8787          emit_split_vector(ctx, dst, 2);
8788       } else {
8789          isel_err(&instr->instr, "Unimplemented NIR quad group instruction bit size.");
8790       }
8791 
8792       set_wqm(ctx);
8793       break;
8794    }
8795    case nir_intrinsic_masked_swizzle_amd: {
8796       Temp src = get_ssa_temp(ctx, instr->src[0].ssa);
8797       if (!instr->def.divergent) {
8798          emit_uniform_subgroup(ctx, instr, src);
8799          break;
8800       }
8801       Temp dst = get_ssa_temp(ctx, &instr->def);
8802       uint32_t mask = nir_intrinsic_swizzle_mask(instr);
8803       bool allow_fi = nir_intrinsic_fetch_inactive(instr);
8804 
8805       if (instr->def.bit_size != 1)
8806          src = as_vgpr(ctx, src);
8807 
8808       if (instr->def.bit_size == 1) {
8809          assert(src.regClass() == bld.lm);
8810          src = bld.vop2_e64(aco_opcode::v_cndmask_b32, bld.def(v1), Operand::zero(),
8811                             Operand::c32(-1), src);
8812          src = emit_masked_swizzle(ctx, bld, src, mask, allow_fi);
8813          bld.vopc(aco_opcode::v_cmp_lg_u32, Definition(dst), Operand::zero(), src);
8814       } else if (dst.regClass() == v1b) {
8815          Temp tmp = emit_masked_swizzle(ctx, bld, src, mask, allow_fi);
8816          emit_extract_vector(ctx, tmp, 0, dst);
8817       } else if (dst.regClass() == v2b) {
8818          Temp tmp = emit_masked_swizzle(ctx, bld, src, mask, allow_fi);
8819          emit_extract_vector(ctx, tmp, 0, dst);
8820       } else if (dst.regClass() == v1) {
8821          bld.copy(Definition(dst), emit_masked_swizzle(ctx, bld, src, mask, allow_fi));
8822       } else if (dst.regClass() == v2) {
8823          Temp lo = bld.tmp(v1), hi = bld.tmp(v1);
8824          bld.pseudo(aco_opcode::p_split_vector, Definition(lo), Definition(hi), src);
8825          lo = emit_masked_swizzle(ctx, bld, lo, mask, allow_fi);
8826          hi = emit_masked_swizzle(ctx, bld, hi, mask, allow_fi);
8827          bld.pseudo(aco_opcode::p_create_vector, Definition(dst), lo, hi);
8828          emit_split_vector(ctx, dst, 2);
8829       } else {
8830          isel_err(&instr->instr, "Unimplemented NIR instr bit size");
8831       }
8832       set_wqm(ctx);
8833       break;
8834    }
8835    case nir_intrinsic_write_invocation_amd: {
8836       Temp src = as_vgpr(ctx, get_ssa_temp(ctx, instr->src[0].ssa));
8837       Temp val = bld.as_uniform(get_ssa_temp(ctx, instr->src[1].ssa));
8838       Temp lane = bld.as_uniform(get_ssa_temp(ctx, instr->src[2].ssa));
8839       Temp dst = get_ssa_temp(ctx, &instr->def);
8840       if (dst.regClass() == v1) {
8841          /* src2 is ignored for writelane. RA assigns the same reg for dst */
8842          bld.writelane(Definition(dst), val, lane, src);
8843       } else if (dst.regClass() == v2) {
8844          Temp src_lo = bld.tmp(v1), src_hi = bld.tmp(v1);
8845          Temp val_lo = bld.tmp(s1), val_hi = bld.tmp(s1);
8846          bld.pseudo(aco_opcode::p_split_vector, Definition(src_lo), Definition(src_hi), src);
8847          bld.pseudo(aco_opcode::p_split_vector, Definition(val_lo), Definition(val_hi), val);
8848          Temp lo = bld.writelane(bld.def(v1), val_lo, lane, src_hi);
8849          Temp hi = bld.writelane(bld.def(v1), val_hi, lane, src_hi);
8850          bld.pseudo(aco_opcode::p_create_vector, Definition(dst), lo, hi);
8851          emit_split_vector(ctx, dst, 2);
8852       } else {
8853          isel_err(&instr->instr, "Unimplemented NIR instr bit size");
8854       }
8855       break;
8856    }
8857    case nir_intrinsic_mbcnt_amd: {
8858       Temp src = get_ssa_temp(ctx, instr->src[0].ssa);
8859       Temp add_src = as_vgpr(ctx, get_ssa_temp(ctx, instr->src[1].ssa));
8860       Temp dst = get_ssa_temp(ctx, &instr->def);
8861       /* Fit 64-bit mask for wave32 */
8862       src = emit_extract_vector(ctx, src, 0, RegClass(src.type(), bld.lm.size()));
8863       emit_mbcnt(ctx, dst, Operand(src), Operand(add_src));
8864       set_wqm(ctx);
8865       break;
8866    }
8867    case nir_intrinsic_lane_permute_16_amd: {
8868       Temp src = get_ssa_temp(ctx, instr->src[0].ssa);
8869       Temp dst = get_ssa_temp(ctx, &instr->def);
8870       assert(ctx->program->gfx_level >= GFX10);
8871 
8872       if (src.regClass() == s1) {
8873          bld.copy(Definition(dst), src);
8874       } else if (dst.regClass() == v1 && src.regClass() == v1) {
8875          bld.vop3(aco_opcode::v_permlane16_b32, Definition(dst), src,
8876                   bld.as_uniform(get_ssa_temp(ctx, instr->src[1].ssa)),
8877                   bld.as_uniform(get_ssa_temp(ctx, instr->src[2].ssa)));
8878       } else {
8879          isel_err(&instr->instr, "Unimplemented lane_permute_16_amd");
8880       }
8881       break;
8882    }
8883    case nir_intrinsic_load_helper_invocation:
8884    case nir_intrinsic_is_helper_invocation: {
8885       /* load_helper() after demote() get lowered to is_helper().
8886        * Otherwise, these two behave the same. */
8887       Temp dst = get_ssa_temp(ctx, &instr->def);
8888       bld.pseudo(aco_opcode::p_is_helper, Definition(dst), Operand(exec, bld.lm));
8889       ctx->program->needs_exact = true;
8890       break;
8891    }
8892    case nir_intrinsic_demote:
8893    case nir_intrinsic_demote_if: {
8894       Operand cond = Operand::c32(-1u);
8895       if (instr->intrinsic == nir_intrinsic_demote_if) {
8896          Temp src = get_ssa_temp(ctx, instr->src[0].ssa);
8897          assert(src.regClass() == bld.lm);
8898          cond =
8899             bld.sop2(Builder::s_and, bld.def(bld.lm), bld.def(s1, scc), src, Operand(exec, bld.lm));
8900       }
8901 
8902       bld.pseudo(aco_opcode::p_demote_to_helper, cond);
8903 
8904       if (ctx->block->loop_nest_depth || ctx->cf_info.parent_if.is_divergent)
8905          ctx->cf_info.exec_potentially_empty_discard = true;
8906 
8907       ctx->block->kind |= block_kind_uses_discard;
8908       ctx->program->needs_exact = true;
8909 
8910       /* Enable WQM in order to prevent helper lanes from getting terminated. */
8911       if (ctx->shader->info.maximally_reconverges)
8912          ctx->program->needs_wqm = true;
8913 
8914       break;
8915    }
8916    case nir_intrinsic_terminate:
8917    case nir_intrinsic_terminate_if:
8918    case nir_intrinsic_discard:
8919    case nir_intrinsic_discard_if: {
8920       Operand cond = Operand::c32(-1u);
8921       if (instr->intrinsic == nir_intrinsic_discard_if ||
8922           instr->intrinsic == nir_intrinsic_terminate_if) {
8923          Temp src = get_ssa_temp(ctx, instr->src[0].ssa);
8924          assert(src.regClass() == bld.lm);
8925          cond =
8926             bld.sop2(Builder::s_and, bld.def(bld.lm), bld.def(s1, scc), src, Operand(exec, bld.lm));
8927 
8928          ctx->cf_info.had_divergent_discard |= nir_src_is_divergent(instr->src[0]);
8929       }
8930 
8931       bld.pseudo(aco_opcode::p_discard_if, cond);
8932 
8933       if (ctx->block->loop_nest_depth || ctx->cf_info.parent_if.is_divergent)
8934          ctx->cf_info.exec_potentially_empty_discard = true;
8935       ctx->cf_info.had_divergent_discard |= in_exec_divergent_or_in_loop(ctx);
8936       ctx->block->kind |= block_kind_uses_discard;
8937       ctx->program->needs_exact = true;
8938       break;
8939    }
8940    case nir_intrinsic_first_invocation: {
8941       bld.sop1(Builder::s_ff1_i32, Definition(get_ssa_temp(ctx, &instr->def)),
8942                Operand(exec, bld.lm));
8943       set_wqm(ctx);
8944       break;
8945    }
8946    case nir_intrinsic_last_invocation: {
8947       Temp flbit = bld.sop1(Builder::s_flbit_i32, bld.def(s1), Operand(exec, bld.lm));
8948       bld.sop2(aco_opcode::s_sub_i32, Definition(get_ssa_temp(ctx, &instr->def)), bld.def(s1, scc),
8949                Operand::c32(ctx->program->wave_size - 1u), flbit);
8950       set_wqm(ctx);
8951       break;
8952    }
8953    case nir_intrinsic_elect: {
8954       /* p_elect is lowered in aco_insert_exec_mask.
8955        * Use exec as an operand so value numbering and the pre-RA optimizer won't recognize
8956        * two p_elect with different exec masks as the same.
8957        */
8958       bld.pseudo(aco_opcode::p_elect, Definition(get_ssa_temp(ctx, &instr->def)),
8959                  Operand(exec, bld.lm));
8960       set_wqm(ctx);
8961       break;
8962    }
8963    case nir_intrinsic_shader_clock: {
8964       Temp dst = get_ssa_temp(ctx, &instr->def);
8965       if (nir_intrinsic_memory_scope(instr) == SCOPE_SUBGROUP &&
8966           ctx->options->gfx_level >= GFX10_3) {
8967          /* "((size - 1) << 11) | register" (SHADER_CYCLES is encoded as register 29) */
8968          Temp clock = bld.sopk(aco_opcode::s_getreg_b32, bld.def(s1), ((20 - 1) << 11) | 29);
8969          bld.pseudo(aco_opcode::p_create_vector, Definition(dst), clock, Operand::zero());
8970       } else if (nir_intrinsic_memory_scope(instr) == SCOPE_DEVICE &&
8971                  ctx->options->gfx_level >= GFX11) {
8972          bld.sop1(aco_opcode::s_sendmsg_rtn_b64, Definition(dst),
8973                   Operand::c32(sendmsg_rtn_get_realtime));
8974       } else {
8975          aco_opcode opcode = nir_intrinsic_memory_scope(instr) == SCOPE_DEVICE
8976                                 ? aco_opcode::s_memrealtime
8977                                 : aco_opcode::s_memtime;
8978          bld.smem(opcode, Definition(dst), memory_sync_info(0, semantic_volatile));
8979       }
8980       emit_split_vector(ctx, dst, 2);
8981       break;
8982    }
8983    case nir_intrinsic_load_vertex_id_zero_base: {
8984       Temp dst = get_ssa_temp(ctx, &instr->def);
8985       bld.copy(Definition(dst), get_arg(ctx, ctx->args->vertex_id));
8986       break;
8987    }
8988    case nir_intrinsic_load_first_vertex: {
8989       Temp dst = get_ssa_temp(ctx, &instr->def);
8990       bld.copy(Definition(dst), get_arg(ctx, ctx->args->base_vertex));
8991       break;
8992    }
8993    case nir_intrinsic_load_base_instance: {
8994       Temp dst = get_ssa_temp(ctx, &instr->def);
8995       bld.copy(Definition(dst), get_arg(ctx, ctx->args->start_instance));
8996       break;
8997    }
8998    case nir_intrinsic_load_instance_id: {
8999       Temp dst = get_ssa_temp(ctx, &instr->def);
9000       bld.copy(Definition(dst), get_arg(ctx, ctx->args->instance_id));
9001       break;
9002    }
9003    case nir_intrinsic_load_draw_id: {
9004       Temp dst = get_ssa_temp(ctx, &instr->def);
9005       bld.copy(Definition(dst), get_arg(ctx, ctx->args->draw_id));
9006       break;
9007    }
9008    case nir_intrinsic_load_invocation_id: {
9009       Temp dst = get_ssa_temp(ctx, &instr->def);
9010 
9011       if (ctx->shader->info.stage == MESA_SHADER_GEOMETRY) {
9012          if (ctx->options->gfx_level >= GFX10)
9013             bld.vop2_e64(aco_opcode::v_and_b32, Definition(dst), Operand::c32(127u),
9014                          get_arg(ctx, ctx->args->gs_invocation_id));
9015          else
9016             bld.copy(Definition(dst), get_arg(ctx, ctx->args->gs_invocation_id));
9017       } else if (ctx->shader->info.stage == MESA_SHADER_TESS_CTRL) {
9018          bld.vop3(aco_opcode::v_bfe_u32, Definition(dst), get_arg(ctx, ctx->args->tcs_rel_ids),
9019                   Operand::c32(8u), Operand::c32(5u));
9020       } else {
9021          unreachable("Unsupported stage for load_invocation_id");
9022       }
9023 
9024       break;
9025    }
9026    case nir_intrinsic_load_primitive_id: {
9027       Temp dst = get_ssa_temp(ctx, &instr->def);
9028 
9029       switch (ctx->shader->info.stage) {
9030       case MESA_SHADER_GEOMETRY:
9031          bld.copy(Definition(dst), get_arg(ctx, ctx->args->gs_prim_id));
9032          break;
9033       case MESA_SHADER_TESS_CTRL:
9034          bld.copy(Definition(dst), get_arg(ctx, ctx->args->tcs_patch_id));
9035          break;
9036       case MESA_SHADER_TESS_EVAL:
9037          bld.copy(Definition(dst), get_arg(ctx, ctx->args->tes_patch_id));
9038          break;
9039       default:
9040          if (ctx->stage.hw == AC_HW_NEXT_GEN_GEOMETRY_SHADER && !ctx->stage.has(SWStage::GS)) {
9041             /* In case of NGG, the GS threads always have the primitive ID
9042              * even if there is no SW GS. */
9043             bld.copy(Definition(dst), get_arg(ctx, ctx->args->gs_prim_id));
9044             break;
9045          } else if (ctx->shader->info.stage == MESA_SHADER_VERTEX) {
9046             bld.copy(Definition(dst), get_arg(ctx, ctx->args->vs_prim_id));
9047             break;
9048          }
9049          unreachable("Unimplemented shader stage for nir_intrinsic_load_primitive_id");
9050       }
9051 
9052       break;
9053    }
9054    case nir_intrinsic_sendmsg_amd: {
9055       unsigned imm = nir_intrinsic_base(instr);
9056       Temp m0_content = bld.as_uniform(get_ssa_temp(ctx, instr->src[0].ssa));
9057       bld.sopp(aco_opcode::s_sendmsg, bld.m0(m0_content), -1, imm);
9058       break;
9059    }
9060    case nir_intrinsic_load_gs_wave_id_amd: {
9061       Temp dst = get_ssa_temp(ctx, &instr->def);
9062       if (ctx->args->merged_wave_info.used)
9063          bld.pseudo(aco_opcode::p_extract, Definition(dst), bld.def(s1, scc),
9064                     get_arg(ctx, ctx->args->merged_wave_info), Operand::c32(2u), Operand::c32(8u),
9065                     Operand::zero());
9066       else if (ctx->args->gs_wave_id.used)
9067          bld.copy(Definition(dst), get_arg(ctx, ctx->args->gs_wave_id));
9068       else
9069          unreachable("Shader doesn't have GS wave ID.");
9070       break;
9071    }
9072    case nir_intrinsic_is_subgroup_invocation_lt_amd: {
9073       Temp src = bld.as_uniform(get_ssa_temp(ctx, instr->src[0].ssa));
9074       bld.copy(Definition(get_ssa_temp(ctx, &instr->def)), lanecount_to_mask(ctx, src));
9075       break;
9076    }
9077    case nir_intrinsic_gds_atomic_add_amd: {
9078       Temp store_val = get_ssa_temp(ctx, instr->src[0].ssa);
9079       Temp gds_addr = get_ssa_temp(ctx, instr->src[1].ssa);
9080       Temp m0_val = get_ssa_temp(ctx, instr->src[2].ssa);
9081       Operand m = bld.m0((Temp)bld.copy(bld.def(s1, m0), bld.as_uniform(m0_val)));
9082       bld.ds(aco_opcode::ds_add_u32, as_vgpr(ctx, gds_addr), as_vgpr(ctx, store_val), m, 0u, 0u,
9083              true);
9084       break;
9085    }
9086    case nir_intrinsic_load_sbt_base_amd: {
9087       Temp dst = get_ssa_temp(ctx, &instr->def);
9088       Temp addr = get_arg(ctx, ctx->args->rt.sbt_descriptors);
9089       assert(addr.regClass() == s2);
9090       bld.copy(Definition(dst), Operand(addr));
9091       break;
9092    }
9093    case nir_intrinsic_bvh64_intersect_ray_amd: visit_bvh64_intersect_ray_amd(ctx, instr); break;
9094    case nir_intrinsic_load_rt_dynamic_callable_stack_base_amd:
9095       bld.copy(Definition(get_ssa_temp(ctx, &instr->def)),
9096                get_arg(ctx, ctx->args->rt.dynamic_callable_stack_base));
9097       break;
9098    case nir_intrinsic_load_resume_shader_address_amd: {
9099       bld.pseudo(aco_opcode::p_resume_shader_address, Definition(get_ssa_temp(ctx, &instr->def)),
9100                  bld.def(s1, scc), Operand::c32(nir_intrinsic_call_idx(instr)));
9101       break;
9102    }
9103    case nir_intrinsic_overwrite_vs_arguments_amd: {
9104       ctx->arg_temps[ctx->args->vertex_id.arg_index] = get_ssa_temp(ctx, instr->src[0].ssa);
9105       ctx->arg_temps[ctx->args->instance_id.arg_index] = get_ssa_temp(ctx, instr->src[1].ssa);
9106       break;
9107    }
9108    case nir_intrinsic_overwrite_tes_arguments_amd: {
9109       ctx->arg_temps[ctx->args->tes_u.arg_index] = get_ssa_temp(ctx, instr->src[0].ssa);
9110       ctx->arg_temps[ctx->args->tes_v.arg_index] = get_ssa_temp(ctx, instr->src[1].ssa);
9111       ctx->arg_temps[ctx->args->tes_rel_patch_id.arg_index] = get_ssa_temp(ctx, instr->src[3].ssa);
9112       ctx->arg_temps[ctx->args->tes_patch_id.arg_index] = get_ssa_temp(ctx, instr->src[2].ssa);
9113       break;
9114    }
9115    case nir_intrinsic_load_scalar_arg_amd:
9116    case nir_intrinsic_load_vector_arg_amd: {
9117       assert(nir_intrinsic_base(instr) < ctx->args->arg_count);
9118       Temp dst = get_ssa_temp(ctx, &instr->def);
9119       Temp src = ctx->arg_temps[nir_intrinsic_base(instr)];
9120       assert(src.id());
9121       assert(src.type() == (instr->intrinsic == nir_intrinsic_load_scalar_arg_amd ? RegType::sgpr
9122                                                                                   : RegType::vgpr));
9123       bld.copy(Definition(dst), src);
9124       emit_split_vector(ctx, dst, dst.size());
9125       break;
9126    }
9127    case nir_intrinsic_ordered_xfb_counter_add_amd: {
9128       Temp dst = get_ssa_temp(ctx, &instr->def);
9129       Temp ordered_id = get_ssa_temp(ctx, instr->src[0].ssa);
9130       Temp counter = get_ssa_temp(ctx, instr->src[1].ssa);
9131 
9132       Temp gds_base = bld.copy(bld.def(v1), Operand::c32(0u));
9133       unsigned offset0, offset1;
9134       Instruction* ds_instr;
9135       Operand m;
9136 
9137       /* Lock a GDS mutex. */
9138       ds_ordered_count_offsets(ctx, 1 << 24u, false, false, &offset0, &offset1);
9139       m = bld.m0(bld.as_uniform(ordered_id));
9140       ds_instr =
9141          bld.ds(aco_opcode::ds_ordered_count, bld.def(v1), gds_base, m, offset0, offset1, true);
9142       ds_instr->ds().sync = memory_sync_info(storage_gds, semantic_volatile);
9143 
9144       aco_ptr<Pseudo_instruction> vec{create_instruction<Pseudo_instruction>(
9145          aco_opcode::p_create_vector, Format::PSEUDO, instr->num_components, 1)};
9146       unsigned write_mask = nir_intrinsic_write_mask(instr);
9147 
9148       for (unsigned i = 0; i < instr->num_components; i++) {
9149          if (write_mask & (1 << i)) {
9150             Temp chan_counter = emit_extract_vector(ctx, counter, i, v1);
9151 
9152             ds_instr = bld.ds(aco_opcode::ds_add_gs_reg_rtn, bld.def(v1), Operand(), chan_counter,
9153                               i * 4, 0u, true);
9154             ds_instr->ds().sync = memory_sync_info(storage_gds, semantic_atomicrmw);
9155 
9156             vec->operands[i] = Operand(ds_instr->definitions[0].getTemp());
9157          } else {
9158             vec->operands[i] = Operand::zero();
9159          }
9160       }
9161 
9162       vec->definitions[0] = Definition(dst);
9163       ctx->block->instructions.emplace_back(std::move(vec));
9164 
9165       /* Unlock a GDS mutex. */
9166       ds_ordered_count_offsets(ctx, 1 << 24u, true, true, &offset0, &offset1);
9167       m = bld.m0(bld.as_uniform(ordered_id));
9168       ds_instr =
9169          bld.ds(aco_opcode::ds_ordered_count, bld.def(v1), gds_base, m, offset0, offset1, true);
9170       ds_instr->ds().sync = memory_sync_info(storage_gds, semantic_volatile);
9171 
9172       emit_split_vector(ctx, dst, instr->num_components);
9173       break;
9174    }
9175    case nir_intrinsic_xfb_counter_sub_amd: {
9176       unsigned write_mask = nir_intrinsic_write_mask(instr);
9177       Temp counter = get_ssa_temp(ctx, instr->src[0].ssa);
9178 
9179       u_foreach_bit (i, write_mask) {
9180          Temp chan_counter = emit_extract_vector(ctx, counter, i, v1);
9181          Instruction* ds_instr;
9182 
9183          ds_instr = bld.ds(aco_opcode::ds_sub_gs_reg_rtn, bld.def(v1), Operand(), chan_counter,
9184                            i * 4, 0u, true);
9185          ds_instr->ds().sync = memory_sync_info(storage_gds, semantic_atomicrmw);
9186       }
9187       break;
9188    }
9189    case nir_intrinsic_export_amd:
9190    case nir_intrinsic_export_row_amd: {
9191       unsigned flags = nir_intrinsic_flags(instr);
9192       unsigned target = nir_intrinsic_base(instr);
9193       unsigned write_mask = nir_intrinsic_write_mask(instr);
9194 
9195       /* Mark vertex export block. */
9196       if (target == V_008DFC_SQ_EXP_POS || target <= V_008DFC_SQ_EXP_NULL)
9197          ctx->block->kind |= block_kind_export_end;
9198 
9199       if (target < V_008DFC_SQ_EXP_MRTZ)
9200          ctx->program->has_color_exports = true;
9201 
9202       const bool row_en = instr->intrinsic == nir_intrinsic_export_row_amd;
9203 
9204       aco_ptr<Export_instruction> exp{
9205          create_instruction<Export_instruction>(aco_opcode::exp, Format::EXP, 4 + row_en, 0)};
9206 
9207       exp->dest = target;
9208       exp->enabled_mask = write_mask;
9209       exp->compressed = flags & AC_EXP_FLAG_COMPRESSED;
9210 
9211       /* ACO may reorder position/mrt export instructions, then mark done for last
9212        * export instruction. So don't respect the nir AC_EXP_FLAG_DONE for position/mrt
9213        * exports here and leave it to ACO.
9214        */
9215       if (target == V_008DFC_SQ_EXP_PRIM)
9216          exp->done = flags & AC_EXP_FLAG_DONE;
9217       else
9218          exp->done = false;
9219 
9220       /* ACO may reorder mrt export instructions, then mark valid mask for last
9221        * export instruction. So don't respect the nir AC_EXP_FLAG_VALID_MASK for mrt
9222        * exports here and leave it to ACO.
9223        */
9224       if (target > V_008DFC_SQ_EXP_NULL)
9225          exp->valid_mask = flags & AC_EXP_FLAG_VALID_MASK;
9226       else
9227          exp->valid_mask = false;
9228 
9229       exp->row_en = row_en;
9230 
9231       /* Compressed export uses two bits for a channel. */
9232       uint32_t channel_mask =
9233          exp->compressed ? (write_mask & 0x3 ? 1 : 0) | (write_mask & 0xc ? 2 : 0) : write_mask;
9234 
9235       Temp value = get_ssa_temp(ctx, instr->src[0].ssa);
9236       for (unsigned i = 0; i < 4; i++) {
9237          exp->operands[i] = channel_mask & BITFIELD_BIT(i)
9238                                ? Operand(emit_extract_vector(ctx, value, i, v1))
9239                                : Operand(v1);
9240       }
9241 
9242       if (row_en) {
9243          Temp row = bld.as_uniform(get_ssa_temp(ctx, instr->src[1].ssa));
9244          /* Hack to prevent the RA from moving the source into m0 and then back to a normal SGPR. */
9245          row = bld.copy(bld.def(s1, m0), row);
9246          exp->operands[4] = bld.m0(row);
9247       }
9248 
9249       ctx->block->instructions.emplace_back(std::move(exp));
9250       break;
9251    }
9252    case nir_intrinsic_export_dual_src_blend_amd: {
9253       Temp val0 = get_ssa_temp(ctx, instr->src[0].ssa);
9254       Temp val1 = get_ssa_temp(ctx, instr->src[1].ssa);
9255       unsigned write_mask = nir_intrinsic_write_mask(instr);
9256 
9257       struct aco_export_mrt mrt0, mrt1;
9258       for (unsigned i = 0; i < 4; i++) {
9259          mrt0.out[i] = write_mask & BITFIELD_BIT(i) ? Operand(emit_extract_vector(ctx, val0, i, v1))
9260                                                     : Operand(v1);
9261 
9262          mrt1.out[i] = write_mask & BITFIELD_BIT(i) ? Operand(emit_extract_vector(ctx, val1, i, v1))
9263                                                     : Operand(v1);
9264       }
9265       mrt0.enabled_channels = mrt1.enabled_channels = write_mask;
9266 
9267       create_fs_dual_src_export_gfx11(ctx, &mrt0, &mrt1);
9268 
9269       ctx->block->kind |= block_kind_export_end;
9270       break;
9271    }
9272    case nir_intrinsic_strict_wqm_coord_amd: {
9273       Temp dst = get_ssa_temp(ctx, &instr->def);
9274       Temp src = get_ssa_temp(ctx, instr->src[0].ssa);
9275       Temp tmp = bld.tmp(RegClass::get(RegType::vgpr, dst.bytes()));
9276       unsigned begin_size = nir_intrinsic_base(instr);
9277 
9278       unsigned num_src = 1;
9279       auto it = ctx->allocated_vec.find(src.id());
9280       if (it != ctx->allocated_vec.end())
9281          num_src = src.bytes() / it->second[0].bytes();
9282 
9283       aco_ptr<Pseudo_instruction> vec{create_instruction<Pseudo_instruction>(
9284          aco_opcode::p_create_vector, Format::PSEUDO, num_src + !!begin_size, 1)};
9285 
9286       if (begin_size)
9287          vec->operands[0] = Operand(RegClass::get(RegType::vgpr, begin_size));
9288       for (unsigned i = 0; i < num_src; i++) {
9289          Temp comp = it != ctx->allocated_vec.end() ? it->second[i] : src;
9290          vec->operands[i + !!begin_size] = Operand(comp);
9291       }
9292 
9293       vec->definitions[0] = Definition(tmp);
9294       ctx->block->instructions.emplace_back(std::move(vec));
9295 
9296       bld.pseudo(aco_opcode::p_start_linear_vgpr, Definition(dst), tmp);
9297       break;
9298    }
9299    case nir_intrinsic_load_lds_ngg_scratch_base_amd: {
9300       Temp dst = get_ssa_temp(ctx, &instr->def);
9301       bld.sop1(aco_opcode::p_load_symbol, Definition(dst),
9302                Operand::c32(aco_symbol_lds_ngg_scratch_base));
9303       break;
9304    }
9305    case nir_intrinsic_load_lds_ngg_gs_out_vertex_base_amd: {
9306       Temp dst = get_ssa_temp(ctx, &instr->def);
9307       bld.sop1(aco_opcode::p_load_symbol, Definition(dst),
9308                Operand::c32(aco_symbol_lds_ngg_gs_out_vertex_base));
9309       break;
9310    }
9311    case nir_intrinsic_store_scalar_arg_amd: {
9312       ctx->arg_temps[nir_intrinsic_base(instr)] =
9313          bld.as_uniform(get_ssa_temp(ctx, instr->src[0].ssa));
9314       break;
9315    }
9316    case nir_intrinsic_store_vector_arg_amd: {
9317       ctx->arg_temps[nir_intrinsic_base(instr)] =
9318          as_vgpr(ctx, get_ssa_temp(ctx, instr->src[0].ssa));
9319       break;
9320    }
9321    case nir_intrinsic_begin_invocation_interlock: {
9322       pops_await_overlapped_waves(ctx);
9323       break;
9324    }
9325    case nir_intrinsic_end_invocation_interlock: {
9326       if (ctx->options->gfx_level < GFX11)
9327          bld.pseudo(aco_opcode::p_pops_gfx9_ordered_section_done);
9328       break;
9329    }
9330    case nir_intrinsic_cmat_muladd_amd: visit_cmat_muladd(ctx, instr); break;
9331    default:
9332       isel_err(&instr->instr, "Unimplemented intrinsic instr");
9333       abort();
9334 
9335       break;
9336    }
9337 }
9338 
9339 void
get_const_vec(nir_def * vec,nir_const_value * cv[4])9340 get_const_vec(nir_def* vec, nir_const_value* cv[4])
9341 {
9342    if (vec->parent_instr->type != nir_instr_type_alu)
9343       return;
9344    nir_alu_instr* vec_instr = nir_instr_as_alu(vec->parent_instr);
9345    if (vec_instr->op != nir_op_vec(vec->num_components))
9346       return;
9347 
9348    for (unsigned i = 0; i < vec->num_components; i++) {
9349       cv[i] =
9350          vec_instr->src[i].swizzle[0] == 0 ? nir_src_as_const_value(vec_instr->src[i].src) : NULL;
9351    }
9352 }
9353 
9354 void
visit_tex(isel_context * ctx,nir_tex_instr * instr)9355 visit_tex(isel_context* ctx, nir_tex_instr* instr)
9356 {
9357    assert(instr->op != nir_texop_samples_identical);
9358 
9359    Builder bld(ctx->program, ctx->block);
9360    bool has_bias = false, has_lod = false, level_zero = false, has_compare = false,
9361         has_offset = false, has_ddx = false, has_ddy = false, has_derivs = false,
9362         has_sample_index = false, has_clamped_lod = false, has_wqm_coord = false;
9363    Temp resource, sampler, bias = Temp(), compare = Temp(), sample_index = Temp(), lod = Temp(),
9364                            offset = Temp(), ddx = Temp(), ddy = Temp(), clamped_lod = Temp(),
9365                            coord = Temp(), wqm_coord = Temp();
9366    std::vector<Temp> coords;
9367    std::vector<Temp> derivs;
9368    nir_const_value* const_offset[4] = {NULL, NULL, NULL, NULL};
9369 
9370    for (unsigned i = 0; i < instr->num_srcs; i++) {
9371       switch (instr->src[i].src_type) {
9372       case nir_tex_src_texture_handle:
9373          resource = bld.as_uniform(get_ssa_temp(ctx, instr->src[i].src.ssa));
9374          break;
9375       case nir_tex_src_sampler_handle:
9376          sampler = bld.as_uniform(get_ssa_temp(ctx, instr->src[i].src.ssa));
9377          break;
9378       default: break;
9379       }
9380    }
9381 
9382    bool tg4_integer_workarounds = ctx->options->gfx_level <= GFX8 && instr->op == nir_texop_tg4 &&
9383                                   (instr->dest_type & (nir_type_int | nir_type_uint));
9384    bool tg4_integer_cube_workaround =
9385       tg4_integer_workarounds && instr->sampler_dim == GLSL_SAMPLER_DIM_CUBE;
9386 
9387    bool a16 = false, g16 = false;
9388 
9389    int coord_idx = nir_tex_instr_src_index(instr, nir_tex_src_coord);
9390    if (coord_idx > 0)
9391       a16 = instr->src[coord_idx].src.ssa->bit_size == 16;
9392 
9393    int ddx_idx = nir_tex_instr_src_index(instr, nir_tex_src_ddx);
9394    if (ddx_idx > 0)
9395       g16 = instr->src[ddx_idx].src.ssa->bit_size == 16;
9396 
9397    for (unsigned i = 0; i < instr->num_srcs; i++) {
9398       switch (instr->src[i].src_type) {
9399       case nir_tex_src_coord: {
9400          assert(instr->src[i].src.ssa->bit_size == (a16 ? 16 : 32));
9401          coord = get_ssa_temp_tex(ctx, instr->src[i].src.ssa, a16);
9402          break;
9403       }
9404       case nir_tex_src_backend1: {
9405          assert(instr->src[i].src.ssa->bit_size == 32);
9406          wqm_coord = get_ssa_temp(ctx, instr->src[i].src.ssa);
9407          has_wqm_coord = true;
9408          break;
9409       }
9410       case nir_tex_src_bias:
9411          assert(instr->src[i].src.ssa->bit_size == (a16 ? 16 : 32));
9412          /* Doesn't need get_ssa_temp_tex because we pack it into its own dword anyway. */
9413          bias = get_ssa_temp(ctx, instr->src[i].src.ssa);
9414          has_bias = true;
9415          break;
9416       case nir_tex_src_lod: {
9417          if (nir_src_is_const(instr->src[i].src) && nir_src_as_uint(instr->src[i].src) == 0) {
9418             level_zero = true;
9419          } else {
9420             assert(instr->src[i].src.ssa->bit_size == (a16 ? 16 : 32));
9421             lod = get_ssa_temp_tex(ctx, instr->src[i].src.ssa, a16);
9422             has_lod = true;
9423          }
9424          break;
9425       }
9426       case nir_tex_src_min_lod:
9427          assert(instr->src[i].src.ssa->bit_size == (a16 ? 16 : 32));
9428          clamped_lod = get_ssa_temp_tex(ctx, instr->src[i].src.ssa, a16);
9429          has_clamped_lod = true;
9430          break;
9431       case nir_tex_src_comparator:
9432          if (instr->is_shadow) {
9433             assert(instr->src[i].src.ssa->bit_size == 32);
9434             compare = get_ssa_temp(ctx, instr->src[i].src.ssa);
9435             has_compare = true;
9436          }
9437          break;
9438       case nir_tex_src_offset:
9439       case nir_tex_src_backend2:
9440          assert(instr->src[i].src.ssa->bit_size == 32);
9441          offset = get_ssa_temp(ctx, instr->src[i].src.ssa);
9442          get_const_vec(instr->src[i].src.ssa, const_offset);
9443          has_offset = true;
9444          break;
9445       case nir_tex_src_ddx:
9446          assert(instr->src[i].src.ssa->bit_size == (g16 ? 16 : 32));
9447          ddx = get_ssa_temp_tex(ctx, instr->src[i].src.ssa, g16);
9448          has_ddx = true;
9449          break;
9450       case nir_tex_src_ddy:
9451          assert(instr->src[i].src.ssa->bit_size == (g16 ? 16 : 32));
9452          ddy = get_ssa_temp_tex(ctx, instr->src[i].src.ssa, g16);
9453          has_ddy = true;
9454          break;
9455       case nir_tex_src_ms_index:
9456          assert(instr->src[i].src.ssa->bit_size == (a16 ? 16 : 32));
9457          sample_index = get_ssa_temp_tex(ctx, instr->src[i].src.ssa, a16);
9458          has_sample_index = true;
9459          break;
9460       case nir_tex_src_texture_offset:
9461       case nir_tex_src_sampler_offset:
9462       default: break;
9463       }
9464    }
9465 
9466    if (has_wqm_coord) {
9467       assert(instr->op == nir_texop_tex || instr->op == nir_texop_txb ||
9468              instr->op == nir_texop_lod);
9469       assert(wqm_coord.regClass().is_linear_vgpr());
9470       assert(!a16 && !g16);
9471    }
9472 
9473    if (instr->op == nir_texop_tg4 && !has_lod && !instr->is_gather_implicit_lod)
9474       level_zero = true;
9475 
9476    if (has_offset) {
9477       assert(instr->op != nir_texop_txf);
9478 
9479       aco_ptr<Instruction> tmp_instr;
9480       Temp acc, pack = Temp();
9481 
9482       uint32_t pack_const = 0;
9483       for (unsigned i = 0; i < offset.size(); i++) {
9484          if (!const_offset[i])
9485             continue;
9486          pack_const |= (const_offset[i]->u32 & 0x3Fu) << (8u * i);
9487       }
9488 
9489       if (offset.type() == RegType::sgpr) {
9490          for (unsigned i = 0; i < offset.size(); i++) {
9491             if (const_offset[i])
9492                continue;
9493 
9494             acc = emit_extract_vector(ctx, offset, i, s1);
9495             acc = bld.sop2(aco_opcode::s_and_b32, bld.def(s1), bld.def(s1, scc), acc,
9496                            Operand::c32(0x3Fu));
9497 
9498             if (i) {
9499                acc = bld.sop2(aco_opcode::s_lshl_b32, bld.def(s1), bld.def(s1, scc), acc,
9500                               Operand::c32(8u * i));
9501             }
9502 
9503             if (pack == Temp()) {
9504                pack = acc;
9505             } else {
9506                pack = bld.sop2(aco_opcode::s_or_b32, bld.def(s1), bld.def(s1, scc), pack, acc);
9507             }
9508          }
9509 
9510          if (pack_const && pack != Temp())
9511             pack = bld.sop2(aco_opcode::s_or_b32, bld.def(s1), bld.def(s1, scc),
9512                             Operand::c32(pack_const), pack);
9513       } else {
9514          for (unsigned i = 0; i < offset.size(); i++) {
9515             if (const_offset[i])
9516                continue;
9517 
9518             acc = emit_extract_vector(ctx, offset, i, v1);
9519             acc = bld.vop2(aco_opcode::v_and_b32, bld.def(v1), Operand::c32(0x3Fu), acc);
9520 
9521             if (i) {
9522                acc = bld.vop2(aco_opcode::v_lshlrev_b32, bld.def(v1), Operand::c32(8u * i), acc);
9523             }
9524 
9525             if (pack == Temp()) {
9526                pack = acc;
9527             } else {
9528                pack = bld.vop2(aco_opcode::v_or_b32, bld.def(v1), pack, acc);
9529             }
9530          }
9531 
9532          if (pack_const && pack != Temp())
9533             pack = bld.vop2(aco_opcode::v_or_b32, bld.def(v1), Operand::c32(pack_const), pack);
9534       }
9535       if (pack == Temp())
9536          offset = bld.copy(bld.def(v1), Operand::c32(pack_const));
9537       else
9538          offset = pack;
9539    }
9540 
9541    std::vector<Temp> unpacked_coord;
9542    if (coord != Temp())
9543       unpacked_coord.push_back(coord);
9544    if (has_sample_index)
9545       unpacked_coord.push_back(sample_index);
9546    if (has_lod)
9547       unpacked_coord.push_back(lod);
9548    if (has_clamped_lod)
9549       unpacked_coord.push_back(clamped_lod);
9550 
9551    coords = emit_pack_v1(ctx, unpacked_coord);
9552 
9553    /* pack derivatives */
9554    if (has_ddx || has_ddy) {
9555       assert(a16 == g16 || ctx->options->gfx_level >= GFX10);
9556       std::array<Temp, 2> ddxddy = {ddx, ddy};
9557       for (Temp tmp : ddxddy) {
9558          if (tmp == Temp())
9559             continue;
9560          std::vector<Temp> unpacked = {tmp};
9561          for (Temp derv : emit_pack_v1(ctx, unpacked))
9562             derivs.push_back(derv);
9563       }
9564       has_derivs = true;
9565    }
9566 
9567    unsigned dim = 0;
9568    bool da = false;
9569    if (instr->sampler_dim != GLSL_SAMPLER_DIM_BUF) {
9570       dim = ac_get_sampler_dim(ctx->options->gfx_level, instr->sampler_dim, instr->is_array);
9571       da = should_declare_array((ac_image_dim)dim);
9572    }
9573 
9574    /* Build tex instruction */
9575    unsigned dmask = nir_def_components_read(&instr->def) & 0xf;
9576    if (instr->sampler_dim == GLSL_SAMPLER_DIM_BUF)
9577       dmask = u_bit_consecutive(0, util_last_bit(dmask));
9578    if (instr->is_sparse)
9579       dmask = MAX2(dmask, 1) | 0x10;
9580    bool d16 = instr->def.bit_size == 16;
9581    Temp dst = get_ssa_temp(ctx, &instr->def);
9582    Temp tmp_dst = dst;
9583 
9584    /* gather4 selects the component by dmask and always returns vec4 (vec5 if sparse) */
9585    if (instr->op == nir_texop_tg4) {
9586       assert(instr->def.num_components == (4 + instr->is_sparse));
9587       if (instr->is_shadow)
9588          dmask = 1;
9589       else
9590          dmask = 1 << instr->component;
9591       if (tg4_integer_cube_workaround || dst.type() == RegType::sgpr)
9592          tmp_dst = bld.tmp(instr->is_sparse ? v5 : (d16 ? v2 : v4));
9593    } else if (instr->op == nir_texop_fragment_mask_fetch_amd) {
9594       tmp_dst = bld.tmp(v1);
9595    } else if (util_bitcount(dmask) != instr->def.num_components || dst.type() == RegType::sgpr) {
9596       unsigned bytes = util_bitcount(dmask) * instr->def.bit_size / 8;
9597       tmp_dst = bld.tmp(RegClass::get(RegType::vgpr, bytes));
9598    }
9599 
9600    Temp tg4_compare_cube_wa64 = Temp();
9601 
9602    if (tg4_integer_workarounds) {
9603       Temp half_texel[2];
9604       if (instr->sampler_dim == GLSL_SAMPLER_DIM_RECT) {
9605          half_texel[0] = half_texel[1] = bld.copy(bld.def(v1), Operand::c32(0xbf000000 /*-0.5*/));
9606       } else {
9607          Temp tg4_lod = bld.copy(bld.def(v1), Operand::zero());
9608          Temp size = bld.tmp(v2);
9609          MIMG_instruction* tex = emit_mimg(bld, aco_opcode::image_get_resinfo, size, resource,
9610                                            Operand(s4), std::vector<Temp>{tg4_lod});
9611          tex->dim = dim;
9612          tex->dmask = 0x3;
9613          tex->da = da;
9614          emit_split_vector(ctx, size, size.size());
9615 
9616          for (unsigned i = 0; i < 2; i++) {
9617             half_texel[i] = emit_extract_vector(ctx, size, i, v1);
9618             half_texel[i] = bld.vop1(aco_opcode::v_cvt_f32_i32, bld.def(v1), half_texel[i]);
9619             half_texel[i] = bld.vop1(aco_opcode::v_rcp_iflag_f32, bld.def(v1), half_texel[i]);
9620             half_texel[i] = bld.vop2(aco_opcode::v_mul_f32, bld.def(v1),
9621                                      Operand::c32(0xbf000000 /*-0.5*/), half_texel[i]);
9622          }
9623 
9624          if (instr->sampler_dim == GLSL_SAMPLER_DIM_2D && !instr->is_array) {
9625             /* In vulkan, whether the sampler uses unnormalized
9626              * coordinates or not is a dynamic property of the
9627              * sampler. Hence, to figure out whether or not we
9628              * need to divide by the texture size, we need to test
9629              * the sampler at runtime. This tests the bit set by
9630              * radv_init_sampler().
9631              */
9632             unsigned bit_idx = ffs(S_008F30_FORCE_UNNORMALIZED(1)) - 1;
9633             Temp dword0 = emit_extract_vector(ctx, sampler, 0, s1);
9634             Temp not_needed =
9635                bld.sopc(aco_opcode::s_bitcmp0_b32, bld.def(s1, scc), dword0, Operand::c32(bit_idx));
9636 
9637             not_needed = bool_to_vector_condition(ctx, not_needed);
9638             half_texel[0] = bld.vop2(aco_opcode::v_cndmask_b32, bld.def(v1),
9639                                      Operand::c32(0xbf000000 /*-0.5*/), half_texel[0], not_needed);
9640             half_texel[1] = bld.vop2(aco_opcode::v_cndmask_b32, bld.def(v1),
9641                                      Operand::c32(0xbf000000 /*-0.5*/), half_texel[1], not_needed);
9642          }
9643       }
9644 
9645       Temp new_coords[2] = {bld.vop2(aco_opcode::v_add_f32, bld.def(v1), coords[0], half_texel[0]),
9646                             bld.vop2(aco_opcode::v_add_f32, bld.def(v1), coords[1], half_texel[1])};
9647 
9648       if (tg4_integer_cube_workaround) {
9649          /* see comment in ac_nir_to_llvm.c's lower_gather4_integer() */
9650          Temp* const desc = (Temp*)alloca(resource.size() * sizeof(Temp));
9651          aco_ptr<Instruction> split{create_instruction<Pseudo_instruction>(
9652             aco_opcode::p_split_vector, Format::PSEUDO, 1, resource.size())};
9653          split->operands[0] = Operand(resource);
9654          for (unsigned i = 0; i < resource.size(); i++) {
9655             desc[i] = bld.tmp(s1);
9656             split->definitions[i] = Definition(desc[i]);
9657          }
9658          ctx->block->instructions.emplace_back(std::move(split));
9659 
9660          Temp dfmt = bld.sop2(aco_opcode::s_bfe_u32, bld.def(s1), bld.def(s1, scc), desc[1],
9661                               Operand::c32(20u | (6u << 16)));
9662          Temp compare_cube_wa = bld.sopc(aco_opcode::s_cmp_eq_u32, bld.def(s1, scc), dfmt,
9663                                          Operand::c32(V_008F14_IMG_DATA_FORMAT_8_8_8_8));
9664 
9665          Temp nfmt;
9666          if (instr->dest_type & nir_type_uint) {
9667             nfmt = bld.sop2(aco_opcode::s_cselect_b32, bld.def(s1),
9668                             Operand::c32(V_008F14_IMG_NUM_FORMAT_USCALED),
9669                             Operand::c32(V_008F14_IMG_NUM_FORMAT_UINT), bld.scc(compare_cube_wa));
9670          } else {
9671             nfmt = bld.sop2(aco_opcode::s_cselect_b32, bld.def(s1),
9672                             Operand::c32(V_008F14_IMG_NUM_FORMAT_SSCALED),
9673                             Operand::c32(V_008F14_IMG_NUM_FORMAT_SINT), bld.scc(compare_cube_wa));
9674          }
9675          tg4_compare_cube_wa64 = bld.tmp(bld.lm);
9676          bool_to_vector_condition(ctx, compare_cube_wa, tg4_compare_cube_wa64);
9677 
9678          nfmt = bld.sop2(aco_opcode::s_lshl_b32, bld.def(s1), bld.def(s1, scc), nfmt,
9679                          Operand::c32(26u));
9680 
9681          desc[1] = bld.sop2(aco_opcode::s_and_b32, bld.def(s1), bld.def(s1, scc), desc[1],
9682                             Operand::c32(C_008F14_NUM_FORMAT));
9683          desc[1] = bld.sop2(aco_opcode::s_or_b32, bld.def(s1), bld.def(s1, scc), desc[1], nfmt);
9684 
9685          aco_ptr<Instruction> vec{create_instruction<Pseudo_instruction>(
9686             aco_opcode::p_create_vector, Format::PSEUDO, resource.size(), 1)};
9687          for (unsigned i = 0; i < resource.size(); i++)
9688             vec->operands[i] = Operand(desc[i]);
9689          resource = bld.tmp(resource.regClass());
9690          vec->definitions[0] = Definition(resource);
9691          ctx->block->instructions.emplace_back(std::move(vec));
9692 
9693          new_coords[0] = bld.vop2(aco_opcode::v_cndmask_b32, bld.def(v1), new_coords[0], coords[0],
9694                                   tg4_compare_cube_wa64);
9695          new_coords[1] = bld.vop2(aco_opcode::v_cndmask_b32, bld.def(v1), new_coords[1], coords[1],
9696                                   tg4_compare_cube_wa64);
9697       }
9698       coords[0] = new_coords[0];
9699       coords[1] = new_coords[1];
9700    }
9701 
9702    if (instr->sampler_dim == GLSL_SAMPLER_DIM_BUF) {
9703       // FIXME: if (ctx->abi->gfx9_stride_size_workaround) return
9704       // ac_build_buffer_load_format_gfx9_safe()
9705 
9706       assert(coords.size() == 1);
9707       aco_opcode op;
9708       if (d16) {
9709          switch (util_last_bit(dmask & 0xf)) {
9710          case 1: op = aco_opcode::buffer_load_format_d16_x; break;
9711          case 2: op = aco_opcode::buffer_load_format_d16_xy; break;
9712          case 3: op = aco_opcode::buffer_load_format_d16_xyz; break;
9713          case 4: op = aco_opcode::buffer_load_format_d16_xyzw; break;
9714          default: unreachable("Tex instruction loads more than 4 components.");
9715          }
9716       } else {
9717          switch (util_last_bit(dmask & 0xf)) {
9718          case 1: op = aco_opcode::buffer_load_format_x; break;
9719          case 2: op = aco_opcode::buffer_load_format_xy; break;
9720          case 3: op = aco_opcode::buffer_load_format_xyz; break;
9721          case 4: op = aco_opcode::buffer_load_format_xyzw; break;
9722          default: unreachable("Tex instruction loads more than 4 components.");
9723          }
9724       }
9725 
9726       aco_ptr<MUBUF_instruction> mubuf{
9727          create_instruction<MUBUF_instruction>(op, Format::MUBUF, 3 + instr->is_sparse, 1)};
9728       mubuf->operands[0] = Operand(resource);
9729       mubuf->operands[1] = Operand(coords[0]);
9730       mubuf->operands[2] = Operand::c32(0);
9731       mubuf->definitions[0] = Definition(tmp_dst);
9732       mubuf->idxen = true;
9733       mubuf->tfe = instr->is_sparse;
9734       if (mubuf->tfe)
9735          mubuf->operands[3] = emit_tfe_init(bld, tmp_dst);
9736       ctx->block->instructions.emplace_back(std::move(mubuf));
9737 
9738       expand_vector(ctx, tmp_dst, dst, instr->def.num_components, dmask);
9739       return;
9740    }
9741 
9742    /* gather MIMG address components */
9743    std::vector<Temp> args;
9744    if (has_wqm_coord) {
9745       args.emplace_back(wqm_coord);
9746       if (!(ctx->block->kind & block_kind_top_level))
9747          ctx->unended_linear_vgprs.push_back(wqm_coord);
9748    }
9749    if (has_offset)
9750       args.emplace_back(offset);
9751    if (has_bias)
9752       args.emplace_back(emit_pack_v1(ctx, {bias})[0]);
9753    if (has_compare)
9754       args.emplace_back(compare);
9755    if (has_derivs)
9756       args.insert(args.end(), derivs.begin(), derivs.end());
9757 
9758    args.insert(args.end(), coords.begin(), coords.end());
9759 
9760    if (instr->op == nir_texop_txf || instr->op == nir_texop_fragment_fetch_amd ||
9761        instr->op == nir_texop_fragment_mask_fetch_amd || instr->op == nir_texop_txf_ms) {
9762       aco_opcode op = level_zero || instr->sampler_dim == GLSL_SAMPLER_DIM_MS ||
9763                             instr->sampler_dim == GLSL_SAMPLER_DIM_SUBPASS_MS
9764                          ? aco_opcode::image_load
9765                          : aco_opcode::image_load_mip;
9766       Operand vdata = instr->is_sparse ? emit_tfe_init(bld, tmp_dst) : Operand(v1);
9767       MIMG_instruction* tex = emit_mimg(bld, op, tmp_dst, resource, Operand(s4), args, vdata);
9768       if (instr->op == nir_texop_fragment_mask_fetch_amd)
9769          tex->dim = da ? ac_image_2darray : ac_image_2d;
9770       else
9771          tex->dim = dim;
9772       tex->dmask = dmask & 0xf;
9773       tex->unrm = true;
9774       tex->da = da;
9775       tex->tfe = instr->is_sparse;
9776       tex->d16 = d16;
9777       tex->a16 = a16;
9778 
9779       if (instr->op == nir_texop_fragment_mask_fetch_amd) {
9780          /* Use 0x76543210 if the image doesn't have FMASK. */
9781          assert(dmask == 1 && dst.bytes() == 4);
9782          assert(dst.id() != tmp_dst.id());
9783 
9784          if (dst.regClass() == s1) {
9785             Temp is_not_null = bld.sopc(aco_opcode::s_cmp_lg_u32, bld.def(s1, scc), Operand::zero(),
9786                                         emit_extract_vector(ctx, resource, 1, s1));
9787             bld.sop2(aco_opcode::s_cselect_b32, Definition(dst), bld.as_uniform(tmp_dst),
9788                      Operand::c32(0x76543210), bld.scc(is_not_null));
9789          } else {
9790             Temp is_not_null = bld.tmp(bld.lm);
9791             bld.vopc_e64(aco_opcode::v_cmp_lg_u32, Definition(is_not_null), Operand::zero(),
9792                          emit_extract_vector(ctx, resource, 1, s1));
9793             bld.vop2(aco_opcode::v_cndmask_b32, Definition(dst),
9794                      bld.copy(bld.def(v1), Operand::c32(0x76543210)), tmp_dst, is_not_null);
9795          }
9796       } else {
9797          expand_vector(ctx, tmp_dst, dst, instr->def.num_components, dmask);
9798       }
9799       return;
9800    }
9801 
9802    bool separate_g16 = ctx->options->gfx_level >= GFX10 && g16;
9803 
9804    // TODO: would be better to do this by adding offsets, but needs the opcodes ordered.
9805    aco_opcode opcode = aco_opcode::image_sample;
9806    if (has_offset) { /* image_sample_*_o */
9807       if (has_clamped_lod) {
9808          if (has_compare) {
9809             opcode = aco_opcode::image_sample_c_cl_o;
9810             if (separate_g16)
9811                opcode = aco_opcode::image_sample_c_d_cl_o_g16;
9812             else if (has_derivs)
9813                opcode = aco_opcode::image_sample_c_d_cl_o;
9814             if (has_bias)
9815                opcode = aco_opcode::image_sample_c_b_cl_o;
9816          } else {
9817             opcode = aco_opcode::image_sample_cl_o;
9818             if (separate_g16)
9819                opcode = aco_opcode::image_sample_d_cl_o_g16;
9820             else if (has_derivs)
9821                opcode = aco_opcode::image_sample_d_cl_o;
9822             if (has_bias)
9823                opcode = aco_opcode::image_sample_b_cl_o;
9824          }
9825       } else if (has_compare) {
9826          opcode = aco_opcode::image_sample_c_o;
9827          if (separate_g16)
9828             opcode = aco_opcode::image_sample_c_d_o_g16;
9829          else if (has_derivs)
9830             opcode = aco_opcode::image_sample_c_d_o;
9831          if (has_bias)
9832             opcode = aco_opcode::image_sample_c_b_o;
9833          if (level_zero)
9834             opcode = aco_opcode::image_sample_c_lz_o;
9835          if (has_lod)
9836             opcode = aco_opcode::image_sample_c_l_o;
9837       } else {
9838          opcode = aco_opcode::image_sample_o;
9839          if (separate_g16)
9840             opcode = aco_opcode::image_sample_d_o_g16;
9841          else if (has_derivs)
9842             opcode = aco_opcode::image_sample_d_o;
9843          if (has_bias)
9844             opcode = aco_opcode::image_sample_b_o;
9845          if (level_zero)
9846             opcode = aco_opcode::image_sample_lz_o;
9847          if (has_lod)
9848             opcode = aco_opcode::image_sample_l_o;
9849       }
9850    } else if (has_clamped_lod) { /* image_sample_*_cl */
9851       if (has_compare) {
9852          opcode = aco_opcode::image_sample_c_cl;
9853          if (separate_g16)
9854             opcode = aco_opcode::image_sample_c_d_cl_g16;
9855          else if (has_derivs)
9856             opcode = aco_opcode::image_sample_c_d_cl;
9857          if (has_bias)
9858             opcode = aco_opcode::image_sample_c_b_cl;
9859       } else {
9860          opcode = aco_opcode::image_sample_cl;
9861          if (separate_g16)
9862             opcode = aco_opcode::image_sample_d_cl_g16;
9863          else if (has_derivs)
9864             opcode = aco_opcode::image_sample_d_cl;
9865          if (has_bias)
9866             opcode = aco_opcode::image_sample_b_cl;
9867       }
9868    } else { /* no offset */
9869       if (has_compare) {
9870          opcode = aco_opcode::image_sample_c;
9871          if (separate_g16)
9872             opcode = aco_opcode::image_sample_c_d_g16;
9873          else if (has_derivs)
9874             opcode = aco_opcode::image_sample_c_d;
9875          if (has_bias)
9876             opcode = aco_opcode::image_sample_c_b;
9877          if (level_zero)
9878             opcode = aco_opcode::image_sample_c_lz;
9879          if (has_lod)
9880             opcode = aco_opcode::image_sample_c_l;
9881       } else {
9882          opcode = aco_opcode::image_sample;
9883          if (separate_g16)
9884             opcode = aco_opcode::image_sample_d_g16;
9885          else if (has_derivs)
9886             opcode = aco_opcode::image_sample_d;
9887          if (has_bias)
9888             opcode = aco_opcode::image_sample_b;
9889          if (level_zero)
9890             opcode = aco_opcode::image_sample_lz;
9891          if (has_lod)
9892             opcode = aco_opcode::image_sample_l;
9893       }
9894    }
9895 
9896    if (instr->op == nir_texop_tg4) {
9897       /* GFX11 supports implicit LOD, but the extension is unsupported. */
9898       assert(level_zero || ctx->options->gfx_level < GFX11);
9899 
9900       if (has_offset) { /* image_gather4_*_o */
9901          if (has_compare) {
9902             opcode = aco_opcode::image_gather4_c_o;
9903             if (level_zero)
9904                opcode = aco_opcode::image_gather4_c_lz_o;
9905             if (has_lod)
9906                opcode = aco_opcode::image_gather4_c_l_o;
9907             if (has_bias)
9908                opcode = aco_opcode::image_gather4_c_b_o;
9909          } else {
9910             opcode = aco_opcode::image_gather4_o;
9911             if (level_zero)
9912                opcode = aco_opcode::image_gather4_lz_o;
9913             if (has_lod)
9914                opcode = aco_opcode::image_gather4_l_o;
9915             if (has_bias)
9916                opcode = aco_opcode::image_gather4_b_o;
9917          }
9918       } else {
9919          if (has_compare) {
9920             opcode = aco_opcode::image_gather4_c;
9921             if (level_zero)
9922                opcode = aco_opcode::image_gather4_c_lz;
9923             if (has_lod)
9924                opcode = aco_opcode::image_gather4_c_l;
9925             if (has_bias)
9926                opcode = aco_opcode::image_gather4_c_b;
9927          } else {
9928             opcode = aco_opcode::image_gather4;
9929             if (level_zero)
9930                opcode = aco_opcode::image_gather4_lz;
9931             if (has_lod)
9932                opcode = aco_opcode::image_gather4_l;
9933             if (has_bias)
9934                opcode = aco_opcode::image_gather4_b;
9935          }
9936       }
9937    } else if (instr->op == nir_texop_lod) {
9938       opcode = aco_opcode::image_get_lod;
9939    }
9940 
9941    bool implicit_derivs = bld.program->stage == fragment_fs && !has_derivs && !has_lod &&
9942                           !level_zero && instr->sampler_dim != GLSL_SAMPLER_DIM_MS &&
9943                           instr->sampler_dim != GLSL_SAMPLER_DIM_SUBPASS_MS;
9944 
9945    Operand vdata = instr->is_sparse ? emit_tfe_init(bld, tmp_dst) : Operand(v1);
9946    MIMG_instruction* tex = emit_mimg(bld, opcode, tmp_dst, resource, Operand(sampler), args, vdata);
9947    tex->dim = dim;
9948    tex->dmask = dmask & 0xf;
9949    tex->da = da;
9950    tex->unrm = instr->sampler_dim == GLSL_SAMPLER_DIM_RECT;
9951    tex->tfe = instr->is_sparse;
9952    tex->d16 = d16;
9953    tex->a16 = a16;
9954    if (implicit_derivs)
9955       set_wqm(ctx, true);
9956 
9957    if (tg4_integer_cube_workaround) {
9958       assert(tmp_dst.id() != dst.id());
9959       assert(tmp_dst.size() == dst.size());
9960 
9961       emit_split_vector(ctx, tmp_dst, tmp_dst.size());
9962       Temp val[4];
9963       for (unsigned i = 0; i < 4; i++) {
9964          val[i] = emit_extract_vector(ctx, tmp_dst, i, v1);
9965          Temp cvt_val;
9966          if (instr->dest_type & nir_type_uint)
9967             cvt_val = bld.vop1(aco_opcode::v_cvt_u32_f32, bld.def(v1), val[i]);
9968          else
9969             cvt_val = bld.vop1(aco_opcode::v_cvt_i32_f32, bld.def(v1), val[i]);
9970          val[i] = bld.vop2(aco_opcode::v_cndmask_b32, bld.def(v1), val[i], cvt_val,
9971                            tg4_compare_cube_wa64);
9972       }
9973 
9974       Temp tmp = dst.regClass() == tmp_dst.regClass() ? dst : bld.tmp(tmp_dst.regClass());
9975       if (instr->is_sparse)
9976          tmp_dst = bld.pseudo(aco_opcode::p_create_vector, Definition(tmp), val[0], val[1], val[2],
9977                               val[3], emit_extract_vector(ctx, tmp_dst, 4, v1));
9978       else
9979          tmp_dst = bld.pseudo(aco_opcode::p_create_vector, Definition(tmp), val[0], val[1], val[2],
9980                               val[3]);
9981    }
9982    unsigned mask = instr->op == nir_texop_tg4 ? (instr->is_sparse ? 0x1F : 0xF) : dmask;
9983    expand_vector(ctx, tmp_dst, dst, instr->def.num_components, mask);
9984 }
9985 
9986 Operand
get_phi_operand(isel_context * ctx,nir_def * ssa,RegClass rc,bool logical)9987 get_phi_operand(isel_context* ctx, nir_def* ssa, RegClass rc, bool logical)
9988 {
9989    Temp tmp = get_ssa_temp(ctx, ssa);
9990    if (ssa->parent_instr->type == nir_instr_type_undef) {
9991       return Operand(rc);
9992    } else if (logical && ssa->bit_size == 1 &&
9993               ssa->parent_instr->type == nir_instr_type_load_const) {
9994       bool val = nir_instr_as_load_const(ssa->parent_instr)->value[0].b;
9995       return Operand::c32_or_c64(val ? -1 : 0, ctx->program->lane_mask == s2);
9996    } else {
9997       return Operand(tmp);
9998    }
9999 }
10000 
10001 void
visit_phi(isel_context * ctx,nir_phi_instr * instr)10002 visit_phi(isel_context* ctx, nir_phi_instr* instr)
10003 {
10004    aco_ptr<Pseudo_instruction> phi;
10005    Temp dst = get_ssa_temp(ctx, &instr->def);
10006    assert(instr->def.bit_size != 1 || dst.regClass() == ctx->program->lane_mask);
10007 
10008    bool logical = !dst.is_linear() || instr->def.divergent;
10009    logical |= (ctx->block->kind & block_kind_merge) != 0;
10010    aco_opcode opcode = logical ? aco_opcode::p_phi : aco_opcode::p_linear_phi;
10011 
10012    /* we want a sorted list of sources, since the predecessor list is also sorted */
10013    std::map<unsigned, nir_def*> phi_src;
10014    nir_foreach_phi_src (src, instr)
10015       phi_src[src->pred->index] = src->src.ssa;
10016 
10017    std::vector<unsigned>& preds = logical ? ctx->block->logical_preds : ctx->block->linear_preds;
10018    unsigned num_operands = 0;
10019    Operand* const operands = (Operand*)alloca(
10020       (std::max(exec_list_length(&instr->srcs), (unsigned)preds.size()) + 1) * sizeof(Operand));
10021    unsigned num_defined = 0;
10022    unsigned cur_pred_idx = 0;
10023    for (std::pair<unsigned, nir_def*> src : phi_src) {
10024       if (cur_pred_idx < preds.size()) {
10025          /* handle missing preds (IF merges with discard/break) and extra preds
10026           * (loop exit with discard) */
10027          unsigned block = ctx->cf_info.nir_to_aco[src.first];
10028          unsigned skipped = 0;
10029          while (cur_pred_idx + skipped < preds.size() && preds[cur_pred_idx + skipped] != block)
10030             skipped++;
10031          if (cur_pred_idx + skipped < preds.size()) {
10032             for (unsigned i = 0; i < skipped; i++)
10033                operands[num_operands++] = Operand(dst.regClass());
10034             cur_pred_idx += skipped;
10035          } else {
10036             continue;
10037          }
10038       }
10039       /* Handle missing predecessors at the end. This shouldn't happen with loop
10040        * headers and we can't ignore these sources for loop header phis. */
10041       if (!(ctx->block->kind & block_kind_loop_header) && cur_pred_idx >= preds.size())
10042          continue;
10043       cur_pred_idx++;
10044       Operand op = get_phi_operand(ctx, src.second, dst.regClass(), logical);
10045       operands[num_operands++] = op;
10046       num_defined += !op.isUndefined();
10047    }
10048    /* handle block_kind_continue_or_break at loop exit blocks */
10049    while (cur_pred_idx++ < preds.size())
10050       operands[num_operands++] = Operand(dst.regClass());
10051 
10052    /* If the loop ends with a break, still add a linear continue edge in case
10053     * that break is divergent or continue_or_break is used. We'll either remove
10054     * this operand later in visit_loop() if it's not necessary or replace the
10055     * undef with something correct. */
10056    if (!logical && ctx->block->kind & block_kind_loop_header) {
10057       nir_loop* loop = nir_cf_node_as_loop(instr->instr.block->cf_node.parent);
10058       nir_block* last = nir_loop_last_block(loop);
10059       if (last->successors[0] != instr->instr.block)
10060          operands[num_operands++] = Operand(RegClass());
10061    }
10062 
10063    /* we can use a linear phi in some cases if one src is undef */
10064    if (dst.is_linear() && ctx->block->kind & block_kind_merge && num_defined == 1) {
10065       phi.reset(create_instruction<Pseudo_instruction>(aco_opcode::p_linear_phi, Format::PSEUDO,
10066                                                        num_operands, 1));
10067 
10068       Block* linear_else = &ctx->program->blocks[ctx->block->linear_preds[1]];
10069       Block* invert = &ctx->program->blocks[linear_else->linear_preds[0]];
10070       assert(invert->kind & block_kind_invert);
10071 
10072       unsigned then_block = invert->linear_preds[0];
10073 
10074       Block* insert_block = NULL;
10075       for (unsigned i = 0; i < num_operands; i++) {
10076          Operand op = operands[i];
10077          if (op.isUndefined())
10078             continue;
10079          insert_block = ctx->block->logical_preds[i] == then_block ? invert : ctx->block;
10080          phi->operands[0] = op;
10081          break;
10082       }
10083       assert(insert_block); /* should be handled by the "num_defined == 0" case above */
10084       phi->operands[1] = Operand(dst.regClass());
10085       phi->definitions[0] = Definition(dst);
10086       insert_block->instructions.emplace(insert_block->instructions.begin(), std::move(phi));
10087       return;
10088    }
10089 
10090    phi.reset(create_instruction<Pseudo_instruction>(opcode, Format::PSEUDO, num_operands, 1));
10091    for (unsigned i = 0; i < num_operands; i++)
10092       phi->operands[i] = operands[i];
10093    phi->definitions[0] = Definition(dst);
10094    ctx->block->instructions.emplace(ctx->block->instructions.begin(), std::move(phi));
10095 }
10096 
10097 void
visit_undef(isel_context * ctx,nir_undef_instr * instr)10098 visit_undef(isel_context* ctx, nir_undef_instr* instr)
10099 {
10100    Temp dst = get_ssa_temp(ctx, &instr->def);
10101 
10102    assert(dst.type() == RegType::sgpr);
10103 
10104    if (dst.size() == 1) {
10105       Builder(ctx->program, ctx->block).copy(Definition(dst), Operand::zero());
10106    } else {
10107       aco_ptr<Pseudo_instruction> vec{create_instruction<Pseudo_instruction>(
10108          aco_opcode::p_create_vector, Format::PSEUDO, dst.size(), 1)};
10109       for (unsigned i = 0; i < dst.size(); i++)
10110          vec->operands[i] = Operand::zero();
10111       vec->definitions[0] = Definition(dst);
10112       ctx->block->instructions.emplace_back(std::move(vec));
10113    }
10114 }
10115 
10116 void
begin_loop(isel_context * ctx,loop_context * lc)10117 begin_loop(isel_context* ctx, loop_context* lc)
10118 {
10119    // TODO: we might want to wrap the loop around a branch if exec_potentially_empty=true
10120    append_logical_end(ctx->block);
10121    ctx->block->kind |= block_kind_loop_preheader | block_kind_uniform;
10122    Builder bld(ctx->program, ctx->block);
10123    bld.branch(aco_opcode::p_branch, bld.def(s2));
10124    unsigned loop_preheader_idx = ctx->block->index;
10125 
10126    lc->loop_exit.kind |= (block_kind_loop_exit | (ctx->block->kind & block_kind_top_level));
10127 
10128    ctx->program->next_loop_depth++;
10129 
10130    Block* loop_header = ctx->program->create_and_insert_block();
10131    loop_header->kind |= block_kind_loop_header;
10132    add_edge(loop_preheader_idx, loop_header);
10133    ctx->block = loop_header;
10134 
10135    append_logical_start(ctx->block);
10136 
10137    lc->header_idx_old = std::exchange(ctx->cf_info.parent_loop.header_idx, loop_header->index);
10138    lc->exit_old = std::exchange(ctx->cf_info.parent_loop.exit, &lc->loop_exit);
10139    lc->divergent_cont_old = std::exchange(ctx->cf_info.parent_loop.has_divergent_continue, false);
10140    lc->divergent_branch_old = std::exchange(ctx->cf_info.parent_loop.has_divergent_branch, false);
10141    lc->divergent_if_old = std::exchange(ctx->cf_info.parent_if.is_divergent, false);
10142 }
10143 
10144 void
end_loop(isel_context * ctx,loop_context * lc)10145 end_loop(isel_context* ctx, loop_context* lc)
10146 {
10147    // TODO: what if a loop ends with a unconditional or uniformly branched continue
10148    //       and this branch is never taken?
10149    if (!ctx->cf_info.has_branch) {
10150       unsigned loop_header_idx = ctx->cf_info.parent_loop.header_idx;
10151       Builder bld(ctx->program, ctx->block);
10152       append_logical_end(ctx->block);
10153 
10154       if (ctx->cf_info.exec_potentially_empty_discard ||
10155           ctx->cf_info.exec_potentially_empty_break) {
10156          /* Discards can result in code running with an empty exec mask.
10157           * This would result in divergent breaks not ever being taken. As a
10158           * workaround, break the loop when the loop mask is empty instead of
10159           * always continuing. */
10160          ctx->block->kind |= (block_kind_continue_or_break | block_kind_uniform);
10161          unsigned block_idx = ctx->block->index;
10162 
10163          /* create helper blocks to avoid critical edges */
10164          Block* break_block = ctx->program->create_and_insert_block();
10165          break_block->kind = block_kind_uniform;
10166          bld.reset(break_block);
10167          bld.branch(aco_opcode::p_branch, bld.def(s2));
10168          add_linear_edge(block_idx, break_block);
10169          add_linear_edge(break_block->index, &lc->loop_exit);
10170 
10171          Block* continue_block = ctx->program->create_and_insert_block();
10172          continue_block->kind = block_kind_uniform;
10173          bld.reset(continue_block);
10174          bld.branch(aco_opcode::p_branch, bld.def(s2));
10175          add_linear_edge(block_idx, continue_block);
10176          add_linear_edge(continue_block->index, &ctx->program->blocks[loop_header_idx]);
10177 
10178          if (!ctx->cf_info.parent_loop.has_divergent_branch)
10179             add_logical_edge(block_idx, &ctx->program->blocks[loop_header_idx]);
10180          ctx->block = &ctx->program->blocks[block_idx];
10181       } else {
10182          ctx->block->kind |= (block_kind_continue | block_kind_uniform);
10183          if (!ctx->cf_info.parent_loop.has_divergent_branch)
10184             add_edge(ctx->block->index, &ctx->program->blocks[loop_header_idx]);
10185          else
10186             add_linear_edge(ctx->block->index, &ctx->program->blocks[loop_header_idx]);
10187       }
10188 
10189       bld.reset(ctx->block);
10190       bld.branch(aco_opcode::p_branch, bld.def(s2));
10191    }
10192 
10193    ctx->cf_info.has_branch = false;
10194    ctx->program->next_loop_depth--;
10195 
10196    // TODO: if the loop has not a single exit, we must add one °°
10197    /* emit loop successor block */
10198    ctx->block = ctx->program->insert_block(std::move(lc->loop_exit));
10199    append_logical_start(ctx->block);
10200 
10201 #if 0
10202    // TODO: check if it is beneficial to not branch on continues
10203    /* trim linear phis in loop header */
10204    for (auto&& instr : loop_entry->instructions) {
10205       if (instr->opcode == aco_opcode::p_linear_phi) {
10206          aco_ptr<Pseudo_instruction> new_phi{create_instruction<Pseudo_instruction>(aco_opcode::p_linear_phi, Format::PSEUDO, loop_entry->linear_predecessors.size(), 1)};
10207          new_phi->definitions[0] = instr->definitions[0];
10208          for (unsigned i = 0; i < new_phi->operands.size(); i++)
10209             new_phi->operands[i] = instr->operands[i];
10210          /* check that the remaining operands are all the same */
10211          for (unsigned i = new_phi->operands.size(); i < instr->operands.size(); i++)
10212             assert(instr->operands[i].tempId() == instr->operands.back().tempId());
10213          instr.swap(new_phi);
10214       } else if (instr->opcode == aco_opcode::p_phi) {
10215          continue;
10216       } else {
10217          break;
10218       }
10219    }
10220 #endif
10221 
10222    ctx->cf_info.parent_loop.header_idx = lc->header_idx_old;
10223    ctx->cf_info.parent_loop.exit = lc->exit_old;
10224    ctx->cf_info.parent_loop.has_divergent_continue = lc->divergent_cont_old;
10225    ctx->cf_info.parent_loop.has_divergent_branch = lc->divergent_branch_old;
10226    ctx->cf_info.parent_if.is_divergent = lc->divergent_if_old;
10227    if (!ctx->block->loop_nest_depth && !ctx->cf_info.parent_if.is_divergent)
10228       ctx->cf_info.exec_potentially_empty_discard = false;
10229 }
10230 
10231 void
emit_loop_jump(isel_context * ctx,bool is_break)10232 emit_loop_jump(isel_context* ctx, bool is_break)
10233 {
10234    Builder bld(ctx->program, ctx->block);
10235    Block* logical_target;
10236    append_logical_end(ctx->block);
10237    unsigned idx = ctx->block->index;
10238 
10239    if (is_break) {
10240       logical_target = ctx->cf_info.parent_loop.exit;
10241       add_logical_edge(idx, logical_target);
10242       ctx->block->kind |= block_kind_break;
10243 
10244       if (!ctx->cf_info.parent_if.is_divergent &&
10245           !ctx->cf_info.parent_loop.has_divergent_continue) {
10246          /* uniform break - directly jump out of the loop */
10247          ctx->block->kind |= block_kind_uniform;
10248          ctx->cf_info.has_branch = true;
10249          bld.branch(aco_opcode::p_branch, bld.def(s2));
10250          add_linear_edge(idx, logical_target);
10251          return;
10252       }
10253       ctx->cf_info.parent_loop.has_divergent_branch = true;
10254    } else {
10255       logical_target = &ctx->program->blocks[ctx->cf_info.parent_loop.header_idx];
10256       add_logical_edge(idx, logical_target);
10257       ctx->block->kind |= block_kind_continue;
10258 
10259       if (!ctx->cf_info.parent_if.is_divergent) {
10260          /* uniform continue - directly jump to the loop header */
10261          ctx->block->kind |= block_kind_uniform;
10262          ctx->cf_info.has_branch = true;
10263          bld.branch(aco_opcode::p_branch, bld.def(s2));
10264          add_linear_edge(idx, logical_target);
10265          return;
10266       }
10267 
10268       /* for potential uniform breaks after this continue,
10269          we must ensure that they are handled correctly */
10270       ctx->cf_info.parent_loop.has_divergent_continue = true;
10271       ctx->cf_info.parent_loop.has_divergent_branch = true;
10272    }
10273 
10274    if (ctx->cf_info.parent_if.is_divergent && !ctx->cf_info.exec_potentially_empty_break) {
10275       ctx->cf_info.exec_potentially_empty_break = true;
10276       ctx->cf_info.exec_potentially_empty_break_depth = ctx->block->loop_nest_depth;
10277    }
10278 
10279    /* remove critical edges from linear CFG */
10280    bld.branch(aco_opcode::p_branch, bld.def(s2));
10281    Block* break_block = ctx->program->create_and_insert_block();
10282    break_block->kind |= block_kind_uniform;
10283    add_linear_edge(idx, break_block);
10284    /* the loop_header pointer might be invalidated by this point */
10285    if (!is_break)
10286       logical_target = &ctx->program->blocks[ctx->cf_info.parent_loop.header_idx];
10287    add_linear_edge(break_block->index, logical_target);
10288    bld.reset(break_block);
10289    bld.branch(aco_opcode::p_branch, bld.def(s2));
10290 
10291    Block* continue_block = ctx->program->create_and_insert_block();
10292    add_linear_edge(idx, continue_block);
10293    append_logical_start(continue_block);
10294    ctx->block = continue_block;
10295 }
10296 
10297 void
emit_loop_break(isel_context * ctx)10298 emit_loop_break(isel_context* ctx)
10299 {
10300    emit_loop_jump(ctx, true);
10301 }
10302 
10303 void
emit_loop_continue(isel_context * ctx)10304 emit_loop_continue(isel_context* ctx)
10305 {
10306    emit_loop_jump(ctx, false);
10307 }
10308 
10309 void
visit_jump(isel_context * ctx,nir_jump_instr * instr)10310 visit_jump(isel_context* ctx, nir_jump_instr* instr)
10311 {
10312    /* visit_block() would usually do this but divergent jumps updates ctx->block */
10313    ctx->cf_info.nir_to_aco[instr->instr.block->index] = ctx->block->index;
10314 
10315    switch (instr->type) {
10316    case nir_jump_break: emit_loop_break(ctx); break;
10317    case nir_jump_continue: emit_loop_continue(ctx); break;
10318    default: isel_err(&instr->instr, "Unknown NIR jump instr"); abort();
10319    }
10320 }
10321 
10322 void
visit_block(isel_context * ctx,nir_block * block)10323 visit_block(isel_context* ctx, nir_block* block)
10324 {
10325    if (ctx->block->kind & block_kind_top_level) {
10326       Builder bld(ctx->program, ctx->block);
10327       for (Temp tmp : ctx->unended_linear_vgprs)
10328          bld.pseudo(aco_opcode::p_end_linear_vgpr, tmp);
10329       ctx->unended_linear_vgprs.clear();
10330    }
10331 
10332    ctx->block->instructions.reserve(ctx->block->instructions.size() +
10333                                     exec_list_length(&block->instr_list) * 2);
10334    nir_foreach_instr (instr, block) {
10335       switch (instr->type) {
10336       case nir_instr_type_alu: visit_alu_instr(ctx, nir_instr_as_alu(instr)); break;
10337       case nir_instr_type_load_const: visit_load_const(ctx, nir_instr_as_load_const(instr)); break;
10338       case nir_instr_type_intrinsic: visit_intrinsic(ctx, nir_instr_as_intrinsic(instr)); break;
10339       case nir_instr_type_tex: visit_tex(ctx, nir_instr_as_tex(instr)); break;
10340       case nir_instr_type_phi: visit_phi(ctx, nir_instr_as_phi(instr)); break;
10341       case nir_instr_type_undef: visit_undef(ctx, nir_instr_as_undef(instr)); break;
10342       case nir_instr_type_deref: break;
10343       case nir_instr_type_jump: visit_jump(ctx, nir_instr_as_jump(instr)); break;
10344       default: isel_err(instr, "Unknown NIR instr type");
10345       }
10346    }
10347 
10348    if (!ctx->cf_info.parent_loop.has_divergent_branch)
10349       ctx->cf_info.nir_to_aco[block->index] = ctx->block->index;
10350 }
10351 
10352 static Operand
create_continue_phis(isel_context * ctx,unsigned first,unsigned last,aco_ptr<Instruction> & header_phi,Operand * vals)10353 create_continue_phis(isel_context* ctx, unsigned first, unsigned last,
10354                      aco_ptr<Instruction>& header_phi, Operand* vals)
10355 {
10356    vals[0] = Operand(header_phi->definitions[0].getTemp());
10357    RegClass rc = vals[0].regClass();
10358 
10359    unsigned loop_nest_depth = ctx->program->blocks[first].loop_nest_depth;
10360 
10361    unsigned next_pred = 1;
10362 
10363    for (unsigned idx = first + 1; idx <= last; idx++) {
10364       Block& block = ctx->program->blocks[idx];
10365       if (block.loop_nest_depth != loop_nest_depth) {
10366          vals[idx - first] = vals[idx - 1 - first];
10367          continue;
10368       }
10369 
10370       if ((block.kind & block_kind_continue) && block.index != last) {
10371          vals[idx - first] = header_phi->operands[next_pred];
10372          next_pred++;
10373          continue;
10374       }
10375 
10376       bool all_same = true;
10377       for (unsigned i = 1; all_same && (i < block.linear_preds.size()); i++)
10378          all_same = vals[block.linear_preds[i] - first] == vals[block.linear_preds[0] - first];
10379 
10380       Operand val;
10381       if (all_same) {
10382          val = vals[block.linear_preds[0] - first];
10383       } else {
10384          aco_ptr<Instruction> phi(create_instruction<Pseudo_instruction>(
10385             aco_opcode::p_linear_phi, Format::PSEUDO, block.linear_preds.size(), 1));
10386          for (unsigned i = 0; i < block.linear_preds.size(); i++)
10387             phi->operands[i] = vals[block.linear_preds[i] - first];
10388          val = Operand(ctx->program->allocateTmp(rc));
10389          phi->definitions[0] = Definition(val.getTemp());
10390          block.instructions.emplace(block.instructions.begin(), std::move(phi));
10391       }
10392       vals[idx - first] = val;
10393    }
10394 
10395    return vals[last - first];
10396 }
10397 
10398 static void begin_uniform_if_then(isel_context* ctx, if_context* ic, Temp cond);
10399 static void begin_uniform_if_else(isel_context* ctx, if_context* ic);
10400 static void end_uniform_if(isel_context* ctx, if_context* ic);
10401 
10402 static void
visit_loop(isel_context * ctx,nir_loop * loop)10403 visit_loop(isel_context* ctx, nir_loop* loop)
10404 {
10405    assert(!nir_loop_has_continue_construct(loop));
10406    loop_context lc;
10407    begin_loop(ctx, &lc);
10408 
10409    bool unreachable = visit_cf_list(ctx, &loop->body);
10410 
10411    unsigned loop_header_idx = ctx->cf_info.parent_loop.header_idx;
10412 
10413    /* Fixup phis in loop header from unreachable blocks.
10414     * has_branch/has_divergent_branch also indicates if the loop ends with a
10415     * break/continue instruction, but we don't emit those if unreachable=true */
10416    if (unreachable) {
10417       assert(ctx->cf_info.has_branch || ctx->cf_info.parent_loop.has_divergent_branch);
10418       bool linear = ctx->cf_info.has_branch;
10419       bool logical = ctx->cf_info.has_branch || ctx->cf_info.parent_loop.has_divergent_branch;
10420       for (aco_ptr<Instruction>& instr : ctx->program->blocks[loop_header_idx].instructions) {
10421          if ((logical && instr->opcode == aco_opcode::p_phi) ||
10422              (linear && instr->opcode == aco_opcode::p_linear_phi)) {
10423             /* the last operand should be the one that needs to be removed */
10424             instr->operands.pop_back();
10425          } else if (!is_phi(instr)) {
10426             break;
10427          }
10428       }
10429    }
10430 
10431    /* Fixup linear phis in loop header from expecting a continue. Both this fixup
10432     * and the previous one shouldn't both happen at once because a break in the
10433     * merge block would get CSE'd */
10434    if (nir_loop_last_block(loop)->successors[0] != nir_loop_first_block(loop)) {
10435       unsigned num_vals = ctx->cf_info.has_branch ? 1 : (ctx->block->index - loop_header_idx + 1);
10436       Operand* const vals = (Operand*)alloca(num_vals * sizeof(Operand));
10437       for (aco_ptr<Instruction>& instr : ctx->program->blocks[loop_header_idx].instructions) {
10438          if (instr->opcode == aco_opcode::p_linear_phi) {
10439             if (ctx->cf_info.has_branch)
10440                instr->operands.pop_back();
10441             else
10442                instr->operands.back() =
10443                   create_continue_phis(ctx, loop_header_idx, ctx->block->index, instr, vals);
10444          } else if (!is_phi(instr)) {
10445             break;
10446          }
10447       }
10448    }
10449 
10450    /* NIR seems to allow this, and even though the loop exit has no predecessors, SSA defs from the
10451     * loop header are live. Handle this without complicating the ACO IR by creating a dummy break.
10452     */
10453    if (nir_cf_node_cf_tree_next(&loop->cf_node)->predecessors->entries == 0) {
10454       Builder bld(ctx->program, ctx->block);
10455       Temp cond = bld.copy(bld.def(s1, scc), Operand::zero());
10456       if_context ic;
10457       begin_uniform_if_then(ctx, &ic, cond);
10458       emit_loop_break(ctx);
10459       begin_uniform_if_else(ctx, &ic);
10460       end_uniform_if(ctx, &ic);
10461    }
10462 
10463    end_loop(ctx, &lc);
10464 }
10465 
10466 static void
begin_divergent_if_then(isel_context * ctx,if_context * ic,Temp cond,nir_selection_control sel_ctrl=nir_selection_control_none)10467 begin_divergent_if_then(isel_context* ctx, if_context* ic, Temp cond,
10468                         nir_selection_control sel_ctrl = nir_selection_control_none)
10469 {
10470    ic->cond = cond;
10471 
10472    append_logical_end(ctx->block);
10473    ctx->block->kind |= block_kind_branch;
10474 
10475    /* branch to linear then block */
10476    assert(cond.regClass() == ctx->program->lane_mask);
10477    aco_ptr<Pseudo_branch_instruction> branch;
10478    branch.reset(create_instruction<Pseudo_branch_instruction>(aco_opcode::p_cbranch_z,
10479                                                               Format::PSEUDO_BRANCH, 1, 1));
10480    branch->definitions[0] = Definition(ctx->program->allocateTmp(s2));
10481    branch->operands[0] = Operand(cond);
10482    branch->selection_control_remove = sel_ctrl == nir_selection_control_flatten ||
10483                                       sel_ctrl == nir_selection_control_divergent_always_taken;
10484    ctx->block->instructions.push_back(std::move(branch));
10485 
10486    ic->BB_if_idx = ctx->block->index;
10487    ic->BB_invert = Block();
10488    /* Invert blocks are intentionally not marked as top level because they
10489     * are not part of the logical cfg. */
10490    ic->BB_invert.kind |= block_kind_invert;
10491    ic->BB_endif = Block();
10492    ic->BB_endif.kind |= (block_kind_merge | (ctx->block->kind & block_kind_top_level));
10493 
10494    ic->exec_potentially_empty_discard_old = ctx->cf_info.exec_potentially_empty_discard;
10495    ic->exec_potentially_empty_break_old = ctx->cf_info.exec_potentially_empty_break;
10496    ic->exec_potentially_empty_break_depth_old = ctx->cf_info.exec_potentially_empty_break_depth;
10497    ic->divergent_old = ctx->cf_info.parent_if.is_divergent;
10498    ic->had_divergent_discard_old = ctx->cf_info.had_divergent_discard;
10499    ctx->cf_info.parent_if.is_divergent = true;
10500 
10501    /* divergent branches use cbranch_execz */
10502    ctx->cf_info.exec_potentially_empty_discard = false;
10503    ctx->cf_info.exec_potentially_empty_break = false;
10504    ctx->cf_info.exec_potentially_empty_break_depth = UINT16_MAX;
10505 
10506    /** emit logical then block */
10507    ctx->program->next_divergent_if_logical_depth++;
10508    Block* BB_then_logical = ctx->program->create_and_insert_block();
10509    add_edge(ic->BB_if_idx, BB_then_logical);
10510    ctx->block = BB_then_logical;
10511    append_logical_start(BB_then_logical);
10512 }
10513 
10514 static void
begin_divergent_if_else(isel_context * ctx,if_context * ic,nir_selection_control sel_ctrl=nir_selection_control_none)10515 begin_divergent_if_else(isel_context* ctx, if_context* ic,
10516                         nir_selection_control sel_ctrl = nir_selection_control_none)
10517 {
10518    Block* BB_then_logical = ctx->block;
10519    append_logical_end(BB_then_logical);
10520    /* branch from logical then block to invert block */
10521    aco_ptr<Pseudo_branch_instruction> branch;
10522    branch.reset(create_instruction<Pseudo_branch_instruction>(aco_opcode::p_branch,
10523                                                               Format::PSEUDO_BRANCH, 0, 1));
10524    branch->definitions[0] = Definition(ctx->program->allocateTmp(s2));
10525    BB_then_logical->instructions.emplace_back(std::move(branch));
10526    add_linear_edge(BB_then_logical->index, &ic->BB_invert);
10527    if (!ctx->cf_info.parent_loop.has_divergent_branch)
10528       add_logical_edge(BB_then_logical->index, &ic->BB_endif);
10529    BB_then_logical->kind |= block_kind_uniform;
10530    assert(!ctx->cf_info.has_branch);
10531    ic->then_branch_divergent = ctx->cf_info.parent_loop.has_divergent_branch;
10532    ctx->cf_info.parent_loop.has_divergent_branch = false;
10533    ctx->program->next_divergent_if_logical_depth--;
10534 
10535    /** emit linear then block */
10536    Block* BB_then_linear = ctx->program->create_and_insert_block();
10537    BB_then_linear->kind |= block_kind_uniform;
10538    add_linear_edge(ic->BB_if_idx, BB_then_linear);
10539    /* branch from linear then block to invert block */
10540    branch.reset(create_instruction<Pseudo_branch_instruction>(aco_opcode::p_branch,
10541                                                               Format::PSEUDO_BRANCH, 0, 1));
10542    branch->definitions[0] = Definition(ctx->program->allocateTmp(s2));
10543    BB_then_linear->instructions.emplace_back(std::move(branch));
10544    add_linear_edge(BB_then_linear->index, &ic->BB_invert);
10545 
10546    /** emit invert merge block */
10547    ctx->block = ctx->program->insert_block(std::move(ic->BB_invert));
10548    ic->invert_idx = ctx->block->index;
10549 
10550    /* branch to linear else block (skip else) */
10551    branch.reset(create_instruction<Pseudo_branch_instruction>(aco_opcode::p_branch,
10552                                                               Format::PSEUDO_BRANCH, 0, 1));
10553    branch->definitions[0] = Definition(ctx->program->allocateTmp(s2));
10554    branch->selection_control_remove = sel_ctrl == nir_selection_control_flatten ||
10555                                       sel_ctrl == nir_selection_control_divergent_always_taken;
10556    ctx->block->instructions.push_back(std::move(branch));
10557 
10558    ic->exec_potentially_empty_discard_old |= ctx->cf_info.exec_potentially_empty_discard;
10559    ic->exec_potentially_empty_break_old |= ctx->cf_info.exec_potentially_empty_break;
10560    ic->exec_potentially_empty_break_depth_old = std::min(
10561       ic->exec_potentially_empty_break_depth_old, ctx->cf_info.exec_potentially_empty_break_depth);
10562    /* divergent branches use cbranch_execz */
10563    ctx->cf_info.exec_potentially_empty_discard = false;
10564    ctx->cf_info.exec_potentially_empty_break = false;
10565    ctx->cf_info.exec_potentially_empty_break_depth = UINT16_MAX;
10566 
10567    ic->had_divergent_discard_then = ctx->cf_info.had_divergent_discard;
10568    ctx->cf_info.had_divergent_discard = ic->had_divergent_discard_old;
10569 
10570    /** emit logical else block */
10571    ctx->program->next_divergent_if_logical_depth++;
10572    Block* BB_else_logical = ctx->program->create_and_insert_block();
10573    add_logical_edge(ic->BB_if_idx, BB_else_logical);
10574    add_linear_edge(ic->invert_idx, BB_else_logical);
10575    ctx->block = BB_else_logical;
10576    append_logical_start(BB_else_logical);
10577 }
10578 
10579 static void
end_divergent_if(isel_context * ctx,if_context * ic)10580 end_divergent_if(isel_context* ctx, if_context* ic)
10581 {
10582    Block* BB_else_logical = ctx->block;
10583    append_logical_end(BB_else_logical);
10584 
10585    /* branch from logical else block to endif block */
10586    aco_ptr<Pseudo_branch_instruction> branch;
10587    branch.reset(create_instruction<Pseudo_branch_instruction>(aco_opcode::p_branch,
10588                                                               Format::PSEUDO_BRANCH, 0, 1));
10589    branch->definitions[0] = Definition(ctx->program->allocateTmp(s2));
10590    BB_else_logical->instructions.emplace_back(std::move(branch));
10591    add_linear_edge(BB_else_logical->index, &ic->BB_endif);
10592    if (!ctx->cf_info.parent_loop.has_divergent_branch)
10593       add_logical_edge(BB_else_logical->index, &ic->BB_endif);
10594    BB_else_logical->kind |= block_kind_uniform;
10595    ctx->program->next_divergent_if_logical_depth--;
10596 
10597    assert(!ctx->cf_info.has_branch);
10598    ctx->cf_info.parent_loop.has_divergent_branch &= ic->then_branch_divergent;
10599 
10600    /** emit linear else block */
10601    Block* BB_else_linear = ctx->program->create_and_insert_block();
10602    BB_else_linear->kind |= block_kind_uniform;
10603    add_linear_edge(ic->invert_idx, BB_else_linear);
10604 
10605    /* branch from linear else block to endif block */
10606    branch.reset(create_instruction<Pseudo_branch_instruction>(aco_opcode::p_branch,
10607                                                               Format::PSEUDO_BRANCH, 0, 1));
10608    branch->definitions[0] = Definition(ctx->program->allocateTmp(s2));
10609    BB_else_linear->instructions.emplace_back(std::move(branch));
10610    add_linear_edge(BB_else_linear->index, &ic->BB_endif);
10611 
10612    /** emit endif merge block */
10613    ctx->block = ctx->program->insert_block(std::move(ic->BB_endif));
10614    append_logical_start(ctx->block);
10615 
10616    ctx->cf_info.parent_if.is_divergent = ic->divergent_old;
10617    ctx->cf_info.exec_potentially_empty_discard |= ic->exec_potentially_empty_discard_old;
10618    ctx->cf_info.exec_potentially_empty_break |= ic->exec_potentially_empty_break_old;
10619    ctx->cf_info.exec_potentially_empty_break_depth = std::min(
10620       ic->exec_potentially_empty_break_depth_old, ctx->cf_info.exec_potentially_empty_break_depth);
10621    if (ctx->block->loop_nest_depth == ctx->cf_info.exec_potentially_empty_break_depth &&
10622        !ctx->cf_info.parent_if.is_divergent) {
10623       ctx->cf_info.exec_potentially_empty_break = false;
10624       ctx->cf_info.exec_potentially_empty_break_depth = UINT16_MAX;
10625    }
10626    /* uniform control flow never has an empty exec-mask */
10627    if (!ctx->block->loop_nest_depth && !ctx->cf_info.parent_if.is_divergent) {
10628       ctx->cf_info.exec_potentially_empty_discard = false;
10629       ctx->cf_info.exec_potentially_empty_break = false;
10630       ctx->cf_info.exec_potentially_empty_break_depth = UINT16_MAX;
10631    }
10632    ctx->cf_info.had_divergent_discard |= ic->had_divergent_discard_then;
10633 }
10634 
10635 static void
begin_uniform_if_then(isel_context * ctx,if_context * ic,Temp cond)10636 begin_uniform_if_then(isel_context* ctx, if_context* ic, Temp cond)
10637 {
10638    assert(cond.regClass() == s1);
10639 
10640    append_logical_end(ctx->block);
10641    ctx->block->kind |= block_kind_uniform;
10642 
10643    aco_ptr<Pseudo_branch_instruction> branch;
10644    aco_opcode branch_opcode = aco_opcode::p_cbranch_z;
10645    branch.reset(
10646       create_instruction<Pseudo_branch_instruction>(branch_opcode, Format::PSEUDO_BRANCH, 1, 1));
10647    branch->definitions[0] = Definition(ctx->program->allocateTmp(s2));
10648    branch->operands[0] = Operand(cond);
10649    branch->operands[0].setFixed(scc);
10650    ctx->block->instructions.emplace_back(std::move(branch));
10651 
10652    ic->BB_if_idx = ctx->block->index;
10653    ic->BB_endif = Block();
10654    ic->BB_endif.kind |= ctx->block->kind & block_kind_top_level;
10655 
10656    ctx->cf_info.has_branch = false;
10657    ctx->cf_info.parent_loop.has_divergent_branch = false;
10658 
10659    ic->had_divergent_discard_old = ctx->cf_info.had_divergent_discard;
10660 
10661    /** emit then block */
10662    ctx->program->next_uniform_if_depth++;
10663    Block* BB_then = ctx->program->create_and_insert_block();
10664    add_edge(ic->BB_if_idx, BB_then);
10665    append_logical_start(BB_then);
10666    ctx->block = BB_then;
10667 }
10668 
10669 static void
begin_uniform_if_else(isel_context * ctx,if_context * ic)10670 begin_uniform_if_else(isel_context* ctx, if_context* ic)
10671 {
10672    Block* BB_then = ctx->block;
10673 
10674    ic->uniform_has_then_branch = ctx->cf_info.has_branch;
10675    ic->then_branch_divergent = ctx->cf_info.parent_loop.has_divergent_branch;
10676 
10677    if (!ic->uniform_has_then_branch) {
10678       append_logical_end(BB_then);
10679       /* branch from then block to endif block */
10680       aco_ptr<Pseudo_branch_instruction> branch;
10681       branch.reset(create_instruction<Pseudo_branch_instruction>(aco_opcode::p_branch,
10682                                                                  Format::PSEUDO_BRANCH, 0, 1));
10683       branch->definitions[0] = Definition(ctx->program->allocateTmp(s2));
10684       BB_then->instructions.emplace_back(std::move(branch));
10685       add_linear_edge(BB_then->index, &ic->BB_endif);
10686       if (!ic->then_branch_divergent)
10687          add_logical_edge(BB_then->index, &ic->BB_endif);
10688       BB_then->kind |= block_kind_uniform;
10689    }
10690 
10691    ctx->cf_info.has_branch = false;
10692    ctx->cf_info.parent_loop.has_divergent_branch = false;
10693 
10694    ic->had_divergent_discard_then = ctx->cf_info.had_divergent_discard;
10695    ctx->cf_info.had_divergent_discard = ic->had_divergent_discard_old;
10696 
10697    /** emit else block */
10698    Block* BB_else = ctx->program->create_and_insert_block();
10699    add_edge(ic->BB_if_idx, BB_else);
10700    append_logical_start(BB_else);
10701    ctx->block = BB_else;
10702 }
10703 
10704 static void
end_uniform_if(isel_context * ctx,if_context * ic)10705 end_uniform_if(isel_context* ctx, if_context* ic)
10706 {
10707    Block* BB_else = ctx->block;
10708 
10709    if (!ctx->cf_info.has_branch) {
10710       append_logical_end(BB_else);
10711       /* branch from then block to endif block */
10712       aco_ptr<Pseudo_branch_instruction> branch;
10713       branch.reset(create_instruction<Pseudo_branch_instruction>(aco_opcode::p_branch,
10714                                                                  Format::PSEUDO_BRANCH, 0, 1));
10715       branch->definitions[0] = Definition(ctx->program->allocateTmp(s2));
10716       BB_else->instructions.emplace_back(std::move(branch));
10717       add_linear_edge(BB_else->index, &ic->BB_endif);
10718       if (!ctx->cf_info.parent_loop.has_divergent_branch)
10719          add_logical_edge(BB_else->index, &ic->BB_endif);
10720       BB_else->kind |= block_kind_uniform;
10721    }
10722 
10723    ctx->cf_info.has_branch &= ic->uniform_has_then_branch;
10724    ctx->cf_info.parent_loop.has_divergent_branch &= ic->then_branch_divergent;
10725    ctx->cf_info.had_divergent_discard |= ic->had_divergent_discard_then;
10726 
10727    /** emit endif merge block */
10728    ctx->program->next_uniform_if_depth--;
10729    if (!ctx->cf_info.has_branch) {
10730       ctx->block = ctx->program->insert_block(std::move(ic->BB_endif));
10731       append_logical_start(ctx->block);
10732    }
10733 }
10734 
10735 static bool
visit_if(isel_context * ctx,nir_if * if_stmt)10736 visit_if(isel_context* ctx, nir_if* if_stmt)
10737 {
10738    Temp cond = get_ssa_temp(ctx, if_stmt->condition.ssa);
10739    Builder bld(ctx->program, ctx->block);
10740    aco_ptr<Pseudo_branch_instruction> branch;
10741    if_context ic;
10742 
10743    if (!nir_src_is_divergent(if_stmt->condition)) { /* uniform condition */
10744       /**
10745        * Uniform conditionals are represented in the following way*) :
10746        *
10747        * The linear and logical CFG:
10748        *                        BB_IF
10749        *                        /    \
10750        *       BB_THEN (logical)      BB_ELSE (logical)
10751        *                        \    /
10752        *                        BB_ENDIF
10753        *
10754        * *) Exceptions may be due to break and continue statements within loops
10755        *    If a break/continue happens within uniform control flow, it branches
10756        *    to the loop exit/entry block. Otherwise, it branches to the next
10757        *    merge block.
10758        **/
10759 
10760       assert(cond.regClass() == ctx->program->lane_mask);
10761       cond = bool_to_scalar_condition(ctx, cond);
10762 
10763       begin_uniform_if_then(ctx, &ic, cond);
10764       visit_cf_list(ctx, &if_stmt->then_list);
10765 
10766       begin_uniform_if_else(ctx, &ic);
10767       visit_cf_list(ctx, &if_stmt->else_list);
10768 
10769       end_uniform_if(ctx, &ic);
10770    } else { /* non-uniform condition */
10771       /**
10772        * To maintain a logical and linear CFG without critical edges,
10773        * non-uniform conditionals are represented in the following way*) :
10774        *
10775        * The linear CFG:
10776        *                        BB_IF
10777        *                        /    \
10778        *       BB_THEN (logical)      BB_THEN (linear)
10779        *                        \    /
10780        *                        BB_INVERT (linear)
10781        *                        /    \
10782        *       BB_ELSE (logical)      BB_ELSE (linear)
10783        *                        \    /
10784        *                        BB_ENDIF
10785        *
10786        * The logical CFG:
10787        *                        BB_IF
10788        *                        /    \
10789        *       BB_THEN (logical)      BB_ELSE (logical)
10790        *                        \    /
10791        *                        BB_ENDIF
10792        *
10793        * *) Exceptions may be due to break and continue statements within loops
10794        **/
10795 
10796       begin_divergent_if_then(ctx, &ic, cond, if_stmt->control);
10797       visit_cf_list(ctx, &if_stmt->then_list);
10798 
10799       begin_divergent_if_else(ctx, &ic, if_stmt->control);
10800       visit_cf_list(ctx, &if_stmt->else_list);
10801 
10802       end_divergent_if(ctx, &ic);
10803    }
10804 
10805    return !ctx->cf_info.has_branch && !ctx->block->logical_preds.empty();
10806 }
10807 
10808 static bool
visit_cf_list(isel_context * ctx,struct exec_list * list)10809 visit_cf_list(isel_context* ctx, struct exec_list* list)
10810 {
10811    foreach_list_typed (nir_cf_node, node, node, list) {
10812       switch (node->type) {
10813       case nir_cf_node_block: visit_block(ctx, nir_cf_node_as_block(node)); break;
10814       case nir_cf_node_if:
10815          if (!visit_if(ctx, nir_cf_node_as_if(node)))
10816             return true;
10817          break;
10818       case nir_cf_node_loop: visit_loop(ctx, nir_cf_node_as_loop(node)); break;
10819       default: unreachable("unimplemented cf list type");
10820       }
10821    }
10822    return false;
10823 }
10824 
10825 static void
export_mrt(isel_context * ctx,const struct aco_export_mrt * mrt)10826 export_mrt(isel_context* ctx, const struct aco_export_mrt* mrt)
10827 {
10828    Builder bld(ctx->program, ctx->block);
10829 
10830    bld.exp(aco_opcode::exp, mrt->out[0], mrt->out[1], mrt->out[2], mrt->out[3],
10831            mrt->enabled_channels, mrt->target, mrt->compr);
10832 
10833    ctx->program->has_color_exports = true;
10834 }
10835 
10836 static bool
export_fs_mrt_color(isel_context * ctx,const struct aco_ps_epilog_info * info,Temp colors[4],unsigned slot,struct aco_export_mrt * mrt)10837 export_fs_mrt_color(isel_context* ctx, const struct aco_ps_epilog_info* info, Temp colors[4],
10838                     unsigned slot, struct aco_export_mrt* mrt)
10839 {
10840    unsigned col_format = (info->spi_shader_col_format >> (slot * 4)) & 0xf;
10841 
10842    if (col_format == V_028714_SPI_SHADER_ZERO)
10843       return false;
10844 
10845    Builder bld(ctx->program, ctx->block);
10846    Operand values[4];
10847 
10848    for (unsigned i = 0; i < 4; ++i) {
10849       values[i] = Operand(colors[i]);
10850    }
10851 
10852    unsigned enabled_channels = 0;
10853    aco_opcode compr_op = aco_opcode::num_opcodes;
10854    bool compr = false;
10855    bool is_16bit = colors[0].regClass() == v2b;
10856    bool is_int8 = (info->color_is_int8 >> slot) & 1;
10857    bool is_int10 = (info->color_is_int10 >> slot) & 1;
10858    bool enable_mrt_output_nan_fixup = (ctx->options->enable_mrt_output_nan_fixup >> slot) & 1;
10859 
10860    /* Replace NaN by zero (only 32-bit) to fix game bugs if requested. */
10861    if (enable_mrt_output_nan_fixup && !is_16bit &&
10862        (col_format == V_028714_SPI_SHADER_32_R || col_format == V_028714_SPI_SHADER_32_GR ||
10863         col_format == V_028714_SPI_SHADER_32_AR || col_format == V_028714_SPI_SHADER_32_ABGR ||
10864         col_format == V_028714_SPI_SHADER_FP16_ABGR)) {
10865       for (unsigned i = 0; i < 4; i++) {
10866          Temp is_not_nan =
10867             bld.vopc(aco_opcode::v_cmp_eq_f32, bld.def(bld.lm), values[i], values[i]);
10868          values[i] = bld.vop2(aco_opcode::v_cndmask_b32, bld.def(v1), Operand::zero(), values[i],
10869                               is_not_nan);
10870       }
10871    }
10872 
10873    switch (col_format) {
10874    case V_028714_SPI_SHADER_32_R: enabled_channels = 1; break;
10875 
10876    case V_028714_SPI_SHADER_32_GR: enabled_channels = 0x3; break;
10877 
10878    case V_028714_SPI_SHADER_32_AR:
10879       if (ctx->options->gfx_level >= GFX10) {
10880          /* Special case: on GFX10, the outputs are different for 32_AR */
10881          enabled_channels = 0x3;
10882          values[1] = values[3];
10883          values[3] = Operand(v1);
10884       } else {
10885          enabled_channels = 0x9;
10886       }
10887       break;
10888 
10889    case V_028714_SPI_SHADER_FP16_ABGR:
10890       for (int i = 0; i < 2; i++) {
10891          if (is_16bit) {
10892             values[i] = bld.pseudo(aco_opcode::p_create_vector, bld.def(v1), values[i * 2],
10893                                    values[i * 2 + 1]);
10894          } else if (ctx->options->gfx_level == GFX8 || ctx->options->gfx_level == GFX9) {
10895             values[i] = bld.vop3(aco_opcode::v_cvt_pkrtz_f16_f32_e64, bld.def(v1), values[i * 2],
10896                                  values[i * 2 + 1]);
10897          } else {
10898             values[i] = bld.vop2(aco_opcode::v_cvt_pkrtz_f16_f32, bld.def(v1), values[i * 2],
10899                                  values[i * 2 + 1]);
10900          }
10901       }
10902       values[2] = Operand(v1);
10903       values[3] = Operand(v1);
10904       enabled_channels = 0xf;
10905       compr = true;
10906       break;
10907 
10908    case V_028714_SPI_SHADER_UNORM16_ABGR:
10909       if (is_16bit && ctx->options->gfx_level >= GFX9) {
10910          compr_op = aco_opcode::v_cvt_pknorm_u16_f16;
10911       } else {
10912          compr_op = aco_opcode::v_cvt_pknorm_u16_f32;
10913       }
10914       break;
10915 
10916    case V_028714_SPI_SHADER_SNORM16_ABGR:
10917       if (is_16bit && ctx->options->gfx_level >= GFX9) {
10918          compr_op = aco_opcode::v_cvt_pknorm_i16_f16;
10919       } else {
10920          compr_op = aco_opcode::v_cvt_pknorm_i16_f32;
10921       }
10922       break;
10923 
10924    case V_028714_SPI_SHADER_UINT16_ABGR:
10925       compr_op = aco_opcode::v_cvt_pk_u16_u32;
10926       if (is_int8 || is_int10) {
10927          /* clamp */
10928          uint32_t max_rgb = is_int8 ? 255 : is_int10 ? 1023 : 0;
10929 
10930          for (unsigned i = 0; i < 4; i++) {
10931             uint32_t max = i == 3 && is_int10 ? 3 : max_rgb;
10932 
10933             values[i] = bld.vop2(aco_opcode::v_min_u32, bld.def(v1), Operand::c32(max), values[i]);
10934          }
10935       } else if (is_16bit) {
10936          for (unsigned i = 0; i < 4; i++) {
10937             Temp tmp = convert_int(ctx, bld, values[i].getTemp(), 16, 32, false);
10938             values[i] = Operand(tmp);
10939          }
10940       }
10941       break;
10942 
10943    case V_028714_SPI_SHADER_SINT16_ABGR:
10944       compr_op = aco_opcode::v_cvt_pk_i16_i32;
10945       if (is_int8 || is_int10) {
10946          /* clamp */
10947          uint32_t max_rgb = is_int8 ? 127 : is_int10 ? 511 : 0;
10948          uint32_t min_rgb = is_int8 ? -128 : is_int10 ? -512 : 0;
10949 
10950          for (unsigned i = 0; i < 4; i++) {
10951             uint32_t max = i == 3 && is_int10 ? 1 : max_rgb;
10952             uint32_t min = i == 3 && is_int10 ? -2u : min_rgb;
10953 
10954             values[i] = bld.vop2(aco_opcode::v_min_i32, bld.def(v1), Operand::c32(max), values[i]);
10955             values[i] = bld.vop2(aco_opcode::v_max_i32, bld.def(v1), Operand::c32(min), values[i]);
10956          }
10957       } else if (is_16bit) {
10958          for (unsigned i = 0; i < 4; i++) {
10959             Temp tmp = convert_int(ctx, bld, values[i].getTemp(), 16, 32, true);
10960             values[i] = Operand(tmp);
10961          }
10962       }
10963       break;
10964 
10965    case V_028714_SPI_SHADER_32_ABGR: enabled_channels = 0xF; break;
10966 
10967    case V_028714_SPI_SHADER_ZERO:
10968    default: return false;
10969    }
10970 
10971    if (compr_op != aco_opcode::num_opcodes) {
10972       values[0] = bld.vop3(compr_op, bld.def(v1), values[0], values[1]);
10973       values[1] = bld.vop3(compr_op, bld.def(v1), values[2], values[3]);
10974       values[2] = Operand(v1);
10975       values[3] = Operand(v1);
10976       enabled_channels = 0xf;
10977       compr = true;
10978    } else if (!compr) {
10979       for (int i = 0; i < 4; i++)
10980          values[i] = enabled_channels & (1 << i) ? values[i] : Operand(v1);
10981    }
10982 
10983    if (ctx->program->gfx_level >= GFX11) {
10984       /* GFX11 doesn't use COMPR for exports, but the channel mask should be
10985        * 0x3 instead.
10986        */
10987       enabled_channels = compr ? 0x3 : enabled_channels;
10988       compr = false;
10989    }
10990 
10991    for (unsigned i = 0; i < 4; i++)
10992       mrt->out[i] = values[i];
10993    mrt->target = V_008DFC_SQ_EXP_MRT;
10994    mrt->enabled_channels = enabled_channels;
10995    mrt->compr = compr;
10996 
10997    return true;
10998 }
10999 
11000 static void
export_fs_mrtz(isel_context * ctx,Temp depth,Temp stencil,Temp samplemask,Temp alpha)11001 export_fs_mrtz(isel_context* ctx, Temp depth, Temp stencil, Temp samplemask, Temp alpha)
11002 {
11003    Builder bld(ctx->program, ctx->block);
11004    unsigned enabled_channels = 0;
11005    bool compr = false;
11006    Operand values[4];
11007 
11008    for (unsigned i = 0; i < 4; ++i) {
11009       values[i] = Operand(v1);
11010    }
11011 
11012    /* Both stencil and sample mask only need 16-bits. */
11013    if (!depth.id() && !alpha.id() && (stencil.id() || samplemask.id())) {
11014       compr = ctx->program->gfx_level < GFX11; /* COMPR flag */
11015 
11016       if (stencil.id()) {
11017          /* Stencil should be in X[23:16]. */
11018          values[0] = bld.vop2(aco_opcode::v_lshlrev_b32, bld.def(v1), Operand::c32(16u), stencil);
11019          enabled_channels |= ctx->program->gfx_level >= GFX11 ? 0x1 : 0x3;
11020       }
11021 
11022       if (samplemask.id()) {
11023          /* SampleMask should be in Y[15:0]. */
11024          values[1] = Operand(samplemask);
11025          enabled_channels |= ctx->program->gfx_level >= GFX11 ? 0x2 : 0xc;
11026       }
11027    } else {
11028       if (depth.id()) {
11029          values[0] = Operand(depth);
11030          enabled_channels |= 0x1;
11031       }
11032 
11033       if (stencil.id()) {
11034          values[1] = Operand(stencil);
11035          enabled_channels |= 0x2;
11036       }
11037 
11038       if (samplemask.id()) {
11039          values[2] = Operand(samplemask);
11040          enabled_channels |= 0x4;
11041       }
11042 
11043       if (alpha.id()) {
11044          assert(ctx->program->gfx_level >= GFX11);
11045          values[3] = Operand(alpha);
11046          enabled_channels |= 0x8;
11047       }
11048    }
11049 
11050    /* GFX6 (except OLAND and HAINAN) has a bug that it only looks at the X
11051     * writemask component.
11052     */
11053    if (ctx->options->gfx_level == GFX6 && ctx->options->family != CHIP_OLAND &&
11054        ctx->options->family != CHIP_HAINAN) {
11055       enabled_channels |= 0x1;
11056    }
11057 
11058    bld.exp(aco_opcode::exp, values[0], values[1], values[2], values[3], enabled_channels,
11059            V_008DFC_SQ_EXP_MRTZ, compr);
11060 }
11061 
11062 static void
create_fs_null_export(isel_context * ctx)11063 create_fs_null_export(isel_context* ctx)
11064 {
11065    /* FS must always have exports.
11066     * So when there are none, we need to add a null export.
11067     */
11068 
11069    Builder bld(ctx->program, ctx->block);
11070    /* GFX11 doesn't support NULL exports, and MRT0 should be exported instead. */
11071    unsigned dest = ctx->options->gfx_level >= GFX11 ? V_008DFC_SQ_EXP_MRT : V_008DFC_SQ_EXP_NULL;
11072    bld.exp(aco_opcode::exp, Operand(v1), Operand(v1), Operand(v1), Operand(v1),
11073            /* enabled_mask */ 0, dest, /* compr */ false, /* done */ true, /* vm */ true);
11074 
11075    ctx->program->has_color_exports = true;
11076 }
11077 
11078 static void
create_fs_jump_to_epilog(isel_context * ctx)11079 create_fs_jump_to_epilog(isel_context* ctx)
11080 {
11081    Builder bld(ctx->program, ctx->block);
11082    std::vector<Operand> exports;
11083    unsigned vgpr = 256; /* VGPR 0 */
11084 
11085    if (ctx->outputs.mask[FRAG_RESULT_DEPTH])
11086       exports.emplace_back(Operand(ctx->outputs.temps[FRAG_RESULT_DEPTH * 4u], PhysReg{vgpr++}));
11087 
11088    if (ctx->outputs.mask[FRAG_RESULT_STENCIL])
11089       exports.emplace_back(Operand(ctx->outputs.temps[FRAG_RESULT_STENCIL * 4u], PhysReg{vgpr++}));
11090 
11091    if (ctx->outputs.mask[FRAG_RESULT_SAMPLE_MASK])
11092       exports.emplace_back(
11093          Operand(ctx->outputs.temps[FRAG_RESULT_SAMPLE_MASK * 4u], PhysReg{vgpr++}));
11094 
11095    PhysReg exports_start(vgpr);
11096 
11097    for (unsigned slot = FRAG_RESULT_DATA0; slot < FRAG_RESULT_DATA7 + 1; ++slot) {
11098       unsigned color_index = slot - FRAG_RESULT_DATA0;
11099       unsigned color_type = (ctx->output_color_types >> (color_index * 2)) & 0x3;
11100       unsigned write_mask = ctx->outputs.mask[slot];
11101 
11102       if (!write_mask)
11103          continue;
11104 
11105       PhysReg color_start(exports_start.reg() + color_index * 4);
11106 
11107       for (unsigned i = 0; i < 4; i++) {
11108          if (!(write_mask & BITFIELD_BIT(i))) {
11109             exports.emplace_back(Operand(v1));
11110             continue;
11111          }
11112 
11113          PhysReg chan_reg = color_start.advance(i * 4u);
11114          Operand chan(ctx->outputs.temps[slot * 4u + i]);
11115 
11116          if (color_type == ACO_TYPE_FLOAT16) {
11117             chan = bld.vop1(aco_opcode::v_cvt_f32_f16, bld.def(v1), chan);
11118          } else if (color_type == ACO_TYPE_INT16 || color_type == ACO_TYPE_UINT16) {
11119             bool sign_ext = color_type == ACO_TYPE_INT16;
11120             Temp tmp = convert_int(ctx, bld, chan.getTemp(), 16, 32, sign_ext);
11121             chan = Operand(tmp);
11122          }
11123 
11124          chan.setFixed(chan_reg);
11125          exports.emplace_back(chan);
11126       }
11127    }
11128 
11129    Temp continue_pc = convert_pointer_to_64_bit(ctx, get_arg(ctx, ctx->program->info.ps.epilog_pc));
11130 
11131    aco_ptr<Pseudo_instruction> jump{create_instruction<Pseudo_instruction>(
11132       aco_opcode::p_jump_to_epilog, Format::PSEUDO, 1 + exports.size(), 0)};
11133    jump->operands[0] = Operand(continue_pc);
11134    for (unsigned i = 0; i < exports.size(); i++) {
11135       jump->operands[i + 1] = exports[i];
11136    }
11137    ctx->block->instructions.emplace_back(std::move(jump));
11138 }
11139 
11140 PhysReg
get_arg_reg(const struct ac_shader_args * args,struct ac_arg arg)11141 get_arg_reg(const struct ac_shader_args* args, struct ac_arg arg)
11142 {
11143    assert(arg.used);
11144    enum ac_arg_regfile file = args->args[arg.arg_index].file;
11145    unsigned reg = args->args[arg.arg_index].offset;
11146    return PhysReg(file == AC_ARG_SGPR ? reg : reg + 256);
11147 }
11148 
11149 static Operand
get_arg_for_end(isel_context * ctx,struct ac_arg arg)11150 get_arg_for_end(isel_context* ctx, struct ac_arg arg)
11151 {
11152    return Operand(get_arg(ctx, arg), get_arg_reg(ctx->args, arg));
11153 }
11154 
11155 static Temp
get_tcs_out_current_patch_data_offset(isel_context * ctx)11156 get_tcs_out_current_patch_data_offset(isel_context* ctx)
11157 {
11158    Builder bld(ctx->program, ctx->block);
11159 
11160    const unsigned output_vertex_size = ctx->program->info.tcs.num_linked_outputs * 4u;
11161    const unsigned pervertex_output_patch_size =
11162       ctx->program->info.tcs.tcs_vertices_out * output_vertex_size;
11163    const unsigned output_patch_stride =
11164       pervertex_output_patch_size + ctx->program->info.tcs.num_linked_patch_outputs * 4u;
11165 
11166    Temp tcs_rel_ids = get_arg(ctx, ctx->args->tcs_rel_ids);
11167    Temp rel_patch_id =
11168       bld.vop3(aco_opcode::v_bfe_u32, bld.def(v1), tcs_rel_ids, Operand::c32(0u), Operand::c32(8u));
11169    Temp patch_offset = bld.v_mul_imm(bld.def(v1), rel_patch_id, output_patch_stride, false);
11170 
11171    Temp tcs_offchip_layout = get_arg(ctx, ctx->program->info.tcs.tcs_offchip_layout);
11172 
11173    Temp patch_control_points = bld.sop2(aco_opcode::s_and_b32, bld.def(s1), bld.def(s1, scc),
11174                                         tcs_offchip_layout, Operand::c32(0x3f));
11175 
11176    Temp num_patches = bld.sop2(aco_opcode::s_bfe_u32, bld.def(s1), bld.def(s1, scc),
11177                                tcs_offchip_layout, Operand::c32(0x60006));
11178 
11179    Temp lshs_vertex_stride = bld.sop2(aco_opcode::s_bfe_u32, bld.def(s1), bld.def(s1, scc),
11180                                       tcs_offchip_layout, Operand::c32(0x8000c));
11181 
11182    Temp input_patch_size =
11183       bld.sop2(aco_opcode::s_mul_i32, bld.def(s1), patch_control_points, lshs_vertex_stride);
11184 
11185    Temp output_patch0_offset =
11186       bld.sop2(aco_opcode::s_mul_i32, bld.def(s1), num_patches, input_patch_size);
11187 
11188    Temp output_patch_offset =
11189       bld.nuw().sop2(aco_opcode::s_add_i32, bld.def(s1), bld.def(s1, scc),
11190                      Operand::c32(pervertex_output_patch_size), output_patch0_offset);
11191 
11192    return bld.nuw().vadd32(bld.def(v1), patch_offset, output_patch_offset);
11193 }
11194 
11195 static Temp
get_patch_base(isel_context * ctx)11196 get_patch_base(isel_context* ctx)
11197 {
11198    Builder bld(ctx->program, ctx->block);
11199 
11200    const unsigned output_vertex_size = ctx->program->info.tcs.num_linked_outputs * 16u;
11201    const unsigned pervertex_output_patch_size =
11202       ctx->program->info.tcs.tcs_vertices_out * output_vertex_size;
11203 
11204    Temp num_patches =
11205       bld.sop2(aco_opcode::s_bfe_u32, bld.def(s1), bld.def(s1, scc),
11206                get_arg(ctx, ctx->program->info.tcs.tcs_offchip_layout), Operand::c32(0x60006));
11207 
11208    return bld.sop2(aco_opcode::s_mul_i32, bld.def(s1), num_patches,
11209                    Operand::c32(pervertex_output_patch_size));
11210 }
11211 
11212 static void
passthrough_all_args(isel_context * ctx,std::vector<Operand> & regs)11213 passthrough_all_args(isel_context* ctx, std::vector<Operand>& regs)
11214 {
11215    struct ac_arg arg;
11216    arg.used = true;
11217 
11218    for (arg.arg_index = 0; arg.arg_index < ctx->args->arg_count; arg.arg_index++)
11219       regs.emplace_back(get_arg_for_end(ctx, arg));
11220 }
11221 
11222 static void
build_end_with_regs(isel_context * ctx,std::vector<Operand> & regs)11223 build_end_with_regs(isel_context* ctx, std::vector<Operand>& regs)
11224 {
11225    aco_ptr<Pseudo_instruction> end{create_instruction<Pseudo_instruction>(
11226       aco_opcode::p_end_with_regs, Format::PSEUDO, regs.size(), 0)};
11227 
11228    for (unsigned i = 0; i < regs.size(); i++)
11229       end->operands[i] = regs[i];
11230 
11231    ctx->block->instructions.emplace_back(std::move(end));
11232 
11233    ctx->block->kind |= block_kind_end_with_regs;
11234 }
11235 
11236 static void
create_tcs_jump_to_epilog(isel_context * ctx)11237 create_tcs_jump_to_epilog(isel_context* ctx)
11238 {
11239    Builder bld(ctx->program, ctx->block);
11240 
11241    PhysReg vgpr_start(256); /* VGPR 0 */
11242    PhysReg sgpr_start(0);   /* SGPR 0 */
11243 
11244    /* SGPRs */
11245    Operand ring_offsets = Operand(get_arg(ctx, ctx->args->ring_offsets));
11246    ring_offsets.setFixed(sgpr_start);
11247 
11248    Operand tess_offchip_offset = Operand(get_arg(ctx, ctx->args->tess_offchip_offset));
11249    tess_offchip_offset.setFixed(sgpr_start.advance(8u));
11250 
11251    Operand tcs_factor_offset = Operand(get_arg(ctx, ctx->args->tcs_factor_offset));
11252    tcs_factor_offset.setFixed(sgpr_start.advance(12u));
11253 
11254    Operand tcs_offchip_layout = Operand(get_arg(ctx, ctx->program->info.tcs.tcs_offchip_layout));
11255    tcs_offchip_layout.setFixed(sgpr_start.advance(16u));
11256 
11257    Operand patch_base = Operand(get_patch_base(ctx));
11258    patch_base.setFixed(sgpr_start.advance(20u));
11259 
11260    /* VGPRs */
11261    Operand tcs_out_current_patch_data_offset = Operand(get_tcs_out_current_patch_data_offset(ctx));
11262    tcs_out_current_patch_data_offset.setFixed(vgpr_start);
11263 
11264    Operand invocation_id =
11265       bld.vop3(aco_opcode::v_bfe_u32, bld.def(v1), get_arg(ctx, ctx->args->tcs_rel_ids),
11266                Operand::c32(8u), Operand::c32(5u));
11267    invocation_id.setFixed(vgpr_start.advance(4u));
11268 
11269    Operand rel_patch_id =
11270       bld.pseudo(aco_opcode::p_extract, bld.def(v1), get_arg(ctx, ctx->args->tcs_rel_ids),
11271                  Operand::c32(0u), Operand::c32(8u), Operand::c32(0u));
11272    rel_patch_id.setFixed(vgpr_start.advance(8u));
11273 
11274    Temp continue_pc =
11275       convert_pointer_to_64_bit(ctx, get_arg(ctx, ctx->program->info.tcs.epilog_pc));
11276 
11277    aco_ptr<Pseudo_instruction> jump{
11278       create_instruction<Pseudo_instruction>(aco_opcode::p_jump_to_epilog, Format::PSEUDO, 9, 0)};
11279    jump->operands[0] = Operand(continue_pc);
11280    jump->operands[1] = ring_offsets;
11281    jump->operands[2] = tess_offchip_offset;
11282    jump->operands[3] = tcs_factor_offset;
11283    jump->operands[4] = tcs_offchip_layout;
11284    jump->operands[5] = patch_base;
11285    jump->operands[6] = tcs_out_current_patch_data_offset;
11286    jump->operands[7] = invocation_id;
11287    jump->operands[8] = rel_patch_id;
11288    ctx->block->instructions.emplace_back(std::move(jump));
11289 }
11290 
11291 static void
create_tcs_end_for_epilog(isel_context * ctx)11292 create_tcs_end_for_epilog(isel_context* ctx)
11293 {
11294    std::vector<Operand> regs;
11295 
11296    regs.emplace_back(get_arg_for_end(ctx, ctx->program->info.tcs.tcs_offchip_layout));
11297    regs.emplace_back(get_arg_for_end(ctx, ctx->program->info.tcs.tes_offchip_addr));
11298    regs.emplace_back(get_arg_for_end(ctx, ctx->args->tess_offchip_offset));
11299    regs.emplace_back(get_arg_for_end(ctx, ctx->args->tcs_factor_offset));
11300 
11301    Builder bld(ctx->program, ctx->block);
11302 
11303    /* Leave a hole corresponding to the two input VGPRs. This ensures that
11304     * the invocation_id output does not alias the tcs_rel_ids input,
11305     * which saves a V_MOV on gfx9.
11306     */
11307    unsigned vgpr = 256 + ctx->args->num_vgprs_used;
11308 
11309    Temp rel_patch_id =
11310       bld.pseudo(aco_opcode::p_extract, bld.def(v1), get_arg(ctx, ctx->args->tcs_rel_ids),
11311                  Operand::c32(0u), Operand::c32(8u), Operand::c32(0u));
11312    regs.emplace_back(Operand(rel_patch_id, PhysReg{vgpr++}));
11313 
11314    Temp invocation_id =
11315       bld.vop3(aco_opcode::v_bfe_u32, bld.def(v1), get_arg(ctx, ctx->args->tcs_rel_ids),
11316                Operand::c32(8u), Operand::c32(5u));
11317    regs.emplace_back(Operand(invocation_id, PhysReg{vgpr++}));
11318 
11319    if (ctx->program->info.tcs.pass_tessfactors_by_reg) {
11320       vgpr++; /* skip the tess factor LDS offset */
11321 
11322       unsigned slot = VARYING_SLOT_TESS_LEVEL_OUTER;
11323       u_foreach_bit (i, ctx->outputs.mask[slot]) {
11324          regs.emplace_back(Operand(ctx->outputs.temps[slot * 4 + i], PhysReg{vgpr + i}));
11325       }
11326       vgpr += 4;
11327 
11328       slot = VARYING_SLOT_TESS_LEVEL_INNER;
11329       u_foreach_bit (i, ctx->outputs.mask[slot]) {
11330          regs.emplace_back(Operand(ctx->outputs.temps[slot * 4 + i], PhysReg{vgpr + i}));
11331       }
11332    } else {
11333       Temp patch0_patch_data_offset =
11334          bld.sop2(aco_opcode::s_bfe_u32, bld.def(s1), bld.def(s1, scc),
11335                   get_arg(ctx, ctx->program->info.tcs.vs_state_bits), Operand::c32(0xe000a));
11336 
11337       Temp tf_lds_offset =
11338          bld.v_mul24_imm(bld.def(v1), rel_patch_id, ctx->program->info.tcs.patch_stride);
11339       tf_lds_offset = bld.nuw().vadd32(bld.def(v1), tf_lds_offset, patch0_patch_data_offset);
11340 
11341       regs.emplace_back(Operand(tf_lds_offset, PhysReg{vgpr}));
11342    }
11343 
11344    build_end_with_regs(ctx, regs);
11345 }
11346 
11347 static void
create_fs_end_for_epilog(isel_context * ctx)11348 create_fs_end_for_epilog(isel_context* ctx)
11349 {
11350    Builder bld(ctx->program, ctx->block);
11351 
11352    std::vector<Operand> regs;
11353 
11354    regs.emplace_back(get_arg_for_end(ctx, ctx->program->info.ps.alpha_reference));
11355 
11356    unsigned vgpr = 256;
11357 
11358    for (unsigned slot = FRAG_RESULT_DATA0; slot <= FRAG_RESULT_DATA7; slot++) {
11359       unsigned index = slot - FRAG_RESULT_DATA0;
11360       unsigned type = (ctx->output_color_types >> (index * 2)) & 0x3;
11361       unsigned write_mask = ctx->outputs.mask[slot];
11362 
11363       if (!write_mask)
11364          continue;
11365 
11366       if (type == ACO_TYPE_ANY32) {
11367          u_foreach_bit (i, write_mask) {
11368             regs.emplace_back(Operand(ctx->outputs.temps[slot * 4 + i], PhysReg{vgpr + i}));
11369          }
11370       } else {
11371          for (unsigned i = 0; i < 2; i++) {
11372             unsigned mask = (write_mask >> (i * 2)) & 0x3;
11373             if (!mask)
11374                continue;
11375 
11376             unsigned chan = slot * 4 + i * 2;
11377             Operand lo = mask & 0x1 ? Operand(ctx->outputs.temps[chan]) : Operand(v2b);
11378             Operand hi = mask & 0x2 ? Operand(ctx->outputs.temps[chan + 1]) : Operand(v2b);
11379 
11380             Temp dst = bld.pseudo(aco_opcode::p_create_vector, bld.def(v1), lo, hi);
11381             regs.emplace_back(Operand(dst, PhysReg{vgpr + i}));
11382          }
11383       }
11384       vgpr += 4;
11385    }
11386 
11387    if (ctx->outputs.mask[FRAG_RESULT_DEPTH])
11388       regs.emplace_back(Operand(ctx->outputs.temps[FRAG_RESULT_DEPTH * 4], PhysReg{vgpr++}));
11389 
11390    if (ctx->outputs.mask[FRAG_RESULT_STENCIL])
11391       regs.emplace_back(Operand(ctx->outputs.temps[FRAG_RESULT_STENCIL * 4], PhysReg{vgpr++}));
11392 
11393    if (ctx->outputs.mask[FRAG_RESULT_SAMPLE_MASK])
11394       regs.emplace_back(Operand(ctx->outputs.temps[FRAG_RESULT_SAMPLE_MASK * 4], PhysReg{vgpr++}));
11395 
11396    build_end_with_regs(ctx, regs);
11397 
11398    /* Exit WQM mode finally. */
11399    ctx->program->needs_exact = true;
11400 }
11401 
11402 Pseudo_instruction*
add_startpgm(struct isel_context * ctx)11403 add_startpgm(struct isel_context* ctx)
11404 {
11405    unsigned def_count = 0;
11406    for (unsigned i = 0; i < ctx->args->arg_count; i++) {
11407       if (ctx->args->args[i].skip)
11408          continue;
11409       unsigned align = MIN2(4, util_next_power_of_two(ctx->args->args[i].size));
11410       if (ctx->args->args[i].file == AC_ARG_SGPR && ctx->args->args[i].offset % align)
11411          def_count += ctx->args->args[i].size;
11412       else
11413          def_count++;
11414    }
11415 
11416    Pseudo_instruction* startpgm =
11417       create_instruction<Pseudo_instruction>(aco_opcode::p_startpgm, Format::PSEUDO, 0, def_count);
11418    ctx->block->instructions.emplace_back(startpgm);
11419    for (unsigned i = 0, arg = 0; i < ctx->args->arg_count; i++) {
11420       if (ctx->args->args[i].skip)
11421          continue;
11422 
11423       enum ac_arg_regfile file = ctx->args->args[i].file;
11424       unsigned size = ctx->args->args[i].size;
11425       unsigned reg = ctx->args->args[i].offset;
11426       RegClass type = RegClass(file == AC_ARG_SGPR ? RegType::sgpr : RegType::vgpr, size);
11427 
11428       if (file == AC_ARG_SGPR && reg % MIN2(4, util_next_power_of_two(size))) {
11429          Temp elems[16];
11430          for (unsigned j = 0; j < size; j++) {
11431             elems[j] = ctx->program->allocateTmp(s1);
11432             startpgm->definitions[arg++] = Definition(elems[j].id(), PhysReg{reg + j}, s1);
11433          }
11434          ctx->arg_temps[i] = create_vec_from_array(ctx, elems, size, RegType::sgpr, 4);
11435       } else {
11436          Temp dst = ctx->program->allocateTmp(type);
11437          Definition def(dst);
11438          def.setFixed(PhysReg{file == AC_ARG_SGPR ? reg : reg + 256});
11439          ctx->arg_temps[i] = dst;
11440          startpgm->definitions[arg++] = def;
11441 
11442          if (ctx->args->args[i].pending_vmem) {
11443             assert(file == AC_ARG_VGPR);
11444             ctx->program->args_pending_vmem.push_back(def);
11445          }
11446       }
11447    }
11448 
11449    /* epilog has no scratch */
11450    if (ctx->args->scratch_offset.used) {
11451       if (ctx->program->gfx_level < GFX9) {
11452          /* Stash these in the program so that they can be accessed later when
11453           * handling spilling.
11454           */
11455          if (ctx->args->ring_offsets.used)
11456             ctx->program->private_segment_buffer = get_arg(ctx, ctx->args->ring_offsets);
11457 
11458          ctx->program->scratch_offset = get_arg(ctx, ctx->args->scratch_offset);
11459       } else if (ctx->program->gfx_level <= GFX10_3 && ctx->program->stage != raytracing_cs) {
11460          /* Manually initialize scratch. For RT stages scratch initialization is done in the prolog.
11461           */
11462          Operand scratch_offset = Operand(get_arg(ctx, ctx->args->scratch_offset));
11463          scratch_offset.setLateKill(true);
11464 
11465          Operand scratch_addr = ctx->args->ring_offsets.used
11466                                    ? Operand(get_arg(ctx, ctx->args->ring_offsets))
11467                                    : Operand(s2);
11468 
11469          Builder bld(ctx->program, ctx->block);
11470          bld.pseudo(aco_opcode::p_init_scratch, bld.def(s2), bld.def(s1, scc), scratch_addr,
11471                     scratch_offset);
11472       }
11473    }
11474 
11475    return startpgm;
11476 }
11477 
11478 void
fix_ls_vgpr_init_bug(isel_context * ctx)11479 fix_ls_vgpr_init_bug(isel_context* ctx)
11480 {
11481    Builder bld(ctx->program, ctx->block);
11482    constexpr unsigned hs_idx = 1u;
11483    Builder::Result hs_thread_count =
11484       bld.sop2(aco_opcode::s_bfe_u32, bld.def(s1), bld.def(s1, scc),
11485                get_arg(ctx, ctx->args->merged_wave_info), Operand::c32((8u << 16) | (hs_idx * 8u)));
11486    Temp ls_has_nonzero_hs_threads = bool_to_vector_condition(ctx, hs_thread_count.def(1).getTemp());
11487 
11488    /* If there are no HS threads, SPI mistakenly loads the LS VGPRs starting at VGPR 0. */
11489 
11490    Temp instance_id =
11491       bld.vop2(aco_opcode::v_cndmask_b32, bld.def(v1), get_arg(ctx, ctx->args->vertex_id),
11492                get_arg(ctx, ctx->args->instance_id), ls_has_nonzero_hs_threads);
11493    Temp vs_rel_patch_id =
11494       bld.vop2(aco_opcode::v_cndmask_b32, bld.def(v1), get_arg(ctx, ctx->args->tcs_rel_ids),
11495                get_arg(ctx, ctx->args->vs_rel_patch_id), ls_has_nonzero_hs_threads);
11496    Temp vertex_id =
11497       bld.vop2(aco_opcode::v_cndmask_b32, bld.def(v1), get_arg(ctx, ctx->args->tcs_patch_id),
11498                get_arg(ctx, ctx->args->vertex_id), ls_has_nonzero_hs_threads);
11499 
11500    ctx->arg_temps[ctx->args->instance_id.arg_index] = instance_id;
11501    ctx->arg_temps[ctx->args->vs_rel_patch_id.arg_index] = vs_rel_patch_id;
11502    ctx->arg_temps[ctx->args->vertex_id.arg_index] = vertex_id;
11503 }
11504 
11505 void
split_arguments(isel_context * ctx,Pseudo_instruction * startpgm)11506 split_arguments(isel_context* ctx, Pseudo_instruction* startpgm)
11507 {
11508    /* Split all arguments except for the first (ring_offsets) and the last
11509     * (exec) so that the dead channels don't stay live throughout the program.
11510     */
11511    for (int i = 1; i < startpgm->definitions.size(); i++) {
11512       if (startpgm->definitions[i].regClass().size() > 1) {
11513          emit_split_vector(ctx, startpgm->definitions[i].getTemp(),
11514                            startpgm->definitions[i].regClass().size());
11515       }
11516    }
11517 }
11518 
11519 void
setup_fp_mode(isel_context * ctx,nir_shader * shader)11520 setup_fp_mode(isel_context* ctx, nir_shader* shader)
11521 {
11522    Program* program = ctx->program;
11523 
11524    unsigned float_controls = shader->info.float_controls_execution_mode;
11525 
11526    program->next_fp_mode.preserve_signed_zero_inf_nan32 =
11527       float_controls & FLOAT_CONTROLS_SIGNED_ZERO_INF_NAN_PRESERVE_FP32;
11528    program->next_fp_mode.preserve_signed_zero_inf_nan16_64 =
11529       float_controls & (FLOAT_CONTROLS_SIGNED_ZERO_INF_NAN_PRESERVE_FP16 |
11530                         FLOAT_CONTROLS_SIGNED_ZERO_INF_NAN_PRESERVE_FP64);
11531 
11532    program->next_fp_mode.must_flush_denorms32 =
11533       float_controls & FLOAT_CONTROLS_DENORM_FLUSH_TO_ZERO_FP32;
11534    program->next_fp_mode.must_flush_denorms16_64 =
11535       float_controls &
11536       (FLOAT_CONTROLS_DENORM_FLUSH_TO_ZERO_FP16 | FLOAT_CONTROLS_DENORM_FLUSH_TO_ZERO_FP64);
11537 
11538    program->next_fp_mode.care_about_round32 =
11539       float_controls &
11540       (FLOAT_CONTROLS_ROUNDING_MODE_RTZ_FP32 | FLOAT_CONTROLS_ROUNDING_MODE_RTE_FP32);
11541 
11542    program->next_fp_mode.care_about_round16_64 =
11543       float_controls &
11544       (FLOAT_CONTROLS_ROUNDING_MODE_RTZ_FP16 | FLOAT_CONTROLS_ROUNDING_MODE_RTZ_FP64 |
11545        FLOAT_CONTROLS_ROUNDING_MODE_RTE_FP16 | FLOAT_CONTROLS_ROUNDING_MODE_RTE_FP64);
11546 
11547    /* default to preserving fp16 and fp64 denorms, since it's free for fp64 and
11548     * the precision seems needed for Wolfenstein: Youngblood to render correctly */
11549    if (program->next_fp_mode.must_flush_denorms16_64)
11550       program->next_fp_mode.denorm16_64 = 0;
11551    else
11552       program->next_fp_mode.denorm16_64 = fp_denorm_keep;
11553 
11554    /* preserving fp32 denorms is expensive, so only do it if asked */
11555    if (float_controls & FLOAT_CONTROLS_DENORM_PRESERVE_FP32)
11556       program->next_fp_mode.denorm32 = fp_denorm_keep;
11557    else
11558       program->next_fp_mode.denorm32 = 0;
11559 
11560    if (float_controls & FLOAT_CONTROLS_ROUNDING_MODE_RTZ_FP32)
11561       program->next_fp_mode.round32 = fp_round_tz;
11562    else
11563       program->next_fp_mode.round32 = fp_round_ne;
11564 
11565    if (float_controls &
11566        (FLOAT_CONTROLS_ROUNDING_MODE_RTZ_FP16 | FLOAT_CONTROLS_ROUNDING_MODE_RTZ_FP64))
11567       program->next_fp_mode.round16_64 = fp_round_tz;
11568    else
11569       program->next_fp_mode.round16_64 = fp_round_ne;
11570 
11571    ctx->block->fp_mode = program->next_fp_mode;
11572 }
11573 
11574 void
cleanup_cfg(Program * program)11575 cleanup_cfg(Program* program)
11576 {
11577    /* create linear_succs/logical_succs */
11578    for (Block& BB : program->blocks) {
11579       for (unsigned idx : BB.linear_preds)
11580          program->blocks[idx].linear_succs.emplace_back(BB.index);
11581       for (unsigned idx : BB.logical_preds)
11582          program->blocks[idx].logical_succs.emplace_back(BB.index);
11583    }
11584 }
11585 
11586 void
finish_program(isel_context * ctx)11587 finish_program(isel_context* ctx)
11588 {
11589    cleanup_cfg(ctx->program);
11590 
11591    /* Insert a single p_end_wqm instruction after the last derivative calculation */
11592    if (ctx->program->stage == fragment_fs && ctx->program->needs_wqm && ctx->program->needs_exact) {
11593       /* Find the next BB at top-level CFG */
11594       while (!(ctx->program->blocks[ctx->wqm_block_idx].kind & block_kind_top_level)) {
11595          ctx->wqm_block_idx++;
11596          ctx->wqm_instruction_idx = 0;
11597       }
11598 
11599       std::vector<aco_ptr<Instruction>>* instrs =
11600          &ctx->program->blocks[ctx->wqm_block_idx].instructions;
11601       auto it = instrs->begin() + ctx->wqm_instruction_idx;
11602 
11603       /* Delay transistion to Exact to help optimizations and scheduling */
11604       while (it != instrs->end()) {
11605          aco_ptr<Instruction>& instr = *it;
11606          /* End WQM before: */
11607          if (instr->isVMEM() || instr->isFlatLike() || instr->isDS() || instr->isEXP() ||
11608              instr->opcode == aco_opcode::p_dual_src_export_gfx11 ||
11609              instr->opcode == aco_opcode::p_jump_to_epilog ||
11610              instr->opcode == aco_opcode::p_logical_start)
11611             break;
11612 
11613          ++it;
11614 
11615          /* End WQM after: */
11616          if (instr->opcode == aco_opcode::p_logical_end ||
11617              instr->opcode == aco_opcode::p_discard_if ||
11618              instr->opcode == aco_opcode::p_demote_to_helper ||
11619              instr->opcode == aco_opcode::p_end_with_regs)
11620             break;
11621       }
11622 
11623       Builder bld(ctx->program);
11624       bld.reset(instrs, it);
11625       bld.pseudo(aco_opcode::p_end_wqm);
11626    }
11627 }
11628 
11629 Temp
lanecount_to_mask(isel_context * ctx,Temp count)11630 lanecount_to_mask(isel_context* ctx, Temp count)
11631 {
11632    assert(count.regClass() == s1);
11633 
11634    Builder bld(ctx->program, ctx->block);
11635    Temp mask = bld.sop2(aco_opcode::s_bfm_b64, bld.def(s2), count, Operand::zero());
11636    Temp cond;
11637 
11638    if (ctx->program->wave_size == 64) {
11639       /* Special case for 64 active invocations, because 64 doesn't work with s_bfm */
11640       Temp active_64 = bld.sopc(aco_opcode::s_bitcmp1_b32, bld.def(s1, scc), count,
11641                                 Operand::c32(6u /* log2(64) */));
11642       cond =
11643          bld.sop2(Builder::s_cselect, bld.def(bld.lm), Operand::c32(-1u), mask, bld.scc(active_64));
11644    } else {
11645       /* We use s_bfm_b64 (not _b32) which works with 32, but we need to extract the lower half of
11646        * the register */
11647       cond = emit_extract_vector(ctx, mask, 0, bld.lm);
11648    }
11649 
11650    return cond;
11651 }
11652 
11653 Temp
merged_wave_info_to_mask(isel_context * ctx,unsigned i)11654 merged_wave_info_to_mask(isel_context* ctx, unsigned i)
11655 {
11656    Builder bld(ctx->program, ctx->block);
11657 
11658    /* lanecount_to_mask() only cares about s0.u[6:0] so we don't need either s_bfe nor s_and here */
11659    Temp count = i == 0 ? get_arg(ctx, ctx->args->merged_wave_info)
11660                        : bld.sop2(aco_opcode::s_lshr_b32, bld.def(s1), bld.def(s1, scc),
11661                                   get_arg(ctx, ctx->args->merged_wave_info), Operand::c32(i * 8u));
11662 
11663    return lanecount_to_mask(ctx, count);
11664 }
11665 
11666 static void
insert_rt_jump_next(isel_context & ctx,const struct ac_shader_args * args)11667 insert_rt_jump_next(isel_context& ctx, const struct ac_shader_args* args)
11668 {
11669    unsigned src_count = ctx.args->arg_count;
11670    Pseudo_instruction* ret =
11671       create_instruction<Pseudo_instruction>(aco_opcode::p_return, Format::PSEUDO, src_count, 0);
11672    ctx.block->instructions.emplace_back(ret);
11673 
11674    for (unsigned i = 0; i < src_count; i++) {
11675       enum ac_arg_regfile file = ctx.args->args[i].file;
11676       unsigned size = ctx.args->args[i].size;
11677       unsigned reg = ctx.args->args[i].offset + (file == AC_ARG_SGPR ? 0 : 256);
11678       RegClass type = RegClass(file == AC_ARG_SGPR ? RegType::sgpr : RegType::vgpr, size);
11679       Operand op = ctx.arg_temps[i].id() ? Operand(ctx.arg_temps[i], PhysReg{reg})
11680                                          : Operand(PhysReg{reg}, type);
11681       ret->operands[i] = op;
11682    }
11683 
11684    Builder bld(ctx.program, ctx.block);
11685    bld.sop1(aco_opcode::s_setpc_b64, get_arg(&ctx, ctx.args->rt.uniform_shader_addr));
11686 }
11687 
11688 void
select_program_rt(isel_context & ctx,unsigned shader_count,struct nir_shader * const * shaders,const struct ac_shader_args * args)11689 select_program_rt(isel_context& ctx, unsigned shader_count, struct nir_shader* const* shaders,
11690                   const struct ac_shader_args* args)
11691 {
11692    for (unsigned i = 0; i < shader_count; i++) {
11693       if (i) {
11694          ctx.block = ctx.program->create_and_insert_block();
11695          ctx.block->kind = block_kind_top_level | block_kind_resume;
11696       }
11697 
11698       nir_shader* nir = shaders[i];
11699       init_context(&ctx, nir);
11700       setup_fp_mode(&ctx, nir);
11701 
11702       Pseudo_instruction* startpgm = add_startpgm(&ctx);
11703       append_logical_start(ctx.block);
11704       split_arguments(&ctx, startpgm);
11705       visit_cf_list(&ctx, &nir_shader_get_entrypoint(nir)->body);
11706       append_logical_end(ctx.block);
11707       ctx.block->kind |= block_kind_uniform;
11708 
11709       /* Fix output registers and jump to next shader. We can skip this when dealing with a raygen
11710        * shader without shader calls.
11711        */
11712       if (shader_count > 1 || shaders[i]->info.stage != MESA_SHADER_RAYGEN)
11713          insert_rt_jump_next(ctx, args);
11714 
11715       cleanup_context(&ctx);
11716    }
11717 
11718    ctx.program->config->float_mode = ctx.program->blocks[0].fp_mode.val;
11719    finish_program(&ctx);
11720 }
11721 
11722 void
pops_await_overlapped_waves(isel_context * ctx)11723 pops_await_overlapped_waves(isel_context* ctx)
11724 {
11725    ctx->program->has_pops_overlapped_waves_wait = true;
11726 
11727    Builder bld(ctx->program, ctx->block);
11728 
11729    if (ctx->program->gfx_level >= GFX11) {
11730       /* GFX11+ - waiting for the export from the overlapped waves.
11731        * Await the export_ready event (bit wait_event_imm_dont_wait_export_ready clear).
11732        */
11733       bld.sopp(aco_opcode::s_wait_event, -1, 0);
11734       return;
11735    }
11736 
11737    /* Pre-GFX11 - sleep loop polling the exiting wave ID. */
11738 
11739    const Temp collision = get_arg(ctx, ctx->args->pops_collision_wave_id);
11740 
11741    /* Check if there's an overlap in the current wave - otherwise, the wait may result in a hang. */
11742    const Temp did_overlap =
11743       bld.sopc(aco_opcode::s_bitcmp1_b32, bld.def(s1, scc), collision, Operand::c32(31));
11744    if_context did_overlap_if_context;
11745    begin_uniform_if_then(ctx, &did_overlap_if_context, did_overlap);
11746    bld.reset(ctx->block);
11747 
11748    /* Set the packer register - after this, pops_exiting_wave_id can be polled. */
11749    if (ctx->program->gfx_level >= GFX10) {
11750       /* 2 packer ID bits on GFX10-10.3. */
11751       const Temp packer_id = bld.sop2(aco_opcode::s_bfe_u32, bld.def(s1), bld.def(s1, scc),
11752                                       collision, Operand::c32(0x2001c));
11753       /* POPS_PACKER register: bit 0 - POPS enabled for this wave, bits 2:1 - packer ID. */
11754       const Temp packer_id_hwreg_bits = bld.sop2(aco_opcode::s_lshl1_add_u32, bld.def(s1),
11755                                                  bld.def(s1, scc), packer_id, Operand::c32(1));
11756       bld.sopk(aco_opcode::s_setreg_b32, packer_id_hwreg_bits, ((3 - 1) << 11) | 25);
11757    } else {
11758       /* 1 packer ID bit on GFX9. */
11759       const Temp packer_id = bld.sop2(aco_opcode::s_bfe_u32, bld.def(s1), bld.def(s1, scc),
11760                                       collision, Operand::c32(0x1001c));
11761       /* MODE register: bit 24 - wave is associated with packer 0, bit 25 - with packer 1.
11762        * Packer index to packer bits: 0 to 0b01, 1 to 0b10.
11763        */
11764       const Temp packer_id_hwreg_bits =
11765          bld.sop2(aco_opcode::s_add_i32, bld.def(s1), bld.def(s1, scc), packer_id, Operand::c32(1));
11766       bld.sopk(aco_opcode::s_setreg_b32, packer_id_hwreg_bits, ((2 - 1) << 11) | (24 << 6) | 1);
11767    }
11768 
11769    Temp newest_overlapped_wave_id = bld.sop2(aco_opcode::s_bfe_u32, bld.def(s1), bld.def(s1, scc),
11770                                              collision, Operand::c32(0xa0010));
11771    if (ctx->program->gfx_level < GFX10) {
11772       /* On GFX9, the newest overlapped wave ID value passed to the shader is smaller than the
11773        * actual wave ID by 1 in case of wraparound.
11774        */
11775       const Temp current_wave_id = bld.sop2(aco_opcode::s_and_b32, bld.def(s1), bld.def(s1, scc),
11776                                             collision, Operand::c32(0x3ff));
11777       const Temp newest_overlapped_wave_id_wrapped = bld.sopc(
11778          aco_opcode::s_cmp_gt_u32, bld.def(s1, scc), newest_overlapped_wave_id, current_wave_id);
11779       newest_overlapped_wave_id =
11780          bld.sop2(aco_opcode::s_add_i32, bld.def(s1), bld.def(s1, scc), newest_overlapped_wave_id,
11781                   newest_overlapped_wave_id_wrapped);
11782    }
11783 
11784    /* The wave IDs are the low 10 bits of a monotonically increasing wave counter.
11785     * The overlapped and the exiting wave IDs can't be larger than the current wave ID, and they are
11786     * no more than 1023 values behind the current wave ID.
11787     * Remap the overlapped and the exiting wave IDs from wrapping to monotonic so an unsigned
11788     * comparison can be used: the wave `current - 1023` becomes 0, it's followed by a piece growing
11789     * away from 0, then a piece increasing until UINT32_MAX, and the current wave is UINT32_MAX.
11790     * To do that, subtract `current - 1023`, which with wrapping arithmetic is (current + 1), and
11791     * `a - (b + 1)` is `a + ~b`.
11792     * Note that if the 10-bit current wave ID is 1023 (thus 1024 will be subtracted), the wave
11793     * `current - 1023` will become `UINT32_MAX - 1023` rather than 0, but all the possible wave IDs
11794     * will still grow monotonically in the 32-bit value, and the unsigned comparison will behave as
11795     * expected.
11796     */
11797    const Temp wave_id_offset = bld.sop2(aco_opcode::s_nand_b32, bld.def(s1), bld.def(s1, scc),
11798                                         collision, Operand::c32(0x3ff));
11799    newest_overlapped_wave_id = bld.sop2(aco_opcode::s_add_i32, bld.def(s1), bld.def(s1, scc),
11800                                         newest_overlapped_wave_id, wave_id_offset);
11801 
11802    /* Await the overlapped waves. */
11803 
11804    loop_context wait_loop_context;
11805    begin_loop(ctx, &wait_loop_context);
11806    bld.reset(ctx->block);
11807 
11808    const Temp exiting_wave_id = bld.pseudo(aco_opcode::p_pops_gfx9_add_exiting_wave_id, bld.def(s1),
11809                                            bld.def(s1, scc), wave_id_offset);
11810    /* If the exiting (not exited) wave ID is larger than the newest overlapped wave ID (after
11811     * remapping both to monotonically increasing unsigned integers), the newest overlapped wave has
11812     * exited the ordered section.
11813     */
11814    const Temp newest_overlapped_wave_exited = bld.sopc(aco_opcode::s_cmp_lt_u32, bld.def(s1, scc),
11815                                                        newest_overlapped_wave_id, exiting_wave_id);
11816    if_context newest_overlapped_wave_exited_if_context;
11817    begin_uniform_if_then(ctx, &newest_overlapped_wave_exited_if_context,
11818                          newest_overlapped_wave_exited);
11819    emit_loop_break(ctx);
11820    begin_uniform_if_else(ctx, &newest_overlapped_wave_exited_if_context);
11821    end_uniform_if(ctx, &newest_overlapped_wave_exited_if_context);
11822    bld.reset(ctx->block);
11823 
11824    /* Sleep before rechecking to let overlapped waves run for some time. */
11825    bld.sopp(aco_opcode::s_sleep, -1, ctx->program->gfx_level >= GFX10 ? UINT16_MAX : 3);
11826 
11827    end_loop(ctx, &wait_loop_context);
11828    bld.reset(ctx->block);
11829 
11830    /* Indicate the wait has been done to subsequent compilation stages. */
11831    bld.pseudo(aco_opcode::p_pops_gfx9_overlapped_wave_wait_done);
11832 
11833    begin_uniform_if_else(ctx, &did_overlap_if_context);
11834    end_uniform_if(ctx, &did_overlap_if_context);
11835    bld.reset(ctx->block);
11836 }
11837 
11838 static void
create_merged_jump_to_epilog(isel_context * ctx)11839 create_merged_jump_to_epilog(isel_context* ctx)
11840 {
11841    Builder bld(ctx->program, ctx->block);
11842    std::vector<Operand> regs;
11843 
11844    for (unsigned i = 0; i < ctx->args->arg_count; i++) {
11845       if (!ctx->args->args[i].preserved)
11846          continue;
11847 
11848       const enum ac_arg_regfile file = ctx->args->args[i].file;
11849       const unsigned reg = ctx->args->args[i].offset;
11850 
11851       Operand op(ctx->arg_temps[i]);
11852       op.setFixed(PhysReg{file == AC_ARG_SGPR ? reg : reg + 256});
11853       regs.emplace_back(op);
11854    }
11855 
11856    Temp continue_pc =
11857       convert_pointer_to_64_bit(ctx, get_arg(ctx, ctx->program->info.next_stage_pc));
11858 
11859    aco_ptr<Pseudo_instruction> jump{create_instruction<Pseudo_instruction>(
11860       aco_opcode::p_jump_to_epilog, Format::PSEUDO, 1 + regs.size(), 0)};
11861    jump->operands[0] = Operand(continue_pc);
11862    for (unsigned i = 0; i < regs.size(); i++) {
11863       jump->operands[i + 1] = regs[i];
11864    }
11865    ctx->block->instructions.emplace_back(std::move(jump));
11866 }
11867 
11868 static void
create_end_for_merged_shader(isel_context * ctx)11869 create_end_for_merged_shader(isel_context* ctx)
11870 {
11871    std::vector<Operand> regs;
11872 
11873    unsigned max_args;
11874    if (ctx->stage.sw == SWStage::VS) {
11875       assert(ctx->args->vertex_id.used);
11876       max_args = ctx->args->vertex_id.arg_index;
11877    } else {
11878       assert(ctx->stage.sw == SWStage::TES);
11879       assert(ctx->args->tes_u.used);
11880       max_args = ctx->args->tes_u.arg_index;
11881    }
11882 
11883    struct ac_arg arg;
11884    arg.used = true;
11885 
11886    for (arg.arg_index = 0; arg.arg_index < max_args; arg.arg_index++)
11887       regs.emplace_back(get_arg_for_end(ctx, arg));
11888 
11889    build_end_with_regs(ctx, regs);
11890 }
11891 
11892 void
select_shader(isel_context & ctx,nir_shader * nir,const bool need_startpgm,const bool need_endpgm,const bool need_barrier,if_context * ic_merged_wave_info,const bool check_merged_wave_info,const bool endif_merged_wave_info)11893 select_shader(isel_context& ctx, nir_shader* nir, const bool need_startpgm, const bool need_endpgm,
11894               const bool need_barrier, if_context* ic_merged_wave_info,
11895               const bool check_merged_wave_info, const bool endif_merged_wave_info)
11896 {
11897    init_context(&ctx, nir);
11898    setup_fp_mode(&ctx, nir);
11899 
11900    Program* program = ctx.program;
11901 
11902    if (need_startpgm) {
11903       /* Needs to be after init_context() for FS. */
11904       Pseudo_instruction* startpgm = add_startpgm(&ctx);
11905       append_logical_start(ctx.block);
11906 
11907       if (ctx.options->has_ls_vgpr_init_bug && ctx.stage == vertex_tess_control_hs &&
11908           !program->info.vs.has_prolog)
11909          fix_ls_vgpr_init_bug(&ctx);
11910 
11911       split_arguments(&ctx, startpgm);
11912 
11913       if (!program->info.vs.has_prolog &&
11914           (program->stage.has(SWStage::VS) || program->stage.has(SWStage::TES))) {
11915          Builder(ctx.program, ctx.block).sopp(aco_opcode::s_setprio, -1u, 0x3u);
11916       }
11917    }
11918 
11919    if (program->gfx_level == GFX10 && program->stage.hw == AC_HW_NEXT_GEN_GEOMETRY_SHADER &&
11920        !program->stage.has(SWStage::GS)) {
11921       /* Workaround for Navi1x HW bug to ensure that all NGG waves launch before
11922        * s_sendmsg(GS_ALLOC_REQ).
11923        */
11924       Builder(ctx.program, ctx.block).sopp(aco_opcode::s_barrier, -1u, 0u);
11925    }
11926 
11927    if (check_merged_wave_info) {
11928       const unsigned i =
11929          nir->info.stage == MESA_SHADER_VERTEX || nir->info.stage == MESA_SHADER_TESS_EVAL ? 0 : 1;
11930       const Temp cond = merged_wave_info_to_mask(&ctx, i);
11931       begin_divergent_if_then(&ctx, ic_merged_wave_info, cond);
11932    }
11933 
11934    if (need_barrier) {
11935       const sync_scope scope = ctx.stage == vertex_tess_control_hs && ctx.tcs_in_out_eq &&
11936                                      program->wave_size % nir->info.tess.tcs_vertices_out == 0
11937                                   ? scope_subgroup
11938                                   : scope_workgroup;
11939 
11940       Builder(ctx.program, ctx.block)
11941          .barrier(aco_opcode::p_barrier, memory_sync_info(storage_shared, semantic_acqrel, scope),
11942                   scope);
11943    }
11944 
11945    nir_function_impl* func = nir_shader_get_entrypoint(nir);
11946    visit_cf_list(&ctx, &func->body);
11947 
11948    if (ctx.program->info.has_epilog) {
11949       if (ctx.stage == fragment_fs) {
11950          if (ctx.options->is_opengl)
11951             create_fs_end_for_epilog(&ctx);
11952          else
11953             create_fs_jump_to_epilog(&ctx);
11954 
11955          /* FS epilogs always have at least one color/null export. */
11956          ctx.program->has_color_exports = true;
11957       } else if (nir->info.stage == MESA_SHADER_TESS_CTRL) {
11958          assert(ctx.stage == tess_control_hs || ctx.stage == vertex_tess_control_hs);
11959          if (ctx.options->is_opengl)
11960             create_tcs_end_for_epilog(&ctx);
11961          else
11962             create_tcs_jump_to_epilog(&ctx);
11963       }
11964    }
11965 
11966    if (endif_merged_wave_info) {
11967       begin_divergent_if_else(&ctx, ic_merged_wave_info);
11968       end_divergent_if(&ctx, ic_merged_wave_info);
11969    }
11970 
11971    bool is_first_stage_of_merged_shader = false;
11972 
11973    if (ctx.program->info.merged_shader_compiled_separately &&
11974        (ctx.stage.sw == SWStage::VS || ctx.stage.sw == SWStage::TES)) {
11975       assert(program->gfx_level >= GFX9);
11976       if (ctx.options->is_opengl)
11977          create_end_for_merged_shader(&ctx);
11978       else
11979          create_merged_jump_to_epilog(&ctx);
11980 
11981       is_first_stage_of_merged_shader = true;
11982    }
11983 
11984    cleanup_context(&ctx);
11985 
11986    if (need_endpgm) {
11987       program->config->float_mode = program->blocks[0].fp_mode.val;
11988 
11989       append_logical_end(ctx.block);
11990       ctx.block->kind |= block_kind_uniform;
11991 
11992       if ((!program->info.has_epilog && !is_first_stage_of_merged_shader) ||
11993           (nir->info.stage == MESA_SHADER_TESS_CTRL && program->gfx_level >= GFX9)) {
11994          Builder(program, ctx.block).sopp(aco_opcode::s_endpgm);
11995       }
11996 
11997       finish_program(&ctx);
11998    }
11999 }
12000 
12001 void
select_program_merged(isel_context & ctx,const unsigned shader_count,nir_shader * const * shaders)12002 select_program_merged(isel_context& ctx, const unsigned shader_count, nir_shader* const* shaders)
12003 {
12004    if_context ic_merged_wave_info;
12005    const bool ngg_gs = ctx.stage.hw == AC_HW_NEXT_GEN_GEOMETRY_SHADER && ctx.stage.has(SWStage::GS);
12006 
12007    for (unsigned i = 0; i < shader_count; i++) {
12008       nir_shader* nir = shaders[i];
12009 
12010       /* We always need to insert p_startpgm at the beginning of the first shader.  */
12011       const bool need_startpgm = i == 0;
12012 
12013       /* Need to handle program end for last shader stage. */
12014       const bool need_endpgm = i == shader_count - 1;
12015 
12016       /* In a merged VS+TCS HS, the VS implementation can be completely empty. */
12017       nir_function_impl* func = nir_shader_get_entrypoint(nir);
12018       const bool empty_shader =
12019          nir_cf_list_is_empty_block(&func->body) &&
12020          ((nir->info.stage == MESA_SHADER_VERTEX &&
12021            (ctx.stage == vertex_tess_control_hs || ctx.stage == vertex_geometry_gs)) ||
12022           (nir->info.stage == MESA_SHADER_TESS_EVAL && ctx.stage == tess_eval_geometry_gs));
12023 
12024       /* See if we need to emit a check of the merged wave info SGPR. */
12025       const bool check_merged_wave_info =
12026          ctx.tcs_in_out_eq ? i == 0 : (shader_count >= 2 && !empty_shader && !(ngg_gs && i == 1));
12027       const bool endif_merged_wave_info =
12028          ctx.tcs_in_out_eq ? i == 1 : (check_merged_wave_info && !(ngg_gs && i == 1));
12029 
12030       /* Skip s_barrier from TCS when VS outputs are not stored in the LDS. */
12031       const bool tcs_skip_barrier =
12032          ctx.stage == vertex_tess_control_hs && ctx.tcs_temp_only_inputs == nir->info.inputs_read;
12033 
12034       /* A barrier is usually needed at the beginning of the second shader, with exceptions. */
12035       const bool need_barrier = i != 0 && !ngg_gs && !tcs_skip_barrier;
12036 
12037       select_shader(ctx, nir, need_startpgm, need_endpgm, need_barrier, &ic_merged_wave_info,
12038                     check_merged_wave_info, endif_merged_wave_info);
12039 
12040       if (i == 0 && ctx.stage == vertex_tess_control_hs && ctx.tcs_in_out_eq) {
12041          /* Special handling when TCS input and output patch size is the same.
12042           * Outputs of the previous stage are inputs to the next stage.
12043           */
12044          ctx.inputs = ctx.outputs;
12045          ctx.outputs = shader_io_state();
12046       }
12047    }
12048 }
12049 
12050 Temp
get_tess_ring_descriptor(isel_context * ctx,const struct aco_tcs_epilog_info * einfo,bool is_tcs_factor_ring)12051 get_tess_ring_descriptor(isel_context* ctx, const struct aco_tcs_epilog_info* einfo,
12052                          bool is_tcs_factor_ring)
12053 {
12054    Builder bld(ctx->program, ctx->block);
12055 
12056    if (!ctx->options->is_opengl) {
12057       Temp ring_offsets = get_arg(ctx, ctx->args->ring_offsets);
12058       uint32_t tess_ring_offset =
12059          is_tcs_factor_ring ? 5 /* RING_HS_TESS_FACTOR */ : 6 /* RING_HS_TESS_OFFCHIP */;
12060       return bld.smem(aco_opcode::s_load_dwordx4, bld.def(s4), ring_offsets,
12061                       Operand::c32(tess_ring_offset * 16u));
12062    }
12063 
12064    Temp addr = get_arg(ctx, einfo->tcs_out_lds_layout);
12065    /* TCS only receives high 13 bits of the address. */
12066    addr = bld.sop2(aco_opcode::s_and_b32, bld.def(s1), bld.def(s1, scc), addr,
12067                    Operand::c32(0xfff80000));
12068 
12069    if (is_tcs_factor_ring) {
12070       addr = bld.sop2(aco_opcode::s_add_u32, bld.def(s1), bld.def(s1, scc), addr,
12071                       Operand::c32(einfo->tess_offchip_ring_size));
12072    }
12073 
12074    uint32_t rsrc3 = S_008F0C_DST_SEL_X(V_008F0C_SQ_SEL_X) | S_008F0C_DST_SEL_Y(V_008F0C_SQ_SEL_Y) |
12075                     S_008F0C_DST_SEL_Z(V_008F0C_SQ_SEL_Z) | S_008F0C_DST_SEL_W(V_008F0C_SQ_SEL_W);
12076 
12077    if (ctx->options->gfx_level >= GFX11) {
12078       rsrc3 |= S_008F0C_FORMAT(V_008F0C_GFX11_FORMAT_32_FLOAT) |
12079                S_008F0C_OOB_SELECT(V_008F0C_OOB_SELECT_RAW);
12080    } else if (ctx->options->gfx_level >= GFX10) {
12081       rsrc3 |= S_008F0C_FORMAT(V_008F0C_GFX10_FORMAT_32_FLOAT) |
12082                S_008F0C_OOB_SELECT(V_008F0C_OOB_SELECT_RAW) | S_008F0C_RESOURCE_LEVEL(1);
12083    } else {
12084       rsrc3 |= S_008F0C_NUM_FORMAT(V_008F0C_BUF_NUM_FORMAT_FLOAT) |
12085                S_008F0C_DATA_FORMAT(V_008F0C_BUF_DATA_FORMAT_32);
12086    }
12087 
12088    return bld.pseudo(aco_opcode::p_create_vector, bld.def(s4), addr,
12089                      Operand::c32(ctx->options->address32_hi), Operand::c32(0xffffffff),
12090                      Operand::c32(rsrc3));
12091 }
12092 
12093 void
store_tess_factor_to_tess_ring(isel_context * ctx,Temp tess_ring_desc,Temp factors[],unsigned factor_comps,Temp sbase,Temp voffset,Temp num_patches,unsigned patch_offset)12094 store_tess_factor_to_tess_ring(isel_context* ctx, Temp tess_ring_desc, Temp factors[],
12095                                unsigned factor_comps, Temp sbase, Temp voffset, Temp num_patches,
12096                                unsigned patch_offset)
12097 {
12098    Builder bld(ctx->program, ctx->block);
12099 
12100    Temp soffset = sbase;
12101    if (patch_offset) {
12102       Temp offset =
12103          bld.sop2(aco_opcode::s_mul_i32, bld.def(s1), num_patches, Operand::c32(patch_offset));
12104       soffset = bld.sop2(aco_opcode::s_add_u32, bld.def(s1), bld.def(s1, scc), soffset, offset);
12105    }
12106 
12107    Temp data = factor_comps == 1
12108                   ? factors[0]
12109                   : create_vec_from_array(ctx, factors, factor_comps, RegType::vgpr, 4);
12110 
12111    emit_single_mubuf_store(ctx, tess_ring_desc, voffset, soffset, Temp(), data, 0,
12112                            memory_sync_info(storage_vmem_output), true, false, false);
12113 }
12114 
12115 void
emit_polygon_stipple(isel_context * ctx,const struct aco_ps_prolog_info * finfo)12116 emit_polygon_stipple(isel_context* ctx, const struct aco_ps_prolog_info* finfo)
12117 {
12118    Builder bld(ctx->program, ctx->block);
12119 
12120    /* Use the fixed-point gl_FragCoord input.
12121     * Since the stipple pattern is 32x32 and it repeats, just get 5 bits
12122     * per coordinate to get the repeating effect.
12123     */
12124    Temp pos_fixed_pt = get_arg(ctx, ctx->args->pos_fixed_pt);
12125    Temp addr0 = bld.vop2(aco_opcode::v_and_b32, bld.def(v1), Operand::c32(0x1f), pos_fixed_pt);
12126    Temp addr1 = bld.vop3(aco_opcode::v_bfe_u32, bld.def(v1), pos_fixed_pt, Operand::c32(16u),
12127                          Operand::c32(5u));
12128 
12129    /* Load the buffer descriptor. */
12130    Temp list = get_arg(ctx, finfo->internal_bindings);
12131    list = convert_pointer_to_64_bit(ctx, list);
12132    Temp desc = bld.smem(aco_opcode::s_load_dwordx4, bld.def(s4), list,
12133                         Operand::c32(finfo->poly_stipple_buf_offset));
12134 
12135    /* The stipple pattern is 32x32, each row has 32 bits. */
12136    Temp offset = bld.vop2(aco_opcode::v_lshlrev_b32, bld.def(v1), Operand::c32(2), addr1);
12137    Temp row = bld.mubuf(aco_opcode::buffer_load_dword, bld.def(v1), desc, offset, Operand::c32(0u),
12138                         0, true);
12139    Temp bit = bld.vop3(aco_opcode::v_bfe_u32, bld.def(v1), row, addr0, Operand::c32(1u));
12140    Temp cond = bld.vopc(aco_opcode::v_cmp_eq_u32, bld.def(bld.lm), Operand::zero(), bit);
12141    bld.pseudo(aco_opcode::p_demote_to_helper, cond);
12142 
12143    ctx->block->kind |= block_kind_uses_discard;
12144    ctx->program->needs_exact = true;
12145 }
12146 
12147 void
overwrite_interp_args(isel_context * ctx,const struct aco_ps_prolog_info * finfo)12148 overwrite_interp_args(isel_context* ctx, const struct aco_ps_prolog_info* finfo)
12149 {
12150    Builder bld(ctx->program, ctx->block);
12151 
12152    if (finfo->bc_optimize_for_persp || finfo->bc_optimize_for_linear) {
12153       /* The shader should do: if (PRIM_MASK[31]) CENTROID = CENTER;
12154        * The hw doesn't compute CENTROID if the whole wave only
12155        * contains fully-covered quads.
12156        */
12157       Temp bc_optimize = get_arg(ctx, ctx->args->prim_mask);
12158 
12159       /* enabled when bit 31 is set */
12160       Temp cond =
12161          bld.sopc(aco_opcode::s_bitcmp1_b32, bld.def(s1, scc), bc_optimize, Operand::c32(31u));
12162 
12163       /* scale 1bit scc to wave size bits used by v_cndmask */
12164       cond = bool_to_vector_condition(ctx, cond);
12165 
12166       if (finfo->bc_optimize_for_persp) {
12167          Temp center = get_arg(ctx, ctx->args->persp_center);
12168          Temp centroid = get_arg(ctx, ctx->args->persp_centroid);
12169 
12170          Temp dst = bld.tmp(v2);
12171          select_vec2(ctx, dst, cond, center, centroid);
12172          ctx->arg_temps[ctx->args->persp_centroid.arg_index] = dst;
12173       }
12174 
12175       if (finfo->bc_optimize_for_linear) {
12176          Temp center = get_arg(ctx, ctx->args->linear_center);
12177          Temp centroid = get_arg(ctx, ctx->args->linear_centroid);
12178 
12179          Temp dst = bld.tmp(v2);
12180          select_vec2(ctx, dst, cond, center, centroid);
12181          ctx->arg_temps[ctx->args->linear_centroid.arg_index] = dst;
12182       }
12183    }
12184 
12185    if (finfo->force_persp_sample_interp) {
12186       Temp persp_sample = get_arg(ctx, ctx->args->persp_sample);
12187       ctx->arg_temps[ctx->args->persp_center.arg_index] = persp_sample;
12188       ctx->arg_temps[ctx->args->persp_centroid.arg_index] = persp_sample;
12189    }
12190 
12191    if (finfo->force_linear_sample_interp) {
12192       Temp linear_sample = get_arg(ctx, ctx->args->linear_sample);
12193       ctx->arg_temps[ctx->args->linear_center.arg_index] = linear_sample;
12194       ctx->arg_temps[ctx->args->linear_centroid.arg_index] = linear_sample;
12195    }
12196 
12197    if (finfo->force_persp_center_interp) {
12198       Temp persp_center = get_arg(ctx, ctx->args->persp_center);
12199       ctx->arg_temps[ctx->args->persp_sample.arg_index] = persp_center;
12200       ctx->arg_temps[ctx->args->persp_centroid.arg_index] = persp_center;
12201    }
12202 
12203    if (finfo->force_linear_center_interp) {
12204       Temp linear_center = get_arg(ctx, ctx->args->linear_center);
12205       ctx->arg_temps[ctx->args->linear_sample.arg_index] = linear_center;
12206       ctx->arg_temps[ctx->args->linear_centroid.arg_index] = linear_center;
12207    }
12208 }
12209 
12210 void
overwrite_samplemask_arg(isel_context * ctx,const struct aco_ps_prolog_info * finfo)12211 overwrite_samplemask_arg(isel_context* ctx, const struct aco_ps_prolog_info* finfo)
12212 {
12213    Builder bld(ctx->program, ctx->block);
12214 
12215    /* Section 15.2.2 (Shader Inputs) of the OpenGL 4.5 (Core Profile) spec
12216     * says:
12217     *
12218     *    "When per-sample shading is active due to the use of a fragment
12219     *     input qualified by sample or due to the use of the gl_SampleID
12220     *     or gl_SamplePosition variables, only the bit for the current
12221     *     sample is set in gl_SampleMaskIn. When state specifies multiple
12222     *     fragment shader invocations for a given fragment, the sample
12223     *     mask for any single fragment shader invocation may specify a
12224     *     subset of the covered samples for the fragment. In this case,
12225     *     the bit corresponding to each covered sample will be set in
12226     *     exactly one fragment shader invocation."
12227     *
12228     * The samplemask loaded by hardware is always the coverage of the
12229     * entire pixel/fragment, so mask bits out based on the sample ID.
12230     */
12231    if (finfo->samplemask_log_ps_iter) {
12232       Temp ancillary = get_arg(ctx, ctx->args->ancillary);
12233       Temp sampleid = bld.vop3(aco_opcode::v_bfe_u32, bld.def(v1), ancillary, Operand::c32(8u),
12234                                Operand::c32(4u));
12235       Temp samplemask = get_arg(ctx, ctx->args->sample_coverage);
12236 
12237       uint32_t ps_iter_mask = ac_get_ps_iter_mask(1 << finfo->samplemask_log_ps_iter);
12238       Temp iter_mask = bld.copy(bld.def(v1), Operand::c32(ps_iter_mask));
12239 
12240       Temp mask = bld.vop2(aco_opcode::v_lshlrev_b32, bld.def(v1), sampleid, iter_mask);
12241       samplemask = bld.vop2(aco_opcode::v_and_b32, bld.def(v1), samplemask, mask);
12242 
12243       ctx->arg_temps[ctx->args->sample_coverage.arg_index] = samplemask;
12244    }
12245 }
12246 
12247 Temp
get_interp_color(isel_context * ctx,int interp_vgpr,unsigned attr_index,unsigned comp)12248 get_interp_color(isel_context* ctx, int interp_vgpr, unsigned attr_index, unsigned comp)
12249 {
12250    Builder bld(ctx->program, ctx->block);
12251 
12252    Temp dst = bld.tmp(v1);
12253 
12254    Temp prim_mask = get_arg(ctx, ctx->args->prim_mask);
12255 
12256    if (interp_vgpr != -1) {
12257       /* interp args are all 2 vgprs */
12258       int arg_index = ctx->args->persp_sample.arg_index + interp_vgpr / 2;
12259       Temp interp_ij = ctx->arg_temps[arg_index];
12260 
12261       emit_interp_instr(ctx, attr_index, comp, interp_ij, dst, prim_mask);
12262    } else {
12263       emit_interp_mov_instr(ctx, attr_index, comp, 0, dst, prim_mask);
12264    }
12265 
12266    return dst;
12267 }
12268 
12269 void
interpolate_color_args(isel_context * ctx,const struct aco_ps_prolog_info * finfo,std::vector<Operand> & regs)12270 interpolate_color_args(isel_context* ctx, const struct aco_ps_prolog_info* finfo,
12271                        std::vector<Operand>& regs)
12272 {
12273    if (!finfo->colors_read)
12274       return;
12275 
12276    Builder bld(ctx->program, ctx->block);
12277 
12278    unsigned vgpr = 256 + ctx->args->num_vgprs_used;
12279 
12280    if (finfo->color_two_side) {
12281       Temp face = get_arg(ctx, ctx->args->front_face);
12282       Temp is_face_positive =
12283          bld.vopc(aco_opcode::v_cmp_lg_u32, bld.def(bld.lm), Operand::zero(), face);
12284 
12285       u_foreach_bit (i, finfo->colors_read) {
12286          unsigned color_index = i / 4;
12287          unsigned front_index = finfo->color_attr_index[color_index];
12288          int interp_vgpr = finfo->color_interp_vgpr_index[color_index];
12289 
12290          /* If BCOLOR0 is used, BCOLOR1 is at offset "num_inputs + 1",
12291           * otherwise it's at offset "num_inputs".
12292           */
12293          unsigned back_index = finfo->num_interp_inputs;
12294          if (color_index == 1 && finfo->colors_read & 0xf)
12295             back_index++;
12296 
12297          Temp front = get_interp_color(ctx, interp_vgpr, front_index, i % 4);
12298          Temp back = get_interp_color(ctx, interp_vgpr, back_index, i % 4);
12299 
12300          Temp color =
12301             bld.vop2(aco_opcode::v_cndmask_b32, bld.def(v1), back, front, is_face_positive);
12302 
12303          regs.emplace_back(Operand(color, PhysReg{vgpr++}));
12304       }
12305    } else {
12306       u_foreach_bit (i, finfo->colors_read) {
12307          unsigned color_index = i / 4;
12308          unsigned attr_index = finfo->color_attr_index[color_index];
12309          int interp_vgpr = finfo->color_interp_vgpr_index[color_index];
12310          Temp color = get_interp_color(ctx, interp_vgpr, attr_index, i % 4);
12311 
12312          regs.emplace_back(Operand(color, PhysReg{vgpr++}));
12313       }
12314    }
12315 }
12316 
12317 void
emit_clamp_alpha_test(isel_context * ctx,const struct aco_ps_epilog_info * info,Temp colors[4],unsigned color_index)12318 emit_clamp_alpha_test(isel_context* ctx, const struct aco_ps_epilog_info* info, Temp colors[4],
12319                       unsigned color_index)
12320 {
12321    Builder bld(ctx->program, ctx->block);
12322 
12323    if (info->clamp_color) {
12324       for (unsigned i = 0; i < 4; i++) {
12325          if (colors[i].regClass() == v2b) {
12326             colors[i] = bld.vop3(aco_opcode::v_med3_f16, bld.def(v2b), Operand::c16(0u),
12327                                  Operand::c16(0x3c00), colors[i]);
12328          } else {
12329             assert(colors[i].regClass() == v1);
12330             colors[i] = bld.vop3(aco_opcode::v_med3_f32, bld.def(v1), Operand::zero(),
12331                                  Operand::c32(0x3f800000u), colors[i]);
12332          }
12333       }
12334    }
12335 
12336    if (info->alpha_to_one) {
12337       if (colors[3].regClass() == v2b)
12338          colors[3] = bld.copy(bld.def(v2b), Operand::c16(0x3c00));
12339       else
12340          colors[3] = bld.copy(bld.def(v1), Operand::c32(0x3f800000u));
12341    }
12342 
12343    if (color_index == 0 && info->alpha_func != COMPARE_FUNC_ALWAYS) {
12344       Operand cond = Operand::c32(-1u);
12345       if (info->alpha_func != COMPARE_FUNC_NEVER) {
12346          aco_opcode opcode = aco_opcode::num_opcodes;
12347 
12348          switch (info->alpha_func) {
12349          case COMPARE_FUNC_LESS: opcode = aco_opcode::v_cmp_ngt_f32; break;
12350          case COMPARE_FUNC_EQUAL: opcode = aco_opcode::v_cmp_neq_f32; break;
12351          case COMPARE_FUNC_LEQUAL: opcode = aco_opcode::v_cmp_nge_f32; break;
12352          case COMPARE_FUNC_GREATER: opcode = aco_opcode::v_cmp_nlt_f32; break;
12353          case COMPARE_FUNC_NOTEQUAL: opcode = aco_opcode::v_cmp_nlg_f32; break;
12354          case COMPARE_FUNC_GEQUAL: opcode = aco_opcode::v_cmp_nle_f32; break;
12355          default: unreachable("invalid alpha func");
12356          }
12357 
12358          Temp ref = get_arg(ctx, info->alpha_reference);
12359 
12360          Temp alpha = colors[3].regClass() == v2b
12361                          ? bld.vop1(aco_opcode::v_cvt_f32_f16, bld.def(v1), colors[3])
12362                          : colors[3];
12363 
12364          /* true if not pass */
12365          cond = bld.vopc(opcode, bld.def(bld.lm), ref, alpha);
12366       }
12367 
12368       bld.pseudo(aco_opcode::p_discard_if, cond);
12369       ctx->block->kind |= block_kind_uses_discard;
12370       ctx->program->needs_exact = true;
12371    }
12372 }
12373 
12374 } /* end namespace */
12375 
12376 void
select_program(Program * program,unsigned shader_count,struct nir_shader * const * shaders,ac_shader_config * config,const struct aco_compiler_options * options,const struct aco_shader_info * info,const struct ac_shader_args * args)12377 select_program(Program* program, unsigned shader_count, struct nir_shader* const* shaders,
12378                ac_shader_config* config, const struct aco_compiler_options* options,
12379                const struct aco_shader_info* info, const struct ac_shader_args* args)
12380 {
12381    isel_context ctx =
12382       setup_isel_context(program, shader_count, shaders, config, options, info, args);
12383 
12384    if (ctx.stage == raytracing_cs)
12385       return select_program_rt(ctx, shader_count, shaders, args);
12386 
12387    if (shader_count >= 2) {
12388       select_program_merged(ctx, shader_count, shaders);
12389    } else {
12390       bool need_barrier = false, check_merged_wave_info = false, endif_merged_wave_info = false;
12391       if_context ic_merged_wave_info;
12392 
12393       /* Handle separate compilation of VS+TCS and {VS,TES}+GS on GFX9+. */
12394       if (ctx.program->info.merged_shader_compiled_separately) {
12395          assert(ctx.program->gfx_level >= GFX9);
12396          if (ctx.stage.sw == SWStage::VS || ctx.stage.sw == SWStage::TES) {
12397             check_merged_wave_info = endif_merged_wave_info = true;
12398          } else {
12399             const bool ngg_gs =
12400                ctx.stage.hw == AC_HW_NEXT_GEN_GEOMETRY_SHADER && ctx.stage.sw == SWStage::GS;
12401             assert(ctx.stage == tess_control_hs || ctx.stage == geometry_gs || ngg_gs);
12402             check_merged_wave_info = endif_merged_wave_info = !ngg_gs;
12403             need_barrier = !ngg_gs;
12404          }
12405       }
12406 
12407       select_shader(ctx, shaders[0], true, true, need_barrier, &ic_merged_wave_info,
12408                     check_merged_wave_info, endif_merged_wave_info);
12409    }
12410 }
12411 
12412 void
select_trap_handler_shader(Program * program,struct nir_shader * shader,ac_shader_config * config,const struct aco_compiler_options * options,const struct aco_shader_info * info,const struct ac_shader_args * args)12413 select_trap_handler_shader(Program* program, struct nir_shader* shader, ac_shader_config* config,
12414                            const struct aco_compiler_options* options,
12415                            const struct aco_shader_info* info, const struct ac_shader_args* args)
12416 {
12417    assert(options->gfx_level == GFX8);
12418 
12419    init_program(program, compute_cs, info, options->gfx_level, options->family, options->wgp_mode,
12420                 config);
12421 
12422    isel_context ctx = {};
12423    ctx.program = program;
12424    ctx.args = args;
12425    ctx.options = options;
12426    ctx.stage = program->stage;
12427 
12428    ctx.block = ctx.program->create_and_insert_block();
12429    ctx.block->kind = block_kind_top_level;
12430 
12431    program->workgroup_size = 1; /* XXX */
12432 
12433    add_startpgm(&ctx);
12434    append_logical_start(ctx.block);
12435 
12436    Builder bld(ctx.program, ctx.block);
12437 
12438    /* Load the buffer descriptor from TMA. */
12439    bld.smem(aco_opcode::s_load_dwordx4, Definition(PhysReg{ttmp4}, s4), Operand(PhysReg{tma}, s2),
12440             Operand::zero());
12441 
12442    /* Store TTMP0-TTMP1. */
12443    bld.smem(aco_opcode::s_buffer_store_dwordx2, Operand(PhysReg{ttmp4}, s4), Operand::zero(),
12444             Operand(PhysReg{ttmp0}, s2), memory_sync_info(), true);
12445 
12446    uint32_t hw_regs_idx[] = {
12447       2, /* HW_REG_STATUS */
12448       3, /* HW_REG_TRAP_STS */
12449       4, /* HW_REG_HW_ID */
12450       7, /* HW_REG_IB_STS */
12451    };
12452 
12453    /* Store some hardware registers. */
12454    for (unsigned i = 0; i < ARRAY_SIZE(hw_regs_idx); i++) {
12455       /* "((size - 1) << 11) | register" */
12456       bld.sopk(aco_opcode::s_getreg_b32, Definition(PhysReg{ttmp8}, s1),
12457                ((20 - 1) << 11) | hw_regs_idx[i]);
12458 
12459       bld.smem(aco_opcode::s_buffer_store_dword, Operand(PhysReg{ttmp4}, s4),
12460                Operand::c32(8u + i * 4), Operand(PhysReg{ttmp8}, s1), memory_sync_info(), true);
12461    }
12462 
12463    program->config->float_mode = program->blocks[0].fp_mode.val;
12464 
12465    append_logical_end(ctx.block);
12466    ctx.block->kind |= block_kind_uniform;
12467    bld.sopp(aco_opcode::s_endpgm);
12468 
12469    finish_program(&ctx);
12470 }
12471 
12472 Operand
get_arg_fixed(const struct ac_shader_args * args,struct ac_arg arg)12473 get_arg_fixed(const struct ac_shader_args* args, struct ac_arg arg)
12474 {
12475    enum ac_arg_regfile file = args->args[arg.arg_index].file;
12476    unsigned size = args->args[arg.arg_index].size;
12477    RegClass rc = RegClass(file == AC_ARG_SGPR ? RegType::sgpr : RegType::vgpr, size);
12478    return Operand(get_arg_reg(args, arg), rc);
12479 }
12480 
12481 unsigned
load_vb_descs(Builder & bld,PhysReg dest,Operand base,unsigned start,unsigned max)12482 load_vb_descs(Builder& bld, PhysReg dest, Operand base, unsigned start, unsigned max)
12483 {
12484    unsigned count = MIN2((bld.program->dev.sgpr_limit - dest.reg()) / 4u, max);
12485 
12486    unsigned num_loads = (count / 4u) + util_bitcount(count & 0x3);
12487    if (bld.program->gfx_level >= GFX10 && num_loads > 1)
12488       bld.sopp(aco_opcode::s_clause, -1, num_loads - 1);
12489 
12490    for (unsigned i = 0; i < count;) {
12491       unsigned size = 1u << util_logbase2(MIN2(count - i, 4));
12492 
12493       if (size == 4)
12494          bld.smem(aco_opcode::s_load_dwordx16, Definition(dest, s16), base,
12495                   Operand::c32((start + i) * 16u));
12496       else if (size == 2)
12497          bld.smem(aco_opcode::s_load_dwordx8, Definition(dest, s8), base,
12498                   Operand::c32((start + i) * 16u));
12499       else
12500          bld.smem(aco_opcode::s_load_dwordx4, Definition(dest, s4), base,
12501                   Operand::c32((start + i) * 16u));
12502 
12503       dest = dest.advance(size * 16u);
12504       i += size;
12505    }
12506 
12507    return count;
12508 }
12509 
12510 Operand
calc_nontrivial_instance_id(Builder & bld,const struct ac_shader_args * args,const struct aco_vs_prolog_info * pinfo,unsigned index,Operand instance_id,Operand start_instance,PhysReg tmp_sgpr,PhysReg tmp_vgpr0,PhysReg tmp_vgpr1)12511 calc_nontrivial_instance_id(Builder& bld, const struct ac_shader_args* args,
12512                             const struct aco_vs_prolog_info* pinfo, unsigned index,
12513                             Operand instance_id, Operand start_instance, PhysReg tmp_sgpr,
12514                             PhysReg tmp_vgpr0, PhysReg tmp_vgpr1)
12515 {
12516    bld.smem(aco_opcode::s_load_dwordx2, Definition(tmp_sgpr, s2),
12517             get_arg_fixed(args, pinfo->inputs), Operand::c32(8u + index * 8u));
12518 
12519    wait_imm lgkm_imm;
12520    lgkm_imm.lgkm = 0;
12521    bld.sopp(aco_opcode::s_waitcnt, -1, lgkm_imm.pack(bld.program->gfx_level));
12522 
12523    Definition fetch_index_def(tmp_vgpr0, v1);
12524    Operand fetch_index(tmp_vgpr0, v1);
12525 
12526    Operand div_info(tmp_sgpr, s1);
12527    if (bld.program->gfx_level >= GFX8 && bld.program->gfx_level < GFX11) {
12528       /* use SDWA */
12529       if (bld.program->gfx_level < GFX9) {
12530          bld.vop1(aco_opcode::v_mov_b32, Definition(tmp_vgpr1, v1), div_info);
12531          div_info = Operand(tmp_vgpr1, v1);
12532       }
12533 
12534       bld.vop2(aco_opcode::v_lshrrev_b32, fetch_index_def, div_info, instance_id);
12535 
12536       Instruction* instr;
12537       if (bld.program->gfx_level >= GFX9)
12538          instr = bld.vop2_sdwa(aco_opcode::v_add_u32, fetch_index_def, div_info, fetch_index).instr;
12539       else
12540          instr = bld.vop2_sdwa(aco_opcode::v_add_co_u32, fetch_index_def, Definition(vcc, bld.lm),
12541                                div_info, fetch_index)
12542                     .instr;
12543       instr->sdwa().sel[0] = SubdwordSel::ubyte1;
12544 
12545       bld.vop3(aco_opcode::v_mul_hi_u32, fetch_index_def, Operand(tmp_sgpr.advance(4), s1),
12546                fetch_index);
12547 
12548       instr =
12549          bld.vop2_sdwa(aco_opcode::v_lshrrev_b32, fetch_index_def, div_info, fetch_index).instr;
12550       instr->sdwa().sel[0] = SubdwordSel::ubyte2;
12551    } else {
12552       Operand tmp_op(tmp_vgpr1, v1);
12553       Definition tmp_def(tmp_vgpr1, v1);
12554 
12555       bld.vop2(aco_opcode::v_lshrrev_b32, fetch_index_def, div_info, instance_id);
12556 
12557       bld.vop3(aco_opcode::v_bfe_u32, tmp_def, div_info, Operand::c32(8u), Operand::c32(8u));
12558       bld.vadd32(fetch_index_def, tmp_op, fetch_index, false, Operand(s2), true);
12559 
12560       bld.vop3(aco_opcode::v_mul_hi_u32, fetch_index_def, fetch_index,
12561                Operand(tmp_sgpr.advance(4), s1));
12562 
12563       bld.vop3(aco_opcode::v_bfe_u32, tmp_def, div_info, Operand::c32(16u), Operand::c32(8u));
12564       bld.vop2(aco_opcode::v_lshrrev_b32, fetch_index_def, tmp_op, fetch_index);
12565    }
12566 
12567    bld.vadd32(fetch_index_def, start_instance, fetch_index, false, Operand(s2), true);
12568 
12569    return fetch_index;
12570 }
12571 
12572 void
select_rt_prolog(Program * program,ac_shader_config * config,const struct aco_compiler_options * options,const struct aco_shader_info * info,const struct ac_shader_args * in_args,const struct ac_shader_args * out_args)12573 select_rt_prolog(Program* program, ac_shader_config* config,
12574                  const struct aco_compiler_options* options, const struct aco_shader_info* info,
12575                  const struct ac_shader_args* in_args, const struct ac_shader_args* out_args)
12576 {
12577    init_program(program, compute_cs, info, options->gfx_level, options->family, options->wgp_mode,
12578                 config);
12579    Block* block = program->create_and_insert_block();
12580    block->kind = block_kind_top_level;
12581    program->workgroup_size = info->workgroup_size;
12582    program->wave_size = info->workgroup_size;
12583    calc_min_waves(program);
12584    Builder bld(program, block);
12585    block->instructions.reserve(32);
12586    unsigned num_sgprs = MAX2(in_args->num_sgprs_used, out_args->num_sgprs_used);
12587    unsigned num_vgprs = MAX2(in_args->num_vgprs_used, out_args->num_vgprs_used);
12588 
12589    /* Inputs:
12590     * Ring offsets:                s[0-1]
12591     * Indirect descriptor sets:    s[2]
12592     * Push constants pointer:      s[3]
12593     * SBT descriptors:             s[4-5]
12594     * Traversal shader address:    s[6-7]
12595     * Ray launch size address:     s[8-9]
12596     * Dynamic callable stack base: s[10]
12597     * Workgroup IDs (xyz):         s[11], s[12], s[13]
12598     * Scratch offset:              s[14]
12599     * Local invocation IDs:        v[0-2]
12600     */
12601    PhysReg in_ring_offsets = get_arg_reg(in_args, in_args->ring_offsets);
12602    PhysReg in_sbt_desc = get_arg_reg(in_args, in_args->rt.sbt_descriptors);
12603    PhysReg in_launch_size_addr = get_arg_reg(in_args, in_args->rt.launch_size_addr);
12604    PhysReg in_stack_base = get_arg_reg(in_args, in_args->rt.dynamic_callable_stack_base);
12605    PhysReg in_wg_id_x = get_arg_reg(in_args, in_args->workgroup_ids[0]);
12606    PhysReg in_wg_id_y = get_arg_reg(in_args, in_args->workgroup_ids[1]);
12607    PhysReg in_wg_id_z = get_arg_reg(in_args, in_args->workgroup_ids[2]);
12608    PhysReg in_scratch_offset;
12609    if (options->gfx_level < GFX11)
12610       in_scratch_offset = get_arg_reg(in_args, in_args->scratch_offset);
12611    PhysReg in_local_ids[2] = {
12612       get_arg_reg(in_args, in_args->local_invocation_ids),
12613       get_arg_reg(in_args, in_args->local_invocation_ids).advance(4),
12614    };
12615 
12616    /* Outputs:
12617     * Callee shader PC:            s[0-1]
12618     * Indirect descriptor sets:    s[2]
12619     * Push constants pointer:      s[3]
12620     * SBT descriptors:             s[4-5]
12621     * Traversal shader address:    s[6-7]
12622     * Ray launch sizes (xyz):      s[8], s[9], s[10]
12623     * Scratch offset (<GFX9 only): s[11]
12624     * Ring offsets (<GFX9 only):   s[12-13]
12625     * Ray launch IDs:              v[0-2]
12626     * Stack pointer:               v[3]
12627     * Shader VA:                   v[4-5]
12628     * Shader Record Ptr:           v[6-7]
12629     */
12630    PhysReg out_uniform_shader_addr = get_arg_reg(out_args, out_args->rt.uniform_shader_addr);
12631    PhysReg out_launch_size_x = get_arg_reg(out_args, out_args->rt.launch_size);
12632    PhysReg out_launch_size_y = out_launch_size_x.advance(4);
12633    PhysReg out_launch_size_z = out_launch_size_y.advance(4);
12634    PhysReg out_launch_ids[3];
12635    for (unsigned i = 0; i < 3; i++)
12636       out_launch_ids[i] = get_arg_reg(out_args, out_args->rt.launch_id).advance(i * 4);
12637    PhysReg out_stack_ptr = get_arg_reg(out_args, out_args->rt.dynamic_callable_stack_base);
12638    PhysReg out_record_ptr = get_arg_reg(out_args, out_args->rt.shader_record);
12639 
12640    /* Temporaries: */
12641    num_sgprs = align(num_sgprs, 2);
12642    PhysReg tmp_raygen_sbt = PhysReg{num_sgprs};
12643    num_sgprs += 2;
12644    PhysReg tmp_ring_offsets = PhysReg{num_sgprs};
12645    num_sgprs += 2;
12646 
12647    PhysReg tmp_invocation_idx = PhysReg{256 + num_vgprs++};
12648 
12649    /* Confirm some assumptions about register aliasing */
12650    assert(in_ring_offsets == out_uniform_shader_addr);
12651    assert(get_arg_reg(in_args, in_args->push_constants) ==
12652           get_arg_reg(out_args, out_args->push_constants));
12653    assert(get_arg_reg(in_args, in_args->rt.sbt_descriptors) ==
12654           get_arg_reg(out_args, out_args->rt.sbt_descriptors));
12655    assert(in_launch_size_addr == out_launch_size_x);
12656    assert(in_stack_base == out_launch_size_z);
12657    assert(in_local_ids[0] == out_launch_ids[0]);
12658 
12659    /* load raygen sbt */
12660    bld.smem(aco_opcode::s_load_dwordx2, Definition(tmp_raygen_sbt, s2), Operand(in_sbt_desc, s2),
12661             Operand::c32(0u));
12662 
12663    /* init scratch */
12664    if (options->gfx_level < GFX9) {
12665       /* copy ring offsets to temporary location*/
12666       bld.sop1(aco_opcode::s_mov_b64, Definition(tmp_ring_offsets, s2),
12667                Operand(in_ring_offsets, s2));
12668    } else if (options->gfx_level < GFX11) {
12669       hw_init_scratch(bld, Definition(in_ring_offsets, s1), Operand(in_ring_offsets, s2),
12670                       Operand(in_scratch_offset, s1));
12671    }
12672 
12673    /* set stack ptr */
12674    bld.vop1(aco_opcode::v_mov_b32, Definition(out_stack_ptr, v1), Operand(in_stack_base, s1));
12675 
12676    /* load raygen address */
12677    bld.smem(aco_opcode::s_load_dwordx2, Definition(out_uniform_shader_addr, s2),
12678             Operand(tmp_raygen_sbt, s2), Operand::c32(0u));
12679 
12680    /* load ray launch sizes */
12681    bld.smem(aco_opcode::s_load_dword, Definition(out_launch_size_z, s1),
12682             Operand(in_launch_size_addr, s2), Operand::c32(8u));
12683    bld.smem(aco_opcode::s_load_dwordx2, Definition(out_launch_size_x, s2),
12684             Operand(in_launch_size_addr, s2), Operand::c32(0u));
12685 
12686    /* calculate ray launch ids */
12687    if (options->gfx_level >= GFX11) {
12688       /* Thread IDs are packed in VGPR0, 10 bits per component. */
12689       bld.vop3(aco_opcode::v_bfe_u32, Definition(in_local_ids[1], v1), Operand(in_local_ids[0], v1),
12690                Operand::c32(10u), Operand::c32(3u));
12691       bld.vop2(aco_opcode::v_and_b32, Definition(in_local_ids[0], v1), Operand::c32(0x7),
12692                Operand(in_local_ids[0], v1));
12693    }
12694    /* Do this backwards to reduce some RAW hazards on GFX11+ */
12695    bld.vop1(aco_opcode::v_mov_b32, Definition(out_launch_ids[2], v1), Operand(in_wg_id_z, s1));
12696    bld.vop3(aco_opcode::v_mad_u32_u24, Definition(out_launch_ids[1], v1), Operand(in_wg_id_y, s1),
12697             Operand::c32(program->workgroup_size == 32 ? 4 : 8), Operand(in_local_ids[1], v1));
12698    bld.vop3(aco_opcode::v_mad_u32_u24, Definition(out_launch_ids[0], v1), Operand(in_wg_id_x, s1),
12699             Operand::c32(8), Operand(in_local_ids[0], v1));
12700 
12701    if (options->gfx_level < GFX9) {
12702       /* write scratch/ring offsets to outputs, if needed */
12703       bld.sop1(aco_opcode::s_mov_b32,
12704                Definition(get_arg_reg(out_args, out_args->scratch_offset), s1),
12705                Operand(in_scratch_offset, s1));
12706       bld.sop1(aco_opcode::s_mov_b64, Definition(get_arg_reg(out_args, out_args->ring_offsets), s2),
12707                Operand(tmp_ring_offsets, s2));
12708    }
12709 
12710    /* calculate shader record ptr: SBT + RADV_RT_HANDLE_SIZE */
12711    if (options->gfx_level < GFX9) {
12712       bld.vop2_e64(aco_opcode::v_add_co_u32, Definition(out_record_ptr, v1), Definition(vcc, s2),
12713                    Operand(tmp_raygen_sbt, s1), Operand::c32(32u));
12714    } else {
12715       bld.vop2_e64(aco_opcode::v_add_u32, Definition(out_record_ptr, v1),
12716                    Operand(tmp_raygen_sbt, s1), Operand::c32(32u));
12717    }
12718    bld.vop1(aco_opcode::v_mov_b32, Definition(out_record_ptr.advance(4), v1),
12719             Operand(tmp_raygen_sbt.advance(4), s1));
12720 
12721    /* For 1D dispatches converted into 2D ones, we need to fix up the launch IDs.
12722     * Calculating the 1D launch ID is: id = local_invocation_index + (wg_id.x * wg_size).
12723     * in_wg_id_x now holds wg_id.x * wg_size.
12724     */
12725    bld.sop2(aco_opcode::s_lshl_b32, Definition(in_wg_id_x, s1), Definition(scc, s1),
12726             Operand(in_wg_id_x, s1), Operand::c32(program->workgroup_size == 32 ? 5 : 6));
12727 
12728    /* Calculate and add local_invocation_index */
12729    bld.vop3(aco_opcode::v_mbcnt_lo_u32_b32, Definition(tmp_invocation_idx, v1), Operand::c32(-1u),
12730             Operand(in_wg_id_x, s1));
12731    if (program->wave_size == 64) {
12732       if (program->gfx_level <= GFX7)
12733          bld.vop2(aco_opcode::v_mbcnt_hi_u32_b32, Definition(tmp_invocation_idx, v1),
12734                   Operand::c32(-1u), Operand(tmp_invocation_idx, v1));
12735       else
12736          bld.vop3(aco_opcode::v_mbcnt_hi_u32_b32_e64, Definition(tmp_invocation_idx, v1),
12737                   Operand::c32(-1u), Operand(tmp_invocation_idx, v1));
12738    }
12739 
12740    /* Make fixup operations a no-op if this is not a converted 2D dispatch. */
12741    bld.sopc(aco_opcode::s_cmp_lg_u32, Definition(scc, s1),
12742             Operand::c32(ACO_RT_CONVERTED_2D_LAUNCH_SIZE), Operand(out_launch_size_y, s1));
12743    bld.sop2(Builder::s_cselect, Definition(vcc, bld.lm),
12744             Operand::c32_or_c64(-1u, program->wave_size == 64),
12745             Operand::c32_or_c64(0, program->wave_size == 64), Operand(scc, s1));
12746    bld.vop2(aco_opcode::v_cndmask_b32, Definition(out_launch_ids[0], v1),
12747             Operand(tmp_invocation_idx, v1), Operand(out_launch_ids[0], v1), Operand(vcc, bld.lm));
12748    bld.vop2(aco_opcode::v_cndmask_b32, Definition(out_launch_ids[1], v1), Operand::zero(),
12749             Operand(out_launch_ids[1], v1), Operand(vcc, bld.lm));
12750 
12751    /* jump to raygen */
12752    bld.sop1(aco_opcode::s_setpc_b64, Operand(out_uniform_shader_addr, s2));
12753 
12754    program->config->float_mode = program->blocks[0].fp_mode.val;
12755    program->config->num_vgprs = get_vgpr_alloc(program, num_vgprs);
12756    program->config->num_sgprs = get_sgpr_alloc(program, num_sgprs);
12757 }
12758 
12759 void
select_vs_prolog(Program * program,const struct aco_vs_prolog_info * pinfo,ac_shader_config * config,const struct aco_compiler_options * options,const struct aco_shader_info * info,const struct ac_shader_args * args)12760 select_vs_prolog(Program* program, const struct aco_vs_prolog_info* pinfo, ac_shader_config* config,
12761                  const struct aco_compiler_options* options, const struct aco_shader_info* info,
12762                  const struct ac_shader_args* args)
12763 {
12764    assert(pinfo->num_attributes > 0);
12765 
12766    /* This should be enough for any shader/stage. */
12767    unsigned max_user_sgprs = options->gfx_level >= GFX9 ? 32 : 16;
12768 
12769    init_program(program, compute_cs, info, options->gfx_level, options->family, options->wgp_mode,
12770                 config);
12771    program->dev.vgpr_limit = 256;
12772 
12773    Block* block = program->create_and_insert_block();
12774    block->kind = block_kind_top_level;
12775 
12776    program->workgroup_size = 64;
12777    calc_min_waves(program);
12778 
12779    Builder bld(program, block);
12780 
12781    block->instructions.reserve(16 + pinfo->num_attributes * 4);
12782 
12783    bld.sopp(aco_opcode::s_setprio, -1u, 0x3u);
12784 
12785    uint32_t attrib_mask = BITFIELD_MASK(pinfo->num_attributes);
12786    bool has_nontrivial_divisors = pinfo->nontrivial_divisors;
12787 
12788    wait_imm lgkm_imm;
12789    lgkm_imm.lgkm = 0;
12790 
12791    /* choose sgprs */
12792    PhysReg vertex_buffers(align(max_user_sgprs + 14, 2));
12793    PhysReg prolog_input = vertex_buffers.advance(8);
12794    PhysReg desc(
12795       align((has_nontrivial_divisors ? prolog_input : vertex_buffers).advance(8).reg(), 4));
12796 
12797    Operand start_instance = get_arg_fixed(args, args->start_instance);
12798    Operand instance_id = get_arg_fixed(args, args->instance_id);
12799 
12800    PhysReg attributes_start(256 + args->num_vgprs_used);
12801    /* choose vgprs that won't be used for anything else until the last attribute load */
12802    PhysReg vertex_index(attributes_start.reg() + pinfo->num_attributes * 4 - 1);
12803    PhysReg instance_index(attributes_start.reg() + pinfo->num_attributes * 4 - 2);
12804    PhysReg start_instance_vgpr(attributes_start.reg() + pinfo->num_attributes * 4 - 3);
12805    PhysReg nontrivial_tmp_vgpr0(attributes_start.reg() + pinfo->num_attributes * 4 - 4);
12806    PhysReg nontrivial_tmp_vgpr1(attributes_start.reg() + pinfo->num_attributes * 4);
12807 
12808    bld.sop1(aco_opcode::s_mov_b32, Definition(vertex_buffers, s1),
12809             get_arg_fixed(args, args->vertex_buffers));
12810    if (options->address32_hi >= 0xffff8000 || options->address32_hi <= 0x7fff) {
12811       bld.sopk(aco_opcode::s_movk_i32, Definition(vertex_buffers.advance(4), s1),
12812                options->address32_hi & 0xFFFF);
12813    } else {
12814       bld.sop1(aco_opcode::s_mov_b32, Definition(vertex_buffers.advance(4), s1),
12815                Operand::c32((unsigned)options->address32_hi));
12816    }
12817 
12818    /* calculate vgpr requirements */
12819    unsigned num_vgprs = attributes_start.reg() - 256;
12820    num_vgprs += pinfo->num_attributes * 4;
12821    if (has_nontrivial_divisors && program->gfx_level <= GFX8)
12822       num_vgprs++; /* make space for nontrivial_tmp_vgpr1 */
12823    unsigned num_sgprs = 0;
12824 
12825    const struct ac_vtx_format_info* vtx_info_table =
12826       ac_get_vtx_format_info_table(GFX8, CHIP_POLARIS10);
12827 
12828    for (unsigned loc = 0; loc < pinfo->num_attributes;) {
12829       unsigned num_descs =
12830          load_vb_descs(bld, desc, Operand(vertex_buffers, s2), loc, pinfo->num_attributes - loc);
12831       num_sgprs = MAX2(num_sgprs, desc.advance(num_descs * 16u).reg());
12832 
12833       if (loc == 0) {
12834          /* perform setup while we load the descriptors */
12835          if (pinfo->is_ngg || pinfo->next_stage != MESA_SHADER_VERTEX) {
12836             Operand count = get_arg_fixed(args, args->merged_wave_info);
12837             bld.sop2(aco_opcode::s_bfm_b64, Definition(exec, s2), count, Operand::c32(0u));
12838             if (program->wave_size == 64) {
12839                bld.sopc(aco_opcode::s_bitcmp1_b32, Definition(scc, s1), count,
12840                         Operand::c32(6u /* log2(64) */));
12841                bld.sop2(aco_opcode::s_cselect_b64, Definition(exec, s2), Operand::c64(UINT64_MAX),
12842                         Operand(exec, s2), Operand(scc, s1));
12843             }
12844          }
12845 
12846          /* If there are no HS threads, SPI mistakenly loads the LS VGPRs starting at VGPR 0. */
12847          if (info->hw_stage == AC_HW_HULL_SHADER && options->has_ls_vgpr_init_bug) {
12848             /* We don't want load_vb_descs() to write vcc. */
12849             assert(program->dev.sgpr_limit <= vcc.reg());
12850 
12851             bld.sop2(aco_opcode::s_bfe_u32, Definition(vcc, s1), Definition(scc, s1),
12852                      get_arg_fixed(args, args->merged_wave_info), Operand::c32((8u << 16) | 8u));
12853             bld.sop2(Builder::s_cselect, Definition(vcc, bld.lm), Operand::c32(-1), Operand::zero(),
12854                      Operand(scc, s1));
12855 
12856             /* These copies are ordered so that vertex_id=tcs_patch_id doesn't overwrite vertex_id
12857              * before instance_id=vertex_id. */
12858             ac_arg src_args[] = {args->vertex_id, args->tcs_rel_ids, args->tcs_patch_id};
12859             ac_arg dst_args[] = {args->instance_id, args->vs_rel_patch_id, args->vertex_id};
12860             for (unsigned i = 0; i < 3; i++) {
12861                bld.vop2(aco_opcode::v_cndmask_b32, Definition(get_arg_reg(args, dst_args[i]), v1),
12862                         get_arg_fixed(args, src_args[i]), get_arg_fixed(args, dst_args[i]),
12863                         Operand(vcc, bld.lm));
12864             }
12865          }
12866 
12867          bool needs_instance_index =
12868             pinfo->instance_rate_inputs &
12869             ~(pinfo->zero_divisors | pinfo->nontrivial_divisors); /* divisor is 1 */
12870          bool needs_start_instance = pinfo->instance_rate_inputs & pinfo->zero_divisors;
12871          bool needs_vertex_index = ~pinfo->instance_rate_inputs & attrib_mask;
12872          if (needs_vertex_index)
12873             bld.vadd32(Definition(vertex_index, v1), get_arg_fixed(args, args->base_vertex),
12874                        get_arg_fixed(args, args->vertex_id), false, Operand(s2), true);
12875          if (needs_instance_index)
12876             bld.vadd32(Definition(instance_index, v1), start_instance, instance_id, false,
12877                        Operand(s2), true);
12878          if (needs_start_instance)
12879             bld.vop1(aco_opcode::v_mov_b32, Definition(start_instance_vgpr, v1), start_instance);
12880       }
12881 
12882       bld.sopp(aco_opcode::s_waitcnt, -1, lgkm_imm.pack(program->gfx_level));
12883 
12884       for (unsigned i = 0; i < num_descs;) {
12885          PhysReg dest(attributes_start.reg() + loc * 4u);
12886 
12887          /* calculate index */
12888          Operand fetch_index = Operand(vertex_index, v1);
12889          if (pinfo->instance_rate_inputs & (1u << loc)) {
12890             if (!(pinfo->zero_divisors & (1u << loc))) {
12891                fetch_index = instance_id;
12892                if (pinfo->nontrivial_divisors & (1u << loc)) {
12893                   unsigned index = util_bitcount(pinfo->nontrivial_divisors & BITFIELD_MASK(loc));
12894                   fetch_index = calc_nontrivial_instance_id(
12895                      bld, args, pinfo, index, instance_id, start_instance, prolog_input,
12896                      nontrivial_tmp_vgpr0, nontrivial_tmp_vgpr1);
12897                } else {
12898                   fetch_index = Operand(instance_index, v1);
12899                }
12900             } else {
12901                fetch_index = Operand(start_instance_vgpr, v1);
12902             }
12903          }
12904 
12905          /* perform load */
12906          PhysReg cur_desc = desc.advance(i * 16);
12907          if ((pinfo->misaligned_mask & (1u << loc))) {
12908             const struct ac_vtx_format_info* vtx_info = &vtx_info_table[pinfo->formats[loc]];
12909 
12910             assert(vtx_info->has_hw_format & 0x1);
12911             unsigned dfmt = vtx_info->hw_format[0] & 0xf;
12912             unsigned nfmt = vtx_info->hw_format[0] >> 4;
12913 
12914             for (unsigned j = 0; j < vtx_info->num_channels; j++) {
12915                bool post_shuffle = pinfo->post_shuffle & (1u << loc);
12916                unsigned offset = vtx_info->chan_byte_size * (post_shuffle && j < 3 ? 2 - j : j);
12917 
12918                /* Use MUBUF to workaround hangs for byte-aligned dword loads. The Vulkan spec
12919                 * doesn't require this to work, but some GL CTS tests over Zink do this anyway.
12920                 * MTBUF can hang, but MUBUF doesn't (probably gives garbage, but GL CTS doesn't
12921                 * care).
12922                 */
12923                if (dfmt == V_008F0C_BUF_DATA_FORMAT_32)
12924                   bld.mubuf(aco_opcode::buffer_load_dword, Definition(dest.advance(j * 4u), v1),
12925                             Operand(cur_desc, s4), fetch_index, Operand::c32(0u), offset, false,
12926                             false, true);
12927                else if (vtx_info->chan_byte_size == 8)
12928                   bld.mtbuf(aco_opcode::tbuffer_load_format_xy,
12929                             Definition(dest.advance(j * 8u), v2), Operand(cur_desc, s4),
12930                             fetch_index, Operand::c32(0u), dfmt, nfmt, offset, false, true);
12931                else
12932                   bld.mtbuf(aco_opcode::tbuffer_load_format_x, Definition(dest.advance(j * 4u), v1),
12933                             Operand(cur_desc, s4), fetch_index, Operand::c32(0u), dfmt, nfmt,
12934                             offset, false, true);
12935             }
12936             uint32_t one =
12937                nfmt == V_008F0C_BUF_NUM_FORMAT_UINT || nfmt == V_008F0C_BUF_NUM_FORMAT_SINT
12938                   ? 1u
12939                   : 0x3f800000u;
12940             /* 22.1.1. Attribute Location and Component Assignment of Vulkan 1.3 specification:
12941              * For 64-bit data types, no default attribute values are provided. Input variables must
12942              * not use more components than provided by the attribute.
12943              */
12944             for (unsigned j = vtx_info->num_channels; vtx_info->chan_byte_size != 8 && j < 4; j++) {
12945                bld.vop1(aco_opcode::v_mov_b32, Definition(dest.advance(j * 4u), v1),
12946                         Operand::c32(j == 3 ? one : 0u));
12947             }
12948 
12949             unsigned slots = vtx_info->chan_byte_size == 8 && vtx_info->num_channels > 2 ? 2 : 1;
12950             loc += slots;
12951             i += slots;
12952          } else {
12953             bld.mubuf(aco_opcode::buffer_load_format_xyzw, Definition(dest, v4),
12954                       Operand(cur_desc, s4), fetch_index, Operand::c32(0u), 0u, false, false, true);
12955             loc++;
12956             i++;
12957          }
12958       }
12959    }
12960 
12961    if (pinfo->alpha_adjust_lo | pinfo->alpha_adjust_hi) {
12962       wait_imm vm_imm;
12963       vm_imm.vm = 0;
12964       bld.sopp(aco_opcode::s_waitcnt, -1, vm_imm.pack(program->gfx_level));
12965    }
12966 
12967    /* For 2_10_10_10 formats the alpha is handled as unsigned by pre-vega HW.
12968     * so we may need to fix it up. */
12969    u_foreach_bit (loc, (pinfo->alpha_adjust_lo | pinfo->alpha_adjust_hi)) {
12970       PhysReg alpha(attributes_start.reg() + loc * 4u + 3);
12971 
12972       unsigned alpha_adjust = (pinfo->alpha_adjust_lo >> loc) & 0x1;
12973       alpha_adjust |= ((pinfo->alpha_adjust_hi >> loc) & 0x1) << 1;
12974 
12975       if (alpha_adjust == AC_ALPHA_ADJUST_SSCALED)
12976          bld.vop1(aco_opcode::v_cvt_u32_f32, Definition(alpha, v1), Operand(alpha, v1));
12977 
12978       /* For the integer-like cases, do a natural sign extension.
12979        *
12980        * For the SNORM case, the values are 0.0, 0.333, 0.666, 1.0
12981        * and happen to contain 0, 1, 2, 3 as the two LSBs of the
12982        * exponent.
12983        */
12984       unsigned offset = alpha_adjust == AC_ALPHA_ADJUST_SNORM ? 23u : 0u;
12985       bld.vop3(aco_opcode::v_bfe_i32, Definition(alpha, v1), Operand(alpha, v1),
12986                Operand::c32(offset), Operand::c32(2u));
12987 
12988       /* Convert back to the right type. */
12989       if (alpha_adjust == AC_ALPHA_ADJUST_SNORM) {
12990          bld.vop1(aco_opcode::v_cvt_f32_i32, Definition(alpha, v1), Operand(alpha, v1));
12991          bld.vop2(aco_opcode::v_max_f32, Definition(alpha, v1), Operand::c32(0xbf800000u),
12992                   Operand(alpha, v1));
12993       } else if (alpha_adjust == AC_ALPHA_ADJUST_SSCALED) {
12994          bld.vop1(aco_opcode::v_cvt_f32_i32, Definition(alpha, v1), Operand(alpha, v1));
12995       }
12996    }
12997 
12998    block->kind |= block_kind_uniform;
12999 
13000    /* continue on to the main shader */
13001    Operand continue_pc = get_arg_fixed(args, pinfo->inputs);
13002    if (has_nontrivial_divisors) {
13003       bld.smem(aco_opcode::s_load_dwordx2, Definition(prolog_input, s2),
13004                get_arg_fixed(args, pinfo->inputs), Operand::c32(0u));
13005       bld.sopp(aco_opcode::s_waitcnt, -1, lgkm_imm.pack(program->gfx_level));
13006       continue_pc = Operand(prolog_input, s2);
13007    }
13008 
13009    bld.sop1(aco_opcode::s_setpc_b64, continue_pc);
13010 
13011    program->config->float_mode = program->blocks[0].fp_mode.val;
13012    /* addition on GFX6-8 requires a carry-out (we use VCC) */
13013    program->needs_vcc = program->gfx_level <= GFX8;
13014    program->config->num_vgprs = std::min<uint16_t>(get_vgpr_alloc(program, num_vgprs), 256);
13015    program->config->num_sgprs = get_sgpr_alloc(program, num_sgprs);
13016 }
13017 
13018 void
select_ps_epilog(Program * program,void * pinfo,ac_shader_config * config,const struct aco_compiler_options * options,const struct aco_shader_info * info,const struct ac_shader_args * args)13019 select_ps_epilog(Program* program, void* pinfo, ac_shader_config* config,
13020                  const struct aco_compiler_options* options, const struct aco_shader_info* info,
13021                  const struct ac_shader_args* args)
13022 {
13023    const struct aco_ps_epilog_info* einfo = (const struct aco_ps_epilog_info*)pinfo;
13024    isel_context ctx =
13025       setup_isel_context(program, 0, NULL, config, options, info, args, SWStage::FS);
13026 
13027    ctx.block->fp_mode = program->next_fp_mode;
13028 
13029    add_startpgm(&ctx);
13030    append_logical_start(ctx.block);
13031 
13032    Builder bld(ctx.program, ctx.block);
13033 
13034    Temp colors[MAX_DRAW_BUFFERS][4];
13035    for (unsigned i = 0; i < MAX_DRAW_BUFFERS; i++) {
13036       if (!einfo->colors[i].used)
13037          continue;
13038 
13039       Temp color = get_arg(&ctx, einfo->colors[i]);
13040       unsigned col_types = (einfo->color_types >> (i * 2)) & 0x3;
13041 
13042       emit_split_vector(&ctx, color, col_types == ACO_TYPE_ANY32 ? 4 : 8);
13043       for (unsigned c = 0; c < 4; ++c) {
13044          colors[i][c] = emit_extract_vector(&ctx, color, c, col_types == ACO_TYPE_ANY32 ? v1 : v2b);
13045       }
13046 
13047       emit_clamp_alpha_test(&ctx, einfo, colors[i], i);
13048    }
13049 
13050    bool has_mrtz_depth = einfo->depth.used;
13051    bool has_mrtz_stencil = einfo->stencil.used;
13052    bool has_mrtz_samplemask = einfo->samplemask.used;
13053    bool has_mrtz_alpha = einfo->alpha_to_coverage_via_mrtz && einfo->colors[0].used;
13054    bool has_mrtz_export =
13055       has_mrtz_depth || has_mrtz_stencil || has_mrtz_samplemask || has_mrtz_alpha;
13056    if (has_mrtz_export) {
13057       Temp depth = has_mrtz_depth ? get_arg(&ctx, einfo->depth) : Temp();
13058       Temp stencil = has_mrtz_stencil ? get_arg(&ctx, einfo->stencil) : Temp();
13059       Temp samplemask = has_mrtz_samplemask ? get_arg(&ctx, einfo->samplemask) : Temp();
13060       Temp alpha = has_mrtz_alpha ? colors[0][3] : Temp();
13061 
13062       export_fs_mrtz(&ctx, depth, stencil, samplemask, alpha);
13063    }
13064 
13065    /* Export all color render targets */
13066    struct aco_export_mrt mrts[MAX_DRAW_BUFFERS];
13067    unsigned mrt_num = 0;
13068 
13069    if (einfo->broadcast_last_cbuf) {
13070       for (unsigned i = 0; i <= einfo->broadcast_last_cbuf; i++) {
13071          struct aco_export_mrt* mrt = &mrts[mrt_num];
13072          if (export_fs_mrt_color(&ctx, einfo, colors[0], i, mrt))
13073             mrt->target += mrt_num++;
13074       }
13075    } else {
13076       for (unsigned i = 0; i < MAX_DRAW_BUFFERS; i++) {
13077          struct aco_export_mrt* mrt = &mrts[mrt_num];
13078          if (export_fs_mrt_color(&ctx, einfo, colors[i], i, mrt))
13079             mrt->target += mrt_num++;
13080       }
13081    }
13082 
13083    if (mrt_num) {
13084       if (ctx.options->gfx_level >= GFX11 && einfo->mrt0_is_dual_src) {
13085          assert(mrt_num == 2);
13086          create_fs_dual_src_export_gfx11(&ctx, &mrts[0], &mrts[1]);
13087       } else {
13088          for (unsigned i = 0; i < mrt_num; i++)
13089             export_mrt(&ctx, &mrts[i]);
13090       }
13091    } else if (!has_mrtz_export && !einfo->skip_null_export) {
13092       create_fs_null_export(&ctx);
13093    }
13094 
13095    program->config->float_mode = program->blocks[0].fp_mode.val;
13096 
13097    append_logical_end(ctx.block);
13098    ctx.block->kind |= block_kind_export_end;
13099    bld.reset(ctx.block);
13100    bld.sopp(aco_opcode::s_endpgm);
13101 
13102    finish_program(&ctx);
13103 }
13104 
13105 void
select_tcs_epilog(Program * program,void * pinfo,ac_shader_config * config,const struct aco_compiler_options * options,const struct aco_shader_info * info,const struct ac_shader_args * args)13106 select_tcs_epilog(Program* program, void* pinfo, ac_shader_config* config,
13107                   const struct aco_compiler_options* options, const struct aco_shader_info* info,
13108                   const struct ac_shader_args* args)
13109 {
13110    const struct aco_tcs_epilog_info* einfo = (const struct aco_tcs_epilog_info*)pinfo;
13111    isel_context ctx =
13112       setup_isel_context(program, 0, NULL, config, options, info, args, SWStage::TCS);
13113 
13114    ctx.block->fp_mode = program->next_fp_mode;
13115 
13116    add_startpgm(&ctx);
13117    append_logical_start(ctx.block);
13118 
13119    Builder bld(ctx.program, ctx.block);
13120 
13121    /* Add a barrier before loading tess factors from LDS. */
13122    if (!einfo->pass_tessfactors_by_reg) {
13123       /* To generate s_waitcnt lgkmcnt(0) when waitcnt insertion. */
13124       program->pending_lds_access = true;
13125 
13126       sync_scope scope = einfo->tcs_out_patch_fits_subgroup ? scope_subgroup : scope_workgroup;
13127       bld.barrier(aco_opcode::p_barrier, memory_sync_info(storage_shared, semantic_acqrel, scope),
13128                   scope);
13129    }
13130 
13131    Temp invocation_id = get_arg(&ctx, einfo->invocation_id);
13132 
13133    Temp cond = bld.vopc(aco_opcode::v_cmp_eq_u32, bld.def(bld.lm), Operand::zero(), invocation_id);
13134 
13135    if_context ic_invoc_0;
13136    begin_divergent_if_then(&ctx, &ic_invoc_0, cond);
13137 
13138    int outer_comps, inner_comps;
13139    switch (einfo->primitive_mode) {
13140    case TESS_PRIMITIVE_ISOLINES:
13141       outer_comps = 2;
13142       inner_comps = 0;
13143       break;
13144    case TESS_PRIMITIVE_TRIANGLES:
13145       outer_comps = 3;
13146       inner_comps = 1;
13147       break;
13148    case TESS_PRIMITIVE_QUADS:
13149       outer_comps = 4;
13150       inner_comps = 2;
13151       break;
13152    default: unreachable("invalid primitive mode"); return;
13153    }
13154 
13155    bld.reset(ctx.block);
13156 
13157    unsigned tess_lvl_out_loc =
13158       ac_shader_io_get_unique_index_patch(VARYING_SLOT_TESS_LEVEL_OUTER) * 16;
13159    unsigned tess_lvl_in_loc =
13160       ac_shader_io_get_unique_index_patch(VARYING_SLOT_TESS_LEVEL_INNER) * 16;
13161 
13162    Temp outer[4];
13163    Temp inner[2];
13164    if (einfo->pass_tessfactors_by_reg) {
13165       for (int i = 0; i < outer_comps; i++)
13166          outer[i] = get_arg(&ctx, einfo->tess_lvl_out[i]);
13167 
13168       for (int i = 0; i < inner_comps; i++)
13169          inner[i] = get_arg(&ctx, einfo->tess_lvl_in[i]);
13170    } else {
13171       Temp addr = get_arg(&ctx, einfo->tcs_out_current_patch_data_offset);
13172       addr = bld.vop2(aco_opcode::v_lshlrev_b32, bld.def(v1), Operand::c32(2), addr);
13173 
13174       Temp data = program->allocateTmp(RegClass(RegType::vgpr, outer_comps));
13175       load_lds(&ctx, 4, outer_comps, data, addr, tess_lvl_out_loc, 4);
13176       for (int i = 0; i < outer_comps; i++)
13177          outer[i] = emit_extract_vector(&ctx, data, i, v1);
13178 
13179       if (inner_comps) {
13180          data = program->allocateTmp(RegClass(RegType::vgpr, inner_comps));
13181          load_lds(&ctx, 4, inner_comps, data, addr, tess_lvl_in_loc, 4);
13182          for (int i = 0; i < inner_comps; i++)
13183             inner[i] = emit_extract_vector(&ctx, data, i, v1);
13184       }
13185    }
13186 
13187    Temp tess_factor_ring_desc = get_tess_ring_descriptor(&ctx, einfo, true);
13188    Temp tess_factor_ring_base = get_arg(&ctx, args->tcs_factor_offset);
13189    Temp rel_patch_id = get_arg(&ctx, einfo->rel_patch_id);
13190    unsigned tess_factor_ring_const_offset = 0;
13191 
13192    if (program->gfx_level <= GFX8) {
13193       /* Store the dynamic HS control word. */
13194       cond = bld.vopc(aco_opcode::v_cmp_eq_u32, bld.def(bld.lm), Operand::zero(), rel_patch_id);
13195 
13196       if_context ic_patch_0;
13197       begin_divergent_if_then(&ctx, &ic_patch_0, cond);
13198 
13199       bld.reset(ctx.block);
13200 
13201       Temp data = bld.copy(bld.def(v1), Operand::c32(0x80000000u));
13202 
13203       emit_single_mubuf_store(&ctx, tess_factor_ring_desc, Temp(0, v1), tess_factor_ring_base,
13204                               Temp(), data, 0, memory_sync_info(), true, false, false);
13205 
13206       tess_factor_ring_const_offset += 4;
13207 
13208       begin_divergent_if_else(&ctx, &ic_patch_0);
13209       end_divergent_if(&ctx, &ic_patch_0);
13210    }
13211 
13212    bld.reset(ctx.block);
13213 
13214    Temp tess_factor_ring_offset =
13215       bld.v_mul_imm(bld.def(v1), rel_patch_id, (inner_comps + outer_comps) * 4, false);
13216 
13217    switch (einfo->primitive_mode) {
13218    case TESS_PRIMITIVE_ISOLINES: {
13219       /* For isolines, the hardware expects tess factors in the reverse order. */
13220       Temp data = bld.pseudo(aco_opcode::p_create_vector, bld.def(v2), outer[1], outer[0]);
13221       emit_single_mubuf_store(&ctx, tess_factor_ring_desc, tess_factor_ring_offset,
13222                               tess_factor_ring_base, Temp(), data, tess_factor_ring_const_offset,
13223                               memory_sync_info(), true, false, false);
13224       break;
13225    }
13226    case TESS_PRIMITIVE_TRIANGLES: {
13227       Temp data = bld.pseudo(aco_opcode::p_create_vector, bld.def(v4), outer[0], outer[1], outer[2],
13228                              inner[0]);
13229       emit_single_mubuf_store(&ctx, tess_factor_ring_desc, tess_factor_ring_offset,
13230                               tess_factor_ring_base, Temp(), data, tess_factor_ring_const_offset,
13231                               memory_sync_info(), true, false, false);
13232       break;
13233    }
13234    case TESS_PRIMITIVE_QUADS: {
13235       Temp data = bld.pseudo(aco_opcode::p_create_vector, bld.def(v4), outer[0], outer[1], outer[2],
13236                              outer[3]);
13237       emit_single_mubuf_store(&ctx, tess_factor_ring_desc, tess_factor_ring_offset,
13238                               tess_factor_ring_base, Temp(), data, tess_factor_ring_const_offset,
13239                               memory_sync_info(), true, false, false);
13240 
13241       data = bld.pseudo(aco_opcode::p_create_vector, bld.def(v2), inner[0], inner[1]);
13242       emit_single_mubuf_store(
13243          &ctx, tess_factor_ring_desc, tess_factor_ring_offset, tess_factor_ring_base, Temp(), data,
13244          tess_factor_ring_const_offset + 16, memory_sync_info(), true, false, false);
13245       break;
13246    }
13247    default: unreachable("invalid primitive mode"); break;
13248    }
13249 
13250    if (einfo->tes_reads_tessfactors) {
13251       Temp layout = get_arg(&ctx, einfo->tcs_offchip_layout);
13252       Temp num_patches, patch_base;
13253 
13254       if (ctx.options->is_opengl) {
13255          num_patches = bld.sop2(aco_opcode::s_and_b32, bld.def(s1), bld.def(s1, scc), layout,
13256                                 Operand::c32(0x3f));
13257          num_patches = bld.sop2(aco_opcode::s_add_u32, bld.def(s1), bld.def(s1, scc), num_patches,
13258                                 Operand::c32(1));
13259 
13260          patch_base = bld.sop2(aco_opcode::s_lshr_b32, bld.def(s1), bld.def(s1, scc), layout,
13261                                Operand::c32(16));
13262       } else {
13263          num_patches = bld.sop2(aco_opcode::s_bfe_u32, bld.def(s1), bld.def(s1, scc), layout,
13264                                 Operand::c32(0x60006));
13265 
13266          patch_base = get_arg(&ctx, einfo->patch_base);
13267       }
13268 
13269       Temp tess_ring_desc = get_tess_ring_descriptor(&ctx, einfo, false);
13270       Temp tess_ring_base = get_arg(&ctx, args->tess_offchip_offset);
13271 
13272       Temp sbase =
13273          bld.sop2(aco_opcode::s_add_u32, bld.def(s1), bld.def(s1, scc), tess_ring_base, patch_base);
13274 
13275       Temp voffset =
13276          bld.vop2(aco_opcode::v_lshlrev_b32, bld.def(v1), Operand::c32(4), rel_patch_id);
13277 
13278       store_tess_factor_to_tess_ring(&ctx, tess_ring_desc, outer, outer_comps, sbase, voffset,
13279                                      num_patches, tess_lvl_out_loc);
13280 
13281       if (inner_comps) {
13282          store_tess_factor_to_tess_ring(&ctx, tess_ring_desc, inner, inner_comps, sbase, voffset,
13283                                         num_patches, tess_lvl_in_loc);
13284       }
13285    }
13286 
13287    begin_divergent_if_else(&ctx, &ic_invoc_0);
13288    end_divergent_if(&ctx, &ic_invoc_0);
13289 
13290    program->config->float_mode = program->blocks[0].fp_mode.val;
13291 
13292    append_logical_end(ctx.block);
13293 
13294    bld.reset(ctx.block);
13295    bld.sopp(aco_opcode::s_endpgm);
13296 
13297    finish_program(&ctx);
13298 }
13299 
13300 void
select_ps_prolog(Program * program,void * pinfo,ac_shader_config * config,const struct aco_compiler_options * options,const struct aco_shader_info * info,const struct ac_shader_args * args)13301 select_ps_prolog(Program* program, void* pinfo, ac_shader_config* config,
13302                  const struct aco_compiler_options* options, const struct aco_shader_info* info,
13303                  const struct ac_shader_args* args)
13304 {
13305    const struct aco_ps_prolog_info* finfo = (const struct aco_ps_prolog_info*)pinfo;
13306    isel_context ctx =
13307       setup_isel_context(program, 0, NULL, config, options, info, args, SWStage::FS);
13308 
13309    ctx.block->fp_mode = program->next_fp_mode;
13310 
13311    add_startpgm(&ctx);
13312    append_logical_start(ctx.block);
13313 
13314    if (finfo->poly_stipple)
13315       emit_polygon_stipple(&ctx, finfo);
13316 
13317    overwrite_interp_args(&ctx, finfo);
13318 
13319    overwrite_samplemask_arg(&ctx, finfo);
13320 
13321    std::vector<Operand> regs;
13322    passthrough_all_args(&ctx, regs);
13323 
13324    interpolate_color_args(&ctx, finfo, regs);
13325 
13326    program->config->float_mode = program->blocks[0].fp_mode.val;
13327 
13328    append_logical_end(ctx.block);
13329 
13330    build_end_with_regs(&ctx, regs);
13331 
13332    /* To compute all end args in WQM mode if required by main part. */
13333    if (finfo->needs_wqm)
13334       set_wqm(&ctx, true);
13335 
13336    /* Exit WQM mode finally. */
13337    program->needs_exact = true;
13338 
13339    finish_program(&ctx);
13340 }
13341 
13342 } // namespace aco
13343