• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /*
2  * Copyright © 2018 Valve Corporation
3  * Copyright © 2018 Google
4  *
5  * Permission is hereby granted, free of charge, to any person obtaining a
6  * copy of this software and associated documentation files (the "Software"),
7  * to deal in the Software without restriction, including without limitation
8  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
9  * and/or sell copies of the Software, and to permit persons to whom the
10  * Software is furnished to do so, subject to the following conditions:
11  *
12  * The above copyright notice and this permission notice (including the next
13  * paragraph) shall be included in all copies or substantial portions of the
14  * Software.
15  *
16  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
19  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
21  * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
22  * IN THE SOFTWARE.
23  *
24  */
25 
26 #include "aco_instruction_selection.h"
27 
28 #include "aco_builder.h"
29 #include "aco_ir.h"
30 
31 #include "common/ac_exp_param.h"
32 #include "common/sid.h"
33 #include "vulkan/radv_descriptor_set.h"
34 
35 #include "util/fast_idiv_by_const.h"
36 #include "util/memstream.h"
37 
38 #include <array>
39 #include <functional>
40 #include <map>
41 #include <numeric>
42 #include <stack>
43 #include <utility>
44 #include <vector>
45 
46 namespace aco {
47 namespace {
48 
49 #define isel_err(...) _isel_err(ctx, __FILE__, __LINE__, __VA_ARGS__)
50 
51 static void
_isel_err(isel_context * ctx,const char * file,unsigned line,const nir_instr * instr,const char * msg)52 _isel_err(isel_context* ctx, const char* file, unsigned line, const nir_instr* instr,
53           const char* msg)
54 {
55    char* out;
56    size_t outsize;
57    struct u_memstream mem;
58    u_memstream_open(&mem, &out, &outsize);
59    FILE* const memf = u_memstream_get(&mem);
60 
61    fprintf(memf, "%s: ", msg);
62    nir_print_instr(instr, memf);
63    u_memstream_close(&mem);
64 
65    _aco_err(ctx->program, file, line, out);
66    free(out);
67 }
68 
69 struct if_context {
70    Temp cond;
71 
72    bool divergent_old;
73    bool exec_potentially_empty_discard_old;
74    bool exec_potentially_empty_break_old;
75    uint16_t exec_potentially_empty_break_depth_old;
76 
77    unsigned BB_if_idx;
78    unsigned invert_idx;
79    bool uniform_has_then_branch;
80    bool then_branch_divergent;
81    Block BB_invert;
82    Block BB_endif;
83 };
84 
85 struct loop_context {
86    Block loop_exit;
87 
88    unsigned header_idx_old;
89    Block* exit_old;
90    bool divergent_cont_old;
91    bool divergent_branch_old;
92    bool divergent_if_old;
93 };
94 
95 static bool visit_cf_list(struct isel_context* ctx, struct exec_list* list);
96 
97 static void
add_logical_edge(unsigned pred_idx,Block * succ)98 add_logical_edge(unsigned pred_idx, Block* succ)
99 {
100    succ->logical_preds.emplace_back(pred_idx);
101 }
102 
103 static void
add_linear_edge(unsigned pred_idx,Block * succ)104 add_linear_edge(unsigned pred_idx, Block* succ)
105 {
106    succ->linear_preds.emplace_back(pred_idx);
107 }
108 
109 static void
add_edge(unsigned pred_idx,Block * succ)110 add_edge(unsigned pred_idx, Block* succ)
111 {
112    add_logical_edge(pred_idx, succ);
113    add_linear_edge(pred_idx, succ);
114 }
115 
116 static void
append_logical_start(Block * b)117 append_logical_start(Block* b)
118 {
119    Builder(NULL, b).pseudo(aco_opcode::p_logical_start);
120 }
121 
122 static void
append_logical_end(Block * b)123 append_logical_end(Block* b)
124 {
125    Builder(NULL, b).pseudo(aco_opcode::p_logical_end);
126 }
127 
128 Temp
get_ssa_temp(struct isel_context * ctx,nir_ssa_def * def)129 get_ssa_temp(struct isel_context* ctx, nir_ssa_def* def)
130 {
131    uint32_t id = ctx->first_temp_id + def->index;
132    return Temp(id, ctx->program->temp_rc[id]);
133 }
134 
135 Temp
emit_mbcnt(isel_context * ctx,Temp dst,Operand mask=Operand (),Operand base=Operand::zero ())136 emit_mbcnt(isel_context* ctx, Temp dst, Operand mask = Operand(), Operand base = Operand::zero())
137 {
138    Builder bld(ctx->program, ctx->block);
139    assert(mask.isUndefined() || mask.isTemp() || (mask.isFixed() && mask.physReg() == exec));
140    assert(mask.isUndefined() || mask.bytes() == bld.lm.bytes());
141 
142    if (ctx->program->wave_size == 32) {
143       Operand mask_lo = mask.isUndefined() ? Operand::c32(-1u) : mask;
144       return bld.vop3(aco_opcode::v_mbcnt_lo_u32_b32, Definition(dst), mask_lo, base);
145    }
146 
147    Operand mask_lo = Operand::c32(-1u);
148    Operand mask_hi = Operand::c32(-1u);
149 
150    if (mask.isTemp()) {
151       RegClass rc = RegClass(mask.regClass().type(), 1);
152       Builder::Result mask_split =
153          bld.pseudo(aco_opcode::p_split_vector, bld.def(rc), bld.def(rc), mask);
154       mask_lo = Operand(mask_split.def(0).getTemp());
155       mask_hi = Operand(mask_split.def(1).getTemp());
156    } else if (mask.physReg() == exec) {
157       mask_lo = Operand(exec_lo, s1);
158       mask_hi = Operand(exec_hi, s1);
159    }
160 
161    Temp mbcnt_lo = bld.vop3(aco_opcode::v_mbcnt_lo_u32_b32, bld.def(v1), mask_lo, base);
162 
163    if (ctx->program->chip_class <= GFX7)
164       return bld.vop2(aco_opcode::v_mbcnt_hi_u32_b32, Definition(dst), mask_hi, mbcnt_lo);
165    else
166       return bld.vop3(aco_opcode::v_mbcnt_hi_u32_b32_e64, Definition(dst), mask_hi, mbcnt_lo);
167 }
168 
169 Temp
emit_wqm(Builder & bld,Temp src,Temp dst=Temp (0,s1),bool program_needs_wqm=false)170 emit_wqm(Builder& bld, Temp src, Temp dst = Temp(0, s1), bool program_needs_wqm = false)
171 {
172    if (!dst.id())
173       dst = bld.tmp(src.regClass());
174 
175    assert(src.size() == dst.size());
176 
177    if (bld.program->stage != fragment_fs) {
178       if (!dst.id())
179          return src;
180 
181       bld.copy(Definition(dst), src);
182       return dst;
183    }
184 
185    bld.pseudo(aco_opcode::p_wqm, Definition(dst), src);
186    bld.program->needs_wqm |= program_needs_wqm;
187    return dst;
188 }
189 
190 static Temp
emit_bpermute(isel_context * ctx,Builder & bld,Temp index,Temp data)191 emit_bpermute(isel_context* ctx, Builder& bld, Temp index, Temp data)
192 {
193    if (index.regClass() == s1)
194       return bld.readlane(bld.def(s1), data, index);
195 
196    if (ctx->options->chip_class <= GFX7) {
197       /* GFX6-7: there is no bpermute instruction */
198       Operand index_op(index);
199       Operand input_data(data);
200       index_op.setLateKill(true);
201       input_data.setLateKill(true);
202 
203       return bld.pseudo(aco_opcode::p_bpermute, bld.def(v1), bld.def(bld.lm), bld.def(bld.lm, vcc),
204                         index_op, input_data);
205    } else if (ctx->options->chip_class >= GFX10 && ctx->program->wave_size == 64) {
206 
207       /* GFX10 wave64 mode: emulate full-wave bpermute */
208       Temp index_is_lo =
209          bld.vopc(aco_opcode::v_cmp_ge_u32, bld.def(bld.lm), Operand::c32(31u), index);
210       Builder::Result index_is_lo_split =
211          bld.pseudo(aco_opcode::p_split_vector, bld.def(s1), bld.def(s1), index_is_lo);
212       Temp index_is_lo_n1 = bld.sop1(aco_opcode::s_not_b32, bld.def(s1), bld.def(s1, scc),
213                                      index_is_lo_split.def(1).getTemp());
214       Operand same_half = bld.pseudo(aco_opcode::p_create_vector, bld.def(s2),
215                                      index_is_lo_split.def(0).getTemp(), index_is_lo_n1);
216       Operand index_x4 = bld.vop2(aco_opcode::v_lshlrev_b32, bld.def(v1), Operand::c32(2u), index);
217       Operand input_data(data);
218 
219       index_x4.setLateKill(true);
220       input_data.setLateKill(true);
221       same_half.setLateKill(true);
222 
223       /* We need one pair of shared VGPRs:
224        * Note, that these have twice the allocation granularity of normal VGPRs */
225       ctx->program->config->num_shared_vgprs = 2 * ctx->program->dev.vgpr_alloc_granule;
226 
227       return bld.pseudo(aco_opcode::p_bpermute, bld.def(v1), bld.def(s2), bld.def(s1, scc),
228                         index_x4, input_data, same_half);
229    } else {
230       /* GFX8-9 or GFX10 wave32: bpermute works normally */
231       Temp index_x4 = bld.vop2(aco_opcode::v_lshlrev_b32, bld.def(v1), Operand::c32(2u), index);
232       return bld.ds(aco_opcode::ds_bpermute_b32, bld.def(v1), index_x4, data);
233    }
234 }
235 
236 static Temp
emit_masked_swizzle(isel_context * ctx,Builder & bld,Temp src,unsigned mask)237 emit_masked_swizzle(isel_context* ctx, Builder& bld, Temp src, unsigned mask)
238 {
239    if (ctx->options->chip_class >= GFX8) {
240       unsigned and_mask = mask & 0x1f;
241       unsigned or_mask = (mask >> 5) & 0x1f;
242       unsigned xor_mask = (mask >> 10) & 0x1f;
243 
244       uint16_t dpp_ctrl = 0xffff;
245 
246       // TODO: we could use DPP8 for some swizzles
247       if (and_mask == 0x1f && or_mask < 4 && xor_mask < 4) {
248          unsigned res[4] = {0, 1, 2, 3};
249          for (unsigned i = 0; i < 4; i++)
250             res[i] = ((res[i] | or_mask) ^ xor_mask) & 0x3;
251          dpp_ctrl = dpp_quad_perm(res[0], res[1], res[2], res[3]);
252       } else if (and_mask == 0x1f && !or_mask && xor_mask == 8) {
253          dpp_ctrl = dpp_row_rr(8);
254       } else if (and_mask == 0x1f && !or_mask && xor_mask == 0xf) {
255          dpp_ctrl = dpp_row_mirror;
256       } else if (and_mask == 0x1f && !or_mask && xor_mask == 0x7) {
257          dpp_ctrl = dpp_row_half_mirror;
258       }
259 
260       if (dpp_ctrl != 0xffff)
261          return bld.vop1_dpp(aco_opcode::v_mov_b32, bld.def(v1), src, dpp_ctrl);
262    }
263 
264    return bld.ds(aco_opcode::ds_swizzle_b32, bld.def(v1), src, mask, 0, false);
265 }
266 
267 Temp
as_vgpr(isel_context * ctx,Temp val)268 as_vgpr(isel_context* ctx, Temp val)
269 {
270    if (val.type() == RegType::sgpr) {
271       Builder bld(ctx->program, ctx->block);
272       return bld.copy(bld.def(RegType::vgpr, val.size()), val);
273    }
274    assert(val.type() == RegType::vgpr);
275    return val;
276 }
277 
278 // assumes a != 0xffffffff
279 void
emit_v_div_u32(isel_context * ctx,Temp dst,Temp a,uint32_t b)280 emit_v_div_u32(isel_context* ctx, Temp dst, Temp a, uint32_t b)
281 {
282    assert(b != 0);
283    Builder bld(ctx->program, ctx->block);
284 
285    if (util_is_power_of_two_or_zero(b)) {
286       bld.vop2(aco_opcode::v_lshrrev_b32, Definition(dst), Operand::c32(util_logbase2(b)), a);
287       return;
288    }
289 
290    util_fast_udiv_info info = util_compute_fast_udiv_info(b, 32, 32);
291 
292    assert(info.multiplier <= 0xffffffff);
293 
294    bool pre_shift = info.pre_shift != 0;
295    bool increment = info.increment != 0;
296    bool multiply = true;
297    bool post_shift = info.post_shift != 0;
298 
299    if (!pre_shift && !increment && !multiply && !post_shift) {
300       bld.copy(Definition(dst), a);
301       return;
302    }
303 
304    Temp pre_shift_dst = a;
305    if (pre_shift) {
306       pre_shift_dst = (increment || multiply || post_shift) ? bld.tmp(v1) : dst;
307       bld.vop2(aco_opcode::v_lshrrev_b32, Definition(pre_shift_dst), Operand::c32(info.pre_shift),
308                a);
309    }
310 
311    Temp increment_dst = pre_shift_dst;
312    if (increment) {
313       increment_dst = (post_shift || multiply) ? bld.tmp(v1) : dst;
314       bld.vadd32(Definition(increment_dst), Operand::c32(info.increment), pre_shift_dst);
315    }
316 
317    Temp multiply_dst = increment_dst;
318    if (multiply) {
319       multiply_dst = post_shift ? bld.tmp(v1) : dst;
320       bld.vop3(aco_opcode::v_mul_hi_u32, Definition(multiply_dst), increment_dst,
321                bld.copy(bld.def(v1), Operand::c32(info.multiplier)));
322    }
323 
324    if (post_shift) {
325       bld.vop2(aco_opcode::v_lshrrev_b32, Definition(dst), Operand::c32(info.post_shift),
326                multiply_dst);
327    }
328 }
329 
330 void
emit_extract_vector(isel_context * ctx,Temp src,uint32_t idx,Temp dst)331 emit_extract_vector(isel_context* ctx, Temp src, uint32_t idx, Temp dst)
332 {
333    Builder bld(ctx->program, ctx->block);
334    bld.pseudo(aco_opcode::p_extract_vector, Definition(dst), src, Operand::c32(idx));
335 }
336 
337 Temp
emit_extract_vector(isel_context * ctx,Temp src,uint32_t idx,RegClass dst_rc)338 emit_extract_vector(isel_context* ctx, Temp src, uint32_t idx, RegClass dst_rc)
339 {
340    /* no need to extract the whole vector */
341    if (src.regClass() == dst_rc) {
342       assert(idx == 0);
343       return src;
344    }
345 
346    assert(src.bytes() > (idx * dst_rc.bytes()));
347    Builder bld(ctx->program, ctx->block);
348    auto it = ctx->allocated_vec.find(src.id());
349    if (it != ctx->allocated_vec.end() && dst_rc.bytes() == it->second[idx].regClass().bytes()) {
350       if (it->second[idx].regClass() == dst_rc) {
351          return it->second[idx];
352       } else {
353          assert(!dst_rc.is_subdword());
354          assert(dst_rc.type() == RegType::vgpr && it->second[idx].type() == RegType::sgpr);
355          return bld.copy(bld.def(dst_rc), it->second[idx]);
356       }
357    }
358 
359    if (dst_rc.is_subdword())
360       src = as_vgpr(ctx, src);
361 
362    if (src.bytes() == dst_rc.bytes()) {
363       assert(idx == 0);
364       return bld.copy(bld.def(dst_rc), src);
365    } else {
366       Temp dst = bld.tmp(dst_rc);
367       emit_extract_vector(ctx, src, idx, dst);
368       return dst;
369    }
370 }
371 
372 void
emit_split_vector(isel_context * ctx,Temp vec_src,unsigned num_components)373 emit_split_vector(isel_context* ctx, Temp vec_src, unsigned num_components)
374 {
375    if (num_components == 1)
376       return;
377    if (ctx->allocated_vec.find(vec_src.id()) != ctx->allocated_vec.end())
378       return;
379    RegClass rc;
380    if (num_components > vec_src.size()) {
381       if (vec_src.type() == RegType::sgpr) {
382          /* should still help get_alu_src() */
383          emit_split_vector(ctx, vec_src, vec_src.size());
384          return;
385       }
386       /* sub-dword split */
387       rc = RegClass(RegType::vgpr, vec_src.bytes() / num_components).as_subdword();
388    } else {
389       rc = RegClass(vec_src.type(), vec_src.size() / num_components);
390    }
391    aco_ptr<Pseudo_instruction> split{create_instruction<Pseudo_instruction>(
392       aco_opcode::p_split_vector, Format::PSEUDO, 1, num_components)};
393    split->operands[0] = Operand(vec_src);
394    std::array<Temp, NIR_MAX_VEC_COMPONENTS> elems;
395    for (unsigned i = 0; i < num_components; i++) {
396       elems[i] = ctx->program->allocateTmp(rc);
397       split->definitions[i] = Definition(elems[i]);
398    }
399    ctx->block->instructions.emplace_back(std::move(split));
400    ctx->allocated_vec.emplace(vec_src.id(), elems);
401 }
402 
403 /* This vector expansion uses a mask to determine which elements in the new vector
404  * come from the original vector. The other elements are undefined. */
405 void
expand_vector(isel_context * ctx,Temp vec_src,Temp dst,unsigned num_components,unsigned mask)406 expand_vector(isel_context* ctx, Temp vec_src, Temp dst, unsigned num_components, unsigned mask)
407 {
408    emit_split_vector(ctx, vec_src, util_bitcount(mask));
409 
410    if (vec_src == dst)
411       return;
412 
413    Builder bld(ctx->program, ctx->block);
414    if (num_components == 1) {
415       if (dst.type() == RegType::sgpr)
416          bld.pseudo(aco_opcode::p_as_uniform, Definition(dst), vec_src);
417       else
418          bld.copy(Definition(dst), vec_src);
419       return;
420    }
421 
422    unsigned component_size = dst.size() / num_components;
423    std::array<Temp, NIR_MAX_VEC_COMPONENTS> elems;
424 
425    aco_ptr<Pseudo_instruction> vec{create_instruction<Pseudo_instruction>(
426       aco_opcode::p_create_vector, Format::PSEUDO, num_components, 1)};
427    vec->definitions[0] = Definition(dst);
428    unsigned k = 0;
429    for (unsigned i = 0; i < num_components; i++) {
430       if (mask & (1 << i)) {
431          Temp src =
432             emit_extract_vector(ctx, vec_src, k++, RegClass(vec_src.type(), component_size));
433          if (dst.type() == RegType::sgpr)
434             src = bld.as_uniform(src);
435          vec->operands[i] = Operand(src);
436       } else {
437          vec->operands[i] = Operand::zero(component_size == 2 ? 8 : 4);
438       }
439       elems[i] = vec->operands[i].getTemp();
440    }
441    ctx->block->instructions.emplace_back(std::move(vec));
442    ctx->allocated_vec.emplace(dst.id(), elems);
443 }
444 
445 /* adjust misaligned small bit size loads */
446 void
byte_align_scalar(isel_context * ctx,Temp vec,Operand offset,Temp dst)447 byte_align_scalar(isel_context* ctx, Temp vec, Operand offset, Temp dst)
448 {
449    Builder bld(ctx->program, ctx->block);
450    Operand shift;
451    Temp select = Temp();
452    if (offset.isConstant()) {
453       assert(offset.constantValue() && offset.constantValue() < 4);
454       shift = Operand::c32(offset.constantValue() * 8);
455    } else {
456       /* bit_offset = 8 * (offset & 0x3) */
457       Temp tmp =
458          bld.sop2(aco_opcode::s_and_b32, bld.def(s1), bld.def(s1, scc), offset, Operand::c32(3u));
459       select = bld.tmp(s1);
460       shift = bld.sop2(aco_opcode::s_lshl_b32, bld.def(s1), bld.scc(Definition(select)), tmp,
461                        Operand::c32(3u));
462    }
463 
464    if (vec.size() == 1) {
465       bld.sop2(aco_opcode::s_lshr_b32, Definition(dst), bld.def(s1, scc), vec, shift);
466    } else if (vec.size() == 2) {
467       Temp tmp = dst.size() == 2 ? dst : bld.tmp(s2);
468       bld.sop2(aco_opcode::s_lshr_b64, Definition(tmp), bld.def(s1, scc), vec, shift);
469       if (tmp == dst)
470          emit_split_vector(ctx, dst, 2);
471       else
472          emit_extract_vector(ctx, tmp, 0, dst);
473    } else if (vec.size() == 3 || vec.size() == 4) {
474       Temp lo = bld.tmp(s2), hi;
475       if (vec.size() == 3) {
476          /* this can happen if we use VMEM for a uniform load */
477          hi = bld.tmp(s1);
478          bld.pseudo(aco_opcode::p_split_vector, Definition(lo), Definition(hi), vec);
479       } else {
480          hi = bld.tmp(s2);
481          bld.pseudo(aco_opcode::p_split_vector, Definition(lo), Definition(hi), vec);
482          hi = bld.pseudo(aco_opcode::p_extract_vector, bld.def(s1), hi, Operand::zero());
483       }
484       if (select != Temp())
485          hi =
486             bld.sop2(aco_opcode::s_cselect_b32, bld.def(s1), hi, Operand::zero(), bld.scc(select));
487       lo = bld.sop2(aco_opcode::s_lshr_b64, bld.def(s2), bld.def(s1, scc), lo, shift);
488       Temp mid = bld.tmp(s1);
489       lo = bld.pseudo(aco_opcode::p_split_vector, bld.def(s1), Definition(mid), lo);
490       hi = bld.sop2(aco_opcode::s_lshl_b32, bld.def(s1), bld.def(s1, scc), hi, shift);
491       mid = bld.sop2(aco_opcode::s_or_b32, bld.def(s1), bld.def(s1, scc), hi, mid);
492       bld.pseudo(aco_opcode::p_create_vector, Definition(dst), lo, mid);
493       emit_split_vector(ctx, dst, 2);
494    }
495 }
496 
497 void
byte_align_vector(isel_context * ctx,Temp vec,Operand offset,Temp dst,unsigned component_size)498 byte_align_vector(isel_context* ctx, Temp vec, Operand offset, Temp dst, unsigned component_size)
499 {
500    Builder bld(ctx->program, ctx->block);
501    if (offset.isTemp()) {
502       Temp tmp[4] = {vec, vec, vec, vec};
503 
504       if (vec.size() == 4) {
505          tmp[0] = bld.tmp(v1), tmp[1] = bld.tmp(v1), tmp[2] = bld.tmp(v1), tmp[3] = bld.tmp(v1);
506          bld.pseudo(aco_opcode::p_split_vector, Definition(tmp[0]), Definition(tmp[1]),
507                     Definition(tmp[2]), Definition(tmp[3]), vec);
508       } else if (vec.size() == 3) {
509          tmp[0] = bld.tmp(v1), tmp[1] = bld.tmp(v1), tmp[2] = bld.tmp(v1);
510          bld.pseudo(aco_opcode::p_split_vector, Definition(tmp[0]), Definition(tmp[1]),
511                     Definition(tmp[2]), vec);
512       } else if (vec.size() == 2) {
513          tmp[0] = bld.tmp(v1), tmp[1] = bld.tmp(v1), tmp[2] = tmp[1];
514          bld.pseudo(aco_opcode::p_split_vector, Definition(tmp[0]), Definition(tmp[1]), vec);
515       }
516       for (unsigned i = 0; i < dst.size(); i++)
517          tmp[i] = bld.vop3(aco_opcode::v_alignbyte_b32, bld.def(v1), tmp[i + 1], tmp[i], offset);
518 
519       vec = tmp[0];
520       if (dst.size() == 2)
521          vec = bld.pseudo(aco_opcode::p_create_vector, bld.def(v2), tmp[0], tmp[1]);
522 
523       offset = Operand::zero();
524    }
525 
526    unsigned num_components = vec.bytes() / component_size;
527    if (vec.regClass() == dst.regClass()) {
528       assert(offset.constantValue() == 0);
529       bld.copy(Definition(dst), vec);
530       emit_split_vector(ctx, dst, num_components);
531       return;
532    }
533 
534    emit_split_vector(ctx, vec, num_components);
535    std::array<Temp, NIR_MAX_VEC_COMPONENTS> elems;
536    RegClass rc = RegClass(RegType::vgpr, component_size).as_subdword();
537 
538    assert(offset.constantValue() % component_size == 0);
539    unsigned skip = offset.constantValue() / component_size;
540    for (unsigned i = skip; i < num_components; i++)
541       elems[i - skip] = emit_extract_vector(ctx, vec, i, rc);
542 
543    if (dst.type() == RegType::vgpr) {
544       /* if dst is vgpr - split the src and create a shrunk version according to the mask. */
545       num_components = dst.bytes() / component_size;
546       aco_ptr<Pseudo_instruction> create_vec{create_instruction<Pseudo_instruction>(
547          aco_opcode::p_create_vector, Format::PSEUDO, num_components, 1)};
548       for (unsigned i = 0; i < num_components; i++)
549          create_vec->operands[i] = Operand(elems[i]);
550       create_vec->definitions[0] = Definition(dst);
551       bld.insert(std::move(create_vec));
552 
553    } else if (skip) {
554       /* if dst is sgpr - split the src, but move the original to sgpr. */
555       vec = bld.pseudo(aco_opcode::p_as_uniform, bld.def(RegClass(RegType::sgpr, vec.size())), vec);
556       byte_align_scalar(ctx, vec, offset, dst);
557    } else {
558       assert(dst.size() == vec.size());
559       bld.pseudo(aco_opcode::p_as_uniform, Definition(dst), vec);
560    }
561 
562    ctx->allocated_vec.emplace(dst.id(), elems);
563 }
564 
565 Temp
bool_to_vector_condition(isel_context * ctx,Temp val,Temp dst=Temp (0,s2))566 bool_to_vector_condition(isel_context* ctx, Temp val, Temp dst = Temp(0, s2))
567 {
568    Builder bld(ctx->program, ctx->block);
569    if (!dst.id())
570       dst = bld.tmp(bld.lm);
571 
572    assert(val.regClass() == s1);
573    assert(dst.regClass() == bld.lm);
574 
575    return bld.sop2(Builder::s_cselect, Definition(dst), Operand::c32(-1), Operand::zero(),
576                    bld.scc(val));
577 }
578 
579 Temp
bool_to_scalar_condition(isel_context * ctx,Temp val,Temp dst=Temp (0,s1))580 bool_to_scalar_condition(isel_context* ctx, Temp val, Temp dst = Temp(0, s1))
581 {
582    Builder bld(ctx->program, ctx->block);
583    if (!dst.id())
584       dst = bld.tmp(s1);
585 
586    assert(val.regClass() == bld.lm);
587    assert(dst.regClass() == s1);
588 
589    /* if we're currently in WQM mode, ensure that the source is also computed in WQM */
590    Temp tmp = bld.tmp(s1);
591    bld.sop2(Builder::s_and, bld.def(bld.lm), bld.scc(Definition(tmp)), val, Operand(exec, bld.lm));
592    return emit_wqm(bld, tmp, dst);
593 }
594 
595 /**
596  * Copies the first src_bits of the input to the output Temp. Input bits at positions larger than
597  * src_bits and dst_bits are truncated.
598  *
599  * Sign extension may be applied using the sign_extend parameter. The position of the input sign
600  * bit is indicated by src_bits in this case.
601  *
602  * If dst.bytes() is larger than dst_bits/8, the value of the upper bits is undefined.
603  */
604 Temp
convert_int(isel_context * ctx,Builder & bld,Temp src,unsigned src_bits,unsigned dst_bits,bool sign_extend,Temp dst=Temp ())605 convert_int(isel_context* ctx, Builder& bld, Temp src, unsigned src_bits, unsigned dst_bits,
606             bool sign_extend, Temp dst = Temp())
607 {
608    assert(!(sign_extend && dst_bits < src_bits) &&
609           "Shrinking integers is not supported for signed inputs");
610 
611    if (!dst.id()) {
612       if (dst_bits % 32 == 0 || src.type() == RegType::sgpr)
613          dst = bld.tmp(src.type(), DIV_ROUND_UP(dst_bits, 32u));
614       else
615          dst = bld.tmp(RegClass(RegType::vgpr, dst_bits / 8u).as_subdword());
616    }
617 
618    assert(src.type() == RegType::sgpr || src_bits == src.bytes() * 8);
619    assert(dst.type() == RegType::sgpr || dst_bits == dst.bytes() * 8);
620 
621    if (dst.bytes() == src.bytes() && dst_bits < src_bits) {
622       /* Copy the raw value, leaving an undefined value in the upper bits for
623        * the caller to handle appropriately */
624       return bld.copy(Definition(dst), src);
625    } else if (dst.bytes() < src.bytes()) {
626       return bld.pseudo(aco_opcode::p_extract_vector, Definition(dst), src, Operand::zero());
627    }
628 
629    Temp tmp = dst;
630    if (dst_bits == 64)
631       tmp = src_bits == 32 ? src : bld.tmp(src.type(), 1);
632 
633    if (tmp == src) {
634    } else if (src.regClass() == s1) {
635       assert(src_bits < 32);
636       bld.pseudo(aco_opcode::p_extract, Definition(tmp), bld.def(s1, scc), src, Operand::zero(),
637                  Operand::c32(src_bits), Operand::c32((unsigned)sign_extend));
638    } else {
639       assert(src_bits < 32);
640       bld.pseudo(aco_opcode::p_extract, Definition(tmp), src, Operand::zero(), Operand::c32(src_bits),
641                  Operand::c32((unsigned)sign_extend));
642    }
643 
644    if (dst_bits == 64) {
645       if (sign_extend && dst.regClass() == s2) {
646          Temp high =
647             bld.sop2(aco_opcode::s_ashr_i32, bld.def(s1), bld.def(s1, scc), tmp, Operand::c32(31u));
648          bld.pseudo(aco_opcode::p_create_vector, Definition(dst), tmp, high);
649       } else if (sign_extend && dst.regClass() == v2) {
650          Temp high = bld.vop2(aco_opcode::v_ashrrev_i32, bld.def(v1), Operand::c32(31u), tmp);
651          bld.pseudo(aco_opcode::p_create_vector, Definition(dst), tmp, high);
652       } else {
653          bld.pseudo(aco_opcode::p_create_vector, Definition(dst), tmp, Operand::zero());
654       }
655    }
656 
657    return dst;
658 }
659 
660 enum sgpr_extract_mode {
661    sgpr_extract_sext,
662    sgpr_extract_zext,
663    sgpr_extract_undef,
664 };
665 
666 Temp
extract_8_16_bit_sgpr_element(isel_context * ctx,Temp dst,nir_alu_src * src,sgpr_extract_mode mode)667 extract_8_16_bit_sgpr_element(isel_context* ctx, Temp dst, nir_alu_src* src, sgpr_extract_mode mode)
668 {
669    Temp vec = get_ssa_temp(ctx, src->src.ssa);
670    unsigned src_size = src->src.ssa->bit_size;
671    unsigned swizzle = src->swizzle[0];
672 
673    if (vec.size() > 1) {
674       assert(src_size == 16);
675       vec = emit_extract_vector(ctx, vec, swizzle / 2, s1);
676       swizzle = swizzle & 1;
677    }
678 
679    Builder bld(ctx->program, ctx->block);
680    Temp tmp = dst.regClass() == s2 ? bld.tmp(s1) : dst;
681 
682    if (mode == sgpr_extract_undef && swizzle == 0)
683       bld.copy(Definition(tmp), vec);
684    else
685       bld.pseudo(aco_opcode::p_extract, Definition(tmp), bld.def(s1, scc), Operand(vec),
686                  Operand::c32(swizzle), Operand::c32(src_size),
687                  Operand::c32((mode == sgpr_extract_sext)));
688 
689    if (dst.regClass() == s2)
690       convert_int(ctx, bld, tmp, 32, 64, mode == sgpr_extract_sext, dst);
691 
692    return dst;
693 }
694 
695 Temp
get_alu_src(struct isel_context * ctx,nir_alu_src src,unsigned size=1)696 get_alu_src(struct isel_context* ctx, nir_alu_src src, unsigned size = 1)
697 {
698    if (src.src.ssa->num_components == 1 && size == 1)
699       return get_ssa_temp(ctx, src.src.ssa);
700 
701    Temp vec = get_ssa_temp(ctx, src.src.ssa);
702    unsigned elem_size = src.src.ssa->bit_size / 8u;
703    bool identity_swizzle = true;
704 
705    for (unsigned i = 0; identity_swizzle && i < size; i++) {
706       if (src.swizzle[i] != i)
707          identity_swizzle = false;
708    }
709    if (identity_swizzle)
710       return emit_extract_vector(ctx, vec, 0, RegClass::get(vec.type(), elem_size * size));
711 
712    assert(elem_size > 0);
713    assert(vec.bytes() % elem_size == 0);
714 
715    if (elem_size < 4 && vec.type() == RegType::sgpr && size == 1) {
716       assert(src.src.ssa->bit_size == 8 || src.src.ssa->bit_size == 16);
717       return extract_8_16_bit_sgpr_element(ctx, ctx->program->allocateTmp(s1), &src,
718                                            sgpr_extract_undef);
719    }
720 
721    bool as_uniform = elem_size < 4 && vec.type() == RegType::sgpr;
722    if (as_uniform)
723       vec = as_vgpr(ctx, vec);
724 
725    RegClass elem_rc = elem_size < 4 ? RegClass(vec.type(), elem_size).as_subdword()
726                                     : RegClass(vec.type(), elem_size / 4);
727    if (size == 1) {
728       return emit_extract_vector(ctx, vec, src.swizzle[0], elem_rc);
729    } else {
730       assert(size <= 4);
731       std::array<Temp, NIR_MAX_VEC_COMPONENTS> elems;
732       aco_ptr<Pseudo_instruction> vec_instr{create_instruction<Pseudo_instruction>(
733          aco_opcode::p_create_vector, Format::PSEUDO, size, 1)};
734       for (unsigned i = 0; i < size; ++i) {
735          elems[i] = emit_extract_vector(ctx, vec, src.swizzle[i], elem_rc);
736          vec_instr->operands[i] = Operand{elems[i]};
737       }
738       Temp dst = ctx->program->allocateTmp(RegClass(vec.type(), elem_size * size / 4));
739       vec_instr->definitions[0] = Definition(dst);
740       ctx->block->instructions.emplace_back(std::move(vec_instr));
741       ctx->allocated_vec.emplace(dst.id(), elems);
742       return vec.type() == RegType::sgpr ? Builder(ctx->program, ctx->block).as_uniform(dst) : dst;
743    }
744 }
745 
746 Temp
get_alu_src_vop3p(struct isel_context * ctx,nir_alu_src src)747 get_alu_src_vop3p(struct isel_context* ctx, nir_alu_src src)
748 {
749    /* returns v2b or v1 for vop3p usage.
750     * The source expects exactly 2 16bit components
751     * which are within the same dword
752     */
753    assert(src.src.ssa->bit_size == 16);
754    assert(src.swizzle[0] >> 1 == src.swizzle[1] >> 1);
755 
756    Temp tmp = get_ssa_temp(ctx, src.src.ssa);
757    if (tmp.size() == 1)
758       return tmp;
759 
760    /* the size is larger than 1 dword: check the swizzle */
761    unsigned dword = src.swizzle[0] >> 1;
762 
763    /* extract a full dword if possible */
764    if (tmp.bytes() >= (dword + 1) * 4) {
765       return emit_extract_vector(ctx, tmp, dword, RegClass(tmp.type(), 1));
766    } else {
767       /* This must be a swizzled access to %a.zz where %a is v6b */
768       assert(((src.swizzle[0] | src.swizzle[1]) & 1) == 0);
769       assert(tmp.regClass() == v6b && dword == 1);
770       return emit_extract_vector(ctx, tmp, dword * 2, v2b);
771    }
772 }
773 
774 uint32_t
get_alu_src_ub(isel_context * ctx,nir_alu_instr * instr,int src_idx)775 get_alu_src_ub(isel_context* ctx, nir_alu_instr* instr, int src_idx)
776 {
777    nir_ssa_scalar scalar =
778       nir_ssa_scalar{instr->src[src_idx].src.ssa, instr->src[src_idx].swizzle[0]};
779    return nir_unsigned_upper_bound(ctx->shader, ctx->range_ht, scalar, &ctx->ub_config);
780 }
781 
782 Temp
convert_pointer_to_64_bit(isel_context * ctx,Temp ptr,bool non_uniform=false)783 convert_pointer_to_64_bit(isel_context* ctx, Temp ptr, bool non_uniform = false)
784 {
785    if (ptr.size() == 2)
786       return ptr;
787    Builder bld(ctx->program, ctx->block);
788    if (ptr.type() == RegType::vgpr && !non_uniform)
789       ptr = bld.as_uniform(ptr);
790    return bld.pseudo(aco_opcode::p_create_vector, bld.def(RegClass(ptr.type(), 2)), ptr,
791                      Operand::c32((unsigned)ctx->options->address32_hi));
792 }
793 
794 void
emit_sop2_instruction(isel_context * ctx,nir_alu_instr * instr,aco_opcode op,Temp dst,bool writes_scc,uint8_t uses_ub=0)795 emit_sop2_instruction(isel_context* ctx, nir_alu_instr* instr, aco_opcode op, Temp dst,
796                       bool writes_scc, uint8_t uses_ub = 0)
797 {
798    aco_ptr<SOP2_instruction> sop2{
799       create_instruction<SOP2_instruction>(op, Format::SOP2, 2, writes_scc ? 2 : 1)};
800    sop2->operands[0] = Operand(get_alu_src(ctx, instr->src[0]));
801    sop2->operands[1] = Operand(get_alu_src(ctx, instr->src[1]));
802    sop2->definitions[0] = Definition(dst);
803    if (instr->no_unsigned_wrap)
804       sop2->definitions[0].setNUW(true);
805    if (writes_scc)
806       sop2->definitions[1] = Definition(ctx->program->allocateId(s1), scc, s1);
807 
808    for (int i = 0; i < 2; i++) {
809       if (uses_ub & (1 << i)) {
810          uint32_t src_ub = get_alu_src_ub(ctx, instr, i);
811          if (src_ub <= 0xffff)
812             sop2->operands[i].set16bit(true);
813          else if (src_ub <= 0xffffff)
814             sop2->operands[i].set24bit(true);
815       }
816    }
817 
818    ctx->block->instructions.emplace_back(std::move(sop2));
819 }
820 
821 void
emit_vop2_instruction(isel_context * ctx,nir_alu_instr * instr,aco_opcode opc,Temp dst,bool commutative,bool swap_srcs=false,bool flush_denorms=false,bool nuw=false,uint8_t uses_ub=0)822 emit_vop2_instruction(isel_context* ctx, nir_alu_instr* instr, aco_opcode opc, Temp dst,
823                       bool commutative, bool swap_srcs = false, bool flush_denorms = false,
824                       bool nuw = false, uint8_t uses_ub = 0)
825 {
826    Builder bld(ctx->program, ctx->block);
827    bld.is_precise = instr->exact;
828 
829    Temp src0 = get_alu_src(ctx, instr->src[swap_srcs ? 1 : 0]);
830    Temp src1 = get_alu_src(ctx, instr->src[swap_srcs ? 0 : 1]);
831    if (src1.type() == RegType::sgpr) {
832       if (commutative && src0.type() == RegType::vgpr) {
833          Temp t = src0;
834          src0 = src1;
835          src1 = t;
836       } else {
837          src1 = as_vgpr(ctx, src1);
838       }
839    }
840 
841    Operand op[2] = {Operand(src0), Operand(src1)};
842 
843    for (int i = 0; i < 2; i++) {
844       if (uses_ub & (1 << i)) {
845          uint32_t src_ub = get_alu_src_ub(ctx, instr, swap_srcs ? !i : i);
846          if (src_ub <= 0xffff)
847             op[i].set16bit(true);
848          else if (src_ub <= 0xffffff)
849             op[i].set24bit(true);
850       }
851    }
852 
853    if (flush_denorms && ctx->program->chip_class < GFX9) {
854       assert(dst.size() == 1);
855       Temp tmp = bld.vop2(opc, bld.def(v1), op[0], op[1]);
856       bld.vop2(aco_opcode::v_mul_f32, Definition(dst), Operand::c32(0x3f800000u), tmp);
857    } else {
858       if (nuw) {
859          bld.nuw().vop2(opc, Definition(dst), op[0], op[1]);
860       } else {
861          bld.vop2(opc, Definition(dst), op[0], op[1]);
862       }
863    }
864 }
865 
866 void
emit_vop2_instruction_logic64(isel_context * ctx,nir_alu_instr * instr,aco_opcode op,Temp dst)867 emit_vop2_instruction_logic64(isel_context* ctx, nir_alu_instr* instr, aco_opcode op, Temp dst)
868 {
869    Builder bld(ctx->program, ctx->block);
870    bld.is_precise = instr->exact;
871 
872    Temp src0 = get_alu_src(ctx, instr->src[0]);
873    Temp src1 = get_alu_src(ctx, instr->src[1]);
874 
875    if (src1.type() == RegType::sgpr) {
876       assert(src0.type() == RegType::vgpr);
877       std::swap(src0, src1);
878    }
879 
880    Temp src00 = bld.tmp(src0.type(), 1);
881    Temp src01 = bld.tmp(src0.type(), 1);
882    bld.pseudo(aco_opcode::p_split_vector, Definition(src00), Definition(src01), src0);
883    Temp src10 = bld.tmp(v1);
884    Temp src11 = bld.tmp(v1);
885    bld.pseudo(aco_opcode::p_split_vector, Definition(src10), Definition(src11), src1);
886    Temp lo = bld.vop2(op, bld.def(v1), src00, src10);
887    Temp hi = bld.vop2(op, bld.def(v1), src01, src11);
888    bld.pseudo(aco_opcode::p_create_vector, Definition(dst), lo, hi);
889 }
890 
891 void
emit_vop3a_instruction(isel_context * ctx,nir_alu_instr * instr,aco_opcode op,Temp dst,bool flush_denorms=false,unsigned num_sources=2,bool swap_srcs=false)892 emit_vop3a_instruction(isel_context* ctx, nir_alu_instr* instr, aco_opcode op, Temp dst,
893                        bool flush_denorms = false, unsigned num_sources = 2, bool swap_srcs = false)
894 {
895    assert(num_sources == 2 || num_sources == 3);
896    Temp src[3] = {Temp(0, v1), Temp(0, v1), Temp(0, v1)};
897    bool has_sgpr = false;
898    for (unsigned i = 0; i < num_sources; i++) {
899       src[i] = get_alu_src(ctx, instr->src[swap_srcs ? 1 - i : i]);
900       if (has_sgpr)
901          src[i] = as_vgpr(ctx, src[i]);
902       else
903          has_sgpr = src[i].type() == RegType::sgpr;
904    }
905 
906    Builder bld(ctx->program, ctx->block);
907    bld.is_precise = instr->exact;
908    if (flush_denorms && ctx->program->chip_class < GFX9) {
909       Temp tmp;
910       if (num_sources == 3)
911          tmp = bld.vop3(op, bld.def(dst.regClass()), src[0], src[1], src[2]);
912       else
913          tmp = bld.vop3(op, bld.def(dst.regClass()), src[0], src[1]);
914       if (dst.size() == 1)
915          bld.vop2(aco_opcode::v_mul_f32, Definition(dst), Operand::c32(0x3f800000u), tmp);
916       else
917          bld.vop3(aco_opcode::v_mul_f64, Definition(dst), Operand::c64(0x3FF0000000000000), tmp);
918    } else if (num_sources == 3) {
919       bld.vop3(op, Definition(dst), src[0], src[1], src[2]);
920    } else {
921       bld.vop3(op, Definition(dst), src[0], src[1]);
922    }
923 }
924 
925 Builder::Result
emit_vop3p_instruction(isel_context * ctx,nir_alu_instr * instr,aco_opcode op,Temp dst,bool swap_srcs=false)926 emit_vop3p_instruction(isel_context* ctx, nir_alu_instr* instr, aco_opcode op, Temp dst,
927                        bool swap_srcs = false)
928 {
929    Temp src0 = get_alu_src_vop3p(ctx, instr->src[swap_srcs]);
930    Temp src1 = get_alu_src_vop3p(ctx, instr->src[!swap_srcs]);
931    if (src0.type() == RegType::sgpr && src1.type() == RegType::sgpr)
932       src1 = as_vgpr(ctx, src1);
933    assert(instr->dest.dest.ssa.num_components == 2);
934 
935    /* swizzle to opsel: all swizzles are either 0 (x) or 1 (y) */
936    unsigned opsel_lo =
937       (instr->src[!swap_srcs].swizzle[0] & 1) << 1 | (instr->src[swap_srcs].swizzle[0] & 1);
938    unsigned opsel_hi =
939       (instr->src[!swap_srcs].swizzle[1] & 1) << 1 | (instr->src[swap_srcs].swizzle[1] & 1);
940 
941    Builder bld(ctx->program, ctx->block);
942    bld.is_precise = instr->exact;
943    Builder::Result res = bld.vop3p(op, Definition(dst), src0, src1, opsel_lo, opsel_hi);
944    emit_split_vector(ctx, dst, 2);
945    return res;
946 }
947 
948 void
emit_idot_instruction(isel_context * ctx,nir_alu_instr * instr,aco_opcode op,Temp dst,bool clamp)949 emit_idot_instruction(isel_context* ctx, nir_alu_instr* instr, aco_opcode op, Temp dst, bool clamp)
950 {
951    Temp src[3] = {Temp(0, v1), Temp(0, v1), Temp(0, v1)};
952    bool has_sgpr = false;
953    for (unsigned i = 0; i < 3; i++) {
954       src[i] = get_alu_src(ctx, instr->src[i]);
955       if (has_sgpr)
956          src[i] = as_vgpr(ctx, src[i]);
957       else
958          has_sgpr = src[i].type() == RegType::sgpr;
959    }
960 
961    Builder bld(ctx->program, ctx->block);
962    bld.is_precise = instr->exact;
963    bld.vop3p(op, Definition(dst), src[0], src[1], src[2], 0x0, 0x7).instr->vop3p().clamp = clamp;
964 }
965 
966 void
emit_vop1_instruction(isel_context * ctx,nir_alu_instr * instr,aco_opcode op,Temp dst)967 emit_vop1_instruction(isel_context* ctx, nir_alu_instr* instr, aco_opcode op, Temp dst)
968 {
969    Builder bld(ctx->program, ctx->block);
970    bld.is_precise = instr->exact;
971    if (dst.type() == RegType::sgpr)
972       bld.pseudo(aco_opcode::p_as_uniform, Definition(dst),
973                  bld.vop1(op, bld.def(RegType::vgpr, dst.size()), get_alu_src(ctx, instr->src[0])));
974    else
975       bld.vop1(op, Definition(dst), get_alu_src(ctx, instr->src[0]));
976 }
977 
978 void
emit_vopc_instruction(isel_context * ctx,nir_alu_instr * instr,aco_opcode op,Temp dst)979 emit_vopc_instruction(isel_context* ctx, nir_alu_instr* instr, aco_opcode op, Temp dst)
980 {
981    Temp src0 = get_alu_src(ctx, instr->src[0]);
982    Temp src1 = get_alu_src(ctx, instr->src[1]);
983    assert(src0.size() == src1.size());
984 
985    aco_ptr<Instruction> vopc;
986    if (src1.type() == RegType::sgpr) {
987       if (src0.type() == RegType::vgpr) {
988          /* to swap the operands, we might also have to change the opcode */
989          switch (op) {
990          case aco_opcode::v_cmp_lt_f16: op = aco_opcode::v_cmp_gt_f16; break;
991          case aco_opcode::v_cmp_ge_f16: op = aco_opcode::v_cmp_le_f16; break;
992          case aco_opcode::v_cmp_lt_i16: op = aco_opcode::v_cmp_gt_i16; break;
993          case aco_opcode::v_cmp_ge_i16: op = aco_opcode::v_cmp_le_i16; break;
994          case aco_opcode::v_cmp_lt_u16: op = aco_opcode::v_cmp_gt_u16; break;
995          case aco_opcode::v_cmp_ge_u16: op = aco_opcode::v_cmp_le_u16; break;
996          case aco_opcode::v_cmp_lt_f32: op = aco_opcode::v_cmp_gt_f32; break;
997          case aco_opcode::v_cmp_ge_f32: op = aco_opcode::v_cmp_le_f32; break;
998          case aco_opcode::v_cmp_lt_i32: op = aco_opcode::v_cmp_gt_i32; break;
999          case aco_opcode::v_cmp_ge_i32: op = aco_opcode::v_cmp_le_i32; break;
1000          case aco_opcode::v_cmp_lt_u32: op = aco_opcode::v_cmp_gt_u32; break;
1001          case aco_opcode::v_cmp_ge_u32: op = aco_opcode::v_cmp_le_u32; break;
1002          case aco_opcode::v_cmp_lt_f64: op = aco_opcode::v_cmp_gt_f64; break;
1003          case aco_opcode::v_cmp_ge_f64: op = aco_opcode::v_cmp_le_f64; break;
1004          case aco_opcode::v_cmp_lt_i64: op = aco_opcode::v_cmp_gt_i64; break;
1005          case aco_opcode::v_cmp_ge_i64: op = aco_opcode::v_cmp_le_i64; break;
1006          case aco_opcode::v_cmp_lt_u64: op = aco_opcode::v_cmp_gt_u64; break;
1007          case aco_opcode::v_cmp_ge_u64: op = aco_opcode::v_cmp_le_u64; break;
1008          default: /* eq and ne are commutative */ break;
1009          }
1010          Temp t = src0;
1011          src0 = src1;
1012          src1 = t;
1013       } else {
1014          src1 = as_vgpr(ctx, src1);
1015       }
1016    }
1017 
1018    Builder bld(ctx->program, ctx->block);
1019    bld.vopc(op, bld.hint_vcc(Definition(dst)), src0, src1);
1020 }
1021 
1022 void
emit_sopc_instruction(isel_context * ctx,nir_alu_instr * instr,aco_opcode op,Temp dst)1023 emit_sopc_instruction(isel_context* ctx, nir_alu_instr* instr, aco_opcode op, Temp dst)
1024 {
1025    Temp src0 = get_alu_src(ctx, instr->src[0]);
1026    Temp src1 = get_alu_src(ctx, instr->src[1]);
1027    Builder bld(ctx->program, ctx->block);
1028 
1029    assert(dst.regClass() == bld.lm);
1030    assert(src0.type() == RegType::sgpr);
1031    assert(src1.type() == RegType::sgpr);
1032    assert(src0.regClass() == src1.regClass());
1033 
1034    /* Emit the SALU comparison instruction */
1035    Temp cmp = bld.sopc(op, bld.scc(bld.def(s1)), src0, src1);
1036    /* Turn the result into a per-lane bool */
1037    bool_to_vector_condition(ctx, cmp, dst);
1038 }
1039 
1040 void
emit_comparison(isel_context * ctx,nir_alu_instr * instr,Temp dst,aco_opcode v16_op,aco_opcode v32_op,aco_opcode v64_op,aco_opcode s32_op=aco_opcode::num_opcodes,aco_opcode s64_op=aco_opcode::num_opcodes)1041 emit_comparison(isel_context* ctx, nir_alu_instr* instr, Temp dst, aco_opcode v16_op,
1042                 aco_opcode v32_op, aco_opcode v64_op, aco_opcode s32_op = aco_opcode::num_opcodes,
1043                 aco_opcode s64_op = aco_opcode::num_opcodes)
1044 {
1045    aco_opcode s_op = instr->src[0].src.ssa->bit_size == 64   ? s64_op
1046                      : instr->src[0].src.ssa->bit_size == 32 ? s32_op
1047                                                              : aco_opcode::num_opcodes;
1048    aco_opcode v_op = instr->src[0].src.ssa->bit_size == 64   ? v64_op
1049                      : instr->src[0].src.ssa->bit_size == 32 ? v32_op
1050                                                              : v16_op;
1051    bool use_valu = s_op == aco_opcode::num_opcodes || nir_dest_is_divergent(instr->dest.dest) ||
1052                    get_ssa_temp(ctx, instr->src[0].src.ssa).type() == RegType::vgpr ||
1053                    get_ssa_temp(ctx, instr->src[1].src.ssa).type() == RegType::vgpr;
1054    aco_opcode op = use_valu ? v_op : s_op;
1055    assert(op != aco_opcode::num_opcodes);
1056    assert(dst.regClass() == ctx->program->lane_mask);
1057 
1058    if (use_valu)
1059       emit_vopc_instruction(ctx, instr, op, dst);
1060    else
1061       emit_sopc_instruction(ctx, instr, op, dst);
1062 }
1063 
1064 void
emit_boolean_logic(isel_context * ctx,nir_alu_instr * instr,Builder::WaveSpecificOpcode op,Temp dst)1065 emit_boolean_logic(isel_context* ctx, nir_alu_instr* instr, Builder::WaveSpecificOpcode op,
1066                    Temp dst)
1067 {
1068    Builder bld(ctx->program, ctx->block);
1069    Temp src0 = get_alu_src(ctx, instr->src[0]);
1070    Temp src1 = get_alu_src(ctx, instr->src[1]);
1071 
1072    assert(dst.regClass() == bld.lm);
1073    assert(src0.regClass() == bld.lm);
1074    assert(src1.regClass() == bld.lm);
1075 
1076    bld.sop2(op, Definition(dst), bld.def(s1, scc), src0, src1);
1077 }
1078 
1079 void
emit_bcsel(isel_context * ctx,nir_alu_instr * instr,Temp dst)1080 emit_bcsel(isel_context* ctx, nir_alu_instr* instr, Temp dst)
1081 {
1082    Builder bld(ctx->program, ctx->block);
1083    Temp cond = get_alu_src(ctx, instr->src[0]);
1084    Temp then = get_alu_src(ctx, instr->src[1]);
1085    Temp els = get_alu_src(ctx, instr->src[2]);
1086 
1087    assert(cond.regClass() == bld.lm);
1088 
1089    if (dst.type() == RegType::vgpr) {
1090       aco_ptr<Instruction> bcsel;
1091       if (dst.size() == 1) {
1092          then = as_vgpr(ctx, then);
1093          els = as_vgpr(ctx, els);
1094 
1095          bld.vop2(aco_opcode::v_cndmask_b32, Definition(dst), els, then, cond);
1096       } else if (dst.size() == 2) {
1097          Temp then_lo = bld.tmp(v1), then_hi = bld.tmp(v1);
1098          bld.pseudo(aco_opcode::p_split_vector, Definition(then_lo), Definition(then_hi), then);
1099          Temp else_lo = bld.tmp(v1), else_hi = bld.tmp(v1);
1100          bld.pseudo(aco_opcode::p_split_vector, Definition(else_lo), Definition(else_hi), els);
1101 
1102          Temp dst0 = bld.vop2(aco_opcode::v_cndmask_b32, bld.def(v1), else_lo, then_lo, cond);
1103          Temp dst1 = bld.vop2(aco_opcode::v_cndmask_b32, bld.def(v1), else_hi, then_hi, cond);
1104 
1105          bld.pseudo(aco_opcode::p_create_vector, Definition(dst), dst0, dst1);
1106       } else {
1107          isel_err(&instr->instr, "Unimplemented NIR instr bit size");
1108       }
1109       return;
1110    }
1111 
1112    if (instr->dest.dest.ssa.bit_size == 1) {
1113       assert(dst.regClass() == bld.lm);
1114       assert(then.regClass() == bld.lm);
1115       assert(els.regClass() == bld.lm);
1116    }
1117 
1118    if (!nir_src_is_divergent(instr->src[0].src)) { /* uniform condition and values in sgpr */
1119       if (dst.regClass() == s1 || dst.regClass() == s2) {
1120          assert((then.regClass() == s1 || then.regClass() == s2) &&
1121                 els.regClass() == then.regClass());
1122          assert(dst.size() == then.size());
1123          aco_opcode op =
1124             dst.regClass() == s1 ? aco_opcode::s_cselect_b32 : aco_opcode::s_cselect_b64;
1125          bld.sop2(op, Definition(dst), then, els, bld.scc(bool_to_scalar_condition(ctx, cond)));
1126       } else {
1127          isel_err(&instr->instr, "Unimplemented uniform bcsel bit size");
1128       }
1129       return;
1130    }
1131 
1132    /* divergent boolean bcsel
1133     * this implements bcsel on bools: dst = s0 ? s1 : s2
1134     * are going to be: dst = (s0 & s1) | (~s0 & s2) */
1135    assert(instr->dest.dest.ssa.bit_size == 1);
1136 
1137    if (cond.id() != then.id())
1138       then = bld.sop2(Builder::s_and, bld.def(bld.lm), bld.def(s1, scc), cond, then);
1139 
1140    if (cond.id() == els.id())
1141       bld.copy(Definition(dst), then);
1142    else
1143       bld.sop2(Builder::s_or, Definition(dst), bld.def(s1, scc), then,
1144                bld.sop2(Builder::s_andn2, bld.def(bld.lm), bld.def(s1, scc), els, cond));
1145 }
1146 
1147 void
emit_scaled_op(isel_context * ctx,Builder & bld,Definition dst,Temp val,aco_opcode op,uint32_t undo)1148 emit_scaled_op(isel_context* ctx, Builder& bld, Definition dst, Temp val, aco_opcode op,
1149                uint32_t undo)
1150 {
1151    /* multiply by 16777216 to handle denormals */
1152    Temp is_denormal =
1153       bld.vopc(aco_opcode::v_cmp_class_f32, bld.hint_vcc(bld.def(bld.lm)), as_vgpr(ctx, val),
1154                bld.copy(bld.def(v1), Operand::c32((1u << 7) | (1u << 4))));
1155    Temp scaled = bld.vop2(aco_opcode::v_mul_f32, bld.def(v1), Operand::c32(0x4b800000u), val);
1156    scaled = bld.vop1(op, bld.def(v1), scaled);
1157    scaled = bld.vop2(aco_opcode::v_mul_f32, bld.def(v1), Operand::c32(undo), scaled);
1158 
1159    Temp not_scaled = bld.vop1(op, bld.def(v1), val);
1160 
1161    bld.vop2(aco_opcode::v_cndmask_b32, dst, not_scaled, scaled, is_denormal);
1162 }
1163 
1164 void
emit_rcp(isel_context * ctx,Builder & bld,Definition dst,Temp val)1165 emit_rcp(isel_context* ctx, Builder& bld, Definition dst, Temp val)
1166 {
1167    if (ctx->block->fp_mode.denorm32 == 0) {
1168       bld.vop1(aco_opcode::v_rcp_f32, dst, val);
1169       return;
1170    }
1171 
1172    emit_scaled_op(ctx, bld, dst, val, aco_opcode::v_rcp_f32, 0x4b800000u);
1173 }
1174 
1175 void
emit_rsq(isel_context * ctx,Builder & bld,Definition dst,Temp val)1176 emit_rsq(isel_context* ctx, Builder& bld, Definition dst, Temp val)
1177 {
1178    if (ctx->block->fp_mode.denorm32 == 0) {
1179       bld.vop1(aco_opcode::v_rsq_f32, dst, val);
1180       return;
1181    }
1182 
1183    emit_scaled_op(ctx, bld, dst, val, aco_opcode::v_rsq_f32, 0x45800000u);
1184 }
1185 
1186 void
emit_sqrt(isel_context * ctx,Builder & bld,Definition dst,Temp val)1187 emit_sqrt(isel_context* ctx, Builder& bld, Definition dst, Temp val)
1188 {
1189    if (ctx->block->fp_mode.denorm32 == 0) {
1190       bld.vop1(aco_opcode::v_sqrt_f32, dst, val);
1191       return;
1192    }
1193 
1194    emit_scaled_op(ctx, bld, dst, val, aco_opcode::v_sqrt_f32, 0x39800000u);
1195 }
1196 
1197 void
emit_log2(isel_context * ctx,Builder & bld,Definition dst,Temp val)1198 emit_log2(isel_context* ctx, Builder& bld, Definition dst, Temp val)
1199 {
1200    if (ctx->block->fp_mode.denorm32 == 0) {
1201       bld.vop1(aco_opcode::v_log_f32, dst, val);
1202       return;
1203    }
1204 
1205    emit_scaled_op(ctx, bld, dst, val, aco_opcode::v_log_f32, 0xc1c00000u);
1206 }
1207 
1208 Temp
emit_trunc_f64(isel_context * ctx,Builder & bld,Definition dst,Temp val)1209 emit_trunc_f64(isel_context* ctx, Builder& bld, Definition dst, Temp val)
1210 {
1211    if (ctx->options->chip_class >= GFX7)
1212       return bld.vop1(aco_opcode::v_trunc_f64, Definition(dst), val);
1213 
1214    /* GFX6 doesn't support V_TRUNC_F64, lower it. */
1215    /* TODO: create more efficient code! */
1216    if (val.type() == RegType::sgpr)
1217       val = as_vgpr(ctx, val);
1218 
1219    /* Split the input value. */
1220    Temp val_lo = bld.tmp(v1), val_hi = bld.tmp(v1);
1221    bld.pseudo(aco_opcode::p_split_vector, Definition(val_lo), Definition(val_hi), val);
1222 
1223    /* Extract the exponent and compute the unbiased value. */
1224    Temp exponent =
1225       bld.vop3(aco_opcode::v_bfe_u32, bld.def(v1), val_hi, Operand::c32(20u), Operand::c32(11u));
1226    exponent = bld.vsub32(bld.def(v1), exponent, Operand::c32(1023u));
1227 
1228    /* Extract the fractional part. */
1229    Temp fract_mask = bld.pseudo(aco_opcode::p_create_vector, bld.def(v2), Operand::c32(-1u),
1230                                 Operand::c32(0x000fffffu));
1231    fract_mask = bld.vop3(aco_opcode::v_lshr_b64, bld.def(v2), fract_mask, exponent);
1232 
1233    Temp fract_mask_lo = bld.tmp(v1), fract_mask_hi = bld.tmp(v1);
1234    bld.pseudo(aco_opcode::p_split_vector, Definition(fract_mask_lo), Definition(fract_mask_hi),
1235               fract_mask);
1236 
1237    Temp fract_lo = bld.tmp(v1), fract_hi = bld.tmp(v1);
1238    Temp tmp = bld.vop1(aco_opcode::v_not_b32, bld.def(v1), fract_mask_lo);
1239    fract_lo = bld.vop2(aco_opcode::v_and_b32, bld.def(v1), val_lo, tmp);
1240    tmp = bld.vop1(aco_opcode::v_not_b32, bld.def(v1), fract_mask_hi);
1241    fract_hi = bld.vop2(aco_opcode::v_and_b32, bld.def(v1), val_hi, tmp);
1242 
1243    /* Get the sign bit. */
1244    Temp sign = bld.vop2(aco_opcode::v_and_b32, bld.def(v1), Operand::c32(0x80000000u), val_hi);
1245 
1246    /* Decide the operation to apply depending on the unbiased exponent. */
1247    Temp exp_lt0 = bld.vopc_e64(aco_opcode::v_cmp_lt_i32, bld.hint_vcc(bld.def(bld.lm)), exponent,
1248                                Operand::zero());
1249    Temp dst_lo = bld.vop2(aco_opcode::v_cndmask_b32, bld.def(v1), fract_lo,
1250                           bld.copy(bld.def(v1), Operand::zero()), exp_lt0);
1251    Temp dst_hi = bld.vop2(aco_opcode::v_cndmask_b32, bld.def(v1), fract_hi, sign, exp_lt0);
1252    Temp exp_gt51 = bld.vopc_e64(aco_opcode::v_cmp_gt_i32, bld.def(s2), exponent, Operand::c32(51u));
1253    dst_lo = bld.vop2(aco_opcode::v_cndmask_b32, bld.def(v1), dst_lo, val_lo, exp_gt51);
1254    dst_hi = bld.vop2(aco_opcode::v_cndmask_b32, bld.def(v1), dst_hi, val_hi, exp_gt51);
1255 
1256    return bld.pseudo(aco_opcode::p_create_vector, Definition(dst), dst_lo, dst_hi);
1257 }
1258 
1259 Temp
emit_floor_f64(isel_context * ctx,Builder & bld,Definition dst,Temp val)1260 emit_floor_f64(isel_context* ctx, Builder& bld, Definition dst, Temp val)
1261 {
1262    if (ctx->options->chip_class >= GFX7)
1263       return bld.vop1(aco_opcode::v_floor_f64, Definition(dst), val);
1264 
1265    /* GFX6 doesn't support V_FLOOR_F64, lower it (note that it's actually
1266     * lowered at NIR level for precision reasons). */
1267    Temp src0 = as_vgpr(ctx, val);
1268 
1269    Temp mask = bld.copy(bld.def(s1), Operand::c32(3u)); /* isnan */
1270    Temp min_val = bld.pseudo(aco_opcode::p_create_vector, bld.def(s2), Operand::c32(-1u),
1271                              Operand::c32(0x3fefffffu));
1272 
1273    Temp isnan =
1274       bld.vopc_e64(aco_opcode::v_cmp_class_f64, bld.hint_vcc(bld.def(bld.lm)), src0, mask);
1275    Temp fract = bld.vop1(aco_opcode::v_fract_f64, bld.def(v2), src0);
1276    Temp min = bld.vop3(aco_opcode::v_min_f64, bld.def(v2), fract, min_val);
1277 
1278    Temp then_lo = bld.tmp(v1), then_hi = bld.tmp(v1);
1279    bld.pseudo(aco_opcode::p_split_vector, Definition(then_lo), Definition(then_hi), src0);
1280    Temp else_lo = bld.tmp(v1), else_hi = bld.tmp(v1);
1281    bld.pseudo(aco_opcode::p_split_vector, Definition(else_lo), Definition(else_hi), min);
1282 
1283    Temp dst0 = bld.vop2(aco_opcode::v_cndmask_b32, bld.def(v1), else_lo, then_lo, isnan);
1284    Temp dst1 = bld.vop2(aco_opcode::v_cndmask_b32, bld.def(v1), else_hi, then_hi, isnan);
1285 
1286    Temp v = bld.pseudo(aco_opcode::p_create_vector, bld.def(v2), dst0, dst1);
1287 
1288    Instruction* add = bld.vop3(aco_opcode::v_add_f64, Definition(dst), src0, v);
1289    add->vop3().neg[1] = true;
1290 
1291    return add->definitions[0].getTemp();
1292 }
1293 
1294 Temp
uadd32_sat(Builder & bld,Definition dst,Temp src0,Temp src1)1295 uadd32_sat(Builder& bld, Definition dst, Temp src0, Temp src1)
1296 {
1297    if (bld.program->chip_class < GFX8) {
1298       Builder::Result add = bld.vadd32(bld.def(v1), src0, src1, true);
1299       return bld.vop2_e64(aco_opcode::v_cndmask_b32, dst, add.def(0).getTemp(), Operand::c32(-1),
1300                           add.def(1).getTemp());
1301    }
1302 
1303    Builder::Result add(NULL);
1304    if (bld.program->chip_class >= GFX9) {
1305       add = bld.vop2_e64(aco_opcode::v_add_u32, dst, src0, src1);
1306    } else {
1307       add = bld.vop2_e64(aco_opcode::v_add_co_u32, dst, bld.hint_vcc(bld.def(bld.lm)), src0, src1);
1308    }
1309    add.instr->vop3().clamp = 1;
1310    return dst.getTemp();
1311 }
1312 
1313 void
visit_alu_instr(isel_context * ctx,nir_alu_instr * instr)1314 visit_alu_instr(isel_context* ctx, nir_alu_instr* instr)
1315 {
1316    if (!instr->dest.dest.is_ssa) {
1317       isel_err(&instr->instr, "nir alu dst not in ssa");
1318       abort();
1319    }
1320    Builder bld(ctx->program, ctx->block);
1321    bld.is_precise = instr->exact;
1322    Temp dst = get_ssa_temp(ctx, &instr->dest.dest.ssa);
1323    switch (instr->op) {
1324    case nir_op_vec2:
1325    case nir_op_vec3:
1326    case nir_op_vec4:
1327    case nir_op_vec5: {
1328       std::array<Temp, NIR_MAX_VEC_COMPONENTS> elems;
1329       unsigned num = instr->dest.dest.ssa.num_components;
1330       for (unsigned i = 0; i < num; ++i)
1331          elems[i] = get_alu_src(ctx, instr->src[i]);
1332 
1333       if (instr->dest.dest.ssa.bit_size >= 32 || dst.type() == RegType::vgpr) {
1334          aco_ptr<Pseudo_instruction> vec{create_instruction<Pseudo_instruction>(
1335             aco_opcode::p_create_vector, Format::PSEUDO, instr->dest.dest.ssa.num_components, 1)};
1336          RegClass elem_rc = RegClass::get(RegType::vgpr, instr->dest.dest.ssa.bit_size / 8u);
1337          for (unsigned i = 0; i < num; ++i) {
1338             if (elems[i].type() == RegType::sgpr && elem_rc.is_subdword())
1339                elems[i] = emit_extract_vector(ctx, elems[i], 0, elem_rc);
1340             vec->operands[i] = Operand{elems[i]};
1341          }
1342          vec->definitions[0] = Definition(dst);
1343          ctx->block->instructions.emplace_back(std::move(vec));
1344          ctx->allocated_vec.emplace(dst.id(), elems);
1345       } else {
1346          bool use_s_pack = ctx->program->chip_class >= GFX9;
1347          Temp mask = bld.copy(bld.def(s1), Operand::c32((1u << instr->dest.dest.ssa.bit_size) - 1));
1348 
1349          std::array<Temp, NIR_MAX_VEC_COMPONENTS> packed;
1350          uint32_t const_vals[NIR_MAX_VEC_COMPONENTS] = {};
1351          for (unsigned i = 0; i < num; i++) {
1352             unsigned packed_size = use_s_pack ? 16 : 32;
1353             unsigned idx = i * instr->dest.dest.ssa.bit_size / packed_size;
1354             unsigned offset = i * instr->dest.dest.ssa.bit_size % packed_size;
1355             if (nir_src_is_const(instr->src[i].src)) {
1356                const_vals[idx] |= nir_src_as_uint(instr->src[i].src) << offset;
1357                continue;
1358             }
1359 
1360             if (offset != packed_size - instr->dest.dest.ssa.bit_size)
1361                elems[i] =
1362                   bld.sop2(aco_opcode::s_and_b32, bld.def(s1), bld.def(s1, scc), elems[i], mask);
1363 
1364             if (offset)
1365                elems[i] = bld.sop2(aco_opcode::s_lshl_b32, bld.def(s1), bld.def(s1, scc), elems[i],
1366                                    Operand::c32(offset));
1367 
1368             if (packed[idx].id())
1369                packed[idx] = bld.sop2(aco_opcode::s_or_b32, bld.def(s1), bld.def(s1, scc), elems[i],
1370                                       packed[idx]);
1371             else
1372                packed[idx] = elems[i];
1373          }
1374 
1375          if (use_s_pack) {
1376             for (unsigned i = 0; i < dst.size(); i++) {
1377                bool same = !!packed[i * 2].id() == !!packed[i * 2 + 1].id();
1378 
1379                if (packed[i * 2].id() && packed[i * 2 + 1].id())
1380                   packed[i] = bld.sop2(aco_opcode::s_pack_ll_b32_b16, bld.def(s1), packed[i * 2],
1381                                        packed[i * 2 + 1]);
1382                else if (packed[i * 2 + 1].id())
1383                   packed[i] = bld.sop2(aco_opcode::s_pack_ll_b32_b16, bld.def(s1),
1384                                        Operand::c32(const_vals[i * 2]), packed[i * 2 + 1]);
1385                else if (packed[i * 2].id())
1386                   packed[i] = bld.sop2(aco_opcode::s_pack_ll_b32_b16, bld.def(s1), packed[i * 2],
1387                                        Operand::c32(const_vals[i * 2 + 1]));
1388 
1389                if (same)
1390                   const_vals[i] = const_vals[i * 2] | (const_vals[i * 2 + 1] << 16);
1391                else
1392                   const_vals[i] = 0;
1393             }
1394          }
1395 
1396          for (unsigned i = 0; i < dst.size(); i++) {
1397             if (const_vals[i] && packed[i].id())
1398                packed[i] = bld.sop2(aco_opcode::s_or_b32, bld.def(s1), bld.def(s1, scc),
1399                                     Operand::c32(const_vals[i]), packed[i]);
1400             else if (!packed[i].id())
1401                packed[i] = bld.copy(bld.def(s1), Operand::c32(const_vals[i]));
1402          }
1403 
1404          if (dst.size() == 1)
1405             bld.copy(Definition(dst), packed[0]);
1406          else if (dst.size() == 2)
1407             bld.pseudo(aco_opcode::p_create_vector, Definition(dst), packed[0], packed[1]);
1408          else
1409             bld.pseudo(aco_opcode::p_create_vector, Definition(dst), packed[0], packed[1],
1410                        packed[2]);
1411       }
1412       break;
1413    }
1414    case nir_op_mov: {
1415       Temp src = get_alu_src(ctx, instr->src[0]);
1416       if (src.type() == RegType::vgpr && dst.type() == RegType::sgpr) {
1417          /* use size() instead of bytes() for 8/16-bit */
1418          assert(src.size() == dst.size() && "wrong src or dst register class for nir_op_mov");
1419          bld.pseudo(aco_opcode::p_as_uniform, Definition(dst), src);
1420       } else {
1421          assert(src.bytes() == dst.bytes() && "wrong src or dst register class for nir_op_mov");
1422          bld.copy(Definition(dst), src);
1423       }
1424       break;
1425    }
1426    case nir_op_inot: {
1427       Temp src = get_alu_src(ctx, instr->src[0]);
1428       if (dst.regClass() == v1 || dst.regClass() == v2b || dst.regClass() == v1b) {
1429          emit_vop1_instruction(ctx, instr, aco_opcode::v_not_b32, dst);
1430       } else if (dst.regClass() == v2) {
1431          Temp lo = bld.tmp(v1), hi = bld.tmp(v1);
1432          bld.pseudo(aco_opcode::p_split_vector, Definition(lo), Definition(hi), src);
1433          lo = bld.vop1(aco_opcode::v_not_b32, bld.def(v1), lo);
1434          hi = bld.vop1(aco_opcode::v_not_b32, bld.def(v1), hi);
1435          bld.pseudo(aco_opcode::p_create_vector, Definition(dst), lo, hi);
1436       } else if (dst.type() == RegType::sgpr) {
1437          aco_opcode opcode = dst.size() == 1 ? aco_opcode::s_not_b32 : aco_opcode::s_not_b64;
1438          bld.sop1(opcode, Definition(dst), bld.def(s1, scc), src);
1439       } else {
1440          isel_err(&instr->instr, "Unimplemented NIR instr bit size");
1441       }
1442       break;
1443    }
1444    case nir_op_iabs: {
1445       Temp src = get_alu_src(ctx, instr->src[0]);
1446       if (dst.regClass() == s1) {
1447          bld.sop1(aco_opcode::s_abs_i32, Definition(dst), bld.def(s1, scc), src);
1448       } else if (dst.regClass() == v1) {
1449          bld.vop2(aco_opcode::v_max_i32, Definition(dst), src,
1450                   bld.vsub32(bld.def(v1), Operand::zero(), src));
1451       } else {
1452          isel_err(&instr->instr, "Unimplemented NIR instr bit size");
1453       }
1454       break;
1455    }
1456    case nir_op_isign: {
1457       Temp src = get_alu_src(ctx, instr->src[0]);
1458       if (dst.regClass() == s1) {
1459          Temp tmp =
1460             bld.sop2(aco_opcode::s_max_i32, bld.def(s1), bld.def(s1, scc), src, Operand::c32(-1));
1461          bld.sop2(aco_opcode::s_min_i32, Definition(dst), bld.def(s1, scc), tmp, Operand::c32(1u));
1462       } else if (dst.regClass() == s2) {
1463          Temp neg =
1464             bld.sop2(aco_opcode::s_ashr_i64, bld.def(s2), bld.def(s1, scc), src, Operand::c32(63u));
1465          Temp neqz;
1466          if (ctx->program->chip_class >= GFX8)
1467             neqz = bld.sopc(aco_opcode::s_cmp_lg_u64, bld.def(s1, scc), src, Operand::zero());
1468          else
1469             neqz =
1470                bld.sop2(aco_opcode::s_or_b64, bld.def(s2), bld.def(s1, scc), src, Operand::zero())
1471                   .def(1)
1472                   .getTemp();
1473          /* SCC gets zero-extended to 64 bit */
1474          bld.sop2(aco_opcode::s_or_b64, Definition(dst), bld.def(s1, scc), neg, bld.scc(neqz));
1475       } else if (dst.regClass() == v1) {
1476          bld.vop3(aco_opcode::v_med3_i32, Definition(dst), Operand::c32(-1), src, Operand::c32(1u));
1477       } else if (dst.regClass() == v2) {
1478          Temp upper = emit_extract_vector(ctx, src, 1, v1);
1479          Temp neg = bld.vop2(aco_opcode::v_ashrrev_i32, bld.def(v1), Operand::c32(31u), upper);
1480          Temp gtz =
1481             bld.vopc(aco_opcode::v_cmp_ge_i64, bld.hint_vcc(bld.def(bld.lm)), Operand::zero(), src);
1482          Temp lower = bld.vop2(aco_opcode::v_cndmask_b32, bld.def(v1), Operand::c32(1u), neg, gtz);
1483          upper = bld.vop2(aco_opcode::v_cndmask_b32, bld.def(v1), Operand::zero(), neg, gtz);
1484          bld.pseudo(aco_opcode::p_create_vector, Definition(dst), lower, upper);
1485       } else {
1486          isel_err(&instr->instr, "Unimplemented NIR instr bit size");
1487       }
1488       break;
1489    }
1490    case nir_op_imax: {
1491       if (dst.regClass() == v2b && ctx->program->chip_class >= GFX10) {
1492          emit_vop3a_instruction(ctx, instr, aco_opcode::v_max_i16_e64, dst);
1493       } else if (dst.regClass() == v2b) {
1494          emit_vop2_instruction(ctx, instr, aco_opcode::v_max_i16, dst, true);
1495       } else if (dst.regClass() == v1 && instr->dest.dest.ssa.bit_size == 16) {
1496          emit_vop3p_instruction(ctx, instr, aco_opcode::v_pk_max_i16, dst);
1497       } else if (dst.regClass() == v1) {
1498          emit_vop2_instruction(ctx, instr, aco_opcode::v_max_i32, dst, true);
1499       } else if (dst.regClass() == s1) {
1500          emit_sop2_instruction(ctx, instr, aco_opcode::s_max_i32, dst, true);
1501       } else {
1502          isel_err(&instr->instr, "Unimplemented NIR instr bit size");
1503       }
1504       break;
1505    }
1506    case nir_op_umax: {
1507       if (dst.regClass() == v2b && ctx->program->chip_class >= GFX10) {
1508          emit_vop3a_instruction(ctx, instr, aco_opcode::v_max_u16_e64, dst);
1509       } else if (dst.regClass() == v2b) {
1510          emit_vop2_instruction(ctx, instr, aco_opcode::v_max_u16, dst, true);
1511       } else if (dst.regClass() == v1 && instr->dest.dest.ssa.bit_size == 16) {
1512          emit_vop3p_instruction(ctx, instr, aco_opcode::v_pk_max_u16, dst);
1513       } else if (dst.regClass() == v1) {
1514          emit_vop2_instruction(ctx, instr, aco_opcode::v_max_u32, dst, true);
1515       } else if (dst.regClass() == s1) {
1516          emit_sop2_instruction(ctx, instr, aco_opcode::s_max_u32, dst, true);
1517       } else {
1518          isel_err(&instr->instr, "Unimplemented NIR instr bit size");
1519       }
1520       break;
1521    }
1522    case nir_op_imin: {
1523       if (dst.regClass() == v2b && ctx->program->chip_class >= GFX10) {
1524          emit_vop3a_instruction(ctx, instr, aco_opcode::v_min_i16_e64, dst);
1525       } else if (dst.regClass() == v2b) {
1526          emit_vop2_instruction(ctx, instr, aco_opcode::v_min_i16, dst, true);
1527       } else if (dst.regClass() == v1 && instr->dest.dest.ssa.bit_size == 16) {
1528          emit_vop3p_instruction(ctx, instr, aco_opcode::v_pk_min_i16, dst);
1529       } else if (dst.regClass() == v1) {
1530          emit_vop2_instruction(ctx, instr, aco_opcode::v_min_i32, dst, true);
1531       } else if (dst.regClass() == s1) {
1532          emit_sop2_instruction(ctx, instr, aco_opcode::s_min_i32, dst, true);
1533       } else {
1534          isel_err(&instr->instr, "Unimplemented NIR instr bit size");
1535       }
1536       break;
1537    }
1538    case nir_op_umin: {
1539       if (dst.regClass() == v2b && ctx->program->chip_class >= GFX10) {
1540          emit_vop3a_instruction(ctx, instr, aco_opcode::v_min_u16_e64, dst);
1541       } else if (dst.regClass() == v2b) {
1542          emit_vop2_instruction(ctx, instr, aco_opcode::v_min_u16, dst, true);
1543       } else if (dst.regClass() == v1 && instr->dest.dest.ssa.bit_size == 16) {
1544          emit_vop3p_instruction(ctx, instr, aco_opcode::v_pk_min_u16, dst);
1545       } else if (dst.regClass() == v1) {
1546          emit_vop2_instruction(ctx, instr, aco_opcode::v_min_u32, dst, true);
1547       } else if (dst.regClass() == s1) {
1548          emit_sop2_instruction(ctx, instr, aco_opcode::s_min_u32, dst, true);
1549       } else {
1550          isel_err(&instr->instr, "Unimplemented NIR instr bit size");
1551       }
1552       break;
1553    }
1554    case nir_op_ior: {
1555       if (instr->dest.dest.ssa.bit_size == 1) {
1556          emit_boolean_logic(ctx, instr, Builder::s_or, dst);
1557       } else if (dst.regClass() == v1 || dst.regClass() == v2b || dst.regClass() == v1b) {
1558          emit_vop2_instruction(ctx, instr, aco_opcode::v_or_b32, dst, true);
1559       } else if (dst.regClass() == v2) {
1560          emit_vop2_instruction_logic64(ctx, instr, aco_opcode::v_or_b32, dst);
1561       } else if (dst.regClass() == s1) {
1562          emit_sop2_instruction(ctx, instr, aco_opcode::s_or_b32, dst, true);
1563       } else if (dst.regClass() == s2) {
1564          emit_sop2_instruction(ctx, instr, aco_opcode::s_or_b64, dst, true);
1565       } else {
1566          isel_err(&instr->instr, "Unimplemented NIR instr bit size");
1567       }
1568       break;
1569    }
1570    case nir_op_iand: {
1571       if (instr->dest.dest.ssa.bit_size == 1) {
1572          emit_boolean_logic(ctx, instr, Builder::s_and, dst);
1573       } else if (dst.regClass() == v1 || dst.regClass() == v2b || dst.regClass() == v1b) {
1574          emit_vop2_instruction(ctx, instr, aco_opcode::v_and_b32, dst, true);
1575       } else if (dst.regClass() == v2) {
1576          emit_vop2_instruction_logic64(ctx, instr, aco_opcode::v_and_b32, dst);
1577       } else if (dst.regClass() == s1) {
1578          emit_sop2_instruction(ctx, instr, aco_opcode::s_and_b32, dst, true);
1579       } else if (dst.regClass() == s2) {
1580          emit_sop2_instruction(ctx, instr, aco_opcode::s_and_b64, dst, true);
1581       } else {
1582          isel_err(&instr->instr, "Unimplemented NIR instr bit size");
1583       }
1584       break;
1585    }
1586    case nir_op_ixor: {
1587       if (instr->dest.dest.ssa.bit_size == 1) {
1588          emit_boolean_logic(ctx, instr, Builder::s_xor, dst);
1589       } else if (dst.regClass() == v1 || dst.regClass() == v2b || dst.regClass() == v1b) {
1590          emit_vop2_instruction(ctx, instr, aco_opcode::v_xor_b32, dst, true);
1591       } else if (dst.regClass() == v2) {
1592          emit_vop2_instruction_logic64(ctx, instr, aco_opcode::v_xor_b32, dst);
1593       } else if (dst.regClass() == s1) {
1594          emit_sop2_instruction(ctx, instr, aco_opcode::s_xor_b32, dst, true);
1595       } else if (dst.regClass() == s2) {
1596          emit_sop2_instruction(ctx, instr, aco_opcode::s_xor_b64, dst, true);
1597       } else {
1598          isel_err(&instr->instr, "Unimplemented NIR instr bit size");
1599       }
1600       break;
1601    }
1602    case nir_op_ushr: {
1603       if (dst.regClass() == v2b && ctx->program->chip_class >= GFX10) {
1604          emit_vop3a_instruction(ctx, instr, aco_opcode::v_lshrrev_b16_e64, dst, false, 2, true);
1605       } else if (dst.regClass() == v2b) {
1606          emit_vop2_instruction(ctx, instr, aco_opcode::v_lshrrev_b16, dst, false, true);
1607       } else if (dst.regClass() == v1 && instr->dest.dest.ssa.bit_size == 16) {
1608          emit_vop3p_instruction(ctx, instr, aco_opcode::v_pk_lshrrev_b16, dst, true);
1609       } else if (dst.regClass() == v1) {
1610          emit_vop2_instruction(ctx, instr, aco_opcode::v_lshrrev_b32, dst, false, true);
1611       } else if (dst.regClass() == v2 && ctx->program->chip_class >= GFX8) {
1612          bld.vop3(aco_opcode::v_lshrrev_b64, Definition(dst), get_alu_src(ctx, instr->src[1]),
1613                   get_alu_src(ctx, instr->src[0]));
1614       } else if (dst.regClass() == v2) {
1615          emit_vop3a_instruction(ctx, instr, aco_opcode::v_lshr_b64, dst);
1616       } else if (dst.regClass() == s2) {
1617          emit_sop2_instruction(ctx, instr, aco_opcode::s_lshr_b64, dst, true);
1618       } else if (dst.regClass() == s1) {
1619          emit_sop2_instruction(ctx, instr, aco_opcode::s_lshr_b32, dst, true);
1620       } else {
1621          isel_err(&instr->instr, "Unimplemented NIR instr bit size");
1622       }
1623       break;
1624    }
1625    case nir_op_ishl: {
1626       if (dst.regClass() == v2b && ctx->program->chip_class >= GFX10) {
1627          emit_vop3a_instruction(ctx, instr, aco_opcode::v_lshlrev_b16_e64, dst, false, 2, true);
1628       } else if (dst.regClass() == v2b) {
1629          emit_vop2_instruction(ctx, instr, aco_opcode::v_lshlrev_b16, dst, false, true);
1630       } else if (dst.regClass() == v1 && instr->dest.dest.ssa.bit_size == 16) {
1631          emit_vop3p_instruction(ctx, instr, aco_opcode::v_pk_lshlrev_b16, dst, true);
1632       } else if (dst.regClass() == v1) {
1633          emit_vop2_instruction(ctx, instr, aco_opcode::v_lshlrev_b32, dst, false, true, false,
1634                                false, 2);
1635       } else if (dst.regClass() == v2 && ctx->program->chip_class >= GFX8) {
1636          bld.vop3(aco_opcode::v_lshlrev_b64, Definition(dst), get_alu_src(ctx, instr->src[1]),
1637                   get_alu_src(ctx, instr->src[0]));
1638       } else if (dst.regClass() == v2) {
1639          emit_vop3a_instruction(ctx, instr, aco_opcode::v_lshl_b64, dst);
1640       } else if (dst.regClass() == s1) {
1641          emit_sop2_instruction(ctx, instr, aco_opcode::s_lshl_b32, dst, true, 1);
1642       } else if (dst.regClass() == s2) {
1643          emit_sop2_instruction(ctx, instr, aco_opcode::s_lshl_b64, dst, true);
1644       } else {
1645          isel_err(&instr->instr, "Unimplemented NIR instr bit size");
1646       }
1647       break;
1648    }
1649    case nir_op_ishr: {
1650       if (dst.regClass() == v2b && ctx->program->chip_class >= GFX10) {
1651          emit_vop3a_instruction(ctx, instr, aco_opcode::v_ashrrev_i16_e64, dst, false, 2, true);
1652       } else if (dst.regClass() == v2b) {
1653          emit_vop2_instruction(ctx, instr, aco_opcode::v_ashrrev_i16, dst, false, true);
1654       } else if (dst.regClass() == v1 && instr->dest.dest.ssa.bit_size == 16) {
1655          emit_vop3p_instruction(ctx, instr, aco_opcode::v_pk_ashrrev_i16, dst, true);
1656       } else if (dst.regClass() == v1) {
1657          emit_vop2_instruction(ctx, instr, aco_opcode::v_ashrrev_i32, dst, false, true);
1658       } else if (dst.regClass() == v2 && ctx->program->chip_class >= GFX8) {
1659          bld.vop3(aco_opcode::v_ashrrev_i64, Definition(dst), get_alu_src(ctx, instr->src[1]),
1660                   get_alu_src(ctx, instr->src[0]));
1661       } else if (dst.regClass() == v2) {
1662          emit_vop3a_instruction(ctx, instr, aco_opcode::v_ashr_i64, dst);
1663       } else if (dst.regClass() == s1) {
1664          emit_sop2_instruction(ctx, instr, aco_opcode::s_ashr_i32, dst, true);
1665       } else if (dst.regClass() == s2) {
1666          emit_sop2_instruction(ctx, instr, aco_opcode::s_ashr_i64, dst, true);
1667       } else {
1668          isel_err(&instr->instr, "Unimplemented NIR instr bit size");
1669       }
1670       break;
1671    }
1672    case nir_op_find_lsb: {
1673       Temp src = get_alu_src(ctx, instr->src[0]);
1674       if (src.regClass() == s1) {
1675          bld.sop1(aco_opcode::s_ff1_i32_b32, Definition(dst), src);
1676       } else if (src.regClass() == v1) {
1677          emit_vop1_instruction(ctx, instr, aco_opcode::v_ffbl_b32, dst);
1678       } else if (src.regClass() == s2) {
1679          bld.sop1(aco_opcode::s_ff1_i32_b64, Definition(dst), src);
1680       } else {
1681          isel_err(&instr->instr, "Unimplemented NIR instr bit size");
1682       }
1683       break;
1684    }
1685    case nir_op_ufind_msb:
1686    case nir_op_ifind_msb: {
1687       Temp src = get_alu_src(ctx, instr->src[0]);
1688       if (src.regClass() == s1 || src.regClass() == s2) {
1689          aco_opcode op = src.regClass() == s2
1690                             ? (instr->op == nir_op_ufind_msb ? aco_opcode::s_flbit_i32_b64
1691                                                              : aco_opcode::s_flbit_i32_i64)
1692                             : (instr->op == nir_op_ufind_msb ? aco_opcode::s_flbit_i32_b32
1693                                                              : aco_opcode::s_flbit_i32);
1694          Temp msb_rev = bld.sop1(op, bld.def(s1), src);
1695 
1696          Builder::Result sub = bld.sop2(aco_opcode::s_sub_u32, bld.def(s1), bld.def(s1, scc),
1697                                         Operand::c32(src.size() * 32u - 1u), msb_rev);
1698          Temp msb = sub.def(0).getTemp();
1699          Temp carry = sub.def(1).getTemp();
1700 
1701          bld.sop2(aco_opcode::s_cselect_b32, Definition(dst), Operand::c32(-1), msb,
1702                   bld.scc(carry));
1703       } else if (src.regClass() == v1) {
1704          aco_opcode op =
1705             instr->op == nir_op_ufind_msb ? aco_opcode::v_ffbh_u32 : aco_opcode::v_ffbh_i32;
1706          Temp msb_rev = bld.tmp(v1);
1707          emit_vop1_instruction(ctx, instr, op, msb_rev);
1708          Temp msb = bld.tmp(v1);
1709          Temp carry =
1710             bld.vsub32(Definition(msb), Operand::c32(31u), Operand(msb_rev), true).def(1).getTemp();
1711          bld.vop2_e64(aco_opcode::v_cndmask_b32, Definition(dst), msb, Operand::c32(-1), carry);
1712       } else if (src.regClass() == v2) {
1713          aco_opcode op =
1714             instr->op == nir_op_ufind_msb ? aco_opcode::v_ffbh_u32 : aco_opcode::v_ffbh_i32;
1715 
1716          Temp lo = bld.tmp(v1), hi = bld.tmp(v1);
1717          bld.pseudo(aco_opcode::p_split_vector, Definition(lo), Definition(hi), src);
1718 
1719          lo = uadd32_sat(bld, bld.def(v1), bld.copy(bld.def(s1), Operand::c32(32u)),
1720                          bld.vop1(op, bld.def(v1), lo));
1721          hi = bld.vop1(op, bld.def(v1), hi);
1722          Temp found_hi = bld.vopc(aco_opcode::v_cmp_lg_u32, bld.def(bld.lm), Operand::c32(-1), hi);
1723 
1724          Temp msb_rev = bld.vop2(aco_opcode::v_cndmask_b32, bld.def(v1), lo, hi, found_hi);
1725 
1726          Temp msb = bld.tmp(v1);
1727          Temp carry =
1728             bld.vsub32(Definition(msb), Operand::c32(63u), Operand(msb_rev), true).def(1).getTemp();
1729          bld.vop2_e64(aco_opcode::v_cndmask_b32, Definition(dst), msb, Operand::c32(-1), carry);
1730       } else {
1731          isel_err(&instr->instr, "Unimplemented NIR instr bit size");
1732       }
1733       break;
1734    }
1735    case nir_op_bitfield_reverse: {
1736       if (dst.regClass() == s1) {
1737          bld.sop1(aco_opcode::s_brev_b32, Definition(dst), get_alu_src(ctx, instr->src[0]));
1738       } else if (dst.regClass() == v1) {
1739          bld.vop1(aco_opcode::v_bfrev_b32, Definition(dst), get_alu_src(ctx, instr->src[0]));
1740       } else {
1741          isel_err(&instr->instr, "Unimplemented NIR instr bit size");
1742       }
1743       break;
1744    }
1745    case nir_op_iadd: {
1746       if (dst.regClass() == s1) {
1747          emit_sop2_instruction(ctx, instr, aco_opcode::s_add_u32, dst, true);
1748          break;
1749       } else if (dst.bytes() <= 2 && ctx->program->chip_class >= GFX10) {
1750          emit_vop3a_instruction(ctx, instr, aco_opcode::v_add_u16_e64, dst);
1751          break;
1752       } else if (dst.bytes() <= 2 && ctx->program->chip_class >= GFX8) {
1753          emit_vop2_instruction(ctx, instr, aco_opcode::v_add_u16, dst, true);
1754          break;
1755       } else if (dst.regClass() == v1 && instr->dest.dest.ssa.bit_size == 16) {
1756          emit_vop3p_instruction(ctx, instr, aco_opcode::v_pk_add_u16, dst);
1757          break;
1758       }
1759 
1760       Temp src0 = get_alu_src(ctx, instr->src[0]);
1761       Temp src1 = get_alu_src(ctx, instr->src[1]);
1762       if (dst.type() == RegType::vgpr && dst.bytes() <= 4) {
1763          bld.vadd32(Definition(dst), Operand(src0), Operand(src1));
1764          break;
1765       }
1766 
1767       assert(src0.size() == 2 && src1.size() == 2);
1768       Temp src00 = bld.tmp(src0.type(), 1);
1769       Temp src01 = bld.tmp(dst.type(), 1);
1770       bld.pseudo(aco_opcode::p_split_vector, Definition(src00), Definition(src01), src0);
1771       Temp src10 = bld.tmp(src1.type(), 1);
1772       Temp src11 = bld.tmp(dst.type(), 1);
1773       bld.pseudo(aco_opcode::p_split_vector, Definition(src10), Definition(src11), src1);
1774 
1775       if (dst.regClass() == s2) {
1776          Temp carry = bld.tmp(s1);
1777          Temp dst0 =
1778             bld.sop2(aco_opcode::s_add_u32, bld.def(s1), bld.scc(Definition(carry)), src00, src10);
1779          Temp dst1 = bld.sop2(aco_opcode::s_addc_u32, bld.def(s1), bld.def(s1, scc), src01, src11,
1780                               bld.scc(carry));
1781          bld.pseudo(aco_opcode::p_create_vector, Definition(dst), dst0, dst1);
1782       } else if (dst.regClass() == v2) {
1783          Temp dst0 = bld.tmp(v1);
1784          Temp carry = bld.vadd32(Definition(dst0), src00, src10, true).def(1).getTemp();
1785          Temp dst1 = bld.vadd32(bld.def(v1), src01, src11, false, carry);
1786          bld.pseudo(aco_opcode::p_create_vector, Definition(dst), dst0, dst1);
1787       } else {
1788          isel_err(&instr->instr, "Unimplemented NIR instr bit size");
1789       }
1790       break;
1791    }
1792    case nir_op_uadd_sat: {
1793       Temp src0 = get_alu_src(ctx, instr->src[0]);
1794       Temp src1 = get_alu_src(ctx, instr->src[1]);
1795       if (dst.regClass() == s1) {
1796          Temp tmp = bld.tmp(s1), carry = bld.tmp(s1);
1797          bld.sop2(aco_opcode::s_add_u32, Definition(tmp), bld.scc(Definition(carry)), src0, src1);
1798          bld.sop2(aco_opcode::s_cselect_b32, Definition(dst), Operand::c32(-1), tmp,
1799                   bld.scc(carry));
1800       } else if (dst.regClass() == v2b) {
1801          Instruction* add_instr;
1802          if (ctx->program->chip_class >= GFX10) {
1803             add_instr = bld.vop3(aco_opcode::v_add_u16_e64, Definition(dst), src0, src1).instr;
1804          } else {
1805             if (src1.type() == RegType::sgpr)
1806                std::swap(src0, src1);
1807             add_instr =
1808                bld.vop2_e64(aco_opcode::v_add_u16, Definition(dst), src0, as_vgpr(ctx, src1)).instr;
1809          }
1810          add_instr->vop3().clamp = 1;
1811       } else if (dst.regClass() == v1) {
1812          uadd32_sat(bld, Definition(dst), src0, src1);
1813       } else {
1814          isel_err(&instr->instr, "Unimplemented NIR instr bit size");
1815       }
1816       break;
1817    }
1818    case nir_op_iadd_sat: {
1819       Temp src0 = get_alu_src(ctx, instr->src[0]);
1820       Temp src1 = as_vgpr(ctx, get_alu_src(ctx, instr->src[1]));
1821       if (dst.regClass() == v2b) {
1822          Instruction* add_instr =
1823             bld.vop3(aco_opcode::v_add_i16, Definition(dst), src0, src1).instr;
1824          add_instr->vop3().clamp = 1;
1825       } else if (dst.regClass() == v1) {
1826          Instruction* add_instr =
1827             bld.vop3(aco_opcode::v_add_i32, Definition(dst), src0, src1).instr;
1828          add_instr->vop3().clamp = 1;
1829       } else {
1830          isel_err(&instr->instr, "Unimplemented NIR instr bit size");
1831       }
1832       break;
1833    }
1834    case nir_op_uadd_carry: {
1835       Temp src0 = get_alu_src(ctx, instr->src[0]);
1836       Temp src1 = get_alu_src(ctx, instr->src[1]);
1837       if (dst.regClass() == s1) {
1838          bld.sop2(aco_opcode::s_add_u32, bld.def(s1), bld.scc(Definition(dst)), src0, src1);
1839          break;
1840       }
1841       if (dst.regClass() == v1) {
1842          Temp carry = bld.vadd32(bld.def(v1), src0, src1, true).def(1).getTemp();
1843          bld.vop2_e64(aco_opcode::v_cndmask_b32, Definition(dst), Operand::zero(), Operand::c32(1u),
1844                       carry);
1845          break;
1846       }
1847 
1848       Temp src00 = bld.tmp(src0.type(), 1);
1849       Temp src01 = bld.tmp(dst.type(), 1);
1850       bld.pseudo(aco_opcode::p_split_vector, Definition(src00), Definition(src01), src0);
1851       Temp src10 = bld.tmp(src1.type(), 1);
1852       Temp src11 = bld.tmp(dst.type(), 1);
1853       bld.pseudo(aco_opcode::p_split_vector, Definition(src10), Definition(src11), src1);
1854       if (dst.regClass() == s2) {
1855          Temp carry = bld.tmp(s1);
1856          bld.sop2(aco_opcode::s_add_u32, bld.def(s1), bld.scc(Definition(carry)), src00, src10);
1857          carry = bld.sop2(aco_opcode::s_addc_u32, bld.def(s1), bld.scc(bld.def(s1)), src01, src11,
1858                           bld.scc(carry))
1859                     .def(1)
1860                     .getTemp();
1861          bld.pseudo(aco_opcode::p_create_vector, Definition(dst), carry, Operand::zero());
1862       } else if (dst.regClass() == v2) {
1863          Temp carry = bld.vadd32(bld.def(v1), src00, src10, true).def(1).getTemp();
1864          carry = bld.vadd32(bld.def(v1), src01, src11, true, carry).def(1).getTemp();
1865          carry = bld.vop2_e64(aco_opcode::v_cndmask_b32, bld.def(v1), Operand::zero(),
1866                               Operand::c32(1u), carry);
1867          bld.pseudo(aco_opcode::p_create_vector, Definition(dst), carry, Operand::zero());
1868       } else {
1869          isel_err(&instr->instr, "Unimplemented NIR instr bit size");
1870       }
1871       break;
1872    }
1873    case nir_op_isub: {
1874       if (dst.regClass() == s1) {
1875          emit_sop2_instruction(ctx, instr, aco_opcode::s_sub_i32, dst, true);
1876          break;
1877       } else if (dst.regClass() == v1 && instr->dest.dest.ssa.bit_size == 16) {
1878          emit_vop3p_instruction(ctx, instr, aco_opcode::v_pk_sub_u16, dst);
1879          break;
1880       }
1881 
1882       Temp src0 = get_alu_src(ctx, instr->src[0]);
1883       Temp src1 = get_alu_src(ctx, instr->src[1]);
1884       if (dst.regClass() == v1) {
1885          bld.vsub32(Definition(dst), src0, src1);
1886          break;
1887       } else if (dst.bytes() <= 2) {
1888          if (ctx->program->chip_class >= GFX10)
1889             bld.vop3(aco_opcode::v_sub_u16_e64, Definition(dst), src0, src1);
1890          else if (src1.type() == RegType::sgpr)
1891             bld.vop2(aco_opcode::v_subrev_u16, Definition(dst), src1, as_vgpr(ctx, src0));
1892          else if (ctx->program->chip_class >= GFX8)
1893             bld.vop2(aco_opcode::v_sub_u16, Definition(dst), src0, as_vgpr(ctx, src1));
1894          else
1895             bld.vsub32(Definition(dst), src0, src1);
1896          break;
1897       }
1898 
1899       Temp src00 = bld.tmp(src0.type(), 1);
1900       Temp src01 = bld.tmp(dst.type(), 1);
1901       bld.pseudo(aco_opcode::p_split_vector, Definition(src00), Definition(src01), src0);
1902       Temp src10 = bld.tmp(src1.type(), 1);
1903       Temp src11 = bld.tmp(dst.type(), 1);
1904       bld.pseudo(aco_opcode::p_split_vector, Definition(src10), Definition(src11), src1);
1905       if (dst.regClass() == s2) {
1906          Temp borrow = bld.tmp(s1);
1907          Temp dst0 =
1908             bld.sop2(aco_opcode::s_sub_u32, bld.def(s1), bld.scc(Definition(borrow)), src00, src10);
1909          Temp dst1 = bld.sop2(aco_opcode::s_subb_u32, bld.def(s1), bld.def(s1, scc), src01, src11,
1910                               bld.scc(borrow));
1911          bld.pseudo(aco_opcode::p_create_vector, Definition(dst), dst0, dst1);
1912       } else if (dst.regClass() == v2) {
1913          Temp lower = bld.tmp(v1);
1914          Temp borrow = bld.vsub32(Definition(lower), src00, src10, true).def(1).getTemp();
1915          Temp upper = bld.vsub32(bld.def(v1), src01, src11, false, borrow);
1916          bld.pseudo(aco_opcode::p_create_vector, Definition(dst), lower, upper);
1917       } else {
1918          isel_err(&instr->instr, "Unimplemented NIR instr bit size");
1919       }
1920       break;
1921    }
1922    case nir_op_usub_borrow: {
1923       Temp src0 = get_alu_src(ctx, instr->src[0]);
1924       Temp src1 = get_alu_src(ctx, instr->src[1]);
1925       if (dst.regClass() == s1) {
1926          bld.sop2(aco_opcode::s_sub_u32, bld.def(s1), bld.scc(Definition(dst)), src0, src1);
1927          break;
1928       } else if (dst.regClass() == v1) {
1929          Temp borrow = bld.vsub32(bld.def(v1), src0, src1, true).def(1).getTemp();
1930          bld.vop2_e64(aco_opcode::v_cndmask_b32, Definition(dst), Operand::zero(), Operand::c32(1u),
1931                       borrow);
1932          break;
1933       }
1934 
1935       Temp src00 = bld.tmp(src0.type(), 1);
1936       Temp src01 = bld.tmp(dst.type(), 1);
1937       bld.pseudo(aco_opcode::p_split_vector, Definition(src00), Definition(src01), src0);
1938       Temp src10 = bld.tmp(src1.type(), 1);
1939       Temp src11 = bld.tmp(dst.type(), 1);
1940       bld.pseudo(aco_opcode::p_split_vector, Definition(src10), Definition(src11), src1);
1941       if (dst.regClass() == s2) {
1942          Temp borrow = bld.tmp(s1);
1943          bld.sop2(aco_opcode::s_sub_u32, bld.def(s1), bld.scc(Definition(borrow)), src00, src10);
1944          borrow = bld.sop2(aco_opcode::s_subb_u32, bld.def(s1), bld.scc(bld.def(s1)), src01, src11,
1945                            bld.scc(borrow))
1946                      .def(1)
1947                      .getTemp();
1948          bld.pseudo(aco_opcode::p_create_vector, Definition(dst), borrow, Operand::zero());
1949       } else if (dst.regClass() == v2) {
1950          Temp borrow = bld.vsub32(bld.def(v1), src00, src10, true).def(1).getTemp();
1951          borrow = bld.vsub32(bld.def(v1), src01, src11, true, Operand(borrow)).def(1).getTemp();
1952          borrow = bld.vop2_e64(aco_opcode::v_cndmask_b32, bld.def(v1), Operand::zero(),
1953                                Operand::c32(1u), borrow);
1954          bld.pseudo(aco_opcode::p_create_vector, Definition(dst), borrow, Operand::zero());
1955       } else {
1956          isel_err(&instr->instr, "Unimplemented NIR instr bit size");
1957       }
1958       break;
1959    }
1960    case nir_op_imul: {
1961       if (dst.bytes() <= 2 && ctx->program->chip_class >= GFX10) {
1962          emit_vop3a_instruction(ctx, instr, aco_opcode::v_mul_lo_u16_e64, dst);
1963       } else if (dst.bytes() <= 2 && ctx->program->chip_class >= GFX8) {
1964          emit_vop2_instruction(ctx, instr, aco_opcode::v_mul_lo_u16, dst, true);
1965       } else if (dst.regClass() == v1 && instr->dest.dest.ssa.bit_size == 16) {
1966          emit_vop3p_instruction(ctx, instr, aco_opcode::v_pk_mul_lo_u16, dst);
1967       } else if (dst.type() == RegType::vgpr) {
1968          uint32_t src0_ub = get_alu_src_ub(ctx, instr, 0);
1969          uint32_t src1_ub = get_alu_src_ub(ctx, instr, 1);
1970 
1971          if (src0_ub <= 0xffffff && src1_ub <= 0xffffff) {
1972             bool nuw_16bit = src0_ub <= 0xffff && src1_ub <= 0xffff && src0_ub * src1_ub <= 0xffff;
1973             emit_vop2_instruction(ctx, instr, aco_opcode::v_mul_u32_u24, dst,
1974                                   true /* commutative */, false, false, nuw_16bit);
1975          } else if (nir_src_is_const(instr->src[0].src)) {
1976             bld.v_mul_imm(Definition(dst), get_alu_src(ctx, instr->src[1]),
1977                           nir_src_as_uint(instr->src[0].src), false);
1978          } else if (nir_src_is_const(instr->src[1].src)) {
1979             bld.v_mul_imm(Definition(dst), get_alu_src(ctx, instr->src[0]),
1980                           nir_src_as_uint(instr->src[1].src), false);
1981          } else {
1982             emit_vop3a_instruction(ctx, instr, aco_opcode::v_mul_lo_u32, dst);
1983          }
1984       } else if (dst.regClass() == s1) {
1985          emit_sop2_instruction(ctx, instr, aco_opcode::s_mul_i32, dst, false);
1986       } else {
1987          isel_err(&instr->instr, "Unimplemented NIR instr bit size");
1988       }
1989       break;
1990    }
1991    case nir_op_umul_high: {
1992       if (dst.regClass() == s1 && ctx->options->chip_class >= GFX9) {
1993          emit_sop2_instruction(ctx, instr, aco_opcode::s_mul_hi_u32, dst, false);
1994       } else if (dst.bytes() == 4) {
1995          uint32_t src0_ub = get_alu_src_ub(ctx, instr, 0);
1996          uint32_t src1_ub = get_alu_src_ub(ctx, instr, 1);
1997 
1998          Temp tmp = dst.regClass() == s1 ? bld.tmp(v1) : dst;
1999          if (src0_ub <= 0xffffff && src1_ub <= 0xffffff) {
2000             emit_vop2_instruction(ctx, instr, aco_opcode::v_mul_hi_u32_u24, tmp, true);
2001          } else {
2002             emit_vop3a_instruction(ctx, instr, aco_opcode::v_mul_hi_u32, tmp);
2003          }
2004 
2005          if (dst.regClass() == s1)
2006             bld.pseudo(aco_opcode::p_as_uniform, Definition(dst), tmp);
2007       } else {
2008          isel_err(&instr->instr, "Unimplemented NIR instr bit size");
2009       }
2010       break;
2011    }
2012    case nir_op_imul_high: {
2013       if (dst.regClass() == v1) {
2014          emit_vop3a_instruction(ctx, instr, aco_opcode::v_mul_hi_i32, dst);
2015       } else if (dst.regClass() == s1 && ctx->options->chip_class >= GFX9) {
2016          emit_sop2_instruction(ctx, instr, aco_opcode::s_mul_hi_i32, dst, false);
2017       } else if (dst.regClass() == s1) {
2018          Temp tmp = bld.vop3(aco_opcode::v_mul_hi_i32, bld.def(v1), get_alu_src(ctx, instr->src[0]),
2019                              as_vgpr(ctx, get_alu_src(ctx, instr->src[1])));
2020          bld.pseudo(aco_opcode::p_as_uniform, Definition(dst), tmp);
2021       } else {
2022          isel_err(&instr->instr, "Unimplemented NIR instr bit size");
2023       }
2024       break;
2025    }
2026    case nir_op_fmul: {
2027       if (dst.regClass() == v2b) {
2028          emit_vop2_instruction(ctx, instr, aco_opcode::v_mul_f16, dst, true);
2029       } else if (dst.regClass() == v1 && instr->dest.dest.ssa.bit_size == 16) {
2030          emit_vop3p_instruction(ctx, instr, aco_opcode::v_pk_mul_f16, dst);
2031       } else if (dst.regClass() == v1) {
2032          emit_vop2_instruction(ctx, instr, aco_opcode::v_mul_f32, dst, true);
2033       } else if (dst.regClass() == v2) {
2034          emit_vop3a_instruction(ctx, instr, aco_opcode::v_mul_f64, dst);
2035       } else {
2036          isel_err(&instr->instr, "Unimplemented NIR instr bit size");
2037       }
2038       break;
2039    }
2040    case nir_op_fadd: {
2041       if (dst.regClass() == v2b) {
2042          emit_vop2_instruction(ctx, instr, aco_opcode::v_add_f16, dst, true);
2043       } else if (dst.regClass() == v1 && instr->dest.dest.ssa.bit_size == 16) {
2044          emit_vop3p_instruction(ctx, instr, aco_opcode::v_pk_add_f16, dst);
2045       } else if (dst.regClass() == v1) {
2046          emit_vop2_instruction(ctx, instr, aco_opcode::v_add_f32, dst, true);
2047       } else if (dst.regClass() == v2) {
2048          emit_vop3a_instruction(ctx, instr, aco_opcode::v_add_f64, dst);
2049       } else {
2050          isel_err(&instr->instr, "Unimplemented NIR instr bit size");
2051       }
2052       break;
2053    }
2054    case nir_op_fsub: {
2055       if (dst.regClass() == v1 && instr->dest.dest.ssa.bit_size == 16) {
2056          Instruction* add = emit_vop3p_instruction(ctx, instr, aco_opcode::v_pk_add_f16, dst);
2057          VOP3P_instruction& sub = add->vop3p();
2058          sub.neg_lo[1] = true;
2059          sub.neg_hi[1] = true;
2060          break;
2061       }
2062 
2063       Temp src0 = get_alu_src(ctx, instr->src[0]);
2064       Temp src1 = get_alu_src(ctx, instr->src[1]);
2065       if (dst.regClass() == v2b) {
2066          if (src1.type() == RegType::vgpr || src0.type() != RegType::vgpr)
2067             emit_vop2_instruction(ctx, instr, aco_opcode::v_sub_f16, dst, false);
2068          else
2069             emit_vop2_instruction(ctx, instr, aco_opcode::v_subrev_f16, dst, true);
2070       } else if (dst.regClass() == v1) {
2071          if (src1.type() == RegType::vgpr || src0.type() != RegType::vgpr)
2072             emit_vop2_instruction(ctx, instr, aco_opcode::v_sub_f32, dst, false);
2073          else
2074             emit_vop2_instruction(ctx, instr, aco_opcode::v_subrev_f32, dst, true);
2075       } else if (dst.regClass() == v2) {
2076          Instruction* add = bld.vop3(aco_opcode::v_add_f64, Definition(dst), as_vgpr(ctx, src0),
2077                                      as_vgpr(ctx, src1));
2078          add->vop3().neg[1] = true;
2079       } else {
2080          isel_err(&instr->instr, "Unimplemented NIR instr bit size");
2081       }
2082       break;
2083    }
2084    case nir_op_fmax: {
2085       if (dst.regClass() == v2b) {
2086          // TODO: check fp_mode.must_flush_denorms16_64
2087          emit_vop2_instruction(ctx, instr, aco_opcode::v_max_f16, dst, true);
2088       } else if (dst.regClass() == v1 && instr->dest.dest.ssa.bit_size == 16) {
2089          emit_vop3p_instruction(ctx, instr, aco_opcode::v_pk_max_f16, dst);
2090       } else if (dst.regClass() == v1) {
2091          emit_vop2_instruction(ctx, instr, aco_opcode::v_max_f32, dst, true, false,
2092                                ctx->block->fp_mode.must_flush_denorms32);
2093       } else if (dst.regClass() == v2) {
2094          emit_vop3a_instruction(ctx, instr, aco_opcode::v_max_f64, dst,
2095                                 ctx->block->fp_mode.must_flush_denorms16_64);
2096       } else {
2097          isel_err(&instr->instr, "Unimplemented NIR instr bit size");
2098       }
2099       break;
2100    }
2101    case nir_op_fmin: {
2102       if (dst.regClass() == v2b) {
2103          // TODO: check fp_mode.must_flush_denorms16_64
2104          emit_vop2_instruction(ctx, instr, aco_opcode::v_min_f16, dst, true);
2105       } else if (dst.regClass() == v1 && instr->dest.dest.ssa.bit_size == 16) {
2106          emit_vop3p_instruction(ctx, instr, aco_opcode::v_pk_min_f16, dst, true);
2107       } else if (dst.regClass() == v1) {
2108          emit_vop2_instruction(ctx, instr, aco_opcode::v_min_f32, dst, true, false,
2109                                ctx->block->fp_mode.must_flush_denorms32);
2110       } else if (dst.regClass() == v2) {
2111          emit_vop3a_instruction(ctx, instr, aco_opcode::v_min_f64, dst,
2112                                 ctx->block->fp_mode.must_flush_denorms16_64);
2113       } else {
2114          isel_err(&instr->instr, "Unimplemented NIR instr bit size");
2115       }
2116       break;
2117    }
2118    case nir_op_sdot_4x8_iadd: {
2119       emit_idot_instruction(ctx, instr, aco_opcode::v_dot4_i32_i8, dst, false);
2120       break;
2121    }
2122    case nir_op_sdot_4x8_iadd_sat: {
2123       emit_idot_instruction(ctx, instr, aco_opcode::v_dot4_i32_i8, dst, true);
2124       break;
2125    }
2126    case nir_op_udot_4x8_uadd: {
2127       emit_idot_instruction(ctx, instr, aco_opcode::v_dot4_u32_u8, dst, false);
2128       break;
2129    }
2130    case nir_op_udot_4x8_uadd_sat: {
2131       emit_idot_instruction(ctx, instr, aco_opcode::v_dot4_u32_u8, dst, true);
2132       break;
2133    }
2134    case nir_op_sdot_2x16_iadd: {
2135       emit_idot_instruction(ctx, instr, aco_opcode::v_dot2_i32_i16, dst, false);
2136       break;
2137    }
2138    case nir_op_sdot_2x16_iadd_sat: {
2139       emit_idot_instruction(ctx, instr, aco_opcode::v_dot2_i32_i16, dst, true);
2140       break;
2141    }
2142    case nir_op_udot_2x16_uadd: {
2143       emit_idot_instruction(ctx, instr, aco_opcode::v_dot2_u32_u16, dst, false);
2144       break;
2145    }
2146    case nir_op_udot_2x16_uadd_sat: {
2147       emit_idot_instruction(ctx, instr, aco_opcode::v_dot2_u32_u16, dst, true);
2148       break;
2149    }
2150    case nir_op_cube_face_coord_amd: {
2151       Temp in = get_alu_src(ctx, instr->src[0], 3);
2152       Temp src[3] = {emit_extract_vector(ctx, in, 0, v1), emit_extract_vector(ctx, in, 1, v1),
2153                      emit_extract_vector(ctx, in, 2, v1)};
2154       Temp ma = bld.vop3(aco_opcode::v_cubema_f32, bld.def(v1), src[0], src[1], src[2]);
2155       ma = bld.vop1(aco_opcode::v_rcp_f32, bld.def(v1), ma);
2156       Temp sc = bld.vop3(aco_opcode::v_cubesc_f32, bld.def(v1), src[0], src[1], src[2]);
2157       Temp tc = bld.vop3(aco_opcode::v_cubetc_f32, bld.def(v1), src[0], src[1], src[2]);
2158       sc = bld.vop2(aco_opcode::v_add_f32, bld.def(v1), Operand::c32(0x3f000000u /*0.5*/),
2159                     bld.vop2(aco_opcode::v_mul_f32, bld.def(v1), sc, ma));
2160       tc = bld.vop2(aco_opcode::v_add_f32, bld.def(v1), Operand::c32(0x3f000000u /*0.5*/),
2161                     bld.vop2(aco_opcode::v_mul_f32, bld.def(v1), tc, ma));
2162       bld.pseudo(aco_opcode::p_create_vector, Definition(dst), sc, tc);
2163       break;
2164    }
2165    case nir_op_cube_face_index_amd: {
2166       Temp in = get_alu_src(ctx, instr->src[0], 3);
2167       Temp src[3] = {emit_extract_vector(ctx, in, 0, v1), emit_extract_vector(ctx, in, 1, v1),
2168                      emit_extract_vector(ctx, in, 2, v1)};
2169       bld.vop3(aco_opcode::v_cubeid_f32, Definition(dst), src[0], src[1], src[2]);
2170       break;
2171    }
2172    case nir_op_bcsel: {
2173       emit_bcsel(ctx, instr, dst);
2174       break;
2175    }
2176    case nir_op_frsq: {
2177       if (dst.regClass() == v2b) {
2178          emit_vop1_instruction(ctx, instr, aco_opcode::v_rsq_f16, dst);
2179       } else if (dst.regClass() == v1) {
2180          Temp src = get_alu_src(ctx, instr->src[0]);
2181          emit_rsq(ctx, bld, Definition(dst), src);
2182       } else if (dst.regClass() == v2) {
2183          /* Lowered at NIR level for precision reasons. */
2184          emit_vop1_instruction(ctx, instr, aco_opcode::v_rsq_f64, dst);
2185       } else {
2186          isel_err(&instr->instr, "Unimplemented NIR instr bit size");
2187       }
2188       break;
2189    }
2190    case nir_op_fneg: {
2191       if (dst.regClass() == v1 && instr->dest.dest.ssa.bit_size == 16) {
2192          Temp src = get_alu_src_vop3p(ctx, instr->src[0]);
2193          bld.vop3p(aco_opcode::v_pk_mul_f16, Definition(dst), src, Operand::c16(0xBC00),
2194                    instr->src[0].swizzle[0] & 1, instr->src[0].swizzle[1] & 1);
2195          emit_split_vector(ctx, dst, 2);
2196          break;
2197       }
2198       Temp src = get_alu_src(ctx, instr->src[0]);
2199       if (dst.regClass() == v2b) {
2200          bld.vop2(aco_opcode::v_mul_f16, Definition(dst), Operand::c16(0xbc00u), as_vgpr(ctx, src));
2201       } else if (dst.regClass() == v1) {
2202          bld.vop2(aco_opcode::v_mul_f32, Definition(dst), Operand::c32(0xbf800000u),
2203                   as_vgpr(ctx, src));
2204       } else if (dst.regClass() == v2) {
2205          if (ctx->block->fp_mode.must_flush_denorms16_64)
2206             src = bld.vop3(aco_opcode::v_mul_f64, bld.def(v2), Operand::c64(0x3FF0000000000000),
2207                            as_vgpr(ctx, src));
2208          Temp upper = bld.tmp(v1), lower = bld.tmp(v1);
2209          bld.pseudo(aco_opcode::p_split_vector, Definition(lower), Definition(upper), src);
2210          upper = bld.vop2(aco_opcode::v_xor_b32, bld.def(v1), Operand::c32(0x80000000u), upper);
2211          bld.pseudo(aco_opcode::p_create_vector, Definition(dst), lower, upper);
2212       } else {
2213          isel_err(&instr->instr, "Unimplemented NIR instr bit size");
2214       }
2215       break;
2216    }
2217    case nir_op_fabs: {
2218       Temp src = get_alu_src(ctx, instr->src[0]);
2219       if (dst.regClass() == v2b) {
2220          Instruction* mul = bld.vop2_e64(aco_opcode::v_mul_f16, Definition(dst),
2221                                          Operand::c16(0x3c00), as_vgpr(ctx, src))
2222                                .instr;
2223          mul->vop3().abs[1] = true;
2224       } else if (dst.regClass() == v1) {
2225          Instruction* mul = bld.vop2_e64(aco_opcode::v_mul_f32, Definition(dst),
2226                                          Operand::c32(0x3f800000u), as_vgpr(ctx, src))
2227                                .instr;
2228          mul->vop3().abs[1] = true;
2229       } else if (dst.regClass() == v2) {
2230          if (ctx->block->fp_mode.must_flush_denorms16_64)
2231             src = bld.vop3(aco_opcode::v_mul_f64, bld.def(v2), Operand::c64(0x3FF0000000000000),
2232                            as_vgpr(ctx, src));
2233          Temp upper = bld.tmp(v1), lower = bld.tmp(v1);
2234          bld.pseudo(aco_opcode::p_split_vector, Definition(lower), Definition(upper), src);
2235          upper = bld.vop2(aco_opcode::v_and_b32, bld.def(v1), Operand::c32(0x7FFFFFFFu), upper);
2236          bld.pseudo(aco_opcode::p_create_vector, Definition(dst), lower, upper);
2237       } else {
2238          isel_err(&instr->instr, "Unimplemented NIR instr bit size");
2239       }
2240       break;
2241    }
2242    case nir_op_fsat: {
2243       if (dst.regClass() == v1 && instr->dest.dest.ssa.bit_size == 16) {
2244          Temp src = get_alu_src_vop3p(ctx, instr->src[0]);
2245          Instruction* vop3p =
2246             bld.vop3p(aco_opcode::v_pk_mul_f16, Definition(dst), src, Operand::c16(0x3C00),
2247                       instr->src[0].swizzle[0] & 1, instr->src[0].swizzle[1] & 1);
2248          vop3p->vop3p().clamp = true;
2249          emit_split_vector(ctx, dst, 2);
2250          break;
2251       }
2252       Temp src = get_alu_src(ctx, instr->src[0]);
2253       if (dst.regClass() == v2b) {
2254          bld.vop3(aco_opcode::v_med3_f16, Definition(dst), Operand::c16(0u), Operand::c16(0x3c00),
2255                   src);
2256       } else if (dst.regClass() == v1) {
2257          bld.vop3(aco_opcode::v_med3_f32, Definition(dst), Operand::zero(),
2258                   Operand::c32(0x3f800000u), src);
2259          /* apparently, it is not necessary to flush denorms if this instruction is used with these
2260           * operands */
2261          // TODO: confirm that this holds under any circumstances
2262       } else if (dst.regClass() == v2) {
2263          Instruction* add = bld.vop3(aco_opcode::v_add_f64, Definition(dst), src, Operand::zero());
2264          add->vop3().clamp = true;
2265       } else {
2266          isel_err(&instr->instr, "Unimplemented NIR instr bit size");
2267       }
2268       break;
2269    }
2270    case nir_op_flog2: {
2271       if (dst.regClass() == v2b) {
2272          emit_vop1_instruction(ctx, instr, aco_opcode::v_log_f16, dst);
2273       } else if (dst.regClass() == v1) {
2274          Temp src = get_alu_src(ctx, instr->src[0]);
2275          emit_log2(ctx, bld, Definition(dst), src);
2276       } else {
2277          isel_err(&instr->instr, "Unimplemented NIR instr bit size");
2278       }
2279       break;
2280    }
2281    case nir_op_frcp: {
2282       if (dst.regClass() == v2b) {
2283          emit_vop1_instruction(ctx, instr, aco_opcode::v_rcp_f16, dst);
2284       } else if (dst.regClass() == v1) {
2285          Temp src = get_alu_src(ctx, instr->src[0]);
2286          emit_rcp(ctx, bld, Definition(dst), src);
2287       } else if (dst.regClass() == v2) {
2288          /* Lowered at NIR level for precision reasons. */
2289          emit_vop1_instruction(ctx, instr, aco_opcode::v_rcp_f64, dst);
2290       } else {
2291          isel_err(&instr->instr, "Unimplemented NIR instr bit size");
2292       }
2293       break;
2294    }
2295    case nir_op_fexp2: {
2296       if (dst.regClass() == v2b) {
2297          emit_vop1_instruction(ctx, instr, aco_opcode::v_exp_f16, dst);
2298       } else if (dst.regClass() == v1) {
2299          emit_vop1_instruction(ctx, instr, aco_opcode::v_exp_f32, dst);
2300       } else {
2301          isel_err(&instr->instr, "Unimplemented NIR instr bit size");
2302       }
2303       break;
2304    }
2305    case nir_op_fsqrt: {
2306       if (dst.regClass() == v2b) {
2307          emit_vop1_instruction(ctx, instr, aco_opcode::v_sqrt_f16, dst);
2308       } else if (dst.regClass() == v1) {
2309          Temp src = get_alu_src(ctx, instr->src[0]);
2310          emit_sqrt(ctx, bld, Definition(dst), src);
2311       } else if (dst.regClass() == v2) {
2312          /* Lowered at NIR level for precision reasons. */
2313          emit_vop1_instruction(ctx, instr, aco_opcode::v_sqrt_f64, dst);
2314       } else {
2315          isel_err(&instr->instr, "Unimplemented NIR instr bit size");
2316       }
2317       break;
2318    }
2319    case nir_op_ffract: {
2320       if (dst.regClass() == v2b) {
2321          emit_vop1_instruction(ctx, instr, aco_opcode::v_fract_f16, dst);
2322       } else if (dst.regClass() == v1) {
2323          emit_vop1_instruction(ctx, instr, aco_opcode::v_fract_f32, dst);
2324       } else if (dst.regClass() == v2) {
2325          emit_vop1_instruction(ctx, instr, aco_opcode::v_fract_f64, dst);
2326       } else {
2327          isel_err(&instr->instr, "Unimplemented NIR instr bit size");
2328       }
2329       break;
2330    }
2331    case nir_op_ffloor: {
2332       if (dst.regClass() == v2b) {
2333          emit_vop1_instruction(ctx, instr, aco_opcode::v_floor_f16, dst);
2334       } else if (dst.regClass() == v1) {
2335          emit_vop1_instruction(ctx, instr, aco_opcode::v_floor_f32, dst);
2336       } else if (dst.regClass() == v2) {
2337          Temp src = get_alu_src(ctx, instr->src[0]);
2338          emit_floor_f64(ctx, bld, Definition(dst), src);
2339       } else {
2340          isel_err(&instr->instr, "Unimplemented NIR instr bit size");
2341       }
2342       break;
2343    }
2344    case nir_op_fceil: {
2345       if (dst.regClass() == v2b) {
2346          emit_vop1_instruction(ctx, instr, aco_opcode::v_ceil_f16, dst);
2347       } else if (dst.regClass() == v1) {
2348          emit_vop1_instruction(ctx, instr, aco_opcode::v_ceil_f32, dst);
2349       } else if (dst.regClass() == v2) {
2350          if (ctx->options->chip_class >= GFX7) {
2351             emit_vop1_instruction(ctx, instr, aco_opcode::v_ceil_f64, dst);
2352          } else {
2353             /* GFX6 doesn't support V_CEIL_F64, lower it. */
2354             /* trunc = trunc(src0)
2355              * if (src0 > 0.0 && src0 != trunc)
2356              *    trunc += 1.0
2357              */
2358             Temp src0 = get_alu_src(ctx, instr->src[0]);
2359             Temp trunc = emit_trunc_f64(ctx, bld, bld.def(v2), src0);
2360             Temp tmp0 =
2361                bld.vopc_e64(aco_opcode::v_cmp_gt_f64, bld.def(bld.lm), src0, Operand::zero());
2362             Temp tmp1 =
2363                bld.vopc(aco_opcode::v_cmp_lg_f64, bld.hint_vcc(bld.def(bld.lm)), src0, trunc);
2364             Temp cond = bld.sop2(aco_opcode::s_and_b64, bld.hint_vcc(bld.def(s2)), bld.def(s1, scc),
2365                                  tmp0, tmp1);
2366             Temp add = bld.vop2(aco_opcode::v_cndmask_b32, bld.def(v1),
2367                                 bld.copy(bld.def(v1), Operand::zero()),
2368                                 bld.copy(bld.def(v1), Operand::c32(0x3ff00000u)), cond);
2369             add = bld.pseudo(aco_opcode::p_create_vector, bld.def(v2),
2370                              bld.copy(bld.def(v1), Operand::zero()), add);
2371             bld.vop3(aco_opcode::v_add_f64, Definition(dst), trunc, add);
2372          }
2373       } else {
2374          isel_err(&instr->instr, "Unimplemented NIR instr bit size");
2375       }
2376       break;
2377    }
2378    case nir_op_ftrunc: {
2379       if (dst.regClass() == v2b) {
2380          emit_vop1_instruction(ctx, instr, aco_opcode::v_trunc_f16, dst);
2381       } else if (dst.regClass() == v1) {
2382          emit_vop1_instruction(ctx, instr, aco_opcode::v_trunc_f32, dst);
2383       } else if (dst.regClass() == v2) {
2384          Temp src = get_alu_src(ctx, instr->src[0]);
2385          emit_trunc_f64(ctx, bld, Definition(dst), src);
2386       } else {
2387          isel_err(&instr->instr, "Unimplemented NIR instr bit size");
2388       }
2389       break;
2390    }
2391    case nir_op_fround_even: {
2392       if (dst.regClass() == v2b) {
2393          emit_vop1_instruction(ctx, instr, aco_opcode::v_rndne_f16, dst);
2394       } else if (dst.regClass() == v1) {
2395          emit_vop1_instruction(ctx, instr, aco_opcode::v_rndne_f32, dst);
2396       } else if (dst.regClass() == v2) {
2397          if (ctx->options->chip_class >= GFX7) {
2398             emit_vop1_instruction(ctx, instr, aco_opcode::v_rndne_f64, dst);
2399          } else {
2400             /* GFX6 doesn't support V_RNDNE_F64, lower it. */
2401             Temp src0_lo = bld.tmp(v1), src0_hi = bld.tmp(v1);
2402             Temp src0 = get_alu_src(ctx, instr->src[0]);
2403             bld.pseudo(aco_opcode::p_split_vector, Definition(src0_lo), Definition(src0_hi), src0);
2404 
2405             Temp bitmask = bld.sop1(aco_opcode::s_brev_b32, bld.def(s1),
2406                                     bld.copy(bld.def(s1), Operand::c32(-2u)));
2407             Temp bfi =
2408                bld.vop3(aco_opcode::v_bfi_b32, bld.def(v1), bitmask,
2409                         bld.copy(bld.def(v1), Operand::c32(0x43300000u)), as_vgpr(ctx, src0_hi));
2410             Temp tmp =
2411                bld.vop3(aco_opcode::v_add_f64, bld.def(v2), src0,
2412                         bld.pseudo(aco_opcode::p_create_vector, bld.def(v2), Operand::zero(), bfi));
2413             Instruction* sub =
2414                bld.vop3(aco_opcode::v_add_f64, bld.def(v2), tmp,
2415                         bld.pseudo(aco_opcode::p_create_vector, bld.def(v2), Operand::zero(), bfi));
2416             sub->vop3().neg[1] = true;
2417             tmp = sub->definitions[0].getTemp();
2418 
2419             Temp v = bld.pseudo(aco_opcode::p_create_vector, bld.def(v2), Operand::c32(-1u),
2420                                 Operand::c32(0x432fffffu));
2421             Instruction* vop3 =
2422                bld.vopc_e64(aco_opcode::v_cmp_gt_f64, bld.hint_vcc(bld.def(bld.lm)), src0, v);
2423             vop3->vop3().abs[0] = true;
2424             Temp cond = vop3->definitions[0].getTemp();
2425 
2426             Temp tmp_lo = bld.tmp(v1), tmp_hi = bld.tmp(v1);
2427             bld.pseudo(aco_opcode::p_split_vector, Definition(tmp_lo), Definition(tmp_hi), tmp);
2428             Temp dst0 = bld.vop2_e64(aco_opcode::v_cndmask_b32, bld.def(v1), tmp_lo,
2429                                      as_vgpr(ctx, src0_lo), cond);
2430             Temp dst1 = bld.vop2_e64(aco_opcode::v_cndmask_b32, bld.def(v1), tmp_hi,
2431                                      as_vgpr(ctx, src0_hi), cond);
2432 
2433             bld.pseudo(aco_opcode::p_create_vector, Definition(dst), dst0, dst1);
2434          }
2435       } else {
2436          isel_err(&instr->instr, "Unimplemented NIR instr bit size");
2437       }
2438       break;
2439    }
2440    case nir_op_fsin:
2441    case nir_op_fcos: {
2442       Temp src = as_vgpr(ctx, get_alu_src(ctx, instr->src[0]));
2443       aco_ptr<Instruction> norm;
2444       if (dst.regClass() == v2b) {
2445          Temp half_pi = bld.copy(bld.def(s1), Operand::c32(0x3118u));
2446          Temp tmp = bld.vop2(aco_opcode::v_mul_f16, bld.def(v1), half_pi, src);
2447          aco_opcode opcode =
2448             instr->op == nir_op_fsin ? aco_opcode::v_sin_f16 : aco_opcode::v_cos_f16;
2449          bld.vop1(opcode, Definition(dst), tmp);
2450       } else if (dst.regClass() == v1) {
2451          Temp half_pi = bld.copy(bld.def(s1), Operand::c32(0x3e22f983u));
2452          Temp tmp = bld.vop2(aco_opcode::v_mul_f32, bld.def(v1), half_pi, src);
2453 
2454          /* before GFX9, v_sin_f32 and v_cos_f32 had a valid input domain of [-256, +256] */
2455          if (ctx->options->chip_class < GFX9)
2456             tmp = bld.vop1(aco_opcode::v_fract_f32, bld.def(v1), tmp);
2457 
2458          aco_opcode opcode =
2459             instr->op == nir_op_fsin ? aco_opcode::v_sin_f32 : aco_opcode::v_cos_f32;
2460          bld.vop1(opcode, Definition(dst), tmp);
2461       } else {
2462          isel_err(&instr->instr, "Unimplemented NIR instr bit size");
2463       }
2464       break;
2465    }
2466    case nir_op_ldexp: {
2467       if (dst.regClass() == v2b) {
2468          emit_vop2_instruction(ctx, instr, aco_opcode::v_ldexp_f16, dst, false);
2469       } else if (dst.regClass() == v1) {
2470          emit_vop3a_instruction(ctx, instr, aco_opcode::v_ldexp_f32, dst);
2471       } else if (dst.regClass() == v2) {
2472          emit_vop3a_instruction(ctx, instr, aco_opcode::v_ldexp_f64, dst);
2473       } else {
2474          isel_err(&instr->instr, "Unimplemented NIR instr bit size");
2475       }
2476       break;
2477    }
2478    case nir_op_frexp_sig: {
2479       if (dst.regClass() == v2b) {
2480          emit_vop1_instruction(ctx, instr, aco_opcode::v_frexp_mant_f16, dst);
2481       } else if (dst.regClass() == v1) {
2482          emit_vop1_instruction(ctx, instr, aco_opcode::v_frexp_mant_f32, dst);
2483       } else if (dst.regClass() == v2) {
2484          emit_vop1_instruction(ctx, instr, aco_opcode::v_frexp_mant_f64, dst);
2485       } else {
2486          isel_err(&instr->instr, "Unimplemented NIR instr bit size");
2487       }
2488       break;
2489    }
2490    case nir_op_frexp_exp: {
2491       if (instr->src[0].src.ssa->bit_size == 16) {
2492          Temp src = get_alu_src(ctx, instr->src[0]);
2493          Temp tmp = bld.vop1(aco_opcode::v_frexp_exp_i16_f16, bld.def(v1), src);
2494          tmp = bld.pseudo(aco_opcode::p_extract_vector, bld.def(v1b), tmp, Operand::zero());
2495          convert_int(ctx, bld, tmp, 8, 32, true, dst);
2496       } else if (instr->src[0].src.ssa->bit_size == 32) {
2497          emit_vop1_instruction(ctx, instr, aco_opcode::v_frexp_exp_i32_f32, dst);
2498       } else if (instr->src[0].src.ssa->bit_size == 64) {
2499          emit_vop1_instruction(ctx, instr, aco_opcode::v_frexp_exp_i32_f64, dst);
2500       } else {
2501          isel_err(&instr->instr, "Unimplemented NIR instr bit size");
2502       }
2503       break;
2504    }
2505    case nir_op_fsign: {
2506       Temp src = as_vgpr(ctx, get_alu_src(ctx, instr->src[0]));
2507       if (dst.regClass() == v2b) {
2508          assert(ctx->program->chip_class >= GFX9);
2509          /* replace negative zero with positive zero */
2510          src = bld.vop2(aco_opcode::v_add_f16, bld.def(v2b), Operand::zero(), src);
2511          src =
2512             bld.vop3(aco_opcode::v_med3_i16, bld.def(v2b), Operand::c16(-1), src, Operand::c16(1u));
2513          bld.vop1(aco_opcode::v_cvt_f16_i16, Definition(dst), src);
2514       } else if (dst.regClass() == v1) {
2515          src = bld.vop2(aco_opcode::v_add_f32, bld.def(v1), Operand::zero(), src);
2516          src =
2517             bld.vop3(aco_opcode::v_med3_i32, bld.def(v1), Operand::c32(-1), src, Operand::c32(1u));
2518          bld.vop1(aco_opcode::v_cvt_f32_i32, Definition(dst), src);
2519       } else if (dst.regClass() == v2) {
2520          Temp cond = bld.vopc(aco_opcode::v_cmp_nlt_f64, bld.hint_vcc(bld.def(bld.lm)),
2521                               Operand::zero(), src);
2522          Temp tmp = bld.copy(bld.def(v1), Operand::c32(0x3FF00000u));
2523          Temp upper = bld.vop2_e64(aco_opcode::v_cndmask_b32, bld.def(v1), tmp,
2524                                    emit_extract_vector(ctx, src, 1, v1), cond);
2525 
2526          cond =
2527             bld.vopc(aco_opcode::v_cmp_le_f64, bld.hint_vcc(bld.def(bld.lm)), Operand::zero(), src);
2528          tmp = bld.copy(bld.def(v1), Operand::c32(0xBFF00000u));
2529          upper = bld.vop2(aco_opcode::v_cndmask_b32, bld.def(v1), tmp, upper, cond);
2530 
2531          bld.pseudo(aco_opcode::p_create_vector, Definition(dst), Operand::zero(), upper);
2532       } else {
2533          isel_err(&instr->instr, "Unimplemented NIR instr bit size");
2534       }
2535       break;
2536    }
2537    case nir_op_f2f16:
2538    case nir_op_f2f16_rtne: {
2539       Temp src = get_alu_src(ctx, instr->src[0]);
2540       if (instr->src[0].src.ssa->bit_size == 64)
2541          src = bld.vop1(aco_opcode::v_cvt_f32_f64, bld.def(v1), src);
2542       if (instr->op == nir_op_f2f16_rtne && ctx->block->fp_mode.round16_64 != fp_round_ne)
2543          /* We emit s_round_mode/s_setreg_imm32 in lower_to_hw_instr to
2544           * keep value numbering and the scheduler simpler.
2545           */
2546          bld.vop1(aco_opcode::p_cvt_f16_f32_rtne, Definition(dst), src);
2547       else
2548          bld.vop1(aco_opcode::v_cvt_f16_f32, Definition(dst), src);
2549       break;
2550    }
2551    case nir_op_f2f16_rtz: {
2552       Temp src = get_alu_src(ctx, instr->src[0]);
2553       if (instr->src[0].src.ssa->bit_size == 64)
2554          src = bld.vop1(aco_opcode::v_cvt_f32_f64, bld.def(v1), src);
2555       if (ctx->block->fp_mode.round16_64 == fp_round_tz)
2556          bld.vop1(aco_opcode::v_cvt_f16_f32, Definition(dst), src);
2557       else if (ctx->program->chip_class == GFX8 || ctx->program->chip_class == GFX9)
2558          bld.vop3(aco_opcode::v_cvt_pkrtz_f16_f32_e64, Definition(dst), src, Operand::zero());
2559       else
2560          bld.vop2(aco_opcode::v_cvt_pkrtz_f16_f32, Definition(dst), src, as_vgpr(ctx, src));
2561       break;
2562    }
2563    case nir_op_f2f32: {
2564       if (instr->src[0].src.ssa->bit_size == 16) {
2565          emit_vop1_instruction(ctx, instr, aco_opcode::v_cvt_f32_f16, dst);
2566       } else if (instr->src[0].src.ssa->bit_size == 64) {
2567          emit_vop1_instruction(ctx, instr, aco_opcode::v_cvt_f32_f64, dst);
2568       } else {
2569          isel_err(&instr->instr, "Unimplemented NIR instr bit size");
2570       }
2571       break;
2572    }
2573    case nir_op_f2f64: {
2574       Temp src = get_alu_src(ctx, instr->src[0]);
2575       if (instr->src[0].src.ssa->bit_size == 16)
2576          src = bld.vop1(aco_opcode::v_cvt_f32_f16, bld.def(v1), src);
2577       bld.vop1(aco_opcode::v_cvt_f64_f32, Definition(dst), src);
2578       break;
2579    }
2580    case nir_op_i2f16: {
2581       assert(dst.regClass() == v2b);
2582       Temp src = get_alu_src(ctx, instr->src[0]);
2583       const unsigned input_size = instr->src[0].src.ssa->bit_size;
2584       if (input_size <= 16) {
2585          /* Expand integer to the size expected by the uint→float converter used below */
2586          unsigned target_size = (ctx->program->chip_class >= GFX8 ? 16 : 32);
2587          if (input_size != target_size) {
2588             src = convert_int(ctx, bld, src, input_size, target_size, true);
2589          }
2590       } else if (input_size == 64) {
2591          /* Truncate down to 32 bits; if any of the upper bits are relevant,
2592           * the value does not fall into the single-precision float range
2593           * anyway. SPIR-V does not mandate any specific behavior for such
2594           * large inputs.
2595           */
2596          src = convert_int(ctx, bld, src, 64, 32, false);
2597       }
2598 
2599       if (ctx->program->chip_class >= GFX8 && input_size <= 16) {
2600          bld.vop1(aco_opcode::v_cvt_f16_i16, Definition(dst), src);
2601       } else {
2602          /* Convert to f32 and then down to f16. This is needed to handle
2603           * inputs slightly outside the range [INT16_MIN, INT16_MAX],
2604           * which are representable via f16 but wouldn't be converted
2605           * correctly by v_cvt_f16_i16.
2606           *
2607           * This is also the fallback-path taken on GFX7 and earlier, which
2608           * do not support direct f16⟷i16 conversions.
2609           */
2610          src = bld.vop1(aco_opcode::v_cvt_f32_i32, bld.def(v1), src);
2611          bld.vop1(aco_opcode::v_cvt_f16_f32, Definition(dst), src);
2612       }
2613       break;
2614    }
2615    case nir_op_i2f32: {
2616       assert(dst.size() == 1);
2617       Temp src = get_alu_src(ctx, instr->src[0]);
2618       const unsigned input_size = instr->src[0].src.ssa->bit_size;
2619       if (input_size <= 32) {
2620          if (input_size <= 16) {
2621             /* Sign-extend to 32-bits */
2622             src = convert_int(ctx, bld, src, input_size, 32, true);
2623          }
2624          bld.vop1(aco_opcode::v_cvt_f32_i32, Definition(dst), src);
2625       } else {
2626          assert(input_size == 64);
2627          RegClass rc = RegClass(src.type(), 1);
2628          Temp lower = bld.tmp(rc), upper = bld.tmp(rc);
2629          bld.pseudo(aco_opcode::p_split_vector, Definition(lower), Definition(upper), src);
2630          lower = bld.vop1(aco_opcode::v_cvt_f64_u32, bld.def(v2), lower);
2631          upper = bld.vop1(aco_opcode::v_cvt_f64_i32, bld.def(v2), upper);
2632          upper = bld.vop3(aco_opcode::v_ldexp_f64, bld.def(v2), upper, Operand::c32(32u));
2633          upper = bld.vop3(aco_opcode::v_add_f64, bld.def(v2), lower, upper);
2634          bld.vop1(aco_opcode::v_cvt_f32_f64, Definition(dst), upper);
2635       }
2636 
2637       break;
2638    }
2639    case nir_op_i2f64: {
2640       if (instr->src[0].src.ssa->bit_size <= 32) {
2641          Temp src = get_alu_src(ctx, instr->src[0]);
2642          if (instr->src[0].src.ssa->bit_size <= 16)
2643             src = convert_int(ctx, bld, src, instr->src[0].src.ssa->bit_size, 32, true);
2644          bld.vop1(aco_opcode::v_cvt_f64_i32, Definition(dst), src);
2645       } else if (instr->src[0].src.ssa->bit_size == 64) {
2646          Temp src = get_alu_src(ctx, instr->src[0]);
2647          RegClass rc = RegClass(src.type(), 1);
2648          Temp lower = bld.tmp(rc), upper = bld.tmp(rc);
2649          bld.pseudo(aco_opcode::p_split_vector, Definition(lower), Definition(upper), src);
2650          lower = bld.vop1(aco_opcode::v_cvt_f64_u32, bld.def(v2), lower);
2651          upper = bld.vop1(aco_opcode::v_cvt_f64_i32, bld.def(v2), upper);
2652          upper = bld.vop3(aco_opcode::v_ldexp_f64, bld.def(v2), upper, Operand::c32(32u));
2653          bld.vop3(aco_opcode::v_add_f64, Definition(dst), lower, upper);
2654 
2655       } else {
2656          isel_err(&instr->instr, "Unimplemented NIR instr bit size");
2657       }
2658       break;
2659    }
2660    case nir_op_u2f16: {
2661       assert(dst.regClass() == v2b);
2662       Temp src = get_alu_src(ctx, instr->src[0]);
2663       const unsigned input_size = instr->src[0].src.ssa->bit_size;
2664       if (input_size <= 16) {
2665          /* Expand integer to the size expected by the uint→float converter used below */
2666          unsigned target_size = (ctx->program->chip_class >= GFX8 ? 16 : 32);
2667          if (input_size != target_size) {
2668             src = convert_int(ctx, bld, src, input_size, target_size, false);
2669          }
2670       } else if (input_size == 64) {
2671          /* Truncate down to 32 bits; if any of the upper bits are non-zero,
2672           * the value does not fall into the single-precision float range
2673           * anyway. SPIR-V does not mandate any specific behavior for such
2674           * large inputs.
2675           */
2676          src = convert_int(ctx, bld, src, 64, 32, false);
2677       }
2678 
2679       if (ctx->program->chip_class >= GFX8) {
2680          /* float16 has a range of [0, 65519]. Converting from larger
2681           * inputs is UB, so we just need to consider the lower 16 bits */
2682          bld.vop1(aco_opcode::v_cvt_f16_u16, Definition(dst), src);
2683       } else {
2684          /* GFX7 and earlier do not support direct f16⟷u16 conversions */
2685          src = bld.vop1(aco_opcode::v_cvt_f32_u32, bld.def(v1), src);
2686          bld.vop1(aco_opcode::v_cvt_f16_f32, Definition(dst), src);
2687       }
2688       break;
2689    }
2690    case nir_op_u2f32: {
2691       assert(dst.size() == 1);
2692       Temp src = get_alu_src(ctx, instr->src[0]);
2693       const unsigned input_size = instr->src[0].src.ssa->bit_size;
2694       if (input_size == 8) {
2695          bld.vop1(aco_opcode::v_cvt_f32_ubyte0, Definition(dst), src);
2696       } else if (input_size <= 32) {
2697          if (input_size == 16)
2698             src = convert_int(ctx, bld, src, instr->src[0].src.ssa->bit_size, 32, false);
2699          bld.vop1(aco_opcode::v_cvt_f32_u32, Definition(dst), src);
2700       } else {
2701          assert(input_size == 64);
2702          RegClass rc = RegClass(src.type(), 1);
2703          Temp lower = bld.tmp(rc), upper = bld.tmp(rc);
2704          bld.pseudo(aco_opcode::p_split_vector, Definition(lower), Definition(upper), src);
2705          lower = bld.vop1(aco_opcode::v_cvt_f64_u32, bld.def(v2), lower);
2706          upper = bld.vop1(aco_opcode::v_cvt_f64_u32, bld.def(v2), upper);
2707          upper = bld.vop3(aco_opcode::v_ldexp_f64, bld.def(v2), upper, Operand::c32(32u));
2708          upper = bld.vop3(aco_opcode::v_add_f64, bld.def(v2), lower, upper);
2709          bld.vop1(aco_opcode::v_cvt_f32_f64, Definition(dst), upper);
2710       }
2711       break;
2712    }
2713    case nir_op_u2f64: {
2714       if (instr->src[0].src.ssa->bit_size <= 32) {
2715          Temp src = get_alu_src(ctx, instr->src[0]);
2716          if (instr->src[0].src.ssa->bit_size <= 16)
2717             src = convert_int(ctx, bld, src, instr->src[0].src.ssa->bit_size, 32, false);
2718          bld.vop1(aco_opcode::v_cvt_f64_u32, Definition(dst), src);
2719       } else if (instr->src[0].src.ssa->bit_size == 64) {
2720          Temp src = get_alu_src(ctx, instr->src[0]);
2721          RegClass rc = RegClass(src.type(), 1);
2722          Temp lower = bld.tmp(rc), upper = bld.tmp(rc);
2723          bld.pseudo(aco_opcode::p_split_vector, Definition(lower), Definition(upper), src);
2724          lower = bld.vop1(aco_opcode::v_cvt_f64_u32, bld.def(v2), lower);
2725          upper = bld.vop1(aco_opcode::v_cvt_f64_u32, bld.def(v2), upper);
2726          upper = bld.vop3(aco_opcode::v_ldexp_f64, bld.def(v2), upper, Operand::c32(32u));
2727          bld.vop3(aco_opcode::v_add_f64, Definition(dst), lower, upper);
2728       } else {
2729          isel_err(&instr->instr, "Unimplemented NIR instr bit size");
2730       }
2731       break;
2732    }
2733    case nir_op_f2i8:
2734    case nir_op_f2i16: {
2735       if (instr->src[0].src.ssa->bit_size == 16) {
2736          if (ctx->program->chip_class >= GFX8) {
2737             emit_vop1_instruction(ctx, instr, aco_opcode::v_cvt_i16_f16, dst);
2738          } else {
2739             /* GFX7 and earlier do not support direct f16⟷i16 conversions */
2740             Temp tmp = bld.tmp(v1);
2741             emit_vop1_instruction(ctx, instr, aco_opcode::v_cvt_f32_f16, tmp);
2742             tmp = bld.vop1(aco_opcode::v_cvt_i32_f32, bld.def(v1), tmp);
2743             tmp = convert_int(ctx, bld, tmp, 32, instr->dest.dest.ssa.bit_size, false,
2744                               (dst.type() == RegType::sgpr) ? Temp() : dst);
2745             if (dst.type() == RegType::sgpr) {
2746                bld.pseudo(aco_opcode::p_as_uniform, Definition(dst), tmp);
2747             }
2748          }
2749       } else if (instr->src[0].src.ssa->bit_size == 32) {
2750          emit_vop1_instruction(ctx, instr, aco_opcode::v_cvt_i32_f32, dst);
2751       } else {
2752          emit_vop1_instruction(ctx, instr, aco_opcode::v_cvt_i32_f64, dst);
2753       }
2754       break;
2755    }
2756    case nir_op_f2u8:
2757    case nir_op_f2u16: {
2758       if (instr->src[0].src.ssa->bit_size == 16) {
2759          if (ctx->program->chip_class >= GFX8) {
2760             emit_vop1_instruction(ctx, instr, aco_opcode::v_cvt_u16_f16, dst);
2761          } else {
2762             /* GFX7 and earlier do not support direct f16⟷u16 conversions */
2763             Temp tmp = bld.tmp(v1);
2764             emit_vop1_instruction(ctx, instr, aco_opcode::v_cvt_f32_f16, tmp);
2765             tmp = bld.vop1(aco_opcode::v_cvt_u32_f32, bld.def(v1), tmp);
2766             tmp = convert_int(ctx, bld, tmp, 32, instr->dest.dest.ssa.bit_size, false,
2767                               (dst.type() == RegType::sgpr) ? Temp() : dst);
2768             if (dst.type() == RegType::sgpr) {
2769                bld.pseudo(aco_opcode::p_as_uniform, Definition(dst), tmp);
2770             }
2771          }
2772       } else if (instr->src[0].src.ssa->bit_size == 32) {
2773          emit_vop1_instruction(ctx, instr, aco_opcode::v_cvt_u32_f32, dst);
2774       } else {
2775          emit_vop1_instruction(ctx, instr, aco_opcode::v_cvt_u32_f64, dst);
2776       }
2777       break;
2778    }
2779    case nir_op_f2i32: {
2780       Temp src = get_alu_src(ctx, instr->src[0]);
2781       if (instr->src[0].src.ssa->bit_size == 16) {
2782          Temp tmp = bld.vop1(aco_opcode::v_cvt_f32_f16, bld.def(v1), src);
2783          if (dst.type() == RegType::vgpr) {
2784             bld.vop1(aco_opcode::v_cvt_i32_f32, Definition(dst), tmp);
2785          } else {
2786             bld.pseudo(aco_opcode::p_as_uniform, Definition(dst),
2787                        bld.vop1(aco_opcode::v_cvt_i32_f32, bld.def(v1), tmp));
2788          }
2789       } else if (instr->src[0].src.ssa->bit_size == 32) {
2790          emit_vop1_instruction(ctx, instr, aco_opcode::v_cvt_i32_f32, dst);
2791       } else if (instr->src[0].src.ssa->bit_size == 64) {
2792          emit_vop1_instruction(ctx, instr, aco_opcode::v_cvt_i32_f64, dst);
2793       } else {
2794          isel_err(&instr->instr, "Unimplemented NIR instr bit size");
2795       }
2796       break;
2797    }
2798    case nir_op_f2u32: {
2799       Temp src = get_alu_src(ctx, instr->src[0]);
2800       if (instr->src[0].src.ssa->bit_size == 16) {
2801          Temp tmp = bld.vop1(aco_opcode::v_cvt_f32_f16, bld.def(v1), src);
2802          if (dst.type() == RegType::vgpr) {
2803             bld.vop1(aco_opcode::v_cvt_u32_f32, Definition(dst), tmp);
2804          } else {
2805             bld.pseudo(aco_opcode::p_as_uniform, Definition(dst),
2806                        bld.vop1(aco_opcode::v_cvt_u32_f32, bld.def(v1), tmp));
2807          }
2808       } else if (instr->src[0].src.ssa->bit_size == 32) {
2809          emit_vop1_instruction(ctx, instr, aco_opcode::v_cvt_u32_f32, dst);
2810       } else if (instr->src[0].src.ssa->bit_size == 64) {
2811          emit_vop1_instruction(ctx, instr, aco_opcode::v_cvt_u32_f64, dst);
2812       } else {
2813          isel_err(&instr->instr, "Unimplemented NIR instr bit size");
2814       }
2815       break;
2816    }
2817    case nir_op_f2i64: {
2818       Temp src = get_alu_src(ctx, instr->src[0]);
2819       if (instr->src[0].src.ssa->bit_size == 16)
2820          src = bld.vop1(aco_opcode::v_cvt_f32_f16, bld.def(v1), src);
2821 
2822       if (instr->src[0].src.ssa->bit_size <= 32 && dst.type() == RegType::vgpr) {
2823          Temp exponent = bld.vop1(aco_opcode::v_frexp_exp_i32_f32, bld.def(v1), src);
2824          exponent = bld.vop3(aco_opcode::v_med3_i32, bld.def(v1), Operand::zero(), exponent,
2825                              Operand::c32(64u));
2826          Temp mantissa = bld.vop2(aco_opcode::v_and_b32, bld.def(v1), Operand::c32(0x7fffffu), src);
2827          Temp sign = bld.vop2(aco_opcode::v_ashrrev_i32, bld.def(v1), Operand::c32(31u), src);
2828          mantissa = bld.vop2(aco_opcode::v_or_b32, bld.def(v1), Operand::c32(0x800000u), mantissa);
2829          mantissa = bld.vop2(aco_opcode::v_lshlrev_b32, bld.def(v1), Operand::c32(7u), mantissa);
2830          mantissa = bld.pseudo(aco_opcode::p_create_vector, bld.def(v2), Operand::zero(), mantissa);
2831          Temp new_exponent = bld.tmp(v1);
2832          Temp borrow =
2833             bld.vsub32(Definition(new_exponent), Operand::c32(63u), exponent, true).def(1).getTemp();
2834          if (ctx->program->chip_class >= GFX8)
2835             mantissa = bld.vop3(aco_opcode::v_lshrrev_b64, bld.def(v2), new_exponent, mantissa);
2836          else
2837             mantissa = bld.vop3(aco_opcode::v_lshr_b64, bld.def(v2), mantissa, new_exponent);
2838          Temp saturate = bld.vop1(aco_opcode::v_bfrev_b32, bld.def(v1), Operand::c32(0xfffffffeu));
2839          Temp lower = bld.tmp(v1), upper = bld.tmp(v1);
2840          bld.pseudo(aco_opcode::p_split_vector, Definition(lower), Definition(upper), mantissa);
2841          lower = bld.vop2_e64(aco_opcode::v_cndmask_b32, bld.def(v1), lower,
2842                               Operand::c32(0xffffffffu), borrow);
2843          upper = bld.vop2(aco_opcode::v_cndmask_b32, bld.def(v1), upper, saturate, borrow);
2844          lower = bld.vop2(aco_opcode::v_xor_b32, bld.def(v1), sign, lower);
2845          upper = bld.vop2(aco_opcode::v_xor_b32, bld.def(v1), sign, upper);
2846          Temp new_lower = bld.tmp(v1);
2847          borrow = bld.vsub32(Definition(new_lower), lower, sign, true).def(1).getTemp();
2848          Temp new_upper = bld.vsub32(bld.def(v1), upper, sign, false, borrow);
2849          bld.pseudo(aco_opcode::p_create_vector, Definition(dst), new_lower, new_upper);
2850 
2851       } else if (instr->src[0].src.ssa->bit_size <= 32 && dst.type() == RegType::sgpr) {
2852          if (src.type() == RegType::vgpr)
2853             src = bld.as_uniform(src);
2854          Temp exponent = bld.sop2(aco_opcode::s_bfe_u32, bld.def(s1), bld.def(s1, scc), src,
2855                                   Operand::c32(0x80017u));
2856          exponent = bld.sop2(aco_opcode::s_sub_i32, bld.def(s1), bld.def(s1, scc), exponent,
2857                              Operand::c32(126u));
2858          exponent = bld.sop2(aco_opcode::s_max_i32, bld.def(s1), bld.def(s1, scc), Operand::zero(),
2859                              exponent);
2860          exponent = bld.sop2(aco_opcode::s_min_i32, bld.def(s1), bld.def(s1, scc),
2861                              Operand::c32(64u), exponent);
2862          Temp mantissa = bld.sop2(aco_opcode::s_and_b32, bld.def(s1), bld.def(s1, scc),
2863                                   Operand::c32(0x7fffffu), src);
2864          Temp sign =
2865             bld.sop2(aco_opcode::s_ashr_i32, bld.def(s1), bld.def(s1, scc), src, Operand::c32(31u));
2866          mantissa = bld.sop2(aco_opcode::s_or_b32, bld.def(s1), bld.def(s1, scc),
2867                              Operand::c32(0x800000u), mantissa);
2868          mantissa = bld.sop2(aco_opcode::s_lshl_b32, bld.def(s1), bld.def(s1, scc), mantissa,
2869                              Operand::c32(7u));
2870          mantissa = bld.pseudo(aco_opcode::p_create_vector, bld.def(s2), Operand::zero(), mantissa);
2871          exponent = bld.sop2(aco_opcode::s_sub_u32, bld.def(s1), bld.def(s1, scc),
2872                              Operand::c32(63u), exponent);
2873          mantissa =
2874             bld.sop2(aco_opcode::s_lshr_b64, bld.def(s2), bld.def(s1, scc), mantissa, exponent);
2875          Temp cond = bld.sopc(aco_opcode::s_cmp_eq_u32, bld.def(s1, scc), exponent,
2876                               Operand::c32(0xffffffffu)); // exp >= 64
2877          Temp saturate = bld.sop1(aco_opcode::s_brev_b64, bld.def(s2), Operand::c32(0xfffffffeu));
2878          mantissa = bld.sop2(aco_opcode::s_cselect_b64, bld.def(s2), saturate, mantissa, cond);
2879          Temp lower = bld.tmp(s1), upper = bld.tmp(s1);
2880          bld.pseudo(aco_opcode::p_split_vector, Definition(lower), Definition(upper), mantissa);
2881          lower = bld.sop2(aco_opcode::s_xor_b32, bld.def(s1), bld.def(s1, scc), sign, lower);
2882          upper = bld.sop2(aco_opcode::s_xor_b32, bld.def(s1), bld.def(s1, scc), sign, upper);
2883          Temp borrow = bld.tmp(s1);
2884          lower =
2885             bld.sop2(aco_opcode::s_sub_u32, bld.def(s1), bld.scc(Definition(borrow)), lower, sign);
2886          upper = bld.sop2(aco_opcode::s_subb_u32, bld.def(s1), bld.def(s1, scc), upper, sign,
2887                           bld.scc(borrow));
2888          bld.pseudo(aco_opcode::p_create_vector, Definition(dst), lower, upper);
2889 
2890       } else if (instr->src[0].src.ssa->bit_size == 64) {
2891          Temp vec = bld.pseudo(aco_opcode::p_create_vector, bld.def(s2), Operand::zero(),
2892                                Operand::c32(0x3df00000u));
2893          Temp trunc = emit_trunc_f64(ctx, bld, bld.def(v2), src);
2894          Temp mul = bld.vop3(aco_opcode::v_mul_f64, bld.def(v2), trunc, vec);
2895          vec = bld.pseudo(aco_opcode::p_create_vector, bld.def(s2), Operand::zero(),
2896                           Operand::c32(0xc1f00000u));
2897          Temp floor = emit_floor_f64(ctx, bld, bld.def(v2), mul);
2898          Temp fma = bld.vop3(aco_opcode::v_fma_f64, bld.def(v2), floor, vec, trunc);
2899          Temp lower = bld.vop1(aco_opcode::v_cvt_u32_f64, bld.def(v1), fma);
2900          Temp upper = bld.vop1(aco_opcode::v_cvt_i32_f64, bld.def(v1), floor);
2901          if (dst.type() == RegType::sgpr) {
2902             lower = bld.as_uniform(lower);
2903             upper = bld.as_uniform(upper);
2904          }
2905          bld.pseudo(aco_opcode::p_create_vector, Definition(dst), lower, upper);
2906 
2907       } else {
2908          isel_err(&instr->instr, "Unimplemented NIR instr bit size");
2909       }
2910       break;
2911    }
2912    case nir_op_f2u64: {
2913       Temp src = get_alu_src(ctx, instr->src[0]);
2914       if (instr->src[0].src.ssa->bit_size == 16)
2915          src = bld.vop1(aco_opcode::v_cvt_f32_f16, bld.def(v1), src);
2916 
2917       if (instr->src[0].src.ssa->bit_size <= 32 && dst.type() == RegType::vgpr) {
2918          Temp exponent = bld.vop1(aco_opcode::v_frexp_exp_i32_f32, bld.def(v1), src);
2919          Temp exponent_in_range = bld.vopc(aco_opcode::v_cmp_ge_i32, bld.hint_vcc(bld.def(bld.lm)),
2920                                            Operand::c32(64u), exponent);
2921          exponent = bld.vop2(aco_opcode::v_max_i32, bld.def(v1), Operand::zero(), exponent);
2922          Temp mantissa = bld.vop2(aco_opcode::v_and_b32, bld.def(v1), Operand::c32(0x7fffffu), src);
2923          mantissa = bld.vop2(aco_opcode::v_or_b32, bld.def(v1), Operand::c32(0x800000u), mantissa);
2924          Temp exponent_small = bld.vsub32(bld.def(v1), Operand::c32(24u), exponent);
2925          Temp small = bld.vop2(aco_opcode::v_lshrrev_b32, bld.def(v1), exponent_small, mantissa);
2926          mantissa = bld.pseudo(aco_opcode::p_create_vector, bld.def(v2), Operand::zero(), mantissa);
2927          Temp new_exponent = bld.tmp(v1);
2928          Temp cond_small =
2929             bld.vsub32(Definition(new_exponent), exponent, Operand::c32(24u), true).def(1).getTemp();
2930          if (ctx->program->chip_class >= GFX8)
2931             mantissa = bld.vop3(aco_opcode::v_lshlrev_b64, bld.def(v2), new_exponent, mantissa);
2932          else
2933             mantissa = bld.vop3(aco_opcode::v_lshl_b64, bld.def(v2), mantissa, new_exponent);
2934          Temp lower = bld.tmp(v1), upper = bld.tmp(v1);
2935          bld.pseudo(aco_opcode::p_split_vector, Definition(lower), Definition(upper), mantissa);
2936          lower = bld.vop2(aco_opcode::v_cndmask_b32, bld.def(v1), lower, small, cond_small);
2937          upper = bld.vop2_e64(aco_opcode::v_cndmask_b32, bld.def(v1), upper, Operand::zero(),
2938                               cond_small);
2939          lower = bld.vop2(aco_opcode::v_cndmask_b32, bld.def(v1), Operand::c32(0xffffffffu), lower,
2940                           exponent_in_range);
2941          upper = bld.vop2(aco_opcode::v_cndmask_b32, bld.def(v1), Operand::c32(0xffffffffu), upper,
2942                           exponent_in_range);
2943          bld.pseudo(aco_opcode::p_create_vector, Definition(dst), lower, upper);
2944 
2945       } else if (instr->src[0].src.ssa->bit_size <= 32 && dst.type() == RegType::sgpr) {
2946          if (src.type() == RegType::vgpr)
2947             src = bld.as_uniform(src);
2948          Temp exponent = bld.sop2(aco_opcode::s_bfe_u32, bld.def(s1), bld.def(s1, scc), src,
2949                                   Operand::c32(0x80017u));
2950          exponent = bld.sop2(aco_opcode::s_sub_i32, bld.def(s1), bld.def(s1, scc), exponent,
2951                              Operand::c32(126u));
2952          exponent = bld.sop2(aco_opcode::s_max_i32, bld.def(s1), bld.def(s1, scc), Operand::zero(),
2953                              exponent);
2954          Temp mantissa = bld.sop2(aco_opcode::s_and_b32, bld.def(s1), bld.def(s1, scc),
2955                                   Operand::c32(0x7fffffu), src);
2956          mantissa = bld.sop2(aco_opcode::s_or_b32, bld.def(s1), bld.def(s1, scc),
2957                              Operand::c32(0x800000u), mantissa);
2958          Temp exponent_small = bld.sop2(aco_opcode::s_sub_u32, bld.def(s1), bld.def(s1, scc),
2959                                         Operand::c32(24u), exponent);
2960          Temp small = bld.sop2(aco_opcode::s_lshr_b32, bld.def(s1), bld.def(s1, scc), mantissa,
2961                                exponent_small);
2962          mantissa = bld.pseudo(aco_opcode::p_create_vector, bld.def(s2), Operand::zero(), mantissa);
2963          Temp exponent_large = bld.sop2(aco_opcode::s_sub_u32, bld.def(s1), bld.def(s1, scc),
2964                                         exponent, Operand::c32(24u));
2965          mantissa = bld.sop2(aco_opcode::s_lshl_b64, bld.def(s2), bld.def(s1, scc), mantissa,
2966                              exponent_large);
2967          Temp cond =
2968             bld.sopc(aco_opcode::s_cmp_ge_i32, bld.def(s1, scc), Operand::c32(64u), exponent);
2969          mantissa = bld.sop2(aco_opcode::s_cselect_b64, bld.def(s2), mantissa,
2970                              Operand::c32(0xffffffffu), cond);
2971          Temp lower = bld.tmp(s1), upper = bld.tmp(s1);
2972          bld.pseudo(aco_opcode::p_split_vector, Definition(lower), Definition(upper), mantissa);
2973          Temp cond_small =
2974             bld.sopc(aco_opcode::s_cmp_le_i32, bld.def(s1, scc), exponent, Operand::c32(24u));
2975          lower = bld.sop2(aco_opcode::s_cselect_b32, bld.def(s1), small, lower, cond_small);
2976          upper =
2977             bld.sop2(aco_opcode::s_cselect_b32, bld.def(s1), Operand::zero(), upper, cond_small);
2978          bld.pseudo(aco_opcode::p_create_vector, Definition(dst), lower, upper);
2979 
2980       } else if (instr->src[0].src.ssa->bit_size == 64) {
2981          Temp vec = bld.pseudo(aco_opcode::p_create_vector, bld.def(s2), Operand::zero(),
2982                                Operand::c32(0x3df00000u));
2983          Temp trunc = emit_trunc_f64(ctx, bld, bld.def(v2), src);
2984          Temp mul = bld.vop3(aco_opcode::v_mul_f64, bld.def(v2), trunc, vec);
2985          vec = bld.pseudo(aco_opcode::p_create_vector, bld.def(s2), Operand::zero(),
2986                           Operand::c32(0xc1f00000u));
2987          Temp floor = emit_floor_f64(ctx, bld, bld.def(v2), mul);
2988          Temp fma = bld.vop3(aco_opcode::v_fma_f64, bld.def(v2), floor, vec, trunc);
2989          Temp lower = bld.vop1(aco_opcode::v_cvt_u32_f64, bld.def(v1), fma);
2990          Temp upper = bld.vop1(aco_opcode::v_cvt_u32_f64, bld.def(v1), floor);
2991          if (dst.type() == RegType::sgpr) {
2992             lower = bld.as_uniform(lower);
2993             upper = bld.as_uniform(upper);
2994          }
2995          bld.pseudo(aco_opcode::p_create_vector, Definition(dst), lower, upper);
2996 
2997       } else {
2998          isel_err(&instr->instr, "Unimplemented NIR instr bit size");
2999       }
3000       break;
3001    }
3002    case nir_op_b2f16: {
3003       Temp src = get_alu_src(ctx, instr->src[0]);
3004       assert(src.regClass() == bld.lm);
3005 
3006       if (dst.regClass() == s1) {
3007          src = bool_to_scalar_condition(ctx, src);
3008          bld.sop2(aco_opcode::s_mul_i32, Definition(dst), Operand::c32(0x3c00u), src);
3009       } else if (dst.regClass() == v2b) {
3010          Temp one = bld.copy(bld.def(v1), Operand::c32(0x3c00u));
3011          bld.vop2(aco_opcode::v_cndmask_b32, Definition(dst), Operand::zero(), one, src);
3012       } else {
3013          unreachable("Wrong destination register class for nir_op_b2f16.");
3014       }
3015       break;
3016    }
3017    case nir_op_b2f32: {
3018       Temp src = get_alu_src(ctx, instr->src[0]);
3019       assert(src.regClass() == bld.lm);
3020 
3021       if (dst.regClass() == s1) {
3022          src = bool_to_scalar_condition(ctx, src);
3023          bld.sop2(aco_opcode::s_mul_i32, Definition(dst), Operand::c32(0x3f800000u), src);
3024       } else if (dst.regClass() == v1) {
3025          bld.vop2_e64(aco_opcode::v_cndmask_b32, Definition(dst), Operand::zero(),
3026                       Operand::c32(0x3f800000u), src);
3027       } else {
3028          unreachable("Wrong destination register class for nir_op_b2f32.");
3029       }
3030       break;
3031    }
3032    case nir_op_b2f64: {
3033       Temp src = get_alu_src(ctx, instr->src[0]);
3034       assert(src.regClass() == bld.lm);
3035 
3036       if (dst.regClass() == s2) {
3037          src = bool_to_scalar_condition(ctx, src);
3038          bld.sop2(aco_opcode::s_cselect_b64, Definition(dst), Operand::c32(0x3f800000u),
3039                   Operand::zero(), bld.scc(src));
3040       } else if (dst.regClass() == v2) {
3041          Temp one = bld.copy(bld.def(v1), Operand::c32(0x3FF00000u));
3042          Temp upper =
3043             bld.vop2_e64(aco_opcode::v_cndmask_b32, bld.def(v1), Operand::zero(), one, src);
3044          bld.pseudo(aco_opcode::p_create_vector, Definition(dst), Operand::zero(), upper);
3045       } else {
3046          unreachable("Wrong destination register class for nir_op_b2f64.");
3047       }
3048       break;
3049    }
3050    case nir_op_i2i8:
3051    case nir_op_i2i16:
3052    case nir_op_i2i32:
3053    case nir_op_i2i64: {
3054       if (dst.type() == RegType::sgpr && instr->src[0].src.ssa->bit_size < 32) {
3055          /* no need to do the extract in get_alu_src() */
3056          sgpr_extract_mode mode = instr->dest.dest.ssa.bit_size > instr->src[0].src.ssa->bit_size
3057                                      ? sgpr_extract_sext
3058                                      : sgpr_extract_undef;
3059          extract_8_16_bit_sgpr_element(ctx, dst, &instr->src[0], mode);
3060       } else {
3061          const unsigned input_bitsize = instr->src[0].src.ssa->bit_size;
3062          const unsigned output_bitsize = instr->dest.dest.ssa.bit_size;
3063          convert_int(ctx, bld, get_alu_src(ctx, instr->src[0]), input_bitsize, output_bitsize,
3064                      output_bitsize > input_bitsize, dst);
3065       }
3066       break;
3067    }
3068    case nir_op_u2u8:
3069    case nir_op_u2u16:
3070    case nir_op_u2u32:
3071    case nir_op_u2u64: {
3072       if (dst.type() == RegType::sgpr && instr->src[0].src.ssa->bit_size < 32) {
3073          /* no need to do the extract in get_alu_src() */
3074          sgpr_extract_mode mode = instr->dest.dest.ssa.bit_size > instr->src[0].src.ssa->bit_size
3075                                      ? sgpr_extract_zext
3076                                      : sgpr_extract_undef;
3077          extract_8_16_bit_sgpr_element(ctx, dst, &instr->src[0], mode);
3078       } else {
3079          convert_int(ctx, bld, get_alu_src(ctx, instr->src[0]), instr->src[0].src.ssa->bit_size,
3080                      instr->dest.dest.ssa.bit_size, false, dst);
3081       }
3082       break;
3083    }
3084    case nir_op_b2b32:
3085    case nir_op_b2i8:
3086    case nir_op_b2i16:
3087    case nir_op_b2i32:
3088    case nir_op_b2i64: {
3089       Temp src = get_alu_src(ctx, instr->src[0]);
3090       assert(src.regClass() == bld.lm);
3091 
3092       Temp tmp = dst.bytes() == 8 ? bld.tmp(RegClass::get(dst.type(), 4)) : dst;
3093       if (tmp.regClass() == s1) {
3094          bool_to_scalar_condition(ctx, src, tmp);
3095       } else if (tmp.type() == RegType::vgpr) {
3096          bld.vop2_e64(aco_opcode::v_cndmask_b32, Definition(tmp), Operand::zero(), Operand::c32(1u),
3097                       src);
3098       } else {
3099          unreachable("Invalid register class for b2i32");
3100       }
3101 
3102       if (tmp != dst)
3103          bld.pseudo(aco_opcode::p_create_vector, Definition(dst), tmp, Operand::zero());
3104       break;
3105    }
3106    case nir_op_b2b1:
3107    case nir_op_i2b1: {
3108       Temp src = get_alu_src(ctx, instr->src[0]);
3109       assert(dst.regClass() == bld.lm);
3110 
3111       if (src.type() == RegType::vgpr) {
3112          assert(src.regClass() == v1 || src.regClass() == v2);
3113          assert(dst.regClass() == bld.lm);
3114          bld.vopc(src.size() == 2 ? aco_opcode::v_cmp_lg_u64 : aco_opcode::v_cmp_lg_u32,
3115                   Definition(dst), Operand::zero(), src)
3116             .def(0)
3117             .setHint(vcc);
3118       } else {
3119          assert(src.regClass() == s1 || src.regClass() == s2);
3120          Temp tmp;
3121          if (src.regClass() == s2 && ctx->program->chip_class <= GFX7) {
3122             tmp =
3123                bld.sop2(aco_opcode::s_or_b64, bld.def(s2), bld.def(s1, scc), Operand::zero(), src)
3124                   .def(1)
3125                   .getTemp();
3126          } else {
3127             tmp = bld.sopc(src.size() == 2 ? aco_opcode::s_cmp_lg_u64 : aco_opcode::s_cmp_lg_u32,
3128                            bld.scc(bld.def(s1)), Operand::zero(), src);
3129          }
3130          bool_to_vector_condition(ctx, tmp, dst);
3131       }
3132       break;
3133    }
3134    case nir_op_unpack_64_2x32:
3135    case nir_op_unpack_32_2x16:
3136    case nir_op_unpack_64_4x16:
3137       bld.copy(Definition(dst), get_alu_src(ctx, instr->src[0]));
3138       emit_split_vector(ctx, dst, instr->op == nir_op_unpack_64_4x16 ? 4 : 2);
3139       break;
3140    case nir_op_pack_64_2x32_split: {
3141       Temp src0 = get_alu_src(ctx, instr->src[0]);
3142       Temp src1 = get_alu_src(ctx, instr->src[1]);
3143 
3144       bld.pseudo(aco_opcode::p_create_vector, Definition(dst), src0, src1);
3145       break;
3146    }
3147    case nir_op_unpack_64_2x32_split_x:
3148       bld.pseudo(aco_opcode::p_split_vector, Definition(dst), bld.def(dst.regClass()),
3149                  get_alu_src(ctx, instr->src[0]));
3150       break;
3151    case nir_op_unpack_64_2x32_split_y:
3152       bld.pseudo(aco_opcode::p_split_vector, bld.def(dst.regClass()), Definition(dst),
3153                  get_alu_src(ctx, instr->src[0]));
3154       break;
3155    case nir_op_unpack_32_2x16_split_x:
3156       if (dst.type() == RegType::vgpr) {
3157          bld.pseudo(aco_opcode::p_split_vector, Definition(dst), bld.def(dst.regClass()),
3158                     get_alu_src(ctx, instr->src[0]));
3159       } else {
3160          bld.copy(Definition(dst), get_alu_src(ctx, instr->src[0]));
3161       }
3162       break;
3163    case nir_op_unpack_32_2x16_split_y:
3164       if (dst.type() == RegType::vgpr) {
3165          bld.pseudo(aco_opcode::p_split_vector, bld.def(dst.regClass()), Definition(dst),
3166                     get_alu_src(ctx, instr->src[0]));
3167       } else {
3168          bld.pseudo(aco_opcode::p_extract, Definition(dst), bld.def(s1, scc),
3169                     get_alu_src(ctx, instr->src[0]), Operand::c32(1u), Operand::c32(16u),
3170                     Operand::zero());
3171       }
3172       break;
3173    case nir_op_pack_32_2x16_split: {
3174       Temp src0 = get_alu_src(ctx, instr->src[0]);
3175       Temp src1 = get_alu_src(ctx, instr->src[1]);
3176       if (dst.regClass() == v1) {
3177          src0 = emit_extract_vector(ctx, src0, 0, v2b);
3178          src1 = emit_extract_vector(ctx, src1, 0, v2b);
3179          bld.pseudo(aco_opcode::p_create_vector, Definition(dst), src0, src1);
3180       } else {
3181          src0 = bld.sop2(aco_opcode::s_and_b32, bld.def(s1), bld.def(s1, scc), src0,
3182                          Operand::c32(0xFFFFu));
3183          src1 = bld.sop2(aco_opcode::s_lshl_b32, bld.def(s1), bld.def(s1, scc), src1,
3184                          Operand::c32(16u));
3185          bld.sop2(aco_opcode::s_or_b32, Definition(dst), bld.def(s1, scc), src0, src1);
3186       }
3187       break;
3188    }
3189    case nir_op_pack_32_4x8: bld.copy(Definition(dst), get_alu_src(ctx, instr->src[0], 4)); break;
3190    case nir_op_pack_half_2x16_split: {
3191       if (dst.regClass() == v1) {
3192          nir_const_value* val = nir_src_as_const_value(instr->src[1].src);
3193          if (val && val->u32 == 0 && ctx->program->chip_class <= GFX9) {
3194             /* upper bits zero on GFX6-GFX9 */
3195             bld.vop1(aco_opcode::v_cvt_f16_f32, Definition(dst), get_alu_src(ctx, instr->src[0]));
3196          } else if (!ctx->block->fp_mode.care_about_round16_64 ||
3197                     ctx->block->fp_mode.round16_64 == fp_round_tz) {
3198             if (ctx->program->chip_class == GFX8 || ctx->program->chip_class == GFX9)
3199                emit_vop3a_instruction(ctx, instr, aco_opcode::v_cvt_pkrtz_f16_f32_e64, dst);
3200             else
3201                emit_vop2_instruction(ctx, instr, aco_opcode::v_cvt_pkrtz_f16_f32, dst, false);
3202          } else {
3203             Temp src0 =
3204                bld.vop1(aco_opcode::v_cvt_f16_f32, bld.def(v2b), get_alu_src(ctx, instr->src[0]));
3205             Temp src1 =
3206                bld.vop1(aco_opcode::v_cvt_f16_f32, bld.def(v2b), get_alu_src(ctx, instr->src[1]));
3207             bld.pseudo(aco_opcode::p_create_vector, Definition(dst), src0, src1);
3208          }
3209       } else {
3210          isel_err(&instr->instr, "Unimplemented NIR instr bit size");
3211       }
3212       break;
3213    }
3214    case nir_op_unpack_half_2x16_split_x_flush_to_zero:
3215    case nir_op_unpack_half_2x16_split_x: {
3216       Temp src = get_alu_src(ctx, instr->src[0]);
3217       if (src.regClass() == v1)
3218          src = bld.pseudo(aco_opcode::p_split_vector, bld.def(v2b), bld.def(v2b), src);
3219       if (dst.regClass() == v1) {
3220          assert(ctx->block->fp_mode.must_flush_denorms16_64 ==
3221                 (instr->op == nir_op_unpack_half_2x16_split_x_flush_to_zero));
3222          bld.vop1(aco_opcode::v_cvt_f32_f16, Definition(dst), src);
3223       } else {
3224          isel_err(&instr->instr, "Unimplemented NIR instr bit size");
3225       }
3226       break;
3227    }
3228    case nir_op_unpack_half_2x16_split_y_flush_to_zero:
3229    case nir_op_unpack_half_2x16_split_y: {
3230       Temp src = get_alu_src(ctx, instr->src[0]);
3231       if (src.regClass() == s1)
3232          src =
3233             bld.sop2(aco_opcode::s_lshr_b32, bld.def(s1), bld.def(s1, scc), src, Operand::c32(16u));
3234       else
3235          src =
3236             bld.pseudo(aco_opcode::p_split_vector, bld.def(v2b), bld.def(v2b), src).def(1).getTemp();
3237       if (dst.regClass() == v1) {
3238          assert(ctx->block->fp_mode.must_flush_denorms16_64 ==
3239                 (instr->op == nir_op_unpack_half_2x16_split_y_flush_to_zero));
3240          bld.vop1(aco_opcode::v_cvt_f32_f16, Definition(dst), src);
3241       } else {
3242          isel_err(&instr->instr, "Unimplemented NIR instr bit size");
3243       }
3244       break;
3245    }
3246    case nir_op_sad_u8x4: {
3247       assert(dst.regClass() == v1);
3248       emit_vop3a_instruction(ctx, instr, aco_opcode::v_sad_u8, dst, false, 3u, false);
3249       break;
3250    }
3251    case nir_op_fquantize2f16: {
3252       Temp src = get_alu_src(ctx, instr->src[0]);
3253       Temp f16 = bld.vop1(aco_opcode::v_cvt_f16_f32, bld.def(v1), src);
3254       Temp f32, cmp_res;
3255 
3256       if (ctx->program->chip_class >= GFX8) {
3257          Temp mask = bld.copy(
3258             bld.def(s1), Operand::c32(0x36Fu)); /* value is NOT negative/positive denormal value */
3259          cmp_res =
3260             bld.vopc_e64(aco_opcode::v_cmp_class_f16, bld.hint_vcc(bld.def(bld.lm)), f16, mask);
3261          f32 = bld.vop1(aco_opcode::v_cvt_f32_f16, bld.def(v1), f16);
3262       } else {
3263          /* 0x38800000 is smallest half float value (2^-14) in 32-bit float,
3264           * so compare the result and flush to 0 if it's smaller.
3265           */
3266          f32 = bld.vop1(aco_opcode::v_cvt_f32_f16, bld.def(v1), f16);
3267          Temp smallest = bld.copy(bld.def(s1), Operand::c32(0x38800000u));
3268          Instruction* tmp0 = bld.vopc_e64(aco_opcode::v_cmp_lt_f32, bld.def(bld.lm), f32, smallest);
3269          tmp0->vop3().abs[0] = true;
3270          Temp tmp1 =
3271             bld.vopc(aco_opcode::v_cmp_lg_f32, bld.hint_vcc(bld.def(bld.lm)), Operand::zero(), f32);
3272          cmp_res = bld.sop2(aco_opcode::s_nand_b64, bld.def(s2), bld.def(s1, scc),
3273                             tmp0->definitions[0].getTemp(), tmp1);
3274       }
3275 
3276       if (ctx->block->fp_mode.preserve_signed_zero_inf_nan32) {
3277          Temp copysign_0 =
3278             bld.vop2(aco_opcode::v_mul_f32, bld.def(v1), Operand::zero(), as_vgpr(ctx, src));
3279          bld.vop2(aco_opcode::v_cndmask_b32, Definition(dst), copysign_0, f32, cmp_res);
3280       } else {
3281          bld.vop2(aco_opcode::v_cndmask_b32, Definition(dst), Operand::zero(), f32, cmp_res);
3282       }
3283       break;
3284    }
3285    case nir_op_bfm: {
3286       Temp bits = get_alu_src(ctx, instr->src[0]);
3287       Temp offset = get_alu_src(ctx, instr->src[1]);
3288 
3289       if (dst.regClass() == s1) {
3290          bld.sop2(aco_opcode::s_bfm_b32, Definition(dst), bits, offset);
3291       } else if (dst.regClass() == v1) {
3292          bld.vop3(aco_opcode::v_bfm_b32, Definition(dst), bits, offset);
3293       } else {
3294          isel_err(&instr->instr, "Unimplemented NIR instr bit size");
3295       }
3296       break;
3297    }
3298    case nir_op_bitfield_select: {
3299 
3300       /* dst = (insert & bitmask) | (base & ~bitmask) */
3301       if (dst.regClass() == s1) {
3302          Temp bitmask = get_alu_src(ctx, instr->src[0]);
3303          Temp insert = get_alu_src(ctx, instr->src[1]);
3304          Temp base = get_alu_src(ctx, instr->src[2]);
3305          aco_ptr<Instruction> sop2;
3306          nir_const_value* const_bitmask = nir_src_as_const_value(instr->src[0].src);
3307          nir_const_value* const_insert = nir_src_as_const_value(instr->src[1].src);
3308          Operand lhs;
3309          if (const_insert && const_bitmask) {
3310             lhs = Operand::c32(const_insert->u32 & const_bitmask->u32);
3311          } else {
3312             insert =
3313                bld.sop2(aco_opcode::s_and_b32, bld.def(s1), bld.def(s1, scc), insert, bitmask);
3314             lhs = Operand(insert);
3315          }
3316 
3317          Operand rhs;
3318          nir_const_value* const_base = nir_src_as_const_value(instr->src[2].src);
3319          if (const_base && const_bitmask) {
3320             rhs = Operand::c32(const_base->u32 & ~const_bitmask->u32);
3321          } else {
3322             base = bld.sop2(aco_opcode::s_andn2_b32, bld.def(s1), bld.def(s1, scc), base, bitmask);
3323             rhs = Operand(base);
3324          }
3325 
3326          bld.sop2(aco_opcode::s_or_b32, Definition(dst), bld.def(s1, scc), rhs, lhs);
3327 
3328       } else if (dst.regClass() == v1) {
3329          emit_vop3a_instruction(ctx, instr, aco_opcode::v_bfi_b32, dst, false, 3);
3330       } else {
3331          isel_err(&instr->instr, "Unimplemented NIR instr bit size");
3332       }
3333       break;
3334    }
3335    case nir_op_ubfe:
3336    case nir_op_ibfe: {
3337       if (dst.bytes() != 4)
3338          unreachable("Unsupported BFE bit size");
3339 
3340       if (dst.type() == RegType::sgpr) {
3341          Temp base = get_alu_src(ctx, instr->src[0]);
3342 
3343          nir_const_value* const_offset = nir_src_as_const_value(instr->src[1].src);
3344          nir_const_value* const_bits = nir_src_as_const_value(instr->src[2].src);
3345          if (const_offset && const_bits) {
3346             uint32_t extract = (const_bits->u32 << 16) | (const_offset->u32 & 0x1f);
3347             aco_opcode opcode =
3348                instr->op == nir_op_ubfe ? aco_opcode::s_bfe_u32 : aco_opcode::s_bfe_i32;
3349             bld.sop2(opcode, Definition(dst), bld.def(s1, scc), base, Operand::c32(extract));
3350             break;
3351          }
3352 
3353          Temp offset = get_alu_src(ctx, instr->src[1]);
3354          Temp bits = get_alu_src(ctx, instr->src[2]);
3355          if (instr->op == nir_op_ubfe) {
3356             Temp mask = bld.sop2(aco_opcode::s_bfm_b32, bld.def(s1), bits, offset);
3357             Temp masked =
3358                bld.sop2(aco_opcode::s_and_b32, bld.def(s1), bld.def(s1, scc), base, mask);
3359             bld.sop2(aco_opcode::s_lshr_b32, Definition(dst), bld.def(s1, scc), masked, offset);
3360          } else {
3361             Operand bits_op = const_bits ? Operand::c32(const_bits->u32 << 16)
3362                                          : bld.sop2(aco_opcode::s_lshl_b32, bld.def(s1),
3363                                                     bld.def(s1, scc), bits, Operand::c32(16u));
3364             Operand offset_op = const_offset
3365                                    ? Operand::c32(const_offset->u32 & 0x1fu)
3366                                    : bld.sop2(aco_opcode::s_and_b32, bld.def(s1), bld.def(s1, scc),
3367                                               offset, Operand::c32(0x1fu));
3368 
3369             Temp extract =
3370                bld.sop2(aco_opcode::s_or_b32, bld.def(s1), bld.def(s1, scc), bits_op, offset_op);
3371             bld.sop2(aco_opcode::s_bfe_i32, Definition(dst), bld.def(s1, scc), base, extract);
3372          }
3373 
3374       } else {
3375          aco_opcode opcode =
3376             instr->op == nir_op_ubfe ? aco_opcode::v_bfe_u32 : aco_opcode::v_bfe_i32;
3377          emit_vop3a_instruction(ctx, instr, opcode, dst, false, 3);
3378       }
3379       break;
3380    }
3381    case nir_op_extract_u8:
3382    case nir_op_extract_i8:
3383    case nir_op_extract_u16:
3384    case nir_op_extract_i16: {
3385       bool is_signed = instr->op == nir_op_extract_i16 || instr->op == nir_op_extract_i8;
3386       unsigned comp = instr->op == nir_op_extract_u8 || instr->op == nir_op_extract_i8 ? 4 : 2;
3387       uint32_t bits = comp == 4 ? 8 : 16;
3388       unsigned index = nir_src_as_uint(instr->src[1].src);
3389       if (bits >= instr->dest.dest.ssa.bit_size || index * bits >= instr->dest.dest.ssa.bit_size) {
3390          assert(index == 0);
3391          bld.copy(Definition(dst), get_alu_src(ctx, instr->src[0]));
3392       } else if (dst.regClass() == s1 && instr->dest.dest.ssa.bit_size == 16) {
3393          Temp vec = get_ssa_temp(ctx, instr->src[0].src.ssa);
3394          unsigned swizzle = instr->src[0].swizzle[0];
3395          if (vec.size() > 1) {
3396             vec = emit_extract_vector(ctx, vec, swizzle / 2, s1);
3397             swizzle = swizzle & 1;
3398          }
3399          index += swizzle * instr->dest.dest.ssa.bit_size / bits;
3400          bld.pseudo(aco_opcode::p_extract, Definition(dst), bld.def(s1, scc), Operand(vec),
3401                     Operand::c32(index), Operand::c32(bits), Operand::c32(is_signed));
3402       } else {
3403          Temp src = get_alu_src(ctx, instr->src[0]);
3404          Definition def(dst);
3405          if (dst.bytes() == 8) {
3406             src = emit_extract_vector(ctx, src, index / comp, RegClass(src.type(), 1));
3407             index %= comp;
3408             def = bld.def(src.type(), 1);
3409          }
3410          assert(def.bytes() <= 4);
3411          if (def.regClass() == s1) {
3412             bld.pseudo(aco_opcode::p_extract, def, bld.def(s1, scc), Operand(src),
3413                        Operand::c32(index), Operand::c32(bits), Operand::c32(is_signed));
3414          } else {
3415             src = emit_extract_vector(ctx, src, 0, def.regClass());
3416             bld.pseudo(aco_opcode::p_extract, def, Operand(src), Operand::c32(index),
3417                        Operand::c32(bits), Operand::c32(is_signed));
3418          }
3419          if (dst.size() == 2)
3420             bld.pseudo(aco_opcode::p_create_vector, Definition(dst), def.getTemp(),
3421                        Operand::zero());
3422       }
3423       break;
3424    }
3425    case nir_op_insert_u8:
3426    case nir_op_insert_u16: {
3427       unsigned comp = instr->op == nir_op_insert_u8 ? 4 : 2;
3428       uint32_t bits = comp == 4 ? 8 : 16;
3429       unsigned index = nir_src_as_uint(instr->src[1].src);
3430       if (bits >= instr->dest.dest.ssa.bit_size || index * bits >= instr->dest.dest.ssa.bit_size) {
3431          assert(index == 0);
3432          bld.copy(Definition(dst), get_alu_src(ctx, instr->src[0]));
3433       } else {
3434          Temp src = get_alu_src(ctx, instr->src[0]);
3435          Definition def(dst);
3436          bool swap = false;
3437          if (dst.bytes() == 8) {
3438             src = emit_extract_vector(ctx, src, 0u, RegClass(src.type(), 1));
3439             swap = index >= comp;
3440             index %= comp;
3441             def = bld.def(src.type(), 1);
3442          }
3443          if (def.regClass() == s1) {
3444             bld.pseudo(aco_opcode::p_insert, def, bld.def(s1, scc), Operand(src),
3445                        Operand::c32(index), Operand::c32(bits));
3446          } else {
3447             src = emit_extract_vector(ctx, src, 0, def.regClass());
3448             bld.pseudo(aco_opcode::p_insert, def, Operand(src), Operand::c32(index),
3449                        Operand::c32(bits));
3450          }
3451          if (dst.size() == 2 && swap)
3452             bld.pseudo(aco_opcode::p_create_vector, Definition(dst), Operand::zero(),
3453                        def.getTemp());
3454          else if (dst.size() == 2)
3455             bld.pseudo(aco_opcode::p_create_vector, Definition(dst), def.getTemp(),
3456                        Operand::zero());
3457       }
3458       break;
3459    }
3460    case nir_op_bit_count: {
3461       Temp src = get_alu_src(ctx, instr->src[0]);
3462       if (src.regClass() == s1) {
3463          bld.sop1(aco_opcode::s_bcnt1_i32_b32, Definition(dst), bld.def(s1, scc), src);
3464       } else if (src.regClass() == v1) {
3465          bld.vop3(aco_opcode::v_bcnt_u32_b32, Definition(dst), src, Operand::zero());
3466       } else if (src.regClass() == v2) {
3467          bld.vop3(aco_opcode::v_bcnt_u32_b32, Definition(dst), emit_extract_vector(ctx, src, 1, v1),
3468                   bld.vop3(aco_opcode::v_bcnt_u32_b32, bld.def(v1),
3469                            emit_extract_vector(ctx, src, 0, v1), Operand::zero()));
3470       } else if (src.regClass() == s2) {
3471          bld.sop1(aco_opcode::s_bcnt1_i32_b64, Definition(dst), bld.def(s1, scc), src);
3472       } else {
3473          isel_err(&instr->instr, "Unimplemented NIR instr bit size");
3474       }
3475       break;
3476    }
3477    case nir_op_flt: {
3478       emit_comparison(ctx, instr, dst, aco_opcode::v_cmp_lt_f16, aco_opcode::v_cmp_lt_f32,
3479                       aco_opcode::v_cmp_lt_f64);
3480       break;
3481    }
3482    case nir_op_fge: {
3483       emit_comparison(ctx, instr, dst, aco_opcode::v_cmp_ge_f16, aco_opcode::v_cmp_ge_f32,
3484                       aco_opcode::v_cmp_ge_f64);
3485       break;
3486    }
3487    case nir_op_feq: {
3488       emit_comparison(ctx, instr, dst, aco_opcode::v_cmp_eq_f16, aco_opcode::v_cmp_eq_f32,
3489                       aco_opcode::v_cmp_eq_f64);
3490       break;
3491    }
3492    case nir_op_fneu: {
3493       emit_comparison(ctx, instr, dst, aco_opcode::v_cmp_neq_f16, aco_opcode::v_cmp_neq_f32,
3494                       aco_opcode::v_cmp_neq_f64);
3495       break;
3496    }
3497    case nir_op_ilt: {
3498       emit_comparison(ctx, instr, dst, aco_opcode::v_cmp_lt_i16, aco_opcode::v_cmp_lt_i32,
3499                       aco_opcode::v_cmp_lt_i64, aco_opcode::s_cmp_lt_i32);
3500       break;
3501    }
3502    case nir_op_ige: {
3503       emit_comparison(ctx, instr, dst, aco_opcode::v_cmp_ge_i16, aco_opcode::v_cmp_ge_i32,
3504                       aco_opcode::v_cmp_ge_i64, aco_opcode::s_cmp_ge_i32);
3505       break;
3506    }
3507    case nir_op_ieq: {
3508       if (instr->src[0].src.ssa->bit_size == 1)
3509          emit_boolean_logic(ctx, instr, Builder::s_xnor, dst);
3510       else
3511          emit_comparison(
3512             ctx, instr, dst, aco_opcode::v_cmp_eq_i16, aco_opcode::v_cmp_eq_i32,
3513             aco_opcode::v_cmp_eq_i64, aco_opcode::s_cmp_eq_i32,
3514             ctx->program->chip_class >= GFX8 ? aco_opcode::s_cmp_eq_u64 : aco_opcode::num_opcodes);
3515       break;
3516    }
3517    case nir_op_ine: {
3518       if (instr->src[0].src.ssa->bit_size == 1)
3519          emit_boolean_logic(ctx, instr, Builder::s_xor, dst);
3520       else
3521          emit_comparison(
3522             ctx, instr, dst, aco_opcode::v_cmp_lg_i16, aco_opcode::v_cmp_lg_i32,
3523             aco_opcode::v_cmp_lg_i64, aco_opcode::s_cmp_lg_i32,
3524             ctx->program->chip_class >= GFX8 ? aco_opcode::s_cmp_lg_u64 : aco_opcode::num_opcodes);
3525       break;
3526    }
3527    case nir_op_ult: {
3528       emit_comparison(ctx, instr, dst, aco_opcode::v_cmp_lt_u16, aco_opcode::v_cmp_lt_u32,
3529                       aco_opcode::v_cmp_lt_u64, aco_opcode::s_cmp_lt_u32);
3530       break;
3531    }
3532    case nir_op_uge: {
3533       emit_comparison(ctx, instr, dst, aco_opcode::v_cmp_ge_u16, aco_opcode::v_cmp_ge_u32,
3534                       aco_opcode::v_cmp_ge_u64, aco_opcode::s_cmp_ge_u32);
3535       break;
3536    }
3537    case nir_op_fddx:
3538    case nir_op_fddy:
3539    case nir_op_fddx_fine:
3540    case nir_op_fddy_fine:
3541    case nir_op_fddx_coarse:
3542    case nir_op_fddy_coarse: {
3543       if (!nir_src_is_divergent(instr->src[0].src)) {
3544          /* Source is the same in all lanes, so the derivative is zero.
3545           * This also avoids emitting invalid IR.
3546           */
3547          bld.copy(Definition(dst), Operand::zero());
3548          break;
3549       }
3550 
3551       Temp src = as_vgpr(ctx, get_alu_src(ctx, instr->src[0]));
3552       uint16_t dpp_ctrl1, dpp_ctrl2;
3553       if (instr->op == nir_op_fddx_fine) {
3554          dpp_ctrl1 = dpp_quad_perm(0, 0, 2, 2);
3555          dpp_ctrl2 = dpp_quad_perm(1, 1, 3, 3);
3556       } else if (instr->op == nir_op_fddy_fine) {
3557          dpp_ctrl1 = dpp_quad_perm(0, 1, 0, 1);
3558          dpp_ctrl2 = dpp_quad_perm(2, 3, 2, 3);
3559       } else {
3560          dpp_ctrl1 = dpp_quad_perm(0, 0, 0, 0);
3561          if (instr->op == nir_op_fddx || instr->op == nir_op_fddx_coarse)
3562             dpp_ctrl2 = dpp_quad_perm(1, 1, 1, 1);
3563          else
3564             dpp_ctrl2 = dpp_quad_perm(2, 2, 2, 2);
3565       }
3566 
3567       Temp tmp;
3568       if (ctx->program->chip_class >= GFX8) {
3569          Temp tl = bld.vop1_dpp(aco_opcode::v_mov_b32, bld.def(v1), src, dpp_ctrl1);
3570          tmp = bld.vop2_dpp(aco_opcode::v_sub_f32, bld.def(v1), src, tl, dpp_ctrl2);
3571       } else {
3572          Temp tl = bld.ds(aco_opcode::ds_swizzle_b32, bld.def(v1), src, (1 << 15) | dpp_ctrl1);
3573          Temp tr = bld.ds(aco_opcode::ds_swizzle_b32, bld.def(v1), src, (1 << 15) | dpp_ctrl2);
3574          tmp = bld.vop2(aco_opcode::v_sub_f32, bld.def(v1), tr, tl);
3575       }
3576       emit_wqm(bld, tmp, dst, true);
3577       break;
3578    }
3579    default: isel_err(&instr->instr, "Unknown NIR ALU instr");
3580    }
3581 }
3582 
3583 void
visit_load_const(isel_context * ctx,nir_load_const_instr * instr)3584 visit_load_const(isel_context* ctx, nir_load_const_instr* instr)
3585 {
3586    Temp dst = get_ssa_temp(ctx, &instr->def);
3587 
3588    // TODO: we really want to have the resulting type as this would allow for 64bit literals
3589    // which get truncated the lsb if double and msb if int
3590    // for now, we only use s_mov_b64 with 64bit inline constants
3591    assert(instr->def.num_components == 1 && "Vector load_const should be lowered to scalar.");
3592    assert(dst.type() == RegType::sgpr);
3593 
3594    Builder bld(ctx->program, ctx->block);
3595 
3596    if (instr->def.bit_size == 1) {
3597       assert(dst.regClass() == bld.lm);
3598       int val = instr->value[0].b ? -1 : 0;
3599       Operand op = bld.lm.size() == 1 ? Operand::c32(val) : Operand::c64(val);
3600       bld.copy(Definition(dst), op);
3601    } else if (instr->def.bit_size == 8) {
3602       bld.copy(Definition(dst), Operand::c32(instr->value[0].u8));
3603    } else if (instr->def.bit_size == 16) {
3604       /* sign-extend to use s_movk_i32 instead of a literal */
3605       bld.copy(Definition(dst), Operand::c32(instr->value[0].i16));
3606    } else if (dst.size() == 1) {
3607       bld.copy(Definition(dst), Operand::c32(instr->value[0].u32));
3608    } else {
3609       assert(dst.size() != 1);
3610       aco_ptr<Pseudo_instruction> vec{create_instruction<Pseudo_instruction>(
3611          aco_opcode::p_create_vector, Format::PSEUDO, dst.size(), 1)};
3612       if (instr->def.bit_size == 64)
3613          for (unsigned i = 0; i < dst.size(); i++)
3614             vec->operands[i] = Operand::c32(instr->value[0].u64 >> i * 32);
3615       else {
3616          for (unsigned i = 0; i < dst.size(); i++)
3617             vec->operands[i] = Operand::c32(instr->value[i].u32);
3618       }
3619       vec->definitions[0] = Definition(dst);
3620       ctx->block->instructions.emplace_back(std::move(vec));
3621    }
3622 }
3623 
3624 uint32_t
widen_mask(uint32_t mask,unsigned multiplier)3625 widen_mask(uint32_t mask, unsigned multiplier)
3626 {
3627    uint32_t new_mask = 0;
3628    for (unsigned i = 0; i < 32 && (1u << i) <= mask; ++i)
3629       if (mask & (1u << i))
3630          new_mask |= ((1u << multiplier) - 1u) << (i * multiplier);
3631    return new_mask;
3632 }
3633 
3634 struct LoadEmitInfo {
3635    Operand offset;
3636    Temp dst;
3637    unsigned num_components;
3638    unsigned component_size;
3639    Temp resource = Temp(0, s1);
3640    unsigned component_stride = 0;
3641    unsigned const_offset = 0;
3642    unsigned align_mul = 0;
3643    unsigned align_offset = 0;
3644 
3645    bool glc = false;
3646    bool slc = false;
3647    unsigned swizzle_component_size = 0;
3648    memory_sync_info sync;
3649    Temp soffset = Temp(0, s1);
3650 };
3651 
3652 struct EmitLoadParameters {
3653    using Callback = Temp (*)(Builder& bld, const LoadEmitInfo& info, Temp offset,
3654                              unsigned bytes_needed, unsigned align, unsigned const_offset,
3655                              Temp dst_hint);
3656 
3657    Callback callback;
3658    bool byte_align_loads;
3659    bool supports_8bit_16bit_loads;
3660    unsigned max_const_offset_plus_one;
3661 };
3662 
3663 void
emit_load(isel_context * ctx,Builder & bld,const LoadEmitInfo & info,const EmitLoadParameters & params)3664 emit_load(isel_context* ctx, Builder& bld, const LoadEmitInfo& info,
3665           const EmitLoadParameters& params)
3666 {
3667    unsigned load_size = info.num_components * info.component_size;
3668    unsigned component_size = info.component_size;
3669 
3670    unsigned num_vals = 0;
3671    Temp* const vals = (Temp*)alloca(info.dst.bytes() * sizeof(Temp));
3672 
3673    unsigned const_offset = info.const_offset;
3674 
3675    const unsigned align_mul = info.align_mul ? info.align_mul : component_size;
3676    unsigned align_offset = (info.align_offset + const_offset) % align_mul;
3677 
3678    unsigned bytes_read = 0;
3679    while (bytes_read < load_size) {
3680       unsigned bytes_needed = load_size - bytes_read;
3681 
3682       /* add buffer for unaligned loads */
3683       int byte_align = 0;
3684       if (params.byte_align_loads) {
3685          byte_align = align_mul % 4 == 0 ? align_offset % 4 : -1;
3686       }
3687 
3688       if (byte_align) {
3689          if (bytes_needed > 2 || (bytes_needed == 2 && (align_mul % 2 || align_offset % 2)) ||
3690              !params.supports_8bit_16bit_loads) {
3691             if (info.component_stride) {
3692                assert(params.supports_8bit_16bit_loads && "unimplemented");
3693                bytes_needed = 2;
3694                byte_align = 0;
3695             } else {
3696                bytes_needed += byte_align == -1 ? 4 - info.align_mul : byte_align;
3697                bytes_needed = align(bytes_needed, 4);
3698             }
3699          } else {
3700             byte_align = 0;
3701          }
3702       }
3703 
3704       if (info.swizzle_component_size)
3705          bytes_needed = MIN2(bytes_needed, info.swizzle_component_size);
3706       if (info.component_stride)
3707          bytes_needed = MIN2(bytes_needed, info.component_size);
3708 
3709       bool need_to_align_offset = byte_align && (align_mul % 4 || align_offset % 4);
3710 
3711       /* reduce constant offset */
3712       Operand offset = info.offset;
3713       unsigned reduced_const_offset = const_offset;
3714       bool remove_const_offset_completely = need_to_align_offset;
3715       if (const_offset &&
3716           (remove_const_offset_completely || const_offset >= params.max_const_offset_plus_one)) {
3717          unsigned to_add = const_offset;
3718          if (remove_const_offset_completely) {
3719             reduced_const_offset = 0;
3720          } else {
3721             to_add =
3722                const_offset / params.max_const_offset_plus_one * params.max_const_offset_plus_one;
3723             reduced_const_offset %= params.max_const_offset_plus_one;
3724          }
3725          Temp offset_tmp = offset.isTemp() ? offset.getTemp() : Temp();
3726          if (offset.isConstant()) {
3727             offset = Operand::c32(offset.constantValue() + to_add);
3728          } else if (offset_tmp.regClass() == s1) {
3729             offset = bld.sop2(aco_opcode::s_add_i32, bld.def(s1), bld.def(s1, scc), offset_tmp,
3730                               Operand::c32(to_add));
3731          } else if (offset_tmp.regClass() == v1) {
3732             offset = bld.vadd32(bld.def(v1), offset_tmp, Operand::c32(to_add));
3733          } else {
3734             Temp lo = bld.tmp(offset_tmp.type(), 1);
3735             Temp hi = bld.tmp(offset_tmp.type(), 1);
3736             bld.pseudo(aco_opcode::p_split_vector, Definition(lo), Definition(hi), offset_tmp);
3737 
3738             if (offset_tmp.regClass() == s2) {
3739                Temp carry = bld.tmp(s1);
3740                lo = bld.sop2(aco_opcode::s_add_u32, bld.def(s1), bld.scc(Definition(carry)), lo,
3741                              Operand::c32(to_add));
3742                hi = bld.sop2(aco_opcode::s_add_u32, bld.def(s1), bld.def(s1, scc), hi, carry);
3743                offset = bld.pseudo(aco_opcode::p_create_vector, bld.def(s2), lo, hi);
3744             } else {
3745                Temp new_lo = bld.tmp(v1);
3746                Temp carry =
3747                   bld.vadd32(Definition(new_lo), lo, Operand::c32(to_add), true).def(1).getTemp();
3748                hi = bld.vadd32(bld.def(v1), hi, Operand::zero(), false, carry);
3749                offset = bld.pseudo(aco_opcode::p_create_vector, bld.def(v2), new_lo, hi);
3750             }
3751          }
3752       }
3753 
3754       /* align offset down if needed */
3755       Operand aligned_offset = offset;
3756       unsigned align = align_offset ? 1 << (ffs(align_offset) - 1) : align_mul;
3757       if (need_to_align_offset) {
3758          align = 4;
3759          Temp offset_tmp = offset.isTemp() ? offset.getTemp() : Temp();
3760          if (offset.isConstant()) {
3761             aligned_offset = Operand::c32(offset.constantValue() & 0xfffffffcu);
3762          } else if (offset_tmp.regClass() == s1) {
3763             aligned_offset = bld.sop2(aco_opcode::s_and_b32, bld.def(s1), bld.def(s1, scc),
3764                                       Operand::c32(0xfffffffcu), offset_tmp);
3765          } else if (offset_tmp.regClass() == s2) {
3766             aligned_offset = bld.sop2(aco_opcode::s_and_b64, bld.def(s2), bld.def(s1, scc),
3767                                       Operand::c64(0xfffffffffffffffcllu), offset_tmp);
3768          } else if (offset_tmp.regClass() == v1) {
3769             aligned_offset =
3770                bld.vop2(aco_opcode::v_and_b32, bld.def(v1), Operand::c32(0xfffffffcu), offset_tmp);
3771          } else if (offset_tmp.regClass() == v2) {
3772             Temp hi = bld.tmp(v1), lo = bld.tmp(v1);
3773             bld.pseudo(aco_opcode::p_split_vector, Definition(lo), Definition(hi), offset_tmp);
3774             lo = bld.vop2(aco_opcode::v_and_b32, bld.def(v1), Operand::c32(0xfffffffcu), lo);
3775             aligned_offset = bld.pseudo(aco_opcode::p_create_vector, bld.def(v2), lo, hi);
3776          }
3777       }
3778       Temp aligned_offset_tmp =
3779          aligned_offset.isTemp() ? aligned_offset.getTemp() : bld.copy(bld.def(s1), aligned_offset);
3780 
3781       Temp val = params.callback(bld, info, aligned_offset_tmp, bytes_needed, align,
3782                                  reduced_const_offset, byte_align ? Temp() : info.dst);
3783 
3784       /* the callback wrote directly to dst */
3785       if (val == info.dst) {
3786          assert(num_vals == 0);
3787          emit_split_vector(ctx, info.dst, info.num_components);
3788          return;
3789       }
3790 
3791       /* shift result right if needed */
3792       if (params.byte_align_loads && info.component_size < 4) {
3793          Operand byte_align_off = Operand::c32(byte_align);
3794          if (byte_align == -1) {
3795             if (offset.isConstant())
3796                byte_align_off = Operand::c32(offset.constantValue() % 4u);
3797             else if (offset.size() == 2)
3798                byte_align_off = Operand(emit_extract_vector(ctx, offset.getTemp(), 0,
3799                                                             RegClass(offset.getTemp().type(), 1)));
3800             else
3801                byte_align_off = offset;
3802          }
3803 
3804          assert(val.bytes() >= load_size && "unimplemented");
3805          if (val.type() == RegType::sgpr)
3806             byte_align_scalar(ctx, val, byte_align_off, info.dst);
3807          else
3808             byte_align_vector(ctx, val, byte_align_off, info.dst, component_size);
3809          return;
3810       }
3811 
3812       /* add result to list and advance */
3813       if (info.component_stride) {
3814          assert(val.bytes() == info.component_size && "unimplemented");
3815          const_offset += info.component_stride;
3816          align_offset = (align_offset + info.component_stride) % align_mul;
3817       } else {
3818          const_offset += val.bytes();
3819          align_offset = (align_offset + val.bytes()) % align_mul;
3820       }
3821       bytes_read += val.bytes();
3822       vals[num_vals++] = val;
3823    }
3824 
3825    /* create array of components */
3826    unsigned components_split = 0;
3827    std::array<Temp, NIR_MAX_VEC_COMPONENTS> allocated_vec;
3828    bool has_vgprs = false;
3829    for (unsigned i = 0; i < num_vals;) {
3830       Temp* const tmp = (Temp*)alloca(num_vals * sizeof(Temp));
3831       unsigned num_tmps = 0;
3832       unsigned tmp_size = 0;
3833       RegType reg_type = RegType::sgpr;
3834       while ((!tmp_size || (tmp_size % component_size)) && i < num_vals) {
3835          if (vals[i].type() == RegType::vgpr)
3836             reg_type = RegType::vgpr;
3837          tmp_size += vals[i].bytes();
3838          tmp[num_tmps++] = vals[i++];
3839       }
3840       if (num_tmps > 1) {
3841          aco_ptr<Pseudo_instruction> vec{create_instruction<Pseudo_instruction>(
3842             aco_opcode::p_create_vector, Format::PSEUDO, num_tmps, 1)};
3843          for (unsigned j = 0; j < num_tmps; j++)
3844             vec->operands[j] = Operand(tmp[j]);
3845          tmp[0] = bld.tmp(RegClass::get(reg_type, tmp_size));
3846          vec->definitions[0] = Definition(tmp[0]);
3847          bld.insert(std::move(vec));
3848       }
3849 
3850       if (tmp[0].bytes() % component_size) {
3851          /* trim tmp[0] */
3852          assert(i == num_vals);
3853          RegClass new_rc =
3854             RegClass::get(reg_type, tmp[0].bytes() / component_size * component_size);
3855          tmp[0] =
3856             bld.pseudo(aco_opcode::p_extract_vector, bld.def(new_rc), tmp[0], Operand::zero());
3857       }
3858 
3859       RegClass elem_rc = RegClass::get(reg_type, component_size);
3860 
3861       unsigned start = components_split;
3862 
3863       if (tmp_size == elem_rc.bytes()) {
3864          allocated_vec[components_split++] = tmp[0];
3865       } else {
3866          assert(tmp_size % elem_rc.bytes() == 0);
3867          aco_ptr<Pseudo_instruction> split{create_instruction<Pseudo_instruction>(
3868             aco_opcode::p_split_vector, Format::PSEUDO, 1, tmp_size / elem_rc.bytes())};
3869          for (auto& def : split->definitions) {
3870             Temp component = bld.tmp(elem_rc);
3871             allocated_vec[components_split++] = component;
3872             def = Definition(component);
3873          }
3874          split->operands[0] = Operand(tmp[0]);
3875          bld.insert(std::move(split));
3876       }
3877 
3878       /* try to p_as_uniform early so we can create more optimizable code and
3879        * also update allocated_vec */
3880       for (unsigned j = start; j < components_split; j++) {
3881          if (allocated_vec[j].bytes() % 4 == 0 && info.dst.type() == RegType::sgpr)
3882             allocated_vec[j] = bld.as_uniform(allocated_vec[j]);
3883          has_vgprs |= allocated_vec[j].type() == RegType::vgpr;
3884       }
3885    }
3886 
3887    /* concatenate components and p_as_uniform() result if needed */
3888    if (info.dst.type() == RegType::vgpr || !has_vgprs)
3889       ctx->allocated_vec.emplace(info.dst.id(), allocated_vec);
3890 
3891    int padding_bytes =
3892       MAX2((int)info.dst.bytes() - int(allocated_vec[0].bytes() * info.num_components), 0);
3893 
3894    aco_ptr<Pseudo_instruction> vec{create_instruction<Pseudo_instruction>(
3895       aco_opcode::p_create_vector, Format::PSEUDO, info.num_components + !!padding_bytes, 1)};
3896    for (unsigned i = 0; i < info.num_components; i++)
3897       vec->operands[i] = Operand(allocated_vec[i]);
3898    if (padding_bytes)
3899       vec->operands[info.num_components] = Operand(RegClass::get(RegType::vgpr, padding_bytes));
3900    if (info.dst.type() == RegType::sgpr && has_vgprs) {
3901       Temp tmp = bld.tmp(RegType::vgpr, info.dst.size());
3902       vec->definitions[0] = Definition(tmp);
3903       bld.insert(std::move(vec));
3904       bld.pseudo(aco_opcode::p_as_uniform, Definition(info.dst), tmp);
3905    } else {
3906       vec->definitions[0] = Definition(info.dst);
3907       bld.insert(std::move(vec));
3908    }
3909 }
3910 
3911 Operand
load_lds_size_m0(Builder & bld)3912 load_lds_size_m0(Builder& bld)
3913 {
3914    /* m0 does not need to be initialized on GFX9+ */
3915    if (bld.program->chip_class >= GFX9)
3916       return Operand(s1);
3917 
3918    return bld.m0((Temp)bld.copy(bld.def(s1, m0), Operand::c32(0xffffffffu)));
3919 }
3920 
3921 Temp
lds_load_callback(Builder & bld,const LoadEmitInfo & info,Temp offset,unsigned bytes_needed,unsigned align,unsigned const_offset,Temp dst_hint)3922 lds_load_callback(Builder& bld, const LoadEmitInfo& info, Temp offset, unsigned bytes_needed,
3923                   unsigned align, unsigned const_offset, Temp dst_hint)
3924 {
3925    offset = offset.regClass() == s1 ? bld.copy(bld.def(v1), offset) : offset;
3926 
3927    Operand m = load_lds_size_m0(bld);
3928 
3929    bool large_ds_read = bld.program->chip_class >= GFX7;
3930    bool usable_read2 = bld.program->chip_class >= GFX7;
3931 
3932    bool read2 = false;
3933    unsigned size = 0;
3934    aco_opcode op;
3935    if (bytes_needed >= 16 && align % 16 == 0 && large_ds_read) {
3936       size = 16;
3937       op = aco_opcode::ds_read_b128;
3938    } else if (bytes_needed >= 16 && align % 8 == 0 && const_offset % 8 == 0 && usable_read2) {
3939       size = 16;
3940       read2 = true;
3941       op = aco_opcode::ds_read2_b64;
3942    } else if (bytes_needed >= 12 && align % 16 == 0 && large_ds_read) {
3943       size = 12;
3944       op = aco_opcode::ds_read_b96;
3945    } else if (bytes_needed >= 8 && align % 8 == 0) {
3946       size = 8;
3947       op = aco_opcode::ds_read_b64;
3948    } else if (bytes_needed >= 8 && align % 4 == 0 && const_offset % 4 == 0 && usable_read2) {
3949       size = 8;
3950       read2 = true;
3951       op = aco_opcode::ds_read2_b32;
3952    } else if (bytes_needed >= 4 && align % 4 == 0) {
3953       size = 4;
3954       op = aco_opcode::ds_read_b32;
3955    } else if (bytes_needed >= 2 && align % 2 == 0) {
3956       size = 2;
3957       op = bld.program->chip_class >= GFX9 ? aco_opcode::ds_read_u16_d16 : aco_opcode::ds_read_u16;
3958    } else {
3959       size = 1;
3960       op = bld.program->chip_class >= GFX9 ? aco_opcode::ds_read_u8_d16 : aco_opcode::ds_read_u8;
3961    }
3962 
3963    unsigned const_offset_unit = read2 ? size / 2u : 1u;
3964    unsigned const_offset_range = read2 ? 255 * const_offset_unit : 65536;
3965 
3966    if (const_offset > (const_offset_range - const_offset_unit)) {
3967       unsigned excess = const_offset - (const_offset % const_offset_range);
3968       offset = bld.vadd32(bld.def(v1), offset, Operand::c32(excess));
3969       const_offset -= excess;
3970    }
3971 
3972    const_offset /= const_offset_unit;
3973 
3974    RegClass rc = RegClass::get(RegType::vgpr, size);
3975    Temp val = rc == info.dst.regClass() && dst_hint.id() ? dst_hint : bld.tmp(rc);
3976    Instruction* instr;
3977    if (read2)
3978       instr = bld.ds(op, Definition(val), offset, m, const_offset, const_offset + 1);
3979    else
3980       instr = bld.ds(op, Definition(val), offset, m, const_offset);
3981    instr->ds().sync = info.sync;
3982 
3983    if (m.isUndefined())
3984       instr->operands.pop_back();
3985 
3986    return val;
3987 }
3988 
3989 const EmitLoadParameters lds_load_params{lds_load_callback, false, true, UINT32_MAX};
3990 
3991 Temp
smem_load_callback(Builder & bld,const LoadEmitInfo & info,Temp offset,unsigned bytes_needed,unsigned align,unsigned const_offset,Temp dst_hint)3992 smem_load_callback(Builder& bld, const LoadEmitInfo& info, Temp offset, unsigned bytes_needed,
3993                    unsigned align, unsigned const_offset, Temp dst_hint)
3994 {
3995    unsigned size = 0;
3996    aco_opcode op;
3997    if (bytes_needed <= 4) {
3998       size = 1;
3999       op = info.resource.id() ? aco_opcode::s_buffer_load_dword : aco_opcode::s_load_dword;
4000    } else if (bytes_needed <= 8) {
4001       size = 2;
4002       op = info.resource.id() ? aco_opcode::s_buffer_load_dwordx2 : aco_opcode::s_load_dwordx2;
4003    } else if (bytes_needed <= 16) {
4004       size = 4;
4005       op = info.resource.id() ? aco_opcode::s_buffer_load_dwordx4 : aco_opcode::s_load_dwordx4;
4006    } else if (bytes_needed <= 32) {
4007       size = 8;
4008       op = info.resource.id() ? aco_opcode::s_buffer_load_dwordx8 : aco_opcode::s_load_dwordx8;
4009    } else {
4010       size = 16;
4011       op = info.resource.id() ? aco_opcode::s_buffer_load_dwordx16 : aco_opcode::s_load_dwordx16;
4012    }
4013    aco_ptr<SMEM_instruction> load{create_instruction<SMEM_instruction>(op, Format::SMEM, 2, 1)};
4014    if (info.resource.id()) {
4015       load->operands[0] = Operand(info.resource);
4016       load->operands[1] = Operand(offset);
4017    } else {
4018       load->operands[0] = Operand(offset);
4019       load->operands[1] = Operand::zero();
4020    }
4021    RegClass rc(RegType::sgpr, size);
4022    Temp val = dst_hint.id() && dst_hint.regClass() == rc ? dst_hint : bld.tmp(rc);
4023    load->definitions[0] = Definition(val);
4024    load->glc = info.glc;
4025    load->dlc = info.glc && bld.program->chip_class >= GFX10;
4026    load->sync = info.sync;
4027    bld.insert(std::move(load));
4028    return val;
4029 }
4030 
4031 const EmitLoadParameters smem_load_params{smem_load_callback, true, false, 1024};
4032 
4033 Temp
mubuf_load_callback(Builder & bld,const LoadEmitInfo & info,Temp offset,unsigned bytes_needed,unsigned align_,unsigned const_offset,Temp dst_hint)4034 mubuf_load_callback(Builder& bld, const LoadEmitInfo& info, Temp offset, unsigned bytes_needed,
4035                     unsigned align_, unsigned const_offset, Temp dst_hint)
4036 {
4037    Operand vaddr = offset.type() == RegType::vgpr ? Operand(offset) : Operand(v1);
4038    Operand soffset = offset.type() == RegType::sgpr ? Operand(offset) : Operand::c32(0);
4039 
4040    if (info.soffset.id()) {
4041       if (soffset.isTemp())
4042          vaddr = bld.copy(bld.def(v1), soffset);
4043       soffset = Operand(info.soffset);
4044    }
4045 
4046    unsigned bytes_size = 0;
4047    aco_opcode op;
4048    if (bytes_needed == 1 || align_ % 2) {
4049       bytes_size = 1;
4050       op = aco_opcode::buffer_load_ubyte;
4051    } else if (bytes_needed == 2 || align_ % 4) {
4052       bytes_size = 2;
4053       op = aco_opcode::buffer_load_ushort;
4054    } else if (bytes_needed <= 4) {
4055       bytes_size = 4;
4056       op = aco_opcode::buffer_load_dword;
4057    } else if (bytes_needed <= 8) {
4058       bytes_size = 8;
4059       op = aco_opcode::buffer_load_dwordx2;
4060    } else if (bytes_needed <= 12 && bld.program->chip_class > GFX6) {
4061       bytes_size = 12;
4062       op = aco_opcode::buffer_load_dwordx3;
4063    } else {
4064       bytes_size = 16;
4065       op = aco_opcode::buffer_load_dwordx4;
4066    }
4067    aco_ptr<MUBUF_instruction> mubuf{create_instruction<MUBUF_instruction>(op, Format::MUBUF, 3, 1)};
4068    mubuf->operands[0] = Operand(info.resource);
4069    mubuf->operands[1] = vaddr;
4070    mubuf->operands[2] = soffset;
4071    mubuf->offen = (offset.type() == RegType::vgpr);
4072    mubuf->glc = info.glc;
4073    mubuf->dlc = info.glc && bld.program->chip_class >= GFX10;
4074    mubuf->slc = info.slc;
4075    mubuf->sync = info.sync;
4076    mubuf->offset = const_offset;
4077    mubuf->swizzled = info.swizzle_component_size != 0;
4078    RegClass rc = RegClass::get(RegType::vgpr, bytes_size);
4079    Temp val = dst_hint.id() && rc == dst_hint.regClass() ? dst_hint : bld.tmp(rc);
4080    mubuf->definitions[0] = Definition(val);
4081    bld.insert(std::move(mubuf));
4082 
4083    return val;
4084 }
4085 
4086 const EmitLoadParameters mubuf_load_params{mubuf_load_callback, true, true, 4096};
4087 const EmitLoadParameters scratch_load_params{mubuf_load_callback, false, true, 4096};
4088 
4089 Temp
get_gfx6_global_rsrc(Builder & bld,Temp addr)4090 get_gfx6_global_rsrc(Builder& bld, Temp addr)
4091 {
4092    uint32_t rsrc_conf = S_008F0C_NUM_FORMAT(V_008F0C_BUF_NUM_FORMAT_FLOAT) |
4093                         S_008F0C_DATA_FORMAT(V_008F0C_BUF_DATA_FORMAT_32);
4094 
4095    if (addr.type() == RegType::vgpr)
4096       return bld.pseudo(aco_opcode::p_create_vector, bld.def(s4), Operand::zero(), Operand::zero(),
4097                         Operand::c32(-1u), Operand::c32(rsrc_conf));
4098    return bld.pseudo(aco_opcode::p_create_vector, bld.def(s4), addr, Operand::c32(-1u),
4099                      Operand::c32(rsrc_conf));
4100 }
4101 
4102 Temp
global_load_callback(Builder & bld,const LoadEmitInfo & info,Temp offset,unsigned bytes_needed,unsigned align_,unsigned const_offset,Temp dst_hint)4103 global_load_callback(Builder& bld, const LoadEmitInfo& info, Temp offset, unsigned bytes_needed,
4104                      unsigned align_, unsigned const_offset, Temp dst_hint)
4105 {
4106    unsigned bytes_size = 0;
4107    bool use_mubuf = bld.program->chip_class == GFX6;
4108    bool global = bld.program->chip_class >= GFX9;
4109    aco_opcode op;
4110    if (bytes_needed == 1) {
4111       bytes_size = 1;
4112       op = use_mubuf ? aco_opcode::buffer_load_ubyte
4113            : global  ? aco_opcode::global_load_ubyte
4114                      : aco_opcode::flat_load_ubyte;
4115    } else if (bytes_needed == 2) {
4116       bytes_size = 2;
4117       op = use_mubuf ? aco_opcode::buffer_load_ushort
4118            : global  ? aco_opcode::global_load_ushort
4119                      : aco_opcode::flat_load_ushort;
4120    } else if (bytes_needed <= 4) {
4121       bytes_size = 4;
4122       op = use_mubuf ? aco_opcode::buffer_load_dword
4123            : global  ? aco_opcode::global_load_dword
4124                      : aco_opcode::flat_load_dword;
4125    } else if (bytes_needed <= 8) {
4126       bytes_size = 8;
4127       op = use_mubuf ? aco_opcode::buffer_load_dwordx2
4128            : global  ? aco_opcode::global_load_dwordx2
4129                      : aco_opcode::flat_load_dwordx2;
4130    } else if (bytes_needed <= 12 && !use_mubuf) {
4131       bytes_size = 12;
4132       op = global ? aco_opcode::global_load_dwordx3 : aco_opcode::flat_load_dwordx3;
4133    } else {
4134       bytes_size = 16;
4135       op = use_mubuf ? aco_opcode::buffer_load_dwordx4
4136            : global  ? aco_opcode::global_load_dwordx4
4137                      : aco_opcode::flat_load_dwordx4;
4138    }
4139    RegClass rc = RegClass::get(RegType::vgpr, align(bytes_size, 4));
4140    Temp val = dst_hint.id() && rc == dst_hint.regClass() ? dst_hint : bld.tmp(rc);
4141    if (use_mubuf) {
4142       aco_ptr<MUBUF_instruction> mubuf{
4143          create_instruction<MUBUF_instruction>(op, Format::MUBUF, 3, 1)};
4144       mubuf->operands[0] = Operand(get_gfx6_global_rsrc(bld, offset));
4145       mubuf->operands[1] = offset.type() == RegType::vgpr ? Operand(offset) : Operand(v1);
4146       mubuf->operands[2] = Operand::zero();
4147       mubuf->glc = info.glc;
4148       mubuf->dlc = false;
4149       mubuf->offset = 0;
4150       mubuf->addr64 = offset.type() == RegType::vgpr;
4151       mubuf->disable_wqm = false;
4152       mubuf->sync = info.sync;
4153       mubuf->definitions[0] = Definition(val);
4154       bld.insert(std::move(mubuf));
4155    } else {
4156       offset = offset.regClass() == s2 ? bld.copy(bld.def(v2), offset) : offset;
4157 
4158       aco_ptr<FLAT_instruction> flat{
4159          create_instruction<FLAT_instruction>(op, global ? Format::GLOBAL : Format::FLAT, 2, 1)};
4160       flat->operands[0] = Operand(offset);
4161       flat->operands[1] = Operand(s1);
4162       flat->glc = info.glc;
4163       flat->dlc = info.glc && bld.program->chip_class >= GFX10;
4164       flat->sync = info.sync;
4165       flat->offset = 0u;
4166       flat->definitions[0] = Definition(val);
4167       bld.insert(std::move(flat));
4168    }
4169 
4170    return val;
4171 }
4172 
4173 const EmitLoadParameters global_load_params{global_load_callback, true, true, 1};
4174 
4175 Temp
load_lds(isel_context * ctx,unsigned elem_size_bytes,unsigned num_components,Temp dst,Temp address,unsigned base_offset,unsigned align)4176 load_lds(isel_context* ctx, unsigned elem_size_bytes, unsigned num_components, Temp dst,
4177          Temp address, unsigned base_offset, unsigned align)
4178 {
4179    assert(util_is_power_of_two_nonzero(align));
4180 
4181    Builder bld(ctx->program, ctx->block);
4182 
4183    LoadEmitInfo info = {Operand(as_vgpr(ctx, address)), dst, num_components, elem_size_bytes};
4184    info.align_mul = align;
4185    info.align_offset = 0;
4186    info.sync = memory_sync_info(storage_shared);
4187    info.const_offset = base_offset;
4188    emit_load(ctx, bld, info, lds_load_params);
4189 
4190    return dst;
4191 }
4192 
4193 void
split_store_data(isel_context * ctx,RegType dst_type,unsigned count,Temp * dst,unsigned * bytes,Temp src)4194 split_store_data(isel_context* ctx, RegType dst_type, unsigned count, Temp* dst, unsigned* bytes,
4195                  Temp src)
4196 {
4197    if (!count)
4198       return;
4199 
4200    Builder bld(ctx->program, ctx->block);
4201 
4202    /* count == 1 fast path */
4203    if (count == 1) {
4204       if (dst_type == RegType::sgpr)
4205          dst[0] = bld.as_uniform(src);
4206       else
4207          dst[0] = as_vgpr(ctx, src);
4208       return;
4209    }
4210 
4211    /* elem_size_bytes is the greatest common divisor which is a power of 2 */
4212    unsigned elem_size_bytes =
4213       1u << (ffs(std::accumulate(bytes, bytes + count, 8, std::bit_or<>{})) - 1);
4214 
4215    ASSERTED bool is_subdword = elem_size_bytes < 4;
4216    assert(!is_subdword || dst_type == RegType::vgpr);
4217 
4218    for (unsigned i = 0; i < count; i++)
4219       dst[i] = bld.tmp(RegClass::get(dst_type, bytes[i]));
4220 
4221    std::vector<Temp> temps;
4222    /* use allocated_vec if possible */
4223    auto it = ctx->allocated_vec.find(src.id());
4224    if (it != ctx->allocated_vec.end()) {
4225       if (!it->second[0].id())
4226          goto split;
4227       unsigned elem_size = it->second[0].bytes();
4228       assert(src.bytes() % elem_size == 0);
4229 
4230       for (unsigned i = 0; i < src.bytes() / elem_size; i++) {
4231          if (!it->second[i].id())
4232             goto split;
4233       }
4234       if (elem_size_bytes % elem_size)
4235          goto split;
4236 
4237       temps.insert(temps.end(), it->second.begin(), it->second.begin() + src.bytes() / elem_size);
4238       elem_size_bytes = elem_size;
4239    }
4240 
4241 split:
4242    /* split src if necessary */
4243    if (temps.empty()) {
4244       if (is_subdword && src.type() == RegType::sgpr)
4245          src = as_vgpr(ctx, src);
4246       if (dst_type == RegType::sgpr)
4247          src = bld.as_uniform(src);
4248 
4249       unsigned num_elems = src.bytes() / elem_size_bytes;
4250       aco_ptr<Instruction> split{create_instruction<Pseudo_instruction>(
4251          aco_opcode::p_split_vector, Format::PSEUDO, 1, num_elems)};
4252       split->operands[0] = Operand(src);
4253       for (unsigned i = 0; i < num_elems; i++) {
4254          temps.emplace_back(bld.tmp(RegClass::get(dst_type, elem_size_bytes)));
4255          split->definitions[i] = Definition(temps.back());
4256       }
4257       bld.insert(std::move(split));
4258    }
4259 
4260    unsigned idx = 0;
4261    for (unsigned i = 0; i < count; i++) {
4262       unsigned op_count = dst[i].bytes() / elem_size_bytes;
4263       if (op_count == 1) {
4264          if (dst_type == RegType::sgpr)
4265             dst[i] = bld.as_uniform(temps[idx++]);
4266          else
4267             dst[i] = as_vgpr(ctx, temps[idx++]);
4268          continue;
4269       }
4270 
4271       aco_ptr<Instruction> vec{create_instruction<Pseudo_instruction>(aco_opcode::p_create_vector,
4272                                                                       Format::PSEUDO, op_count, 1)};
4273       for (unsigned j = 0; j < op_count; j++) {
4274          Temp tmp = temps[idx++];
4275          if (dst_type == RegType::sgpr)
4276             tmp = bld.as_uniform(tmp);
4277          vec->operands[j] = Operand(tmp);
4278       }
4279       vec->definitions[0] = Definition(dst[i]);
4280       bld.insert(std::move(vec));
4281    }
4282    return;
4283 }
4284 
4285 bool
scan_write_mask(uint32_t mask,uint32_t todo_mask,int * start,int * count)4286 scan_write_mask(uint32_t mask, uint32_t todo_mask, int* start, int* count)
4287 {
4288    unsigned start_elem = ffs(todo_mask) - 1;
4289    bool skip = !(mask & (1 << start_elem));
4290    if (skip)
4291       mask = ~mask & todo_mask;
4292 
4293    mask &= todo_mask;
4294 
4295    u_bit_scan_consecutive_range(&mask, start, count);
4296 
4297    return !skip;
4298 }
4299 
4300 void
advance_write_mask(uint32_t * todo_mask,int start,int count)4301 advance_write_mask(uint32_t* todo_mask, int start, int count)
4302 {
4303    *todo_mask &= ~u_bit_consecutive(0, count) << start;
4304 }
4305 
4306 void
store_lds(isel_context * ctx,unsigned elem_size_bytes,Temp data,uint32_t wrmask,Temp address,unsigned base_offset,unsigned align)4307 store_lds(isel_context* ctx, unsigned elem_size_bytes, Temp data, uint32_t wrmask, Temp address,
4308           unsigned base_offset, unsigned align)
4309 {
4310    assert(util_is_power_of_two_nonzero(align));
4311    assert(util_is_power_of_two_nonzero(elem_size_bytes) && elem_size_bytes <= 8);
4312 
4313    Builder bld(ctx->program, ctx->block);
4314    bool large_ds_write = ctx->options->chip_class >= GFX7;
4315    bool usable_write2 = ctx->options->chip_class >= GFX7;
4316 
4317    unsigned write_count = 0;
4318    Temp write_datas[32];
4319    unsigned offsets[32];
4320    unsigned bytes[32];
4321    aco_opcode opcodes[32];
4322 
4323    wrmask = widen_mask(wrmask, elem_size_bytes);
4324 
4325    uint32_t todo = u_bit_consecutive(0, data.bytes());
4326    while (todo) {
4327       int offset, byte;
4328       if (!scan_write_mask(wrmask, todo, &offset, &byte)) {
4329          offsets[write_count] = offset;
4330          bytes[write_count] = byte;
4331          opcodes[write_count] = aco_opcode::num_opcodes;
4332          write_count++;
4333          advance_write_mask(&todo, offset, byte);
4334          continue;
4335       }
4336 
4337       bool aligned2 = offset % 2 == 0 && align % 2 == 0;
4338       bool aligned4 = offset % 4 == 0 && align % 4 == 0;
4339       bool aligned8 = offset % 8 == 0 && align % 8 == 0;
4340       bool aligned16 = offset % 16 == 0 && align % 16 == 0;
4341 
4342       // TODO: use ds_write_b8_d16_hi/ds_write_b16_d16_hi if beneficial
4343       aco_opcode op = aco_opcode::num_opcodes;
4344       if (byte >= 16 && aligned16 && large_ds_write) {
4345          op = aco_opcode::ds_write_b128;
4346          byte = 16;
4347       } else if (byte >= 12 && aligned16 && large_ds_write) {
4348          op = aco_opcode::ds_write_b96;
4349          byte = 12;
4350       } else if (byte >= 8 && aligned8) {
4351          op = aco_opcode::ds_write_b64;
4352          byte = 8;
4353       } else if (byte >= 4 && aligned4) {
4354          op = aco_opcode::ds_write_b32;
4355          byte = 4;
4356       } else if (byte >= 2 && aligned2) {
4357          op = aco_opcode::ds_write_b16;
4358          byte = 2;
4359       } else if (byte >= 1) {
4360          op = aco_opcode::ds_write_b8;
4361          byte = 1;
4362       } else {
4363          assert(false);
4364       }
4365 
4366       offsets[write_count] = offset;
4367       bytes[write_count] = byte;
4368       opcodes[write_count] = op;
4369       write_count++;
4370       advance_write_mask(&todo, offset, byte);
4371    }
4372 
4373    Operand m = load_lds_size_m0(bld);
4374 
4375    split_store_data(ctx, RegType::vgpr, write_count, write_datas, bytes, data);
4376 
4377    for (unsigned i = 0; i < write_count; i++) {
4378       aco_opcode op = opcodes[i];
4379       if (op == aco_opcode::num_opcodes)
4380          continue;
4381 
4382       Temp split_data = write_datas[i];
4383 
4384       unsigned second = write_count;
4385       if (usable_write2 && (op == aco_opcode::ds_write_b32 || op == aco_opcode::ds_write_b64)) {
4386          for (second = i + 1; second < write_count; second++) {
4387             if (opcodes[second] == op && (offsets[second] - offsets[i]) % split_data.bytes() == 0) {
4388                op = split_data.bytes() == 4 ? aco_opcode::ds_write2_b32 : aco_opcode::ds_write2_b64;
4389                opcodes[second] = aco_opcode::num_opcodes;
4390                break;
4391             }
4392          }
4393       }
4394 
4395       bool write2 = op == aco_opcode::ds_write2_b32 || op == aco_opcode::ds_write2_b64;
4396       unsigned write2_off = (offsets[second] - offsets[i]) / split_data.bytes();
4397 
4398       unsigned inline_offset = base_offset + offsets[i];
4399       unsigned max_offset = write2 ? (255 - write2_off) * split_data.bytes() : 65535;
4400       Temp address_offset = address;
4401       if (inline_offset > max_offset) {
4402          address_offset = bld.vadd32(bld.def(v1), Operand::c32(base_offset), address_offset);
4403          inline_offset = offsets[i];
4404       }
4405 
4406       /* offsets[i] shouldn't be large enough for this to happen */
4407       assert(inline_offset <= max_offset);
4408 
4409       Instruction* instr;
4410       if (write2) {
4411          Temp second_data = write_datas[second];
4412          inline_offset /= split_data.bytes();
4413          instr = bld.ds(op, address_offset, split_data, second_data, m, inline_offset,
4414                         inline_offset + write2_off);
4415       } else {
4416          instr = bld.ds(op, address_offset, split_data, m, inline_offset);
4417       }
4418       instr->ds().sync = memory_sync_info(storage_shared);
4419 
4420       if (m.isUndefined())
4421          instr->operands.pop_back();
4422    }
4423 }
4424 
4425 aco_opcode
get_buffer_store_op(unsigned bytes)4426 get_buffer_store_op(unsigned bytes)
4427 {
4428    switch (bytes) {
4429    case 1: return aco_opcode::buffer_store_byte;
4430    case 2: return aco_opcode::buffer_store_short;
4431    case 4: return aco_opcode::buffer_store_dword;
4432    case 8: return aco_opcode::buffer_store_dwordx2;
4433    case 12: return aco_opcode::buffer_store_dwordx3;
4434    case 16: return aco_opcode::buffer_store_dwordx4;
4435    }
4436    unreachable("Unexpected store size");
4437    return aco_opcode::num_opcodes;
4438 }
4439 
4440 void
split_buffer_store(isel_context * ctx,nir_intrinsic_instr * instr,bool smem,RegType dst_type,Temp data,unsigned writemask,int swizzle_element_size,unsigned * write_count,Temp * write_datas,unsigned * offsets)4441 split_buffer_store(isel_context* ctx, nir_intrinsic_instr* instr, bool smem, RegType dst_type,
4442                    Temp data, unsigned writemask, int swizzle_element_size, unsigned* write_count,
4443                    Temp* write_datas, unsigned* offsets)
4444 {
4445    unsigned write_count_with_skips = 0;
4446    bool skips[16];
4447    unsigned bytes[16];
4448 
4449    /* determine how to split the data */
4450    unsigned todo = u_bit_consecutive(0, data.bytes());
4451    while (todo) {
4452       int offset, byte;
4453       skips[write_count_with_skips] = !scan_write_mask(writemask, todo, &offset, &byte);
4454       offsets[write_count_with_skips] = offset;
4455       if (skips[write_count_with_skips]) {
4456          bytes[write_count_with_skips] = byte;
4457          advance_write_mask(&todo, offset, byte);
4458          write_count_with_skips++;
4459          continue;
4460       }
4461 
4462       /* only supported sizes are 1, 2, 4, 8, 12 and 16 bytes and can't be
4463        * larger than swizzle_element_size */
4464       byte = MIN2(byte, swizzle_element_size);
4465       if (byte % 4)
4466          byte = byte > 4 ? byte & ~0x3 : MIN2(byte, 2);
4467 
4468       /* SMEM and GFX6 VMEM can't emit 12-byte stores */
4469       if ((ctx->program->chip_class == GFX6 || smem) && byte == 12)
4470          byte = 8;
4471 
4472       /* dword or larger stores have to be dword-aligned */
4473       unsigned align_mul = instr ? nir_intrinsic_align_mul(instr) : 4;
4474       unsigned align_offset = (instr ? nir_intrinsic_align_offset(instr) : 0) + offset;
4475       bool dword_aligned = align_offset % 4 == 0 && align_mul % 4 == 0;
4476       if (!dword_aligned)
4477          byte = MIN2(byte, (align_offset % 2 == 0 && align_mul % 2 == 0) ? 2 : 1);
4478 
4479       bytes[write_count_with_skips] = byte;
4480       advance_write_mask(&todo, offset, byte);
4481       write_count_with_skips++;
4482    }
4483 
4484    /* actually split data */
4485    split_store_data(ctx, dst_type, write_count_with_skips, write_datas, bytes, data);
4486 
4487    /* remove skips */
4488    for (unsigned i = 0; i < write_count_with_skips; i++) {
4489       if (skips[i])
4490          continue;
4491       write_datas[*write_count] = write_datas[i];
4492       offsets[*write_count] = offsets[i];
4493       (*write_count)++;
4494    }
4495 }
4496 
4497 Temp
create_vec_from_array(isel_context * ctx,Temp arr[],unsigned cnt,RegType reg_type,unsigned elem_size_bytes,unsigned split_cnt=0u,Temp dst=Temp ())4498 create_vec_from_array(isel_context* ctx, Temp arr[], unsigned cnt, RegType reg_type,
4499                       unsigned elem_size_bytes, unsigned split_cnt = 0u, Temp dst = Temp())
4500 {
4501    Builder bld(ctx->program, ctx->block);
4502    unsigned dword_size = elem_size_bytes / 4;
4503 
4504    if (!dst.id())
4505       dst = bld.tmp(RegClass(reg_type, cnt * dword_size));
4506 
4507    std::array<Temp, NIR_MAX_VEC_COMPONENTS> allocated_vec;
4508    aco_ptr<Pseudo_instruction> instr{
4509       create_instruction<Pseudo_instruction>(aco_opcode::p_create_vector, Format::PSEUDO, cnt, 1)};
4510    instr->definitions[0] = Definition(dst);
4511 
4512    for (unsigned i = 0; i < cnt; ++i) {
4513       if (arr[i].id()) {
4514          assert(arr[i].size() == dword_size);
4515          allocated_vec[i] = arr[i];
4516          instr->operands[i] = Operand(arr[i]);
4517       } else {
4518          Temp zero = bld.copy(bld.def(RegClass(reg_type, dword_size)),
4519                               Operand::zero(dword_size == 2 ? 8 : 4));
4520          allocated_vec[i] = zero;
4521          instr->operands[i] = Operand(zero);
4522       }
4523    }
4524 
4525    bld.insert(std::move(instr));
4526 
4527    if (split_cnt)
4528       emit_split_vector(ctx, dst, split_cnt);
4529    else
4530       ctx->allocated_vec.emplace(dst.id(), allocated_vec); /* emit_split_vector already does this */
4531 
4532    return dst;
4533 }
4534 
4535 inline unsigned
resolve_excess_vmem_const_offset(Builder & bld,Temp & voffset,unsigned const_offset)4536 resolve_excess_vmem_const_offset(Builder& bld, Temp& voffset, unsigned const_offset)
4537 {
4538    if (const_offset >= 4096) {
4539       unsigned excess_const_offset = const_offset / 4096u * 4096u;
4540       const_offset %= 4096u;
4541 
4542       if (!voffset.id())
4543          voffset = bld.copy(bld.def(v1), Operand::c32(excess_const_offset));
4544       else if (unlikely(voffset.regClass() == s1))
4545          voffset = bld.sop2(aco_opcode::s_add_u32, bld.def(s1), bld.def(s1, scc),
4546                             Operand::c32(excess_const_offset), Operand(voffset));
4547       else if (likely(voffset.regClass() == v1))
4548          voffset = bld.vadd32(bld.def(v1), Operand(voffset), Operand::c32(excess_const_offset));
4549       else
4550          unreachable("Unsupported register class of voffset");
4551    }
4552 
4553    return const_offset;
4554 }
4555 
4556 void
emit_single_mubuf_store(isel_context * ctx,Temp descriptor,Temp voffset,Temp soffset,Temp vdata,unsigned const_offset=0u,memory_sync_info sync=memory_sync_info (),bool slc=false,bool swizzled=false)4557 emit_single_mubuf_store(isel_context* ctx, Temp descriptor, Temp voffset, Temp soffset, Temp vdata,
4558                         unsigned const_offset = 0u, memory_sync_info sync = memory_sync_info(),
4559                         bool slc = false, bool swizzled = false)
4560 {
4561    assert(vdata.id());
4562    assert(vdata.size() != 3 || ctx->program->chip_class != GFX6);
4563    assert(vdata.size() >= 1 && vdata.size() <= 4);
4564 
4565    Builder bld(ctx->program, ctx->block);
4566    aco_opcode op = get_buffer_store_op(vdata.bytes());
4567    const_offset = resolve_excess_vmem_const_offset(bld, voffset, const_offset);
4568 
4569    Operand voffset_op = voffset.id() ? Operand(as_vgpr(ctx, voffset)) : Operand(v1);
4570    Operand soffset_op = soffset.id() ? Operand(soffset) : Operand::zero();
4571    Builder::Result r =
4572       bld.mubuf(op, Operand(descriptor), voffset_op, soffset_op, Operand(vdata), const_offset,
4573                 /* offen */ !voffset_op.isUndefined(), /* swizzled */ swizzled,
4574                 /* idxen*/ false, /* addr64 */ false, /* disable_wqm */ false, /* glc */ true,
4575                 /* dlc*/ false, /* slc */ slc);
4576 
4577    r.instr->mubuf().sync = sync;
4578 }
4579 
4580 void
store_vmem_mubuf(isel_context * ctx,Temp src,Temp descriptor,Temp voffset,Temp soffset,unsigned base_const_offset,unsigned elem_size_bytes,unsigned write_mask,bool allow_combining=true,memory_sync_info sync=memory_sync_info (),bool slc=false)4581 store_vmem_mubuf(isel_context* ctx, Temp src, Temp descriptor, Temp voffset, Temp soffset,
4582                  unsigned base_const_offset, unsigned elem_size_bytes, unsigned write_mask,
4583                  bool allow_combining = true, memory_sync_info sync = memory_sync_info(),
4584                  bool slc = false)
4585 {
4586    Builder bld(ctx->program, ctx->block);
4587    assert(elem_size_bytes == 2 || elem_size_bytes == 4 || elem_size_bytes == 8);
4588    assert(write_mask);
4589    write_mask = widen_mask(write_mask, elem_size_bytes);
4590 
4591    unsigned write_count = 0;
4592    Temp write_datas[32];
4593    unsigned offsets[32];
4594    split_buffer_store(ctx, NULL, false, RegType::vgpr, src, write_mask, allow_combining ? 16 : 4,
4595                       &write_count, write_datas, offsets);
4596 
4597    for (unsigned i = 0; i < write_count; i++) {
4598       unsigned const_offset = offsets[i] + base_const_offset;
4599       emit_single_mubuf_store(ctx, descriptor, voffset, soffset, write_datas[i], const_offset, sync,
4600                               slc, !allow_combining);
4601    }
4602 }
4603 
4604 void
load_vmem_mubuf(isel_context * ctx,Temp dst,Temp descriptor,Temp voffset,Temp soffset,unsigned base_const_offset,unsigned elem_size_bytes,unsigned num_components,unsigned stride=0u,bool allow_combining=true,bool allow_reorder=true,bool slc=false)4605 load_vmem_mubuf(isel_context* ctx, Temp dst, Temp descriptor, Temp voffset, Temp soffset,
4606                 unsigned base_const_offset, unsigned elem_size_bytes, unsigned num_components,
4607                 unsigned stride = 0u, bool allow_combining = true, bool allow_reorder = true,
4608                 bool slc = false)
4609 {
4610    assert(elem_size_bytes == 2 || elem_size_bytes == 4 || elem_size_bytes == 8);
4611    assert((num_components * elem_size_bytes) == dst.bytes());
4612    assert(!!stride != allow_combining);
4613 
4614    Builder bld(ctx->program, ctx->block);
4615 
4616    LoadEmitInfo info = {Operand(voffset), dst, num_components, elem_size_bytes, descriptor};
4617    info.component_stride = allow_combining ? 0 : stride;
4618    info.glc = true;
4619    info.slc = slc;
4620    info.swizzle_component_size = allow_combining ? 0 : 4;
4621    info.align_mul = MIN2(elem_size_bytes, 4);
4622    info.align_offset = 0;
4623    info.soffset = soffset;
4624    info.const_offset = base_const_offset;
4625    emit_load(ctx, bld, info, mubuf_load_params);
4626 }
4627 
4628 Temp
wave_id_in_threadgroup(isel_context * ctx)4629 wave_id_in_threadgroup(isel_context* ctx)
4630 {
4631    Builder bld(ctx->program, ctx->block);
4632    return bld.sop2(aco_opcode::s_bfe_u32, bld.def(s1), bld.def(s1, scc),
4633                    get_arg(ctx, ctx->args->ac.merged_wave_info), Operand::c32(24u | (4u << 16)));
4634 }
4635 
4636 Temp
thread_id_in_threadgroup(isel_context * ctx)4637 thread_id_in_threadgroup(isel_context* ctx)
4638 {
4639    /* tid_in_tg = wave_id * wave_size + tid_in_wave */
4640 
4641    Builder bld(ctx->program, ctx->block);
4642    Temp tid_in_wave = emit_mbcnt(ctx, bld.tmp(v1));
4643 
4644    if (ctx->program->workgroup_size <= ctx->program->wave_size)
4645       return tid_in_wave;
4646 
4647    Temp wave_id_in_tg = wave_id_in_threadgroup(ctx);
4648    Temp num_pre_threads =
4649       bld.sop2(aco_opcode::s_lshl_b32, bld.def(s1), bld.def(s1, scc), wave_id_in_tg,
4650                Operand::c32(ctx->program->wave_size == 64 ? 6u : 5u));
4651    return bld.vadd32(bld.def(v1), Operand(num_pre_threads), Operand(tid_in_wave));
4652 }
4653 
4654 Temp
get_tess_rel_patch_id(isel_context * ctx)4655 get_tess_rel_patch_id(isel_context* ctx)
4656 {
4657    Builder bld(ctx->program, ctx->block);
4658 
4659    switch (ctx->shader->info.stage) {
4660    case MESA_SHADER_TESS_CTRL:
4661       return bld.pseudo(aco_opcode::p_extract, bld.def(v1), get_arg(ctx, ctx->args->ac.tcs_rel_ids),
4662                         Operand::zero(), Operand::c32(8u), Operand::zero());
4663    case MESA_SHADER_TESS_EVAL: return get_arg(ctx, ctx->args->ac.tes_rel_patch_id);
4664    default: unreachable("Unsupported stage in get_tess_rel_patch_id");
4665    }
4666 }
4667 
4668 bool
store_output_to_temps(isel_context * ctx,nir_intrinsic_instr * instr)4669 store_output_to_temps(isel_context* ctx, nir_intrinsic_instr* instr)
4670 {
4671    unsigned write_mask = nir_intrinsic_write_mask(instr);
4672    unsigned component = nir_intrinsic_component(instr);
4673    unsigned idx = nir_intrinsic_base(instr) * 4u + component;
4674    nir_src offset = *nir_get_io_offset_src(instr);
4675 
4676    if (!nir_src_is_const(offset) || nir_src_as_uint(offset))
4677       return false;
4678 
4679    Temp src = get_ssa_temp(ctx, instr->src[0].ssa);
4680 
4681    if (instr->src[0].ssa->bit_size == 64)
4682       write_mask = widen_mask(write_mask, 2);
4683 
4684    RegClass rc = instr->src[0].ssa->bit_size == 16 ? v2b : v1;
4685 
4686    for (unsigned i = 0; i < 8; ++i) {
4687       if (write_mask & (1 << i)) {
4688          ctx->outputs.mask[idx / 4u] |= 1 << (idx % 4u);
4689          ctx->outputs.temps[idx] = emit_extract_vector(ctx, src, i, rc);
4690       }
4691       idx++;
4692    }
4693 
4694    return true;
4695 }
4696 
4697 bool
load_input_from_temps(isel_context * ctx,nir_intrinsic_instr * instr,Temp dst)4698 load_input_from_temps(isel_context* ctx, nir_intrinsic_instr* instr, Temp dst)
4699 {
4700    /* Only TCS per-vertex inputs are supported by this function.
4701     * Per-vertex inputs only match between the VS/TCS invocation id when the number of invocations
4702     * is the same.
4703     */
4704    if (ctx->shader->info.stage != MESA_SHADER_TESS_CTRL || !ctx->tcs_in_out_eq)
4705       return false;
4706 
4707    nir_src* off_src = nir_get_io_offset_src(instr);
4708    nir_src* vertex_index_src = nir_get_io_vertex_index_src(instr);
4709    nir_instr* vertex_index_instr = vertex_index_src->ssa->parent_instr;
4710    bool can_use_temps =
4711       nir_src_is_const(*off_src) && vertex_index_instr->type == nir_instr_type_intrinsic &&
4712       nir_instr_as_intrinsic(vertex_index_instr)->intrinsic == nir_intrinsic_load_invocation_id;
4713 
4714    if (!can_use_temps)
4715       return false;
4716 
4717    unsigned idx = nir_intrinsic_base(instr) * 4u + nir_intrinsic_component(instr) +
4718                   4 * nir_src_as_uint(*off_src);
4719    Temp* src = &ctx->inputs.temps[idx];
4720    create_vec_from_array(ctx, src, dst.size(), dst.regClass().type(), 4u, 0, dst);
4721 
4722    return true;
4723 }
4724 
4725 static void export_vs_varying(isel_context* ctx, int slot, bool is_pos, int* next_pos);
4726 
4727 void
visit_store_output(isel_context * ctx,nir_intrinsic_instr * instr)4728 visit_store_output(isel_context* ctx, nir_intrinsic_instr* instr)
4729 {
4730    if (ctx->stage == vertex_vs || ctx->stage == tess_eval_vs || ctx->stage == fragment_fs ||
4731        ctx->stage == vertex_ngg || ctx->stage == tess_eval_ngg ||
4732        (ctx->stage == vertex_tess_control_hs && ctx->shader->info.stage == MESA_SHADER_VERTEX) ||
4733        ctx->shader->info.stage == MESA_SHADER_GEOMETRY) {
4734       bool stored_to_temps = store_output_to_temps(ctx, instr);
4735       if (!stored_to_temps) {
4736          isel_err(instr->src[1].ssa->parent_instr, "Unimplemented output offset instruction");
4737          abort();
4738       }
4739    } else {
4740       unreachable("Shader stage not implemented");
4741    }
4742 
4743    /* For NGG VS and TES shaders the primitive ID is exported manually after the other exports so we
4744     * have to emit an exp here manually */
4745    if (ctx->stage.hw == HWStage::NGG &&
4746        (ctx->stage.has(SWStage::VS) || ctx->stage.has(SWStage::TES)) &&
4747        nir_intrinsic_io_semantics(instr).location == VARYING_SLOT_PRIMITIVE_ID)
4748       export_vs_varying(ctx, VARYING_SLOT_PRIMITIVE_ID, false, NULL);
4749 }
4750 
4751 void
emit_interp_instr(isel_context * ctx,unsigned idx,unsigned component,Temp src,Temp dst,Temp prim_mask)4752 emit_interp_instr(isel_context* ctx, unsigned idx, unsigned component, Temp src, Temp dst,
4753                   Temp prim_mask)
4754 {
4755    Temp coord1 = emit_extract_vector(ctx, src, 0, v1);
4756    Temp coord2 = emit_extract_vector(ctx, src, 1, v1);
4757 
4758    Builder bld(ctx->program, ctx->block);
4759 
4760    if (dst.regClass() == v2b) {
4761       if (ctx->program->dev.has_16bank_lds) {
4762          assert(ctx->options->chip_class <= GFX8);
4763          Builder::Result interp_p1 =
4764             bld.vintrp(aco_opcode::v_interp_mov_f32, bld.def(v1), Operand::c32(2u) /* P0 */,
4765                        bld.m0(prim_mask), idx, component);
4766          interp_p1 = bld.vintrp(aco_opcode::v_interp_p1lv_f16, bld.def(v2b), coord1,
4767                                 bld.m0(prim_mask), interp_p1, idx, component);
4768          bld.vintrp(aco_opcode::v_interp_p2_legacy_f16, Definition(dst), coord2, bld.m0(prim_mask),
4769                     interp_p1, idx, component);
4770       } else {
4771          aco_opcode interp_p2_op = aco_opcode::v_interp_p2_f16;
4772 
4773          if (ctx->options->chip_class == GFX8)
4774             interp_p2_op = aco_opcode::v_interp_p2_legacy_f16;
4775 
4776          Builder::Result interp_p1 = bld.vintrp(aco_opcode::v_interp_p1ll_f16, bld.def(v1), coord1,
4777                                                 bld.m0(prim_mask), idx, component);
4778          bld.vintrp(interp_p2_op, Definition(dst), coord2, bld.m0(prim_mask), interp_p1, idx,
4779                     component);
4780       }
4781    } else {
4782       Builder::Result interp_p1 = bld.vintrp(aco_opcode::v_interp_p1_f32, bld.def(v1), coord1,
4783                                              bld.m0(prim_mask), idx, component);
4784 
4785       if (ctx->program->dev.has_16bank_lds)
4786          interp_p1.instr->operands[0].setLateKill(true);
4787 
4788       bld.vintrp(aco_opcode::v_interp_p2_f32, Definition(dst), coord2, bld.m0(prim_mask), interp_p1,
4789                  idx, component);
4790    }
4791 }
4792 
4793 void
emit_load_frag_coord(isel_context * ctx,Temp dst,unsigned num_components)4794 emit_load_frag_coord(isel_context* ctx, Temp dst, unsigned num_components)
4795 {
4796    Builder bld(ctx->program, ctx->block);
4797 
4798    aco_ptr<Pseudo_instruction> vec(create_instruction<Pseudo_instruction>(
4799       aco_opcode::p_create_vector, Format::PSEUDO, num_components, 1));
4800    for (unsigned i = 0; i < num_components; i++) {
4801       if (ctx->args->ac.frag_pos[i].used)
4802          vec->operands[i] = Operand(get_arg(ctx, ctx->args->ac.frag_pos[i]));
4803       else
4804          vec->operands[i] = Operand(v1);
4805    }
4806    if (G_0286CC_POS_W_FLOAT_ENA(ctx->program->config->spi_ps_input_ena)) {
4807       assert(num_components == 4);
4808       vec->operands[3] =
4809          bld.vop1(aco_opcode::v_rcp_f32, bld.def(v1), get_arg(ctx, ctx->args->ac.frag_pos[3]));
4810    }
4811 
4812    if (ctx->options->adjust_frag_coord_z &&
4813        G_0286CC_POS_Z_FLOAT_ENA(ctx->program->config->spi_ps_input_ena)) {
4814       /* Adjust gl_FragCoord.z for VRS due to a hw bug on some GFX10.3 chips. */
4815       Operand frag_z = vec->operands[2];
4816       Temp adjusted_frag_z = bld.tmp(v1);
4817       Temp tmp;
4818 
4819       /* dFdx fine */
4820       Temp tl = bld.vop1_dpp(aco_opcode::v_mov_b32, bld.def(v1), frag_z, dpp_quad_perm(0, 0, 2, 2));
4821       tmp = bld.vop2_dpp(aco_opcode::v_sub_f32, bld.def(v1), frag_z, tl, dpp_quad_perm(1, 1, 3, 3));
4822       emit_wqm(bld, tmp, adjusted_frag_z, true);
4823 
4824       /* adjusted_frag_z * 0.0625 + frag_z */
4825       adjusted_frag_z = bld.vop3(aco_opcode::v_fma_f32, bld.def(v1), adjusted_frag_z,
4826                                  Operand::c32(0x3d800000u /* 0.0625 */), frag_z);
4827 
4828       /* VRS Rate X = Ancillary[2:3] */
4829       Temp x_rate =
4830          bld.vop3(aco_opcode::v_bfe_u32, bld.def(v1), get_arg(ctx, ctx->args->ac.ancillary),
4831                   Operand::c32(2u), Operand::c32(2u));
4832 
4833       /* xRate = xRate == 0x1 ? adjusted_frag_z : frag_z. */
4834       Temp cond =
4835          bld.vopc(aco_opcode::v_cmp_eq_i32, bld.def(bld.lm), Operand::c32(1u), Operand(x_rate));
4836       vec->operands[2] =
4837          bld.vop2(aco_opcode::v_cndmask_b32, bld.def(v1), frag_z, adjusted_frag_z, cond);
4838    }
4839 
4840    for (Operand& op : vec->operands)
4841       op = op.isUndefined() ? Operand::zero() : op;
4842 
4843    vec->definitions[0] = Definition(dst);
4844    ctx->block->instructions.emplace_back(std::move(vec));
4845    emit_split_vector(ctx, dst, num_components);
4846    return;
4847 }
4848 
4849 void
emit_load_frag_shading_rate(isel_context * ctx,Temp dst)4850 emit_load_frag_shading_rate(isel_context* ctx, Temp dst)
4851 {
4852    Builder bld(ctx->program, ctx->block);
4853    Temp cond;
4854 
4855    /* VRS Rate X = Ancillary[2:3]
4856     * VRS Rate Y = Ancillary[4:5]
4857     */
4858    Temp x_rate = bld.vop3(aco_opcode::v_bfe_u32, bld.def(v1), get_arg(ctx, ctx->args->ac.ancillary),
4859                           Operand::c32(2u), Operand::c32(2u));
4860    Temp y_rate = bld.vop3(aco_opcode::v_bfe_u32, bld.def(v1), get_arg(ctx, ctx->args->ac.ancillary),
4861                           Operand::c32(4u), Operand::c32(2u));
4862 
4863    /* xRate = xRate == 0x1 ? Horizontal2Pixels : None. */
4864    cond = bld.vopc(aco_opcode::v_cmp_eq_i32, bld.def(bld.lm), Operand::c32(1u), Operand(x_rate));
4865    x_rate = bld.vop2(aco_opcode::v_cndmask_b32, bld.def(v1), bld.copy(bld.def(v1), Operand::zero()),
4866                      bld.copy(bld.def(v1), Operand::c32(4u)), cond);
4867 
4868    /* yRate = yRate == 0x1 ? Vertical2Pixels : None. */
4869    cond = bld.vopc(aco_opcode::v_cmp_eq_i32, bld.def(bld.lm), Operand::c32(1u), Operand(y_rate));
4870    y_rate = bld.vop2(aco_opcode::v_cndmask_b32, bld.def(v1), bld.copy(bld.def(v1), Operand::zero()),
4871                      bld.copy(bld.def(v1), Operand::c32(1u)), cond);
4872 
4873    bld.vop2(aco_opcode::v_or_b32, Definition(dst), Operand(x_rate), Operand(y_rate));
4874 }
4875 
4876 void
visit_load_interpolated_input(isel_context * ctx,nir_intrinsic_instr * instr)4877 visit_load_interpolated_input(isel_context* ctx, nir_intrinsic_instr* instr)
4878 {
4879    Temp dst = get_ssa_temp(ctx, &instr->dest.ssa);
4880    Temp coords = get_ssa_temp(ctx, instr->src[0].ssa);
4881    unsigned idx = nir_intrinsic_base(instr);
4882    unsigned component = nir_intrinsic_component(instr);
4883    Temp prim_mask = get_arg(ctx, ctx->args->ac.prim_mask);
4884 
4885    assert(nir_src_is_const(instr->src[1]) && !nir_src_as_uint(instr->src[1]));
4886 
4887    if (instr->dest.ssa.num_components == 1) {
4888       emit_interp_instr(ctx, idx, component, coords, dst, prim_mask);
4889    } else {
4890       aco_ptr<Pseudo_instruction> vec(create_instruction<Pseudo_instruction>(
4891          aco_opcode::p_create_vector, Format::PSEUDO, instr->dest.ssa.num_components, 1));
4892       for (unsigned i = 0; i < instr->dest.ssa.num_components; i++) {
4893          Temp tmp = ctx->program->allocateTmp(instr->dest.ssa.bit_size == 16 ? v2b : v1);
4894          emit_interp_instr(ctx, idx, component + i, coords, tmp, prim_mask);
4895          vec->operands[i] = Operand(tmp);
4896       }
4897       vec->definitions[0] = Definition(dst);
4898       ctx->block->instructions.emplace_back(std::move(vec));
4899    }
4900 }
4901 
4902 bool
check_vertex_fetch_size(isel_context * ctx,const ac_data_format_info * vtx_info,unsigned offset,unsigned binding_align,unsigned channels)4903 check_vertex_fetch_size(isel_context* ctx, const ac_data_format_info* vtx_info, unsigned offset,
4904                         unsigned binding_align, unsigned channels)
4905 {
4906    unsigned vertex_byte_size = vtx_info->chan_byte_size * channels;
4907    if (vtx_info->chan_byte_size != 4 && channels == 3)
4908       return false;
4909 
4910    /* Split typed vertex buffer loads on GFX6 and GFX10+ to avoid any
4911     * alignment issues that triggers memory violations and eventually a GPU
4912     * hang. This can happen if the stride (static or dynamic) is unaligned and
4913     * also if the VBO offset is aligned to a scalar (eg. stride is 8 and VBO
4914     * offset is 2 for R16G16B16A16_SNORM).
4915     */
4916    return (ctx->options->chip_class >= GFX7 && ctx->options->chip_class <= GFX9) ||
4917           (offset % vertex_byte_size == 0 && MAX2(binding_align, 1) % vertex_byte_size == 0);
4918 }
4919 
4920 uint8_t
get_fetch_data_format(isel_context * ctx,const ac_data_format_info * vtx_info,unsigned offset,unsigned * channels,unsigned max_channels,unsigned binding_align)4921 get_fetch_data_format(isel_context* ctx, const ac_data_format_info* vtx_info, unsigned offset,
4922                       unsigned* channels, unsigned max_channels, unsigned binding_align)
4923 {
4924    if (!vtx_info->chan_byte_size) {
4925       *channels = vtx_info->num_channels;
4926       return vtx_info->chan_format;
4927    }
4928 
4929    unsigned num_channels = *channels;
4930    if (!check_vertex_fetch_size(ctx, vtx_info, offset, binding_align, *channels)) {
4931       unsigned new_channels = num_channels + 1;
4932       /* first, assume more loads is worse and try using a larger data format */
4933       while (new_channels <= max_channels &&
4934              !check_vertex_fetch_size(ctx, vtx_info, offset, binding_align, new_channels)) {
4935          new_channels++;
4936       }
4937 
4938       if (new_channels > max_channels) {
4939          /* then try decreasing load size (at the cost of more loads) */
4940          new_channels = *channels;
4941          while (new_channels > 1 &&
4942                 !check_vertex_fetch_size(ctx, vtx_info, offset, binding_align, new_channels))
4943             new_channels--;
4944       }
4945 
4946       if (new_channels < *channels)
4947          *channels = new_channels;
4948       num_channels = new_channels;
4949    }
4950 
4951    switch (vtx_info->chan_format) {
4952    case V_008F0C_BUF_DATA_FORMAT_8:
4953       return std::array<uint8_t, 4>{V_008F0C_BUF_DATA_FORMAT_8, V_008F0C_BUF_DATA_FORMAT_8_8,
4954                                     V_008F0C_BUF_DATA_FORMAT_INVALID,
4955                                     V_008F0C_BUF_DATA_FORMAT_8_8_8_8}[num_channels - 1];
4956    case V_008F0C_BUF_DATA_FORMAT_16:
4957       return std::array<uint8_t, 4>{V_008F0C_BUF_DATA_FORMAT_16, V_008F0C_BUF_DATA_FORMAT_16_16,
4958                                     V_008F0C_BUF_DATA_FORMAT_INVALID,
4959                                     V_008F0C_BUF_DATA_FORMAT_16_16_16_16}[num_channels - 1];
4960    case V_008F0C_BUF_DATA_FORMAT_32:
4961       return std::array<uint8_t, 4>{V_008F0C_BUF_DATA_FORMAT_32, V_008F0C_BUF_DATA_FORMAT_32_32,
4962                                     V_008F0C_BUF_DATA_FORMAT_32_32_32,
4963                                     V_008F0C_BUF_DATA_FORMAT_32_32_32_32}[num_channels - 1];
4964    }
4965    unreachable("shouldn't reach here");
4966    return V_008F0C_BUF_DATA_FORMAT_INVALID;
4967 }
4968 
4969 /* For 2_10_10_10 formats the alpha is handled as unsigned by pre-vega HW.
4970  * so we may need to fix it up. */
4971 Temp
adjust_vertex_fetch_alpha(isel_context * ctx,enum radv_vs_input_alpha_adjust adjustment,Temp alpha)4972 adjust_vertex_fetch_alpha(isel_context* ctx, enum radv_vs_input_alpha_adjust adjustment, Temp alpha)
4973 {
4974    Builder bld(ctx->program, ctx->block);
4975 
4976    if (adjustment == ALPHA_ADJUST_SSCALED)
4977       alpha = bld.vop1(aco_opcode::v_cvt_u32_f32, bld.def(v1), alpha);
4978 
4979    /* For the integer-like cases, do a natural sign extension.
4980     *
4981     * For the SNORM case, the values are 0.0, 0.333, 0.666, 1.0
4982     * and happen to contain 0, 1, 2, 3 as the two LSBs of the
4983     * exponent.
4984     */
4985    unsigned offset = adjustment == ALPHA_ADJUST_SNORM ? 23u : 0u;
4986    alpha =
4987       bld.vop3(aco_opcode::v_bfe_i32, bld.def(v1), alpha, Operand::c32(offset), Operand::c32(2u));
4988 
4989    /* Convert back to the right type. */
4990    if (adjustment == ALPHA_ADJUST_SNORM) {
4991       alpha = bld.vop1(aco_opcode::v_cvt_f32_i32, bld.def(v1), alpha);
4992       alpha = bld.vop2(aco_opcode::v_max_f32, bld.def(v1), Operand::c32(0xbf800000u), alpha);
4993    } else if (adjustment == ALPHA_ADJUST_SSCALED) {
4994       alpha = bld.vop1(aco_opcode::v_cvt_f32_i32, bld.def(v1), alpha);
4995    }
4996 
4997    return alpha;
4998 }
4999 
5000 void
visit_load_input(isel_context * ctx,nir_intrinsic_instr * instr)5001 visit_load_input(isel_context* ctx, nir_intrinsic_instr* instr)
5002 {
5003    Builder bld(ctx->program, ctx->block);
5004    Temp dst = get_ssa_temp(ctx, &instr->dest.ssa);
5005    nir_src offset = *nir_get_io_offset_src(instr);
5006 
5007    if (ctx->shader->info.stage == MESA_SHADER_VERTEX && ctx->args->shader_info->vs.dynamic_inputs) {
5008       if (!nir_src_is_const(offset) || nir_src_as_uint(offset))
5009          isel_err(offset.ssa->parent_instr,
5010                   "Unimplemented non-zero nir_intrinsic_load_input offset");
5011 
5012       unsigned location = nir_intrinsic_base(instr) - VERT_ATTRIB_GENERIC0;
5013       unsigned component = nir_intrinsic_component(instr);
5014       unsigned bitsize = instr->dest.ssa.bit_size;
5015       unsigned num_components = instr->dest.ssa.num_components;
5016 
5017       Temp input = get_arg(ctx, ctx->args->vs_inputs[location]);
5018 
5019       aco_ptr<Instruction> vec{create_instruction<Pseudo_instruction>(
5020          aco_opcode::p_create_vector, Format::PSEUDO, num_components, 1)};
5021       std::array<Temp, NIR_MAX_VEC_COMPONENTS> elems;
5022       for (unsigned i = 0; i < num_components; i++) {
5023          elems[i] = emit_extract_vector(ctx, input, component + i, bitsize == 64 ? v2 : v1);
5024          if (bitsize == 16) {
5025             if (nir_alu_type_get_base_type(nir_intrinsic_dest_type(instr)) == nir_type_float)
5026                elems[i] = bld.vop1(aco_opcode::v_cvt_f16_f32, bld.def(v2b), elems[i]);
5027             else
5028                elems[i] = bld.pseudo(aco_opcode::p_extract_vector, bld.def(v2b), elems[i],
5029                                      Operand::c32(0u));
5030          }
5031          vec->operands[i] = Operand(elems[i]);
5032       }
5033       vec->definitions[0] = Definition(dst);
5034       ctx->block->instructions.emplace_back(std::move(vec));
5035       ctx->allocated_vec.emplace(dst.id(), elems);
5036    } else if (ctx->shader->info.stage == MESA_SHADER_VERTEX) {
5037 
5038       if (!nir_src_is_const(offset) || nir_src_as_uint(offset))
5039          isel_err(offset.ssa->parent_instr,
5040                   "Unimplemented non-zero nir_intrinsic_load_input offset");
5041 
5042       Temp vertex_buffers =
5043          convert_pointer_to_64_bit(ctx, get_arg(ctx, ctx->args->ac.vertex_buffers));
5044 
5045       unsigned location = nir_intrinsic_base(instr) - VERT_ATTRIB_GENERIC0;
5046       unsigned component = nir_intrinsic_component(instr);
5047       unsigned bitsize = instr->dest.ssa.bit_size;
5048       unsigned attrib_binding = ctx->options->key.vs.vertex_attribute_bindings[location];
5049       uint32_t attrib_offset = ctx->options->key.vs.vertex_attribute_offsets[location];
5050       uint32_t attrib_stride = ctx->options->key.vs.vertex_attribute_strides[location];
5051       unsigned attrib_format = ctx->options->key.vs.vertex_attribute_formats[location];
5052       unsigned binding_align = ctx->options->key.vs.vertex_binding_align[attrib_binding];
5053       enum radv_vs_input_alpha_adjust alpha_adjust =
5054          ctx->options->key.vs.vertex_alpha_adjust[location];
5055 
5056       unsigned dfmt = attrib_format & 0xf;
5057       unsigned nfmt = (attrib_format >> 4) & 0x7;
5058       const struct ac_data_format_info* vtx_info = ac_get_data_format_info(dfmt);
5059 
5060       unsigned mask = nir_ssa_def_components_read(&instr->dest.ssa) << component;
5061       unsigned num_channels = MIN2(util_last_bit(mask), vtx_info->num_channels);
5062       bool post_shuffle = ctx->options->key.vs.vertex_post_shuffle & (1 << location);
5063       if (post_shuffle)
5064          num_channels = MAX2(num_channels, 3);
5065 
5066       unsigned desc_index =
5067          ctx->program->info->vs.use_per_attribute_vb_descs ? location : attrib_binding;
5068       desc_index = util_bitcount(ctx->program->info->vs.vb_desc_usage_mask &
5069                                  u_bit_consecutive(0, desc_index));
5070       Operand off = bld.copy(bld.def(s1), Operand::c32(desc_index * 16u));
5071       Temp list = bld.smem(aco_opcode::s_load_dwordx4, bld.def(s4), vertex_buffers, off);
5072 
5073       Temp index;
5074       if (ctx->options->key.vs.instance_rate_inputs & (1u << location)) {
5075          uint32_t divisor = ctx->options->key.vs.instance_rate_divisors[location];
5076          Temp start_instance = get_arg(ctx, ctx->args->ac.start_instance);
5077          if (divisor) {
5078             Temp instance_id = get_arg(ctx, ctx->args->ac.instance_id);
5079             if (divisor != 1) {
5080                Temp divided = bld.tmp(v1);
5081                emit_v_div_u32(ctx, divided, as_vgpr(ctx, instance_id), divisor);
5082                index = bld.vadd32(bld.def(v1), start_instance, divided);
5083             } else {
5084                index = bld.vadd32(bld.def(v1), start_instance, instance_id);
5085             }
5086          } else {
5087             index = bld.copy(bld.def(v1), start_instance);
5088          }
5089       } else {
5090          index = bld.vadd32(bld.def(v1), get_arg(ctx, ctx->args->ac.base_vertex),
5091                             get_arg(ctx, ctx->args->ac.vertex_id));
5092       }
5093 
5094       Temp* const channels = (Temp*)alloca(num_channels * sizeof(Temp));
5095       unsigned channel_start = 0;
5096       bool direct_fetch = false;
5097 
5098       /* skip unused channels at the start */
5099       if (vtx_info->chan_byte_size && !post_shuffle) {
5100          channel_start = ffs(mask) - 1;
5101          for (unsigned i = 0; i < MIN2(channel_start, num_channels); i++)
5102             channels[i] = Temp(0, s1);
5103       } else if (vtx_info->chan_byte_size && post_shuffle && !(mask & 0x8)) {
5104          num_channels = 3 - (ffs(mask) - 1);
5105       }
5106 
5107       /* load channels */
5108       while (channel_start < num_channels) {
5109          unsigned fetch_component = num_channels - channel_start;
5110          unsigned fetch_offset = attrib_offset + channel_start * vtx_info->chan_byte_size;
5111          bool expanded = false;
5112 
5113          /* use MUBUF when possible to avoid possible alignment issues */
5114          /* TODO: we could use SDWA to unpack 8/16-bit attributes without extra instructions */
5115          bool use_mubuf =
5116             (nfmt == V_008F0C_BUF_NUM_FORMAT_FLOAT || nfmt == V_008F0C_BUF_NUM_FORMAT_UINT ||
5117              nfmt == V_008F0C_BUF_NUM_FORMAT_SINT) &&
5118             vtx_info->chan_byte_size == 4;
5119          unsigned fetch_dfmt = V_008F0C_BUF_DATA_FORMAT_INVALID;
5120          if (!use_mubuf) {
5121             fetch_dfmt =
5122                get_fetch_data_format(ctx, vtx_info, fetch_offset, &fetch_component,
5123                                      vtx_info->num_channels - channel_start, binding_align);
5124          } else {
5125             if (fetch_component == 3 && ctx->options->chip_class == GFX6) {
5126                /* GFX6 only supports loading vec3 with MTBUF, expand to vec4. */
5127                fetch_component = 4;
5128                expanded = true;
5129             }
5130          }
5131 
5132          unsigned fetch_bytes = fetch_component * bitsize / 8;
5133 
5134          Temp fetch_index = index;
5135          if (attrib_stride != 0 && fetch_offset > attrib_stride) {
5136             fetch_index =
5137                bld.vadd32(bld.def(v1), Operand::c32(fetch_offset / attrib_stride), fetch_index);
5138             fetch_offset = fetch_offset % attrib_stride;
5139          }
5140 
5141          Operand soffset = Operand::zero();
5142          if (fetch_offset >= 4096) {
5143             soffset = bld.copy(bld.def(s1), Operand::c32(fetch_offset / 4096 * 4096));
5144             fetch_offset %= 4096;
5145          }
5146 
5147          aco_opcode opcode;
5148          switch (fetch_bytes) {
5149          case 2:
5150             assert(!use_mubuf && bitsize == 16);
5151             opcode = aco_opcode::tbuffer_load_format_d16_x;
5152             break;
5153          case 4:
5154             if (bitsize == 16) {
5155                assert(!use_mubuf);
5156                opcode = aco_opcode::tbuffer_load_format_d16_xy;
5157             } else {
5158                opcode =
5159                   use_mubuf ? aco_opcode::buffer_load_dword : aco_opcode::tbuffer_load_format_x;
5160             }
5161             break;
5162          case 6:
5163             assert(!use_mubuf && bitsize == 16);
5164             opcode = aco_opcode::tbuffer_load_format_d16_xyz;
5165             break;
5166          case 8:
5167             if (bitsize == 16) {
5168                assert(!use_mubuf);
5169                opcode = aco_opcode::tbuffer_load_format_d16_xyzw;
5170             } else {
5171                opcode =
5172                   use_mubuf ? aco_opcode::buffer_load_dwordx2 : aco_opcode::tbuffer_load_format_xy;
5173             }
5174             break;
5175          case 12:
5176             assert(ctx->options->chip_class >= GFX7 ||
5177                    (!use_mubuf && ctx->options->chip_class == GFX6));
5178             opcode =
5179                use_mubuf ? aco_opcode::buffer_load_dwordx3 : aco_opcode::tbuffer_load_format_xyz;
5180             break;
5181          case 16:
5182             opcode =
5183                use_mubuf ? aco_opcode::buffer_load_dwordx4 : aco_opcode::tbuffer_load_format_xyzw;
5184             break;
5185          default: unreachable("Unimplemented load_input vector size");
5186          }
5187 
5188          Temp fetch_dst;
5189          if (channel_start == 0 && fetch_bytes == dst.bytes() && !post_shuffle && !expanded &&
5190              (alpha_adjust == ALPHA_ADJUST_NONE || num_channels <= 3)) {
5191             direct_fetch = true;
5192             fetch_dst = dst;
5193          } else {
5194             fetch_dst = bld.tmp(RegClass::get(RegType::vgpr, fetch_bytes));
5195          }
5196 
5197          if (use_mubuf) {
5198             Instruction* mubuf = bld.mubuf(opcode, Definition(fetch_dst), list, fetch_index,
5199                                            soffset, fetch_offset, false, false, true)
5200                                     .instr;
5201             mubuf->mubuf().vtx_binding = attrib_binding + 1;
5202          } else {
5203             Instruction* mtbuf = bld.mtbuf(opcode, Definition(fetch_dst), list, fetch_index,
5204                                            soffset, fetch_dfmt, nfmt, fetch_offset, false, true)
5205                                     .instr;
5206             mtbuf->mtbuf().vtx_binding = attrib_binding + 1;
5207          }
5208 
5209          emit_split_vector(ctx, fetch_dst, fetch_dst.size());
5210 
5211          if (fetch_component == 1) {
5212             channels[channel_start] = fetch_dst;
5213          } else {
5214             for (unsigned i = 0; i < MIN2(fetch_component, num_channels - channel_start); i++)
5215                channels[channel_start + i] =
5216                   emit_extract_vector(ctx, fetch_dst, i, bitsize == 16 ? v2b : v1);
5217          }
5218 
5219          channel_start += fetch_component;
5220       }
5221 
5222       if (!direct_fetch) {
5223          bool is_float =
5224             nfmt != V_008F0C_BUF_NUM_FORMAT_UINT && nfmt != V_008F0C_BUF_NUM_FORMAT_SINT;
5225 
5226          static const unsigned swizzle_normal[4] = {0, 1, 2, 3};
5227          static const unsigned swizzle_post_shuffle[4] = {2, 1, 0, 3};
5228          const unsigned* swizzle = post_shuffle ? swizzle_post_shuffle : swizzle_normal;
5229          unsigned num_components = instr->dest.ssa.num_components;
5230 
5231          aco_ptr<Instruction> vec{create_instruction<Pseudo_instruction>(
5232             aco_opcode::p_create_vector, Format::PSEUDO, num_components, 1)};
5233          std::array<Temp, NIR_MAX_VEC_COMPONENTS> elems;
5234          unsigned num_temp = 0;
5235          for (unsigned i = 0; i < num_components; i++) {
5236             unsigned idx = i + component;
5237             if (swizzle[idx] < num_channels && channels[swizzle[idx]].id()) {
5238                Temp channel = channels[swizzle[idx]];
5239                if (idx == 3 && alpha_adjust != ALPHA_ADJUST_NONE)
5240                   channel = adjust_vertex_fetch_alpha(ctx, alpha_adjust, channel);
5241                vec->operands[i] = Operand(channel);
5242 
5243                num_temp++;
5244                elems[i] = channel;
5245             } else if (is_float && idx == 3) {
5246                vec->operands[i] = Operand::c32(0x3f800000u);
5247             } else if (!is_float && idx == 3) {
5248                vec->operands[i] = Operand::c32(1u);
5249             } else {
5250                vec->operands[i] = Operand::zero();
5251             }
5252          }
5253          vec->definitions[0] = Definition(dst);
5254          ctx->block->instructions.emplace_back(std::move(vec));
5255          emit_split_vector(ctx, dst, num_components);
5256 
5257          if (num_temp == num_components)
5258             ctx->allocated_vec.emplace(dst.id(), elems);
5259       }
5260    } else if (ctx->shader->info.stage == MESA_SHADER_FRAGMENT) {
5261       if (!nir_src_is_const(offset) || nir_src_as_uint(offset))
5262          isel_err(offset.ssa->parent_instr,
5263                   "Unimplemented non-zero nir_intrinsic_load_input offset");
5264 
5265       Temp prim_mask = get_arg(ctx, ctx->args->ac.prim_mask);
5266 
5267       unsigned idx = nir_intrinsic_base(instr);
5268       unsigned component = nir_intrinsic_component(instr);
5269       unsigned vertex_id = 2; /* P0 */
5270 
5271       if (instr->intrinsic == nir_intrinsic_load_input_vertex) {
5272          nir_const_value* src0 = nir_src_as_const_value(instr->src[0]);
5273          switch (src0->u32) {
5274          case 0:
5275             vertex_id = 2; /* P0 */
5276             break;
5277          case 1:
5278             vertex_id = 0; /* P10 */
5279             break;
5280          case 2:
5281             vertex_id = 1; /* P20 */
5282             break;
5283          default: unreachable("invalid vertex index");
5284          }
5285       }
5286 
5287       if (instr->dest.ssa.num_components == 1 &&
5288           instr->dest.ssa.bit_size != 64) {
5289          bld.vintrp(aco_opcode::v_interp_mov_f32, Definition(dst), Operand::c32(vertex_id),
5290                     bld.m0(prim_mask), idx, component);
5291       } else {
5292          unsigned num_components = instr->dest.ssa.num_components;
5293          if (instr->dest.ssa.bit_size == 64)
5294             num_components *= 2;
5295          aco_ptr<Pseudo_instruction> vec{create_instruction<Pseudo_instruction>(
5296             aco_opcode::p_create_vector, Format::PSEUDO, num_components, 1)};
5297          for (unsigned i = 0; i < num_components; i++) {
5298             unsigned chan_component = (component + i) % 4;
5299             unsigned chan_idx = idx + (component + i) / 4;
5300             vec->operands[i] = bld.vintrp(
5301                aco_opcode::v_interp_mov_f32, bld.def(instr->dest.ssa.bit_size == 16 ? v2b : v1),
5302                Operand::c32(vertex_id), bld.m0(prim_mask), chan_idx, chan_component);
5303          }
5304          vec->definitions[0] = Definition(dst);
5305          bld.insert(std::move(vec));
5306       }
5307    } else {
5308       unreachable("Shader stage not implemented");
5309    }
5310 }
5311 
5312 void
visit_load_tcs_per_vertex_input(isel_context * ctx,nir_intrinsic_instr * instr)5313 visit_load_tcs_per_vertex_input(isel_context* ctx, nir_intrinsic_instr* instr)
5314 {
5315    assert(ctx->shader->info.stage == MESA_SHADER_TESS_CTRL);
5316 
5317    Builder bld(ctx->program, ctx->block);
5318    Temp dst = get_ssa_temp(ctx, &instr->dest.ssa);
5319 
5320    if (load_input_from_temps(ctx, instr, dst))
5321       return;
5322 
5323    unreachable("LDS-based TCS input should have been lowered in NIR.");
5324 }
5325 
5326 void
visit_load_per_vertex_input(isel_context * ctx,nir_intrinsic_instr * instr)5327 visit_load_per_vertex_input(isel_context* ctx, nir_intrinsic_instr* instr)
5328 {
5329    switch (ctx->shader->info.stage) {
5330    case MESA_SHADER_TESS_CTRL: visit_load_tcs_per_vertex_input(ctx, instr); break;
5331    default: unreachable("Unimplemented shader stage");
5332    }
5333 }
5334 
5335 void
visit_load_tess_coord(isel_context * ctx,nir_intrinsic_instr * instr)5336 visit_load_tess_coord(isel_context* ctx, nir_intrinsic_instr* instr)
5337 {
5338    assert(ctx->shader->info.stage == MESA_SHADER_TESS_EVAL);
5339 
5340    Builder bld(ctx->program, ctx->block);
5341    Temp dst = get_ssa_temp(ctx, &instr->dest.ssa);
5342 
5343    Operand tes_u(get_arg(ctx, ctx->args->ac.tes_u));
5344    Operand tes_v(get_arg(ctx, ctx->args->ac.tes_v));
5345    Operand tes_w = Operand::zero();
5346 
5347    if (ctx->shader->info.tess.primitive_mode == GL_TRIANGLES) {
5348       Temp tmp = bld.vop2(aco_opcode::v_add_f32, bld.def(v1), tes_u, tes_v);
5349       tmp = bld.vop2(aco_opcode::v_sub_f32, bld.def(v1), Operand::c32(0x3f800000u /* 1.0f */), tmp);
5350       tes_w = Operand(tmp);
5351    }
5352 
5353    Temp tess_coord = bld.pseudo(aco_opcode::p_create_vector, Definition(dst), tes_u, tes_v, tes_w);
5354    emit_split_vector(ctx, tess_coord, 3);
5355 }
5356 
5357 Temp
load_desc_ptr(isel_context * ctx,unsigned desc_set)5358 load_desc_ptr(isel_context* ctx, unsigned desc_set)
5359 {
5360    const struct radv_userdata_locations *user_sgprs_locs = &ctx->program->info->user_sgprs_locs;
5361 
5362    if (user_sgprs_locs->shader_data[AC_UD_INDIRECT_DESCRIPTOR_SETS].sgpr_idx != -1) {
5363       Builder bld(ctx->program, ctx->block);
5364       Temp ptr64 = convert_pointer_to_64_bit(ctx, get_arg(ctx, ctx->args->descriptor_sets[0]));
5365       Operand off = bld.copy(bld.def(s1), Operand::c32(desc_set << 2));
5366       return bld.smem(aco_opcode::s_load_dword, bld.def(s1), ptr64, off); //, false, false, false);
5367    }
5368 
5369    return get_arg(ctx, ctx->args->descriptor_sets[desc_set]);
5370 }
5371 
5372 void
visit_load_resource(isel_context * ctx,nir_intrinsic_instr * instr)5373 visit_load_resource(isel_context* ctx, nir_intrinsic_instr* instr)
5374 {
5375    Builder bld(ctx->program, ctx->block);
5376    Temp index = get_ssa_temp(ctx, instr->src[0].ssa);
5377    if (!nir_dest_is_divergent(instr->dest))
5378       index = bld.as_uniform(index);
5379    unsigned desc_set = nir_intrinsic_desc_set(instr);
5380    unsigned binding = nir_intrinsic_binding(instr);
5381 
5382    Temp desc_ptr;
5383    radv_pipeline_layout* pipeline_layout = ctx->options->layout;
5384    radv_descriptor_set_layout* layout = pipeline_layout->set[desc_set].layout;
5385    unsigned offset = layout->binding[binding].offset;
5386    unsigned stride;
5387    if (layout->binding[binding].type == VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER_DYNAMIC ||
5388        layout->binding[binding].type == VK_DESCRIPTOR_TYPE_STORAGE_BUFFER_DYNAMIC) {
5389       unsigned idx = pipeline_layout->set[desc_set].dynamic_offset_start +
5390                      layout->binding[binding].dynamic_offset_offset;
5391       desc_ptr = get_arg(ctx, ctx->args->ac.push_constants);
5392       offset = pipeline_layout->push_constant_size + 16 * idx;
5393       stride = 16;
5394    } else {
5395       desc_ptr = load_desc_ptr(ctx, desc_set);
5396       stride = layout->binding[binding].size;
5397    }
5398 
5399    if (nir_src_is_const(instr->src[0])) {
5400       index =
5401          bld.copy(bld.def(s1), Operand::c32((offset + nir_src_as_uint(instr->src[0]) * stride)));
5402    } else if (index.type() == RegType::vgpr) {
5403       if (stride != 1) {
5404          bool index24bit = layout->binding[binding].array_size <= 0x1000000;
5405          index = bld.v_mul_imm(bld.def(v1), index, stride, index24bit);
5406       }
5407       if (offset)
5408          index = bld.vadd32(bld.def(v1), Operand::c32(offset), index);
5409    } else {
5410       if (stride != 1)
5411          index = bld.sop2(aco_opcode::s_mul_i32, bld.def(s1), Operand::c32(stride), index);
5412       if (offset)
5413          index = bld.sop2(aco_opcode::s_add_i32, bld.def(s1), bld.def(s1, scc),
5414                           Operand::c32(offset), index);
5415    }
5416 
5417    Temp dst = get_ssa_temp(ctx, &instr->dest.ssa);
5418    std::array<Temp, NIR_MAX_VEC_COMPONENTS> elems;
5419    elems[0] = desc_ptr;
5420    elems[1] = index;
5421    ctx->allocated_vec.emplace(dst.id(), elems);
5422    bld.pseudo(aco_opcode::p_create_vector, Definition(dst), desc_ptr, index, Operand::zero());
5423 }
5424 
5425 void
load_buffer(isel_context * ctx,unsigned num_components,unsigned component_size,Temp dst,Temp rsrc,Temp offset,unsigned align_mul,unsigned align_offset,bool glc=false,bool allow_smem=true,memory_sync_info sync=memory_sync_info ())5426 load_buffer(isel_context* ctx, unsigned num_components, unsigned component_size, Temp dst,
5427             Temp rsrc, Temp offset, unsigned align_mul, unsigned align_offset, bool glc = false,
5428             bool allow_smem = true, memory_sync_info sync = memory_sync_info())
5429 {
5430    Builder bld(ctx->program, ctx->block);
5431 
5432    bool use_smem =
5433       dst.type() != RegType::vgpr && (!glc || ctx->options->chip_class >= GFX8) && allow_smem;
5434    if (use_smem)
5435       offset = bld.as_uniform(offset);
5436    else {
5437       /* GFX6-7 are affected by a hw bug that prevents address clamping to
5438        * work correctly when the SGPR offset is used.
5439        */
5440       if (offset.type() == RegType::sgpr && ctx->options->chip_class < GFX8)
5441          offset = as_vgpr(ctx, offset);
5442    }
5443 
5444    LoadEmitInfo info = {Operand(offset), dst, num_components, component_size, rsrc};
5445    info.glc = glc;
5446    info.sync = sync;
5447    info.align_mul = align_mul;
5448    info.align_offset = align_offset;
5449    if (use_smem)
5450       emit_load(ctx, bld, info, smem_load_params);
5451    else
5452       emit_load(ctx, bld, info, mubuf_load_params);
5453 }
5454 
5455 Temp
load_buffer_rsrc(isel_context * ctx,Temp rsrc)5456 load_buffer_rsrc(isel_context* ctx, Temp rsrc)
5457 {
5458    Builder bld(ctx->program, ctx->block);
5459    Temp set_ptr = emit_extract_vector(ctx, rsrc, 0, RegClass(rsrc.type(), 1));
5460    Temp binding = bld.as_uniform(emit_extract_vector(ctx, rsrc, 1, RegClass(rsrc.type(), 1)));
5461    set_ptr = convert_pointer_to_64_bit(ctx, set_ptr);
5462    return bld.smem(aco_opcode::s_load_dwordx4, bld.def(s4), set_ptr, binding);
5463 }
5464 
5465 bool
is_inline_ubo(isel_context * ctx,nir_src rsrc)5466 is_inline_ubo(isel_context* ctx, nir_src rsrc)
5467 {
5468    nir_binding binding = nir_chase_binding(rsrc);
5469    if (!binding.success)
5470       return false;
5471 
5472    radv_descriptor_set_layout* layout = ctx->options->layout->set[binding.desc_set].layout;
5473    return layout->binding[binding.binding].type == VK_DESCRIPTOR_TYPE_INLINE_UNIFORM_BLOCK_EXT;
5474 }
5475 
5476 void
visit_load_ubo(isel_context * ctx,nir_intrinsic_instr * instr)5477 visit_load_ubo(isel_context* ctx, nir_intrinsic_instr* instr)
5478 {
5479    Temp dst = get_ssa_temp(ctx, &instr->dest.ssa);
5480    Temp rsrc = get_ssa_temp(ctx, instr->src[0].ssa);
5481 
5482    Builder bld(ctx->program, ctx->block);
5483 
5484    if (is_inline_ubo(ctx, instr->src[0])) {
5485       Temp set_ptr = bld.as_uniform(emit_extract_vector(ctx, rsrc, 0, RegClass(rsrc.type(), 1)));
5486       Temp binding_off =
5487          bld.as_uniform(emit_extract_vector(ctx, rsrc, 1, RegClass(rsrc.type(), 1)));
5488       rsrc = bld.sop2(aco_opcode::s_add_u32, bld.def(s1), bld.def(s1, scc), set_ptr, binding_off);
5489 
5490       uint32_t desc_type =
5491          S_008F0C_DST_SEL_X(V_008F0C_SQ_SEL_X) | S_008F0C_DST_SEL_Y(V_008F0C_SQ_SEL_Y) |
5492          S_008F0C_DST_SEL_Z(V_008F0C_SQ_SEL_Z) | S_008F0C_DST_SEL_W(V_008F0C_SQ_SEL_W);
5493       if (ctx->options->chip_class >= GFX10) {
5494          desc_type |= S_008F0C_FORMAT(V_008F0C_GFX10_FORMAT_32_FLOAT) |
5495                       S_008F0C_OOB_SELECT(V_008F0C_OOB_SELECT_RAW) | S_008F0C_RESOURCE_LEVEL(1);
5496       } else {
5497          desc_type |= S_008F0C_NUM_FORMAT(V_008F0C_BUF_NUM_FORMAT_FLOAT) |
5498                       S_008F0C_DATA_FORMAT(V_008F0C_BUF_DATA_FORMAT_32);
5499       }
5500       rsrc = bld.pseudo(aco_opcode::p_create_vector, bld.def(s4), rsrc,
5501                         Operand::c32(S_008F04_BASE_ADDRESS_HI(ctx->options->address32_hi)),
5502                         Operand::c32(0xFFFFFFFFu), Operand::c32(desc_type));
5503    } else {
5504       rsrc = load_buffer_rsrc(ctx, rsrc);
5505    }
5506    unsigned size = instr->dest.ssa.bit_size / 8;
5507    load_buffer(ctx, instr->num_components, size, dst, rsrc, get_ssa_temp(ctx, instr->src[1].ssa),
5508                nir_intrinsic_align_mul(instr), nir_intrinsic_align_offset(instr));
5509 }
5510 
5511 void
visit_load_sbt_amd(isel_context * ctx,nir_intrinsic_instr * instr)5512 visit_load_sbt_amd(isel_context* ctx, nir_intrinsic_instr* instr)
5513 {
5514    Temp dst = get_ssa_temp(ctx, &instr->dest.ssa);
5515    unsigned binding = nir_intrinsic_binding(instr);
5516 
5517    Builder bld(ctx->program, ctx->block);
5518    Temp desc_base = convert_pointer_to_64_bit(ctx, get_arg(ctx, ctx->args->ac.sbt_descriptors));
5519    Operand desc_off = bld.copy(bld.def(s1), Operand::c32(binding * 16u));
5520    bld.smem(aco_opcode::s_load_dwordx4, Definition(dst), desc_base, desc_off);
5521 }
5522 
5523 void
visit_load_push_constant(isel_context * ctx,nir_intrinsic_instr * instr)5524 visit_load_push_constant(isel_context* ctx, nir_intrinsic_instr* instr)
5525 {
5526    Builder bld(ctx->program, ctx->block);
5527    Temp dst = get_ssa_temp(ctx, &instr->dest.ssa);
5528    unsigned offset = nir_intrinsic_base(instr);
5529    unsigned count = instr->dest.ssa.num_components;
5530    nir_const_value* index_cv = nir_src_as_const_value(instr->src[0]);
5531 
5532    if (index_cv && instr->dest.ssa.bit_size == 32) {
5533       struct radv_userdata_info *loc =
5534          &ctx->args->shader_info->user_sgprs_locs.shader_data[AC_UD_INLINE_PUSH_CONSTANTS];
5535       unsigned start = (offset + index_cv->u32) / 4u;
5536       unsigned num_inline_push_consts = loc->sgpr_idx != -1 ? loc->num_sgprs : 0;
5537 
5538       start -= ctx->args->shader_info->min_push_constant_used / 4;
5539       if (start + count <= num_inline_push_consts) {
5540          std::array<Temp, NIR_MAX_VEC_COMPONENTS> elems;
5541          aco_ptr<Pseudo_instruction> vec{create_instruction<Pseudo_instruction>(
5542             aco_opcode::p_create_vector, Format::PSEUDO, count, 1)};
5543          for (unsigned i = 0; i < count; ++i) {
5544             elems[i] = get_arg(ctx, ctx->args->ac.inline_push_consts[start + i]);
5545             vec->operands[i] = Operand{elems[i]};
5546          }
5547          vec->definitions[0] = Definition(dst);
5548          ctx->block->instructions.emplace_back(std::move(vec));
5549          ctx->allocated_vec.emplace(dst.id(), elems);
5550          return;
5551       }
5552    }
5553 
5554    Temp index = bld.as_uniform(get_ssa_temp(ctx, instr->src[0].ssa));
5555    if (offset != 0) // TODO check if index != 0 as well
5556       index = bld.nuw().sop2(aco_opcode::s_add_i32, bld.def(s1), bld.def(s1, scc),
5557                              Operand::c32(offset), index);
5558    Temp ptr = convert_pointer_to_64_bit(ctx, get_arg(ctx, ctx->args->ac.push_constants));
5559    Temp vec = dst;
5560    bool trim = false;
5561    bool aligned = true;
5562 
5563    if (instr->dest.ssa.bit_size == 8) {
5564       aligned = index_cv && (offset + index_cv->u32) % 4 == 0;
5565       bool fits_in_dword = count == 1 || (index_cv && ((offset + index_cv->u32) % 4 + count) <= 4);
5566       if (!aligned)
5567          vec = fits_in_dword ? bld.tmp(s1) : bld.tmp(s2);
5568    } else if (instr->dest.ssa.bit_size == 16) {
5569       aligned = index_cv && (offset + index_cv->u32) % 4 == 0;
5570       if (!aligned)
5571          vec = count == 4 ? bld.tmp(s4) : count > 1 ? bld.tmp(s2) : bld.tmp(s1);
5572    }
5573 
5574    aco_opcode op;
5575 
5576    switch (vec.size()) {
5577    case 1: op = aco_opcode::s_load_dword; break;
5578    case 2: op = aco_opcode::s_load_dwordx2; break;
5579    case 3:
5580       vec = bld.tmp(s4);
5581       trim = true;
5582       FALLTHROUGH;
5583    case 4: op = aco_opcode::s_load_dwordx4; break;
5584    case 6:
5585       vec = bld.tmp(s8);
5586       trim = true;
5587       FALLTHROUGH;
5588    case 8: op = aco_opcode::s_load_dwordx8; break;
5589    default: unreachable("unimplemented or forbidden load_push_constant.");
5590    }
5591 
5592    bld.smem(op, Definition(vec), ptr, index).instr->smem().prevent_overflow = true;
5593 
5594    if (!aligned) {
5595       Operand byte_offset = index_cv ? Operand::c32((offset + index_cv->u32) % 4) : Operand(index);
5596       byte_align_scalar(ctx, vec, byte_offset, dst);
5597       return;
5598    }
5599 
5600    if (trim) {
5601       emit_split_vector(ctx, vec, 4);
5602       RegClass rc = dst.size() == 3 ? s1 : s2;
5603       bld.pseudo(aco_opcode::p_create_vector, Definition(dst), emit_extract_vector(ctx, vec, 0, rc),
5604                  emit_extract_vector(ctx, vec, 1, rc), emit_extract_vector(ctx, vec, 2, rc));
5605    }
5606    emit_split_vector(ctx, dst, instr->dest.ssa.num_components);
5607 }
5608 
5609 void
visit_load_constant(isel_context * ctx,nir_intrinsic_instr * instr)5610 visit_load_constant(isel_context* ctx, nir_intrinsic_instr* instr)
5611 {
5612    Temp dst = get_ssa_temp(ctx, &instr->dest.ssa);
5613 
5614    Builder bld(ctx->program, ctx->block);
5615 
5616    uint32_t desc_type =
5617       S_008F0C_DST_SEL_X(V_008F0C_SQ_SEL_X) | S_008F0C_DST_SEL_Y(V_008F0C_SQ_SEL_Y) |
5618       S_008F0C_DST_SEL_Z(V_008F0C_SQ_SEL_Z) | S_008F0C_DST_SEL_W(V_008F0C_SQ_SEL_W);
5619    if (ctx->options->chip_class >= GFX10) {
5620       desc_type |= S_008F0C_FORMAT(V_008F0C_GFX10_FORMAT_32_FLOAT) |
5621                    S_008F0C_OOB_SELECT(V_008F0C_OOB_SELECT_RAW) | S_008F0C_RESOURCE_LEVEL(1);
5622    } else {
5623       desc_type |= S_008F0C_NUM_FORMAT(V_008F0C_BUF_NUM_FORMAT_FLOAT) |
5624                    S_008F0C_DATA_FORMAT(V_008F0C_BUF_DATA_FORMAT_32);
5625    }
5626 
5627    unsigned base = nir_intrinsic_base(instr);
5628    unsigned range = nir_intrinsic_range(instr);
5629 
5630    Temp offset = get_ssa_temp(ctx, instr->src[0].ssa);
5631    if (base && offset.type() == RegType::sgpr)
5632       offset = bld.nuw().sop2(aco_opcode::s_add_u32, bld.def(s1), bld.def(s1, scc), offset,
5633                               Operand::c32(base));
5634    else if (base && offset.type() == RegType::vgpr)
5635       offset = bld.vadd32(bld.def(v1), Operand::c32(base), offset);
5636 
5637    Temp rsrc = bld.pseudo(aco_opcode::p_create_vector, bld.def(s4),
5638                           bld.pseudo(aco_opcode::p_constaddr, bld.def(s2), bld.def(s1, scc),
5639                                      Operand::c32(ctx->constant_data_offset)),
5640                           Operand::c32(MIN2(base + range, ctx->shader->constant_data_size)),
5641                           Operand::c32(desc_type));
5642    unsigned size = instr->dest.ssa.bit_size / 8;
5643    // TODO: get alignment information for subdword constants
5644    load_buffer(ctx, instr->num_components, size, dst, rsrc, offset, size, 0);
5645 }
5646 
5647 void
visit_discard_if(isel_context * ctx,nir_intrinsic_instr * instr)5648 visit_discard_if(isel_context* ctx, nir_intrinsic_instr* instr)
5649 {
5650    if (ctx->block->loop_nest_depth || ctx->cf_info.parent_if.is_divergent)
5651       ctx->cf_info.exec_potentially_empty_discard = true;
5652 
5653    ctx->program->needs_exact = true;
5654 
5655    // TODO: optimize uniform conditions
5656    Builder bld(ctx->program, ctx->block);
5657    Temp src = get_ssa_temp(ctx, instr->src[0].ssa);
5658    assert(src.regClass() == bld.lm);
5659    src = bld.sop2(Builder::s_and, bld.def(bld.lm), bld.def(s1, scc), src, Operand(exec, bld.lm));
5660    bld.pseudo(aco_opcode::p_discard_if, src);
5661    ctx->block->kind |= block_kind_uses_discard_if;
5662    return;
5663 }
5664 
5665 void
visit_discard(isel_context * ctx,nir_intrinsic_instr * instr)5666 visit_discard(isel_context* ctx, nir_intrinsic_instr* instr)
5667 {
5668    Builder bld(ctx->program, ctx->block);
5669 
5670    if (ctx->block->loop_nest_depth || ctx->cf_info.parent_if.is_divergent)
5671       ctx->cf_info.exec_potentially_empty_discard = true;
5672 
5673    bool divergent =
5674       ctx->cf_info.parent_if.is_divergent || ctx->cf_info.parent_loop.has_divergent_continue;
5675 
5676    if (ctx->block->loop_nest_depth && (nir_instr_is_last(&instr->instr) && !divergent)) {
5677       /* we handle discards the same way as jump instructions */
5678       append_logical_end(ctx->block);
5679 
5680       /* in loops, discard behaves like break */
5681       Block* linear_target = ctx->cf_info.parent_loop.exit;
5682       ctx->block->kind |= block_kind_discard;
5683 
5684       /* uniform discard - loop ends here */
5685       assert(nir_instr_is_last(&instr->instr));
5686       ctx->block->kind |= block_kind_uniform;
5687       ctx->cf_info.has_branch = true;
5688       bld.branch(aco_opcode::p_branch, bld.hint_vcc(bld.def(s2)));
5689       add_linear_edge(ctx->block->index, linear_target);
5690       return;
5691    }
5692 
5693    /* it can currently happen that NIR doesn't remove the unreachable code */
5694    if (!nir_instr_is_last(&instr->instr)) {
5695       ctx->program->needs_exact = true;
5696       /* save exec somewhere temporarily so that it doesn't get
5697        * overwritten before the discard from outer exec masks */
5698       Temp cond = bld.sop2(Builder::s_and, bld.def(bld.lm), bld.def(s1, scc),
5699                            Operand::c32(0xFFFFFFFF), Operand(exec, bld.lm));
5700       bld.pseudo(aco_opcode::p_discard_if, cond);
5701       ctx->block->kind |= block_kind_uses_discard_if;
5702       return;
5703    }
5704 
5705    /* This condition is incorrect for uniformly branched discards in a loop
5706     * predicated by a divergent condition, but the above code catches that case
5707     * and the discard would end up turning into a discard_if.
5708     * For example:
5709     * if (divergent) {
5710     *    while (...) {
5711     *       if (uniform) {
5712     *          discard;
5713     *       }
5714     *    }
5715     * }
5716     */
5717    if (!ctx->cf_info.parent_if.is_divergent) {
5718       /* program just ends here */
5719       ctx->block->kind |= block_kind_uses_discard_if;
5720       bld.pseudo(aco_opcode::p_discard_if, Operand::c32(0xFFFFFFFFu));
5721       // TODO: it will potentially be followed by a branch which is dead code to sanitize NIR phis
5722    } else {
5723       ctx->block->kind |= block_kind_discard;
5724       /* branch and linear edge is added by visit_if() */
5725    }
5726 }
5727 
5728 enum aco_descriptor_type {
5729    ACO_DESC_IMAGE,
5730    ACO_DESC_FMASK,
5731    ACO_DESC_SAMPLER,
5732    ACO_DESC_BUFFER,
5733    ACO_DESC_PLANE_0,
5734    ACO_DESC_PLANE_1,
5735    ACO_DESC_PLANE_2,
5736 };
5737 
5738 static bool
should_declare_array(isel_context * ctx,enum glsl_sampler_dim sampler_dim,bool is_array)5739 should_declare_array(isel_context* ctx, enum glsl_sampler_dim sampler_dim, bool is_array)
5740 {
5741    if (sampler_dim == GLSL_SAMPLER_DIM_BUF)
5742       return false;
5743    ac_image_dim dim = ac_get_sampler_dim(ctx->options->chip_class, sampler_dim, is_array);
5744    return dim == ac_image_cube || dim == ac_image_1darray || dim == ac_image_2darray ||
5745           dim == ac_image_2darraymsaa;
5746 }
5747 
5748 Temp
get_sampler_desc(isel_context * ctx,nir_deref_instr * deref_instr,enum aco_descriptor_type desc_type,const nir_tex_instr * tex_instr,bool write)5749 get_sampler_desc(isel_context* ctx, nir_deref_instr* deref_instr,
5750                  enum aco_descriptor_type desc_type, const nir_tex_instr* tex_instr, bool write)
5751 {
5752    /* FIXME: we should lower the deref with some new nir_intrinsic_load_desc
5753       std::unordered_map<uint64_t, Temp>::iterator it = ctx->tex_desc.find((uint64_t) desc_type <<
5754       32 | deref_instr->dest.ssa.index); if (it != ctx->tex_desc.end()) return it->second;
5755    */
5756    Temp index = Temp();
5757    bool index_set = false;
5758    unsigned constant_index = 0;
5759    unsigned descriptor_set;
5760    unsigned base_index;
5761    Builder bld(ctx->program, ctx->block);
5762 
5763    if (!deref_instr) {
5764       assert(tex_instr);
5765       descriptor_set = 0;
5766       base_index = tex_instr->sampler_index;
5767    } else {
5768       while (deref_instr->deref_type != nir_deref_type_var) {
5769          unsigned array_size = glsl_get_aoa_size(deref_instr->type);
5770          if (!array_size)
5771             array_size = 1;
5772 
5773          assert(deref_instr->deref_type == nir_deref_type_array);
5774          nir_const_value* const_value = nir_src_as_const_value(deref_instr->arr.index);
5775          if (const_value) {
5776             constant_index += array_size * const_value->u32;
5777          } else {
5778             Temp indirect = get_ssa_temp(ctx, deref_instr->arr.index.ssa);
5779             if (indirect.type() == RegType::vgpr)
5780                indirect = bld.as_uniform(indirect);
5781 
5782             if (array_size != 1)
5783                indirect =
5784                   bld.sop2(aco_opcode::s_mul_i32, bld.def(s1), Operand::c32(array_size), indirect);
5785 
5786             if (!index_set) {
5787                index = indirect;
5788                index_set = true;
5789             } else {
5790                index =
5791                   bld.sop2(aco_opcode::s_add_i32, bld.def(s1), bld.def(s1, scc), index, indirect);
5792             }
5793          }
5794 
5795          deref_instr = nir_src_as_deref(deref_instr->parent);
5796       }
5797       descriptor_set = deref_instr->var->data.descriptor_set;
5798       base_index = deref_instr->var->data.binding;
5799    }
5800 
5801    Temp list = load_desc_ptr(ctx, descriptor_set);
5802    list = convert_pointer_to_64_bit(ctx, list);
5803 
5804    struct radv_descriptor_set_layout* layout = ctx->options->layout->set[descriptor_set].layout;
5805    struct radv_descriptor_set_binding_layout* binding = layout->binding + base_index;
5806    unsigned offset = binding->offset;
5807    unsigned stride = binding->size;
5808    aco_opcode opcode;
5809    RegClass type;
5810 
5811    assert(base_index < layout->binding_count);
5812 
5813    switch (desc_type) {
5814    case ACO_DESC_IMAGE:
5815       type = s8;
5816       opcode = aco_opcode::s_load_dwordx8;
5817       break;
5818    case ACO_DESC_FMASK:
5819       type = s8;
5820       opcode = aco_opcode::s_load_dwordx8;
5821       offset += 32;
5822       break;
5823    case ACO_DESC_SAMPLER:
5824       type = s4;
5825       opcode = aco_opcode::s_load_dwordx4;
5826       if (binding->type == VK_DESCRIPTOR_TYPE_COMBINED_IMAGE_SAMPLER)
5827          offset += radv_combined_image_descriptor_sampler_offset(binding);
5828       break;
5829    case ACO_DESC_BUFFER:
5830       type = s4;
5831       opcode = aco_opcode::s_load_dwordx4;
5832       break;
5833    case ACO_DESC_PLANE_0:
5834    case ACO_DESC_PLANE_1:
5835       type = s8;
5836       opcode = aco_opcode::s_load_dwordx8;
5837       offset += 32 * (desc_type - ACO_DESC_PLANE_0);
5838       break;
5839    case ACO_DESC_PLANE_2:
5840       type = s4;
5841       opcode = aco_opcode::s_load_dwordx4;
5842       offset += 64;
5843       break;
5844    default: unreachable("invalid desc_type\n");
5845    }
5846 
5847    offset += constant_index * stride;
5848 
5849    if (desc_type == ACO_DESC_SAMPLER && binding->immutable_samplers_offset &&
5850        (!index_set || binding->immutable_samplers_equal)) {
5851       if (binding->immutable_samplers_equal)
5852          constant_index = 0;
5853 
5854       const uint32_t* samplers = radv_immutable_samplers(layout, binding);
5855       uint32_t dword0_mask = tex_instr->op == nir_texop_tg4 ? C_008F30_TRUNC_COORD : 0xffffffffu;
5856       return bld.pseudo(aco_opcode::p_create_vector, bld.def(s4),
5857                         Operand::c32(samplers[constant_index * 4 + 0] & dword0_mask),
5858                         Operand::c32(samplers[constant_index * 4 + 1]),
5859                         Operand::c32(samplers[constant_index * 4 + 2]),
5860                         Operand::c32(samplers[constant_index * 4 + 3]));
5861    }
5862 
5863    Operand off;
5864    if (!index_set) {
5865       off = bld.copy(bld.def(s1), Operand::c32(offset));
5866    } else {
5867       off = Operand(
5868          (Temp)bld.sop2(aco_opcode::s_add_i32, bld.def(s1), bld.def(s1, scc), Operand::c32(offset),
5869                         bld.sop2(aco_opcode::s_mul_i32, bld.def(s1), Operand::c32(stride), index)));
5870    }
5871 
5872    Temp res = bld.smem(opcode, bld.def(type), list, off);
5873 
5874    if (desc_type == ACO_DESC_PLANE_2) {
5875       Temp components[8];
5876       for (unsigned i = 0; i < 8; i++)
5877          components[i] = bld.tmp(s1);
5878       bld.pseudo(aco_opcode::p_split_vector, Definition(components[0]), Definition(components[1]),
5879                  Definition(components[2]), Definition(components[3]), res);
5880 
5881       Temp desc2 = get_sampler_desc(ctx, deref_instr, ACO_DESC_PLANE_1, tex_instr, write);
5882       bld.pseudo(aco_opcode::p_split_vector, bld.def(s1), bld.def(s1), bld.def(s1), bld.def(s1),
5883                  Definition(components[4]), Definition(components[5]), Definition(components[6]),
5884                  Definition(components[7]), desc2);
5885 
5886       res = bld.pseudo(aco_opcode::p_create_vector, bld.def(s8), components[0], components[1],
5887                        components[2], components[3], components[4], components[5], components[6],
5888                        components[7]);
5889    } else if (desc_type == ACO_DESC_IMAGE && ctx->options->has_image_load_dcc_bug && !tex_instr &&
5890               !write) {
5891       Temp components[8];
5892       for (unsigned i = 0; i < 8; i++)
5893          components[i] = bld.tmp(s1);
5894 
5895       bld.pseudo(aco_opcode::p_split_vector, Definition(components[0]), Definition(components[1]),
5896                  Definition(components[2]), Definition(components[3]), Definition(components[4]),
5897                  Definition(components[5]), Definition(components[6]), Definition(components[7]),
5898                  res);
5899 
5900       /* WRITE_COMPRESS_ENABLE must be 0 for all image loads to workaround a
5901        * hardware bug.
5902        */
5903       components[6] = bld.sop2(aco_opcode::s_and_b32, bld.def(s1), bld.def(s1, scc), components[6],
5904                                bld.copy(bld.def(s1), Operand::c32(C_00A018_WRITE_COMPRESS_ENABLE)));
5905 
5906       res = bld.pseudo(aco_opcode::p_create_vector, bld.def(s8), components[0], components[1],
5907                        components[2], components[3], components[4], components[5], components[6],
5908                        components[7]);
5909    } else if (desc_type == ACO_DESC_SAMPLER && tex_instr->op == nir_texop_tg4) {
5910       Temp components[4];
5911       for (unsigned i = 0; i < 4; i++)
5912          components[i] = bld.tmp(s1);
5913 
5914       bld.pseudo(aco_opcode::p_split_vector, Definition(components[0]), Definition(components[1]),
5915                  Definition(components[2]), Definition(components[3]), res);
5916 
5917       /* We want to always use the linear filtering truncation behaviour for
5918        * nir_texop_tg4, even if the sampler uses nearest/point filtering.
5919        */
5920       components[0] = bld.sop2(aco_opcode::s_and_b32, bld.def(s1), bld.def(s1, scc), components[0],
5921                                Operand::c32(C_008F30_TRUNC_COORD));
5922 
5923       res = bld.pseudo(aco_opcode::p_create_vector, bld.def(s4), components[0], components[1],
5924                        components[2], components[3]);
5925    }
5926 
5927    return res;
5928 }
5929 
5930 static int
image_type_to_components_count(enum glsl_sampler_dim dim,bool array)5931 image_type_to_components_count(enum glsl_sampler_dim dim, bool array)
5932 {
5933    switch (dim) {
5934    case GLSL_SAMPLER_DIM_BUF: return 1;
5935    case GLSL_SAMPLER_DIM_1D: return array ? 2 : 1;
5936    case GLSL_SAMPLER_DIM_2D: return array ? 3 : 2;
5937    case GLSL_SAMPLER_DIM_MS: return array ? 4 : 3;
5938    case GLSL_SAMPLER_DIM_3D:
5939    case GLSL_SAMPLER_DIM_CUBE: return 3;
5940    case GLSL_SAMPLER_DIM_RECT:
5941    case GLSL_SAMPLER_DIM_SUBPASS: return 2;
5942    case GLSL_SAMPLER_DIM_SUBPASS_MS: return 3;
5943    default: break;
5944    }
5945    return 0;
5946 }
5947 
5948 static MIMG_instruction*
emit_mimg(Builder & bld,aco_opcode op,Definition dst,Temp rsrc,Operand samp,std::vector<Temp> coords,unsigned wqm_mask=0,Operand vdata=Operand (v1))5949 emit_mimg(Builder& bld, aco_opcode op, Definition dst, Temp rsrc, Operand samp,
5950           std::vector<Temp> coords, unsigned wqm_mask = 0, Operand vdata = Operand(v1))
5951 {
5952    /* Limit NSA instructions to 3 dwords on GFX10 to avoid stability issues. */
5953    unsigned max_nsa_size = bld.program->chip_class >= GFX10_3 ? 13 : 5;
5954    bool use_nsa = bld.program->chip_class >= GFX10 && coords.size() <= max_nsa_size;
5955 
5956    if (!use_nsa) {
5957       Temp coord = coords[0];
5958       if (coords.size() > 1) {
5959          coord = bld.tmp(RegType::vgpr, coords.size());
5960 
5961          aco_ptr<Pseudo_instruction> vec{create_instruction<Pseudo_instruction>(
5962             aco_opcode::p_create_vector, Format::PSEUDO, coords.size(), 1)};
5963          for (unsigned i = 0; i < coords.size(); i++)
5964             vec->operands[i] = Operand(coords[i]);
5965          vec->definitions[0] = Definition(coord);
5966          bld.insert(std::move(vec));
5967       } else if (coord.type() == RegType::sgpr) {
5968          coord = bld.copy(bld.def(v1), coord);
5969       }
5970 
5971       if (wqm_mask) {
5972          /* We don't need the bias, sample index, compare value or offset to be
5973           * computed in WQM but if the p_create_vector copies the coordinates, then it
5974           * needs to be in WQM. */
5975          coord = emit_wqm(bld, coord, bld.tmp(coord.regClass()), true);
5976       }
5977 
5978       coords[0] = coord;
5979       coords.resize(1);
5980    } else {
5981       for (unsigned i = 0; i < coords.size(); i++) {
5982          if (wqm_mask & (1u << i))
5983             coords[i] = emit_wqm(bld, coords[i], bld.tmp(coords[i].regClass()), true);
5984       }
5985 
5986       for (Temp& coord : coords) {
5987          if (coord.type() == RegType::sgpr)
5988             coord = bld.copy(bld.def(v1), coord);
5989       }
5990    }
5991 
5992    aco_ptr<MIMG_instruction> mimg{
5993       create_instruction<MIMG_instruction>(op, Format::MIMG, 3 + coords.size(), dst.isTemp())};
5994    if (dst.isTemp())
5995       mimg->definitions[0] = dst;
5996    mimg->operands[0] = Operand(rsrc);
5997    mimg->operands[1] = samp;
5998    mimg->operands[2] = vdata;
5999    for (unsigned i = 0; i < coords.size(); i++)
6000       mimg->operands[3 + i] = Operand(coords[i]);
6001 
6002    MIMG_instruction* res = mimg.get();
6003    bld.insert(std::move(mimg));
6004    return res;
6005 }
6006 
6007 void
visit_bvh64_intersect_ray_amd(isel_context * ctx,nir_intrinsic_instr * instr)6008 visit_bvh64_intersect_ray_amd(isel_context* ctx, nir_intrinsic_instr* instr)
6009 {
6010    Builder bld(ctx->program, ctx->block);
6011    Temp dst = get_ssa_temp(ctx, &instr->dest.ssa);
6012    Temp resource = get_ssa_temp(ctx, instr->src[0].ssa);
6013    Temp node = get_ssa_temp(ctx, instr->src[1].ssa);
6014    Temp tmax = get_ssa_temp(ctx, instr->src[2].ssa);
6015    Temp origin = get_ssa_temp(ctx, instr->src[3].ssa);
6016    Temp dir = get_ssa_temp(ctx, instr->src[4].ssa);
6017    Temp inv_dir = get_ssa_temp(ctx, instr->src[5].ssa);
6018 
6019    std::vector<Temp> args;
6020    args.push_back(emit_extract_vector(ctx, node, 0, v1));
6021    args.push_back(emit_extract_vector(ctx, node, 1, v1));
6022    args.push_back(as_vgpr(ctx, tmax));
6023    args.push_back(emit_extract_vector(ctx, origin, 0, v1));
6024    args.push_back(emit_extract_vector(ctx, origin, 1, v1));
6025    args.push_back(emit_extract_vector(ctx, origin, 2, v1));
6026    args.push_back(emit_extract_vector(ctx, dir, 0, v1));
6027    args.push_back(emit_extract_vector(ctx, dir, 1, v1));
6028    args.push_back(emit_extract_vector(ctx, dir, 2, v1));
6029    args.push_back(emit_extract_vector(ctx, inv_dir, 0, v1));
6030    args.push_back(emit_extract_vector(ctx, inv_dir, 1, v1));
6031    args.push_back(emit_extract_vector(ctx, inv_dir, 2, v1));
6032 
6033    MIMG_instruction* mimg = emit_mimg(bld, aco_opcode::image_bvh64_intersect_ray, Definition(dst),
6034                                       resource, Operand(s4), args);
6035    mimg->dim = ac_image_1d;
6036    mimg->dmask = 0xf;
6037    mimg->unrm = true;
6038    mimg->r128 = true;
6039 }
6040 
6041 static std::vector<Temp>
get_image_coords(isel_context * ctx,const nir_intrinsic_instr * instr)6042 get_image_coords(isel_context* ctx, const nir_intrinsic_instr* instr)
6043 {
6044 
6045    Temp src0 = get_ssa_temp(ctx, instr->src[1].ssa);
6046    enum glsl_sampler_dim dim = nir_intrinsic_image_dim(instr);
6047    bool is_array = nir_intrinsic_image_array(instr);
6048    ASSERTED bool add_frag_pos =
6049       (dim == GLSL_SAMPLER_DIM_SUBPASS || dim == GLSL_SAMPLER_DIM_SUBPASS_MS);
6050    assert(!add_frag_pos && "Input attachments should be lowered.");
6051    bool is_ms = (dim == GLSL_SAMPLER_DIM_MS || dim == GLSL_SAMPLER_DIM_SUBPASS_MS);
6052    bool gfx9_1d = ctx->options->chip_class == GFX9 && dim == GLSL_SAMPLER_DIM_1D;
6053    int count = image_type_to_components_count(dim, is_array);
6054    std::vector<Temp> coords(count);
6055    Builder bld(ctx->program, ctx->block);
6056 
6057    if (is_ms)
6058       coords[--count] = emit_extract_vector(ctx, get_ssa_temp(ctx, instr->src[2].ssa), 0, v1);
6059 
6060    if (gfx9_1d) {
6061       coords[0] = emit_extract_vector(ctx, src0, 0, v1);
6062       coords.resize(coords.size() + 1);
6063       coords[1] = bld.copy(bld.def(v1), Operand::zero());
6064       if (is_array)
6065          coords[2] = emit_extract_vector(ctx, src0, 1, v1);
6066    } else {
6067       for (int i = 0; i < count; i++)
6068          coords[i] = emit_extract_vector(ctx, src0, i, v1);
6069    }
6070 
6071    if (instr->intrinsic == nir_intrinsic_image_deref_load ||
6072        instr->intrinsic == nir_intrinsic_image_deref_sparse_load ||
6073        instr->intrinsic == nir_intrinsic_image_deref_store) {
6074       int lod_index = instr->intrinsic == nir_intrinsic_image_deref_store ? 4 : 3;
6075       bool level_zero =
6076          nir_src_is_const(instr->src[lod_index]) && nir_src_as_uint(instr->src[lod_index]) == 0;
6077 
6078       if (!level_zero)
6079          coords.emplace_back(get_ssa_temp(ctx, instr->src[lod_index].ssa));
6080    }
6081 
6082    return coords;
6083 }
6084 
6085 memory_sync_info
get_memory_sync_info(nir_intrinsic_instr * instr,storage_class storage,unsigned semantics)6086 get_memory_sync_info(nir_intrinsic_instr* instr, storage_class storage, unsigned semantics)
6087 {
6088    /* atomicrmw might not have NIR_INTRINSIC_ACCESS and there's nothing interesting there anyway */
6089    if (semantics & semantic_atomicrmw)
6090       return memory_sync_info(storage, semantics);
6091 
6092    unsigned access = nir_intrinsic_access(instr);
6093 
6094    if (access & ACCESS_VOLATILE)
6095       semantics |= semantic_volatile;
6096    if (access & ACCESS_CAN_REORDER)
6097       semantics |= semantic_can_reorder | semantic_private;
6098 
6099    return memory_sync_info(storage, semantics);
6100 }
6101 
6102 Operand
emit_tfe_init(Builder & bld,Temp dst)6103 emit_tfe_init(Builder& bld, Temp dst)
6104 {
6105    Temp tmp = bld.tmp(dst.regClass());
6106 
6107    aco_ptr<Pseudo_instruction> vec{create_instruction<Pseudo_instruction>(
6108       aco_opcode::p_create_vector, Format::PSEUDO, dst.size(), 1)};
6109    for (unsigned i = 0; i < dst.size(); i++)
6110       vec->operands[i] = Operand::zero();
6111    vec->definitions[0] = Definition(tmp);
6112    /* Since this is fixed to an instruction's definition register, any CSE will
6113     * just create copies. Copying costs about the same as zero-initialization,
6114     * but these copies can break up clauses.
6115     */
6116    vec->definitions[0].setNoCSE(true);
6117    bld.insert(std::move(vec));
6118 
6119    return Operand(tmp);
6120 }
6121 
6122 void
visit_image_load(isel_context * ctx,nir_intrinsic_instr * instr)6123 visit_image_load(isel_context* ctx, nir_intrinsic_instr* instr)
6124 {
6125    Builder bld(ctx->program, ctx->block);
6126    const nir_variable* var =
6127       nir_deref_instr_get_variable(nir_instr_as_deref(instr->src[0].ssa->parent_instr));
6128    const enum glsl_sampler_dim dim = nir_intrinsic_image_dim(instr);
6129    bool is_array = nir_intrinsic_image_array(instr);
6130    bool is_sparse = instr->intrinsic == nir_intrinsic_image_deref_sparse_load;
6131    Temp dst = get_ssa_temp(ctx, &instr->dest.ssa);
6132 
6133    memory_sync_info sync = get_memory_sync_info(instr, storage_image, 0);
6134    unsigned access = var->data.access | nir_intrinsic_access(instr);
6135 
6136    unsigned result_size = instr->dest.ssa.num_components - is_sparse;
6137    unsigned expand_mask =
6138       nir_ssa_def_components_read(&instr->dest.ssa) & u_bit_consecutive(0, result_size);
6139    expand_mask = MAX2(expand_mask, 1); /* this can be zero in the case of sparse image loads */
6140    if (dim == GLSL_SAMPLER_DIM_BUF)
6141       expand_mask = (1u << util_last_bit(expand_mask)) - 1u;
6142    unsigned dmask = expand_mask;
6143    if (instr->dest.ssa.bit_size == 64) {
6144       expand_mask &= 0x9;
6145       /* only R64_UINT and R64_SINT supported. x is in xy of the result, w in zw */
6146       dmask = ((expand_mask & 0x1) ? 0x3 : 0) | ((expand_mask & 0x8) ? 0xc : 0);
6147    }
6148    if (is_sparse)
6149       expand_mask |= 1 << result_size;
6150    unsigned num_components = util_bitcount(dmask) + is_sparse;
6151 
6152    Temp tmp;
6153    if (num_components == dst.size() && dst.type() == RegType::vgpr)
6154       tmp = dst;
6155    else
6156       tmp = ctx->program->allocateTmp(RegClass(RegType::vgpr, num_components));
6157 
6158    Temp resource = get_sampler_desc(ctx, nir_instr_as_deref(instr->src[0].ssa->parent_instr),
6159                                     dim == GLSL_SAMPLER_DIM_BUF ? ACO_DESC_BUFFER : ACO_DESC_IMAGE,
6160                                     nullptr, false);
6161 
6162    if (dim == GLSL_SAMPLER_DIM_BUF) {
6163       Temp vindex = emit_extract_vector(ctx, get_ssa_temp(ctx, instr->src[1].ssa), 0, v1);
6164 
6165       aco_opcode opcode;
6166       switch (util_bitcount(dmask)) {
6167       case 1: opcode = aco_opcode::buffer_load_format_x; break;
6168       case 2: opcode = aco_opcode::buffer_load_format_xy; break;
6169       case 3: opcode = aco_opcode::buffer_load_format_xyz; break;
6170       case 4: opcode = aco_opcode::buffer_load_format_xyzw; break;
6171       default: unreachable(">4 channel buffer image load");
6172       }
6173       aco_ptr<MUBUF_instruction> load{
6174          create_instruction<MUBUF_instruction>(opcode, Format::MUBUF, 3 + is_sparse, 1)};
6175       load->operands[0] = Operand(resource);
6176       load->operands[1] = Operand(vindex);
6177       load->operands[2] = Operand::c32(0);
6178       load->definitions[0] = Definition(tmp);
6179       load->idxen = true;
6180       load->glc = access & (ACCESS_VOLATILE | ACCESS_COHERENT);
6181       load->dlc = load->glc && ctx->options->chip_class >= GFX10;
6182       load->sync = sync;
6183       load->tfe = is_sparse;
6184       if (load->tfe)
6185          load->operands[3] = emit_tfe_init(bld, tmp);
6186       ctx->block->instructions.emplace_back(std::move(load));
6187    } else {
6188       std::vector<Temp> coords = get_image_coords(ctx, instr);
6189 
6190       bool level_zero = nir_src_is_const(instr->src[3]) && nir_src_as_uint(instr->src[3]) == 0;
6191       aco_opcode opcode = level_zero ? aco_opcode::image_load : aco_opcode::image_load_mip;
6192 
6193       Operand vdata = is_sparse ? emit_tfe_init(bld, tmp) : Operand(v1);
6194       MIMG_instruction* load =
6195          emit_mimg(bld, opcode, Definition(tmp), resource, Operand(s4), coords, 0, vdata);
6196       load->glc = access & (ACCESS_VOLATILE | ACCESS_COHERENT) ? 1 : 0;
6197       load->dlc = load->glc && ctx->options->chip_class >= GFX10;
6198       load->dim = ac_get_image_dim(ctx->options->chip_class, dim, is_array);
6199       load->dmask = dmask;
6200       load->unrm = true;
6201       load->da = should_declare_array(ctx, dim, is_array);
6202       load->sync = sync;
6203       load->tfe = is_sparse;
6204    }
6205 
6206    if (is_sparse && instr->dest.ssa.bit_size == 64) {
6207       /* The result components are 64-bit but the sparse residency code is
6208        * 32-bit. So add a zero to the end so expand_vector() works correctly.
6209        */
6210       tmp = bld.pseudo(aco_opcode::p_create_vector, bld.def(RegType::vgpr, tmp.size() + 1), tmp,
6211                        Operand::zero());
6212    }
6213 
6214    expand_vector(ctx, tmp, dst, instr->dest.ssa.num_components, expand_mask);
6215 }
6216 
6217 void
visit_image_store(isel_context * ctx,nir_intrinsic_instr * instr)6218 visit_image_store(isel_context* ctx, nir_intrinsic_instr* instr)
6219 {
6220    const nir_variable* var =
6221       nir_deref_instr_get_variable(nir_instr_as_deref(instr->src[0].ssa->parent_instr));
6222    const enum glsl_sampler_dim dim = nir_intrinsic_image_dim(instr);
6223    bool is_array = nir_intrinsic_image_array(instr);
6224    Temp data = get_ssa_temp(ctx, instr->src[3].ssa);
6225 
6226    /* only R64_UINT and R64_SINT supported */
6227    if (instr->src[3].ssa->bit_size == 64 && data.bytes() > 8)
6228       data = emit_extract_vector(ctx, data, 0, RegClass(data.type(), 2));
6229    data = as_vgpr(ctx, data);
6230 
6231    memory_sync_info sync = get_memory_sync_info(instr, storage_image, 0);
6232    unsigned access = var->data.access | nir_intrinsic_access(instr);
6233    bool glc = ctx->options->chip_class == GFX6 ||
6234                     access & (ACCESS_VOLATILE | ACCESS_COHERENT | ACCESS_NON_READABLE)
6235                  ? 1
6236                  : 0;
6237 
6238    if (dim == GLSL_SAMPLER_DIM_BUF) {
6239       Temp rsrc = get_sampler_desc(ctx, nir_instr_as_deref(instr->src[0].ssa->parent_instr),
6240                                    ACO_DESC_BUFFER, nullptr, true);
6241       Temp vindex = emit_extract_vector(ctx, get_ssa_temp(ctx, instr->src[1].ssa), 0, v1);
6242       aco_opcode opcode;
6243       switch (data.size()) {
6244       case 1: opcode = aco_opcode::buffer_store_format_x; break;
6245       case 2: opcode = aco_opcode::buffer_store_format_xy; break;
6246       case 3: opcode = aco_opcode::buffer_store_format_xyz; break;
6247       case 4: opcode = aco_opcode::buffer_store_format_xyzw; break;
6248       default: unreachable(">4 channel buffer image store");
6249       }
6250       aco_ptr<MUBUF_instruction> store{
6251          create_instruction<MUBUF_instruction>(opcode, Format::MUBUF, 4, 0)};
6252       store->operands[0] = Operand(rsrc);
6253       store->operands[1] = Operand(vindex);
6254       store->operands[2] = Operand::c32(0);
6255       store->operands[3] = Operand(data);
6256       store->idxen = true;
6257       store->glc = glc;
6258       store->dlc = false;
6259       store->disable_wqm = true;
6260       store->sync = sync;
6261       ctx->program->needs_exact = true;
6262       ctx->block->instructions.emplace_back(std::move(store));
6263       return;
6264    }
6265 
6266    assert(data.type() == RegType::vgpr);
6267    std::vector<Temp> coords = get_image_coords(ctx, instr);
6268    Temp resource = get_sampler_desc(ctx, nir_instr_as_deref(instr->src[0].ssa->parent_instr),
6269                                     ACO_DESC_IMAGE, nullptr, true);
6270 
6271    bool level_zero = nir_src_is_const(instr->src[4]) && nir_src_as_uint(instr->src[4]) == 0;
6272    aco_opcode opcode = level_zero ? aco_opcode::image_store : aco_opcode::image_store_mip;
6273 
6274    Builder bld(ctx->program, ctx->block);
6275    MIMG_instruction* store =
6276       emit_mimg(bld, opcode, Definition(), resource, Operand(s4), coords, 0, Operand(data));
6277    store->glc = glc;
6278    store->dlc = false;
6279    store->dim = ac_get_image_dim(ctx->options->chip_class, dim, is_array);
6280    store->dmask = (1 << data.size()) - 1;
6281    store->unrm = true;
6282    store->da = should_declare_array(ctx, dim, is_array);
6283    store->disable_wqm = true;
6284    store->sync = sync;
6285    ctx->program->needs_exact = true;
6286    return;
6287 }
6288 
6289 void
visit_image_atomic(isel_context * ctx,nir_intrinsic_instr * instr)6290 visit_image_atomic(isel_context* ctx, nir_intrinsic_instr* instr)
6291 {
6292    bool return_previous = !nir_ssa_def_is_unused(&instr->dest.ssa);
6293    const enum glsl_sampler_dim dim = nir_intrinsic_image_dim(instr);
6294    bool is_array = nir_intrinsic_image_array(instr);
6295    Builder bld(ctx->program, ctx->block);
6296 
6297    Temp data = as_vgpr(ctx, get_ssa_temp(ctx, instr->src[3].ssa));
6298    bool is_64bit = data.bytes() == 8;
6299    assert((data.bytes() == 4 || data.bytes() == 8) && "only 32/64-bit image atomics implemented.");
6300 
6301    if (instr->intrinsic == nir_intrinsic_image_deref_atomic_comp_swap)
6302       data = bld.pseudo(aco_opcode::p_create_vector, bld.def(is_64bit ? v4 : v2),
6303                         get_ssa_temp(ctx, instr->src[4].ssa), data);
6304 
6305    aco_opcode buf_op, buf_op64, image_op;
6306    switch (instr->intrinsic) {
6307    case nir_intrinsic_image_deref_atomic_add:
6308       buf_op = aco_opcode::buffer_atomic_add;
6309       buf_op64 = aco_opcode::buffer_atomic_add_x2;
6310       image_op = aco_opcode::image_atomic_add;
6311       break;
6312    case nir_intrinsic_image_deref_atomic_umin:
6313       buf_op = aco_opcode::buffer_atomic_umin;
6314       buf_op64 = aco_opcode::buffer_atomic_umin_x2;
6315       image_op = aco_opcode::image_atomic_umin;
6316       break;
6317    case nir_intrinsic_image_deref_atomic_imin:
6318       buf_op = aco_opcode::buffer_atomic_smin;
6319       buf_op64 = aco_opcode::buffer_atomic_smin_x2;
6320       image_op = aco_opcode::image_atomic_smin;
6321       break;
6322    case nir_intrinsic_image_deref_atomic_umax:
6323       buf_op = aco_opcode::buffer_atomic_umax;
6324       buf_op64 = aco_opcode::buffer_atomic_umax_x2;
6325       image_op = aco_opcode::image_atomic_umax;
6326       break;
6327    case nir_intrinsic_image_deref_atomic_imax:
6328       buf_op = aco_opcode::buffer_atomic_smax;
6329       buf_op64 = aco_opcode::buffer_atomic_smax_x2;
6330       image_op = aco_opcode::image_atomic_smax;
6331       break;
6332    case nir_intrinsic_image_deref_atomic_and:
6333       buf_op = aco_opcode::buffer_atomic_and;
6334       buf_op64 = aco_opcode::buffer_atomic_and_x2;
6335       image_op = aco_opcode::image_atomic_and;
6336       break;
6337    case nir_intrinsic_image_deref_atomic_or:
6338       buf_op = aco_opcode::buffer_atomic_or;
6339       buf_op64 = aco_opcode::buffer_atomic_or_x2;
6340       image_op = aco_opcode::image_atomic_or;
6341       break;
6342    case nir_intrinsic_image_deref_atomic_xor:
6343       buf_op = aco_opcode::buffer_atomic_xor;
6344       buf_op64 = aco_opcode::buffer_atomic_xor_x2;
6345       image_op = aco_opcode::image_atomic_xor;
6346       break;
6347    case nir_intrinsic_image_deref_atomic_exchange:
6348       buf_op = aco_opcode::buffer_atomic_swap;
6349       buf_op64 = aco_opcode::buffer_atomic_swap_x2;
6350       image_op = aco_opcode::image_atomic_swap;
6351       break;
6352    case nir_intrinsic_image_deref_atomic_comp_swap:
6353       buf_op = aco_opcode::buffer_atomic_cmpswap;
6354       buf_op64 = aco_opcode::buffer_atomic_cmpswap_x2;
6355       image_op = aco_opcode::image_atomic_cmpswap;
6356       break;
6357    case nir_intrinsic_image_deref_atomic_fmin:
6358       buf_op = aco_opcode::buffer_atomic_fmin;
6359       buf_op64 = aco_opcode::buffer_atomic_fmin_x2;
6360       image_op = aco_opcode::image_atomic_fmin;
6361       break;
6362    case nir_intrinsic_image_deref_atomic_fmax:
6363       buf_op = aco_opcode::buffer_atomic_fmax;
6364       buf_op64 = aco_opcode::buffer_atomic_fmax_x2;
6365       image_op = aco_opcode::image_atomic_fmax;
6366       break;
6367    default:
6368       unreachable("visit_image_atomic should only be called with "
6369                   "nir_intrinsic_image_deref_atomic_* instructions.");
6370    }
6371 
6372    Temp dst = get_ssa_temp(ctx, &instr->dest.ssa);
6373    memory_sync_info sync = get_memory_sync_info(instr, storage_image, semantic_atomicrmw);
6374 
6375    if (dim == GLSL_SAMPLER_DIM_BUF) {
6376       Temp vindex = emit_extract_vector(ctx, get_ssa_temp(ctx, instr->src[1].ssa), 0, v1);
6377       Temp resource = get_sampler_desc(ctx, nir_instr_as_deref(instr->src[0].ssa->parent_instr),
6378                                        ACO_DESC_BUFFER, nullptr, true);
6379       // assert(ctx->options->chip_class < GFX9 && "GFX9 stride size workaround not yet
6380       // implemented.");
6381       aco_ptr<MUBUF_instruction> mubuf{create_instruction<MUBUF_instruction>(
6382          is_64bit ? buf_op64 : buf_op, Format::MUBUF, 4, return_previous ? 1 : 0)};
6383       mubuf->operands[0] = Operand(resource);
6384       mubuf->operands[1] = Operand(vindex);
6385       mubuf->operands[2] = Operand::c32(0);
6386       mubuf->operands[3] = Operand(data);
6387       if (return_previous)
6388          mubuf->definitions[0] = Definition(dst);
6389       mubuf->offset = 0;
6390       mubuf->idxen = true;
6391       mubuf->glc = return_previous;
6392       mubuf->dlc = false; /* Not needed for atomics */
6393       mubuf->disable_wqm = true;
6394       mubuf->sync = sync;
6395       ctx->program->needs_exact = true;
6396       ctx->block->instructions.emplace_back(std::move(mubuf));
6397       return;
6398    }
6399 
6400    std::vector<Temp> coords = get_image_coords(ctx, instr);
6401    Temp resource = get_sampler_desc(ctx, nir_instr_as_deref(instr->src[0].ssa->parent_instr),
6402                                     ACO_DESC_IMAGE, nullptr, true);
6403    Definition def = return_previous ? Definition(dst) : Definition();
6404    MIMG_instruction* mimg =
6405       emit_mimg(bld, image_op, def, resource, Operand(s4), coords, 0, Operand(data));
6406    mimg->glc = return_previous;
6407    mimg->dlc = false; /* Not needed for atomics */
6408    mimg->dim = ac_get_image_dim(ctx->options->chip_class, dim, is_array);
6409    mimg->dmask = (1 << data.size()) - 1;
6410    mimg->unrm = true;
6411    mimg->da = should_declare_array(ctx, dim, is_array);
6412    mimg->disable_wqm = true;
6413    mimg->sync = sync;
6414    ctx->program->needs_exact = true;
6415    return;
6416 }
6417 
6418 void
get_buffer_size(isel_context * ctx,Temp desc,Temp dst)6419 get_buffer_size(isel_context* ctx, Temp desc, Temp dst)
6420 {
6421    if (ctx->options->chip_class == GFX8) {
6422       /* we only have to divide by 1, 2, 4, 8, 12 or 16 */
6423       Builder bld(ctx->program, ctx->block);
6424 
6425       Temp size = emit_extract_vector(ctx, desc, 2, s1);
6426 
6427       Temp size_div3 = bld.vop3(aco_opcode::v_mul_hi_u32, bld.def(v1),
6428                                 bld.copy(bld.def(v1), Operand::c32(0xaaaaaaabu)), size);
6429       size_div3 = bld.sop2(aco_opcode::s_lshr_b32, bld.def(s1), bld.def(s1, scc),
6430                            bld.as_uniform(size_div3), Operand::c32(1u));
6431 
6432       Temp stride = emit_extract_vector(ctx, desc, 1, s1);
6433       stride = bld.sop2(aco_opcode::s_bfe_u32, bld.def(s1), bld.def(s1, scc), stride,
6434                         Operand::c32((5u << 16) | 16u));
6435 
6436       Temp is12 = bld.sopc(aco_opcode::s_cmp_eq_i32, bld.def(s1, scc), stride, Operand::c32(12u));
6437       size = bld.sop2(aco_opcode::s_cselect_b32, bld.def(s1), size_div3, size, bld.scc(is12));
6438 
6439       Temp shr_dst = dst.type() == RegType::vgpr ? bld.tmp(s1) : dst;
6440       bld.sop2(aco_opcode::s_lshr_b32, Definition(shr_dst), bld.def(s1, scc), size,
6441                bld.sop1(aco_opcode::s_ff1_i32_b32, bld.def(s1), stride));
6442       if (dst.type() == RegType::vgpr)
6443          bld.copy(Definition(dst), shr_dst);
6444 
6445       /* TODO: we can probably calculate this faster with v_skip when stride != 12 */
6446    } else {
6447       emit_extract_vector(ctx, desc, 2, dst);
6448    }
6449 }
6450 
6451 void
visit_image_size(isel_context * ctx,nir_intrinsic_instr * instr)6452 visit_image_size(isel_context* ctx, nir_intrinsic_instr* instr)
6453 {
6454    const enum glsl_sampler_dim dim = nir_intrinsic_image_dim(instr);
6455    bool is_array = nir_intrinsic_image_array(instr);
6456    Builder bld(ctx->program, ctx->block);
6457 
6458    if (dim == GLSL_SAMPLER_DIM_BUF) {
6459       Temp desc = get_sampler_desc(ctx, nir_instr_as_deref(instr->src[0].ssa->parent_instr),
6460                                    ACO_DESC_BUFFER, NULL, false);
6461       return get_buffer_size(ctx, desc, get_ssa_temp(ctx, &instr->dest.ssa));
6462    }
6463 
6464    /* LOD */
6465    assert(nir_src_as_uint(instr->src[1]) == 0);
6466    std::vector<Temp> lod{bld.copy(bld.def(v1), Operand::zero())};
6467 
6468    /* Resource */
6469    Temp resource = get_sampler_desc(ctx, nir_instr_as_deref(instr->src[0].ssa->parent_instr),
6470                                     ACO_DESC_IMAGE, NULL, false);
6471 
6472    Temp dst = get_ssa_temp(ctx, &instr->dest.ssa);
6473 
6474    MIMG_instruction* mimg =
6475       emit_mimg(bld, aco_opcode::image_get_resinfo, Definition(dst), resource, Operand(s4), lod);
6476    uint8_t& dmask = mimg->dmask;
6477    mimg->dim = ac_get_image_dim(ctx->options->chip_class, dim, is_array);
6478    mimg->dmask = (1 << instr->dest.ssa.num_components) - 1;
6479    mimg->da = is_array;
6480 
6481    if (ctx->options->chip_class == GFX9 && dim == GLSL_SAMPLER_DIM_1D && is_array) {
6482       assert(instr->dest.ssa.num_components == 2);
6483       dmask = 0x5;
6484    }
6485 
6486    emit_split_vector(ctx, dst, instr->dest.ssa.num_components);
6487 }
6488 
6489 void
get_image_samples(isel_context * ctx,Definition dst,Temp resource)6490 get_image_samples(isel_context* ctx, Definition dst, Temp resource)
6491 {
6492    Builder bld(ctx->program, ctx->block);
6493 
6494    Temp dword3 = emit_extract_vector(ctx, resource, 3, s1);
6495    Temp samples_log2 = bld.sop2(aco_opcode::s_bfe_u32, bld.def(s1), bld.def(s1, scc), dword3,
6496                                 Operand::c32(16u | 4u << 16));
6497    Temp samples = bld.sop2(aco_opcode::s_lshl_b32, bld.def(s1), bld.def(s1, scc), Operand::c32(1u),
6498                            samples_log2);
6499    Temp type = bld.sop2(aco_opcode::s_bfe_u32, bld.def(s1), bld.def(s1, scc), dword3,
6500                         Operand::c32(28u | 4u << 16 /* offset=28, width=4 */));
6501 
6502    Operand default_sample = Operand::c32(1u);
6503    if (ctx->options->robust_buffer_access) {
6504       /* Extract the second dword of the descriptor, if it's
6505        * all zero, then it's a null descriptor.
6506        */
6507       Temp dword1 = emit_extract_vector(ctx, resource, 1, s1);
6508       Temp is_non_null_descriptor =
6509          bld.sopc(aco_opcode::s_cmp_gt_u32, bld.def(s1, scc), dword1, Operand::zero());
6510       default_sample = Operand(is_non_null_descriptor);
6511    }
6512 
6513    Temp is_msaa = bld.sopc(aco_opcode::s_cmp_ge_u32, bld.def(s1, scc), type, Operand::c32(14u));
6514    bld.sop2(aco_opcode::s_cselect_b32, dst, samples, default_sample, bld.scc(is_msaa));
6515 }
6516 
6517 void
visit_image_samples(isel_context * ctx,nir_intrinsic_instr * instr)6518 visit_image_samples(isel_context* ctx, nir_intrinsic_instr* instr)
6519 {
6520    Builder bld(ctx->program, ctx->block);
6521    Temp dst = get_ssa_temp(ctx, &instr->dest.ssa);
6522    Temp resource = get_sampler_desc(ctx, nir_instr_as_deref(instr->src[0].ssa->parent_instr),
6523                                     ACO_DESC_IMAGE, NULL, false);
6524    get_image_samples(ctx, Definition(dst), resource);
6525 }
6526 
6527 void
visit_load_ssbo(isel_context * ctx,nir_intrinsic_instr * instr)6528 visit_load_ssbo(isel_context* ctx, nir_intrinsic_instr* instr)
6529 {
6530    Builder bld(ctx->program, ctx->block);
6531    unsigned num_components = instr->num_components;
6532 
6533    Temp dst = get_ssa_temp(ctx, &instr->dest.ssa);
6534    Temp rsrc = load_buffer_rsrc(ctx, get_ssa_temp(ctx, instr->src[0].ssa));
6535 
6536    unsigned access = nir_intrinsic_access(instr);
6537    bool glc = access & (ACCESS_VOLATILE | ACCESS_COHERENT);
6538    unsigned size = instr->dest.ssa.bit_size / 8;
6539 
6540    bool allow_smem = access & ACCESS_CAN_REORDER;
6541 
6542    load_buffer(ctx, num_components, size, dst, rsrc, get_ssa_temp(ctx, instr->src[1].ssa),
6543                nir_intrinsic_align_mul(instr), nir_intrinsic_align_offset(instr), glc, allow_smem,
6544                get_memory_sync_info(instr, storage_buffer, 0));
6545 }
6546 
6547 void
visit_store_ssbo(isel_context * ctx,nir_intrinsic_instr * instr)6548 visit_store_ssbo(isel_context* ctx, nir_intrinsic_instr* instr)
6549 {
6550    Builder bld(ctx->program, ctx->block);
6551    Temp data = get_ssa_temp(ctx, instr->src[0].ssa);
6552    unsigned elem_size_bytes = instr->src[0].ssa->bit_size / 8;
6553    unsigned writemask = widen_mask(nir_intrinsic_write_mask(instr), elem_size_bytes);
6554    Temp offset = get_ssa_temp(ctx, instr->src[2].ssa);
6555 
6556    Temp rsrc = load_buffer_rsrc(ctx, get_ssa_temp(ctx, instr->src[1].ssa));
6557 
6558    memory_sync_info sync = get_memory_sync_info(instr, storage_buffer, 0);
6559    bool glc =
6560       nir_intrinsic_access(instr) & (ACCESS_VOLATILE | ACCESS_COHERENT | ACCESS_NON_READABLE);
6561 
6562    unsigned write_count = 0;
6563    Temp write_datas[32];
6564    unsigned offsets[32];
6565    split_buffer_store(ctx, instr, false, RegType::vgpr, data, writemask, 16, &write_count,
6566                       write_datas, offsets);
6567 
6568    /* GFX6-7 are affected by a hw bug that prevents address clamping to work
6569     * correctly when the SGPR offset is used.
6570     */
6571    if (offset.type() == RegType::sgpr && ctx->options->chip_class < GFX8)
6572       offset = as_vgpr(ctx, offset);
6573 
6574    for (unsigned i = 0; i < write_count; i++) {
6575       aco_opcode op = get_buffer_store_op(write_datas[i].bytes());
6576 
6577       aco_ptr<MUBUF_instruction> store{
6578          create_instruction<MUBUF_instruction>(op, Format::MUBUF, 4, 0)};
6579       store->operands[0] = Operand(rsrc);
6580       store->operands[1] = offset.type() == RegType::vgpr ? Operand(offset) : Operand(v1);
6581       store->operands[2] = offset.type() == RegType::sgpr ? Operand(offset) : Operand::c32(0);
6582       store->operands[3] = Operand(write_datas[i]);
6583       store->offset = offsets[i];
6584       store->offen = (offset.type() == RegType::vgpr);
6585       store->glc = glc;
6586       store->dlc = false;
6587       store->disable_wqm = true;
6588       store->sync = sync;
6589       ctx->program->needs_exact = true;
6590       ctx->block->instructions.emplace_back(std::move(store));
6591    }
6592 }
6593 
6594 void
visit_atomic_ssbo(isel_context * ctx,nir_intrinsic_instr * instr)6595 visit_atomic_ssbo(isel_context* ctx, nir_intrinsic_instr* instr)
6596 {
6597    Builder bld(ctx->program, ctx->block);
6598    bool return_previous = !nir_ssa_def_is_unused(&instr->dest.ssa);
6599    Temp data = as_vgpr(ctx, get_ssa_temp(ctx, instr->src[2].ssa));
6600 
6601    if (instr->intrinsic == nir_intrinsic_ssbo_atomic_comp_swap)
6602       data = bld.pseudo(aco_opcode::p_create_vector, bld.def(RegType::vgpr, data.size() * 2),
6603                         get_ssa_temp(ctx, instr->src[3].ssa), data);
6604 
6605    Temp offset = get_ssa_temp(ctx, instr->src[1].ssa);
6606    Temp rsrc = load_buffer_rsrc(ctx, get_ssa_temp(ctx, instr->src[0].ssa));
6607 
6608    Temp dst = get_ssa_temp(ctx, &instr->dest.ssa);
6609 
6610    aco_opcode op32, op64;
6611    switch (instr->intrinsic) {
6612    case nir_intrinsic_ssbo_atomic_add:
6613       op32 = aco_opcode::buffer_atomic_add;
6614       op64 = aco_opcode::buffer_atomic_add_x2;
6615       break;
6616    case nir_intrinsic_ssbo_atomic_imin:
6617       op32 = aco_opcode::buffer_atomic_smin;
6618       op64 = aco_opcode::buffer_atomic_smin_x2;
6619       break;
6620    case nir_intrinsic_ssbo_atomic_umin:
6621       op32 = aco_opcode::buffer_atomic_umin;
6622       op64 = aco_opcode::buffer_atomic_umin_x2;
6623       break;
6624    case nir_intrinsic_ssbo_atomic_imax:
6625       op32 = aco_opcode::buffer_atomic_smax;
6626       op64 = aco_opcode::buffer_atomic_smax_x2;
6627       break;
6628    case nir_intrinsic_ssbo_atomic_umax:
6629       op32 = aco_opcode::buffer_atomic_umax;
6630       op64 = aco_opcode::buffer_atomic_umax_x2;
6631       break;
6632    case nir_intrinsic_ssbo_atomic_and:
6633       op32 = aco_opcode::buffer_atomic_and;
6634       op64 = aco_opcode::buffer_atomic_and_x2;
6635       break;
6636    case nir_intrinsic_ssbo_atomic_or:
6637       op32 = aco_opcode::buffer_atomic_or;
6638       op64 = aco_opcode::buffer_atomic_or_x2;
6639       break;
6640    case nir_intrinsic_ssbo_atomic_xor:
6641       op32 = aco_opcode::buffer_atomic_xor;
6642       op64 = aco_opcode::buffer_atomic_xor_x2;
6643       break;
6644    case nir_intrinsic_ssbo_atomic_exchange:
6645       op32 = aco_opcode::buffer_atomic_swap;
6646       op64 = aco_opcode::buffer_atomic_swap_x2;
6647       break;
6648    case nir_intrinsic_ssbo_atomic_comp_swap:
6649       op32 = aco_opcode::buffer_atomic_cmpswap;
6650       op64 = aco_opcode::buffer_atomic_cmpswap_x2;
6651       break;
6652    case nir_intrinsic_ssbo_atomic_fmin:
6653       op32 = aco_opcode::buffer_atomic_fmin;
6654       op64 = aco_opcode::buffer_atomic_fmin_x2;
6655       break;
6656    case nir_intrinsic_ssbo_atomic_fmax:
6657       op32 = aco_opcode::buffer_atomic_fmax;
6658       op64 = aco_opcode::buffer_atomic_fmax_x2;
6659       break;
6660    default:
6661       unreachable(
6662          "visit_atomic_ssbo should only be called with nir_intrinsic_ssbo_atomic_* instructions.");
6663    }
6664    aco_opcode op = instr->dest.ssa.bit_size == 32 ? op32 : op64;
6665    aco_ptr<MUBUF_instruction> mubuf{
6666       create_instruction<MUBUF_instruction>(op, Format::MUBUF, 4, return_previous ? 1 : 0)};
6667    mubuf->operands[0] = Operand(rsrc);
6668    mubuf->operands[1] = offset.type() == RegType::vgpr ? Operand(offset) : Operand(v1);
6669    mubuf->operands[2] = offset.type() == RegType::sgpr ? Operand(offset) : Operand::c32(0);
6670    mubuf->operands[3] = Operand(data);
6671    if (return_previous)
6672       mubuf->definitions[0] = Definition(dst);
6673    mubuf->offset = 0;
6674    mubuf->offen = (offset.type() == RegType::vgpr);
6675    mubuf->glc = return_previous;
6676    mubuf->dlc = false; /* Not needed for atomics */
6677    mubuf->disable_wqm = true;
6678    mubuf->sync = get_memory_sync_info(instr, storage_buffer, semantic_atomicrmw);
6679    ctx->program->needs_exact = true;
6680    ctx->block->instructions.emplace_back(std::move(mubuf));
6681 }
6682 
6683 void
visit_get_ssbo_size(isel_context * ctx,nir_intrinsic_instr * instr)6684 visit_get_ssbo_size(isel_context* ctx, nir_intrinsic_instr* instr)
6685 {
6686 
6687    Temp rsrc = get_ssa_temp(ctx, instr->src[0].ssa);
6688    Temp dst = get_ssa_temp(ctx, &instr->dest.ssa);
6689    bool non_uniform = dst.type() == RegType::vgpr;
6690 
6691    Builder bld(ctx->program, ctx->block);
6692    if (non_uniform) {
6693       Temp set_ptr = emit_extract_vector(ctx, rsrc, 0, RegClass(rsrc.type(), 1));
6694       Temp binding = emit_extract_vector(ctx, rsrc, 1, RegClass(rsrc.type(), 1));
6695       Temp index = bld.vadd32(bld.def(v1), set_ptr, binding);
6696       index = convert_pointer_to_64_bit(ctx, index, non_uniform);
6697 
6698       LoadEmitInfo info = {Operand(index), dst, 1, 4};
6699       info.align_mul = 4;
6700       info.const_offset = 8;
6701       emit_load(ctx, bld, info, global_load_params);
6702    } else {
6703       emit_extract_vector(ctx, load_buffer_rsrc(ctx, rsrc), 2, dst);
6704    }
6705 }
6706 
6707 void
visit_load_global(isel_context * ctx,nir_intrinsic_instr * instr)6708 visit_load_global(isel_context* ctx, nir_intrinsic_instr* instr)
6709 {
6710    Builder bld(ctx->program, ctx->block);
6711    unsigned num_components = instr->num_components;
6712    unsigned component_size = instr->dest.ssa.bit_size / 8;
6713 
6714    LoadEmitInfo info = {Operand(get_ssa_temp(ctx, instr->src[0].ssa)),
6715                         get_ssa_temp(ctx, &instr->dest.ssa), num_components, component_size};
6716    info.glc = nir_intrinsic_access(instr) & (ACCESS_VOLATILE | ACCESS_COHERENT);
6717    info.align_mul = nir_intrinsic_align_mul(instr);
6718    info.align_offset = nir_intrinsic_align_offset(instr);
6719    info.sync = get_memory_sync_info(instr, storage_buffer, 0);
6720    /* VMEM stores don't update the SMEM cache and it's difficult to prove that
6721     * it's safe to use SMEM */
6722    bool can_use_smem = nir_intrinsic_access(instr) & ACCESS_NON_WRITEABLE;
6723    if (info.dst.type() == RegType::vgpr || (info.glc && ctx->options->chip_class < GFX8) ||
6724        !can_use_smem) {
6725       emit_load(ctx, bld, info, global_load_params);
6726    } else {
6727       info.offset = Operand(bld.as_uniform(info.offset));
6728       emit_load(ctx, bld, info, smem_load_params);
6729    }
6730 }
6731 
6732 void
visit_store_global(isel_context * ctx,nir_intrinsic_instr * instr)6733 visit_store_global(isel_context* ctx, nir_intrinsic_instr* instr)
6734 {
6735    Builder bld(ctx->program, ctx->block);
6736    unsigned elem_size_bytes = instr->src[0].ssa->bit_size / 8;
6737    unsigned writemask = widen_mask(nir_intrinsic_write_mask(instr), elem_size_bytes);
6738 
6739    Temp data = as_vgpr(ctx, get_ssa_temp(ctx, instr->src[0].ssa));
6740    Temp addr = get_ssa_temp(ctx, instr->src[1].ssa);
6741    memory_sync_info sync = get_memory_sync_info(instr, storage_buffer, 0);
6742    bool glc =
6743       nir_intrinsic_access(instr) & (ACCESS_VOLATILE | ACCESS_COHERENT | ACCESS_NON_READABLE);
6744 
6745    if (ctx->options->chip_class >= GFX7)
6746       addr = as_vgpr(ctx, addr);
6747 
6748    unsigned write_count = 0;
6749    Temp write_datas[32];
6750    unsigned offsets[32];
6751    split_buffer_store(ctx, instr, false, RegType::vgpr, data, writemask, 16, &write_count,
6752                       write_datas, offsets);
6753 
6754    for (unsigned i = 0; i < write_count; i++) {
6755       if (ctx->options->chip_class >= GFX7) {
6756          unsigned offset = offsets[i];
6757          Temp store_addr = addr;
6758          if (offset > 0 && ctx->options->chip_class < GFX9) {
6759             Temp addr0 = bld.tmp(v1), addr1 = bld.tmp(v1);
6760             Temp new_addr0 = bld.tmp(v1), new_addr1 = bld.tmp(v1);
6761             Temp carry = bld.tmp(bld.lm);
6762             bld.pseudo(aco_opcode::p_split_vector, Definition(addr0), Definition(addr1), addr);
6763 
6764             bld.vop2(aco_opcode::v_add_co_u32, Definition(new_addr0),
6765                      bld.hint_vcc(Definition(carry)), Operand::c32(offset), addr0);
6766             bld.vop2(aco_opcode::v_addc_co_u32, Definition(new_addr1), bld.def(bld.lm),
6767                      Operand::zero(), addr1, carry)
6768                .def(1)
6769                .setHint(vcc);
6770 
6771             store_addr = bld.pseudo(aco_opcode::p_create_vector, bld.def(v2), new_addr0, new_addr1);
6772 
6773             offset = 0;
6774          }
6775 
6776          bool global = ctx->options->chip_class >= GFX9;
6777          aco_opcode op;
6778          switch (write_datas[i].bytes()) {
6779          case 1: op = global ? aco_opcode::global_store_byte : aco_opcode::flat_store_byte; break;
6780          case 2: op = global ? aco_opcode::global_store_short : aco_opcode::flat_store_short; break;
6781          case 4: op = global ? aco_opcode::global_store_dword : aco_opcode::flat_store_dword; break;
6782          case 8:
6783             op = global ? aco_opcode::global_store_dwordx2 : aco_opcode::flat_store_dwordx2;
6784             break;
6785          case 12:
6786             op = global ? aco_opcode::global_store_dwordx3 : aco_opcode::flat_store_dwordx3;
6787             break;
6788          case 16:
6789             op = global ? aco_opcode::global_store_dwordx4 : aco_opcode::flat_store_dwordx4;
6790             break;
6791          default: unreachable("store_global not implemented for this size.");
6792          }
6793 
6794          aco_ptr<FLAT_instruction> flat{
6795             create_instruction<FLAT_instruction>(op, global ? Format::GLOBAL : Format::FLAT, 3, 0)};
6796          flat->operands[0] = Operand(store_addr);
6797          flat->operands[1] = Operand(s1);
6798          flat->operands[2] = Operand(write_datas[i]);
6799          flat->glc = glc;
6800          flat->dlc = false;
6801          flat->offset = offset;
6802          flat->disable_wqm = true;
6803          flat->sync = sync;
6804          ctx->program->needs_exact = true;
6805          ctx->block->instructions.emplace_back(std::move(flat));
6806       } else {
6807          assert(ctx->options->chip_class == GFX6);
6808 
6809          aco_opcode op = get_buffer_store_op(write_datas[i].bytes());
6810 
6811          Temp rsrc = get_gfx6_global_rsrc(bld, addr);
6812 
6813          aco_ptr<MUBUF_instruction> mubuf{
6814             create_instruction<MUBUF_instruction>(op, Format::MUBUF, 4, 0)};
6815          mubuf->operands[0] = Operand(rsrc);
6816          mubuf->operands[1] = addr.type() == RegType::vgpr ? Operand(addr) : Operand(v1);
6817          mubuf->operands[2] = Operand::zero();
6818          mubuf->operands[3] = Operand(write_datas[i]);
6819          mubuf->glc = glc;
6820          mubuf->dlc = false;
6821          mubuf->offset = offsets[i];
6822          mubuf->addr64 = addr.type() == RegType::vgpr;
6823          mubuf->disable_wqm = true;
6824          mubuf->sync = sync;
6825          ctx->program->needs_exact = true;
6826          ctx->block->instructions.emplace_back(std::move(mubuf));
6827       }
6828    }
6829 }
6830 
6831 void
visit_global_atomic(isel_context * ctx,nir_intrinsic_instr * instr)6832 visit_global_atomic(isel_context* ctx, nir_intrinsic_instr* instr)
6833 {
6834    Builder bld(ctx->program, ctx->block);
6835    bool return_previous = !nir_ssa_def_is_unused(&instr->dest.ssa);
6836    Temp addr = get_ssa_temp(ctx, instr->src[0].ssa);
6837    Temp data = as_vgpr(ctx, get_ssa_temp(ctx, instr->src[1].ssa));
6838 
6839    if (ctx->options->chip_class >= GFX7)
6840       addr = as_vgpr(ctx, addr);
6841 
6842    if (instr->intrinsic == nir_intrinsic_global_atomic_comp_swap)
6843       data = bld.pseudo(aco_opcode::p_create_vector, bld.def(RegType::vgpr, data.size() * 2),
6844                         get_ssa_temp(ctx, instr->src[2].ssa), data);
6845 
6846    Temp dst = get_ssa_temp(ctx, &instr->dest.ssa);
6847 
6848    aco_opcode op32, op64;
6849 
6850    if (ctx->options->chip_class >= GFX7) {
6851       bool global = ctx->options->chip_class >= GFX9;
6852       switch (instr->intrinsic) {
6853       case nir_intrinsic_global_atomic_add:
6854          op32 = global ? aco_opcode::global_atomic_add : aco_opcode::flat_atomic_add;
6855          op64 = global ? aco_opcode::global_atomic_add_x2 : aco_opcode::flat_atomic_add_x2;
6856          break;
6857       case nir_intrinsic_global_atomic_imin:
6858          op32 = global ? aco_opcode::global_atomic_smin : aco_opcode::flat_atomic_smin;
6859          op64 = global ? aco_opcode::global_atomic_smin_x2 : aco_opcode::flat_atomic_smin_x2;
6860          break;
6861       case nir_intrinsic_global_atomic_umin:
6862          op32 = global ? aco_opcode::global_atomic_umin : aco_opcode::flat_atomic_umin;
6863          op64 = global ? aco_opcode::global_atomic_umin_x2 : aco_opcode::flat_atomic_umin_x2;
6864          break;
6865       case nir_intrinsic_global_atomic_imax:
6866          op32 = global ? aco_opcode::global_atomic_smax : aco_opcode::flat_atomic_smax;
6867          op64 = global ? aco_opcode::global_atomic_smax_x2 : aco_opcode::flat_atomic_smax_x2;
6868          break;
6869       case nir_intrinsic_global_atomic_umax:
6870          op32 = global ? aco_opcode::global_atomic_umax : aco_opcode::flat_atomic_umax;
6871          op64 = global ? aco_opcode::global_atomic_umax_x2 : aco_opcode::flat_atomic_umax_x2;
6872          break;
6873       case nir_intrinsic_global_atomic_and:
6874          op32 = global ? aco_opcode::global_atomic_and : aco_opcode::flat_atomic_and;
6875          op64 = global ? aco_opcode::global_atomic_and_x2 : aco_opcode::flat_atomic_and_x2;
6876          break;
6877       case nir_intrinsic_global_atomic_or:
6878          op32 = global ? aco_opcode::global_atomic_or : aco_opcode::flat_atomic_or;
6879          op64 = global ? aco_opcode::global_atomic_or_x2 : aco_opcode::flat_atomic_or_x2;
6880          break;
6881       case nir_intrinsic_global_atomic_xor:
6882          op32 = global ? aco_opcode::global_atomic_xor : aco_opcode::flat_atomic_xor;
6883          op64 = global ? aco_opcode::global_atomic_xor_x2 : aco_opcode::flat_atomic_xor_x2;
6884          break;
6885       case nir_intrinsic_global_atomic_exchange:
6886          op32 = global ? aco_opcode::global_atomic_swap : aco_opcode::flat_atomic_swap;
6887          op64 = global ? aco_opcode::global_atomic_swap_x2 : aco_opcode::flat_atomic_swap_x2;
6888          break;
6889       case nir_intrinsic_global_atomic_comp_swap:
6890          op32 = global ? aco_opcode::global_atomic_cmpswap : aco_opcode::flat_atomic_cmpswap;
6891          op64 = global ? aco_opcode::global_atomic_cmpswap_x2 : aco_opcode::flat_atomic_cmpswap_x2;
6892          break;
6893       case nir_intrinsic_global_atomic_fmin:
6894          op32 = global ? aco_opcode::global_atomic_fmin : aco_opcode::flat_atomic_fmin;
6895          op64 = global ? aco_opcode::global_atomic_fmin_x2 : aco_opcode::flat_atomic_fmin_x2;
6896          break;
6897       case nir_intrinsic_global_atomic_fmax:
6898          op32 = global ? aco_opcode::global_atomic_fmax : aco_opcode::flat_atomic_fmax;
6899          op64 = global ? aco_opcode::global_atomic_fmax_x2 : aco_opcode::flat_atomic_fmax_x2;
6900          break;
6901       default:
6902          unreachable("visit_atomic_global should only be called with nir_intrinsic_global_atomic_* "
6903                      "instructions.");
6904       }
6905 
6906       aco_opcode op = instr->dest.ssa.bit_size == 32 ? op32 : op64;
6907       aco_ptr<FLAT_instruction> flat{create_instruction<FLAT_instruction>(
6908          op, global ? Format::GLOBAL : Format::FLAT, 3, return_previous ? 1 : 0)};
6909       flat->operands[0] = Operand(addr);
6910       flat->operands[1] = Operand(s1);
6911       flat->operands[2] = Operand(data);
6912       if (return_previous)
6913          flat->definitions[0] = Definition(dst);
6914       flat->glc = return_previous;
6915       flat->dlc = false; /* Not needed for atomics */
6916       flat->offset = 0;
6917       flat->disable_wqm = true;
6918       flat->sync = get_memory_sync_info(instr, storage_buffer, semantic_atomicrmw);
6919       ctx->program->needs_exact = true;
6920       ctx->block->instructions.emplace_back(std::move(flat));
6921    } else {
6922       assert(ctx->options->chip_class == GFX6);
6923 
6924       switch (instr->intrinsic) {
6925       case nir_intrinsic_global_atomic_add:
6926          op32 = aco_opcode::buffer_atomic_add;
6927          op64 = aco_opcode::buffer_atomic_add_x2;
6928          break;
6929       case nir_intrinsic_global_atomic_imin:
6930          op32 = aco_opcode::buffer_atomic_smin;
6931          op64 = aco_opcode::buffer_atomic_smin_x2;
6932          break;
6933       case nir_intrinsic_global_atomic_umin:
6934          op32 = aco_opcode::buffer_atomic_umin;
6935          op64 = aco_opcode::buffer_atomic_umin_x2;
6936          break;
6937       case nir_intrinsic_global_atomic_imax:
6938          op32 = aco_opcode::buffer_atomic_smax;
6939          op64 = aco_opcode::buffer_atomic_smax_x2;
6940          break;
6941       case nir_intrinsic_global_atomic_umax:
6942          op32 = aco_opcode::buffer_atomic_umax;
6943          op64 = aco_opcode::buffer_atomic_umax_x2;
6944          break;
6945       case nir_intrinsic_global_atomic_and:
6946          op32 = aco_opcode::buffer_atomic_and;
6947          op64 = aco_opcode::buffer_atomic_and_x2;
6948          break;
6949       case nir_intrinsic_global_atomic_or:
6950          op32 = aco_opcode::buffer_atomic_or;
6951          op64 = aco_opcode::buffer_atomic_or_x2;
6952          break;
6953       case nir_intrinsic_global_atomic_xor:
6954          op32 = aco_opcode::buffer_atomic_xor;
6955          op64 = aco_opcode::buffer_atomic_xor_x2;
6956          break;
6957       case nir_intrinsic_global_atomic_exchange:
6958          op32 = aco_opcode::buffer_atomic_swap;
6959          op64 = aco_opcode::buffer_atomic_swap_x2;
6960          break;
6961       case nir_intrinsic_global_atomic_comp_swap:
6962          op32 = aco_opcode::buffer_atomic_cmpswap;
6963          op64 = aco_opcode::buffer_atomic_cmpswap_x2;
6964          break;
6965       case nir_intrinsic_global_atomic_fmin:
6966          op32 = aco_opcode::buffer_atomic_fmin;
6967          op64 = aco_opcode::buffer_atomic_fmin_x2;
6968          break;
6969       case nir_intrinsic_global_atomic_fmax:
6970          op32 = aco_opcode::buffer_atomic_fmax;
6971          op64 = aco_opcode::buffer_atomic_fmax_x2;
6972          break;
6973       default:
6974          unreachable("visit_atomic_global should only be called with nir_intrinsic_global_atomic_* "
6975                      "instructions.");
6976       }
6977 
6978       Temp rsrc = get_gfx6_global_rsrc(bld, addr);
6979 
6980       aco_opcode op = instr->dest.ssa.bit_size == 32 ? op32 : op64;
6981 
6982       aco_ptr<MUBUF_instruction> mubuf{
6983          create_instruction<MUBUF_instruction>(op, Format::MUBUF, 4, return_previous ? 1 : 0)};
6984       mubuf->operands[0] = Operand(rsrc);
6985       mubuf->operands[1] = addr.type() == RegType::vgpr ? Operand(addr) : Operand(v1);
6986       mubuf->operands[2] = Operand::zero();
6987       mubuf->operands[3] = Operand(data);
6988       if (return_previous)
6989          mubuf->definitions[0] = Definition(dst);
6990       mubuf->glc = return_previous;
6991       mubuf->dlc = false;
6992       mubuf->offset = 0;
6993       mubuf->addr64 = addr.type() == RegType::vgpr;
6994       mubuf->disable_wqm = true;
6995       mubuf->sync = get_memory_sync_info(instr, storage_buffer, semantic_atomicrmw);
6996       ctx->program->needs_exact = true;
6997       ctx->block->instructions.emplace_back(std::move(mubuf));
6998    }
6999 }
7000 
7001 void
visit_load_buffer(isel_context * ctx,nir_intrinsic_instr * intrin)7002 visit_load_buffer(isel_context* ctx, nir_intrinsic_instr* intrin)
7003 {
7004    Builder bld(ctx->program, ctx->block);
7005 
7006    Temp dst = get_ssa_temp(ctx, &intrin->dest.ssa);
7007    Temp descriptor = bld.as_uniform(get_ssa_temp(ctx, intrin->src[0].ssa));
7008    Temp v_offset = as_vgpr(ctx, get_ssa_temp(ctx, intrin->src[1].ssa));
7009    Temp s_offset = bld.as_uniform(get_ssa_temp(ctx, intrin->src[2].ssa));
7010 
7011    bool swizzled = nir_intrinsic_is_swizzled(intrin);
7012    bool reorder = nir_intrinsic_can_reorder(intrin);
7013    bool slc = nir_intrinsic_slc_amd(intrin);
7014 
7015    unsigned const_offset = nir_intrinsic_base(intrin);
7016    unsigned elem_size_bytes = intrin->dest.ssa.bit_size / 8u;
7017    unsigned num_components = intrin->dest.ssa.num_components;
7018    unsigned swizzle_element_size = swizzled ? (ctx->program->chip_class <= GFX8 ? 4 : 16) : 0;
7019 
7020    load_vmem_mubuf(ctx, dst, descriptor, v_offset, s_offset, const_offset, elem_size_bytes,
7021                    num_components, swizzle_element_size, !swizzled, reorder, slc);
7022 }
7023 
7024 void
visit_store_buffer(isel_context * ctx,nir_intrinsic_instr * intrin)7025 visit_store_buffer(isel_context* ctx, nir_intrinsic_instr* intrin)
7026 {
7027    Temp store_src = get_ssa_temp(ctx, intrin->src[0].ssa);
7028    Temp descriptor = get_ssa_temp(ctx, intrin->src[1].ssa);
7029    Temp v_offset = get_ssa_temp(ctx, intrin->src[2].ssa);
7030    Temp s_offset = get_ssa_temp(ctx, intrin->src[3].ssa);
7031 
7032    bool swizzled = nir_intrinsic_is_swizzled(intrin);
7033    bool slc = nir_intrinsic_slc_amd(intrin);
7034 
7035    unsigned const_offset = nir_intrinsic_base(intrin);
7036    unsigned write_mask = nir_intrinsic_write_mask(intrin);
7037    unsigned elem_size_bytes = intrin->src[0].ssa->bit_size / 8u;
7038 
7039    nir_variable_mode mem_mode = nir_intrinsic_memory_modes(intrin);
7040    memory_sync_info sync(mem_mode == nir_var_shader_out ? storage_vmem_output : storage_none);
7041 
7042    store_vmem_mubuf(ctx, store_src, descriptor, v_offset, s_offset, const_offset, elem_size_bytes,
7043                     write_mask, !swizzled, sync, slc);
7044 }
7045 
7046 sync_scope
translate_nir_scope(nir_scope scope)7047 translate_nir_scope(nir_scope scope)
7048 {
7049    switch (scope) {
7050    case NIR_SCOPE_NONE:
7051    case NIR_SCOPE_INVOCATION: return scope_invocation;
7052    case NIR_SCOPE_SUBGROUP: return scope_subgroup;
7053    case NIR_SCOPE_WORKGROUP: return scope_workgroup;
7054    case NIR_SCOPE_QUEUE_FAMILY: return scope_queuefamily;
7055    case NIR_SCOPE_DEVICE: return scope_device;
7056    case NIR_SCOPE_SHADER_CALL: return scope_invocation;
7057    }
7058    unreachable("invalid scope");
7059 }
7060 
7061 void
emit_scoped_barrier(isel_context * ctx,nir_intrinsic_instr * instr)7062 emit_scoped_barrier(isel_context* ctx, nir_intrinsic_instr* instr)
7063 {
7064    Builder bld(ctx->program, ctx->block);
7065 
7066    unsigned semantics = 0;
7067    unsigned storage = 0;
7068    sync_scope mem_scope = translate_nir_scope(nir_intrinsic_memory_scope(instr));
7069    sync_scope exec_scope = translate_nir_scope(nir_intrinsic_execution_scope(instr));
7070 
7071    /* We use shared storage for the following:
7072     * - compute shaders expose it in their API
7073     * - when tessellation is used, TCS and VS I/O is lowered to shared memory
7074     * - when GS is used on GFX9+, VS->GS and TES->GS I/O is lowered to shared memory
7075     * - additionally, when NGG is used on GFX10+, shared memory is used for certain features
7076     */
7077    bool shared_storage_used = ctx->stage.hw == HWStage::CS || ctx->stage.hw == HWStage::LS ||
7078                               ctx->stage.hw == HWStage::HS ||
7079                               (ctx->stage.hw == HWStage::GS && ctx->program->chip_class >= GFX9) ||
7080                               ctx->stage.hw == HWStage::NGG;
7081 
7082    /* Workgroup barriers can hang merged shaders that can potentially have 0 threads in either half.
7083     * They are allowed in CS, TCS, and in any NGG shader.
7084     */
7085    ASSERTED bool workgroup_scope_allowed =
7086       ctx->stage.hw == HWStage::CS || ctx->stage.hw == HWStage::HS || ctx->stage.hw == HWStage::NGG;
7087 
7088    unsigned nir_storage = nir_intrinsic_memory_modes(instr);
7089    if (nir_storage & (nir_var_mem_ssbo | nir_var_mem_global))
7090       storage |= storage_buffer | storage_image; // TODO: split this when NIR gets nir_var_mem_image
7091    if (shared_storage_used && (nir_storage & nir_var_mem_shared))
7092       storage |= storage_shared;
7093 
7094    unsigned nir_semantics = nir_intrinsic_memory_semantics(instr);
7095    if (nir_semantics & NIR_MEMORY_ACQUIRE)
7096       semantics |= semantic_acquire | semantic_release;
7097    if (nir_semantics & NIR_MEMORY_RELEASE)
7098       semantics |= semantic_acquire | semantic_release;
7099 
7100    assert(!(nir_semantics & (NIR_MEMORY_MAKE_AVAILABLE | NIR_MEMORY_MAKE_VISIBLE)));
7101    assert(exec_scope != scope_workgroup || workgroup_scope_allowed);
7102 
7103    bld.barrier(aco_opcode::p_barrier,
7104                memory_sync_info((storage_class)storage, (memory_semantics)semantics, mem_scope),
7105                exec_scope);
7106 }
7107 
7108 void
visit_load_shared(isel_context * ctx,nir_intrinsic_instr * instr)7109 visit_load_shared(isel_context* ctx, nir_intrinsic_instr* instr)
7110 {
7111    // TODO: implement sparse reads using ds_read2_b32 and nir_ssa_def_components_read()
7112    Temp dst = get_ssa_temp(ctx, &instr->dest.ssa);
7113    Temp address = as_vgpr(ctx, get_ssa_temp(ctx, instr->src[0].ssa));
7114    Builder bld(ctx->program, ctx->block);
7115 
7116    unsigned elem_size_bytes = instr->dest.ssa.bit_size / 8;
7117    unsigned num_components = instr->dest.ssa.num_components;
7118    unsigned align = nir_intrinsic_align_mul(instr) ? nir_intrinsic_align(instr) : elem_size_bytes;
7119    load_lds(ctx, elem_size_bytes, num_components, dst, address, nir_intrinsic_base(instr), align);
7120 }
7121 
7122 void
visit_store_shared(isel_context * ctx,nir_intrinsic_instr * instr)7123 visit_store_shared(isel_context* ctx, nir_intrinsic_instr* instr)
7124 {
7125    unsigned writemask = nir_intrinsic_write_mask(instr);
7126    Temp data = get_ssa_temp(ctx, instr->src[0].ssa);
7127    Temp address = as_vgpr(ctx, get_ssa_temp(ctx, instr->src[1].ssa));
7128    unsigned elem_size_bytes = instr->src[0].ssa->bit_size / 8;
7129 
7130    unsigned align = nir_intrinsic_align_mul(instr) ? nir_intrinsic_align(instr) : elem_size_bytes;
7131    store_lds(ctx, elem_size_bytes, data, writemask, address, nir_intrinsic_base(instr), align);
7132 }
7133 
7134 void
visit_shared_atomic(isel_context * ctx,nir_intrinsic_instr * instr)7135 visit_shared_atomic(isel_context* ctx, nir_intrinsic_instr* instr)
7136 {
7137    unsigned offset = nir_intrinsic_base(instr);
7138    Builder bld(ctx->program, ctx->block);
7139    Operand m = load_lds_size_m0(bld);
7140    Temp data = as_vgpr(ctx, get_ssa_temp(ctx, instr->src[1].ssa));
7141    Temp address = as_vgpr(ctx, get_ssa_temp(ctx, instr->src[0].ssa));
7142 
7143    unsigned num_operands = 3;
7144    aco_opcode op32, op64, op32_rtn, op64_rtn;
7145    switch (instr->intrinsic) {
7146    case nir_intrinsic_shared_atomic_add:
7147       op32 = aco_opcode::ds_add_u32;
7148       op64 = aco_opcode::ds_add_u64;
7149       op32_rtn = aco_opcode::ds_add_rtn_u32;
7150       op64_rtn = aco_opcode::ds_add_rtn_u64;
7151       break;
7152    case nir_intrinsic_shared_atomic_imin:
7153       op32 = aco_opcode::ds_min_i32;
7154       op64 = aco_opcode::ds_min_i64;
7155       op32_rtn = aco_opcode::ds_min_rtn_i32;
7156       op64_rtn = aco_opcode::ds_min_rtn_i64;
7157       break;
7158    case nir_intrinsic_shared_atomic_umin:
7159       op32 = aco_opcode::ds_min_u32;
7160       op64 = aco_opcode::ds_min_u64;
7161       op32_rtn = aco_opcode::ds_min_rtn_u32;
7162       op64_rtn = aco_opcode::ds_min_rtn_u64;
7163       break;
7164    case nir_intrinsic_shared_atomic_imax:
7165       op32 = aco_opcode::ds_max_i32;
7166       op64 = aco_opcode::ds_max_i64;
7167       op32_rtn = aco_opcode::ds_max_rtn_i32;
7168       op64_rtn = aco_opcode::ds_max_rtn_i64;
7169       break;
7170    case nir_intrinsic_shared_atomic_umax:
7171       op32 = aco_opcode::ds_max_u32;
7172       op64 = aco_opcode::ds_max_u64;
7173       op32_rtn = aco_opcode::ds_max_rtn_u32;
7174       op64_rtn = aco_opcode::ds_max_rtn_u64;
7175       break;
7176    case nir_intrinsic_shared_atomic_and:
7177       op32 = aco_opcode::ds_and_b32;
7178       op64 = aco_opcode::ds_and_b64;
7179       op32_rtn = aco_opcode::ds_and_rtn_b32;
7180       op64_rtn = aco_opcode::ds_and_rtn_b64;
7181       break;
7182    case nir_intrinsic_shared_atomic_or:
7183       op32 = aco_opcode::ds_or_b32;
7184       op64 = aco_opcode::ds_or_b64;
7185       op32_rtn = aco_opcode::ds_or_rtn_b32;
7186       op64_rtn = aco_opcode::ds_or_rtn_b64;
7187       break;
7188    case nir_intrinsic_shared_atomic_xor:
7189       op32 = aco_opcode::ds_xor_b32;
7190       op64 = aco_opcode::ds_xor_b64;
7191       op32_rtn = aco_opcode::ds_xor_rtn_b32;
7192       op64_rtn = aco_opcode::ds_xor_rtn_b64;
7193       break;
7194    case nir_intrinsic_shared_atomic_exchange:
7195       op32 = aco_opcode::ds_write_b32;
7196       op64 = aco_opcode::ds_write_b64;
7197       op32_rtn = aco_opcode::ds_wrxchg_rtn_b32;
7198       op64_rtn = aco_opcode::ds_wrxchg_rtn_b64;
7199       break;
7200    case nir_intrinsic_shared_atomic_comp_swap:
7201       op32 = aco_opcode::ds_cmpst_b32;
7202       op64 = aco_opcode::ds_cmpst_b64;
7203       op32_rtn = aco_opcode::ds_cmpst_rtn_b32;
7204       op64_rtn = aco_opcode::ds_cmpst_rtn_b64;
7205       num_operands = 4;
7206       break;
7207    case nir_intrinsic_shared_atomic_fadd:
7208       op32 = aco_opcode::ds_add_f32;
7209       op32_rtn = aco_opcode::ds_add_rtn_f32;
7210       op64 = aco_opcode::num_opcodes;
7211       op64_rtn = aco_opcode::num_opcodes;
7212       break;
7213    case nir_intrinsic_shared_atomic_fmin:
7214       op32 = aco_opcode::ds_min_f32;
7215       op32_rtn = aco_opcode::ds_min_rtn_f32;
7216       op64 = aco_opcode::ds_min_f64;
7217       op64_rtn = aco_opcode::ds_min_rtn_f64;
7218       break;
7219    case nir_intrinsic_shared_atomic_fmax:
7220       op32 = aco_opcode::ds_max_f32;
7221       op32_rtn = aco_opcode::ds_max_rtn_f32;
7222       op64 = aco_opcode::ds_max_f64;
7223       op64_rtn = aco_opcode::ds_max_rtn_f64;
7224       break;
7225    default: unreachable("Unhandled shared atomic intrinsic");
7226    }
7227 
7228    bool return_previous = !nir_ssa_def_is_unused(&instr->dest.ssa);
7229 
7230    aco_opcode op;
7231    if (data.size() == 1) {
7232       assert(instr->dest.ssa.bit_size == 32);
7233       op = return_previous ? op32_rtn : op32;
7234    } else {
7235       assert(instr->dest.ssa.bit_size == 64);
7236       op = return_previous ? op64_rtn : op64;
7237    }
7238 
7239    if (offset > 65535) {
7240       address = bld.vadd32(bld.def(v1), Operand::c32(offset), address);
7241       offset = 0;
7242    }
7243 
7244    aco_ptr<DS_instruction> ds;
7245    ds.reset(
7246       create_instruction<DS_instruction>(op, Format::DS, num_operands, return_previous ? 1 : 0));
7247    ds->operands[0] = Operand(address);
7248    ds->operands[1] = Operand(data);
7249    if (num_operands == 4) {
7250       Temp data2 = as_vgpr(ctx, get_ssa_temp(ctx, instr->src[2].ssa));
7251       ds->operands[2] = Operand(data2);
7252    }
7253    ds->operands[num_operands - 1] = m;
7254    ds->offset0 = offset;
7255    if (return_previous)
7256       ds->definitions[0] = Definition(get_ssa_temp(ctx, &instr->dest.ssa));
7257    ds->sync = memory_sync_info(storage_shared, semantic_atomicrmw);
7258 
7259    if (m.isUndefined())
7260       ds->operands.pop_back();
7261 
7262    ctx->block->instructions.emplace_back(std::move(ds));
7263 }
7264 
7265 Temp
get_scratch_resource(isel_context * ctx)7266 get_scratch_resource(isel_context* ctx)
7267 {
7268    Builder bld(ctx->program, ctx->block);
7269    Temp scratch_addr = ctx->program->private_segment_buffer;
7270    if (ctx->stage != compute_cs)
7271       scratch_addr =
7272          bld.smem(aco_opcode::s_load_dwordx2, bld.def(s2), scratch_addr, Operand::zero());
7273 
7274    uint32_t rsrc_conf =
7275       S_008F0C_ADD_TID_ENABLE(1) | S_008F0C_INDEX_STRIDE(ctx->program->wave_size == 64 ? 3 : 2);
7276 
7277    if (ctx->program->chip_class >= GFX10) {
7278       rsrc_conf |= S_008F0C_FORMAT(V_008F0C_GFX10_FORMAT_32_FLOAT) |
7279                    S_008F0C_OOB_SELECT(V_008F0C_OOB_SELECT_RAW) | S_008F0C_RESOURCE_LEVEL(1);
7280    } else if (ctx->program->chip_class <=
7281               GFX7) { /* dfmt modifies stride on GFX8/GFX9 when ADD_TID_EN=1 */
7282       rsrc_conf |= S_008F0C_NUM_FORMAT(V_008F0C_BUF_NUM_FORMAT_FLOAT) |
7283                    S_008F0C_DATA_FORMAT(V_008F0C_BUF_DATA_FORMAT_32);
7284    }
7285 
7286    /* older generations need element size = 4 bytes. element size removed in GFX9 */
7287    if (ctx->program->chip_class <= GFX8)
7288       rsrc_conf |= S_008F0C_ELEMENT_SIZE(1);
7289 
7290    return bld.pseudo(aco_opcode::p_create_vector, bld.def(s4), scratch_addr, Operand::c32(-1u),
7291                      Operand::c32(rsrc_conf));
7292 }
7293 
7294 void
visit_load_scratch(isel_context * ctx,nir_intrinsic_instr * instr)7295 visit_load_scratch(isel_context* ctx, nir_intrinsic_instr* instr)
7296 {
7297    Builder bld(ctx->program, ctx->block);
7298    Temp rsrc = get_scratch_resource(ctx);
7299    Temp offset = as_vgpr(ctx, get_ssa_temp(ctx, instr->src[0].ssa));
7300    Temp dst = get_ssa_temp(ctx, &instr->dest.ssa);
7301 
7302    LoadEmitInfo info = {Operand(offset), dst, instr->dest.ssa.num_components,
7303                         instr->dest.ssa.bit_size / 8u, rsrc};
7304    info.align_mul = nir_intrinsic_align_mul(instr);
7305    info.align_offset = nir_intrinsic_align_offset(instr);
7306    info.swizzle_component_size = ctx->program->chip_class <= GFX8 ? 4 : 0;
7307    info.sync = memory_sync_info(storage_scratch, semantic_private);
7308    info.soffset = ctx->program->scratch_offset;
7309    emit_load(ctx, bld, info, scratch_load_params);
7310 }
7311 
7312 void
visit_store_scratch(isel_context * ctx,nir_intrinsic_instr * instr)7313 visit_store_scratch(isel_context* ctx, nir_intrinsic_instr* instr)
7314 {
7315    Builder bld(ctx->program, ctx->block);
7316    Temp rsrc = get_scratch_resource(ctx);
7317    Temp data = as_vgpr(ctx, get_ssa_temp(ctx, instr->src[0].ssa));
7318    Temp offset = as_vgpr(ctx, get_ssa_temp(ctx, instr->src[1].ssa));
7319 
7320    unsigned elem_size_bytes = instr->src[0].ssa->bit_size / 8;
7321    unsigned writemask = widen_mask(nir_intrinsic_write_mask(instr), elem_size_bytes);
7322 
7323    unsigned write_count = 0;
7324    Temp write_datas[32];
7325    unsigned offsets[32];
7326    unsigned swizzle_component_size = ctx->program->chip_class <= GFX8 ? 4 : 16;
7327    split_buffer_store(ctx, instr, false, RegType::vgpr, data, writemask, swizzle_component_size,
7328                       &write_count, write_datas, offsets);
7329 
7330    for (unsigned i = 0; i < write_count; i++) {
7331       aco_opcode op = get_buffer_store_op(write_datas[i].bytes());
7332       Instruction* mubuf = bld.mubuf(op, rsrc, offset, ctx->program->scratch_offset, write_datas[i],
7333                                      offsets[i], true, true);
7334       mubuf->mubuf().sync = memory_sync_info(storage_scratch, semantic_private);
7335    }
7336 }
7337 
7338 void
visit_load_sample_mask_in(isel_context * ctx,nir_intrinsic_instr * instr)7339 visit_load_sample_mask_in(isel_context* ctx, nir_intrinsic_instr* instr)
7340 {
7341    uint8_t log2_ps_iter_samples;
7342    if (ctx->program->info->ps.uses_sample_shading) {
7343       log2_ps_iter_samples = util_logbase2(ctx->options->key.ps.num_samples);
7344    } else {
7345       log2_ps_iter_samples = ctx->options->key.ps.log2_ps_iter_samples;
7346    }
7347 
7348    Builder bld(ctx->program, ctx->block);
7349 
7350    Temp dst = get_ssa_temp(ctx, &instr->dest.ssa);
7351 
7352    if (log2_ps_iter_samples) {
7353       /* gl_SampleMaskIn[0] = (SampleCoverage & (1 << gl_SampleID)). */
7354       Temp sample_id =
7355          bld.vop3(aco_opcode::v_bfe_u32, bld.def(v1), get_arg(ctx, ctx->args->ac.ancillary),
7356                   Operand::c32(8u), Operand::c32(4u));
7357       Temp mask = bld.vop2(aco_opcode::v_lshlrev_b32, bld.def(v1), sample_id,
7358                            bld.copy(bld.def(v1), Operand::c32(1u)));
7359       bld.vop2(aco_opcode::v_and_b32, Definition(dst), mask,
7360                get_arg(ctx, ctx->args->ac.sample_coverage));
7361    } else {
7362       bld.copy(Definition(dst), get_arg(ctx, ctx->args->ac.sample_coverage));
7363    }
7364 }
7365 
7366 void
visit_emit_vertex_with_counter(isel_context * ctx,nir_intrinsic_instr * instr)7367 visit_emit_vertex_with_counter(isel_context* ctx, nir_intrinsic_instr* instr)
7368 {
7369    Builder bld(ctx->program, ctx->block);
7370 
7371    unsigned stream = nir_intrinsic_stream_id(instr);
7372    Temp next_vertex = as_vgpr(ctx, get_ssa_temp(ctx, instr->src[0].ssa));
7373    next_vertex = bld.v_mul_imm(bld.def(v1), next_vertex, 4u);
7374    nir_const_value* next_vertex_cv = nir_src_as_const_value(instr->src[0]);
7375 
7376    /* get GSVS ring */
7377    Temp gsvs_ring =
7378       bld.smem(aco_opcode::s_load_dwordx4, bld.def(s4), ctx->program->private_segment_buffer,
7379                Operand::c32(RING_GSVS_GS * 16u));
7380 
7381    unsigned num_components = ctx->program->info->gs.num_stream_output_components[stream];
7382 
7383    unsigned stride = 4u * num_components * ctx->shader->info.gs.vertices_out;
7384    unsigned stream_offset = 0;
7385    for (unsigned i = 0; i < stream; i++) {
7386       unsigned prev_stride = 4u * ctx->program->info->gs.num_stream_output_components[i] *
7387                              ctx->shader->info.gs.vertices_out;
7388       stream_offset += prev_stride * ctx->program->wave_size;
7389    }
7390 
7391    /* Limit on the stride field for <= GFX7. */
7392    assert(stride < (1 << 14));
7393 
7394    Temp gsvs_dwords[4];
7395    for (unsigned i = 0; i < 4; i++)
7396       gsvs_dwords[i] = bld.tmp(s1);
7397    bld.pseudo(aco_opcode::p_split_vector, Definition(gsvs_dwords[0]), Definition(gsvs_dwords[1]),
7398               Definition(gsvs_dwords[2]), Definition(gsvs_dwords[3]), gsvs_ring);
7399 
7400    if (stream_offset) {
7401       Temp stream_offset_tmp = bld.copy(bld.def(s1), Operand::c32(stream_offset));
7402 
7403       Temp carry = bld.tmp(s1);
7404       gsvs_dwords[0] = bld.sop2(aco_opcode::s_add_u32, bld.def(s1), bld.scc(Definition(carry)),
7405                                 gsvs_dwords[0], stream_offset_tmp);
7406       gsvs_dwords[1] = bld.sop2(aco_opcode::s_addc_u32, bld.def(s1), bld.def(s1, scc),
7407                                 gsvs_dwords[1], Operand::zero(), bld.scc(carry));
7408    }
7409 
7410    gsvs_dwords[1] = bld.sop2(aco_opcode::s_or_b32, bld.def(s1), bld.def(s1, scc), gsvs_dwords[1],
7411                              Operand::c32(S_008F04_STRIDE(stride)));
7412    gsvs_dwords[2] = bld.copy(bld.def(s1), Operand::c32(ctx->program->wave_size));
7413 
7414    gsvs_ring = bld.pseudo(aco_opcode::p_create_vector, bld.def(s4), gsvs_dwords[0], gsvs_dwords[1],
7415                           gsvs_dwords[2], gsvs_dwords[3]);
7416 
7417    unsigned offset = 0;
7418    for (unsigned i = 0; i <= VARYING_SLOT_VAR31; i++) {
7419       if (ctx->program->info->gs.output_streams[i] != stream)
7420          continue;
7421 
7422       for (unsigned j = 0; j < 4; j++) {
7423          if (!(ctx->program->info->gs.output_usage_mask[i] & (1 << j)))
7424             continue;
7425 
7426          if (ctx->outputs.mask[i] & (1 << j)) {
7427             Operand vaddr_offset = next_vertex_cv ? Operand(v1) : Operand(next_vertex);
7428             unsigned const_offset = (offset + (next_vertex_cv ? next_vertex_cv->u32 : 0u)) * 4u;
7429             if (const_offset >= 4096u) {
7430                if (vaddr_offset.isUndefined())
7431                   vaddr_offset = bld.copy(bld.def(v1), Operand::c32(const_offset / 4096u * 4096u));
7432                else
7433                   vaddr_offset = bld.vadd32(bld.def(v1), Operand::c32(const_offset / 4096u * 4096u),
7434                                             vaddr_offset);
7435                const_offset %= 4096u;
7436             }
7437 
7438             aco_ptr<MTBUF_instruction> mtbuf{create_instruction<MTBUF_instruction>(
7439                aco_opcode::tbuffer_store_format_x, Format::MTBUF, 4, 0)};
7440             mtbuf->operands[0] = Operand(gsvs_ring);
7441             mtbuf->operands[1] = vaddr_offset;
7442             mtbuf->operands[2] = Operand(get_arg(ctx, ctx->args->ac.gs2vs_offset));
7443             mtbuf->operands[3] = Operand(ctx->outputs.temps[i * 4u + j]);
7444             mtbuf->offen = !vaddr_offset.isUndefined();
7445             mtbuf->dfmt = V_008F0C_BUF_DATA_FORMAT_32;
7446             mtbuf->nfmt = V_008F0C_BUF_NUM_FORMAT_UINT;
7447             mtbuf->offset = const_offset;
7448             mtbuf->glc = true;
7449             mtbuf->slc = true;
7450             mtbuf->sync = memory_sync_info(storage_vmem_output, semantic_can_reorder);
7451             bld.insert(std::move(mtbuf));
7452          }
7453 
7454          offset += ctx->shader->info.gs.vertices_out;
7455       }
7456 
7457       /* outputs for the next vertex are undefined and keeping them around can
7458        * create invalid IR with control flow */
7459       ctx->outputs.mask[i] = 0;
7460    }
7461 
7462    bld.sopp(aco_opcode::s_sendmsg, bld.m0(ctx->gs_wave_id), -1, sendmsg_gs(false, true, stream));
7463 }
7464 
7465 Temp
emit_boolean_reduce(isel_context * ctx,nir_op op,unsigned cluster_size,Temp src)7466 emit_boolean_reduce(isel_context* ctx, nir_op op, unsigned cluster_size, Temp src)
7467 {
7468    Builder bld(ctx->program, ctx->block);
7469 
7470    if (cluster_size == 1) {
7471       return src;
7472    }
7473    if (op == nir_op_iand && cluster_size == 4) {
7474       /* subgroupClusteredAnd(val, 4) -> ~wqm(exec & ~val) */
7475       Temp tmp =
7476          bld.sop2(Builder::s_andn2, bld.def(bld.lm), bld.def(s1, scc), Operand(exec, bld.lm), src);
7477       return bld.sop1(Builder::s_not, bld.def(bld.lm), bld.def(s1, scc),
7478                       bld.sop1(Builder::s_wqm, bld.def(bld.lm), bld.def(s1, scc), tmp));
7479    } else if (op == nir_op_ior && cluster_size == 4) {
7480       /* subgroupClusteredOr(val, 4) -> wqm(val & exec) */
7481       return bld.sop1(
7482          Builder::s_wqm, bld.def(bld.lm), bld.def(s1, scc),
7483          bld.sop2(Builder::s_and, bld.def(bld.lm), bld.def(s1, scc), src, Operand(exec, bld.lm)));
7484    } else if (op == nir_op_iand && cluster_size == ctx->program->wave_size) {
7485       /* subgroupAnd(val) -> (exec & ~val) == 0 */
7486       Temp tmp =
7487          bld.sop2(Builder::s_andn2, bld.def(bld.lm), bld.def(s1, scc), Operand(exec, bld.lm), src)
7488             .def(1)
7489             .getTemp();
7490       Temp cond = bool_to_vector_condition(ctx, emit_wqm(bld, tmp));
7491       return bld.sop1(Builder::s_not, bld.def(bld.lm), bld.def(s1, scc), cond);
7492    } else if (op == nir_op_ior && cluster_size == ctx->program->wave_size) {
7493       /* subgroupOr(val) -> (val & exec) != 0 */
7494       Temp tmp =
7495          bld.sop2(Builder::s_and, bld.def(bld.lm), bld.def(s1, scc), src, Operand(exec, bld.lm))
7496             .def(1)
7497             .getTemp();
7498       return bool_to_vector_condition(ctx, tmp);
7499    } else if (op == nir_op_ixor && cluster_size == ctx->program->wave_size) {
7500       /* subgroupXor(val) -> s_bcnt1_i32_b64(val & exec) & 1 */
7501       Temp tmp =
7502          bld.sop2(Builder::s_and, bld.def(bld.lm), bld.def(s1, scc), src, Operand(exec, bld.lm));
7503       tmp = bld.sop1(Builder::s_bcnt1_i32, bld.def(s1), bld.def(s1, scc), tmp);
7504       tmp = bld.sop2(aco_opcode::s_and_b32, bld.def(s1), bld.def(s1, scc), tmp, Operand::c32(1u))
7505                .def(1)
7506                .getTemp();
7507       return bool_to_vector_condition(ctx, tmp);
7508    } else {
7509       /* subgroupClustered{And,Or,Xor}(val, n):
7510        *   lane_id = v_mbcnt_hi_u32_b32(-1, v_mbcnt_lo_u32_b32(-1, 0)) (just v_mbcnt_lo on wave32)
7511        *   cluster_offset = ~(n - 1) & lane_id cluster_mask = ((1 << n) - 1)
7512        * subgroupClusteredAnd():
7513        *   return ((val | ~exec) >> cluster_offset) & cluster_mask == cluster_mask
7514        * subgroupClusteredOr():
7515        *   return ((val & exec) >> cluster_offset) & cluster_mask != 0
7516        * subgroupClusteredXor():
7517        *   return v_bnt_u32_b32(((val & exec) >> cluster_offset) & cluster_mask, 0) & 1 != 0
7518        */
7519       Temp lane_id = emit_mbcnt(ctx, bld.tmp(v1));
7520       Temp cluster_offset = bld.vop2(aco_opcode::v_and_b32, bld.def(v1),
7521                                      Operand::c32(~uint32_t(cluster_size - 1)), lane_id);
7522 
7523       Temp tmp;
7524       if (op == nir_op_iand)
7525          tmp = bld.sop2(Builder::s_orn2, bld.def(bld.lm), bld.def(s1, scc), src,
7526                         Operand(exec, bld.lm));
7527       else
7528          tmp =
7529             bld.sop2(Builder::s_and, bld.def(bld.lm), bld.def(s1, scc), src, Operand(exec, bld.lm));
7530 
7531       uint32_t cluster_mask = cluster_size == 32 ? -1 : (1u << cluster_size) - 1u;
7532 
7533       if (ctx->program->chip_class <= GFX7)
7534          tmp = bld.vop3(aco_opcode::v_lshr_b64, bld.def(v2), tmp, cluster_offset);
7535       else if (ctx->program->wave_size == 64)
7536          tmp = bld.vop3(aco_opcode::v_lshrrev_b64, bld.def(v2), cluster_offset, tmp);
7537       else
7538          tmp = bld.vop2_e64(aco_opcode::v_lshrrev_b32, bld.def(v1), cluster_offset, tmp);
7539       tmp = emit_extract_vector(ctx, tmp, 0, v1);
7540       if (cluster_mask != 0xffffffff)
7541          tmp = bld.vop2(aco_opcode::v_and_b32, bld.def(v1), Operand::c32(cluster_mask), tmp);
7542 
7543       if (op == nir_op_iand) {
7544          return bld.vopc(aco_opcode::v_cmp_eq_u32, bld.hint_vcc(bld.lm), Operand::c32(cluster_mask),
7545                          tmp);
7546       } else if (op == nir_op_ior) {
7547          return bld.vopc(aco_opcode::v_cmp_lg_u32, bld.hint_vcc(bld.lm), Operand::zero(), tmp);
7548       } else if (op == nir_op_ixor) {
7549          tmp = bld.vop2(aco_opcode::v_and_b32, bld.def(v1), Operand::c32(1u),
7550                         bld.vop3(aco_opcode::v_bcnt_u32_b32, bld.def(v1), tmp, Operand::zero()));
7551          return bld.vopc(aco_opcode::v_cmp_lg_u32, bld.hint_vcc(bld.lm), Operand::zero(), tmp);
7552       }
7553       assert(false);
7554       return Temp();
7555    }
7556 }
7557 
7558 Temp
emit_boolean_exclusive_scan(isel_context * ctx,nir_op op,Temp src)7559 emit_boolean_exclusive_scan(isel_context* ctx, nir_op op, Temp src)
7560 {
7561    Builder bld(ctx->program, ctx->block);
7562    assert(src.regClass() == bld.lm);
7563 
7564    /* subgroupExclusiveAnd(val) -> mbcnt(exec & ~val) == 0
7565     * subgroupExclusiveOr(val) -> mbcnt(val & exec) != 0
7566     * subgroupExclusiveXor(val) -> mbcnt(val & exec) & 1 != 0
7567     */
7568    Temp tmp;
7569    if (op == nir_op_iand)
7570       tmp =
7571          bld.sop2(Builder::s_andn2, bld.def(bld.lm), bld.def(s1, scc), Operand(exec, bld.lm), src);
7572    else
7573       tmp = bld.sop2(Builder::s_and, bld.def(bld.lm), bld.def(s1, scc), src, Operand(exec, bld.lm));
7574 
7575    Temp mbcnt = emit_mbcnt(ctx, bld.tmp(v1), Operand(tmp));
7576 
7577    if (op == nir_op_iand)
7578       return bld.vopc(aco_opcode::v_cmp_eq_u32, bld.hint_vcc(bld.lm), Operand::zero(), mbcnt);
7579    else if (op == nir_op_ior)
7580       return bld.vopc(aco_opcode::v_cmp_lg_u32, bld.hint_vcc(bld.lm), Operand::zero(), mbcnt);
7581    else if (op == nir_op_ixor)
7582       return bld.vopc(aco_opcode::v_cmp_lg_u32, bld.hint_vcc(bld.lm), Operand::zero(),
7583                       bld.vop2(aco_opcode::v_and_b32, bld.def(v1), Operand::c32(1u), mbcnt));
7584 
7585    assert(false);
7586    return Temp();
7587 }
7588 
7589 Temp
emit_boolean_inclusive_scan(isel_context * ctx,nir_op op,Temp src)7590 emit_boolean_inclusive_scan(isel_context* ctx, nir_op op, Temp src)
7591 {
7592    Builder bld(ctx->program, ctx->block);
7593 
7594    /* subgroupInclusiveAnd(val) -> subgroupExclusiveAnd(val) && val
7595     * subgroupInclusiveOr(val) -> subgroupExclusiveOr(val) || val
7596     * subgroupInclusiveXor(val) -> subgroupExclusiveXor(val) ^^ val
7597     */
7598    Temp tmp = emit_boolean_exclusive_scan(ctx, op, src);
7599    if (op == nir_op_iand)
7600       return bld.sop2(Builder::s_and, bld.def(bld.lm), bld.def(s1, scc), tmp, src);
7601    else if (op == nir_op_ior)
7602       return bld.sop2(Builder::s_or, bld.def(bld.lm), bld.def(s1, scc), tmp, src);
7603    else if (op == nir_op_ixor)
7604       return bld.sop2(Builder::s_xor, bld.def(bld.lm), bld.def(s1, scc), tmp, src);
7605 
7606    assert(false);
7607    return Temp();
7608 }
7609 
7610 ReduceOp
get_reduce_op(nir_op op,unsigned bit_size)7611 get_reduce_op(nir_op op, unsigned bit_size)
7612 {
7613    switch (op) {
7614 #define CASEI(name)                                                                                \
7615    case nir_op_##name:                                                                             \
7616       return (bit_size == 32)   ? name##32                                                         \
7617              : (bit_size == 16) ? name##16                                                         \
7618              : (bit_size == 8)  ? name##8                                                          \
7619                                 : name##64;
7620 #define CASEF(name)                                                                                \
7621    case nir_op_##name: return (bit_size == 32) ? name##32 : (bit_size == 16) ? name##16 : name##64;
7622       CASEI(iadd)
7623       CASEI(imul)
7624       CASEI(imin)
7625       CASEI(umin)
7626       CASEI(imax)
7627       CASEI(umax)
7628       CASEI(iand)
7629       CASEI(ior)
7630       CASEI(ixor)
7631       CASEF(fadd)
7632       CASEF(fmul)
7633       CASEF(fmin)
7634       CASEF(fmax)
7635    default: unreachable("unknown reduction op");
7636 #undef CASEI
7637 #undef CASEF
7638    }
7639 }
7640 
7641 void
emit_uniform_subgroup(isel_context * ctx,nir_intrinsic_instr * instr,Temp src)7642 emit_uniform_subgroup(isel_context* ctx, nir_intrinsic_instr* instr, Temp src)
7643 {
7644    Builder bld(ctx->program, ctx->block);
7645    Definition dst(get_ssa_temp(ctx, &instr->dest.ssa));
7646    assert(dst.regClass().type() != RegType::vgpr);
7647    if (src.regClass().type() == RegType::vgpr)
7648       bld.pseudo(aco_opcode::p_as_uniform, dst, src);
7649    else
7650       bld.copy(dst, src);
7651 }
7652 
7653 void
emit_addition_uniform_reduce(isel_context * ctx,nir_op op,Definition dst,nir_src src,Temp count)7654 emit_addition_uniform_reduce(isel_context* ctx, nir_op op, Definition dst, nir_src src, Temp count)
7655 {
7656    Builder bld(ctx->program, ctx->block);
7657    Temp src_tmp = get_ssa_temp(ctx, src.ssa);
7658 
7659    if (op == nir_op_fadd) {
7660       src_tmp = as_vgpr(ctx, src_tmp);
7661       Temp tmp = dst.regClass() == s1 ? bld.tmp(src_tmp.regClass()) : dst.getTemp();
7662 
7663       if (src.ssa->bit_size == 16) {
7664          count = bld.vop1(aco_opcode::v_cvt_f16_u16, bld.def(v2b), count);
7665          bld.vop2(aco_opcode::v_mul_f16, Definition(tmp), count, src_tmp);
7666       } else {
7667          assert(src.ssa->bit_size == 32);
7668          count = bld.vop1(aco_opcode::v_cvt_f32_u32, bld.def(v1), count);
7669          bld.vop2(aco_opcode::v_mul_f32, Definition(tmp), count, src_tmp);
7670       }
7671 
7672       if (tmp != dst.getTemp())
7673          bld.pseudo(aco_opcode::p_as_uniform, dst, tmp);
7674 
7675       return;
7676    }
7677 
7678    if (dst.regClass() == s1)
7679       src_tmp = bld.as_uniform(src_tmp);
7680 
7681    if (op == nir_op_ixor && count.type() == RegType::sgpr)
7682       count =
7683          bld.sop2(aco_opcode::s_and_b32, bld.def(s1), bld.def(s1, scc), count, Operand::c32(1u));
7684    else if (op == nir_op_ixor)
7685       count = bld.vop2(aco_opcode::v_and_b32, bld.def(v1), Operand::c32(1u), count);
7686 
7687    assert(dst.getTemp().type() == count.type());
7688 
7689    if (nir_src_is_const(src)) {
7690       if (nir_src_as_uint(src) == 1 && dst.bytes() <= 2)
7691          bld.pseudo(aco_opcode::p_extract_vector, dst, count, Operand::zero());
7692       else if (nir_src_as_uint(src) == 1)
7693          bld.copy(dst, count);
7694       else if (nir_src_as_uint(src) == 0 && dst.bytes() <= 2)
7695          bld.vop1(aco_opcode::v_mov_b32, dst, Operand::zero()); /* RA will use SDWA if possible */
7696       else if (nir_src_as_uint(src) == 0)
7697          bld.copy(dst, Operand::zero());
7698       else if (count.type() == RegType::vgpr)
7699          bld.v_mul_imm(dst, count, nir_src_as_uint(src));
7700       else
7701          bld.sop2(aco_opcode::s_mul_i32, dst, src_tmp, count);
7702    } else if (dst.bytes() <= 2 && ctx->program->chip_class >= GFX10) {
7703       bld.vop3(aco_opcode::v_mul_lo_u16_e64, dst, src_tmp, count);
7704    } else if (dst.bytes() <= 2 && ctx->program->chip_class >= GFX8) {
7705       bld.vop2(aco_opcode::v_mul_lo_u16, dst, src_tmp, count);
7706    } else if (dst.getTemp().type() == RegType::vgpr) {
7707       bld.vop3(aco_opcode::v_mul_lo_u32, dst, src_tmp, count);
7708    } else {
7709       bld.sop2(aco_opcode::s_mul_i32, dst, src_tmp, count);
7710    }
7711 }
7712 
7713 bool
emit_uniform_reduce(isel_context * ctx,nir_intrinsic_instr * instr)7714 emit_uniform_reduce(isel_context* ctx, nir_intrinsic_instr* instr)
7715 {
7716    nir_op op = (nir_op)nir_intrinsic_reduction_op(instr);
7717    if (op == nir_op_imul || op == nir_op_fmul)
7718       return false;
7719 
7720    if (op == nir_op_iadd || op == nir_op_ixor || op == nir_op_fadd) {
7721       Builder bld(ctx->program, ctx->block);
7722       Definition dst(get_ssa_temp(ctx, &instr->dest.ssa));
7723       unsigned bit_size = instr->src[0].ssa->bit_size;
7724       if (bit_size > 32)
7725          return false;
7726 
7727       Temp thread_count =
7728          bld.sop1(Builder::s_bcnt1_i32, bld.def(s1), bld.def(s1, scc), Operand(exec, bld.lm));
7729 
7730       emit_addition_uniform_reduce(ctx, op, dst, instr->src[0], thread_count);
7731    } else {
7732       emit_uniform_subgroup(ctx, instr, get_ssa_temp(ctx, instr->src[0].ssa));
7733    }
7734 
7735    return true;
7736 }
7737 
7738 bool
emit_uniform_scan(isel_context * ctx,nir_intrinsic_instr * instr)7739 emit_uniform_scan(isel_context* ctx, nir_intrinsic_instr* instr)
7740 {
7741    Builder bld(ctx->program, ctx->block);
7742    Definition dst(get_ssa_temp(ctx, &instr->dest.ssa));
7743    nir_op op = (nir_op)nir_intrinsic_reduction_op(instr);
7744    bool inc = instr->intrinsic == nir_intrinsic_inclusive_scan;
7745 
7746    if (op == nir_op_imul || op == nir_op_fmul)
7747       return false;
7748 
7749    if (op == nir_op_iadd || op == nir_op_ixor || op == nir_op_fadd) {
7750       if (instr->src[0].ssa->bit_size > 32)
7751          return false;
7752 
7753       Temp packed_tid;
7754       if (inc)
7755          packed_tid = emit_mbcnt(ctx, bld.tmp(v1), Operand(exec, bld.lm), Operand::c32(1u));
7756       else
7757          packed_tid = emit_mbcnt(ctx, bld.tmp(v1), Operand(exec, bld.lm));
7758 
7759       emit_addition_uniform_reduce(ctx, op, dst, instr->src[0], packed_tid);
7760       return true;
7761    }
7762 
7763    assert(op == nir_op_imin || op == nir_op_umin || op == nir_op_imax || op == nir_op_umax ||
7764           op == nir_op_iand || op == nir_op_ior || op == nir_op_fmin || op == nir_op_fmax);
7765 
7766    if (inc) {
7767       emit_uniform_subgroup(ctx, instr, get_ssa_temp(ctx, instr->src[0].ssa));
7768       return true;
7769    }
7770 
7771    /* Copy the source and write the reduction operation identity to the first lane. */
7772    Temp lane = bld.sop1(Builder::s_ff1_i32, bld.def(s1), Operand(exec, bld.lm));
7773    Temp src = get_ssa_temp(ctx, instr->src[0].ssa);
7774    ReduceOp reduce_op = get_reduce_op(op, instr->src[0].ssa->bit_size);
7775    if (dst.bytes() == 8) {
7776       Temp lo = bld.tmp(v1), hi = bld.tmp(v1);
7777       bld.pseudo(aco_opcode::p_split_vector, Definition(lo), Definition(hi), src);
7778       uint32_t identity_lo = get_reduction_identity(reduce_op, 0);
7779       uint32_t identity_hi = get_reduction_identity(reduce_op, 1);
7780 
7781       lo =
7782          bld.writelane(bld.def(v1), bld.copy(bld.hint_m0(s1), Operand::c32(identity_lo)), lane, lo);
7783       hi =
7784          bld.writelane(bld.def(v1), bld.copy(bld.hint_m0(s1), Operand::c32(identity_hi)), lane, hi);
7785       bld.pseudo(aco_opcode::p_create_vector, dst, lo, hi);
7786    } else {
7787       uint32_t identity = get_reduction_identity(reduce_op, 0);
7788       bld.writelane(dst, bld.copy(bld.hint_m0(s1), Operand::c32(identity)), lane,
7789                     as_vgpr(ctx, src));
7790    }
7791 
7792    return true;
7793 }
7794 
7795 Temp
emit_reduction_instr(isel_context * ctx,aco_opcode aco_op,ReduceOp op,unsigned cluster_size,Definition dst,Temp src)7796 emit_reduction_instr(isel_context* ctx, aco_opcode aco_op, ReduceOp op, unsigned cluster_size,
7797                      Definition dst, Temp src)
7798 {
7799    assert(src.bytes() <= 8);
7800    assert(src.type() == RegType::vgpr);
7801 
7802    Builder bld(ctx->program, ctx->block);
7803 
7804    unsigned num_defs = 0;
7805    Definition defs[5];
7806    defs[num_defs++] = dst;
7807    defs[num_defs++] = bld.def(bld.lm); /* used internally to save/restore exec */
7808 
7809    /* scalar identity temporary */
7810    bool need_sitmp = (ctx->program->chip_class <= GFX7 || ctx->program->chip_class >= GFX10) &&
7811                      aco_op != aco_opcode::p_reduce;
7812    if (aco_op == aco_opcode::p_exclusive_scan) {
7813       need_sitmp |= (op == imin8 || op == imin16 || op == imin32 || op == imin64 || op == imax8 ||
7814                      op == imax16 || op == imax32 || op == imax64 || op == fmin16 || op == fmin32 ||
7815                      op == fmin64 || op == fmax16 || op == fmax32 || op == fmax64 || op == fmul16 ||
7816                      op == fmul64);
7817    }
7818    if (need_sitmp)
7819       defs[num_defs++] = bld.def(RegType::sgpr, dst.size());
7820 
7821    /* scc clobber */
7822    defs[num_defs++] = bld.def(s1, scc);
7823 
7824    /* vcc clobber */
7825    bool clobber_vcc = false;
7826    if ((op == iadd32 || op == imul64) && ctx->program->chip_class < GFX9)
7827       clobber_vcc = true;
7828    if ((op == iadd8 || op == iadd16) && ctx->program->chip_class < GFX8)
7829       clobber_vcc = true;
7830    if (op == iadd64 || op == umin64 || op == umax64 || op == imin64 || op == imax64)
7831       clobber_vcc = true;
7832 
7833    if (clobber_vcc)
7834       defs[num_defs++] = bld.def(bld.lm, vcc);
7835 
7836    Pseudo_reduction_instruction* reduce = create_instruction<Pseudo_reduction_instruction>(
7837       aco_op, Format::PSEUDO_REDUCTION, 3, num_defs);
7838    reduce->operands[0] = Operand(src);
7839    /* setup_reduce_temp will update these undef operands if needed */
7840    reduce->operands[1] = Operand(RegClass(RegType::vgpr, dst.size()).as_linear());
7841    reduce->operands[2] = Operand(v1.as_linear());
7842    std::copy(defs, defs + num_defs, reduce->definitions.begin());
7843 
7844    reduce->reduce_op = op;
7845    reduce->cluster_size = cluster_size;
7846    bld.insert(std::move(reduce));
7847 
7848    return dst.getTemp();
7849 }
7850 
7851 void
emit_interp_center(isel_context * ctx,Temp dst,Temp bary,Temp pos1,Temp pos2)7852 emit_interp_center(isel_context* ctx, Temp dst, Temp bary, Temp pos1, Temp pos2)
7853 {
7854    Builder bld(ctx->program, ctx->block);
7855    Temp p1 = emit_extract_vector(ctx, bary, 0, v1);
7856    Temp p2 = emit_extract_vector(ctx, bary, 1, v1);
7857 
7858    Temp ddx_1, ddx_2, ddy_1, ddy_2;
7859    uint32_t dpp_ctrl0 = dpp_quad_perm(0, 0, 0, 0);
7860    uint32_t dpp_ctrl1 = dpp_quad_perm(1, 1, 1, 1);
7861    uint32_t dpp_ctrl2 = dpp_quad_perm(2, 2, 2, 2);
7862 
7863    /* Build DD X/Y */
7864    if (ctx->program->chip_class >= GFX8) {
7865       Temp tl_1 = bld.vop1_dpp(aco_opcode::v_mov_b32, bld.def(v1), p1, dpp_ctrl0);
7866       ddx_1 = bld.vop2_dpp(aco_opcode::v_sub_f32, bld.def(v1), p1, tl_1, dpp_ctrl1);
7867       ddy_1 = bld.vop2_dpp(aco_opcode::v_sub_f32, bld.def(v1), p1, tl_1, dpp_ctrl2);
7868       Temp tl_2 = bld.vop1_dpp(aco_opcode::v_mov_b32, bld.def(v1), p2, dpp_ctrl0);
7869       ddx_2 = bld.vop2_dpp(aco_opcode::v_sub_f32, bld.def(v1), p2, tl_2, dpp_ctrl1);
7870       ddy_2 = bld.vop2_dpp(aco_opcode::v_sub_f32, bld.def(v1), p2, tl_2, dpp_ctrl2);
7871    } else {
7872       Temp tl_1 = bld.ds(aco_opcode::ds_swizzle_b32, bld.def(v1), p1, (1 << 15) | dpp_ctrl0);
7873       ddx_1 = bld.ds(aco_opcode::ds_swizzle_b32, bld.def(v1), p1, (1 << 15) | dpp_ctrl1);
7874       ddx_1 = bld.vop2(aco_opcode::v_sub_f32, bld.def(v1), ddx_1, tl_1);
7875       ddx_2 = bld.ds(aco_opcode::ds_swizzle_b32, bld.def(v1), p1, (1 << 15) | dpp_ctrl2);
7876       ddx_2 = bld.vop2(aco_opcode::v_sub_f32, bld.def(v1), ddx_2, tl_1);
7877       Temp tl_2 = bld.ds(aco_opcode::ds_swizzle_b32, bld.def(v1), p2, (1 << 15) | dpp_ctrl0);
7878       ddy_1 = bld.ds(aco_opcode::ds_swizzle_b32, bld.def(v1), p2, (1 << 15) | dpp_ctrl1);
7879       ddy_1 = bld.vop2(aco_opcode::v_sub_f32, bld.def(v1), ddy_1, tl_2);
7880       ddy_2 = bld.ds(aco_opcode::ds_swizzle_b32, bld.def(v1), p2, (1 << 15) | dpp_ctrl2);
7881       ddy_2 = bld.vop2(aco_opcode::v_sub_f32, bld.def(v1), ddy_2, tl_2);
7882    }
7883 
7884    /* res_k = p_k + ddx_k * pos1 + ddy_k * pos2 */
7885    aco_opcode mad =
7886       ctx->program->chip_class >= GFX10_3 ? aco_opcode::v_fma_f32 : aco_opcode::v_mad_f32;
7887    Temp tmp1 = bld.vop3(mad, bld.def(v1), ddx_1, pos1, p1);
7888    Temp tmp2 = bld.vop3(mad, bld.def(v1), ddx_2, pos1, p2);
7889    tmp1 = bld.vop3(mad, bld.def(v1), ddy_1, pos2, tmp1);
7890    tmp2 = bld.vop3(mad, bld.def(v1), ddy_2, pos2, tmp2);
7891    Temp wqm1 = bld.tmp(v1);
7892    emit_wqm(bld, tmp1, wqm1, true);
7893    Temp wqm2 = bld.tmp(v1);
7894    emit_wqm(bld, tmp2, wqm2, true);
7895    bld.pseudo(aco_opcode::p_create_vector, Definition(dst), wqm1, wqm2);
7896    return;
7897 }
7898 
7899 Temp merged_wave_info_to_mask(isel_context* ctx, unsigned i);
7900 void ngg_emit_sendmsg_gs_alloc_req(isel_context* ctx, Temp vtx_cnt, Temp prm_cnt);
7901 static void create_vs_exports(isel_context* ctx);
7902 
7903 Temp
get_interp_param(isel_context * ctx,nir_intrinsic_op intrin,enum glsl_interp_mode interp)7904 get_interp_param(isel_context* ctx, nir_intrinsic_op intrin,
7905                  enum glsl_interp_mode interp)
7906 {
7907    bool linear = interp == INTERP_MODE_NOPERSPECTIVE;
7908    if (intrin == nir_intrinsic_load_barycentric_pixel ||
7909        intrin == nir_intrinsic_load_barycentric_at_sample ||
7910        intrin == nir_intrinsic_load_barycentric_at_offset) {
7911       return get_arg(ctx, linear ? ctx->args->ac.linear_center : ctx->args->ac.persp_center);
7912    } else if (intrin == nir_intrinsic_load_barycentric_centroid) {
7913       return linear ? ctx->linear_centroid : ctx->persp_centroid;
7914    } else {
7915       assert(intrin == nir_intrinsic_load_barycentric_sample);
7916       return get_arg(ctx, linear ? ctx->args->ac.linear_sample : ctx->args->ac.persp_sample);
7917    }
7918 }
7919 
7920 void
visit_intrinsic(isel_context * ctx,nir_intrinsic_instr * instr)7921 visit_intrinsic(isel_context* ctx, nir_intrinsic_instr* instr)
7922 {
7923    Builder bld(ctx->program, ctx->block);
7924    switch (instr->intrinsic) {
7925    case nir_intrinsic_load_barycentric_sample:
7926    case nir_intrinsic_load_barycentric_pixel:
7927    case nir_intrinsic_load_barycentric_centroid: {
7928       glsl_interp_mode mode = (glsl_interp_mode)nir_intrinsic_interp_mode(instr);
7929       Temp bary = get_interp_param(ctx, instr->intrinsic, mode);
7930       Temp dst = get_ssa_temp(ctx, &instr->dest.ssa);
7931       Temp p1 = emit_extract_vector(ctx, bary, 0, v1);
7932       Temp p2 = emit_extract_vector(ctx, bary, 1, v1);
7933       bld.pseudo(aco_opcode::p_create_vector, Definition(dst), Operand(p1), Operand(p2));
7934       emit_split_vector(ctx, dst, 2);
7935       break;
7936    }
7937    case nir_intrinsic_load_barycentric_model: {
7938       Temp model = get_arg(ctx, ctx->args->ac.pull_model);
7939 
7940       Temp dst = get_ssa_temp(ctx, &instr->dest.ssa);
7941       Temp p1 = emit_extract_vector(ctx, model, 0, v1);
7942       Temp p2 = emit_extract_vector(ctx, model, 1, v1);
7943       Temp p3 = emit_extract_vector(ctx, model, 2, v1);
7944       bld.pseudo(aco_opcode::p_create_vector, Definition(dst), Operand(p1), Operand(p2),
7945                  Operand(p3));
7946       emit_split_vector(ctx, dst, 3);
7947       break;
7948    }
7949    case nir_intrinsic_load_barycentric_at_sample: {
7950       uint32_t sample_pos_offset = RING_PS_SAMPLE_POSITIONS * 16;
7951       switch (ctx->options->key.ps.num_samples) {
7952       case 2: sample_pos_offset += 1 << 3; break;
7953       case 4: sample_pos_offset += 3 << 3; break;
7954       case 8: sample_pos_offset += 7 << 3; break;
7955       default: break;
7956       }
7957       Temp sample_pos;
7958       Temp addr = get_ssa_temp(ctx, instr->src[0].ssa);
7959       nir_const_value* const_addr = nir_src_as_const_value(instr->src[0]);
7960       Temp private_segment_buffer = ctx->program->private_segment_buffer;
7961       // TODO: bounds checking?
7962       if (addr.type() == RegType::sgpr) {
7963          Operand offset;
7964          if (const_addr) {
7965             sample_pos_offset += const_addr->u32 << 3;
7966             offset = Operand::c32(sample_pos_offset);
7967          } else if (ctx->options->chip_class >= GFX9) {
7968             offset = bld.sop2(aco_opcode::s_lshl3_add_u32, bld.def(s1), bld.def(s1, scc), addr,
7969                               Operand::c32(sample_pos_offset));
7970          } else {
7971             offset = bld.sop2(aco_opcode::s_lshl_b32, bld.def(s1), bld.def(s1, scc), addr,
7972                               Operand::c32(3u));
7973             offset = bld.sop2(aco_opcode::s_add_u32, bld.def(s1), bld.def(s1, scc), offset,
7974                               Operand::c32(sample_pos_offset));
7975          }
7976 
7977          Operand off = bld.copy(bld.def(s1), Operand(offset));
7978          sample_pos =
7979             bld.smem(aco_opcode::s_load_dwordx2, bld.def(s2), private_segment_buffer, off);
7980 
7981       } else if (ctx->options->chip_class >= GFX9) {
7982          addr = bld.vop2(aco_opcode::v_lshlrev_b32, bld.def(v1), Operand::c32(3u), addr);
7983          sample_pos = bld.global(aco_opcode::global_load_dwordx2, bld.def(v2), addr,
7984                                  private_segment_buffer, sample_pos_offset);
7985       } else if (ctx->options->chip_class >= GFX7) {
7986          /* addr += private_segment_buffer + sample_pos_offset */
7987          Temp tmp0 = bld.tmp(s1);
7988          Temp tmp1 = bld.tmp(s1);
7989          bld.pseudo(aco_opcode::p_split_vector, Definition(tmp0), Definition(tmp1),
7990                     private_segment_buffer);
7991          Definition scc_tmp = bld.def(s1, scc);
7992          tmp0 = bld.sop2(aco_opcode::s_add_u32, bld.def(s1), scc_tmp, tmp0,
7993                          Operand::c32(sample_pos_offset));
7994          tmp1 = bld.sop2(aco_opcode::s_addc_u32, bld.def(s1), bld.def(s1, scc), tmp1,
7995                          Operand::zero(), bld.scc(scc_tmp.getTemp()));
7996          addr = bld.vop2(aco_opcode::v_lshlrev_b32, bld.def(v1), Operand::c32(3u), addr);
7997          Temp pck0 = bld.tmp(v1);
7998          Temp carry = bld.vadd32(Definition(pck0), tmp0, addr, true).def(1).getTemp();
7999          tmp1 = as_vgpr(ctx, tmp1);
8000          Temp pck1 = bld.vop2_e64(aco_opcode::v_addc_co_u32, bld.def(v1),
8001                                   bld.hint_vcc(bld.def(bld.lm)), tmp1, Operand::zero(), carry);
8002          addr = bld.pseudo(aco_opcode::p_create_vector, bld.def(v2), pck0, pck1);
8003 
8004          /* sample_pos = flat_load_dwordx2 addr */
8005          sample_pos = bld.flat(aco_opcode::flat_load_dwordx2, bld.def(v2), addr, Operand(s1));
8006       } else {
8007          assert(ctx->options->chip_class == GFX6);
8008 
8009          uint32_t rsrc_conf = S_008F0C_NUM_FORMAT(V_008F0C_BUF_NUM_FORMAT_FLOAT) |
8010                               S_008F0C_DATA_FORMAT(V_008F0C_BUF_DATA_FORMAT_32);
8011          Temp rsrc = bld.pseudo(aco_opcode::p_create_vector, bld.def(s4), private_segment_buffer,
8012                                 Operand::zero(), Operand::c32(rsrc_conf));
8013 
8014          addr = bld.vop2(aco_opcode::v_lshlrev_b32, bld.def(v1), Operand::c32(3u), addr);
8015          addr = bld.pseudo(aco_opcode::p_create_vector, bld.def(v2), addr, Operand::zero());
8016 
8017          sample_pos = bld.tmp(v2);
8018 
8019          aco_ptr<MUBUF_instruction> load{create_instruction<MUBUF_instruction>(
8020             aco_opcode::buffer_load_dwordx2, Format::MUBUF, 3, 1)};
8021          load->definitions[0] = Definition(sample_pos);
8022          load->operands[0] = Operand(rsrc);
8023          load->operands[1] = Operand(addr);
8024          load->operands[2] = Operand::zero();
8025          load->offset = sample_pos_offset;
8026          load->offen = 0;
8027          load->addr64 = true;
8028          load->glc = false;
8029          load->dlc = false;
8030          load->disable_wqm = false;
8031          ctx->block->instructions.emplace_back(std::move(load));
8032       }
8033 
8034       /* sample_pos -= 0.5 */
8035       Temp pos1 = bld.tmp(RegClass(sample_pos.type(), 1));
8036       Temp pos2 = bld.tmp(RegClass(sample_pos.type(), 1));
8037       bld.pseudo(aco_opcode::p_split_vector, Definition(pos1), Definition(pos2), sample_pos);
8038       pos1 = bld.vop2_e64(aco_opcode::v_sub_f32, bld.def(v1), pos1, Operand::c32(0x3f000000u));
8039       pos2 = bld.vop2_e64(aco_opcode::v_sub_f32, bld.def(v1), pos2, Operand::c32(0x3f000000u));
8040 
8041       Temp bary = get_interp_param(ctx, instr->intrinsic, (glsl_interp_mode)nir_intrinsic_interp_mode(instr));
8042       emit_interp_center(ctx, get_ssa_temp(ctx, &instr->dest.ssa), bary, pos1, pos2);
8043       break;
8044    }
8045    case nir_intrinsic_load_barycentric_at_offset: {
8046       Temp offset = get_ssa_temp(ctx, instr->src[0].ssa);
8047       RegClass rc = RegClass(offset.type(), 1);
8048       Temp pos1 = bld.tmp(rc), pos2 = bld.tmp(rc);
8049       bld.pseudo(aco_opcode::p_split_vector, Definition(pos1), Definition(pos2), offset);
8050       Temp bary = get_interp_param(ctx, instr->intrinsic, (glsl_interp_mode)nir_intrinsic_interp_mode(instr));
8051       emit_interp_center(ctx, get_ssa_temp(ctx, &instr->dest.ssa), bary, pos1, pos2);
8052       break;
8053    }
8054    case nir_intrinsic_load_front_face: {
8055       bld.vopc(aco_opcode::v_cmp_lg_u32, Definition(get_ssa_temp(ctx, &instr->dest.ssa)),
8056                Operand::zero(), get_arg(ctx, ctx->args->ac.front_face))
8057          .def(0)
8058          .setHint(vcc);
8059       break;
8060    }
8061    case nir_intrinsic_load_view_index: {
8062       Temp dst = get_ssa_temp(ctx, &instr->dest.ssa);
8063       bld.copy(Definition(dst), Operand(get_arg(ctx, ctx->args->ac.view_index)));
8064       break;
8065    }
8066    case nir_intrinsic_load_frag_coord: {
8067       emit_load_frag_coord(ctx, get_ssa_temp(ctx, &instr->dest.ssa), 4);
8068       break;
8069    }
8070    case nir_intrinsic_load_frag_shading_rate:
8071       emit_load_frag_shading_rate(ctx, get_ssa_temp(ctx, &instr->dest.ssa));
8072       break;
8073    case nir_intrinsic_load_sample_pos: {
8074       Temp posx = get_arg(ctx, ctx->args->ac.frag_pos[0]);
8075       Temp posy = get_arg(ctx, ctx->args->ac.frag_pos[1]);
8076       bld.pseudo(
8077          aco_opcode::p_create_vector, Definition(get_ssa_temp(ctx, &instr->dest.ssa)),
8078          posx.id() ? bld.vop1(aco_opcode::v_fract_f32, bld.def(v1), posx) : Operand::zero(),
8079          posy.id() ? bld.vop1(aco_opcode::v_fract_f32, bld.def(v1), posy) : Operand::zero());
8080       break;
8081    }
8082    case nir_intrinsic_load_tess_coord: visit_load_tess_coord(ctx, instr); break;
8083    case nir_intrinsic_load_interpolated_input: visit_load_interpolated_input(ctx, instr); break;
8084    case nir_intrinsic_store_output: visit_store_output(ctx, instr); break;
8085    case nir_intrinsic_load_input:
8086    case nir_intrinsic_load_input_vertex: visit_load_input(ctx, instr); break;
8087    case nir_intrinsic_load_per_vertex_input: visit_load_per_vertex_input(ctx, instr); break;
8088    case nir_intrinsic_load_ubo: visit_load_ubo(ctx, instr); break;
8089    case nir_intrinsic_load_push_constant: visit_load_push_constant(ctx, instr); break;
8090    case nir_intrinsic_load_constant: visit_load_constant(ctx, instr); break;
8091    case nir_intrinsic_vulkan_resource_index: visit_load_resource(ctx, instr); break;
8092    case nir_intrinsic_terminate:
8093    case nir_intrinsic_discard: visit_discard(ctx, instr); break;
8094    case nir_intrinsic_terminate_if:
8095    case nir_intrinsic_discard_if: visit_discard_if(ctx, instr); break;
8096    case nir_intrinsic_load_shared: visit_load_shared(ctx, instr); break;
8097    case nir_intrinsic_store_shared: visit_store_shared(ctx, instr); break;
8098    case nir_intrinsic_shared_atomic_add:
8099    case nir_intrinsic_shared_atomic_imin:
8100    case nir_intrinsic_shared_atomic_umin:
8101    case nir_intrinsic_shared_atomic_imax:
8102    case nir_intrinsic_shared_atomic_umax:
8103    case nir_intrinsic_shared_atomic_and:
8104    case nir_intrinsic_shared_atomic_or:
8105    case nir_intrinsic_shared_atomic_xor:
8106    case nir_intrinsic_shared_atomic_exchange:
8107    case nir_intrinsic_shared_atomic_comp_swap:
8108    case nir_intrinsic_shared_atomic_fadd:
8109    case nir_intrinsic_shared_atomic_fmin:
8110    case nir_intrinsic_shared_atomic_fmax: visit_shared_atomic(ctx, instr); break;
8111    case nir_intrinsic_image_deref_load:
8112    case nir_intrinsic_image_deref_sparse_load: visit_image_load(ctx, instr); break;
8113    case nir_intrinsic_image_deref_store: visit_image_store(ctx, instr); break;
8114    case nir_intrinsic_image_deref_atomic_add:
8115    case nir_intrinsic_image_deref_atomic_umin:
8116    case nir_intrinsic_image_deref_atomic_imin:
8117    case nir_intrinsic_image_deref_atomic_umax:
8118    case nir_intrinsic_image_deref_atomic_imax:
8119    case nir_intrinsic_image_deref_atomic_and:
8120    case nir_intrinsic_image_deref_atomic_or:
8121    case nir_intrinsic_image_deref_atomic_xor:
8122    case nir_intrinsic_image_deref_atomic_exchange:
8123    case nir_intrinsic_image_deref_atomic_comp_swap:
8124    case nir_intrinsic_image_deref_atomic_fmin:
8125    case nir_intrinsic_image_deref_atomic_fmax: visit_image_atomic(ctx, instr); break;
8126    case nir_intrinsic_image_deref_size: visit_image_size(ctx, instr); break;
8127    case nir_intrinsic_image_deref_samples: visit_image_samples(ctx, instr); break;
8128    case nir_intrinsic_load_ssbo: visit_load_ssbo(ctx, instr); break;
8129    case nir_intrinsic_store_ssbo: visit_store_ssbo(ctx, instr); break;
8130    case nir_intrinsic_load_global_constant:
8131    case nir_intrinsic_load_global: visit_load_global(ctx, instr); break;
8132    case nir_intrinsic_load_buffer_amd: visit_load_buffer(ctx, instr); break;
8133    case nir_intrinsic_store_buffer_amd: visit_store_buffer(ctx, instr); break;
8134    case nir_intrinsic_store_global: visit_store_global(ctx, instr); break;
8135    case nir_intrinsic_global_atomic_add:
8136    case nir_intrinsic_global_atomic_imin:
8137    case nir_intrinsic_global_atomic_umin:
8138    case nir_intrinsic_global_atomic_imax:
8139    case nir_intrinsic_global_atomic_umax:
8140    case nir_intrinsic_global_atomic_and:
8141    case nir_intrinsic_global_atomic_or:
8142    case nir_intrinsic_global_atomic_xor:
8143    case nir_intrinsic_global_atomic_exchange:
8144    case nir_intrinsic_global_atomic_comp_swap:
8145    case nir_intrinsic_global_atomic_fmin:
8146    case nir_intrinsic_global_atomic_fmax: visit_global_atomic(ctx, instr); break;
8147    case nir_intrinsic_ssbo_atomic_add:
8148    case nir_intrinsic_ssbo_atomic_imin:
8149    case nir_intrinsic_ssbo_atomic_umin:
8150    case nir_intrinsic_ssbo_atomic_imax:
8151    case nir_intrinsic_ssbo_atomic_umax:
8152    case nir_intrinsic_ssbo_atomic_and:
8153    case nir_intrinsic_ssbo_atomic_or:
8154    case nir_intrinsic_ssbo_atomic_xor:
8155    case nir_intrinsic_ssbo_atomic_exchange:
8156    case nir_intrinsic_ssbo_atomic_comp_swap:
8157    case nir_intrinsic_ssbo_atomic_fmin:
8158    case nir_intrinsic_ssbo_atomic_fmax: visit_atomic_ssbo(ctx, instr); break;
8159    case nir_intrinsic_load_scratch: visit_load_scratch(ctx, instr); break;
8160    case nir_intrinsic_store_scratch: visit_store_scratch(ctx, instr); break;
8161    case nir_intrinsic_get_ssbo_size: visit_get_ssbo_size(ctx, instr); break;
8162    case nir_intrinsic_scoped_barrier: emit_scoped_barrier(ctx, instr); break;
8163    case nir_intrinsic_load_num_workgroups: {
8164       Temp dst = get_ssa_temp(ctx, &instr->dest.ssa);
8165       bld.copy(Definition(dst), Operand(get_arg(ctx, ctx->args->ac.num_work_groups)));
8166       emit_split_vector(ctx, dst, 3);
8167       break;
8168    }
8169    case nir_intrinsic_load_ray_launch_size: {
8170       Temp dst = get_ssa_temp(ctx, &instr->dest.ssa);
8171       bld.copy(Definition(dst), Operand(get_arg(ctx, ctx->args->ac.ray_launch_size)));
8172       emit_split_vector(ctx, dst, 3);
8173       break;
8174    }
8175    case nir_intrinsic_load_local_invocation_id: {
8176       Temp dst = get_ssa_temp(ctx, &instr->dest.ssa);
8177       bld.copy(Definition(dst), Operand(get_arg(ctx, ctx->args->ac.local_invocation_ids)));
8178       emit_split_vector(ctx, dst, 3);
8179       break;
8180    }
8181    case nir_intrinsic_load_workgroup_id: {
8182       Temp dst = get_ssa_temp(ctx, &instr->dest.ssa);
8183       const struct ac_arg* args = ctx->args->ac.workgroup_ids;
8184       bld.pseudo(aco_opcode::p_create_vector, Definition(dst),
8185                  args[0].used ? Operand(get_arg(ctx, args[0])) : Operand::zero(),
8186                  args[1].used ? Operand(get_arg(ctx, args[1])) : Operand::zero(),
8187                  args[2].used ? Operand(get_arg(ctx, args[2])) : Operand::zero());
8188       emit_split_vector(ctx, dst, 3);
8189       break;
8190    }
8191    case nir_intrinsic_load_local_invocation_index: {
8192       if (ctx->stage.hw == HWStage::LS || ctx->stage.hw == HWStage::HS) {
8193          bld.copy(Definition(get_ssa_temp(ctx, &instr->dest.ssa)),
8194                   get_arg(ctx, ctx->args->ac.vs_rel_patch_id));
8195          break;
8196       } else if (ctx->stage.hw == HWStage::GS || ctx->stage.hw == HWStage::NGG) {
8197          bld.copy(Definition(get_ssa_temp(ctx, &instr->dest.ssa)), thread_id_in_threadgroup(ctx));
8198          break;
8199       }
8200 
8201       Temp id = emit_mbcnt(ctx, bld.tmp(v1));
8202 
8203       /* The tg_size bits [6:11] contain the subgroup id,
8204        * we need this multiplied by the wave size, and then OR the thread id to it.
8205        */
8206       if (ctx->program->wave_size == 64) {
8207          /* After the s_and the bits are already multiplied by 64 (left shifted by 6) so we can just
8208           * feed that to v_or */
8209          Temp tg_num = bld.sop2(aco_opcode::s_and_b32, bld.def(s1), bld.def(s1, scc),
8210                                 Operand::c32(0xfc0u), get_arg(ctx, ctx->args->ac.tg_size));
8211          bld.vop2(aco_opcode::v_or_b32, Definition(get_ssa_temp(ctx, &instr->dest.ssa)), tg_num,
8212                   id);
8213       } else {
8214          /* Extract the bit field and multiply the result by 32 (left shift by 5), then do the OR */
8215          Temp tg_num =
8216             bld.sop2(aco_opcode::s_bfe_u32, bld.def(s1), bld.def(s1, scc),
8217                      get_arg(ctx, ctx->args->ac.tg_size), Operand::c32(0x6u | (0x6u << 16)));
8218          bld.vop3(aco_opcode::v_lshl_or_b32, Definition(get_ssa_temp(ctx, &instr->dest.ssa)),
8219                   tg_num, Operand::c32(0x5u), id);
8220       }
8221       break;
8222    }
8223    case nir_intrinsic_load_subgroup_id: {
8224       if (ctx->stage == compute_cs) {
8225          bld.sop2(aco_opcode::s_bfe_u32, Definition(get_ssa_temp(ctx, &instr->dest.ssa)),
8226                   bld.def(s1, scc), get_arg(ctx, ctx->args->ac.tg_size),
8227                   Operand::c32(0x6u | (0x6u << 16)));
8228       } else if (ctx->stage.hw == HWStage::NGG) {
8229          /* Get the id of the current wave within the threadgroup (workgroup) */
8230          bld.sop2(aco_opcode::s_bfe_u32, Definition(get_ssa_temp(ctx, &instr->dest.ssa)),
8231                   bld.def(s1, scc), get_arg(ctx, ctx->args->ac.merged_wave_info),
8232                   Operand::c32(24u | (4u << 16)));
8233       } else {
8234          bld.copy(Definition(get_ssa_temp(ctx, &instr->dest.ssa)), Operand::zero());
8235       }
8236       break;
8237    }
8238    case nir_intrinsic_load_subgroup_invocation: {
8239       emit_mbcnt(ctx, get_ssa_temp(ctx, &instr->dest.ssa));
8240       break;
8241    }
8242    case nir_intrinsic_load_num_subgroups: {
8243       if (ctx->stage == compute_cs)
8244          bld.sop2(aco_opcode::s_and_b32, Definition(get_ssa_temp(ctx, &instr->dest.ssa)),
8245                   bld.def(s1, scc), Operand::c32(0x3fu), get_arg(ctx, ctx->args->ac.tg_size));
8246       else if (ctx->stage.hw == HWStage::NGG)
8247          bld.sop2(aco_opcode::s_bfe_u32, Definition(get_ssa_temp(ctx, &instr->dest.ssa)),
8248                   bld.def(s1, scc), get_arg(ctx, ctx->args->ac.merged_wave_info),
8249                   Operand::c32(28u | (4u << 16)));
8250       else
8251          bld.copy(Definition(get_ssa_temp(ctx, &instr->dest.ssa)), Operand::c32(0x1u));
8252       break;
8253    }
8254    case nir_intrinsic_ballot: {
8255       Temp src = get_ssa_temp(ctx, instr->src[0].ssa);
8256       Temp dst = get_ssa_temp(ctx, &instr->dest.ssa);
8257 
8258       if (instr->src[0].ssa->bit_size == 1) {
8259          assert(src.regClass() == bld.lm);
8260       } else if (instr->src[0].ssa->bit_size == 32 && src.regClass() == v1) {
8261          src = bld.vopc(aco_opcode::v_cmp_lg_u32, bld.def(bld.lm), Operand::zero(), src);
8262       } else if (instr->src[0].ssa->bit_size == 64 && src.regClass() == v2) {
8263          src = bld.vopc(aco_opcode::v_cmp_lg_u64, bld.def(bld.lm), Operand::zero(), src);
8264       } else {
8265          isel_err(&instr->instr, "Unimplemented NIR instr bit size");
8266       }
8267 
8268       /* Make sure that all inactive lanes return zero.
8269        * Value-numbering might remove the comparison above */
8270       src = bld.sop2(Builder::s_and, bld.def(bld.lm), bld.def(s1, scc), src, Operand(exec, bld.lm));
8271       if (dst.size() != bld.lm.size()) {
8272          /* Wave32 with ballot size set to 64 */
8273          src =
8274             bld.pseudo(aco_opcode::p_create_vector, bld.def(dst.regClass()), src, Operand::zero());
8275       }
8276 
8277       emit_wqm(bld, src, dst);
8278       break;
8279    }
8280    case nir_intrinsic_shuffle:
8281    case nir_intrinsic_read_invocation: {
8282       Temp src = get_ssa_temp(ctx, instr->src[0].ssa);
8283       if (!nir_src_is_divergent(instr->src[0])) {
8284          emit_uniform_subgroup(ctx, instr, src);
8285       } else {
8286          Temp tid = get_ssa_temp(ctx, instr->src[1].ssa);
8287          if (instr->intrinsic == nir_intrinsic_read_invocation ||
8288              !nir_src_is_divergent(instr->src[1]))
8289             tid = bld.as_uniform(tid);
8290          Temp dst = get_ssa_temp(ctx, &instr->dest.ssa);
8291 
8292          if (instr->dest.ssa.bit_size != 1)
8293             src = as_vgpr(ctx, src);
8294 
8295          if (src.regClass() == v1b || src.regClass() == v2b) {
8296             Temp tmp = bld.tmp(v1);
8297             tmp = emit_wqm(bld, emit_bpermute(ctx, bld, tid, src), tmp);
8298             if (dst.type() == RegType::vgpr)
8299                bld.pseudo(aco_opcode::p_split_vector, Definition(dst),
8300                           bld.def(src.regClass() == v1b ? v3b : v2b), tmp);
8301             else
8302                bld.pseudo(aco_opcode::p_as_uniform, Definition(dst), tmp);
8303          } else if (src.regClass() == v1) {
8304             emit_wqm(bld, emit_bpermute(ctx, bld, tid, src), dst);
8305          } else if (src.regClass() == v2) {
8306             Temp lo = bld.tmp(v1), hi = bld.tmp(v1);
8307             bld.pseudo(aco_opcode::p_split_vector, Definition(lo), Definition(hi), src);
8308             lo = emit_wqm(bld, emit_bpermute(ctx, bld, tid, lo));
8309             hi = emit_wqm(bld, emit_bpermute(ctx, bld, tid, hi));
8310             bld.pseudo(aco_opcode::p_create_vector, Definition(dst), lo, hi);
8311             emit_split_vector(ctx, dst, 2);
8312          } else if (instr->dest.ssa.bit_size == 1 && tid.regClass() == s1) {
8313             assert(src.regClass() == bld.lm);
8314             Temp tmp = bld.sopc(Builder::s_bitcmp1, bld.def(s1, scc), src, tid);
8315             bool_to_vector_condition(ctx, emit_wqm(bld, tmp), dst);
8316          } else if (instr->dest.ssa.bit_size == 1 && tid.regClass() == v1) {
8317             assert(src.regClass() == bld.lm);
8318             Temp tmp;
8319             if (ctx->program->chip_class <= GFX7)
8320                tmp = bld.vop3(aco_opcode::v_lshr_b64, bld.def(v2), src, tid);
8321             else if (ctx->program->wave_size == 64)
8322                tmp = bld.vop3(aco_opcode::v_lshrrev_b64, bld.def(v2), tid, src);
8323             else
8324                tmp = bld.vop2_e64(aco_opcode::v_lshrrev_b32, bld.def(v1), tid, src);
8325             tmp = emit_extract_vector(ctx, tmp, 0, v1);
8326             tmp = bld.vop2(aco_opcode::v_and_b32, bld.def(v1), Operand::c32(1u), tmp);
8327             emit_wqm(bld, bld.vopc(aco_opcode::v_cmp_lg_u32, bld.def(bld.lm), Operand::zero(), tmp),
8328                      dst);
8329          } else {
8330             isel_err(&instr->instr, "Unimplemented NIR instr bit size");
8331          }
8332       }
8333       break;
8334    }
8335    case nir_intrinsic_load_sample_id: {
8336       bld.vop3(aco_opcode::v_bfe_u32, Definition(get_ssa_temp(ctx, &instr->dest.ssa)),
8337                get_arg(ctx, ctx->args->ac.ancillary), Operand::c32(8u), Operand::c32(4u));
8338       break;
8339    }
8340    case nir_intrinsic_load_sample_mask_in: {
8341       visit_load_sample_mask_in(ctx, instr);
8342       break;
8343    }
8344    case nir_intrinsic_read_first_invocation: {
8345       Temp src = get_ssa_temp(ctx, instr->src[0].ssa);
8346       Temp dst = get_ssa_temp(ctx, &instr->dest.ssa);
8347       if (src.regClass() == v1b || src.regClass() == v2b || src.regClass() == v1) {
8348          emit_wqm(bld, bld.vop1(aco_opcode::v_readfirstlane_b32, bld.def(s1), src), dst);
8349       } else if (src.regClass() == v2) {
8350          Temp lo = bld.tmp(v1), hi = bld.tmp(v1);
8351          bld.pseudo(aco_opcode::p_split_vector, Definition(lo), Definition(hi), src);
8352          lo = emit_wqm(bld, bld.vop1(aco_opcode::v_readfirstlane_b32, bld.def(s1), lo));
8353          hi = emit_wqm(bld, bld.vop1(aco_opcode::v_readfirstlane_b32, bld.def(s1), hi));
8354          bld.pseudo(aco_opcode::p_create_vector, Definition(dst), lo, hi);
8355          emit_split_vector(ctx, dst, 2);
8356       } else if (instr->dest.ssa.bit_size == 1) {
8357          assert(src.regClass() == bld.lm);
8358          Temp tmp = bld.sopc(Builder::s_bitcmp1, bld.def(s1, scc), src,
8359                              bld.sop1(Builder::s_ff1_i32, bld.def(s1), Operand(exec, bld.lm)));
8360          bool_to_vector_condition(ctx, emit_wqm(bld, tmp), dst);
8361       } else {
8362          bld.copy(Definition(dst), src);
8363       }
8364       break;
8365    }
8366    case nir_intrinsic_vote_all: {
8367       Temp src = get_ssa_temp(ctx, instr->src[0].ssa);
8368       Temp dst = get_ssa_temp(ctx, &instr->dest.ssa);
8369       assert(src.regClass() == bld.lm);
8370       assert(dst.regClass() == bld.lm);
8371 
8372       Temp tmp =
8373          bld.sop2(Builder::s_andn2, bld.def(bld.lm), bld.def(s1, scc), Operand(exec, bld.lm), src)
8374             .def(1)
8375             .getTemp();
8376       Temp cond = bool_to_vector_condition(ctx, emit_wqm(bld, tmp));
8377       bld.sop1(Builder::s_not, Definition(dst), bld.def(s1, scc), cond);
8378       break;
8379    }
8380    case nir_intrinsic_vote_any: {
8381       Temp src = get_ssa_temp(ctx, instr->src[0].ssa);
8382       Temp dst = get_ssa_temp(ctx, &instr->dest.ssa);
8383       assert(src.regClass() == bld.lm);
8384       assert(dst.regClass() == bld.lm);
8385 
8386       Temp tmp = bool_to_scalar_condition(ctx, src);
8387       bool_to_vector_condition(ctx, emit_wqm(bld, tmp), dst);
8388       break;
8389    }
8390    case nir_intrinsic_reduce:
8391    case nir_intrinsic_inclusive_scan:
8392    case nir_intrinsic_exclusive_scan: {
8393       Temp src = get_ssa_temp(ctx, instr->src[0].ssa);
8394       Temp dst = get_ssa_temp(ctx, &instr->dest.ssa);
8395       nir_op op = (nir_op)nir_intrinsic_reduction_op(instr);
8396       unsigned cluster_size =
8397          instr->intrinsic == nir_intrinsic_reduce ? nir_intrinsic_cluster_size(instr) : 0;
8398       cluster_size = util_next_power_of_two(
8399          MIN2(cluster_size ? cluster_size : ctx->program->wave_size, ctx->program->wave_size));
8400 
8401       if (!nir_src_is_divergent(instr->src[0]) && cluster_size == ctx->program->wave_size &&
8402           instr->dest.ssa.bit_size != 1) {
8403          /* We use divergence analysis to assign the regclass, so check if it's
8404           * working as expected */
8405          ASSERTED bool expected_divergent = instr->intrinsic == nir_intrinsic_exclusive_scan;
8406          if (instr->intrinsic == nir_intrinsic_inclusive_scan)
8407             expected_divergent = op == nir_op_iadd || op == nir_op_fadd || op == nir_op_ixor;
8408          assert(nir_dest_is_divergent(instr->dest) == expected_divergent);
8409 
8410          if (instr->intrinsic == nir_intrinsic_reduce) {
8411             if (emit_uniform_reduce(ctx, instr))
8412                break;
8413          } else if (emit_uniform_scan(ctx, instr)) {
8414             break;
8415          }
8416       }
8417 
8418       if (instr->dest.ssa.bit_size == 1) {
8419          if (op == nir_op_imul || op == nir_op_umin || op == nir_op_imin)
8420             op = nir_op_iand;
8421          else if (op == nir_op_iadd)
8422             op = nir_op_ixor;
8423          else if (op == nir_op_umax || op == nir_op_imax)
8424             op = nir_op_ior;
8425          assert(op == nir_op_iand || op == nir_op_ior || op == nir_op_ixor);
8426 
8427          switch (instr->intrinsic) {
8428          case nir_intrinsic_reduce:
8429             emit_wqm(bld, emit_boolean_reduce(ctx, op, cluster_size, src), dst);
8430             break;
8431          case nir_intrinsic_exclusive_scan:
8432             emit_wqm(bld, emit_boolean_exclusive_scan(ctx, op, src), dst);
8433             break;
8434          case nir_intrinsic_inclusive_scan:
8435             emit_wqm(bld, emit_boolean_inclusive_scan(ctx, op, src), dst);
8436             break;
8437          default: assert(false);
8438          }
8439       } else if (cluster_size == 1) {
8440          bld.copy(Definition(dst), src);
8441       } else {
8442          unsigned bit_size = instr->src[0].ssa->bit_size;
8443 
8444          src = emit_extract_vector(ctx, src, 0, RegClass::get(RegType::vgpr, bit_size / 8));
8445 
8446          ReduceOp reduce_op = get_reduce_op(op, bit_size);
8447 
8448          aco_opcode aco_op;
8449          switch (instr->intrinsic) {
8450          case nir_intrinsic_reduce: aco_op = aco_opcode::p_reduce; break;
8451          case nir_intrinsic_inclusive_scan: aco_op = aco_opcode::p_inclusive_scan; break;
8452          case nir_intrinsic_exclusive_scan: aco_op = aco_opcode::p_exclusive_scan; break;
8453          default: unreachable("unknown reduce intrinsic");
8454          }
8455 
8456          Temp tmp_dst = emit_reduction_instr(ctx, aco_op, reduce_op, cluster_size,
8457                                              bld.def(dst.regClass()), src);
8458          emit_wqm(bld, tmp_dst, dst);
8459       }
8460       break;
8461    }
8462    case nir_intrinsic_quad_broadcast:
8463    case nir_intrinsic_quad_swap_horizontal:
8464    case nir_intrinsic_quad_swap_vertical:
8465    case nir_intrinsic_quad_swap_diagonal:
8466    case nir_intrinsic_quad_swizzle_amd: {
8467       Temp src = get_ssa_temp(ctx, instr->src[0].ssa);
8468 
8469       if (!nir_dest_is_divergent(instr->dest)) {
8470          emit_uniform_subgroup(ctx, instr, src);
8471          break;
8472       }
8473 
8474       /* Quad broadcast lane. */
8475       unsigned lane = 0;
8476       /* Use VALU for the bool instructions that don't have a SALU-only special case. */
8477       bool bool_use_valu = instr->dest.ssa.bit_size == 1;
8478 
8479       uint16_t dpp_ctrl = 0;
8480 
8481       switch (instr->intrinsic) {
8482       case nir_intrinsic_quad_swap_horizontal: dpp_ctrl = dpp_quad_perm(1, 0, 3, 2); break;
8483       case nir_intrinsic_quad_swap_vertical: dpp_ctrl = dpp_quad_perm(2, 3, 0, 1); break;
8484       case nir_intrinsic_quad_swap_diagonal: dpp_ctrl = dpp_quad_perm(3, 2, 1, 0); break;
8485       case nir_intrinsic_quad_swizzle_amd: dpp_ctrl = nir_intrinsic_swizzle_mask(instr); break;
8486       case nir_intrinsic_quad_broadcast:
8487          lane = nir_src_as_const_value(instr->src[1])->u32;
8488          dpp_ctrl = dpp_quad_perm(lane, lane, lane, lane);
8489          bool_use_valu = false;
8490          break;
8491       default: break;
8492       }
8493 
8494       Temp dst = get_ssa_temp(ctx, &instr->dest.ssa);
8495       Temp tmp(dst);
8496 
8497       /* Setup source. */
8498       if (bool_use_valu)
8499          src = bld.vop2_e64(aco_opcode::v_cndmask_b32, bld.def(v1), Operand::zero(),
8500                             Operand::c32(-1), src);
8501       else if (instr->dest.ssa.bit_size != 1)
8502          src = as_vgpr(ctx, src);
8503 
8504       /* Setup temporary destination. */
8505       if (bool_use_valu)
8506          tmp = bld.tmp(v1);
8507       else if (ctx->program->stage == fragment_fs)
8508          tmp = bld.tmp(dst.regClass());
8509 
8510       if (instr->dest.ssa.bit_size == 1 && instr->intrinsic == nir_intrinsic_quad_broadcast) {
8511          /* Special case for quad broadcast using SALU only. */
8512          assert(src.regClass() == bld.lm && tmp.regClass() == bld.lm);
8513 
8514          uint32_t half_mask = 0x11111111u << lane;
8515          Operand mask_tmp = bld.lm.bytes() == 4
8516                                ? Operand::c32(half_mask)
8517                                : bld.pseudo(aco_opcode::p_create_vector, bld.def(bld.lm),
8518                                             Operand::c32(half_mask), Operand::c32(half_mask));
8519 
8520          src =
8521             bld.sop2(Builder::s_and, bld.def(bld.lm), bld.def(s1, scc), src, Operand(exec, bld.lm));
8522          src = bld.sop2(Builder::s_and, bld.def(bld.lm), bld.def(s1, scc), mask_tmp, src);
8523          bld.sop1(Builder::s_wqm, Definition(tmp), src);
8524       } else if (instr->dest.ssa.bit_size <= 32 || bool_use_valu) {
8525          unsigned excess_bytes = bool_use_valu ? 0 : 4 - instr->dest.ssa.bit_size / 8;
8526          Definition def = excess_bytes ? bld.def(v1) : Definition(tmp);
8527 
8528          if (ctx->program->chip_class >= GFX8)
8529             bld.vop1_dpp(aco_opcode::v_mov_b32, def, src, dpp_ctrl);
8530          else
8531             bld.ds(aco_opcode::ds_swizzle_b32, def, src, (1 << 15) | dpp_ctrl);
8532 
8533          if (excess_bytes)
8534             bld.pseudo(aco_opcode::p_split_vector, Definition(tmp),
8535                        bld.def(RegClass::get(tmp.type(), excess_bytes)), def.getTemp());
8536       } else if (instr->dest.ssa.bit_size == 64) {
8537          Temp lo = bld.tmp(v1), hi = bld.tmp(v1);
8538          bld.pseudo(aco_opcode::p_split_vector, Definition(lo), Definition(hi), src);
8539 
8540          if (ctx->program->chip_class >= GFX8) {
8541             lo = bld.vop1_dpp(aco_opcode::v_mov_b32, bld.def(v1), lo, dpp_ctrl);
8542             hi = bld.vop1_dpp(aco_opcode::v_mov_b32, bld.def(v1), hi, dpp_ctrl);
8543          } else {
8544             lo = bld.ds(aco_opcode::ds_swizzle_b32, bld.def(v1), lo, (1 << 15) | dpp_ctrl);
8545             hi = bld.ds(aco_opcode::ds_swizzle_b32, bld.def(v1), hi, (1 << 15) | dpp_ctrl);
8546          }
8547 
8548          bld.pseudo(aco_opcode::p_create_vector, Definition(tmp), lo, hi);
8549          emit_split_vector(ctx, tmp, 2);
8550       } else {
8551          isel_err(&instr->instr, "Unimplemented NIR quad group instruction bit size.");
8552       }
8553 
8554       if (tmp.id() != dst.id()) {
8555          if (bool_use_valu)
8556             tmp = bld.vopc(aco_opcode::v_cmp_lg_u32, bld.def(bld.lm), Operand::zero(), tmp);
8557 
8558          /* Vulkan spec 9.25: Helper invocations must be active for quad group instructions. */
8559          emit_wqm(bld, tmp, dst, true);
8560       }
8561 
8562       break;
8563    }
8564    case nir_intrinsic_masked_swizzle_amd: {
8565       Temp src = get_ssa_temp(ctx, instr->src[0].ssa);
8566       if (!nir_dest_is_divergent(instr->dest)) {
8567          emit_uniform_subgroup(ctx, instr, src);
8568          break;
8569       }
8570       Temp dst = get_ssa_temp(ctx, &instr->dest.ssa);
8571       uint32_t mask = nir_intrinsic_swizzle_mask(instr);
8572 
8573       if (instr->dest.ssa.bit_size != 1)
8574          src = as_vgpr(ctx, src);
8575 
8576       if (instr->dest.ssa.bit_size == 1) {
8577          assert(src.regClass() == bld.lm);
8578          src = bld.vop2_e64(aco_opcode::v_cndmask_b32, bld.def(v1), Operand::zero(),
8579                             Operand::c32(-1), src);
8580          src = emit_masked_swizzle(ctx, bld, src, mask);
8581          Temp tmp = bld.vopc(aco_opcode::v_cmp_lg_u32, bld.def(bld.lm), Operand::zero(), src);
8582          emit_wqm(bld, tmp, dst);
8583       } else if (dst.regClass() == v1b) {
8584          Temp tmp = emit_wqm(bld, emit_masked_swizzle(ctx, bld, src, mask));
8585          emit_extract_vector(ctx, tmp, 0, dst);
8586       } else if (dst.regClass() == v2b) {
8587          Temp tmp = emit_wqm(bld, emit_masked_swizzle(ctx, bld, src, mask));
8588          emit_extract_vector(ctx, tmp, 0, dst);
8589       } else if (dst.regClass() == v1) {
8590          emit_wqm(bld, emit_masked_swizzle(ctx, bld, src, mask), dst);
8591       } else if (dst.regClass() == v2) {
8592          Temp lo = bld.tmp(v1), hi = bld.tmp(v1);
8593          bld.pseudo(aco_opcode::p_split_vector, Definition(lo), Definition(hi), src);
8594          lo = emit_wqm(bld, emit_masked_swizzle(ctx, bld, lo, mask));
8595          hi = emit_wqm(bld, emit_masked_swizzle(ctx, bld, hi, mask));
8596          bld.pseudo(aco_opcode::p_create_vector, Definition(dst), lo, hi);
8597          emit_split_vector(ctx, dst, 2);
8598       } else {
8599          isel_err(&instr->instr, "Unimplemented NIR instr bit size");
8600       }
8601       break;
8602    }
8603    case nir_intrinsic_write_invocation_amd: {
8604       Temp src = as_vgpr(ctx, get_ssa_temp(ctx, instr->src[0].ssa));
8605       Temp val = bld.as_uniform(get_ssa_temp(ctx, instr->src[1].ssa));
8606       Temp lane = bld.as_uniform(get_ssa_temp(ctx, instr->src[2].ssa));
8607       Temp dst = get_ssa_temp(ctx, &instr->dest.ssa);
8608       if (dst.regClass() == v1) {
8609          /* src2 is ignored for writelane. RA assigns the same reg for dst */
8610          emit_wqm(bld, bld.writelane(bld.def(v1), val, lane, src), dst);
8611       } else if (dst.regClass() == v2) {
8612          Temp src_lo = bld.tmp(v1), src_hi = bld.tmp(v1);
8613          Temp val_lo = bld.tmp(s1), val_hi = bld.tmp(s1);
8614          bld.pseudo(aco_opcode::p_split_vector, Definition(src_lo), Definition(src_hi), src);
8615          bld.pseudo(aco_opcode::p_split_vector, Definition(val_lo), Definition(val_hi), val);
8616          Temp lo = emit_wqm(bld, bld.writelane(bld.def(v1), val_lo, lane, src_hi));
8617          Temp hi = emit_wqm(bld, bld.writelane(bld.def(v1), val_hi, lane, src_hi));
8618          bld.pseudo(aco_opcode::p_create_vector, Definition(dst), lo, hi);
8619          emit_split_vector(ctx, dst, 2);
8620       } else {
8621          isel_err(&instr->instr, "Unimplemented NIR instr bit size");
8622       }
8623       break;
8624    }
8625    case nir_intrinsic_mbcnt_amd: {
8626       Temp src = get_ssa_temp(ctx, instr->src[0].ssa);
8627       Temp add_src = as_vgpr(ctx, get_ssa_temp(ctx, instr->src[1].ssa));
8628       Temp dst = get_ssa_temp(ctx, &instr->dest.ssa);
8629       /* Fit 64-bit mask for wave32 */
8630       src = emit_extract_vector(ctx, src, 0, RegClass(src.type(), bld.lm.size()));
8631       Temp wqm_tmp = emit_mbcnt(ctx, bld.tmp(v1), Operand(src), Operand(add_src));
8632       emit_wqm(bld, wqm_tmp, dst);
8633       break;
8634    }
8635    case nir_intrinsic_byte_permute_amd: {
8636       Temp dst = get_ssa_temp(ctx, &instr->dest.ssa);
8637       assert(dst.regClass() == v1);
8638       assert(ctx->program->chip_class >= GFX8);
8639       bld.vop3(aco_opcode::v_perm_b32, Definition(dst), get_ssa_temp(ctx, instr->src[0].ssa),
8640                as_vgpr(ctx, get_ssa_temp(ctx, instr->src[1].ssa)),
8641                as_vgpr(ctx, get_ssa_temp(ctx, instr->src[2].ssa)));
8642       break;
8643    }
8644    case nir_intrinsic_lane_permute_16_amd: {
8645       Temp src = get_ssa_temp(ctx, instr->src[0].ssa);
8646       Temp dst = get_ssa_temp(ctx, &instr->dest.ssa);
8647       assert(ctx->program->chip_class >= GFX10);
8648 
8649       if (src.regClass() == s1) {
8650          bld.copy(Definition(dst), src);
8651       } else if (dst.regClass() == v1 && src.regClass() == v1) {
8652          bld.vop3(aco_opcode::v_permlane16_b32, Definition(dst), src,
8653                   bld.as_uniform(get_ssa_temp(ctx, instr->src[1].ssa)),
8654                   bld.as_uniform(get_ssa_temp(ctx, instr->src[2].ssa)));
8655       } else {
8656          isel_err(&instr->instr, "Unimplemented lane_permute_16_amd");
8657       }
8658       break;
8659    }
8660    case nir_intrinsic_load_helper_invocation:
8661    case nir_intrinsic_is_helper_invocation: {
8662       /* load_helper() after demote() get lowered to is_helper().
8663        * Otherwise, these two behave the same. */
8664       Temp dst = get_ssa_temp(ctx, &instr->dest.ssa);
8665       bld.pseudo(aco_opcode::p_is_helper, Definition(dst), Operand(exec, bld.lm));
8666       ctx->block->kind |= block_kind_needs_lowering;
8667       ctx->program->needs_exact = true;
8668       break;
8669    }
8670    case nir_intrinsic_demote:
8671       bld.pseudo(aco_opcode::p_demote_to_helper, Operand::c32(-1u));
8672 
8673       if (ctx->block->loop_nest_depth || ctx->cf_info.parent_if.is_divergent)
8674          ctx->cf_info.exec_potentially_empty_discard = true;
8675       ctx->block->kind |= block_kind_uses_demote;
8676       ctx->program->needs_exact = true;
8677       break;
8678    case nir_intrinsic_demote_if: {
8679       Temp src = get_ssa_temp(ctx, instr->src[0].ssa);
8680       assert(src.regClass() == bld.lm);
8681       Temp cond =
8682          bld.sop2(Builder::s_and, bld.def(bld.lm), bld.def(s1, scc), src, Operand(exec, bld.lm));
8683       bld.pseudo(aco_opcode::p_demote_to_helper, cond);
8684 
8685       if (ctx->block->loop_nest_depth || ctx->cf_info.parent_if.is_divergent)
8686          ctx->cf_info.exec_potentially_empty_discard = true;
8687       ctx->block->kind |= block_kind_uses_demote;
8688       ctx->program->needs_exact = true;
8689       break;
8690    }
8691    case nir_intrinsic_first_invocation: {
8692       emit_wqm(bld, bld.sop1(Builder::s_ff1_i32, bld.def(s1), Operand(exec, bld.lm)),
8693                get_ssa_temp(ctx, &instr->dest.ssa));
8694       break;
8695    }
8696    case nir_intrinsic_last_invocation: {
8697       Temp flbit = bld.sop1(Builder::s_flbit_i32, bld.def(s1), Operand(exec, bld.lm));
8698       Temp last = bld.sop2(aco_opcode::s_sub_i32, bld.def(s1), bld.def(s1, scc),
8699                            Operand::c32(ctx->program->wave_size - 1u), flbit);
8700       emit_wqm(bld, last, get_ssa_temp(ctx, &instr->dest.ssa));
8701       break;
8702    }
8703    case nir_intrinsic_elect: {
8704       /* p_elect is lowered in aco_insert_exec_mask.
8705        * Use exec as an operand so value numbering and the pre-RA optimizer won't recognize
8706        * two p_elect with different exec masks as the same.
8707        */
8708       Temp elected = bld.pseudo(aco_opcode::p_elect, bld.def(bld.lm), Operand(exec, bld.lm));
8709       emit_wqm(bld, elected, get_ssa_temp(ctx, &instr->dest.ssa));
8710       ctx->block->kind |= block_kind_needs_lowering;
8711       break;
8712    }
8713    case nir_intrinsic_shader_clock: {
8714       Temp dst = get_ssa_temp(ctx, &instr->dest.ssa);
8715       if (nir_intrinsic_memory_scope(instr) == NIR_SCOPE_SUBGROUP &&
8716           ctx->options->chip_class >= GFX10_3) {
8717          /* "((size - 1) << 11) | register" (SHADER_CYCLES is encoded as register 29) */
8718          Temp clock = bld.sopk(aco_opcode::s_getreg_b32, bld.def(s1), ((20 - 1) << 11) | 29);
8719          bld.pseudo(aco_opcode::p_create_vector, Definition(dst), clock, Operand::zero());
8720       } else {
8721          aco_opcode opcode = nir_intrinsic_memory_scope(instr) == NIR_SCOPE_DEVICE
8722                                 ? aco_opcode::s_memrealtime
8723                                 : aco_opcode::s_memtime;
8724          bld.smem(opcode, Definition(dst), memory_sync_info(0, semantic_volatile));
8725       }
8726       emit_split_vector(ctx, dst, 2);
8727       break;
8728    }
8729    case nir_intrinsic_load_vertex_id_zero_base: {
8730       Temp dst = get_ssa_temp(ctx, &instr->dest.ssa);
8731       bld.copy(Definition(dst), get_arg(ctx, ctx->args->ac.vertex_id));
8732       break;
8733    }
8734    case nir_intrinsic_load_first_vertex: {
8735       Temp dst = get_ssa_temp(ctx, &instr->dest.ssa);
8736       bld.copy(Definition(dst), get_arg(ctx, ctx->args->ac.base_vertex));
8737       break;
8738    }
8739    case nir_intrinsic_load_base_instance: {
8740       Temp dst = get_ssa_temp(ctx, &instr->dest.ssa);
8741       bld.copy(Definition(dst), get_arg(ctx, ctx->args->ac.start_instance));
8742       break;
8743    }
8744    case nir_intrinsic_load_instance_id: {
8745       Temp dst = get_ssa_temp(ctx, &instr->dest.ssa);
8746       bld.copy(Definition(dst), get_arg(ctx, ctx->args->ac.instance_id));
8747       break;
8748    }
8749    case nir_intrinsic_load_draw_id: {
8750       Temp dst = get_ssa_temp(ctx, &instr->dest.ssa);
8751       bld.copy(Definition(dst), get_arg(ctx, ctx->args->ac.draw_id));
8752       break;
8753    }
8754    case nir_intrinsic_load_invocation_id: {
8755       Temp dst = get_ssa_temp(ctx, &instr->dest.ssa);
8756 
8757       if (ctx->shader->info.stage == MESA_SHADER_GEOMETRY) {
8758          if (ctx->options->chip_class >= GFX10)
8759             bld.vop2_e64(aco_opcode::v_and_b32, Definition(dst), Operand::c32(127u),
8760                          get_arg(ctx, ctx->args->ac.gs_invocation_id));
8761          else
8762             bld.copy(Definition(dst), get_arg(ctx, ctx->args->ac.gs_invocation_id));
8763       } else if (ctx->shader->info.stage == MESA_SHADER_TESS_CTRL) {
8764          bld.vop3(aco_opcode::v_bfe_u32, Definition(dst), get_arg(ctx, ctx->args->ac.tcs_rel_ids),
8765                   Operand::c32(8u), Operand::c32(5u));
8766       } else {
8767          unreachable("Unsupported stage for load_invocation_id");
8768       }
8769 
8770       break;
8771    }
8772    case nir_intrinsic_load_primitive_id: {
8773       Temp dst = get_ssa_temp(ctx, &instr->dest.ssa);
8774 
8775       switch (ctx->shader->info.stage) {
8776       case MESA_SHADER_GEOMETRY:
8777          bld.copy(Definition(dst), get_arg(ctx, ctx->args->ac.gs_prim_id));
8778          break;
8779       case MESA_SHADER_TESS_CTRL:
8780          bld.copy(Definition(dst), get_arg(ctx, ctx->args->ac.tcs_patch_id));
8781          break;
8782       case MESA_SHADER_TESS_EVAL:
8783          bld.copy(Definition(dst), get_arg(ctx, ctx->args->ac.tes_patch_id));
8784          break;
8785       default:
8786          if (ctx->stage.hw == HWStage::NGG && !ctx->stage.has(SWStage::GS)) {
8787             /* In case of NGG, the GS threads always have the primitive ID
8788              * even if there is no SW GS. */
8789             bld.copy(Definition(dst), get_arg(ctx, ctx->args->ac.gs_prim_id));
8790             break;
8791          }
8792          unreachable("Unimplemented shader stage for nir_intrinsic_load_primitive_id");
8793       }
8794 
8795       break;
8796    }
8797    case nir_intrinsic_load_patch_vertices_in: {
8798       assert(ctx->shader->info.stage == MESA_SHADER_TESS_CTRL ||
8799              ctx->shader->info.stage == MESA_SHADER_TESS_EVAL);
8800 
8801       Temp dst = get_ssa_temp(ctx, &instr->dest.ssa);
8802       bld.copy(Definition(dst), Operand::c32(ctx->args->options->key.tcs.tess_input_vertices));
8803       break;
8804    }
8805    case nir_intrinsic_emit_vertex_with_counter: {
8806       assert(ctx->stage.hw == HWStage::GS);
8807       visit_emit_vertex_with_counter(ctx, instr);
8808       break;
8809    }
8810    case nir_intrinsic_end_primitive_with_counter: {
8811       if (ctx->stage.hw != HWStage::NGG) {
8812          unsigned stream = nir_intrinsic_stream_id(instr);
8813          bld.sopp(aco_opcode::s_sendmsg, bld.m0(ctx->gs_wave_id), -1,
8814                   sendmsg_gs(true, false, stream));
8815       }
8816       break;
8817    }
8818    case nir_intrinsic_set_vertex_and_primitive_count: {
8819       assert(ctx->stage.hw == HWStage::GS);
8820       /* unused in the legacy pipeline, the HW keeps track of this for us */
8821       break;
8822    }
8823    case nir_intrinsic_load_tess_rel_patch_id_amd: {
8824       bld.copy(Definition(get_ssa_temp(ctx, &instr->dest.ssa)), get_tess_rel_patch_id(ctx));
8825       break;
8826    }
8827    case nir_intrinsic_load_ring_tess_factors_amd: {
8828       bld.smem(aco_opcode::s_load_dwordx4, Definition(get_ssa_temp(ctx, &instr->dest.ssa)),
8829                ctx->program->private_segment_buffer, Operand::c32(RING_HS_TESS_FACTOR * 16u));
8830       break;
8831    }
8832    case nir_intrinsic_load_ring_tess_factors_offset_amd: {
8833       bld.copy(Definition(get_ssa_temp(ctx, &instr->dest.ssa)),
8834                get_arg(ctx, ctx->args->ac.tcs_factor_offset));
8835       break;
8836    }
8837    case nir_intrinsic_load_ring_tess_offchip_amd: {
8838       bld.smem(aco_opcode::s_load_dwordx4, Definition(get_ssa_temp(ctx, &instr->dest.ssa)),
8839                ctx->program->private_segment_buffer, Operand::c32(RING_HS_TESS_OFFCHIP * 16u));
8840       break;
8841    }
8842    case nir_intrinsic_load_ring_tess_offchip_offset_amd: {
8843       bld.copy(Definition(get_ssa_temp(ctx, &instr->dest.ssa)),
8844                get_arg(ctx, ctx->args->ac.tess_offchip_offset));
8845       break;
8846    }
8847    case nir_intrinsic_load_ring_esgs_amd: {
8848       unsigned ring = ctx->stage.hw == HWStage::ES ? RING_ESGS_VS : RING_ESGS_GS;
8849       bld.smem(aco_opcode::s_load_dwordx4, Definition(get_ssa_temp(ctx, &instr->dest.ssa)),
8850                ctx->program->private_segment_buffer, Operand::c32(ring * 16u));
8851       break;
8852    }
8853    case nir_intrinsic_load_ring_es2gs_offset_amd: {
8854       bld.copy(Definition(get_ssa_temp(ctx, &instr->dest.ssa)),
8855                get_arg(ctx, ctx->args->ac.es2gs_offset));
8856       break;
8857    }
8858    case nir_intrinsic_load_gs_vertex_offset_amd: {
8859       /* GFX6-8 uses 6 separate args, while GFX9+ packs these into only 3 args. */
8860       unsigned b = nir_intrinsic_base(instr);
8861       assert(b <= (ctx->program->chip_class >= GFX9 ? 2 : 5));
8862       bld.copy(Definition(get_ssa_temp(ctx, &instr->dest.ssa)),
8863                get_arg(ctx, ctx->args->ac.gs_vtx_offset[b]));
8864       break;
8865    }
8866    case nir_intrinsic_has_input_vertex_amd:
8867    case nir_intrinsic_has_input_primitive_amd: {
8868       assert(ctx->stage.hw == HWStage::NGG);
8869       unsigned i = instr->intrinsic == nir_intrinsic_has_input_vertex_amd ? 0 : 1;
8870       bld.copy(Definition(get_ssa_temp(ctx, &instr->dest.ssa)), merged_wave_info_to_mask(ctx, i));
8871       break;
8872    }
8873    case nir_intrinsic_load_workgroup_num_input_vertices_amd:
8874    case nir_intrinsic_load_workgroup_num_input_primitives_amd: {
8875       assert(ctx->stage.hw == HWStage::NGG);
8876       unsigned pos =
8877          instr->intrinsic == nir_intrinsic_load_workgroup_num_input_vertices_amd ? 12 : 22;
8878       bld.sop2(aco_opcode::s_bfe_u32, Definition(get_ssa_temp(ctx, &instr->dest.ssa)),
8879                bld.def(s1, scc), get_arg(ctx, ctx->args->ac.gs_tg_info),
8880                Operand::c32(pos | (9u << 16u)));
8881       break;
8882    }
8883    case nir_intrinsic_load_initial_edgeflags_amd: {
8884       assert(ctx->stage.hw == HWStage::NGG);
8885 
8886       Temp gs_invocation_id = get_arg(ctx, ctx->args->ac.gs_invocation_id);
8887       /* Get initial edgeflags for each vertex at bits 8, 9, 10 of gs_invocation_id. */
8888       Temp flags =
8889          bld.vop2(aco_opcode::v_and_b32, bld.def(v1), Operand::c32(0x700u), gs_invocation_id);
8890       /* Move the bits to their desired position: 8->9, 9->19, 10->29. */
8891       flags = bld.vop2(aco_opcode::v_mul_u32_u24, bld.def(v1), Operand::c32(0x80402u), flags);
8892       /* Remove garbage bits that are a byproduct of the multiplication. */
8893       bld.vop2(aco_opcode::v_and_b32, Definition(get_ssa_temp(ctx, &instr->dest.ssa)),
8894                Operand::c32(0x20080200), flags);
8895       break;
8896    }
8897    case nir_intrinsic_load_packed_passthrough_primitive_amd: {
8898       bld.copy(Definition(get_ssa_temp(ctx, &instr->dest.ssa)),
8899                get_arg(ctx, ctx->args->ac.gs_vtx_offset[0]));
8900       break;
8901    }
8902    case nir_intrinsic_export_vertex_amd: {
8903       ctx->block->kind |= block_kind_export_end;
8904       create_vs_exports(ctx);
8905       break;
8906    }
8907    case nir_intrinsic_export_primitive_amd: {
8908       assert(ctx->stage.hw == HWStage::NGG);
8909       Temp prim_exp_arg = get_ssa_temp(ctx, instr->src[0].ssa);
8910       bld.exp(aco_opcode::exp, prim_exp_arg, Operand(v1), Operand(v1), Operand(v1),
8911               1 /* enabled mask */, V_008DFC_SQ_EXP_PRIM /* dest */, false /* compressed */,
8912               true /* done */, false /* valid mask */);
8913       break;
8914    }
8915    case nir_intrinsic_alloc_vertices_and_primitives_amd: {
8916       assert(ctx->stage.hw == HWStage::NGG);
8917       Temp num_vertices = get_ssa_temp(ctx, instr->src[0].ssa);
8918       Temp num_primitives = get_ssa_temp(ctx, instr->src[1].ssa);
8919       ngg_emit_sendmsg_gs_alloc_req(ctx, num_vertices, num_primitives);
8920       break;
8921    }
8922    case nir_intrinsic_gds_atomic_add_amd: {
8923       Temp store_val = get_ssa_temp(ctx, instr->src[0].ssa);
8924       Temp gds_addr = get_ssa_temp(ctx, instr->src[1].ssa);
8925       Temp m0_val = get_ssa_temp(ctx, instr->src[2].ssa);
8926       Operand m = bld.m0((Temp)bld.copy(bld.def(s1, m0), bld.as_uniform(m0_val)));
8927       bld.ds(aco_opcode::ds_add_u32, as_vgpr(ctx, gds_addr), as_vgpr(ctx, store_val), m, 0u, 0u,
8928              true);
8929       break;
8930    }
8931    case nir_intrinsic_load_shader_query_enabled_amd: {
8932       unsigned cmp_bit = 0;
8933       Temp shader_query_enabled =
8934          bld.sopc(aco_opcode::s_bitcmp1_b32, bld.def(s1, scc),
8935                   get_arg(ctx, ctx->args->ngg_gs_state), Operand::c32(cmp_bit));
8936       bld.copy(Definition(get_ssa_temp(ctx, &instr->dest.ssa)),
8937                bool_to_vector_condition(ctx, shader_query_enabled));
8938       break;
8939    }
8940    case nir_intrinsic_load_cull_front_face_enabled_amd:
8941    case nir_intrinsic_load_cull_back_face_enabled_amd:
8942    case nir_intrinsic_load_cull_ccw_amd:
8943    case nir_intrinsic_load_cull_small_primitives_enabled_amd: {
8944       unsigned cmp_bit;
8945       if (instr->intrinsic == nir_intrinsic_load_cull_front_face_enabled_amd)
8946          cmp_bit = 0;
8947       else if (instr->intrinsic == nir_intrinsic_load_cull_back_face_enabled_amd)
8948          cmp_bit = 1;
8949       else if (instr->intrinsic == nir_intrinsic_load_cull_ccw_amd)
8950          cmp_bit = 2;
8951       else if (instr->intrinsic == nir_intrinsic_load_cull_small_primitives_enabled_amd)
8952          cmp_bit = 3;
8953       else
8954          unreachable("unimplemented culling intrinsic");
8955 
8956       Builder::Result enabled =
8957          bld.sopc(aco_opcode::s_bitcmp1_b32, bld.def(s1, scc),
8958                   get_arg(ctx, ctx->args->ngg_culling_settings), Operand::c32(cmp_bit));
8959       enabled.instr->definitions[0].setNoCSE(true);
8960       bld.copy(Definition(get_ssa_temp(ctx, &instr->dest.ssa)),
8961                bool_to_vector_condition(ctx, enabled));
8962       break;
8963    }
8964    case nir_intrinsic_load_sbt_amd: visit_load_sbt_amd(ctx, instr); break;
8965    case nir_intrinsic_bvh64_intersect_ray_amd: visit_bvh64_intersect_ray_amd(ctx, instr); break;
8966    case nir_intrinsic_load_cull_any_enabled_amd: {
8967       Builder::Result cull_any_enabled =
8968          bld.sop2(aco_opcode::s_and_b32, bld.def(s1), bld.def(s1, scc),
8969                   get_arg(ctx, ctx->args->ngg_culling_settings), Operand::c32(0xbu));
8970       cull_any_enabled.instr->definitions[1].setNoCSE(true);
8971       bld.copy(Definition(get_ssa_temp(ctx, &instr->dest.ssa)),
8972                bool_to_vector_condition(ctx, cull_any_enabled.def(1).getTemp()));
8973       break;
8974    }
8975    case nir_intrinsic_load_cull_small_prim_precision_amd: {
8976       /* Exponent is 8-bit signed int, move that into a signed 32-bit int. */
8977       Temp exponent = bld.sop2(aco_opcode::s_ashr_i32, bld.def(s1), bld.def(s1, scc),
8978                                get_arg(ctx, ctx->args->ngg_culling_settings), Operand::c32(24u));
8979       /* small_prim_precision = 1.0 * 2^X */
8980       bld.vop3(aco_opcode::v_ldexp_f32, Definition(get_ssa_temp(ctx, &instr->dest.ssa)),
8981                Operand::c32(0x3f800000u), Operand(exponent));
8982       break;
8983    }
8984    case nir_intrinsic_load_viewport_x_scale: {
8985       bld.copy(Definition(get_ssa_temp(ctx, &instr->dest.ssa)),
8986                get_arg(ctx, ctx->args->ngg_viewport_scale[0]));
8987       break;
8988    }
8989    case nir_intrinsic_load_viewport_y_scale: {
8990       bld.copy(Definition(get_ssa_temp(ctx, &instr->dest.ssa)),
8991                get_arg(ctx, ctx->args->ngg_viewport_scale[1]));
8992       break;
8993    }
8994    case nir_intrinsic_load_viewport_x_offset: {
8995       bld.copy(Definition(get_ssa_temp(ctx, &instr->dest.ssa)),
8996                get_arg(ctx, ctx->args->ngg_viewport_translate[0]));
8997       break;
8998    }
8999    case nir_intrinsic_load_viewport_y_offset: {
9000       bld.copy(Definition(get_ssa_temp(ctx, &instr->dest.ssa)),
9001                get_arg(ctx, ctx->args->ngg_viewport_translate[1]));
9002       break;
9003    }
9004    case nir_intrinsic_overwrite_vs_arguments_amd: {
9005       ctx->arg_temps[ctx->args->ac.vertex_id.arg_index] = get_ssa_temp(ctx, instr->src[0].ssa);
9006       ctx->arg_temps[ctx->args->ac.instance_id.arg_index] = get_ssa_temp(ctx, instr->src[1].ssa);
9007       break;
9008    }
9009    case nir_intrinsic_overwrite_tes_arguments_amd: {
9010       ctx->arg_temps[ctx->args->ac.tes_u.arg_index] = get_ssa_temp(ctx, instr->src[0].ssa);
9011       ctx->arg_temps[ctx->args->ac.tes_v.arg_index] = get_ssa_temp(ctx, instr->src[1].ssa);
9012       ctx->arg_temps[ctx->args->ac.tes_rel_patch_id.arg_index] =
9013          get_ssa_temp(ctx, instr->src[2].ssa);
9014       ctx->arg_temps[ctx->args->ac.tes_patch_id.arg_index] = get_ssa_temp(ctx, instr->src[3].ssa);
9015       break;
9016    }
9017    default:
9018       isel_err(&instr->instr, "Unimplemented intrinsic instr");
9019       abort();
9020 
9021       break;
9022    }
9023 }
9024 
9025 void
tex_fetch_ptrs(isel_context * ctx,nir_tex_instr * instr,Temp * res_ptr,Temp * samp_ptr,enum glsl_base_type * stype)9026 tex_fetch_ptrs(isel_context* ctx, nir_tex_instr* instr, Temp* res_ptr, Temp* samp_ptr,
9027                enum glsl_base_type* stype)
9028 {
9029    nir_deref_instr* texture_deref_instr = NULL;
9030    nir_deref_instr* sampler_deref_instr = NULL;
9031    int plane = -1;
9032 
9033    for (unsigned i = 0; i < instr->num_srcs; i++) {
9034       switch (instr->src[i].src_type) {
9035       case nir_tex_src_texture_deref:
9036          texture_deref_instr = nir_src_as_deref(instr->src[i].src);
9037          break;
9038       case nir_tex_src_sampler_deref:
9039          sampler_deref_instr = nir_src_as_deref(instr->src[i].src);
9040          break;
9041       case nir_tex_src_plane: plane = nir_src_as_int(instr->src[i].src); break;
9042       default: break;
9043       }
9044    }
9045 
9046    *stype = glsl_get_sampler_result_type(texture_deref_instr->type);
9047 
9048    if (!sampler_deref_instr)
9049       sampler_deref_instr = texture_deref_instr;
9050 
9051    if (plane >= 0) {
9052       assert(instr->sampler_dim != GLSL_SAMPLER_DIM_BUF);
9053       *res_ptr = get_sampler_desc(ctx, texture_deref_instr,
9054                                   (aco_descriptor_type)(ACO_DESC_PLANE_0 + plane), instr, false);
9055    } else if (instr->sampler_dim == GLSL_SAMPLER_DIM_BUF) {
9056       *res_ptr = get_sampler_desc(ctx, texture_deref_instr, ACO_DESC_BUFFER, instr, false);
9057    } else if (instr->op == nir_texop_fragment_mask_fetch_amd) {
9058       *res_ptr = get_sampler_desc(ctx, texture_deref_instr, ACO_DESC_FMASK, instr, false);
9059    } else {
9060       *res_ptr = get_sampler_desc(ctx, texture_deref_instr, ACO_DESC_IMAGE, instr, false);
9061    }
9062    if (samp_ptr) {
9063       *samp_ptr = get_sampler_desc(ctx, sampler_deref_instr, ACO_DESC_SAMPLER, instr, false);
9064 
9065       if (instr->sampler_dim < GLSL_SAMPLER_DIM_RECT && ctx->options->chip_class < GFX8) {
9066          /* fix sampler aniso on SI/CI: samp[0] = samp[0] & img[7] */
9067          Builder bld(ctx->program, ctx->block);
9068 
9069          /* to avoid unnecessary moves, we split and recombine sampler and image */
9070          Temp img[8] = {bld.tmp(s1), bld.tmp(s1), bld.tmp(s1), bld.tmp(s1),
9071                         bld.tmp(s1), bld.tmp(s1), bld.tmp(s1), bld.tmp(s1)};
9072          Temp samp[4] = {bld.tmp(s1), bld.tmp(s1), bld.tmp(s1), bld.tmp(s1)};
9073          bld.pseudo(aco_opcode::p_split_vector, Definition(img[0]), Definition(img[1]),
9074                     Definition(img[2]), Definition(img[3]), Definition(img[4]), Definition(img[5]),
9075                     Definition(img[6]), Definition(img[7]), *res_ptr);
9076          bld.pseudo(aco_opcode::p_split_vector, Definition(samp[0]), Definition(samp[1]),
9077                     Definition(samp[2]), Definition(samp[3]), *samp_ptr);
9078 
9079          samp[0] = bld.sop2(aco_opcode::s_and_b32, bld.def(s1), bld.def(s1, scc), samp[0], img[7]);
9080          *res_ptr = bld.pseudo(aco_opcode::p_create_vector, bld.def(s8), img[0], img[1], img[2],
9081                                img[3], img[4], img[5], img[6], img[7]);
9082          *samp_ptr = bld.pseudo(aco_opcode::p_create_vector, bld.def(s4), samp[0], samp[1], samp[2],
9083                                 samp[3]);
9084       }
9085    }
9086 }
9087 
9088 void
build_cube_select(isel_context * ctx,Temp ma,Temp id,Temp deriv,Temp * out_ma,Temp * out_sc,Temp * out_tc)9089 build_cube_select(isel_context* ctx, Temp ma, Temp id, Temp deriv, Temp* out_ma, Temp* out_sc,
9090                   Temp* out_tc)
9091 {
9092    Builder bld(ctx->program, ctx->block);
9093 
9094    Temp deriv_x = emit_extract_vector(ctx, deriv, 0, v1);
9095    Temp deriv_y = emit_extract_vector(ctx, deriv, 1, v1);
9096    Temp deriv_z = emit_extract_vector(ctx, deriv, 2, v1);
9097 
9098    Operand neg_one = Operand::c32(0xbf800000u);
9099    Operand one = Operand::c32(0x3f800000u);
9100    Operand two = Operand::c32(0x40000000u);
9101    Operand four = Operand::c32(0x40800000u);
9102 
9103    Temp is_ma_positive =
9104       bld.vopc(aco_opcode::v_cmp_le_f32, bld.hint_vcc(bld.def(bld.lm)), Operand::zero(), ma);
9105    Temp sgn_ma = bld.vop2_e64(aco_opcode::v_cndmask_b32, bld.def(v1), neg_one, one, is_ma_positive);
9106    Temp neg_sgn_ma = bld.vop2(aco_opcode::v_sub_f32, bld.def(v1), Operand::zero(), sgn_ma);
9107 
9108    Temp is_ma_z = bld.vopc(aco_opcode::v_cmp_le_f32, bld.hint_vcc(bld.def(bld.lm)), four, id);
9109    Temp is_ma_y = bld.vopc(aco_opcode::v_cmp_le_f32, bld.def(bld.lm), two, id);
9110    is_ma_y = bld.sop2(Builder::s_andn2, bld.hint_vcc(bld.def(bld.lm)), is_ma_y, is_ma_z);
9111    Temp is_not_ma_x = bld.sop2(aco_opcode::s_or_b64, bld.hint_vcc(bld.def(bld.lm)),
9112                                bld.def(s1, scc), is_ma_z, is_ma_y);
9113 
9114    /* select sc */
9115    Temp tmp = bld.vop2(aco_opcode::v_cndmask_b32, bld.def(v1), deriv_z, deriv_x, is_not_ma_x);
9116    Temp sgn = bld.vop2_e64(
9117       aco_opcode::v_cndmask_b32, bld.def(v1),
9118       bld.vop2(aco_opcode::v_cndmask_b32, bld.def(v1), neg_sgn_ma, sgn_ma, is_ma_z), one, is_ma_y);
9119    *out_sc = bld.vop2(aco_opcode::v_mul_f32, bld.def(v1), tmp, sgn);
9120 
9121    /* select tc */
9122    tmp = bld.vop2(aco_opcode::v_cndmask_b32, bld.def(v1), deriv_y, deriv_z, is_ma_y);
9123    sgn = bld.vop2(aco_opcode::v_cndmask_b32, bld.def(v1), neg_one, sgn_ma, is_ma_y);
9124    *out_tc = bld.vop2(aco_opcode::v_mul_f32, bld.def(v1), tmp, sgn);
9125 
9126    /* select ma */
9127    tmp = bld.vop2(aco_opcode::v_cndmask_b32, bld.def(v1),
9128                   bld.vop2(aco_opcode::v_cndmask_b32, bld.def(v1), deriv_x, deriv_y, is_ma_y),
9129                   deriv_z, is_ma_z);
9130    tmp = bld.vop2(aco_opcode::v_and_b32, bld.def(v1), Operand::c32(0x7fffffffu), tmp);
9131    *out_ma = bld.vop2(aco_opcode::v_mul_f32, bld.def(v1), two, tmp);
9132 }
9133 
9134 void
prepare_cube_coords(isel_context * ctx,std::vector<Temp> & coords,Temp * ddx,Temp * ddy,bool is_deriv,bool is_array)9135 prepare_cube_coords(isel_context* ctx, std::vector<Temp>& coords, Temp* ddx, Temp* ddy,
9136                     bool is_deriv, bool is_array)
9137 {
9138    Builder bld(ctx->program, ctx->block);
9139    Temp ma, tc, sc, id;
9140    aco_opcode madak =
9141       ctx->program->chip_class >= GFX10_3 ? aco_opcode::v_fmaak_f32 : aco_opcode::v_madak_f32;
9142    aco_opcode madmk =
9143       ctx->program->chip_class >= GFX10_3 ? aco_opcode::v_fmamk_f32 : aco_opcode::v_madmk_f32;
9144 
9145    if (is_array) {
9146       coords[3] = bld.vop1(aco_opcode::v_rndne_f32, bld.def(v1), coords[3]);
9147 
9148       /* see comment in ac_prepare_cube_coords() */
9149       if (ctx->options->chip_class <= GFX8)
9150          coords[3] = bld.vop2(aco_opcode::v_max_f32, bld.def(v1), Operand::zero(), coords[3]);
9151    }
9152 
9153    ma = bld.vop3(aco_opcode::v_cubema_f32, bld.def(v1), coords[0], coords[1], coords[2]);
9154 
9155    aco_ptr<VOP3_instruction> vop3a{
9156       create_instruction<VOP3_instruction>(aco_opcode::v_rcp_f32, asVOP3(Format::VOP1), 1, 1)};
9157    vop3a->operands[0] = Operand(ma);
9158    vop3a->abs[0] = true;
9159    Temp invma = bld.tmp(v1);
9160    vop3a->definitions[0] = Definition(invma);
9161    ctx->block->instructions.emplace_back(std::move(vop3a));
9162 
9163    sc = bld.vop3(aco_opcode::v_cubesc_f32, bld.def(v1), coords[0], coords[1], coords[2]);
9164    if (!is_deriv)
9165       sc = bld.vop2(madak, bld.def(v1), sc, invma, Operand::c32(0x3fc00000u /*1.5*/));
9166 
9167    tc = bld.vop3(aco_opcode::v_cubetc_f32, bld.def(v1), coords[0], coords[1], coords[2]);
9168    if (!is_deriv)
9169       tc = bld.vop2(madak, bld.def(v1), tc, invma, Operand::c32(0x3fc00000u /*1.5*/));
9170 
9171    id = bld.vop3(aco_opcode::v_cubeid_f32, bld.def(v1), coords[0], coords[1], coords[2]);
9172 
9173    if (is_deriv) {
9174       sc = bld.vop2(aco_opcode::v_mul_f32, bld.def(v1), sc, invma);
9175       tc = bld.vop2(aco_opcode::v_mul_f32, bld.def(v1), tc, invma);
9176 
9177       for (unsigned i = 0; i < 2; i++) {
9178          /* see comment in ac_prepare_cube_coords() */
9179          Temp deriv_ma;
9180          Temp deriv_sc, deriv_tc;
9181          build_cube_select(ctx, ma, id, i ? *ddy : *ddx, &deriv_ma, &deriv_sc, &deriv_tc);
9182 
9183          deriv_ma = bld.vop2(aco_opcode::v_mul_f32, bld.def(v1), deriv_ma, invma);
9184 
9185          Temp x = bld.vop2(aco_opcode::v_sub_f32, bld.def(v1),
9186                            bld.vop2(aco_opcode::v_mul_f32, bld.def(v1), deriv_sc, invma),
9187                            bld.vop2(aco_opcode::v_mul_f32, bld.def(v1), deriv_ma, sc));
9188          Temp y = bld.vop2(aco_opcode::v_sub_f32, bld.def(v1),
9189                            bld.vop2(aco_opcode::v_mul_f32, bld.def(v1), deriv_tc, invma),
9190                            bld.vop2(aco_opcode::v_mul_f32, bld.def(v1), deriv_ma, tc));
9191          *(i ? ddy : ddx) = bld.pseudo(aco_opcode::p_create_vector, bld.def(v2), x, y);
9192       }
9193 
9194       sc = bld.vop2(aco_opcode::v_add_f32, bld.def(v1), Operand::c32(0x3fc00000u /*1.5*/), sc);
9195       tc = bld.vop2(aco_opcode::v_add_f32, bld.def(v1), Operand::c32(0x3fc00000u /*1.5*/), tc);
9196    }
9197 
9198    if (is_array)
9199       id = bld.vop2(madmk, bld.def(v1), coords[3], id, Operand::c32(0x41000000u /*8.0*/));
9200    coords.resize(3);
9201    coords[0] = sc;
9202    coords[1] = tc;
9203    coords[2] = id;
9204 }
9205 
9206 void
get_const_vec(nir_ssa_def * vec,nir_const_value * cv[4])9207 get_const_vec(nir_ssa_def* vec, nir_const_value* cv[4])
9208 {
9209    if (vec->parent_instr->type != nir_instr_type_alu)
9210       return;
9211    nir_alu_instr* vec_instr = nir_instr_as_alu(vec->parent_instr);
9212    if (vec_instr->op != nir_op_vec(vec->num_components))
9213       return;
9214 
9215    for (unsigned i = 0; i < vec->num_components; i++) {
9216       cv[i] =
9217          vec_instr->src[i].swizzle[0] == 0 ? nir_src_as_const_value(vec_instr->src[i].src) : NULL;
9218    }
9219 }
9220 
9221 void
visit_tex(isel_context * ctx,nir_tex_instr * instr)9222 visit_tex(isel_context* ctx, nir_tex_instr* instr)
9223 {
9224    assert(instr->op != nir_texop_txf_ms && instr->op != nir_texop_samples_identical);
9225 
9226    Builder bld(ctx->program, ctx->block);
9227    bool has_bias = false, has_lod = false, level_zero = false, has_compare = false,
9228         has_offset = false, has_ddx = false, has_ddy = false, has_derivs = false,
9229         has_sample_index = false, has_clamped_lod = false;
9230    Temp resource, sampler, bias = Temp(), compare = Temp(), sample_index = Temp(), lod = Temp(),
9231                            offset = Temp(), ddx = Temp(), ddy = Temp(), clamped_lod = Temp();
9232    std::vector<Temp> coords;
9233    std::vector<Temp> derivs;
9234    nir_const_value* const_offset[4] = {NULL, NULL, NULL, NULL};
9235    enum glsl_base_type stype;
9236    tex_fetch_ptrs(ctx, instr, &resource, &sampler, &stype);
9237 
9238    bool tg4_integer_workarounds = ctx->options->chip_class <= GFX8 && instr->op == nir_texop_tg4 &&
9239                                   (stype == GLSL_TYPE_UINT || stype == GLSL_TYPE_INT);
9240    bool tg4_integer_cube_workaround =
9241       tg4_integer_workarounds && instr->sampler_dim == GLSL_SAMPLER_DIM_CUBE;
9242 
9243    for (unsigned i = 0; i < instr->num_srcs; i++) {
9244       switch (instr->src[i].src_type) {
9245       case nir_tex_src_coord: {
9246          Temp coord = get_ssa_temp(ctx, instr->src[i].src.ssa);
9247          for (unsigned j = 0; j < coord.size(); j++)
9248             coords.emplace_back(emit_extract_vector(ctx, coord, j, v1));
9249          break;
9250       }
9251       case nir_tex_src_bias:
9252          bias = get_ssa_temp(ctx, instr->src[i].src.ssa);
9253          has_bias = true;
9254          break;
9255       case nir_tex_src_lod: {
9256          if (nir_src_is_const(instr->src[i].src) && nir_src_as_uint(instr->src[i].src) == 0) {
9257             level_zero = true;
9258          } else {
9259             lod = get_ssa_temp(ctx, instr->src[i].src.ssa);
9260             has_lod = true;
9261          }
9262          break;
9263       }
9264       case nir_tex_src_min_lod:
9265          clamped_lod = get_ssa_temp(ctx, instr->src[i].src.ssa);
9266          has_clamped_lod = true;
9267          break;
9268       case nir_tex_src_comparator:
9269          if (instr->is_shadow) {
9270             compare = get_ssa_temp(ctx, instr->src[i].src.ssa);
9271             has_compare = true;
9272          }
9273          break;
9274       case nir_tex_src_offset:
9275          offset = get_ssa_temp(ctx, instr->src[i].src.ssa);
9276          get_const_vec(instr->src[i].src.ssa, const_offset);
9277          has_offset = true;
9278          break;
9279       case nir_tex_src_ddx:
9280          ddx = get_ssa_temp(ctx, instr->src[i].src.ssa);
9281          has_ddx = true;
9282          break;
9283       case nir_tex_src_ddy:
9284          ddy = get_ssa_temp(ctx, instr->src[i].src.ssa);
9285          has_ddy = true;
9286          break;
9287       case nir_tex_src_ms_index:
9288          sample_index = get_ssa_temp(ctx, instr->src[i].src.ssa);
9289          has_sample_index = true;
9290          break;
9291       case nir_tex_src_texture_offset:
9292       case nir_tex_src_sampler_offset:
9293       default: break;
9294       }
9295    }
9296 
9297    if (instr->op == nir_texop_txs && instr->sampler_dim == GLSL_SAMPLER_DIM_BUF)
9298       return get_buffer_size(ctx, resource, get_ssa_temp(ctx, &instr->dest.ssa));
9299 
9300    if (instr->op == nir_texop_texture_samples) {
9301       get_image_samples(ctx, Definition(get_ssa_temp(ctx, &instr->dest.ssa)), resource);
9302       return;
9303    }
9304 
9305    if (has_offset && instr->op != nir_texop_txf) {
9306       aco_ptr<Instruction> tmp_instr;
9307       Temp acc, pack = Temp();
9308 
9309       uint32_t pack_const = 0;
9310       for (unsigned i = 0; i < offset.size(); i++) {
9311          if (!const_offset[i])
9312             continue;
9313          pack_const |= (const_offset[i]->u32 & 0x3Fu) << (8u * i);
9314       }
9315 
9316       if (offset.type() == RegType::sgpr) {
9317          for (unsigned i = 0; i < offset.size(); i++) {
9318             if (const_offset[i])
9319                continue;
9320 
9321             acc = emit_extract_vector(ctx, offset, i, s1);
9322             acc = bld.sop2(aco_opcode::s_and_b32, bld.def(s1), bld.def(s1, scc), acc,
9323                            Operand::c32(0x3Fu));
9324 
9325             if (i) {
9326                acc = bld.sop2(aco_opcode::s_lshl_b32, bld.def(s1), bld.def(s1, scc), acc,
9327                               Operand::c32(8u * i));
9328             }
9329 
9330             if (pack == Temp()) {
9331                pack = acc;
9332             } else {
9333                pack = bld.sop2(aco_opcode::s_or_b32, bld.def(s1), bld.def(s1, scc), pack, acc);
9334             }
9335          }
9336 
9337          if (pack_const && pack != Temp())
9338             pack = bld.sop2(aco_opcode::s_or_b32, bld.def(s1), bld.def(s1, scc),
9339                             Operand::c32(pack_const), pack);
9340       } else {
9341          for (unsigned i = 0; i < offset.size(); i++) {
9342             if (const_offset[i])
9343                continue;
9344 
9345             acc = emit_extract_vector(ctx, offset, i, v1);
9346             acc = bld.vop2(aco_opcode::v_and_b32, bld.def(v1), Operand::c32(0x3Fu), acc);
9347 
9348             if (i) {
9349                acc = bld.vop2(aco_opcode::v_lshlrev_b32, bld.def(v1), Operand::c32(8u * i), acc);
9350             }
9351 
9352             if (pack == Temp()) {
9353                pack = acc;
9354             } else {
9355                pack = bld.vop2(aco_opcode::v_or_b32, bld.def(v1), pack, acc);
9356             }
9357          }
9358 
9359          if (pack_const && pack != Temp())
9360             pack = bld.sop2(aco_opcode::v_or_b32, bld.def(v1), Operand::c32(pack_const), pack);
9361       }
9362       if (pack_const && pack == Temp())
9363          offset = bld.copy(bld.def(v1), Operand::c32(pack_const));
9364       else if (pack == Temp())
9365          has_offset = false;
9366       else
9367          offset = pack;
9368    }
9369 
9370    if (instr->sampler_dim == GLSL_SAMPLER_DIM_CUBE && instr->coord_components)
9371       prepare_cube_coords(ctx, coords, &ddx, &ddy, instr->op == nir_texop_txd,
9372                           instr->is_array && instr->op != nir_texop_lod);
9373 
9374    /* pack derivatives */
9375    if (has_ddx || has_ddy) {
9376       if (instr->sampler_dim == GLSL_SAMPLER_DIM_1D && ctx->options->chip_class == GFX9) {
9377          assert(has_ddx && has_ddy && ddx.size() == 1 && ddy.size() == 1);
9378          Temp zero = bld.copy(bld.def(v1), Operand::zero());
9379          derivs = {ddx, zero, ddy, zero};
9380       } else {
9381          for (unsigned i = 0; has_ddx && i < ddx.size(); i++)
9382             derivs.emplace_back(emit_extract_vector(ctx, ddx, i, v1));
9383          for (unsigned i = 0; has_ddy && i < ddy.size(); i++)
9384             derivs.emplace_back(emit_extract_vector(ctx, ddy, i, v1));
9385       }
9386       has_derivs = true;
9387    }
9388 
9389    if (instr->coord_components > 1 && instr->sampler_dim == GLSL_SAMPLER_DIM_1D &&
9390        instr->is_array && instr->op != nir_texop_txf)
9391       coords[1] = bld.vop1(aco_opcode::v_rndne_f32, bld.def(v1), coords[1]);
9392 
9393    if (instr->coord_components > 2 &&
9394        (instr->sampler_dim == GLSL_SAMPLER_DIM_2D || instr->sampler_dim == GLSL_SAMPLER_DIM_MS ||
9395         instr->sampler_dim == GLSL_SAMPLER_DIM_SUBPASS ||
9396         instr->sampler_dim == GLSL_SAMPLER_DIM_SUBPASS_MS) &&
9397        instr->is_array && instr->op != nir_texop_txf && instr->op != nir_texop_fragment_fetch_amd &&
9398        instr->op != nir_texop_fragment_mask_fetch_amd)
9399       coords[2] = bld.vop1(aco_opcode::v_rndne_f32, bld.def(v1), coords[2]);
9400 
9401    if (ctx->options->chip_class == GFX9 && instr->sampler_dim == GLSL_SAMPLER_DIM_1D &&
9402        instr->op != nir_texop_lod && instr->coord_components) {
9403       assert(coords.size() > 0 && coords.size() < 3);
9404 
9405       coords.insert(std::next(coords.begin()),
9406                     bld.copy(bld.def(v1), instr->op == nir_texop_txf ? Operand::c32(0)
9407                                                                      : Operand::c32(0x3f000000)));
9408    }
9409 
9410    bool da = should_declare_array(ctx, instr->sampler_dim, instr->is_array);
9411 
9412    if (has_offset && instr->op == nir_texop_txf) {
9413       for (unsigned i = 0; i < std::min(offset.size(), instr->coord_components); i++) {
9414          Temp off = emit_extract_vector(ctx, offset, i, v1);
9415          coords[i] = bld.vadd32(bld.def(v1), coords[i], off);
9416       }
9417       has_offset = false;
9418    }
9419 
9420    /* Build tex instruction */
9421    unsigned dmask = nir_ssa_def_components_read(&instr->dest.ssa) & 0xf;
9422    if (instr->sampler_dim == GLSL_SAMPLER_DIM_BUF)
9423       dmask = u_bit_consecutive(0, util_last_bit(dmask));
9424    if (instr->is_sparse)
9425       dmask = MAX2(dmask, 1) | 0x10;
9426    unsigned dim =
9427       ctx->options->chip_class >= GFX10 && instr->sampler_dim != GLSL_SAMPLER_DIM_BUF
9428          ? ac_get_sampler_dim(ctx->options->chip_class, instr->sampler_dim, instr->is_array)
9429          : 0;
9430    Temp dst = get_ssa_temp(ctx, &instr->dest.ssa);
9431    Temp tmp_dst = dst;
9432 
9433    /* gather4 selects the component by dmask and always returns vec4 (vec5 if sparse) */
9434    if (instr->op == nir_texop_tg4) {
9435       assert(instr->dest.ssa.num_components == (4 + instr->is_sparse));
9436       if (instr->is_shadow)
9437          dmask = 1;
9438       else
9439          dmask = 1 << instr->component;
9440       if (tg4_integer_cube_workaround || dst.type() == RegType::sgpr)
9441          tmp_dst = bld.tmp(instr->is_sparse ? v5 : v4);
9442    } else if (instr->op == nir_texop_fragment_mask_fetch_amd) {
9443       tmp_dst = bld.tmp(v1);
9444    } else if (util_bitcount(dmask) != instr->dest.ssa.num_components ||
9445               dst.type() == RegType::sgpr) {
9446       tmp_dst = bld.tmp(RegClass(RegType::vgpr, util_bitcount(dmask)));
9447    }
9448 
9449    if (instr->op == nir_texop_txs || instr->op == nir_texop_query_levels) {
9450       if (!has_lod)
9451          lod = bld.copy(bld.def(v1), Operand::zero());
9452 
9453       MIMG_instruction* tex = emit_mimg(bld, aco_opcode::image_get_resinfo, Definition(tmp_dst),
9454                                         resource, Operand(s4), std::vector<Temp>{lod});
9455       if (ctx->options->chip_class == GFX9 && instr->op == nir_texop_txs &&
9456           instr->sampler_dim == GLSL_SAMPLER_DIM_1D && instr->is_array) {
9457          tex->dmask = (dmask & 0x1) | ((dmask & 0x2) << 1);
9458       } else if (instr->op == nir_texop_query_levels) {
9459          tex->dmask = 1 << 3;
9460       } else {
9461          tex->dmask = dmask;
9462       }
9463       tex->da = da;
9464       tex->dim = dim;
9465 
9466       expand_vector(ctx, tmp_dst, dst, instr->dest.ssa.num_components, dmask);
9467       return;
9468    }
9469 
9470    Temp tg4_compare_cube_wa64 = Temp();
9471 
9472    if (tg4_integer_workarounds) {
9473       Temp tg4_lod = bld.copy(bld.def(v1), Operand::zero());
9474       Temp size = bld.tmp(v2);
9475       MIMG_instruction* tex = emit_mimg(bld, aco_opcode::image_get_resinfo, Definition(size),
9476                                         resource, Operand(s4), std::vector<Temp>{tg4_lod});
9477       tex->dim = dim;
9478       tex->dmask = 0x3;
9479       tex->da = da;
9480       emit_split_vector(ctx, size, size.size());
9481 
9482       Temp half_texel[2];
9483       for (unsigned i = 0; i < 2; i++) {
9484          half_texel[i] = emit_extract_vector(ctx, size, i, v1);
9485          half_texel[i] = bld.vop1(aco_opcode::v_cvt_f32_i32, bld.def(v1), half_texel[i]);
9486          half_texel[i] = bld.vop1(aco_opcode::v_rcp_iflag_f32, bld.def(v1), half_texel[i]);
9487          half_texel[i] = bld.vop2(aco_opcode::v_mul_f32, bld.def(v1),
9488                                   Operand::c32(0xbf000000 /*-0.5*/), half_texel[i]);
9489       }
9490 
9491       if (instr->sampler_dim == GLSL_SAMPLER_DIM_2D && !instr->is_array) {
9492          /* In vulkan, whether the sampler uses unnormalized
9493           * coordinates or not is a dynamic property of the
9494           * sampler. Hence, to figure out whether or not we
9495           * need to divide by the texture size, we need to test
9496           * the sampler at runtime. This tests the bit set by
9497           * radv_init_sampler().
9498           */
9499          unsigned bit_idx = ffs(S_008F30_FORCE_UNNORMALIZED(1)) - 1;
9500          Temp not_needed =
9501             bld.sopc(aco_opcode::s_bitcmp0_b32, bld.def(s1, scc), sampler, Operand::c32(bit_idx));
9502 
9503          not_needed = bool_to_vector_condition(ctx, not_needed);
9504          half_texel[0] = bld.vop2(aco_opcode::v_cndmask_b32, bld.def(v1),
9505                                   Operand::c32(0xbf000000 /*-0.5*/), half_texel[0], not_needed);
9506          half_texel[1] = bld.vop2(aco_opcode::v_cndmask_b32, bld.def(v1),
9507                                   Operand::c32(0xbf000000 /*-0.5*/), half_texel[1], not_needed);
9508       }
9509 
9510       Temp new_coords[2] = {bld.vop2(aco_opcode::v_add_f32, bld.def(v1), coords[0], half_texel[0]),
9511                             bld.vop2(aco_opcode::v_add_f32, bld.def(v1), coords[1], half_texel[1])};
9512 
9513       if (tg4_integer_cube_workaround) {
9514          /* see comment in ac_nir_to_llvm.c's lower_gather4_integer() */
9515          Temp* const desc = (Temp*)alloca(resource.size() * sizeof(Temp));
9516          aco_ptr<Instruction> split{create_instruction<Pseudo_instruction>(
9517             aco_opcode::p_split_vector, Format::PSEUDO, 1, resource.size())};
9518          split->operands[0] = Operand(resource);
9519          for (unsigned i = 0; i < resource.size(); i++) {
9520             desc[i] = bld.tmp(s1);
9521             split->definitions[i] = Definition(desc[i]);
9522          }
9523          ctx->block->instructions.emplace_back(std::move(split));
9524 
9525          Temp dfmt = bld.sop2(aco_opcode::s_bfe_u32, bld.def(s1), bld.def(s1, scc), desc[1],
9526                               Operand::c32(20u | (6u << 16)));
9527          Temp compare_cube_wa = bld.sopc(aco_opcode::s_cmp_eq_u32, bld.def(s1, scc), dfmt,
9528                                          Operand::c32(V_008F14_IMG_DATA_FORMAT_8_8_8_8));
9529 
9530          Temp nfmt;
9531          if (stype == GLSL_TYPE_UINT) {
9532             nfmt = bld.sop2(aco_opcode::s_cselect_b32, bld.def(s1),
9533                             Operand::c32(V_008F14_IMG_NUM_FORMAT_USCALED),
9534                             Operand::c32(V_008F14_IMG_NUM_FORMAT_UINT), bld.scc(compare_cube_wa));
9535          } else {
9536             nfmt = bld.sop2(aco_opcode::s_cselect_b32, bld.def(s1),
9537                             Operand::c32(V_008F14_IMG_NUM_FORMAT_SSCALED),
9538                             Operand::c32(V_008F14_IMG_NUM_FORMAT_SINT), bld.scc(compare_cube_wa));
9539          }
9540          tg4_compare_cube_wa64 = bld.tmp(bld.lm);
9541          bool_to_vector_condition(ctx, compare_cube_wa, tg4_compare_cube_wa64);
9542 
9543          nfmt = bld.sop2(aco_opcode::s_lshl_b32, bld.def(s1), bld.def(s1, scc), nfmt,
9544                          Operand::c32(26u));
9545 
9546          desc[1] = bld.sop2(aco_opcode::s_and_b32, bld.def(s1), bld.def(s1, scc), desc[1],
9547                             Operand::c32(C_008F14_NUM_FORMAT));
9548          desc[1] = bld.sop2(aco_opcode::s_or_b32, bld.def(s1), bld.def(s1, scc), desc[1], nfmt);
9549 
9550          aco_ptr<Instruction> vec{create_instruction<Pseudo_instruction>(
9551             aco_opcode::p_create_vector, Format::PSEUDO, resource.size(), 1)};
9552          for (unsigned i = 0; i < resource.size(); i++)
9553             vec->operands[i] = Operand(desc[i]);
9554          resource = bld.tmp(resource.regClass());
9555          vec->definitions[0] = Definition(resource);
9556          ctx->block->instructions.emplace_back(std::move(vec));
9557 
9558          new_coords[0] = bld.vop2(aco_opcode::v_cndmask_b32, bld.def(v1), new_coords[0], coords[0],
9559                                   tg4_compare_cube_wa64);
9560          new_coords[1] = bld.vop2(aco_opcode::v_cndmask_b32, bld.def(v1), new_coords[1], coords[1],
9561                                   tg4_compare_cube_wa64);
9562       }
9563       coords[0] = new_coords[0];
9564       coords[1] = new_coords[1];
9565    }
9566 
9567    if (instr->sampler_dim == GLSL_SAMPLER_DIM_BUF) {
9568       // FIXME: if (ctx->abi->gfx9_stride_size_workaround) return
9569       // ac_build_buffer_load_format_gfx9_safe()
9570 
9571       assert(coords.size() == 1);
9572       aco_opcode op;
9573       switch (util_last_bit(dmask & 0xf)) {
9574       case 1: op = aco_opcode::buffer_load_format_x; break;
9575       case 2: op = aco_opcode::buffer_load_format_xy; break;
9576       case 3: op = aco_opcode::buffer_load_format_xyz; break;
9577       case 4: op = aco_opcode::buffer_load_format_xyzw; break;
9578       default: unreachable("Tex instruction loads more than 4 components.");
9579       }
9580 
9581       aco_ptr<MUBUF_instruction> mubuf{
9582          create_instruction<MUBUF_instruction>(op, Format::MUBUF, 3 + instr->is_sparse, 1)};
9583       mubuf->operands[0] = Operand(resource);
9584       mubuf->operands[1] = Operand(coords[0]);
9585       mubuf->operands[2] = Operand::c32(0);
9586       mubuf->definitions[0] = Definition(tmp_dst);
9587       mubuf->idxen = true;
9588       mubuf->tfe = instr->is_sparse;
9589       if (mubuf->tfe)
9590          mubuf->operands[3] = emit_tfe_init(bld, tmp_dst);
9591       ctx->block->instructions.emplace_back(std::move(mubuf));
9592 
9593       expand_vector(ctx, tmp_dst, dst, instr->dest.ssa.num_components, dmask);
9594       return;
9595    }
9596 
9597    /* gather MIMG address components */
9598    std::vector<Temp> args;
9599    unsigned wqm_mask = 0;
9600    if (has_offset) {
9601       wqm_mask |= u_bit_consecutive(args.size(), 1);
9602       args.emplace_back(offset);
9603    }
9604    if (has_bias)
9605       args.emplace_back(bias);
9606    if (has_compare)
9607       args.emplace_back(compare);
9608    if (has_derivs)
9609       args.insert(args.end(), derivs.begin(), derivs.end());
9610 
9611    wqm_mask |= u_bit_consecutive(args.size(), coords.size());
9612    args.insert(args.end(), coords.begin(), coords.end());
9613 
9614    if (has_sample_index)
9615       args.emplace_back(sample_index);
9616    if (has_lod)
9617       args.emplace_back(lod);
9618    if (has_clamped_lod)
9619       args.emplace_back(clamped_lod);
9620 
9621    if (instr->op == nir_texop_txf || instr->op == nir_texop_fragment_fetch_amd ||
9622        instr->op == nir_texop_fragment_mask_fetch_amd) {
9623       aco_opcode op = level_zero || instr->sampler_dim == GLSL_SAMPLER_DIM_MS ||
9624                             instr->sampler_dim == GLSL_SAMPLER_DIM_SUBPASS_MS
9625                          ? aco_opcode::image_load
9626                          : aco_opcode::image_load_mip;
9627       Operand vdata = instr->is_sparse ? emit_tfe_init(bld, tmp_dst) : Operand(v1);
9628       MIMG_instruction* tex =
9629          emit_mimg(bld, op, Definition(tmp_dst), resource, Operand(s4), args, 0, vdata);
9630       if (instr->op == nir_texop_fragment_mask_fetch_amd)
9631          tex->dim = da ? ac_image_2darray : ac_image_2d;
9632       else
9633          tex->dim = dim;
9634       tex->dmask = dmask & 0xf;
9635       tex->unrm = true;
9636       tex->da = da;
9637       tex->tfe = instr->is_sparse;
9638 
9639       if (instr->op == nir_texop_fragment_mask_fetch_amd) {
9640          /* Use 0x76543210 if the image doesn't have FMASK. */
9641          assert(dmask == 1 && dst.bytes() == 4);
9642          assert(dst.id() != tmp_dst.id());
9643 
9644          if (dst.regClass() == s1) {
9645             Temp is_not_null = bld.sopc(aco_opcode::s_cmp_lg_u32, bld.def(s1, scc), Operand::zero(),
9646                                         emit_extract_vector(ctx, resource, 1, s1));
9647             bld.sop2(aco_opcode::s_cselect_b32, Definition(dst),
9648                      bld.as_uniform(tmp_dst), Operand::c32(0x76543210),
9649                      bld.scc(is_not_null));
9650          } else {
9651             Temp is_not_null = bld.tmp(bld.lm);
9652             bld.vopc_e64(aco_opcode::v_cmp_lg_u32, Definition(is_not_null), Operand::zero(),
9653                          emit_extract_vector(ctx, resource, 1, s1))
9654                .def(0)
9655                .setHint(vcc);
9656             bld.vop2(aco_opcode::v_cndmask_b32, Definition(dst),
9657                      bld.copy(bld.def(v1), Operand::c32(0x76543210)), tmp_dst, is_not_null);
9658          }
9659       } else {
9660          expand_vector(ctx, tmp_dst, dst, instr->dest.ssa.num_components, dmask);
9661       }
9662       return;
9663    }
9664 
9665    // TODO: would be better to do this by adding offsets, but needs the opcodes ordered.
9666    aco_opcode opcode = aco_opcode::image_sample;
9667    if (has_offset) { /* image_sample_*_o */
9668       if (has_clamped_lod) {
9669          if (has_compare) {
9670             opcode = aco_opcode::image_sample_c_cl_o;
9671             if (has_derivs)
9672                opcode = aco_opcode::image_sample_c_d_cl_o;
9673             if (has_bias)
9674                opcode = aco_opcode::image_sample_c_b_cl_o;
9675          } else {
9676             opcode = aco_opcode::image_sample_cl_o;
9677             if (has_derivs)
9678                opcode = aco_opcode::image_sample_d_cl_o;
9679             if (has_bias)
9680                opcode = aco_opcode::image_sample_b_cl_o;
9681          }
9682       } else if (has_compare) {
9683          opcode = aco_opcode::image_sample_c_o;
9684          if (has_derivs)
9685             opcode = aco_opcode::image_sample_c_d_o;
9686          if (has_bias)
9687             opcode = aco_opcode::image_sample_c_b_o;
9688          if (level_zero)
9689             opcode = aco_opcode::image_sample_c_lz_o;
9690          if (has_lod)
9691             opcode = aco_opcode::image_sample_c_l_o;
9692       } else {
9693          opcode = aco_opcode::image_sample_o;
9694          if (has_derivs)
9695             opcode = aco_opcode::image_sample_d_o;
9696          if (has_bias)
9697             opcode = aco_opcode::image_sample_b_o;
9698          if (level_zero)
9699             opcode = aco_opcode::image_sample_lz_o;
9700          if (has_lod)
9701             opcode = aco_opcode::image_sample_l_o;
9702       }
9703    } else if (has_clamped_lod) { /* image_sample_*_cl */
9704       if (has_compare) {
9705          opcode = aco_opcode::image_sample_c_cl;
9706          if (has_derivs)
9707             opcode = aco_opcode::image_sample_c_d_cl;
9708          if (has_bias)
9709             opcode = aco_opcode::image_sample_c_b_cl;
9710       } else {
9711          opcode = aco_opcode::image_sample_cl;
9712          if (has_derivs)
9713             opcode = aco_opcode::image_sample_d_cl;
9714          if (has_bias)
9715             opcode = aco_opcode::image_sample_b_cl;
9716       }
9717    } else { /* no offset */
9718       if (has_compare) {
9719          opcode = aco_opcode::image_sample_c;
9720          if (has_derivs)
9721             opcode = aco_opcode::image_sample_c_d;
9722          if (has_bias)
9723             opcode = aco_opcode::image_sample_c_b;
9724          if (level_zero)
9725             opcode = aco_opcode::image_sample_c_lz;
9726          if (has_lod)
9727             opcode = aco_opcode::image_sample_c_l;
9728       } else {
9729          opcode = aco_opcode::image_sample;
9730          if (has_derivs)
9731             opcode = aco_opcode::image_sample_d;
9732          if (has_bias)
9733             opcode = aco_opcode::image_sample_b;
9734          if (level_zero)
9735             opcode = aco_opcode::image_sample_lz;
9736          if (has_lod)
9737             opcode = aco_opcode::image_sample_l;
9738       }
9739    }
9740 
9741    if (instr->op == nir_texop_tg4) {
9742       if (has_offset) { /* image_gather4_*_o */
9743          if (has_compare) {
9744             opcode = aco_opcode::image_gather4_c_lz_o;
9745             if (has_lod)
9746                opcode = aco_opcode::image_gather4_c_l_o;
9747             if (has_bias)
9748                opcode = aco_opcode::image_gather4_c_b_o;
9749          } else {
9750             opcode = aco_opcode::image_gather4_lz_o;
9751             if (has_lod)
9752                opcode = aco_opcode::image_gather4_l_o;
9753             if (has_bias)
9754                opcode = aco_opcode::image_gather4_b_o;
9755          }
9756       } else {
9757          if (has_compare) {
9758             opcode = aco_opcode::image_gather4_c_lz;
9759             if (has_lod)
9760                opcode = aco_opcode::image_gather4_c_l;
9761             if (has_bias)
9762                opcode = aco_opcode::image_gather4_c_b;
9763          } else {
9764             opcode = aco_opcode::image_gather4_lz;
9765             if (has_lod)
9766                opcode = aco_opcode::image_gather4_l;
9767             if (has_bias)
9768                opcode = aco_opcode::image_gather4_b;
9769          }
9770       }
9771    } else if (instr->op == nir_texop_lod) {
9772       opcode = aco_opcode::image_get_lod;
9773    }
9774 
9775    bool implicit_derivs = bld.program->stage == fragment_fs && !has_derivs && !has_lod &&
9776                           !level_zero && instr->sampler_dim != GLSL_SAMPLER_DIM_MS &&
9777                           instr->sampler_dim != GLSL_SAMPLER_DIM_SUBPASS_MS;
9778 
9779    Operand vdata = instr->is_sparse ? emit_tfe_init(bld, tmp_dst) : Operand(v1);
9780    MIMG_instruction* tex = emit_mimg(bld, opcode, Definition(tmp_dst), resource, Operand(sampler),
9781                                      args, implicit_derivs ? wqm_mask : 0, vdata);
9782    tex->dim = dim;
9783    tex->dmask = dmask & 0xf;
9784    tex->da = da;
9785    tex->tfe = instr->is_sparse;
9786 
9787    if (tg4_integer_cube_workaround) {
9788       assert(tmp_dst.id() != dst.id());
9789       assert(tmp_dst.size() == dst.size());
9790 
9791       emit_split_vector(ctx, tmp_dst, tmp_dst.size());
9792       Temp val[4];
9793       for (unsigned i = 0; i < 4; i++) {
9794          val[i] = emit_extract_vector(ctx, tmp_dst, i, v1);
9795          Temp cvt_val;
9796          if (stype == GLSL_TYPE_UINT)
9797             cvt_val = bld.vop1(aco_opcode::v_cvt_u32_f32, bld.def(v1), val[i]);
9798          else
9799             cvt_val = bld.vop1(aco_opcode::v_cvt_i32_f32, bld.def(v1), val[i]);
9800          val[i] = bld.vop2(aco_opcode::v_cndmask_b32, bld.def(v1), val[i], cvt_val,
9801                            tg4_compare_cube_wa64);
9802       }
9803 
9804       Temp tmp = dst.regClass() == tmp_dst.regClass() ? dst : bld.tmp(tmp_dst.regClass());
9805       if (instr->is_sparse)
9806          tmp_dst = bld.pseudo(aco_opcode::p_create_vector, Definition(tmp), val[0], val[1], val[2],
9807                               val[3], emit_extract_vector(ctx, tmp_dst, 4, v1));
9808       else
9809          tmp_dst = bld.pseudo(aco_opcode::p_create_vector, Definition(tmp), val[0], val[1], val[2],
9810                               val[3]);
9811    }
9812    unsigned mask = instr->op == nir_texop_tg4 ? (instr->is_sparse ? 0x1F : 0xF) : dmask;
9813    expand_vector(ctx, tmp_dst, dst, instr->dest.ssa.num_components, mask);
9814 }
9815 
9816 Operand
get_phi_operand(isel_context * ctx,nir_ssa_def * ssa,RegClass rc,bool logical)9817 get_phi_operand(isel_context* ctx, nir_ssa_def* ssa, RegClass rc, bool logical)
9818 {
9819    Temp tmp = get_ssa_temp(ctx, ssa);
9820    if (ssa->parent_instr->type == nir_instr_type_ssa_undef) {
9821       return Operand(rc);
9822    } else if (logical && ssa->bit_size == 1 &&
9823               ssa->parent_instr->type == nir_instr_type_load_const) {
9824       if (ctx->program->wave_size == 64)
9825          return Operand::c64(nir_instr_as_load_const(ssa->parent_instr)->value[0].b ? UINT64_MAX
9826                                                                                     : 0u);
9827       else
9828          return Operand::c32(nir_instr_as_load_const(ssa->parent_instr)->value[0].b ? UINT32_MAX
9829                                                                                     : 0u);
9830    } else {
9831       return Operand(tmp);
9832    }
9833 }
9834 
9835 void
visit_phi(isel_context * ctx,nir_phi_instr * instr)9836 visit_phi(isel_context* ctx, nir_phi_instr* instr)
9837 {
9838    aco_ptr<Pseudo_instruction> phi;
9839    Temp dst = get_ssa_temp(ctx, &instr->dest.ssa);
9840    assert(instr->dest.ssa.bit_size != 1 || dst.regClass() == ctx->program->lane_mask);
9841 
9842    bool logical = !dst.is_linear() || nir_dest_is_divergent(instr->dest);
9843    logical |= (ctx->block->kind & block_kind_merge) != 0;
9844    aco_opcode opcode = logical ? aco_opcode::p_phi : aco_opcode::p_linear_phi;
9845 
9846    /* we want a sorted list of sources, since the predecessor list is also sorted */
9847    std::map<unsigned, nir_ssa_def*> phi_src;
9848    nir_foreach_phi_src (src, instr)
9849       phi_src[src->pred->index] = src->src.ssa;
9850 
9851    std::vector<unsigned>& preds = logical ? ctx->block->logical_preds : ctx->block->linear_preds;
9852    unsigned num_operands = 0;
9853    Operand* const operands = (Operand*)alloca(
9854       (std::max(exec_list_length(&instr->srcs), (unsigned)preds.size()) + 1) * sizeof(Operand));
9855    unsigned num_defined = 0;
9856    unsigned cur_pred_idx = 0;
9857    for (std::pair<unsigned, nir_ssa_def*> src : phi_src) {
9858       if (cur_pred_idx < preds.size()) {
9859          /* handle missing preds (IF merges with discard/break) and extra preds
9860           * (loop exit with discard) */
9861          unsigned block = ctx->cf_info.nir_to_aco[src.first];
9862          unsigned skipped = 0;
9863          while (cur_pred_idx + skipped < preds.size() && preds[cur_pred_idx + skipped] != block)
9864             skipped++;
9865          if (cur_pred_idx + skipped < preds.size()) {
9866             for (unsigned i = 0; i < skipped; i++)
9867                operands[num_operands++] = Operand(dst.regClass());
9868             cur_pred_idx += skipped;
9869          } else {
9870             continue;
9871          }
9872       }
9873       /* Handle missing predecessors at the end. This shouldn't happen with loop
9874        * headers and we can't ignore these sources for loop header phis. */
9875       if (!(ctx->block->kind & block_kind_loop_header) && cur_pred_idx >= preds.size())
9876          continue;
9877       cur_pred_idx++;
9878       Operand op = get_phi_operand(ctx, src.second, dst.regClass(), logical);
9879       operands[num_operands++] = op;
9880       num_defined += !op.isUndefined();
9881    }
9882    /* handle block_kind_continue_or_break at loop exit blocks */
9883    while (cur_pred_idx++ < preds.size())
9884       operands[num_operands++] = Operand(dst.regClass());
9885 
9886    /* If the loop ends with a break, still add a linear continue edge in case
9887     * that break is divergent or continue_or_break is used. We'll either remove
9888     * this operand later in visit_loop() if it's not necessary or replace the
9889     * undef with something correct. */
9890    if (!logical && ctx->block->kind & block_kind_loop_header) {
9891       nir_loop* loop = nir_cf_node_as_loop(instr->instr.block->cf_node.parent);
9892       nir_block* last = nir_loop_last_block(loop);
9893       if (last->successors[0] != instr->instr.block)
9894          operands[num_operands++] = Operand(RegClass());
9895    }
9896 
9897    /* we can use a linear phi in some cases if one src is undef */
9898    if (dst.is_linear() && ctx->block->kind & block_kind_merge && num_defined == 1) {
9899       phi.reset(create_instruction<Pseudo_instruction>(aco_opcode::p_linear_phi, Format::PSEUDO,
9900                                                        num_operands, 1));
9901 
9902       Block* linear_else = &ctx->program->blocks[ctx->block->linear_preds[1]];
9903       Block* invert = &ctx->program->blocks[linear_else->linear_preds[0]];
9904       assert(invert->kind & block_kind_invert);
9905 
9906       unsigned then_block = invert->linear_preds[0];
9907 
9908       Block* insert_block = NULL;
9909       for (unsigned i = 0; i < num_operands; i++) {
9910          Operand op = operands[i];
9911          if (op.isUndefined())
9912             continue;
9913          insert_block = ctx->block->logical_preds[i] == then_block ? invert : ctx->block;
9914          phi->operands[0] = op;
9915          break;
9916       }
9917       assert(insert_block); /* should be handled by the "num_defined == 0" case above */
9918       phi->operands[1] = Operand(dst.regClass());
9919       phi->definitions[0] = Definition(dst);
9920       insert_block->instructions.emplace(insert_block->instructions.begin(), std::move(phi));
9921       return;
9922    }
9923 
9924    phi.reset(create_instruction<Pseudo_instruction>(opcode, Format::PSEUDO, num_operands, 1));
9925    for (unsigned i = 0; i < num_operands; i++)
9926       phi->operands[i] = operands[i];
9927    phi->definitions[0] = Definition(dst);
9928    ctx->block->instructions.emplace(ctx->block->instructions.begin(), std::move(phi));
9929 }
9930 
9931 void
visit_undef(isel_context * ctx,nir_ssa_undef_instr * instr)9932 visit_undef(isel_context* ctx, nir_ssa_undef_instr* instr)
9933 {
9934    Temp dst = get_ssa_temp(ctx, &instr->def);
9935 
9936    assert(dst.type() == RegType::sgpr);
9937 
9938    if (dst.size() == 1) {
9939       Builder(ctx->program, ctx->block).copy(Definition(dst), Operand::zero());
9940    } else {
9941       aco_ptr<Pseudo_instruction> vec{create_instruction<Pseudo_instruction>(
9942          aco_opcode::p_create_vector, Format::PSEUDO, dst.size(), 1)};
9943       for (unsigned i = 0; i < dst.size(); i++)
9944          vec->operands[i] = Operand::zero();
9945       vec->definitions[0] = Definition(dst);
9946       ctx->block->instructions.emplace_back(std::move(vec));
9947    }
9948 }
9949 
9950 void
begin_loop(isel_context * ctx,loop_context * lc)9951 begin_loop(isel_context* ctx, loop_context* lc)
9952 {
9953    // TODO: we might want to wrap the loop around a branch if exec_potentially_empty=true
9954    append_logical_end(ctx->block);
9955    ctx->block->kind |= block_kind_loop_preheader | block_kind_uniform;
9956    Builder bld(ctx->program, ctx->block);
9957    bld.branch(aco_opcode::p_branch, bld.hint_vcc(bld.def(s2)));
9958    unsigned loop_preheader_idx = ctx->block->index;
9959 
9960    lc->loop_exit.kind |= (block_kind_loop_exit | (ctx->block->kind & block_kind_top_level));
9961 
9962    ctx->program->next_loop_depth++;
9963 
9964    Block* loop_header = ctx->program->create_and_insert_block();
9965    loop_header->kind |= block_kind_loop_header;
9966    add_edge(loop_preheader_idx, loop_header);
9967    ctx->block = loop_header;
9968 
9969    append_logical_start(ctx->block);
9970 
9971    lc->header_idx_old = std::exchange(ctx->cf_info.parent_loop.header_idx, loop_header->index);
9972    lc->exit_old = std::exchange(ctx->cf_info.parent_loop.exit, &lc->loop_exit);
9973    lc->divergent_cont_old = std::exchange(ctx->cf_info.parent_loop.has_divergent_continue, false);
9974    lc->divergent_branch_old = std::exchange(ctx->cf_info.parent_loop.has_divergent_branch, false);
9975    lc->divergent_if_old = std::exchange(ctx->cf_info.parent_if.is_divergent, false);
9976 }
9977 
9978 void
end_loop(isel_context * ctx,loop_context * lc)9979 end_loop(isel_context* ctx, loop_context* lc)
9980 {
9981    // TODO: what if a loop ends with a unconditional or uniformly branched continue
9982    //       and this branch is never taken?
9983    if (!ctx->cf_info.has_branch) {
9984       unsigned loop_header_idx = ctx->cf_info.parent_loop.header_idx;
9985       Builder bld(ctx->program, ctx->block);
9986       append_logical_end(ctx->block);
9987 
9988       if (ctx->cf_info.exec_potentially_empty_discard ||
9989           ctx->cf_info.exec_potentially_empty_break) {
9990          /* Discards can result in code running with an empty exec mask.
9991           * This would result in divergent breaks not ever being taken. As a
9992           * workaround, break the loop when the loop mask is empty instead of
9993           * always continuing. */
9994          ctx->block->kind |= (block_kind_continue_or_break | block_kind_uniform);
9995          unsigned block_idx = ctx->block->index;
9996 
9997          /* create helper blocks to avoid critical edges */
9998          Block* break_block = ctx->program->create_and_insert_block();
9999          break_block->kind = block_kind_uniform;
10000          bld.reset(break_block);
10001          bld.branch(aco_opcode::p_branch, bld.hint_vcc(bld.def(s2)));
10002          add_linear_edge(block_idx, break_block);
10003          add_linear_edge(break_block->index, &lc->loop_exit);
10004 
10005          Block* continue_block = ctx->program->create_and_insert_block();
10006          continue_block->kind = block_kind_uniform;
10007          bld.reset(continue_block);
10008          bld.branch(aco_opcode::p_branch, bld.hint_vcc(bld.def(s2)));
10009          add_linear_edge(block_idx, continue_block);
10010          add_linear_edge(continue_block->index, &ctx->program->blocks[loop_header_idx]);
10011 
10012          if (!ctx->cf_info.parent_loop.has_divergent_branch)
10013             add_logical_edge(block_idx, &ctx->program->blocks[loop_header_idx]);
10014          ctx->block = &ctx->program->blocks[block_idx];
10015       } else {
10016          ctx->block->kind |= (block_kind_continue | block_kind_uniform);
10017          if (!ctx->cf_info.parent_loop.has_divergent_branch)
10018             add_edge(ctx->block->index, &ctx->program->blocks[loop_header_idx]);
10019          else
10020             add_linear_edge(ctx->block->index, &ctx->program->blocks[loop_header_idx]);
10021       }
10022 
10023       bld.reset(ctx->block);
10024       bld.branch(aco_opcode::p_branch, bld.hint_vcc(bld.def(s2)));
10025    }
10026 
10027    ctx->cf_info.has_branch = false;
10028    ctx->program->next_loop_depth--;
10029 
10030    // TODO: if the loop has not a single exit, we must add one °°
10031    /* emit loop successor block */
10032    ctx->block = ctx->program->insert_block(std::move(lc->loop_exit));
10033    append_logical_start(ctx->block);
10034 
10035 #if 0
10036    // TODO: check if it is beneficial to not branch on continues
10037    /* trim linear phis in loop header */
10038    for (auto&& instr : loop_entry->instructions) {
10039       if (instr->opcode == aco_opcode::p_linear_phi) {
10040          aco_ptr<Pseudo_instruction> new_phi{create_instruction<Pseudo_instruction>(aco_opcode::p_linear_phi, Format::PSEUDO, loop_entry->linear_predecessors.size(), 1)};
10041          new_phi->definitions[0] = instr->definitions[0];
10042          for (unsigned i = 0; i < new_phi->operands.size(); i++)
10043             new_phi->operands[i] = instr->operands[i];
10044          /* check that the remaining operands are all the same */
10045          for (unsigned i = new_phi->operands.size(); i < instr->operands.size(); i++)
10046             assert(instr->operands[i].tempId() == instr->operands.back().tempId());
10047          instr.swap(new_phi);
10048       } else if (instr->opcode == aco_opcode::p_phi) {
10049          continue;
10050       } else {
10051          break;
10052       }
10053    }
10054 #endif
10055 
10056    ctx->cf_info.parent_loop.header_idx = lc->header_idx_old;
10057    ctx->cf_info.parent_loop.exit = lc->exit_old;
10058    ctx->cf_info.parent_loop.has_divergent_continue = lc->divergent_cont_old;
10059    ctx->cf_info.parent_loop.has_divergent_branch = lc->divergent_branch_old;
10060    ctx->cf_info.parent_if.is_divergent = lc->divergent_if_old;
10061    if (!ctx->block->loop_nest_depth && !ctx->cf_info.parent_if.is_divergent)
10062       ctx->cf_info.exec_potentially_empty_discard = false;
10063 }
10064 
10065 void
emit_loop_jump(isel_context * ctx,bool is_break)10066 emit_loop_jump(isel_context* ctx, bool is_break)
10067 {
10068    Builder bld(ctx->program, ctx->block);
10069    Block* logical_target;
10070    append_logical_end(ctx->block);
10071    unsigned idx = ctx->block->index;
10072 
10073    if (is_break) {
10074       logical_target = ctx->cf_info.parent_loop.exit;
10075       add_logical_edge(idx, logical_target);
10076       ctx->block->kind |= block_kind_break;
10077 
10078       if (!ctx->cf_info.parent_if.is_divergent &&
10079           !ctx->cf_info.parent_loop.has_divergent_continue) {
10080          /* uniform break - directly jump out of the loop */
10081          ctx->block->kind |= block_kind_uniform;
10082          ctx->cf_info.has_branch = true;
10083          bld.branch(aco_opcode::p_branch, bld.hint_vcc(bld.def(s2)));
10084          add_linear_edge(idx, logical_target);
10085          return;
10086       }
10087       ctx->cf_info.parent_loop.has_divergent_branch = true;
10088    } else {
10089       logical_target = &ctx->program->blocks[ctx->cf_info.parent_loop.header_idx];
10090       add_logical_edge(idx, logical_target);
10091       ctx->block->kind |= block_kind_continue;
10092 
10093       if (!ctx->cf_info.parent_if.is_divergent) {
10094          /* uniform continue - directly jump to the loop header */
10095          ctx->block->kind |= block_kind_uniform;
10096          ctx->cf_info.has_branch = true;
10097          bld.branch(aco_opcode::p_branch, bld.hint_vcc(bld.def(s2)));
10098          add_linear_edge(idx, logical_target);
10099          return;
10100       }
10101 
10102       /* for potential uniform breaks after this continue,
10103          we must ensure that they are handled correctly */
10104       ctx->cf_info.parent_loop.has_divergent_continue = true;
10105       ctx->cf_info.parent_loop.has_divergent_branch = true;
10106    }
10107 
10108    if (ctx->cf_info.parent_if.is_divergent && !ctx->cf_info.exec_potentially_empty_break) {
10109       ctx->cf_info.exec_potentially_empty_break = true;
10110       ctx->cf_info.exec_potentially_empty_break_depth = ctx->block->loop_nest_depth;
10111    }
10112 
10113    /* remove critical edges from linear CFG */
10114    bld.branch(aco_opcode::p_branch, bld.hint_vcc(bld.def(s2)));
10115    Block* break_block = ctx->program->create_and_insert_block();
10116    break_block->kind |= block_kind_uniform;
10117    add_linear_edge(idx, break_block);
10118    /* the loop_header pointer might be invalidated by this point */
10119    if (!is_break)
10120       logical_target = &ctx->program->blocks[ctx->cf_info.parent_loop.header_idx];
10121    add_linear_edge(break_block->index, logical_target);
10122    bld.reset(break_block);
10123    bld.branch(aco_opcode::p_branch, bld.hint_vcc(bld.def(s2)));
10124 
10125    Block* continue_block = ctx->program->create_and_insert_block();
10126    add_linear_edge(idx, continue_block);
10127    append_logical_start(continue_block);
10128    ctx->block = continue_block;
10129 }
10130 
10131 void
emit_loop_break(isel_context * ctx)10132 emit_loop_break(isel_context* ctx)
10133 {
10134    emit_loop_jump(ctx, true);
10135 }
10136 
10137 void
emit_loop_continue(isel_context * ctx)10138 emit_loop_continue(isel_context* ctx)
10139 {
10140    emit_loop_jump(ctx, false);
10141 }
10142 
10143 void
visit_jump(isel_context * ctx,nir_jump_instr * instr)10144 visit_jump(isel_context* ctx, nir_jump_instr* instr)
10145 {
10146    /* visit_block() would usually do this but divergent jumps updates ctx->block */
10147    ctx->cf_info.nir_to_aco[instr->instr.block->index] = ctx->block->index;
10148 
10149    switch (instr->type) {
10150    case nir_jump_break: emit_loop_break(ctx); break;
10151    case nir_jump_continue: emit_loop_continue(ctx); break;
10152    default: isel_err(&instr->instr, "Unknown NIR jump instr"); abort();
10153    }
10154 }
10155 
10156 void
visit_block(isel_context * ctx,nir_block * block)10157 visit_block(isel_context* ctx, nir_block* block)
10158 {
10159    nir_foreach_instr (instr, block) {
10160       switch (instr->type) {
10161       case nir_instr_type_alu: visit_alu_instr(ctx, nir_instr_as_alu(instr)); break;
10162       case nir_instr_type_load_const: visit_load_const(ctx, nir_instr_as_load_const(instr)); break;
10163       case nir_instr_type_intrinsic: visit_intrinsic(ctx, nir_instr_as_intrinsic(instr)); break;
10164       case nir_instr_type_tex: visit_tex(ctx, nir_instr_as_tex(instr)); break;
10165       case nir_instr_type_phi: visit_phi(ctx, nir_instr_as_phi(instr)); break;
10166       case nir_instr_type_ssa_undef: visit_undef(ctx, nir_instr_as_ssa_undef(instr)); break;
10167       case nir_instr_type_deref: break;
10168       case nir_instr_type_jump: visit_jump(ctx, nir_instr_as_jump(instr)); break;
10169       default: isel_err(instr, "Unknown NIR instr type");
10170       }
10171    }
10172 
10173    if (!ctx->cf_info.parent_loop.has_divergent_branch)
10174       ctx->cf_info.nir_to_aco[block->index] = ctx->block->index;
10175 }
10176 
10177 static Operand
create_continue_phis(isel_context * ctx,unsigned first,unsigned last,aco_ptr<Instruction> & header_phi,Operand * vals)10178 create_continue_phis(isel_context* ctx, unsigned first, unsigned last,
10179                      aco_ptr<Instruction>& header_phi, Operand* vals)
10180 {
10181    vals[0] = Operand(header_phi->definitions[0].getTemp());
10182    RegClass rc = vals[0].regClass();
10183 
10184    unsigned loop_nest_depth = ctx->program->blocks[first].loop_nest_depth;
10185 
10186    unsigned next_pred = 1;
10187 
10188    for (unsigned idx = first + 1; idx <= last; idx++) {
10189       Block& block = ctx->program->blocks[idx];
10190       if (block.loop_nest_depth != loop_nest_depth) {
10191          vals[idx - first] = vals[idx - 1 - first];
10192          continue;
10193       }
10194 
10195       if ((block.kind & block_kind_continue) && block.index != last) {
10196          vals[idx - first] = header_phi->operands[next_pred];
10197          next_pred++;
10198          continue;
10199       }
10200 
10201       bool all_same = true;
10202       for (unsigned i = 1; all_same && (i < block.linear_preds.size()); i++)
10203          all_same = vals[block.linear_preds[i] - first] == vals[block.linear_preds[0] - first];
10204 
10205       Operand val;
10206       if (all_same) {
10207          val = vals[block.linear_preds[0] - first];
10208       } else {
10209          aco_ptr<Instruction> phi(create_instruction<Pseudo_instruction>(
10210             aco_opcode::p_linear_phi, Format::PSEUDO, block.linear_preds.size(), 1));
10211          for (unsigned i = 0; i < block.linear_preds.size(); i++)
10212             phi->operands[i] = vals[block.linear_preds[i] - first];
10213          val = Operand(ctx->program->allocateTmp(rc));
10214          phi->definitions[0] = Definition(val.getTemp());
10215          block.instructions.emplace(block.instructions.begin(), std::move(phi));
10216       }
10217       vals[idx - first] = val;
10218    }
10219 
10220    return vals[last - first];
10221 }
10222 
10223 static void begin_uniform_if_then(isel_context* ctx, if_context* ic, Temp cond);
10224 static void begin_uniform_if_else(isel_context* ctx, if_context* ic);
10225 static void end_uniform_if(isel_context* ctx, if_context* ic);
10226 
10227 static void
visit_loop(isel_context * ctx,nir_loop * loop)10228 visit_loop(isel_context* ctx, nir_loop* loop)
10229 {
10230    loop_context lc;
10231    begin_loop(ctx, &lc);
10232 
10233    /* NIR seems to allow this, and even though the loop exit has no predecessors, SSA defs from the
10234     * loop header are live. Handle this without complicating the ACO IR by creating a dummy break.
10235     */
10236    if (nir_cf_node_cf_tree_next(&loop->cf_node)->predecessors->entries == 0) {
10237       Builder bld(ctx->program, ctx->block);
10238       Temp cond = bld.copy(bld.def(s1, scc), Operand::zero());
10239       if_context ic;
10240       begin_uniform_if_then(ctx, &ic, cond);
10241       emit_loop_break(ctx);
10242       begin_uniform_if_else(ctx, &ic);
10243       end_uniform_if(ctx, &ic);
10244    }
10245 
10246    bool unreachable = visit_cf_list(ctx, &loop->body);
10247 
10248    unsigned loop_header_idx = ctx->cf_info.parent_loop.header_idx;
10249 
10250    /* Fixup phis in loop header from unreachable blocks.
10251     * has_branch/has_divergent_branch also indicates if the loop ends with a
10252     * break/continue instruction, but we don't emit those if unreachable=true */
10253    if (unreachable) {
10254       assert(ctx->cf_info.has_branch || ctx->cf_info.parent_loop.has_divergent_branch);
10255       bool linear = ctx->cf_info.has_branch;
10256       bool logical = ctx->cf_info.has_branch || ctx->cf_info.parent_loop.has_divergent_branch;
10257       for (aco_ptr<Instruction>& instr : ctx->program->blocks[loop_header_idx].instructions) {
10258          if ((logical && instr->opcode == aco_opcode::p_phi) ||
10259              (linear && instr->opcode == aco_opcode::p_linear_phi)) {
10260             /* the last operand should be the one that needs to be removed */
10261             instr->operands.pop_back();
10262          } else if (!is_phi(instr)) {
10263             break;
10264          }
10265       }
10266    }
10267 
10268    /* Fixup linear phis in loop header from expecting a continue. Both this fixup
10269     * and the previous one shouldn't both happen at once because a break in the
10270     * merge block would get CSE'd */
10271    if (nir_loop_last_block(loop)->successors[0] != nir_loop_first_block(loop)) {
10272       unsigned num_vals = ctx->cf_info.has_branch ? 1 : (ctx->block->index - loop_header_idx + 1);
10273       Operand* const vals = (Operand*)alloca(num_vals * sizeof(Operand));
10274       for (aco_ptr<Instruction>& instr : ctx->program->blocks[loop_header_idx].instructions) {
10275          if (instr->opcode == aco_opcode::p_linear_phi) {
10276             if (ctx->cf_info.has_branch)
10277                instr->operands.pop_back();
10278             else
10279                instr->operands.back() =
10280                   create_continue_phis(ctx, loop_header_idx, ctx->block->index, instr, vals);
10281          } else if (!is_phi(instr)) {
10282             break;
10283          }
10284       }
10285    }
10286 
10287    end_loop(ctx, &lc);
10288 }
10289 
10290 static void
begin_divergent_if_then(isel_context * ctx,if_context * ic,Temp cond)10291 begin_divergent_if_then(isel_context* ctx, if_context* ic, Temp cond)
10292 {
10293    ic->cond = cond;
10294 
10295    append_logical_end(ctx->block);
10296    ctx->block->kind |= block_kind_branch;
10297 
10298    /* branch to linear then block */
10299    assert(cond.regClass() == ctx->program->lane_mask);
10300    aco_ptr<Pseudo_branch_instruction> branch;
10301    branch.reset(create_instruction<Pseudo_branch_instruction>(aco_opcode::p_cbranch_z,
10302                                                               Format::PSEUDO_BRANCH, 1, 1));
10303    branch->definitions[0] = Definition(ctx->program->allocateTmp(s2));
10304    branch->definitions[0].setHint(vcc);
10305    branch->operands[0] = Operand(cond);
10306    ctx->block->instructions.push_back(std::move(branch));
10307 
10308    ic->BB_if_idx = ctx->block->index;
10309    ic->BB_invert = Block();
10310    /* Invert blocks are intentionally not marked as top level because they
10311     * are not part of the logical cfg. */
10312    ic->BB_invert.kind |= block_kind_invert;
10313    ic->BB_endif = Block();
10314    ic->BB_endif.kind |= (block_kind_merge | (ctx->block->kind & block_kind_top_level));
10315 
10316    ic->exec_potentially_empty_discard_old = ctx->cf_info.exec_potentially_empty_discard;
10317    ic->exec_potentially_empty_break_old = ctx->cf_info.exec_potentially_empty_break;
10318    ic->exec_potentially_empty_break_depth_old = ctx->cf_info.exec_potentially_empty_break_depth;
10319    ic->divergent_old = ctx->cf_info.parent_if.is_divergent;
10320    ctx->cf_info.parent_if.is_divergent = true;
10321 
10322    /* divergent branches use cbranch_execz */
10323    ctx->cf_info.exec_potentially_empty_discard = false;
10324    ctx->cf_info.exec_potentially_empty_break = false;
10325    ctx->cf_info.exec_potentially_empty_break_depth = UINT16_MAX;
10326 
10327    /** emit logical then block */
10328    ctx->program->next_divergent_if_logical_depth++;
10329    Block* BB_then_logical = ctx->program->create_and_insert_block();
10330    add_edge(ic->BB_if_idx, BB_then_logical);
10331    ctx->block = BB_then_logical;
10332    append_logical_start(BB_then_logical);
10333 }
10334 
10335 static void
begin_divergent_if_else(isel_context * ctx,if_context * ic)10336 begin_divergent_if_else(isel_context* ctx, if_context* ic)
10337 {
10338    Block* BB_then_logical = ctx->block;
10339    append_logical_end(BB_then_logical);
10340    /* branch from logical then block to invert block */
10341    aco_ptr<Pseudo_branch_instruction> branch;
10342    branch.reset(create_instruction<Pseudo_branch_instruction>(aco_opcode::p_branch,
10343                                                               Format::PSEUDO_BRANCH, 0, 1));
10344    branch->definitions[0] = Definition(ctx->program->allocateTmp(s2));
10345    branch->definitions[0].setHint(vcc);
10346    BB_then_logical->instructions.emplace_back(std::move(branch));
10347    add_linear_edge(BB_then_logical->index, &ic->BB_invert);
10348    if (!ctx->cf_info.parent_loop.has_divergent_branch)
10349       add_logical_edge(BB_then_logical->index, &ic->BB_endif);
10350    BB_then_logical->kind |= block_kind_uniform;
10351    assert(!ctx->cf_info.has_branch);
10352    ic->then_branch_divergent = ctx->cf_info.parent_loop.has_divergent_branch;
10353    ctx->cf_info.parent_loop.has_divergent_branch = false;
10354    ctx->program->next_divergent_if_logical_depth--;
10355 
10356    /** emit linear then block */
10357    Block* BB_then_linear = ctx->program->create_and_insert_block();
10358    BB_then_linear->kind |= block_kind_uniform;
10359    add_linear_edge(ic->BB_if_idx, BB_then_linear);
10360    /* branch from linear then block to invert block */
10361    branch.reset(create_instruction<Pseudo_branch_instruction>(aco_opcode::p_branch,
10362                                                               Format::PSEUDO_BRANCH, 0, 1));
10363    branch->definitions[0] = Definition(ctx->program->allocateTmp(s2));
10364    branch->definitions[0].setHint(vcc);
10365    BB_then_linear->instructions.emplace_back(std::move(branch));
10366    add_linear_edge(BB_then_linear->index, &ic->BB_invert);
10367 
10368    /** emit invert merge block */
10369    ctx->block = ctx->program->insert_block(std::move(ic->BB_invert));
10370    ic->invert_idx = ctx->block->index;
10371 
10372    /* branch to linear else block (skip else) */
10373    branch.reset(create_instruction<Pseudo_branch_instruction>(aco_opcode::p_branch,
10374                                                               Format::PSEUDO_BRANCH, 0, 1));
10375    branch->definitions[0] = Definition(ctx->program->allocateTmp(s2));
10376    branch->definitions[0].setHint(vcc);
10377    ctx->block->instructions.push_back(std::move(branch));
10378 
10379    ic->exec_potentially_empty_discard_old |= ctx->cf_info.exec_potentially_empty_discard;
10380    ic->exec_potentially_empty_break_old |= ctx->cf_info.exec_potentially_empty_break;
10381    ic->exec_potentially_empty_break_depth_old = std::min(
10382       ic->exec_potentially_empty_break_depth_old, ctx->cf_info.exec_potentially_empty_break_depth);
10383    /* divergent branches use cbranch_execz */
10384    ctx->cf_info.exec_potentially_empty_discard = false;
10385    ctx->cf_info.exec_potentially_empty_break = false;
10386    ctx->cf_info.exec_potentially_empty_break_depth = UINT16_MAX;
10387 
10388    /** emit logical else block */
10389    ctx->program->next_divergent_if_logical_depth++;
10390    Block* BB_else_logical = ctx->program->create_and_insert_block();
10391    add_logical_edge(ic->BB_if_idx, BB_else_logical);
10392    add_linear_edge(ic->invert_idx, BB_else_logical);
10393    ctx->block = BB_else_logical;
10394    append_logical_start(BB_else_logical);
10395 }
10396 
10397 static void
end_divergent_if(isel_context * ctx,if_context * ic)10398 end_divergent_if(isel_context* ctx, if_context* ic)
10399 {
10400    Block* BB_else_logical = ctx->block;
10401    append_logical_end(BB_else_logical);
10402 
10403    /* branch from logical else block to endif block */
10404    aco_ptr<Pseudo_branch_instruction> branch;
10405    branch.reset(create_instruction<Pseudo_branch_instruction>(aco_opcode::p_branch,
10406                                                               Format::PSEUDO_BRANCH, 0, 1));
10407    branch->definitions[0] = Definition(ctx->program->allocateTmp(s2));
10408    branch->definitions[0].setHint(vcc);
10409    BB_else_logical->instructions.emplace_back(std::move(branch));
10410    add_linear_edge(BB_else_logical->index, &ic->BB_endif);
10411    if (!ctx->cf_info.parent_loop.has_divergent_branch)
10412       add_logical_edge(BB_else_logical->index, &ic->BB_endif);
10413    BB_else_logical->kind |= block_kind_uniform;
10414    ctx->program->next_divergent_if_logical_depth--;
10415 
10416    assert(!ctx->cf_info.has_branch);
10417    ctx->cf_info.parent_loop.has_divergent_branch &= ic->then_branch_divergent;
10418 
10419    /** emit linear else block */
10420    Block* BB_else_linear = ctx->program->create_and_insert_block();
10421    BB_else_linear->kind |= block_kind_uniform;
10422    add_linear_edge(ic->invert_idx, BB_else_linear);
10423 
10424    /* branch from linear else block to endif block */
10425    branch.reset(create_instruction<Pseudo_branch_instruction>(aco_opcode::p_branch,
10426                                                               Format::PSEUDO_BRANCH, 0, 1));
10427    branch->definitions[0] = Definition(ctx->program->allocateTmp(s2));
10428    branch->definitions[0].setHint(vcc);
10429    BB_else_linear->instructions.emplace_back(std::move(branch));
10430    add_linear_edge(BB_else_linear->index, &ic->BB_endif);
10431 
10432    /** emit endif merge block */
10433    ctx->block = ctx->program->insert_block(std::move(ic->BB_endif));
10434    append_logical_start(ctx->block);
10435 
10436    ctx->cf_info.parent_if.is_divergent = ic->divergent_old;
10437    ctx->cf_info.exec_potentially_empty_discard |= ic->exec_potentially_empty_discard_old;
10438    ctx->cf_info.exec_potentially_empty_break |= ic->exec_potentially_empty_break_old;
10439    ctx->cf_info.exec_potentially_empty_break_depth = std::min(
10440       ic->exec_potentially_empty_break_depth_old, ctx->cf_info.exec_potentially_empty_break_depth);
10441    if (ctx->block->loop_nest_depth == ctx->cf_info.exec_potentially_empty_break_depth &&
10442        !ctx->cf_info.parent_if.is_divergent) {
10443       ctx->cf_info.exec_potentially_empty_break = false;
10444       ctx->cf_info.exec_potentially_empty_break_depth = UINT16_MAX;
10445    }
10446    /* uniform control flow never has an empty exec-mask */
10447    if (!ctx->block->loop_nest_depth && !ctx->cf_info.parent_if.is_divergent) {
10448       ctx->cf_info.exec_potentially_empty_discard = false;
10449       ctx->cf_info.exec_potentially_empty_break = false;
10450       ctx->cf_info.exec_potentially_empty_break_depth = UINT16_MAX;
10451    }
10452 }
10453 
10454 static void
begin_uniform_if_then(isel_context * ctx,if_context * ic,Temp cond)10455 begin_uniform_if_then(isel_context* ctx, if_context* ic, Temp cond)
10456 {
10457    assert(cond.regClass() == s1);
10458 
10459    append_logical_end(ctx->block);
10460    ctx->block->kind |= block_kind_uniform;
10461 
10462    aco_ptr<Pseudo_branch_instruction> branch;
10463    aco_opcode branch_opcode = aco_opcode::p_cbranch_z;
10464    branch.reset(
10465       create_instruction<Pseudo_branch_instruction>(branch_opcode, Format::PSEUDO_BRANCH, 1, 1));
10466    branch->definitions[0] = Definition(ctx->program->allocateTmp(s2));
10467    branch->definitions[0].setHint(vcc);
10468    branch->operands[0] = Operand(cond);
10469    branch->operands[0].setFixed(scc);
10470    ctx->block->instructions.emplace_back(std::move(branch));
10471 
10472    ic->BB_if_idx = ctx->block->index;
10473    ic->BB_endif = Block();
10474    ic->BB_endif.kind |= ctx->block->kind & block_kind_top_level;
10475 
10476    ctx->cf_info.has_branch = false;
10477    ctx->cf_info.parent_loop.has_divergent_branch = false;
10478 
10479    /** emit then block */
10480    ctx->program->next_uniform_if_depth++;
10481    Block* BB_then = ctx->program->create_and_insert_block();
10482    add_edge(ic->BB_if_idx, BB_then);
10483    append_logical_start(BB_then);
10484    ctx->block = BB_then;
10485 }
10486 
10487 static void
begin_uniform_if_else(isel_context * ctx,if_context * ic)10488 begin_uniform_if_else(isel_context* ctx, if_context* ic)
10489 {
10490    Block* BB_then = ctx->block;
10491 
10492    ic->uniform_has_then_branch = ctx->cf_info.has_branch;
10493    ic->then_branch_divergent = ctx->cf_info.parent_loop.has_divergent_branch;
10494 
10495    if (!ic->uniform_has_then_branch) {
10496       append_logical_end(BB_then);
10497       /* branch from then block to endif block */
10498       aco_ptr<Pseudo_branch_instruction> branch;
10499       branch.reset(create_instruction<Pseudo_branch_instruction>(aco_opcode::p_branch,
10500                                                                  Format::PSEUDO_BRANCH, 0, 1));
10501       branch->definitions[0] = Definition(ctx->program->allocateTmp(s2));
10502       branch->definitions[0].setHint(vcc);
10503       BB_then->instructions.emplace_back(std::move(branch));
10504       add_linear_edge(BB_then->index, &ic->BB_endif);
10505       if (!ic->then_branch_divergent)
10506          add_logical_edge(BB_then->index, &ic->BB_endif);
10507       BB_then->kind |= block_kind_uniform;
10508    }
10509 
10510    ctx->cf_info.has_branch = false;
10511    ctx->cf_info.parent_loop.has_divergent_branch = false;
10512 
10513    /** emit else block */
10514    Block* BB_else = ctx->program->create_and_insert_block();
10515    add_edge(ic->BB_if_idx, BB_else);
10516    append_logical_start(BB_else);
10517    ctx->block = BB_else;
10518 }
10519 
10520 static void
end_uniform_if(isel_context * ctx,if_context * ic)10521 end_uniform_if(isel_context* ctx, if_context* ic)
10522 {
10523    Block* BB_else = ctx->block;
10524 
10525    if (!ctx->cf_info.has_branch) {
10526       append_logical_end(BB_else);
10527       /* branch from then block to endif block */
10528       aco_ptr<Pseudo_branch_instruction> branch;
10529       branch.reset(create_instruction<Pseudo_branch_instruction>(aco_opcode::p_branch,
10530                                                                  Format::PSEUDO_BRANCH, 0, 1));
10531       branch->definitions[0] = Definition(ctx->program->allocateTmp(s2));
10532       branch->definitions[0].setHint(vcc);
10533       BB_else->instructions.emplace_back(std::move(branch));
10534       add_linear_edge(BB_else->index, &ic->BB_endif);
10535       if (!ctx->cf_info.parent_loop.has_divergent_branch)
10536          add_logical_edge(BB_else->index, &ic->BB_endif);
10537       BB_else->kind |= block_kind_uniform;
10538    }
10539 
10540    ctx->cf_info.has_branch &= ic->uniform_has_then_branch;
10541    ctx->cf_info.parent_loop.has_divergent_branch &= ic->then_branch_divergent;
10542 
10543    /** emit endif merge block */
10544    ctx->program->next_uniform_if_depth--;
10545    if (!ctx->cf_info.has_branch) {
10546       ctx->block = ctx->program->insert_block(std::move(ic->BB_endif));
10547       append_logical_start(ctx->block);
10548    }
10549 }
10550 
10551 static bool
visit_if(isel_context * ctx,nir_if * if_stmt)10552 visit_if(isel_context* ctx, nir_if* if_stmt)
10553 {
10554    Temp cond = get_ssa_temp(ctx, if_stmt->condition.ssa);
10555    Builder bld(ctx->program, ctx->block);
10556    aco_ptr<Pseudo_branch_instruction> branch;
10557    if_context ic;
10558 
10559    if (!nir_src_is_divergent(if_stmt->condition)) { /* uniform condition */
10560       /**
10561        * Uniform conditionals are represented in the following way*) :
10562        *
10563        * The linear and logical CFG:
10564        *                        BB_IF
10565        *                        /    \
10566        *       BB_THEN (logical)      BB_ELSE (logical)
10567        *                        \    /
10568        *                        BB_ENDIF
10569        *
10570        * *) Exceptions may be due to break and continue statements within loops
10571        *    If a break/continue happens within uniform control flow, it branches
10572        *    to the loop exit/entry block. Otherwise, it branches to the next
10573        *    merge block.
10574        **/
10575 
10576       assert(cond.regClass() == ctx->program->lane_mask);
10577       cond = bool_to_scalar_condition(ctx, cond);
10578 
10579       begin_uniform_if_then(ctx, &ic, cond);
10580       visit_cf_list(ctx, &if_stmt->then_list);
10581 
10582       begin_uniform_if_else(ctx, &ic);
10583       visit_cf_list(ctx, &if_stmt->else_list);
10584 
10585       end_uniform_if(ctx, &ic);
10586    } else { /* non-uniform condition */
10587       /**
10588        * To maintain a logical and linear CFG without critical edges,
10589        * non-uniform conditionals are represented in the following way*) :
10590        *
10591        * The linear CFG:
10592        *                        BB_IF
10593        *                        /    \
10594        *       BB_THEN (logical)      BB_THEN (linear)
10595        *                        \    /
10596        *                        BB_INVERT (linear)
10597        *                        /    \
10598        *       BB_ELSE (logical)      BB_ELSE (linear)
10599        *                        \    /
10600        *                        BB_ENDIF
10601        *
10602        * The logical CFG:
10603        *                        BB_IF
10604        *                        /    \
10605        *       BB_THEN (logical)      BB_ELSE (logical)
10606        *                        \    /
10607        *                        BB_ENDIF
10608        *
10609        * *) Exceptions may be due to break and continue statements within loops
10610        **/
10611 
10612       begin_divergent_if_then(ctx, &ic, cond);
10613       visit_cf_list(ctx, &if_stmt->then_list);
10614 
10615       begin_divergent_if_else(ctx, &ic);
10616       visit_cf_list(ctx, &if_stmt->else_list);
10617 
10618       end_divergent_if(ctx, &ic);
10619    }
10620 
10621    return !ctx->cf_info.has_branch && !ctx->block->logical_preds.empty();
10622 }
10623 
10624 static bool
visit_cf_list(isel_context * ctx,struct exec_list * list)10625 visit_cf_list(isel_context* ctx, struct exec_list* list)
10626 {
10627    foreach_list_typed (nir_cf_node, node, node, list) {
10628       switch (node->type) {
10629       case nir_cf_node_block: visit_block(ctx, nir_cf_node_as_block(node)); break;
10630       case nir_cf_node_if:
10631          if (!visit_if(ctx, nir_cf_node_as_if(node)))
10632             return true;
10633          break;
10634       case nir_cf_node_loop: visit_loop(ctx, nir_cf_node_as_loop(node)); break;
10635       default: unreachable("unimplemented cf list type");
10636       }
10637    }
10638    return false;
10639 }
10640 
10641 static void
export_vs_varying(isel_context * ctx,int slot,bool is_pos,int * next_pos)10642 export_vs_varying(isel_context* ctx, int slot, bool is_pos, int* next_pos)
10643 {
10644    assert(ctx->stage.hw == HWStage::VS || ctx->stage.hw == HWStage::NGG);
10645 
10646    int offset = (ctx->stage.has(SWStage::TES) && !ctx->stage.has(SWStage::GS))
10647                    ? ctx->program->info->tes.outinfo.vs_output_param_offset[slot]
10648                    : ctx->program->info->vs.outinfo.vs_output_param_offset[slot];
10649    unsigned mask = ctx->outputs.mask[slot];
10650    if (!is_pos && !mask)
10651       return;
10652    if (!is_pos && offset == AC_EXP_PARAM_UNDEFINED)
10653       return;
10654    aco_ptr<Export_instruction> exp{
10655       create_instruction<Export_instruction>(aco_opcode::exp, Format::EXP, 4, 0)};
10656    exp->enabled_mask = mask;
10657    for (unsigned i = 0; i < 4; ++i) {
10658       if (mask & (1 << i))
10659          exp->operands[i] = Operand(ctx->outputs.temps[slot * 4u + i]);
10660       else
10661          exp->operands[i] = Operand(v1);
10662    }
10663    /* GFX10 (Navi1x) skip POS0 exports if EXEC=0 and DONE=0, causing a hang.
10664     * Setting valid_mask=1 prevents it and has no other effect.
10665     */
10666    exp->valid_mask = ctx->options->chip_class == GFX10 && is_pos && *next_pos == 0;
10667    exp->done = false;
10668    exp->compressed = false;
10669    if (is_pos)
10670       exp->dest = V_008DFC_SQ_EXP_POS + (*next_pos)++;
10671    else
10672       exp->dest = V_008DFC_SQ_EXP_PARAM + offset;
10673    ctx->block->instructions.emplace_back(std::move(exp));
10674 }
10675 
10676 static void
export_vs_psiz_layer_viewport_vrs(isel_context * ctx,int * next_pos)10677 export_vs_psiz_layer_viewport_vrs(isel_context* ctx, int* next_pos)
10678 {
10679    aco_ptr<Export_instruction> exp{
10680       create_instruction<Export_instruction>(aco_opcode::exp, Format::EXP, 4, 0)};
10681    exp->enabled_mask = 0;
10682    for (unsigned i = 0; i < 4; ++i)
10683       exp->operands[i] = Operand(v1);
10684    if (ctx->outputs.mask[VARYING_SLOT_PSIZ]) {
10685       exp->operands[0] = Operand(ctx->outputs.temps[VARYING_SLOT_PSIZ * 4u]);
10686       exp->enabled_mask |= 0x1;
10687    }
10688    if (ctx->outputs.mask[VARYING_SLOT_LAYER]) {
10689       exp->operands[2] = Operand(ctx->outputs.temps[VARYING_SLOT_LAYER * 4u]);
10690       exp->enabled_mask |= 0x4;
10691    }
10692    if (ctx->outputs.mask[VARYING_SLOT_VIEWPORT]) {
10693       if (ctx->options->chip_class < GFX9) {
10694          exp->operands[3] = Operand(ctx->outputs.temps[VARYING_SLOT_VIEWPORT * 4u]);
10695          exp->enabled_mask |= 0x8;
10696       } else {
10697          Builder bld(ctx->program, ctx->block);
10698 
10699          Temp out = bld.vop2(aco_opcode::v_lshlrev_b32, bld.def(v1), Operand::c32(16u),
10700                              Operand(ctx->outputs.temps[VARYING_SLOT_VIEWPORT * 4u]));
10701          if (exp->operands[2].isTemp())
10702             out = bld.vop2(aco_opcode::v_or_b32, bld.def(v1), Operand(out), exp->operands[2]);
10703 
10704          exp->operands[2] = Operand(out);
10705          exp->enabled_mask |= 0x4;
10706       }
10707    }
10708    if (ctx->outputs.mask[VARYING_SLOT_PRIMITIVE_SHADING_RATE]) {
10709       exp->operands[1] = Operand(ctx->outputs.temps[VARYING_SLOT_PRIMITIVE_SHADING_RATE * 4u]);
10710       exp->enabled_mask |= 0x2;
10711    } else if (ctx->options->force_vrs_rates) {
10712       /* Bits [2:3] = VRS rate X
10713        * Bits [4:5] = VRS rate Y
10714        *
10715        * The range is [-2, 1]. Values:
10716        *   1: 2x coarser shading rate in that direction.
10717        *   0: normal shading rate
10718        *  -1: 2x finer shading rate (sample shading, not directional)
10719        *  -2: 4x finer shading rate (sample shading, not directional)
10720        *
10721        * Sample shading can't go above 8 samples, so both numbers can't be -2
10722        * at the same time.
10723        */
10724       Builder bld(ctx->program, ctx->block);
10725       Temp rates = bld.copy(bld.def(v1), Operand::c32((unsigned)ctx->options->force_vrs_rates));
10726 
10727       /* If Pos.W != 1 (typical for non-GUI elements), use 2x2 coarse shading. */
10728       Temp cond = bld.vopc(aco_opcode::v_cmp_neq_f32, bld.def(bld.lm), Operand::c32(0x3f800000u),
10729                            Operand(ctx->outputs.temps[VARYING_SLOT_POS + 3]));
10730       rates = bld.vop2(aco_opcode::v_cndmask_b32, bld.def(v1),
10731                        bld.copy(bld.def(v1), Operand::zero()), rates, cond);
10732 
10733       exp->operands[1] = Operand(rates);
10734       exp->enabled_mask |= 0x2;
10735    }
10736 
10737    exp->valid_mask = ctx->options->chip_class == GFX10 && *next_pos == 0;
10738    exp->done = false;
10739    exp->compressed = false;
10740    exp->dest = V_008DFC_SQ_EXP_POS + (*next_pos)++;
10741    ctx->block->instructions.emplace_back(std::move(exp));
10742 }
10743 
10744 static void
create_vs_exports(isel_context * ctx)10745 create_vs_exports(isel_context* ctx)
10746 {
10747    assert(ctx->stage.hw == HWStage::VS || ctx->stage.hw == HWStage::NGG);
10748 
10749    const radv_vs_output_info* outinfo = (ctx->stage.has(SWStage::TES) && !ctx->stage.has(SWStage::GS))
10750                                         ? &ctx->program->info->tes.outinfo
10751                                         : &ctx->program->info->vs.outinfo;
10752 
10753    ctx->block->kind |= block_kind_export_end;
10754 
10755    if (outinfo->export_prim_id && ctx->stage.hw != HWStage::NGG) {
10756       ctx->outputs.mask[VARYING_SLOT_PRIMITIVE_ID] |= 0x1;
10757       if (ctx->stage.has(SWStage::TES))
10758          ctx->outputs.temps[VARYING_SLOT_PRIMITIVE_ID * 4u] =
10759             get_arg(ctx, ctx->args->ac.tes_patch_id);
10760       else
10761          ctx->outputs.temps[VARYING_SLOT_PRIMITIVE_ID * 4u] =
10762             get_arg(ctx, ctx->args->ac.vs_prim_id);
10763    }
10764 
10765    if (ctx->options->key.has_multiview_view_index) {
10766       ctx->outputs.mask[VARYING_SLOT_LAYER] |= 0x1;
10767       ctx->outputs.temps[VARYING_SLOT_LAYER * 4u] =
10768          as_vgpr(ctx, get_arg(ctx, ctx->args->ac.view_index));
10769    }
10770 
10771    /* Hardware requires position data to always be exported, even if the
10772     * application did not write gl_Position.
10773     */
10774    ctx->outputs.mask[VARYING_SLOT_POS] = 0xf;
10775 
10776    /* the order these position exports are created is important */
10777    int next_pos = 0;
10778    export_vs_varying(ctx, VARYING_SLOT_POS, true, &next_pos);
10779 
10780    bool writes_primitive_shading_rate =
10781       outinfo->writes_primitive_shading_rate || ctx->options->force_vrs_rates;
10782    if (outinfo->writes_pointsize || outinfo->writes_layer || outinfo->writes_viewport_index ||
10783        writes_primitive_shading_rate) {
10784       export_vs_psiz_layer_viewport_vrs(ctx, &next_pos);
10785    }
10786    if (ctx->num_clip_distances + ctx->num_cull_distances > 0)
10787       export_vs_varying(ctx, VARYING_SLOT_CLIP_DIST0, true, &next_pos);
10788    if (ctx->num_clip_distances + ctx->num_cull_distances > 4)
10789       export_vs_varying(ctx, VARYING_SLOT_CLIP_DIST1, true, &next_pos);
10790 
10791    if (ctx->export_clip_dists) {
10792       if (ctx->num_clip_distances + ctx->num_cull_distances > 0)
10793          export_vs_varying(ctx, VARYING_SLOT_CLIP_DIST0, false, &next_pos);
10794       if (ctx->num_clip_distances + ctx->num_cull_distances > 4)
10795          export_vs_varying(ctx, VARYING_SLOT_CLIP_DIST1, false, &next_pos);
10796    }
10797 
10798    for (unsigned i = 0; i <= VARYING_SLOT_VAR31; ++i) {
10799       if (i < VARYING_SLOT_VAR0 && i != VARYING_SLOT_LAYER && i != VARYING_SLOT_PRIMITIVE_ID &&
10800           i != VARYING_SLOT_VIEWPORT)
10801          continue;
10802 
10803       export_vs_varying(ctx, i, false, NULL);
10804    }
10805 }
10806 
10807 static bool
export_fs_mrt_z(isel_context * ctx)10808 export_fs_mrt_z(isel_context* ctx)
10809 {
10810    Builder bld(ctx->program, ctx->block);
10811    unsigned enabled_channels = 0;
10812    bool compr = false;
10813    Operand values[4];
10814 
10815    for (unsigned i = 0; i < 4; ++i) {
10816       values[i] = Operand(v1);
10817    }
10818 
10819    /* Both stencil and sample mask only need 16-bits. */
10820    if (!ctx->program->info->ps.writes_z &&
10821        (ctx->program->info->ps.writes_stencil || ctx->program->info->ps.writes_sample_mask)) {
10822       compr = true; /* COMPR flag */
10823 
10824       if (ctx->program->info->ps.writes_stencil) {
10825          /* Stencil should be in X[23:16]. */
10826          values[0] = Operand(ctx->outputs.temps[FRAG_RESULT_STENCIL * 4u]);
10827          values[0] = bld.vop2(aco_opcode::v_lshlrev_b32, bld.def(v1), Operand::c32(16u), values[0]);
10828          enabled_channels |= 0x3;
10829       }
10830 
10831       if (ctx->program->info->ps.writes_sample_mask) {
10832          /* SampleMask should be in Y[15:0]. */
10833          values[1] = Operand(ctx->outputs.temps[FRAG_RESULT_SAMPLE_MASK * 4u]);
10834          enabled_channels |= 0xc;
10835       }
10836    } else {
10837       if (ctx->program->info->ps.writes_z) {
10838          values[0] = Operand(ctx->outputs.temps[FRAG_RESULT_DEPTH * 4u]);
10839          enabled_channels |= 0x1;
10840       }
10841 
10842       if (ctx->program->info->ps.writes_stencil) {
10843          values[1] = Operand(ctx->outputs.temps[FRAG_RESULT_STENCIL * 4u]);
10844          enabled_channels |= 0x2;
10845       }
10846 
10847       if (ctx->program->info->ps.writes_sample_mask) {
10848          values[2] = Operand(ctx->outputs.temps[FRAG_RESULT_SAMPLE_MASK * 4u]);
10849          enabled_channels |= 0x4;
10850       }
10851    }
10852 
10853    /* GFX6 (except OLAND and HAINAN) has a bug that it only looks at the X
10854     * writemask component.
10855     */
10856    if (ctx->options->chip_class == GFX6 && ctx->options->family != CHIP_OLAND &&
10857        ctx->options->family != CHIP_HAINAN) {
10858       enabled_channels |= 0x1;
10859    }
10860 
10861    bld.exp(aco_opcode::exp, values[0], values[1], values[2], values[3], enabled_channels,
10862            V_008DFC_SQ_EXP_MRTZ, compr);
10863 
10864    return true;
10865 }
10866 
10867 static bool
export_fs_mrt_color(isel_context * ctx,int slot)10868 export_fs_mrt_color(isel_context* ctx, int slot)
10869 {
10870    Builder bld(ctx->program, ctx->block);
10871    unsigned write_mask = ctx->outputs.mask[slot];
10872    Operand values[4];
10873 
10874    for (unsigned i = 0; i < 4; ++i) {
10875       if (write_mask & (1 << i)) {
10876          values[i] = Operand(ctx->outputs.temps[slot * 4u + i]);
10877       } else {
10878          values[i] = Operand(v1);
10879       }
10880    }
10881 
10882    unsigned target, col_format;
10883    unsigned enabled_channels = 0;
10884    aco_opcode compr_op = (aco_opcode)0;
10885    bool compr = false;
10886 
10887    slot -= FRAG_RESULT_DATA0;
10888    target = V_008DFC_SQ_EXP_MRT + slot;
10889    col_format = (ctx->options->key.ps.col_format >> (4 * slot)) & 0xf;
10890 
10891    bool is_int8 = (ctx->options->key.ps.is_int8 >> slot) & 1;
10892    bool is_int10 = (ctx->options->key.ps.is_int10 >> slot) & 1;
10893    bool is_16bit = values[0].regClass() == v2b;
10894 
10895    /* Replace NaN by zero (only 32-bit) to fix game bugs if requested. */
10896    if (ctx->options->enable_mrt_output_nan_fixup && !is_16bit &&
10897        (col_format == V_028714_SPI_SHADER_32_R || col_format == V_028714_SPI_SHADER_32_GR ||
10898         col_format == V_028714_SPI_SHADER_32_AR || col_format == V_028714_SPI_SHADER_32_ABGR ||
10899         col_format == V_028714_SPI_SHADER_FP16_ABGR)) {
10900       for (int i = 0; i < 4; i++) {
10901          if (!(write_mask & (1 << i)))
10902             continue;
10903 
10904          Temp isnan = bld.vopc(aco_opcode::v_cmp_class_f32, bld.hint_vcc(bld.def(bld.lm)),
10905                                values[i], bld.copy(bld.def(v1), Operand::c32(3u)));
10906          values[i] = bld.vop2(aco_opcode::v_cndmask_b32, bld.def(v1), values[i],
10907                               bld.copy(bld.def(v1), Operand::zero()), isnan);
10908       }
10909    }
10910 
10911    switch (col_format) {
10912    case V_028714_SPI_SHADER_32_R: enabled_channels = 1; break;
10913 
10914    case V_028714_SPI_SHADER_32_GR: enabled_channels = 0x3; break;
10915 
10916    case V_028714_SPI_SHADER_32_AR:
10917       if (ctx->options->chip_class >= GFX10) {
10918          /* Special case: on GFX10, the outputs are different for 32_AR */
10919          enabled_channels = 0x3;
10920          values[1] = values[3];
10921          values[3] = Operand(v1);
10922       } else {
10923          enabled_channels = 0x9;
10924       }
10925       break;
10926 
10927    case V_028714_SPI_SHADER_FP16_ABGR:
10928       for (int i = 0; i < 2; i++) {
10929          bool enabled = (write_mask >> (i * 2)) & 0x3;
10930          if (enabled) {
10931             enabled_channels |= 0x3 << (i * 2);
10932             if (is_16bit) {
10933                values[i] =
10934                   bld.pseudo(aco_opcode::p_create_vector, bld.def(v1),
10935                              values[i * 2].isUndefined() ? Operand(v2b) : values[i * 2],
10936                              values[i * 2 + 1].isUndefined() ? Operand(v2b) : values[i * 2 + 1]);
10937             } else if (ctx->options->chip_class == GFX8 || ctx->options->chip_class == GFX9) {
10938                values[i] =
10939                   bld.vop3(aco_opcode::v_cvt_pkrtz_f16_f32_e64, bld.def(v1),
10940                            values[i * 2].isUndefined() ? Operand::zero() : values[i * 2],
10941                            values[i * 2 + 1].isUndefined() ? Operand::zero() : values[i * 2 + 1]);
10942             } else {
10943                values[i] =
10944                   bld.vop2(aco_opcode::v_cvt_pkrtz_f16_f32, bld.def(v1),
10945                            values[i * 2].isUndefined() ? values[i * 2 + 1] : values[i * 2],
10946                            values[i * 2 + 1].isUndefined() ? values[i * 2] : values[i * 2 + 1]);
10947             }
10948          } else {
10949             values[i] = Operand(v1);
10950          }
10951       }
10952       values[2] = Operand(v1);
10953       values[3] = Operand(v1);
10954       compr = true;
10955       break;
10956 
10957    case V_028714_SPI_SHADER_UNORM16_ABGR:
10958       if (is_16bit && ctx->options->chip_class >= GFX9) {
10959          compr_op = aco_opcode::v_cvt_pknorm_u16_f16;
10960       } else {
10961          compr_op = aco_opcode::v_cvt_pknorm_u16_f32;
10962       }
10963       break;
10964 
10965    case V_028714_SPI_SHADER_SNORM16_ABGR:
10966       if (is_16bit && ctx->options->chip_class >= GFX9) {
10967          compr_op = aco_opcode::v_cvt_pknorm_i16_f16;
10968       } else {
10969          compr_op = aco_opcode::v_cvt_pknorm_i16_f32;
10970       }
10971       break;
10972 
10973    case V_028714_SPI_SHADER_UINT16_ABGR: {
10974       compr_op = aco_opcode::v_cvt_pk_u16_u32;
10975       if (is_int8 || is_int10) {
10976          /* clamp */
10977          uint32_t max_rgb = is_int8 ? 255 : is_int10 ? 1023 : 0;
10978          Temp max_rgb_val = bld.copy(bld.def(s1), Operand::c32(max_rgb));
10979 
10980          for (unsigned i = 0; i < 4; i++) {
10981             if ((write_mask >> i) & 1) {
10982                values[i] =
10983                   bld.vop2(aco_opcode::v_min_u32, bld.def(v1),
10984                            i == 3 && is_int10 ? Operand::c32(3u) : Operand(max_rgb_val), values[i]);
10985             }
10986          }
10987       } else if (is_16bit) {
10988          for (unsigned i = 0; i < 4; i++) {
10989             if ((write_mask >> i) & 1) {
10990                Temp tmp = convert_int(ctx, bld, values[i].getTemp(), 16, 32, false);
10991                values[i] = Operand(tmp);
10992             }
10993          }
10994       }
10995       break;
10996    }
10997 
10998    case V_028714_SPI_SHADER_SINT16_ABGR:
10999       compr_op = aco_opcode::v_cvt_pk_i16_i32;
11000       if (is_int8 || is_int10) {
11001          /* clamp */
11002          uint32_t max_rgb = is_int8 ? 127 : is_int10 ? 511 : 0;
11003          uint32_t min_rgb = is_int8 ? -128 : is_int10 ? -512 : 0;
11004          Temp max_rgb_val = bld.copy(bld.def(s1), Operand::c32(max_rgb));
11005          Temp min_rgb_val = bld.copy(bld.def(s1), Operand::c32(min_rgb));
11006 
11007          for (unsigned i = 0; i < 4; i++) {
11008             if ((write_mask >> i) & 1) {
11009                values[i] =
11010                   bld.vop2(aco_opcode::v_min_i32, bld.def(v1),
11011                            i == 3 && is_int10 ? Operand::c32(1u) : Operand(max_rgb_val), values[i]);
11012                values[i] = bld.vop2(aco_opcode::v_max_i32, bld.def(v1),
11013                                     i == 3 && is_int10 ? Operand::c32(-2u) : Operand(min_rgb_val),
11014                                     values[i]);
11015             }
11016          }
11017       } else if (is_16bit) {
11018          for (unsigned i = 0; i < 4; i++) {
11019             if ((write_mask >> i) & 1) {
11020                Temp tmp = convert_int(ctx, bld, values[i].getTemp(), 16, 32, true);
11021                values[i] = Operand(tmp);
11022             }
11023          }
11024       }
11025       break;
11026 
11027    case V_028714_SPI_SHADER_32_ABGR: enabled_channels = 0xF; break;
11028 
11029    case V_028714_SPI_SHADER_ZERO:
11030    default: return false;
11031    }
11032 
11033    if ((bool)compr_op) {
11034       for (int i = 0; i < 2; i++) {
11035          /* check if at least one of the values to be compressed is enabled */
11036          bool enabled = (write_mask >> (i * 2)) & 0x3;
11037          if (enabled) {
11038             enabled_channels |= 0x3 << (i * 2);
11039             values[i] = bld.vop3(
11040                compr_op, bld.def(v1), values[i * 2].isUndefined() ? Operand::zero() : values[i * 2],
11041                values[i * 2 + 1].isUndefined() ? Operand::zero() : values[i * 2 + 1]);
11042          } else {
11043             values[i] = Operand(v1);
11044          }
11045       }
11046       values[2] = Operand(v1);
11047       values[3] = Operand(v1);
11048       compr = true;
11049    } else if (!compr) {
11050       for (int i = 0; i < 4; i++)
11051          values[i] = enabled_channels & (1 << i) ? values[i] : Operand(v1);
11052    }
11053 
11054    bld.exp(aco_opcode::exp, values[0], values[1], values[2], values[3], enabled_channels, target,
11055            compr);
11056    return true;
11057 }
11058 
11059 static void
create_fs_null_export(isel_context * ctx)11060 create_fs_null_export(isel_context* ctx)
11061 {
11062    /* FS must always have exports.
11063     * So when there are none, we need to add a null export.
11064     */
11065 
11066    Builder bld(ctx->program, ctx->block);
11067    unsigned dest = V_008DFC_SQ_EXP_NULL;
11068    bld.exp(aco_opcode::exp, Operand(v1), Operand(v1), Operand(v1), Operand(v1),
11069            /* enabled_mask */ 0, dest, /* compr */ false, /* done */ true, /* vm */ true);
11070 }
11071 
11072 static void
create_fs_exports(isel_context * ctx)11073 create_fs_exports(isel_context* ctx)
11074 {
11075    bool exported = false;
11076 
11077    /* Export depth, stencil and sample mask. */
11078    if (ctx->outputs.mask[FRAG_RESULT_DEPTH] || ctx->outputs.mask[FRAG_RESULT_STENCIL] ||
11079        ctx->outputs.mask[FRAG_RESULT_SAMPLE_MASK])
11080       exported |= export_fs_mrt_z(ctx);
11081 
11082    /* Export all color render targets. */
11083    for (unsigned i = FRAG_RESULT_DATA0; i < FRAG_RESULT_DATA7 + 1; ++i)
11084       if (ctx->outputs.mask[i])
11085          exported |= export_fs_mrt_color(ctx, i);
11086 
11087    if (!exported)
11088       create_fs_null_export(ctx);
11089 
11090    ctx->block->kind |= block_kind_export_end;
11091 }
11092 
11093 static void
create_workgroup_barrier(Builder & bld)11094 create_workgroup_barrier(Builder& bld)
11095 {
11096    bld.barrier(aco_opcode::p_barrier,
11097                memory_sync_info(storage_shared, semantic_acqrel, scope_workgroup), scope_workgroup);
11098 }
11099 
11100 static void
emit_stream_output(isel_context * ctx,Temp const * so_buffers,Temp const * so_write_offset,const struct radv_stream_output * output)11101 emit_stream_output(isel_context* ctx, Temp const* so_buffers, Temp const* so_write_offset,
11102                    const struct radv_stream_output* output)
11103 {
11104    unsigned num_comps = util_bitcount(output->component_mask);
11105    unsigned writemask = (1 << num_comps) - 1;
11106    unsigned loc = output->location;
11107    unsigned buf = output->buffer;
11108 
11109    assert(num_comps && num_comps <= 4);
11110    if (!num_comps || num_comps > 4)
11111       return;
11112 
11113    unsigned first_comp = ffs(output->component_mask) - 1;
11114 
11115    Temp out[4];
11116    bool all_undef = true;
11117    assert(ctx->stage.hw == HWStage::VS);
11118    for (unsigned i = 0; i < num_comps; i++) {
11119       out[i] = ctx->outputs.temps[loc * 4 + first_comp + i];
11120       all_undef = all_undef && !out[i].id();
11121    }
11122    if (all_undef)
11123       return;
11124 
11125    while (writemask) {
11126       int start, count;
11127       u_bit_scan_consecutive_range(&writemask, &start, &count);
11128       if (count == 3 && ctx->options->chip_class == GFX6) {
11129          /* GFX6 doesn't support storing vec3, split it. */
11130          writemask |= 1u << (start + 2);
11131          count = 2;
11132       }
11133 
11134       unsigned offset = output->offset + start * 4;
11135 
11136       Temp write_data = ctx->program->allocateTmp(RegClass(RegType::vgpr, count));
11137       aco_ptr<Pseudo_instruction> vec{create_instruction<Pseudo_instruction>(
11138          aco_opcode::p_create_vector, Format::PSEUDO, count, 1)};
11139       for (int i = 0; i < count; ++i)
11140          vec->operands[i] =
11141             (ctx->outputs.mask[loc] & 1 << (start + first_comp + i)) ? Operand(out[start + i]) : Operand::zero();
11142       vec->definitions[0] = Definition(write_data);
11143       ctx->block->instructions.emplace_back(std::move(vec));
11144 
11145       aco_opcode opcode;
11146       switch (count) {
11147       case 1: opcode = aco_opcode::buffer_store_dword; break;
11148       case 2: opcode = aco_opcode::buffer_store_dwordx2; break;
11149       case 3: opcode = aco_opcode::buffer_store_dwordx3; break;
11150       case 4: opcode = aco_opcode::buffer_store_dwordx4; break;
11151       default: unreachable("Unsupported dword count.");
11152       }
11153 
11154       aco_ptr<MUBUF_instruction> store{
11155          create_instruction<MUBUF_instruction>(opcode, Format::MUBUF, 4, 0)};
11156       store->operands[0] = Operand(so_buffers[buf]);
11157       store->operands[1] = Operand(so_write_offset[buf]);
11158       store->operands[2] = Operand::c32(0);
11159       store->operands[3] = Operand(write_data);
11160       if (offset > 4095) {
11161          /* Don't think this can happen in RADV, but maybe GL? It's easy to do this anyway. */
11162          Builder bld(ctx->program, ctx->block);
11163          store->operands[0] =
11164             bld.vadd32(bld.def(v1), Operand::c32(offset), Operand(so_write_offset[buf]));
11165       } else {
11166          store->offset = offset;
11167       }
11168       store->offen = true;
11169       store->glc = true;
11170       store->dlc = false;
11171       store->slc = true;
11172       ctx->block->instructions.emplace_back(std::move(store));
11173    }
11174 }
11175 
11176 static void
emit_streamout(isel_context * ctx,unsigned stream)11177 emit_streamout(isel_context* ctx, unsigned stream)
11178 {
11179    Builder bld(ctx->program, ctx->block);
11180 
11181    Temp so_vtx_count =
11182       bld.sop2(aco_opcode::s_bfe_u32, bld.def(s1), bld.def(s1, scc),
11183                get_arg(ctx, ctx->args->ac.streamout_config), Operand::c32(0x70010u));
11184 
11185    Temp tid = emit_mbcnt(ctx, bld.tmp(v1));
11186 
11187    Temp can_emit = bld.vopc(aco_opcode::v_cmp_gt_i32, bld.def(bld.lm), so_vtx_count, tid);
11188 
11189    if_context ic;
11190    begin_divergent_if_then(ctx, &ic, can_emit);
11191 
11192    bld.reset(ctx->block);
11193 
11194    Temp so_write_index =
11195       bld.vadd32(bld.def(v1), get_arg(ctx, ctx->args->ac.streamout_write_index), tid);
11196 
11197    Temp so_buffers[4];
11198    Temp so_write_offset[4];
11199    Temp buf_ptr = convert_pointer_to_64_bit(ctx, get_arg(ctx, ctx->args->streamout_buffers));
11200 
11201    for (unsigned i = 0; i < 4; i++) {
11202       unsigned stride = ctx->program->info->so.strides[i];
11203       if (!stride)
11204          continue;
11205 
11206       so_buffers[i] = bld.smem(aco_opcode::s_load_dwordx4, bld.def(s4), buf_ptr,
11207                                bld.copy(bld.def(s1), Operand::c32(i * 16u)));
11208 
11209       if (stride == 1) {
11210          Temp offset = bld.sop2(aco_opcode::s_add_i32, bld.def(s1), bld.def(s1, scc),
11211                                 get_arg(ctx, ctx->args->ac.streamout_write_index),
11212                                 get_arg(ctx, ctx->args->ac.streamout_offset[i]));
11213          Temp new_offset = bld.vadd32(bld.def(v1), offset, tid);
11214 
11215          so_write_offset[i] =
11216             bld.vop2(aco_opcode::v_lshlrev_b32, bld.def(v1), Operand::c32(2u), new_offset);
11217       } else {
11218          Temp offset = bld.v_mul_imm(bld.def(v1), so_write_index, stride * 4u);
11219          Temp offset2 = bld.sop2(aco_opcode::s_mul_i32, bld.def(s1), Operand::c32(4u),
11220                                  get_arg(ctx, ctx->args->ac.streamout_offset[i]));
11221          so_write_offset[i] = bld.vadd32(bld.def(v1), offset, offset2);
11222       }
11223    }
11224 
11225    for (unsigned i = 0; i < ctx->program->info->so.num_outputs; i++) {
11226       const struct radv_stream_output* output = &ctx->program->info->so.outputs[i];
11227       if (stream != output->stream)
11228          continue;
11229 
11230       emit_stream_output(ctx, so_buffers, so_write_offset, output);
11231    }
11232 
11233    begin_divergent_if_else(ctx, &ic);
11234    end_divergent_if(ctx, &ic);
11235 }
11236 
11237 Pseudo_instruction*
add_startpgm(struct isel_context * ctx)11238 add_startpgm(struct isel_context* ctx)
11239 {
11240    aco_ptr<Pseudo_instruction> startpgm{
11241       create_instruction<Pseudo_instruction>(aco_opcode::p_startpgm, Format::PSEUDO, 0, ctx->args->ac.arg_count)};
11242    for (unsigned i = 0, arg = 0; i < ctx->args->ac.arg_count; i++) {
11243       if (ctx->args->ac.args[i].skip)
11244          continue;
11245 
11246       enum ac_arg_regfile file = ctx->args->ac.args[i].file;
11247       unsigned size = ctx->args->ac.args[i].size;
11248       unsigned reg = ctx->args->ac.args[i].offset;
11249       RegClass type = RegClass(file == AC_ARG_SGPR ? RegType::sgpr : RegType::vgpr, size);
11250       Temp dst = ctx->program->allocateTmp(type);
11251       ctx->arg_temps[i] = dst;
11252       startpgm->definitions[arg] = Definition(dst);
11253       startpgm->definitions[arg].setFixed(PhysReg{file == AC_ARG_SGPR ? reg : reg + 256});
11254       arg++;
11255    }
11256    Pseudo_instruction* instr = startpgm.get();
11257    ctx->block->instructions.push_back(std::move(startpgm));
11258 
11259    /* Stash these in the program so that they can be accessed later when
11260     * handling spilling.
11261     */
11262    ctx->program->private_segment_buffer = get_arg(ctx, ctx->args->ring_offsets);
11263    ctx->program->scratch_offset = get_arg(ctx, ctx->args->ac.scratch_offset);
11264 
11265    if (ctx->stage.has(SWStage::VS) && ctx->program->info->vs.dynamic_inputs) {
11266       unsigned num_attributes = util_last_bit(ctx->program->info->vs.vb_desc_usage_mask);
11267       for (unsigned i = 0; i < num_attributes; i++) {
11268          Definition def(get_arg(ctx, ctx->args->vs_inputs[i]));
11269 
11270          unsigned idx = ctx->args->vs_inputs[i].arg_index;
11271          def.setFixed(PhysReg(256 + ctx->args->ac.args[idx].offset));
11272 
11273          ctx->program->vs_inputs.push_back(def);
11274       }
11275    }
11276 
11277    return instr;
11278 }
11279 
11280 void
fix_ls_vgpr_init_bug(isel_context * ctx,Pseudo_instruction * startpgm)11281 fix_ls_vgpr_init_bug(isel_context* ctx, Pseudo_instruction* startpgm)
11282 {
11283    assert(ctx->shader->info.stage == MESA_SHADER_VERTEX);
11284    Builder bld(ctx->program, ctx->block);
11285    constexpr unsigned hs_idx = 1u;
11286    Builder::Result hs_thread_count = bld.sop2(aco_opcode::s_bfe_u32, bld.def(s1), bld.def(s1, scc),
11287                                               get_arg(ctx, ctx->args->ac.merged_wave_info),
11288                                               Operand::c32((8u << 16) | (hs_idx * 8u)));
11289    Temp ls_has_nonzero_hs_threads = bool_to_vector_condition(ctx, hs_thread_count.def(1).getTemp());
11290 
11291    /* If there are no HS threads, SPI mistakenly loads the LS VGPRs starting at VGPR 0. */
11292 
11293    Temp instance_id =
11294       bld.vop2(aco_opcode::v_cndmask_b32, bld.def(v1), get_arg(ctx, ctx->args->ac.vertex_id),
11295                get_arg(ctx, ctx->args->ac.instance_id), ls_has_nonzero_hs_threads);
11296    Temp vs_rel_patch_id =
11297       bld.vop2(aco_opcode::v_cndmask_b32, bld.def(v1), get_arg(ctx, ctx->args->ac.tcs_rel_ids),
11298                get_arg(ctx, ctx->args->ac.vs_rel_patch_id), ls_has_nonzero_hs_threads);
11299    Temp vertex_id =
11300       bld.vop2(aco_opcode::v_cndmask_b32, bld.def(v1), get_arg(ctx, ctx->args->ac.tcs_patch_id),
11301                get_arg(ctx, ctx->args->ac.vertex_id), ls_has_nonzero_hs_threads);
11302 
11303    ctx->arg_temps[ctx->args->ac.instance_id.arg_index] = instance_id;
11304    ctx->arg_temps[ctx->args->ac.vs_rel_patch_id.arg_index] = vs_rel_patch_id;
11305    ctx->arg_temps[ctx->args->ac.vertex_id.arg_index] = vertex_id;
11306 }
11307 
11308 void
split_arguments(isel_context * ctx,Pseudo_instruction * startpgm)11309 split_arguments(isel_context* ctx, Pseudo_instruction* startpgm)
11310 {
11311    /* Split all arguments except for the first (ring_offsets) and the last
11312     * (exec) so that the dead channels don't stay live throughout the program.
11313     */
11314    for (int i = 1; i < startpgm->definitions.size(); i++) {
11315       if (startpgm->definitions[i].regClass().size() > 1) {
11316          emit_split_vector(ctx, startpgm->definitions[i].getTemp(),
11317                            startpgm->definitions[i].regClass().size());
11318       }
11319    }
11320 }
11321 
11322 void
handle_bc_optimize(isel_context * ctx)11323 handle_bc_optimize(isel_context* ctx)
11324 {
11325    /* needed when SPI_PS_IN_CONTROL.BC_OPTIMIZE_DISABLE is set to 0 */
11326    Builder bld(ctx->program, ctx->block);
11327    uint32_t spi_ps_input_ena = ctx->program->config->spi_ps_input_ena;
11328    bool uses_center =
11329       G_0286CC_PERSP_CENTER_ENA(spi_ps_input_ena) || G_0286CC_LINEAR_CENTER_ENA(spi_ps_input_ena);
11330    bool uses_persp_centroid = G_0286CC_PERSP_CENTROID_ENA(spi_ps_input_ena);
11331    bool uses_linear_centroid = G_0286CC_LINEAR_CENTROID_ENA(spi_ps_input_ena);
11332 
11333    if (uses_persp_centroid)
11334       ctx->persp_centroid = get_arg(ctx, ctx->args->ac.persp_centroid);
11335    if (uses_linear_centroid)
11336       ctx->linear_centroid = get_arg(ctx, ctx->args->ac.linear_centroid);
11337 
11338    if (uses_center && (uses_persp_centroid || uses_linear_centroid)) {
11339       Temp sel = bld.vopc_e64(aco_opcode::v_cmp_lt_i32, bld.hint_vcc(bld.def(bld.lm)),
11340                               get_arg(ctx, ctx->args->ac.prim_mask), Operand::zero());
11341 
11342       if (uses_persp_centroid) {
11343          Temp new_coord[2];
11344          for (unsigned i = 0; i < 2; i++) {
11345             Temp persp_centroid =
11346                emit_extract_vector(ctx, get_arg(ctx, ctx->args->ac.persp_centroid), i, v1);
11347             Temp persp_center =
11348                emit_extract_vector(ctx, get_arg(ctx, ctx->args->ac.persp_center), i, v1);
11349             new_coord[i] =
11350                bld.vop2(aco_opcode::v_cndmask_b32, bld.def(v1), persp_centroid, persp_center, sel);
11351          }
11352          ctx->persp_centroid = bld.tmp(v2);
11353          bld.pseudo(aco_opcode::p_create_vector, Definition(ctx->persp_centroid),
11354                     Operand(new_coord[0]), Operand(new_coord[1]));
11355          emit_split_vector(ctx, ctx->persp_centroid, 2);
11356       }
11357 
11358       if (uses_linear_centroid) {
11359          Temp new_coord[2];
11360          for (unsigned i = 0; i < 2; i++) {
11361             Temp linear_centroid =
11362                emit_extract_vector(ctx, get_arg(ctx, ctx->args->ac.linear_centroid), i, v1);
11363             Temp linear_center =
11364                emit_extract_vector(ctx, get_arg(ctx, ctx->args->ac.linear_center), i, v1);
11365             new_coord[i] = bld.vop2(aco_opcode::v_cndmask_b32, bld.def(v1), linear_centroid,
11366                                     linear_center, sel);
11367          }
11368          ctx->linear_centroid = bld.tmp(v2);
11369          bld.pseudo(aco_opcode::p_create_vector, Definition(ctx->linear_centroid),
11370                     Operand(new_coord[0]), Operand(new_coord[1]));
11371          emit_split_vector(ctx, ctx->linear_centroid, 2);
11372       }
11373    }
11374 }
11375 
11376 void
setup_fp_mode(isel_context * ctx,nir_shader * shader)11377 setup_fp_mode(isel_context* ctx, nir_shader* shader)
11378 {
11379    Program* program = ctx->program;
11380 
11381    unsigned float_controls = shader->info.float_controls_execution_mode;
11382 
11383    program->next_fp_mode.preserve_signed_zero_inf_nan32 =
11384       float_controls & FLOAT_CONTROLS_SIGNED_ZERO_INF_NAN_PRESERVE_FP32;
11385    program->next_fp_mode.preserve_signed_zero_inf_nan16_64 =
11386       float_controls & (FLOAT_CONTROLS_SIGNED_ZERO_INF_NAN_PRESERVE_FP16 |
11387                         FLOAT_CONTROLS_SIGNED_ZERO_INF_NAN_PRESERVE_FP64);
11388 
11389    program->next_fp_mode.must_flush_denorms32 =
11390       float_controls & FLOAT_CONTROLS_DENORM_FLUSH_TO_ZERO_FP32;
11391    program->next_fp_mode.must_flush_denorms16_64 =
11392       float_controls &
11393       (FLOAT_CONTROLS_DENORM_FLUSH_TO_ZERO_FP16 | FLOAT_CONTROLS_DENORM_FLUSH_TO_ZERO_FP64);
11394 
11395    program->next_fp_mode.care_about_round32 =
11396       float_controls &
11397       (FLOAT_CONTROLS_ROUNDING_MODE_RTZ_FP32 | FLOAT_CONTROLS_ROUNDING_MODE_RTE_FP32);
11398 
11399    program->next_fp_mode.care_about_round16_64 =
11400       float_controls &
11401       (FLOAT_CONTROLS_ROUNDING_MODE_RTZ_FP16 | FLOAT_CONTROLS_ROUNDING_MODE_RTZ_FP64 |
11402        FLOAT_CONTROLS_ROUNDING_MODE_RTE_FP16 | FLOAT_CONTROLS_ROUNDING_MODE_RTE_FP64);
11403 
11404    /* default to preserving fp16 and fp64 denorms, since it's free for fp64 and
11405     * the precision seems needed for Wolfenstein: Youngblood to render correctly */
11406    if (program->next_fp_mode.must_flush_denorms16_64)
11407       program->next_fp_mode.denorm16_64 = 0;
11408    else
11409       program->next_fp_mode.denorm16_64 = fp_denorm_keep;
11410 
11411    /* preserving fp32 denorms is expensive, so only do it if asked */
11412    if (float_controls & FLOAT_CONTROLS_DENORM_PRESERVE_FP32)
11413       program->next_fp_mode.denorm32 = fp_denorm_keep;
11414    else
11415       program->next_fp_mode.denorm32 = 0;
11416 
11417    if (float_controls & FLOAT_CONTROLS_ROUNDING_MODE_RTZ_FP32)
11418       program->next_fp_mode.round32 = fp_round_tz;
11419    else
11420       program->next_fp_mode.round32 = fp_round_ne;
11421 
11422    if (float_controls &
11423        (FLOAT_CONTROLS_ROUNDING_MODE_RTZ_FP16 | FLOAT_CONTROLS_ROUNDING_MODE_RTZ_FP64))
11424       program->next_fp_mode.round16_64 = fp_round_tz;
11425    else
11426       program->next_fp_mode.round16_64 = fp_round_ne;
11427 
11428    ctx->block->fp_mode = program->next_fp_mode;
11429 }
11430 
11431 void
cleanup_cfg(Program * program)11432 cleanup_cfg(Program* program)
11433 {
11434    /* create linear_succs/logical_succs */
11435    for (Block& BB : program->blocks) {
11436       for (unsigned idx : BB.linear_preds)
11437          program->blocks[idx].linear_succs.emplace_back(BB.index);
11438       for (unsigned idx : BB.logical_preds)
11439          program->blocks[idx].logical_succs.emplace_back(BB.index);
11440    }
11441 }
11442 
11443 Temp
lanecount_to_mask(isel_context * ctx,Temp count,bool allow64=true)11444 lanecount_to_mask(isel_context* ctx, Temp count, bool allow64 = true)
11445 {
11446    assert(count.regClass() == s1);
11447 
11448    Builder bld(ctx->program, ctx->block);
11449    Temp mask = bld.sop2(aco_opcode::s_bfm_b64, bld.def(s2), count, Operand::zero());
11450    Temp cond;
11451 
11452    if (ctx->program->wave_size == 64) {
11453       /* If we know that all 64 threads can't be active at a time, we just use the mask as-is */
11454       if (!allow64)
11455          return mask;
11456 
11457       /* Special case for 64 active invocations, because 64 doesn't work with s_bfm */
11458       Temp active_64 = bld.sopc(aco_opcode::s_bitcmp1_b32, bld.def(s1, scc), count,
11459                                 Operand::c32(6u /* log2(64) */));
11460       cond =
11461          bld.sop2(Builder::s_cselect, bld.def(bld.lm), Operand::c32(-1u), mask, bld.scc(active_64));
11462    } else {
11463       /* We use s_bfm_b64 (not _b32) which works with 32, but we need to extract the lower half of
11464        * the register */
11465       cond = emit_extract_vector(ctx, mask, 0, bld.lm);
11466    }
11467 
11468    return cond;
11469 }
11470 
11471 Temp
merged_wave_info_to_mask(isel_context * ctx,unsigned i)11472 merged_wave_info_to_mask(isel_context* ctx, unsigned i)
11473 {
11474    Builder bld(ctx->program, ctx->block);
11475 
11476    /* lanecount_to_mask() only cares about s0.u[6:0] so we don't need either s_bfe nor s_and here */
11477    Temp count = i == 0
11478                    ? get_arg(ctx, ctx->args->ac.merged_wave_info)
11479                    : bld.sop2(aco_opcode::s_lshr_b32, bld.def(s1), bld.def(s1, scc),
11480                               get_arg(ctx, ctx->args->ac.merged_wave_info), Operand::c32(i * 8u));
11481 
11482    return lanecount_to_mask(ctx, count);
11483 }
11484 
11485 void
ngg_emit_sendmsg_gs_alloc_req(isel_context * ctx,Temp vtx_cnt,Temp prm_cnt)11486 ngg_emit_sendmsg_gs_alloc_req(isel_context* ctx, Temp vtx_cnt, Temp prm_cnt)
11487 {
11488    assert(vtx_cnt.id() && prm_cnt.id());
11489 
11490    Builder bld(ctx->program, ctx->block);
11491    Temp prm_cnt_0;
11492 
11493    if (ctx->program->chip_class == GFX10 &&
11494        (ctx->stage.has(SWStage::GS) || ctx->program->info->has_ngg_culling)) {
11495       /* Navi 1x workaround: check whether the workgroup has no output.
11496        * If so, change the number of exported vertices and primitives to 1.
11497        */
11498       prm_cnt_0 = bld.sopc(aco_opcode::s_cmp_eq_u32, bld.def(s1, scc), prm_cnt, Operand::zero());
11499       prm_cnt = bld.sop2(aco_opcode::s_cselect_b32, bld.def(s1), Operand::c32(1u), prm_cnt,
11500                          bld.scc(prm_cnt_0));
11501       vtx_cnt = bld.sop2(aco_opcode::s_cselect_b32, bld.def(s1), Operand::c32(1u), vtx_cnt,
11502                          bld.scc(prm_cnt_0));
11503    }
11504 
11505    /* Put the number of vertices and primitives into m0 for the GS_ALLOC_REQ */
11506    Temp tmp =
11507       bld.sop2(aco_opcode::s_lshl_b32, bld.def(s1), bld.def(s1, scc), prm_cnt, Operand::c32(12u));
11508    tmp = bld.sop2(aco_opcode::s_or_b32, bld.m0(bld.def(s1)), bld.def(s1, scc), tmp, vtx_cnt);
11509 
11510    /* Request the SPI to allocate space for the primitives and vertices
11511     * that will be exported by the threadgroup.
11512     */
11513    bld.sopp(aco_opcode::s_sendmsg, bld.m0(tmp), -1, sendmsg_gs_alloc_req);
11514 
11515    if (prm_cnt_0.id()) {
11516       /* Navi 1x workaround: export a triangle with NaN coordinates when NGG has no output.
11517        * It can't have all-zero positions because that would render an undesired pixel with
11518        * conservative rasterization.
11519        */
11520       Temp first_lane = bld.sop1(Builder::s_ff1_i32, bld.def(s1), Operand(exec, bld.lm));
11521       Temp cond = bld.sop2(Builder::s_lshl, bld.def(bld.lm), bld.def(s1, scc),
11522                            Operand::c32_or_c64(1u, ctx->program->wave_size == 64), first_lane);
11523       cond = bld.sop2(Builder::s_cselect, bld.def(bld.lm), cond,
11524                       Operand::zero(ctx->program->wave_size == 64 ? 8 : 4), bld.scc(prm_cnt_0));
11525 
11526       if_context ic_prim_0;
11527       begin_divergent_if_then(ctx, &ic_prim_0, cond);
11528       bld.reset(ctx->block);
11529       ctx->block->kind |= block_kind_export_end;
11530 
11531       /* Use zero: means that it's a triangle whose every vertex index is 0. */
11532       Temp zero = bld.copy(bld.def(v1), Operand::zero());
11533       /* Use NaN for the coordinates, so that the rasterizer allways culls it.  */
11534       Temp nan_coord = bld.copy(bld.def(v1), Operand::c32(-1u));
11535 
11536       bld.exp(aco_opcode::exp, zero, Operand(v1), Operand(v1), Operand(v1), 1 /* enabled mask */,
11537               V_008DFC_SQ_EXP_PRIM /* dest */, false /* compressed */, true /* done */,
11538               false /* valid mask */);
11539       bld.exp(aco_opcode::exp, nan_coord, nan_coord, nan_coord, nan_coord, 0xf /* enabled mask */,
11540               V_008DFC_SQ_EXP_POS /* dest */, false /* compressed */, true /* done */,
11541               true /* valid mask */);
11542 
11543       begin_divergent_if_else(ctx, &ic_prim_0);
11544       end_divergent_if(ctx, &ic_prim_0);
11545       bld.reset(ctx->block);
11546    }
11547 }
11548 
11549 } /* end namespace */
11550 
11551 void
select_program(Program * program,unsigned shader_count,struct nir_shader * const * shaders,ac_shader_config * config,const struct radv_shader_args * args)11552 select_program(Program* program, unsigned shader_count, struct nir_shader* const* shaders,
11553                ac_shader_config* config, const struct radv_shader_args* args)
11554 {
11555    isel_context ctx = setup_isel_context(program, shader_count, shaders, config, args, false);
11556    if_context ic_merged_wave_info;
11557    bool ngg_gs = ctx.stage.hw == HWStage::NGG && ctx.stage.has(SWStage::GS);
11558 
11559    for (unsigned i = 0; i < shader_count; i++) {
11560       nir_shader* nir = shaders[i];
11561       init_context(&ctx, nir);
11562 
11563       setup_fp_mode(&ctx, nir);
11564 
11565       if (!i) {
11566          /* needs to be after init_context() for FS */
11567          Pseudo_instruction* startpgm = add_startpgm(&ctx);
11568          append_logical_start(ctx.block);
11569 
11570          if (unlikely(args->options->has_ls_vgpr_init_bug && ctx.stage == vertex_tess_control_hs))
11571             fix_ls_vgpr_init_bug(&ctx, startpgm);
11572 
11573          split_arguments(&ctx, startpgm);
11574 
11575          if (!args->shader_info->vs.has_prolog &&
11576              (program->stage.has(SWStage::VS) || program->stage.has(SWStage::TES))) {
11577             Builder(ctx.program, ctx.block).sopp(aco_opcode::s_setprio, -1u, 0x3u);
11578          }
11579       }
11580 
11581       /* In a merged VS+TCS HS, the VS implementation can be completely empty. */
11582       nir_function_impl* func = nir_shader_get_entrypoint(nir);
11583       bool empty_shader =
11584          nir_cf_list_is_empty_block(&func->body) &&
11585          ((nir->info.stage == MESA_SHADER_VERTEX &&
11586            (ctx.stage == vertex_tess_control_hs || ctx.stage == vertex_geometry_gs)) ||
11587           (nir->info.stage == MESA_SHADER_TESS_EVAL && ctx.stage == tess_eval_geometry_gs));
11588 
11589       bool check_merged_wave_info =
11590          ctx.tcs_in_out_eq ? i == 0 : (shader_count >= 2 && !empty_shader && !(ngg_gs && i == 1));
11591       bool endif_merged_wave_info =
11592          ctx.tcs_in_out_eq ? i == 1 : (check_merged_wave_info && !(ngg_gs && i == 1));
11593 
11594       if (program->chip_class == GFX10 && program->stage.hw == HWStage::NGG &&
11595           program->stage.num_sw_stages() == 1) {
11596          /* Workaround for Navi1x HW bug to ensure that all NGG waves launch before
11597           * s_sendmsg(GS_ALLOC_REQ). */
11598          Builder(ctx.program, ctx.block).sopp(aco_opcode::s_barrier, -1u, 0u);
11599       }
11600 
11601       if (check_merged_wave_info) {
11602          Temp cond = merged_wave_info_to_mask(&ctx, i);
11603          begin_divergent_if_then(&ctx, &ic_merged_wave_info, cond);
11604       }
11605 
11606       if (i) {
11607          Builder bld(ctx.program, ctx.block);
11608 
11609          /* Skip s_barrier from TCS when VS outputs are not stored in the LDS. */
11610          bool tcs_skip_barrier = ctx.stage == vertex_tess_control_hs &&
11611                                  ctx.tcs_temp_only_inputs == nir->info.inputs_read;
11612 
11613          if (!ngg_gs && !tcs_skip_barrier)
11614             create_workgroup_barrier(bld);
11615 
11616          if (ctx.stage == vertex_geometry_gs || ctx.stage == tess_eval_geometry_gs) {
11617             ctx.gs_wave_id = bld.pseudo(aco_opcode::p_extract, bld.def(s1, m0), bld.def(s1, scc),
11618                                         get_arg(&ctx, args->ac.merged_wave_info), Operand::c32(2u),
11619                                         Operand::c32(8u), Operand::zero());
11620          }
11621       } else if (ctx.stage == geometry_gs)
11622          ctx.gs_wave_id = get_arg(&ctx, args->ac.gs_wave_id);
11623 
11624       if (ctx.stage == fragment_fs)
11625          handle_bc_optimize(&ctx);
11626 
11627       visit_cf_list(&ctx, &func->body);
11628 
11629       if (ctx.program->info->so.num_outputs && ctx.stage.hw == HWStage::VS)
11630          emit_streamout(&ctx, 0);
11631 
11632       if (ctx.stage.hw == HWStage::VS) {
11633          create_vs_exports(&ctx);
11634       } else if (nir->info.stage == MESA_SHADER_GEOMETRY && !ngg_gs) {
11635          Builder bld(ctx.program, ctx.block);
11636          bld.barrier(aco_opcode::p_barrier,
11637                      memory_sync_info(storage_vmem_output, semantic_release, scope_device));
11638          bld.sopp(aco_opcode::s_sendmsg, bld.m0(ctx.gs_wave_id), -1,
11639                   sendmsg_gs_done(false, false, 0));
11640       }
11641 
11642       if (ctx.stage == fragment_fs) {
11643          create_fs_exports(&ctx);
11644       }
11645 
11646       if (endif_merged_wave_info) {
11647          begin_divergent_if_else(&ctx, &ic_merged_wave_info);
11648          end_divergent_if(&ctx, &ic_merged_wave_info);
11649       }
11650 
11651       if (i == 0 && ctx.stage == vertex_tess_control_hs && ctx.tcs_in_out_eq) {
11652          /* Outputs of the previous stage are inputs to the next stage */
11653          ctx.inputs = ctx.outputs;
11654          ctx.outputs = shader_io_state();
11655       }
11656 
11657       cleanup_context(&ctx);
11658    }
11659 
11660    program->config->float_mode = program->blocks[0].fp_mode.val;
11661 
11662    append_logical_end(ctx.block);
11663    ctx.block->kind |= block_kind_uniform;
11664    Builder bld(ctx.program, ctx.block);
11665    bld.sopp(aco_opcode::s_endpgm);
11666 
11667    cleanup_cfg(program);
11668 }
11669 
11670 void
select_gs_copy_shader(Program * program,struct nir_shader * gs_shader,ac_shader_config * config,const struct radv_shader_args * args)11671 select_gs_copy_shader(Program* program, struct nir_shader* gs_shader, ac_shader_config* config,
11672                       const struct radv_shader_args* args)
11673 {
11674    isel_context ctx = setup_isel_context(program, 1, &gs_shader, config, args, true);
11675 
11676    ctx.block->fp_mode = program->next_fp_mode;
11677 
11678    add_startpgm(&ctx);
11679    append_logical_start(ctx.block);
11680 
11681    Builder bld(ctx.program, ctx.block);
11682 
11683    Temp gsvs_ring = bld.smem(aco_opcode::s_load_dwordx4, bld.def(s4),
11684                              program->private_segment_buffer, Operand::c32(RING_GSVS_VS * 16u));
11685 
11686    Operand stream_id = Operand::zero();
11687    if (args->shader_info->so.num_outputs)
11688       stream_id = bld.sop2(aco_opcode::s_bfe_u32, bld.def(s1), bld.def(s1, scc),
11689                            get_arg(&ctx, ctx.args->ac.streamout_config), Operand::c32(0x20018u));
11690 
11691    Temp vtx_offset = bld.vop2(aco_opcode::v_lshlrev_b32, bld.def(v1), Operand::c32(2u),
11692                               get_arg(&ctx, ctx.args->ac.vertex_id));
11693 
11694    std::stack<if_context, std::vector<if_context>> if_contexts;
11695 
11696    for (unsigned stream = 0; stream < 4; stream++) {
11697       if (stream_id.isConstant() && stream != stream_id.constantValue())
11698          continue;
11699 
11700       unsigned num_components = args->shader_info->gs.num_stream_output_components[stream];
11701       if (stream > 0 && (!num_components || !args->shader_info->so.num_outputs))
11702          continue;
11703 
11704       memset(ctx.outputs.mask, 0, sizeof(ctx.outputs.mask));
11705 
11706       if (!stream_id.isConstant()) {
11707          Temp cond =
11708             bld.sopc(aco_opcode::s_cmp_eq_u32, bld.def(s1, scc), stream_id, Operand::c32(stream));
11709          if_contexts.emplace();
11710          begin_uniform_if_then(&ctx, &if_contexts.top(), cond);
11711          bld.reset(ctx.block);
11712       }
11713 
11714       unsigned offset = 0;
11715       for (unsigned i = 0; i <= VARYING_SLOT_VAR31; ++i) {
11716          if (args->shader_info->gs.output_streams[i] != stream)
11717             continue;
11718 
11719          unsigned output_usage_mask = args->shader_info->gs.output_usage_mask[i];
11720          unsigned length = util_last_bit(output_usage_mask);
11721          for (unsigned j = 0; j < length; ++j) {
11722             if (!(output_usage_mask & (1 << j)))
11723                continue;
11724 
11725             Temp val = bld.tmp(v1);
11726             unsigned const_offset = offset * args->shader_info->gs.vertices_out * 16 * 4;
11727             load_vmem_mubuf(&ctx, val, gsvs_ring, vtx_offset, Temp(), const_offset, 4, 1, 0u, true,
11728                             true, true);
11729 
11730             ctx.outputs.mask[i] |= 1 << j;
11731             ctx.outputs.temps[i * 4u + j] = val;
11732 
11733             offset++;
11734          }
11735       }
11736 
11737       if (args->shader_info->so.num_outputs) {
11738          emit_streamout(&ctx, stream);
11739          bld.reset(ctx.block);
11740       }
11741 
11742       if (stream == 0) {
11743          create_vs_exports(&ctx);
11744       }
11745 
11746       if (!stream_id.isConstant()) {
11747          begin_uniform_if_else(&ctx, &if_contexts.top());
11748          bld.reset(ctx.block);
11749       }
11750    }
11751 
11752    while (!if_contexts.empty()) {
11753       end_uniform_if(&ctx, &if_contexts.top());
11754       if_contexts.pop();
11755    }
11756 
11757    program->config->float_mode = program->blocks[0].fp_mode.val;
11758 
11759    append_logical_end(ctx.block);
11760    ctx.block->kind |= block_kind_uniform;
11761    bld.reset(ctx.block);
11762    bld.sopp(aco_opcode::s_endpgm);
11763 
11764    cleanup_cfg(program);
11765 }
11766 
11767 void
select_trap_handler_shader(Program * program,struct nir_shader * shader,ac_shader_config * config,const struct radv_shader_args * args)11768 select_trap_handler_shader(Program* program, struct nir_shader* shader, ac_shader_config* config,
11769                            const struct radv_shader_args* args)
11770 {
11771    assert(args->options->chip_class == GFX8);
11772 
11773    init_program(program, compute_cs, args->shader_info, args->options->chip_class,
11774                 args->options->family, args->options->wgp_mode, config);
11775 
11776    isel_context ctx = {};
11777    ctx.program = program;
11778    ctx.args = args;
11779    ctx.options = args->options;
11780    ctx.stage = program->stage;
11781 
11782    ctx.block = ctx.program->create_and_insert_block();
11783    ctx.block->kind = block_kind_top_level;
11784 
11785    program->workgroup_size = 1; /* XXX */
11786 
11787    add_startpgm(&ctx);
11788    append_logical_start(ctx.block);
11789 
11790    Builder bld(ctx.program, ctx.block);
11791 
11792    /* Load the buffer descriptor from TMA. */
11793    bld.smem(aco_opcode::s_load_dwordx4, Definition(PhysReg{ttmp4}, s4), Operand(PhysReg{tma}, s2),
11794             Operand::zero());
11795 
11796    /* Store TTMP0-TTMP1. */
11797    bld.smem(aco_opcode::s_buffer_store_dwordx2, Operand(PhysReg{ttmp4}, s4), Operand::zero(),
11798             Operand(PhysReg{ttmp0}, s2), memory_sync_info(), true);
11799 
11800    uint32_t hw_regs_idx[] = {
11801       2, /* HW_REG_STATUS */
11802       3, /* HW_REG_TRAP_STS */
11803       4, /* HW_REG_HW_ID */
11804       7, /* HW_REG_IB_STS */
11805    };
11806 
11807    /* Store some hardware registers. */
11808    for (unsigned i = 0; i < ARRAY_SIZE(hw_regs_idx); i++) {
11809       /* "((size - 1) << 11) | register" */
11810       bld.sopk(aco_opcode::s_getreg_b32, Definition(PhysReg{ttmp8}, s1),
11811                ((20 - 1) << 11) | hw_regs_idx[i]);
11812 
11813       bld.smem(aco_opcode::s_buffer_store_dword, Operand(PhysReg{ttmp4}, s4),
11814                Operand::c32(8u + i * 4), Operand(PhysReg{ttmp8}, s1), memory_sync_info(), true);
11815    }
11816 
11817    program->config->float_mode = program->blocks[0].fp_mode.val;
11818 
11819    append_logical_end(ctx.block);
11820    ctx.block->kind |= block_kind_uniform;
11821    bld.sopp(aco_opcode::s_endpgm);
11822 
11823    cleanup_cfg(program);
11824 }
11825 
11826 Operand
get_arg_fixed(const struct radv_shader_args * args,struct ac_arg arg)11827 get_arg_fixed(const struct radv_shader_args* args, struct ac_arg arg)
11828 {
11829    assert(arg.used);
11830 
11831    enum ac_arg_regfile file = args->ac.args[arg.arg_index].file;
11832    unsigned size = args->ac.args[arg.arg_index].size;
11833    unsigned reg = args->ac.args[arg.arg_index].offset;
11834 
11835    return Operand(PhysReg(file == AC_ARG_SGPR ? reg : reg + 256),
11836                   RegClass(file == AC_ARG_SGPR ? RegType::sgpr : RegType::vgpr, size));
11837 }
11838 
11839 unsigned
load_vb_descs(Builder & bld,PhysReg dest,Operand base,unsigned start,unsigned max)11840 load_vb_descs(Builder& bld, PhysReg dest, Operand base, unsigned start, unsigned max)
11841 {
11842    unsigned count = MIN2((bld.program->dev.sgpr_limit - dest.reg()) / 4u, max);
11843 
11844    unsigned num_loads = (count / 4u) + util_bitcount(count & 0x3);
11845    if (bld.program->chip_class >= GFX10 && num_loads > 1)
11846       bld.sopp(aco_opcode::s_clause, -1, num_loads - 1);
11847 
11848    for (unsigned i = 0; i < count;) {
11849       unsigned size = 1u << util_logbase2(MIN2(count - i, 4));
11850 
11851       if (size == 4)
11852          bld.smem(aco_opcode::s_load_dwordx16, Definition(dest, s16), base,
11853                   Operand::c32((start + i) * 16u));
11854       else if (size == 2)
11855          bld.smem(aco_opcode::s_load_dwordx8, Definition(dest, s8), base,
11856                   Operand::c32((start + i) * 16u));
11857       else
11858          bld.smem(aco_opcode::s_load_dwordx4, Definition(dest, s4), base,
11859                   Operand::c32((start + i) * 16u));
11860 
11861       dest = dest.advance(size * 16u);
11862       i += size;
11863    }
11864 
11865    return count;
11866 }
11867 
11868 Operand
calc_nontrivial_instance_id(Builder & bld,const struct radv_shader_args * args,unsigned index,Operand instance_id,Operand start_instance,PhysReg tmp_sgpr,PhysReg tmp_vgpr0,PhysReg tmp_vgpr1)11869 calc_nontrivial_instance_id(Builder& bld, const struct radv_shader_args* args, unsigned index,
11870                             Operand instance_id, Operand start_instance, PhysReg tmp_sgpr,
11871                             PhysReg tmp_vgpr0, PhysReg tmp_vgpr1)
11872 {
11873    bld.smem(aco_opcode::s_load_dwordx2, Definition(tmp_sgpr, s2),
11874             get_arg_fixed(args, args->prolog_inputs), Operand::c32(8u + index * 8u));
11875 
11876    wait_imm lgkm_imm;
11877    lgkm_imm.lgkm = 0;
11878    bld.sopp(aco_opcode::s_waitcnt, -1, lgkm_imm.pack(bld.program->chip_class));
11879 
11880    Definition fetch_index_def(tmp_vgpr0, v1);
11881    Operand fetch_index(tmp_vgpr0, v1);
11882 
11883    Operand div_info(tmp_sgpr, s1);
11884    if (bld.program->chip_class >= GFX8) {
11885       /* use SDWA */
11886       if (bld.program->chip_class < GFX9) {
11887          bld.vop1(aco_opcode::v_mov_b32, Definition(tmp_vgpr1, v1), div_info);
11888          div_info = Operand(tmp_vgpr1, v1);
11889       }
11890 
11891       bld.vop2(aco_opcode::v_lshrrev_b32, fetch_index_def, div_info, instance_id).instr;
11892 
11893       Instruction* instr;
11894       if (bld.program->chip_class >= GFX9)
11895          instr = bld.vop2_sdwa(aco_opcode::v_add_u32, fetch_index_def, div_info, fetch_index).instr;
11896       else
11897          instr = bld.vop2_sdwa(aco_opcode::v_add_co_u32, fetch_index_def, Definition(vcc, bld.lm),
11898                                div_info, fetch_index)
11899                     .instr;
11900       instr->sdwa().sel[0] = SubdwordSel::ubyte1;
11901 
11902       bld.vop3(aco_opcode::v_mul_hi_u32, fetch_index_def, Operand(tmp_sgpr.advance(4), s1),
11903                fetch_index);
11904 
11905       instr =
11906          bld.vop2_sdwa(aco_opcode::v_lshrrev_b32, fetch_index_def, div_info, fetch_index).instr;
11907       instr->sdwa().sel[0] = SubdwordSel::ubyte2;
11908    } else {
11909       Operand tmp_op(tmp_vgpr1, v1);
11910       Definition tmp_def(tmp_vgpr1, v1);
11911 
11912       bld.vop2(aco_opcode::v_lshrrev_b32, fetch_index_def, div_info, instance_id);
11913 
11914       bld.vop3(aco_opcode::v_bfe_u32, tmp_def, div_info, Operand::c32(8u), Operand::c32(8u));
11915       bld.vadd32(fetch_index_def, tmp_op, fetch_index, false, Operand(s2), true);
11916 
11917       bld.vop3(aco_opcode::v_mul_hi_u32, fetch_index_def, fetch_index,
11918                Operand(tmp_sgpr.advance(4), s1));
11919 
11920       bld.vop3(aco_opcode::v_bfe_u32, tmp_def, div_info, Operand::c32(16u), Operand::c32(8u));
11921       bld.vop2(aco_opcode::v_lshrrev_b32, fetch_index_def, tmp_op, fetch_index);
11922    }
11923 
11924    bld.vadd32(fetch_index_def, start_instance, fetch_index, false, Operand(s2), true);
11925 
11926    return fetch_index;
11927 }
11928 
11929 void
select_vs_prolog(Program * program,const struct radv_vs_prolog_key * key,ac_shader_config * config,const struct radv_shader_args * args,unsigned * num_preserved_sgprs)11930 select_vs_prolog(Program* program, const struct radv_vs_prolog_key* key, ac_shader_config* config,
11931                  const struct radv_shader_args* args, unsigned* num_preserved_sgprs)
11932 {
11933    assert(key->num_attributes > 0);
11934 
11935    /* This should be enough for any shader/stage. */
11936    unsigned max_user_sgprs = args->options->chip_class >= GFX9 ? 32 : 16;
11937    *num_preserved_sgprs = max_user_sgprs + 14;
11938 
11939    init_program(program, compute_cs, args->shader_info, args->options->chip_class,
11940                 args->options->family, args->options->wgp_mode, config);
11941 
11942    Block* block = program->create_and_insert_block();
11943    block->kind = block_kind_top_level;
11944 
11945    program->workgroup_size = 64;
11946    calc_min_waves(program);
11947 
11948    Builder bld(program, block);
11949 
11950    block->instructions.reserve(16 + key->num_attributes * 4);
11951 
11952    bld.sopp(aco_opcode::s_setprio, -1u, 0x3u);
11953 
11954    uint32_t attrib_mask = BITFIELD_MASK(key->num_attributes);
11955    bool has_nontrivial_divisors = key->state->nontrivial_divisors & attrib_mask;
11956 
11957    wait_imm lgkm_imm;
11958    lgkm_imm.lgkm = 0;
11959 
11960    /* choose sgprs */
11961    PhysReg vertex_buffers(align(*num_preserved_sgprs, 2));
11962    PhysReg prolog_input = vertex_buffers.advance(8);
11963    PhysReg desc(
11964       align((has_nontrivial_divisors ? prolog_input : vertex_buffers).advance(8).reg(), 4));
11965 
11966    Operand start_instance = get_arg_fixed(args, args->ac.start_instance);
11967    Operand instance_id = get_arg_fixed(args, args->ac.instance_id);
11968 
11969    PhysReg attributes_start(256 + args->ac.num_vgprs_used);
11970    /* choose vgprs that won't be used for anything else until the last attribute load */
11971    PhysReg vertex_index(attributes_start.reg() + key->num_attributes * 4 - 1);
11972    PhysReg instance_index(attributes_start.reg() + key->num_attributes * 4 - 2);
11973    PhysReg start_instance_vgpr(attributes_start.reg() + key->num_attributes * 4 - 3);
11974    PhysReg nontrivial_tmp_vgpr0(attributes_start.reg() + key->num_attributes * 4 - 4);
11975    PhysReg nontrivial_tmp_vgpr1(attributes_start.reg() + key->num_attributes * 4);
11976 
11977    bld.sop1(aco_opcode::s_mov_b32, Definition(vertex_buffers, s1),
11978             get_arg_fixed(args, args->ac.vertex_buffers));
11979    bld.sop1(aco_opcode::s_mov_b32, Definition(vertex_buffers.advance(4), s1),
11980             Operand::c32((unsigned)args->options->address32_hi));
11981 
11982    /* calculate vgpr requirements */
11983    unsigned num_vgprs = attributes_start.reg() - 256;
11984    num_vgprs += key->num_attributes * 4;
11985    if (has_nontrivial_divisors && program->chip_class <= GFX8)
11986       num_vgprs++; /* make space for nontrivial_tmp_vgpr1 */
11987    unsigned num_sgprs = 0;
11988 
11989    for (unsigned loc = 0; loc < key->num_attributes;) {
11990       unsigned num_descs =
11991          load_vb_descs(bld, desc, Operand(vertex_buffers, s2), loc, key->num_attributes - loc);
11992       num_sgprs = MAX2(num_sgprs, desc.advance(num_descs * 16u).reg());
11993 
11994       if (loc == 0) {
11995          /* perform setup while we load the descriptors */
11996          if (key->is_ngg || key->next_stage != MESA_SHADER_VERTEX) {
11997             Operand count = get_arg_fixed(args, args->ac.merged_wave_info);
11998             bld.sop2(aco_opcode::s_bfm_b64, Definition(exec, s2), count, Operand::c32(0u));
11999             if (program->wave_size == 64) {
12000                bld.sopc(aco_opcode::s_bitcmp1_b32, Definition(scc, s1), count,
12001                         Operand::c32(6u /* log2(64) */));
12002                bld.sop2(aco_opcode::s_cselect_b64, Definition(exec, s2), Operand::c64(UINT64_MAX),
12003                         Operand(exec, s2), Operand(scc, s1));
12004             }
12005          }
12006 
12007          bool needs_instance_index = false;
12008          bool needs_start_instance = false;
12009          u_foreach_bit(i, key->state->instance_rate_inputs & attrib_mask)
12010          {
12011             needs_instance_index |= key->state->divisors[i] == 1;
12012             needs_start_instance |= key->state->divisors[i] == 0;
12013          }
12014          bool needs_vertex_index = ~key->state->instance_rate_inputs & attrib_mask;
12015          if (needs_vertex_index)
12016             bld.vadd32(Definition(vertex_index, v1), get_arg_fixed(args, args->ac.base_vertex),
12017                        get_arg_fixed(args, args->ac.vertex_id), false, Operand(s2), true);
12018          if (needs_instance_index)
12019             bld.vadd32(Definition(instance_index, v1), start_instance, instance_id, false,
12020                        Operand(s2), true);
12021          if (needs_start_instance)
12022             bld.vop1(aco_opcode::v_mov_b32, Definition(start_instance_vgpr, v1), start_instance);
12023       }
12024 
12025       bld.sopp(aco_opcode::s_waitcnt, -1, lgkm_imm.pack(program->chip_class));
12026 
12027       for (unsigned i = 0; i < num_descs; i++, loc++) {
12028          PhysReg dest(attributes_start.reg() + loc * 4u);
12029 
12030          /* calculate index */
12031          Operand fetch_index = Operand(vertex_index, v1);
12032          if (key->state->instance_rate_inputs & (1u << loc)) {
12033             uint32_t divisor = key->state->divisors[loc];
12034             if (divisor) {
12035                fetch_index = instance_id;
12036                if (key->state->nontrivial_divisors & (1u << loc)) {
12037                   unsigned index =
12038                      util_bitcount(key->state->nontrivial_divisors & BITFIELD_MASK(loc));
12039                   fetch_index = calc_nontrivial_instance_id(
12040                      bld, args, index, instance_id, start_instance, prolog_input,
12041                      nontrivial_tmp_vgpr0, nontrivial_tmp_vgpr1);
12042                } else {
12043                   fetch_index = Operand(instance_index, v1);
12044                }
12045             } else {
12046                fetch_index = Operand(start_instance_vgpr, v1);
12047             }
12048          }
12049 
12050          /* perform load */
12051          PhysReg cur_desc = desc.advance(i * 16);
12052          if ((key->misaligned_mask & (1u << loc))) {
12053             unsigned dfmt = key->state->formats[loc] & 0xf;
12054             unsigned nfmt = key->state->formats[loc] >> 4;
12055             const struct ac_data_format_info* vtx_info = ac_get_data_format_info(dfmt);
12056             for (unsigned j = 0; j < vtx_info->num_channels; j++) {
12057                bool post_shuffle = key->state->post_shuffle & (1u << loc);
12058                unsigned offset = vtx_info->chan_byte_size * (post_shuffle && j < 3 ? 2 - j : j);
12059 
12060                /* Use MUBUF to workaround hangs for byte-aligned dword loads. The Vulkan spec
12061                 * doesn't require this to work, but some GL CTS tests over Zink do this anyway.
12062                 * MTBUF can hang, but MUBUF doesn't (probably gives garbage, but GL CTS doesn't
12063                 * care).
12064                 */
12065                if (vtx_info->chan_format == V_008F0C_BUF_DATA_FORMAT_32)
12066                   bld.mubuf(aco_opcode::buffer_load_dword, Definition(dest.advance(j * 4u), v1),
12067                             Operand(cur_desc, s4), fetch_index, Operand::c32(0u), offset, false,
12068                             false, true);
12069                else
12070                   bld.mtbuf(aco_opcode::tbuffer_load_format_x, Definition(dest.advance(j * 4u), v1),
12071                             Operand(cur_desc, s4), fetch_index, Operand::c32(0u),
12072                             vtx_info->chan_format, nfmt, offset, false, true);
12073             }
12074             uint32_t one =
12075                nfmt == V_008F0C_BUF_NUM_FORMAT_UINT || nfmt == V_008F0C_BUF_NUM_FORMAT_SINT
12076                   ? 1u
12077                   : 0x3f800000u;
12078             for (unsigned j = vtx_info->num_channels; j < 4; j++) {
12079                bld.vop1(aco_opcode::v_mov_b32, Definition(dest.advance(j * 4u), v1),
12080                         Operand::c32(j == 3 ? one : 0u));
12081             }
12082          } else {
12083             bld.mubuf(aco_opcode::buffer_load_format_xyzw, Definition(dest, v4),
12084                       Operand(cur_desc, s4), fetch_index, Operand::c32(0u), 0u, false, false, true);
12085          }
12086       }
12087    }
12088 
12089    if (key->state->alpha_adjust_lo | key->state->alpha_adjust_hi) {
12090       wait_imm vm_imm;
12091       vm_imm.vm = 0;
12092       bld.sopp(aco_opcode::s_waitcnt, -1, vm_imm.pack(program->chip_class));
12093    }
12094 
12095    /* For 2_10_10_10 formats the alpha is handled as unsigned by pre-vega HW.
12096     * so we may need to fix it up. */
12097    u_foreach_bit(loc, (key->state->alpha_adjust_lo | key->state->alpha_adjust_hi))
12098    {
12099       PhysReg alpha(attributes_start.reg() + loc * 4u + 3);
12100 
12101       unsigned alpha_adjust = (key->state->alpha_adjust_lo >> loc) & 0x1;
12102       alpha_adjust |= ((key->state->alpha_adjust_hi >> loc) & 0x1) << 1;
12103 
12104       if (alpha_adjust == ALPHA_ADJUST_SSCALED)
12105          bld.vop1(aco_opcode::v_cvt_u32_f32, Definition(alpha, v1), Operand(alpha, v1));
12106 
12107       /* For the integer-like cases, do a natural sign extension.
12108        *
12109        * For the SNORM case, the values are 0.0, 0.333, 0.666, 1.0
12110        * and happen to contain 0, 1, 2, 3 as the two LSBs of the
12111        * exponent.
12112        */
12113       unsigned offset = alpha_adjust == ALPHA_ADJUST_SNORM ? 23u : 0u;
12114       bld.vop3(aco_opcode::v_bfe_i32, Definition(alpha, v1), Operand(alpha, v1),
12115                Operand::c32(offset), Operand::c32(2u));
12116 
12117       /* Convert back to the right type. */
12118       if (alpha_adjust == ALPHA_ADJUST_SNORM) {
12119          bld.vop1(aco_opcode::v_cvt_f32_i32, Definition(alpha, v1), Operand(alpha, v1));
12120          bld.vop2(aco_opcode::v_max_f32, Definition(alpha, v1), Operand::c32(0xbf800000u),
12121                   Operand(alpha, v1));
12122       } else if (alpha_adjust == ALPHA_ADJUST_SSCALED) {
12123          bld.vop1(aco_opcode::v_cvt_f32_i32, Definition(alpha, v1), Operand(alpha, v1));
12124       }
12125    }
12126 
12127    block->kind |= block_kind_uniform;
12128 
12129    /* continue on to the main shader */
12130    Operand continue_pc = get_arg_fixed(args, args->prolog_inputs);
12131    if (has_nontrivial_divisors) {
12132       bld.smem(aco_opcode::s_load_dwordx2, Definition(prolog_input, s2),
12133                get_arg_fixed(args, args->prolog_inputs), Operand::c32(0u));
12134       bld.sopp(aco_opcode::s_waitcnt, -1, lgkm_imm.pack(program->chip_class));
12135       continue_pc = Operand(prolog_input, s2);
12136    }
12137 
12138    bld.sop1(aco_opcode::s_setpc_b64, continue_pc);
12139 
12140    program->config->float_mode = program->blocks[0].fp_mode.val;
12141    /* addition on GFX6-8 requires a carry-out (we use VCC) */
12142    program->needs_vcc = program->chip_class <= GFX8;
12143    program->config->num_vgprs = get_vgpr_alloc(program, num_vgprs);
12144    program->config->num_sgprs = get_sgpr_alloc(program, num_sgprs);
12145 }
12146 } // namespace aco
12147