1 /*
2 * Copyright © 2018 Valve Corporation
3 * Copyright © 2018 Google
4 *
5 * SPDX-License-Identifier: MIT
6 */
7
8 #include "aco_instruction_selection.h"
9
10 #include "aco_builder.h"
11 #include "aco_interface.h"
12 #include "aco_ir.h"
13
14 #include "common/ac_descriptors.h"
15 #include "common/ac_gpu_info.h"
16 #include "common/nir/ac_nir.h"
17 #include "common/sid.h"
18
19 #include "util/fast_idiv_by_const.h"
20 #include "util/memstream.h"
21
22 #include <array>
23 #include <functional>
24 #include <map>
25 #include <numeric>
26 #include <stack>
27 #include <utility>
28 #include <vector>
29
30 namespace aco {
31 namespace {
32
33 #define isel_err(...) _isel_err(ctx, __FILE__, __LINE__, __VA_ARGS__)
34
35 static void
_isel_err(isel_context * ctx,const char * file,unsigned line,const nir_instr * instr,const char * msg)36 _isel_err(isel_context* ctx, const char* file, unsigned line, const nir_instr* instr,
37 const char* msg)
38 {
39 char* out;
40 size_t outsize;
41 struct u_memstream mem;
42 u_memstream_open(&mem, &out, &outsize);
43 FILE* const memf = u_memstream_get(&mem);
44
45 fprintf(memf, "%s: ", msg);
46 nir_print_instr(instr, memf);
47 u_memstream_close(&mem);
48
49 _aco_err(ctx->program, file, line, out);
50 free(out);
51 }
52
53 struct loop_context {
54 Block loop_exit;
55
56 unsigned header_idx_old;
57 Block* exit_old;
58 bool divergent_cont_old;
59 bool divergent_branch_old;
60 bool divergent_if_old;
61 };
62
63 static void visit_cf_list(struct isel_context* ctx, struct exec_list* list);
64
65 static void
add_logical_edge(unsigned pred_idx,Block * succ)66 add_logical_edge(unsigned pred_idx, Block* succ)
67 {
68 succ->logical_preds.emplace_back(pred_idx);
69 }
70
71 static void
add_linear_edge(unsigned pred_idx,Block * succ)72 add_linear_edge(unsigned pred_idx, Block* succ)
73 {
74 succ->linear_preds.emplace_back(pred_idx);
75 }
76
77 static void
add_edge(unsigned pred_idx,Block * succ)78 add_edge(unsigned pred_idx, Block* succ)
79 {
80 add_logical_edge(pred_idx, succ);
81 add_linear_edge(pred_idx, succ);
82 }
83
84 static void
append_logical_start(Block * b)85 append_logical_start(Block* b)
86 {
87 Builder(NULL, b).pseudo(aco_opcode::p_logical_start);
88 }
89
90 static void
append_logical_end(Block * b)91 append_logical_end(Block* b)
92 {
93 Builder(NULL, b).pseudo(aco_opcode::p_logical_end);
94 }
95
96 Temp
get_ssa_temp(struct isel_context * ctx,nir_def * def)97 get_ssa_temp(struct isel_context* ctx, nir_def* def)
98 {
99 uint32_t id = ctx->first_temp_id + def->index;
100 return Temp(id, ctx->program->temp_rc[id]);
101 }
102
103 static Builder
create_alu_builder(isel_context * ctx,nir_alu_instr * instr)104 create_alu_builder(isel_context* ctx, nir_alu_instr* instr)
105 {
106 Builder bld(ctx->program, ctx->block);
107 bld.is_precise = instr->exact;
108 bld.is_sz_preserve = nir_alu_instr_is_signed_zero_preserve(instr);
109 bld.is_inf_preserve = nir_alu_instr_is_inf_preserve(instr);
110 bld.is_nan_preserve = nir_alu_instr_is_nan_preserve(instr);
111 return bld;
112 }
113
114 Temp
emit_mbcnt(isel_context * ctx,Temp dst,Operand mask=Operand (),Operand base=Operand::zero ())115 emit_mbcnt(isel_context* ctx, Temp dst, Operand mask = Operand(), Operand base = Operand::zero())
116 {
117 Builder bld(ctx->program, ctx->block);
118 assert(mask.isUndefined() || mask.isTemp() || (mask.isFixed() && mask.physReg() == exec));
119 assert(mask.isUndefined() || mask.bytes() == bld.lm.bytes());
120
121 if (ctx->program->wave_size == 32) {
122 Operand mask_lo = mask.isUndefined() ? Operand::c32(-1u) : mask;
123 return bld.vop3(aco_opcode::v_mbcnt_lo_u32_b32, Definition(dst), mask_lo, base);
124 }
125
126 Operand mask_lo = Operand::c32(-1u);
127 Operand mask_hi = Operand::c32(-1u);
128
129 if (mask.isTemp()) {
130 RegClass rc = RegClass(mask.regClass().type(), 1);
131 Builder::Result mask_split =
132 bld.pseudo(aco_opcode::p_split_vector, bld.def(rc), bld.def(rc), mask);
133 mask_lo = Operand(mask_split.def(0).getTemp());
134 mask_hi = Operand(mask_split.def(1).getTemp());
135 } else if (mask.physReg() == exec) {
136 mask_lo = Operand(exec_lo, s1);
137 mask_hi = Operand(exec_hi, s1);
138 }
139
140 Temp mbcnt_lo = bld.vop3(aco_opcode::v_mbcnt_lo_u32_b32, bld.def(v1), mask_lo, base);
141
142 if (ctx->program->gfx_level <= GFX7)
143 return bld.vop2(aco_opcode::v_mbcnt_hi_u32_b32, Definition(dst), mask_hi, mbcnt_lo);
144 else
145 return bld.vop3(aco_opcode::v_mbcnt_hi_u32_b32_e64, Definition(dst), mask_hi, mbcnt_lo);
146 }
147
148 inline void
set_wqm(isel_context * ctx,bool enable_helpers=false)149 set_wqm(isel_context* ctx, bool enable_helpers = false)
150 {
151 if (ctx->program->stage == fragment_fs) {
152 ctx->wqm_block_idx = ctx->block->index;
153 ctx->wqm_instruction_idx = ctx->block->instructions.size();
154 if (ctx->shader)
155 enable_helpers |= ctx->shader->info.fs.require_full_quads;
156 ctx->program->needs_wqm |= enable_helpers;
157 }
158 }
159
160 static Temp
emit_bpermute(isel_context * ctx,Builder & bld,Temp index,Temp data)161 emit_bpermute(isel_context* ctx, Builder& bld, Temp index, Temp data)
162 {
163 if (index.regClass() == s1)
164 return bld.readlane(bld.def(s1), data, index);
165
166 /* Avoid using shared VGPRs for shuffle on GFX10 when the shader consists
167 * of multiple binaries, because the VGPR use is not known when choosing
168 * which registers to use for the shared VGPRs.
169 */
170 const bool avoid_shared_vgprs =
171 ctx->options->gfx_level >= GFX10 && ctx->options->gfx_level < GFX11 &&
172 ctx->program->wave_size == 64 &&
173 (ctx->program->info.ps.has_epilog || ctx->program->info.merged_shader_compiled_separately ||
174 ctx->program->info.vs.has_prolog || ctx->stage == raytracing_cs);
175
176 if (ctx->options->gfx_level <= GFX7 || avoid_shared_vgprs) {
177 /* GFX6-7: there is no bpermute instruction */
178 return bld.pseudo(aco_opcode::p_bpermute_readlane, bld.def(v1), bld.def(bld.lm),
179 bld.def(bld.lm, vcc), index, data);
180 } else if (ctx->options->gfx_level >= GFX10 && ctx->options->gfx_level <= GFX11_5 &&
181 ctx->program->wave_size == 64) {
182
183 /* GFX10-11.5 wave64 mode: emulate full-wave bpermute */
184 Temp index_is_lo =
185 bld.vopc(aco_opcode::v_cmp_ge_u32, bld.def(bld.lm), Operand::c32(31u), index);
186 Builder::Result index_is_lo_split =
187 bld.pseudo(aco_opcode::p_split_vector, bld.def(s1), bld.def(s1), index_is_lo);
188 Temp index_is_lo_n1 = bld.sop1(aco_opcode::s_not_b32, bld.def(s1), bld.def(s1, scc),
189 index_is_lo_split.def(1).getTemp());
190 Operand same_half = bld.pseudo(aco_opcode::p_create_vector, bld.def(s2),
191 index_is_lo_split.def(0).getTemp(), index_is_lo_n1);
192 Operand index_x4 = bld.vop2(aco_opcode::v_lshlrev_b32, bld.def(v1), Operand::c32(2u), index);
193
194 if (ctx->options->gfx_level <= GFX10_3) {
195 /* We need one pair of shared VGPRs:
196 * Note, that these have twice the allocation granularity of normal VGPRs
197 */
198 ctx->program->config->num_shared_vgprs = 2 * ctx->program->dev.vgpr_alloc_granule;
199
200 return bld.pseudo(aco_opcode::p_bpermute_shared_vgpr, bld.def(v1), bld.def(s2),
201 bld.def(s1, scc), index_x4, data, same_half);
202 } else {
203 return bld.pseudo(aco_opcode::p_bpermute_permlane, bld.def(v1), bld.def(s2),
204 bld.def(s1, scc), Operand(v1.as_linear()), index_x4, data, same_half);
205 }
206 } else {
207 /* wave32 or GFX8-9, GFX12+: bpermute works normally */
208 Temp index_x4 = bld.vop2(aco_opcode::v_lshlrev_b32, bld.def(v1), Operand::c32(2u), index);
209 return bld.ds(aco_opcode::ds_bpermute_b32, bld.def(v1), index_x4, data);
210 }
211 }
212
213 static Temp
emit_masked_swizzle(isel_context * ctx,Builder & bld,Temp src,unsigned mask,bool allow_fi)214 emit_masked_swizzle(isel_context* ctx, Builder& bld, Temp src, unsigned mask, bool allow_fi)
215 {
216 if (ctx->options->gfx_level >= GFX8) {
217 unsigned and_mask = mask & 0x1f;
218 unsigned or_mask = (mask >> 5) & 0x1f;
219 unsigned xor_mask = (mask >> 10) & 0x1f;
220
221 /* Eliminate or_mask. */
222 and_mask &= ~or_mask;
223 xor_mask ^= or_mask;
224
225 uint16_t dpp_ctrl = 0xffff;
226
227 /* DPP16 before DPP8 before v_permlane(x)16_b32
228 * because DPP16 supports modifiers and v_permlane
229 * can't be folded into valu instructions.
230 */
231 if ((and_mask & 0x1c) == 0x1c && xor_mask < 4) {
232 unsigned res[4];
233 for (unsigned i = 0; i < 4; i++)
234 res[i] = ((i & and_mask) ^ xor_mask);
235 dpp_ctrl = dpp_quad_perm(res[0], res[1], res[2], res[3]);
236 } else if (and_mask == 0x1f && xor_mask == 8) {
237 dpp_ctrl = dpp_row_rr(8);
238 } else if (and_mask == 0x1f && xor_mask == 0xf) {
239 dpp_ctrl = dpp_row_mirror;
240 } else if (and_mask == 0x1f && xor_mask == 0x7) {
241 dpp_ctrl = dpp_row_half_mirror;
242 } else if (ctx->options->gfx_level >= GFX11 && and_mask == 0x10 && xor_mask < 0x10) {
243 dpp_ctrl = dpp_row_share(xor_mask);
244 } else if (ctx->options->gfx_level >= GFX11 && and_mask == 0x1f && xor_mask < 0x10) {
245 dpp_ctrl = dpp_row_xmask(xor_mask);
246 } else if (ctx->options->gfx_level >= GFX10 && (and_mask & 0x18) == 0x18 && xor_mask < 8) {
247 uint32_t lane_sel = 0;
248 for (unsigned i = 0; i < 8; i++)
249 lane_sel |= ((i & and_mask) ^ xor_mask) << (i * 3);
250 return bld.vop1_dpp8(aco_opcode::v_mov_b32, bld.def(v1), src, lane_sel, allow_fi);
251 } else if (ctx->options->gfx_level >= GFX10 && (and_mask & 0x10) == 0x10) {
252 uint64_t lane_mask = 0;
253 for (unsigned i = 0; i < 16; i++)
254 lane_mask |= uint64_t((i & and_mask) ^ (xor_mask & 0xf)) << i * 4;
255 aco_opcode opcode =
256 xor_mask & 0x10 ? aco_opcode::v_permlanex16_b32 : aco_opcode::v_permlane16_b32;
257 Temp op1 = bld.copy(bld.def(s1), Operand::c32(lane_mask & 0xffffffff));
258 Temp op2 = bld.copy(bld.def(s1), Operand::c32(lane_mask >> 32));
259 Builder::Result ret = bld.vop3(opcode, bld.def(v1), src, op1, op2);
260 ret->valu().opsel[0] = allow_fi; /* set FETCH_INACTIVE */
261 ret->valu().opsel[1] = true; /* set BOUND_CTRL */
262 return ret;
263 }
264
265 if (dpp_ctrl != 0xffff)
266 return bld.vop1_dpp(aco_opcode::v_mov_b32, bld.def(v1), src, dpp_ctrl, 0xf, 0xf, true,
267 allow_fi);
268 }
269
270 return bld.ds(aco_opcode::ds_swizzle_b32, bld.def(v1), src, mask, 0, false);
271 }
272
273 Temp
as_vgpr(Builder & bld,Temp val)274 as_vgpr(Builder& bld, Temp val)
275 {
276 if (val.type() == RegType::sgpr)
277 return bld.copy(bld.def(RegType::vgpr, val.size()), val);
278 assert(val.type() == RegType::vgpr);
279 return val;
280 }
281
282 Temp
as_vgpr(isel_context * ctx,Temp val)283 as_vgpr(isel_context* ctx, Temp val)
284 {
285 Builder bld(ctx->program, ctx->block);
286 return as_vgpr(bld, val);
287 }
288
289 void
emit_extract_vector(isel_context * ctx,Temp src,uint32_t idx,Temp dst)290 emit_extract_vector(isel_context* ctx, Temp src, uint32_t idx, Temp dst)
291 {
292 Builder bld(ctx->program, ctx->block);
293 bld.pseudo(aco_opcode::p_extract_vector, Definition(dst), src, Operand::c32(idx));
294 }
295
296 Temp
emit_extract_vector(isel_context * ctx,Temp src,uint32_t idx,RegClass dst_rc)297 emit_extract_vector(isel_context* ctx, Temp src, uint32_t idx, RegClass dst_rc)
298 {
299 /* no need to extract the whole vector */
300 if (src.regClass() == dst_rc) {
301 assert(idx == 0);
302 return src;
303 }
304
305 assert(src.bytes() > (idx * dst_rc.bytes()));
306 Builder bld(ctx->program, ctx->block);
307 auto it = ctx->allocated_vec.find(src.id());
308 if (it != ctx->allocated_vec.end() && dst_rc.bytes() == it->second[idx].regClass().bytes()) {
309 if (it->second[idx].regClass() == dst_rc) {
310 return it->second[idx];
311 } else {
312 assert(!dst_rc.is_subdword());
313 assert(dst_rc.type() == RegType::vgpr && it->second[idx].type() == RegType::sgpr);
314 return bld.copy(bld.def(dst_rc), it->second[idx]);
315 }
316 }
317
318 if (dst_rc.is_subdword())
319 src = as_vgpr(ctx, src);
320
321 if (src.bytes() == dst_rc.bytes()) {
322 assert(idx == 0);
323 return bld.copy(bld.def(dst_rc), src);
324 } else {
325 Temp dst = bld.tmp(dst_rc);
326 emit_extract_vector(ctx, src, idx, dst);
327 return dst;
328 }
329 }
330
331 void
emit_split_vector(isel_context * ctx,Temp vec_src,unsigned num_components)332 emit_split_vector(isel_context* ctx, Temp vec_src, unsigned num_components)
333 {
334 if (num_components == 1)
335 return;
336 if (ctx->allocated_vec.find(vec_src.id()) != ctx->allocated_vec.end())
337 return;
338 RegClass rc;
339 if (num_components > vec_src.size()) {
340 if (vec_src.type() == RegType::sgpr) {
341 /* should still help get_alu_src() */
342 emit_split_vector(ctx, vec_src, vec_src.size());
343 return;
344 }
345 /* sub-dword split */
346 rc = RegClass(RegType::vgpr, vec_src.bytes() / num_components).as_subdword();
347 } else {
348 rc = RegClass(vec_src.type(), vec_src.size() / num_components);
349 }
350 aco_ptr<Instruction> split{
351 create_instruction(aco_opcode::p_split_vector, Format::PSEUDO, 1, num_components)};
352 split->operands[0] = Operand(vec_src);
353 std::array<Temp, NIR_MAX_VEC_COMPONENTS> elems;
354 for (unsigned i = 0; i < num_components; i++) {
355 elems[i] = ctx->program->allocateTmp(rc);
356 split->definitions[i] = Definition(elems[i]);
357 }
358 ctx->block->instructions.emplace_back(std::move(split));
359 ctx->allocated_vec.emplace(vec_src.id(), elems);
360 }
361
362 /* This vector expansion uses a mask to determine which elements in the new vector
363 * come from the original vector. The other elements are undefined. */
364 void
expand_vector(isel_context * ctx,Temp vec_src,Temp dst,unsigned num_components,unsigned mask,bool zero_padding=false)365 expand_vector(isel_context* ctx, Temp vec_src, Temp dst, unsigned num_components, unsigned mask,
366 bool zero_padding = false)
367 {
368 assert(vec_src.type() == RegType::vgpr);
369 Builder bld(ctx->program, ctx->block);
370
371 if (dst.type() == RegType::sgpr && num_components > dst.size()) {
372 Temp tmp_dst = bld.tmp(RegClass::get(RegType::vgpr, 2 * num_components));
373 expand_vector(ctx, vec_src, tmp_dst, num_components, mask, zero_padding);
374 bld.pseudo(aco_opcode::p_as_uniform, Definition(dst), tmp_dst);
375 ctx->allocated_vec[dst.id()] = ctx->allocated_vec[tmp_dst.id()];
376 return;
377 }
378
379 emit_split_vector(ctx, vec_src, util_bitcount(mask));
380
381 if (vec_src == dst)
382 return;
383
384 if (num_components == 1) {
385 if (dst.type() == RegType::sgpr)
386 bld.pseudo(aco_opcode::p_as_uniform, Definition(dst), vec_src);
387 else
388 bld.copy(Definition(dst), vec_src);
389 return;
390 }
391
392 unsigned component_bytes = dst.bytes() / num_components;
393 RegClass src_rc = RegClass::get(RegType::vgpr, component_bytes);
394 RegClass dst_rc = RegClass::get(dst.type(), component_bytes);
395 assert(dst.type() == RegType::vgpr || !src_rc.is_subdword());
396 std::array<Temp, NIR_MAX_VEC_COMPONENTS> elems;
397
398 Temp padding = Temp(0, dst_rc);
399 if (zero_padding)
400 padding = bld.copy(bld.def(dst_rc), Operand::zero(component_bytes));
401
402 aco_ptr<Instruction> vec{
403 create_instruction(aco_opcode::p_create_vector, Format::PSEUDO, num_components, 1)};
404 vec->definitions[0] = Definition(dst);
405 unsigned k = 0;
406 for (unsigned i = 0; i < num_components; i++) {
407 if (mask & (1 << i)) {
408 Temp src = emit_extract_vector(ctx, vec_src, k++, src_rc);
409 if (dst.type() == RegType::sgpr)
410 src = bld.as_uniform(src);
411 vec->operands[i] = Operand(src);
412 elems[i] = src;
413 } else {
414 vec->operands[i] = Operand::zero(component_bytes);
415 elems[i] = padding;
416 }
417 }
418 ctx->block->instructions.emplace_back(std::move(vec));
419 ctx->allocated_vec.emplace(dst.id(), elems);
420 }
421
422 Temp
get_ssa_temp_tex(struct isel_context * ctx,nir_def * def,bool is_16bit)423 get_ssa_temp_tex(struct isel_context* ctx, nir_def* def, bool is_16bit)
424 {
425 RegClass rc = RegClass::get(RegType::vgpr, (is_16bit ? 2 : 4) * def->num_components);
426 Temp tmp = get_ssa_temp(ctx, def);
427 if (tmp.bytes() != rc.bytes())
428 return emit_extract_vector(ctx, tmp, 0, rc);
429 else
430 return tmp;
431 }
432
433 Temp
bool_to_vector_condition(isel_context * ctx,Temp val,Temp dst=Temp (0,s2))434 bool_to_vector_condition(isel_context* ctx, Temp val, Temp dst = Temp(0, s2))
435 {
436 Builder bld(ctx->program, ctx->block);
437 if (!dst.id())
438 dst = bld.tmp(bld.lm);
439
440 assert(val.regClass() == s1);
441 assert(dst.regClass() == bld.lm);
442
443 return bld.sop2(Builder::s_cselect, Definition(dst), Operand::c32(-1), Operand::zero(),
444 bld.scc(val));
445 }
446
447 Temp
bool_to_scalar_condition(isel_context * ctx,Temp val,Temp dst=Temp (0,s1))448 bool_to_scalar_condition(isel_context* ctx, Temp val, Temp dst = Temp(0, s1))
449 {
450 Builder bld(ctx->program, ctx->block);
451 if (!dst.id())
452 dst = bld.tmp(s1);
453
454 assert(val.regClass() == bld.lm);
455 assert(dst.regClass() == s1);
456
457 /* if we're currently in WQM mode, ensure that the source is also computed in WQM */
458 bld.sop2(Builder::s_and, bld.def(bld.lm), bld.scc(Definition(dst)), val, Operand(exec, bld.lm));
459 return dst;
460 }
461
462 /**
463 * Copies the first src_bits of the input to the output Temp. Input bits at positions larger than
464 * src_bits and dst_bits are truncated.
465 *
466 * Sign extension may be applied using the sign_extend parameter. The position of the input sign
467 * bit is indicated by src_bits in this case.
468 *
469 * If dst.bytes() is larger than dst_bits/8, the value of the upper bits is undefined.
470 */
471 Temp
convert_int(isel_context * ctx,Builder & bld,Temp src,unsigned src_bits,unsigned dst_bits,bool sign_extend,Temp dst=Temp ())472 convert_int(isel_context* ctx, Builder& bld, Temp src, unsigned src_bits, unsigned dst_bits,
473 bool sign_extend, Temp dst = Temp())
474 {
475 assert(!(sign_extend && dst_bits < src_bits) &&
476 "Shrinking integers is not supported for signed inputs");
477
478 if (!dst.id()) {
479 if (dst_bits % 32 == 0 || src.type() == RegType::sgpr)
480 dst = bld.tmp(src.type(), DIV_ROUND_UP(dst_bits, 32u));
481 else
482 dst = bld.tmp(RegClass(RegType::vgpr, dst_bits / 8u).as_subdword());
483 }
484
485 assert(src.type() == RegType::sgpr || src_bits == src.bytes() * 8);
486 assert(dst.type() == RegType::sgpr || dst_bits == dst.bytes() * 8);
487
488 if (dst.bytes() == src.bytes() && dst_bits < src_bits) {
489 /* Copy the raw value, leaving an undefined value in the upper bits for
490 * the caller to handle appropriately */
491 return bld.copy(Definition(dst), src);
492 } else if (dst.bytes() < src.bytes()) {
493 return bld.pseudo(aco_opcode::p_extract_vector, Definition(dst), src, Operand::zero());
494 }
495
496 Temp tmp = dst;
497 if (dst_bits == 64)
498 tmp = src_bits == 32 ? src : bld.tmp(src.type(), 1);
499
500 if (tmp == src) {
501 } else if (src.regClass() == s1) {
502 assert(src_bits < 32);
503 bld.pseudo(aco_opcode::p_extract, Definition(tmp), bld.def(s1, scc), src, Operand::zero(),
504 Operand::c32(src_bits), Operand::c32((unsigned)sign_extend));
505 } else {
506 assert(src_bits < 32);
507 bld.pseudo(aco_opcode::p_extract, Definition(tmp), src, Operand::zero(),
508 Operand::c32(src_bits), Operand::c32((unsigned)sign_extend));
509 }
510
511 if (dst_bits == 64) {
512 if (sign_extend && dst.regClass() == s2) {
513 Temp high =
514 bld.sop2(aco_opcode::s_ashr_i32, bld.def(s1), bld.def(s1, scc), tmp, Operand::c32(31u));
515 bld.pseudo(aco_opcode::p_create_vector, Definition(dst), tmp, high);
516 } else if (sign_extend && dst.regClass() == v2) {
517 Temp high = bld.vop2(aco_opcode::v_ashrrev_i32, bld.def(v1), Operand::c32(31u), tmp);
518 bld.pseudo(aco_opcode::p_create_vector, Definition(dst), tmp, high);
519 } else {
520 bld.pseudo(aco_opcode::p_create_vector, Definition(dst), tmp, Operand::zero());
521 }
522 }
523
524 return dst;
525 }
526
527 enum sgpr_extract_mode {
528 sgpr_extract_sext,
529 sgpr_extract_zext,
530 sgpr_extract_undef,
531 };
532
533 Temp
extract_8_16_bit_sgpr_element(isel_context * ctx,Temp dst,nir_alu_src * src,sgpr_extract_mode mode)534 extract_8_16_bit_sgpr_element(isel_context* ctx, Temp dst, nir_alu_src* src, sgpr_extract_mode mode)
535 {
536 Temp vec = get_ssa_temp(ctx, src->src.ssa);
537 unsigned src_size = src->src.ssa->bit_size;
538 unsigned swizzle = src->swizzle[0];
539
540 if (vec.size() > 1) {
541 assert(src_size == 16);
542 vec = emit_extract_vector(ctx, vec, swizzle / 2, s1);
543 swizzle = swizzle & 1;
544 }
545
546 Builder bld(ctx->program, ctx->block);
547 Temp tmp = dst.regClass() == s2 ? bld.tmp(s1) : dst;
548
549 if (mode == sgpr_extract_undef && swizzle == 0)
550 bld.copy(Definition(tmp), vec);
551 else
552 bld.pseudo(aco_opcode::p_extract, Definition(tmp), bld.def(s1, scc), Operand(vec),
553 Operand::c32(swizzle), Operand::c32(src_size),
554 Operand::c32((mode == sgpr_extract_sext)));
555
556 if (dst.regClass() == s2)
557 convert_int(ctx, bld, tmp, 32, 64, mode == sgpr_extract_sext, dst);
558
559 return dst;
560 }
561
562 Temp
get_alu_src(struct isel_context * ctx,nir_alu_src src,unsigned size=1)563 get_alu_src(struct isel_context* ctx, nir_alu_src src, unsigned size = 1)
564 {
565 if (src.src.ssa->num_components == 1 && size == 1)
566 return get_ssa_temp(ctx, src.src.ssa);
567
568 Temp vec = get_ssa_temp(ctx, src.src.ssa);
569 unsigned elem_size = src.src.ssa->bit_size / 8u;
570 bool identity_swizzle = true;
571
572 for (unsigned i = 0; identity_swizzle && i < size; i++) {
573 if (src.swizzle[i] != i)
574 identity_swizzle = false;
575 }
576 if (identity_swizzle)
577 return emit_extract_vector(ctx, vec, 0, RegClass::get(vec.type(), elem_size * size));
578
579 assert(elem_size > 0);
580 assert(vec.bytes() % elem_size == 0);
581
582 if (elem_size < 4 && vec.type() == RegType::sgpr && size == 1) {
583 assert(src.src.ssa->bit_size == 8 || src.src.ssa->bit_size == 16);
584 return extract_8_16_bit_sgpr_element(ctx, ctx->program->allocateTmp(s1), &src,
585 sgpr_extract_undef);
586 }
587
588 bool as_uniform = elem_size < 4 && vec.type() == RegType::sgpr;
589 if (as_uniform)
590 vec = as_vgpr(ctx, vec);
591
592 RegClass elem_rc = elem_size < 4 ? RegClass(vec.type(), elem_size).as_subdword()
593 : RegClass(vec.type(), elem_size / 4);
594 if (size == 1) {
595 return emit_extract_vector(ctx, vec, src.swizzle[0], elem_rc);
596 } else {
597 assert(size <= 4);
598 std::array<Temp, NIR_MAX_VEC_COMPONENTS> elems;
599 aco_ptr<Instruction> vec_instr{
600 create_instruction(aco_opcode::p_create_vector, Format::PSEUDO, size, 1)};
601 for (unsigned i = 0; i < size; ++i) {
602 elems[i] = emit_extract_vector(ctx, vec, src.swizzle[i], elem_rc);
603 vec_instr->operands[i] = Operand{elems[i]};
604 }
605 Temp dst = ctx->program->allocateTmp(RegClass(vec.type(), elem_size * size / 4));
606 vec_instr->definitions[0] = Definition(dst);
607 ctx->block->instructions.emplace_back(std::move(vec_instr));
608 ctx->allocated_vec.emplace(dst.id(), elems);
609 return as_uniform ? Builder(ctx->program, ctx->block).as_uniform(dst) : dst;
610 }
611 }
612
613 Temp
get_alu_src_vop3p(struct isel_context * ctx,nir_alu_src src)614 get_alu_src_vop3p(struct isel_context* ctx, nir_alu_src src)
615 {
616 /* returns v2b or v1 for vop3p usage.
617 * The source expects exactly 2 16bit components
618 * which are within the same dword
619 */
620 assert(src.src.ssa->bit_size == 16);
621 assert(src.swizzle[0] >> 1 == src.swizzle[1] >> 1);
622
623 Temp tmp = get_ssa_temp(ctx, src.src.ssa);
624 if (tmp.size() == 1)
625 return tmp;
626
627 /* the size is larger than 1 dword: check the swizzle */
628 unsigned dword = src.swizzle[0] >> 1;
629
630 /* extract a full dword if possible */
631 if (tmp.bytes() >= (dword + 1) * 4) {
632 /* if the source is split into components, use p_create_vector */
633 auto it = ctx->allocated_vec.find(tmp.id());
634 if (it != ctx->allocated_vec.end()) {
635 unsigned index = dword << 1;
636 Builder bld(ctx->program, ctx->block);
637 if (it->second[index].regClass() == v2b)
638 return bld.pseudo(aco_opcode::p_create_vector, bld.def(v1), it->second[index],
639 it->second[index + 1]);
640 }
641 return emit_extract_vector(ctx, tmp, dword, v1);
642 } else {
643 /* This must be a swizzled access to %a.zz where %a is v6b */
644 assert(((src.swizzle[0] | src.swizzle[1]) & 1) == 0);
645 assert(tmp.regClass() == v6b && dword == 1);
646 return emit_extract_vector(ctx, tmp, dword * 2, v2b);
647 }
648 }
649
650 uint32_t
get_alu_src_ub(isel_context * ctx,nir_alu_instr * instr,int src_idx)651 get_alu_src_ub(isel_context* ctx, nir_alu_instr* instr, int src_idx)
652 {
653 nir_scalar scalar = nir_scalar{instr->src[src_idx].src.ssa, instr->src[src_idx].swizzle[0]};
654 return nir_unsigned_upper_bound(ctx->shader, ctx->range_ht, scalar, &ctx->ub_config);
655 }
656
657 Temp
convert_pointer_to_64_bit(isel_context * ctx,Temp ptr,bool non_uniform=false)658 convert_pointer_to_64_bit(isel_context* ctx, Temp ptr, bool non_uniform = false)
659 {
660 if (ptr.size() == 2)
661 return ptr;
662 Builder bld(ctx->program, ctx->block);
663 if (ptr.type() == RegType::vgpr && !non_uniform)
664 ptr = bld.as_uniform(ptr);
665 return bld.pseudo(aco_opcode::p_create_vector, bld.def(RegClass(ptr.type(), 2)), ptr,
666 Operand::c32((unsigned)ctx->options->address32_hi));
667 }
668
669 void
emit_sop2_instruction(isel_context * ctx,nir_alu_instr * instr,aco_opcode op,Temp dst,bool writes_scc,uint8_t uses_ub=0)670 emit_sop2_instruction(isel_context* ctx, nir_alu_instr* instr, aco_opcode op, Temp dst,
671 bool writes_scc, uint8_t uses_ub = 0)
672 {
673 Builder bld = create_alu_builder(ctx, instr);
674 bld.is_nuw = instr->no_unsigned_wrap;
675
676 Operand operands[2] = {Operand(get_alu_src(ctx, instr->src[0])),
677 Operand(get_alu_src(ctx, instr->src[1]))};
678 u_foreach_bit (i, uses_ub) {
679 uint32_t src_ub = get_alu_src_ub(ctx, instr, i);
680 if (src_ub <= 0xffff)
681 operands[i].set16bit(true);
682 else if (src_ub <= 0xffffff)
683 operands[i].set24bit(true);
684 }
685
686 if (writes_scc)
687 bld.sop2(op, Definition(dst), bld.def(s1, scc), operands[0], operands[1]);
688 else
689 bld.sop2(op, Definition(dst), operands[0], operands[1]);
690 }
691
692 void
emit_vop2_instruction(isel_context * ctx,nir_alu_instr * instr,aco_opcode opc,Temp dst,bool commutative,bool swap_srcs=false,bool flush_denorms=false,bool nuw=false,uint8_t uses_ub=0)693 emit_vop2_instruction(isel_context* ctx, nir_alu_instr* instr, aco_opcode opc, Temp dst,
694 bool commutative, bool swap_srcs = false, bool flush_denorms = false,
695 bool nuw = false, uint8_t uses_ub = 0)
696 {
697 Builder bld = create_alu_builder(ctx, instr);
698 bld.is_nuw = nuw;
699
700 Operand operands[2] = {Operand(get_alu_src(ctx, instr->src[0])),
701 Operand(get_alu_src(ctx, instr->src[1]))};
702 u_foreach_bit (i, uses_ub) {
703 uint32_t src_ub = get_alu_src_ub(ctx, instr, i);
704 if (src_ub <= 0xffff)
705 operands[i].set16bit(true);
706 else if (src_ub <= 0xffffff)
707 operands[i].set24bit(true);
708 }
709
710 if (swap_srcs)
711 std::swap(operands[0], operands[1]);
712
713 if (operands[1].isOfType(RegType::sgpr)) {
714 if (commutative && operands[0].isOfType(RegType::vgpr)) {
715 std::swap(operands[0], operands[1]);
716 } else {
717 operands[1] = bld.copy(bld.def(RegType::vgpr, operands[1].size()), operands[1]);
718 }
719 }
720
721 if (flush_denorms && ctx->program->gfx_level < GFX9) {
722 assert(dst.size() == 1);
723 Temp tmp = bld.vop2(opc, bld.def(dst.regClass()), operands[0], operands[1]);
724 if (dst.bytes() == 2)
725 bld.vop2(aco_opcode::v_mul_f16, Definition(dst), Operand::c16(0x3c00), tmp);
726 else
727 bld.vop2(aco_opcode::v_mul_f32, Definition(dst), Operand::c32(0x3f800000u), tmp);
728 } else {
729 bld.vop2(opc, Definition(dst), operands[0], operands[1]);
730 }
731 }
732
733 void
emit_vop2_instruction_logic64(isel_context * ctx,nir_alu_instr * instr,aco_opcode op,Temp dst)734 emit_vop2_instruction_logic64(isel_context* ctx, nir_alu_instr* instr, aco_opcode op, Temp dst)
735 {
736 Builder bld = create_alu_builder(ctx, instr);
737
738 Temp src0 = get_alu_src(ctx, instr->src[0]);
739 Temp src1 = get_alu_src(ctx, instr->src[1]);
740
741 if (src1.type() == RegType::sgpr) {
742 assert(src0.type() == RegType::vgpr);
743 std::swap(src0, src1);
744 }
745
746 Temp src00 = bld.tmp(src0.type(), 1);
747 Temp src01 = bld.tmp(src0.type(), 1);
748 bld.pseudo(aco_opcode::p_split_vector, Definition(src00), Definition(src01), src0);
749 Temp src10 = bld.tmp(v1);
750 Temp src11 = bld.tmp(v1);
751 bld.pseudo(aco_opcode::p_split_vector, Definition(src10), Definition(src11), src1);
752 Temp lo = bld.vop2(op, bld.def(v1), src00, src10);
753 Temp hi = bld.vop2(op, bld.def(v1), src01, src11);
754 bld.pseudo(aco_opcode::p_create_vector, Definition(dst), lo, hi);
755 }
756
757 void
emit_vop3a_instruction(isel_context * ctx,nir_alu_instr * instr,aco_opcode op,Temp dst,bool flush_denorms=false,unsigned num_sources=2,bool swap_srcs=false)758 emit_vop3a_instruction(isel_context* ctx, nir_alu_instr* instr, aco_opcode op, Temp dst,
759 bool flush_denorms = false, unsigned num_sources = 2, bool swap_srcs = false)
760 {
761 assert(num_sources == 2 || num_sources == 3);
762 Temp src[3] = {Temp(0, v1), Temp(0, v1), Temp(0, v1)};
763 bool has_sgpr = false;
764 for (unsigned i = 0; i < num_sources; i++) {
765 src[i] = get_alu_src(ctx, instr->src[(swap_srcs && i < 2) ? 1 - i : i]);
766 if (has_sgpr)
767 src[i] = as_vgpr(ctx, src[i]);
768 else
769 has_sgpr = src[i].type() == RegType::sgpr;
770 }
771
772 Builder bld = create_alu_builder(ctx, instr);
773 if (flush_denorms && ctx->program->gfx_level < GFX9) {
774 Temp tmp;
775 if (num_sources == 3)
776 tmp = bld.vop3(op, bld.def(dst.regClass()), src[0], src[1], src[2]);
777 else
778 tmp = bld.vop3(op, bld.def(dst.regClass()), src[0], src[1]);
779 if (dst.size() == 1)
780 bld.vop2(aco_opcode::v_mul_f32, Definition(dst), Operand::c32(0x3f800000u), tmp);
781 else
782 bld.vop3(aco_opcode::v_mul_f64_e64, Definition(dst), Operand::c64(0x3FF0000000000000),
783 tmp);
784 } else if (num_sources == 3) {
785 bld.vop3(op, Definition(dst), src[0], src[1], src[2]);
786 } else {
787 bld.vop3(op, Definition(dst), src[0], src[1]);
788 }
789 }
790
791 Builder::Result
emit_vop3p_instruction(isel_context * ctx,nir_alu_instr * instr,aco_opcode op,Temp dst,bool swap_srcs=false)792 emit_vop3p_instruction(isel_context* ctx, nir_alu_instr* instr, aco_opcode op, Temp dst,
793 bool swap_srcs = false)
794 {
795 Temp src0 = get_alu_src_vop3p(ctx, instr->src[swap_srcs]);
796 Temp src1 = get_alu_src_vop3p(ctx, instr->src[!swap_srcs]);
797 if (src0.type() == RegType::sgpr && src1.type() == RegType::sgpr)
798 src1 = as_vgpr(ctx, src1);
799 assert(instr->def.num_components == 2);
800
801 /* swizzle to opsel: all swizzles are either 0 (x) or 1 (y) */
802 unsigned opsel_lo =
803 (instr->src[!swap_srcs].swizzle[0] & 1) << 1 | (instr->src[swap_srcs].swizzle[0] & 1);
804 unsigned opsel_hi =
805 (instr->src[!swap_srcs].swizzle[1] & 1) << 1 | (instr->src[swap_srcs].swizzle[1] & 1);
806
807 Builder bld = create_alu_builder(ctx, instr);
808 Builder::Result res = bld.vop3p(op, Definition(dst), src0, src1, opsel_lo, opsel_hi);
809 emit_split_vector(ctx, dst, 2);
810 return res;
811 }
812
813 void
emit_idot_instruction(isel_context * ctx,nir_alu_instr * instr,aco_opcode op,Temp dst,bool clamp,unsigned neg_lo=0)814 emit_idot_instruction(isel_context* ctx, nir_alu_instr* instr, aco_opcode op, Temp dst, bool clamp,
815 unsigned neg_lo = 0)
816 {
817 Temp src[3] = {Temp(0, v1), Temp(0, v1), Temp(0, v1)};
818 bool has_sgpr = false;
819 for (unsigned i = 0; i < 3; i++) {
820 src[i] = get_alu_src(ctx, instr->src[i]);
821 if (has_sgpr)
822 src[i] = as_vgpr(ctx, src[i]);
823 else
824 has_sgpr = src[i].type() == RegType::sgpr;
825 }
826
827 Builder bld = create_alu_builder(ctx, instr);
828 VALU_instruction& vop3p =
829 bld.vop3p(op, Definition(dst), src[0], src[1], src[2], 0x0, 0x7)->valu();
830 vop3p.clamp = clamp;
831 vop3p.neg_lo = neg_lo;
832 }
833
834 void
emit_vop1_instruction(isel_context * ctx,nir_alu_instr * instr,aco_opcode op,Temp dst)835 emit_vop1_instruction(isel_context* ctx, nir_alu_instr* instr, aco_opcode op, Temp dst)
836 {
837 Builder bld = create_alu_builder(ctx, instr);
838 if (dst.type() == RegType::sgpr)
839 bld.pseudo(aco_opcode::p_as_uniform, Definition(dst),
840 bld.vop1(op, bld.def(RegType::vgpr, dst.size()), get_alu_src(ctx, instr->src[0])));
841 else
842 bld.vop1(op, Definition(dst), get_alu_src(ctx, instr->src[0]));
843 }
844
845 void
emit_vopc_instruction(isel_context * ctx,nir_alu_instr * instr,aco_opcode op,Temp dst)846 emit_vopc_instruction(isel_context* ctx, nir_alu_instr* instr, aco_opcode op, Temp dst)
847 {
848 Temp src0 = get_alu_src(ctx, instr->src[0]);
849 Temp src1 = get_alu_src(ctx, instr->src[1]);
850 assert(src0.size() == src1.size());
851
852 aco_ptr<Instruction> vopc;
853 if (src1.type() == RegType::sgpr) {
854 if (src0.type() == RegType::vgpr) {
855 /* to swap the operands, we might also have to change the opcode */
856 op = get_vcmp_swapped(op);
857 Temp t = src0;
858 src0 = src1;
859 src1 = t;
860 } else {
861 src1 = as_vgpr(ctx, src1);
862 }
863 }
864
865 Builder bld = create_alu_builder(ctx, instr);
866 bld.vopc(op, Definition(dst), src0, src1);
867 }
868
869 void
emit_sopc_instruction(isel_context * ctx,nir_alu_instr * instr,aco_opcode op,Temp dst)870 emit_sopc_instruction(isel_context* ctx, nir_alu_instr* instr, aco_opcode op, Temp dst)
871 {
872 Temp src0 = get_alu_src(ctx, instr->src[0]);
873 Temp src1 = get_alu_src(ctx, instr->src[1]);
874 Builder bld = create_alu_builder(ctx, instr);
875
876 assert(dst.regClass() == bld.lm);
877 assert(src0.type() == RegType::sgpr);
878 assert(src1.type() == RegType::sgpr);
879
880 /* Emit the SALU comparison instruction */
881 Temp cmp = bld.sopc(op, bld.scc(bld.def(s1)), src0, src1);
882 /* Turn the result into a per-lane bool */
883 bool_to_vector_condition(ctx, cmp, dst);
884 }
885
886 void
emit_comparison(isel_context * ctx,nir_alu_instr * instr,Temp dst,aco_opcode v16_op,aco_opcode v32_op,aco_opcode v64_op,aco_opcode s16_op=aco_opcode::num_opcodes,aco_opcode s32_op=aco_opcode::num_opcodes,aco_opcode s64_op=aco_opcode::num_opcodes)887 emit_comparison(isel_context* ctx, nir_alu_instr* instr, Temp dst, aco_opcode v16_op,
888 aco_opcode v32_op, aco_opcode v64_op, aco_opcode s16_op = aco_opcode::num_opcodes,
889 aco_opcode s32_op = aco_opcode::num_opcodes,
890 aco_opcode s64_op = aco_opcode::num_opcodes)
891 {
892 aco_opcode s_op = instr->src[0].src.ssa->bit_size == 64 ? s64_op
893 : instr->src[0].src.ssa->bit_size == 32 ? s32_op
894 : s16_op;
895 aco_opcode v_op = instr->src[0].src.ssa->bit_size == 64 ? v64_op
896 : instr->src[0].src.ssa->bit_size == 32 ? v32_op
897 : v16_op;
898 bool use_valu = s_op == aco_opcode::num_opcodes || instr->def.divergent ||
899 get_ssa_temp(ctx, instr->src[0].src.ssa).type() == RegType::vgpr ||
900 get_ssa_temp(ctx, instr->src[1].src.ssa).type() == RegType::vgpr;
901 aco_opcode op = use_valu ? v_op : s_op;
902 assert(op != aco_opcode::num_opcodes);
903 assert(dst.regClass() == ctx->program->lane_mask);
904
905 if (use_valu)
906 emit_vopc_instruction(ctx, instr, op, dst);
907 else
908 emit_sopc_instruction(ctx, instr, op, dst);
909 }
910
911 void
emit_boolean_logic(isel_context * ctx,nir_alu_instr * instr,Builder::WaveSpecificOpcode op,Temp dst)912 emit_boolean_logic(isel_context* ctx, nir_alu_instr* instr, Builder::WaveSpecificOpcode op,
913 Temp dst)
914 {
915 Builder bld(ctx->program, ctx->block);
916 Temp src0 = get_alu_src(ctx, instr->src[0]);
917 Temp src1 = get_alu_src(ctx, instr->src[1]);
918
919 assert(dst.regClass() == bld.lm);
920 assert(src0.regClass() == bld.lm);
921 assert(src1.regClass() == bld.lm);
922
923 bld.sop2(op, Definition(dst), bld.def(s1, scc), src0, src1);
924 }
925
926 void
select_vec2(isel_context * ctx,Temp dst,Temp cond,Temp then,Temp els)927 select_vec2(isel_context* ctx, Temp dst, Temp cond, Temp then, Temp els)
928 {
929 Builder bld(ctx->program, ctx->block);
930
931 Temp then_lo = bld.tmp(v1), then_hi = bld.tmp(v1);
932 bld.pseudo(aco_opcode::p_split_vector, Definition(then_lo), Definition(then_hi), then);
933 Temp else_lo = bld.tmp(v1), else_hi = bld.tmp(v1);
934 bld.pseudo(aco_opcode::p_split_vector, Definition(else_lo), Definition(else_hi), els);
935
936 Temp dst0 = bld.vop2(aco_opcode::v_cndmask_b32, bld.def(v1), else_lo, then_lo, cond);
937 Temp dst1 = bld.vop2(aco_opcode::v_cndmask_b32, bld.def(v1), else_hi, then_hi, cond);
938
939 bld.pseudo(aco_opcode::p_create_vector, Definition(dst), dst0, dst1);
940 }
941
942 void
emit_bcsel(isel_context * ctx,nir_alu_instr * instr,Temp dst)943 emit_bcsel(isel_context* ctx, nir_alu_instr* instr, Temp dst)
944 {
945 Builder bld(ctx->program, ctx->block);
946 Temp cond = get_alu_src(ctx, instr->src[0]);
947 Temp then = get_alu_src(ctx, instr->src[1]);
948 Temp els = get_alu_src(ctx, instr->src[2]);
949
950 assert(cond.regClass() == bld.lm);
951
952 if (dst.type() == RegType::vgpr) {
953 aco_ptr<Instruction> bcsel;
954 if (dst.size() == 1) {
955 then = as_vgpr(ctx, then);
956 els = as_vgpr(ctx, els);
957
958 bld.vop2(aco_opcode::v_cndmask_b32, Definition(dst), els, then, cond);
959 } else if (dst.size() == 2) {
960 select_vec2(ctx, dst, cond, then, els);
961 } else {
962 isel_err(&instr->instr, "Unimplemented NIR instr bit size");
963 }
964 return;
965 }
966
967 if (instr->def.bit_size == 1) {
968 assert(dst.regClass() == bld.lm);
969 assert(then.regClass() == bld.lm);
970 assert(els.regClass() == bld.lm);
971 }
972
973 if (!nir_src_is_divergent(&instr->src[0].src)) { /* uniform condition and values in sgpr */
974 if (dst.regClass() == s1 || dst.regClass() == s2) {
975 assert((then.regClass() == s1 || then.regClass() == s2) &&
976 els.regClass() == then.regClass());
977 assert(dst.size() == then.size());
978 aco_opcode op =
979 dst.regClass() == s1 ? aco_opcode::s_cselect_b32 : aco_opcode::s_cselect_b64;
980 bld.sop2(op, Definition(dst), then, els, bld.scc(bool_to_scalar_condition(ctx, cond)));
981 } else {
982 isel_err(&instr->instr, "Unimplemented uniform bcsel bit size");
983 }
984 return;
985 }
986
987 /* divergent boolean bcsel
988 * this implements bcsel on bools: dst = s0 ? s1 : s2
989 * are going to be: dst = (s0 & s1) | (~s0 & s2) */
990 assert(instr->def.bit_size == 1);
991
992 if (cond.id() != then.id())
993 then = bld.sop2(Builder::s_and, bld.def(bld.lm), bld.def(s1, scc), cond, then);
994
995 if (cond.id() == els.id())
996 bld.copy(Definition(dst), then);
997 else
998 bld.sop2(Builder::s_or, Definition(dst), bld.def(s1, scc), then,
999 bld.sop2(Builder::s_andn2, bld.def(bld.lm), bld.def(s1, scc), els, cond));
1000 }
1001
1002 void
emit_scaled_op(isel_context * ctx,Builder & bld,Definition dst,Temp val,aco_opcode vop,aco_opcode sop,uint32_t undo)1003 emit_scaled_op(isel_context* ctx, Builder& bld, Definition dst, Temp val, aco_opcode vop,
1004 aco_opcode sop, uint32_t undo)
1005 {
1006 if (ctx->block->fp_mode.denorm32 == 0) {
1007 if (dst.regClass() == v1)
1008 bld.vop1(vop, dst, val);
1009 else if (ctx->options->gfx_level >= GFX12)
1010 bld.vop3(sop, dst, val);
1011 else
1012 bld.pseudo(aco_opcode::p_as_uniform, dst, bld.vop1(vop, bld.def(v1), val));
1013 return;
1014 }
1015
1016 /* multiply by 16777216 to handle denormals */
1017 Temp scale, unscale;
1018 if (val.regClass() == v1) {
1019 val = as_vgpr(bld, val);
1020 Temp is_denormal = bld.tmp(bld.lm);
1021 VALU_instruction& valu = bld.vopc_e64(aco_opcode::v_cmp_class_f32, Definition(is_denormal),
1022 val, Operand::c32(1u << 4))
1023 ->valu();
1024 valu.neg[0] = true;
1025 valu.abs[0] = true;
1026 scale = bld.vop2_e64(aco_opcode::v_cndmask_b32, bld.def(v1), Operand::c32(0x3f800000),
1027 bld.copy(bld.def(s1), Operand::c32(0x4b800000u)), is_denormal);
1028 unscale = bld.vop2_e64(aco_opcode::v_cndmask_b32, bld.def(v1), Operand::c32(0x3f800000),
1029 bld.copy(bld.def(s1), Operand::c32(undo)), is_denormal);
1030 } else {
1031 Temp abs = bld.sop2(aco_opcode::s_and_b32, bld.def(s1), bld.def(s1, scc), val,
1032 bld.copy(bld.def(s1), Operand::c32(0x7fffffff)));
1033 Temp denorm_cmp = bld.copy(bld.def(s1), Operand::c32(0x00800000));
1034 Temp is_denormal = bld.sopc(aco_opcode::s_cmp_lt_u32, bld.def(s1, scc), abs, denorm_cmp);
1035 scale = bld.sop2(aco_opcode::s_cselect_b32, bld.def(s1),
1036 bld.copy(bld.def(s1), Operand::c32(0x4b800000u)), Operand::c32(0x3f800000),
1037 bld.scc(is_denormal));
1038 unscale =
1039 bld.sop2(aco_opcode::s_cselect_b32, bld.def(s1), bld.copy(bld.def(s1), Operand::c32(undo)),
1040 Operand::c32(0x3f800000), bld.scc(is_denormal));
1041 }
1042
1043 if (dst.regClass() == v1) {
1044 Temp scaled = bld.vop2(aco_opcode::v_mul_f32, bld.def(v1), scale, as_vgpr(bld, val));
1045 scaled = bld.vop1(vop, bld.def(v1), scaled);
1046 bld.vop2(aco_opcode::v_mul_f32, dst, unscale, scaled);
1047 } else {
1048 assert(ctx->options->gfx_level >= GFX11_5);
1049 Temp scaled = bld.sop2(aco_opcode::s_mul_f32, bld.def(s1), scale, val);
1050 if (ctx->options->gfx_level >= GFX12)
1051 scaled = bld.vop3(sop, bld.def(s1), scaled);
1052 else
1053 scaled = bld.as_uniform(bld.vop1(vop, bld.def(v1), scaled));
1054 bld.sop2(aco_opcode::s_mul_f32, dst, unscale, scaled);
1055 }
1056 }
1057
1058 void
emit_rcp(isel_context * ctx,Builder & bld,Definition dst,Temp val)1059 emit_rcp(isel_context* ctx, Builder& bld, Definition dst, Temp val)
1060 {
1061 emit_scaled_op(ctx, bld, dst, val, aco_opcode::v_rcp_f32, aco_opcode::v_s_rcp_f32, 0x4b800000u);
1062 }
1063
1064 void
emit_rsq(isel_context * ctx,Builder & bld,Definition dst,Temp val)1065 emit_rsq(isel_context* ctx, Builder& bld, Definition dst, Temp val)
1066 {
1067 emit_scaled_op(ctx, bld, dst, val, aco_opcode::v_rsq_f32, aco_opcode::v_s_rsq_f32, 0x45800000u);
1068 }
1069
1070 void
emit_sqrt(isel_context * ctx,Builder & bld,Definition dst,Temp val)1071 emit_sqrt(isel_context* ctx, Builder& bld, Definition dst, Temp val)
1072 {
1073 emit_scaled_op(ctx, bld, dst, val, aco_opcode::v_sqrt_f32, aco_opcode::v_s_sqrt_f32,
1074 0x39800000u);
1075 }
1076
1077 void
emit_log2(isel_context * ctx,Builder & bld,Definition dst,Temp val)1078 emit_log2(isel_context* ctx, Builder& bld, Definition dst, Temp val)
1079 {
1080 emit_scaled_op(ctx, bld, dst, val, aco_opcode::v_log_f32, aco_opcode::v_s_log_f32, 0xc1c00000u);
1081 }
1082
1083 Temp
emit_trunc_f64(isel_context * ctx,Builder & bld,Definition dst,Temp val)1084 emit_trunc_f64(isel_context* ctx, Builder& bld, Definition dst, Temp val)
1085 {
1086 if (ctx->options->gfx_level >= GFX7)
1087 return bld.vop1(aco_opcode::v_trunc_f64, Definition(dst), val);
1088
1089 /* GFX6 doesn't support V_TRUNC_F64, lower it. */
1090 /* TODO: create more efficient code! */
1091 if (val.type() == RegType::sgpr)
1092 val = as_vgpr(ctx, val);
1093
1094 /* Split the input value. */
1095 Temp val_lo = bld.tmp(v1), val_hi = bld.tmp(v1);
1096 bld.pseudo(aco_opcode::p_split_vector, Definition(val_lo), Definition(val_hi), val);
1097
1098 /* Extract the exponent and compute the unbiased value. */
1099 Temp exponent =
1100 bld.vop3(aco_opcode::v_bfe_u32, bld.def(v1), val_hi, Operand::c32(20u), Operand::c32(11u));
1101 exponent = bld.vsub32(bld.def(v1), exponent, Operand::c32(1023u));
1102
1103 /* Extract the fractional part. */
1104 Temp fract_mask = bld.pseudo(aco_opcode::p_create_vector, bld.def(v2), Operand::c32(-1u),
1105 Operand::c32(0x000fffffu));
1106 fract_mask = bld.vop3(aco_opcode::v_lshr_b64, bld.def(v2), fract_mask, exponent);
1107
1108 Temp fract_mask_lo = bld.tmp(v1), fract_mask_hi = bld.tmp(v1);
1109 bld.pseudo(aco_opcode::p_split_vector, Definition(fract_mask_lo), Definition(fract_mask_hi),
1110 fract_mask);
1111
1112 Temp fract_lo = bld.tmp(v1), fract_hi = bld.tmp(v1);
1113 Temp tmp = bld.vop1(aco_opcode::v_not_b32, bld.def(v1), fract_mask_lo);
1114 fract_lo = bld.vop2(aco_opcode::v_and_b32, bld.def(v1), val_lo, tmp);
1115 tmp = bld.vop1(aco_opcode::v_not_b32, bld.def(v1), fract_mask_hi);
1116 fract_hi = bld.vop2(aco_opcode::v_and_b32, bld.def(v1), val_hi, tmp);
1117
1118 /* Get the sign bit. */
1119 Temp sign = bld.vop2(aco_opcode::v_and_b32, bld.def(v1), Operand::c32(0x80000000u), val_hi);
1120
1121 /* Decide the operation to apply depending on the unbiased exponent. */
1122 Temp exp_lt0 =
1123 bld.vopc_e64(aco_opcode::v_cmp_lt_i32, bld.def(bld.lm), exponent, Operand::zero());
1124 Temp dst_lo = bld.vop2(aco_opcode::v_cndmask_b32, bld.def(v1), fract_lo,
1125 bld.copy(bld.def(v1), Operand::zero()), exp_lt0);
1126 Temp dst_hi = bld.vop2(aco_opcode::v_cndmask_b32, bld.def(v1), fract_hi, sign, exp_lt0);
1127 Temp exp_gt51 = bld.vopc_e64(aco_opcode::v_cmp_gt_i32, bld.def(s2), exponent, Operand::c32(51u));
1128 dst_lo = bld.vop2(aco_opcode::v_cndmask_b32, bld.def(v1), dst_lo, val_lo, exp_gt51);
1129 dst_hi = bld.vop2(aco_opcode::v_cndmask_b32, bld.def(v1), dst_hi, val_hi, exp_gt51);
1130
1131 return bld.pseudo(aco_opcode::p_create_vector, Definition(dst), dst_lo, dst_hi);
1132 }
1133
1134 Temp
emit_floor_f64(isel_context * ctx,Builder & bld,Definition dst,Temp val)1135 emit_floor_f64(isel_context* ctx, Builder& bld, Definition dst, Temp val)
1136 {
1137 if (ctx->options->gfx_level >= GFX7)
1138 return bld.vop1(aco_opcode::v_floor_f64, Definition(dst), val);
1139
1140 /* GFX6 doesn't support V_FLOOR_F64, lower it (note that it's actually
1141 * lowered at NIR level for precision reasons). */
1142 Temp src0 = as_vgpr(ctx, val);
1143
1144 Temp min_val = bld.pseudo(aco_opcode::p_create_vector, bld.def(s2), Operand::c32(-1u),
1145 Operand::c32(0x3fefffffu));
1146
1147 Temp isnan = bld.vopc(aco_opcode::v_cmp_neq_f64, bld.def(bld.lm), src0, src0);
1148 Temp fract = bld.vop1(aco_opcode::v_fract_f64, bld.def(v2), src0);
1149 Temp min = bld.vop3(aco_opcode::v_min_f64_e64, bld.def(v2), fract, min_val);
1150
1151 Temp then_lo = bld.tmp(v1), then_hi = bld.tmp(v1);
1152 bld.pseudo(aco_opcode::p_split_vector, Definition(then_lo), Definition(then_hi), src0);
1153 Temp else_lo = bld.tmp(v1), else_hi = bld.tmp(v1);
1154 bld.pseudo(aco_opcode::p_split_vector, Definition(else_lo), Definition(else_hi), min);
1155
1156 Temp dst0 = bld.vop2(aco_opcode::v_cndmask_b32, bld.def(v1), else_lo, then_lo, isnan);
1157 Temp dst1 = bld.vop2(aco_opcode::v_cndmask_b32, bld.def(v1), else_hi, then_hi, isnan);
1158
1159 Temp v = bld.pseudo(aco_opcode::p_create_vector, bld.def(v2), dst0, dst1);
1160
1161 Instruction* add = bld.vop3(aco_opcode::v_add_f64_e64, Definition(dst), src0, v);
1162 add->valu().neg[1] = true;
1163
1164 return add->definitions[0].getTemp();
1165 }
1166
1167 Temp
uadd32_sat(Builder & bld,Definition dst,Temp src0,Temp src1)1168 uadd32_sat(Builder& bld, Definition dst, Temp src0, Temp src1)
1169 {
1170 if (bld.program->gfx_level < GFX8) {
1171 Builder::Result add = bld.vadd32(bld.def(v1), src0, src1, true);
1172 return bld.vop2_e64(aco_opcode::v_cndmask_b32, dst, add.def(0).getTemp(), Operand::c32(-1),
1173 add.def(1).getTemp());
1174 }
1175
1176 Builder::Result add(NULL);
1177 if (bld.program->gfx_level >= GFX9) {
1178 add = bld.vop2_e64(aco_opcode::v_add_u32, dst, src0, src1);
1179 } else {
1180 add = bld.vop2_e64(aco_opcode::v_add_co_u32, dst, bld.def(bld.lm), src0, src1);
1181 }
1182 add->valu().clamp = 1;
1183 return dst.getTemp();
1184 }
1185
1186 Temp
usub32_sat(Builder & bld,Definition dst,Temp src0,Temp src1)1187 usub32_sat(Builder& bld, Definition dst, Temp src0, Temp src1)
1188 {
1189 if (bld.program->gfx_level < GFX8) {
1190 Builder::Result sub = bld.vsub32(bld.def(v1), src0, src1, true);
1191 return bld.vop2_e64(aco_opcode::v_cndmask_b32, dst, sub.def(0).getTemp(), Operand::c32(0u),
1192 sub.def(1).getTemp());
1193 }
1194
1195 Builder::Result sub(NULL);
1196 if (bld.program->gfx_level >= GFX9) {
1197 sub = bld.vop2_e64(aco_opcode::v_sub_u32, dst, src0, src1);
1198 } else {
1199 sub = bld.vop2_e64(aco_opcode::v_sub_co_u32, dst, bld.def(bld.lm), src0, src1);
1200 }
1201 sub->valu().clamp = 1;
1202 return dst.getTemp();
1203 }
1204
1205 void
emit_vec2_f2f16(isel_context * ctx,nir_alu_instr * instr,Temp dst)1206 emit_vec2_f2f16(isel_context* ctx, nir_alu_instr* instr, Temp dst)
1207 {
1208 Builder bld = create_alu_builder(ctx, instr);
1209 Temp src = get_ssa_temp(ctx, instr->src[0].src.ssa);
1210 RegClass rc = RegClass(src.regClass().type(), instr->src[0].src.ssa->bit_size / 32);
1211 Temp src0 = emit_extract_vector(ctx, src, instr->src[0].swizzle[0], rc);
1212 Temp src1 = emit_extract_vector(ctx, src, instr->src[0].swizzle[1], rc);
1213
1214 if (dst.regClass() == s1) {
1215 bld.sop2(aco_opcode::s_cvt_pk_rtz_f16_f32, Definition(dst), src0, src1);
1216 } else {
1217 src1 = as_vgpr(ctx, src1);
1218 if (ctx->program->gfx_level == GFX8 || ctx->program->gfx_level == GFX9)
1219 bld.vop3(aco_opcode::v_cvt_pkrtz_f16_f32_e64, Definition(dst), src0, src1);
1220 else
1221 bld.vop2(aco_opcode::v_cvt_pkrtz_f16_f32, Definition(dst), src0, src1);
1222 emit_split_vector(ctx, dst, 2);
1223 }
1224 }
1225
1226 void
visit_alu_instr(isel_context * ctx,nir_alu_instr * instr)1227 visit_alu_instr(isel_context* ctx, nir_alu_instr* instr)
1228 {
1229 Builder bld = create_alu_builder(ctx, instr);
1230 Temp dst = get_ssa_temp(ctx, &instr->def);
1231 switch (instr->op) {
1232 case nir_op_vec2:
1233 case nir_op_vec3:
1234 case nir_op_vec4:
1235 case nir_op_vec5:
1236 case nir_op_vec8:
1237 case nir_op_vec16: {
1238 std::array<Temp, NIR_MAX_VEC_COMPONENTS> elems;
1239 unsigned num = instr->def.num_components;
1240 for (unsigned i = 0; i < num; ++i)
1241 elems[i] = get_alu_src(ctx, instr->src[i]);
1242
1243 if (instr->def.bit_size >= 32 || dst.type() == RegType::vgpr) {
1244 aco_ptr<Instruction> vec{create_instruction(aco_opcode::p_create_vector, Format::PSEUDO,
1245 instr->def.num_components, 1)};
1246 RegClass elem_rc = RegClass::get(dst.type(), instr->def.bit_size / 8u);
1247 for (unsigned i = 0; i < num; ++i) {
1248 if (elems[i].type() == RegType::sgpr && elem_rc.is_subdword())
1249 elems[i] = emit_extract_vector(ctx, elems[i], 0, elem_rc);
1250
1251 if (nir_src_is_undef(instr->src[i].src))
1252 vec->operands[i] = Operand{elem_rc};
1253 else
1254 vec->operands[i] = Operand{elems[i]};
1255 }
1256 vec->definitions[0] = Definition(dst);
1257 ctx->block->instructions.emplace_back(std::move(vec));
1258 ctx->allocated_vec.emplace(dst.id(), elems);
1259 } else {
1260 bool use_s_pack = ctx->program->gfx_level >= GFX9;
1261 Temp mask = bld.copy(bld.def(s1), Operand::c32((1u << instr->def.bit_size) - 1));
1262
1263 std::array<Temp, NIR_MAX_VEC_COMPONENTS> packed;
1264 uint32_t const_vals[NIR_MAX_VEC_COMPONENTS] = {};
1265 bitarray32 undef_mask = UINT32_MAX;
1266 for (unsigned i = 0; i < num; i++) {
1267 unsigned packed_size = use_s_pack ? 16 : 32;
1268 unsigned idx = i * instr->def.bit_size / packed_size;
1269 unsigned offset = i * instr->def.bit_size % packed_size;
1270 if (nir_src_is_undef(instr->src[i].src))
1271 continue;
1272 else
1273 undef_mask[idx] = false;
1274
1275 if (nir_src_is_const(instr->src[i].src)) {
1276 const_vals[idx] |= nir_src_as_uint(instr->src[i].src) << offset;
1277 continue;
1278 }
1279
1280 if (offset != packed_size - instr->def.bit_size)
1281 elems[i] =
1282 bld.sop2(aco_opcode::s_and_b32, bld.def(s1), bld.def(s1, scc), elems[i], mask);
1283
1284 if (offset)
1285 elems[i] = bld.sop2(aco_opcode::s_lshl_b32, bld.def(s1), bld.def(s1, scc), elems[i],
1286 Operand::c32(offset));
1287
1288 if (packed[idx].id())
1289 packed[idx] = bld.sop2(aco_opcode::s_or_b32, bld.def(s1), bld.def(s1, scc), elems[i],
1290 packed[idx]);
1291 else
1292 packed[idx] = elems[i];
1293 }
1294
1295 if (use_s_pack) {
1296 for (unsigned i = 0; i < dst.size(); i++) {
1297 bool same = !!packed[i * 2].id() == !!packed[i * 2 + 1].id();
1298
1299 if (packed[i * 2].id() && packed[i * 2 + 1].id())
1300 packed[i] = bld.sop2(aco_opcode::s_pack_ll_b32_b16, bld.def(s1), packed[i * 2],
1301 packed[i * 2 + 1]);
1302 else if (packed[i * 2 + 1].id())
1303 packed[i] = bld.sop2(aco_opcode::s_pack_ll_b32_b16, bld.def(s1),
1304 Operand::c32(const_vals[i * 2]), packed[i * 2 + 1]);
1305 else if (packed[i * 2].id())
1306 packed[i] = bld.sop2(aco_opcode::s_pack_ll_b32_b16, bld.def(s1), packed[i * 2],
1307 Operand::c32(const_vals[i * 2 + 1]));
1308 else
1309 packed[i] = Temp(0, s1); /* Both constants, so reset the entry */
1310
1311 undef_mask[i] = undef_mask[i * 2] && undef_mask[i * 2 + 1];
1312
1313 if (same)
1314 const_vals[i] = const_vals[i * 2] | (const_vals[i * 2 + 1] << 16);
1315 else
1316 const_vals[i] = 0;
1317 }
1318 }
1319
1320 for (unsigned i = 0; i < dst.size(); i++) {
1321 if (const_vals[i] && packed[i].id())
1322 packed[i] = bld.sop2(aco_opcode::s_or_b32, bld.def(s1), bld.def(s1, scc),
1323 Operand::c32(const_vals[i]), packed[i]);
1324 else if (!packed[i].id() && !undef_mask[i])
1325 packed[i] = bld.copy(bld.def(s1), Operand::c32(const_vals[i]));
1326 }
1327
1328 if (dst.size() == 1 && packed[0].id())
1329 bld.copy(Definition(dst), packed[0]);
1330 else {
1331 aco_ptr<Instruction> vec{
1332 create_instruction(aco_opcode::p_create_vector, Format::PSEUDO, dst.size(), 1)};
1333 vec->definitions[0] = Definition(dst);
1334 for (unsigned i = 0; i < dst.size(); ++i)
1335 vec->operands[i] = Operand(packed[i]);
1336 bld.insert(std::move(vec));
1337 }
1338 }
1339 break;
1340 }
1341 case nir_op_mov: {
1342 Temp src = get_alu_src(ctx, instr->src[0]);
1343 if (src.type() == RegType::vgpr && dst.type() == RegType::sgpr) {
1344 /* use size() instead of bytes() for 8/16-bit */
1345 assert(src.size() == dst.size() && "wrong src or dst register class for nir_op_mov");
1346 bld.pseudo(aco_opcode::p_as_uniform, Definition(dst), src);
1347 } else {
1348 assert(src.bytes() == dst.bytes() && "wrong src or dst register class for nir_op_mov");
1349 bld.copy(Definition(dst), src);
1350 }
1351 break;
1352 }
1353 case nir_op_inot: {
1354 Temp src = get_alu_src(ctx, instr->src[0]);
1355 if (dst.regClass() == v1 || dst.regClass() == v2b || dst.regClass() == v1b) {
1356 emit_vop1_instruction(ctx, instr, aco_opcode::v_not_b32, dst);
1357 } else if (dst.regClass() == v2) {
1358 Temp lo = bld.tmp(v1), hi = bld.tmp(v1);
1359 bld.pseudo(aco_opcode::p_split_vector, Definition(lo), Definition(hi), src);
1360 lo = bld.vop1(aco_opcode::v_not_b32, bld.def(v1), lo);
1361 hi = bld.vop1(aco_opcode::v_not_b32, bld.def(v1), hi);
1362 bld.pseudo(aco_opcode::p_create_vector, Definition(dst), lo, hi);
1363 } else if (dst.type() == RegType::sgpr) {
1364 aco_opcode opcode = dst.size() == 1 ? aco_opcode::s_not_b32 : aco_opcode::s_not_b64;
1365 bld.sop1(opcode, Definition(dst), bld.def(s1, scc), src);
1366 } else {
1367 isel_err(&instr->instr, "Unimplemented NIR instr bit size");
1368 }
1369 break;
1370 }
1371 case nir_op_iabs: {
1372 if (dst.regClass() == v1 && instr->def.bit_size == 16) {
1373 Temp src = get_alu_src_vop3p(ctx, instr->src[0]);
1374
1375 unsigned opsel_lo = (instr->src[0].swizzle[0] & 1) << 1;
1376 unsigned opsel_hi = ((instr->src[0].swizzle[1] & 1) << 1) | 1;
1377
1378 Temp sub = bld.vop3p(aco_opcode::v_pk_sub_u16, Definition(bld.tmp(v1)), Operand::zero(),
1379 src, opsel_lo, opsel_hi);
1380 bld.vop3p(aco_opcode::v_pk_max_i16, Definition(dst), sub, src, opsel_lo, opsel_hi);
1381 emit_split_vector(ctx, dst, 2);
1382 break;
1383 }
1384 Temp src = get_alu_src(ctx, instr->src[0]);
1385 if (dst.regClass() == s1) {
1386 bld.sop1(aco_opcode::s_abs_i32, Definition(dst), bld.def(s1, scc), src);
1387 } else if (dst.regClass() == v1) {
1388 bld.vop2(aco_opcode::v_max_i32, Definition(dst), src,
1389 bld.vsub32(bld.def(v1), Operand::zero(), src));
1390 } else if (dst.regClass() == v2b && ctx->program->gfx_level >= GFX10) {
1391 bld.vop3(
1392 aco_opcode::v_max_i16_e64, Definition(dst), src,
1393 bld.vop3(aco_opcode::v_sub_u16_e64, Definition(bld.tmp(v2b)), Operand::zero(2), src));
1394 } else if (dst.regClass() == v2b) {
1395 src = as_vgpr(ctx, src);
1396 bld.vop2(aco_opcode::v_max_i16, Definition(dst), src,
1397 bld.vop2(aco_opcode::v_sub_u16, Definition(bld.tmp(v2b)), Operand::zero(2), src));
1398 } else {
1399 isel_err(&instr->instr, "Unimplemented NIR instr bit size");
1400 }
1401 break;
1402 }
1403 case nir_op_isign: {
1404 Temp src = get_alu_src(ctx, instr->src[0]);
1405 if (dst.regClass() == s1) {
1406 Temp tmp =
1407 bld.sop2(aco_opcode::s_max_i32, bld.def(s1), bld.def(s1, scc), src, Operand::c32(-1));
1408 bld.sop2(aco_opcode::s_min_i32, Definition(dst), bld.def(s1, scc), tmp, Operand::c32(1u));
1409 } else if (dst.regClass() == s2) {
1410 Temp neg =
1411 bld.sop2(aco_opcode::s_ashr_i64, bld.def(s2), bld.def(s1, scc), src, Operand::c32(63u));
1412 Temp neqz;
1413 if (ctx->program->gfx_level >= GFX8)
1414 neqz = bld.sopc(aco_opcode::s_cmp_lg_u64, bld.def(s1, scc), src, Operand::zero());
1415 else
1416 neqz =
1417 bld.sop2(aco_opcode::s_or_b64, bld.def(s2), bld.def(s1, scc), src, Operand::zero())
1418 .def(1)
1419 .getTemp();
1420 /* SCC gets zero-extended to 64 bit */
1421 bld.sop2(aco_opcode::s_or_b64, Definition(dst), bld.def(s1, scc), neg, bld.scc(neqz));
1422 } else if (dst.regClass() == v1) {
1423 bld.vop3(aco_opcode::v_med3_i32, Definition(dst), Operand::c32(-1), src, Operand::c32(1u));
1424 } else if (dst.regClass() == v2b && ctx->program->gfx_level >= GFX9) {
1425 bld.vop3(aco_opcode::v_med3_i16, Definition(dst), Operand::c16(-1), src, Operand::c16(1u));
1426 } else if (dst.regClass() == v2b) {
1427 src = as_vgpr(ctx, src);
1428 bld.vop2(aco_opcode::v_max_i16, Definition(dst), Operand::c16(-1),
1429 bld.vop2(aco_opcode::v_min_i16, Definition(bld.tmp(v1)), Operand::c16(1u), src));
1430 } else if (dst.regClass() == v2) {
1431 Temp upper = emit_extract_vector(ctx, src, 1, v1);
1432 Temp neg = bld.vop2(aco_opcode::v_ashrrev_i32, bld.def(v1), Operand::c32(31u), upper);
1433 Temp gtz = bld.vopc(aco_opcode::v_cmp_ge_i64, bld.def(bld.lm), Operand::zero(), src);
1434 Temp lower = bld.vop2(aco_opcode::v_cndmask_b32, bld.def(v1), Operand::c32(1u), neg, gtz);
1435 upper = bld.vop2(aco_opcode::v_cndmask_b32, bld.def(v1), Operand::zero(), neg, gtz);
1436 bld.pseudo(aco_opcode::p_create_vector, Definition(dst), lower, upper);
1437 } else {
1438 isel_err(&instr->instr, "Unimplemented NIR instr bit size");
1439 }
1440 break;
1441 }
1442 case nir_op_imax: {
1443 if (dst.regClass() == v2b && ctx->program->gfx_level >= GFX10) {
1444 emit_vop3a_instruction(ctx, instr, aco_opcode::v_max_i16_e64, dst);
1445 } else if (dst.regClass() == v2b) {
1446 emit_vop2_instruction(ctx, instr, aco_opcode::v_max_i16, dst, true);
1447 } else if (dst.regClass() == v1 && instr->def.bit_size == 16) {
1448 emit_vop3p_instruction(ctx, instr, aco_opcode::v_pk_max_i16, dst);
1449 } else if (dst.regClass() == v1) {
1450 emit_vop2_instruction(ctx, instr, aco_opcode::v_max_i32, dst, true);
1451 } else if (dst.regClass() == s1) {
1452 emit_sop2_instruction(ctx, instr, aco_opcode::s_max_i32, dst, true);
1453 } else {
1454 isel_err(&instr->instr, "Unimplemented NIR instr bit size");
1455 }
1456 break;
1457 }
1458 case nir_op_umax: {
1459 if (dst.regClass() == v2b && ctx->program->gfx_level >= GFX10) {
1460 emit_vop3a_instruction(ctx, instr, aco_opcode::v_max_u16_e64, dst);
1461 } else if (dst.regClass() == v2b) {
1462 emit_vop2_instruction(ctx, instr, aco_opcode::v_max_u16, dst, true);
1463 } else if (dst.regClass() == v1 && instr->def.bit_size == 16) {
1464 emit_vop3p_instruction(ctx, instr, aco_opcode::v_pk_max_u16, dst);
1465 } else if (dst.regClass() == v1) {
1466 emit_vop2_instruction(ctx, instr, aco_opcode::v_max_u32, dst, true);
1467 } else if (dst.regClass() == s1) {
1468 emit_sop2_instruction(ctx, instr, aco_opcode::s_max_u32, dst, true);
1469 } else {
1470 isel_err(&instr->instr, "Unimplemented NIR instr bit size");
1471 }
1472 break;
1473 }
1474 case nir_op_imin: {
1475 if (dst.regClass() == v2b && ctx->program->gfx_level >= GFX10) {
1476 emit_vop3a_instruction(ctx, instr, aco_opcode::v_min_i16_e64, dst);
1477 } else if (dst.regClass() == v2b) {
1478 emit_vop2_instruction(ctx, instr, aco_opcode::v_min_i16, dst, true);
1479 } else if (dst.regClass() == v1 && instr->def.bit_size == 16) {
1480 emit_vop3p_instruction(ctx, instr, aco_opcode::v_pk_min_i16, dst);
1481 } else if (dst.regClass() == v1) {
1482 emit_vop2_instruction(ctx, instr, aco_opcode::v_min_i32, dst, true);
1483 } else if (dst.regClass() == s1) {
1484 emit_sop2_instruction(ctx, instr, aco_opcode::s_min_i32, dst, true);
1485 } else {
1486 isel_err(&instr->instr, "Unimplemented NIR instr bit size");
1487 }
1488 break;
1489 }
1490 case nir_op_umin: {
1491 if (dst.regClass() == v2b && ctx->program->gfx_level >= GFX10) {
1492 emit_vop3a_instruction(ctx, instr, aco_opcode::v_min_u16_e64, dst);
1493 } else if (dst.regClass() == v2b) {
1494 emit_vop2_instruction(ctx, instr, aco_opcode::v_min_u16, dst, true);
1495 } else if (dst.regClass() == v1 && instr->def.bit_size == 16) {
1496 emit_vop3p_instruction(ctx, instr, aco_opcode::v_pk_min_u16, dst);
1497 } else if (dst.regClass() == v1) {
1498 emit_vop2_instruction(ctx, instr, aco_opcode::v_min_u32, dst, true);
1499 } else if (dst.regClass() == s1) {
1500 emit_sop2_instruction(ctx, instr, aco_opcode::s_min_u32, dst, true);
1501 } else {
1502 isel_err(&instr->instr, "Unimplemented NIR instr bit size");
1503 }
1504 break;
1505 }
1506 case nir_op_ior: {
1507 if (instr->def.bit_size == 1) {
1508 emit_boolean_logic(ctx, instr, Builder::s_or, dst);
1509 } else if (dst.regClass() == v1 || dst.regClass() == v2b || dst.regClass() == v1b) {
1510 emit_vop2_instruction(ctx, instr, aco_opcode::v_or_b32, dst, true);
1511 } else if (dst.regClass() == v2) {
1512 emit_vop2_instruction_logic64(ctx, instr, aco_opcode::v_or_b32, dst);
1513 } else if (dst.regClass() == s1) {
1514 emit_sop2_instruction(ctx, instr, aco_opcode::s_or_b32, dst, true);
1515 } else if (dst.regClass() == s2) {
1516 emit_sop2_instruction(ctx, instr, aco_opcode::s_or_b64, dst, true);
1517 } else {
1518 isel_err(&instr->instr, "Unimplemented NIR instr bit size");
1519 }
1520 break;
1521 }
1522 case nir_op_iand: {
1523 if (instr->def.bit_size == 1) {
1524 emit_boolean_logic(ctx, instr, Builder::s_and, dst);
1525 } else if (dst.regClass() == v1 || dst.regClass() == v2b || dst.regClass() == v1b) {
1526 emit_vop2_instruction(ctx, instr, aco_opcode::v_and_b32, dst, true);
1527 } else if (dst.regClass() == v2) {
1528 emit_vop2_instruction_logic64(ctx, instr, aco_opcode::v_and_b32, dst);
1529 } else if (dst.regClass() == s1) {
1530 emit_sop2_instruction(ctx, instr, aco_opcode::s_and_b32, dst, true);
1531 } else if (dst.regClass() == s2) {
1532 emit_sop2_instruction(ctx, instr, aco_opcode::s_and_b64, dst, true);
1533 } else {
1534 isel_err(&instr->instr, "Unimplemented NIR instr bit size");
1535 }
1536 break;
1537 }
1538 case nir_op_ixor: {
1539 if (instr->def.bit_size == 1) {
1540 emit_boolean_logic(ctx, instr, Builder::s_xor, dst);
1541 } else if (dst.regClass() == v1 || dst.regClass() == v2b || dst.regClass() == v1b) {
1542 emit_vop2_instruction(ctx, instr, aco_opcode::v_xor_b32, dst, true);
1543 } else if (dst.regClass() == v2) {
1544 emit_vop2_instruction_logic64(ctx, instr, aco_opcode::v_xor_b32, dst);
1545 } else if (dst.regClass() == s1) {
1546 emit_sop2_instruction(ctx, instr, aco_opcode::s_xor_b32, dst, true);
1547 } else if (dst.regClass() == s2) {
1548 emit_sop2_instruction(ctx, instr, aco_opcode::s_xor_b64, dst, true);
1549 } else {
1550 isel_err(&instr->instr, "Unimplemented NIR instr bit size");
1551 }
1552 break;
1553 }
1554 case nir_op_ushr: {
1555 if (dst.regClass() == v2b && ctx->program->gfx_level >= GFX10) {
1556 emit_vop3a_instruction(ctx, instr, aco_opcode::v_lshrrev_b16_e64, dst, false, 2, true);
1557 } else if (dst.regClass() == v2b) {
1558 emit_vop2_instruction(ctx, instr, aco_opcode::v_lshrrev_b16, dst, false, true);
1559 } else if (dst.regClass() == v1 && instr->def.bit_size == 16) {
1560 emit_vop3p_instruction(ctx, instr, aco_opcode::v_pk_lshrrev_b16, dst, true);
1561 } else if (dst.regClass() == v1) {
1562 emit_vop2_instruction(ctx, instr, aco_opcode::v_lshrrev_b32, dst, false, true);
1563 } else if (dst.regClass() == v2 && ctx->program->gfx_level >= GFX8) {
1564 bld.vop3(aco_opcode::v_lshrrev_b64, Definition(dst), get_alu_src(ctx, instr->src[1]),
1565 get_alu_src(ctx, instr->src[0]));
1566 } else if (dst.regClass() == v2) {
1567 emit_vop3a_instruction(ctx, instr, aco_opcode::v_lshr_b64, dst);
1568 } else if (dst.regClass() == s2) {
1569 emit_sop2_instruction(ctx, instr, aco_opcode::s_lshr_b64, dst, true);
1570 } else if (dst.regClass() == s1) {
1571 emit_sop2_instruction(ctx, instr, aco_opcode::s_lshr_b32, dst, true);
1572 } else {
1573 isel_err(&instr->instr, "Unimplemented NIR instr bit size");
1574 }
1575 break;
1576 }
1577 case nir_op_ishl: {
1578 if (dst.regClass() == v2b && ctx->program->gfx_level >= GFX10) {
1579 emit_vop3a_instruction(ctx, instr, aco_opcode::v_lshlrev_b16_e64, dst, false, 2, true);
1580 } else if (dst.regClass() == v2b) {
1581 emit_vop2_instruction(ctx, instr, aco_opcode::v_lshlrev_b16, dst, false, true);
1582 } else if (dst.regClass() == v1 && instr->def.bit_size == 16) {
1583 emit_vop3p_instruction(ctx, instr, aco_opcode::v_pk_lshlrev_b16, dst, true);
1584 } else if (dst.regClass() == v1) {
1585 emit_vop2_instruction(ctx, instr, aco_opcode::v_lshlrev_b32, dst, false, true, false,
1586 false, 1);
1587 } else if (dst.regClass() == v2 && ctx->program->gfx_level >= GFX8) {
1588 bld.vop3(aco_opcode::v_lshlrev_b64_e64, Definition(dst), get_alu_src(ctx, instr->src[1]),
1589 get_alu_src(ctx, instr->src[0]));
1590 } else if (dst.regClass() == v2) {
1591 emit_vop3a_instruction(ctx, instr, aco_opcode::v_lshl_b64, dst);
1592 } else if (dst.regClass() == s1) {
1593 emit_sop2_instruction(ctx, instr, aco_opcode::s_lshl_b32, dst, true, 1);
1594 } else if (dst.regClass() == s2) {
1595 emit_sop2_instruction(ctx, instr, aco_opcode::s_lshl_b64, dst, true);
1596 } else {
1597 isel_err(&instr->instr, "Unimplemented NIR instr bit size");
1598 }
1599 break;
1600 }
1601 case nir_op_ishr: {
1602 if (dst.regClass() == v2b && ctx->program->gfx_level >= GFX10) {
1603 emit_vop3a_instruction(ctx, instr, aco_opcode::v_ashrrev_i16_e64, dst, false, 2, true);
1604 } else if (dst.regClass() == v2b) {
1605 emit_vop2_instruction(ctx, instr, aco_opcode::v_ashrrev_i16, dst, false, true);
1606 } else if (dst.regClass() == v1 && instr->def.bit_size == 16) {
1607 emit_vop3p_instruction(ctx, instr, aco_opcode::v_pk_ashrrev_i16, dst, true);
1608 } else if (dst.regClass() == v1) {
1609 emit_vop2_instruction(ctx, instr, aco_opcode::v_ashrrev_i32, dst, false, true);
1610 } else if (dst.regClass() == v2 && ctx->program->gfx_level >= GFX8) {
1611 bld.vop3(aco_opcode::v_ashrrev_i64, Definition(dst), get_alu_src(ctx, instr->src[1]),
1612 get_alu_src(ctx, instr->src[0]));
1613 } else if (dst.regClass() == v2) {
1614 emit_vop3a_instruction(ctx, instr, aco_opcode::v_ashr_i64, dst);
1615 } else if (dst.regClass() == s1) {
1616 emit_sop2_instruction(ctx, instr, aco_opcode::s_ashr_i32, dst, true);
1617 } else if (dst.regClass() == s2) {
1618 emit_sop2_instruction(ctx, instr, aco_opcode::s_ashr_i64, dst, true);
1619 } else {
1620 isel_err(&instr->instr, "Unimplemented NIR instr bit size");
1621 }
1622 break;
1623 }
1624 case nir_op_find_lsb: {
1625 Temp src = get_alu_src(ctx, instr->src[0]);
1626 if (src.regClass() == s1) {
1627 bld.sop1(aco_opcode::s_ff1_i32_b32, Definition(dst), src);
1628 } else if (src.regClass() == v1) {
1629 emit_vop1_instruction(ctx, instr, aco_opcode::v_ffbl_b32, dst);
1630 } else if (src.regClass() == s2) {
1631 bld.sop1(aco_opcode::s_ff1_i32_b64, Definition(dst), src);
1632 } else if (src.regClass() == v2) {
1633 Temp lo = bld.tmp(v1), hi = bld.tmp(v1);
1634 bld.pseudo(aco_opcode::p_split_vector, Definition(lo), Definition(hi), src);
1635 lo = bld.vop1(aco_opcode::v_ffbl_b32, bld.def(v1), lo);
1636 hi = bld.vop1(aco_opcode::v_ffbl_b32, bld.def(v1), hi);
1637 hi = bld.vop2(aco_opcode::v_or_b32, bld.def(v1), Operand::c32(32u), hi);
1638 bld.vop2(aco_opcode::v_min_u32, Definition(dst), lo, hi);
1639 } else {
1640 isel_err(&instr->instr, "Unimplemented NIR instr bit size");
1641 }
1642 break;
1643 }
1644 case nir_op_ufind_msb:
1645 case nir_op_ifind_msb: {
1646 Temp src = get_alu_src(ctx, instr->src[0]);
1647 if (src.regClass() == s1 || src.regClass() == s2) {
1648 aco_opcode op = src.regClass() == s2
1649 ? (instr->op == nir_op_ufind_msb ? aco_opcode::s_flbit_i32_b64
1650 : aco_opcode::s_flbit_i32_i64)
1651 : (instr->op == nir_op_ufind_msb ? aco_opcode::s_flbit_i32_b32
1652 : aco_opcode::s_flbit_i32);
1653 Temp msb_rev = bld.sop1(op, bld.def(s1), src);
1654
1655 Builder::Result sub = bld.sop2(aco_opcode::s_sub_u32, bld.def(s1), bld.def(s1, scc),
1656 Operand::c32(src.size() * 32u - 1u), msb_rev);
1657 Temp msb = sub.def(0).getTemp();
1658 Temp carry = sub.def(1).getTemp();
1659
1660 bld.sop2(aco_opcode::s_cselect_b32, Definition(dst), Operand::c32(-1), msb,
1661 bld.scc(carry));
1662 } else if (src.regClass() == v1) {
1663 aco_opcode op =
1664 instr->op == nir_op_ufind_msb ? aco_opcode::v_ffbh_u32 : aco_opcode::v_ffbh_i32;
1665 Temp msb_rev = bld.tmp(v1);
1666 emit_vop1_instruction(ctx, instr, op, msb_rev);
1667 Temp msb = bld.tmp(v1);
1668 Temp carry =
1669 bld.vsub32(Definition(msb), Operand::c32(31u), Operand(msb_rev), true).def(1).getTemp();
1670 bld.vop2(aco_opcode::v_cndmask_b32, Definition(dst), msb, msb_rev, carry);
1671 } else if (src.regClass() == v2) {
1672 aco_opcode op =
1673 instr->op == nir_op_ufind_msb ? aco_opcode::v_ffbh_u32 : aco_opcode::v_ffbh_i32;
1674
1675 Temp lo = bld.tmp(v1), hi = bld.tmp(v1);
1676 bld.pseudo(aco_opcode::p_split_vector, Definition(lo), Definition(hi), src);
1677
1678 lo = bld.vop1(op, bld.def(v1), lo);
1679 lo = bld.vop2(aco_opcode::v_or_b32, bld.def(v1), Operand::c32(32), lo);
1680 hi = bld.vop1(op, bld.def(v1), hi);
1681 Temp msb_rev = bld.vop2(aco_opcode::v_min_u32, bld.def(v1), lo, hi);
1682
1683 Temp msb = bld.tmp(v1);
1684 Temp carry =
1685 bld.vsub32(Definition(msb), Operand::c32(63u), Operand(msb_rev), true).def(1).getTemp();
1686 bld.vop2(aco_opcode::v_cndmask_b32, Definition(dst), msb, msb_rev, carry);
1687 } else {
1688 isel_err(&instr->instr, "Unimplemented NIR instr bit size");
1689 }
1690 break;
1691 }
1692 case nir_op_ufind_msb_rev:
1693 case nir_op_ifind_msb_rev: {
1694 Temp src = get_alu_src(ctx, instr->src[0]);
1695 if (src.regClass() == s1) {
1696 aco_opcode op = instr->op == nir_op_ufind_msb_rev ? aco_opcode::s_flbit_i32_b32
1697 : aco_opcode::s_flbit_i32;
1698 bld.sop1(op, Definition(dst), src);
1699 } else if (src.regClass() == v1) {
1700 aco_opcode op =
1701 instr->op == nir_op_ufind_msb_rev ? aco_opcode::v_ffbh_u32 : aco_opcode::v_ffbh_i32;
1702 emit_vop1_instruction(ctx, instr, op, dst);
1703 } else {
1704 isel_err(&instr->instr, "Unimplemented NIR instr bit size");
1705 }
1706 break;
1707 }
1708 case nir_op_bitfield_reverse: {
1709 if (dst.regClass() == s1) {
1710 bld.sop1(aco_opcode::s_brev_b32, Definition(dst), get_alu_src(ctx, instr->src[0]));
1711 } else if (dst.regClass() == v1) {
1712 bld.vop1(aco_opcode::v_bfrev_b32, Definition(dst), get_alu_src(ctx, instr->src[0]));
1713 } else {
1714 isel_err(&instr->instr, "Unimplemented NIR instr bit size");
1715 }
1716 break;
1717 }
1718 case nir_op_iadd: {
1719 if (dst.regClass() == s1) {
1720 emit_sop2_instruction(ctx, instr, aco_opcode::s_add_u32, dst, true);
1721 break;
1722 } else if (dst.bytes() <= 2 && ctx->program->gfx_level >= GFX10) {
1723 emit_vop3a_instruction(ctx, instr, aco_opcode::v_add_u16_e64, dst);
1724 break;
1725 } else if (dst.bytes() <= 2 && ctx->program->gfx_level >= GFX8) {
1726 emit_vop2_instruction(ctx, instr, aco_opcode::v_add_u16, dst, true);
1727 break;
1728 } else if (dst.regClass() == v1 && instr->def.bit_size == 16) {
1729 emit_vop3p_instruction(ctx, instr, aco_opcode::v_pk_add_u16, dst);
1730 break;
1731 }
1732
1733 Temp src0 = get_alu_src(ctx, instr->src[0]);
1734 Temp src1 = get_alu_src(ctx, instr->src[1]);
1735 if (dst.type() == RegType::vgpr && dst.bytes() <= 4) {
1736 if (instr->no_unsigned_wrap)
1737 bld.nuw().vadd32(Definition(dst), Operand(src0), Operand(src1));
1738 else
1739 bld.vadd32(Definition(dst), Operand(src0), Operand(src1));
1740 break;
1741 }
1742
1743 assert(src0.size() == 2 && src1.size() == 2);
1744 Temp src00 = bld.tmp(src0.type(), 1);
1745 Temp src01 = bld.tmp(dst.type(), 1);
1746 bld.pseudo(aco_opcode::p_split_vector, Definition(src00), Definition(src01), src0);
1747 Temp src10 = bld.tmp(src1.type(), 1);
1748 Temp src11 = bld.tmp(dst.type(), 1);
1749 bld.pseudo(aco_opcode::p_split_vector, Definition(src10), Definition(src11), src1);
1750
1751 if (dst.regClass() == s2) {
1752 Temp carry = bld.tmp(s1);
1753 Temp dst0 =
1754 bld.sop2(aco_opcode::s_add_u32, bld.def(s1), bld.scc(Definition(carry)), src00, src10);
1755 Temp dst1 = bld.sop2(aco_opcode::s_addc_u32, bld.def(s1), bld.def(s1, scc), src01, src11,
1756 bld.scc(carry));
1757 bld.pseudo(aco_opcode::p_create_vector, Definition(dst), dst0, dst1);
1758 } else if (dst.regClass() == v2) {
1759 Temp dst0 = bld.tmp(v1);
1760 Temp carry = bld.vadd32(Definition(dst0), src00, src10, true).def(1).getTemp();
1761 Temp dst1 = bld.vadd32(bld.def(v1), src01, src11, false, carry);
1762 bld.pseudo(aco_opcode::p_create_vector, Definition(dst), dst0, dst1);
1763 } else {
1764 isel_err(&instr->instr, "Unimplemented NIR instr bit size");
1765 }
1766 break;
1767 }
1768 case nir_op_uadd_sat: {
1769 if (dst.regClass() == v1 && instr->def.bit_size == 16) {
1770 Instruction* add_instr = emit_vop3p_instruction(ctx, instr, aco_opcode::v_pk_add_u16, dst);
1771 add_instr->valu().clamp = 1;
1772 break;
1773 }
1774 Temp src0 = get_alu_src(ctx, instr->src[0]);
1775 Temp src1 = get_alu_src(ctx, instr->src[1]);
1776 if (dst.regClass() == s1) {
1777 Temp tmp = bld.tmp(s1), carry = bld.tmp(s1);
1778 bld.sop2(aco_opcode::s_add_u32, Definition(tmp), bld.scc(Definition(carry)), src0, src1);
1779 bld.sop2(aco_opcode::s_cselect_b32, Definition(dst), Operand::c32(-1), tmp,
1780 bld.scc(carry));
1781 break;
1782 } else if (dst.regClass() == v2b) {
1783 Instruction* add_instr;
1784 if (ctx->program->gfx_level >= GFX10) {
1785 add_instr = bld.vop3(aco_opcode::v_add_u16_e64, Definition(dst), src0, src1).instr;
1786 } else {
1787 if (src1.type() == RegType::sgpr)
1788 std::swap(src0, src1);
1789 add_instr =
1790 bld.vop2_e64(aco_opcode::v_add_u16, Definition(dst), src0, as_vgpr(ctx, src1)).instr;
1791 }
1792 add_instr->valu().clamp = 1;
1793 break;
1794 } else if (dst.regClass() == v1) {
1795 uadd32_sat(bld, Definition(dst), src0, src1);
1796 break;
1797 }
1798
1799 assert(src0.size() == 2 && src1.size() == 2);
1800
1801 Temp src00 = bld.tmp(src0.type(), 1);
1802 Temp src01 = bld.tmp(src0.type(), 1);
1803 bld.pseudo(aco_opcode::p_split_vector, Definition(src00), Definition(src01), src0);
1804 Temp src10 = bld.tmp(src1.type(), 1);
1805 Temp src11 = bld.tmp(src1.type(), 1);
1806 bld.pseudo(aco_opcode::p_split_vector, Definition(src10), Definition(src11), src1);
1807
1808 if (dst.regClass() == s2) {
1809 Temp carry0 = bld.tmp(s1);
1810 Temp carry1 = bld.tmp(s1);
1811
1812 Temp no_sat0 =
1813 bld.sop2(aco_opcode::s_add_u32, bld.def(s1), bld.scc(Definition(carry0)), src00, src10);
1814 Temp no_sat1 = bld.sop2(aco_opcode::s_addc_u32, bld.def(s1), bld.scc(Definition(carry1)),
1815 src01, src11, bld.scc(carry0));
1816
1817 Temp no_sat = bld.pseudo(aco_opcode::p_create_vector, bld.def(s2), no_sat0, no_sat1);
1818
1819 bld.sop2(aco_opcode::s_cselect_b64, Definition(dst), Operand::c64(-1), no_sat,
1820 bld.scc(carry1));
1821 } else if (dst.regClass() == v2) {
1822 Temp no_sat0 = bld.tmp(v1);
1823 Temp dst0 = bld.tmp(v1);
1824 Temp dst1 = bld.tmp(v1);
1825
1826 Temp carry0 = bld.vadd32(Definition(no_sat0), src00, src10, true).def(1).getTemp();
1827 Temp carry1;
1828
1829 if (ctx->program->gfx_level >= GFX8) {
1830 carry1 = bld.tmp(bld.lm);
1831 bld.vop2_e64(aco_opcode::v_addc_co_u32, Definition(dst1), Definition(carry1),
1832 as_vgpr(ctx, src01), as_vgpr(ctx, src11), carry0)
1833 ->valu()
1834 .clamp = 1;
1835 } else {
1836 Temp no_sat1 = bld.tmp(v1);
1837 carry1 = bld.vadd32(Definition(no_sat1), src01, src11, true, carry0).def(1).getTemp();
1838 bld.vop2_e64(aco_opcode::v_cndmask_b32, Definition(dst1), no_sat1, Operand::c32(-1),
1839 carry1);
1840 }
1841
1842 bld.vop2_e64(aco_opcode::v_cndmask_b32, Definition(dst0), no_sat0, Operand::c32(-1),
1843 carry1);
1844 bld.pseudo(aco_opcode::p_create_vector, Definition(dst), dst0, dst1);
1845 } else {
1846 isel_err(&instr->instr, "Unimplemented NIR instr bit size");
1847 }
1848 break;
1849 }
1850 case nir_op_iadd_sat: {
1851 if (dst.regClass() == v1 && instr->def.bit_size == 16) {
1852 Instruction* add_instr = emit_vop3p_instruction(ctx, instr, aco_opcode::v_pk_add_i16, dst);
1853 add_instr->valu().clamp = 1;
1854 break;
1855 }
1856 Temp src0 = get_alu_src(ctx, instr->src[0]);
1857 Temp src1 = get_alu_src(ctx, instr->src[1]);
1858 if (dst.regClass() == s1) {
1859 Temp cond = bld.sopc(aco_opcode::s_cmp_lt_i32, bld.def(s1, scc), src1, Operand::zero());
1860 Temp bound = bld.sop2(aco_opcode::s_add_u32, bld.def(s1), bld.scc(bld.def(s1, scc)),
1861 Operand::c32(INT32_MAX), cond);
1862 Temp overflow = bld.tmp(s1);
1863 Temp add =
1864 bld.sop2(aco_opcode::s_add_i32, bld.def(s1), bld.scc(Definition(overflow)), src0, src1);
1865 bld.sop2(aco_opcode::s_cselect_b32, Definition(dst), bound, add, bld.scc(overflow));
1866 break;
1867 }
1868
1869 src1 = as_vgpr(ctx, src1);
1870
1871 if (dst.regClass() == v2b) {
1872 Instruction* add_instr =
1873 bld.vop3(aco_opcode::v_add_i16, Definition(dst), src0, src1).instr;
1874 add_instr->valu().clamp = 1;
1875 } else if (dst.regClass() == v1) {
1876 Instruction* add_instr =
1877 bld.vop3(aco_opcode::v_add_i32, Definition(dst), src0, src1).instr;
1878 add_instr->valu().clamp = 1;
1879 } else {
1880 isel_err(&instr->instr, "Unimplemented NIR instr bit size");
1881 }
1882 break;
1883 }
1884 case nir_op_uadd_carry: {
1885 Temp src0 = get_alu_src(ctx, instr->src[0]);
1886 Temp src1 = get_alu_src(ctx, instr->src[1]);
1887 if (dst.regClass() == s1) {
1888 bld.sop2(aco_opcode::s_add_u32, bld.def(s1), bld.scc(Definition(dst)), src0, src1);
1889 break;
1890 }
1891 if (dst.regClass() == v1) {
1892 Temp carry = bld.vadd32(bld.def(v1), src0, src1, true).def(1).getTemp();
1893 bld.vop2_e64(aco_opcode::v_cndmask_b32, Definition(dst), Operand::zero(), Operand::c32(1u),
1894 carry);
1895 break;
1896 }
1897
1898 Temp src00 = bld.tmp(src0.type(), 1);
1899 Temp src01 = bld.tmp(dst.type(), 1);
1900 bld.pseudo(aco_opcode::p_split_vector, Definition(src00), Definition(src01), src0);
1901 Temp src10 = bld.tmp(src1.type(), 1);
1902 Temp src11 = bld.tmp(dst.type(), 1);
1903 bld.pseudo(aco_opcode::p_split_vector, Definition(src10), Definition(src11), src1);
1904 if (dst.regClass() == s2) {
1905 Temp carry = bld.tmp(s1);
1906 bld.sop2(aco_opcode::s_add_u32, bld.def(s1), bld.scc(Definition(carry)), src00, src10);
1907 carry = bld.sop2(aco_opcode::s_addc_u32, bld.def(s1), bld.scc(bld.def(s1)), src01, src11,
1908 bld.scc(carry))
1909 .def(1)
1910 .getTemp();
1911 bld.pseudo(aco_opcode::p_create_vector, Definition(dst), carry, Operand::zero());
1912 } else if (dst.regClass() == v2) {
1913 Temp carry = bld.vadd32(bld.def(v1), src00, src10, true).def(1).getTemp();
1914 carry = bld.vadd32(bld.def(v1), src01, src11, true, carry).def(1).getTemp();
1915 carry = bld.vop2_e64(aco_opcode::v_cndmask_b32, bld.def(v1), Operand::zero(),
1916 Operand::c32(1u), carry);
1917 bld.pseudo(aco_opcode::p_create_vector, Definition(dst), carry, Operand::zero());
1918 } else {
1919 isel_err(&instr->instr, "Unimplemented NIR instr bit size");
1920 }
1921 break;
1922 }
1923 case nir_op_isub: {
1924 if (dst.regClass() == s1) {
1925 emit_sop2_instruction(ctx, instr, aco_opcode::s_sub_i32, dst, true);
1926 break;
1927 } else if (dst.regClass() == v1 && instr->def.bit_size == 16) {
1928 emit_vop3p_instruction(ctx, instr, aco_opcode::v_pk_sub_u16, dst);
1929 break;
1930 }
1931
1932 Temp src0 = get_alu_src(ctx, instr->src[0]);
1933 Temp src1 = get_alu_src(ctx, instr->src[1]);
1934 if (dst.regClass() == v1) {
1935 bld.vsub32(Definition(dst), src0, src1);
1936 break;
1937 } else if (dst.bytes() <= 2) {
1938 if (ctx->program->gfx_level >= GFX10)
1939 bld.vop3(aco_opcode::v_sub_u16_e64, Definition(dst), src0, src1);
1940 else if (src1.type() == RegType::sgpr)
1941 bld.vop2(aco_opcode::v_subrev_u16, Definition(dst), src1, as_vgpr(ctx, src0));
1942 else if (ctx->program->gfx_level >= GFX8)
1943 bld.vop2(aco_opcode::v_sub_u16, Definition(dst), src0, as_vgpr(ctx, src1));
1944 else
1945 bld.vsub32(Definition(dst), src0, src1);
1946 break;
1947 }
1948
1949 Temp src00 = bld.tmp(src0.type(), 1);
1950 Temp src01 = bld.tmp(dst.type(), 1);
1951 bld.pseudo(aco_opcode::p_split_vector, Definition(src00), Definition(src01), src0);
1952 Temp src10 = bld.tmp(src1.type(), 1);
1953 Temp src11 = bld.tmp(dst.type(), 1);
1954 bld.pseudo(aco_opcode::p_split_vector, Definition(src10), Definition(src11), src1);
1955 if (dst.regClass() == s2) {
1956 Temp borrow = bld.tmp(s1);
1957 Temp dst0 =
1958 bld.sop2(aco_opcode::s_sub_u32, bld.def(s1), bld.scc(Definition(borrow)), src00, src10);
1959 Temp dst1 = bld.sop2(aco_opcode::s_subb_u32, bld.def(s1), bld.def(s1, scc), src01, src11,
1960 bld.scc(borrow));
1961 bld.pseudo(aco_opcode::p_create_vector, Definition(dst), dst0, dst1);
1962 } else if (dst.regClass() == v2) {
1963 Temp lower = bld.tmp(v1);
1964 Temp borrow = bld.vsub32(Definition(lower), src00, src10, true).def(1).getTemp();
1965 Temp upper = bld.vsub32(bld.def(v1), src01, src11, false, borrow);
1966 bld.pseudo(aco_opcode::p_create_vector, Definition(dst), lower, upper);
1967 } else {
1968 isel_err(&instr->instr, "Unimplemented NIR instr bit size");
1969 }
1970 break;
1971 }
1972 case nir_op_usub_borrow: {
1973 Temp src0 = get_alu_src(ctx, instr->src[0]);
1974 Temp src1 = get_alu_src(ctx, instr->src[1]);
1975 if (dst.regClass() == s1) {
1976 bld.sop2(aco_opcode::s_sub_u32, bld.def(s1), bld.scc(Definition(dst)), src0, src1);
1977 break;
1978 } else if (dst.regClass() == v1) {
1979 Temp borrow = bld.vsub32(bld.def(v1), src0, src1, true).def(1).getTemp();
1980 bld.vop2_e64(aco_opcode::v_cndmask_b32, Definition(dst), Operand::zero(), Operand::c32(1u),
1981 borrow);
1982 break;
1983 }
1984
1985 Temp src00 = bld.tmp(src0.type(), 1);
1986 Temp src01 = bld.tmp(dst.type(), 1);
1987 bld.pseudo(aco_opcode::p_split_vector, Definition(src00), Definition(src01), src0);
1988 Temp src10 = bld.tmp(src1.type(), 1);
1989 Temp src11 = bld.tmp(dst.type(), 1);
1990 bld.pseudo(aco_opcode::p_split_vector, Definition(src10), Definition(src11), src1);
1991 if (dst.regClass() == s2) {
1992 Temp borrow = bld.tmp(s1);
1993 bld.sop2(aco_opcode::s_sub_u32, bld.def(s1), bld.scc(Definition(borrow)), src00, src10);
1994 borrow = bld.sop2(aco_opcode::s_subb_u32, bld.def(s1), bld.scc(bld.def(s1)), src01, src11,
1995 bld.scc(borrow))
1996 .def(1)
1997 .getTemp();
1998 bld.pseudo(aco_opcode::p_create_vector, Definition(dst), borrow, Operand::zero());
1999 } else if (dst.regClass() == v2) {
2000 Temp borrow = bld.vsub32(bld.def(v1), src00, src10, true).def(1).getTemp();
2001 borrow = bld.vsub32(bld.def(v1), src01, src11, true, Operand(borrow)).def(1).getTemp();
2002 borrow = bld.vop2_e64(aco_opcode::v_cndmask_b32, bld.def(v1), Operand::zero(),
2003 Operand::c32(1u), borrow);
2004 bld.pseudo(aco_opcode::p_create_vector, Definition(dst), borrow, Operand::zero());
2005 } else {
2006 isel_err(&instr->instr, "Unimplemented NIR instr bit size");
2007 }
2008 break;
2009 }
2010 case nir_op_usub_sat: {
2011 if (dst.regClass() == v1 && instr->def.bit_size == 16) {
2012 Instruction* sub_instr = emit_vop3p_instruction(ctx, instr, aco_opcode::v_pk_sub_u16, dst);
2013 sub_instr->valu().clamp = 1;
2014 break;
2015 }
2016 Temp src0 = get_alu_src(ctx, instr->src[0]);
2017 Temp src1 = get_alu_src(ctx, instr->src[1]);
2018 if (dst.regClass() == s1) {
2019 Temp tmp = bld.tmp(s1), carry = bld.tmp(s1);
2020 bld.sop2(aco_opcode::s_sub_u32, Definition(tmp), bld.scc(Definition(carry)), src0, src1);
2021 bld.sop2(aco_opcode::s_cselect_b32, Definition(dst), Operand::c32(0), tmp, bld.scc(carry));
2022 break;
2023 } else if (dst.regClass() == v2b) {
2024 Instruction* sub_instr;
2025 if (ctx->program->gfx_level >= GFX10) {
2026 sub_instr = bld.vop3(aco_opcode::v_sub_u16_e64, Definition(dst), src0, src1).instr;
2027 } else {
2028 aco_opcode op = aco_opcode::v_sub_u16;
2029 if (src1.type() == RegType::sgpr) {
2030 std::swap(src0, src1);
2031 op = aco_opcode::v_subrev_u16;
2032 }
2033 sub_instr = bld.vop2_e64(op, Definition(dst), src0, as_vgpr(ctx, src1)).instr;
2034 }
2035 sub_instr->valu().clamp = 1;
2036 break;
2037 } else if (dst.regClass() == v1) {
2038 usub32_sat(bld, Definition(dst), src0, as_vgpr(ctx, src1));
2039 break;
2040 }
2041
2042 assert(src0.size() == 2 && src1.size() == 2);
2043 Temp src00 = bld.tmp(src0.type(), 1);
2044 Temp src01 = bld.tmp(src0.type(), 1);
2045 bld.pseudo(aco_opcode::p_split_vector, Definition(src00), Definition(src01), src0);
2046 Temp src10 = bld.tmp(src1.type(), 1);
2047 Temp src11 = bld.tmp(src1.type(), 1);
2048 bld.pseudo(aco_opcode::p_split_vector, Definition(src10), Definition(src11), src1);
2049
2050 if (dst.regClass() == s2) {
2051 Temp carry0 = bld.tmp(s1);
2052 Temp carry1 = bld.tmp(s1);
2053
2054 Temp no_sat0 =
2055 bld.sop2(aco_opcode::s_sub_u32, bld.def(s1), bld.scc(Definition(carry0)), src00, src10);
2056 Temp no_sat1 = bld.sop2(aco_opcode::s_subb_u32, bld.def(s1), bld.scc(Definition(carry1)),
2057 src01, src11, bld.scc(carry0));
2058
2059 Temp no_sat = bld.pseudo(aco_opcode::p_create_vector, bld.def(s2), no_sat0, no_sat1);
2060
2061 bld.sop2(aco_opcode::s_cselect_b64, Definition(dst), Operand::c64(0ull), no_sat,
2062 bld.scc(carry1));
2063 } else if (dst.regClass() == v2) {
2064 Temp no_sat0 = bld.tmp(v1);
2065 Temp dst0 = bld.tmp(v1);
2066 Temp dst1 = bld.tmp(v1);
2067
2068 Temp carry0 = bld.vsub32(Definition(no_sat0), src00, src10, true).def(1).getTemp();
2069 Temp carry1;
2070
2071 if (ctx->program->gfx_level >= GFX8) {
2072 carry1 = bld.tmp(bld.lm);
2073 bld.vop2_e64(aco_opcode::v_subb_co_u32, Definition(dst1), Definition(carry1),
2074 as_vgpr(ctx, src01), as_vgpr(ctx, src11), carry0)
2075 ->valu()
2076 .clamp = 1;
2077 } else {
2078 Temp no_sat1 = bld.tmp(v1);
2079 carry1 = bld.vsub32(Definition(no_sat1), src01, src11, true, carry0).def(1).getTemp();
2080 bld.vop2_e64(aco_opcode::v_cndmask_b32, Definition(dst1), no_sat1, Operand::c32(0u),
2081 carry1);
2082 }
2083
2084 bld.vop2_e64(aco_opcode::v_cndmask_b32, Definition(dst0), no_sat0, Operand::c32(0u),
2085 carry1);
2086 bld.pseudo(aco_opcode::p_create_vector, Definition(dst), dst0, dst1);
2087 } else {
2088 isel_err(&instr->instr, "Unimplemented NIR instr bit size");
2089 }
2090 break;
2091 }
2092 case nir_op_isub_sat: {
2093 if (dst.regClass() == v1 && instr->def.bit_size == 16) {
2094 Instruction* sub_instr = emit_vop3p_instruction(ctx, instr, aco_opcode::v_pk_sub_i16, dst);
2095 sub_instr->valu().clamp = 1;
2096 break;
2097 }
2098 Temp src0 = get_alu_src(ctx, instr->src[0]);
2099 Temp src1 = get_alu_src(ctx, instr->src[1]);
2100 if (dst.regClass() == s1) {
2101 Temp cond = bld.sopc(aco_opcode::s_cmp_gt_i32, bld.def(s1, scc), src1, Operand::zero());
2102 Temp bound = bld.sop2(aco_opcode::s_add_u32, bld.def(s1), bld.scc(bld.def(s1, scc)),
2103 Operand::c32(INT32_MAX), cond);
2104 Temp overflow = bld.tmp(s1);
2105 Temp sub =
2106 bld.sop2(aco_opcode::s_sub_i32, bld.def(s1), bld.scc(Definition(overflow)), src0, src1);
2107 bld.sop2(aco_opcode::s_cselect_b32, Definition(dst), bound, sub, bld.scc(overflow));
2108 break;
2109 }
2110
2111 src1 = as_vgpr(ctx, src1);
2112
2113 if (dst.regClass() == v2b) {
2114 Instruction* sub_instr =
2115 bld.vop3(aco_opcode::v_sub_i16, Definition(dst), src0, src1).instr;
2116 sub_instr->valu().clamp = 1;
2117 } else if (dst.regClass() == v1) {
2118 Instruction* sub_instr =
2119 bld.vop3(aco_opcode::v_sub_i32, Definition(dst), src0, src1).instr;
2120 sub_instr->valu().clamp = 1;
2121 } else {
2122 isel_err(&instr->instr, "Unimplemented NIR instr bit size");
2123 }
2124 break;
2125 }
2126 case nir_op_imul: {
2127 if (dst.bytes() <= 2 && ctx->program->gfx_level >= GFX10) {
2128 emit_vop3a_instruction(ctx, instr, aco_opcode::v_mul_lo_u16_e64, dst);
2129 } else if (dst.bytes() <= 2 && ctx->program->gfx_level >= GFX8) {
2130 emit_vop2_instruction(ctx, instr, aco_opcode::v_mul_lo_u16, dst, true);
2131 } else if (dst.regClass() == v1 && instr->def.bit_size == 16) {
2132 emit_vop3p_instruction(ctx, instr, aco_opcode::v_pk_mul_lo_u16, dst);
2133 } else if (dst.type() == RegType::vgpr) {
2134 uint32_t src0_ub = get_alu_src_ub(ctx, instr, 0);
2135 uint32_t src1_ub = get_alu_src_ub(ctx, instr, 1);
2136
2137 if (src0_ub <= 0xffffff && src1_ub <= 0xffffff) {
2138 bool nuw_16bit = src0_ub <= 0xffff && src1_ub <= 0xffff && src0_ub * src1_ub <= 0xffff;
2139 emit_vop2_instruction(ctx, instr, aco_opcode::v_mul_u32_u24, dst,
2140 true /* commutative */, false, false, nuw_16bit, 0x3);
2141 } else if (nir_src_is_const(instr->src[0].src)) {
2142 bld.v_mul_imm(Definition(dst), get_alu_src(ctx, instr->src[1]),
2143 nir_src_as_uint(instr->src[0].src), false);
2144 } else if (nir_src_is_const(instr->src[1].src)) {
2145 bld.v_mul_imm(Definition(dst), get_alu_src(ctx, instr->src[0]),
2146 nir_src_as_uint(instr->src[1].src), false);
2147 } else {
2148 emit_vop3a_instruction(ctx, instr, aco_opcode::v_mul_lo_u32, dst);
2149 }
2150 } else if (dst.regClass() == s1) {
2151 emit_sop2_instruction(ctx, instr, aco_opcode::s_mul_i32, dst, false);
2152 } else {
2153 isel_err(&instr->instr, "Unimplemented NIR instr bit size");
2154 }
2155 break;
2156 }
2157 case nir_op_umul_high: {
2158 if (dst.regClass() == s1 && ctx->options->gfx_level >= GFX9) {
2159 emit_sop2_instruction(ctx, instr, aco_opcode::s_mul_hi_u32, dst, false);
2160 } else if (dst.bytes() == 4) {
2161 uint32_t src0_ub = get_alu_src_ub(ctx, instr, 0);
2162 uint32_t src1_ub = get_alu_src_ub(ctx, instr, 1);
2163
2164 Temp tmp = dst.regClass() == s1 ? bld.tmp(v1) : dst;
2165 if (src0_ub <= 0xffffff && src1_ub <= 0xffffff) {
2166 emit_vop2_instruction(ctx, instr, aco_opcode::v_mul_hi_u32_u24, tmp, true);
2167 } else {
2168 emit_vop3a_instruction(ctx, instr, aco_opcode::v_mul_hi_u32, tmp);
2169 }
2170
2171 if (dst.regClass() == s1)
2172 bld.pseudo(aco_opcode::p_as_uniform, Definition(dst), tmp);
2173 } else {
2174 isel_err(&instr->instr, "Unimplemented NIR instr bit size");
2175 }
2176 break;
2177 }
2178 case nir_op_imul_high: {
2179 if (dst.regClass() == v1) {
2180 emit_vop3a_instruction(ctx, instr, aco_opcode::v_mul_hi_i32, dst);
2181 } else if (dst.regClass() == s1 && ctx->options->gfx_level >= GFX9) {
2182 emit_sop2_instruction(ctx, instr, aco_opcode::s_mul_hi_i32, dst, false);
2183 } else if (dst.regClass() == s1) {
2184 Temp tmp = bld.vop3(aco_opcode::v_mul_hi_i32, bld.def(v1), get_alu_src(ctx, instr->src[0]),
2185 as_vgpr(ctx, get_alu_src(ctx, instr->src[1])));
2186 bld.pseudo(aco_opcode::p_as_uniform, Definition(dst), tmp);
2187 } else {
2188 isel_err(&instr->instr, "Unimplemented NIR instr bit size");
2189 }
2190 break;
2191 }
2192 case nir_op_fmul: {
2193 if (dst.regClass() == v2b) {
2194 emit_vop2_instruction(ctx, instr, aco_opcode::v_mul_f16, dst, true);
2195 } else if (dst.regClass() == v1 && instr->def.bit_size == 16) {
2196 emit_vop3p_instruction(ctx, instr, aco_opcode::v_pk_mul_f16, dst);
2197 } else if (dst.regClass() == v1) {
2198 emit_vop2_instruction(ctx, instr, aco_opcode::v_mul_f32, dst, true);
2199 } else if (dst.regClass() == v2) {
2200 emit_vop3a_instruction(ctx, instr, aco_opcode::v_mul_f64_e64, dst);
2201 } else if (dst.regClass() == s1 && instr->def.bit_size == 16) {
2202 emit_sop2_instruction(ctx, instr, aco_opcode::s_mul_f16, dst, false);
2203 } else if (dst.regClass() == s1 && instr->def.bit_size == 32) {
2204 emit_sop2_instruction(ctx, instr, aco_opcode::s_mul_f32, dst, false);
2205 } else {
2206 isel_err(&instr->instr, "Unimplemented NIR instr bit size");
2207 }
2208 break;
2209 }
2210 case nir_op_fmulz: {
2211 if (dst.regClass() == v1) {
2212 emit_vop2_instruction(ctx, instr, aco_opcode::v_mul_legacy_f32, dst, true);
2213 } else {
2214 isel_err(&instr->instr, "Unimplemented NIR instr bit size");
2215 }
2216 break;
2217 }
2218 case nir_op_fadd: {
2219 if (dst.regClass() == v2b) {
2220 emit_vop2_instruction(ctx, instr, aco_opcode::v_add_f16, dst, true);
2221 } else if (dst.regClass() == v1 && instr->def.bit_size == 16) {
2222 emit_vop3p_instruction(ctx, instr, aco_opcode::v_pk_add_f16, dst);
2223 } else if (dst.regClass() == v1) {
2224 emit_vop2_instruction(ctx, instr, aco_opcode::v_add_f32, dst, true);
2225 } else if (dst.regClass() == v2) {
2226 emit_vop3a_instruction(ctx, instr, aco_opcode::v_add_f64_e64, dst);
2227 } else if (dst.regClass() == s1 && instr->def.bit_size == 16) {
2228 emit_sop2_instruction(ctx, instr, aco_opcode::s_add_f16, dst, false);
2229 } else if (dst.regClass() == s1 && instr->def.bit_size == 32) {
2230 emit_sop2_instruction(ctx, instr, aco_opcode::s_add_f32, dst, false);
2231 } else {
2232 isel_err(&instr->instr, "Unimplemented NIR instr bit size");
2233 }
2234 break;
2235 }
2236 case nir_op_fsub: {
2237 if (dst.regClass() == v1 && instr->def.bit_size == 16) {
2238 Instruction* add = emit_vop3p_instruction(ctx, instr, aco_opcode::v_pk_add_f16, dst);
2239 VALU_instruction& sub = add->valu();
2240 sub.neg_lo[1] = true;
2241 sub.neg_hi[1] = true;
2242 break;
2243 }
2244
2245 Temp src0 = get_alu_src(ctx, instr->src[0]);
2246 Temp src1 = get_alu_src(ctx, instr->src[1]);
2247 if (dst.regClass() == v2b) {
2248 if (src1.type() == RegType::vgpr || src0.type() != RegType::vgpr)
2249 emit_vop2_instruction(ctx, instr, aco_opcode::v_sub_f16, dst, false);
2250 else
2251 emit_vop2_instruction(ctx, instr, aco_opcode::v_subrev_f16, dst, true);
2252 } else if (dst.regClass() == v1) {
2253 if (src1.type() == RegType::vgpr || src0.type() != RegType::vgpr)
2254 emit_vop2_instruction(ctx, instr, aco_opcode::v_sub_f32, dst, false);
2255 else
2256 emit_vop2_instruction(ctx, instr, aco_opcode::v_subrev_f32, dst, true);
2257 } else if (dst.regClass() == v2) {
2258 Instruction* add = bld.vop3(aco_opcode::v_add_f64_e64, Definition(dst), as_vgpr(ctx, src0),
2259 as_vgpr(ctx, src1));
2260 add->valu().neg[1] = true;
2261 } else if (dst.regClass() == s1 && instr->def.bit_size == 16) {
2262 emit_sop2_instruction(ctx, instr, aco_opcode::s_sub_f16, dst, false);
2263 } else if (dst.regClass() == s1 && instr->def.bit_size == 32) {
2264 emit_sop2_instruction(ctx, instr, aco_opcode::s_sub_f32, dst, false);
2265 } else {
2266 isel_err(&instr->instr, "Unimplemented NIR instr bit size");
2267 }
2268 break;
2269 }
2270 case nir_op_ffma: {
2271 if (dst.regClass() == v2b) {
2272 emit_vop3a_instruction(ctx, instr, aco_opcode::v_fma_f16, dst, false, 3);
2273 } else if (dst.regClass() == v1 && instr->def.bit_size == 16) {
2274 assert(instr->def.num_components == 2);
2275
2276 Temp src0 = as_vgpr(ctx, get_alu_src_vop3p(ctx, instr->src[0]));
2277 Temp src1 = as_vgpr(ctx, get_alu_src_vop3p(ctx, instr->src[1]));
2278 Temp src2 = as_vgpr(ctx, get_alu_src_vop3p(ctx, instr->src[2]));
2279
2280 /* swizzle to opsel: all swizzles are either 0 (x) or 1 (y) */
2281 unsigned opsel_lo = 0, opsel_hi = 0;
2282 for (unsigned i = 0; i < 3; i++) {
2283 opsel_lo |= (instr->src[i].swizzle[0] & 1) << i;
2284 opsel_hi |= (instr->src[i].swizzle[1] & 1) << i;
2285 }
2286
2287 bld.vop3p(aco_opcode::v_pk_fma_f16, Definition(dst), src0, src1, src2, opsel_lo, opsel_hi);
2288 emit_split_vector(ctx, dst, 2);
2289 } else if (dst.regClass() == v1) {
2290 emit_vop3a_instruction(ctx, instr, aco_opcode::v_fma_f32, dst,
2291 ctx->block->fp_mode.must_flush_denorms32, 3);
2292 } else if (dst.regClass() == v2) {
2293 emit_vop3a_instruction(ctx, instr, aco_opcode::v_fma_f64, dst, false, 3);
2294 } else if (dst.regClass() == s1) {
2295 Temp src0 = get_alu_src(ctx, instr->src[0]);
2296 Temp src1 = get_alu_src(ctx, instr->src[1]);
2297 Temp src2 = get_alu_src(ctx, instr->src[2]);
2298 aco_opcode op =
2299 instr->def.bit_size == 16 ? aco_opcode::s_fmac_f16 : aco_opcode::s_fmac_f32;
2300 bld.sop2(op, Definition(dst), src0, src1, src2);
2301 } else {
2302 isel_err(&instr->instr, "Unimplemented NIR instr bit size");
2303 }
2304 break;
2305 }
2306 case nir_op_ffmaz: {
2307 if (dst.regClass() == v1) {
2308 emit_vop3a_instruction(ctx, instr, aco_opcode::v_fma_legacy_f32, dst,
2309 ctx->block->fp_mode.must_flush_denorms32, 3);
2310 } else {
2311 isel_err(&instr->instr, "Unimplemented NIR instr bit size");
2312 }
2313 break;
2314 }
2315 case nir_op_fmax: {
2316 if (dst.regClass() == v2b) {
2317 emit_vop2_instruction(ctx, instr, aco_opcode::v_max_f16, dst, true, false,
2318 ctx->block->fp_mode.must_flush_denorms16_64);
2319 } else if (dst.regClass() == v1 && instr->def.bit_size == 16) {
2320 emit_vop3p_instruction(ctx, instr, aco_opcode::v_pk_max_f16, dst);
2321 } else if (dst.regClass() == v1) {
2322 emit_vop2_instruction(ctx, instr, aco_opcode::v_max_f32, dst, true, false,
2323 ctx->block->fp_mode.must_flush_denorms32);
2324 } else if (dst.regClass() == v2) {
2325 emit_vop3a_instruction(ctx, instr, aco_opcode::v_max_f64_e64, dst,
2326 ctx->block->fp_mode.must_flush_denorms16_64);
2327 } else if (dst.regClass() == s1 && instr->def.bit_size == 16) {
2328 emit_sop2_instruction(ctx, instr, aco_opcode::s_max_f16, dst, false);
2329 } else if (dst.regClass() == s1 && instr->def.bit_size == 32) {
2330 emit_sop2_instruction(ctx, instr, aco_opcode::s_max_f32, dst, false);
2331 } else {
2332 isel_err(&instr->instr, "Unimplemented NIR instr bit size");
2333 }
2334 break;
2335 }
2336 case nir_op_fmin: {
2337 if (dst.regClass() == v2b) {
2338 emit_vop2_instruction(ctx, instr, aco_opcode::v_min_f16, dst, true, false,
2339 ctx->block->fp_mode.must_flush_denorms16_64);
2340 } else if (dst.regClass() == v1 && instr->def.bit_size == 16) {
2341 emit_vop3p_instruction(ctx, instr, aco_opcode::v_pk_min_f16, dst, true);
2342 } else if (dst.regClass() == v1) {
2343 emit_vop2_instruction(ctx, instr, aco_opcode::v_min_f32, dst, true, false,
2344 ctx->block->fp_mode.must_flush_denorms32);
2345 } else if (dst.regClass() == v2) {
2346 emit_vop3a_instruction(ctx, instr, aco_opcode::v_min_f64_e64, dst,
2347 ctx->block->fp_mode.must_flush_denorms16_64);
2348 } else if (dst.regClass() == s1 && instr->def.bit_size == 16) {
2349 emit_sop2_instruction(ctx, instr, aco_opcode::s_min_f16, dst, false);
2350 } else if (dst.regClass() == s1 && instr->def.bit_size == 32) {
2351 emit_sop2_instruction(ctx, instr, aco_opcode::s_min_f32, dst, false);
2352 } else {
2353 isel_err(&instr->instr, "Unimplemented NIR instr bit size");
2354 }
2355 break;
2356 }
2357 case nir_op_sdot_4x8_iadd: {
2358 if (ctx->options->gfx_level >= GFX11)
2359 emit_idot_instruction(ctx, instr, aco_opcode::v_dot4_i32_iu8, dst, false, 0x3);
2360 else
2361 emit_idot_instruction(ctx, instr, aco_opcode::v_dot4_i32_i8, dst, false);
2362 break;
2363 }
2364 case nir_op_sdot_4x8_iadd_sat: {
2365 if (ctx->options->gfx_level >= GFX11)
2366 emit_idot_instruction(ctx, instr, aco_opcode::v_dot4_i32_iu8, dst, true, 0x3);
2367 else
2368 emit_idot_instruction(ctx, instr, aco_opcode::v_dot4_i32_i8, dst, true);
2369 break;
2370 }
2371 case nir_op_sudot_4x8_iadd: {
2372 emit_idot_instruction(ctx, instr, aco_opcode::v_dot4_i32_iu8, dst, false, 0x1);
2373 break;
2374 }
2375 case nir_op_sudot_4x8_iadd_sat: {
2376 emit_idot_instruction(ctx, instr, aco_opcode::v_dot4_i32_iu8, dst, true, 0x1);
2377 break;
2378 }
2379 case nir_op_udot_4x8_uadd: {
2380 emit_idot_instruction(ctx, instr, aco_opcode::v_dot4_u32_u8, dst, false);
2381 break;
2382 }
2383 case nir_op_udot_4x8_uadd_sat: {
2384 emit_idot_instruction(ctx, instr, aco_opcode::v_dot4_u32_u8, dst, true);
2385 break;
2386 }
2387 case nir_op_sdot_2x16_iadd: {
2388 emit_idot_instruction(ctx, instr, aco_opcode::v_dot2_i32_i16, dst, false);
2389 break;
2390 }
2391 case nir_op_sdot_2x16_iadd_sat: {
2392 emit_idot_instruction(ctx, instr, aco_opcode::v_dot2_i32_i16, dst, true);
2393 break;
2394 }
2395 case nir_op_udot_2x16_uadd: {
2396 emit_idot_instruction(ctx, instr, aco_opcode::v_dot2_u32_u16, dst, false);
2397 break;
2398 }
2399 case nir_op_udot_2x16_uadd_sat: {
2400 emit_idot_instruction(ctx, instr, aco_opcode::v_dot2_u32_u16, dst, true);
2401 break;
2402 }
2403 case nir_op_cube_amd: {
2404 Temp in = get_alu_src(ctx, instr->src[0], 3);
2405 Temp src[3] = {emit_extract_vector(ctx, in, 0, v1), emit_extract_vector(ctx, in, 1, v1),
2406 emit_extract_vector(ctx, in, 2, v1)};
2407 Temp ma = bld.vop3(aco_opcode::v_cubema_f32, bld.def(v1), src[0], src[1], src[2]);
2408 Temp sc = bld.vop3(aco_opcode::v_cubesc_f32, bld.def(v1), src[0], src[1], src[2]);
2409 Temp tc = bld.vop3(aco_opcode::v_cubetc_f32, bld.def(v1), src[0], src[1], src[2]);
2410 Temp id = bld.vop3(aco_opcode::v_cubeid_f32, bld.def(v1), src[0], src[1], src[2]);
2411 bld.pseudo(aco_opcode::p_create_vector, Definition(dst), tc, sc, ma, id);
2412 break;
2413 }
2414 case nir_op_bcsel: {
2415 emit_bcsel(ctx, instr, dst);
2416 break;
2417 }
2418 case nir_op_frsq: {
2419 if (instr->def.bit_size == 16) {
2420 if (dst.regClass() == s1 && ctx->program->gfx_level >= GFX12)
2421 bld.vop3(aco_opcode::v_s_rsq_f16, Definition(dst), get_alu_src(ctx, instr->src[0]));
2422 else
2423 emit_vop1_instruction(ctx, instr, aco_opcode::v_rsq_f16, dst);
2424 } else if (instr->def.bit_size == 32) {
2425 emit_rsq(ctx, bld, Definition(dst), get_alu_src(ctx, instr->src[0]));
2426 } else if (instr->def.bit_size == 64) {
2427 /* Lowered at NIR level for precision reasons. */
2428 emit_vop1_instruction(ctx, instr, aco_opcode::v_rsq_f64, dst);
2429 } else {
2430 isel_err(&instr->instr, "Unimplemented NIR instr bit size");
2431 }
2432 break;
2433 }
2434 case nir_op_fneg: {
2435 if (dst.regClass() == v1 && instr->def.bit_size == 16) {
2436 Temp src = get_alu_src_vop3p(ctx, instr->src[0]);
2437 Instruction* vop3p =
2438 bld.vop3p(aco_opcode::v_pk_mul_f16, Definition(dst), src, Operand::c16(0x3C00),
2439 instr->src[0].swizzle[0] & 1, instr->src[0].swizzle[1] & 1);
2440 vop3p->valu().neg_lo[0] = true;
2441 vop3p->valu().neg_hi[0] = true;
2442 emit_split_vector(ctx, dst, 2);
2443 break;
2444 }
2445 Temp src = get_alu_src(ctx, instr->src[0]);
2446 if (dst.regClass() == v2b) {
2447 bld.vop2(aco_opcode::v_mul_f16, Definition(dst), Operand::c16(0xbc00u), as_vgpr(ctx, src));
2448 } else if (dst.regClass() == v1) {
2449 bld.vop2(aco_opcode::v_mul_f32, Definition(dst), Operand::c32(0xbf800000u),
2450 as_vgpr(ctx, src));
2451 } else if (dst.regClass() == v2) {
2452 if (ctx->block->fp_mode.must_flush_denorms16_64)
2453 src = bld.vop3(aco_opcode::v_mul_f64_e64, bld.def(v2), Operand::c64(0x3FF0000000000000),
2454 as_vgpr(ctx, src));
2455 Temp upper = bld.tmp(v1), lower = bld.tmp(v1);
2456 bld.pseudo(aco_opcode::p_split_vector, Definition(lower), Definition(upper), src);
2457 upper = bld.vop2(aco_opcode::v_xor_b32, bld.def(v1), Operand::c32(0x80000000u), upper);
2458 bld.pseudo(aco_opcode::p_create_vector, Definition(dst), lower, upper);
2459 } else if (dst.regClass() == s1 && instr->def.bit_size == 16) {
2460 bld.sop2(aco_opcode::s_mul_f16, Definition(dst), Operand::c16(0xbc00u), src);
2461 } else if (dst.regClass() == s1 && instr->def.bit_size == 32) {
2462 bld.sop2(aco_opcode::s_mul_f32, Definition(dst), Operand::c32(0xbf800000u), src);
2463 } else {
2464 isel_err(&instr->instr, "Unimplemented NIR instr bit size");
2465 }
2466 break;
2467 }
2468 case nir_op_fabs: {
2469 if (dst.regClass() == v1 && instr->def.bit_size == 16) {
2470 Temp src = get_alu_src_vop3p(ctx, instr->src[0]);
2471 Instruction* vop3p =
2472 bld.vop3p(aco_opcode::v_pk_max_f16, Definition(dst), src, src,
2473 instr->src[0].swizzle[0] & 1 ? 3 : 0, instr->src[0].swizzle[1] & 1 ? 3 : 0)
2474 .instr;
2475 vop3p->valu().neg_lo[1] = true;
2476 vop3p->valu().neg_hi[1] = true;
2477 emit_split_vector(ctx, dst, 2);
2478 break;
2479 }
2480 Temp src = get_alu_src(ctx, instr->src[0]);
2481 if (dst.regClass() == v2b) {
2482 Instruction* mul = bld.vop2_e64(aco_opcode::v_mul_f16, Definition(dst),
2483 Operand::c16(0x3c00), as_vgpr(ctx, src))
2484 .instr;
2485 mul->valu().abs[1] = true;
2486 } else if (dst.regClass() == v1) {
2487 Instruction* mul = bld.vop2_e64(aco_opcode::v_mul_f32, Definition(dst),
2488 Operand::c32(0x3f800000u), as_vgpr(ctx, src))
2489 .instr;
2490 mul->valu().abs[1] = true;
2491 } else if (dst.regClass() == v2) {
2492 if (ctx->block->fp_mode.must_flush_denorms16_64)
2493 src = bld.vop3(aco_opcode::v_mul_f64_e64, bld.def(v2), Operand::c64(0x3FF0000000000000),
2494 as_vgpr(ctx, src));
2495 Temp upper = bld.tmp(v1), lower = bld.tmp(v1);
2496 bld.pseudo(aco_opcode::p_split_vector, Definition(lower), Definition(upper), src);
2497 upper = bld.vop2(aco_opcode::v_and_b32, bld.def(v1), Operand::c32(0x7FFFFFFFu), upper);
2498 bld.pseudo(aco_opcode::p_create_vector, Definition(dst), lower, upper);
2499 } else if (dst.regClass() == s1 && instr->def.bit_size == 16) {
2500 Temp mask = bld.copy(bld.def(s1), Operand::c32(0x7fff));
2501 if (ctx->block->fp_mode.denorm16_64 == fp_denorm_keep) {
2502 bld.sop2(aco_opcode::s_and_b32, Definition(dst), bld.def(s1, scc), mask, src);
2503 } else {
2504 Temp tmp = bld.sop2(aco_opcode::s_and_b32, bld.def(s1), bld.def(s1, scc), mask, src);
2505 bld.sop2(aco_opcode::s_mul_f16, Definition(dst), Operand::c16(0x3c00), tmp);
2506 }
2507 } else if (dst.regClass() == s1 && instr->def.bit_size == 32) {
2508 Temp mask = bld.copy(bld.def(s1), Operand::c32(0x7fffffff));
2509 if (ctx->block->fp_mode.denorm32 == fp_denorm_keep) {
2510 bld.sop2(aco_opcode::s_and_b32, Definition(dst), bld.def(s1, scc), mask, src);
2511 } else {
2512 Temp tmp = bld.sop2(aco_opcode::s_and_b32, bld.def(s1), bld.def(s1, scc), mask, src);
2513 bld.sop2(aco_opcode::s_mul_f32, Definition(dst), Operand::c32(0x3f800000), tmp);
2514 }
2515 } else {
2516 isel_err(&instr->instr, "Unimplemented NIR instr bit size");
2517 }
2518 break;
2519 }
2520 case nir_op_fsat: {
2521 if (dst.regClass() == v1 && instr->def.bit_size == 16) {
2522 Temp src = get_alu_src_vop3p(ctx, instr->src[0]);
2523 Instruction* vop3p =
2524 bld.vop3p(aco_opcode::v_pk_mul_f16, Definition(dst), src, Operand::c16(0x3C00),
2525 instr->src[0].swizzle[0] & 1, instr->src[0].swizzle[1] & 1);
2526 vop3p->valu().clamp = true;
2527 emit_split_vector(ctx, dst, 2);
2528 break;
2529 }
2530 Temp src = get_alu_src(ctx, instr->src[0]);
2531 if (dst.regClass() == v2b && ctx->program->gfx_level >= GFX9) {
2532 bld.vop3(aco_opcode::v_med3_f16, Definition(dst), Operand::c16(0u), Operand::c16(0x3c00),
2533 src);
2534 } else if (dst.regClass() == v2b) {
2535 bld.vop2_e64(aco_opcode::v_mul_f16, Definition(dst), Operand::c16(0x3c00), src)
2536 ->valu()
2537 .clamp = true;
2538 } else if (dst.regClass() == v1) {
2539 bld.vop3(aco_opcode::v_med3_f32, Definition(dst), Operand::zero(),
2540 Operand::c32(0x3f800000u), src);
2541 /* apparently, it is not necessary to flush denorms if this instruction is used with these
2542 * operands */
2543 // TODO: confirm that this holds under any circumstances
2544 } else if (dst.regClass() == v2) {
2545 Instruction* add =
2546 bld.vop3(aco_opcode::v_add_f64_e64, Definition(dst), src, Operand::zero());
2547 add->valu().clamp = true;
2548 } else if (dst.regClass() == s1 && instr->def.bit_size == 16) {
2549 Temp low = bld.sop2(aco_opcode::s_max_f16, bld.def(s1), src, Operand::c16(0));
2550 bld.sop2(aco_opcode::s_min_f16, Definition(dst), low, Operand::c16(0x3C00));
2551 } else if (dst.regClass() == s1 && instr->def.bit_size == 32) {
2552 Temp low = bld.sop2(aco_opcode::s_max_f32, bld.def(s1), src, Operand::c32(0));
2553 bld.sop2(aco_opcode::s_min_f32, Definition(dst), low, Operand::c32(0x3f800000));
2554 } else {
2555 isel_err(&instr->instr, "Unimplemented NIR instr bit size");
2556 }
2557 break;
2558 }
2559 case nir_op_flog2: {
2560 if (instr->def.bit_size == 16) {
2561 if (dst.regClass() == s1 && ctx->program->gfx_level >= GFX12)
2562 bld.vop3(aco_opcode::v_s_log_f16, Definition(dst), get_alu_src(ctx, instr->src[0]));
2563 else
2564 emit_vop1_instruction(ctx, instr, aco_opcode::v_log_f16, dst);
2565 } else if (instr->def.bit_size == 32) {
2566 emit_log2(ctx, bld, Definition(dst), get_alu_src(ctx, instr->src[0]));
2567 } else {
2568 isel_err(&instr->instr, "Unimplemented NIR instr bit size");
2569 }
2570 break;
2571 }
2572 case nir_op_frcp: {
2573 if (instr->def.bit_size == 16) {
2574 if (dst.regClass() == s1 && ctx->program->gfx_level >= GFX12)
2575 bld.vop3(aco_opcode::v_s_rcp_f16, Definition(dst), get_alu_src(ctx, instr->src[0]));
2576 else
2577 emit_vop1_instruction(ctx, instr, aco_opcode::v_rcp_f16, dst);
2578 } else if (instr->def.bit_size == 32) {
2579 emit_rcp(ctx, bld, Definition(dst), get_alu_src(ctx, instr->src[0]));
2580 } else if (instr->def.bit_size == 64) {
2581 /* Lowered at NIR level for precision reasons. */
2582 emit_vop1_instruction(ctx, instr, aco_opcode::v_rcp_f64, dst);
2583 } else {
2584 isel_err(&instr->instr, "Unimplemented NIR instr bit size");
2585 }
2586 break;
2587 }
2588 case nir_op_fexp2: {
2589 if (dst.regClass() == s1 && ctx->options->gfx_level >= GFX12) {
2590 aco_opcode opcode =
2591 instr->def.bit_size == 16 ? aco_opcode::v_s_exp_f16 : aco_opcode::v_s_exp_f32;
2592 bld.vop3(opcode, Definition(dst), get_alu_src(ctx, instr->src[0]));
2593 } else if (instr->def.bit_size == 16) {
2594 emit_vop1_instruction(ctx, instr, aco_opcode::v_exp_f16, dst);
2595 } else if (instr->def.bit_size == 32) {
2596 emit_vop1_instruction(ctx, instr, aco_opcode::v_exp_f32, dst);
2597 } else {
2598 isel_err(&instr->instr, "Unimplemented NIR instr bit size");
2599 }
2600 break;
2601 }
2602 case nir_op_fsqrt: {
2603 if (instr->def.bit_size == 16) {
2604 if (dst.regClass() == s1 && ctx->program->gfx_level >= GFX12)
2605 bld.vop3(aco_opcode::v_s_sqrt_f16, Definition(dst), get_alu_src(ctx, instr->src[0]));
2606 else
2607 emit_vop1_instruction(ctx, instr, aco_opcode::v_sqrt_f16, dst);
2608 } else if (instr->def.bit_size == 32) {
2609 emit_sqrt(ctx, bld, Definition(dst), get_alu_src(ctx, instr->src[0]));
2610 } else if (instr->def.bit_size == 64) {
2611 /* Lowered at NIR level for precision reasons. */
2612 emit_vop1_instruction(ctx, instr, aco_opcode::v_sqrt_f64, dst);
2613 } else {
2614 isel_err(&instr->instr, "Unimplemented NIR instr bit size");
2615 }
2616 break;
2617 }
2618 case nir_op_ffract: {
2619 if (dst.regClass() == v2b) {
2620 emit_vop1_instruction(ctx, instr, aco_opcode::v_fract_f16, dst);
2621 } else if (dst.regClass() == v1) {
2622 emit_vop1_instruction(ctx, instr, aco_opcode::v_fract_f32, dst);
2623 } else if (dst.regClass() == v2) {
2624 emit_vop1_instruction(ctx, instr, aco_opcode::v_fract_f64, dst);
2625 } else if (dst.regClass() == s1) {
2626 Temp src = get_alu_src(ctx, instr->src[0]);
2627 aco_opcode op =
2628 instr->def.bit_size == 16 ? aco_opcode::s_floor_f16 : aco_opcode::s_floor_f32;
2629 Temp floor = bld.sop1(op, bld.def(s1), src);
2630 op = instr->def.bit_size == 16 ? aco_opcode::s_sub_f16 : aco_opcode::s_sub_f32;
2631 bld.sop2(op, Definition(dst), src, floor);
2632 } else {
2633 isel_err(&instr->instr, "Unimplemented NIR instr bit size");
2634 }
2635 break;
2636 }
2637 case nir_op_ffloor: {
2638 if (dst.regClass() == v2b) {
2639 emit_vop1_instruction(ctx, instr, aco_opcode::v_floor_f16, dst);
2640 } else if (dst.regClass() == v1) {
2641 emit_vop1_instruction(ctx, instr, aco_opcode::v_floor_f32, dst);
2642 } else if (dst.regClass() == v2) {
2643 Temp src = get_alu_src(ctx, instr->src[0]);
2644 emit_floor_f64(ctx, bld, Definition(dst), src);
2645 } else if (dst.regClass() == s1) {
2646 Temp src = get_alu_src(ctx, instr->src[0]);
2647 aco_opcode op =
2648 instr->def.bit_size == 16 ? aco_opcode::s_floor_f16 : aco_opcode::s_floor_f32;
2649 bld.sop1(op, Definition(dst), src);
2650 } else {
2651 isel_err(&instr->instr, "Unimplemented NIR instr bit size");
2652 }
2653 break;
2654 }
2655 case nir_op_fceil: {
2656 if (dst.regClass() == v2b) {
2657 emit_vop1_instruction(ctx, instr, aco_opcode::v_ceil_f16, dst);
2658 } else if (dst.regClass() == v1) {
2659 emit_vop1_instruction(ctx, instr, aco_opcode::v_ceil_f32, dst);
2660 } else if (dst.regClass() == v2) {
2661 if (ctx->options->gfx_level >= GFX7) {
2662 emit_vop1_instruction(ctx, instr, aco_opcode::v_ceil_f64, dst);
2663 } else {
2664 /* GFX6 doesn't support V_CEIL_F64, lower it. */
2665 /* trunc = trunc(src0)
2666 * if (src0 > 0.0 && src0 != trunc)
2667 * trunc += 1.0
2668 */
2669 Temp src0 = get_alu_src(ctx, instr->src[0]);
2670 Temp trunc = emit_trunc_f64(ctx, bld, bld.def(v2), src0);
2671 Temp tmp0 =
2672 bld.vopc_e64(aco_opcode::v_cmp_gt_f64, bld.def(bld.lm), src0, Operand::zero());
2673 Temp tmp1 = bld.vopc(aco_opcode::v_cmp_lg_f64, bld.def(bld.lm), src0, trunc);
2674 Temp cond = bld.sop2(aco_opcode::s_and_b64, bld.def(s2), bld.def(s1, scc), tmp0, tmp1);
2675 Temp add = bld.vop2(aco_opcode::v_cndmask_b32, bld.def(v1),
2676 bld.copy(bld.def(v1), Operand::zero()),
2677 bld.copy(bld.def(v1), Operand::c32(0x3ff00000u)), cond);
2678 add = bld.pseudo(aco_opcode::p_create_vector, bld.def(v2),
2679 bld.copy(bld.def(v1), Operand::zero()), add);
2680 bld.vop3(aco_opcode::v_add_f64_e64, Definition(dst), trunc, add);
2681 }
2682 } else if (dst.regClass() == s1) {
2683 Temp src = get_alu_src(ctx, instr->src[0]);
2684 aco_opcode op =
2685 instr->def.bit_size == 16 ? aco_opcode::s_ceil_f16 : aco_opcode::s_ceil_f32;
2686 bld.sop1(op, Definition(dst), src);
2687 } else {
2688 isel_err(&instr->instr, "Unimplemented NIR instr bit size");
2689 }
2690 break;
2691 }
2692 case nir_op_ftrunc: {
2693 if (dst.regClass() == v2b) {
2694 emit_vop1_instruction(ctx, instr, aco_opcode::v_trunc_f16, dst);
2695 } else if (dst.regClass() == v1) {
2696 emit_vop1_instruction(ctx, instr, aco_opcode::v_trunc_f32, dst);
2697 } else if (dst.regClass() == v2) {
2698 Temp src = get_alu_src(ctx, instr->src[0]);
2699 emit_trunc_f64(ctx, bld, Definition(dst), src);
2700 } else if (dst.regClass() == s1) {
2701 Temp src = get_alu_src(ctx, instr->src[0]);
2702 aco_opcode op =
2703 instr->def.bit_size == 16 ? aco_opcode::s_trunc_f16 : aco_opcode::s_trunc_f32;
2704 bld.sop1(op, Definition(dst), src);
2705 } else {
2706 isel_err(&instr->instr, "Unimplemented NIR instr bit size");
2707 }
2708 break;
2709 }
2710 case nir_op_fround_even: {
2711 if (dst.regClass() == v2b) {
2712 emit_vop1_instruction(ctx, instr, aco_opcode::v_rndne_f16, dst);
2713 } else if (dst.regClass() == v1) {
2714 emit_vop1_instruction(ctx, instr, aco_opcode::v_rndne_f32, dst);
2715 } else if (dst.regClass() == v2) {
2716 if (ctx->options->gfx_level >= GFX7) {
2717 emit_vop1_instruction(ctx, instr, aco_opcode::v_rndne_f64, dst);
2718 } else {
2719 /* GFX6 doesn't support V_RNDNE_F64, lower it. */
2720 Temp src0_lo = bld.tmp(v1), src0_hi = bld.tmp(v1);
2721 Temp src0 = get_alu_src(ctx, instr->src[0]);
2722 bld.pseudo(aco_opcode::p_split_vector, Definition(src0_lo), Definition(src0_hi), src0);
2723
2724 Temp bitmask = bld.sop1(aco_opcode::s_brev_b32, bld.def(s1),
2725 bld.copy(bld.def(s1), Operand::c32(-2u)));
2726 Temp bfi =
2727 bld.vop3(aco_opcode::v_bfi_b32, bld.def(v1), bitmask,
2728 bld.copy(bld.def(v1), Operand::c32(0x43300000u)), as_vgpr(ctx, src0_hi));
2729 Temp tmp =
2730 bld.vop3(aco_opcode::v_add_f64_e64, bld.def(v2), src0,
2731 bld.pseudo(aco_opcode::p_create_vector, bld.def(v2), Operand::zero(), bfi));
2732 Instruction* sub =
2733 bld.vop3(aco_opcode::v_add_f64_e64, bld.def(v2), tmp,
2734 bld.pseudo(aco_opcode::p_create_vector, bld.def(v2), Operand::zero(), bfi));
2735 sub->valu().neg[1] = true;
2736 tmp = sub->definitions[0].getTemp();
2737
2738 Temp v = bld.pseudo(aco_opcode::p_create_vector, bld.def(v2), Operand::c32(-1u),
2739 Operand::c32(0x432fffffu));
2740 Instruction* vop3 = bld.vopc_e64(aco_opcode::v_cmp_gt_f64, bld.def(bld.lm), src0, v);
2741 vop3->valu().abs[0] = true;
2742 Temp cond = vop3->definitions[0].getTemp();
2743
2744 Temp tmp_lo = bld.tmp(v1), tmp_hi = bld.tmp(v1);
2745 bld.pseudo(aco_opcode::p_split_vector, Definition(tmp_lo), Definition(tmp_hi), tmp);
2746 Temp dst0 = bld.vop2_e64(aco_opcode::v_cndmask_b32, bld.def(v1), tmp_lo,
2747 as_vgpr(ctx, src0_lo), cond);
2748 Temp dst1 = bld.vop2_e64(aco_opcode::v_cndmask_b32, bld.def(v1), tmp_hi,
2749 as_vgpr(ctx, src0_hi), cond);
2750
2751 bld.pseudo(aco_opcode::p_create_vector, Definition(dst), dst0, dst1);
2752 }
2753 } else if (dst.regClass() == s1) {
2754 Temp src = get_alu_src(ctx, instr->src[0]);
2755 aco_opcode op =
2756 instr->def.bit_size == 16 ? aco_opcode::s_rndne_f16 : aco_opcode::s_rndne_f32;
2757 bld.sop1(op, Definition(dst), src);
2758 } else {
2759 isel_err(&instr->instr, "Unimplemented NIR instr bit size");
2760 }
2761 break;
2762 }
2763 case nir_op_fsin_amd:
2764 case nir_op_fcos_amd: {
2765 if (instr->def.bit_size == 16 || instr->def.bit_size == 32) {
2766 bool is_sin = instr->op == nir_op_fsin_amd;
2767 aco_opcode opcode, fract;
2768 RegClass rc;
2769 if (instr->def.bit_size == 16) {
2770 opcode = is_sin ? aco_opcode::v_sin_f16 : aco_opcode::v_cos_f16;
2771 fract = aco_opcode::v_fract_f16;
2772 rc = v2b;
2773 } else {
2774 opcode = is_sin ? aco_opcode::v_sin_f32 : aco_opcode::v_cos_f32;
2775 fract = aco_opcode::v_fract_f32;
2776 rc = v1;
2777 }
2778
2779 Temp src = get_alu_src(ctx, instr->src[0]);
2780 /* before GFX9, v_sin and v_cos had a valid input domain of [-256, +256] */
2781 if (ctx->options->gfx_level < GFX9)
2782 src = bld.vop1(fract, bld.def(rc), src);
2783
2784 if (dst.regClass() == rc) {
2785 bld.vop1(opcode, Definition(dst), src);
2786 } else {
2787 Temp tmp = bld.vop1(opcode, bld.def(rc), src);
2788 bld.pseudo(aco_opcode::p_as_uniform, Definition(dst), tmp);
2789 }
2790 } else {
2791 isel_err(&instr->instr, "Unimplemented NIR instr bit size");
2792 }
2793 break;
2794 }
2795 case nir_op_ldexp: {
2796 if (dst.regClass() == v2b) {
2797 emit_vop2_instruction(ctx, instr, aco_opcode::v_ldexp_f16, dst, false);
2798 } else if (dst.regClass() == v1) {
2799 emit_vop3a_instruction(ctx, instr, aco_opcode::v_ldexp_f32, dst);
2800 } else if (dst.regClass() == v2) {
2801 emit_vop3a_instruction(ctx, instr, aco_opcode::v_ldexp_f64, dst);
2802 } else {
2803 isel_err(&instr->instr, "Unimplemented NIR instr bit size");
2804 }
2805 break;
2806 }
2807 case nir_op_frexp_sig: {
2808 if (dst.regClass() == v2b) {
2809 emit_vop1_instruction(ctx, instr, aco_opcode::v_frexp_mant_f16, dst);
2810 } else if (dst.regClass() == v1) {
2811 emit_vop1_instruction(ctx, instr, aco_opcode::v_frexp_mant_f32, dst);
2812 } else if (dst.regClass() == v2) {
2813 emit_vop1_instruction(ctx, instr, aco_opcode::v_frexp_mant_f64, dst);
2814 } else {
2815 isel_err(&instr->instr, "Unimplemented NIR instr bit size");
2816 }
2817 break;
2818 }
2819 case nir_op_frexp_exp: {
2820 if (instr->src[0].src.ssa->bit_size == 16) {
2821 Temp src = get_alu_src(ctx, instr->src[0]);
2822 Temp tmp = bld.vop1(aco_opcode::v_frexp_exp_i16_f16, bld.def(v1), src);
2823 tmp = bld.pseudo(aco_opcode::p_extract_vector, bld.def(v1b), tmp, Operand::zero());
2824 convert_int(ctx, bld, tmp, 8, 32, true, dst);
2825 } else if (instr->src[0].src.ssa->bit_size == 32) {
2826 emit_vop1_instruction(ctx, instr, aco_opcode::v_frexp_exp_i32_f32, dst);
2827 } else if (instr->src[0].src.ssa->bit_size == 64) {
2828 emit_vop1_instruction(ctx, instr, aco_opcode::v_frexp_exp_i32_f64, dst);
2829 } else {
2830 isel_err(&instr->instr, "Unimplemented NIR instr bit size");
2831 }
2832 break;
2833 }
2834 case nir_op_fsign: {
2835 Temp src = get_alu_src(ctx, instr->src[0]);
2836 if (dst.regClass() == v2b) {
2837 /* replace negative zero with positive zero */
2838 src = bld.vop2(aco_opcode::v_add_f16, bld.def(v2b), Operand::zero(), as_vgpr(ctx, src));
2839 if (ctx->program->gfx_level >= GFX9) {
2840 src = bld.vop3(aco_opcode::v_med3_i16, bld.def(v2b), Operand::c16(-1), src,
2841 Operand::c16(1u));
2842 bld.vop1(aco_opcode::v_cvt_f16_i16, Definition(dst), src);
2843 } else {
2844 src = convert_int(ctx, bld, src, 16, 32, true);
2845 src = bld.vop3(aco_opcode::v_med3_i32, bld.def(v1), Operand::c32(-1), src,
2846 Operand::c32(1u));
2847 bld.vop1(aco_opcode::v_cvt_f16_i16, Definition(dst), src);
2848 }
2849 } else if (dst.regClass() == v1) {
2850 /* Legacy multiply with +Inf means +-0.0 becomes +0.0 and all other numbers
2851 * the correctly signed Inf. After that, we only need to clamp between -1.0 and +1.0.
2852 */
2853 Temp inf = bld.copy(bld.def(s1), Operand::c32(0x7f800000));
2854 src = bld.vop2(aco_opcode::v_mul_legacy_f32, bld.def(v1), inf, as_vgpr(ctx, src));
2855 bld.vop3(aco_opcode::v_med3_f32, Definition(dst), Operand::c32(0x3f800000), src,
2856 Operand::c32(0xbf800000));
2857 } else if (dst.regClass() == v2) {
2858 src = as_vgpr(ctx, src);
2859 Temp cond = bld.vopc(aco_opcode::v_cmp_nlt_f64, bld.def(bld.lm), Operand::zero(), src);
2860 Temp tmp = bld.copy(bld.def(v1), Operand::c32(0x3FF00000u));
2861 Temp upper = bld.vop2_e64(aco_opcode::v_cndmask_b32, bld.def(v1), tmp,
2862 emit_extract_vector(ctx, src, 1, v1), cond);
2863
2864 cond = bld.vopc(aco_opcode::v_cmp_le_f64, bld.def(bld.lm), Operand::zero(), src);
2865 tmp = bld.copy(bld.def(v1), Operand::c32(0xBFF00000u));
2866 upper = bld.vop2(aco_opcode::v_cndmask_b32, bld.def(v1), tmp, upper, cond);
2867
2868 bld.pseudo(aco_opcode::p_create_vector, Definition(dst), Operand::zero(), upper);
2869 } else if (dst.regClass() == s1 && instr->def.bit_size == 16) {
2870 Temp cond = bld.sopc(aco_opcode::s_cmp_lt_f16, bld.def(s1, scc), Operand::c16(0), src);
2871 src = bld.sop2(aco_opcode::s_cselect_b32, bld.def(s1), Operand::c32(0x3c00), src,
2872 bld.scc(cond));
2873 cond = bld.sopc(aco_opcode::s_cmp_ge_f16, bld.def(s1, scc), src, Operand::c16(0));
2874 bld.sop2(aco_opcode::s_cselect_b32, Definition(dst), src, Operand::c32(0xbc00),
2875 bld.scc(cond));
2876 } else if (dst.regClass() == s1 && instr->def.bit_size == 32) {
2877 Temp cond = bld.sopc(aco_opcode::s_cmp_lt_f32, bld.def(s1, scc), Operand::c32(0), src);
2878 src = bld.sop2(aco_opcode::s_cselect_b32, bld.def(s1), Operand::c32(0x3f800000), src,
2879 bld.scc(cond));
2880 cond = bld.sopc(aco_opcode::s_cmp_ge_f32, bld.def(s1, scc), src, Operand::c32(0));
2881 bld.sop2(aco_opcode::s_cselect_b32, Definition(dst), src, Operand::c32(0xbf800000),
2882 bld.scc(cond));
2883 } else {
2884 isel_err(&instr->instr, "Unimplemented NIR instr bit size");
2885 }
2886 break;
2887 }
2888 case nir_op_f2f16:
2889 case nir_op_f2f16_rtne: {
2890 assert(instr->src[0].src.ssa->bit_size == 32);
2891 if (instr->def.num_components == 2) {
2892 /* Vectorizing f2f16 is only possible with rtz. */
2893 assert(instr->op != nir_op_f2f16_rtne);
2894 assert(ctx->block->fp_mode.round16_64 == fp_round_tz ||
2895 !ctx->block->fp_mode.care_about_round16_64);
2896 emit_vec2_f2f16(ctx, instr, dst);
2897 break;
2898 }
2899 Temp src = get_alu_src(ctx, instr->src[0]);
2900 if (instr->op == nir_op_f2f16_rtne && ctx->block->fp_mode.round16_64 != fp_round_ne) {
2901 /* We emit s_round_mode/s_setreg_imm32 in lower_to_hw_instr to
2902 * keep value numbering and the scheduler simpler.
2903 */
2904 if (dst.regClass() == v2b)
2905 bld.vop1(aco_opcode::p_v_cvt_f16_f32_rtne, Definition(dst), src);
2906 else
2907 bld.sop1(aco_opcode::p_s_cvt_f16_f32_rtne, Definition(dst), src);
2908 } else {
2909 if (dst.regClass() == v2b)
2910 bld.vop1(aco_opcode::v_cvt_f16_f32, Definition(dst), src);
2911 else
2912 bld.sop1(aco_opcode::s_cvt_f16_f32, Definition(dst), src);
2913 }
2914 break;
2915 }
2916 case nir_op_f2f16_rtz: {
2917 assert(instr->src[0].src.ssa->bit_size == 32);
2918 if (instr->def.num_components == 2) {
2919 emit_vec2_f2f16(ctx, instr, dst);
2920 break;
2921 }
2922 Temp src = get_alu_src(ctx, instr->src[0]);
2923 if (ctx->block->fp_mode.round16_64 == fp_round_tz) {
2924 if (dst.regClass() == v2b)
2925 bld.vop1(aco_opcode::v_cvt_f16_f32, Definition(dst), src);
2926 else
2927 bld.sop1(aco_opcode::s_cvt_f16_f32, Definition(dst), src);
2928 } else if (dst.regClass() == s1) {
2929 bld.sop2(aco_opcode::s_cvt_pk_rtz_f16_f32, Definition(dst), src, Operand::zero());
2930 } else if (ctx->program->gfx_level == GFX8 || ctx->program->gfx_level == GFX9) {
2931 bld.vop3(aco_opcode::v_cvt_pkrtz_f16_f32_e64, Definition(dst), src, Operand::zero());
2932 } else {
2933 bld.vop2(aco_opcode::v_cvt_pkrtz_f16_f32, Definition(dst), src, as_vgpr(ctx, src));
2934 }
2935 break;
2936 }
2937 case nir_op_f2f32: {
2938 if (dst.regClass() == s1) {
2939 assert(instr->src[0].src.ssa->bit_size == 16);
2940 Temp src = get_alu_src(ctx, instr->src[0]);
2941 bld.sop1(aco_opcode::s_cvt_f32_f16, Definition(dst), src);
2942 } else if (instr->src[0].src.ssa->bit_size == 16) {
2943 emit_vop1_instruction(ctx, instr, aco_opcode::v_cvt_f32_f16, dst);
2944 } else if (instr->src[0].src.ssa->bit_size == 64) {
2945 emit_vop1_instruction(ctx, instr, aco_opcode::v_cvt_f32_f64, dst);
2946 } else {
2947 isel_err(&instr->instr, "Unimplemented NIR instr bit size");
2948 }
2949 break;
2950 }
2951 case nir_op_f2f64: {
2952 assert(instr->src[0].src.ssa->bit_size == 32);
2953 Temp src = get_alu_src(ctx, instr->src[0]);
2954 bld.vop1(aco_opcode::v_cvt_f64_f32, Definition(dst), src);
2955 break;
2956 }
2957 case nir_op_i2f16: {
2958 Temp src = get_alu_src(ctx, instr->src[0]);
2959 const unsigned input_size = instr->src[0].src.ssa->bit_size;
2960 if (dst.regClass() == v2b) {
2961 if (input_size <= 16) {
2962 /* Expand integer to the size expected by the uint→float converter used below */
2963 unsigned target_size = (ctx->program->gfx_level >= GFX8 ? 16 : 32);
2964 if (input_size != target_size) {
2965 src = convert_int(ctx, bld, src, input_size, target_size, true);
2966 }
2967 }
2968
2969 if (ctx->program->gfx_level >= GFX8 && input_size <= 16) {
2970 bld.vop1(aco_opcode::v_cvt_f16_i16, Definition(dst), src);
2971 } else {
2972 /* Large 32bit inputs need to return +-inf/FLOAT_MAX.
2973 *
2974 * This is also the fallback-path taken on GFX7 and earlier, which
2975 * do not support direct f16⟷i16 conversions.
2976 */
2977 src = bld.vop1(aco_opcode::v_cvt_f32_i32, bld.def(v1), src);
2978 bld.vop1(aco_opcode::v_cvt_f16_f32, Definition(dst), src);
2979 }
2980 } else if (dst.regClass() == s1) {
2981 if (input_size <= 16) {
2982 src = convert_int(ctx, bld, src, input_size, 32, true);
2983 }
2984 src = bld.sop1(aco_opcode::s_cvt_f32_i32, bld.def(s1), src);
2985 bld.sop1(aco_opcode::s_cvt_f16_f32, Definition(dst), src);
2986 } else {
2987 isel_err(&instr->instr, "Unimplemented NIR instr bit size");
2988 }
2989 break;
2990 }
2991 case nir_op_i2f32: {
2992 assert(dst.size() == 1);
2993 Temp src = get_alu_src(ctx, instr->src[0]);
2994 const unsigned input_size = instr->src[0].src.ssa->bit_size;
2995 if (input_size <= 32) {
2996 if (input_size <= 16) {
2997 /* Sign-extend to 32-bits */
2998 src = convert_int(ctx, bld, src, input_size, 32, true);
2999 }
3000 if (dst.regClass() == v1)
3001 bld.vop1(aco_opcode::v_cvt_f32_i32, Definition(dst), src);
3002 else
3003 bld.sop1(aco_opcode::s_cvt_f32_i32, Definition(dst), src);
3004 } else {
3005 isel_err(&instr->instr, "Unimplemented NIR instr bit size");
3006 }
3007 break;
3008 }
3009 case nir_op_i2f64: {
3010 if (instr->src[0].src.ssa->bit_size <= 32) {
3011 Temp src = get_alu_src(ctx, instr->src[0]);
3012 if (instr->src[0].src.ssa->bit_size <= 16)
3013 src = convert_int(ctx, bld, src, instr->src[0].src.ssa->bit_size, 32, true);
3014 bld.vop1(aco_opcode::v_cvt_f64_i32, Definition(dst), src);
3015 } else {
3016 isel_err(&instr->instr, "Unimplemented NIR instr bit size");
3017 }
3018 break;
3019 }
3020 case nir_op_u2f16: {
3021 Temp src = get_alu_src(ctx, instr->src[0]);
3022 const unsigned input_size = instr->src[0].src.ssa->bit_size;
3023 if (dst.regClass() == v2b) {
3024 if (input_size <= 16) {
3025 /* Expand integer to the size expected by the uint→float converter used below */
3026 unsigned target_size = (ctx->program->gfx_level >= GFX8 ? 16 : 32);
3027 if (input_size != target_size) {
3028 src = convert_int(ctx, bld, src, input_size, target_size, false);
3029 }
3030 }
3031
3032 if (ctx->program->gfx_level >= GFX8 && input_size <= 16) {
3033 bld.vop1(aco_opcode::v_cvt_f16_u16, Definition(dst), src);
3034 } else {
3035 /* Large 32bit inputs need to return inf/FLOAT_MAX.
3036 *
3037 * This is also the fallback-path taken on GFX7 and earlier, which
3038 * do not support direct f16⟷u16 conversions.
3039 */
3040 src = bld.vop1(aco_opcode::v_cvt_f32_u32, bld.def(v1), src);
3041 bld.vop1(aco_opcode::v_cvt_f16_f32, Definition(dst), src);
3042 }
3043 } else if (dst.regClass() == s1) {
3044 if (input_size <= 16) {
3045 src = convert_int(ctx, bld, src, input_size, 32, false);
3046 }
3047 src = bld.sop1(aco_opcode::s_cvt_f32_u32, bld.def(s1), src);
3048 bld.sop1(aco_opcode::s_cvt_f16_f32, Definition(dst), src);
3049 } else {
3050 isel_err(&instr->instr, "Unimplemented NIR instr bit size");
3051 }
3052 break;
3053 }
3054 case nir_op_u2f32: {
3055 assert(dst.size() == 1);
3056 Temp src = get_alu_src(ctx, instr->src[0]);
3057 const unsigned input_size = instr->src[0].src.ssa->bit_size;
3058 if (input_size == 8 && dst.regClass() == v1) {
3059 bld.vop1(aco_opcode::v_cvt_f32_ubyte0, Definition(dst), src);
3060 } else if (input_size <= 32) {
3061 if (input_size <= 16)
3062 src = convert_int(ctx, bld, src, instr->src[0].src.ssa->bit_size, 32, false);
3063 if (dst.regClass() == v1)
3064 bld.vop1(aco_opcode::v_cvt_f32_u32, Definition(dst), src);
3065 else
3066 bld.sop1(aco_opcode::s_cvt_f32_u32, Definition(dst), src);
3067 } else {
3068 isel_err(&instr->instr, "Unimplemented NIR instr bit size");
3069 }
3070 break;
3071 }
3072 case nir_op_u2f64: {
3073 if (instr->src[0].src.ssa->bit_size <= 32) {
3074 Temp src = get_alu_src(ctx, instr->src[0]);
3075 if (instr->src[0].src.ssa->bit_size <= 16)
3076 src = convert_int(ctx, bld, src, instr->src[0].src.ssa->bit_size, 32, false);
3077 bld.vop1(aco_opcode::v_cvt_f64_u32, Definition(dst), src);
3078 } else {
3079 isel_err(&instr->instr, "Unimplemented NIR instr bit size");
3080 }
3081 break;
3082 }
3083 case nir_op_f2i8:
3084 case nir_op_f2i16: {
3085 if (instr->src[0].src.ssa->bit_size <= 32 && dst.regClass() == s1 &&
3086 ctx->program->gfx_level >= GFX11_5) {
3087 Temp src = get_alu_src(ctx, instr->src[0]);
3088 Temp tmp = bld.as_uniform(src);
3089 if (instr->src[0].src.ssa->bit_size == 16)
3090 tmp = bld.sop1(aco_opcode::s_cvt_f32_f16, bld.def(s1), tmp);
3091 bld.sop1(aco_opcode::s_cvt_i32_f32, Definition(dst), tmp);
3092 } else if (instr->src[0].src.ssa->bit_size == 16) {
3093 if (ctx->program->gfx_level >= GFX8) {
3094 emit_vop1_instruction(ctx, instr, aco_opcode::v_cvt_i16_f16, dst);
3095 } else {
3096 /* GFX7 and earlier do not support direct f16⟷i16 conversions */
3097 Temp tmp = bld.tmp(v1);
3098 emit_vop1_instruction(ctx, instr, aco_opcode::v_cvt_f32_f16, tmp);
3099 tmp = bld.vop1(aco_opcode::v_cvt_i32_f32, bld.def(v1), tmp);
3100 tmp = convert_int(ctx, bld, tmp, 32, instr->def.bit_size, false,
3101 (dst.type() == RegType::sgpr) ? Temp() : dst);
3102 if (dst.type() == RegType::sgpr) {
3103 bld.pseudo(aco_opcode::p_as_uniform, Definition(dst), tmp);
3104 }
3105 }
3106 } else if (instr->src[0].src.ssa->bit_size == 32) {
3107 emit_vop1_instruction(ctx, instr, aco_opcode::v_cvt_i32_f32, dst);
3108 } else {
3109 emit_vop1_instruction(ctx, instr, aco_opcode::v_cvt_i32_f64, dst);
3110 }
3111 break;
3112 }
3113 case nir_op_f2u8:
3114 case nir_op_f2u16: {
3115 if (instr->src[0].src.ssa->bit_size <= 32 && dst.regClass() == s1 &&
3116 ctx->program->gfx_level >= GFX11_5) {
3117 Temp src = get_alu_src(ctx, instr->src[0]);
3118 Temp tmp = bld.as_uniform(src);
3119 if (instr->src[0].src.ssa->bit_size == 16)
3120 tmp = bld.sop1(aco_opcode::s_cvt_f32_f16, bld.def(s1), tmp);
3121 bld.sop1(aco_opcode::s_cvt_u32_f32, Definition(dst), tmp);
3122 } else if (instr->src[0].src.ssa->bit_size == 16) {
3123 if (ctx->program->gfx_level >= GFX8) {
3124 emit_vop1_instruction(ctx, instr, aco_opcode::v_cvt_u16_f16, dst);
3125 } else {
3126 /* GFX7 and earlier do not support direct f16⟷u16 conversions */
3127 Temp tmp = bld.tmp(v1);
3128 emit_vop1_instruction(ctx, instr, aco_opcode::v_cvt_f32_f16, tmp);
3129 tmp = bld.vop1(aco_opcode::v_cvt_u32_f32, bld.def(v1), tmp);
3130 tmp = convert_int(ctx, bld, tmp, 32, instr->def.bit_size, false,
3131 (dst.type() == RegType::sgpr) ? Temp() : dst);
3132 if (dst.type() == RegType::sgpr) {
3133 bld.pseudo(aco_opcode::p_as_uniform, Definition(dst), tmp);
3134 }
3135 }
3136 } else if (instr->src[0].src.ssa->bit_size == 32) {
3137 if (dst.regClass() == v1b && ctx->program->gfx_level >= GFX11)
3138 bld.vop3(aco_opcode::p_v_cvt_pk_u8_f32, Definition(dst),
3139 get_alu_src(ctx, instr->src[0]));
3140 else
3141 emit_vop1_instruction(ctx, instr, aco_opcode::v_cvt_u32_f32, dst);
3142 } else {
3143 emit_vop1_instruction(ctx, instr, aco_opcode::v_cvt_u32_f64, dst);
3144 }
3145 break;
3146 }
3147 case nir_op_f2i32: {
3148 Temp src = get_alu_src(ctx, instr->src[0]);
3149 if (instr->src[0].src.ssa->bit_size <= 32 && dst.regClass() == s1 &&
3150 ctx->program->gfx_level >= GFX11_5) {
3151 Temp tmp = bld.as_uniform(src);
3152 if (instr->src[0].src.ssa->bit_size == 16)
3153 tmp = bld.sop1(aco_opcode::s_cvt_f32_f16, bld.def(s1), tmp);
3154 bld.sop1(aco_opcode::s_cvt_i32_f32, Definition(dst), tmp);
3155 } else if (instr->src[0].src.ssa->bit_size == 16) {
3156 Temp tmp = bld.vop1(aco_opcode::v_cvt_f32_f16, bld.def(v1), src);
3157 if (dst.type() == RegType::vgpr) {
3158 bld.vop1(aco_opcode::v_cvt_i32_f32, Definition(dst), tmp);
3159 } else {
3160 bld.pseudo(aco_opcode::p_as_uniform, Definition(dst),
3161 bld.vop1(aco_opcode::v_cvt_i32_f32, bld.def(v1), tmp));
3162 }
3163 } else if (instr->src[0].src.ssa->bit_size == 32) {
3164 emit_vop1_instruction(ctx, instr, aco_opcode::v_cvt_i32_f32, dst);
3165 } else if (instr->src[0].src.ssa->bit_size == 64) {
3166 emit_vop1_instruction(ctx, instr, aco_opcode::v_cvt_i32_f64, dst);
3167 } else {
3168 isel_err(&instr->instr, "Unimplemented NIR instr bit size");
3169 }
3170 break;
3171 }
3172 case nir_op_f2u32: {
3173 Temp src = get_alu_src(ctx, instr->src[0]);
3174 if (instr->src[0].src.ssa->bit_size <= 32 && dst.regClass() == s1 &&
3175 ctx->program->gfx_level >= GFX11_5) {
3176 Temp tmp = bld.as_uniform(src);
3177 if (instr->src[0].src.ssa->bit_size == 16)
3178 tmp = bld.sop1(aco_opcode::s_cvt_f32_f16, bld.def(s1), tmp);
3179 bld.sop1(aco_opcode::s_cvt_u32_f32, Definition(dst), tmp);
3180 } else if (instr->src[0].src.ssa->bit_size == 16) {
3181 Temp tmp = bld.vop1(aco_opcode::v_cvt_f32_f16, bld.def(v1), src);
3182 if (dst.type() == RegType::vgpr) {
3183 bld.vop1(aco_opcode::v_cvt_u32_f32, Definition(dst), tmp);
3184 } else {
3185 bld.pseudo(aco_opcode::p_as_uniform, Definition(dst),
3186 bld.vop1(aco_opcode::v_cvt_u32_f32, bld.def(v1), tmp));
3187 }
3188 } else if (instr->src[0].src.ssa->bit_size == 32) {
3189 emit_vop1_instruction(ctx, instr, aco_opcode::v_cvt_u32_f32, dst);
3190 } else if (instr->src[0].src.ssa->bit_size == 64) {
3191 emit_vop1_instruction(ctx, instr, aco_opcode::v_cvt_u32_f64, dst);
3192 } else {
3193 isel_err(&instr->instr, "Unimplemented NIR instr bit size");
3194 }
3195 break;
3196 }
3197 case nir_op_b2f16: {
3198 Temp src = get_alu_src(ctx, instr->src[0]);
3199 assert(src.regClass() == bld.lm);
3200
3201 if (dst.regClass() == s1) {
3202 src = bool_to_scalar_condition(ctx, src);
3203 bld.sop2(aco_opcode::s_mul_i32, Definition(dst), Operand::c32(0x3c00u), src);
3204 } else if (dst.regClass() == v2b) {
3205 Temp one = bld.copy(bld.def(v1), Operand::c32(0x3c00u));
3206 bld.vop2(aco_opcode::v_cndmask_b32, Definition(dst), Operand::zero(), one, src);
3207 } else {
3208 unreachable("Wrong destination register class for nir_op_b2f16.");
3209 }
3210 break;
3211 }
3212 case nir_op_b2f32: {
3213 Temp src = get_alu_src(ctx, instr->src[0]);
3214 assert(src.regClass() == bld.lm);
3215
3216 if (dst.regClass() == s1) {
3217 src = bool_to_scalar_condition(ctx, src);
3218 bld.sop2(aco_opcode::s_mul_i32, Definition(dst), Operand::c32(0x3f800000u), src);
3219 } else if (dst.regClass() == v1) {
3220 bld.vop2_e64(aco_opcode::v_cndmask_b32, Definition(dst), Operand::zero(),
3221 Operand::c32(0x3f800000u), src);
3222 } else {
3223 unreachable("Wrong destination register class for nir_op_b2f32.");
3224 }
3225 break;
3226 }
3227 case nir_op_b2f64: {
3228 Temp src = get_alu_src(ctx, instr->src[0]);
3229 assert(src.regClass() == bld.lm);
3230
3231 if (dst.regClass() == s2) {
3232 src = bool_to_scalar_condition(ctx, src);
3233 bld.sop2(aco_opcode::s_cselect_b64, Definition(dst), Operand::c32(0x3f800000u),
3234 Operand::zero(), bld.scc(src));
3235 } else if (dst.regClass() == v2) {
3236 Temp one = bld.copy(bld.def(v1), Operand::c32(0x3FF00000u));
3237 Temp upper =
3238 bld.vop2_e64(aco_opcode::v_cndmask_b32, bld.def(v1), Operand::zero(), one, src);
3239 bld.pseudo(aco_opcode::p_create_vector, Definition(dst), Operand::zero(), upper);
3240 } else {
3241 unreachable("Wrong destination register class for nir_op_b2f64.");
3242 }
3243 break;
3244 }
3245 case nir_op_i2i8:
3246 case nir_op_i2i16:
3247 case nir_op_i2i32: {
3248 if (dst.type() == RegType::sgpr && instr->src[0].src.ssa->bit_size < 32) {
3249 /* no need to do the extract in get_alu_src() */
3250 sgpr_extract_mode mode = instr->def.bit_size > instr->src[0].src.ssa->bit_size
3251 ? sgpr_extract_sext
3252 : sgpr_extract_undef;
3253 extract_8_16_bit_sgpr_element(ctx, dst, &instr->src[0], mode);
3254 } else {
3255 const unsigned input_bitsize = instr->src[0].src.ssa->bit_size;
3256 const unsigned output_bitsize = instr->def.bit_size;
3257 convert_int(ctx, bld, get_alu_src(ctx, instr->src[0]), input_bitsize, output_bitsize,
3258 output_bitsize > input_bitsize, dst);
3259 }
3260 break;
3261 }
3262 case nir_op_u2u8:
3263 case nir_op_u2u16:
3264 case nir_op_u2u32: {
3265 if (dst.type() == RegType::sgpr && instr->src[0].src.ssa->bit_size < 32) {
3266 /* no need to do the extract in get_alu_src() */
3267 sgpr_extract_mode mode = instr->def.bit_size > instr->src[0].src.ssa->bit_size
3268 ? sgpr_extract_zext
3269 : sgpr_extract_undef;
3270 extract_8_16_bit_sgpr_element(ctx, dst, &instr->src[0], mode);
3271 } else {
3272 convert_int(ctx, bld, get_alu_src(ctx, instr->src[0]), instr->src[0].src.ssa->bit_size,
3273 instr->def.bit_size, false, dst);
3274 }
3275 break;
3276 }
3277 case nir_op_b2b32:
3278 case nir_op_b2i8:
3279 case nir_op_b2i16:
3280 case nir_op_b2i32: {
3281 Temp src = get_alu_src(ctx, instr->src[0]);
3282 assert(src.regClass() == bld.lm);
3283
3284 if (dst.regClass() == s1) {
3285 bool_to_scalar_condition(ctx, src, dst);
3286 } else if (dst.type() == RegType::vgpr) {
3287 bld.vop2_e64(aco_opcode::v_cndmask_b32, Definition(dst), Operand::zero(), Operand::c32(1u),
3288 src);
3289 } else {
3290 unreachable("Invalid register class for b2i32");
3291 }
3292 break;
3293 }
3294 case nir_op_b2b1: {
3295 Temp src = get_alu_src(ctx, instr->src[0]);
3296 assert(dst.regClass() == bld.lm);
3297
3298 if (src.type() == RegType::vgpr) {
3299 assert(src.regClass() == v1 || src.regClass() == v2);
3300 assert(dst.regClass() == bld.lm);
3301 bld.vopc(src.size() == 2 ? aco_opcode::v_cmp_lg_u64 : aco_opcode::v_cmp_lg_u32,
3302 Definition(dst), Operand::zero(), src);
3303 } else {
3304 assert(src.regClass() == s1 || src.regClass() == s2);
3305 Temp tmp;
3306 if (src.regClass() == s2 && ctx->program->gfx_level <= GFX7) {
3307 tmp =
3308 bld.sop2(aco_opcode::s_or_b64, bld.def(s2), bld.def(s1, scc), Operand::zero(), src)
3309 .def(1)
3310 .getTemp();
3311 } else {
3312 tmp = bld.sopc(src.size() == 2 ? aco_opcode::s_cmp_lg_u64 : aco_opcode::s_cmp_lg_u32,
3313 bld.scc(bld.def(s1)), Operand::zero(), src);
3314 }
3315 bool_to_vector_condition(ctx, tmp, dst);
3316 }
3317 break;
3318 }
3319 case nir_op_unpack_64_2x32:
3320 case nir_op_unpack_32_2x16:
3321 case nir_op_unpack_64_4x16:
3322 case nir_op_unpack_32_4x8:
3323 bld.copy(Definition(dst), get_alu_src(ctx, instr->src[0]));
3324 emit_split_vector(
3325 ctx, dst, instr->op == nir_op_unpack_32_4x8 || instr->op == nir_op_unpack_64_4x16 ? 4 : 2);
3326 break;
3327 case nir_op_pack_64_2x32_split: {
3328 Operand src[2];
3329 RegClass elem_rc = dst.regClass() == s2 ? s1 : v1;
3330 for (unsigned i = 0; i < 2; i++) {
3331 if (nir_src_is_undef(instr->src[i].src))
3332 src[i] = Operand(elem_rc);
3333 else
3334 src[i] = Operand(get_alu_src(ctx, instr->src[i]));
3335 }
3336
3337 bld.pseudo(aco_opcode::p_create_vector, Definition(dst), src[0], src[1]);
3338 break;
3339 }
3340 case nir_op_unpack_64_2x32_split_x:
3341 bld.pseudo(aco_opcode::p_split_vector, Definition(dst), bld.def(dst.regClass()),
3342 get_alu_src(ctx, instr->src[0]));
3343 break;
3344 case nir_op_unpack_64_2x32_split_y:
3345 bld.pseudo(aco_opcode::p_split_vector, bld.def(dst.regClass()), Definition(dst),
3346 get_alu_src(ctx, instr->src[0]));
3347 break;
3348 case nir_op_unpack_32_2x16_split_x:
3349 if (dst.type() == RegType::vgpr) {
3350 bld.pseudo(aco_opcode::p_split_vector, Definition(dst), bld.def(dst.regClass()),
3351 get_alu_src(ctx, instr->src[0]));
3352 } else {
3353 bld.copy(Definition(dst), get_alu_src(ctx, instr->src[0]));
3354 }
3355 break;
3356 case nir_op_unpack_32_2x16_split_y:
3357 if (dst.type() == RegType::vgpr) {
3358 bld.pseudo(aco_opcode::p_split_vector, bld.def(dst.regClass()), Definition(dst),
3359 get_alu_src(ctx, instr->src[0]));
3360 } else {
3361 bld.pseudo(aco_opcode::p_extract, Definition(dst), bld.def(s1, scc),
3362 get_alu_src(ctx, instr->src[0]), Operand::c32(1u), Operand::c32(16u),
3363 Operand::zero());
3364 }
3365 break;
3366 case nir_op_pack_32_2x16_split: {
3367 Operand src0 = Operand(get_alu_src(ctx, instr->src[0]));
3368 Operand src1 = Operand(get_alu_src(ctx, instr->src[1]));
3369 if (dst.regClass() == v1) {
3370 if (nir_src_is_undef(instr->src[0].src))
3371 src0 = Operand(v2b);
3372 else
3373 src0 = Operand(emit_extract_vector(ctx, src0.getTemp(), 0, v2b));
3374
3375 if (nir_src_is_undef(instr->src[1].src))
3376 src1 = Operand(v2b);
3377 else
3378 src1 = Operand(emit_extract_vector(ctx, src1.getTemp(), 0, v2b));
3379
3380 bld.pseudo(aco_opcode::p_create_vector, Definition(dst), src0, src1);
3381 } else if (nir_src_is_undef(instr->src[1].src)) {
3382 bld.copy(Definition(dst), src0);
3383 } else if (nir_src_is_undef(instr->src[0].src)) {
3384 bld.pseudo(aco_opcode::p_insert, Definition(dst), bld.def(s1, scc), src1, Operand::c32(1),
3385 Operand::c32(16));
3386 } else if (ctx->program->gfx_level >= GFX9) {
3387 bld.sop2(aco_opcode::s_pack_ll_b32_b16, Definition(dst), src0, src1);
3388 } else {
3389 src0 = bld.sop2(aco_opcode::s_and_b32, bld.def(s1), bld.def(s1, scc), src0,
3390 Operand::c32(0xFFFFu));
3391 src1 = bld.sop2(aco_opcode::s_lshl_b32, bld.def(s1), bld.def(s1, scc), src1,
3392 Operand::c32(16u));
3393 bld.sop2(aco_opcode::s_or_b32, Definition(dst), bld.def(s1, scc), src0, src1);
3394 }
3395 break;
3396 }
3397 case nir_op_pack_32_4x8: bld.copy(Definition(dst), get_alu_src(ctx, instr->src[0], 4)); break;
3398 case nir_op_pack_half_2x16_rtz_split:
3399 case nir_op_pack_half_2x16_split: {
3400 if (dst.regClass() == v1) {
3401 if (ctx->program->gfx_level == GFX8 || ctx->program->gfx_level == GFX9)
3402 emit_vop3a_instruction(ctx, instr, aco_opcode::v_cvt_pkrtz_f16_f32_e64, dst);
3403 else
3404 emit_vop2_instruction(ctx, instr, aco_opcode::v_cvt_pkrtz_f16_f32, dst, false);
3405 } else if (dst.regClass() == s1) {
3406 emit_sop2_instruction(ctx, instr, aco_opcode::s_cvt_pk_rtz_f16_f32, dst, false);
3407 } else {
3408 isel_err(&instr->instr, "Unimplemented NIR instr bit size");
3409 }
3410 break;
3411 }
3412 case nir_op_pack_unorm_2x16:
3413 case nir_op_pack_snorm_2x16: {
3414 unsigned bit_size = instr->src[0].src.ssa->bit_size;
3415 /* Only support 16 and 32bit. */
3416 assert(bit_size == 32 || bit_size == 16);
3417
3418 RegClass src_rc = bit_size == 32 ? v1 : v2b;
3419 Temp src = get_alu_src(ctx, instr->src[0], 2);
3420 Temp src0 = emit_extract_vector(ctx, src, 0, src_rc);
3421 Temp src1 = emit_extract_vector(ctx, src, 1, src_rc);
3422
3423 /* Work around for pre-GFX9 GPU which don't have fp16 pknorm instruction. */
3424 if (bit_size == 16 && ctx->program->gfx_level < GFX9) {
3425 src0 = bld.vop1(aco_opcode::v_cvt_f32_f16, bld.def(v1), src0);
3426 src1 = bld.vop1(aco_opcode::v_cvt_f32_f16, bld.def(v1), src1);
3427 bit_size = 32;
3428 }
3429
3430 aco_opcode opcode;
3431 if (bit_size == 32) {
3432 opcode = instr->op == nir_op_pack_unorm_2x16 ? aco_opcode::v_cvt_pknorm_u16_f32
3433 : aco_opcode::v_cvt_pknorm_i16_f32;
3434 } else {
3435 opcode = instr->op == nir_op_pack_unorm_2x16 ? aco_opcode::v_cvt_pknorm_u16_f16
3436 : aco_opcode::v_cvt_pknorm_i16_f16;
3437 }
3438 bld.vop3(opcode, Definition(dst), src0, src1);
3439 break;
3440 }
3441 case nir_op_pack_uint_2x16:
3442 case nir_op_pack_sint_2x16: {
3443 Temp src = get_alu_src(ctx, instr->src[0], 2);
3444 Temp src0 = emit_extract_vector(ctx, src, 0, v1);
3445 Temp src1 = emit_extract_vector(ctx, src, 1, v1);
3446 aco_opcode opcode = instr->op == nir_op_pack_uint_2x16 ? aco_opcode::v_cvt_pk_u16_u32
3447 : aco_opcode::v_cvt_pk_i16_i32;
3448 bld.vop3(opcode, Definition(dst), src0, src1);
3449 break;
3450 }
3451 case nir_op_unpack_half_2x16_split_x: {
3452 Temp src = get_alu_src(ctx, instr->src[0]);
3453 if (dst.regClass() == s1) {
3454 bld.sop1(aco_opcode::s_cvt_f32_f16, Definition(dst), src);
3455 break;
3456 }
3457 if (src.regClass() == v1)
3458 src = bld.pseudo(aco_opcode::p_split_vector, bld.def(v2b), bld.def(v2b), src);
3459 if (dst.regClass() == v1) {
3460 bld.vop1(aco_opcode::v_cvt_f32_f16, Definition(dst), src);
3461 } else {
3462 isel_err(&instr->instr, "Unimplemented NIR instr bit size");
3463 }
3464 break;
3465 }
3466 case nir_op_unpack_half_2x16_split_y: {
3467 Temp src = get_alu_src(ctx, instr->src[0]);
3468 if (dst.regClass() == s1) {
3469 bld.sop1(aco_opcode::s_cvt_hi_f32_f16, Definition(dst), src);
3470 break;
3471 }
3472 if (src.regClass() == s1)
3473 src = bld.pseudo(aco_opcode::p_extract, bld.def(s1), bld.def(s1, scc), src,
3474 Operand::c32(1u), Operand::c32(16u), Operand::zero());
3475 else
3476 src =
3477 bld.pseudo(aco_opcode::p_split_vector, bld.def(v2b), bld.def(v2b), src).def(1).getTemp();
3478 if (dst.regClass() == v1) {
3479 bld.vop1(aco_opcode::v_cvt_f32_f16, Definition(dst), src);
3480 } else {
3481 isel_err(&instr->instr, "Unimplemented NIR instr bit size");
3482 }
3483 break;
3484 }
3485 case nir_op_msad_4x8: {
3486 assert(dst.regClass() == v1);
3487 emit_vop3a_instruction(ctx, instr, aco_opcode::v_msad_u8, dst, false, 3u, true);
3488 break;
3489 }
3490 case nir_op_mqsad_4x8: {
3491 assert(dst.regClass() == v4);
3492 Temp ref = get_alu_src(ctx, instr->src[0]);
3493 Temp src = get_alu_src(ctx, instr->src[1], 2);
3494 Temp accum = get_alu_src(ctx, instr->src[2], 4);
3495 bld.vop3(aco_opcode::v_mqsad_u32_u8, Definition(dst), as_vgpr(ctx, src), as_vgpr(ctx, ref),
3496 as_vgpr(ctx, accum));
3497 emit_split_vector(ctx, dst, 4);
3498 break;
3499 }
3500 case nir_op_shfr: {
3501 if (dst.regClass() == s1) {
3502 Temp src0 = get_alu_src(ctx, instr->src[0]);
3503 Temp src1 = get_alu_src(ctx, instr->src[1]);
3504
3505 Temp amount;
3506 if (nir_src_is_const(instr->src[2].src)) {
3507 unsigned camount = nir_src_as_uint(instr->src[2].src) & 0x1f;
3508 if (camount == 16 && ctx->program->gfx_level >= GFX11) {
3509 bld.sop2(aco_opcode::s_pack_hl_b32_b16, Definition(dst), src1, src0);
3510 break;
3511 }
3512 amount = bld.copy(bld.def(s1), Operand::c32(camount));
3513 } else if (get_alu_src_ub(ctx, instr, 2) >= 32) {
3514 amount = bld.sop2(aco_opcode::s_and_b32, bld.def(s1), bld.def(s1, scc),
3515 get_alu_src(ctx, instr->src[2]), Operand::c32(0x1f));
3516 } else {
3517 amount = get_alu_src(ctx, instr->src[2]);
3518 }
3519
3520 Temp src = bld.pseudo(aco_opcode::p_create_vector, bld.def(s2), src1, src0);
3521
3522 Temp res = bld.sop2(aco_opcode::s_lshr_b64, bld.def(s2), bld.def(s1, scc), src, amount);
3523 bld.pseudo(aco_opcode::p_extract_vector, Definition(dst), res, Operand::zero());
3524 } else if (dst.regClass() == v1) {
3525 emit_vop3a_instruction(ctx, instr, aco_opcode::v_alignbit_b32, dst, false, 3u);
3526 } else {
3527 isel_err(&instr->instr, "Unimplemented NIR instr bit size");
3528 }
3529 break;
3530 }
3531 case nir_op_alignbyte_amd: {
3532 if (dst.regClass() == v1) {
3533 emit_vop3a_instruction(ctx, instr, aco_opcode::v_alignbyte_b32, dst, false, 3u);
3534 } else {
3535 isel_err(&instr->instr, "Unimplemented NIR instr bit size");
3536 }
3537 break;
3538 }
3539 case nir_op_fquantize2f16: {
3540 Temp src = get_alu_src(ctx, instr->src[0]);
3541 if (dst.regClass() == v1) {
3542 Temp f16;
3543 if (ctx->block->fp_mode.round16_64 != fp_round_ne)
3544 f16 = bld.vop1(aco_opcode::p_v_cvt_f16_f32_rtne, bld.def(v2b), src);
3545 else
3546 f16 = bld.vop1(aco_opcode::v_cvt_f16_f32, bld.def(v2b), src);
3547
3548 if (ctx->block->fp_mode.denorm16_64 != fp_denorm_keep) {
3549 bld.vop1(aco_opcode::v_cvt_f32_f16, Definition(dst), f16);
3550 break;
3551 }
3552
3553 Temp denorm_zero;
3554 Temp f32 = bld.vop1(aco_opcode::v_cvt_f32_f16, bld.def(v1), f16);
3555 if (ctx->program->gfx_level >= GFX8) {
3556 /* value is negative/positive denormal value/zero */
3557 Instruction* tmp0 =
3558 bld.vopc_e64(aco_opcode::v_cmp_class_f16, bld.def(bld.lm), f16, Operand::c32(0x30));
3559 tmp0->valu().abs[0] = true;
3560 tmp0->valu().neg[0] = true;
3561 denorm_zero = tmp0->definitions[0].getTemp();
3562 } else {
3563 /* 0x38800000 is smallest half float value (2^-14) in 32-bit float,
3564 * so compare the result and flush to 0 if it's smaller.
3565 */
3566 Temp smallest = bld.copy(bld.def(s1), Operand::c32(0x38800000u));
3567 Instruction* tmp0 =
3568 bld.vopc_e64(aco_opcode::v_cmp_lt_f32, bld.def(bld.lm), f32, smallest);
3569 tmp0->valu().abs[0] = true;
3570 denorm_zero = tmp0->definitions[0].getTemp();
3571 }
3572 if (nir_alu_instr_is_signed_zero_preserve(instr)) {
3573 Temp copysign_0 =
3574 bld.vop2(aco_opcode::v_mul_f32, bld.def(v1), Operand::zero(), as_vgpr(ctx, src));
3575 bld.vop2(aco_opcode::v_cndmask_b32, Definition(dst), f32, copysign_0, denorm_zero);
3576 } else {
3577 bld.vop2_e64(aco_opcode::v_cndmask_b32, Definition(dst), f32, Operand::zero(),
3578 denorm_zero);
3579 }
3580 } else if (dst.regClass() == s1) {
3581 Temp f16;
3582 if (ctx->block->fp_mode.round16_64 != fp_round_ne)
3583 f16 = bld.sop1(aco_opcode::p_s_cvt_f16_f32_rtne, bld.def(s1), src);
3584 else
3585 f16 = bld.sop1(aco_opcode::s_cvt_f16_f32, bld.def(s1), src);
3586
3587 if (ctx->block->fp_mode.denorm16_64 != fp_denorm_keep) {
3588 bld.sop1(aco_opcode::s_cvt_f32_f16, Definition(dst), f16);
3589 } else {
3590 Temp f32 = bld.sop1(aco_opcode::s_cvt_f32_f16, bld.def(s1), f16);
3591 Temp abs_mask = bld.copy(bld.def(s1), Operand::c32(0x7fffffff));
3592 Temp abs =
3593 bld.sop2(aco_opcode::s_and_b32, bld.def(s1), bld.def(s1, scc), f32, abs_mask);
3594 Operand sign;
3595 if (nir_alu_instr_is_signed_zero_preserve(instr)) {
3596 sign =
3597 bld.sop2(aco_opcode::s_andn2_b32, bld.def(s1), bld.def(s1, scc), f32, abs_mask);
3598 } else {
3599 sign = Operand::c32(0);
3600 }
3601 Temp smallest = bld.copy(bld.def(s1), Operand::c32(0x38800000u));
3602 Temp denorm_zero = bld.sopc(aco_opcode::s_cmp_lt_u32, bld.def(s1, scc), abs, smallest);
3603 bld.sop2(aco_opcode::s_cselect_b32, Definition(dst), sign, f32, bld.scc(denorm_zero));
3604 }
3605 } else {
3606 isel_err(&instr->instr, "Unimplemented NIR instr bit size");
3607 }
3608 break;
3609 }
3610 case nir_op_bfm: {
3611 Temp bits = get_alu_src(ctx, instr->src[0]);
3612 Temp offset = get_alu_src(ctx, instr->src[1]);
3613
3614 if (dst.regClass() == s1) {
3615 bld.sop2(aco_opcode::s_bfm_b32, Definition(dst), bits, offset);
3616 } else if (dst.regClass() == v1) {
3617 bld.vop3(aco_opcode::v_bfm_b32, Definition(dst), bits, offset);
3618 } else {
3619 isel_err(&instr->instr, "Unimplemented NIR instr bit size");
3620 }
3621 break;
3622 }
3623 case nir_op_bitfield_select: {
3624
3625 /* dst = (insert & bitmask) | (base & ~bitmask) */
3626 if (dst.regClass() == s1) {
3627 Temp bitmask = get_alu_src(ctx, instr->src[0]);
3628 Temp insert = get_alu_src(ctx, instr->src[1]);
3629 Temp base = get_alu_src(ctx, instr->src[2]);
3630 aco_ptr<Instruction> sop2;
3631 nir_const_value* const_bitmask = nir_src_as_const_value(instr->src[0].src);
3632 nir_const_value* const_insert = nir_src_as_const_value(instr->src[1].src);
3633
3634 if (const_bitmask && ctx->program->gfx_level >= GFX9 &&
3635 (const_bitmask->u32 == 0xffff || const_bitmask->u32 == 0xffff0000)) {
3636 if (const_bitmask->u32 == 0xffff) {
3637 bld.sop2(aco_opcode::s_pack_lh_b32_b16, Definition(dst), insert, base);
3638 } else {
3639 bld.sop2(aco_opcode::s_pack_lh_b32_b16, Definition(dst), base, insert);
3640 }
3641 break;
3642 }
3643
3644 Operand lhs;
3645 if (const_insert && const_bitmask) {
3646 lhs = Operand::c32(const_insert->u32 & const_bitmask->u32);
3647 } else {
3648 insert =
3649 bld.sop2(aco_opcode::s_and_b32, bld.def(s1), bld.def(s1, scc), insert, bitmask);
3650 lhs = Operand(insert);
3651 }
3652
3653 Operand rhs;
3654 nir_const_value* const_base = nir_src_as_const_value(instr->src[2].src);
3655 if (const_base && const_bitmask) {
3656 rhs = Operand::c32(const_base->u32 & ~const_bitmask->u32);
3657 } else {
3658 base = bld.sop2(aco_opcode::s_andn2_b32, bld.def(s1), bld.def(s1, scc), base, bitmask);
3659 rhs = Operand(base);
3660 }
3661
3662 bld.sop2(aco_opcode::s_or_b32, Definition(dst), bld.def(s1, scc), rhs, lhs);
3663
3664 } else if (dst.regClass() == v1) {
3665 emit_vop3a_instruction(ctx, instr, aco_opcode::v_bfi_b32, dst, false, 3);
3666 } else {
3667 isel_err(&instr->instr, "Unimplemented NIR instr bit size");
3668 }
3669 break;
3670 }
3671 case nir_op_ubfe:
3672 case nir_op_ibfe: {
3673 if (dst.bytes() != 4)
3674 unreachable("Unsupported BFE bit size");
3675
3676 if (dst.type() == RegType::sgpr) {
3677 Temp base = get_alu_src(ctx, instr->src[0]);
3678
3679 nir_const_value* const_offset = nir_src_as_const_value(instr->src[1].src);
3680 nir_const_value* const_bits = nir_src_as_const_value(instr->src[2].src);
3681 aco_opcode opcode =
3682 instr->op == nir_op_ubfe ? aco_opcode::s_bfe_u32 : aco_opcode::s_bfe_i32;
3683 if (const_offset && const_bits) {
3684 uint32_t extract = ((const_bits->u32 & 0x1f) << 16) | (const_offset->u32 & 0x1f);
3685 bld.sop2(opcode, Definition(dst), bld.def(s1, scc), base, Operand::c32(extract));
3686 break;
3687 }
3688
3689 Temp offset = get_alu_src(ctx, instr->src[1]);
3690 Temp bits = get_alu_src(ctx, instr->src[2]);
3691
3692 if (ctx->program->gfx_level >= GFX9) {
3693 Operand bits_op = const_bits ? Operand::c32(const_bits->u32 & 0x1f)
3694 : bld.sop2(aco_opcode::s_and_b32, bld.def(s1),
3695 bld.def(s1, scc), bits, Operand::c32(0x1fu));
3696 Temp extract = bld.sop2(aco_opcode::s_pack_ll_b32_b16, bld.def(s1), offset, bits_op);
3697 bld.sop2(opcode, Definition(dst), bld.def(s1, scc), base, extract);
3698 } else if (instr->op == nir_op_ubfe) {
3699 Temp mask = bld.sop2(aco_opcode::s_bfm_b32, bld.def(s1), bits, offset);
3700 Temp masked =
3701 bld.sop2(aco_opcode::s_and_b32, bld.def(s1), bld.def(s1, scc), base, mask);
3702 bld.sop2(aco_opcode::s_lshr_b32, Definition(dst), bld.def(s1, scc), masked, offset);
3703 } else {
3704 Operand bits_op = const_bits
3705 ? Operand::c32((const_bits->u32 & 0x1f) << 16)
3706 : bld.sop2(aco_opcode::s_lshl_b32, bld.def(s1), bld.def(s1, scc),
3707 bld.sop2(aco_opcode::s_and_b32, bld.def(s1),
3708 bld.def(s1, scc), bits, Operand::c32(0x1fu)),
3709 Operand::c32(16u));
3710 Operand offset_op = const_offset
3711 ? Operand::c32(const_offset->u32 & 0x1fu)
3712 : bld.sop2(aco_opcode::s_and_b32, bld.def(s1), bld.def(s1, scc),
3713 offset, Operand::c32(0x1fu));
3714
3715 Temp extract =
3716 bld.sop2(aco_opcode::s_or_b32, bld.def(s1), bld.def(s1, scc), bits_op, offset_op);
3717 bld.sop2(aco_opcode::s_bfe_i32, Definition(dst), bld.def(s1, scc), base, extract);
3718 }
3719
3720 } else {
3721 aco_opcode opcode =
3722 instr->op == nir_op_ubfe ? aco_opcode::v_bfe_u32 : aco_opcode::v_bfe_i32;
3723 emit_vop3a_instruction(ctx, instr, opcode, dst, false, 3);
3724 }
3725 break;
3726 }
3727 case nir_op_extract_u8:
3728 case nir_op_extract_i8:
3729 case nir_op_extract_u16:
3730 case nir_op_extract_i16: {
3731 bool is_signed = instr->op == nir_op_extract_i16 || instr->op == nir_op_extract_i8;
3732 unsigned comp = instr->op == nir_op_extract_u8 || instr->op == nir_op_extract_i8 ? 4 : 2;
3733 uint32_t bits = comp == 4 ? 8 : 16;
3734 unsigned index = nir_src_as_uint(instr->src[1].src);
3735 if (bits >= instr->def.bit_size || index * bits >= instr->def.bit_size) {
3736 assert(index == 0);
3737 bld.copy(Definition(dst), get_alu_src(ctx, instr->src[0]));
3738 } else if (dst.regClass() == s1 && instr->def.bit_size == 16) {
3739 Temp vec = get_ssa_temp(ctx, instr->src[0].src.ssa);
3740 unsigned swizzle = instr->src[0].swizzle[0];
3741 if (vec.size() > 1) {
3742 vec = emit_extract_vector(ctx, vec, swizzle / 2, s1);
3743 swizzle = swizzle & 1;
3744 }
3745 index += swizzle * instr->def.bit_size / bits;
3746 bld.pseudo(aco_opcode::p_extract, Definition(dst), bld.def(s1, scc), Operand(vec),
3747 Operand::c32(index), Operand::c32(bits), Operand::c32(is_signed));
3748 } else if (dst.regClass() == s1) {
3749 Temp src = get_alu_src(ctx, instr->src[0]);
3750 bld.pseudo(aco_opcode::p_extract, Definition(dst), bld.def(s1, scc), Operand(src),
3751 Operand::c32(index), Operand::c32(bits), Operand::c32(is_signed));
3752 } else if (dst.regClass() == s2) {
3753 Temp src = get_alu_src(ctx, instr->src[0]);
3754 aco_opcode op = is_signed ? aco_opcode::s_bfe_i64 : aco_opcode::s_bfe_u64;
3755 Temp extract = bld.copy(bld.def(s1), Operand::c32((bits << 16) | (index * bits)));
3756 bld.sop2(op, Definition(dst), bld.def(s1, scc), src, extract);
3757 } else {
3758 assert(dst.regClass().type() == RegType::vgpr);
3759 Temp src = get_alu_src(ctx, instr->src[0]);
3760 Definition def(dst);
3761
3762 if (dst.bytes() == 8) {
3763 src = emit_extract_vector(ctx, src, index / comp, v1);
3764 index %= comp;
3765 def = bld.def(v1);
3766 }
3767
3768 assert(def.bytes() <= 4);
3769 src = emit_extract_vector(ctx, src, 0, def.regClass());
3770 bld.pseudo(aco_opcode::p_extract, def, Operand(src), Operand::c32(index),
3771 Operand::c32(bits), Operand::c32(is_signed));
3772
3773 if (dst.size() == 2) {
3774 Temp lo = def.getTemp();
3775 Operand hi = Operand::zero();
3776 if (is_signed)
3777 hi = bld.vop2(aco_opcode::v_ashrrev_i32, bld.def(v1), Operand::c32(31), lo);
3778 bld.pseudo(aco_opcode::p_create_vector, Definition(dst), lo, hi);
3779 }
3780 }
3781 break;
3782 }
3783 case nir_op_insert_u8:
3784 case nir_op_insert_u16: {
3785 unsigned comp = instr->op == nir_op_insert_u8 ? 4 : 2;
3786 uint32_t bits = comp == 4 ? 8 : 16;
3787 unsigned index = nir_src_as_uint(instr->src[1].src);
3788 if (bits >= instr->def.bit_size || index * bits >= instr->def.bit_size) {
3789 assert(index == 0);
3790 bld.copy(Definition(dst), get_alu_src(ctx, instr->src[0]));
3791 } else {
3792 Temp src = get_alu_src(ctx, instr->src[0]);
3793 Definition def(dst);
3794 bool swap = false;
3795 if (dst.bytes() == 8) {
3796 src = emit_extract_vector(ctx, src, 0u, RegClass(src.type(), 1));
3797 swap = index >= comp;
3798 index %= comp;
3799 def = bld.def(src.type(), 1);
3800 }
3801 if (def.regClass() == s1) {
3802 bld.pseudo(aco_opcode::p_insert, def, bld.def(s1, scc), Operand(src),
3803 Operand::c32(index), Operand::c32(bits));
3804 } else {
3805 src = emit_extract_vector(ctx, src, 0, def.regClass());
3806 bld.pseudo(aco_opcode::p_insert, def, Operand(src), Operand::c32(index),
3807 Operand::c32(bits));
3808 }
3809 if (dst.size() == 2 && swap)
3810 bld.pseudo(aco_opcode::p_create_vector, Definition(dst), Operand::zero(),
3811 def.getTemp());
3812 else if (dst.size() == 2)
3813 bld.pseudo(aco_opcode::p_create_vector, Definition(dst), def.getTemp(),
3814 Operand::zero());
3815 }
3816 break;
3817 }
3818 case nir_op_bit_count: {
3819 Temp src = get_alu_src(ctx, instr->src[0]);
3820 if (src.regClass() == s1) {
3821 bld.sop1(aco_opcode::s_bcnt1_i32_b32, Definition(dst), bld.def(s1, scc), src);
3822 } else if (src.regClass() == v1) {
3823 bld.vop3(aco_opcode::v_bcnt_u32_b32, Definition(dst), src, Operand::zero());
3824 } else if (src.regClass() == v2) {
3825 bld.vop3(aco_opcode::v_bcnt_u32_b32, Definition(dst), emit_extract_vector(ctx, src, 1, v1),
3826 bld.vop3(aco_opcode::v_bcnt_u32_b32, bld.def(v1),
3827 emit_extract_vector(ctx, src, 0, v1), Operand::zero()));
3828 } else if (src.regClass() == s2) {
3829 bld.sop1(aco_opcode::s_bcnt1_i32_b64, Definition(dst), bld.def(s1, scc), src);
3830 } else {
3831 isel_err(&instr->instr, "Unimplemented NIR instr bit size");
3832 }
3833 break;
3834 }
3835 case nir_op_flt: {
3836 emit_comparison(
3837 ctx, instr, dst, aco_opcode::v_cmp_lt_f16, aco_opcode::v_cmp_lt_f32,
3838 aco_opcode::v_cmp_lt_f64,
3839 ctx->program->gfx_level >= GFX11_5 ? aco_opcode::s_cmp_lt_f16 : aco_opcode::num_opcodes,
3840 ctx->program->gfx_level >= GFX11_5 ? aco_opcode::s_cmp_lt_f32 : aco_opcode::num_opcodes);
3841 break;
3842 }
3843 case nir_op_fge: {
3844 emit_comparison(
3845 ctx, instr, dst, aco_opcode::v_cmp_ge_f16, aco_opcode::v_cmp_ge_f32,
3846 aco_opcode::v_cmp_ge_f64,
3847 ctx->program->gfx_level >= GFX11_5 ? aco_opcode::s_cmp_ge_f16 : aco_opcode::num_opcodes,
3848 ctx->program->gfx_level >= GFX11_5 ? aco_opcode::s_cmp_ge_f32 : aco_opcode::num_opcodes);
3849 break;
3850 }
3851 case nir_op_fltu: {
3852 emit_comparison(
3853 ctx, instr, dst, aco_opcode::v_cmp_nge_f16, aco_opcode::v_cmp_nge_f32,
3854 aco_opcode::v_cmp_nge_f64,
3855 ctx->program->gfx_level >= GFX11_5 ? aco_opcode::s_cmp_nge_f16 : aco_opcode::num_opcodes,
3856 ctx->program->gfx_level >= GFX11_5 ? aco_opcode::s_cmp_nge_f32 : aco_opcode::num_opcodes);
3857 break;
3858 }
3859 case nir_op_fgeu: {
3860 emit_comparison(
3861 ctx, instr, dst, aco_opcode::v_cmp_nlt_f16, aco_opcode::v_cmp_nlt_f32,
3862 aco_opcode::v_cmp_nlt_f64,
3863 ctx->program->gfx_level >= GFX11_5 ? aco_opcode::s_cmp_nlt_f16 : aco_opcode::num_opcodes,
3864 ctx->program->gfx_level >= GFX11_5 ? aco_opcode::s_cmp_nlt_f32 : aco_opcode::num_opcodes);
3865 break;
3866 }
3867 case nir_op_feq: {
3868 emit_comparison(
3869 ctx, instr, dst, aco_opcode::v_cmp_eq_f16, aco_opcode::v_cmp_eq_f32,
3870 aco_opcode::v_cmp_eq_f64,
3871 ctx->program->gfx_level >= GFX11_5 ? aco_opcode::s_cmp_eq_f16 : aco_opcode::num_opcodes,
3872 ctx->program->gfx_level >= GFX11_5 ? aco_opcode::s_cmp_eq_f32 : aco_opcode::num_opcodes);
3873 break;
3874 }
3875 case nir_op_fneu: {
3876 emit_comparison(
3877 ctx, instr, dst, aco_opcode::v_cmp_neq_f16, aco_opcode::v_cmp_neq_f32,
3878 aco_opcode::v_cmp_neq_f64,
3879 ctx->program->gfx_level >= GFX11_5 ? aco_opcode::s_cmp_neq_f16 : aco_opcode::num_opcodes,
3880 ctx->program->gfx_level >= GFX11_5 ? aco_opcode::s_cmp_neq_f32 : aco_opcode::num_opcodes);
3881 break;
3882 }
3883 case nir_op_fequ: {
3884 emit_comparison(
3885 ctx, instr, dst, aco_opcode::v_cmp_nlg_f16, aco_opcode::v_cmp_nlg_f32,
3886 aco_opcode::v_cmp_nlg_f64,
3887 ctx->program->gfx_level >= GFX11_5 ? aco_opcode::s_cmp_nlg_f16 : aco_opcode::num_opcodes,
3888 ctx->program->gfx_level >= GFX11_5 ? aco_opcode::s_cmp_nlg_f32 : aco_opcode::num_opcodes);
3889 break;
3890 }
3891 case nir_op_fneo: {
3892 emit_comparison(
3893 ctx, instr, dst, aco_opcode::v_cmp_lg_f16, aco_opcode::v_cmp_lg_f32,
3894 aco_opcode::v_cmp_lg_f64,
3895 ctx->program->gfx_level >= GFX11_5 ? aco_opcode::s_cmp_lg_f16 : aco_opcode::num_opcodes,
3896 ctx->program->gfx_level >= GFX11_5 ? aco_opcode::s_cmp_lg_f32 : aco_opcode::num_opcodes);
3897 break;
3898 }
3899 case nir_op_funord: {
3900 emit_comparison(
3901 ctx, instr, dst, aco_opcode::v_cmp_u_f16, aco_opcode::v_cmp_u_f32, aco_opcode::v_cmp_u_f64,
3902 ctx->program->gfx_level >= GFX11_5 ? aco_opcode::s_cmp_u_f16 : aco_opcode::num_opcodes,
3903 ctx->program->gfx_level >= GFX11_5 ? aco_opcode::s_cmp_u_f32 : aco_opcode::num_opcodes);
3904 break;
3905 }
3906 case nir_op_ford: {
3907 emit_comparison(
3908 ctx, instr, dst, aco_opcode::v_cmp_o_f16, aco_opcode::v_cmp_o_f32, aco_opcode::v_cmp_o_f64,
3909 ctx->program->gfx_level >= GFX11_5 ? aco_opcode::s_cmp_o_f16 : aco_opcode::num_opcodes,
3910 ctx->program->gfx_level >= GFX11_5 ? aco_opcode::s_cmp_o_f32 : aco_opcode::num_opcodes);
3911 break;
3912 }
3913 case nir_op_ilt: {
3914 emit_comparison(ctx, instr, dst, aco_opcode::v_cmp_lt_i16, aco_opcode::v_cmp_lt_i32,
3915 aco_opcode::v_cmp_lt_i64, aco_opcode::num_opcodes, aco_opcode::s_cmp_lt_i32);
3916 break;
3917 }
3918 case nir_op_ige: {
3919 emit_comparison(ctx, instr, dst, aco_opcode::v_cmp_ge_i16, aco_opcode::v_cmp_ge_i32,
3920 aco_opcode::v_cmp_ge_i64, aco_opcode::num_opcodes, aco_opcode::s_cmp_ge_i32);
3921 break;
3922 }
3923 case nir_op_ieq: {
3924 if (instr->src[0].src.ssa->bit_size == 1)
3925 emit_boolean_logic(ctx, instr, Builder::s_xnor, dst);
3926 else
3927 emit_comparison(
3928 ctx, instr, dst, aco_opcode::v_cmp_eq_i16, aco_opcode::v_cmp_eq_i32,
3929 aco_opcode::v_cmp_eq_i64, aco_opcode::num_opcodes, aco_opcode::s_cmp_eq_i32,
3930 ctx->program->gfx_level >= GFX8 ? aco_opcode::s_cmp_eq_u64 : aco_opcode::num_opcodes);
3931 break;
3932 }
3933 case nir_op_ine: {
3934 if (instr->src[0].src.ssa->bit_size == 1)
3935 emit_boolean_logic(ctx, instr, Builder::s_xor, dst);
3936 else
3937 emit_comparison(
3938 ctx, instr, dst, aco_opcode::v_cmp_lg_i16, aco_opcode::v_cmp_lg_i32,
3939 aco_opcode::v_cmp_lg_i64, aco_opcode::num_opcodes, aco_opcode::s_cmp_lg_i32,
3940 ctx->program->gfx_level >= GFX8 ? aco_opcode::s_cmp_lg_u64 : aco_opcode::num_opcodes);
3941 break;
3942 }
3943 case nir_op_ult: {
3944 emit_comparison(ctx, instr, dst, aco_opcode::v_cmp_lt_u16, aco_opcode::v_cmp_lt_u32,
3945 aco_opcode::v_cmp_lt_u64, aco_opcode::num_opcodes, aco_opcode::s_cmp_lt_u32);
3946 break;
3947 }
3948 case nir_op_uge: {
3949 emit_comparison(ctx, instr, dst, aco_opcode::v_cmp_ge_u16, aco_opcode::v_cmp_ge_u32,
3950 aco_opcode::v_cmp_ge_u64, aco_opcode::num_opcodes, aco_opcode::s_cmp_ge_u32);
3951 break;
3952 }
3953 case nir_op_bitz:
3954 case nir_op_bitnz: {
3955 assert(instr->src[0].src.ssa->bit_size != 1);
3956 bool test0 = instr->op == nir_op_bitz;
3957 Temp src0 = get_alu_src(ctx, instr->src[0]);
3958 Temp src1 = get_alu_src(ctx, instr->src[1]);
3959 bool use_valu = src0.type() == RegType::vgpr || src1.type() == RegType::vgpr;
3960 if (!use_valu) {
3961 aco_opcode op = instr->src[0].src.ssa->bit_size == 64 ? aco_opcode::s_bitcmp1_b64
3962 : aco_opcode::s_bitcmp1_b32;
3963 if (test0)
3964 op = instr->src[0].src.ssa->bit_size == 64 ? aco_opcode::s_bitcmp0_b64
3965 : aco_opcode::s_bitcmp0_b32;
3966 emit_sopc_instruction(ctx, instr, op, dst);
3967 break;
3968 }
3969
3970 /* We do not have a VALU version of s_bitcmp.
3971 * But if the second source is constant, we can use
3972 * v_cmp_class_f32's LUT to check the bit.
3973 * The LUT only has 10 entries, so extract a higher byte if we have to.
3974 * For sign bits comparision with 0 is better because v_cmp_class
3975 * can't be inverted.
3976 */
3977 if (nir_src_is_const(instr->src[1].src)) {
3978 uint32_t bit = nir_alu_src_as_uint(instr->src[1]);
3979 bit &= instr->src[0].src.ssa->bit_size - 1;
3980 src0 = as_vgpr(ctx, src0);
3981
3982 if (src0.regClass() == v2) {
3983 src0 = emit_extract_vector(ctx, src0, (bit & 32) != 0, v1);
3984 bit &= 31;
3985 }
3986
3987 if (bit == 31) {
3988 bld.vopc(test0 ? aco_opcode::v_cmp_le_i32 : aco_opcode::v_cmp_gt_i32, Definition(dst),
3989 Operand::c32(0), src0);
3990 break;
3991 }
3992
3993 if (bit == 15 && ctx->program->gfx_level >= GFX8) {
3994 bld.vopc(test0 ? aco_opcode::v_cmp_le_i16 : aco_opcode::v_cmp_gt_i16, Definition(dst),
3995 Operand::c32(0), src0);
3996 break;
3997 }
3998
3999 /* Set max_bit lower to avoid +inf if we can use sdwa+qnan instead. */
4000 const bool can_sdwa = ctx->program->gfx_level >= GFX8 && ctx->program->gfx_level < GFX11;
4001 const unsigned max_bit = can_sdwa ? 0x8 : 0x9;
4002 const bool use_opsel = bit > 0xf && (bit & 0xf) <= max_bit;
4003 if (use_opsel) {
4004 src0 = bld.pseudo(aco_opcode::p_extract, bld.def(v1), src0, Operand::c32(1),
4005 Operand::c32(16), Operand::c32(0));
4006 bit &= 0xf;
4007 }
4008
4009 /* If we can use sdwa the extract is free, while test0's s_not is not. */
4010 if (bit == 7 && test0 && can_sdwa) {
4011 src0 = bld.pseudo(aco_opcode::p_extract, bld.def(v1), src0, Operand::c32(bit / 8),
4012 Operand::c32(8), Operand::c32(1));
4013 bld.vopc(test0 ? aco_opcode::v_cmp_le_i32 : aco_opcode::v_cmp_gt_i32, Definition(dst),
4014 Operand::c32(0), src0);
4015 break;
4016 }
4017
4018 if (bit > max_bit) {
4019 src0 = bld.pseudo(aco_opcode::p_extract, bld.def(v1), src0, Operand::c32(bit / 8),
4020 Operand::c32(8), Operand::c32(0));
4021 bit &= 0x7;
4022 }
4023
4024 /* denorm and snan/qnan inputs are preserved using all float control modes. */
4025 static const struct {
4026 uint32_t fp32;
4027 uint32_t fp16;
4028 bool negate;
4029 } float_lut[10] = {
4030 {0x7f800001, 0x7c01, false}, /* snan */
4031 {~0u, ~0u, false}, /* qnan */
4032 {0xff800000, 0xfc00, false}, /* -inf */
4033 {0xbf800000, 0xbc00, false}, /* -normal (-1.0) */
4034 {1, 1, true}, /* -denormal */
4035 {0, 0, true}, /* -0.0 */
4036 {0, 0, false}, /* +0.0 */
4037 {1, 1, false}, /* +denormal */
4038 {0x3f800000, 0x3c00, false}, /* +normal (+1.0) */
4039 {0x7f800000, 0x7c00, false}, /* +inf */
4040 };
4041
4042 Temp tmp = test0 ? bld.tmp(bld.lm) : dst;
4043 /* fp16 can use s_movk for bit 0. It also supports opsel on gfx11. */
4044 const bool use_fp16 = (ctx->program->gfx_level >= GFX8 && bit == 0) ||
4045 (ctx->program->gfx_level >= GFX11 && use_opsel);
4046 const aco_opcode op = use_fp16 ? aco_opcode::v_cmp_class_f16 : aco_opcode::v_cmp_class_f32;
4047 const uint32_t c = use_fp16 ? float_lut[bit].fp16 : float_lut[bit].fp32;
4048
4049 VALU_instruction& res =
4050 bld.vopc(op, Definition(tmp), bld.copy(bld.def(s1), Operand::c32(c)), src0)->valu();
4051 if (float_lut[bit].negate) {
4052 res.format = asVOP3(res.format);
4053 res.neg[0] = true;
4054 }
4055
4056 if (test0)
4057 bld.sop1(Builder::s_not, Definition(dst), bld.def(s1, scc), tmp);
4058
4059 break;
4060 }
4061
4062 Temp res;
4063 aco_opcode op = test0 ? aco_opcode::v_cmp_eq_i32 : aco_opcode::v_cmp_lg_i32;
4064 if (instr->src[0].src.ssa->bit_size == 16) {
4065 op = test0 ? aco_opcode::v_cmp_eq_i16 : aco_opcode::v_cmp_lg_i16;
4066 if (ctx->program->gfx_level < GFX10)
4067 res = bld.vop2_e64(aco_opcode::v_lshlrev_b16, bld.def(v2b), src1, Operand::c32(1));
4068 else
4069 res = bld.vop3(aco_opcode::v_lshlrev_b16_e64, bld.def(v2b), src1, Operand::c32(1));
4070
4071 res = bld.vop2(aco_opcode::v_and_b32, bld.def(v2b), src0, res);
4072 } else if (instr->src[0].src.ssa->bit_size == 32) {
4073 res = bld.vop3(aco_opcode::v_bfe_u32, bld.def(v1), src0, src1, Operand::c32(1));
4074 } else if (instr->src[0].src.ssa->bit_size == 64) {
4075 if (ctx->program->gfx_level < GFX8)
4076 res = bld.vop3(aco_opcode::v_lshr_b64, bld.def(v2), src0, src1);
4077 else
4078 res = bld.vop3(aco_opcode::v_lshrrev_b64, bld.def(v2), src1, src0);
4079
4080 res = emit_extract_vector(ctx, res, 0, v1);
4081 res = bld.vop2(aco_opcode::v_and_b32, bld.def(v1), Operand::c32(0x1), res);
4082 } else {
4083 isel_err(&instr->instr, "Unimplemented NIR instr bit size");
4084 }
4085 bld.vopc(op, Definition(dst), Operand::c32(0), res);
4086 break;
4087 }
4088 default: isel_err(&instr->instr, "Unknown NIR ALU instr");
4089 }
4090 }
4091
4092 void
visit_load_const(isel_context * ctx,nir_load_const_instr * instr)4093 visit_load_const(isel_context* ctx, nir_load_const_instr* instr)
4094 {
4095 Temp dst = get_ssa_temp(ctx, &instr->def);
4096
4097 // TODO: we really want to have the resulting type as this would allow for 64bit literals
4098 // which get truncated the lsb if double and msb if int
4099 // for now, we only use s_mov_b64 with 64bit inline constants
4100 assert(instr->def.num_components == 1 && "Vector load_const should be lowered to scalar.");
4101 assert(dst.type() == RegType::sgpr);
4102
4103 Builder bld(ctx->program, ctx->block);
4104
4105 if (instr->def.bit_size == 1) {
4106 assert(dst.regClass() == bld.lm);
4107 int val = instr->value[0].b ? -1 : 0;
4108 Operand op = bld.lm.size() == 1 ? Operand::c32(val) : Operand::c64(val);
4109 bld.copy(Definition(dst), op);
4110 } else if (instr->def.bit_size == 8) {
4111 bld.copy(Definition(dst), Operand::c32(instr->value[0].u8));
4112 } else if (instr->def.bit_size == 16) {
4113 /* sign-extend to use s_movk_i32 instead of a literal */
4114 bld.copy(Definition(dst), Operand::c32(instr->value[0].i16));
4115 } else if (dst.size() == 1) {
4116 bld.copy(Definition(dst), Operand::c32(instr->value[0].u32));
4117 } else {
4118 assert(dst.size() != 1);
4119 aco_ptr<Instruction> vec{
4120 create_instruction(aco_opcode::p_create_vector, Format::PSEUDO, dst.size(), 1)};
4121 if (instr->def.bit_size == 64)
4122 for (unsigned i = 0; i < dst.size(); i++)
4123 vec->operands[i] = Operand::c32(instr->value[0].u64 >> i * 32);
4124 else {
4125 for (unsigned i = 0; i < dst.size(); i++)
4126 vec->operands[i] = Operand::c32(instr->value[i].u32);
4127 }
4128 vec->definitions[0] = Definition(dst);
4129 ctx->block->instructions.emplace_back(std::move(vec));
4130 }
4131 }
4132
4133 Temp
emit_readfirstlane(isel_context * ctx,Temp src,Temp dst)4134 emit_readfirstlane(isel_context* ctx, Temp src, Temp dst)
4135 {
4136 Builder bld(ctx->program, ctx->block);
4137
4138 if (src.regClass().type() == RegType::sgpr) {
4139 bld.copy(Definition(dst), src);
4140 } else if (src.size() == 1) {
4141 bld.vop1(aco_opcode::v_readfirstlane_b32, Definition(dst), src);
4142 } else {
4143 aco_ptr<Instruction> split{
4144 create_instruction(aco_opcode::p_split_vector, Format::PSEUDO, 1, src.size())};
4145 split->operands[0] = Operand(src);
4146
4147 for (unsigned i = 0; i < src.size(); i++) {
4148 split->definitions[i] =
4149 bld.def(RegClass::get(RegType::vgpr, MIN2(src.bytes() - i * 4, 4)));
4150 }
4151
4152 Instruction* split_raw = split.get();
4153 ctx->block->instructions.emplace_back(std::move(split));
4154
4155 aco_ptr<Instruction> vec{
4156 create_instruction(aco_opcode::p_create_vector, Format::PSEUDO, src.size(), 1)};
4157 vec->definitions[0] = Definition(dst);
4158 for (unsigned i = 0; i < src.size(); i++) {
4159 vec->operands[i] = bld.vop1(aco_opcode::v_readfirstlane_b32, bld.def(s1),
4160 split_raw->definitions[i].getTemp());
4161 }
4162
4163 ctx->block->instructions.emplace_back(std::move(vec));
4164 if (src.bytes() % 4 == 0)
4165 emit_split_vector(ctx, dst, src.size());
4166 }
4167
4168 return dst;
4169 }
4170
4171 struct LoadEmitInfo {
4172 Operand offset;
4173 Temp dst;
4174 unsigned num_components;
4175 unsigned component_size;
4176 Temp resource = Temp(0, s1); /* buffer resource or base 64-bit address */
4177 Temp idx = Temp(0, v1); /* buffer index */
4178 unsigned component_stride = 0;
4179 unsigned const_offset = 0;
4180 unsigned align_mul = 0;
4181 unsigned align_offset = 0;
4182 pipe_format format;
4183
4184 ac_hw_cache_flags cache = {{0, 0, 0, 0, 0}};
4185 bool split_by_component_stride = true;
4186 bool readfirstlane_for_uniform = false;
4187 unsigned swizzle_component_size = 0;
4188 memory_sync_info sync;
4189 Temp soffset = Temp(0, s1);
4190 };
4191
4192 struct EmitLoadParameters {
4193 using Callback = Temp (*)(Builder& bld, const LoadEmitInfo& info, Temp offset,
4194 unsigned bytes_needed, unsigned align, unsigned const_offset,
4195 Temp dst_hint);
4196
4197 Callback callback;
4198 unsigned max_const_offset_plus_one;
4199 };
4200
4201 void
emit_load(isel_context * ctx,Builder & bld,const LoadEmitInfo & info,const EmitLoadParameters & params)4202 emit_load(isel_context* ctx, Builder& bld, const LoadEmitInfo& info,
4203 const EmitLoadParameters& params)
4204 {
4205 unsigned load_size = info.num_components * info.component_size;
4206 unsigned component_size = info.component_size;
4207
4208 unsigned num_vals = 0;
4209 Temp* const vals = (Temp*)alloca(info.dst.bytes() * sizeof(Temp));
4210
4211 unsigned const_offset = info.const_offset;
4212
4213 const unsigned align_mul = info.align_mul ? info.align_mul : component_size;
4214 unsigned align_offset = info.align_offset % align_mul;
4215
4216 unsigned bytes_read = 0;
4217 while (bytes_read < load_size) {
4218 unsigned bytes_needed = load_size - bytes_read;
4219
4220 if (info.split_by_component_stride) {
4221 if (info.swizzle_component_size)
4222 bytes_needed = MIN2(bytes_needed, info.swizzle_component_size);
4223 if (info.component_stride)
4224 bytes_needed = MIN2(bytes_needed, info.component_size);
4225 }
4226
4227 /* reduce constant offset */
4228 Operand offset = info.offset;
4229 unsigned reduced_const_offset = const_offset;
4230 if (const_offset && (const_offset >= params.max_const_offset_plus_one)) {
4231 unsigned to_add =
4232 const_offset / params.max_const_offset_plus_one * params.max_const_offset_plus_one;
4233 reduced_const_offset %= params.max_const_offset_plus_one;
4234 Temp offset_tmp = offset.isTemp() ? offset.getTemp() : Temp();
4235 if (offset.isConstant()) {
4236 offset = Operand::c32(offset.constantValue() + to_add);
4237 } else if (offset.isUndefined()) {
4238 offset = Operand::c32(to_add);
4239 } else if (offset_tmp.regClass() == s1) {
4240 offset = bld.sop2(aco_opcode::s_add_i32, bld.def(s1), bld.def(s1, scc), offset_tmp,
4241 Operand::c32(to_add));
4242 } else if (offset_tmp.regClass() == v1) {
4243 offset = bld.vadd32(bld.def(v1), offset_tmp, Operand::c32(to_add));
4244 } else {
4245 Temp lo = bld.tmp(offset_tmp.type(), 1);
4246 Temp hi = bld.tmp(offset_tmp.type(), 1);
4247 bld.pseudo(aco_opcode::p_split_vector, Definition(lo), Definition(hi), offset_tmp);
4248
4249 if (offset_tmp.regClass() == s2) {
4250 Temp carry = bld.tmp(s1);
4251 lo = bld.sop2(aco_opcode::s_add_u32, bld.def(s1), bld.scc(Definition(carry)), lo,
4252 Operand::c32(to_add));
4253 hi = bld.sop2(aco_opcode::s_add_u32, bld.def(s1), bld.def(s1, scc), hi, carry);
4254 offset = bld.pseudo(aco_opcode::p_create_vector, bld.def(s2), lo, hi);
4255 } else {
4256 Temp new_lo = bld.tmp(v1);
4257 Temp carry =
4258 bld.vadd32(Definition(new_lo), lo, Operand::c32(to_add), true).def(1).getTemp();
4259 hi = bld.vadd32(bld.def(v1), hi, Operand::zero(), false, carry);
4260 offset = bld.pseudo(aco_opcode::p_create_vector, bld.def(v2), new_lo, hi);
4261 }
4262 }
4263 }
4264
4265 unsigned align = align_offset ? 1 << (ffs(align_offset) - 1) : align_mul;
4266 Temp offset_tmp = offset.isTemp() ? offset.getTemp()
4267 : offset.isConstant() ? bld.copy(bld.def(s1), offset)
4268 : Temp(0, s1);
4269
4270 Temp val = params.callback(bld, info, offset_tmp, bytes_needed, align, reduced_const_offset,
4271 info.dst);
4272
4273 /* the callback wrote directly to dst */
4274 if (val == info.dst) {
4275 assert(num_vals == 0);
4276 emit_split_vector(ctx, info.dst, info.num_components);
4277 return;
4278 }
4279
4280 /* add result to list and advance */
4281 if (info.component_stride) {
4282 assert(val.bytes() % info.component_size == 0);
4283 unsigned num_loaded_components = val.bytes() / info.component_size;
4284 unsigned advance_bytes = info.component_stride * num_loaded_components;
4285 const_offset += advance_bytes;
4286 align_offset = (align_offset + advance_bytes) % align_mul;
4287 } else {
4288 const_offset += val.bytes();
4289 align_offset = (align_offset + val.bytes()) % align_mul;
4290 }
4291 bytes_read += val.bytes();
4292 vals[num_vals++] = val;
4293 }
4294
4295 /* create array of components */
4296 unsigned components_split = 0;
4297 std::array<Temp, NIR_MAX_VEC_COMPONENTS> allocated_vec;
4298 bool has_vgprs = false;
4299 for (unsigned i = 0; i < num_vals;) {
4300 Temp* const tmp = (Temp*)alloca(num_vals * sizeof(Temp));
4301 unsigned num_tmps = 0;
4302 unsigned tmp_size = 0;
4303 RegType reg_type = RegType::sgpr;
4304 while ((!tmp_size || (tmp_size % component_size)) && i < num_vals) {
4305 if (vals[i].type() == RegType::vgpr)
4306 reg_type = RegType::vgpr;
4307 tmp_size += vals[i].bytes();
4308 tmp[num_tmps++] = vals[i++];
4309 }
4310 if (num_tmps > 1) {
4311 aco_ptr<Instruction> vec{
4312 create_instruction(aco_opcode::p_create_vector, Format::PSEUDO, num_tmps, 1)};
4313 for (unsigned j = 0; j < num_tmps; j++)
4314 vec->operands[j] = Operand(tmp[j]);
4315 tmp[0] = bld.tmp(RegClass::get(reg_type, tmp_size));
4316 vec->definitions[0] = Definition(tmp[0]);
4317 bld.insert(std::move(vec));
4318 }
4319
4320 if (tmp[0].bytes() % component_size) {
4321 /* trim tmp[0] */
4322 assert(i == num_vals);
4323 RegClass new_rc =
4324 RegClass::get(reg_type, tmp[0].bytes() / component_size * component_size);
4325 tmp[0] =
4326 bld.pseudo(aco_opcode::p_extract_vector, bld.def(new_rc), tmp[0], Operand::zero());
4327 }
4328
4329 RegClass elem_rc = RegClass::get(reg_type, component_size);
4330
4331 unsigned start = components_split;
4332
4333 if (tmp_size == elem_rc.bytes()) {
4334 allocated_vec[components_split++] = tmp[0];
4335 } else {
4336 assert(tmp_size % elem_rc.bytes() == 0);
4337 aco_ptr<Instruction> split{create_instruction(aco_opcode::p_split_vector, Format::PSEUDO,
4338 1, tmp_size / elem_rc.bytes())};
4339 for (auto& def : split->definitions) {
4340 Temp component = bld.tmp(elem_rc);
4341 allocated_vec[components_split++] = component;
4342 def = Definition(component);
4343 }
4344 split->operands[0] = Operand(tmp[0]);
4345 bld.insert(std::move(split));
4346 }
4347
4348 /* try to p_as_uniform early so we can create more optimizable code and
4349 * also update allocated_vec */
4350 for (unsigned j = start; j < components_split; j++) {
4351 if (allocated_vec[j].bytes() % 4 == 0 && info.dst.type() == RegType::sgpr) {
4352 if (info.readfirstlane_for_uniform) {
4353 allocated_vec[j] = emit_readfirstlane(
4354 ctx, allocated_vec[j], bld.tmp(RegClass(RegType::sgpr, allocated_vec[j].size())));
4355 } else {
4356 allocated_vec[j] = bld.as_uniform(allocated_vec[j]);
4357 }
4358 }
4359 has_vgprs |= allocated_vec[j].type() == RegType::vgpr;
4360 }
4361 }
4362
4363 /* concatenate components and p_as_uniform() result if needed */
4364 if (info.dst.type() == RegType::vgpr || !has_vgprs)
4365 ctx->allocated_vec.emplace(info.dst.id(), allocated_vec);
4366
4367 int padding_bytes =
4368 MAX2((int)info.dst.bytes() - int(allocated_vec[0].bytes() * info.num_components), 0);
4369
4370 aco_ptr<Instruction> vec{create_instruction(aco_opcode::p_create_vector, Format::PSEUDO,
4371 info.num_components + !!padding_bytes, 1)};
4372 for (unsigned i = 0; i < info.num_components; i++)
4373 vec->operands[i] = Operand(allocated_vec[i]);
4374 if (padding_bytes)
4375 vec->operands[info.num_components] = Operand(RegClass::get(RegType::vgpr, padding_bytes));
4376 if (info.dst.type() == RegType::sgpr && has_vgprs) {
4377 Temp tmp = bld.tmp(RegType::vgpr, info.dst.size());
4378 vec->definitions[0] = Definition(tmp);
4379 bld.insert(std::move(vec));
4380 if (info.readfirstlane_for_uniform)
4381 emit_readfirstlane(ctx, tmp, info.dst);
4382 else
4383 bld.pseudo(aco_opcode::p_as_uniform, Definition(info.dst), tmp);
4384 } else {
4385 vec->definitions[0] = Definition(info.dst);
4386 bld.insert(std::move(vec));
4387 }
4388 }
4389
4390 Operand
load_lds_size_m0(Builder & bld)4391 load_lds_size_m0(Builder& bld)
4392 {
4393 /* m0 does not need to be initialized on GFX9+ */
4394 if (bld.program->gfx_level >= GFX9)
4395 return Operand(s1);
4396
4397 return bld.m0((Temp)bld.copy(bld.def(s1, m0), Operand::c32(0xffffffffu)));
4398 }
4399
4400 Temp
lds_load_callback(Builder & bld,const LoadEmitInfo & info,Temp offset,unsigned bytes_needed,unsigned align,unsigned const_offset,Temp dst_hint)4401 lds_load_callback(Builder& bld, const LoadEmitInfo& info, Temp offset, unsigned bytes_needed,
4402 unsigned align, unsigned const_offset, Temp dst_hint)
4403 {
4404 offset = offset.regClass() == s1 ? bld.copy(bld.def(v1), offset) : offset;
4405
4406 Operand m = load_lds_size_m0(bld);
4407
4408 bool large_ds_read = bld.program->gfx_level >= GFX7;
4409 bool usable_read2 = bld.program->gfx_level >= GFX7;
4410
4411 bool read2 = false;
4412 unsigned size = 0;
4413 aco_opcode op;
4414 if (bytes_needed >= 16 && align % 16 == 0 && large_ds_read) {
4415 size = 16;
4416 op = aco_opcode::ds_read_b128;
4417 } else if (bytes_needed >= 16 && align % 8 == 0 && const_offset % 8 == 0 && usable_read2) {
4418 size = 16;
4419 read2 = true;
4420 op = aco_opcode::ds_read2_b64;
4421 } else if (bytes_needed >= 12 && align % 16 == 0 && large_ds_read) {
4422 size = 12;
4423 op = aco_opcode::ds_read_b96;
4424 } else if (bytes_needed >= 8 && align % 8 == 0) {
4425 size = 8;
4426 op = aco_opcode::ds_read_b64;
4427 } else if (bytes_needed >= 8 && align % 4 == 0 && const_offset % 4 == 0 && usable_read2) {
4428 size = 8;
4429 read2 = true;
4430 op = aco_opcode::ds_read2_b32;
4431 } else if (bytes_needed >= 4 && align % 4 == 0) {
4432 size = 4;
4433 op = aco_opcode::ds_read_b32;
4434 } else if (bytes_needed >= 2 && align % 2 == 0) {
4435 size = 2;
4436 op = bld.program->gfx_level >= GFX9 ? aco_opcode::ds_read_u16_d16 : aco_opcode::ds_read_u16;
4437 } else {
4438 size = 1;
4439 op = bld.program->gfx_level >= GFX9 ? aco_opcode::ds_read_u8_d16 : aco_opcode::ds_read_u8;
4440 }
4441
4442 unsigned const_offset_unit = read2 ? size / 2u : 1u;
4443 unsigned const_offset_range = read2 ? 255 * const_offset_unit : 65536;
4444
4445 if (const_offset > (const_offset_range - const_offset_unit)) {
4446 unsigned excess = const_offset - (const_offset % const_offset_range);
4447 offset = bld.vadd32(bld.def(v1), offset, Operand::c32(excess));
4448 const_offset -= excess;
4449 }
4450
4451 const_offset /= const_offset_unit;
4452
4453 RegClass rc = RegClass::get(RegType::vgpr, size);
4454 Temp val = rc == info.dst.regClass() && dst_hint.id() ? dst_hint : bld.tmp(rc);
4455 Instruction* instr;
4456 if (read2)
4457 instr = bld.ds(op, Definition(val), offset, m, const_offset, const_offset + 1);
4458 else
4459 instr = bld.ds(op, Definition(val), offset, m, const_offset);
4460 instr->ds().sync = info.sync;
4461
4462 if (m.isUndefined())
4463 instr->operands.pop_back();
4464
4465 return val;
4466 }
4467
4468 const EmitLoadParameters lds_load_params{lds_load_callback, UINT32_MAX};
4469
4470 Temp
smem_load_callback(Builder & bld,const LoadEmitInfo & info,Temp offset,unsigned bytes_needed,unsigned align,unsigned const_offset,Temp dst_hint)4471 smem_load_callback(Builder& bld, const LoadEmitInfo& info, Temp offset, unsigned bytes_needed,
4472 unsigned align, unsigned const_offset, Temp dst_hint)
4473 {
4474 assert(align >= 4u);
4475
4476 bld.program->has_smem_buffer_or_global_loads = true;
4477
4478 bool buffer = info.resource.id() && info.resource.bytes() == 16;
4479 Temp addr = info.resource;
4480 if (!buffer && !addr.id()) {
4481 addr = offset;
4482 offset = Temp();
4483 }
4484
4485 bytes_needed = MIN2(bytes_needed, 64);
4486 unsigned needed_round_up = util_next_power_of_two(bytes_needed);
4487 unsigned needed_round_down = needed_round_up >> (needed_round_up != bytes_needed ? 1 : 0);
4488 /* Only round-up global loads if it's aligned so that it won't cross pages */
4489 bytes_needed = buffer || align % needed_round_up == 0 ? needed_round_up : needed_round_down;
4490
4491 aco_opcode op;
4492 if (bytes_needed <= 4) {
4493 op = buffer ? aco_opcode::s_buffer_load_dword : aco_opcode::s_load_dword;
4494 } else if (bytes_needed <= 8) {
4495 op = buffer ? aco_opcode::s_buffer_load_dwordx2 : aco_opcode::s_load_dwordx2;
4496 } else if (bytes_needed <= 16) {
4497 op = buffer ? aco_opcode::s_buffer_load_dwordx4 : aco_opcode::s_load_dwordx4;
4498 } else if (bytes_needed <= 32) {
4499 op = buffer ? aco_opcode::s_buffer_load_dwordx8 : aco_opcode::s_load_dwordx8;
4500 } else {
4501 assert(bytes_needed == 64);
4502 op = buffer ? aco_opcode::s_buffer_load_dwordx16 : aco_opcode::s_load_dwordx16;
4503 }
4504
4505 aco_ptr<Instruction> load{create_instruction(op, Format::SMEM, 2, 1)};
4506 if (buffer) {
4507 if (const_offset)
4508 offset = bld.sop2(aco_opcode::s_add_u32, bld.def(s1), bld.def(s1, scc), offset,
4509 Operand::c32(const_offset));
4510 load->operands[0] = Operand(info.resource);
4511 load->operands[1] = Operand(offset);
4512 } else {
4513 load->operands[0] = Operand(addr);
4514 if (offset.id() && const_offset)
4515 load->operands[1] = bld.sop2(aco_opcode::s_add_u32, bld.def(s1), bld.def(s1, scc), offset,
4516 Operand::c32(const_offset));
4517 else if (offset.id())
4518 load->operands[1] = Operand(offset);
4519 else
4520 load->operands[1] = Operand::c32(const_offset);
4521 }
4522 RegClass rc(RegType::sgpr, DIV_ROUND_UP(bytes_needed, 4u));
4523 Temp val = dst_hint.id() && dst_hint.regClass() == rc ? dst_hint : bld.tmp(rc);
4524 load->definitions[0] = Definition(val);
4525 load->smem().cache = info.cache;
4526 load->smem().sync = info.sync;
4527 bld.insert(std::move(load));
4528 return val;
4529 }
4530
4531 const EmitLoadParameters smem_load_params{smem_load_callback, 1024};
4532
4533 Temp
mubuf_load_callback(Builder & bld,const LoadEmitInfo & info,Temp offset,unsigned bytes_needed,unsigned align_,unsigned const_offset,Temp dst_hint)4534 mubuf_load_callback(Builder& bld, const LoadEmitInfo& info, Temp offset, unsigned bytes_needed,
4535 unsigned align_, unsigned const_offset, Temp dst_hint)
4536 {
4537 Operand vaddr = offset.type() == RegType::vgpr ? Operand(offset) : Operand(v1);
4538 Operand soffset = offset.type() == RegType::sgpr ? Operand(offset) : Operand::c32(0);
4539
4540 if (info.soffset.id()) {
4541 if (soffset.isTemp())
4542 vaddr = bld.copy(bld.def(v1), soffset);
4543 soffset = Operand(info.soffset);
4544 }
4545
4546 if (soffset.isUndefined())
4547 soffset = Operand::zero();
4548
4549 bool offen = !vaddr.isUndefined();
4550 bool idxen = info.idx.id();
4551
4552 if (offen && idxen)
4553 vaddr = bld.pseudo(aco_opcode::p_create_vector, bld.def(v2), info.idx, vaddr);
4554 else if (idxen)
4555 vaddr = Operand(info.idx);
4556
4557 unsigned bytes_size = 0;
4558 aco_opcode op;
4559 if (bytes_needed == 1 || align_ % 2) {
4560 bytes_size = 1;
4561 op = aco_opcode::buffer_load_ubyte;
4562 } else if (bytes_needed == 2 || align_ % 4) {
4563 bytes_size = 2;
4564 op = aco_opcode::buffer_load_ushort;
4565 } else if (bytes_needed <= 4) {
4566 bytes_size = 4;
4567 op = aco_opcode::buffer_load_dword;
4568 } else if (bytes_needed <= 8) {
4569 bytes_size = 8;
4570 op = aco_opcode::buffer_load_dwordx2;
4571 } else if (bytes_needed <= 12 && bld.program->gfx_level > GFX6) {
4572 bytes_size = 12;
4573 op = aco_opcode::buffer_load_dwordx3;
4574 } else {
4575 bytes_size = 16;
4576 op = aco_opcode::buffer_load_dwordx4;
4577 }
4578 aco_ptr<Instruction> mubuf{create_instruction(op, Format::MUBUF, 3, 1)};
4579 mubuf->operands[0] = Operand(info.resource);
4580 mubuf->operands[1] = vaddr;
4581 mubuf->operands[2] = soffset;
4582 mubuf->mubuf().offen = offen;
4583 mubuf->mubuf().idxen = idxen;
4584 mubuf->mubuf().cache = info.cache;
4585 mubuf->mubuf().sync = info.sync;
4586 mubuf->mubuf().offset = const_offset;
4587 RegClass rc = RegClass::get(RegType::vgpr, bytes_size);
4588 Temp val = dst_hint.id() && rc == dst_hint.regClass() ? dst_hint : bld.tmp(rc);
4589 mubuf->definitions[0] = Definition(val);
4590 bld.insert(std::move(mubuf));
4591
4592 return val;
4593 }
4594
4595 const EmitLoadParameters mubuf_load_params{mubuf_load_callback, 4096};
4596
4597 Temp
mubuf_load_format_callback(Builder & bld,const LoadEmitInfo & info,Temp offset,unsigned bytes_needed,unsigned align_,unsigned const_offset,Temp dst_hint)4598 mubuf_load_format_callback(Builder& bld, const LoadEmitInfo& info, Temp offset,
4599 unsigned bytes_needed, unsigned align_, unsigned const_offset,
4600 Temp dst_hint)
4601 {
4602 Operand vaddr = offset.type() == RegType::vgpr ? Operand(offset) : Operand(v1);
4603 Operand soffset = offset.type() == RegType::sgpr ? Operand(offset) : Operand::c32(0);
4604
4605 if (info.soffset.id()) {
4606 if (soffset.isTemp())
4607 vaddr = bld.copy(bld.def(v1), soffset);
4608 soffset = Operand(info.soffset);
4609 }
4610
4611 if (soffset.isUndefined())
4612 soffset = Operand::zero();
4613
4614 bool offen = !vaddr.isUndefined();
4615 bool idxen = info.idx.id();
4616
4617 if (offen && idxen)
4618 vaddr = bld.pseudo(aco_opcode::p_create_vector, bld.def(v2), info.idx, vaddr);
4619 else if (idxen)
4620 vaddr = Operand(info.idx);
4621
4622 aco_opcode op = aco_opcode::num_opcodes;
4623 if (info.component_size == 2) {
4624 switch (bytes_needed) {
4625 case 2: op = aco_opcode::buffer_load_format_d16_x; break;
4626 case 4: op = aco_opcode::buffer_load_format_d16_xy; break;
4627 case 6: op = aco_opcode::buffer_load_format_d16_xyz; break;
4628 case 8: op = aco_opcode::buffer_load_format_d16_xyzw; break;
4629 default: unreachable("invalid buffer load format size"); break;
4630 }
4631 } else {
4632 assert(info.component_size == 4);
4633 switch (bytes_needed) {
4634 case 4: op = aco_opcode::buffer_load_format_x; break;
4635 case 8: op = aco_opcode::buffer_load_format_xy; break;
4636 case 12: op = aco_opcode::buffer_load_format_xyz; break;
4637 case 16: op = aco_opcode::buffer_load_format_xyzw; break;
4638 default: unreachable("invalid buffer load format size"); break;
4639 }
4640 }
4641
4642 aco_ptr<Instruction> mubuf{create_instruction(op, Format::MUBUF, 3, 1)};
4643 mubuf->operands[0] = Operand(info.resource);
4644 mubuf->operands[1] = vaddr;
4645 mubuf->operands[2] = soffset;
4646 mubuf->mubuf().offen = offen;
4647 mubuf->mubuf().idxen = idxen;
4648 mubuf->mubuf().cache = info.cache;
4649 mubuf->mubuf().sync = info.sync;
4650 mubuf->mubuf().offset = const_offset;
4651 RegClass rc = RegClass::get(RegType::vgpr, bytes_needed);
4652 Temp val = dst_hint.id() && rc == dst_hint.regClass() ? dst_hint : bld.tmp(rc);
4653 mubuf->definitions[0] = Definition(val);
4654 bld.insert(std::move(mubuf));
4655
4656 return val;
4657 }
4658
4659 const EmitLoadParameters mubuf_load_format_params{mubuf_load_format_callback, 4096};
4660
4661 Temp
scratch_load_callback(Builder & bld,const LoadEmitInfo & info,Temp offset,unsigned bytes_needed,unsigned align_,unsigned const_offset,Temp dst_hint)4662 scratch_load_callback(Builder& bld, const LoadEmitInfo& info, Temp offset, unsigned bytes_needed,
4663 unsigned align_, unsigned const_offset, Temp dst_hint)
4664 {
4665 unsigned bytes_size = 0;
4666 aco_opcode op;
4667 if (bytes_needed == 1 || align_ % 2u) {
4668 bytes_size = 1;
4669 op = aco_opcode::scratch_load_ubyte;
4670 } else if (bytes_needed == 2 || align_ % 4u) {
4671 bytes_size = 2;
4672 op = aco_opcode::scratch_load_ushort;
4673 } else if (bytes_needed <= 4) {
4674 bytes_size = 4;
4675 op = aco_opcode::scratch_load_dword;
4676 } else if (bytes_needed <= 8) {
4677 bytes_size = 8;
4678 op = aco_opcode::scratch_load_dwordx2;
4679 } else if (bytes_needed <= 12) {
4680 bytes_size = 12;
4681 op = aco_opcode::scratch_load_dwordx3;
4682 } else {
4683 bytes_size = 16;
4684 op = aco_opcode::scratch_load_dwordx4;
4685 }
4686 RegClass rc = RegClass::get(RegType::vgpr, bytes_size);
4687 Temp val = dst_hint.id() && rc == dst_hint.regClass() ? dst_hint : bld.tmp(rc);
4688 aco_ptr<Instruction> flat{create_instruction(op, Format::SCRATCH, 2, 1)};
4689 flat->operands[0] = offset.regClass() == s1 ? Operand(v1) : Operand(offset);
4690 flat->operands[1] = offset.regClass() == s1 ? Operand(offset) : Operand(s1);
4691 flat->scratch().sync = info.sync;
4692 flat->scratch().offset = const_offset;
4693 flat->definitions[0] = Definition(val);
4694 bld.insert(std::move(flat));
4695
4696 return val;
4697 }
4698
4699 const EmitLoadParameters scratch_mubuf_load_params{mubuf_load_callback, 4096};
4700 const EmitLoadParameters scratch_flat_load_params{scratch_load_callback, 2048};
4701
4702 Temp
get_gfx6_global_rsrc(Builder & bld,Temp addr)4703 get_gfx6_global_rsrc(Builder& bld, Temp addr)
4704 {
4705 uint32_t desc[4];
4706 ac_build_raw_buffer_descriptor(bld.program->gfx_level, 0, 0xffffffff, desc);
4707
4708 if (addr.type() == RegType::vgpr)
4709 return bld.pseudo(aco_opcode::p_create_vector, bld.def(s4), Operand::zero(), Operand::zero(),
4710 Operand::c32(desc[2]), Operand::c32(desc[3]));
4711 return bld.pseudo(aco_opcode::p_create_vector, bld.def(s4), addr, Operand::c32(desc[2]),
4712 Operand::c32(desc[3]));
4713 }
4714
4715 Temp
add64_32(Builder & bld,Temp src0,Temp src1)4716 add64_32(Builder& bld, Temp src0, Temp src1)
4717 {
4718 Temp src00 = bld.tmp(src0.type(), 1);
4719 Temp src01 = bld.tmp(src0.type(), 1);
4720 bld.pseudo(aco_opcode::p_split_vector, Definition(src00), Definition(src01), src0);
4721
4722 if (src0.type() == RegType::vgpr || src1.type() == RegType::vgpr) {
4723 Temp dst0 = bld.tmp(v1);
4724 Temp carry = bld.vadd32(Definition(dst0), src00, src1, true).def(1).getTemp();
4725 Temp dst1 = bld.vadd32(bld.def(v1), src01, Operand::zero(), false, carry);
4726 return bld.pseudo(aco_opcode::p_create_vector, bld.def(v2), dst0, dst1);
4727 } else {
4728 Temp carry = bld.tmp(s1);
4729 Temp dst0 =
4730 bld.sop2(aco_opcode::s_add_u32, bld.def(s1), bld.scc(Definition(carry)), src00, src1);
4731 Temp dst1 = bld.sop2(aco_opcode::s_add_u32, bld.def(s1), bld.def(s1, scc), src01, carry);
4732 return bld.pseudo(aco_opcode::p_create_vector, bld.def(s2), dst0, dst1);
4733 }
4734 }
4735
4736 void
lower_global_address(Builder & bld,uint32_t offset_in,Temp * address_inout,uint32_t * const_offset_inout,Temp * offset_inout)4737 lower_global_address(Builder& bld, uint32_t offset_in, Temp* address_inout,
4738 uint32_t* const_offset_inout, Temp* offset_inout)
4739 {
4740 Temp address = *address_inout;
4741 uint64_t const_offset = *const_offset_inout + offset_in;
4742 Temp offset = *offset_inout;
4743
4744 uint64_t max_const_offset_plus_one =
4745 1; /* GFX7/8/9: FLAT loads do not support constant offsets */
4746 if (bld.program->gfx_level >= GFX9)
4747 max_const_offset_plus_one = bld.program->dev.scratch_global_offset_max;
4748 else if (bld.program->gfx_level == GFX6)
4749 max_const_offset_plus_one = 4096; /* MUBUF has a 12-bit unsigned offset field */
4750 uint64_t excess_offset = const_offset - (const_offset % max_const_offset_plus_one);
4751 const_offset %= max_const_offset_plus_one;
4752
4753 if (!offset.id()) {
4754 while (unlikely(excess_offset > UINT32_MAX)) {
4755 address = add64_32(bld, address, bld.copy(bld.def(s1), Operand::c32(UINT32_MAX)));
4756 excess_offset -= UINT32_MAX;
4757 }
4758 if (excess_offset)
4759 offset = bld.copy(bld.def(s1), Operand::c32(excess_offset));
4760 } else {
4761 /* If we add to "offset", we would transform the indended
4762 * "address + u2u64(offset) + u2u64(const_offset)" into
4763 * "address + u2u64(offset + const_offset)", so add to the address.
4764 * This could be more efficient if excess_offset>UINT32_MAX by doing a full 64-bit addition,
4765 * but that should be really rare.
4766 */
4767 while (excess_offset) {
4768 uint32_t src2 = MIN2(excess_offset, UINT32_MAX);
4769 address = add64_32(bld, address, bld.copy(bld.def(s1), Operand::c32(src2)));
4770 excess_offset -= src2;
4771 }
4772 }
4773
4774 if (bld.program->gfx_level == GFX6) {
4775 /* GFX6 (MUBUF): (SGPR address, SGPR offset) or (VGPR address, SGPR offset) */
4776 if (offset.type() != RegType::sgpr) {
4777 address = add64_32(bld, address, offset);
4778 offset = Temp();
4779 }
4780 offset = offset.id() ? offset : bld.copy(bld.def(s1), Operand::zero());
4781 } else if (bld.program->gfx_level <= GFX8) {
4782 /* GFX7,8 (FLAT): VGPR address */
4783 if (offset.id()) {
4784 address = add64_32(bld, address, offset);
4785 offset = Temp();
4786 }
4787 address = as_vgpr(bld, address);
4788 } else {
4789 /* GFX9+ (GLOBAL): (VGPR address), or (SGPR address and VGPR offset) */
4790 if (address.type() == RegType::vgpr && offset.id()) {
4791 address = add64_32(bld, address, offset);
4792 offset = Temp();
4793 } else if (address.type() == RegType::sgpr && offset.id()) {
4794 offset = as_vgpr(bld, offset);
4795 }
4796 if (address.type() == RegType::sgpr && !offset.id())
4797 offset = bld.copy(bld.def(v1), bld.copy(bld.def(s1), Operand::zero()));
4798 }
4799
4800 *address_inout = address;
4801 *const_offset_inout = const_offset;
4802 *offset_inout = offset;
4803 }
4804
4805 Temp
global_load_callback(Builder & bld,const LoadEmitInfo & info,Temp offset,unsigned bytes_needed,unsigned align_,unsigned const_offset,Temp dst_hint)4806 global_load_callback(Builder& bld, const LoadEmitInfo& info, Temp offset, unsigned bytes_needed,
4807 unsigned align_, unsigned const_offset, Temp dst_hint)
4808 {
4809 Temp addr = info.resource;
4810 if (!addr.id()) {
4811 addr = offset;
4812 offset = Temp();
4813 }
4814 lower_global_address(bld, 0, &addr, &const_offset, &offset);
4815
4816 unsigned bytes_size = 0;
4817 bool use_mubuf = bld.program->gfx_level == GFX6;
4818 bool global = bld.program->gfx_level >= GFX9;
4819 aco_opcode op;
4820 if (bytes_needed == 1 || align_ % 2u) {
4821 bytes_size = 1;
4822 op = use_mubuf ? aco_opcode::buffer_load_ubyte
4823 : global ? aco_opcode::global_load_ubyte
4824 : aco_opcode::flat_load_ubyte;
4825 } else if (bytes_needed == 2 || align_ % 4u) {
4826 bytes_size = 2;
4827 op = use_mubuf ? aco_opcode::buffer_load_ushort
4828 : global ? aco_opcode::global_load_ushort
4829 : aco_opcode::flat_load_ushort;
4830 } else if (bytes_needed <= 4) {
4831 bytes_size = 4;
4832 op = use_mubuf ? aco_opcode::buffer_load_dword
4833 : global ? aco_opcode::global_load_dword
4834 : aco_opcode::flat_load_dword;
4835 } else if (bytes_needed <= 8 || (bytes_needed <= 12 && use_mubuf)) {
4836 bytes_size = 8;
4837 op = use_mubuf ? aco_opcode::buffer_load_dwordx2
4838 : global ? aco_opcode::global_load_dwordx2
4839 : aco_opcode::flat_load_dwordx2;
4840 } else if (bytes_needed <= 12 && !use_mubuf) {
4841 bytes_size = 12;
4842 op = global ? aco_opcode::global_load_dwordx3 : aco_opcode::flat_load_dwordx3;
4843 } else {
4844 bytes_size = 16;
4845 op = use_mubuf ? aco_opcode::buffer_load_dwordx4
4846 : global ? aco_opcode::global_load_dwordx4
4847 : aco_opcode::flat_load_dwordx4;
4848 }
4849 RegClass rc = RegClass::get(RegType::vgpr, bytes_size);
4850 Temp val = dst_hint.id() && rc == dst_hint.regClass() ? dst_hint : bld.tmp(rc);
4851 if (use_mubuf) {
4852 aco_ptr<Instruction> mubuf{create_instruction(op, Format::MUBUF, 3, 1)};
4853 mubuf->operands[0] = Operand(get_gfx6_global_rsrc(bld, addr));
4854 mubuf->operands[1] = addr.type() == RegType::vgpr ? Operand(addr) : Operand(v1);
4855 mubuf->operands[2] = Operand(offset);
4856 mubuf->mubuf().cache = info.cache;
4857 mubuf->mubuf().offset = const_offset;
4858 mubuf->mubuf().addr64 = addr.type() == RegType::vgpr;
4859 mubuf->mubuf().disable_wqm = false;
4860 mubuf->mubuf().sync = info.sync;
4861 mubuf->definitions[0] = Definition(val);
4862 bld.insert(std::move(mubuf));
4863 } else {
4864 aco_ptr<Instruction> flat{
4865 create_instruction(op, global ? Format::GLOBAL : Format::FLAT, 2, 1)};
4866 if (addr.regClass() == s2) {
4867 assert(global && offset.id() && offset.type() == RegType::vgpr);
4868 flat->operands[0] = Operand(offset);
4869 flat->operands[1] = Operand(addr);
4870 } else {
4871 assert(addr.type() == RegType::vgpr && !offset.id());
4872 flat->operands[0] = Operand(addr);
4873 flat->operands[1] = Operand(s1);
4874 }
4875 flat->flatlike().cache = info.cache;
4876 flat->flatlike().sync = info.sync;
4877 assert(global || !const_offset);
4878 flat->flatlike().offset = const_offset;
4879 flat->definitions[0] = Definition(val);
4880 bld.insert(std::move(flat));
4881 }
4882
4883 return val;
4884 }
4885
4886 const EmitLoadParameters global_load_params{global_load_callback, UINT32_MAX};
4887
4888 Temp
load_lds(isel_context * ctx,unsigned elem_size_bytes,unsigned num_components,Temp dst,Temp address,unsigned base_offset,unsigned align)4889 load_lds(isel_context* ctx, unsigned elem_size_bytes, unsigned num_components, Temp dst,
4890 Temp address, unsigned base_offset, unsigned align)
4891 {
4892 assert(util_is_power_of_two_nonzero(align));
4893
4894 Builder bld(ctx->program, ctx->block);
4895
4896 LoadEmitInfo info = {Operand(as_vgpr(ctx, address)), dst, num_components, elem_size_bytes};
4897 info.align_mul = align;
4898 info.align_offset = 0;
4899 info.sync = memory_sync_info(storage_shared);
4900 info.const_offset = base_offset;
4901 /* The 2 separate loads for gfx10+ wave64 can see different values, even for uniform addresses,
4902 * if another wave writes LDS in between. Use v_readfirstlane instead of p_as_uniform in order
4903 * to avoid copy-propagation.
4904 */
4905 info.readfirstlane_for_uniform = ctx->options->gfx_level >= GFX10 &&
4906 ctx->program->wave_size == 64 &&
4907 ctx->program->workgroup_size > 64;
4908 emit_load(ctx, bld, info, lds_load_params);
4909
4910 return dst;
4911 }
4912
4913 void
split_store_data(isel_context * ctx,RegType dst_type,unsigned count,Temp * dst,unsigned * bytes,Temp src)4914 split_store_data(isel_context* ctx, RegType dst_type, unsigned count, Temp* dst, unsigned* bytes,
4915 Temp src)
4916 {
4917 if (!count)
4918 return;
4919
4920 Builder bld(ctx->program, ctx->block);
4921
4922 /* count == 1 fast path */
4923 if (count == 1) {
4924 if (dst_type == RegType::sgpr)
4925 dst[0] = bld.as_uniform(src);
4926 else
4927 dst[0] = as_vgpr(ctx, src);
4928 return;
4929 }
4930
4931 /* elem_size_bytes is the greatest common divisor which is a power of 2 */
4932 unsigned elem_size_bytes =
4933 1u << (ffs(std::accumulate(bytes, bytes + count, 8, std::bit_or<>{})) - 1);
4934
4935 ASSERTED bool is_subdword = elem_size_bytes < 4;
4936 assert(!is_subdword || dst_type == RegType::vgpr);
4937
4938 for (unsigned i = 0; i < count; i++)
4939 dst[i] = bld.tmp(RegClass::get(dst_type, bytes[i]));
4940
4941 std::vector<Temp> temps;
4942 /* use allocated_vec if possible */
4943 auto it = ctx->allocated_vec.find(src.id());
4944 if (it != ctx->allocated_vec.end()) {
4945 if (!it->second[0].id())
4946 goto split;
4947 unsigned elem_size = it->second[0].bytes();
4948 assert(src.bytes() % elem_size == 0);
4949
4950 for (unsigned i = 0; i < src.bytes() / elem_size; i++) {
4951 if (!it->second[i].id())
4952 goto split;
4953 }
4954 if (elem_size_bytes % elem_size)
4955 goto split;
4956
4957 temps.insert(temps.end(), it->second.begin(), it->second.begin() + src.bytes() / elem_size);
4958 elem_size_bytes = elem_size;
4959 }
4960
4961 split:
4962 /* split src if necessary */
4963 if (temps.empty()) {
4964 if (is_subdword && src.type() == RegType::sgpr)
4965 src = as_vgpr(ctx, src);
4966 if (dst_type == RegType::sgpr)
4967 src = bld.as_uniform(src);
4968
4969 unsigned num_elems = src.bytes() / elem_size_bytes;
4970 aco_ptr<Instruction> split{
4971 create_instruction(aco_opcode::p_split_vector, Format::PSEUDO, 1, num_elems)};
4972 split->operands[0] = Operand(src);
4973 for (unsigned i = 0; i < num_elems; i++) {
4974 temps.emplace_back(bld.tmp(RegClass::get(dst_type, elem_size_bytes)));
4975 split->definitions[i] = Definition(temps.back());
4976 }
4977 bld.insert(std::move(split));
4978 }
4979
4980 unsigned idx = 0;
4981 for (unsigned i = 0; i < count; i++) {
4982 unsigned op_count = dst[i].bytes() / elem_size_bytes;
4983 if (op_count == 1) {
4984 if (dst_type == RegType::sgpr)
4985 dst[i] = bld.as_uniform(temps[idx++]);
4986 else
4987 dst[i] = as_vgpr(ctx, temps[idx++]);
4988 continue;
4989 }
4990
4991 aco_ptr<Instruction> vec{
4992 create_instruction(aco_opcode::p_create_vector, Format::PSEUDO, op_count, 1)};
4993 for (unsigned j = 0; j < op_count; j++) {
4994 Temp tmp = temps[idx++];
4995 if (dst_type == RegType::sgpr)
4996 tmp = bld.as_uniform(tmp);
4997 vec->operands[j] = Operand(tmp);
4998 }
4999 vec->definitions[0] = Definition(dst[i]);
5000 bld.insert(std::move(vec));
5001 }
5002 return;
5003 }
5004
5005 bool
scan_write_mask(uint32_t mask,uint32_t todo_mask,int * start,int * count)5006 scan_write_mask(uint32_t mask, uint32_t todo_mask, int* start, int* count)
5007 {
5008 unsigned start_elem = ffs(todo_mask) - 1;
5009 bool skip = !(mask & (1 << start_elem));
5010 if (skip)
5011 mask = ~mask & todo_mask;
5012
5013 mask &= todo_mask;
5014
5015 u_bit_scan_consecutive_range(&mask, start, count);
5016
5017 return !skip;
5018 }
5019
5020 void
advance_write_mask(uint32_t * todo_mask,int start,int count)5021 advance_write_mask(uint32_t* todo_mask, int start, int count)
5022 {
5023 *todo_mask &= ~u_bit_consecutive(0, count) << start;
5024 }
5025
5026 void
store_lds(isel_context * ctx,unsigned elem_size_bytes,Temp data,uint32_t wrmask,Temp address,unsigned base_offset,unsigned align)5027 store_lds(isel_context* ctx, unsigned elem_size_bytes, Temp data, uint32_t wrmask, Temp address,
5028 unsigned base_offset, unsigned align)
5029 {
5030 assert(util_is_power_of_two_nonzero(align));
5031 assert(util_is_power_of_two_nonzero(elem_size_bytes) && elem_size_bytes <= 8);
5032
5033 Builder bld(ctx->program, ctx->block);
5034 bool large_ds_write = ctx->options->gfx_level >= GFX7;
5035 bool usable_write2 = ctx->options->gfx_level >= GFX7;
5036
5037 unsigned write_count = 0;
5038 Temp write_datas[32];
5039 unsigned offsets[32];
5040 unsigned bytes[32];
5041 aco_opcode opcodes[32];
5042
5043 wrmask = util_widen_mask(wrmask, elem_size_bytes);
5044
5045 const unsigned wrmask_bitcnt = util_bitcount(wrmask);
5046 uint32_t todo = u_bit_consecutive(0, data.bytes());
5047
5048 if (u_bit_consecutive(0, wrmask_bitcnt) == wrmask)
5049 todo = MIN2(todo, wrmask);
5050
5051 while (todo) {
5052 int offset, byte;
5053 if (!scan_write_mask(wrmask, todo, &offset, &byte)) {
5054 offsets[write_count] = offset;
5055 bytes[write_count] = byte;
5056 opcodes[write_count] = aco_opcode::num_opcodes;
5057 write_count++;
5058 advance_write_mask(&todo, offset, byte);
5059 continue;
5060 }
5061
5062 bool aligned2 = offset % 2 == 0 && align % 2 == 0;
5063 bool aligned4 = offset % 4 == 0 && align % 4 == 0;
5064 bool aligned8 = offset % 8 == 0 && align % 8 == 0;
5065 bool aligned16 = offset % 16 == 0 && align % 16 == 0;
5066
5067 // TODO: use ds_write_b8_d16_hi/ds_write_b16_d16_hi if beneficial
5068 aco_opcode op = aco_opcode::num_opcodes;
5069 if (byte >= 16 && aligned16 && large_ds_write) {
5070 op = aco_opcode::ds_write_b128;
5071 byte = 16;
5072 } else if (byte >= 12 && aligned16 && large_ds_write) {
5073 op = aco_opcode::ds_write_b96;
5074 byte = 12;
5075 } else if (byte >= 8 && aligned8) {
5076 op = aco_opcode::ds_write_b64;
5077 byte = 8;
5078 } else if (byte >= 4 && aligned4) {
5079 op = aco_opcode::ds_write_b32;
5080 byte = 4;
5081 } else if (byte >= 2 && aligned2) {
5082 op = aco_opcode::ds_write_b16;
5083 byte = 2;
5084 } else if (byte >= 1) {
5085 op = aco_opcode::ds_write_b8;
5086 byte = 1;
5087 } else {
5088 assert(false);
5089 }
5090
5091 offsets[write_count] = offset;
5092 bytes[write_count] = byte;
5093 opcodes[write_count] = op;
5094 write_count++;
5095 advance_write_mask(&todo, offset, byte);
5096 }
5097
5098 Operand m = load_lds_size_m0(bld);
5099
5100 split_store_data(ctx, RegType::vgpr, write_count, write_datas, bytes, data);
5101
5102 for (unsigned i = 0; i < write_count; i++) {
5103 aco_opcode op = opcodes[i];
5104 if (op == aco_opcode::num_opcodes)
5105 continue;
5106
5107 Temp split_data = write_datas[i];
5108
5109 unsigned second = write_count;
5110 if (usable_write2 && (op == aco_opcode::ds_write_b32 || op == aco_opcode::ds_write_b64)) {
5111 for (second = i + 1; second < write_count; second++) {
5112 if (opcodes[second] == op && (offsets[second] - offsets[i]) % split_data.bytes() == 0) {
5113 op = split_data.bytes() == 4 ? aco_opcode::ds_write2_b32 : aco_opcode::ds_write2_b64;
5114 opcodes[second] = aco_opcode::num_opcodes;
5115 break;
5116 }
5117 }
5118 }
5119
5120 bool write2 = op == aco_opcode::ds_write2_b32 || op == aco_opcode::ds_write2_b64;
5121 unsigned write2_off = (offsets[second] - offsets[i]) / split_data.bytes();
5122
5123 unsigned inline_offset = base_offset + offsets[i];
5124 unsigned max_offset = write2 ? (255 - write2_off) * split_data.bytes() : 65535;
5125 Temp address_offset = address;
5126 if (inline_offset > max_offset) {
5127 address_offset = bld.vadd32(bld.def(v1), Operand::c32(base_offset), address_offset);
5128 inline_offset = offsets[i];
5129 }
5130
5131 /* offsets[i] shouldn't be large enough for this to happen */
5132 assert(inline_offset <= max_offset);
5133
5134 Instruction* instr;
5135 if (write2) {
5136 Temp second_data = write_datas[second];
5137 inline_offset /= split_data.bytes();
5138 instr = bld.ds(op, address_offset, split_data, second_data, m, inline_offset,
5139 inline_offset + write2_off);
5140 } else {
5141 instr = bld.ds(op, address_offset, split_data, m, inline_offset);
5142 }
5143 instr->ds().sync = memory_sync_info(storage_shared);
5144
5145 if (m.isUndefined())
5146 instr->operands.pop_back();
5147 }
5148 }
5149
5150 aco_opcode
get_buffer_store_op(unsigned bytes)5151 get_buffer_store_op(unsigned bytes)
5152 {
5153 switch (bytes) {
5154 case 1: return aco_opcode::buffer_store_byte;
5155 case 2: return aco_opcode::buffer_store_short;
5156 case 4: return aco_opcode::buffer_store_dword;
5157 case 8: return aco_opcode::buffer_store_dwordx2;
5158 case 12: return aco_opcode::buffer_store_dwordx3;
5159 case 16: return aco_opcode::buffer_store_dwordx4;
5160 }
5161 unreachable("Unexpected store size");
5162 return aco_opcode::num_opcodes;
5163 }
5164
5165 void
split_buffer_store(isel_context * ctx,nir_intrinsic_instr * instr,bool smem,RegType dst_type,Temp data,unsigned writemask,int swizzle_element_size,unsigned * write_count,Temp * write_datas,unsigned * offsets)5166 split_buffer_store(isel_context* ctx, nir_intrinsic_instr* instr, bool smem, RegType dst_type,
5167 Temp data, unsigned writemask, int swizzle_element_size, unsigned* write_count,
5168 Temp* write_datas, unsigned* offsets)
5169 {
5170 unsigned write_count_with_skips = 0;
5171 bool skips[16];
5172 unsigned bytes[16];
5173
5174 /* determine how to split the data */
5175 unsigned todo = u_bit_consecutive(0, data.bytes());
5176 while (todo) {
5177 int offset, byte;
5178 skips[write_count_with_skips] = !scan_write_mask(writemask, todo, &offset, &byte);
5179 offsets[write_count_with_skips] = offset;
5180 if (skips[write_count_with_skips]) {
5181 bytes[write_count_with_skips] = byte;
5182 advance_write_mask(&todo, offset, byte);
5183 write_count_with_skips++;
5184 continue;
5185 }
5186
5187 /* only supported sizes are 1, 2, 4, 8, 12 and 16 bytes and can't be
5188 * larger than swizzle_element_size */
5189 byte = MIN2(byte, swizzle_element_size);
5190 if (byte % 4)
5191 byte = byte > 4 ? byte & ~0x3 : MIN2(byte, 2);
5192
5193 /* SMEM and GFX6 VMEM can't emit 12-byte stores */
5194 if ((ctx->program->gfx_level == GFX6 || smem) && byte == 12)
5195 byte = 8;
5196
5197 /* dword or larger stores have to be dword-aligned */
5198 unsigned align_mul = instr ? nir_intrinsic_align_mul(instr) : 4;
5199 unsigned align_offset = (instr ? nir_intrinsic_align_offset(instr) : 0) + offset;
5200 bool dword_aligned = align_offset % 4 == 0 && align_mul % 4 == 0;
5201 if (!dword_aligned)
5202 byte = MIN2(byte, (align_offset % 2 == 0 && align_mul % 2 == 0) ? 2 : 1);
5203
5204 bytes[write_count_with_skips] = byte;
5205 advance_write_mask(&todo, offset, byte);
5206 write_count_with_skips++;
5207 }
5208
5209 /* actually split data */
5210 split_store_data(ctx, dst_type, write_count_with_skips, write_datas, bytes, data);
5211
5212 /* remove skips */
5213 for (unsigned i = 0; i < write_count_with_skips; i++) {
5214 if (skips[i])
5215 continue;
5216 write_datas[*write_count] = write_datas[i];
5217 offsets[*write_count] = offsets[i];
5218 (*write_count)++;
5219 }
5220 }
5221
5222 Temp
create_vec_from_array(isel_context * ctx,Temp arr[],unsigned cnt,RegType reg_type,unsigned elem_size_bytes,unsigned split_cnt=0u,Temp dst=Temp ())5223 create_vec_from_array(isel_context* ctx, Temp arr[], unsigned cnt, RegType reg_type,
5224 unsigned elem_size_bytes, unsigned split_cnt = 0u, Temp dst = Temp())
5225 {
5226 Builder bld(ctx->program, ctx->block);
5227 unsigned dword_size = elem_size_bytes / 4;
5228
5229 if (!dst.id())
5230 dst = bld.tmp(RegClass(reg_type, cnt * dword_size));
5231
5232 std::array<Temp, NIR_MAX_VEC_COMPONENTS> allocated_vec;
5233 aco_ptr<Instruction> instr{
5234 create_instruction(aco_opcode::p_create_vector, Format::PSEUDO, cnt, 1)};
5235 instr->definitions[0] = Definition(dst);
5236
5237 for (unsigned i = 0; i < cnt; ++i) {
5238 if (arr[i].id()) {
5239 assert(arr[i].size() == dword_size);
5240 allocated_vec[i] = arr[i];
5241 instr->operands[i] = Operand(arr[i]);
5242 } else {
5243 Temp zero = bld.copy(bld.def(RegClass(reg_type, dword_size)),
5244 Operand::zero(dword_size == 2 ? 8 : 4));
5245 allocated_vec[i] = zero;
5246 instr->operands[i] = Operand(zero);
5247 }
5248 }
5249
5250 bld.insert(std::move(instr));
5251
5252 if (split_cnt)
5253 emit_split_vector(ctx, dst, split_cnt);
5254 else
5255 ctx->allocated_vec.emplace(dst.id(), allocated_vec); /* emit_split_vector already does this */
5256
5257 return dst;
5258 }
5259
5260 inline unsigned
resolve_excess_vmem_const_offset(Builder & bld,Temp & voffset,unsigned const_offset)5261 resolve_excess_vmem_const_offset(Builder& bld, Temp& voffset, unsigned const_offset)
5262 {
5263 if (const_offset >= 4096) {
5264 unsigned excess_const_offset = const_offset / 4096u * 4096u;
5265 const_offset %= 4096u;
5266
5267 if (!voffset.id())
5268 voffset = bld.copy(bld.def(v1), Operand::c32(excess_const_offset));
5269 else if (unlikely(voffset.regClass() == s1))
5270 voffset = bld.sop2(aco_opcode::s_add_u32, bld.def(s1), bld.def(s1, scc),
5271 Operand::c32(excess_const_offset), Operand(voffset));
5272 else if (likely(voffset.regClass() == v1))
5273 voffset = bld.vadd32(bld.def(v1), Operand(voffset), Operand::c32(excess_const_offset));
5274 else
5275 unreachable("Unsupported register class of voffset");
5276 }
5277
5278 return const_offset;
5279 }
5280
5281 bool
store_output_to_temps(isel_context * ctx,nir_intrinsic_instr * instr)5282 store_output_to_temps(isel_context* ctx, nir_intrinsic_instr* instr)
5283 {
5284 unsigned write_mask = nir_intrinsic_write_mask(instr);
5285 unsigned component = nir_intrinsic_component(instr);
5286 nir_src offset = *nir_get_io_offset_src(instr);
5287
5288 if (!nir_src_is_const(offset) || nir_src_as_uint(offset))
5289 return false;
5290
5291 Temp src = get_ssa_temp(ctx, instr->src[0].ssa);
5292
5293 if (instr->src[0].ssa->bit_size == 64)
5294 write_mask = util_widen_mask(write_mask, 2);
5295
5296 RegClass rc = instr->src[0].ssa->bit_size == 16 ? v2b : v1;
5297
5298 /* Use semantic location as index. radv already uses it as intrinsic base
5299 * but radeonsi does not. We need to make LS output and TCS input index
5300 * match each other, so need to use semantic location explicitly. Also for
5301 * TCS epilog to index tess factor temps using semantic location directly.
5302 */
5303 nir_io_semantics sem = nir_intrinsic_io_semantics(instr);
5304 unsigned base = sem.location;
5305 if (ctx->stage == fragment_fs) {
5306 /* color result is a legacy slot which won't appear with data result
5307 * at the same time. Here we just use the data slot for it to simplify
5308 * code handling for both of them.
5309 */
5310 if (base == FRAG_RESULT_COLOR)
5311 base = FRAG_RESULT_DATA0;
5312
5313 /* Sencond output of dual source blend just use data1 slot for simplicity,
5314 * because dual source blend does not support multi render target.
5315 */
5316 base += sem.dual_source_blend_index;
5317 }
5318 unsigned idx = base * 4u + component;
5319
5320 for (unsigned i = 0; i < 8; ++i) {
5321 if (write_mask & (1 << i)) {
5322 ctx->outputs.mask[idx / 4u] |= 1 << (idx % 4u);
5323 ctx->outputs.temps[idx] = emit_extract_vector(ctx, src, i, rc);
5324 }
5325 idx++;
5326 }
5327
5328 if (ctx->stage == fragment_fs && ctx->program->info.ps.has_epilog && base >= FRAG_RESULT_DATA0) {
5329 unsigned index = base - FRAG_RESULT_DATA0;
5330
5331 if (nir_intrinsic_src_type(instr) == nir_type_float16) {
5332 ctx->output_color_types |= ACO_TYPE_FLOAT16 << (index * 2);
5333 } else if (nir_intrinsic_src_type(instr) == nir_type_int16) {
5334 ctx->output_color_types |= ACO_TYPE_INT16 << (index * 2);
5335 } else if (nir_intrinsic_src_type(instr) == nir_type_uint16) {
5336 ctx->output_color_types |= ACO_TYPE_UINT16 << (index * 2);
5337 }
5338 }
5339
5340 return true;
5341 }
5342
5343 bool
load_input_from_temps(isel_context * ctx,nir_intrinsic_instr * instr,Temp dst)5344 load_input_from_temps(isel_context* ctx, nir_intrinsic_instr* instr, Temp dst)
5345 {
5346 /* Only TCS per-vertex inputs are supported by this function.
5347 * Per-vertex inputs only match between the VS/TCS invocation id when the number of invocations
5348 * is the same.
5349 */
5350 if (ctx->shader->info.stage != MESA_SHADER_TESS_CTRL || !ctx->tcs_in_out_eq)
5351 return false;
5352
5353 /* This can only be indexing with invocation_id because all other access has been lowered
5354 * to load_shared.
5355 */
5356 nir_src* off_src = nir_get_io_offset_src(instr);
5357 assert(nir_src_is_const(*off_src));
5358
5359 nir_io_semantics sem = nir_intrinsic_io_semantics(instr);
5360
5361 unsigned idx =
5362 sem.location * 4u + nir_intrinsic_component(instr) + 4 * nir_src_as_uint(*off_src);
5363 Temp* src = &ctx->inputs.temps[idx];
5364 create_vec_from_array(ctx, src, dst.size(), dst.regClass().type(), 4u, 0, dst);
5365
5366 return true;
5367 }
5368
5369 void
visit_store_output(isel_context * ctx,nir_intrinsic_instr * instr)5370 visit_store_output(isel_context* ctx, nir_intrinsic_instr* instr)
5371 {
5372 /* LS pass output to TCS by temp if they have same in/out patch size. */
5373 bool ls_need_output = ctx->stage == vertex_tess_control_hs &&
5374 ctx->shader->info.stage == MESA_SHADER_VERTEX && ctx->tcs_in_out_eq;
5375
5376 bool ps_need_output = ctx->stage == fragment_fs;
5377
5378 if (ls_need_output || ps_need_output) {
5379 bool stored_to_temps = store_output_to_temps(ctx, instr);
5380 if (!stored_to_temps) {
5381 isel_err(instr->src[1].ssa->parent_instr, "Unimplemented output offset instruction");
5382 abort();
5383 }
5384 } else {
5385 unreachable("Shader stage not implemented");
5386 }
5387 }
5388
5389 bool
in_exec_divergent_or_in_loop(isel_context * ctx)5390 in_exec_divergent_or_in_loop(isel_context* ctx)
5391 {
5392 return ctx->block->loop_nest_depth || ctx->cf_info.parent_if.is_divergent ||
5393 ctx->cf_info.had_divergent_discard;
5394 }
5395
5396 void
emit_interp_instr_gfx11(isel_context * ctx,unsigned idx,unsigned component,Temp src,Temp dst,Temp prim_mask,bool high_16bits)5397 emit_interp_instr_gfx11(isel_context* ctx, unsigned idx, unsigned component, Temp src, Temp dst,
5398 Temp prim_mask, bool high_16bits)
5399 {
5400 Temp coord1 = emit_extract_vector(ctx, src, 0, v1);
5401 Temp coord2 = emit_extract_vector(ctx, src, 1, v1);
5402
5403 Builder bld(ctx->program, ctx->block);
5404
5405 if (in_exec_divergent_or_in_loop(ctx)) {
5406 bld.pseudo(aco_opcode::p_interp_gfx11, Definition(dst), Operand(v1.as_linear()),
5407 Operand::c32(idx), Operand::c32(component), Operand::c32(high_16bits), coord1,
5408 coord2, bld.m0(prim_mask));
5409 return;
5410 }
5411
5412 Temp p = bld.ldsdir(aco_opcode::lds_param_load, bld.def(v1), bld.m0(prim_mask), idx, component);
5413
5414 Temp res;
5415 if (dst.regClass() == v2b) {
5416 Temp p10 = bld.vinterp_inreg(aco_opcode::v_interp_p10_f16_f32_inreg, bld.def(v1), p, coord1,
5417 p, high_16bits ? 0x5 : 0);
5418 bld.vinterp_inreg(aco_opcode::v_interp_p2_f16_f32_inreg, Definition(dst), p, coord2, p10,
5419 high_16bits ? 0x1 : 0);
5420 } else {
5421 Temp p10 = bld.vinterp_inreg(aco_opcode::v_interp_p10_f32_inreg, bld.def(v1), p, coord1, p);
5422 bld.vinterp_inreg(aco_opcode::v_interp_p2_f32_inreg, Definition(dst), p, coord2, p10);
5423 }
5424 /* lds_param_load must be done in WQM, and the result kept valid for helper lanes. */
5425 set_wqm(ctx, true);
5426 }
5427
5428 void
emit_interp_instr(isel_context * ctx,unsigned idx,unsigned component,Temp src,Temp dst,Temp prim_mask,bool high_16bits)5429 emit_interp_instr(isel_context* ctx, unsigned idx, unsigned component, Temp src, Temp dst,
5430 Temp prim_mask, bool high_16bits)
5431 {
5432 if (ctx->options->gfx_level >= GFX11) {
5433 emit_interp_instr_gfx11(ctx, idx, component, src, dst, prim_mask, high_16bits);
5434 return;
5435 }
5436
5437 Temp coord1 = emit_extract_vector(ctx, src, 0, v1);
5438 Temp coord2 = emit_extract_vector(ctx, src, 1, v1);
5439
5440 Builder bld(ctx->program, ctx->block);
5441
5442 if (dst.regClass() == v2b) {
5443 if (ctx->program->dev.has_16bank_lds) {
5444 assert(ctx->options->gfx_level <= GFX8);
5445 Builder::Result interp_p1 =
5446 bld.vintrp(aco_opcode::v_interp_mov_f32, bld.def(v1), Operand::c32(2u) /* P0 */,
5447 bld.m0(prim_mask), idx, component);
5448 interp_p1 = bld.vintrp(aco_opcode::v_interp_p1lv_f16, bld.def(v1), coord1,
5449 bld.m0(prim_mask), interp_p1, idx, component, high_16bits);
5450 bld.vintrp(aco_opcode::v_interp_p2_legacy_f16, Definition(dst), coord2, bld.m0(prim_mask),
5451 interp_p1, idx, component, high_16bits);
5452 } else {
5453 aco_opcode interp_p2_op = aco_opcode::v_interp_p2_f16;
5454
5455 if (ctx->options->gfx_level == GFX8)
5456 interp_p2_op = aco_opcode::v_interp_p2_legacy_f16;
5457
5458 Builder::Result interp_p1 = bld.vintrp(aco_opcode::v_interp_p1ll_f16, bld.def(v1), coord1,
5459 bld.m0(prim_mask), idx, component, high_16bits);
5460 bld.vintrp(interp_p2_op, Definition(dst), coord2, bld.m0(prim_mask), interp_p1, idx,
5461 component, high_16bits);
5462 }
5463 } else {
5464 assert(!high_16bits);
5465 Temp interp_p1 = bld.vintrp(aco_opcode::v_interp_p1_f32, bld.def(v1), coord1,
5466 bld.m0(prim_mask), idx, component);
5467
5468 bld.vintrp(aco_opcode::v_interp_p2_f32, Definition(dst), coord2, bld.m0(prim_mask), interp_p1,
5469 idx, component);
5470 }
5471 }
5472
5473 void
emit_interp_mov_instr(isel_context * ctx,unsigned idx,unsigned component,unsigned vertex_id,Temp dst,Temp prim_mask,bool high_16bits)5474 emit_interp_mov_instr(isel_context* ctx, unsigned idx, unsigned component, unsigned vertex_id,
5475 Temp dst, Temp prim_mask, bool high_16bits)
5476 {
5477 Builder bld(ctx->program, ctx->block);
5478 Temp tmp = dst.bytes() == 2 ? bld.tmp(v1) : dst;
5479 if (ctx->options->gfx_level >= GFX11) {
5480 uint16_t dpp_ctrl = dpp_quad_perm(vertex_id, vertex_id, vertex_id, vertex_id);
5481 if (in_exec_divergent_or_in_loop(ctx)) {
5482 bld.pseudo(aco_opcode::p_interp_gfx11, Definition(tmp), Operand(v1.as_linear()),
5483 Operand::c32(idx), Operand::c32(component), Operand::c32(dpp_ctrl),
5484 bld.m0(prim_mask));
5485 } else {
5486 Temp p =
5487 bld.ldsdir(aco_opcode::lds_param_load, bld.def(v1), bld.m0(prim_mask), idx, component);
5488 bld.vop1_dpp(aco_opcode::v_mov_b32, Definition(tmp), p, dpp_ctrl);
5489 /* lds_param_load must be done in WQM, and the result kept valid for helper lanes. */
5490 set_wqm(ctx, true);
5491 }
5492 } else {
5493 bld.vintrp(aco_opcode::v_interp_mov_f32, Definition(tmp), Operand::c32((vertex_id + 2) % 3),
5494 bld.m0(prim_mask), idx, component);
5495 }
5496
5497 if (dst.id() != tmp.id())
5498 emit_extract_vector(ctx, tmp, high_16bits, dst);
5499 }
5500
5501 void
visit_load_interpolated_input(isel_context * ctx,nir_intrinsic_instr * instr)5502 visit_load_interpolated_input(isel_context* ctx, nir_intrinsic_instr* instr)
5503 {
5504 Temp dst = get_ssa_temp(ctx, &instr->def);
5505 Temp coords = get_ssa_temp(ctx, instr->src[0].ssa);
5506 unsigned idx = nir_intrinsic_base(instr);
5507 unsigned component = nir_intrinsic_component(instr);
5508 bool high_16bits = nir_intrinsic_io_semantics(instr).high_16bits;
5509 Temp prim_mask = get_arg(ctx, ctx->args->prim_mask);
5510
5511 assert(nir_src_is_const(instr->src[1]) && !nir_src_as_uint(instr->src[1]));
5512
5513 if (instr->def.num_components == 1) {
5514 emit_interp_instr(ctx, idx, component, coords, dst, prim_mask, high_16bits);
5515 } else {
5516 aco_ptr<Instruction> vec(create_instruction(aco_opcode::p_create_vector, Format::PSEUDO,
5517 instr->def.num_components, 1));
5518 for (unsigned i = 0; i < instr->def.num_components; i++) {
5519 Temp tmp = ctx->program->allocateTmp(instr->def.bit_size == 16 ? v2b : v1);
5520 emit_interp_instr(ctx, idx, component + i, coords, tmp, prim_mask, high_16bits);
5521 vec->operands[i] = Operand(tmp);
5522 }
5523 vec->definitions[0] = Definition(dst);
5524 ctx->block->instructions.emplace_back(std::move(vec));
5525 }
5526 }
5527
5528 Temp
mtbuf_load_callback(Builder & bld,const LoadEmitInfo & info,Temp offset,unsigned bytes_needed,unsigned alignment,unsigned const_offset,Temp dst_hint)5529 mtbuf_load_callback(Builder& bld, const LoadEmitInfo& info, Temp offset, unsigned bytes_needed,
5530 unsigned alignment, unsigned const_offset, Temp dst_hint)
5531 {
5532 Operand vaddr = offset.type() == RegType::vgpr ? Operand(offset) : Operand(v1);
5533 Operand soffset = offset.type() == RegType::sgpr ? Operand(offset) : Operand::c32(0);
5534
5535 if (info.soffset.id()) {
5536 if (soffset.isTemp())
5537 vaddr = bld.copy(bld.def(v1), soffset);
5538 soffset = Operand(info.soffset);
5539 }
5540
5541 if (soffset.isUndefined())
5542 soffset = Operand::zero();
5543
5544 const bool offen = !vaddr.isUndefined();
5545 const bool idxen = info.idx.id();
5546
5547 if (offen && idxen)
5548 vaddr = bld.pseudo(aco_opcode::p_create_vector, bld.def(v2), info.idx, vaddr);
5549 else if (idxen)
5550 vaddr = Operand(info.idx);
5551
5552 /* Determine number of fetched components.
5553 * Note, ACO IR works with GFX6-8 nfmt + dfmt fields, these are later converted for GFX10+.
5554 */
5555 const struct ac_vtx_format_info* vtx_info =
5556 ac_get_vtx_format_info(GFX8, CHIP_POLARIS10, info.format);
5557 /* The number of channels in the format determines the memory range. */
5558 const unsigned max_components = vtx_info->num_channels;
5559 /* Calculate maximum number of components loaded according to alignment. */
5560 unsigned max_fetched_components = bytes_needed / info.component_size;
5561 max_fetched_components =
5562 ac_get_safe_fetch_size(bld.program->gfx_level, vtx_info, const_offset, max_components,
5563 alignment, max_fetched_components);
5564 const unsigned fetch_fmt = vtx_info->hw_format[max_fetched_components - 1];
5565 /* Adjust bytes needed in case we need to do a smaller load due to alignment.
5566 * If a larger format is selected, it's still OK to load a smaller amount from it.
5567 */
5568 bytes_needed = MIN2(bytes_needed, max_fetched_components * info.component_size);
5569 unsigned bytes_size = 0;
5570 const unsigned bit_size = info.component_size * 8;
5571 aco_opcode op = aco_opcode::num_opcodes;
5572
5573 if (bytes_needed == 2) {
5574 bytes_size = 2;
5575 op = aco_opcode::tbuffer_load_format_d16_x;
5576 } else if (bytes_needed <= 4) {
5577 bytes_size = 4;
5578 if (bit_size == 16)
5579 op = aco_opcode::tbuffer_load_format_d16_xy;
5580 else
5581 op = aco_opcode::tbuffer_load_format_x;
5582 } else if (bytes_needed <= 6) {
5583 bytes_size = 6;
5584 if (bit_size == 16)
5585 op = aco_opcode::tbuffer_load_format_d16_xyz;
5586 else
5587 op = aco_opcode::tbuffer_load_format_xy;
5588 } else if (bytes_needed <= 8) {
5589 bytes_size = 8;
5590 if (bit_size == 16)
5591 op = aco_opcode::tbuffer_load_format_d16_xyzw;
5592 else
5593 op = aco_opcode::tbuffer_load_format_xy;
5594 } else if (bytes_needed <= 12) {
5595 bytes_size = 12;
5596 op = aco_opcode::tbuffer_load_format_xyz;
5597 } else {
5598 bytes_size = 16;
5599 op = aco_opcode::tbuffer_load_format_xyzw;
5600 }
5601
5602 /* Abort when suitable opcode wasn't found so we don't compile buggy shaders. */
5603 if (op == aco_opcode::num_opcodes) {
5604 aco_err(bld.program, "unsupported bit size for typed buffer load");
5605 abort();
5606 }
5607
5608 aco_ptr<Instruction> mtbuf{create_instruction(op, Format::MTBUF, 3, 1)};
5609 mtbuf->operands[0] = Operand(info.resource);
5610 mtbuf->operands[1] = vaddr;
5611 mtbuf->operands[2] = soffset;
5612 mtbuf->mtbuf().offen = offen;
5613 mtbuf->mtbuf().idxen = idxen;
5614 mtbuf->mtbuf().cache = info.cache;
5615 mtbuf->mtbuf().sync = info.sync;
5616 mtbuf->mtbuf().offset = const_offset;
5617 mtbuf->mtbuf().dfmt = fetch_fmt & 0xf;
5618 mtbuf->mtbuf().nfmt = fetch_fmt >> 4;
5619 RegClass rc = RegClass::get(RegType::vgpr, bytes_size);
5620 Temp val = dst_hint.id() && rc == dst_hint.regClass() ? dst_hint : bld.tmp(rc);
5621 mtbuf->definitions[0] = Definition(val);
5622 bld.insert(std::move(mtbuf));
5623
5624 return val;
5625 }
5626
5627 const EmitLoadParameters mtbuf_load_params{mtbuf_load_callback, 4096};
5628
5629 void
visit_load_fs_input(isel_context * ctx,nir_intrinsic_instr * instr)5630 visit_load_fs_input(isel_context* ctx, nir_intrinsic_instr* instr)
5631 {
5632 Builder bld(ctx->program, ctx->block);
5633 Temp dst = get_ssa_temp(ctx, &instr->def);
5634 nir_src offset = *nir_get_io_offset_src(instr);
5635
5636 if (!nir_src_is_const(offset) || nir_src_as_uint(offset))
5637 isel_err(offset.ssa->parent_instr, "Unimplemented non-zero nir_intrinsic_load_input offset");
5638
5639 Temp prim_mask = get_arg(ctx, ctx->args->prim_mask);
5640
5641 unsigned idx = nir_intrinsic_base(instr);
5642 unsigned component = nir_intrinsic_component(instr);
5643 bool high_16bits = nir_intrinsic_io_semantics(instr).high_16bits;
5644 unsigned vertex_id = 0; /* P0 */
5645
5646 if (instr->intrinsic == nir_intrinsic_load_input_vertex)
5647 vertex_id = nir_src_as_uint(instr->src[0]);
5648
5649 if (instr->def.num_components == 1 && instr->def.bit_size != 64) {
5650 emit_interp_mov_instr(ctx, idx, component, vertex_id, dst, prim_mask, high_16bits);
5651 } else {
5652 unsigned num_components = instr->def.num_components;
5653 if (instr->def.bit_size == 64)
5654 num_components *= 2;
5655 aco_ptr<Instruction> vec{
5656 create_instruction(aco_opcode::p_create_vector, Format::PSEUDO, num_components, 1)};
5657 for (unsigned i = 0; i < num_components; i++) {
5658 unsigned chan_component = (component + i) % 4;
5659 unsigned chan_idx = idx + (component + i) / 4;
5660 vec->operands[i] = Operand(bld.tmp(instr->def.bit_size == 16 ? v2b : v1));
5661 emit_interp_mov_instr(ctx, chan_idx, chan_component, vertex_id, vec->operands[i].getTemp(),
5662 prim_mask, high_16bits);
5663 }
5664 vec->definitions[0] = Definition(dst);
5665 bld.insert(std::move(vec));
5666 }
5667 }
5668
5669 void
visit_load_tcs_per_vertex_input(isel_context * ctx,nir_intrinsic_instr * instr)5670 visit_load_tcs_per_vertex_input(isel_context* ctx, nir_intrinsic_instr* instr)
5671 {
5672 assert(ctx->shader->info.stage == MESA_SHADER_TESS_CTRL);
5673
5674 Builder bld(ctx->program, ctx->block);
5675 Temp dst = get_ssa_temp(ctx, &instr->def);
5676
5677 if (load_input_from_temps(ctx, instr, dst))
5678 return;
5679
5680 unreachable("LDS-based TCS input should have been lowered in NIR.");
5681 }
5682
5683 void
visit_load_per_vertex_input(isel_context * ctx,nir_intrinsic_instr * instr)5684 visit_load_per_vertex_input(isel_context* ctx, nir_intrinsic_instr* instr)
5685 {
5686 switch (ctx->shader->info.stage) {
5687 case MESA_SHADER_TESS_CTRL: visit_load_tcs_per_vertex_input(ctx, instr); break;
5688 default: unreachable("Unimplemented shader stage");
5689 }
5690 }
5691
5692 ac_hw_cache_flags
get_cache_flags(isel_context * ctx,unsigned access)5693 get_cache_flags(isel_context* ctx, unsigned access)
5694 {
5695 return ac_get_hw_cache_flags(ctx->program->gfx_level, (gl_access_qualifier)access);
5696 }
5697
5698 ac_hw_cache_flags
get_atomic_cache_flags(isel_context * ctx,bool return_previous)5699 get_atomic_cache_flags(isel_context* ctx, bool return_previous)
5700 {
5701 ac_hw_cache_flags cache = get_cache_flags(ctx, ACCESS_TYPE_ATOMIC);
5702 if (return_previous && ctx->program->gfx_level >= GFX12)
5703 cache.gfx12.temporal_hint |= gfx12_atomic_return;
5704 else if (return_previous)
5705 cache.value |= ac_glc;
5706 return cache;
5707 }
5708
5709 void
load_buffer(isel_context * ctx,unsigned num_components,unsigned component_size,Temp dst,Temp rsrc,Temp offset,unsigned align_mul,unsigned align_offset,unsigned access=ACCESS_CAN_REORDER,memory_sync_info sync=memory_sync_info ())5710 load_buffer(isel_context* ctx, unsigned num_components, unsigned component_size, Temp dst,
5711 Temp rsrc, Temp offset, unsigned align_mul, unsigned align_offset,
5712 unsigned access = ACCESS_CAN_REORDER, memory_sync_info sync = memory_sync_info())
5713 {
5714 assert(!(access & ACCESS_SMEM_AMD) || (component_size >= 4));
5715
5716 Builder bld(ctx->program, ctx->block);
5717
5718 bool use_smem = access & ACCESS_SMEM_AMD;
5719 if (use_smem) {
5720 offset = bld.as_uniform(offset);
5721 } else {
5722 /* GFX6-7 are affected by a hw bug that prevents address clamping to
5723 * work correctly when the SGPR offset is used.
5724 */
5725 if (offset.type() == RegType::sgpr && ctx->options->gfx_level < GFX8)
5726 offset = as_vgpr(ctx, offset);
5727 }
5728
5729 LoadEmitInfo info = {Operand(offset), dst, num_components, component_size, rsrc};
5730 info.cache = get_cache_flags(ctx, access | ACCESS_TYPE_LOAD | (use_smem ? ACCESS_TYPE_SMEM : 0));
5731 info.sync = sync;
5732 info.align_mul = align_mul;
5733 info.align_offset = align_offset;
5734 if (use_smem)
5735 emit_load(ctx, bld, info, smem_load_params);
5736 else
5737 emit_load(ctx, bld, info, mubuf_load_params);
5738 }
5739
5740 void
visit_load_ubo(isel_context * ctx,nir_intrinsic_instr * instr)5741 visit_load_ubo(isel_context* ctx, nir_intrinsic_instr* instr)
5742 {
5743 Temp dst = get_ssa_temp(ctx, &instr->def);
5744 Builder bld(ctx->program, ctx->block);
5745 Temp rsrc = bld.as_uniform(get_ssa_temp(ctx, instr->src[0].ssa));
5746
5747 unsigned size = instr->def.bit_size / 8;
5748 load_buffer(ctx, instr->num_components, size, dst, rsrc, get_ssa_temp(ctx, instr->src[1].ssa),
5749 nir_intrinsic_align_mul(instr), nir_intrinsic_align_offset(instr),
5750 nir_intrinsic_access(instr) | ACCESS_CAN_REORDER);
5751 }
5752
5753 void
visit_load_constant(isel_context * ctx,nir_intrinsic_instr * instr)5754 visit_load_constant(isel_context* ctx, nir_intrinsic_instr* instr)
5755 {
5756 Temp dst = get_ssa_temp(ctx, &instr->def);
5757
5758 Builder bld(ctx->program, ctx->block);
5759
5760 uint32_t desc[4];
5761 ac_build_raw_buffer_descriptor(ctx->options->gfx_level, 0, 0, desc);
5762
5763 unsigned base = nir_intrinsic_base(instr);
5764 unsigned range = nir_intrinsic_range(instr);
5765
5766 Temp offset = get_ssa_temp(ctx, instr->src[0].ssa);
5767 if (base && offset.type() == RegType::sgpr)
5768 offset = bld.nuw().sop2(aco_opcode::s_add_u32, bld.def(s1), bld.def(s1, scc), offset,
5769 Operand::c32(base));
5770 else if (base && offset.type() == RegType::vgpr)
5771 offset = bld.vadd32(bld.def(v1), Operand::c32(base), offset);
5772
5773 Temp rsrc = bld.pseudo(aco_opcode::p_create_vector, bld.def(s4),
5774 bld.pseudo(aco_opcode::p_constaddr, bld.def(s2), bld.def(s1, scc),
5775 Operand::c32(ctx->constant_data_offset)),
5776 Operand::c32(MIN2(base + range, ctx->shader->constant_data_size)),
5777 Operand::c32(desc[3]));
5778 unsigned size = instr->def.bit_size / 8;
5779 load_buffer(ctx, instr->num_components, size, dst, rsrc, offset, nir_intrinsic_align_mul(instr),
5780 nir_intrinsic_align_offset(instr), nir_intrinsic_access(instr) | ACCESS_CAN_REORDER);
5781 }
5782
5783 /* Packs multiple Temps of different sizes in to a vector of v1 Temps.
5784 * The byte count of each input Temp must be a multiple of 2.
5785 */
5786 static std::vector<Temp>
emit_pack_v1(isel_context * ctx,const std::vector<Temp> & unpacked)5787 emit_pack_v1(isel_context* ctx, const std::vector<Temp>& unpacked)
5788 {
5789 Builder bld(ctx->program, ctx->block);
5790 std::vector<Temp> packed;
5791 Temp low = Temp();
5792 for (Temp tmp : unpacked) {
5793 assert(tmp.bytes() % 2 == 0);
5794 unsigned byte_idx = 0;
5795 while (byte_idx < tmp.bytes()) {
5796 if (low != Temp()) {
5797 Temp high = emit_extract_vector(ctx, tmp, byte_idx / 2, v2b);
5798 Temp dword = bld.pseudo(aco_opcode::p_create_vector, bld.def(v1), low, high);
5799 low = Temp();
5800 packed.push_back(dword);
5801 byte_idx += 2;
5802 } else if (byte_idx % 4 == 0 && (byte_idx + 4) <= tmp.bytes()) {
5803 packed.emplace_back(emit_extract_vector(ctx, tmp, byte_idx / 4, v1));
5804 byte_idx += 4;
5805 } else {
5806 low = emit_extract_vector(ctx, tmp, byte_idx / 2, v2b);
5807 byte_idx += 2;
5808 }
5809 }
5810 }
5811 if (low != Temp()) {
5812 Temp dword = bld.pseudo(aco_opcode::p_create_vector, bld.def(v1), low, Operand(v2b));
5813 packed.push_back(dword);
5814 }
5815 return packed;
5816 }
5817
5818 static bool
should_declare_array(ac_image_dim dim)5819 should_declare_array(ac_image_dim dim)
5820 {
5821 return dim == ac_image_cube || dim == ac_image_1darray || dim == ac_image_2darray ||
5822 dim == ac_image_2darraymsaa;
5823 }
5824
5825 static int
image_type_to_components_count(enum glsl_sampler_dim dim,bool array)5826 image_type_to_components_count(enum glsl_sampler_dim dim, bool array)
5827 {
5828 switch (dim) {
5829 case GLSL_SAMPLER_DIM_BUF: return 1;
5830 case GLSL_SAMPLER_DIM_1D: return array ? 2 : 1;
5831 case GLSL_SAMPLER_DIM_2D: return array ? 3 : 2;
5832 case GLSL_SAMPLER_DIM_MS: return array ? 3 : 2;
5833 case GLSL_SAMPLER_DIM_3D:
5834 case GLSL_SAMPLER_DIM_CUBE: return 3;
5835 case GLSL_SAMPLER_DIM_RECT:
5836 case GLSL_SAMPLER_DIM_SUBPASS: return 2;
5837 case GLSL_SAMPLER_DIM_SUBPASS_MS: return 2;
5838 default: break;
5839 }
5840 return 0;
5841 }
5842
5843 static MIMG_instruction*
emit_mimg(Builder & bld,aco_opcode op,Temp dst,Temp rsrc,Operand samp,std::vector<Temp> coords,Operand vdata=Operand (v1))5844 emit_mimg(Builder& bld, aco_opcode op, Temp dst, Temp rsrc, Operand samp, std::vector<Temp> coords,
5845 Operand vdata = Operand(v1))
5846 {
5847 bool is_vsample = !samp.isUndefined() || op == aco_opcode::image_msaa_load;
5848
5849 size_t nsa_size = bld.program->dev.max_nsa_vgprs;
5850 if (!is_vsample && bld.program->gfx_level >= GFX12)
5851 nsa_size++; /* VIMAGE can encode one more VADDR */
5852 nsa_size = bld.program->gfx_level >= GFX11 || coords.size() <= nsa_size ? nsa_size : 0;
5853
5854 const bool strict_wqm = coords[0].regClass().is_linear_vgpr();
5855 if (strict_wqm)
5856 nsa_size = coords.size();
5857
5858 for (unsigned i = 0; i < std::min(coords.size(), nsa_size); i++) {
5859 if (!coords[i].id())
5860 continue;
5861
5862 coords[i] = as_vgpr(bld, coords[i]);
5863 }
5864
5865 if (nsa_size < coords.size()) {
5866 Temp coord = coords[nsa_size];
5867 if (coords.size() - nsa_size > 1) {
5868 aco_ptr<Instruction> vec{create_instruction(aco_opcode::p_create_vector, Format::PSEUDO,
5869 coords.size() - nsa_size, 1)};
5870
5871 unsigned coord_size = 0;
5872 for (unsigned i = nsa_size; i < coords.size(); i++) {
5873 vec->operands[i - nsa_size] = Operand(coords[i]);
5874 coord_size += coords[i].size();
5875 }
5876
5877 coord = bld.tmp(RegType::vgpr, coord_size);
5878 vec->definitions[0] = Definition(coord);
5879 bld.insert(std::move(vec));
5880 } else {
5881 coord = as_vgpr(bld, coord);
5882 }
5883
5884 coords[nsa_size] = coord;
5885 coords.resize(nsa_size + 1);
5886 }
5887
5888 bool has_dst = dst.id() != 0;
5889
5890 aco_ptr<Instruction> mimg{create_instruction(op, Format::MIMG, 3 + coords.size(), has_dst)};
5891 if (has_dst)
5892 mimg->definitions[0] = Definition(dst);
5893 mimg->operands[0] = Operand(rsrc);
5894 mimg->operands[1] = samp;
5895 mimg->operands[2] = vdata;
5896 for (unsigned i = 0; i < coords.size(); i++)
5897 mimg->operands[3 + i] = Operand(coords[i]);
5898 mimg->mimg().strict_wqm = strict_wqm;
5899
5900 return &bld.insert(std::move(mimg))->mimg();
5901 }
5902
5903 void
visit_bvh64_intersect_ray_amd(isel_context * ctx,nir_intrinsic_instr * instr)5904 visit_bvh64_intersect_ray_amd(isel_context* ctx, nir_intrinsic_instr* instr)
5905 {
5906 Builder bld(ctx->program, ctx->block);
5907 Temp dst = get_ssa_temp(ctx, &instr->def);
5908 Temp resource = get_ssa_temp(ctx, instr->src[0].ssa);
5909 Temp node = get_ssa_temp(ctx, instr->src[1].ssa);
5910 Temp tmax = get_ssa_temp(ctx, instr->src[2].ssa);
5911 Temp origin = get_ssa_temp(ctx, instr->src[3].ssa);
5912 Temp dir = get_ssa_temp(ctx, instr->src[4].ssa);
5913 Temp inv_dir = get_ssa_temp(ctx, instr->src[5].ssa);
5914
5915 /* On GFX11 image_bvh64_intersect_ray has a special vaddr layout with NSA:
5916 * There are five smaller vector groups:
5917 * node_pointer, ray_extent, ray_origin, ray_dir, ray_inv_dir.
5918 * These directly match the NIR intrinsic sources.
5919 */
5920 std::vector<Temp> args = {
5921 node, tmax, origin, dir, inv_dir,
5922 };
5923
5924 if (bld.program->gfx_level == GFX10_3) {
5925 std::vector<Temp> scalar_args;
5926 for (Temp tmp : args) {
5927 for (unsigned i = 0; i < tmp.size(); i++)
5928 scalar_args.push_back(emit_extract_vector(ctx, tmp, i, v1));
5929 }
5930 args = std::move(scalar_args);
5931 }
5932
5933 MIMG_instruction* mimg =
5934 emit_mimg(bld, aco_opcode::image_bvh64_intersect_ray, dst, resource, Operand(s4), args);
5935 mimg->dim = ac_image_1d;
5936 mimg->dmask = 0xf;
5937 mimg->unrm = true;
5938 mimg->r128 = true;
5939
5940 emit_split_vector(ctx, dst, instr->def.num_components);
5941 }
5942
5943 static std::vector<Temp>
get_image_coords(isel_context * ctx,const nir_intrinsic_instr * instr)5944 get_image_coords(isel_context* ctx, const nir_intrinsic_instr* instr)
5945 {
5946
5947 Temp src0 = get_ssa_temp(ctx, instr->src[1].ssa);
5948 bool a16 = instr->src[1].ssa->bit_size == 16;
5949 RegClass rc = a16 ? v2b : v1;
5950 enum glsl_sampler_dim dim = nir_intrinsic_image_dim(instr);
5951 bool is_array = nir_intrinsic_image_array(instr);
5952 ASSERTED bool add_frag_pos =
5953 (dim == GLSL_SAMPLER_DIM_SUBPASS || dim == GLSL_SAMPLER_DIM_SUBPASS_MS);
5954 assert(!add_frag_pos && "Input attachments should be lowered.");
5955 bool is_ms = (dim == GLSL_SAMPLER_DIM_MS || dim == GLSL_SAMPLER_DIM_SUBPASS_MS);
5956 bool gfx9_1d = ctx->options->gfx_level == GFX9 && dim == GLSL_SAMPLER_DIM_1D;
5957 int count = image_type_to_components_count(dim, is_array);
5958 std::vector<Temp> coords;
5959 Builder bld(ctx->program, ctx->block);
5960
5961 if (gfx9_1d) {
5962 coords.emplace_back(emit_extract_vector(ctx, src0, 0, rc));
5963 coords.emplace_back(bld.copy(bld.def(rc), Operand::zero(a16 ? 2 : 4)));
5964 if (is_array)
5965 coords.emplace_back(emit_extract_vector(ctx, src0, 1, rc));
5966 } else {
5967 for (int i = 0; i < count; i++)
5968 coords.emplace_back(emit_extract_vector(ctx, src0, i, rc));
5969 }
5970
5971 bool has_lod = false;
5972 Temp lod;
5973
5974 if (instr->intrinsic == nir_intrinsic_bindless_image_load ||
5975 instr->intrinsic == nir_intrinsic_bindless_image_sparse_load ||
5976 instr->intrinsic == nir_intrinsic_bindless_image_store) {
5977 int lod_index = instr->intrinsic == nir_intrinsic_bindless_image_store ? 4 : 3;
5978 assert(instr->src[lod_index].ssa->bit_size == (a16 ? 16 : 32));
5979 has_lod =
5980 !nir_src_is_const(instr->src[lod_index]) || nir_src_as_uint(instr->src[lod_index]) != 0;
5981
5982 if (has_lod)
5983 lod = get_ssa_temp_tex(ctx, instr->src[lod_index].ssa, a16);
5984 }
5985
5986 if (ctx->program->info.image_2d_view_of_3d && dim == GLSL_SAMPLER_DIM_2D && !is_array) {
5987 /* The hw can't bind a slice of a 3D image as a 2D image, because it
5988 * ignores BASE_ARRAY if the target is 3D. The workaround is to read
5989 * BASE_ARRAY and set it as the 3rd address operand for all 2D images.
5990 */
5991 assert(ctx->options->gfx_level == GFX9);
5992 Temp rsrc = bld.as_uniform(get_ssa_temp(ctx, instr->src[0].ssa));
5993 Temp rsrc_word5 = emit_extract_vector(ctx, rsrc, 5, v1);
5994 /* Extract the BASE_ARRAY field [0:12] from the descriptor. */
5995 Temp first_layer = bld.vop3(aco_opcode::v_bfe_u32, bld.def(v1), rsrc_word5, Operand::c32(0u),
5996 Operand::c32(13u));
5997
5998 if (has_lod) {
5999 /* If there's a lod parameter it matter if the image is 3d or 2d because
6000 * the hw reads either the fourth or third component as lod. So detect
6001 * 3d images and place the lod at the third component otherwise.
6002 * For non 3D descriptors we effectively add lod twice to coords,
6003 * but the hw will only read the first one, the second is ignored.
6004 */
6005 Temp rsrc_word3 = emit_extract_vector(ctx, rsrc, 3, s1);
6006 Temp type = bld.sop2(aco_opcode::s_bfe_u32, bld.def(s1), bld.def(s1, scc), rsrc_word3,
6007 Operand::c32(28 | (4 << 16))); /* extract last 4 bits */
6008 Temp is_3d = bld.vopc_e64(aco_opcode::v_cmp_eq_u32, bld.def(bld.lm), type,
6009 Operand::c32(V_008F1C_SQ_RSRC_IMG_3D));
6010 first_layer =
6011 bld.vop2(aco_opcode::v_cndmask_b32, bld.def(v1), as_vgpr(ctx, lod), first_layer, is_3d);
6012 }
6013
6014 if (a16)
6015 coords.emplace_back(emit_extract_vector(ctx, first_layer, 0, v2b));
6016 else
6017 coords.emplace_back(first_layer);
6018 }
6019
6020 if (is_ms && instr->intrinsic != nir_intrinsic_bindless_image_fragment_mask_load_amd) {
6021 assert(instr->src[2].ssa->bit_size == (a16 ? 16 : 32));
6022 coords.emplace_back(get_ssa_temp_tex(ctx, instr->src[2].ssa, a16));
6023 }
6024
6025 if (has_lod)
6026 coords.emplace_back(lod);
6027
6028 return emit_pack_v1(ctx, coords);
6029 }
6030
6031 memory_sync_info
get_memory_sync_info(nir_intrinsic_instr * instr,storage_class storage,unsigned semantics)6032 get_memory_sync_info(nir_intrinsic_instr* instr, storage_class storage, unsigned semantics)
6033 {
6034 /* atomicrmw might not have NIR_INTRINSIC_ACCESS and there's nothing interesting there anyway */
6035 if (semantics & semantic_atomicrmw)
6036 return memory_sync_info(storage, semantics);
6037
6038 unsigned access = nir_intrinsic_access(instr);
6039
6040 if (access & ACCESS_VOLATILE)
6041 semantics |= semantic_volatile;
6042 if (access & ACCESS_CAN_REORDER)
6043 semantics |= semantic_can_reorder | semantic_private;
6044
6045 return memory_sync_info(storage, semantics);
6046 }
6047
6048 Operand
emit_tfe_init(Builder & bld,Temp dst)6049 emit_tfe_init(Builder& bld, Temp dst)
6050 {
6051 Temp tmp = bld.tmp(dst.regClass());
6052
6053 aco_ptr<Instruction> vec{
6054 create_instruction(aco_opcode::p_create_vector, Format::PSEUDO, dst.size(), 1)};
6055 for (unsigned i = 0; i < dst.size(); i++)
6056 vec->operands[i] = Operand::zero();
6057 vec->definitions[0] = Definition(tmp);
6058 /* Since this is fixed to an instruction's definition register, any CSE will
6059 * just create copies. Copying costs about the same as zero-initialization,
6060 * but these copies can break up clauses.
6061 */
6062 vec->definitions[0].setNoCSE(true);
6063 bld.insert(std::move(vec));
6064
6065 return Operand(tmp);
6066 }
6067
6068 void
visit_image_load(isel_context * ctx,nir_intrinsic_instr * instr)6069 visit_image_load(isel_context* ctx, nir_intrinsic_instr* instr)
6070 {
6071 Builder bld(ctx->program, ctx->block);
6072 const enum glsl_sampler_dim dim = nir_intrinsic_image_dim(instr);
6073 bool is_array = nir_intrinsic_image_array(instr);
6074 bool is_sparse = instr->intrinsic == nir_intrinsic_bindless_image_sparse_load;
6075 Temp dst = get_ssa_temp(ctx, &instr->def);
6076
6077 memory_sync_info sync = get_memory_sync_info(instr, storage_image, 0);
6078
6079 unsigned result_size = instr->def.num_components - is_sparse;
6080 unsigned expand_mask = nir_def_components_read(&instr->def) & u_bit_consecutive(0, result_size);
6081 expand_mask = MAX2(expand_mask, 1); /* this can be zero in the case of sparse image loads */
6082 if (dim == GLSL_SAMPLER_DIM_BUF)
6083 expand_mask = (1u << util_last_bit(expand_mask)) - 1u;
6084 unsigned dmask = expand_mask;
6085 if (instr->def.bit_size == 64) {
6086 expand_mask &= 0x9;
6087 /* only R64_UINT and R64_SINT supported. x is in xy of the result, w in zw */
6088 dmask = ((expand_mask & 0x1) ? 0x3 : 0) | ((expand_mask & 0x8) ? 0xc : 0);
6089 }
6090 if (is_sparse)
6091 expand_mask |= 1 << result_size;
6092
6093 bool d16 = instr->def.bit_size == 16;
6094 assert(!d16 || !is_sparse);
6095
6096 unsigned num_bytes = util_bitcount(dmask) * (d16 ? 2 : 4) + is_sparse * 4;
6097
6098 Temp tmp;
6099 if (num_bytes == dst.bytes() && dst.type() == RegType::vgpr)
6100 tmp = dst;
6101 else
6102 tmp = bld.tmp(RegClass::get(RegType::vgpr, num_bytes));
6103
6104 Temp resource = bld.as_uniform(get_ssa_temp(ctx, instr->src[0].ssa));
6105
6106 if (dim == GLSL_SAMPLER_DIM_BUF) {
6107 Temp vindex = emit_extract_vector(ctx, get_ssa_temp(ctx, instr->src[1].ssa), 0, v1);
6108
6109 aco_opcode opcode;
6110 if (!d16) {
6111 switch (util_bitcount(dmask)) {
6112 case 1: opcode = aco_opcode::buffer_load_format_x; break;
6113 case 2: opcode = aco_opcode::buffer_load_format_xy; break;
6114 case 3: opcode = aco_opcode::buffer_load_format_xyz; break;
6115 case 4: opcode = aco_opcode::buffer_load_format_xyzw; break;
6116 default: unreachable(">4 channel buffer image load");
6117 }
6118 } else {
6119 switch (util_bitcount(dmask)) {
6120 case 1: opcode = aco_opcode::buffer_load_format_d16_x; break;
6121 case 2: opcode = aco_opcode::buffer_load_format_d16_xy; break;
6122 case 3: opcode = aco_opcode::buffer_load_format_d16_xyz; break;
6123 case 4: opcode = aco_opcode::buffer_load_format_d16_xyzw; break;
6124 default: unreachable(">4 channel buffer image load");
6125 }
6126 }
6127 aco_ptr<Instruction> load{create_instruction(opcode, Format::MUBUF, 3 + is_sparse, 1)};
6128 load->operands[0] = Operand(resource);
6129 load->operands[1] = Operand(vindex);
6130 load->operands[2] = Operand::c32(0);
6131 load->definitions[0] = Definition(tmp);
6132 load->mubuf().idxen = true;
6133 load->mubuf().cache = get_cache_flags(ctx, nir_intrinsic_access(instr) | ACCESS_TYPE_LOAD);
6134 load->mubuf().sync = sync;
6135 load->mubuf().tfe = is_sparse;
6136 if (load->mubuf().tfe)
6137 load->operands[3] = emit_tfe_init(bld, tmp);
6138 ctx->block->instructions.emplace_back(std::move(load));
6139 } else {
6140 std::vector<Temp> coords = get_image_coords(ctx, instr);
6141
6142 aco_opcode opcode;
6143 if (instr->intrinsic == nir_intrinsic_bindless_image_fragment_mask_load_amd) {
6144 opcode = aco_opcode::image_load;
6145 } else {
6146 bool level_zero = nir_src_is_const(instr->src[3]) && nir_src_as_uint(instr->src[3]) == 0;
6147 opcode = level_zero ? aco_opcode::image_load : aco_opcode::image_load_mip;
6148 }
6149
6150 Operand vdata = is_sparse ? emit_tfe_init(bld, tmp) : Operand(v1);
6151 MIMG_instruction* load = emit_mimg(bld, opcode, tmp, resource, Operand(s4), coords, vdata);
6152 load->cache = get_cache_flags(ctx, nir_intrinsic_access(instr) | ACCESS_TYPE_LOAD);
6153 load->a16 = instr->src[1].ssa->bit_size == 16;
6154 load->d16 = d16;
6155 load->dmask = dmask;
6156 load->unrm = true;
6157 load->tfe = is_sparse;
6158
6159 if (instr->intrinsic == nir_intrinsic_bindless_image_fragment_mask_load_amd) {
6160 load->dim = is_array ? ac_image_2darray : ac_image_2d;
6161 load->da = is_array;
6162 load->sync = memory_sync_info();
6163 } else {
6164 ac_image_dim sdim = ac_get_image_dim(ctx->options->gfx_level, dim, is_array);
6165 load->dim = sdim;
6166 load->da = should_declare_array(sdim);
6167 load->sync = sync;
6168 }
6169 }
6170
6171 if (is_sparse && instr->def.bit_size == 64) {
6172 /* The result components are 64-bit but the sparse residency code is
6173 * 32-bit. So add a zero to the end so expand_vector() works correctly.
6174 */
6175 tmp = bld.pseudo(aco_opcode::p_create_vector, bld.def(RegType::vgpr, tmp.size() + 1), tmp,
6176 Operand::zero());
6177 }
6178
6179 expand_vector(ctx, tmp, dst, instr->def.num_components, expand_mask, instr->def.bit_size == 64);
6180 }
6181
6182 void
visit_image_store(isel_context * ctx,nir_intrinsic_instr * instr)6183 visit_image_store(isel_context* ctx, nir_intrinsic_instr* instr)
6184 {
6185 Builder bld(ctx->program, ctx->block);
6186 const enum glsl_sampler_dim dim = nir_intrinsic_image_dim(instr);
6187 bool is_array = nir_intrinsic_image_array(instr);
6188 Temp data = get_ssa_temp(ctx, instr->src[3].ssa);
6189 bool d16 = instr->src[3].ssa->bit_size == 16;
6190
6191 /* only R64_UINT and R64_SINT supported */
6192 if (instr->src[3].ssa->bit_size == 64 && data.bytes() > 8)
6193 data = emit_extract_vector(ctx, data, 0, RegClass(data.type(), 2));
6194 data = as_vgpr(ctx, data);
6195
6196 uint32_t num_components = d16 ? instr->src[3].ssa->num_components : data.size();
6197
6198 memory_sync_info sync = get_memory_sync_info(instr, storage_image, 0);
6199 unsigned access = nir_intrinsic_access(instr);
6200 ac_hw_cache_flags cache =
6201 get_cache_flags(ctx, access | ACCESS_TYPE_STORE | ACCESS_MAY_STORE_SUBDWORD);
6202
6203 uint32_t dmask = BITFIELD_MASK(num_components);
6204 if (instr->src[3].ssa->bit_size == 32 || instr->src[3].ssa->bit_size == 16) {
6205 for (uint32_t i = 0; i < instr->num_components; i++) {
6206 /* components not in dmask receive:
6207 * GFX6-11.5: zero
6208 * GFX12+: first component in dmask
6209 */
6210 nir_scalar comp = nir_scalar_resolved(instr->src[3].ssa, i);
6211 if (nir_scalar_is_undef(comp)) {
6212 dmask &= ~BITFIELD_BIT(i);
6213 } else if (ctx->options->gfx_level <= GFX11_5) {
6214 if (nir_scalar_is_const(comp) && nir_scalar_as_uint(comp) == 0)
6215 dmask &= ~BITFIELD_BIT(i);
6216 } else {
6217 unsigned first = dim == GLSL_SAMPLER_DIM_BUF ? 0 : ffs(dmask) - 1;
6218 if (i != first && nir_scalar_equal(nir_scalar_resolved(instr->src[3].ssa, first), comp))
6219 dmask &= ~BITFIELD_BIT(i);
6220 }
6221 }
6222
6223 /* dmask cannot be 0, at least one vgpr is always read */
6224 if (dmask == 0)
6225 dmask = 1;
6226 /* buffer store only supports consecutive components. */
6227 if (dim == GLSL_SAMPLER_DIM_BUF)
6228 dmask = BITFIELD_MASK(util_last_bit(dmask));
6229
6230 if (dmask != BITFIELD_MASK(num_components)) {
6231 uint32_t dmask_count = util_bitcount(dmask);
6232 RegClass rc = d16 ? v2b : v1;
6233 if (dmask_count == 1) {
6234 data = emit_extract_vector(ctx, data, ffs(dmask) - 1, rc);
6235 } else {
6236 aco_ptr<Instruction> vec{
6237 create_instruction(aco_opcode::p_create_vector, Format::PSEUDO, dmask_count, 1)};
6238 uint32_t index = 0;
6239 u_foreach_bit (bit, dmask) {
6240 vec->operands[index++] = Operand(emit_extract_vector(ctx, data, bit, rc));
6241 }
6242 data = bld.tmp(RegClass::get(RegType::vgpr, dmask_count * rc.bytes()));
6243 vec->definitions[0] = Definition(data);
6244 bld.insert(std::move(vec));
6245 }
6246 }
6247 }
6248
6249 if (dim == GLSL_SAMPLER_DIM_BUF) {
6250 Temp rsrc = bld.as_uniform(get_ssa_temp(ctx, instr->src[0].ssa));
6251 Temp vindex = emit_extract_vector(ctx, get_ssa_temp(ctx, instr->src[1].ssa), 0, v1);
6252 aco_opcode opcode;
6253 if (!d16) {
6254 switch (dmask) {
6255 case 0x1: opcode = aco_opcode::buffer_store_format_x; break;
6256 case 0x3: opcode = aco_opcode::buffer_store_format_xy; break;
6257 case 0x7: opcode = aco_opcode::buffer_store_format_xyz; break;
6258 case 0xf: opcode = aco_opcode::buffer_store_format_xyzw; break;
6259 default: unreachable(">4 channel buffer image store");
6260 }
6261 } else {
6262 switch (dmask) {
6263 case 0x1: opcode = aco_opcode::buffer_store_format_d16_x; break;
6264 case 0x3: opcode = aco_opcode::buffer_store_format_d16_xy; break;
6265 case 0x7: opcode = aco_opcode::buffer_store_format_d16_xyz; break;
6266 case 0xf: opcode = aco_opcode::buffer_store_format_d16_xyzw; break;
6267 default: unreachable(">4 channel buffer image store");
6268 }
6269 }
6270 aco_ptr<Instruction> store{create_instruction(opcode, Format::MUBUF, 4, 0)};
6271 store->operands[0] = Operand(rsrc);
6272 store->operands[1] = Operand(vindex);
6273 store->operands[2] = Operand::c32(0);
6274 store->operands[3] = Operand(data);
6275 store->mubuf().idxen = true;
6276 store->mubuf().cache = cache;
6277 store->mubuf().disable_wqm = true;
6278 store->mubuf().sync = sync;
6279 ctx->program->needs_exact = true;
6280 ctx->block->instructions.emplace_back(std::move(store));
6281 return;
6282 }
6283
6284 assert(data.type() == RegType::vgpr);
6285 std::vector<Temp> coords = get_image_coords(ctx, instr);
6286 Temp resource = bld.as_uniform(get_ssa_temp(ctx, instr->src[0].ssa));
6287
6288 bool level_zero = nir_src_is_const(instr->src[4]) && nir_src_as_uint(instr->src[4]) == 0;
6289 aco_opcode opcode = level_zero ? aco_opcode::image_store : aco_opcode::image_store_mip;
6290
6291 MIMG_instruction* store =
6292 emit_mimg(bld, opcode, Temp(0, v1), resource, Operand(s4), coords, Operand(data));
6293 store->cache = cache;
6294 store->a16 = instr->src[1].ssa->bit_size == 16;
6295 store->d16 = d16;
6296 store->dmask = dmask;
6297 store->unrm = true;
6298 ac_image_dim sdim = ac_get_image_dim(ctx->options->gfx_level, dim, is_array);
6299 store->dim = sdim;
6300 store->da = should_declare_array(sdim);
6301 store->disable_wqm = true;
6302 store->sync = sync;
6303 ctx->program->needs_exact = true;
6304 return;
6305 }
6306
6307 void
translate_buffer_image_atomic_op(const nir_atomic_op op,aco_opcode * buf_op,aco_opcode * buf_op64,aco_opcode * image_op)6308 translate_buffer_image_atomic_op(const nir_atomic_op op, aco_opcode* buf_op, aco_opcode* buf_op64,
6309 aco_opcode* image_op)
6310 {
6311 switch (op) {
6312 case nir_atomic_op_iadd:
6313 *buf_op = aco_opcode::buffer_atomic_add;
6314 *buf_op64 = aco_opcode::buffer_atomic_add_x2;
6315 *image_op = aco_opcode::image_atomic_add;
6316 break;
6317 case nir_atomic_op_umin:
6318 *buf_op = aco_opcode::buffer_atomic_umin;
6319 *buf_op64 = aco_opcode::buffer_atomic_umin_x2;
6320 *image_op = aco_opcode::image_atomic_umin;
6321 break;
6322 case nir_atomic_op_imin:
6323 *buf_op = aco_opcode::buffer_atomic_smin;
6324 *buf_op64 = aco_opcode::buffer_atomic_smin_x2;
6325 *image_op = aco_opcode::image_atomic_smin;
6326 break;
6327 case nir_atomic_op_umax:
6328 *buf_op = aco_opcode::buffer_atomic_umax;
6329 *buf_op64 = aco_opcode::buffer_atomic_umax_x2;
6330 *image_op = aco_opcode::image_atomic_umax;
6331 break;
6332 case nir_atomic_op_imax:
6333 *buf_op = aco_opcode::buffer_atomic_smax;
6334 *buf_op64 = aco_opcode::buffer_atomic_smax_x2;
6335 *image_op = aco_opcode::image_atomic_smax;
6336 break;
6337 case nir_atomic_op_iand:
6338 *buf_op = aco_opcode::buffer_atomic_and;
6339 *buf_op64 = aco_opcode::buffer_atomic_and_x2;
6340 *image_op = aco_opcode::image_atomic_and;
6341 break;
6342 case nir_atomic_op_ior:
6343 *buf_op = aco_opcode::buffer_atomic_or;
6344 *buf_op64 = aco_opcode::buffer_atomic_or_x2;
6345 *image_op = aco_opcode::image_atomic_or;
6346 break;
6347 case nir_atomic_op_ixor:
6348 *buf_op = aco_opcode::buffer_atomic_xor;
6349 *buf_op64 = aco_opcode::buffer_atomic_xor_x2;
6350 *image_op = aco_opcode::image_atomic_xor;
6351 break;
6352 case nir_atomic_op_xchg:
6353 *buf_op = aco_opcode::buffer_atomic_swap;
6354 *buf_op64 = aco_opcode::buffer_atomic_swap_x2;
6355 *image_op = aco_opcode::image_atomic_swap;
6356 break;
6357 case nir_atomic_op_cmpxchg:
6358 *buf_op = aco_opcode::buffer_atomic_cmpswap;
6359 *buf_op64 = aco_opcode::buffer_atomic_cmpswap_x2;
6360 *image_op = aco_opcode::image_atomic_cmpswap;
6361 break;
6362 case nir_atomic_op_inc_wrap:
6363 *buf_op = aco_opcode::buffer_atomic_inc;
6364 *buf_op64 = aco_opcode::buffer_atomic_inc_x2;
6365 *image_op = aco_opcode::image_atomic_inc;
6366 break;
6367 case nir_atomic_op_dec_wrap:
6368 *buf_op = aco_opcode::buffer_atomic_dec;
6369 *buf_op64 = aco_opcode::buffer_atomic_dec_x2;
6370 *image_op = aco_opcode::image_atomic_dec;
6371 break;
6372 case nir_atomic_op_fadd:
6373 *buf_op = aco_opcode::buffer_atomic_add_f32;
6374 *buf_op64 = aco_opcode::num_opcodes;
6375 *image_op = aco_opcode::num_opcodes;
6376 break;
6377 case nir_atomic_op_fmin:
6378 *buf_op = aco_opcode::buffer_atomic_fmin;
6379 *buf_op64 = aco_opcode::buffer_atomic_fmin_x2;
6380 *image_op = aco_opcode::image_atomic_fmin;
6381 break;
6382 case nir_atomic_op_fmax:
6383 *buf_op = aco_opcode::buffer_atomic_fmax;
6384 *buf_op64 = aco_opcode::buffer_atomic_fmax_x2;
6385 *image_op = aco_opcode::image_atomic_fmax;
6386 break;
6387 default: unreachable("unsupported atomic operation");
6388 }
6389 }
6390
6391 void
visit_image_atomic(isel_context * ctx,nir_intrinsic_instr * instr)6392 visit_image_atomic(isel_context* ctx, nir_intrinsic_instr* instr)
6393 {
6394 bool return_previous = !nir_def_is_unused(&instr->def);
6395 const enum glsl_sampler_dim dim = nir_intrinsic_image_dim(instr);
6396 bool is_array = nir_intrinsic_image_array(instr);
6397 Builder bld(ctx->program, ctx->block);
6398
6399 const nir_atomic_op op = nir_intrinsic_atomic_op(instr);
6400 const bool cmpswap = op == nir_atomic_op_cmpxchg;
6401
6402 aco_opcode buf_op, buf_op64, image_op;
6403 translate_buffer_image_atomic_op(op, &buf_op, &buf_op64, &image_op);
6404
6405 Temp data = as_vgpr(ctx, get_ssa_temp(ctx, instr->src[3].ssa));
6406 bool is_64bit = data.bytes() == 8;
6407 assert((data.bytes() == 4 || data.bytes() == 8) && "only 32/64-bit image atomics implemented.");
6408
6409 if (cmpswap)
6410 data = bld.pseudo(aco_opcode::p_create_vector, bld.def(is_64bit ? v4 : v2),
6411 get_ssa_temp(ctx, instr->src[4].ssa), data);
6412
6413 Temp dst = get_ssa_temp(ctx, &instr->def);
6414 memory_sync_info sync = get_memory_sync_info(instr, storage_image, semantic_atomicrmw);
6415
6416 if (dim == GLSL_SAMPLER_DIM_BUF) {
6417 Temp vindex = emit_extract_vector(ctx, get_ssa_temp(ctx, instr->src[1].ssa), 0, v1);
6418 Temp resource = bld.as_uniform(get_ssa_temp(ctx, instr->src[0].ssa));
6419 // assert(ctx->options->gfx_level < GFX9 && "GFX9 stride size workaround not yet
6420 // implemented.");
6421 aco_ptr<Instruction> mubuf{create_instruction(is_64bit ? buf_op64 : buf_op, Format::MUBUF, 4,
6422 return_previous ? 1 : 0)};
6423 mubuf->operands[0] = Operand(resource);
6424 mubuf->operands[1] = Operand(vindex);
6425 mubuf->operands[2] = Operand::c32(0);
6426 mubuf->operands[3] = Operand(data);
6427 Definition def =
6428 return_previous ? (cmpswap ? bld.def(data.regClass()) : Definition(dst)) : Definition();
6429 if (return_previous)
6430 mubuf->definitions[0] = def;
6431 mubuf->mubuf().offset = 0;
6432 mubuf->mubuf().idxen = true;
6433 mubuf->mubuf().cache = get_atomic_cache_flags(ctx, return_previous);
6434 mubuf->mubuf().disable_wqm = true;
6435 mubuf->mubuf().sync = sync;
6436 ctx->program->needs_exact = true;
6437 ctx->block->instructions.emplace_back(std::move(mubuf));
6438 if (return_previous && cmpswap)
6439 bld.pseudo(aco_opcode::p_extract_vector, Definition(dst), def.getTemp(), Operand::zero());
6440 return;
6441 }
6442
6443 std::vector<Temp> coords = get_image_coords(ctx, instr);
6444 Temp resource = bld.as_uniform(get_ssa_temp(ctx, instr->src[0].ssa));
6445 Temp tmp = return_previous ? (cmpswap ? bld.tmp(data.regClass()) : dst) : Temp(0, v1);
6446 MIMG_instruction* mimg =
6447 emit_mimg(bld, image_op, tmp, resource, Operand(s4), coords, Operand(data));
6448 mimg->cache = get_atomic_cache_flags(ctx, return_previous);
6449 mimg->dmask = (1 << data.size()) - 1;
6450 mimg->a16 = instr->src[1].ssa->bit_size == 16;
6451 mimg->unrm = true;
6452 ac_image_dim sdim = ac_get_image_dim(ctx->options->gfx_level, dim, is_array);
6453 mimg->dim = sdim;
6454 mimg->da = should_declare_array(sdim);
6455 mimg->disable_wqm = true;
6456 mimg->sync = sync;
6457 ctx->program->needs_exact = true;
6458 if (return_previous && cmpswap)
6459 bld.pseudo(aco_opcode::p_extract_vector, Definition(dst), tmp, Operand::zero());
6460 return;
6461 }
6462
6463 void
visit_load_ssbo(isel_context * ctx,nir_intrinsic_instr * instr)6464 visit_load_ssbo(isel_context* ctx, nir_intrinsic_instr* instr)
6465 {
6466 Builder bld(ctx->program, ctx->block);
6467 unsigned num_components = instr->num_components;
6468
6469 Temp dst = get_ssa_temp(ctx, &instr->def);
6470 Temp rsrc = bld.as_uniform(get_ssa_temp(ctx, instr->src[0].ssa));
6471
6472 unsigned access = nir_intrinsic_access(instr);
6473 unsigned size = instr->def.bit_size / 8;
6474
6475 load_buffer(ctx, num_components, size, dst, rsrc, get_ssa_temp(ctx, instr->src[1].ssa),
6476 nir_intrinsic_align_mul(instr), nir_intrinsic_align_offset(instr), access,
6477 get_memory_sync_info(instr, storage_buffer, 0));
6478 }
6479
6480 void
visit_store_ssbo(isel_context * ctx,nir_intrinsic_instr * instr)6481 visit_store_ssbo(isel_context* ctx, nir_intrinsic_instr* instr)
6482 {
6483 Builder bld(ctx->program, ctx->block);
6484 Temp data = get_ssa_temp(ctx, instr->src[0].ssa);
6485 unsigned elem_size_bytes = instr->src[0].ssa->bit_size / 8;
6486 unsigned writemask = util_widen_mask(nir_intrinsic_write_mask(instr), elem_size_bytes);
6487 Temp offset = get_ssa_temp(ctx, instr->src[2].ssa);
6488
6489 Temp rsrc = bld.as_uniform(get_ssa_temp(ctx, instr->src[1].ssa));
6490
6491 memory_sync_info sync = get_memory_sync_info(instr, storage_buffer, 0);
6492
6493 unsigned write_count = 0;
6494 Temp write_datas[32];
6495 unsigned offsets[32];
6496 split_buffer_store(ctx, instr, false, RegType::vgpr, data, writemask, 16, &write_count,
6497 write_datas, offsets);
6498
6499 /* GFX6-7 are affected by a hw bug that prevents address clamping to work
6500 * correctly when the SGPR offset is used.
6501 */
6502 if (offset.type() == RegType::sgpr && ctx->options->gfx_level < GFX8)
6503 offset = as_vgpr(ctx, offset);
6504
6505 for (unsigned i = 0; i < write_count; i++) {
6506 aco_opcode op = get_buffer_store_op(write_datas[i].bytes());
6507 unsigned access = nir_intrinsic_access(instr) | ACCESS_TYPE_STORE;
6508 if (write_datas[i].bytes() < 4)
6509 access |= ACCESS_MAY_STORE_SUBDWORD;
6510
6511 aco_ptr<Instruction> store{create_instruction(op, Format::MUBUF, 4, 0)};
6512 store->operands[0] = Operand(rsrc);
6513 store->operands[1] = offset.type() == RegType::vgpr ? Operand(offset) : Operand(v1);
6514 store->operands[2] = offset.type() == RegType::sgpr ? Operand(offset) : Operand::c32(0);
6515 store->operands[3] = Operand(write_datas[i]);
6516 store->mubuf().offset = offsets[i];
6517 store->mubuf().offen = (offset.type() == RegType::vgpr);
6518 store->mubuf().cache = get_cache_flags(ctx, access);
6519 store->mubuf().disable_wqm = true;
6520 store->mubuf().sync = sync;
6521 ctx->program->needs_exact = true;
6522 ctx->block->instructions.emplace_back(std::move(store));
6523 }
6524 }
6525
6526 void
visit_atomic_ssbo(isel_context * ctx,nir_intrinsic_instr * instr)6527 visit_atomic_ssbo(isel_context* ctx, nir_intrinsic_instr* instr)
6528 {
6529 Builder bld(ctx->program, ctx->block);
6530 bool return_previous = !nir_def_is_unused(&instr->def);
6531 Temp data = as_vgpr(ctx, get_ssa_temp(ctx, instr->src[2].ssa));
6532
6533 const nir_atomic_op nir_op = nir_intrinsic_atomic_op(instr);
6534 const bool cmpswap = nir_op == nir_atomic_op_cmpxchg;
6535
6536 aco_opcode op32, op64, image_op;
6537 translate_buffer_image_atomic_op(nir_op, &op32, &op64, &image_op);
6538
6539 if (cmpswap)
6540 data = bld.pseudo(aco_opcode::p_create_vector, bld.def(RegType::vgpr, data.size() * 2),
6541 get_ssa_temp(ctx, instr->src[3].ssa), data);
6542
6543 Temp offset = get_ssa_temp(ctx, instr->src[1].ssa);
6544 Temp rsrc = bld.as_uniform(get_ssa_temp(ctx, instr->src[0].ssa));
6545 Temp dst = get_ssa_temp(ctx, &instr->def);
6546
6547 aco_opcode op = instr->def.bit_size == 32 ? op32 : op64;
6548 aco_ptr<Instruction> mubuf{create_instruction(op, Format::MUBUF, 4, return_previous ? 1 : 0)};
6549 mubuf->operands[0] = Operand(rsrc);
6550 mubuf->operands[1] = offset.type() == RegType::vgpr ? Operand(offset) : Operand(v1);
6551 mubuf->operands[2] = offset.type() == RegType::sgpr ? Operand(offset) : Operand::c32(0);
6552 mubuf->operands[3] = Operand(data);
6553 Definition def =
6554 return_previous ? (cmpswap ? bld.def(data.regClass()) : Definition(dst)) : Definition();
6555 if (return_previous)
6556 mubuf->definitions[0] = def;
6557 mubuf->mubuf().offset = 0;
6558 mubuf->mubuf().offen = (offset.type() == RegType::vgpr);
6559 mubuf->mubuf().cache = get_atomic_cache_flags(ctx, return_previous);
6560 mubuf->mubuf().disable_wqm = true;
6561 mubuf->mubuf().sync = get_memory_sync_info(instr, storage_buffer, semantic_atomicrmw);
6562 ctx->program->needs_exact = true;
6563 ctx->block->instructions.emplace_back(std::move(mubuf));
6564 if (return_previous && cmpswap)
6565 bld.pseudo(aco_opcode::p_extract_vector, Definition(dst), def.getTemp(), Operand::zero());
6566 }
6567
6568 void
parse_global(isel_context * ctx,nir_intrinsic_instr * intrin,Temp * address,uint32_t * const_offset,Temp * offset)6569 parse_global(isel_context* ctx, nir_intrinsic_instr* intrin, Temp* address, uint32_t* const_offset,
6570 Temp* offset)
6571 {
6572 bool is_store = intrin->intrinsic == nir_intrinsic_store_global_amd;
6573 *address = get_ssa_temp(ctx, intrin->src[is_store ? 1 : 0].ssa);
6574
6575 *const_offset = nir_intrinsic_base(intrin);
6576
6577 unsigned num_src = nir_intrinsic_infos[intrin->intrinsic].num_srcs;
6578 nir_src offset_src = intrin->src[num_src - 1];
6579 if (!nir_src_is_const(offset_src) || nir_src_as_uint(offset_src))
6580 *offset = get_ssa_temp(ctx, offset_src.ssa);
6581 else
6582 *offset = Temp();
6583 }
6584
6585 void
visit_load_global(isel_context * ctx,nir_intrinsic_instr * instr)6586 visit_load_global(isel_context* ctx, nir_intrinsic_instr* instr)
6587 {
6588 Builder bld(ctx->program, ctx->block);
6589 unsigned num_components = instr->num_components;
6590 unsigned component_size = instr->def.bit_size / 8;
6591
6592 Temp addr, offset;
6593 uint32_t const_offset;
6594 parse_global(ctx, instr, &addr, &const_offset, &offset);
6595
6596 LoadEmitInfo info = {Operand(addr), get_ssa_temp(ctx, &instr->def), num_components,
6597 component_size};
6598 if (offset.id()) {
6599 info.resource = addr;
6600 info.offset = Operand(offset);
6601 }
6602 info.const_offset = const_offset;
6603 info.align_mul = nir_intrinsic_align_mul(instr);
6604 info.align_offset = nir_intrinsic_align_offset(instr);
6605 info.sync = get_memory_sync_info(instr, storage_buffer, 0);
6606
6607 unsigned access = nir_intrinsic_access(instr) | ACCESS_TYPE_LOAD;
6608 if (access & ACCESS_SMEM_AMD) {
6609 assert(component_size >= 4);
6610 if (info.resource.id())
6611 info.resource = bld.as_uniform(info.resource);
6612 info.offset = Operand(bld.as_uniform(info.offset));
6613 info.cache = get_cache_flags(ctx, access | ACCESS_TYPE_SMEM);
6614 emit_load(ctx, bld, info, smem_load_params);
6615 } else {
6616 EmitLoadParameters params = global_load_params;
6617 info.cache = get_cache_flags(ctx, access);
6618 emit_load(ctx, bld, info, params);
6619 }
6620 }
6621
6622 void
visit_store_global(isel_context * ctx,nir_intrinsic_instr * instr)6623 visit_store_global(isel_context* ctx, nir_intrinsic_instr* instr)
6624 {
6625 Builder bld(ctx->program, ctx->block);
6626 unsigned elem_size_bytes = instr->src[0].ssa->bit_size / 8;
6627 unsigned writemask = util_widen_mask(nir_intrinsic_write_mask(instr), elem_size_bytes);
6628
6629 Temp data = as_vgpr(ctx, get_ssa_temp(ctx, instr->src[0].ssa));
6630 memory_sync_info sync = get_memory_sync_info(instr, storage_buffer, 0);
6631
6632 unsigned write_count = 0;
6633 Temp write_datas[32];
6634 unsigned offsets[32];
6635 split_buffer_store(ctx, instr, false, RegType::vgpr, data, writemask, 16, &write_count,
6636 write_datas, offsets);
6637
6638 Temp addr, offset;
6639 uint32_t const_offset;
6640 parse_global(ctx, instr, &addr, &const_offset, &offset);
6641
6642 for (unsigned i = 0; i < write_count; i++) {
6643 Temp write_address = addr;
6644 uint32_t write_const_offset = const_offset;
6645 Temp write_offset = offset;
6646 lower_global_address(bld, offsets[i], &write_address, &write_const_offset, &write_offset);
6647
6648 unsigned access = nir_intrinsic_access(instr) | ACCESS_TYPE_STORE;
6649 if (write_datas[i].bytes() < 4)
6650 access |= ACCESS_MAY_STORE_SUBDWORD;
6651
6652 if (ctx->options->gfx_level >= GFX7) {
6653 bool global = ctx->options->gfx_level >= GFX9;
6654 aco_opcode op;
6655 switch (write_datas[i].bytes()) {
6656 case 1: op = global ? aco_opcode::global_store_byte : aco_opcode::flat_store_byte; break;
6657 case 2: op = global ? aco_opcode::global_store_short : aco_opcode::flat_store_short; break;
6658 case 4: op = global ? aco_opcode::global_store_dword : aco_opcode::flat_store_dword; break;
6659 case 8:
6660 op = global ? aco_opcode::global_store_dwordx2 : aco_opcode::flat_store_dwordx2;
6661 break;
6662 case 12:
6663 op = global ? aco_opcode::global_store_dwordx3 : aco_opcode::flat_store_dwordx3;
6664 break;
6665 case 16:
6666 op = global ? aco_opcode::global_store_dwordx4 : aco_opcode::flat_store_dwordx4;
6667 break;
6668 default: unreachable("store_global not implemented for this size.");
6669 }
6670
6671 aco_ptr<Instruction> flat{
6672 create_instruction(op, global ? Format::GLOBAL : Format::FLAT, 3, 0)};
6673 if (write_address.regClass() == s2) {
6674 assert(global && write_offset.id() && write_offset.type() == RegType::vgpr);
6675 flat->operands[0] = Operand(write_offset);
6676 flat->operands[1] = Operand(write_address);
6677 } else {
6678 assert(write_address.type() == RegType::vgpr && !write_offset.id());
6679 flat->operands[0] = Operand(write_address);
6680 flat->operands[1] = Operand(s1);
6681 }
6682 flat->operands[2] = Operand(write_datas[i]);
6683 flat->flatlike().cache = get_cache_flags(ctx, access);
6684 assert(global || !write_const_offset);
6685 flat->flatlike().offset = write_const_offset;
6686 flat->flatlike().disable_wqm = true;
6687 flat->flatlike().sync = sync;
6688 ctx->program->needs_exact = true;
6689 ctx->block->instructions.emplace_back(std::move(flat));
6690 } else {
6691 assert(ctx->options->gfx_level == GFX6);
6692
6693 aco_opcode op = get_buffer_store_op(write_datas[i].bytes());
6694
6695 Temp rsrc = get_gfx6_global_rsrc(bld, write_address);
6696
6697 aco_ptr<Instruction> mubuf{create_instruction(op, Format::MUBUF, 4, 0)};
6698 mubuf->operands[0] = Operand(rsrc);
6699 mubuf->operands[1] =
6700 write_address.type() == RegType::vgpr ? Operand(write_address) : Operand(v1);
6701 mubuf->operands[2] = Operand(write_offset);
6702 mubuf->operands[3] = Operand(write_datas[i]);
6703 mubuf->mubuf().cache = get_cache_flags(ctx, access);
6704 mubuf->mubuf().offset = write_const_offset;
6705 mubuf->mubuf().addr64 = write_address.type() == RegType::vgpr;
6706 mubuf->mubuf().disable_wqm = true;
6707 mubuf->mubuf().sync = sync;
6708 ctx->program->needs_exact = true;
6709 ctx->block->instructions.emplace_back(std::move(mubuf));
6710 }
6711 }
6712 }
6713
6714 void
visit_global_atomic(isel_context * ctx,nir_intrinsic_instr * instr)6715 visit_global_atomic(isel_context* ctx, nir_intrinsic_instr* instr)
6716 {
6717 Builder bld(ctx->program, ctx->block);
6718 bool return_previous = !nir_def_is_unused(&instr->def);
6719 Temp data = as_vgpr(ctx, get_ssa_temp(ctx, instr->src[1].ssa));
6720
6721 const nir_atomic_op nir_op = nir_intrinsic_atomic_op(instr);
6722 const bool cmpswap = nir_op == nir_atomic_op_cmpxchg;
6723
6724 if (cmpswap)
6725 data = bld.pseudo(aco_opcode::p_create_vector, bld.def(RegType::vgpr, data.size() * 2),
6726 get_ssa_temp(ctx, instr->src[2].ssa), data);
6727
6728 Temp dst = get_ssa_temp(ctx, &instr->def);
6729
6730 aco_opcode op32, op64;
6731
6732 Temp addr, offset;
6733 uint32_t const_offset;
6734 parse_global(ctx, instr, &addr, &const_offset, &offset);
6735 lower_global_address(bld, 0, &addr, &const_offset, &offset);
6736
6737 if (ctx->options->gfx_level >= GFX7) {
6738 bool global = ctx->options->gfx_level >= GFX9;
6739 switch (nir_op) {
6740 case nir_atomic_op_iadd:
6741 op32 = global ? aco_opcode::global_atomic_add : aco_opcode::flat_atomic_add;
6742 op64 = global ? aco_opcode::global_atomic_add_x2 : aco_opcode::flat_atomic_add_x2;
6743 break;
6744 case nir_atomic_op_imin:
6745 op32 = global ? aco_opcode::global_atomic_smin : aco_opcode::flat_atomic_smin;
6746 op64 = global ? aco_opcode::global_atomic_smin_x2 : aco_opcode::flat_atomic_smin_x2;
6747 break;
6748 case nir_atomic_op_umin:
6749 op32 = global ? aco_opcode::global_atomic_umin : aco_opcode::flat_atomic_umin;
6750 op64 = global ? aco_opcode::global_atomic_umin_x2 : aco_opcode::flat_atomic_umin_x2;
6751 break;
6752 case nir_atomic_op_imax:
6753 op32 = global ? aco_opcode::global_atomic_smax : aco_opcode::flat_atomic_smax;
6754 op64 = global ? aco_opcode::global_atomic_smax_x2 : aco_opcode::flat_atomic_smax_x2;
6755 break;
6756 case nir_atomic_op_umax:
6757 op32 = global ? aco_opcode::global_atomic_umax : aco_opcode::flat_atomic_umax;
6758 op64 = global ? aco_opcode::global_atomic_umax_x2 : aco_opcode::flat_atomic_umax_x2;
6759 break;
6760 case nir_atomic_op_iand:
6761 op32 = global ? aco_opcode::global_atomic_and : aco_opcode::flat_atomic_and;
6762 op64 = global ? aco_opcode::global_atomic_and_x2 : aco_opcode::flat_atomic_and_x2;
6763 break;
6764 case nir_atomic_op_ior:
6765 op32 = global ? aco_opcode::global_atomic_or : aco_opcode::flat_atomic_or;
6766 op64 = global ? aco_opcode::global_atomic_or_x2 : aco_opcode::flat_atomic_or_x2;
6767 break;
6768 case nir_atomic_op_ixor:
6769 op32 = global ? aco_opcode::global_atomic_xor : aco_opcode::flat_atomic_xor;
6770 op64 = global ? aco_opcode::global_atomic_xor_x2 : aco_opcode::flat_atomic_xor_x2;
6771 break;
6772 case nir_atomic_op_xchg:
6773 op32 = global ? aco_opcode::global_atomic_swap : aco_opcode::flat_atomic_swap;
6774 op64 = global ? aco_opcode::global_atomic_swap_x2 : aco_opcode::flat_atomic_swap_x2;
6775 break;
6776 case nir_atomic_op_cmpxchg:
6777 op32 = global ? aco_opcode::global_atomic_cmpswap : aco_opcode::flat_atomic_cmpswap;
6778 op64 = global ? aco_opcode::global_atomic_cmpswap_x2 : aco_opcode::flat_atomic_cmpswap_x2;
6779 break;
6780 case nir_atomic_op_fadd:
6781 op32 = global ? aco_opcode::global_atomic_add_f32 : aco_opcode::flat_atomic_add_f32;
6782 op64 = aco_opcode::num_opcodes;
6783 break;
6784 case nir_atomic_op_fmin:
6785 op32 = global ? aco_opcode::global_atomic_fmin : aco_opcode::flat_atomic_fmin;
6786 op64 = global ? aco_opcode::global_atomic_fmin_x2 : aco_opcode::flat_atomic_fmin_x2;
6787 break;
6788 case nir_atomic_op_fmax:
6789 op32 = global ? aco_opcode::global_atomic_fmax : aco_opcode::flat_atomic_fmax;
6790 op64 = global ? aco_opcode::global_atomic_fmax_x2 : aco_opcode::flat_atomic_fmax_x2;
6791 break;
6792 case nir_atomic_op_ordered_add_gfx12_amd:
6793 assert(ctx->options->gfx_level >= GFX12 && instr->def.bit_size == 64);
6794 op32 = aco_opcode::num_opcodes;
6795 op64 = aco_opcode::global_atomic_ordered_add_b64;
6796 break;
6797 default: unreachable("unsupported atomic operation");
6798 }
6799
6800 aco_opcode op = instr->def.bit_size == 32 ? op32 : op64;
6801 aco_ptr<Instruction> flat{create_instruction(op, global ? Format::GLOBAL : Format::FLAT, 3,
6802 return_previous ? 1 : 0)};
6803 if (addr.regClass() == s2) {
6804 assert(global && offset.id() && offset.type() == RegType::vgpr);
6805 flat->operands[0] = Operand(offset);
6806 flat->operands[1] = Operand(addr);
6807 } else {
6808 assert(addr.type() == RegType::vgpr && !offset.id());
6809 flat->operands[0] = Operand(addr);
6810 flat->operands[1] = Operand(s1);
6811 }
6812 flat->operands[2] = Operand(data);
6813 if (return_previous)
6814 flat->definitions[0] = Definition(dst);
6815 flat->flatlike().cache = get_atomic_cache_flags(ctx, return_previous);
6816 assert(global || !const_offset);
6817 flat->flatlike().offset = const_offset;
6818 flat->flatlike().disable_wqm = true;
6819 flat->flatlike().sync = get_memory_sync_info(instr, storage_buffer, semantic_atomicrmw);
6820 ctx->program->needs_exact = true;
6821 ctx->block->instructions.emplace_back(std::move(flat));
6822 } else {
6823 assert(ctx->options->gfx_level == GFX6);
6824
6825 UNUSED aco_opcode image_op;
6826 translate_buffer_image_atomic_op(nir_op, &op32, &op64, &image_op);
6827
6828 Temp rsrc = get_gfx6_global_rsrc(bld, addr);
6829
6830 aco_opcode op = instr->def.bit_size == 32 ? op32 : op64;
6831
6832 aco_ptr<Instruction> mubuf{create_instruction(op, Format::MUBUF, 4, return_previous ? 1 : 0)};
6833 mubuf->operands[0] = Operand(rsrc);
6834 mubuf->operands[1] = addr.type() == RegType::vgpr ? Operand(addr) : Operand(v1);
6835 mubuf->operands[2] = Operand(offset);
6836 mubuf->operands[3] = Operand(data);
6837 Definition def =
6838 return_previous ? (cmpswap ? bld.def(data.regClass()) : Definition(dst)) : Definition();
6839 if (return_previous)
6840 mubuf->definitions[0] = def;
6841 mubuf->mubuf().cache = get_atomic_cache_flags(ctx, return_previous);
6842 mubuf->mubuf().offset = const_offset;
6843 mubuf->mubuf().addr64 = addr.type() == RegType::vgpr;
6844 mubuf->mubuf().disable_wqm = true;
6845 mubuf->mubuf().sync = get_memory_sync_info(instr, storage_buffer, semantic_atomicrmw);
6846 ctx->program->needs_exact = true;
6847 ctx->block->instructions.emplace_back(std::move(mubuf));
6848 if (return_previous && cmpswap)
6849 bld.pseudo(aco_opcode::p_extract_vector, Definition(dst), def.getTemp(), Operand::zero());
6850 }
6851 }
6852
6853 unsigned
aco_storage_mode_from_nir_mem_mode(unsigned mem_mode)6854 aco_storage_mode_from_nir_mem_mode(unsigned mem_mode)
6855 {
6856 unsigned storage = storage_none;
6857
6858 if (mem_mode & nir_var_shader_out)
6859 storage |= storage_vmem_output;
6860 if ((mem_mode & nir_var_mem_ssbo) || (mem_mode & nir_var_mem_global))
6861 storage |= storage_buffer;
6862 if (mem_mode & nir_var_mem_task_payload)
6863 storage |= storage_task_payload;
6864 if (mem_mode & nir_var_mem_shared)
6865 storage |= storage_shared;
6866 if (mem_mode & nir_var_image)
6867 storage |= storage_image;
6868
6869 return storage;
6870 }
6871
6872 void
visit_load_buffer(isel_context * ctx,nir_intrinsic_instr * intrin)6873 visit_load_buffer(isel_context* ctx, nir_intrinsic_instr* intrin)
6874 {
6875 Builder bld(ctx->program, ctx->block);
6876
6877 /* Swizzled buffer addressing seems to be broken on GFX11 without the idxen bit. */
6878 bool swizzled = nir_intrinsic_access(intrin) & ACCESS_IS_SWIZZLED_AMD;
6879 bool idxen = (swizzled && ctx->program->gfx_level >= GFX11) ||
6880 !nir_src_is_const(intrin->src[3]) || nir_src_as_uint(intrin->src[3]);
6881 bool v_offset_zero = nir_src_is_const(intrin->src[1]) && !nir_src_as_uint(intrin->src[1]);
6882 bool s_offset_zero = nir_src_is_const(intrin->src[2]) && !nir_src_as_uint(intrin->src[2]);
6883
6884 Temp dst = get_ssa_temp(ctx, &intrin->def);
6885 Temp descriptor = bld.as_uniform(get_ssa_temp(ctx, intrin->src[0].ssa));
6886 Temp v_offset =
6887 v_offset_zero ? Temp(0, v1) : as_vgpr(ctx, get_ssa_temp(ctx, intrin->src[1].ssa));
6888 Temp s_offset =
6889 s_offset_zero ? Temp(0, s1) : bld.as_uniform(get_ssa_temp(ctx, intrin->src[2].ssa));
6890 Temp idx = idxen ? as_vgpr(ctx, get_ssa_temp(ctx, intrin->src[3].ssa)) : Temp();
6891
6892 ac_hw_cache_flags cache = get_cache_flags(ctx, nir_intrinsic_access(intrin) | ACCESS_TYPE_LOAD);
6893
6894 unsigned const_offset = nir_intrinsic_base(intrin);
6895 unsigned elem_size_bytes = intrin->def.bit_size / 8u;
6896 unsigned num_components = intrin->def.num_components;
6897
6898 nir_variable_mode mem_mode = nir_intrinsic_memory_modes(intrin);
6899 memory_sync_info sync(aco_storage_mode_from_nir_mem_mode(mem_mode));
6900
6901 LoadEmitInfo info = {Operand(v_offset), dst, num_components, elem_size_bytes, descriptor};
6902 info.idx = idx;
6903 info.cache = cache;
6904 info.soffset = s_offset;
6905 info.const_offset = const_offset;
6906 info.sync = sync;
6907
6908 if (intrin->intrinsic == nir_intrinsic_load_typed_buffer_amd) {
6909 const pipe_format format = nir_intrinsic_format(intrin);
6910 const struct ac_vtx_format_info* vtx_info =
6911 ac_get_vtx_format_info(ctx->program->gfx_level, ctx->program->family, format);
6912 const struct util_format_description* f = util_format_description(format);
6913 const unsigned align_mul = nir_intrinsic_align_mul(intrin);
6914 const unsigned align_offset = nir_intrinsic_align_offset(intrin);
6915
6916 /* Avoid splitting:
6917 * - non-array formats because that would result in incorrect code
6918 * - when element size is same as component size (to reduce instruction count)
6919 */
6920 const bool can_split = f->is_array && elem_size_bytes != vtx_info->chan_byte_size;
6921
6922 info.align_mul = align_mul;
6923 info.align_offset = align_offset;
6924 info.format = format;
6925 info.component_stride = can_split ? vtx_info->chan_byte_size : 0;
6926 info.split_by_component_stride = false;
6927
6928 emit_load(ctx, bld, info, mtbuf_load_params);
6929 } else {
6930 assert(intrin->intrinsic == nir_intrinsic_load_buffer_amd);
6931
6932 if (nir_intrinsic_access(intrin) & ACCESS_USES_FORMAT_AMD) {
6933 assert(!swizzled);
6934
6935 emit_load(ctx, bld, info, mubuf_load_format_params);
6936 } else {
6937 const unsigned swizzle_element_size =
6938 swizzled ? (ctx->program->gfx_level <= GFX8 ? 4 : 16) : 0;
6939
6940 info.component_stride = swizzle_element_size;
6941 info.swizzle_component_size = swizzle_element_size ? 4 : 0;
6942 info.align_mul = MIN2(elem_size_bytes, 4);
6943 info.align_offset = 0;
6944
6945 emit_load(ctx, bld, info, mubuf_load_params);
6946 }
6947 }
6948 }
6949
6950 void
visit_store_buffer(isel_context * ctx,nir_intrinsic_instr * intrin)6951 visit_store_buffer(isel_context* ctx, nir_intrinsic_instr* intrin)
6952 {
6953 Builder bld(ctx->program, ctx->block);
6954
6955 /* Swizzled buffer addressing seems to be broken on GFX11 without the idxen bit. */
6956 bool swizzled = nir_intrinsic_access(intrin) & ACCESS_IS_SWIZZLED_AMD;
6957 bool idxen = (swizzled && ctx->program->gfx_level >= GFX11) ||
6958 !nir_src_is_const(intrin->src[4]) || nir_src_as_uint(intrin->src[4]);
6959 bool offen = !nir_src_is_const(intrin->src[2]) || nir_src_as_uint(intrin->src[2]);
6960
6961 Temp store_src = get_ssa_temp(ctx, intrin->src[0].ssa);
6962 Temp descriptor = bld.as_uniform(get_ssa_temp(ctx, intrin->src[1].ssa));
6963 Temp v_offset = offen ? as_vgpr(ctx, get_ssa_temp(ctx, intrin->src[2].ssa)) : Temp();
6964 Temp s_offset = bld.as_uniform(get_ssa_temp(ctx, intrin->src[3].ssa));
6965 Temp idx = idxen ? as_vgpr(ctx, get_ssa_temp(ctx, intrin->src[4].ssa)) : Temp();
6966
6967 unsigned elem_size_bytes = intrin->src[0].ssa->bit_size / 8u;
6968 assert(elem_size_bytes == 1 || elem_size_bytes == 2 || elem_size_bytes == 4 ||
6969 elem_size_bytes == 8);
6970
6971 unsigned write_mask = nir_intrinsic_write_mask(intrin);
6972 write_mask = util_widen_mask(write_mask, elem_size_bytes);
6973
6974 nir_variable_mode mem_mode = nir_intrinsic_memory_modes(intrin);
6975 /* GS outputs are only written once. */
6976 const bool written_once =
6977 mem_mode == nir_var_shader_out && ctx->shader->info.stage == MESA_SHADER_GEOMETRY;
6978 memory_sync_info sync(aco_storage_mode_from_nir_mem_mode(mem_mode),
6979 written_once ? semantic_can_reorder : semantic_none);
6980
6981 unsigned write_count = 0;
6982 Temp write_datas[32];
6983 unsigned offsets[32];
6984 split_buffer_store(ctx, NULL, false, RegType::vgpr, store_src, write_mask,
6985 swizzled && ctx->program->gfx_level <= GFX8 ? 4 : 16, &write_count,
6986 write_datas, offsets);
6987
6988 for (unsigned i = 0; i < write_count; i++) {
6989 aco_opcode op = get_buffer_store_op(write_datas[i].bytes());
6990 Temp write_voffset = v_offset;
6991 unsigned const_offset = resolve_excess_vmem_const_offset(
6992 bld, write_voffset, offsets[i] + nir_intrinsic_base(intrin));
6993
6994 /* write_voffset may be updated in resolve_excess_vmem_const_offset(). */
6995 offen = write_voffset.id();
6996
6997 Operand vaddr_op(v1);
6998 if (offen && idxen)
6999 vaddr_op = bld.pseudo(aco_opcode::p_create_vector, bld.def(v2), idx, write_voffset);
7000 else if (offen)
7001 vaddr_op = Operand(write_voffset);
7002 else if (idxen)
7003 vaddr_op = Operand(idx);
7004
7005 unsigned access = nir_intrinsic_access(intrin);
7006 if (write_datas[i].bytes() < 4)
7007 access |= ACCESS_MAY_STORE_SUBDWORD;
7008 ac_hw_cache_flags cache = get_cache_flags(ctx, access | ACCESS_TYPE_STORE);
7009
7010 Instruction* mubuf = bld.mubuf(op, Operand(descriptor), vaddr_op, s_offset,
7011 Operand(write_datas[i]), const_offset, offen, idxen,
7012 /* addr64 */ false, /* disable_wqm */ false, cache)
7013 .instr;
7014 mubuf->mubuf().sync = sync;
7015 }
7016 }
7017
7018 void
visit_load_smem(isel_context * ctx,nir_intrinsic_instr * instr)7019 visit_load_smem(isel_context* ctx, nir_intrinsic_instr* instr)
7020 {
7021 Builder bld(ctx->program, ctx->block);
7022 Temp dst = get_ssa_temp(ctx, &instr->def);
7023 Temp base = bld.as_uniform(get_ssa_temp(ctx, instr->src[0].ssa));
7024 Temp offset = bld.as_uniform(get_ssa_temp(ctx, instr->src[1].ssa));
7025
7026 /* If base address is 32bit, convert to 64bit with the high 32bit part. */
7027 if (base.bytes() == 4) {
7028 base = bld.pseudo(aco_opcode::p_create_vector, bld.def(s2), base,
7029 Operand::c32(ctx->options->address32_hi));
7030 }
7031
7032 aco_opcode opcode = aco_opcode::s_load_dword;
7033 unsigned size = 1;
7034
7035 assert(dst.bytes() <= 64);
7036
7037 if (dst.bytes() > 32) {
7038 opcode = aco_opcode::s_load_dwordx16;
7039 size = 16;
7040 } else if (dst.bytes() > 16) {
7041 opcode = aco_opcode::s_load_dwordx8;
7042 size = 8;
7043 } else if (dst.bytes() > 8) {
7044 opcode = aco_opcode::s_load_dwordx4;
7045 size = 4;
7046 } else if (dst.bytes() > 4) {
7047 opcode = aco_opcode::s_load_dwordx2;
7048 size = 2;
7049 }
7050
7051 if (dst.size() != size) {
7052 bld.pseudo(aco_opcode::p_extract_vector, Definition(dst),
7053 bld.smem(opcode, bld.def(RegType::sgpr, size), base, offset), Operand::c32(0u));
7054 } else {
7055 bld.smem(opcode, Definition(dst), base, offset);
7056 }
7057 emit_split_vector(ctx, dst, instr->def.num_components);
7058 }
7059
7060 sync_scope
translate_nir_scope(mesa_scope scope)7061 translate_nir_scope(mesa_scope scope)
7062 {
7063 switch (scope) {
7064 case SCOPE_NONE:
7065 case SCOPE_INVOCATION: return scope_invocation;
7066 case SCOPE_SUBGROUP: return scope_subgroup;
7067 case SCOPE_WORKGROUP: return scope_workgroup;
7068 case SCOPE_QUEUE_FAMILY: return scope_queuefamily;
7069 case SCOPE_DEVICE: return scope_device;
7070 case SCOPE_SHADER_CALL: return scope_invocation;
7071 }
7072 unreachable("invalid scope");
7073 }
7074
7075 void
emit_barrier(isel_context * ctx,nir_intrinsic_instr * instr)7076 emit_barrier(isel_context* ctx, nir_intrinsic_instr* instr)
7077 {
7078 Builder bld(ctx->program, ctx->block);
7079
7080 unsigned storage_allowed = storage_buffer | storage_image;
7081 unsigned semantics = 0;
7082 sync_scope mem_scope = translate_nir_scope(nir_intrinsic_memory_scope(instr));
7083 sync_scope exec_scope = translate_nir_scope(nir_intrinsic_execution_scope(instr));
7084
7085 /* We use shared storage for the following:
7086 * - compute shaders expose it in their API
7087 * - when tessellation is used, TCS and VS I/O is lowered to shared memory
7088 * - when GS is used on GFX9+, VS->GS and TES->GS I/O is lowered to shared memory
7089 * - additionally, when NGG is used on GFX10+, shared memory is used for certain features
7090 */
7091 bool shared_storage_used =
7092 ctx->stage.hw == AC_HW_COMPUTE_SHADER || ctx->stage.hw == AC_HW_LOCAL_SHADER ||
7093 ctx->stage.hw == AC_HW_HULL_SHADER ||
7094 (ctx->stage.hw == AC_HW_LEGACY_GEOMETRY_SHADER && ctx->program->gfx_level >= GFX9) ||
7095 ctx->stage.hw == AC_HW_NEXT_GEN_GEOMETRY_SHADER;
7096
7097 if (shared_storage_used)
7098 storage_allowed |= storage_shared;
7099
7100 /* Task payload: Task Shader output, Mesh Shader input */
7101 if (ctx->stage.has(SWStage::MS) || ctx->stage.has(SWStage::TS))
7102 storage_allowed |= storage_task_payload;
7103
7104 /* Allow VMEM output for all stages that can have outputs. */
7105 if ((ctx->stage.hw != AC_HW_COMPUTE_SHADER && ctx->stage.hw != AC_HW_PIXEL_SHADER) ||
7106 ctx->stage.has(SWStage::TS))
7107 storage_allowed |= storage_vmem_output;
7108
7109 /* Workgroup barriers can hang merged shaders that can potentially have 0 threads in either half.
7110 * They are allowed in CS, TCS, and in any NGG shader.
7111 */
7112 ASSERTED bool workgroup_scope_allowed = ctx->stage.hw == AC_HW_COMPUTE_SHADER ||
7113 ctx->stage.hw == AC_HW_HULL_SHADER ||
7114 ctx->stage.hw == AC_HW_NEXT_GEN_GEOMETRY_SHADER;
7115
7116 unsigned nir_storage = nir_intrinsic_memory_modes(instr);
7117 unsigned storage = aco_storage_mode_from_nir_mem_mode(nir_storage);
7118 storage &= storage_allowed;
7119
7120 unsigned nir_semantics = nir_intrinsic_memory_semantics(instr);
7121 if (nir_semantics & NIR_MEMORY_ACQUIRE)
7122 semantics |= semantic_acquire | semantic_release;
7123 if (nir_semantics & NIR_MEMORY_RELEASE)
7124 semantics |= semantic_acquire | semantic_release;
7125
7126 assert(!(nir_semantics & (NIR_MEMORY_MAKE_AVAILABLE | NIR_MEMORY_MAKE_VISIBLE)));
7127 assert(exec_scope != scope_workgroup || workgroup_scope_allowed);
7128
7129 bld.barrier(aco_opcode::p_barrier,
7130 memory_sync_info((storage_class)storage, (memory_semantics)semantics, mem_scope),
7131 exec_scope);
7132 }
7133
7134 void
visit_load_shared(isel_context * ctx,nir_intrinsic_instr * instr)7135 visit_load_shared(isel_context* ctx, nir_intrinsic_instr* instr)
7136 {
7137 // TODO: implement sparse reads using ds_read2_b32 and nir_def_components_read()
7138 Temp dst = get_ssa_temp(ctx, &instr->def);
7139 Temp address = as_vgpr(ctx, get_ssa_temp(ctx, instr->src[0].ssa));
7140 Builder bld(ctx->program, ctx->block);
7141
7142 unsigned elem_size_bytes = instr->def.bit_size / 8;
7143 unsigned num_components = instr->def.num_components;
7144 unsigned align = nir_intrinsic_align_mul(instr) ? nir_intrinsic_align(instr) : elem_size_bytes;
7145 load_lds(ctx, elem_size_bytes, num_components, dst, address, nir_intrinsic_base(instr), align);
7146 }
7147
7148 void
visit_store_shared(isel_context * ctx,nir_intrinsic_instr * instr)7149 visit_store_shared(isel_context* ctx, nir_intrinsic_instr* instr)
7150 {
7151 unsigned writemask = nir_intrinsic_write_mask(instr);
7152 Temp data = get_ssa_temp(ctx, instr->src[0].ssa);
7153 Temp address = as_vgpr(ctx, get_ssa_temp(ctx, instr->src[1].ssa));
7154 unsigned elem_size_bytes = instr->src[0].ssa->bit_size / 8;
7155
7156 unsigned align = nir_intrinsic_align_mul(instr) ? nir_intrinsic_align(instr) : elem_size_bytes;
7157 store_lds(ctx, elem_size_bytes, data, writemask, address, nir_intrinsic_base(instr), align);
7158 }
7159
7160 void
visit_shared_atomic(isel_context * ctx,nir_intrinsic_instr * instr)7161 visit_shared_atomic(isel_context* ctx, nir_intrinsic_instr* instr)
7162 {
7163 unsigned offset = nir_intrinsic_base(instr);
7164 Builder bld(ctx->program, ctx->block);
7165 Operand m = load_lds_size_m0(bld);
7166 Temp data = as_vgpr(ctx, get_ssa_temp(ctx, instr->src[1].ssa));
7167 Temp address = as_vgpr(ctx, get_ssa_temp(ctx, instr->src[0].ssa));
7168
7169 unsigned num_operands = 3;
7170 aco_opcode op32, op64, op32_rtn, op64_rtn;
7171 switch (nir_intrinsic_atomic_op(instr)) {
7172 case nir_atomic_op_iadd:
7173 op32 = aco_opcode::ds_add_u32;
7174 op64 = aco_opcode::ds_add_u64;
7175 op32_rtn = aco_opcode::ds_add_rtn_u32;
7176 op64_rtn = aco_opcode::ds_add_rtn_u64;
7177 break;
7178 case nir_atomic_op_imin:
7179 op32 = aco_opcode::ds_min_i32;
7180 op64 = aco_opcode::ds_min_i64;
7181 op32_rtn = aco_opcode::ds_min_rtn_i32;
7182 op64_rtn = aco_opcode::ds_min_rtn_i64;
7183 break;
7184 case nir_atomic_op_umin:
7185 op32 = aco_opcode::ds_min_u32;
7186 op64 = aco_opcode::ds_min_u64;
7187 op32_rtn = aco_opcode::ds_min_rtn_u32;
7188 op64_rtn = aco_opcode::ds_min_rtn_u64;
7189 break;
7190 case nir_atomic_op_imax:
7191 op32 = aco_opcode::ds_max_i32;
7192 op64 = aco_opcode::ds_max_i64;
7193 op32_rtn = aco_opcode::ds_max_rtn_i32;
7194 op64_rtn = aco_opcode::ds_max_rtn_i64;
7195 break;
7196 case nir_atomic_op_umax:
7197 op32 = aco_opcode::ds_max_u32;
7198 op64 = aco_opcode::ds_max_u64;
7199 op32_rtn = aco_opcode::ds_max_rtn_u32;
7200 op64_rtn = aco_opcode::ds_max_rtn_u64;
7201 break;
7202 case nir_atomic_op_iand:
7203 op32 = aco_opcode::ds_and_b32;
7204 op64 = aco_opcode::ds_and_b64;
7205 op32_rtn = aco_opcode::ds_and_rtn_b32;
7206 op64_rtn = aco_opcode::ds_and_rtn_b64;
7207 break;
7208 case nir_atomic_op_ior:
7209 op32 = aco_opcode::ds_or_b32;
7210 op64 = aco_opcode::ds_or_b64;
7211 op32_rtn = aco_opcode::ds_or_rtn_b32;
7212 op64_rtn = aco_opcode::ds_or_rtn_b64;
7213 break;
7214 case nir_atomic_op_ixor:
7215 op32 = aco_opcode::ds_xor_b32;
7216 op64 = aco_opcode::ds_xor_b64;
7217 op32_rtn = aco_opcode::ds_xor_rtn_b32;
7218 op64_rtn = aco_opcode::ds_xor_rtn_b64;
7219 break;
7220 case nir_atomic_op_xchg:
7221 op32 = aco_opcode::ds_write_b32;
7222 op64 = aco_opcode::ds_write_b64;
7223 op32_rtn = aco_opcode::ds_wrxchg_rtn_b32;
7224 op64_rtn = aco_opcode::ds_wrxchg_rtn_b64;
7225 break;
7226 case nir_atomic_op_cmpxchg:
7227 op32 = aco_opcode::ds_cmpst_b32;
7228 op64 = aco_opcode::ds_cmpst_b64;
7229 op32_rtn = aco_opcode::ds_cmpst_rtn_b32;
7230 op64_rtn = aco_opcode::ds_cmpst_rtn_b64;
7231 num_operands = 4;
7232 break;
7233 case nir_atomic_op_fadd:
7234 op32 = aco_opcode::ds_add_f32;
7235 op32_rtn = aco_opcode::ds_add_rtn_f32;
7236 op64 = aco_opcode::num_opcodes;
7237 op64_rtn = aco_opcode::num_opcodes;
7238 break;
7239 case nir_atomic_op_fmin:
7240 op32 = aco_opcode::ds_min_f32;
7241 op32_rtn = aco_opcode::ds_min_rtn_f32;
7242 op64 = aco_opcode::ds_min_f64;
7243 op64_rtn = aco_opcode::ds_min_rtn_f64;
7244 break;
7245 case nir_atomic_op_fmax:
7246 op32 = aco_opcode::ds_max_f32;
7247 op32_rtn = aco_opcode::ds_max_rtn_f32;
7248 op64 = aco_opcode::ds_max_f64;
7249 op64_rtn = aco_opcode::ds_max_rtn_f64;
7250 break;
7251 default: unreachable("Unhandled shared atomic intrinsic");
7252 }
7253
7254 bool return_previous = !nir_def_is_unused(&instr->def);
7255
7256 aco_opcode op;
7257 if (data.size() == 1) {
7258 assert(instr->def.bit_size == 32);
7259 op = return_previous ? op32_rtn : op32;
7260 } else {
7261 assert(instr->def.bit_size == 64);
7262 op = return_previous ? op64_rtn : op64;
7263 }
7264
7265 if (offset > 65535) {
7266 address = bld.vadd32(bld.def(v1), Operand::c32(offset), address);
7267 offset = 0;
7268 }
7269
7270 aco_ptr<Instruction> ds;
7271 ds.reset(create_instruction(op, Format::DS, num_operands, return_previous ? 1 : 0));
7272 ds->operands[0] = Operand(address);
7273 ds->operands[1] = Operand(data);
7274 if (num_operands == 4) {
7275 Temp data2 = as_vgpr(ctx, get_ssa_temp(ctx, instr->src[2].ssa));
7276 ds->operands[2] = Operand(data2);
7277 if (bld.program->gfx_level >= GFX11)
7278 std::swap(ds->operands[1], ds->operands[2]);
7279 }
7280 ds->operands[num_operands - 1] = m;
7281 ds->ds().offset0 = offset;
7282 if (return_previous)
7283 ds->definitions[0] = Definition(get_ssa_temp(ctx, &instr->def));
7284 ds->ds().sync = memory_sync_info(storage_shared, semantic_atomicrmw);
7285
7286 if (m.isUndefined())
7287 ds->operands.pop_back();
7288
7289 ctx->block->instructions.emplace_back(std::move(ds));
7290 }
7291
7292 void
visit_shared_append(isel_context * ctx,nir_intrinsic_instr * instr)7293 visit_shared_append(isel_context* ctx, nir_intrinsic_instr* instr)
7294 {
7295 Builder bld(ctx->program, ctx->block);
7296 unsigned address = nir_intrinsic_base(instr);
7297 assert(address <= 65535 && (address % 4 == 0));
7298
7299 aco_opcode op;
7300 switch (instr->intrinsic) {
7301 case nir_intrinsic_shared_append_amd: op = aco_opcode::ds_append; break;
7302 case nir_intrinsic_shared_consume_amd: op = aco_opcode::ds_consume; break;
7303 default: unreachable("not shared_append/consume");
7304 }
7305
7306 Temp tmp = bld.tmp(v1);
7307 Instruction *ds;
7308 Operand m = load_lds_size_m0(bld);
7309 if (m.isUndefined())
7310 ds = bld.ds(op, Definition(tmp), address);
7311 else
7312 ds = bld.ds(op, Definition(tmp), m, address);
7313 ds->ds().sync = memory_sync_info(storage_shared, semantic_atomicrmw);
7314
7315 /* In wave64 for hw with native wave32, ds_append seems to be split in a load for the low half
7316 * and an atomic for the high half, and other LDS instructions can be scheduled between the two.
7317 * Which means the result of the low half is unusable because it might be out of date.
7318 */
7319 if (ctx->program->gfx_level >= GFX10 && ctx->program->wave_size == 64 &&
7320 ctx->program->workgroup_size > 64) {
7321 Temp last_lane = bld.sop1(aco_opcode::s_flbit_i32_b64, bld.def(s1), Operand(exec, s2));
7322 last_lane = bld.sop2(aco_opcode::s_sub_u32, bld.def(s1), bld.def(s1, scc), Operand::c32(63),
7323 last_lane);
7324 bld.readlane(Definition(get_ssa_temp(ctx, &instr->def)), tmp, last_lane);
7325 } else {
7326 bld.pseudo(aco_opcode::p_as_uniform, Definition(get_ssa_temp(ctx, &instr->def)), tmp);
7327 }
7328 }
7329
7330 void
visit_access_shared2_amd(isel_context * ctx,nir_intrinsic_instr * instr)7331 visit_access_shared2_amd(isel_context* ctx, nir_intrinsic_instr* instr)
7332 {
7333 bool is_store = instr->intrinsic == nir_intrinsic_store_shared2_amd;
7334 Temp address = as_vgpr(ctx, get_ssa_temp(ctx, instr->src[is_store].ssa));
7335 Builder bld(ctx->program, ctx->block);
7336
7337 assert(bld.program->gfx_level >= GFX7);
7338
7339 bool is64bit = (is_store ? instr->src[0].ssa->bit_size : instr->def.bit_size) == 64;
7340 uint8_t offset0 = nir_intrinsic_offset0(instr);
7341 uint8_t offset1 = nir_intrinsic_offset1(instr);
7342 bool st64 = nir_intrinsic_st64(instr);
7343
7344 Operand m = load_lds_size_m0(bld);
7345 Instruction* ds;
7346 if (is_store) {
7347 aco_opcode op = st64
7348 ? (is64bit ? aco_opcode::ds_write2st64_b64 : aco_opcode::ds_write2st64_b32)
7349 : (is64bit ? aco_opcode::ds_write2_b64 : aco_opcode::ds_write2_b32);
7350 Temp data = get_ssa_temp(ctx, instr->src[0].ssa);
7351 RegClass comp_rc = is64bit ? v2 : v1;
7352 Temp data0 = emit_extract_vector(ctx, data, 0, comp_rc);
7353 Temp data1 = emit_extract_vector(ctx, data, 1, comp_rc);
7354 ds = bld.ds(op, address, data0, data1, m, offset0, offset1);
7355 } else {
7356 Temp dst = get_ssa_temp(ctx, &instr->def);
7357 Definition tmp_dst(dst.type() == RegType::vgpr ? dst : bld.tmp(is64bit ? v4 : v2));
7358 aco_opcode op = st64 ? (is64bit ? aco_opcode::ds_read2st64_b64 : aco_opcode::ds_read2st64_b32)
7359 : (is64bit ? aco_opcode::ds_read2_b64 : aco_opcode::ds_read2_b32);
7360 ds = bld.ds(op, tmp_dst, address, m, offset0, offset1);
7361 }
7362 ds->ds().sync = memory_sync_info(storage_shared);
7363 if (m.isUndefined())
7364 ds->operands.pop_back();
7365
7366 if (!is_store) {
7367 Temp dst = get_ssa_temp(ctx, &instr->def);
7368 if (dst.type() == RegType::sgpr) {
7369 emit_split_vector(ctx, ds->definitions[0].getTemp(), dst.size());
7370 Temp comp[4];
7371 /* Use scalar v_readfirstlane_b32 for better 32-bit copy propagation */
7372 for (unsigned i = 0; i < dst.size(); i++)
7373 comp[i] = bld.as_uniform(emit_extract_vector(ctx, ds->definitions[0].getTemp(), i, v1));
7374 if (is64bit) {
7375 Temp comp0 = bld.pseudo(aco_opcode::p_create_vector, bld.def(s2), comp[0], comp[1]);
7376 Temp comp1 = bld.pseudo(aco_opcode::p_create_vector, bld.def(s2), comp[2], comp[3]);
7377 ctx->allocated_vec[comp0.id()] = {comp[0], comp[1]};
7378 ctx->allocated_vec[comp1.id()] = {comp[2], comp[3]};
7379 bld.pseudo(aco_opcode::p_create_vector, Definition(dst), comp0, comp1);
7380 ctx->allocated_vec[dst.id()] = {comp0, comp1};
7381 } else {
7382 bld.pseudo(aco_opcode::p_create_vector, Definition(dst), comp[0], comp[1]);
7383 }
7384 }
7385
7386 emit_split_vector(ctx, dst, 2);
7387 }
7388 }
7389
7390 Temp
get_scratch_resource(isel_context * ctx)7391 get_scratch_resource(isel_context* ctx)
7392 {
7393 Builder bld(ctx->program, ctx->block);
7394 Temp scratch_addr = ctx->program->private_segment_buffer;
7395 if (!scratch_addr.bytes()) {
7396 Temp addr_lo =
7397 bld.sop1(aco_opcode::p_load_symbol, bld.def(s1), Operand::c32(aco_symbol_scratch_addr_lo));
7398 Temp addr_hi =
7399 bld.sop1(aco_opcode::p_load_symbol, bld.def(s1), Operand::c32(aco_symbol_scratch_addr_hi));
7400 scratch_addr = bld.pseudo(aco_opcode::p_create_vector, bld.def(s2), addr_lo, addr_hi);
7401 } else if (ctx->stage.hw != AC_HW_COMPUTE_SHADER) {
7402 scratch_addr =
7403 bld.smem(aco_opcode::s_load_dwordx2, bld.def(s2), scratch_addr, Operand::zero());
7404 }
7405
7406 struct ac_buffer_state ac_state = {0};
7407 uint32_t desc[4];
7408
7409 ac_state.size = 0xffffffff;
7410 ac_state.format = PIPE_FORMAT_R32_FLOAT;
7411 for (int i = 0; i < 4; i++)
7412 ac_state.swizzle[i] = PIPE_SWIZZLE_0;
7413 /* older generations need element size = 4 bytes. element size removed in GFX9 */
7414 ac_state.element_size = ctx->program->gfx_level <= GFX8 ? 1u : 0u;
7415 ac_state.index_stride = ctx->program->wave_size == 64 ? 3u : 2u;
7416 ac_state.add_tid = true;
7417 ac_state.gfx10_oob_select = V_008F0C_OOB_SELECT_RAW;
7418
7419 ac_build_buffer_descriptor(ctx->program->gfx_level, &ac_state, desc);
7420
7421 return bld.pseudo(aco_opcode::p_create_vector, bld.def(s4), scratch_addr, Operand::c32(desc[2]),
7422 Operand::c32(desc[3]));
7423 }
7424
7425 void
visit_load_scratch(isel_context * ctx,nir_intrinsic_instr * instr)7426 visit_load_scratch(isel_context* ctx, nir_intrinsic_instr* instr)
7427 {
7428 Builder bld(ctx->program, ctx->block);
7429 Temp dst = get_ssa_temp(ctx, &instr->def);
7430
7431 LoadEmitInfo info = {Operand(v1), dst, instr->def.num_components, instr->def.bit_size / 8u};
7432 info.align_mul = nir_intrinsic_align_mul(instr);
7433 info.align_offset = nir_intrinsic_align_offset(instr);
7434 info.cache = get_cache_flags(ctx, ACCESS_TYPE_LOAD | ACCESS_IS_SWIZZLED_AMD);
7435 info.swizzle_component_size = ctx->program->gfx_level <= GFX8 ? 4 : 0;
7436 info.sync = memory_sync_info(storage_scratch, semantic_private);
7437 if (ctx->program->gfx_level >= GFX9) {
7438 if (nir_src_is_const(instr->src[0])) {
7439 uint32_t max = ctx->program->dev.scratch_global_offset_max + 1;
7440 info.offset =
7441 bld.copy(bld.def(s1), Operand::c32(ROUND_DOWN_TO(nir_src_as_uint(instr->src[0]), max)));
7442 info.const_offset = nir_src_as_uint(instr->src[0]) % max;
7443 } else {
7444 info.offset = Operand(get_ssa_temp(ctx, instr->src[0].ssa));
7445 }
7446 EmitLoadParameters params = scratch_flat_load_params;
7447 params.max_const_offset_plus_one = ctx->program->dev.scratch_global_offset_max + 1;
7448 emit_load(ctx, bld, info, params);
7449 } else {
7450 info.resource = get_scratch_resource(ctx);
7451 info.offset = Operand(as_vgpr(ctx, get_ssa_temp(ctx, instr->src[0].ssa)));
7452 info.soffset = ctx->program->scratch_offset;
7453 emit_load(ctx, bld, info, scratch_mubuf_load_params);
7454 }
7455 }
7456
7457 void
visit_store_scratch(isel_context * ctx,nir_intrinsic_instr * instr)7458 visit_store_scratch(isel_context* ctx, nir_intrinsic_instr* instr)
7459 {
7460 Builder bld(ctx->program, ctx->block);
7461 Temp data = as_vgpr(ctx, get_ssa_temp(ctx, instr->src[0].ssa));
7462 Temp offset = get_ssa_temp(ctx, instr->src[1].ssa);
7463
7464 unsigned elem_size_bytes = instr->src[0].ssa->bit_size / 8;
7465 unsigned writemask = util_widen_mask(nir_intrinsic_write_mask(instr), elem_size_bytes);
7466
7467 unsigned write_count = 0;
7468 Temp write_datas[32];
7469 unsigned offsets[32];
7470 unsigned swizzle_component_size = ctx->program->gfx_level <= GFX8 ? 4 : 16;
7471 split_buffer_store(ctx, instr, false, RegType::vgpr, data, writemask, swizzle_component_size,
7472 &write_count, write_datas, offsets);
7473
7474 if (ctx->program->gfx_level >= GFX9) {
7475 uint32_t max = ctx->program->dev.scratch_global_offset_max + 1;
7476 offset = nir_src_is_const(instr->src[1]) ? Temp(0, s1) : offset;
7477 uint32_t base_const_offset =
7478 nir_src_is_const(instr->src[1]) ? nir_src_as_uint(instr->src[1]) : 0;
7479
7480 for (unsigned i = 0; i < write_count; i++) {
7481 aco_opcode op;
7482 switch (write_datas[i].bytes()) {
7483 case 1: op = aco_opcode::scratch_store_byte; break;
7484 case 2: op = aco_opcode::scratch_store_short; break;
7485 case 4: op = aco_opcode::scratch_store_dword; break;
7486 case 8: op = aco_opcode::scratch_store_dwordx2; break;
7487 case 12: op = aco_opcode::scratch_store_dwordx3; break;
7488 case 16: op = aco_opcode::scratch_store_dwordx4; break;
7489 default: unreachable("Unexpected store size");
7490 }
7491
7492 uint32_t const_offset = base_const_offset + offsets[i];
7493 assert(const_offset < max || offset.id() == 0);
7494
7495 Operand addr = offset.regClass() == s1 ? Operand(v1) : Operand(offset);
7496 Operand saddr = offset.regClass() == s1 ? Operand(offset) : Operand(s1);
7497 if (offset.id() == 0)
7498 saddr = bld.copy(bld.def(s1), Operand::c32(ROUND_DOWN_TO(const_offset, max)));
7499
7500 bld.scratch(op, addr, saddr, write_datas[i], const_offset % max,
7501 memory_sync_info(storage_scratch, semantic_private));
7502 }
7503 } else {
7504 Temp rsrc = get_scratch_resource(ctx);
7505 offset = as_vgpr(ctx, offset);
7506 for (unsigned i = 0; i < write_count; i++) {
7507 aco_opcode op = get_buffer_store_op(write_datas[i].bytes());
7508 Instruction* mubuf = bld.mubuf(op, rsrc, offset, ctx->program->scratch_offset,
7509 write_datas[i], offsets[i], true);
7510 mubuf->mubuf().sync = memory_sync_info(storage_scratch, semantic_private);
7511 unsigned access = ACCESS_TYPE_STORE | ACCESS_IS_SWIZZLED_AMD |
7512 (write_datas[i].bytes() < 4 ? ACCESS_MAY_STORE_SUBDWORD : 0);
7513 mubuf->mubuf().cache = get_cache_flags(ctx, access);
7514 }
7515 }
7516 }
7517
7518 ReduceOp
get_reduce_op(nir_op op,unsigned bit_size)7519 get_reduce_op(nir_op op, unsigned bit_size)
7520 {
7521 switch (op) {
7522 #define CASEI(name) \
7523 case nir_op_##name: \
7524 return (bit_size == 32) ? name##32 \
7525 : (bit_size == 16) ? name##16 \
7526 : (bit_size == 8) ? name##8 \
7527 : name##64;
7528 #define CASEF(name) \
7529 case nir_op_##name: return (bit_size == 32) ? name##32 : (bit_size == 16) ? name##16 : name##64;
7530 CASEI(iadd)
7531 CASEI(imul)
7532 CASEI(imin)
7533 CASEI(umin)
7534 CASEI(imax)
7535 CASEI(umax)
7536 CASEI(iand)
7537 CASEI(ior)
7538 CASEI(ixor)
7539 CASEF(fadd)
7540 CASEF(fmul)
7541 CASEF(fmin)
7542 CASEF(fmax)
7543 default: unreachable("unknown reduction op");
7544 #undef CASEI
7545 #undef CASEF
7546 }
7547 }
7548
7549 void
emit_uniform_subgroup(isel_context * ctx,nir_intrinsic_instr * instr,Temp src)7550 emit_uniform_subgroup(isel_context* ctx, nir_intrinsic_instr* instr, Temp src)
7551 {
7552 Builder bld(ctx->program, ctx->block);
7553 Definition dst(get_ssa_temp(ctx, &instr->def));
7554 assert(dst.regClass().type() != RegType::vgpr);
7555 if (src.regClass().type() == RegType::vgpr)
7556 bld.pseudo(aco_opcode::p_as_uniform, dst, src);
7557 else
7558 bld.copy(dst, src);
7559 }
7560
7561 void
emit_addition_uniform_reduce(isel_context * ctx,nir_op op,Definition dst,nir_src src,Temp count)7562 emit_addition_uniform_reduce(isel_context* ctx, nir_op op, Definition dst, nir_src src, Temp count)
7563 {
7564 Builder bld(ctx->program, ctx->block);
7565 Temp src_tmp = get_ssa_temp(ctx, src.ssa);
7566
7567 if (op == nir_op_fadd) {
7568 src_tmp = as_vgpr(ctx, src_tmp);
7569 Temp tmp = dst.regClass() == s1 ? bld.tmp(RegClass::get(RegType::vgpr, src.ssa->bit_size / 8))
7570 : dst.getTemp();
7571
7572 if (src.ssa->bit_size == 16) {
7573 count = bld.vop1(aco_opcode::v_cvt_f16_u16, bld.def(v2b), count);
7574 bld.vop2(aco_opcode::v_mul_f16, Definition(tmp), count, src_tmp);
7575 } else {
7576 assert(src.ssa->bit_size == 32);
7577 count = bld.vop1(aco_opcode::v_cvt_f32_u32, bld.def(v1), count);
7578 bld.vop2(aco_opcode::v_mul_f32, Definition(tmp), count, src_tmp);
7579 }
7580
7581 if (tmp != dst.getTemp())
7582 bld.pseudo(aco_opcode::p_as_uniform, dst, tmp);
7583
7584 return;
7585 }
7586
7587 if (dst.regClass() == s1)
7588 src_tmp = bld.as_uniform(src_tmp);
7589
7590 if (op == nir_op_ixor && count.type() == RegType::sgpr)
7591 count =
7592 bld.sop2(aco_opcode::s_and_b32, bld.def(s1), bld.def(s1, scc), count, Operand::c32(1u));
7593 else if (op == nir_op_ixor)
7594 count = bld.vop2(aco_opcode::v_and_b32, bld.def(v1), Operand::c32(1u), count);
7595
7596 assert(dst.getTemp().type() == count.type());
7597
7598 if (nir_src_is_const(src)) {
7599 uint32_t imm = nir_src_as_uint(src);
7600 if (imm == 1 && dst.bytes() <= 2)
7601 bld.pseudo(aco_opcode::p_extract_vector, dst, count, Operand::zero());
7602 else if (imm == 1)
7603 bld.copy(dst, count);
7604 else if (imm == 0)
7605 bld.copy(dst, Operand::zero(dst.bytes()));
7606 else if (count.type() == RegType::vgpr)
7607 bld.v_mul_imm(dst, count, imm, true, true);
7608 else if (imm == 0xffffffff)
7609 bld.sop2(aco_opcode::s_sub_i32, dst, bld.def(s1, scc), Operand::zero(), count);
7610 else if (util_is_power_of_two_or_zero(imm))
7611 bld.sop2(aco_opcode::s_lshl_b32, dst, bld.def(s1, scc), count,
7612 Operand::c32(ffs(imm) - 1u));
7613 else
7614 bld.sop2(aco_opcode::s_mul_i32, dst, src_tmp, count);
7615 } else if (dst.bytes() <= 2 && ctx->program->gfx_level >= GFX10) {
7616 bld.vop3(aco_opcode::v_mul_lo_u16_e64, dst, src_tmp, count);
7617 } else if (dst.bytes() <= 2 && ctx->program->gfx_level >= GFX8) {
7618 bld.vop2(aco_opcode::v_mul_lo_u16, dst, src_tmp, count);
7619 } else if (dst.getTemp().type() == RegType::vgpr) {
7620 bld.vop3(aco_opcode::v_mul_lo_u32, dst, src_tmp, count);
7621 } else {
7622 bld.sop2(aco_opcode::s_mul_i32, dst, src_tmp, count);
7623 }
7624 }
7625
7626 bool
emit_uniform_reduce(isel_context * ctx,nir_intrinsic_instr * instr)7627 emit_uniform_reduce(isel_context* ctx, nir_intrinsic_instr* instr)
7628 {
7629 nir_op op = (nir_op)nir_intrinsic_reduction_op(instr);
7630 if (op == nir_op_imul || op == nir_op_fmul)
7631 return false;
7632
7633 if (op == nir_op_iadd || op == nir_op_ixor || op == nir_op_fadd) {
7634 Builder bld(ctx->program, ctx->block);
7635 Definition dst(get_ssa_temp(ctx, &instr->def));
7636 unsigned bit_size = instr->src[0].ssa->bit_size;
7637 if (bit_size > 32)
7638 return false;
7639
7640 Temp thread_count =
7641 bld.sop1(Builder::s_bcnt1_i32, bld.def(s1), bld.def(s1, scc), Operand(exec, bld.lm));
7642 set_wqm(ctx);
7643
7644 emit_addition_uniform_reduce(ctx, op, dst, instr->src[0], thread_count);
7645 } else {
7646 emit_uniform_subgroup(ctx, instr, get_ssa_temp(ctx, instr->src[0].ssa));
7647 }
7648
7649 return true;
7650 }
7651
7652 bool
emit_uniform_scan(isel_context * ctx,nir_intrinsic_instr * instr)7653 emit_uniform_scan(isel_context* ctx, nir_intrinsic_instr* instr)
7654 {
7655 Builder bld(ctx->program, ctx->block);
7656 Definition dst(get_ssa_temp(ctx, &instr->def));
7657 nir_op op = (nir_op)nir_intrinsic_reduction_op(instr);
7658 bool inc = instr->intrinsic == nir_intrinsic_inclusive_scan;
7659
7660 if (op == nir_op_imul || op == nir_op_fmul)
7661 return false;
7662
7663 if (op == nir_op_iadd || op == nir_op_ixor || op == nir_op_fadd) {
7664 if (instr->src[0].ssa->bit_size > 32)
7665 return false;
7666
7667 Temp packed_tid;
7668 if (inc)
7669 packed_tid = emit_mbcnt(ctx, bld.tmp(v1), Operand(exec, bld.lm), Operand::c32(1u));
7670 else
7671 packed_tid = emit_mbcnt(ctx, bld.tmp(v1), Operand(exec, bld.lm));
7672 set_wqm(ctx);
7673
7674 emit_addition_uniform_reduce(ctx, op, dst, instr->src[0], packed_tid);
7675 return true;
7676 }
7677
7678 assert(op == nir_op_imin || op == nir_op_umin || op == nir_op_imax || op == nir_op_umax ||
7679 op == nir_op_iand || op == nir_op_ior || op == nir_op_fmin || op == nir_op_fmax);
7680
7681 if (inc) {
7682 emit_uniform_subgroup(ctx, instr, get_ssa_temp(ctx, instr->src[0].ssa));
7683 return true;
7684 }
7685
7686 /* Copy the source and write the reduction operation identity to the first lane. */
7687 Temp lane = bld.sop1(Builder::s_ff1_i32, bld.def(s1), Operand(exec, bld.lm));
7688 Temp src = get_ssa_temp(ctx, instr->src[0].ssa);
7689 ReduceOp reduce_op = get_reduce_op(op, instr->src[0].ssa->bit_size);
7690 if (dst.bytes() == 8) {
7691 Temp lo = bld.tmp(v1), hi = bld.tmp(v1);
7692 bld.pseudo(aco_opcode::p_split_vector, Definition(lo), Definition(hi), src);
7693 uint32_t identity_lo = get_reduction_identity(reduce_op, 0);
7694 uint32_t identity_hi = get_reduction_identity(reduce_op, 1);
7695
7696 lo =
7697 bld.writelane(bld.def(v1), bld.copy(bld.def(s1, m0), Operand::c32(identity_lo)), lane, lo);
7698 hi =
7699 bld.writelane(bld.def(v1), bld.copy(bld.def(s1, m0), Operand::c32(identity_hi)), lane, hi);
7700 bld.pseudo(aco_opcode::p_create_vector, dst, lo, hi);
7701 } else {
7702 uint32_t identity = get_reduction_identity(reduce_op, 0);
7703 bld.writelane(dst, bld.copy(bld.def(s1, m0), Operand::c32(identity)), lane,
7704 as_vgpr(ctx, src));
7705 }
7706
7707 set_wqm(ctx);
7708 return true;
7709 }
7710
7711 Temp
emit_reduction_instr(isel_context * ctx,aco_opcode aco_op,ReduceOp op,unsigned cluster_size,Definition dst,Temp src)7712 emit_reduction_instr(isel_context* ctx, aco_opcode aco_op, ReduceOp op, unsigned cluster_size,
7713 Definition dst, Temp src)
7714 {
7715 assert(src.bytes() <= 8);
7716 assert(src.type() == RegType::vgpr);
7717
7718 Builder bld(ctx->program, ctx->block);
7719
7720 unsigned num_defs = 0;
7721 Definition defs[5];
7722 defs[num_defs++] = dst;
7723 defs[num_defs++] = bld.def(bld.lm); /* used internally to save/restore exec */
7724
7725 /* scalar identity temporary */
7726 bool need_sitmp = (ctx->program->gfx_level <= GFX7 || ctx->program->gfx_level >= GFX10) &&
7727 aco_op != aco_opcode::p_reduce;
7728 if (aco_op == aco_opcode::p_exclusive_scan) {
7729 need_sitmp |= (op == imin8 || op == imin16 || op == imin32 || op == imin64 || op == imax8 ||
7730 op == imax16 || op == imax32 || op == imax64 || op == fmin16 || op == fmin32 ||
7731 op == fmin64 || op == fmax16 || op == fmax32 || op == fmax64 || op == fmul16 ||
7732 op == fmul64);
7733 }
7734 if (need_sitmp)
7735 defs[num_defs++] = bld.def(RegType::sgpr, dst.size());
7736
7737 /* scc clobber */
7738 defs[num_defs++] = bld.def(s1, scc);
7739
7740 /* vcc clobber */
7741 bool clobber_vcc = false;
7742 if ((op == iadd32 || op == imul64) && ctx->program->gfx_level < GFX9)
7743 clobber_vcc = true;
7744 if ((op == iadd8 || op == iadd16) && ctx->program->gfx_level < GFX8)
7745 clobber_vcc = true;
7746 if (op == iadd64 || op == umin64 || op == umax64 || op == imin64 || op == imax64)
7747 clobber_vcc = true;
7748
7749 if (clobber_vcc)
7750 defs[num_defs++] = bld.def(bld.lm, vcc);
7751
7752 Instruction* reduce = create_instruction(aco_op, Format::PSEUDO_REDUCTION, 3, num_defs);
7753 reduce->operands[0] = Operand(src);
7754 /* setup_reduce_temp will update these undef operands if needed */
7755 reduce->operands[1] = Operand(RegClass(RegType::vgpr, dst.size()).as_linear());
7756 reduce->operands[2] = Operand(v1.as_linear());
7757 std::copy(defs, defs + num_defs, reduce->definitions.begin());
7758
7759 reduce->reduction().reduce_op = op;
7760 reduce->reduction().cluster_size = cluster_size;
7761 bld.insert(std::move(reduce));
7762
7763 return dst.getTemp();
7764 }
7765
7766 Temp
inclusive_scan_to_exclusive(isel_context * ctx,ReduceOp op,Definition dst,Temp src)7767 inclusive_scan_to_exclusive(isel_context* ctx, ReduceOp op, Definition dst, Temp src)
7768 {
7769 Builder bld(ctx->program, ctx->block);
7770
7771 Temp scan = emit_reduction_instr(ctx, aco_opcode::p_inclusive_scan, op, ctx->program->wave_size,
7772 bld.def(dst.regClass()), src);
7773
7774 switch (op) {
7775 case iadd8:
7776 case iadd16:
7777 case iadd32: return bld.vsub32(dst, scan, src);
7778 case ixor64:
7779 case iadd64: {
7780 Temp src00 = bld.tmp(v1);
7781 Temp src01 = bld.tmp(v1);
7782 bld.pseudo(aco_opcode::p_split_vector, Definition(src00), Definition(src01), scan);
7783 Temp src10 = bld.tmp(v1);
7784 Temp src11 = bld.tmp(v1);
7785 bld.pseudo(aco_opcode::p_split_vector, Definition(src10), Definition(src11), src);
7786
7787 Temp lower = bld.tmp(v1);
7788 Temp upper = bld.tmp(v1);
7789 if (op == iadd64) {
7790 Temp borrow = bld.vsub32(Definition(lower), src00, src10, true).def(1).getTemp();
7791 bld.vsub32(Definition(upper), src01, src11, false, borrow);
7792 } else {
7793 bld.vop2(aco_opcode::v_xor_b32, Definition(lower), src00, src10);
7794 bld.vop2(aco_opcode::v_xor_b32, Definition(upper), src01, src11);
7795 }
7796 return bld.pseudo(aco_opcode::p_create_vector, dst, lower, upper);
7797 }
7798 case ixor8:
7799 case ixor16:
7800 case ixor32: return bld.vop2(aco_opcode::v_xor_b32, dst, scan, src);
7801 default: unreachable("Unsupported op");
7802 }
7803 }
7804
7805 bool
emit_rotate_by_constant(isel_context * ctx,Temp & dst,Temp src,unsigned cluster_size,uint64_t delta)7806 emit_rotate_by_constant(isel_context* ctx, Temp& dst, Temp src, unsigned cluster_size,
7807 uint64_t delta)
7808 {
7809 Builder bld(ctx->program, ctx->block);
7810 RegClass rc = src.regClass();
7811 dst = Temp(0, rc);
7812 delta %= cluster_size;
7813
7814 if (delta == 0) {
7815 dst = bld.copy(bld.def(rc), src);
7816 } else if (delta * 2 == cluster_size && cluster_size <= 32) {
7817 dst = emit_masked_swizzle(ctx, bld, src, ds_pattern_bitmode(0x1f, 0, delta), true);
7818 } else if (cluster_size == 4) {
7819 unsigned res[4];
7820 for (unsigned i = 0; i < 4; i++)
7821 res[i] = (i + delta) & 0x3;
7822 uint32_t dpp_ctrl = dpp_quad_perm(res[0], res[1], res[2], res[3]);
7823 if (ctx->program->gfx_level >= GFX8)
7824 dst = bld.vop1_dpp(aco_opcode::v_mov_b32, bld.def(rc), src, dpp_ctrl);
7825 else
7826 dst = bld.ds(aco_opcode::ds_swizzle_b32, bld.def(v1), src, (1 << 15) | dpp_ctrl);
7827 } else if (cluster_size == 8 && ctx->program->gfx_level >= GFX10) {
7828 uint32_t lane_sel = 0;
7829 for (unsigned i = 0; i < 8; i++)
7830 lane_sel |= ((i + delta) & 0x7) << (i * 3);
7831 dst = bld.vop1_dpp8(aco_opcode::v_mov_b32, bld.def(rc), src, lane_sel);
7832 } else if (cluster_size == 16 && ctx->program->gfx_level >= GFX8) {
7833 dst = bld.vop1_dpp(aco_opcode::v_mov_b32, bld.def(rc), src, dpp_row_rr(16 - delta));
7834 } else if (cluster_size <= 32 && ctx->program->gfx_level >= GFX8) {
7835 uint32_t ctrl = ds_pattern_rotate(delta, ~(cluster_size - 1) & 0x1f);
7836 dst = bld.ds(aco_opcode::ds_swizzle_b32, bld.def(v1), src, ctrl);
7837 } else if (cluster_size == 64) {
7838 bool has_wf_dpp = ctx->program->gfx_level >= GFX8 && ctx->program->gfx_level < GFX10;
7839 if (delta == 32 && ctx->program->gfx_level >= GFX11) {
7840 dst = bld.vop1(aco_opcode::v_permlane64_b32, bld.def(rc), src);
7841 } else if (delta == 1 && has_wf_dpp) {
7842 dst = bld.vop1_dpp(aco_opcode::v_mov_b32, bld.def(rc), src, dpp_wf_rl1);
7843 } else if (delta == 63 && has_wf_dpp) {
7844 dst = bld.vop1_dpp(aco_opcode::v_mov_b32, bld.def(rc), src, dpp_wf_rr1);
7845 }
7846 }
7847
7848 return dst.id() != 0;
7849 }
7850
7851 Temp merged_wave_info_to_mask(isel_context* ctx, unsigned i);
7852 Temp lanecount_to_mask(isel_context* ctx, Temp count, unsigned bit_offset);
7853 void pops_await_overlapped_waves(isel_context* ctx);
7854
7855 void
ds_ordered_count_offsets(isel_context * ctx,unsigned index_operand,unsigned wave_release,unsigned wave_done,unsigned * offset0,unsigned * offset1)7856 ds_ordered_count_offsets(isel_context* ctx, unsigned index_operand, unsigned wave_release,
7857 unsigned wave_done, unsigned* offset0, unsigned* offset1)
7858 {
7859 unsigned ordered_count_index = index_operand & 0x3f;
7860 unsigned count_dword = (index_operand >> 24) & 0xf;
7861
7862 assert(ctx->options->gfx_level >= GFX10);
7863 assert(count_dword >= 1 && count_dword <= 4);
7864
7865 *offset0 = ordered_count_index << 2;
7866 *offset1 = wave_release | (wave_done << 1) | ((count_dword - 1) << 6);
7867
7868 if (ctx->options->gfx_level < GFX11)
7869 *offset1 |= 3 /* GS shader type */ << 2;
7870 }
7871
7872 struct aco_export_mrt {
7873 Operand out[4];
7874 unsigned enabled_channels;
7875 unsigned target;
7876 bool compr;
7877 };
7878
7879 static void
create_fs_dual_src_export_gfx11(isel_context * ctx,const struct aco_export_mrt * mrt0,const struct aco_export_mrt * mrt1)7880 create_fs_dual_src_export_gfx11(isel_context* ctx, const struct aco_export_mrt* mrt0,
7881 const struct aco_export_mrt* mrt1)
7882 {
7883 Builder bld(ctx->program, ctx->block);
7884
7885 aco_ptr<Instruction> exp{
7886 create_instruction(aco_opcode::p_dual_src_export_gfx11, Format::PSEUDO, 8, 6)};
7887 for (unsigned i = 0; i < 4; i++) {
7888 exp->operands[i] = mrt0 ? mrt0->out[i] : Operand(v1);
7889 exp->operands[i + 4] = mrt1 ? mrt1->out[i] : Operand(v1);
7890 }
7891
7892 RegClass type = RegClass(RegType::vgpr, util_bitcount(mrt0->enabled_channels));
7893 exp->definitions[0] = bld.def(type); /* mrt0 */
7894 exp->definitions[1] = bld.def(type); /* mrt1 */
7895 exp->definitions[2] = bld.def(bld.lm);
7896 exp->definitions[3] = bld.def(bld.lm);
7897 exp->definitions[4] = bld.def(bld.lm, vcc);
7898 exp->definitions[5] = bld.def(s1, scc);
7899 ctx->block->instructions.emplace_back(std::move(exp));
7900
7901 ctx->program->has_color_exports = true;
7902 }
7903
7904 static void
visit_cmat_muladd(isel_context * ctx,nir_intrinsic_instr * instr)7905 visit_cmat_muladd(isel_context* ctx, nir_intrinsic_instr* instr)
7906 {
7907 aco_opcode opcode = aco_opcode::num_opcodes;
7908 unsigned signed_mask = 0;
7909 bool clamp = false;
7910
7911 switch (instr->src[0].ssa->bit_size) {
7912 case 16:
7913 switch (instr->def.bit_size) {
7914 case 32: opcode = aco_opcode::v_wmma_f32_16x16x16_f16; break;
7915 case 16: opcode = aco_opcode::v_wmma_f16_16x16x16_f16; break;
7916 }
7917 break;
7918 case 8:
7919 opcode = aco_opcode::v_wmma_i32_16x16x16_iu8;
7920 signed_mask = nir_intrinsic_cmat_signed_mask(instr);
7921 clamp = nir_intrinsic_saturate(instr);
7922 break;
7923 }
7924
7925 if (opcode == aco_opcode::num_opcodes)
7926 unreachable("visit_cmat_muladd: invalid bit size combination");
7927
7928 Builder bld(ctx->program, ctx->block);
7929
7930 Temp dst = get_ssa_temp(ctx, &instr->def);
7931 Operand A(as_vgpr(ctx, get_ssa_temp(ctx, instr->src[0].ssa)));
7932 Operand B(as_vgpr(ctx, get_ssa_temp(ctx, instr->src[1].ssa)));
7933 Operand C(as_vgpr(ctx, get_ssa_temp(ctx, instr->src[2].ssa)));
7934
7935 VALU_instruction& vop3p = bld.vop3p(opcode, Definition(dst), A, B, C, 0, 0)->valu();
7936 vop3p.neg_lo[0] = (signed_mask & 0x1) != 0;
7937 vop3p.neg_lo[1] = (signed_mask & 0x2) != 0;
7938 vop3p.clamp = clamp;
7939
7940 emit_split_vector(ctx, dst, instr->def.num_components);
7941 }
7942
7943 static void begin_empty_exec_skip(isel_context* ctx, nir_instr* instr, nir_block* block);
7944
7945 static void end_empty_exec_skip(isel_context* ctx);
7946
7947 void
visit_intrinsic(isel_context * ctx,nir_intrinsic_instr * instr)7948 visit_intrinsic(isel_context* ctx, nir_intrinsic_instr* instr)
7949 {
7950 Builder bld(ctx->program, ctx->block);
7951 switch (instr->intrinsic) {
7952 case nir_intrinsic_load_interpolated_input: visit_load_interpolated_input(ctx, instr); break;
7953 case nir_intrinsic_store_output: visit_store_output(ctx, instr); break;
7954 case nir_intrinsic_load_input:
7955 case nir_intrinsic_load_per_primitive_input:
7956 case nir_intrinsic_load_input_vertex:
7957 if (ctx->program->stage == fragment_fs)
7958 visit_load_fs_input(ctx, instr);
7959 else
7960 isel_err(&instr->instr, "Shader inputs should have been lowered in NIR.");
7961 break;
7962 case nir_intrinsic_load_per_vertex_input: visit_load_per_vertex_input(ctx, instr); break;
7963 case nir_intrinsic_load_ubo: visit_load_ubo(ctx, instr); break;
7964 case nir_intrinsic_load_constant: visit_load_constant(ctx, instr); break;
7965 case nir_intrinsic_load_shared: visit_load_shared(ctx, instr); break;
7966 case nir_intrinsic_store_shared: visit_store_shared(ctx, instr); break;
7967 case nir_intrinsic_shared_atomic:
7968 case nir_intrinsic_shared_atomic_swap: visit_shared_atomic(ctx, instr); break;
7969 case nir_intrinsic_shared_append_amd:
7970 case nir_intrinsic_shared_consume_amd: visit_shared_append(ctx, instr); break;
7971 case nir_intrinsic_load_shared2_amd:
7972 case nir_intrinsic_store_shared2_amd: visit_access_shared2_amd(ctx, instr); break;
7973 case nir_intrinsic_bindless_image_load:
7974 case nir_intrinsic_bindless_image_fragment_mask_load_amd:
7975 case nir_intrinsic_bindless_image_sparse_load: visit_image_load(ctx, instr); break;
7976 case nir_intrinsic_bindless_image_store: visit_image_store(ctx, instr); break;
7977 case nir_intrinsic_bindless_image_atomic:
7978 case nir_intrinsic_bindless_image_atomic_swap: visit_image_atomic(ctx, instr); break;
7979 case nir_intrinsic_load_ssbo: visit_load_ssbo(ctx, instr); break;
7980 case nir_intrinsic_store_ssbo: visit_store_ssbo(ctx, instr); break;
7981 case nir_intrinsic_load_typed_buffer_amd:
7982 case nir_intrinsic_load_buffer_amd: visit_load_buffer(ctx, instr); break;
7983 case nir_intrinsic_store_buffer_amd: visit_store_buffer(ctx, instr); break;
7984 case nir_intrinsic_load_smem_amd: visit_load_smem(ctx, instr); break;
7985 case nir_intrinsic_load_global_amd: visit_load_global(ctx, instr); break;
7986 case nir_intrinsic_store_global_amd: visit_store_global(ctx, instr); break;
7987 case nir_intrinsic_global_atomic_amd:
7988 case nir_intrinsic_global_atomic_swap_amd: visit_global_atomic(ctx, instr); break;
7989 case nir_intrinsic_ssbo_atomic:
7990 case nir_intrinsic_ssbo_atomic_swap: visit_atomic_ssbo(ctx, instr); break;
7991 case nir_intrinsic_load_scratch: visit_load_scratch(ctx, instr); break;
7992 case nir_intrinsic_store_scratch: visit_store_scratch(ctx, instr); break;
7993 case nir_intrinsic_barrier: emit_barrier(ctx, instr); break;
7994 case nir_intrinsic_load_num_workgroups: {
7995 Temp dst = get_ssa_temp(ctx, &instr->def);
7996 if (ctx->options->load_grid_size_from_user_sgpr) {
7997 bld.copy(Definition(dst), get_arg(ctx, ctx->args->num_work_groups));
7998 } else {
7999 Temp addr = get_arg(ctx, ctx->args->num_work_groups);
8000 assert(addr.regClass() == s2);
8001 bld.pseudo(aco_opcode::p_create_vector, Definition(dst),
8002 bld.smem(aco_opcode::s_load_dwordx2, bld.def(s2), addr, Operand::zero()),
8003 bld.smem(aco_opcode::s_load_dword, bld.def(s1), addr, Operand::c32(8)));
8004 }
8005 emit_split_vector(ctx, dst, 3);
8006 break;
8007 }
8008 case nir_intrinsic_load_workgroup_id: {
8009 Temp dst = get_ssa_temp(ctx, &instr->def);
8010 if (ctx->stage.hw == AC_HW_COMPUTE_SHADER) {
8011 bld.pseudo(aco_opcode::p_create_vector, Definition(dst), ctx->workgroup_id[0],
8012 ctx->workgroup_id[1], ctx->workgroup_id[2]);
8013 emit_split_vector(ctx, dst, 3);
8014 } else {
8015 isel_err(&instr->instr, "Unsupported stage for load_workgroup_id");
8016 }
8017 break;
8018 }
8019 case nir_intrinsic_load_subgroup_id: {
8020 assert(ctx->options->gfx_level >= GFX12 && ctx->stage.hw == AC_HW_COMPUTE_SHADER);
8021 bld.sop2(aco_opcode::s_bfe_u32, Definition(get_ssa_temp(ctx, &instr->def)), bld.def(s1, scc),
8022 ctx->ttmp8, Operand::c32(25 | (5 << 16)));
8023 break;
8024 }
8025 case nir_intrinsic_ddx:
8026 case nir_intrinsic_ddy:
8027 case nir_intrinsic_ddx_fine:
8028 case nir_intrinsic_ddy_fine:
8029 case nir_intrinsic_ddx_coarse:
8030 case nir_intrinsic_ddy_coarse: {
8031 Temp src = as_vgpr(ctx, get_ssa_temp(ctx, instr->src[0].ssa));
8032 Temp dst = get_ssa_temp(ctx, &instr->def);
8033
8034 uint16_t dpp_ctrl1, dpp_ctrl2;
8035 if (instr->intrinsic == nir_intrinsic_ddx_fine) {
8036 if (nir_def_all_uses_ignore_sign_bit(&instr->def)) {
8037 dpp_ctrl1 = dpp_quad_perm(1, 0, 3, 2);
8038 dpp_ctrl2 = dpp_quad_perm(0, 1, 2, 3);
8039 } else {
8040 dpp_ctrl1 = dpp_quad_perm(0, 0, 2, 2);
8041 dpp_ctrl2 = dpp_quad_perm(1, 1, 3, 3);
8042 }
8043 } else if (instr->intrinsic == nir_intrinsic_ddy_fine) {
8044 if (nir_def_all_uses_ignore_sign_bit(&instr->def)) {
8045 dpp_ctrl1 = dpp_quad_perm(2, 3, 0, 1);
8046 dpp_ctrl2 = dpp_quad_perm(0, 1, 2, 3);
8047 } else {
8048 dpp_ctrl1 = dpp_quad_perm(0, 1, 0, 1);
8049 dpp_ctrl2 = dpp_quad_perm(2, 3, 2, 3);
8050 }
8051 } else {
8052 dpp_ctrl1 = dpp_quad_perm(0, 0, 0, 0);
8053 if (instr->intrinsic == nir_intrinsic_ddx ||
8054 instr->intrinsic == nir_intrinsic_ddx_coarse)
8055 dpp_ctrl2 = dpp_quad_perm(1, 1, 1, 1);
8056 else
8057 dpp_ctrl2 = dpp_quad_perm(2, 2, 2, 2);
8058 }
8059
8060 if (dst.regClass() == v1 && instr->def.bit_size == 16) {
8061 assert(instr->def.num_components == 2);
8062
8063 /* identify swizzle to opsel */
8064 unsigned opsel_lo = 0b00;
8065 unsigned opsel_hi = 0b11;
8066
8067 Temp tl = src;
8068 if (nir_src_is_divergent(&instr->src[0]))
8069 tl = bld.vop1_dpp(aco_opcode::v_mov_b32, bld.def(v1), src, dpp_ctrl1);
8070
8071 Builder::Result sub =
8072 bld.vop3p(aco_opcode::v_pk_add_f16, bld.def(v1), src, tl, opsel_lo, opsel_hi);
8073 sub->valu().neg_lo[1] = true;
8074 sub->valu().neg_hi[1] = true;
8075
8076 if (nir_src_is_divergent(&instr->src[0]) && dpp_ctrl2 != dpp_quad_perm(0, 1, 2, 3))
8077 bld.vop1_dpp(aco_opcode::v_mov_b32, Definition(dst), sub, dpp_ctrl2);
8078 else
8079 bld.copy(Definition(dst), sub);
8080 emit_split_vector(ctx, dst, 2);
8081 } else {
8082 aco_opcode subrev =
8083 instr->def.bit_size == 16 ? aco_opcode::v_subrev_f16 : aco_opcode::v_subrev_f32;
8084
8085 /* v_interp with constant sources only works on GFX11/11.5,
8086 * and it's only faster on GFX11.5.
8087 */
8088 bool use_interp = dpp_ctrl1 == dpp_quad_perm(0, 0, 0, 0) && instr->def.bit_size == 32 &&
8089 ctx->program->gfx_level == GFX11_5;
8090 if (!nir_src_is_divergent(&instr->src[0])) {
8091 bld.vop2(subrev, Definition(dst), src, src);
8092 } else if (use_interp && dpp_ctrl2 == dpp_quad_perm(1, 1, 1, 1)) {
8093 bld.vinterp_inreg(aco_opcode::v_interp_p10_f32_inreg, Definition(dst), src,
8094 Operand::c32(0x3f800000), src)
8095 ->valu()
8096 .neg[2] = true;
8097 } else if (use_interp && dpp_ctrl2 == dpp_quad_perm(2, 2, 2, 2)) {
8098 Builder::Result tmp = bld.vinterp_inreg(aco_opcode::v_interp_p10_f32_inreg, bld.def(v1),
8099 Operand::c32(0), Operand::c32(0), src);
8100 tmp->valu().neg = 0x6;
8101 bld.vinterp_inreg(aco_opcode::v_interp_p2_f32_inreg, Definition(dst), src,
8102 Operand::c32(0x3f800000), tmp);
8103 } else if (ctx->program->gfx_level >= GFX8 && dpp_ctrl2 == dpp_quad_perm(0, 1, 2, 3)) {
8104 bld.vop2_dpp(subrev, Definition(dst), src, src, dpp_ctrl1);
8105 } else if (ctx->program->gfx_level >= GFX8) {
8106 Temp tmp = bld.vop2_dpp(subrev, bld.def(v1), src, src, dpp_ctrl1);
8107 bld.vop1_dpp(aco_opcode::v_mov_b32, Definition(dst), tmp, dpp_ctrl2);
8108 } else {
8109 Temp tl = bld.ds(aco_opcode::ds_swizzle_b32, bld.def(v1), src, (1 << 15) | dpp_ctrl1);
8110 Temp tr = src;
8111 if (dpp_ctrl2 != dpp_quad_perm(0, 1, 2, 3))
8112 tr = bld.ds(aco_opcode::ds_swizzle_b32, bld.def(v1), src, (1 << 15) | dpp_ctrl2);
8113 bld.vop2(subrev, Definition(dst), tl, tr);
8114 }
8115 }
8116 set_wqm(ctx, true);
8117 break;
8118 }
8119
8120 case nir_intrinsic_ballot_relaxed:
8121 case nir_intrinsic_ballot: {
8122 Temp src = get_ssa_temp(ctx, instr->src[0].ssa);
8123 Temp dst = get_ssa_temp(ctx, &instr->def);
8124
8125 if (instr->src[0].ssa->bit_size == 1) {
8126 assert(src.regClass() == bld.lm);
8127 } else if (instr->src[0].ssa->bit_size == 32 && src.regClass() == v1) {
8128 src = bld.vopc(aco_opcode::v_cmp_lg_u32, bld.def(bld.lm), Operand::zero(), src);
8129 } else if (instr->src[0].ssa->bit_size == 64 && src.regClass() == v2) {
8130 src = bld.vopc(aco_opcode::v_cmp_lg_u64, bld.def(bld.lm), Operand::zero(), src);
8131 } else {
8132 isel_err(&instr->instr, "Unimplemented NIR instr bit size");
8133 }
8134
8135 /* Make sure that all inactive lanes return zero.
8136 * Value-numbering might remove the comparison above */
8137 Definition def = dst.size() == bld.lm.size() ? Definition(dst) : bld.def(bld.lm);
8138 if (instr->intrinsic == nir_intrinsic_ballot_relaxed)
8139 src = bld.copy(def, src);
8140 else
8141 src = bld.sop2(Builder::s_and, def, bld.def(s1, scc), src, Operand(exec, bld.lm));
8142 if (dst.size() != bld.lm.size()) {
8143 /* Wave32 with ballot size set to 64 */
8144 bld.pseudo(aco_opcode::p_create_vector, Definition(dst), src, Operand::zero());
8145 }
8146
8147 set_wqm(ctx);
8148 break;
8149 }
8150 case nir_intrinsic_inverse_ballot: {
8151 Temp src = bld.as_uniform(get_ssa_temp(ctx, instr->src[0].ssa));
8152 Temp dst = get_ssa_temp(ctx, &instr->def);
8153
8154 assert(dst.size() == bld.lm.size());
8155 if (src.size() > dst.size()) {
8156 emit_extract_vector(ctx, src, 0, dst);
8157 } else if (src.size() < dst.size()) {
8158 bld.pseudo(aco_opcode::p_create_vector, Definition(dst), src, Operand::zero());
8159 } else {
8160 bld.copy(Definition(dst), src);
8161 }
8162 break;
8163 }
8164 case nir_intrinsic_shuffle:
8165 case nir_intrinsic_read_invocation: {
8166 Temp src = get_ssa_temp(ctx, instr->src[0].ssa);
8167 assert(instr->def.bit_size != 1);
8168 if (!nir_src_is_divergent(&instr->src[0])) {
8169 emit_uniform_subgroup(ctx, instr, src);
8170 } else {
8171 Temp tid = get_ssa_temp(ctx, instr->src[1].ssa);
8172 if (instr->intrinsic == nir_intrinsic_read_invocation ||
8173 !nir_src_is_divergent(&instr->src[1]))
8174 tid = bld.as_uniform(tid);
8175 Temp dst = get_ssa_temp(ctx, &instr->def);
8176
8177 src = as_vgpr(ctx, src);
8178
8179 if (src.regClass() == v1b || src.regClass() == v2b) {
8180 Temp tmp = bld.tmp(v1);
8181 tmp = emit_bpermute(ctx, bld, tid, src);
8182 if (dst.type() == RegType::vgpr)
8183 bld.pseudo(aco_opcode::p_split_vector, Definition(dst),
8184 bld.def(src.regClass() == v1b ? v3b : v2b), tmp);
8185 else
8186 bld.pseudo(aco_opcode::p_as_uniform, Definition(dst), tmp);
8187 } else if (src.regClass() == v1) {
8188 Temp tmp = emit_bpermute(ctx, bld, tid, src);
8189 bld.copy(Definition(dst), tmp);
8190 } else if (src.regClass() == v2) {
8191 Temp lo = bld.tmp(v1), hi = bld.tmp(v1);
8192 bld.pseudo(aco_opcode::p_split_vector, Definition(lo), Definition(hi), src);
8193 lo = emit_bpermute(ctx, bld, tid, lo);
8194 hi = emit_bpermute(ctx, bld, tid, hi);
8195 bld.pseudo(aco_opcode::p_create_vector, Definition(dst), lo, hi);
8196 emit_split_vector(ctx, dst, 2);
8197 } else {
8198 isel_err(&instr->instr, "Unimplemented NIR instr bit size");
8199 }
8200 set_wqm(ctx);
8201 }
8202 break;
8203 }
8204 case nir_intrinsic_rotate: {
8205 Temp src = get_ssa_temp(ctx, instr->src[0].ssa);
8206 Temp delta = get_ssa_temp(ctx, instr->src[1].ssa);
8207 Temp dst = get_ssa_temp(ctx, &instr->def);
8208 assert(instr->def.bit_size > 1 && instr->def.bit_size <= 32);
8209
8210 if (!nir_src_is_divergent(&instr->src[0])) {
8211 emit_uniform_subgroup(ctx, instr, src);
8212 break;
8213 }
8214
8215 unsigned cluster_size = nir_intrinsic_cluster_size(instr);
8216 cluster_size = util_next_power_of_two(
8217 MIN2(cluster_size ? cluster_size : ctx->program->wave_size, ctx->program->wave_size));
8218
8219 if (cluster_size == 1) {
8220 bld.copy(Definition(dst), src);
8221 break;
8222 }
8223
8224 delta = bld.as_uniform(delta);
8225 src = as_vgpr(ctx, src);
8226
8227 Temp tmp;
8228 if (nir_src_is_const(instr->src[1]) &&
8229 emit_rotate_by_constant(ctx, tmp, src, cluster_size, nir_src_as_uint(instr->src[1]))) {
8230 } else if (cluster_size == 2) {
8231 Temp noswap =
8232 bld.sopc(aco_opcode::s_bitcmp0_b32, bld.def(s1, scc), delta, Operand::c32(0));
8233 noswap = bool_to_vector_condition(ctx, noswap);
8234 Temp swapped = emit_masked_swizzle(ctx, bld, src, ds_pattern_bitmode(0x1f, 0, 0x1), true);
8235 tmp = bld.vop2(aco_opcode::v_cndmask_b32, bld.def(src.regClass()), swapped, src, noswap);
8236 } else if (ctx->program->gfx_level >= GFX10 && cluster_size <= 16) {
8237 if (cluster_size == 4) /* shift mask already does this for 8/16. */
8238 delta = bld.sop2(aco_opcode::s_and_b32, bld.def(s1), bld.def(s1, scc), delta,
8239 Operand::c32(0x3));
8240 delta =
8241 bld.sop2(aco_opcode::s_lshl_b32, bld.def(s1), bld.def(s1, scc), delta, Operand::c32(2));
8242
8243 Temp lo = bld.copy(bld.def(s1), Operand::c32(cluster_size == 4 ? 0x32103210 : 0x76543210));
8244 Temp hi;
8245
8246 if (cluster_size <= 8) {
8247 Temp shr = bld.sop2(aco_opcode::s_lshr_b32, bld.def(s1), bld.def(s1, scc), lo, delta);
8248 if (cluster_size == 4) {
8249 Temp lotolohi = bld.copy(bld.def(s1), Operand::c32(0x4444));
8250 Temp lohi =
8251 bld.sop2(aco_opcode::s_or_b32, bld.def(s1), bld.def(s1, scc), shr, lotolohi);
8252 lo = bld.sop2(aco_opcode::s_pack_ll_b32_b16, bld.def(s1), shr, lohi);
8253 } else {
8254 delta = bld.sop2(aco_opcode::s_sub_u32, bld.def(s1), bld.def(s1, scc),
8255 Operand::c32(32), delta);
8256 Temp shl =
8257 bld.sop2(aco_opcode::s_lshl_b32, bld.def(s1), bld.def(s1, scc), lo, delta);
8258 lo = bld.sop2(aco_opcode::s_or_b32, bld.def(s1), bld.def(s1, scc), shr, shl);
8259 }
8260 Temp lotohi = bld.copy(bld.def(s1), Operand::c32(0x88888888));
8261 hi = bld.sop2(aco_opcode::s_or_b32, bld.def(s1), bld.def(s1, scc), lo, lotohi);
8262 } else {
8263 hi = bld.copy(bld.def(s1), Operand::c32(0xfedcba98));
8264
8265 Temp lohi = bld.pseudo(aco_opcode::p_create_vector, bld.def(s2), lo, hi);
8266
8267 Temp shr = bld.sop2(aco_opcode::s_lshr_b64, bld.def(s2), bld.def(s1, scc), lohi, delta);
8268 delta = bld.sop2(aco_opcode::s_sub_u32, bld.def(s1), bld.def(s1, scc), Operand::c32(64),
8269 delta);
8270 Temp shl = bld.sop2(aco_opcode::s_lshl_b64, bld.def(s2), bld.def(s1, scc), lohi, delta);
8271
8272 lohi = bld.sop2(aco_opcode::s_or_b64, bld.def(s2), bld.def(s1, scc), shr, shl);
8273 lo = bld.tmp(s1);
8274 hi = bld.tmp(s1);
8275 bld.pseudo(aco_opcode::p_split_vector, Definition(lo), Definition(hi), lohi);
8276 }
8277
8278 Builder::Result ret =
8279 bld.vop3(aco_opcode::v_permlane16_b32, bld.def(src.regClass()), src, lo, hi);
8280 ret->valu().opsel[0] = true; /* set FETCH_INACTIVE */
8281 ret->valu().opsel[1] = true; /* set BOUND_CTRL */
8282 tmp = ret;
8283 } else {
8284 /* Fallback to ds_bpermute if we can't find a special instruction. */
8285 Temp tid = emit_mbcnt(ctx, bld.tmp(v1));
8286 Temp src_lane = bld.vadd32(bld.def(v1), tid, delta);
8287
8288 if (ctx->program->gfx_level >= GFX10 && ctx->program->gfx_level <= GFX11_5 &&
8289 cluster_size == 32) {
8290 /* ds_bpermute is restricted to 32 lanes on GFX10-GFX11.5. */
8291 Temp index_x4 =
8292 bld.vop2(aco_opcode::v_lshlrev_b32, bld.def(v1), Operand::c32(2u), src_lane);
8293 tmp = bld.ds(aco_opcode::ds_bpermute_b32, bld.def(v1), index_x4, src);
8294 } else {
8295 /* Technically, full wave rotate doesn't need this, but it breaks the pseudo ops. */
8296 src_lane = bld.vop3(aco_opcode::v_bfi_b32, bld.def(v1), Operand::c32(cluster_size - 1),
8297 src_lane, tid);
8298 tmp = emit_bpermute(ctx, bld, src_lane, src);
8299 }
8300 }
8301
8302 tmp = emit_extract_vector(ctx, tmp, 0, dst.regClass());
8303 bld.copy(Definition(dst), tmp);
8304 set_wqm(ctx);
8305 break;
8306 }
8307 case nir_intrinsic_read_first_invocation: {
8308 Temp src = get_ssa_temp(ctx, instr->src[0].ssa);
8309 Temp dst = get_ssa_temp(ctx, &instr->def);
8310 if (instr->def.bit_size == 1) {
8311 assert(src.regClass() == bld.lm);
8312 Temp tmp = bld.sopc(Builder::s_bitcmp1, bld.def(s1, scc), src,
8313 bld.sop1(Builder::s_ff1_i32, bld.def(s1), Operand(exec, bld.lm)));
8314 bool_to_vector_condition(ctx, tmp, dst);
8315 } else {
8316 emit_readfirstlane(ctx, src, dst);
8317 }
8318 set_wqm(ctx);
8319 break;
8320 }
8321 case nir_intrinsic_as_uniform: {
8322 Temp src = get_ssa_temp(ctx, instr->src[0].ssa);
8323 Temp dst = get_ssa_temp(ctx, &instr->def);
8324 if (src.type() == RegType::vgpr)
8325 bld.pseudo(aco_opcode::p_as_uniform, Definition(dst), src);
8326 else
8327 bld.copy(Definition(dst), src);
8328 break;
8329 }
8330 case nir_intrinsic_vote_all: {
8331 Temp src = get_ssa_temp(ctx, instr->src[0].ssa);
8332 Temp dst = get_ssa_temp(ctx, &instr->def);
8333 assert(src.regClass() == bld.lm);
8334 assert(dst.regClass() == bld.lm);
8335
8336 Temp tmp = bld.sop1(Builder::s_not, bld.def(bld.lm), bld.def(s1, scc), src);
8337 tmp = bld.sop2(Builder::s_and, bld.def(bld.lm), bld.def(s1, scc), tmp, Operand(exec, bld.lm))
8338 .def(1)
8339 .getTemp();
8340 Temp cond = bool_to_vector_condition(ctx, tmp);
8341 bld.sop1(Builder::s_not, Definition(dst), bld.def(s1, scc), cond);
8342 set_wqm(ctx);
8343 break;
8344 }
8345 case nir_intrinsic_vote_any: {
8346 Temp src = get_ssa_temp(ctx, instr->src[0].ssa);
8347 Temp dst = get_ssa_temp(ctx, &instr->def);
8348 assert(src.regClass() == bld.lm);
8349 assert(dst.regClass() == bld.lm);
8350
8351 Temp tmp = bool_to_scalar_condition(ctx, src);
8352 bool_to_vector_condition(ctx, tmp, dst);
8353 set_wqm(ctx);
8354 break;
8355 }
8356 case nir_intrinsic_quad_vote_any: {
8357 Temp src = get_ssa_temp(ctx, instr->src[0].ssa);
8358 src = bld.sop2(Builder::s_and, bld.def(bld.lm), bld.def(s1, scc), src, Operand(exec, bld.lm));
8359 bld.sop1(Builder::s_wqm, Definition(get_ssa_temp(ctx, &instr->def)), bld.def(s1, scc), src);
8360 set_wqm(ctx);
8361 break;
8362 }
8363 case nir_intrinsic_quad_vote_all: {
8364 Temp src = get_ssa_temp(ctx, instr->src[0].ssa);
8365 src = bld.sop1(Builder::s_not, bld.def(bld.lm), bld.def(s1, scc), src);
8366 src = bld.sop2(Builder::s_and, bld.def(bld.lm), bld.def(s1, scc), src, Operand(exec, bld.lm));
8367 src = bld.sop1(Builder::s_wqm, bld.def(bld.lm), bld.def(s1, scc), src);
8368 bld.sop1(Builder::s_not, Definition(get_ssa_temp(ctx, &instr->def)), bld.def(s1, scc), src);
8369 set_wqm(ctx);
8370 break;
8371 }
8372 case nir_intrinsic_reduce:
8373 case nir_intrinsic_inclusive_scan:
8374 case nir_intrinsic_exclusive_scan: {
8375 Temp src = get_ssa_temp(ctx, instr->src[0].ssa);
8376 Temp dst = get_ssa_temp(ctx, &instr->def);
8377 nir_op op = (nir_op)nir_intrinsic_reduction_op(instr);
8378 unsigned cluster_size =
8379 instr->intrinsic == nir_intrinsic_reduce ? nir_intrinsic_cluster_size(instr) : 0;
8380 cluster_size = util_next_power_of_two(
8381 MIN2(cluster_size ? cluster_size : ctx->program->wave_size, ctx->program->wave_size));
8382 const unsigned bit_size = instr->src[0].ssa->bit_size;
8383 assert(bit_size != 1);
8384
8385 if (!nir_src_is_divergent(&instr->src[0])) {
8386 /* We use divergence analysis to assign the regclass, so check if it's
8387 * working as expected */
8388 ASSERTED bool expected_divergent = instr->intrinsic == nir_intrinsic_exclusive_scan;
8389 if (instr->intrinsic == nir_intrinsic_inclusive_scan ||
8390 cluster_size != ctx->program->wave_size)
8391 expected_divergent = op == nir_op_iadd || op == nir_op_fadd || op == nir_op_ixor ||
8392 op == nir_op_imul || op == nir_op_fmul;
8393 assert(instr->def.divergent == expected_divergent);
8394
8395 if (instr->intrinsic == nir_intrinsic_reduce) {
8396 if (!instr->def.divergent && emit_uniform_reduce(ctx, instr))
8397 break;
8398 } else if (emit_uniform_scan(ctx, instr)) {
8399 break;
8400 }
8401 }
8402
8403 src = emit_extract_vector(ctx, src, 0, RegClass::get(RegType::vgpr, bit_size / 8));
8404 ReduceOp reduce_op = get_reduce_op(op, bit_size);
8405
8406 aco_opcode aco_op;
8407 switch (instr->intrinsic) {
8408 case nir_intrinsic_reduce: aco_op = aco_opcode::p_reduce; break;
8409 case nir_intrinsic_inclusive_scan: aco_op = aco_opcode::p_inclusive_scan; break;
8410 case nir_intrinsic_exclusive_scan: aco_op = aco_opcode::p_exclusive_scan; break;
8411 default: unreachable("unknown reduce intrinsic");
8412 }
8413
8414 /* Avoid whole wave shift. */
8415 const bool use_inclusive_for_exclusive = aco_op == aco_opcode::p_exclusive_scan &&
8416 (op == nir_op_iadd || op == nir_op_ixor) &&
8417 dst.type() == RegType::vgpr;
8418 if (use_inclusive_for_exclusive)
8419 inclusive_scan_to_exclusive(ctx, reduce_op, Definition(dst), src);
8420 else
8421 emit_reduction_instr(ctx, aco_op, reduce_op, cluster_size, Definition(dst), src);
8422
8423 set_wqm(ctx);
8424 break;
8425 }
8426 case nir_intrinsic_dpp16_shift_amd: {
8427 Temp src = as_vgpr(ctx, get_ssa_temp(ctx, instr->src[0].ssa));
8428 Temp dst = get_ssa_temp(ctx, &instr->def);
8429 int delta = nir_intrinsic_base(instr);
8430 assert(delta >= -15 && delta <= 15 && delta != 0);
8431 assert(instr->def.bit_size != 1 && instr->def.bit_size < 64);
8432 assert(ctx->options->gfx_level >= GFX8);
8433
8434 uint16_t dpp_ctrl = delta < 0 ? dpp_row_sr(-delta) : dpp_row_sl(delta);
8435 bld.vop1_dpp(aco_opcode::v_mov_b32, Definition(dst), src, dpp_ctrl);
8436
8437 set_wqm(ctx);
8438 break;
8439 }
8440 case nir_intrinsic_quad_broadcast:
8441 case nir_intrinsic_quad_swap_horizontal:
8442 case nir_intrinsic_quad_swap_vertical:
8443 case nir_intrinsic_quad_swap_diagonal:
8444 case nir_intrinsic_quad_swizzle_amd: {
8445 Temp src = get_ssa_temp(ctx, instr->src[0].ssa);
8446
8447 if (!instr->def.divergent) {
8448 emit_uniform_subgroup(ctx, instr, src);
8449 break;
8450 }
8451
8452 /* Quad broadcast lane. */
8453 unsigned lane = 0;
8454 /* Use VALU for the bool instructions that don't have a SALU-only special case. */
8455 bool bool_use_valu = instr->def.bit_size == 1;
8456
8457 uint16_t dpp_ctrl = 0;
8458
8459 bool allow_fi = true;
8460 switch (instr->intrinsic) {
8461 case nir_intrinsic_quad_swap_horizontal: dpp_ctrl = dpp_quad_perm(1, 0, 3, 2); break;
8462 case nir_intrinsic_quad_swap_vertical: dpp_ctrl = dpp_quad_perm(2, 3, 0, 1); break;
8463 case nir_intrinsic_quad_swap_diagonal: dpp_ctrl = dpp_quad_perm(3, 2, 1, 0); break;
8464 case nir_intrinsic_quad_swizzle_amd:
8465 dpp_ctrl = nir_intrinsic_swizzle_mask(instr);
8466 allow_fi &= nir_intrinsic_fetch_inactive(instr);
8467 break;
8468 case nir_intrinsic_quad_broadcast:
8469 lane = nir_src_as_const_value(instr->src[1])->u32;
8470 dpp_ctrl = dpp_quad_perm(lane, lane, lane, lane);
8471 bool_use_valu = false;
8472 break;
8473 default: break;
8474 }
8475
8476 Temp dst = get_ssa_temp(ctx, &instr->def);
8477
8478 /* Setup source. */
8479 if (bool_use_valu)
8480 src = bld.vop2_e64(aco_opcode::v_cndmask_b32, bld.def(v1), Operand::zero(),
8481 Operand::c32(-1), src);
8482 else if (instr->def.bit_size != 1)
8483 src = as_vgpr(ctx, src);
8484
8485 if (instr->def.bit_size == 1 && instr->intrinsic == nir_intrinsic_quad_broadcast) {
8486 /* Special case for quad broadcast using SALU only. */
8487 assert(src.regClass() == bld.lm && dst.regClass() == bld.lm);
8488
8489 uint32_t half_mask = 0x11111111u << lane;
8490 Operand mask_tmp = bld.lm.bytes() == 4
8491 ? Operand::c32(half_mask)
8492 : bld.pseudo(aco_opcode::p_create_vector, bld.def(bld.lm),
8493 Operand::c32(half_mask), Operand::c32(half_mask));
8494
8495 src =
8496 bld.sop2(Builder::s_and, bld.def(bld.lm), bld.def(s1, scc), src, Operand(exec, bld.lm));
8497 src = bld.sop2(Builder::s_and, bld.def(bld.lm), bld.def(s1, scc), mask_tmp, src);
8498 bld.sop1(Builder::s_wqm, Definition(dst), bld.def(s1, scc), src);
8499 } else if (instr->def.bit_size <= 32 || bool_use_valu) {
8500 unsigned excess_bytes = bool_use_valu ? 0 : 4 - instr->def.bit_size / 8;
8501 Definition def = (excess_bytes || bool_use_valu) ? bld.def(v1) : Definition(dst);
8502
8503 if (ctx->program->gfx_level >= GFX8)
8504 bld.vop1_dpp(aco_opcode::v_mov_b32, def, src, dpp_ctrl, 0xf, 0xf, true, allow_fi);
8505 else
8506 bld.ds(aco_opcode::ds_swizzle_b32, def, src, (1 << 15) | dpp_ctrl);
8507
8508 if (excess_bytes)
8509 bld.pseudo(aco_opcode::p_split_vector, Definition(dst),
8510 bld.def(RegClass::get(dst.type(), excess_bytes)), def.getTemp());
8511 if (bool_use_valu)
8512 bld.vopc(aco_opcode::v_cmp_lg_u32, Definition(dst), Operand::zero(), def.getTemp());
8513 } else if (instr->def.bit_size == 64) {
8514 Temp lo = bld.tmp(v1), hi = bld.tmp(v1);
8515 bld.pseudo(aco_opcode::p_split_vector, Definition(lo), Definition(hi), src);
8516
8517 if (ctx->program->gfx_level >= GFX8) {
8518 lo = bld.vop1_dpp(aco_opcode::v_mov_b32, bld.def(v1), lo, dpp_ctrl, 0xf, 0xf, true,
8519 allow_fi);
8520 hi = bld.vop1_dpp(aco_opcode::v_mov_b32, bld.def(v1), hi, dpp_ctrl, 0xf, 0xf, true,
8521 allow_fi);
8522 } else {
8523 lo = bld.ds(aco_opcode::ds_swizzle_b32, bld.def(v1), lo, (1 << 15) | dpp_ctrl);
8524 hi = bld.ds(aco_opcode::ds_swizzle_b32, bld.def(v1), hi, (1 << 15) | dpp_ctrl);
8525 }
8526
8527 bld.pseudo(aco_opcode::p_create_vector, Definition(dst), lo, hi);
8528 emit_split_vector(ctx, dst, 2);
8529 } else {
8530 isel_err(&instr->instr, "Unimplemented NIR quad group instruction bit size.");
8531 }
8532
8533 set_wqm(ctx);
8534 break;
8535 }
8536 case nir_intrinsic_masked_swizzle_amd: {
8537 Temp src = get_ssa_temp(ctx, instr->src[0].ssa);
8538 if (!instr->def.divergent) {
8539 emit_uniform_subgroup(ctx, instr, src);
8540 break;
8541 }
8542 Temp dst = get_ssa_temp(ctx, &instr->def);
8543 uint32_t mask = nir_intrinsic_swizzle_mask(instr);
8544 bool allow_fi = nir_intrinsic_fetch_inactive(instr);
8545
8546 if (instr->def.bit_size != 1)
8547 src = as_vgpr(ctx, src);
8548
8549 if (instr->def.bit_size == 1) {
8550 assert(src.regClass() == bld.lm);
8551 src = bld.vop2_e64(aco_opcode::v_cndmask_b32, bld.def(v1), Operand::zero(),
8552 Operand::c32(-1), src);
8553 src = emit_masked_swizzle(ctx, bld, src, mask, allow_fi);
8554 bld.vopc(aco_opcode::v_cmp_lg_u32, Definition(dst), Operand::zero(), src);
8555 } else if (dst.regClass() == v1b) {
8556 Temp tmp = emit_masked_swizzle(ctx, bld, src, mask, allow_fi);
8557 emit_extract_vector(ctx, tmp, 0, dst);
8558 } else if (dst.regClass() == v2b) {
8559 Temp tmp = emit_masked_swizzle(ctx, bld, src, mask, allow_fi);
8560 emit_extract_vector(ctx, tmp, 0, dst);
8561 } else if (dst.regClass() == v1) {
8562 bld.copy(Definition(dst), emit_masked_swizzle(ctx, bld, src, mask, allow_fi));
8563 } else if (dst.regClass() == v2) {
8564 Temp lo = bld.tmp(v1), hi = bld.tmp(v1);
8565 bld.pseudo(aco_opcode::p_split_vector, Definition(lo), Definition(hi), src);
8566 lo = emit_masked_swizzle(ctx, bld, lo, mask, allow_fi);
8567 hi = emit_masked_swizzle(ctx, bld, hi, mask, allow_fi);
8568 bld.pseudo(aco_opcode::p_create_vector, Definition(dst), lo, hi);
8569 emit_split_vector(ctx, dst, 2);
8570 } else {
8571 isel_err(&instr->instr, "Unimplemented NIR instr bit size");
8572 }
8573 set_wqm(ctx);
8574 break;
8575 }
8576 case nir_intrinsic_write_invocation_amd: {
8577 Temp src = as_vgpr(ctx, get_ssa_temp(ctx, instr->src[0].ssa));
8578 Temp val = bld.as_uniform(get_ssa_temp(ctx, instr->src[1].ssa));
8579 Temp lane = bld.as_uniform(get_ssa_temp(ctx, instr->src[2].ssa));
8580 Temp dst = get_ssa_temp(ctx, &instr->def);
8581 if (dst.regClass() == v1) {
8582 /* src2 is ignored for writelane. RA assigns the same reg for dst */
8583 bld.writelane(Definition(dst), val, lane, src);
8584 } else if (dst.regClass() == v2) {
8585 Temp src_lo = bld.tmp(v1), src_hi = bld.tmp(v1);
8586 Temp val_lo = bld.tmp(s1), val_hi = bld.tmp(s1);
8587 bld.pseudo(aco_opcode::p_split_vector, Definition(src_lo), Definition(src_hi), src);
8588 bld.pseudo(aco_opcode::p_split_vector, Definition(val_lo), Definition(val_hi), val);
8589 Temp lo = bld.writelane(bld.def(v1), val_lo, lane, src_hi);
8590 Temp hi = bld.writelane(bld.def(v1), val_hi, lane, src_hi);
8591 bld.pseudo(aco_opcode::p_create_vector, Definition(dst), lo, hi);
8592 emit_split_vector(ctx, dst, 2);
8593 } else {
8594 isel_err(&instr->instr, "Unimplemented NIR instr bit size");
8595 }
8596 break;
8597 }
8598 case nir_intrinsic_mbcnt_amd: {
8599 Temp src = get_ssa_temp(ctx, instr->src[0].ssa);
8600 Temp add_src = as_vgpr(ctx, get_ssa_temp(ctx, instr->src[1].ssa));
8601 Temp dst = get_ssa_temp(ctx, &instr->def);
8602 /* Fit 64-bit mask for wave32 */
8603 src = emit_extract_vector(ctx, src, 0, RegClass(src.type(), bld.lm.size()));
8604 emit_mbcnt(ctx, dst, Operand(src), Operand(add_src));
8605 set_wqm(ctx);
8606 break;
8607 }
8608 case nir_intrinsic_lane_permute_16_amd: {
8609 /* NOTE: If we use divergence analysis information here instead of the src regclass,
8610 * skip_uniformize_merge_phi() should be updated.
8611 */
8612 Temp src = get_ssa_temp(ctx, instr->src[0].ssa);
8613 Temp dst = get_ssa_temp(ctx, &instr->def);
8614 assert(ctx->program->gfx_level >= GFX10);
8615
8616 if (src.regClass() == s1) {
8617 bld.copy(Definition(dst), src);
8618 } else if (dst.regClass() == v1 && src.regClass() == v1) {
8619 bld.vop3(aco_opcode::v_permlane16_b32, Definition(dst), src,
8620 bld.as_uniform(get_ssa_temp(ctx, instr->src[1].ssa)),
8621 bld.as_uniform(get_ssa_temp(ctx, instr->src[2].ssa)));
8622 } else {
8623 isel_err(&instr->instr, "Unimplemented lane_permute_16_amd");
8624 }
8625 break;
8626 }
8627 case nir_intrinsic_load_helper_invocation:
8628 case nir_intrinsic_is_helper_invocation: {
8629 /* load_helper() after demote() get lowered to is_helper().
8630 * Otherwise, these two behave the same. */
8631 Temp dst = get_ssa_temp(ctx, &instr->def);
8632 bld.pseudo(aco_opcode::p_is_helper, Definition(dst), Operand(exec, bld.lm));
8633 ctx->program->needs_exact = true;
8634 break;
8635 }
8636 case nir_intrinsic_demote:
8637 case nir_intrinsic_demote_if: {
8638 Operand cond = Operand::c32(-1u);
8639 if (instr->intrinsic == nir_intrinsic_demote_if) {
8640 Temp src = get_ssa_temp(ctx, instr->src[0].ssa);
8641 assert(src.regClass() == bld.lm);
8642 if (in_exec_divergent_or_in_loop(ctx)) {
8643 cond = bld.sop2(Builder::s_and, bld.def(bld.lm), bld.def(s1, scc), src,
8644 Operand(exec, bld.lm));
8645 } else {
8646 cond = Operand(src);
8647 }
8648 }
8649
8650 bld.pseudo(aco_opcode::p_demote_to_helper, cond);
8651
8652 /* Perform the demote in WQM so that it doesn't make exec empty. WQM should last until at
8653 * least the next top-level block.
8654 */
8655 if (ctx->block->loop_nest_depth || ctx->cf_info.parent_if.is_divergent)
8656 set_wqm(ctx);
8657
8658 ctx->block->kind |= block_kind_uses_discard;
8659 ctx->program->needs_exact = true;
8660
8661 /* Enable WQM in order to prevent helper lanes from getting terminated. */
8662 if (ctx->shader->info.maximally_reconverges)
8663 ctx->program->needs_wqm = true;
8664
8665 break;
8666 }
8667 case nir_intrinsic_terminate:
8668 case nir_intrinsic_terminate_if: {
8669 Operand cond = Operand::c32(-1u);
8670 if (instr->intrinsic == nir_intrinsic_terminate_if) {
8671 Temp src = get_ssa_temp(ctx, instr->src[0].ssa);
8672 assert(src.regClass() == bld.lm);
8673 if (in_exec_divergent_or_in_loop(ctx)) {
8674 cond = bld.sop2(Builder::s_and, bld.def(bld.lm), bld.def(s1, scc), src,
8675 Operand(exec, bld.lm));
8676 } else {
8677 cond = Operand(src);
8678 }
8679
8680 ctx->cf_info.had_divergent_discard |= nir_src_is_divergent(&instr->src[0]);
8681 }
8682
8683 bld.pseudo(aco_opcode::p_discard_if, cond);
8684 ctx->block->kind |= block_kind_uses_discard;
8685
8686 if (ctx->block->loop_nest_depth || ctx->cf_info.parent_if.is_divergent) {
8687 ctx->cf_info.exec.potentially_empty_discard = true;
8688 begin_empty_exec_skip(ctx, &instr->instr, instr->instr.block);
8689 }
8690 ctx->cf_info.had_divergent_discard |= in_exec_divergent_or_in_loop(ctx);
8691 ctx->program->needs_exact = true;
8692 break;
8693 }
8694 case nir_intrinsic_debug_break: {
8695 bld.sopp(aco_opcode::s_trap, 1u);
8696 break;
8697 }
8698 case nir_intrinsic_first_invocation: {
8699 bld.sop1(Builder::s_ff1_i32, Definition(get_ssa_temp(ctx, &instr->def)),
8700 Operand(exec, bld.lm));
8701 set_wqm(ctx);
8702 break;
8703 }
8704 case nir_intrinsic_last_invocation: {
8705 Temp flbit = bld.sop1(Builder::s_flbit_i32, bld.def(s1), Operand(exec, bld.lm));
8706 bld.sop2(aco_opcode::s_sub_i32, Definition(get_ssa_temp(ctx, &instr->def)), bld.def(s1, scc),
8707 Operand::c32(ctx->program->wave_size - 1u), flbit);
8708 set_wqm(ctx);
8709 break;
8710 }
8711 case nir_intrinsic_elect: {
8712 /* p_elect is lowered in aco_insert_exec_mask.
8713 * Use exec as an operand so value numbering and the pre-RA optimizer won't recognize
8714 * two p_elect with different exec masks as the same.
8715 */
8716 bld.pseudo(aco_opcode::p_elect, Definition(get_ssa_temp(ctx, &instr->def)),
8717 Operand(exec, bld.lm));
8718 set_wqm(ctx);
8719 break;
8720 }
8721 case nir_intrinsic_shader_clock: {
8722 Temp dst = get_ssa_temp(ctx, &instr->def);
8723 if (nir_intrinsic_memory_scope(instr) == SCOPE_SUBGROUP &&
8724 ctx->options->gfx_level >= GFX12) {
8725 Temp hi0 = bld.tmp(s1);
8726 Temp hi1 = bld.tmp(s1);
8727 Temp lo = bld.tmp(s1);
8728 bld.pseudo(aco_opcode::p_shader_cycles_hi_lo_hi, Definition(hi0), Definition(lo), Definition(hi1));
8729 Temp hi_eq = bld.sopc(aco_opcode::s_cmp_eq_u32, bld.def(s1, scc), hi0, hi1);
8730 lo = bld.sop2(aco_opcode::s_cselect_b32, bld.def(s1), lo, Operand::zero(), bld.scc(hi_eq));
8731 bld.pseudo(aco_opcode::p_create_vector, Definition(dst), lo, hi1);
8732 } else if (nir_intrinsic_memory_scope(instr) == SCOPE_SUBGROUP &&
8733 ctx->options->gfx_level >= GFX10_3) {
8734 /* "((size - 1) << 11) | register" (SHADER_CYCLES is encoded as register 29) */
8735 Temp clock = bld.sopk(aco_opcode::s_getreg_b32, bld.def(s1), ((20 - 1) << 11) | 29);
8736 bld.pseudo(aco_opcode::p_create_vector, Definition(dst), clock, Operand::zero());
8737 } else if (nir_intrinsic_memory_scope(instr) == SCOPE_DEVICE &&
8738 ctx->options->gfx_level >= GFX11) {
8739 bld.sop1(aco_opcode::s_sendmsg_rtn_b64, Definition(dst),
8740 Operand::c32(sendmsg_rtn_get_realtime));
8741 } else {
8742 aco_opcode opcode = nir_intrinsic_memory_scope(instr) == SCOPE_DEVICE
8743 ? aco_opcode::s_memrealtime
8744 : aco_opcode::s_memtime;
8745 bld.smem(opcode, Definition(dst), memory_sync_info(0, semantic_volatile));
8746 }
8747 emit_split_vector(ctx, dst, 2);
8748 break;
8749 }
8750 case nir_intrinsic_sendmsg_amd: {
8751 unsigned imm = nir_intrinsic_base(instr);
8752 Temp m0_content = bld.as_uniform(get_ssa_temp(ctx, instr->src[0].ssa));
8753 bld.sopp(aco_opcode::s_sendmsg, bld.m0(m0_content), imm);
8754 break;
8755 }
8756 case nir_intrinsic_is_subgroup_invocation_lt_amd: {
8757 Temp src = bld.as_uniform(get_ssa_temp(ctx, instr->src[0].ssa));
8758 unsigned offset = nir_intrinsic_base(instr);
8759 bld.copy(Definition(get_ssa_temp(ctx, &instr->def)), lanecount_to_mask(ctx, src, offset));
8760 break;
8761 }
8762 case nir_intrinsic_gds_atomic_add_amd: {
8763 Temp store_val = get_ssa_temp(ctx, instr->src[0].ssa);
8764 Temp gds_addr = get_ssa_temp(ctx, instr->src[1].ssa);
8765 Temp m0_val = get_ssa_temp(ctx, instr->src[2].ssa);
8766 Operand m = bld.m0((Temp)bld.copy(bld.def(s1, m0), bld.as_uniform(m0_val)));
8767 bld.ds(aco_opcode::ds_add_u32, as_vgpr(ctx, gds_addr), as_vgpr(ctx, store_val), m, 0u, 0u,
8768 true);
8769 break;
8770 }
8771 case nir_intrinsic_load_sbt_base_amd: {
8772 Temp dst = get_ssa_temp(ctx, &instr->def);
8773 Temp addr = get_arg(ctx, ctx->args->rt.sbt_descriptors);
8774 assert(addr.regClass() == s2);
8775 bld.copy(Definition(dst), Operand(addr));
8776 break;
8777 }
8778 case nir_intrinsic_bvh64_intersect_ray_amd: visit_bvh64_intersect_ray_amd(ctx, instr); break;
8779 case nir_intrinsic_load_resume_shader_address_amd: {
8780 bld.pseudo(aco_opcode::p_resume_shader_address, Definition(get_ssa_temp(ctx, &instr->def)),
8781 bld.def(s1, scc), Operand::c32(nir_intrinsic_call_idx(instr)));
8782 break;
8783 }
8784 case nir_intrinsic_load_scalar_arg_amd:
8785 case nir_intrinsic_load_vector_arg_amd: {
8786 assert(nir_intrinsic_base(instr) < ctx->args->arg_count);
8787 Temp dst = get_ssa_temp(ctx, &instr->def);
8788 Temp src = ctx->arg_temps[nir_intrinsic_base(instr)];
8789 assert(src.id());
8790 assert(src.type() == (instr->intrinsic == nir_intrinsic_load_scalar_arg_amd ? RegType::sgpr
8791 : RegType::vgpr));
8792 bld.copy(Definition(dst), src);
8793 emit_split_vector(ctx, dst, dst.size());
8794 break;
8795 }
8796 case nir_intrinsic_ordered_xfb_counter_add_gfx11_amd: {
8797 Temp dst = get_ssa_temp(ctx, &instr->def);
8798 Temp ordered_id = get_ssa_temp(ctx, instr->src[0].ssa);
8799 Temp counter = get_ssa_temp(ctx, instr->src[1].ssa);
8800
8801 Temp gds_base = bld.copy(bld.def(v1), Operand::c32(0u));
8802 unsigned offset0, offset1;
8803 Instruction* ds_instr;
8804 Operand m;
8805
8806 /* Lock a GDS mutex. */
8807 ds_ordered_count_offsets(ctx, 1 << 24u, false, false, &offset0, &offset1);
8808 m = bld.m0(bld.as_uniform(ordered_id));
8809 ds_instr =
8810 bld.ds(aco_opcode::ds_ordered_count, bld.def(v1), gds_base, m, offset0, offset1, true);
8811 ds_instr->ds().sync = memory_sync_info(storage_gds, semantic_volatile);
8812
8813 aco_ptr<Instruction> vec{
8814 create_instruction(aco_opcode::p_create_vector, Format::PSEUDO, instr->num_components, 1)};
8815 unsigned write_mask = nir_intrinsic_write_mask(instr);
8816
8817 for (unsigned i = 0; i < instr->num_components; i++) {
8818 if (write_mask & (1 << i)) {
8819 Temp chan_counter = emit_extract_vector(ctx, counter, i, v1);
8820
8821 ds_instr = bld.ds(aco_opcode::ds_add_gs_reg_rtn, bld.def(v1), Operand(), chan_counter,
8822 i * 4, 0u, true);
8823 ds_instr->ds().sync = memory_sync_info(storage_gds, semantic_atomicrmw);
8824
8825 vec->operands[i] = Operand(ds_instr->definitions[0].getTemp());
8826 } else {
8827 vec->operands[i] = Operand::zero();
8828 }
8829 }
8830
8831 vec->definitions[0] = Definition(dst);
8832 ctx->block->instructions.emplace_back(std::move(vec));
8833
8834 /* Unlock a GDS mutex. */
8835 ds_ordered_count_offsets(ctx, 1 << 24u, true, true, &offset0, &offset1);
8836 m = bld.m0(bld.as_uniform(ordered_id));
8837 ds_instr =
8838 bld.ds(aco_opcode::ds_ordered_count, bld.def(v1), gds_base, m, offset0, offset1, true);
8839 ds_instr->ds().sync = memory_sync_info(storage_gds, semantic_volatile);
8840
8841 emit_split_vector(ctx, dst, instr->num_components);
8842 break;
8843 }
8844 case nir_intrinsic_xfb_counter_sub_gfx11_amd: {
8845 unsigned write_mask = nir_intrinsic_write_mask(instr);
8846 Temp counter = get_ssa_temp(ctx, instr->src[0].ssa);
8847
8848 u_foreach_bit (i, write_mask) {
8849 Temp chan_counter = emit_extract_vector(ctx, counter, i, v1);
8850 Instruction* ds_instr;
8851
8852 ds_instr = bld.ds(aco_opcode::ds_sub_gs_reg_rtn, bld.def(v1), Operand(), chan_counter,
8853 i * 4, 0u, true);
8854 ds_instr->ds().sync = memory_sync_info(storage_gds, semantic_atomicrmw);
8855 }
8856 break;
8857 }
8858 case nir_intrinsic_export_amd:
8859 case nir_intrinsic_export_row_amd: {
8860 unsigned flags = nir_intrinsic_flags(instr);
8861 unsigned target = nir_intrinsic_base(instr);
8862 unsigned write_mask = nir_intrinsic_write_mask(instr);
8863
8864 /* Mark vertex export block. */
8865 if (target == V_008DFC_SQ_EXP_POS || target <= V_008DFC_SQ_EXP_NULL)
8866 ctx->block->kind |= block_kind_export_end;
8867
8868 if (target < V_008DFC_SQ_EXP_MRTZ)
8869 ctx->program->has_color_exports = true;
8870
8871 const bool row_en = instr->intrinsic == nir_intrinsic_export_row_amd;
8872
8873 aco_ptr<Instruction> exp{create_instruction(aco_opcode::exp, Format::EXP, 4 + row_en, 0)};
8874
8875 exp->exp().dest = target;
8876 exp->exp().enabled_mask = write_mask;
8877 exp->exp().compressed = flags & AC_EXP_FLAG_COMPRESSED;
8878
8879 /* ACO may reorder position/mrt export instructions, then mark done for last
8880 * export instruction. So don't respect the nir AC_EXP_FLAG_DONE for position/mrt
8881 * exports here and leave it to ACO.
8882 */
8883 if (target == V_008DFC_SQ_EXP_PRIM)
8884 exp->exp().done = flags & AC_EXP_FLAG_DONE;
8885 else
8886 exp->exp().done = false;
8887
8888 /* ACO may reorder mrt export instructions, then mark valid mask for last
8889 * export instruction. So don't respect the nir AC_EXP_FLAG_VALID_MASK for mrt
8890 * exports here and leave it to ACO.
8891 */
8892 if (target > V_008DFC_SQ_EXP_NULL)
8893 exp->exp().valid_mask = flags & AC_EXP_FLAG_VALID_MASK;
8894 else
8895 exp->exp().valid_mask = false;
8896
8897 exp->exp().row_en = row_en;
8898
8899 /* Compressed export uses two bits for a channel. */
8900 uint32_t channel_mask = exp->exp().compressed
8901 ? (write_mask & 0x3 ? 1 : 0) | (write_mask & 0xc ? 2 : 0)
8902 : write_mask;
8903
8904 Temp value = get_ssa_temp(ctx, instr->src[0].ssa);
8905 for (unsigned i = 0; i < 4; i++) {
8906 exp->operands[i] = channel_mask & BITFIELD_BIT(i)
8907 ? Operand(emit_extract_vector(ctx, value, i, v1))
8908 : Operand(v1);
8909 }
8910
8911 if (row_en) {
8912 Temp row = bld.as_uniform(get_ssa_temp(ctx, instr->src[1].ssa));
8913 /* Hack to prevent the RA from moving the source into m0 and then back to a normal SGPR. */
8914 row = bld.copy(bld.def(s1, m0), row);
8915 exp->operands[4] = bld.m0(row);
8916 }
8917
8918 ctx->block->instructions.emplace_back(std::move(exp));
8919 break;
8920 }
8921 case nir_intrinsic_export_dual_src_blend_amd: {
8922 Temp val0 = get_ssa_temp(ctx, instr->src[0].ssa);
8923 Temp val1 = get_ssa_temp(ctx, instr->src[1].ssa);
8924 unsigned write_mask = nir_intrinsic_write_mask(instr);
8925
8926 struct aco_export_mrt mrt0, mrt1;
8927 for (unsigned i = 0; i < 4; i++) {
8928 mrt0.out[i] = write_mask & BITFIELD_BIT(i) ? Operand(emit_extract_vector(ctx, val0, i, v1))
8929 : Operand(v1);
8930
8931 mrt1.out[i] = write_mask & BITFIELD_BIT(i) ? Operand(emit_extract_vector(ctx, val1, i, v1))
8932 : Operand(v1);
8933 }
8934 mrt0.enabled_channels = mrt1.enabled_channels = write_mask;
8935
8936 create_fs_dual_src_export_gfx11(ctx, &mrt0, &mrt1);
8937
8938 ctx->block->kind |= block_kind_export_end;
8939 break;
8940 }
8941 case nir_intrinsic_strict_wqm_coord_amd: {
8942 Temp dst = get_ssa_temp(ctx, &instr->def);
8943 Temp src = get_ssa_temp(ctx, instr->src[0].ssa);
8944 unsigned begin_size = nir_intrinsic_base(instr);
8945
8946 unsigned num_src = 1;
8947 auto it = ctx->allocated_vec.find(src.id());
8948 if (it != ctx->allocated_vec.end())
8949 num_src = src.bytes() / it->second[0].bytes();
8950
8951 aco_ptr<Instruction> vec{create_instruction(aco_opcode::p_start_linear_vgpr, Format::PSEUDO,
8952 num_src + !!begin_size, 1)};
8953
8954 if (begin_size)
8955 vec->operands[0] = Operand(RegClass::get(RegType::vgpr, begin_size));
8956 for (unsigned i = 0; i < num_src; i++) {
8957 Temp comp = it != ctx->allocated_vec.end() ? it->second[i] : src;
8958 vec->operands[i + !!begin_size] = Operand(comp);
8959 }
8960
8961 vec->definitions[0] = Definition(dst);
8962 ctx->block->instructions.emplace_back(std::move(vec));
8963 break;
8964 }
8965 case nir_intrinsic_load_lds_ngg_scratch_base_amd: {
8966 Temp dst = get_ssa_temp(ctx, &instr->def);
8967 bld.sop1(aco_opcode::p_load_symbol, Definition(dst),
8968 Operand::c32(aco_symbol_lds_ngg_scratch_base));
8969 break;
8970 }
8971 case nir_intrinsic_load_lds_ngg_gs_out_vertex_base_amd: {
8972 Temp dst = get_ssa_temp(ctx, &instr->def);
8973 bld.sop1(aco_opcode::p_load_symbol, Definition(dst),
8974 Operand::c32(aco_symbol_lds_ngg_gs_out_vertex_base));
8975 break;
8976 }
8977 case nir_intrinsic_store_scalar_arg_amd: {
8978 BITSET_SET(ctx->output_args, nir_intrinsic_base(instr));
8979 ctx->arg_temps[nir_intrinsic_base(instr)] =
8980 bld.as_uniform(get_ssa_temp(ctx, instr->src[0].ssa));
8981 break;
8982 }
8983 case nir_intrinsic_store_vector_arg_amd: {
8984 BITSET_SET(ctx->output_args, nir_intrinsic_base(instr));
8985 ctx->arg_temps[nir_intrinsic_base(instr)] =
8986 as_vgpr(ctx, get_ssa_temp(ctx, instr->src[0].ssa));
8987 break;
8988 }
8989 case nir_intrinsic_begin_invocation_interlock: {
8990 pops_await_overlapped_waves(ctx);
8991 break;
8992 }
8993 case nir_intrinsic_end_invocation_interlock: {
8994 if (ctx->options->gfx_level < GFX11)
8995 bld.pseudo(aco_opcode::p_pops_gfx9_ordered_section_done);
8996 break;
8997 }
8998 case nir_intrinsic_cmat_muladd_amd: visit_cmat_muladd(ctx, instr); break;
8999 case nir_intrinsic_nop_amd: bld.sopp(aco_opcode::s_nop, nir_intrinsic_base(instr)); break;
9000 case nir_intrinsic_sleep_amd: bld.sopp(aco_opcode::s_sleep, nir_intrinsic_base(instr)); break;
9001 case nir_intrinsic_unit_test_amd:
9002 bld.pseudo(aco_opcode::p_unit_test, Operand::c32(nir_intrinsic_base(instr)),
9003 get_ssa_temp(ctx, instr->src[0].ssa));
9004 break;
9005 case nir_intrinsic_unit_test_uniform_amd:
9006 case nir_intrinsic_unit_test_divergent_amd:
9007 bld.pseudo(aco_opcode::p_unit_test, Definition(get_ssa_temp(ctx, &instr->def)),
9008 Operand::c32(nir_intrinsic_base(instr)));
9009 break;
9010 default:
9011 isel_err(&instr->instr, "Unimplemented intrinsic instr");
9012 abort();
9013
9014 break;
9015 }
9016 }
9017
9018 void
get_const_vec(nir_def * vec,nir_const_value * cv[4])9019 get_const_vec(nir_def* vec, nir_const_value* cv[4])
9020 {
9021 if (vec->parent_instr->type != nir_instr_type_alu)
9022 return;
9023 nir_alu_instr* vec_instr = nir_instr_as_alu(vec->parent_instr);
9024 if (vec_instr->op != nir_op_vec(vec->num_components))
9025 return;
9026
9027 for (unsigned i = 0; i < vec->num_components; i++) {
9028 cv[i] =
9029 vec_instr->src[i].swizzle[0] == 0 ? nir_src_as_const_value(vec_instr->src[i].src) : NULL;
9030 }
9031 }
9032
9033 void
visit_tex(isel_context * ctx,nir_tex_instr * instr)9034 visit_tex(isel_context* ctx, nir_tex_instr* instr)
9035 {
9036 assert(instr->op != nir_texop_samples_identical);
9037
9038 Builder bld(ctx->program, ctx->block);
9039 bool has_bias = false, has_lod = false, level_zero = false, has_compare = false,
9040 has_offset = false, has_ddx = false, has_ddy = false, has_derivs = false,
9041 has_sample_index = false, has_clamped_lod = false, has_wqm_coord = false;
9042 Temp resource, sampler, bias = Temp(), compare = Temp(), sample_index = Temp(), lod = Temp(),
9043 offset = Temp(), ddx = Temp(), ddy = Temp(), clamped_lod = Temp(),
9044 coord = Temp(), wqm_coord = Temp();
9045 std::vector<Temp> coords;
9046 std::vector<Temp> derivs;
9047 nir_const_value* const_offset[4] = {NULL, NULL, NULL, NULL};
9048
9049 for (unsigned i = 0; i < instr->num_srcs; i++) {
9050 switch (instr->src[i].src_type) {
9051 case nir_tex_src_texture_handle:
9052 resource = bld.as_uniform(get_ssa_temp(ctx, instr->src[i].src.ssa));
9053 break;
9054 case nir_tex_src_sampler_handle:
9055 sampler = bld.as_uniform(get_ssa_temp(ctx, instr->src[i].src.ssa));
9056 break;
9057 default: break;
9058 }
9059 }
9060
9061 bool tg4_integer_workarounds = ctx->options->gfx_level <= GFX8 && instr->op == nir_texop_tg4 &&
9062 (instr->dest_type & (nir_type_int | nir_type_uint));
9063 bool tg4_integer_cube_workaround =
9064 tg4_integer_workarounds && instr->sampler_dim == GLSL_SAMPLER_DIM_CUBE;
9065
9066 bool a16 = false, g16 = false;
9067
9068 int coord_idx = nir_tex_instr_src_index(instr, nir_tex_src_coord);
9069 if (coord_idx > 0)
9070 a16 = instr->src[coord_idx].src.ssa->bit_size == 16;
9071
9072 int ddx_idx = nir_tex_instr_src_index(instr, nir_tex_src_ddx);
9073 if (ddx_idx > 0)
9074 g16 = instr->src[ddx_idx].src.ssa->bit_size == 16;
9075
9076 for (unsigned i = 0; i < instr->num_srcs; i++) {
9077 switch (instr->src[i].src_type) {
9078 case nir_tex_src_coord: {
9079 assert(instr->src[i].src.ssa->bit_size == (a16 ? 16 : 32));
9080 coord = get_ssa_temp_tex(ctx, instr->src[i].src.ssa, a16);
9081 break;
9082 }
9083 case nir_tex_src_backend1: {
9084 assert(instr->src[i].src.ssa->bit_size == 32);
9085 wqm_coord = get_ssa_temp(ctx, instr->src[i].src.ssa);
9086 has_wqm_coord = true;
9087 break;
9088 }
9089 case nir_tex_src_bias:
9090 assert(instr->src[i].src.ssa->bit_size == (a16 ? 16 : 32));
9091 /* Doesn't need get_ssa_temp_tex because we pack it into its own dword anyway. */
9092 bias = get_ssa_temp(ctx, instr->src[i].src.ssa);
9093 has_bias = true;
9094 break;
9095 case nir_tex_src_lod: {
9096 if (nir_src_is_const(instr->src[i].src) && nir_src_as_uint(instr->src[i].src) == 0) {
9097 level_zero = true;
9098 } else {
9099 assert(instr->src[i].src.ssa->bit_size == (a16 ? 16 : 32));
9100 lod = get_ssa_temp_tex(ctx, instr->src[i].src.ssa, a16);
9101 has_lod = true;
9102 }
9103 break;
9104 }
9105 case nir_tex_src_min_lod:
9106 assert(instr->src[i].src.ssa->bit_size == (a16 ? 16 : 32));
9107 clamped_lod = get_ssa_temp_tex(ctx, instr->src[i].src.ssa, a16);
9108 has_clamped_lod = true;
9109 break;
9110 case nir_tex_src_comparator:
9111 if (instr->is_shadow) {
9112 assert(instr->src[i].src.ssa->bit_size == 32);
9113 compare = get_ssa_temp(ctx, instr->src[i].src.ssa);
9114 has_compare = true;
9115 }
9116 break;
9117 case nir_tex_src_offset:
9118 case nir_tex_src_backend2:
9119 assert(instr->src[i].src.ssa->bit_size == 32);
9120 offset = get_ssa_temp(ctx, instr->src[i].src.ssa);
9121 get_const_vec(instr->src[i].src.ssa, const_offset);
9122 has_offset = true;
9123 break;
9124 case nir_tex_src_ddx:
9125 assert(instr->src[i].src.ssa->bit_size == (g16 ? 16 : 32));
9126 ddx = get_ssa_temp_tex(ctx, instr->src[i].src.ssa, g16);
9127 has_ddx = true;
9128 break;
9129 case nir_tex_src_ddy:
9130 assert(instr->src[i].src.ssa->bit_size == (g16 ? 16 : 32));
9131 ddy = get_ssa_temp_tex(ctx, instr->src[i].src.ssa, g16);
9132 has_ddy = true;
9133 break;
9134 case nir_tex_src_ms_index:
9135 assert(instr->src[i].src.ssa->bit_size == (a16 ? 16 : 32));
9136 sample_index = get_ssa_temp_tex(ctx, instr->src[i].src.ssa, a16);
9137 has_sample_index = true;
9138 break;
9139 case nir_tex_src_texture_offset:
9140 case nir_tex_src_sampler_offset:
9141 default: break;
9142 }
9143 }
9144
9145 if (has_wqm_coord) {
9146 assert(instr->op == nir_texop_tex || instr->op == nir_texop_txb ||
9147 instr->op == nir_texop_lod);
9148 assert(wqm_coord.regClass().is_linear_vgpr());
9149 assert(!a16 && !g16);
9150 }
9151
9152 if (instr->op == nir_texop_tg4 && !has_lod && !instr->is_gather_implicit_lod)
9153 level_zero = true;
9154
9155 if (has_offset) {
9156 assert(instr->op != nir_texop_txf);
9157
9158 aco_ptr<Instruction> tmp_instr;
9159 Temp acc, pack = Temp();
9160
9161 uint32_t pack_const = 0;
9162 for (unsigned i = 0; i < offset.size(); i++) {
9163 if (!const_offset[i])
9164 continue;
9165 pack_const |= (const_offset[i]->u32 & 0x3Fu) << (8u * i);
9166 }
9167
9168 if (offset.type() == RegType::sgpr) {
9169 for (unsigned i = 0; i < offset.size(); i++) {
9170 if (const_offset[i])
9171 continue;
9172
9173 acc = emit_extract_vector(ctx, offset, i, s1);
9174 acc = bld.sop2(aco_opcode::s_and_b32, bld.def(s1), bld.def(s1, scc), acc,
9175 Operand::c32(0x3Fu));
9176
9177 if (i) {
9178 acc = bld.sop2(aco_opcode::s_lshl_b32, bld.def(s1), bld.def(s1, scc), acc,
9179 Operand::c32(8u * i));
9180 }
9181
9182 if (pack == Temp()) {
9183 pack = acc;
9184 } else {
9185 pack = bld.sop2(aco_opcode::s_or_b32, bld.def(s1), bld.def(s1, scc), pack, acc);
9186 }
9187 }
9188
9189 if (pack_const && pack != Temp())
9190 pack = bld.sop2(aco_opcode::s_or_b32, bld.def(s1), bld.def(s1, scc),
9191 Operand::c32(pack_const), pack);
9192 } else {
9193 for (unsigned i = 0; i < offset.size(); i++) {
9194 if (const_offset[i])
9195 continue;
9196
9197 acc = emit_extract_vector(ctx, offset, i, v1);
9198 acc = bld.vop2(aco_opcode::v_and_b32, bld.def(v1), Operand::c32(0x3Fu), acc);
9199
9200 if (i) {
9201 acc = bld.vop2(aco_opcode::v_lshlrev_b32, bld.def(v1), Operand::c32(8u * i), acc);
9202 }
9203
9204 if (pack == Temp()) {
9205 pack = acc;
9206 } else {
9207 pack = bld.vop2(aco_opcode::v_or_b32, bld.def(v1), pack, acc);
9208 }
9209 }
9210
9211 if (pack_const && pack != Temp())
9212 pack = bld.vop2(aco_opcode::v_or_b32, bld.def(v1), Operand::c32(pack_const), pack);
9213 }
9214 if (pack == Temp())
9215 offset = bld.copy(bld.def(v1), Operand::c32(pack_const));
9216 else
9217 offset = pack;
9218 }
9219
9220 std::vector<Temp> unpacked_coord;
9221 if (coord != Temp())
9222 unpacked_coord.push_back(coord);
9223 if (has_sample_index)
9224 unpacked_coord.push_back(sample_index);
9225 if (has_lod)
9226 unpacked_coord.push_back(lod);
9227 if (has_clamped_lod)
9228 unpacked_coord.push_back(clamped_lod);
9229
9230 coords = emit_pack_v1(ctx, unpacked_coord);
9231
9232 /* pack derivatives */
9233 if (has_ddx || has_ddy) {
9234 assert(a16 == g16 || ctx->options->gfx_level >= GFX10);
9235 std::array<Temp, 2> ddxddy = {ddx, ddy};
9236 for (Temp tmp : ddxddy) {
9237 if (tmp == Temp())
9238 continue;
9239 std::vector<Temp> unpacked = {tmp};
9240 for (Temp derv : emit_pack_v1(ctx, unpacked))
9241 derivs.push_back(derv);
9242 }
9243 has_derivs = true;
9244 }
9245
9246 unsigned dim = 0;
9247 bool da = false;
9248 if (instr->sampler_dim != GLSL_SAMPLER_DIM_BUF) {
9249 dim = ac_get_sampler_dim(ctx->options->gfx_level, instr->sampler_dim, instr->is_array);
9250 da = should_declare_array((ac_image_dim)dim);
9251 }
9252
9253 /* Build tex instruction */
9254 unsigned dmask = nir_def_components_read(&instr->def) & 0xf;
9255 if (instr->sampler_dim == GLSL_SAMPLER_DIM_BUF)
9256 dmask = u_bit_consecutive(0, util_last_bit(dmask));
9257 if (instr->is_sparse)
9258 dmask = MAX2(dmask, 1) | 0x10;
9259 bool d16 = instr->def.bit_size == 16;
9260 Temp dst = get_ssa_temp(ctx, &instr->def);
9261 Temp tmp_dst = dst;
9262
9263 /* gather4 selects the component by dmask and always returns vec4 (vec5 if sparse) */
9264 if (instr->op == nir_texop_tg4) {
9265 assert(instr->def.num_components == (4 + instr->is_sparse));
9266 if (instr->is_shadow)
9267 dmask = 1;
9268 else
9269 dmask = 1 << instr->component;
9270 if (tg4_integer_cube_workaround || dst.type() == RegType::sgpr)
9271 tmp_dst = bld.tmp(instr->is_sparse ? v5 : (d16 ? v2 : v4));
9272 } else if (instr->op == nir_texop_fragment_mask_fetch_amd) {
9273 tmp_dst = bld.tmp(v1);
9274 } else if (util_bitcount(dmask) != instr->def.num_components || dst.type() == RegType::sgpr) {
9275 unsigned bytes = util_bitcount(dmask) * instr->def.bit_size / 8;
9276 tmp_dst = bld.tmp(RegClass::get(RegType::vgpr, bytes));
9277 }
9278
9279 Temp tg4_compare_cube_wa64 = Temp();
9280
9281 if (tg4_integer_workarounds) {
9282 Temp half_texel[2];
9283 if (instr->sampler_dim == GLSL_SAMPLER_DIM_RECT) {
9284 half_texel[0] = half_texel[1] = bld.copy(bld.def(v1), Operand::c32(0xbf000000 /*-0.5*/));
9285 } else {
9286 Temp tg4_lod = bld.copy(bld.def(v1), Operand::zero());
9287 Temp size = bld.tmp(v2);
9288 MIMG_instruction* tex = emit_mimg(bld, aco_opcode::image_get_resinfo, size, resource,
9289 Operand(s4), std::vector<Temp>{tg4_lod});
9290 tex->dim = dim;
9291 tex->dmask = 0x3;
9292 tex->da = da;
9293 emit_split_vector(ctx, size, size.size());
9294
9295 for (unsigned i = 0; i < 2; i++) {
9296 half_texel[i] = emit_extract_vector(ctx, size, i, v1);
9297 half_texel[i] = bld.vop1(aco_opcode::v_cvt_f32_i32, bld.def(v1), half_texel[i]);
9298 half_texel[i] = bld.vop1(aco_opcode::v_rcp_iflag_f32, bld.def(v1), half_texel[i]);
9299 half_texel[i] = bld.vop2(aco_opcode::v_mul_f32, bld.def(v1),
9300 Operand::c32(0xbf000000 /*-0.5*/), half_texel[i]);
9301 }
9302
9303 if (instr->sampler_dim == GLSL_SAMPLER_DIM_2D && !instr->is_array) {
9304 /* In vulkan, whether the sampler uses unnormalized
9305 * coordinates or not is a dynamic property of the
9306 * sampler. Hence, to figure out whether or not we
9307 * need to divide by the texture size, we need to test
9308 * the sampler at runtime. This tests the bit set by
9309 * radv_init_sampler().
9310 */
9311 unsigned bit_idx = ffs(S_008F30_FORCE_UNNORMALIZED(1)) - 1;
9312 Temp dword0 = emit_extract_vector(ctx, sampler, 0, s1);
9313 Temp not_needed =
9314 bld.sopc(aco_opcode::s_bitcmp0_b32, bld.def(s1, scc), dword0, Operand::c32(bit_idx));
9315
9316 not_needed = bool_to_vector_condition(ctx, not_needed);
9317 half_texel[0] = bld.vop2(aco_opcode::v_cndmask_b32, bld.def(v1),
9318 Operand::c32(0xbf000000 /*-0.5*/), half_texel[0], not_needed);
9319 half_texel[1] = bld.vop2(aco_opcode::v_cndmask_b32, bld.def(v1),
9320 Operand::c32(0xbf000000 /*-0.5*/), half_texel[1], not_needed);
9321 }
9322 }
9323
9324 Temp new_coords[2] = {bld.vop2(aco_opcode::v_add_f32, bld.def(v1), coords[0], half_texel[0]),
9325 bld.vop2(aco_opcode::v_add_f32, bld.def(v1), coords[1], half_texel[1])};
9326
9327 if (tg4_integer_cube_workaround) {
9328 /* see comment in ac_nir_to_llvm.c's lower_gather4_integer() */
9329 Temp* const desc = (Temp*)alloca(resource.size() * sizeof(Temp));
9330 aco_ptr<Instruction> split{
9331 create_instruction(aco_opcode::p_split_vector, Format::PSEUDO, 1, resource.size())};
9332 split->operands[0] = Operand(resource);
9333 for (unsigned i = 0; i < resource.size(); i++) {
9334 desc[i] = bld.tmp(s1);
9335 split->definitions[i] = Definition(desc[i]);
9336 }
9337 ctx->block->instructions.emplace_back(std::move(split));
9338
9339 Temp dfmt = bld.sop2(aco_opcode::s_bfe_u32, bld.def(s1), bld.def(s1, scc), desc[1],
9340 Operand::c32(20u | (6u << 16)));
9341 Temp compare_cube_wa = bld.sopc(aco_opcode::s_cmp_eq_u32, bld.def(s1, scc), dfmt,
9342 Operand::c32(V_008F14_IMG_DATA_FORMAT_8_8_8_8));
9343
9344 Temp nfmt;
9345 if (instr->dest_type & nir_type_uint) {
9346 nfmt = bld.sop2(aco_opcode::s_cselect_b32, bld.def(s1),
9347 Operand::c32(V_008F14_IMG_NUM_FORMAT_USCALED),
9348 Operand::c32(V_008F14_IMG_NUM_FORMAT_UINT), bld.scc(compare_cube_wa));
9349 } else {
9350 nfmt = bld.sop2(aco_opcode::s_cselect_b32, bld.def(s1),
9351 Operand::c32(V_008F14_IMG_NUM_FORMAT_SSCALED),
9352 Operand::c32(V_008F14_IMG_NUM_FORMAT_SINT), bld.scc(compare_cube_wa));
9353 }
9354 tg4_compare_cube_wa64 = bld.tmp(bld.lm);
9355 bool_to_vector_condition(ctx, compare_cube_wa, tg4_compare_cube_wa64);
9356
9357 nfmt = bld.sop2(aco_opcode::s_lshl_b32, bld.def(s1), bld.def(s1, scc), nfmt,
9358 Operand::c32(26u));
9359
9360 desc[1] = bld.sop2(aco_opcode::s_and_b32, bld.def(s1), bld.def(s1, scc), desc[1],
9361 Operand::c32(C_008F14_NUM_FORMAT));
9362 desc[1] = bld.sop2(aco_opcode::s_or_b32, bld.def(s1), bld.def(s1, scc), desc[1], nfmt);
9363
9364 aco_ptr<Instruction> vec{
9365 create_instruction(aco_opcode::p_create_vector, Format::PSEUDO, resource.size(), 1)};
9366 for (unsigned i = 0; i < resource.size(); i++)
9367 vec->operands[i] = Operand(desc[i]);
9368 resource = bld.tmp(resource.regClass());
9369 vec->definitions[0] = Definition(resource);
9370 ctx->block->instructions.emplace_back(std::move(vec));
9371
9372 new_coords[0] = bld.vop2(aco_opcode::v_cndmask_b32, bld.def(v1), new_coords[0], coords[0],
9373 tg4_compare_cube_wa64);
9374 new_coords[1] = bld.vop2(aco_opcode::v_cndmask_b32, bld.def(v1), new_coords[1], coords[1],
9375 tg4_compare_cube_wa64);
9376 }
9377 coords[0] = new_coords[0];
9378 coords[1] = new_coords[1];
9379 }
9380
9381 if (instr->sampler_dim == GLSL_SAMPLER_DIM_BUF) {
9382 // FIXME: if (ctx->abi->gfx9_stride_size_workaround) return
9383 // ac_build_buffer_load_format_gfx9_safe()
9384
9385 assert(coords.size() == 1);
9386 aco_opcode op;
9387 if (d16) {
9388 switch (util_last_bit(dmask & 0xf)) {
9389 case 1: op = aco_opcode::buffer_load_format_d16_x; break;
9390 case 2: op = aco_opcode::buffer_load_format_d16_xy; break;
9391 case 3: op = aco_opcode::buffer_load_format_d16_xyz; break;
9392 case 4: op = aco_opcode::buffer_load_format_d16_xyzw; break;
9393 default: unreachable("Tex instruction loads more than 4 components.");
9394 }
9395 } else {
9396 switch (util_last_bit(dmask & 0xf)) {
9397 case 1: op = aco_opcode::buffer_load_format_x; break;
9398 case 2: op = aco_opcode::buffer_load_format_xy; break;
9399 case 3: op = aco_opcode::buffer_load_format_xyz; break;
9400 case 4: op = aco_opcode::buffer_load_format_xyzw; break;
9401 default: unreachable("Tex instruction loads more than 4 components.");
9402 }
9403 }
9404
9405 aco_ptr<Instruction> mubuf{create_instruction(op, Format::MUBUF, 3 + instr->is_sparse, 1)};
9406 mubuf->operands[0] = Operand(resource);
9407 mubuf->operands[1] = Operand(coords[0]);
9408 mubuf->operands[2] = Operand::c32(0);
9409 mubuf->definitions[0] = Definition(tmp_dst);
9410 mubuf->mubuf().idxen = true;
9411 mubuf->mubuf().tfe = instr->is_sparse;
9412 if (mubuf->mubuf().tfe)
9413 mubuf->operands[3] = emit_tfe_init(bld, tmp_dst);
9414 ctx->block->instructions.emplace_back(std::move(mubuf));
9415
9416 expand_vector(ctx, tmp_dst, dst, instr->def.num_components, dmask);
9417 return;
9418 }
9419
9420 /* gather MIMG address components */
9421 std::vector<Temp> args;
9422 if (has_wqm_coord) {
9423 args.emplace_back(wqm_coord);
9424 if (!(ctx->block->kind & block_kind_top_level))
9425 ctx->unended_linear_vgprs.push_back(wqm_coord);
9426 }
9427 if (has_offset)
9428 args.emplace_back(offset);
9429 if (has_bias)
9430 args.emplace_back(emit_pack_v1(ctx, {bias})[0]);
9431 if (has_compare)
9432 args.emplace_back(compare);
9433 if (has_derivs)
9434 args.insert(args.end(), derivs.begin(), derivs.end());
9435
9436 args.insert(args.end(), coords.begin(), coords.end());
9437
9438 if (instr->op == nir_texop_txf || instr->op == nir_texop_fragment_fetch_amd ||
9439 instr->op == nir_texop_fragment_mask_fetch_amd || instr->op == nir_texop_txf_ms) {
9440 aco_opcode op = level_zero || instr->sampler_dim == GLSL_SAMPLER_DIM_MS ||
9441 instr->sampler_dim == GLSL_SAMPLER_DIM_SUBPASS_MS
9442 ? aco_opcode::image_load
9443 : aco_opcode::image_load_mip;
9444 Operand vdata = instr->is_sparse ? emit_tfe_init(bld, tmp_dst) : Operand(v1);
9445 MIMG_instruction* tex = emit_mimg(bld, op, tmp_dst, resource, Operand(s4), args, vdata);
9446 if (instr->op == nir_texop_fragment_mask_fetch_amd)
9447 tex->dim = da ? ac_image_2darray : ac_image_2d;
9448 else
9449 tex->dim = dim;
9450 tex->dmask = dmask & 0xf;
9451 tex->unrm = true;
9452 tex->da = da;
9453 tex->tfe = instr->is_sparse;
9454 tex->d16 = d16;
9455 tex->a16 = a16;
9456
9457 if (instr->op == nir_texop_fragment_mask_fetch_amd) {
9458 /* Use 0x76543210 if the image doesn't have FMASK. */
9459 assert(dmask == 1 && dst.bytes() == 4);
9460 assert(dst.id() != tmp_dst.id());
9461
9462 if (dst.regClass() == s1) {
9463 Temp is_not_null = bld.sopc(aco_opcode::s_cmp_lg_u32, bld.def(s1, scc), Operand::zero(),
9464 emit_extract_vector(ctx, resource, 1, s1));
9465 bld.sop2(aco_opcode::s_cselect_b32, Definition(dst), bld.as_uniform(tmp_dst),
9466 Operand::c32(0x76543210), bld.scc(is_not_null));
9467 } else {
9468 Temp is_not_null = bld.tmp(bld.lm);
9469 bld.vopc_e64(aco_opcode::v_cmp_lg_u32, Definition(is_not_null), Operand::zero(),
9470 emit_extract_vector(ctx, resource, 1, s1));
9471 bld.vop2(aco_opcode::v_cndmask_b32, Definition(dst),
9472 bld.copy(bld.def(v1), Operand::c32(0x76543210)), tmp_dst, is_not_null);
9473 }
9474 } else {
9475 expand_vector(ctx, tmp_dst, dst, instr->def.num_components, dmask);
9476 }
9477 return;
9478 }
9479
9480 bool separate_g16 = ctx->options->gfx_level >= GFX10 && g16;
9481
9482 // TODO: would be better to do this by adding offsets, but needs the opcodes ordered.
9483 aco_opcode opcode = aco_opcode::image_sample;
9484 if (has_offset) { /* image_sample_*_o */
9485 if (has_clamped_lod) {
9486 if (has_compare) {
9487 opcode = aco_opcode::image_sample_c_cl_o;
9488 if (separate_g16)
9489 opcode = aco_opcode::image_sample_c_d_cl_o_g16;
9490 else if (has_derivs)
9491 opcode = aco_opcode::image_sample_c_d_cl_o;
9492 if (has_bias)
9493 opcode = aco_opcode::image_sample_c_b_cl_o;
9494 } else {
9495 opcode = aco_opcode::image_sample_cl_o;
9496 if (separate_g16)
9497 opcode = aco_opcode::image_sample_d_cl_o_g16;
9498 else if (has_derivs)
9499 opcode = aco_opcode::image_sample_d_cl_o;
9500 if (has_bias)
9501 opcode = aco_opcode::image_sample_b_cl_o;
9502 }
9503 } else if (has_compare) {
9504 opcode = aco_opcode::image_sample_c_o;
9505 if (separate_g16)
9506 opcode = aco_opcode::image_sample_c_d_o_g16;
9507 else if (has_derivs)
9508 opcode = aco_opcode::image_sample_c_d_o;
9509 if (has_bias)
9510 opcode = aco_opcode::image_sample_c_b_o;
9511 if (level_zero)
9512 opcode = aco_opcode::image_sample_c_lz_o;
9513 if (has_lod)
9514 opcode = aco_opcode::image_sample_c_l_o;
9515 } else {
9516 opcode = aco_opcode::image_sample_o;
9517 if (separate_g16)
9518 opcode = aco_opcode::image_sample_d_o_g16;
9519 else if (has_derivs)
9520 opcode = aco_opcode::image_sample_d_o;
9521 if (has_bias)
9522 opcode = aco_opcode::image_sample_b_o;
9523 if (level_zero)
9524 opcode = aco_opcode::image_sample_lz_o;
9525 if (has_lod)
9526 opcode = aco_opcode::image_sample_l_o;
9527 }
9528 } else if (has_clamped_lod) { /* image_sample_*_cl */
9529 if (has_compare) {
9530 opcode = aco_opcode::image_sample_c_cl;
9531 if (separate_g16)
9532 opcode = aco_opcode::image_sample_c_d_cl_g16;
9533 else if (has_derivs)
9534 opcode = aco_opcode::image_sample_c_d_cl;
9535 if (has_bias)
9536 opcode = aco_opcode::image_sample_c_b_cl;
9537 } else {
9538 opcode = aco_opcode::image_sample_cl;
9539 if (separate_g16)
9540 opcode = aco_opcode::image_sample_d_cl_g16;
9541 else if (has_derivs)
9542 opcode = aco_opcode::image_sample_d_cl;
9543 if (has_bias)
9544 opcode = aco_opcode::image_sample_b_cl;
9545 }
9546 } else { /* no offset */
9547 if (has_compare) {
9548 opcode = aco_opcode::image_sample_c;
9549 if (separate_g16)
9550 opcode = aco_opcode::image_sample_c_d_g16;
9551 else if (has_derivs)
9552 opcode = aco_opcode::image_sample_c_d;
9553 if (has_bias)
9554 opcode = aco_opcode::image_sample_c_b;
9555 if (level_zero)
9556 opcode = aco_opcode::image_sample_c_lz;
9557 if (has_lod)
9558 opcode = aco_opcode::image_sample_c_l;
9559 } else {
9560 opcode = aco_opcode::image_sample;
9561 if (separate_g16)
9562 opcode = aco_opcode::image_sample_d_g16;
9563 else if (has_derivs)
9564 opcode = aco_opcode::image_sample_d;
9565 if (has_bias)
9566 opcode = aco_opcode::image_sample_b;
9567 if (level_zero)
9568 opcode = aco_opcode::image_sample_lz;
9569 if (has_lod)
9570 opcode = aco_opcode::image_sample_l;
9571 }
9572 }
9573
9574 if (instr->op == nir_texop_tg4) {
9575 /* GFX11 supports implicit LOD, but the extension is unsupported. */
9576 assert(level_zero || ctx->options->gfx_level < GFX11);
9577
9578 if (has_offset) { /* image_gather4_*_o */
9579 if (has_compare) {
9580 opcode = aco_opcode::image_gather4_c_o;
9581 if (level_zero)
9582 opcode = aco_opcode::image_gather4_c_lz_o;
9583 if (has_lod)
9584 opcode = aco_opcode::image_gather4_c_l_o;
9585 if (has_bias)
9586 opcode = aco_opcode::image_gather4_c_b_o;
9587 } else {
9588 opcode = aco_opcode::image_gather4_o;
9589 if (level_zero)
9590 opcode = aco_opcode::image_gather4_lz_o;
9591 if (has_lod)
9592 opcode = aco_opcode::image_gather4_l_o;
9593 if (has_bias)
9594 opcode = aco_opcode::image_gather4_b_o;
9595 }
9596 } else {
9597 if (has_compare) {
9598 opcode = aco_opcode::image_gather4_c;
9599 if (level_zero)
9600 opcode = aco_opcode::image_gather4_c_lz;
9601 if (has_lod)
9602 opcode = aco_opcode::image_gather4_c_l;
9603 if (has_bias)
9604 opcode = aco_opcode::image_gather4_c_b;
9605 } else {
9606 opcode = aco_opcode::image_gather4;
9607 if (level_zero)
9608 opcode = aco_opcode::image_gather4_lz;
9609 if (has_lod)
9610 opcode = aco_opcode::image_gather4_l;
9611 if (has_bias)
9612 opcode = aco_opcode::image_gather4_b;
9613 }
9614 }
9615 } else if (instr->op == nir_texop_lod) {
9616 opcode = aco_opcode::image_get_lod;
9617 }
9618
9619 bool implicit_derivs = bld.program->stage == fragment_fs && !has_derivs && !has_lod &&
9620 !level_zero && instr->sampler_dim != GLSL_SAMPLER_DIM_MS &&
9621 instr->sampler_dim != GLSL_SAMPLER_DIM_SUBPASS_MS;
9622
9623 Operand vdata = instr->is_sparse ? emit_tfe_init(bld, tmp_dst) : Operand(v1);
9624 MIMG_instruction* tex = emit_mimg(bld, opcode, tmp_dst, resource, Operand(sampler), args, vdata);
9625 tex->dim = dim;
9626 tex->dmask = dmask & 0xf;
9627 tex->da = da;
9628 tex->unrm = instr->sampler_dim == GLSL_SAMPLER_DIM_RECT;
9629 tex->tfe = instr->is_sparse;
9630 tex->d16 = d16;
9631 tex->a16 = a16;
9632 if (implicit_derivs)
9633 set_wqm(ctx, true);
9634
9635 if (tg4_integer_cube_workaround) {
9636 assert(tmp_dst.id() != dst.id());
9637 assert(tmp_dst.size() == dst.size());
9638
9639 emit_split_vector(ctx, tmp_dst, tmp_dst.size());
9640 Temp val[4];
9641 for (unsigned i = 0; i < 4; i++) {
9642 val[i] = emit_extract_vector(ctx, tmp_dst, i, v1);
9643 Temp cvt_val;
9644 if (instr->dest_type & nir_type_uint)
9645 cvt_val = bld.vop1(aco_opcode::v_cvt_u32_f32, bld.def(v1), val[i]);
9646 else
9647 cvt_val = bld.vop1(aco_opcode::v_cvt_i32_f32, bld.def(v1), val[i]);
9648 val[i] = bld.vop2(aco_opcode::v_cndmask_b32, bld.def(v1), val[i], cvt_val,
9649 tg4_compare_cube_wa64);
9650 }
9651
9652 Temp tmp = dst.regClass() == tmp_dst.regClass() ? dst : bld.tmp(tmp_dst.regClass());
9653 if (instr->is_sparse)
9654 tmp_dst = bld.pseudo(aco_opcode::p_create_vector, Definition(tmp), val[0], val[1], val[2],
9655 val[3], emit_extract_vector(ctx, tmp_dst, 4, v1));
9656 else
9657 tmp_dst = bld.pseudo(aco_opcode::p_create_vector, Definition(tmp), val[0], val[1], val[2],
9658 val[3]);
9659 }
9660 unsigned mask = instr->op == nir_texop_tg4 ? (instr->is_sparse ? 0x1F : 0xF) : dmask;
9661 expand_vector(ctx, tmp_dst, dst, instr->def.num_components, mask);
9662 }
9663
9664 Operand
get_phi_operand(isel_context * ctx,nir_def * ssa,RegClass rc)9665 get_phi_operand(isel_context* ctx, nir_def* ssa, RegClass rc)
9666 {
9667 Temp tmp = get_ssa_temp(ctx, ssa);
9668 if (ssa->parent_instr->type == nir_instr_type_undef) {
9669 return Operand(rc);
9670 } else if (ssa->bit_size == 1 && ssa->parent_instr->type == nir_instr_type_load_const) {
9671 bool val = nir_instr_as_load_const(ssa->parent_instr)->value[0].b;
9672 return Operand::c32_or_c64(val ? -1 : 0, ctx->program->lane_mask == s2);
9673 } else {
9674 return Operand(tmp);
9675 }
9676 }
9677
9678 void
visit_phi(isel_context * ctx,nir_phi_instr * instr)9679 visit_phi(isel_context* ctx, nir_phi_instr* instr)
9680 {
9681 Temp dst = get_ssa_temp(ctx, &instr->def);
9682 assert(instr->def.bit_size != 1 || dst.regClass() == ctx->program->lane_mask);
9683 aco_opcode opcode = instr->def.bit_size == 1 ? aco_opcode::p_boolean_phi : aco_opcode::p_phi;
9684
9685 /* we want a sorted list of sources, since the predecessor list is also sorted */
9686 std::map<unsigned, nir_def*> phi_src;
9687 nir_foreach_phi_src (src, instr)
9688 phi_src[src->pred->index] = src->src.ssa;
9689
9690 Instruction* phi = create_instruction(opcode, Format::PSEUDO, phi_src.size(), 1);
9691 unsigned i = 0;
9692 for (std::pair<unsigned, nir_def*> src : phi_src)
9693 phi->operands[i++] = get_phi_operand(ctx, src.second, dst.regClass());
9694 phi->definitions[0] = Definition(dst);
9695 ctx->block->instructions.emplace(ctx->block->instructions.begin(), std::move(phi));
9696 }
9697
9698 void
visit_undef(isel_context * ctx,nir_undef_instr * instr)9699 visit_undef(isel_context* ctx, nir_undef_instr* instr)
9700 {
9701 Temp dst = get_ssa_temp(ctx, &instr->def);
9702
9703 assert(dst.type() == RegType::sgpr);
9704
9705 if (dst.size() == 1) {
9706 Builder(ctx->program, ctx->block).copy(Definition(dst), Operand::zero());
9707 } else {
9708 aco_ptr<Instruction> vec{
9709 create_instruction(aco_opcode::p_create_vector, Format::PSEUDO, dst.size(), 1)};
9710 for (unsigned i = 0; i < dst.size(); i++)
9711 vec->operands[i] = Operand::zero();
9712 vec->definitions[0] = Definition(dst);
9713 ctx->block->instructions.emplace_back(std::move(vec));
9714 }
9715 }
9716
9717 void
begin_loop(isel_context * ctx,loop_context * lc)9718 begin_loop(isel_context* ctx, loop_context* lc)
9719 {
9720 append_logical_end(ctx->block);
9721 ctx->block->kind |= block_kind_loop_preheader | block_kind_uniform;
9722 Builder bld(ctx->program, ctx->block);
9723 bld.branch(aco_opcode::p_branch);
9724 unsigned loop_preheader_idx = ctx->block->index;
9725
9726 lc->loop_exit.kind |= (block_kind_loop_exit | (ctx->block->kind & block_kind_top_level));
9727
9728 ctx->program->next_loop_depth++;
9729
9730 Block* loop_header = ctx->program->create_and_insert_block();
9731 loop_header->kind |= block_kind_loop_header;
9732 add_edge(loop_preheader_idx, loop_header);
9733 ctx->block = loop_header;
9734
9735 append_logical_start(ctx->block);
9736
9737 lc->header_idx_old = std::exchange(ctx->cf_info.parent_loop.header_idx, loop_header->index);
9738 lc->exit_old = std::exchange(ctx->cf_info.parent_loop.exit, &lc->loop_exit);
9739 lc->divergent_cont_old = std::exchange(ctx->cf_info.parent_loop.has_divergent_continue, false);
9740 lc->divergent_branch_old = std::exchange(ctx->cf_info.parent_loop.has_divergent_branch, false);
9741 lc->divergent_if_old = std::exchange(ctx->cf_info.parent_if.is_divergent, false);
9742 }
9743
9744 void
update_exec_info(isel_context * ctx)9745 update_exec_info(isel_context* ctx)
9746 {
9747 if (!ctx->block->loop_nest_depth && !ctx->cf_info.parent_if.is_divergent)
9748 ctx->cf_info.exec.potentially_empty_discard = false;
9749
9750 ctx->cf_info.exec.potentially_empty_break &=
9751 ctx->block->loop_nest_depth >= ctx->cf_info.exec.potentially_empty_break_depth;
9752 ctx->cf_info.exec.potentially_empty_continue &=
9753 ctx->block->loop_nest_depth >= ctx->cf_info.exec.potentially_empty_continue_depth;
9754
9755 if (ctx->block->loop_nest_depth == ctx->cf_info.exec.potentially_empty_break_depth &&
9756 !ctx->cf_info.parent_if.is_divergent && !ctx->cf_info.parent_loop.has_divergent_continue) {
9757 ctx->cf_info.exec.potentially_empty_break = false;
9758 }
9759 if (ctx->block->loop_nest_depth == ctx->cf_info.exec.potentially_empty_continue_depth &&
9760 !ctx->cf_info.parent_if.is_divergent) {
9761 ctx->cf_info.exec.potentially_empty_continue = false;
9762 }
9763
9764 if (!ctx->cf_info.exec.potentially_empty_break)
9765 ctx->cf_info.exec.potentially_empty_break_depth = UINT16_MAX;
9766 if (!ctx->cf_info.exec.potentially_empty_continue)
9767 ctx->cf_info.exec.potentially_empty_continue_depth = UINT16_MAX;
9768 }
9769
9770 void
end_loop(isel_context * ctx,loop_context * lc)9771 end_loop(isel_context* ctx, loop_context* lc)
9772 {
9773 // TODO: what if a loop ends with a unconditional or uniformly branched continue
9774 // and this branch is never taken?
9775 if (!ctx->cf_info.has_branch) {
9776 unsigned loop_header_idx = ctx->cf_info.parent_loop.header_idx;
9777 Builder bld(ctx->program, ctx->block);
9778 append_logical_end(ctx->block);
9779
9780 /* No need to check exec.potentially_empty_break/continue originating inside the loop. In the
9781 * only case where it's possible at this point (divergent break after divergent continue), we
9782 * should continue anyway. */
9783 if (ctx->cf_info.exec.potentially_empty_discard ||
9784 (ctx->cf_info.exec.potentially_empty_break &&
9785 ctx->cf_info.exec.potentially_empty_break_depth < ctx->block->loop_nest_depth) ||
9786 (ctx->cf_info.exec.potentially_empty_continue &&
9787 ctx->cf_info.exec.potentially_empty_continue_depth < ctx->block->loop_nest_depth)) {
9788 /* Discards can result in code running with an empty exec mask.
9789 * This would result in divergent breaks not ever being taken. As a
9790 * workaround, break the loop when the loop mask is empty instead of
9791 * always continuing. */
9792 ctx->block->kind |= (block_kind_continue_or_break | block_kind_uniform);
9793 unsigned block_idx = ctx->block->index;
9794
9795 /* create helper blocks to avoid critical edges */
9796 Block* break_block = ctx->program->create_and_insert_block();
9797 break_block->kind = block_kind_uniform;
9798 bld.reset(break_block);
9799 bld.branch(aco_opcode::p_branch);
9800 add_linear_edge(block_idx, break_block);
9801 add_linear_edge(break_block->index, &lc->loop_exit);
9802
9803 Block* continue_block = ctx->program->create_and_insert_block();
9804 continue_block->kind = block_kind_uniform;
9805 bld.reset(continue_block);
9806 bld.branch(aco_opcode::p_branch);
9807 add_linear_edge(block_idx, continue_block);
9808 add_linear_edge(continue_block->index, &ctx->program->blocks[loop_header_idx]);
9809
9810 if (!ctx->cf_info.parent_loop.has_divergent_branch)
9811 add_logical_edge(block_idx, &ctx->program->blocks[loop_header_idx]);
9812 ctx->block = &ctx->program->blocks[block_idx];
9813
9814 /* SGPR temporaries might need loop exit phis to be created. */
9815 ctx->program->should_repair_ssa = true;
9816 } else {
9817 ctx->block->kind |= (block_kind_continue | block_kind_uniform);
9818 if (!ctx->cf_info.parent_loop.has_divergent_branch)
9819 add_edge(ctx->block->index, &ctx->program->blocks[loop_header_idx]);
9820 else
9821 add_linear_edge(ctx->block->index, &ctx->program->blocks[loop_header_idx]);
9822 }
9823
9824 bld.reset(ctx->block);
9825 bld.branch(aco_opcode::p_branch);
9826 }
9827
9828 ctx->cf_info.has_branch = false;
9829 ctx->program->next_loop_depth--;
9830
9831 /* emit loop successor block */
9832 ctx->block = ctx->program->insert_block(std::move(lc->loop_exit));
9833 append_logical_start(ctx->block);
9834
9835 ctx->cf_info.parent_loop.header_idx = lc->header_idx_old;
9836 ctx->cf_info.parent_loop.exit = lc->exit_old;
9837 ctx->cf_info.parent_loop.has_divergent_continue = lc->divergent_cont_old;
9838 ctx->cf_info.parent_loop.has_divergent_branch = lc->divergent_branch_old;
9839 ctx->cf_info.parent_if.is_divergent = lc->divergent_if_old;
9840 update_exec_info(ctx);
9841 }
9842
9843 void
emit_loop_jump(isel_context * ctx,bool is_break)9844 emit_loop_jump(isel_context* ctx, bool is_break)
9845 {
9846 Builder bld(ctx->program, ctx->block);
9847 Block* logical_target;
9848 append_logical_end(ctx->block);
9849 unsigned idx = ctx->block->index;
9850
9851 /* If exec is empty inside uniform control flow in a loop, we can assume that all invocations
9852 * of the loop are inactive. Breaking from the loop is the right thing to do in that case.
9853 * We shouldn't perform a uniform continue, or else we might never reach a break.
9854 */
9855 bool potentially_empty_exec = ctx->cf_info.exec.potentially_empty_discard ||
9856 ctx->cf_info.exec.potentially_empty_break ||
9857 ctx->cf_info.exec.potentially_empty_continue;
9858
9859 if (is_break) {
9860 logical_target = ctx->cf_info.parent_loop.exit;
9861 add_logical_edge(idx, logical_target);
9862 ctx->block->kind |= block_kind_break;
9863
9864 if (!ctx->cf_info.parent_if.is_divergent &&
9865 !ctx->cf_info.parent_loop.has_divergent_continue) {
9866 /* uniform break - directly jump out of the loop */
9867 ctx->block->kind |= block_kind_uniform;
9868 ctx->cf_info.has_branch = true;
9869 bld.branch(aco_opcode::p_branch);
9870 add_linear_edge(idx, logical_target);
9871 return;
9872 }
9873 ctx->cf_info.parent_loop.has_divergent_branch = true;
9874
9875 if (!ctx->cf_info.exec.potentially_empty_break) {
9876 ctx->cf_info.exec.potentially_empty_break = true;
9877 ctx->cf_info.exec.potentially_empty_break_depth = ctx->block->loop_nest_depth;
9878 }
9879 } else {
9880 logical_target = &ctx->program->blocks[ctx->cf_info.parent_loop.header_idx];
9881 add_logical_edge(idx, logical_target);
9882 ctx->block->kind |= block_kind_continue;
9883
9884 if (!ctx->cf_info.parent_if.is_divergent && !potentially_empty_exec) {
9885 /* uniform continue - directly jump to the loop header */
9886 ctx->block->kind |= block_kind_uniform;
9887 ctx->cf_info.has_branch = true;
9888 bld.branch(aco_opcode::p_branch);
9889 add_linear_edge(idx, logical_target);
9890 return;
9891 }
9892
9893 ctx->cf_info.parent_loop.has_divergent_branch = true;
9894
9895 if (ctx->cf_info.parent_if.is_divergent) {
9896 /* for potential uniform breaks after this continue,
9897 we must ensure that they are handled correctly */
9898 ctx->cf_info.parent_loop.has_divergent_continue = true;
9899
9900 if (!ctx->cf_info.exec.potentially_empty_continue) {
9901 ctx->cf_info.exec.potentially_empty_continue = true;
9902 ctx->cf_info.exec.potentially_empty_continue_depth = ctx->block->loop_nest_depth;
9903 }
9904 }
9905 }
9906
9907 /* remove critical edges from linear CFG */
9908 bld.branch(aco_opcode::p_branch);
9909 Block* break_block = ctx->program->create_and_insert_block();
9910 break_block->kind |= block_kind_uniform;
9911 add_linear_edge(idx, break_block);
9912 /* the loop_header pointer might be invalidated by this point */
9913 if (!is_break)
9914 logical_target = &ctx->program->blocks[ctx->cf_info.parent_loop.header_idx];
9915 add_linear_edge(break_block->index, logical_target);
9916 bld.reset(break_block);
9917 bld.branch(aco_opcode::p_branch);
9918
9919 Block* continue_block = ctx->program->create_and_insert_block();
9920 add_linear_edge(idx, continue_block);
9921 append_logical_start(continue_block);
9922 ctx->block = continue_block;
9923 }
9924
9925 void
emit_loop_break(isel_context * ctx)9926 emit_loop_break(isel_context* ctx)
9927 {
9928 emit_loop_jump(ctx, true);
9929 }
9930
9931 void
emit_loop_continue(isel_context * ctx)9932 emit_loop_continue(isel_context* ctx)
9933 {
9934 emit_loop_jump(ctx, false);
9935 }
9936
9937 void
visit_jump(isel_context * ctx,nir_jump_instr * instr)9938 visit_jump(isel_context* ctx, nir_jump_instr* instr)
9939 {
9940 end_empty_exec_skip(ctx);
9941
9942 switch (instr->type) {
9943 case nir_jump_break: emit_loop_break(ctx); break;
9944 case nir_jump_continue: emit_loop_continue(ctx); break;
9945 default: isel_err(&instr->instr, "Unknown NIR jump instr"); abort();
9946 }
9947 }
9948
9949 void
visit_debug_info(isel_context * ctx,nir_debug_info_instr * instr)9950 visit_debug_info(isel_context* ctx, nir_debug_info_instr* instr)
9951 {
9952 ac_shader_debug_info info;
9953 memset(&info, 0, sizeof(info));
9954
9955 switch (instr->type) {
9956 case nir_debug_info_src_loc:
9957 info.type = ac_shader_debug_info_src_loc;
9958 info.src_loc.file = strdup(nir_src_as_string(instr->src_loc.filename));
9959 info.src_loc.line = instr->src_loc.line;
9960 info.src_loc.column = instr->src_loc.column;
9961 info.src_loc.spirv_offset = instr->src_loc.spirv_offset;
9962 break;
9963 default:
9964 return;
9965 }
9966
9967 Builder bld(ctx->program, ctx->block);
9968 bld.pseudo(aco_opcode::p_debug_info, Operand::c32(ctx->program->debug_info.size()));
9969
9970 ctx->program->debug_info.push_back(info);
9971 }
9972
9973 void
visit_block(isel_context * ctx,nir_block * block)9974 visit_block(isel_context* ctx, nir_block* block)
9975 {
9976 if (ctx->block->kind & block_kind_top_level) {
9977 Builder bld(ctx->program, ctx->block);
9978 for (Temp tmp : ctx->unended_linear_vgprs) {
9979 bld.pseudo(aco_opcode::p_end_linear_vgpr, tmp);
9980 }
9981 ctx->unended_linear_vgprs.clear();
9982 }
9983
9984 nir_foreach_phi (instr, block)
9985 visit_phi(ctx, instr);
9986
9987 nir_phi_instr* last_phi = nir_block_last_phi_instr(block);
9988 begin_empty_exec_skip(ctx, last_phi ? &last_phi->instr : NULL, block);
9989
9990 ctx->block->instructions.reserve(ctx->block->instructions.size() +
9991 exec_list_length(&block->instr_list) * 2);
9992 nir_foreach_instr (instr, block) {
9993 switch (instr->type) {
9994 case nir_instr_type_alu: visit_alu_instr(ctx, nir_instr_as_alu(instr)); break;
9995 case nir_instr_type_load_const: visit_load_const(ctx, nir_instr_as_load_const(instr)); break;
9996 case nir_instr_type_intrinsic: visit_intrinsic(ctx, nir_instr_as_intrinsic(instr)); break;
9997 case nir_instr_type_tex: visit_tex(ctx, nir_instr_as_tex(instr)); break;
9998 case nir_instr_type_phi: break;
9999 case nir_instr_type_undef: visit_undef(ctx, nir_instr_as_undef(instr)); break;
10000 case nir_instr_type_deref: break;
10001 case nir_instr_type_jump: visit_jump(ctx, nir_instr_as_jump(instr)); break;
10002 case nir_instr_type_debug_info: visit_debug_info(ctx, nir_instr_as_debug_info(instr)); break;
10003 default: isel_err(instr, "Unknown NIR instr type");
10004 }
10005 }
10006 }
10007
10008 static void begin_uniform_if_then(isel_context* ctx, if_context* ic, Temp cond);
10009 static void begin_uniform_if_else(isel_context* ctx, if_context* ic, bool logical_else = true);
10010 static void end_uniform_if(isel_context* ctx, if_context* ic, bool logical_else = true);
10011
10012 static void
visit_loop(isel_context * ctx,nir_loop * loop)10013 visit_loop(isel_context* ctx, nir_loop* loop)
10014 {
10015 assert(!nir_loop_has_continue_construct(loop));
10016 loop_context lc;
10017 begin_loop(ctx, &lc);
10018
10019 visit_cf_list(ctx, &loop->body);
10020
10021 end_loop(ctx, &lc);
10022 }
10023
10024 static void
begin_divergent_if_then(isel_context * ctx,if_context * ic,Temp cond,nir_selection_control sel_ctrl=nir_selection_control_none)10025 begin_divergent_if_then(isel_context* ctx, if_context* ic, Temp cond,
10026 nir_selection_control sel_ctrl = nir_selection_control_none)
10027 {
10028 append_logical_end(ctx->block);
10029 ctx->block->kind |= block_kind_branch;
10030
10031 /* branch to linear then block */
10032 assert(cond.regClass() == ctx->program->lane_mask);
10033 aco_ptr<Instruction> branch;
10034 branch.reset(create_instruction(aco_opcode::p_cbranch_z, Format::PSEUDO_BRANCH, 1, 0));
10035 branch->operands[0] = Operand(cond);
10036 bool never_taken =
10037 sel_ctrl == nir_selection_control_divergent_always_taken &&
10038 !(ctx->cf_info.exec.potentially_empty_discard || ctx->cf_info.exec.potentially_empty_break ||
10039 ctx->cf_info.exec.potentially_empty_continue);
10040 branch->branch().rarely_taken = sel_ctrl == nir_selection_control_flatten || never_taken;
10041 branch->branch().never_taken = never_taken;
10042 ctx->block->instructions.push_back(std::move(branch));
10043
10044 ic->BB_if_idx = ctx->block->index;
10045 ic->BB_invert = Block();
10046 /* Invert blocks are intentionally not marked as top level because they
10047 * are not part of the logical cfg. */
10048 ic->BB_invert.kind |= block_kind_invert;
10049 ic->BB_endif = Block();
10050 ic->BB_endif.kind |= (block_kind_merge | (ctx->block->kind & block_kind_top_level));
10051
10052 ic->exec_old = ctx->cf_info.exec;
10053 ic->divergent_old = ctx->cf_info.parent_if.is_divergent;
10054 ic->had_divergent_discard_old = ctx->cf_info.had_divergent_discard;
10055 ctx->cf_info.parent_if.is_divergent = true;
10056
10057 /* divergent branches use cbranch_execz */
10058 ctx->cf_info.exec = exec_info();
10059
10060 /** emit logical then block */
10061 ctx->program->next_divergent_if_logical_depth++;
10062 Block* BB_then_logical = ctx->program->create_and_insert_block();
10063 add_edge(ic->BB_if_idx, BB_then_logical);
10064 ctx->block = BB_then_logical;
10065 append_logical_start(BB_then_logical);
10066 }
10067
10068 static void
begin_divergent_if_else(isel_context * ctx,if_context * ic,nir_selection_control sel_ctrl=nir_selection_control_none)10069 begin_divergent_if_else(isel_context* ctx, if_context* ic,
10070 nir_selection_control sel_ctrl = nir_selection_control_none)
10071 {
10072 Block* BB_then_logical = ctx->block;
10073 append_logical_end(BB_then_logical);
10074 /* branch from logical then block to invert block */
10075 aco_ptr<Instruction> branch;
10076 branch.reset(create_instruction(aco_opcode::p_branch, Format::PSEUDO_BRANCH, 0, 0));
10077 BB_then_logical->instructions.emplace_back(std::move(branch));
10078 add_linear_edge(BB_then_logical->index, &ic->BB_invert);
10079 if (!ctx->cf_info.parent_loop.has_divergent_branch)
10080 add_logical_edge(BB_then_logical->index, &ic->BB_endif);
10081 BB_then_logical->kind |= block_kind_uniform;
10082 assert(!ctx->cf_info.has_branch);
10083 ctx->cf_info.parent_loop.has_divergent_branch = false;
10084 ctx->program->next_divergent_if_logical_depth--;
10085
10086 /** emit linear then block */
10087 Block* BB_then_linear = ctx->program->create_and_insert_block();
10088 BB_then_linear->kind |= block_kind_uniform;
10089 add_linear_edge(ic->BB_if_idx, BB_then_linear);
10090 /* branch from linear then block to invert block */
10091 branch.reset(create_instruction(aco_opcode::p_branch, Format::PSEUDO_BRANCH, 0, 0));
10092 BB_then_linear->instructions.emplace_back(std::move(branch));
10093 add_linear_edge(BB_then_linear->index, &ic->BB_invert);
10094
10095 /** emit invert merge block */
10096 ctx->block = ctx->program->insert_block(std::move(ic->BB_invert));
10097 ic->invert_idx = ctx->block->index;
10098
10099 /* branch to linear else block (skip else) */
10100 branch.reset(create_instruction(aco_opcode::p_branch, Format::PSEUDO_BRANCH, 0, 0));
10101 bool never_taken =
10102 sel_ctrl == nir_selection_control_divergent_always_taken &&
10103 !(ctx->cf_info.exec.potentially_empty_discard || ctx->cf_info.exec.potentially_empty_break ||
10104 ctx->cf_info.exec.potentially_empty_continue);
10105 branch->branch().rarely_taken = sel_ctrl == nir_selection_control_flatten || never_taken;
10106 branch->branch().never_taken = never_taken;
10107 ctx->block->instructions.push_back(std::move(branch));
10108
10109 ic->exec_old.combine(ctx->cf_info.exec);
10110 /* divergent branches use cbranch_execz */
10111 ctx->cf_info.exec = exec_info();
10112
10113 ic->had_divergent_discard_then = ctx->cf_info.had_divergent_discard;
10114 ctx->cf_info.had_divergent_discard = ic->had_divergent_discard_old;
10115
10116 /** emit logical else block */
10117 ctx->program->next_divergent_if_logical_depth++;
10118 Block* BB_else_logical = ctx->program->create_and_insert_block();
10119 add_logical_edge(ic->BB_if_idx, BB_else_logical);
10120 add_linear_edge(ic->invert_idx, BB_else_logical);
10121 ctx->block = BB_else_logical;
10122 append_logical_start(BB_else_logical);
10123 }
10124
10125 static void
end_divergent_if(isel_context * ctx,if_context * ic)10126 end_divergent_if(isel_context* ctx, if_context* ic)
10127 {
10128 Block* BB_else_logical = ctx->block;
10129 append_logical_end(BB_else_logical);
10130
10131 /* branch from logical else block to endif block */
10132 aco_ptr<Instruction> branch;
10133 branch.reset(create_instruction(aco_opcode::p_branch, Format::PSEUDO_BRANCH, 0, 0));
10134 BB_else_logical->instructions.emplace_back(std::move(branch));
10135 add_linear_edge(BB_else_logical->index, &ic->BB_endif);
10136 if (!ctx->cf_info.parent_loop.has_divergent_branch)
10137 add_logical_edge(BB_else_logical->index, &ic->BB_endif);
10138 BB_else_logical->kind |= block_kind_uniform;
10139 ctx->program->next_divergent_if_logical_depth--;
10140
10141 assert(!ctx->cf_info.has_branch);
10142 ctx->cf_info.parent_loop.has_divergent_branch = false;
10143
10144 /** emit linear else block */
10145 Block* BB_else_linear = ctx->program->create_and_insert_block();
10146 BB_else_linear->kind |= block_kind_uniform;
10147 add_linear_edge(ic->invert_idx, BB_else_linear);
10148
10149 /* branch from linear else block to endif block */
10150 branch.reset(create_instruction(aco_opcode::p_branch, Format::PSEUDO_BRANCH, 0, 0));
10151 BB_else_linear->instructions.emplace_back(std::move(branch));
10152 add_linear_edge(BB_else_linear->index, &ic->BB_endif);
10153
10154 /** emit endif merge block */
10155 ctx->block = ctx->program->insert_block(std::move(ic->BB_endif));
10156 append_logical_start(ctx->block);
10157
10158 ctx->cf_info.parent_if.is_divergent = ic->divergent_old;
10159 ctx->cf_info.exec.combine(ic->exec_old);
10160 update_exec_info(ctx);
10161 ctx->cf_info.had_divergent_discard |= ic->had_divergent_discard_then;
10162
10163 /* We shouldn't create unreachable blocks. */
10164 assert(!ctx->block->logical_preds.empty());
10165 }
10166
10167 static void
begin_uniform_if_then(isel_context * ctx,if_context * ic,Temp cond)10168 begin_uniform_if_then(isel_context* ctx, if_context* ic, Temp cond)
10169 {
10170 assert(!cond.id() || cond.regClass() == s1);
10171
10172 ic->cond = cond;
10173
10174 append_logical_end(ctx->block);
10175 ctx->block->kind |= block_kind_uniform;
10176
10177 aco_ptr<Instruction> branch;
10178 aco_opcode branch_opcode = aco_opcode::p_cbranch_z;
10179 branch.reset(create_instruction(branch_opcode, Format::PSEUDO_BRANCH, 1, 0));
10180 if (cond.id()) {
10181 branch->operands[0] = Operand(cond);
10182 branch->operands[0].setPrecolored(scc);
10183 } else {
10184 branch->operands[0] = Operand(exec, ctx->program->lane_mask);
10185 branch->branch().rarely_taken = true;
10186 }
10187 ctx->block->instructions.emplace_back(std::move(branch));
10188
10189 ic->BB_if_idx = ctx->block->index;
10190 ic->BB_endif = Block();
10191 ic->BB_endif.kind |= ctx->block->kind & block_kind_top_level;
10192
10193 ctx->cf_info.has_branch = false;
10194 ctx->cf_info.parent_loop.has_divergent_branch = false;
10195
10196 ic->had_divergent_discard_old = ctx->cf_info.had_divergent_discard;
10197 ic->has_divergent_continue_old = ctx->cf_info.parent_loop.has_divergent_continue;
10198
10199 /** emit then block */
10200 if (ic->cond.id())
10201 ctx->program->next_uniform_if_depth++;
10202 Block* BB_then = ctx->program->create_and_insert_block();
10203 add_edge(ic->BB_if_idx, BB_then);
10204 append_logical_start(BB_then);
10205 ctx->block = BB_then;
10206 }
10207
10208 static void
begin_uniform_if_else(isel_context * ctx,if_context * ic,bool logical_else)10209 begin_uniform_if_else(isel_context* ctx, if_context* ic, bool logical_else)
10210 {
10211 Block* BB_then = ctx->block;
10212
10213 if (!ctx->cf_info.has_branch) {
10214 append_logical_end(BB_then);
10215 /* branch from then block to endif block */
10216 aco_ptr<Instruction> branch;
10217 branch.reset(create_instruction(aco_opcode::p_branch, Format::PSEUDO_BRANCH, 0, 0));
10218 BB_then->instructions.emplace_back(std::move(branch));
10219 add_linear_edge(BB_then->index, &ic->BB_endif);
10220 if (!ctx->cf_info.parent_loop.has_divergent_branch)
10221 add_logical_edge(BB_then->index, &ic->BB_endif);
10222 BB_then->kind |= block_kind_uniform;
10223 }
10224
10225 ctx->cf_info.has_branch = false;
10226 ctx->cf_info.parent_loop.has_divergent_branch = false;
10227
10228 ic->had_divergent_discard_then = ctx->cf_info.had_divergent_discard;
10229 ctx->cf_info.had_divergent_discard = ic->had_divergent_discard_old;
10230
10231 ic->has_divergent_continue_then = ctx->cf_info.parent_loop.has_divergent_continue;
10232 ctx->cf_info.parent_loop.has_divergent_continue = ic->has_divergent_continue_old;
10233
10234 /** emit else block */
10235 Block* BB_else = ctx->program->create_and_insert_block();
10236 if (logical_else) {
10237 add_edge(ic->BB_if_idx, BB_else);
10238 append_logical_start(BB_else);
10239 } else {
10240 add_linear_edge(ic->BB_if_idx, BB_else);
10241 }
10242 ctx->block = BB_else;
10243 }
10244
10245 static void
end_uniform_if(isel_context * ctx,if_context * ic,bool logical_else)10246 end_uniform_if(isel_context* ctx, if_context* ic, bool logical_else)
10247 {
10248 Block* BB_else = ctx->block;
10249
10250 if (!ctx->cf_info.has_branch) {
10251 if (logical_else)
10252 append_logical_end(BB_else);
10253 /* branch from then block to endif block */
10254 aco_ptr<Instruction> branch;
10255 branch.reset(create_instruction(aco_opcode::p_branch, Format::PSEUDO_BRANCH, 0, 0));
10256 BB_else->instructions.emplace_back(std::move(branch));
10257 add_linear_edge(BB_else->index, &ic->BB_endif);
10258 if (logical_else && !ctx->cf_info.parent_loop.has_divergent_branch)
10259 add_logical_edge(BB_else->index, &ic->BB_endif);
10260 BB_else->kind |= block_kind_uniform;
10261 }
10262
10263 ctx->cf_info.has_branch = false;
10264 ctx->cf_info.parent_loop.has_divergent_branch = false;
10265 ctx->cf_info.had_divergent_discard |= ic->had_divergent_discard_then;
10266 ctx->cf_info.parent_loop.has_divergent_continue |= ic->has_divergent_continue_then;
10267
10268 /** emit endif merge block */
10269 if (ic->cond.id())
10270 ctx->program->next_uniform_if_depth--;
10271 ctx->block = ctx->program->insert_block(std::move(ic->BB_endif));
10272 append_logical_start(ctx->block);
10273
10274 /* We shouldn't create unreachable blocks. */
10275 assert(!ctx->block->logical_preds.empty());
10276 }
10277
10278 static void
end_empty_exec_skip(isel_context * ctx)10279 end_empty_exec_skip(isel_context* ctx)
10280 {
10281 if (ctx->cf_info.skipping_empty_exec) {
10282 begin_uniform_if_else(ctx, &ctx->cf_info.empty_exec_skip, false);
10283 end_uniform_if(ctx, &ctx->cf_info.empty_exec_skip, false);
10284 ctx->cf_info.skipping_empty_exec = false;
10285
10286 ctx->cf_info.exec.combine(ctx->cf_info.empty_exec_skip.exec_old);
10287 }
10288 }
10289
10290 /*
10291 * If necessary, begin a branch which skips over instructions if exec is empty.
10292 *
10293 * The linear CFG:
10294 * BB_IF
10295 * / \
10296 * BB_THEN (logical) BB_ELSE (linear)
10297 * \ /
10298 * BB_ENDIF
10299 *
10300 * The logical CFG:
10301 * BB_IF
10302 * |
10303 * BB_THEN (logical)
10304 * |
10305 * BB_ENDIF
10306 *
10307 * BB_THEN should not end with a branch, since that would make BB_ENDIF unreachable.
10308 */
10309 static void
begin_empty_exec_skip(isel_context * ctx,nir_instr * after_instr,nir_block * block)10310 begin_empty_exec_skip(isel_context* ctx, nir_instr* after_instr, nir_block* block)
10311 {
10312 if (!ctx->cf_info.exec.potentially_empty_discard && !ctx->cf_info.exec.potentially_empty_break &&
10313 !ctx->cf_info.exec.potentially_empty_continue)
10314 return;
10315
10316 assert(!(ctx->block->kind & block_kind_top_level));
10317
10318 bool further_cf_empty = !nir_cf_node_next(&block->cf_node);
10319
10320 bool rest_of_block_empty = false;
10321 if (after_instr) {
10322 rest_of_block_empty =
10323 nir_instr_is_last(after_instr) || nir_instr_next(after_instr)->type == nir_instr_type_jump;
10324 } else {
10325 rest_of_block_empty = exec_list_is_empty(&block->instr_list) ||
10326 nir_block_first_instr(block)->type == nir_instr_type_jump;
10327 }
10328
10329 assert(!(ctx->block->kind & block_kind_export_end) || rest_of_block_empty);
10330
10331 if (rest_of_block_empty && further_cf_empty)
10332 return;
10333
10334 /* Don't nest these skipping branches. It is not worth the complexity. */
10335 end_empty_exec_skip(ctx);
10336
10337 begin_uniform_if_then(ctx, &ctx->cf_info.empty_exec_skip, Temp());
10338 ctx->cf_info.skipping_empty_exec = true;
10339
10340 ctx->cf_info.empty_exec_skip.exec_old = ctx->cf_info.exec;
10341 ctx->cf_info.exec = exec_info();
10342
10343 ctx->program->should_repair_ssa = true;
10344 }
10345
10346 static void
visit_if(isel_context * ctx,nir_if * if_stmt)10347 visit_if(isel_context* ctx, nir_if* if_stmt)
10348 {
10349 Temp cond = get_ssa_temp(ctx, if_stmt->condition.ssa);
10350 Builder bld(ctx->program, ctx->block);
10351 aco_ptr<Instruction> branch;
10352 if_context ic;
10353
10354 if (!nir_src_is_divergent(&if_stmt->condition)) { /* uniform condition */
10355 /**
10356 * Uniform conditionals are represented in the following way*) :
10357 *
10358 * The linear and logical CFG:
10359 * BB_IF
10360 * / \
10361 * BB_THEN (logical) BB_ELSE (logical)
10362 * \ /
10363 * BB_ENDIF
10364 *
10365 * *) Exceptions may be due to break and continue statements within loops
10366 * If a break/continue happens within uniform control flow, it branches
10367 * to the loop exit/entry block. Otherwise, it branches to the next
10368 * merge block.
10369 **/
10370
10371 assert(cond.regClass() == ctx->program->lane_mask);
10372 cond = bool_to_scalar_condition(ctx, cond);
10373
10374 begin_uniform_if_then(ctx, &ic, cond);
10375 visit_cf_list(ctx, &if_stmt->then_list);
10376
10377 begin_uniform_if_else(ctx, &ic);
10378 visit_cf_list(ctx, &if_stmt->else_list);
10379
10380 end_uniform_if(ctx, &ic);
10381 } else { /* non-uniform condition */
10382 /**
10383 * To maintain a logical and linear CFG without critical edges,
10384 * non-uniform conditionals are represented in the following way*) :
10385 *
10386 * The linear CFG:
10387 * BB_IF
10388 * / \
10389 * BB_THEN (logical) BB_THEN (linear)
10390 * \ /
10391 * BB_INVERT (linear)
10392 * / \
10393 * BB_ELSE (logical) BB_ELSE (linear)
10394 * \ /
10395 * BB_ENDIF
10396 *
10397 * The logical CFG:
10398 * BB_IF
10399 * / \
10400 * BB_THEN (logical) BB_ELSE (logical)
10401 * \ /
10402 * BB_ENDIF
10403 *
10404 * *) Exceptions may be due to break and continue statements within loops
10405 **/
10406
10407 begin_divergent_if_then(ctx, &ic, cond, if_stmt->control);
10408 visit_cf_list(ctx, &if_stmt->then_list);
10409
10410 begin_divergent_if_else(ctx, &ic, if_stmt->control);
10411 visit_cf_list(ctx, &if_stmt->else_list);
10412
10413 end_divergent_if(ctx, &ic);
10414 }
10415 }
10416
10417 static void
visit_cf_list(isel_context * ctx,struct exec_list * list)10418 visit_cf_list(isel_context* ctx, struct exec_list* list)
10419 {
10420 if (nir_cf_list_is_empty_block(list))
10421 return;
10422
10423 bool skipping_empty_exec_old = ctx->cf_info.skipping_empty_exec;
10424 if_context empty_exec_skip_old = std::move(ctx->cf_info.empty_exec_skip);
10425 ctx->cf_info.skipping_empty_exec = false;
10426
10427 foreach_list_typed (nir_cf_node, node, node, list) {
10428 switch (node->type) {
10429 case nir_cf_node_block: visit_block(ctx, nir_cf_node_as_block(node)); break;
10430 case nir_cf_node_if: visit_if(ctx, nir_cf_node_as_if(node)); break;
10431 case nir_cf_node_loop: visit_loop(ctx, nir_cf_node_as_loop(node)); break;
10432 default: unreachable("unimplemented cf list type");
10433 }
10434 }
10435
10436 end_empty_exec_skip(ctx);
10437 ctx->cf_info.skipping_empty_exec = skipping_empty_exec_old;
10438 ctx->cf_info.empty_exec_skip = std::move(empty_exec_skip_old);
10439 }
10440
10441 static void
export_mrt(isel_context * ctx,const struct aco_export_mrt * mrt)10442 export_mrt(isel_context* ctx, const struct aco_export_mrt* mrt)
10443 {
10444 Builder bld(ctx->program, ctx->block);
10445
10446 bld.exp(aco_opcode::exp, mrt->out[0], mrt->out[1], mrt->out[2], mrt->out[3],
10447 mrt->enabled_channels, mrt->target, mrt->compr);
10448
10449 ctx->program->has_color_exports = true;
10450 }
10451
10452 static bool
export_fs_mrt_color(isel_context * ctx,const struct aco_ps_epilog_info * info,Temp colors[4],unsigned slot,struct aco_export_mrt * mrt)10453 export_fs_mrt_color(isel_context* ctx, const struct aco_ps_epilog_info* info, Temp colors[4],
10454 unsigned slot, struct aco_export_mrt* mrt)
10455 {
10456 unsigned col_format = (info->spi_shader_col_format >> (slot * 4)) & 0xf;
10457
10458 if (col_format == V_028714_SPI_SHADER_ZERO)
10459 return false;
10460
10461 Builder bld(ctx->program, ctx->block);
10462 Operand values[4];
10463
10464 for (unsigned i = 0; i < 4; ++i) {
10465 values[i] = Operand(colors[i]);
10466 }
10467
10468 unsigned enabled_channels = 0;
10469 aco_opcode compr_op = aco_opcode::num_opcodes;
10470 bool compr = false;
10471 bool is_16bit = colors[0].regClass() == v2b;
10472 bool is_int8 = (info->color_is_int8 >> slot) & 1;
10473 bool is_int10 = (info->color_is_int10 >> slot) & 1;
10474 bool enable_mrt_output_nan_fixup = (ctx->options->enable_mrt_output_nan_fixup >> slot) & 1;
10475
10476 /* Replace NaN by zero (only 32-bit) to fix game bugs if requested. */
10477 if (enable_mrt_output_nan_fixup && !is_16bit &&
10478 (col_format == V_028714_SPI_SHADER_32_R || col_format == V_028714_SPI_SHADER_32_GR ||
10479 col_format == V_028714_SPI_SHADER_32_AR || col_format == V_028714_SPI_SHADER_32_ABGR ||
10480 col_format == V_028714_SPI_SHADER_FP16_ABGR)) {
10481 for (unsigned i = 0; i < 4; i++) {
10482 Temp is_not_nan =
10483 bld.vopc(aco_opcode::v_cmp_eq_f32, bld.def(bld.lm), values[i], values[i]);
10484 values[i] = bld.vop2(aco_opcode::v_cndmask_b32, bld.def(v1), Operand::zero(), values[i],
10485 is_not_nan);
10486 }
10487 }
10488
10489 switch (col_format) {
10490 case V_028714_SPI_SHADER_32_R: enabled_channels = 1; break;
10491
10492 case V_028714_SPI_SHADER_32_GR: enabled_channels = 0x3; break;
10493
10494 case V_028714_SPI_SHADER_32_AR:
10495 if (ctx->options->gfx_level >= GFX10) {
10496 /* Special case: on GFX10, the outputs are different for 32_AR */
10497 enabled_channels = 0x3;
10498 values[1] = values[3];
10499 values[3] = Operand(v1);
10500 } else {
10501 enabled_channels = 0x9;
10502 }
10503 break;
10504
10505 case V_028714_SPI_SHADER_FP16_ABGR:
10506 for (int i = 0; i < 2; i++) {
10507 if (is_16bit) {
10508 values[i] = bld.pseudo(aco_opcode::p_create_vector, bld.def(v1), values[i * 2],
10509 values[i * 2 + 1]);
10510 } else if (ctx->options->gfx_level == GFX8 || ctx->options->gfx_level == GFX9) {
10511 values[i] = bld.vop3(aco_opcode::v_cvt_pkrtz_f16_f32_e64, bld.def(v1), values[i * 2],
10512 values[i * 2 + 1]);
10513 } else {
10514 values[i] = bld.vop2(aco_opcode::v_cvt_pkrtz_f16_f32, bld.def(v1), values[i * 2],
10515 values[i * 2 + 1]);
10516 }
10517 }
10518 values[2] = Operand(v1);
10519 values[3] = Operand(v1);
10520 enabled_channels = 0xf;
10521 compr = true;
10522 break;
10523
10524 case V_028714_SPI_SHADER_UNORM16_ABGR:
10525 if (is_16bit && ctx->options->gfx_level >= GFX9) {
10526 compr_op = aco_opcode::v_cvt_pknorm_u16_f16;
10527 } else {
10528 compr_op = aco_opcode::v_cvt_pknorm_u16_f32;
10529 }
10530 break;
10531
10532 case V_028714_SPI_SHADER_SNORM16_ABGR:
10533 if (is_16bit && ctx->options->gfx_level >= GFX9) {
10534 compr_op = aco_opcode::v_cvt_pknorm_i16_f16;
10535 } else {
10536 compr_op = aco_opcode::v_cvt_pknorm_i16_f32;
10537 }
10538 break;
10539
10540 case V_028714_SPI_SHADER_UINT16_ABGR:
10541 compr_op = aco_opcode::v_cvt_pk_u16_u32;
10542 if (is_int8 || is_int10) {
10543 /* clamp */
10544 uint32_t max_rgb = is_int8 ? 255 : is_int10 ? 1023 : 0;
10545
10546 for (unsigned i = 0; i < 4; i++) {
10547 uint32_t max = i == 3 && is_int10 ? 3 : max_rgb;
10548
10549 values[i] = bld.vop2(aco_opcode::v_min_u32, bld.def(v1), Operand::c32(max), values[i]);
10550 }
10551 } else if (is_16bit) {
10552 for (unsigned i = 0; i < 4; i++) {
10553 Temp tmp = convert_int(ctx, bld, values[i].getTemp(), 16, 32, false);
10554 values[i] = Operand(tmp);
10555 }
10556 }
10557 break;
10558
10559 case V_028714_SPI_SHADER_SINT16_ABGR:
10560 compr_op = aco_opcode::v_cvt_pk_i16_i32;
10561 if (is_int8 || is_int10) {
10562 /* clamp */
10563 uint32_t max_rgb = is_int8 ? 127 : is_int10 ? 511 : 0;
10564 uint32_t min_rgb = is_int8 ? -128 : is_int10 ? -512 : 0;
10565
10566 for (unsigned i = 0; i < 4; i++) {
10567 uint32_t max = i == 3 && is_int10 ? 1 : max_rgb;
10568 uint32_t min = i == 3 && is_int10 ? -2u : min_rgb;
10569
10570 values[i] = bld.vop2(aco_opcode::v_min_i32, bld.def(v1), Operand::c32(max), values[i]);
10571 values[i] = bld.vop2(aco_opcode::v_max_i32, bld.def(v1), Operand::c32(min), values[i]);
10572 }
10573 } else if (is_16bit) {
10574 for (unsigned i = 0; i < 4; i++) {
10575 Temp tmp = convert_int(ctx, bld, values[i].getTemp(), 16, 32, true);
10576 values[i] = Operand(tmp);
10577 }
10578 }
10579 break;
10580
10581 case V_028714_SPI_SHADER_32_ABGR: enabled_channels = 0xF; break;
10582
10583 case V_028714_SPI_SHADER_ZERO:
10584 default: return false;
10585 }
10586
10587 if (compr_op != aco_opcode::num_opcodes) {
10588 values[0] = bld.vop3(compr_op, bld.def(v1), values[0], values[1]);
10589 values[1] = bld.vop3(compr_op, bld.def(v1), values[2], values[3]);
10590 values[2] = Operand(v1);
10591 values[3] = Operand(v1);
10592 enabled_channels = 0xf;
10593 compr = true;
10594 } else if (!compr) {
10595 for (int i = 0; i < 4; i++)
10596 values[i] = enabled_channels & (1 << i) ? values[i] : Operand(v1);
10597 }
10598
10599 if (ctx->program->gfx_level >= GFX11) {
10600 /* GFX11 doesn't use COMPR for exports, but the channel mask should be
10601 * 0x3 instead.
10602 */
10603 enabled_channels = compr ? 0x3 : enabled_channels;
10604 compr = false;
10605 }
10606
10607 for (unsigned i = 0; i < 4; i++)
10608 mrt->out[i] = values[i];
10609 mrt->target = V_008DFC_SQ_EXP_MRT;
10610 mrt->enabled_channels = enabled_channels;
10611 mrt->compr = compr;
10612
10613 return true;
10614 }
10615
10616 static void
export_fs_mrtz(isel_context * ctx,const struct aco_ps_epilog_info * info,Temp depth,Temp stencil,Temp samplemask,Temp alpha)10617 export_fs_mrtz(isel_context* ctx, const struct aco_ps_epilog_info* info, Temp depth, Temp stencil,
10618 Temp samplemask, Temp alpha)
10619 {
10620 Builder bld(ctx->program, ctx->block);
10621 unsigned enabled_channels = 0;
10622 bool compr = false;
10623 Operand values[4];
10624
10625 for (unsigned i = 0; i < 4; ++i) {
10626 values[i] = Operand(v1);
10627 }
10628
10629 const unsigned format =
10630 ac_get_spi_shader_z_format(depth.id(), stencil.id(), samplemask.id(), alpha.id());
10631 assert(format != V_028710_SPI_SHADER_ZERO);
10632
10633 /* Both stencil and sample mask only need 16-bits. */
10634 if (format == V_028710_SPI_SHADER_UINT16_ABGR) {
10635 compr = ctx->program->gfx_level < GFX11; /* COMPR flag */
10636
10637 if (stencil.id()) {
10638 /* Stencil should be in X[23:16]. */
10639 values[0] = bld.vop2(aco_opcode::v_lshlrev_b32, bld.def(v1), Operand::c32(16u), stencil);
10640 enabled_channels |= ctx->program->gfx_level >= GFX11 ? 0x1 : 0x3;
10641 }
10642
10643 if (samplemask.id()) {
10644 /* SampleMask should be in Y[15:0]. */
10645 values[1] = Operand(samplemask);
10646 enabled_channels |= ctx->program->gfx_level >= GFX11 ? 0x2 : 0xc;
10647 }
10648 } else {
10649 if (depth.id()) {
10650 values[0] = Operand(depth);
10651 enabled_channels |= 0x1;
10652 }
10653
10654 if (stencil.id()) {
10655 assert(format == V_028710_SPI_SHADER_32_GR || format == V_028710_SPI_SHADER_32_ABGR);
10656 values[1] = Operand(stencil);
10657 enabled_channels |= 0x2;
10658 }
10659
10660 if (samplemask.id()) {
10661 assert(format == V_028710_SPI_SHADER_32_ABGR);
10662 values[2] = Operand(samplemask);
10663 enabled_channels |= 0x4;
10664 }
10665
10666 if (alpha.id()) {
10667 assert(format == V_028710_SPI_SHADER_32_AR || format == V_028710_SPI_SHADER_32_ABGR);
10668 assert(ctx->program->gfx_level >= GFX11 || info->alpha_to_one);
10669 values[3] = Operand(alpha);
10670 enabled_channels |= 0x8;
10671 }
10672 }
10673
10674 /* GFX6 (except OLAND and HAINAN) has a bug that it only looks at the X
10675 * writemask component.
10676 */
10677 if (ctx->options->gfx_level == GFX6 && ctx->options->family != CHIP_OLAND &&
10678 ctx->options->family != CHIP_HAINAN) {
10679 enabled_channels |= 0x1;
10680 }
10681
10682 bld.exp(aco_opcode::exp, values[0], values[1], values[2], values[3], enabled_channels,
10683 V_008DFC_SQ_EXP_MRTZ, compr);
10684 }
10685
10686 static void
create_fs_null_export(isel_context * ctx)10687 create_fs_null_export(isel_context* ctx)
10688 {
10689 /* FS must always have exports.
10690 * So when there are none, we need to add a null export.
10691 */
10692
10693 Builder bld(ctx->program, ctx->block);
10694 /* GFX11 doesn't support NULL exports, and MRT0 should be exported instead. */
10695 unsigned dest = ctx->options->gfx_level >= GFX11 ? V_008DFC_SQ_EXP_MRT : V_008DFC_SQ_EXP_NULL;
10696 bld.exp(aco_opcode::exp, Operand(v1), Operand(v1), Operand(v1), Operand(v1),
10697 /* enabled_mask */ 0, dest, /* compr */ false, /* done */ true, /* vm */ true);
10698
10699 ctx->program->has_color_exports = true;
10700 }
10701
10702 static void
create_fs_jump_to_epilog(isel_context * ctx)10703 create_fs_jump_to_epilog(isel_context* ctx)
10704 {
10705 Builder bld(ctx->program, ctx->block);
10706 std::vector<Operand> exports;
10707 unsigned vgpr = 256; /* VGPR 0 */
10708
10709 if (ctx->outputs.mask[FRAG_RESULT_DEPTH])
10710 exports.emplace_back(Operand(ctx->outputs.temps[FRAG_RESULT_DEPTH * 4u], PhysReg{vgpr++}));
10711
10712 if (ctx->outputs.mask[FRAG_RESULT_STENCIL])
10713 exports.emplace_back(Operand(ctx->outputs.temps[FRAG_RESULT_STENCIL * 4u], PhysReg{vgpr++}));
10714
10715 if (ctx->outputs.mask[FRAG_RESULT_SAMPLE_MASK])
10716 exports.emplace_back(
10717 Operand(ctx->outputs.temps[FRAG_RESULT_SAMPLE_MASK * 4u], PhysReg{vgpr++}));
10718
10719 PhysReg exports_start(vgpr);
10720
10721 for (unsigned slot = FRAG_RESULT_DATA0; slot < FRAG_RESULT_DATA7 + 1; ++slot) {
10722 unsigned color_index = slot - FRAG_RESULT_DATA0;
10723 unsigned color_type = (ctx->output_color_types >> (color_index * 2)) & 0x3;
10724 unsigned write_mask = ctx->outputs.mask[slot];
10725
10726 if (!write_mask)
10727 continue;
10728
10729 PhysReg color_start(exports_start.reg() + color_index * 4);
10730
10731 for (unsigned i = 0; i < 4; i++) {
10732 if (!(write_mask & BITFIELD_BIT(i))) {
10733 exports.emplace_back(Operand(v1));
10734 continue;
10735 }
10736
10737 PhysReg chan_reg = color_start.advance(i * 4u);
10738 Operand chan(ctx->outputs.temps[slot * 4u + i]);
10739
10740 if (color_type == ACO_TYPE_FLOAT16) {
10741 chan = bld.vop1(aco_opcode::v_cvt_f32_f16, bld.def(v1), chan);
10742 } else if (color_type == ACO_TYPE_INT16 || color_type == ACO_TYPE_UINT16) {
10743 bool sign_ext = color_type == ACO_TYPE_INT16;
10744 Temp tmp = convert_int(ctx, bld, chan.getTemp(), 16, 32, sign_ext);
10745 chan = Operand(tmp);
10746 }
10747
10748 chan.setPrecolored(chan_reg);
10749 exports.emplace_back(chan);
10750 }
10751 }
10752
10753 Temp continue_pc = convert_pointer_to_64_bit(ctx, get_arg(ctx, ctx->program->info.epilog_pc));
10754
10755 aco_ptr<Instruction> jump{
10756 create_instruction(aco_opcode::p_jump_to_epilog, Format::PSEUDO, 1 + exports.size(), 0)};
10757 jump->operands[0] = Operand(continue_pc);
10758 for (unsigned i = 0; i < exports.size(); i++) {
10759 jump->operands[i + 1] = exports[i];
10760 }
10761 ctx->block->instructions.emplace_back(std::move(jump));
10762 }
10763
10764 PhysReg
get_arg_reg(const struct ac_shader_args * args,struct ac_arg arg)10765 get_arg_reg(const struct ac_shader_args* args, struct ac_arg arg)
10766 {
10767 assert(arg.used);
10768 enum ac_arg_regfile file = args->args[arg.arg_index].file;
10769 unsigned reg = args->args[arg.arg_index].offset;
10770 return PhysReg(file == AC_ARG_SGPR ? reg : reg + 256);
10771 }
10772
10773 static Operand
get_arg_for_end(isel_context * ctx,struct ac_arg arg)10774 get_arg_for_end(isel_context* ctx, struct ac_arg arg)
10775 {
10776 return Operand(get_arg(ctx, arg), get_arg_reg(ctx->args, arg));
10777 }
10778
10779 static void
passthrough_all_args(isel_context * ctx,std::vector<Operand> & regs)10780 passthrough_all_args(isel_context* ctx, std::vector<Operand>& regs)
10781 {
10782 struct ac_arg arg;
10783 arg.used = true;
10784
10785 for (arg.arg_index = 0; arg.arg_index < ctx->args->arg_count; arg.arg_index++)
10786 regs.emplace_back(get_arg_for_end(ctx, arg));
10787 }
10788
10789 static void
build_end_with_regs(isel_context * ctx,std::vector<Operand> & regs)10790 build_end_with_regs(isel_context* ctx, std::vector<Operand>& regs)
10791 {
10792 aco_ptr<Instruction> end{
10793 create_instruction(aco_opcode::p_end_with_regs, Format::PSEUDO, regs.size(), 0)};
10794
10795 for (unsigned i = 0; i < regs.size(); i++)
10796 end->operands[i] = regs[i];
10797
10798 ctx->block->instructions.emplace_back(std::move(end));
10799
10800 ctx->block->kind |= block_kind_end_with_regs;
10801 }
10802
10803 static void
create_fs_end_for_epilog(isel_context * ctx)10804 create_fs_end_for_epilog(isel_context* ctx)
10805 {
10806 Builder bld(ctx->program, ctx->block);
10807
10808 std::vector<Operand> regs;
10809
10810 regs.emplace_back(get_arg_for_end(ctx, ctx->program->info.ps.alpha_reference));
10811
10812 unsigned vgpr = 256;
10813
10814 for (unsigned slot = FRAG_RESULT_DATA0; slot <= FRAG_RESULT_DATA7; slot++) {
10815 unsigned index = slot - FRAG_RESULT_DATA0;
10816 unsigned type = (ctx->output_color_types >> (index * 2)) & 0x3;
10817 unsigned write_mask = ctx->outputs.mask[slot];
10818
10819 if (!write_mask)
10820 continue;
10821
10822 if (type == ACO_TYPE_ANY32) {
10823 u_foreach_bit (i, write_mask) {
10824 regs.emplace_back(Operand(ctx->outputs.temps[slot * 4 + i], PhysReg{vgpr + i}));
10825 }
10826 } else {
10827 for (unsigned i = 0; i < 2; i++) {
10828 unsigned mask = (write_mask >> (i * 2)) & 0x3;
10829 if (!mask)
10830 continue;
10831
10832 unsigned chan = slot * 4 + i * 2;
10833 Operand lo = mask & 0x1 ? Operand(ctx->outputs.temps[chan]) : Operand(v2b);
10834 Operand hi = mask & 0x2 ? Operand(ctx->outputs.temps[chan + 1]) : Operand(v2b);
10835
10836 Temp dst = bld.pseudo(aco_opcode::p_create_vector, bld.def(v1), lo, hi);
10837 regs.emplace_back(Operand(dst, PhysReg{vgpr + i}));
10838 }
10839 }
10840 vgpr += 4;
10841 }
10842
10843 if (ctx->outputs.mask[FRAG_RESULT_DEPTH])
10844 regs.emplace_back(Operand(ctx->outputs.temps[FRAG_RESULT_DEPTH * 4], PhysReg{vgpr++}));
10845
10846 if (ctx->outputs.mask[FRAG_RESULT_STENCIL])
10847 regs.emplace_back(Operand(ctx->outputs.temps[FRAG_RESULT_STENCIL * 4], PhysReg{vgpr++}));
10848
10849 if (ctx->outputs.mask[FRAG_RESULT_SAMPLE_MASK])
10850 regs.emplace_back(Operand(ctx->outputs.temps[FRAG_RESULT_SAMPLE_MASK * 4], PhysReg{vgpr++}));
10851
10852 build_end_with_regs(ctx, regs);
10853
10854 /* Exit WQM mode finally. */
10855 ctx->program->needs_exact = true;
10856 }
10857
10858 Instruction*
add_startpgm(struct isel_context * ctx)10859 add_startpgm(struct isel_context* ctx)
10860 {
10861 unsigned def_count = 0;
10862 for (unsigned i = 0; i < ctx->args->arg_count; i++) {
10863 if (ctx->args->args[i].skip)
10864 continue;
10865 unsigned align = MIN2(4, util_next_power_of_two(ctx->args->args[i].size));
10866 if (ctx->args->args[i].file == AC_ARG_SGPR && ctx->args->args[i].offset % align)
10867 def_count += ctx->args->args[i].size;
10868 else
10869 def_count++;
10870 }
10871
10872 if (ctx->stage.hw == AC_HW_COMPUTE_SHADER && ctx->program->gfx_level >= GFX12)
10873 def_count += 3;
10874
10875 Instruction* startpgm = create_instruction(aco_opcode::p_startpgm, Format::PSEUDO, 0, def_count);
10876 ctx->block->instructions.emplace_back(startpgm);
10877 for (unsigned i = 0, arg = 0; i < ctx->args->arg_count; i++) {
10878 if (ctx->args->args[i].skip)
10879 continue;
10880
10881 enum ac_arg_regfile file = ctx->args->args[i].file;
10882 unsigned size = ctx->args->args[i].size;
10883 unsigned reg = ctx->args->args[i].offset;
10884 RegClass type = RegClass(file == AC_ARG_SGPR ? RegType::sgpr : RegType::vgpr, size);
10885
10886 if (file == AC_ARG_SGPR && reg % MIN2(4, util_next_power_of_two(size))) {
10887 Temp elems[16];
10888 for (unsigned j = 0; j < size; j++) {
10889 elems[j] = ctx->program->allocateTmp(s1);
10890 startpgm->definitions[arg++] = Definition(elems[j], PhysReg{reg + j});
10891 }
10892 ctx->arg_temps[i] = create_vec_from_array(ctx, elems, size, RegType::sgpr, 4);
10893 } else {
10894 Temp dst = ctx->program->allocateTmp(type);
10895 Definition def(dst);
10896 def.setPrecolored(PhysReg{file == AC_ARG_SGPR ? reg : reg + 256});
10897 ctx->arg_temps[i] = dst;
10898 startpgm->definitions[arg++] = def;
10899
10900 if (ctx->args->args[i].pending_vmem) {
10901 assert(file == AC_ARG_VGPR);
10902 ctx->program->args_pending_vmem.push_back(def);
10903 }
10904 }
10905 }
10906
10907 if (ctx->program->gfx_level >= GFX12 && ctx->stage.hw == AC_HW_COMPUTE_SHADER) {
10908 Temp idx = ctx->program->allocateTmp(s1);
10909 Temp idy = ctx->program->allocateTmp(s1);
10910 ctx->ttmp8 = ctx->program->allocateTmp(s1);
10911 startpgm->definitions[def_count - 3] = Definition(idx);
10912 startpgm->definitions[def_count - 3].setPrecolored(PhysReg(108 + 9 /*ttmp9*/));
10913 startpgm->definitions[def_count - 2] = Definition(ctx->ttmp8);
10914 startpgm->definitions[def_count - 2].setPrecolored(PhysReg(108 + 8 /*ttmp8*/));
10915 startpgm->definitions[def_count - 1] = Definition(idy);
10916 startpgm->definitions[def_count - 1].setPrecolored(PhysReg(108 + 7 /*ttmp7*/));
10917 ctx->workgroup_id[0] = Operand(idx);
10918 if (ctx->args->workgroup_ids[2].used) {
10919 Builder bld(ctx->program, ctx->block);
10920 ctx->workgroup_id[1] =
10921 bld.pseudo(aco_opcode::p_extract, bld.def(s1), bld.def(s1, scc), idy, Operand::zero(),
10922 Operand::c32(16u), Operand::zero());
10923 ctx->workgroup_id[2] =
10924 bld.pseudo(aco_opcode::p_extract, bld.def(s1), bld.def(s1, scc), idy, Operand::c32(1u),
10925 Operand::c32(16u), Operand::zero());
10926 } else {
10927 ctx->workgroup_id[1] = Operand(idy);
10928 ctx->workgroup_id[2] = Operand::zero();
10929 }
10930 } else if (ctx->stage.hw == AC_HW_COMPUTE_SHADER) {
10931 const struct ac_arg* ids = ctx->args->workgroup_ids;
10932 for (unsigned i = 0; i < 3; i++)
10933 ctx->workgroup_id[i] = ids[i].used ? Operand(get_arg(ctx, ids[i])) : Operand::zero();
10934 }
10935
10936 /* epilog has no scratch */
10937 if (ctx->args->scratch_offset.used) {
10938 if (ctx->program->gfx_level < GFX9) {
10939 /* Stash these in the program so that they can be accessed later when
10940 * handling spilling.
10941 */
10942 if (ctx->args->ring_offsets.used)
10943 ctx->program->private_segment_buffer = get_arg(ctx, ctx->args->ring_offsets);
10944
10945 ctx->program->scratch_offset = get_arg(ctx, ctx->args->scratch_offset);
10946 } else if (ctx->program->gfx_level <= GFX10_3 && ctx->program->stage != raytracing_cs) {
10947 /* Manually initialize scratch. For RT stages scratch initialization is done in the prolog.
10948 */
10949 Operand scratch_addr = ctx->args->ring_offsets.used
10950 ? Operand(get_arg(ctx, ctx->args->ring_offsets))
10951 : Operand(s2);
10952
10953 Builder bld(ctx->program, ctx->block);
10954 bld.pseudo(aco_opcode::p_init_scratch, bld.def(s2), bld.def(s1, scc), scratch_addr,
10955 get_arg(ctx, ctx->args->scratch_offset));
10956 }
10957 }
10958
10959 return startpgm;
10960 }
10961
10962 void
split_arguments(isel_context * ctx,Instruction * startpgm)10963 split_arguments(isel_context* ctx, Instruction* startpgm)
10964 {
10965 /* Split all arguments except for the first (ring_offsets) and the last
10966 * (exec) so that the dead channels don't stay live throughout the program.
10967 */
10968 for (int i = 1; i < startpgm->definitions.size(); i++) {
10969 if (startpgm->definitions[i].regClass().size() > 1) {
10970 emit_split_vector(ctx, startpgm->definitions[i].getTemp(),
10971 startpgm->definitions[i].regClass().size());
10972 }
10973 }
10974 }
10975
10976 void
setup_fp_mode(isel_context * ctx,nir_shader * shader)10977 setup_fp_mode(isel_context* ctx, nir_shader* shader)
10978 {
10979 Program* program = ctx->program;
10980
10981 unsigned float_controls = shader->info.float_controls_execution_mode;
10982
10983 program->next_fp_mode.must_flush_denorms32 =
10984 float_controls & FLOAT_CONTROLS_DENORM_FLUSH_TO_ZERO_FP32;
10985 program->next_fp_mode.must_flush_denorms16_64 =
10986 float_controls &
10987 (FLOAT_CONTROLS_DENORM_FLUSH_TO_ZERO_FP16 | FLOAT_CONTROLS_DENORM_FLUSH_TO_ZERO_FP64);
10988
10989 program->next_fp_mode.care_about_round32 =
10990 float_controls &
10991 (FLOAT_CONTROLS_ROUNDING_MODE_RTZ_FP32 | FLOAT_CONTROLS_ROUNDING_MODE_RTE_FP32);
10992
10993 program->next_fp_mode.care_about_round16_64 =
10994 float_controls &
10995 (FLOAT_CONTROLS_ROUNDING_MODE_RTZ_FP16 | FLOAT_CONTROLS_ROUNDING_MODE_RTZ_FP64 |
10996 FLOAT_CONTROLS_ROUNDING_MODE_RTE_FP16 | FLOAT_CONTROLS_ROUNDING_MODE_RTE_FP64);
10997
10998 /* default to preserving fp16 and fp64 denorms, since it's free for fp64 and
10999 * the precision seems needed for Wolfenstein: Youngblood to render correctly */
11000 if (program->next_fp_mode.must_flush_denorms16_64)
11001 program->next_fp_mode.denorm16_64 = 0;
11002 else
11003 program->next_fp_mode.denorm16_64 = fp_denorm_keep;
11004
11005 /* preserving fp32 denorms is expensive, so only do it if asked */
11006 if (float_controls & FLOAT_CONTROLS_DENORM_PRESERVE_FP32)
11007 program->next_fp_mode.denorm32 = fp_denorm_keep;
11008 else
11009 program->next_fp_mode.denorm32 = 0;
11010
11011 if (float_controls & FLOAT_CONTROLS_ROUNDING_MODE_RTZ_FP32)
11012 program->next_fp_mode.round32 = fp_round_tz;
11013 else
11014 program->next_fp_mode.round32 = fp_round_ne;
11015
11016 if (float_controls &
11017 (FLOAT_CONTROLS_ROUNDING_MODE_RTZ_FP16 | FLOAT_CONTROLS_ROUNDING_MODE_RTZ_FP64))
11018 program->next_fp_mode.round16_64 = fp_round_tz;
11019 else
11020 program->next_fp_mode.round16_64 = fp_round_ne;
11021
11022 ctx->block->fp_mode = program->next_fp_mode;
11023 }
11024
11025 void
cleanup_cfg(Program * program)11026 cleanup_cfg(Program* program)
11027 {
11028 /* create linear_succs/logical_succs */
11029 for (Block& BB : program->blocks) {
11030 for (unsigned idx : BB.linear_preds)
11031 program->blocks[idx].linear_succs.emplace_back(BB.index);
11032 for (unsigned idx : BB.logical_preds)
11033 program->blocks[idx].logical_succs.emplace_back(BB.index);
11034 }
11035 }
11036
11037 void
finish_program(isel_context * ctx)11038 finish_program(isel_context* ctx)
11039 {
11040 cleanup_cfg(ctx->program);
11041
11042 /* Insert a single p_end_wqm instruction after the last derivative calculation */
11043 if (ctx->program->stage == fragment_fs && ctx->program->needs_wqm && ctx->program->needs_exact) {
11044 /* Find the next BB at top-level CFG */
11045 while (!(ctx->program->blocks[ctx->wqm_block_idx].kind & block_kind_top_level)) {
11046 ctx->wqm_block_idx++;
11047 ctx->wqm_instruction_idx = 0;
11048 }
11049
11050 std::vector<aco_ptr<Instruction>>* instrs =
11051 &ctx->program->blocks[ctx->wqm_block_idx].instructions;
11052 auto it = instrs->begin() + ctx->wqm_instruction_idx;
11053
11054 /* Delay transistion to Exact to help optimizations and scheduling */
11055 while (it != instrs->end()) {
11056 aco_ptr<Instruction>& instr = *it;
11057 /* End WQM before: */
11058 if (instr->isVMEM() || instr->isFlatLike() || instr->isDS() || instr->isEXP() ||
11059 instr->opcode == aco_opcode::p_dual_src_export_gfx11 ||
11060 instr->opcode == aco_opcode::p_jump_to_epilog ||
11061 instr->opcode == aco_opcode::p_logical_start)
11062 break;
11063
11064 ++it;
11065
11066 /* End WQM after: */
11067 if (instr->opcode == aco_opcode::p_logical_end ||
11068 instr->opcode == aco_opcode::p_discard_if ||
11069 instr->opcode == aco_opcode::p_demote_to_helper ||
11070 instr->opcode == aco_opcode::p_end_with_regs)
11071 break;
11072 }
11073
11074 Builder bld(ctx->program);
11075 bld.reset(instrs, it);
11076 bld.pseudo(aco_opcode::p_end_wqm);
11077 }
11078 }
11079
11080 Temp
lanecount_to_mask(isel_context * ctx,Temp count,unsigned bit_offset)11081 lanecount_to_mask(isel_context* ctx, Temp count, unsigned bit_offset)
11082 {
11083 assert(count.regClass() == s1);
11084
11085 Builder bld(ctx->program, ctx->block);
11086
11087 /* We could optimize other cases, but they are unused at the moment. */
11088 if (bit_offset != 0 && bit_offset != 8) {
11089 assert(bit_offset < 32);
11090 count = bld.sop2(aco_opcode::s_lshr_b32, bld.def(s1), bld.def(s1, scc), count,
11091 Operand::c32(bit_offset));
11092 bit_offset = 0;
11093 }
11094
11095 if (ctx->program->wave_size == 32 && bit_offset == 0) {
11096 /* We use s_bfm_b64 (not _b32) which works with 32, but we need to extract the lower half of
11097 * the register. It doesn't work for 64 because it only uses 6 bits. */
11098 Temp mask = bld.sop2(aco_opcode::s_bfm_b64, bld.def(s2), count, Operand::zero());
11099 return emit_extract_vector(ctx, mask, 0, bld.lm);
11100 } else {
11101 /* s_bfe (both u32 and u64) uses 7 bits for the size, but it needs them in the high word.
11102 * The low word is used for the offset, which has to be zero for our use case.
11103 */
11104 if (bit_offset == 0 && ctx->program->gfx_level >= GFX9) {
11105 /* Avoid writing scc for better scheduling. */
11106 count = bld.sop2(aco_opcode::s_pack_ll_b32_b16, bld.def(s1), Operand::c32(0), count);
11107 } else {
11108 count = bld.sop2(aco_opcode::s_lshl_b32, bld.def(s1), bld.def(s1, scc), count,
11109 Operand::c32(16 - bit_offset));
11110 }
11111
11112 if (ctx->program->wave_size == 32) {
11113 return bld.sop2(aco_opcode::s_bfe_u32, bld.def(bld.lm), bld.def(s1, scc), Operand::c32(-1),
11114 count);
11115 } else {
11116 return bld.sop2(aco_opcode::s_bfe_u64, bld.def(bld.lm), bld.def(s1, scc),
11117 Operand::c64(-1ll), count);
11118 }
11119 }
11120 }
11121
11122 Temp
merged_wave_info_to_mask(isel_context * ctx,unsigned i)11123 merged_wave_info_to_mask(isel_context* ctx, unsigned i)
11124 {
11125 /* lanecount_to_mask() only cares about s0.byte[i].[6:0]
11126 * so we don't need either s_bfe nor s_and here.
11127 */
11128 Temp count = get_arg(ctx, ctx->args->merged_wave_info);
11129
11130 return lanecount_to_mask(ctx, count, i * 8u);
11131 }
11132
11133 static void
insert_rt_jump_next(isel_context & ctx,const struct ac_shader_args * args)11134 insert_rt_jump_next(isel_context& ctx, const struct ac_shader_args* args)
11135 {
11136 unsigned src_count = 0;
11137 for (unsigned i = 0; i < ctx.args->arg_count; i++)
11138 src_count += !!BITSET_TEST(ctx.output_args, i);
11139
11140 Instruction* ret = create_instruction(aco_opcode::p_return, Format::PSEUDO, src_count, 0);
11141 ctx.block->instructions.emplace_back(ret);
11142
11143 src_count = 0;
11144 for (unsigned i = 0; i < ctx.args->arg_count; i++) {
11145 if (!BITSET_TEST(ctx.output_args, i))
11146 continue;
11147
11148 enum ac_arg_regfile file = ctx.args->args[i].file;
11149 unsigned size = ctx.args->args[i].size;
11150 unsigned reg = ctx.args->args[i].offset + (file == AC_ARG_SGPR ? 0 : 256);
11151 RegClass type = RegClass(file == AC_ARG_SGPR ? RegType::sgpr : RegType::vgpr, size);
11152 Operand op = ctx.arg_temps[i].id() ? Operand(ctx.arg_temps[i], PhysReg{reg})
11153 : Operand(PhysReg{reg}, type);
11154 ret->operands[src_count] = op;
11155 src_count++;
11156 }
11157
11158 Builder bld(ctx.program, ctx.block);
11159 bld.sop1(aco_opcode::s_setpc_b64, get_arg(&ctx, ctx.args->rt.uniform_shader_addr));
11160 }
11161
11162 void
select_program_rt(isel_context & ctx,unsigned shader_count,struct nir_shader * const * shaders,const struct ac_shader_args * args)11163 select_program_rt(isel_context& ctx, unsigned shader_count, struct nir_shader* const* shaders,
11164 const struct ac_shader_args* args)
11165 {
11166 for (unsigned i = 0; i < shader_count; i++) {
11167 if (i) {
11168 ctx.block = ctx.program->create_and_insert_block();
11169 ctx.block->kind = block_kind_top_level | block_kind_resume;
11170 }
11171
11172 nir_shader* nir = shaders[i];
11173 init_context(&ctx, nir);
11174 setup_fp_mode(&ctx, nir);
11175
11176 Instruction* startpgm = add_startpgm(&ctx);
11177 append_logical_start(ctx.block);
11178 split_arguments(&ctx, startpgm);
11179 visit_cf_list(&ctx, &nir_shader_get_entrypoint(nir)->body);
11180 append_logical_end(ctx.block);
11181 ctx.block->kind |= block_kind_uniform;
11182
11183 /* Fix output registers and jump to next shader. We can skip this when dealing with a raygen
11184 * shader without shader calls.
11185 */
11186 if (shader_count > 1 || shaders[i]->info.stage != MESA_SHADER_RAYGEN)
11187 insert_rt_jump_next(ctx, args);
11188
11189 cleanup_context(&ctx);
11190 }
11191
11192 ctx.program->config->float_mode = ctx.program->blocks[0].fp_mode.val;
11193 finish_program(&ctx);
11194 }
11195
11196 void
pops_await_overlapped_waves(isel_context * ctx)11197 pops_await_overlapped_waves(isel_context* ctx)
11198 {
11199 ctx->program->has_pops_overlapped_waves_wait = true;
11200
11201 Builder bld(ctx->program, ctx->block);
11202
11203 if (ctx->program->gfx_level >= GFX11) {
11204 /* GFX11+ - waiting for the export from the overlapped waves.
11205 * Await the export_ready event (bit wait_event_imm_dont_wait_export_ready clear).
11206 */
11207 bld.sopp(aco_opcode::s_wait_event,
11208 ctx->program->gfx_level >= GFX12 ? wait_event_imm_wait_export_ready_gfx12 : 0);
11209 return;
11210 }
11211
11212 /* Pre-GFX11 - sleep loop polling the exiting wave ID. */
11213
11214 const Temp collision = get_arg(ctx, ctx->args->pops_collision_wave_id);
11215
11216 /* Check if there's an overlap in the current wave - otherwise, the wait may result in a hang. */
11217 const Temp did_overlap =
11218 bld.sopc(aco_opcode::s_bitcmp1_b32, bld.def(s1, scc), collision, Operand::c32(31));
11219 if_context did_overlap_if_context;
11220 begin_uniform_if_then(ctx, &did_overlap_if_context, did_overlap);
11221 bld.reset(ctx->block);
11222
11223 /* Set the packer register - after this, pops_exiting_wave_id can be polled. */
11224 if (ctx->program->gfx_level >= GFX10) {
11225 /* 2 packer ID bits on GFX10-10.3. */
11226 const Temp packer_id = bld.sop2(aco_opcode::s_bfe_u32, bld.def(s1), bld.def(s1, scc),
11227 collision, Operand::c32(0x2001c));
11228 /* POPS_PACKER register: bit 0 - POPS enabled for this wave, bits 2:1 - packer ID. */
11229 const Temp packer_id_hwreg_bits = bld.sop2(aco_opcode::s_lshl1_add_u32, bld.def(s1),
11230 bld.def(s1, scc), packer_id, Operand::c32(1));
11231 bld.sopk(aco_opcode::s_setreg_b32, packer_id_hwreg_bits, ((3 - 1) << 11) | 25);
11232 } else {
11233 /* 1 packer ID bit on GFX9. */
11234 const Temp packer_id = bld.sop2(aco_opcode::s_bfe_u32, bld.def(s1), bld.def(s1, scc),
11235 collision, Operand::c32(0x1001c));
11236 /* MODE register: bit 24 - wave is associated with packer 0, bit 25 - with packer 1.
11237 * Packer index to packer bits: 0 to 0b01, 1 to 0b10.
11238 */
11239 const Temp packer_id_hwreg_bits =
11240 bld.sop2(aco_opcode::s_add_i32, bld.def(s1), bld.def(s1, scc), packer_id, Operand::c32(1));
11241 bld.sopk(aco_opcode::s_setreg_b32, packer_id_hwreg_bits, ((2 - 1) << 11) | (24 << 6) | 1);
11242 }
11243
11244 Temp newest_overlapped_wave_id = bld.sop2(aco_opcode::s_bfe_u32, bld.def(s1), bld.def(s1, scc),
11245 collision, Operand::c32(0xa0010));
11246 if (ctx->program->gfx_level < GFX10) {
11247 /* On GFX9, the newest overlapped wave ID value passed to the shader is smaller than the
11248 * actual wave ID by 1 in case of wraparound.
11249 */
11250 const Temp current_wave_id = bld.sop2(aco_opcode::s_and_b32, bld.def(s1), bld.def(s1, scc),
11251 collision, Operand::c32(0x3ff));
11252 const Temp newest_overlapped_wave_id_wrapped = bld.sopc(
11253 aco_opcode::s_cmp_gt_u32, bld.def(s1, scc), newest_overlapped_wave_id, current_wave_id);
11254 newest_overlapped_wave_id =
11255 bld.sop2(aco_opcode::s_add_i32, bld.def(s1), bld.def(s1, scc), newest_overlapped_wave_id,
11256 newest_overlapped_wave_id_wrapped);
11257 }
11258
11259 /* The wave IDs are the low 10 bits of a monotonically increasing wave counter.
11260 * The overlapped and the exiting wave IDs can't be larger than the current wave ID, and they are
11261 * no more than 1023 values behind the current wave ID.
11262 * Remap the overlapped and the exiting wave IDs from wrapping to monotonic so an unsigned
11263 * comparison can be used: the wave `current - 1023` becomes 0, it's followed by a piece growing
11264 * away from 0, then a piece increasing until UINT32_MAX, and the current wave is UINT32_MAX.
11265 * To do that, subtract `current - 1023`, which with wrapping arithmetic is (current + 1), and
11266 * `a - (b + 1)` is `a + ~b`.
11267 * Note that if the 10-bit current wave ID is 1023 (thus 1024 will be subtracted), the wave
11268 * `current - 1023` will become `UINT32_MAX - 1023` rather than 0, but all the possible wave IDs
11269 * will still grow monotonically in the 32-bit value, and the unsigned comparison will behave as
11270 * expected.
11271 */
11272 const Temp wave_id_offset = bld.sop2(aco_opcode::s_nand_b32, bld.def(s1), bld.def(s1, scc),
11273 collision, Operand::c32(0x3ff));
11274 newest_overlapped_wave_id = bld.sop2(aco_opcode::s_add_i32, bld.def(s1), bld.def(s1, scc),
11275 newest_overlapped_wave_id, wave_id_offset);
11276
11277 /* Await the overlapped waves. */
11278
11279 loop_context wait_loop_context;
11280 begin_loop(ctx, &wait_loop_context);
11281 bld.reset(ctx->block);
11282
11283 const Temp exiting_wave_id = bld.pseudo(aco_opcode::p_pops_gfx9_add_exiting_wave_id, bld.def(s1),
11284 bld.def(s1, scc), wave_id_offset);
11285 /* If the exiting (not exited) wave ID is larger than the newest overlapped wave ID (after
11286 * remapping both to monotonically increasing unsigned integers), the newest overlapped wave has
11287 * exited the ordered section.
11288 */
11289 const Temp newest_overlapped_wave_exited = bld.sopc(aco_opcode::s_cmp_lt_u32, bld.def(s1, scc),
11290 newest_overlapped_wave_id, exiting_wave_id);
11291 if_context newest_overlapped_wave_exited_if_context;
11292 begin_uniform_if_then(ctx, &newest_overlapped_wave_exited_if_context,
11293 newest_overlapped_wave_exited);
11294 emit_loop_break(ctx);
11295 begin_uniform_if_else(ctx, &newest_overlapped_wave_exited_if_context);
11296 end_uniform_if(ctx, &newest_overlapped_wave_exited_if_context);
11297 bld.reset(ctx->block);
11298
11299 /* Sleep before rechecking to let overlapped waves run for some time. */
11300 bld.sopp(aco_opcode::s_sleep, ctx->program->gfx_level >= GFX10 ? UINT16_MAX : 3);
11301
11302 end_loop(ctx, &wait_loop_context);
11303 bld.reset(ctx->block);
11304
11305 /* Indicate the wait has been done to subsequent compilation stages. */
11306 bld.pseudo(aco_opcode::p_pops_gfx9_overlapped_wave_wait_done);
11307
11308 begin_uniform_if_else(ctx, &did_overlap_if_context);
11309 end_uniform_if(ctx, &did_overlap_if_context);
11310 bld.reset(ctx->block);
11311 }
11312
11313 static void
create_merged_jump_to_epilog(isel_context * ctx)11314 create_merged_jump_to_epilog(isel_context* ctx)
11315 {
11316 Builder bld(ctx->program, ctx->block);
11317 std::vector<Operand> regs;
11318
11319 for (unsigned i = 0; i < ctx->args->arg_count; i++) {
11320 if (!ctx->args->args[i].preserved)
11321 continue;
11322
11323 const enum ac_arg_regfile file = ctx->args->args[i].file;
11324 const unsigned reg = ctx->args->args[i].offset;
11325
11326 Operand op(ctx->arg_temps[i]);
11327 op.setPrecolored(PhysReg{file == AC_ARG_SGPR ? reg : reg + 256});
11328 regs.emplace_back(op);
11329 }
11330
11331 Temp continue_pc =
11332 convert_pointer_to_64_bit(ctx, get_arg(ctx, ctx->program->info.next_stage_pc));
11333
11334 aco_ptr<Instruction> jump{
11335 create_instruction(aco_opcode::p_jump_to_epilog, Format::PSEUDO, 1 + regs.size(), 0)};
11336 jump->operands[0] = Operand(continue_pc);
11337 for (unsigned i = 0; i < regs.size(); i++) {
11338 jump->operands[i + 1] = regs[i];
11339 }
11340 ctx->block->instructions.emplace_back(std::move(jump));
11341 }
11342
11343 static void
create_end_for_merged_shader(isel_context * ctx)11344 create_end_for_merged_shader(isel_context* ctx)
11345 {
11346 std::vector<Operand> regs;
11347
11348 unsigned max_args;
11349 if (ctx->stage.sw == SWStage::VS) {
11350 assert(ctx->args->vertex_id.used);
11351 max_args = ctx->args->vertex_id.arg_index;
11352 } else {
11353 assert(ctx->stage.sw == SWStage::TES);
11354 assert(ctx->args->tes_u.used);
11355 max_args = ctx->args->tes_u.arg_index;
11356 }
11357
11358 struct ac_arg arg;
11359 arg.used = true;
11360
11361 for (arg.arg_index = 0; arg.arg_index < max_args; arg.arg_index++)
11362 regs.emplace_back(get_arg_for_end(ctx, arg));
11363
11364 build_end_with_regs(ctx, regs);
11365 }
11366
11367 void
select_shader(isel_context & ctx,nir_shader * nir,const bool need_startpgm,const bool need_endpgm,const bool need_barrier,if_context * ic_merged_wave_info,const bool check_merged_wave_info,const bool endif_merged_wave_info)11368 select_shader(isel_context& ctx, nir_shader* nir, const bool need_startpgm, const bool need_endpgm,
11369 const bool need_barrier, if_context* ic_merged_wave_info,
11370 const bool check_merged_wave_info, const bool endif_merged_wave_info)
11371 {
11372 init_context(&ctx, nir);
11373 setup_fp_mode(&ctx, nir);
11374
11375 Program* program = ctx.program;
11376
11377 if (need_startpgm) {
11378 /* Needs to be after init_context() for FS. */
11379 Instruction* startpgm = add_startpgm(&ctx);
11380
11381 if (!program->info.vs.has_prolog &&
11382 (program->stage.has(SWStage::VS) || program->stage.has(SWStage::TES))) {
11383 Builder(ctx.program, ctx.block).sopp(aco_opcode::s_setprio, 0x3u);
11384 }
11385
11386 append_logical_start(ctx.block);
11387 split_arguments(&ctx, startpgm);
11388 }
11389
11390 if (program->gfx_level == GFX10 && program->stage.hw == AC_HW_NEXT_GEN_GEOMETRY_SHADER &&
11391 !program->stage.has(SWStage::GS)) {
11392 /* Workaround for Navi1x HW bug to ensure that all NGG waves launch before
11393 * s_sendmsg(GS_ALLOC_REQ).
11394 */
11395 Builder(ctx.program, ctx.block).sopp(aco_opcode::s_barrier, 0u);
11396 }
11397
11398 if (check_merged_wave_info) {
11399 const unsigned i =
11400 nir->info.stage == MESA_SHADER_VERTEX || nir->info.stage == MESA_SHADER_TESS_EVAL ? 0 : 1;
11401 const Temp cond = merged_wave_info_to_mask(&ctx, i);
11402 begin_divergent_if_then(&ctx, ic_merged_wave_info, cond);
11403 }
11404
11405 if (need_barrier) {
11406 const sync_scope scope = ctx.stage == vertex_tess_control_hs && ctx.tcs_in_out_eq &&
11407 program->wave_size % nir->info.tess.tcs_vertices_out == 0
11408 ? scope_subgroup
11409 : scope_workgroup;
11410
11411 Builder(ctx.program, ctx.block)
11412 .barrier(aco_opcode::p_barrier, memory_sync_info(storage_shared, semantic_acqrel, scope),
11413 scope);
11414 }
11415
11416 nir_function_impl* func = nir_shader_get_entrypoint(nir);
11417 visit_cf_list(&ctx, &func->body);
11418
11419 if (ctx.program->info.ps.has_epilog) {
11420 if (ctx.stage == fragment_fs) {
11421 if (ctx.options->is_opengl)
11422 create_fs_end_for_epilog(&ctx);
11423 else
11424 create_fs_jump_to_epilog(&ctx);
11425
11426 /* FS epilogs always have at least one color/null export. */
11427 ctx.program->has_color_exports = true;
11428 }
11429 }
11430
11431 if (endif_merged_wave_info) {
11432 begin_divergent_if_else(&ctx, ic_merged_wave_info);
11433 end_divergent_if(&ctx, ic_merged_wave_info);
11434 }
11435
11436 bool is_first_stage_of_merged_shader = false;
11437
11438 if (ctx.program->info.merged_shader_compiled_separately &&
11439 (ctx.stage.sw == SWStage::VS || ctx.stage.sw == SWStage::TES)) {
11440 assert(program->gfx_level >= GFX9);
11441 if (ctx.options->is_opengl)
11442 create_end_for_merged_shader(&ctx);
11443 else
11444 create_merged_jump_to_epilog(&ctx);
11445
11446 is_first_stage_of_merged_shader = true;
11447 }
11448
11449 cleanup_context(&ctx);
11450
11451 if (need_endpgm) {
11452 program->config->float_mode = program->blocks[0].fp_mode.val;
11453
11454 append_logical_end(ctx.block);
11455 ctx.block->kind |= block_kind_uniform;
11456
11457 if ((!program->info.ps.has_epilog && !is_first_stage_of_merged_shader) ||
11458 (nir->info.stage == MESA_SHADER_TESS_CTRL && program->gfx_level >= GFX9)) {
11459 Builder(program, ctx.block).sopp(aco_opcode::s_endpgm);
11460 }
11461
11462 finish_program(&ctx);
11463 }
11464 }
11465
11466 void
select_program_merged(isel_context & ctx,const unsigned shader_count,nir_shader * const * shaders)11467 select_program_merged(isel_context& ctx, const unsigned shader_count, nir_shader* const* shaders)
11468 {
11469 if_context ic_merged_wave_info;
11470 const bool ngg_gs = ctx.stage.hw == AC_HW_NEXT_GEN_GEOMETRY_SHADER && ctx.stage.has(SWStage::GS);
11471
11472 for (unsigned i = 0; i < shader_count; i++) {
11473 nir_shader* nir = shaders[i];
11474
11475 /* We always need to insert p_startpgm at the beginning of the first shader. */
11476 const bool need_startpgm = i == 0;
11477
11478 /* Need to handle program end for last shader stage. */
11479 const bool need_endpgm = i == shader_count - 1;
11480
11481 /* In a merged VS+TCS HS, the VS implementation can be completely empty. */
11482 nir_function_impl* func = nir_shader_get_entrypoint(nir);
11483 const bool empty_shader =
11484 nir_cf_list_is_empty_block(&func->body) &&
11485 ((nir->info.stage == MESA_SHADER_VERTEX &&
11486 (ctx.stage == vertex_tess_control_hs || ctx.stage == vertex_geometry_gs)) ||
11487 (nir->info.stage == MESA_SHADER_TESS_EVAL && ctx.stage == tess_eval_geometry_gs));
11488
11489 /* See if we need to emit a check of the merged wave info SGPR. */
11490 const bool check_merged_wave_info =
11491 ctx.tcs_in_out_eq ? i == 0 : (shader_count >= 2 && !empty_shader && !(ngg_gs && i == 1));
11492 const bool endif_merged_wave_info =
11493 ctx.tcs_in_out_eq ? i == 1 : (check_merged_wave_info && !(ngg_gs && i == 1));
11494
11495 /* Skip s_barrier from TCS when VS outputs are not stored in the LDS. */
11496 const bool tcs_skip_barrier =
11497 ctx.stage == vertex_tess_control_hs && !ctx.any_tcs_inputs_via_lds;
11498
11499 /* A barrier is usually needed at the beginning of the second shader, with exceptions. */
11500 const bool need_barrier = i != 0 && !ngg_gs && !tcs_skip_barrier;
11501
11502 select_shader(ctx, nir, need_startpgm, need_endpgm, need_barrier, &ic_merged_wave_info,
11503 check_merged_wave_info, endif_merged_wave_info);
11504
11505 if (i == 0 && ctx.stage == vertex_tess_control_hs && ctx.tcs_in_out_eq) {
11506 /* Special handling when TCS input and output patch size is the same.
11507 * Outputs of the previous stage are inputs to the next stage.
11508 */
11509 ctx.inputs = ctx.outputs;
11510 ctx.outputs = shader_io_state();
11511 }
11512 }
11513 }
11514
11515 void
emit_polygon_stipple(isel_context * ctx,const struct aco_ps_prolog_info * finfo)11516 emit_polygon_stipple(isel_context* ctx, const struct aco_ps_prolog_info* finfo)
11517 {
11518 Builder bld(ctx->program, ctx->block);
11519
11520 /* Use the fixed-point gl_FragCoord input.
11521 * Since the stipple pattern is 32x32 and it repeats, just get 5 bits
11522 * per coordinate to get the repeating effect.
11523 */
11524 Temp pos_fixed_pt = get_arg(ctx, ctx->args->pos_fixed_pt);
11525 Temp addr0 = bld.vop2(aco_opcode::v_and_b32, bld.def(v1), Operand::c32(0x1f), pos_fixed_pt);
11526 Temp addr1 = bld.vop3(aco_opcode::v_bfe_u32, bld.def(v1), pos_fixed_pt, Operand::c32(16u),
11527 Operand::c32(5u));
11528
11529 /* Load the buffer descriptor. */
11530 Temp list = get_arg(ctx, finfo->internal_bindings);
11531 list = convert_pointer_to_64_bit(ctx, list);
11532 Temp desc = bld.smem(aco_opcode::s_load_dwordx4, bld.def(s4), list,
11533 Operand::c32(finfo->poly_stipple_buf_offset));
11534
11535 /* The stipple pattern is 32x32, each row has 32 bits. */
11536 Temp offset = bld.vop2(aco_opcode::v_lshlrev_b32, bld.def(v1), Operand::c32(2), addr1);
11537 Temp row = bld.mubuf(aco_opcode::buffer_load_dword, bld.def(v1), desc, offset, Operand::c32(0u),
11538 0, true);
11539 Temp bit = bld.vop3(aco_opcode::v_bfe_u32, bld.def(v1), row, addr0, Operand::c32(1u));
11540 Temp cond = bld.vopc(aco_opcode::v_cmp_eq_u32, bld.def(bld.lm), Operand::zero(), bit);
11541 bld.pseudo(aco_opcode::p_demote_to_helper, cond);
11542
11543 ctx->block->kind |= block_kind_uses_discard;
11544 ctx->program->needs_exact = true;
11545 }
11546
11547 void
overwrite_interp_args(isel_context * ctx,const struct aco_ps_prolog_info * finfo)11548 overwrite_interp_args(isel_context* ctx, const struct aco_ps_prolog_info* finfo)
11549 {
11550 Builder bld(ctx->program, ctx->block);
11551
11552 if (finfo->bc_optimize_for_persp || finfo->bc_optimize_for_linear) {
11553 /* The shader should do: if (PRIM_MASK[31]) CENTROID = CENTER;
11554 * The hw doesn't compute CENTROID if the whole wave only
11555 * contains fully-covered quads.
11556 */
11557 Temp bc_optimize = get_arg(ctx, ctx->args->prim_mask);
11558
11559 /* enabled when bit 31 is set */
11560 Temp cond =
11561 bld.sopc(aco_opcode::s_bitcmp1_b32, bld.def(s1, scc), bc_optimize, Operand::c32(31u));
11562
11563 /* scale 1bit scc to wave size bits used by v_cndmask */
11564 cond = bool_to_vector_condition(ctx, cond);
11565
11566 if (finfo->bc_optimize_for_persp) {
11567 Temp center = get_arg(ctx, ctx->args->persp_center);
11568 Temp centroid = get_arg(ctx, ctx->args->persp_centroid);
11569
11570 Temp dst = bld.tmp(v2);
11571 select_vec2(ctx, dst, cond, center, centroid);
11572 ctx->arg_temps[ctx->args->persp_centroid.arg_index] = dst;
11573 }
11574
11575 if (finfo->bc_optimize_for_linear) {
11576 Temp center = get_arg(ctx, ctx->args->linear_center);
11577 Temp centroid = get_arg(ctx, ctx->args->linear_centroid);
11578
11579 Temp dst = bld.tmp(v2);
11580 select_vec2(ctx, dst, cond, center, centroid);
11581 ctx->arg_temps[ctx->args->linear_centroid.arg_index] = dst;
11582 }
11583 }
11584
11585 if (finfo->force_persp_sample_interp) {
11586 Temp persp_sample = get_arg(ctx, ctx->args->persp_sample);
11587 ctx->arg_temps[ctx->args->persp_center.arg_index] = persp_sample;
11588 ctx->arg_temps[ctx->args->persp_centroid.arg_index] = persp_sample;
11589 }
11590
11591 if (finfo->force_linear_sample_interp) {
11592 Temp linear_sample = get_arg(ctx, ctx->args->linear_sample);
11593 ctx->arg_temps[ctx->args->linear_center.arg_index] = linear_sample;
11594 ctx->arg_temps[ctx->args->linear_centroid.arg_index] = linear_sample;
11595 }
11596
11597 if (finfo->force_persp_center_interp) {
11598 Temp persp_center = get_arg(ctx, ctx->args->persp_center);
11599 ctx->arg_temps[ctx->args->persp_sample.arg_index] = persp_center;
11600 ctx->arg_temps[ctx->args->persp_centroid.arg_index] = persp_center;
11601 }
11602
11603 if (finfo->force_linear_center_interp) {
11604 Temp linear_center = get_arg(ctx, ctx->args->linear_center);
11605 ctx->arg_temps[ctx->args->linear_sample.arg_index] = linear_center;
11606 ctx->arg_temps[ctx->args->linear_centroid.arg_index] = linear_center;
11607 }
11608 }
11609
11610 void
overwrite_samplemask_arg(isel_context * ctx,const struct aco_ps_prolog_info * finfo)11611 overwrite_samplemask_arg(isel_context* ctx, const struct aco_ps_prolog_info* finfo)
11612 {
11613 Builder bld(ctx->program, ctx->block);
11614
11615 /* Section 15.2.2 (Shader Inputs) of the OpenGL 4.5 (Core Profile) spec
11616 * says:
11617 *
11618 * "When per-sample shading is active due to the use of a fragment
11619 * input qualified by sample or due to the use of the gl_SampleID
11620 * or gl_SamplePosition variables, only the bit for the current
11621 * sample is set in gl_SampleMaskIn. When state specifies multiple
11622 * fragment shader invocations for a given fragment, the sample
11623 * mask for any single fragment shader invocation may specify a
11624 * subset of the covered samples for the fragment. In this case,
11625 * the bit corresponding to each covered sample will be set in
11626 * exactly one fragment shader invocation."
11627 *
11628 * The samplemask loaded by hardware is always the coverage of the
11629 * entire pixel/fragment, so mask bits out based on the sample ID.
11630 */
11631 if (finfo->samplemask_log_ps_iter) {
11632 Temp ancillary = get_arg(ctx, ctx->args->ancillary);
11633 Temp sampleid = bld.vop3(aco_opcode::v_bfe_u32, bld.def(v1), ancillary, Operand::c32(8u),
11634 Operand::c32(4u));
11635 Temp samplemask = get_arg(ctx, ctx->args->sample_coverage);
11636
11637 uint32_t ps_iter_mask = ac_get_ps_iter_mask(1 << finfo->samplemask_log_ps_iter);
11638 Temp iter_mask = bld.copy(bld.def(v1), Operand::c32(ps_iter_mask));
11639
11640 Temp mask = bld.vop2(aco_opcode::v_lshlrev_b32, bld.def(v1), sampleid, iter_mask);
11641 samplemask = bld.vop2(aco_opcode::v_and_b32, bld.def(v1), samplemask, mask);
11642
11643 ctx->arg_temps[ctx->args->sample_coverage.arg_index] = samplemask;
11644 }
11645 }
11646
11647 Temp
get_interp_color(isel_context * ctx,int interp_vgpr,unsigned attr_index,unsigned comp)11648 get_interp_color(isel_context* ctx, int interp_vgpr, unsigned attr_index, unsigned comp)
11649 {
11650 Builder bld(ctx->program, ctx->block);
11651
11652 Temp dst = bld.tmp(v1);
11653
11654 Temp prim_mask = get_arg(ctx, ctx->args->prim_mask);
11655
11656 if (interp_vgpr != -1) {
11657 /* interp args are all 2 vgprs */
11658 int arg_index = ctx->args->persp_sample.arg_index + interp_vgpr / 2;
11659 Temp interp_ij = ctx->arg_temps[arg_index];
11660
11661 emit_interp_instr(ctx, attr_index, comp, interp_ij, dst, prim_mask, false);
11662 } else {
11663 emit_interp_mov_instr(ctx, attr_index, comp, 0, dst, prim_mask, false);
11664 }
11665
11666 return dst;
11667 }
11668
11669 void
interpolate_color_args(isel_context * ctx,const struct aco_ps_prolog_info * finfo,std::vector<Operand> & regs)11670 interpolate_color_args(isel_context* ctx, const struct aco_ps_prolog_info* finfo,
11671 std::vector<Operand>& regs)
11672 {
11673 if (!finfo->colors_read)
11674 return;
11675
11676 Builder bld(ctx->program, ctx->block);
11677
11678 unsigned vgpr = 256 + ctx->args->num_vgprs_used;
11679
11680 if (finfo->color_two_side) {
11681 Temp face = get_arg(ctx, ctx->args->front_face);
11682 Temp is_face_positive =
11683 bld.vopc(aco_opcode::v_cmp_lt_f32, bld.def(bld.lm), Operand::zero(), face);
11684
11685 u_foreach_bit (i, finfo->colors_read) {
11686 unsigned color_index = i / 4;
11687 unsigned front_index = finfo->color_attr_index[color_index];
11688 int interp_vgpr = finfo->color_interp_vgpr_index[color_index];
11689
11690 /* If BCOLOR0 is used, BCOLOR1 is at offset "num_inputs + 1",
11691 * otherwise it's at offset "num_inputs".
11692 */
11693 unsigned back_index = finfo->num_interp_inputs;
11694 if (color_index == 1 && finfo->colors_read & 0xf)
11695 back_index++;
11696
11697 Temp front = get_interp_color(ctx, interp_vgpr, front_index, i % 4);
11698 Temp back = get_interp_color(ctx, interp_vgpr, back_index, i % 4);
11699
11700 Temp color =
11701 bld.vop2(aco_opcode::v_cndmask_b32, bld.def(v1), back, front, is_face_positive);
11702
11703 regs.emplace_back(Operand(color, PhysReg{vgpr++}));
11704 }
11705 } else {
11706 u_foreach_bit (i, finfo->colors_read) {
11707 unsigned color_index = i / 4;
11708 unsigned attr_index = finfo->color_attr_index[color_index];
11709 int interp_vgpr = finfo->color_interp_vgpr_index[color_index];
11710 Temp color = get_interp_color(ctx, interp_vgpr, attr_index, i % 4);
11711
11712 regs.emplace_back(Operand(color, PhysReg{vgpr++}));
11713 }
11714 }
11715 }
11716
11717 void
emit_clamp_alpha_test(isel_context * ctx,const struct aco_ps_epilog_info * info,Temp colors[4],unsigned color_index)11718 emit_clamp_alpha_test(isel_context* ctx, const struct aco_ps_epilog_info* info, Temp colors[4],
11719 unsigned color_index)
11720 {
11721 Builder bld(ctx->program, ctx->block);
11722
11723 if (info->clamp_color) {
11724 for (unsigned i = 0; i < 4; i++) {
11725 if (colors[i].regClass() == v2b) {
11726 colors[i] = bld.vop3(aco_opcode::v_med3_f16, bld.def(v2b), Operand::c16(0u),
11727 Operand::c16(0x3c00), colors[i]);
11728 } else {
11729 assert(colors[i].regClass() == v1);
11730 colors[i] = bld.vop3(aco_opcode::v_med3_f32, bld.def(v1), Operand::zero(),
11731 Operand::c32(0x3f800000u), colors[i]);
11732 }
11733 }
11734 }
11735
11736 if (info->alpha_to_one) {
11737 if (colors[3].regClass() == v2b)
11738 colors[3] = bld.copy(bld.def(v2b), Operand::c16(0x3c00));
11739 else
11740 colors[3] = bld.copy(bld.def(v1), Operand::c32(0x3f800000u));
11741 }
11742
11743 if (color_index == 0 && info->alpha_func != COMPARE_FUNC_ALWAYS) {
11744 Operand cond = Operand::c32(-1u);
11745 if (info->alpha_func != COMPARE_FUNC_NEVER) {
11746 aco_opcode opcode = aco_opcode::num_opcodes;
11747
11748 switch (info->alpha_func) {
11749 case COMPARE_FUNC_LESS: opcode = aco_opcode::v_cmp_ngt_f32; break;
11750 case COMPARE_FUNC_EQUAL: opcode = aco_opcode::v_cmp_neq_f32; break;
11751 case COMPARE_FUNC_LEQUAL: opcode = aco_opcode::v_cmp_nge_f32; break;
11752 case COMPARE_FUNC_GREATER: opcode = aco_opcode::v_cmp_nlt_f32; break;
11753 case COMPARE_FUNC_NOTEQUAL: opcode = aco_opcode::v_cmp_nlg_f32; break;
11754 case COMPARE_FUNC_GEQUAL: opcode = aco_opcode::v_cmp_nle_f32; break;
11755 default: unreachable("invalid alpha func");
11756 }
11757
11758 Temp ref = get_arg(ctx, info->alpha_reference);
11759
11760 Temp alpha = colors[3].regClass() == v2b
11761 ? bld.vop1(aco_opcode::v_cvt_f32_f16, bld.def(v1), colors[3])
11762 : colors[3];
11763
11764 /* true if not pass */
11765 cond = bld.vopc(opcode, bld.def(bld.lm), ref, alpha);
11766 }
11767
11768 bld.pseudo(aco_opcode::p_discard_if, cond);
11769 ctx->block->kind |= block_kind_uses_discard;
11770 ctx->program->needs_exact = true;
11771 }
11772 }
11773
11774 } /* end namespace */
11775
11776 void
select_program(Program * program,unsigned shader_count,struct nir_shader * const * shaders,ac_shader_config * config,const struct aco_compiler_options * options,const struct aco_shader_info * info,const struct ac_shader_args * args)11777 select_program(Program* program, unsigned shader_count, struct nir_shader* const* shaders,
11778 ac_shader_config* config, const struct aco_compiler_options* options,
11779 const struct aco_shader_info* info, const struct ac_shader_args* args)
11780 {
11781 isel_context ctx =
11782 setup_isel_context(program, shader_count, shaders, config, options, info, args);
11783
11784 if (ctx.stage == raytracing_cs)
11785 return select_program_rt(ctx, shader_count, shaders, args);
11786
11787 if (shader_count >= 2) {
11788 select_program_merged(ctx, shader_count, shaders);
11789 } else {
11790 bool need_barrier = false, check_merged_wave_info = false, endif_merged_wave_info = false;
11791 if_context ic_merged_wave_info;
11792
11793 /* Handle separate compilation of VS+TCS and {VS,TES}+GS on GFX9+. */
11794 if (ctx.program->info.merged_shader_compiled_separately) {
11795 assert(ctx.program->gfx_level >= GFX9);
11796 if (ctx.stage.sw == SWStage::VS || ctx.stage.sw == SWStage::TES) {
11797 check_merged_wave_info = endif_merged_wave_info = true;
11798 } else {
11799 const bool ngg_gs =
11800 ctx.stage.hw == AC_HW_NEXT_GEN_GEOMETRY_SHADER && ctx.stage.sw == SWStage::GS;
11801 assert(ctx.stage == tess_control_hs || ctx.stage == geometry_gs || ngg_gs);
11802 check_merged_wave_info = endif_merged_wave_info = !ngg_gs;
11803 need_barrier = !ngg_gs;
11804 }
11805 }
11806
11807 select_shader(ctx, shaders[0], true, true, need_barrier, &ic_merged_wave_info,
11808 check_merged_wave_info, endif_merged_wave_info);
11809 }
11810 }
11811
11812 void
dump_sgpr_to_mem(isel_context * ctx,Operand rsrc,Operand data,uint32_t offset)11813 dump_sgpr_to_mem(isel_context* ctx, Operand rsrc, Operand data, uint32_t offset)
11814 {
11815 Builder bld(ctx->program, ctx->block);
11816
11817 ac_hw_cache_flags cache_glc;
11818 cache_glc.value = ac_glc;
11819
11820 if (ctx->program->gfx_level >= GFX9) {
11821 bld.copy(Definition(PhysReg{256}, v1) /* v0 */, data);
11822
11823 bld.mubuf(aco_opcode::buffer_store_dword, Operand(rsrc), Operand(v1), Operand::c32(0u),
11824 Operand(PhysReg{256}, v1) /* v0 */, offset, false /* offen */, false /* idxen */,
11825 /* addr64 */ false, /* disable_wqm */ false, cache_glc);
11826 } else {
11827 bld.smem(aco_opcode::s_buffer_store_dword, Operand(rsrc), Operand::c32(offset), data,
11828 memory_sync_info(), cache_glc);
11829 }
11830 }
11831
11832 void
enable_thread_indexing(isel_context * ctx,Operand rsrc)11833 enable_thread_indexing(isel_context* ctx, Operand rsrc)
11834 {
11835 Builder bld(ctx->program, ctx->block);
11836 PhysReg rsrc_word3(rsrc.physReg() + 3);
11837
11838 bld.sop2(aco_opcode::s_or_b32, Definition(rsrc_word3, s1), bld.def(s1, scc),
11839 Operand(rsrc_word3, s1), Operand::c32(S_008F0C_ADD_TID_ENABLE(1)));
11840 if (ctx->program->gfx_level < GFX10) {
11841 /* This is part of the stride if ADD_TID_ENABLE=1. */
11842 bld.sop2(aco_opcode::s_and_b32, Definition(rsrc_word3, s1), bld.def(s1, scc),
11843 Operand(rsrc_word3, s1), Operand::c32(C_008F0C_DATA_FORMAT));
11844 }
11845 }
11846
11847 void
disable_thread_indexing(isel_context * ctx,Operand rsrc)11848 disable_thread_indexing(isel_context* ctx, Operand rsrc)
11849 {
11850 Builder bld(ctx->program, ctx->block);
11851 PhysReg rsrc_word3(rsrc.physReg() + 3);
11852
11853 bld.sop2(aco_opcode::s_and_b32, Definition(rsrc_word3, s1), bld.def(s1, scc),
11854 Operand(rsrc_word3, s1), Operand::c32(C_008F0C_ADD_TID_ENABLE));
11855 if (ctx->program->gfx_level < GFX10) {
11856 bld.sop2(aco_opcode::s_or_b32, Definition(rsrc_word3, s1), bld.def(s1, scc),
11857 Operand(rsrc_word3, s1),
11858 Operand::c32(S_008F0C_DATA_FORMAT(V_008F0C_BUF_DATA_FORMAT_32)));
11859 }
11860 }
11861
11862 void
save_or_restore_vgprs(isel_context * ctx,Operand rsrc,bool save)11863 save_or_restore_vgprs(isel_context* ctx, Operand rsrc, bool save)
11864 {
11865 Builder bld(ctx->program, ctx->block);
11866 uint32_t offset = offsetof(struct aco_trap_handler_layout, saved_vgprs[0]);
11867
11868 ac_hw_cache_flags cache_glc;
11869 cache_glc.value = ac_glc;
11870
11871 enable_thread_indexing(ctx, rsrc);
11872
11873 for (uint32_t i = 0; i < NUM_SAVED_VGPRS; i++) {
11874 if (save) {
11875 bld.mubuf(aco_opcode::buffer_store_dword, Operand(rsrc), Operand(v1), Operand::c32(0u),
11876 Operand(PhysReg{256 + i}, v1) /* v0 */, offset, false /* offen */,
11877 false /* idxen */,
11878 /* addr64 */ false, /* disable_wqm */ false, cache_glc);
11879 } else {
11880 bld.mubuf(aco_opcode::buffer_load_dword, Definition(PhysReg{256 + i}, v1), Operand(rsrc),
11881 Operand(v1), Operand::c32(0u), offset, false /* offen */, false /* idxen */,
11882 /* addr64 */ false, /* disable_wqm */ false, cache_glc);
11883 }
11884
11885 offset += 256;
11886 }
11887
11888 disable_thread_indexing(ctx, rsrc);
11889 }
11890
11891 void
save_vgprs_to_mem(isel_context * ctx,Operand rsrc)11892 save_vgprs_to_mem(isel_context* ctx, Operand rsrc)
11893 {
11894 save_or_restore_vgprs(ctx, rsrc, true);
11895 }
11896
11897 void
restore_vgprs_from_mem(isel_context * ctx,Operand rsrc)11898 restore_vgprs_from_mem(isel_context* ctx, Operand rsrc)
11899 {
11900 save_or_restore_vgprs(ctx, rsrc, false);
11901 }
11902
11903 void
dump_vgprs_to_mem(isel_context * ctx,Builder & bld,Operand rsrc)11904 dump_vgprs_to_mem(isel_context* ctx, Builder& bld, Operand rsrc)
11905 {
11906 const uint32_t ttmp0_idx = ctx->program->gfx_level >= GFX9 ? 108 : 112;
11907 const uint32_t base_offset = offsetof(struct aco_trap_handler_layout, vgprs[0]);
11908
11909 ac_hw_cache_flags cache_glc;
11910 cache_glc.value = ac_glc;
11911
11912 PhysReg num_vgprs{ttmp0_idx + 2};
11913 PhysReg soffset{ttmp0_idx + 3};
11914
11915 enable_thread_indexing(ctx, rsrc);
11916
11917 /* Determine the number of vgprs to dump in a 4-VGPR granularity. */
11918 const uint32_t vgpr_size_offset = ctx->program->gfx_level >= GFX11 ? 12 : 8;
11919 const uint32_t vgpr_size_width = ctx->program->gfx_level >= GFX10 ? 8 : 6;
11920
11921 bld.sopk(aco_opcode::s_getreg_b32, Definition(num_vgprs, s1),
11922 ((32 - 1) << 11) | 5 /* GPR_ALLOC */);
11923 bld.sop2(aco_opcode::s_bfe_u32, Definition(num_vgprs, s1), bld.def(s1, scc),
11924 Operand(num_vgprs, s1), Operand::c32((vgpr_size_width << 16) | vgpr_size_offset));
11925 bld.sop2(aco_opcode::s_add_u32, Definition(num_vgprs, s1), bld.def(s1, scc),
11926 Operand(num_vgprs, s1), Operand::c32(1u));
11927 bld.sop2(aco_opcode::s_lshl_b32, Definition(num_vgprs, s1), bld.def(s1, scc),
11928 Operand(num_vgprs, s1), Operand::c32(2u));
11929 bld.sop2(aco_opcode::s_mul_i32, Definition(num_vgprs, s1), Operand::c32(256),
11930 Operand(num_vgprs, s1));
11931
11932 /* Initialize m0/soffset to zero. */
11933 bld.copy(Definition(m0, s1), Operand::c32(0u));
11934 bld.copy(Definition(soffset, s1), Operand::c32(0u));
11935
11936 if (ctx->program->gfx_level < GFX10) {
11937 /* Enable VGPR indexing with m0 as source index. */
11938 bld.sopc(aco_opcode::s_set_gpr_idx_on, Definition(m0, s1), Operand(m0, s1),
11939 Operand(PhysReg{1}, s1) /* SRC0 mode */);
11940 }
11941
11942 loop_context lc;
11943 begin_loop(ctx, &lc);
11944 {
11945 bld.reset(ctx->block);
11946
11947 /* Move from a relative source addr (v0 = v[0 + m0]). */
11948 if (ctx->program->gfx_level >= GFX10) {
11949 bld.vop1(aco_opcode::v_movrels_b32, Definition(PhysReg{256}, v1),
11950 Operand(PhysReg{256}, v1), Operand(m0, s1));
11951 } else {
11952 bld.vop1(aco_opcode::v_mov_b32, Definition(PhysReg{256}, v1), Operand(PhysReg{256}, v1));
11953 }
11954
11955 bld.mubuf(aco_opcode::buffer_store_dword, Operand(rsrc), Operand(v1),
11956 Operand(PhysReg{soffset}, s1), Operand(PhysReg{256}, v1) /* v0 */, base_offset,
11957 false /* offen */, false /* idxen */,
11958 /* addr64 */ false, /* disable_wqm */ false, cache_glc);
11959
11960 /* Increase m0 and the offset assuming it's wave64. */
11961 bld.sop2(aco_opcode::s_add_u32, Definition(m0, s1), bld.def(s1, scc), Operand(m0, s1),
11962 Operand::c32(1u));
11963 bld.sop2(aco_opcode::s_add_u32, Definition(soffset, s1), bld.def(s1, scc),
11964 Operand(soffset, s1), Operand::c32(256u));
11965
11966 const Temp cond = bld.sopc(aco_opcode::s_cmp_ge_u32, bld.def(s1, scc), Operand(soffset, s1),
11967 Operand(num_vgprs, s1));
11968
11969 if_context loop_break;
11970 begin_uniform_if_then(ctx, &loop_break, cond);
11971 {
11972 emit_loop_break(ctx);
11973 }
11974 begin_uniform_if_else(ctx, &loop_break);
11975 end_uniform_if(ctx, &loop_break);
11976 }
11977 end_loop(ctx, &lc);
11978 bld.reset(ctx->block);
11979
11980 if (ctx->program->gfx_level < GFX10) {
11981 /* Disable VGPR indexing. */
11982 bld.sopp(aco_opcode::s_set_gpr_idx_off);
11983 }
11984
11985 disable_thread_indexing(ctx, rsrc);
11986 }
11987
11988 void
dump_lds_to_mem(isel_context * ctx,Builder & bld,Operand rsrc)11989 dump_lds_to_mem(isel_context* ctx, Builder& bld, Operand rsrc)
11990 {
11991 const uint32_t ttmp0_idx = ctx->program->gfx_level >= GFX9 ? 108 : 112;
11992 const uint32_t base_offset = offsetof(struct aco_trap_handler_layout, lds[0]);
11993
11994 ac_hw_cache_flags cache_glc;
11995 cache_glc.value = ac_glc;
11996
11997 PhysReg lds_size{ttmp0_idx + 2};
11998 PhysReg soffset{ttmp0_idx + 3};
11999
12000 enable_thread_indexing(ctx, rsrc);
12001
12002 /* Determine the LDS size. */
12003 const uint32_t lds_size_offset = 12;
12004 const uint32_t lds_size_width = 9;
12005
12006 bld.sopk(aco_opcode::s_getreg_b32, Definition(lds_size, s1),
12007 ((lds_size_width - 1) << 11) | (lds_size_offset << 6) | 6 /* LDS_ALLOC */);
12008 Temp lds_size_non_zero =
12009 bld.sopc(aco_opcode::s_cmp_lg_i32, bld.def(s1, scc), Operand(lds_size, s1), Operand::c32(0));
12010
12011 if_context ic;
12012 begin_uniform_if_then(ctx, &ic, lds_size_non_zero);
12013 {
12014 bld.reset(ctx->block);
12015
12016 /* Wait for other waves in the same threadgroup. */
12017 bld.sopp(aco_opcode::s_barrier, 0u);
12018
12019 /* Compute the LDS size in bytes (64 dw * 4). */
12020 bld.sop2(aco_opcode::s_lshl_b32, Definition(lds_size, s1), bld.def(s1, scc),
12021 Operand(lds_size, s1), Operand::c32(8u));
12022
12023 /* Add the base offset because this is used to exit the loop. */
12024 bld.sop2(aco_opcode::s_add_u32, Definition(lds_size, s1), bld.def(s1, scc),
12025 Operand(lds_size, s1), Operand::c32(base_offset));
12026
12027 /* Initialize soffset to base offset. */
12028 bld.copy(Definition(soffset, s1), Operand::c32(base_offset));
12029
12030 /* Compute the LDS offset from the thread ID. */
12031 bld.vop3(aco_opcode::v_mbcnt_lo_u32_b32, Definition(PhysReg{256}, v1), Operand::c32(-1u),
12032 Operand::c32(0u));
12033 bld.vop3(aco_opcode::v_mbcnt_hi_u32_b32_e64, Definition(PhysReg{256}, v1), Operand::c32(-1u),
12034 Operand(PhysReg{256}, v1));
12035 bld.vop2(aco_opcode::v_mul_u32_u24, Definition(PhysReg{256}, v1), Operand::c32(4u),
12036 Operand(PhysReg{256}, v1));
12037
12038 Operand m = load_lds_size_m0(bld);
12039
12040 loop_context lc;
12041 begin_loop(ctx, &lc);
12042 {
12043 bld.reset(ctx->block);
12044
12045 if (ctx->program->gfx_level >= GFX9) {
12046 bld.ds(aco_opcode::ds_read_b32, Definition(PhysReg{257}, v1), Operand(PhysReg{256}, v1),
12047 0);
12048 } else {
12049 bld.ds(aco_opcode::ds_read_b32, Definition(PhysReg{257}, v1), Operand(PhysReg{256}, v1),
12050 m, 0);
12051 }
12052
12053 bld.mubuf(aco_opcode::buffer_store_dword, Operand(rsrc), Operand(v1),
12054 Operand(PhysReg{soffset}, s1), Operand(PhysReg{257}, v1) /* v0 */,
12055 0 /* offset */, false /* offen */, false /* idxen */,
12056 /* addr64 */ false, /* disable_wqm */ false, cache_glc);
12057
12058 /* Increase v0 and the offset assuming it's wave64. */
12059 bld.vop3(aco_opcode::v_mad_u32_u24, Definition(PhysReg{256}, v1), Operand::c32(4u),
12060 Operand::c32(64u), Operand(PhysReg{256}, v1));
12061 bld.sop2(aco_opcode::s_add_u32, Definition(soffset, s1), bld.def(s1, scc),
12062 Operand(soffset, s1), Operand::c32(256u));
12063
12064 const Temp cond = bld.sopc(aco_opcode::s_cmp_ge_u32, bld.def(s1, scc),
12065 Operand(soffset, s1), Operand(lds_size, s1));
12066
12067 if_context loop_break;
12068 begin_uniform_if_then(ctx, &loop_break, cond);
12069 {
12070 emit_loop_break(ctx);
12071 }
12072 begin_uniform_if_else(ctx, &loop_break);
12073 end_uniform_if(ctx, &loop_break);
12074 }
12075 end_loop(ctx, &lc);
12076 bld.reset(ctx->block);
12077 }
12078 begin_uniform_if_else(ctx, &ic);
12079 end_uniform_if(ctx, &ic);
12080 bld.reset(ctx->block);
12081
12082 disable_thread_indexing(ctx, rsrc);
12083 }
12084
12085 void
select_trap_handler_shader(Program * program,ac_shader_config * config,const struct aco_compiler_options * options,const struct aco_shader_info * info,const struct ac_shader_args * args)12086 select_trap_handler_shader(Program* program, ac_shader_config* config,
12087 const struct aco_compiler_options* options,
12088 const struct aco_shader_info* info, const struct ac_shader_args* args)
12089 {
12090 uint32_t offset = 0;
12091
12092 assert(options->gfx_level >= GFX8 && options->gfx_level <= GFX11);
12093
12094 init_program(program, compute_cs, info, options->gfx_level, options->family, options->wgp_mode,
12095 config);
12096
12097 isel_context ctx = {};
12098 ctx.program = program;
12099 ctx.args = args;
12100 ctx.options = options;
12101 ctx.stage = program->stage;
12102
12103 ctx.block = ctx.program->create_and_insert_block();
12104 ctx.block->kind = block_kind_top_level;
12105
12106 program->workgroup_size = 1; /* XXX */
12107
12108 add_startpgm(&ctx);
12109 append_logical_start(ctx.block);
12110
12111 Builder bld(ctx.program, ctx.block);
12112
12113 ac_hw_cache_flags cache_glc;
12114 cache_glc.value = ac_glc;
12115
12116 const uint32_t ttmp0_idx = ctx.program->gfx_level >= GFX9 ? 108 : 112;
12117 PhysReg ttmp0_reg{ttmp0_idx};
12118 PhysReg ttmp1_reg{ttmp0_idx + 1};
12119 PhysReg ttmp2_reg{ttmp0_idx + 2};
12120 PhysReg ttmp3_reg{ttmp0_idx + 3};
12121 PhysReg tma_rsrc{ttmp0_idx + 4}; /* s4 */
12122 PhysReg save_wave_status{ttmp0_idx + 8};
12123 PhysReg save_m0{ttmp0_idx + 9};
12124 PhysReg save_exec{ttmp0_idx + 10}; /* s2 */
12125
12126 /* Save SQ_WAVE_STATUS because SCC needs to be restored. */
12127 bld.sopk(aco_opcode::s_getreg_b32, Definition(save_wave_status, s1), ((32 - 1) << 11) | 2);
12128
12129 /* Save m0. */
12130 bld.copy(Definition(save_m0, s1), Operand(m0, s1));
12131
12132 /* Save exec and use all invocations from the wave. */
12133 bld.sop1(Builder::s_or_saveexec, Definition(save_exec, bld.lm), Definition(scc, s1),
12134 Definition(exec, bld.lm), Operand::c32_or_c64(-1u, bld.lm == s2),
12135 Operand(exec, bld.lm));
12136
12137 if (options->gfx_level < GFX11) {
12138 /* Clear the current wave exception, this is required to re-enable VALU
12139 * instructions in this wave. Seems to be only needed for float exceptions.
12140 */
12141 bld.vop1(aco_opcode::v_clrexcp);
12142 }
12143
12144 offset = offsetof(struct aco_trap_handler_layout, ttmp0);
12145
12146 if (ctx.program->gfx_level >= GFX9) {
12147 /* Get TMA. */
12148 if (ctx.program->gfx_level >= GFX11) {
12149 bld.sop1(aco_opcode::s_sendmsg_rtn_b32, Definition(ttmp2_reg, s1),
12150 Operand::c32(sendmsg_rtn_get_tma));
12151 } else {
12152 bld.sopk(aco_opcode::s_getreg_b32, Definition(ttmp2_reg, s1), ((32 - 1) << 11) | 18);
12153 }
12154
12155 bld.sop2(aco_opcode::s_lshl_b32, Definition(ttmp2_reg, s1), Definition(scc, s1),
12156 Operand(ttmp2_reg, s1), Operand::c32(8u));
12157 bld.copy(Definition(ttmp3_reg, s1), Operand::c32((unsigned)ctx.options->address32_hi));
12158
12159 /* Load the buffer descriptor from TMA. */
12160 bld.smem(aco_opcode::s_load_dwordx4, Definition(tma_rsrc, s4), Operand(ttmp2_reg, s2),
12161 Operand::c32(0u));
12162
12163 /* Save VGPRS that needs to be restored. */
12164 save_vgprs_to_mem(&ctx, Operand(tma_rsrc, s4));
12165
12166 /* Dump VGPRs. */
12167 dump_vgprs_to_mem(&ctx, bld, Operand(tma_rsrc, s4));
12168
12169 /* Store TTMP0-TTMP1. */
12170 bld.copy(Definition(PhysReg{256}, v2) /* v[0-1] */, Operand(ttmp0_reg, s2));
12171
12172 bld.mubuf(aco_opcode::buffer_store_dwordx2, Operand(tma_rsrc, s4), Operand(v1),
12173 Operand::c32(0u), Operand(PhysReg{256}, v2) /* v[0-1] */, offset /* offset */,
12174 false /* offen */, false /* idxen */, /* addr64 */ false,
12175 /* disable_wqm */ false, cache_glc);
12176 } else {
12177 /* Load the buffer descriptor from TMA. */
12178 bld.smem(aco_opcode::s_load_dwordx4, Definition(tma_rsrc, s4), Operand(PhysReg{tma_lo}, s2),
12179 Operand::zero());
12180
12181 /* Save VGPRS that needs to be restored. */
12182 save_vgprs_to_mem(&ctx, Operand(tma_rsrc, s4));
12183
12184 /* Dump VGPRs. */
12185 dump_vgprs_to_mem(&ctx, bld, Operand(tma_rsrc, s4));
12186
12187 /* Store TTMP0-TTMP1. */
12188 bld.smem(aco_opcode::s_buffer_store_dwordx2, Operand(tma_rsrc, s4), Operand::c32(offset),
12189 Operand(ttmp0_reg, s2), memory_sync_info(), cache_glc);
12190 }
12191
12192 /* Store some hardware registers. */
12193 const uint32_t hw_regs_idx[] = {
12194 1, /* HW_REG_MODE */
12195 3, /* HW_REG_TRAP_STS */
12196 4, /* HW_REG_HW_ID */
12197 5, /* WH_REG_GPR_ALLOC */
12198 6, /* WH_REG_LDS_ALLOC */
12199 7, /* HW_REG_IB_STS */
12200 };
12201
12202 offset = offsetof(struct aco_trap_handler_layout, sq_wave_regs.status);
12203
12204 /* Store saved SQ_WAVE_STATUS which can change inside the trap. */
12205 dump_sgpr_to_mem(&ctx, Operand(tma_rsrc, s4), Operand(save_wave_status, s1), offset);
12206 offset += 4;
12207
12208 for (unsigned i = 0; i < ARRAY_SIZE(hw_regs_idx); i++) {
12209 /* "((size - 1) << 11) | register" */
12210 bld.sopk(aco_opcode::s_getreg_b32, Definition(ttmp0_reg, s1),
12211 ((32 - 1) << 11) | hw_regs_idx[i]);
12212
12213 dump_sgpr_to_mem(&ctx, Operand(tma_rsrc, s4), Operand(ttmp0_reg, s1), offset);
12214 offset += 4;
12215 }
12216
12217 assert(offset == offsetof(struct aco_trap_handler_layout, m0));
12218
12219 /* Dump shader registers (m0, exec). */
12220 dump_sgpr_to_mem(&ctx, Operand(tma_rsrc, s4), Operand(save_m0, s1), offset);
12221 offset += 4;
12222 dump_sgpr_to_mem(&ctx, Operand(tma_rsrc, s4), Operand(save_exec, s1), offset);
12223 offset += 4;
12224 dump_sgpr_to_mem(&ctx, Operand(tma_rsrc, s4), Operand(save_exec.advance(4), s1), offset);
12225 offset += 4;
12226
12227 assert(offset == offsetof(struct aco_trap_handler_layout, sgprs[0]));
12228
12229 /* Dump all SGPRs. */
12230 for (uint32_t i = 0; i < program->dev.sgpr_limit; i++) {
12231 dump_sgpr_to_mem(&ctx, Operand(tma_rsrc, s4), Operand(PhysReg{i}, s1), offset);
12232 offset += 4;
12233 }
12234
12235 /* Dump LDS. */
12236 dump_lds_to_mem(&ctx, bld, Operand(tma_rsrc, s4));
12237
12238 /* Restore VGPRS. */
12239 restore_vgprs_from_mem(&ctx, Operand(tma_rsrc, s4));
12240
12241 /* Restore m0 and exec. */
12242 bld.copy(Definition(m0, s1), Operand(save_m0, s1));
12243 bld.copy(Definition(exec, bld.lm), Operand(save_exec, bld.lm));
12244
12245 /* Restore SCC which is the first bit of SQ_WAVE_STATUS. */
12246 bld.sopc(aco_opcode::s_bitcmp1_b32, bld.def(s1, scc), Operand(save_wave_status, s1),
12247 Operand::c32(0u));
12248
12249 program->config->float_mode = program->blocks[0].fp_mode.val;
12250
12251 append_logical_end(ctx.block);
12252 ctx.block->kind |= block_kind_uniform;
12253 bld.sopp(aco_opcode::s_endpgm);
12254
12255 finish_program(&ctx);
12256 }
12257
12258 Operand
get_arg_fixed(const struct ac_shader_args * args,struct ac_arg arg)12259 get_arg_fixed(const struct ac_shader_args* args, struct ac_arg arg)
12260 {
12261 enum ac_arg_regfile file = args->args[arg.arg_index].file;
12262 unsigned size = args->args[arg.arg_index].size;
12263 RegClass rc = RegClass(file == AC_ARG_SGPR ? RegType::sgpr : RegType::vgpr, size);
12264 return Operand(get_arg_reg(args, arg), rc);
12265 }
12266
12267 unsigned
load_vb_descs(Builder & bld,PhysReg dest,Operand base,unsigned start,unsigned max)12268 load_vb_descs(Builder& bld, PhysReg dest, Operand base, unsigned start, unsigned max)
12269 {
12270 unsigned sgpr_limit = get_addr_sgpr_from_waves(bld.program, bld.program->min_waves);
12271 unsigned count = MIN2((sgpr_limit - dest.reg()) / 4u, max);
12272 for (unsigned i = 0; i < count;) {
12273 unsigned size = 1u << util_logbase2(MIN2(count - i, 4));
12274
12275 if (size == 4)
12276 bld.smem(aco_opcode::s_load_dwordx16, Definition(dest, s16), base,
12277 Operand::c32((start + i) * 16u));
12278 else if (size == 2)
12279 bld.smem(aco_opcode::s_load_dwordx8, Definition(dest, s8), base,
12280 Operand::c32((start + i) * 16u));
12281 else
12282 bld.smem(aco_opcode::s_load_dwordx4, Definition(dest, s4), base,
12283 Operand::c32((start + i) * 16u));
12284
12285 dest = dest.advance(size * 16u);
12286 i += size;
12287 }
12288
12289 return count;
12290 }
12291
12292 void
wait_for_smem_loads(Builder & bld)12293 wait_for_smem_loads(Builder& bld)
12294 {
12295 if (bld.program->gfx_level >= GFX12) {
12296 bld.sopp(aco_opcode::s_wait_kmcnt, 0);
12297 } else {
12298 wait_imm lgkm_imm;
12299 lgkm_imm.lgkm = 0;
12300 bld.sopp(aco_opcode::s_waitcnt, lgkm_imm.pack(bld.program->gfx_level));
12301 }
12302 }
12303
12304 void
wait_for_vmem_loads(Builder & bld)12305 wait_for_vmem_loads(Builder& bld)
12306 {
12307 if (bld.program->gfx_level >= GFX12) {
12308 bld.sopp(aco_opcode::s_wait_loadcnt, 0);
12309 } else {
12310 wait_imm vm_imm;
12311 vm_imm.vm = 0;
12312 bld.sopp(aco_opcode::s_waitcnt, vm_imm.pack(bld.program->gfx_level));
12313 }
12314 }
12315
12316 Operand
calc_nontrivial_instance_id(Builder & bld,const struct ac_shader_args * args,const struct aco_vs_prolog_info * pinfo,unsigned index,Operand instance_id,Operand start_instance,PhysReg tmp_sgpr,PhysReg tmp_vgpr0,PhysReg tmp_vgpr1)12317 calc_nontrivial_instance_id(Builder& bld, const struct ac_shader_args* args,
12318 const struct aco_vs_prolog_info* pinfo, unsigned index,
12319 Operand instance_id, Operand start_instance, PhysReg tmp_sgpr,
12320 PhysReg tmp_vgpr0, PhysReg tmp_vgpr1)
12321 {
12322 bld.smem(aco_opcode::s_load_dwordx2, Definition(tmp_sgpr, s2),
12323 get_arg_fixed(args, pinfo->inputs), Operand::c32(8u + index * 8u));
12324
12325 wait_for_smem_loads(bld);
12326
12327 Definition fetch_index_def(tmp_vgpr0, v1);
12328 Operand fetch_index(tmp_vgpr0, v1);
12329
12330 Operand div_info(tmp_sgpr, s1);
12331 if (bld.program->gfx_level >= GFX8 && bld.program->gfx_level < GFX11) {
12332 /* use SDWA */
12333 if (bld.program->gfx_level < GFX9) {
12334 bld.vop1(aco_opcode::v_mov_b32, Definition(tmp_vgpr1, v1), div_info);
12335 div_info = Operand(tmp_vgpr1, v1);
12336 }
12337
12338 bld.vop2(aco_opcode::v_lshrrev_b32, fetch_index_def, div_info, instance_id);
12339
12340 Instruction* instr;
12341 if (bld.program->gfx_level >= GFX9)
12342 instr = bld.vop2_sdwa(aco_opcode::v_add_u32, fetch_index_def, div_info, fetch_index).instr;
12343 else
12344 instr = bld.vop2_sdwa(aco_opcode::v_add_co_u32, fetch_index_def, Definition(vcc, bld.lm),
12345 div_info, fetch_index)
12346 .instr;
12347 instr->sdwa().sel[0] = SubdwordSel::ubyte1;
12348
12349 bld.vop3(aco_opcode::v_mul_hi_u32, fetch_index_def, Operand(tmp_sgpr.advance(4), s1),
12350 fetch_index);
12351
12352 instr =
12353 bld.vop2_sdwa(aco_opcode::v_lshrrev_b32, fetch_index_def, div_info, fetch_index).instr;
12354 instr->sdwa().sel[0] = SubdwordSel::ubyte2;
12355 } else {
12356 Operand tmp_op(tmp_vgpr1, v1);
12357 Definition tmp_def(tmp_vgpr1, v1);
12358
12359 bld.vop2(aco_opcode::v_lshrrev_b32, fetch_index_def, div_info, instance_id);
12360
12361 bld.vop3(aco_opcode::v_bfe_u32, tmp_def, div_info, Operand::c32(8u), Operand::c32(8u));
12362 bld.vadd32(fetch_index_def, tmp_op, fetch_index, false, Operand(s2), true);
12363
12364 bld.vop3(aco_opcode::v_mul_hi_u32, fetch_index_def, fetch_index,
12365 Operand(tmp_sgpr.advance(4), s1));
12366
12367 bld.vop3(aco_opcode::v_bfe_u32, tmp_def, div_info, Operand::c32(16u), Operand::c32(8u));
12368 bld.vop2(aco_opcode::v_lshrrev_b32, fetch_index_def, tmp_op, fetch_index);
12369 }
12370
12371 bld.vadd32(fetch_index_def, start_instance, fetch_index, false, Operand(s2), true);
12372
12373 return fetch_index;
12374 }
12375
12376 void
select_rt_prolog(Program * program,ac_shader_config * config,const struct aco_compiler_options * options,const struct aco_shader_info * info,const struct ac_shader_args * in_args,const struct ac_shader_args * out_args)12377 select_rt_prolog(Program* program, ac_shader_config* config,
12378 const struct aco_compiler_options* options, const struct aco_shader_info* info,
12379 const struct ac_shader_args* in_args, const struct ac_shader_args* out_args)
12380 {
12381 init_program(program, compute_cs, info, options->gfx_level, options->family, options->wgp_mode,
12382 config);
12383 Block* block = program->create_and_insert_block();
12384 block->kind = block_kind_top_level;
12385 program->workgroup_size = info->workgroup_size;
12386 program->wave_size = info->workgroup_size;
12387 calc_min_waves(program);
12388 Builder bld(program, block);
12389 block->instructions.reserve(32);
12390 unsigned num_sgprs = MAX2(in_args->num_sgprs_used, out_args->num_sgprs_used);
12391 unsigned num_vgprs = MAX2(in_args->num_vgprs_used, out_args->num_vgprs_used);
12392
12393 /* Inputs:
12394 * Ring offsets: s[0-1]
12395 * Indirect descriptor sets: s[2]
12396 * Push constants pointer: s[3]
12397 * SBT descriptors: s[4-5]
12398 * Traversal shader address: s[6-7]
12399 * Ray launch size address: s[8-9]
12400 * Dynamic callable stack base: s[10]
12401 * Workgroup IDs (xyz): s[11], s[12], s[13]
12402 * Scratch offset: s[14]
12403 * Local invocation IDs: v[0-2]
12404 */
12405 PhysReg in_ring_offsets = get_arg_reg(in_args, in_args->ring_offsets);
12406 PhysReg in_sbt_desc = get_arg_reg(in_args, in_args->rt.sbt_descriptors);
12407 PhysReg in_launch_size_addr = get_arg_reg(in_args, in_args->rt.launch_size_addr);
12408 PhysReg in_stack_base = get_arg_reg(in_args, in_args->rt.dynamic_callable_stack_base);
12409 PhysReg in_wg_id_x;
12410 PhysReg in_wg_id_y;
12411 PhysReg in_wg_id_z;
12412 PhysReg in_scratch_offset;
12413 if (options->gfx_level < GFX12) {
12414 in_wg_id_x = get_arg_reg(in_args, in_args->workgroup_ids[0]);
12415 in_wg_id_y = get_arg_reg(in_args, in_args->workgroup_ids[1]);
12416 in_wg_id_z = get_arg_reg(in_args, in_args->workgroup_ids[2]);
12417 } else {
12418 in_wg_id_x = PhysReg(108 + 9 /*ttmp9*/);
12419 in_wg_id_y = PhysReg(108 + 7 /*ttmp7*/);
12420 }
12421 if (options->gfx_level < GFX11)
12422 in_scratch_offset = get_arg_reg(in_args, in_args->scratch_offset);
12423 struct ac_arg arg_id = options->gfx_level >= GFX11 ? in_args->local_invocation_ids_packed
12424 : in_args->local_invocation_id_x;
12425 PhysReg in_local_ids[2] = {
12426 get_arg_reg(in_args, arg_id),
12427 get_arg_reg(in_args, arg_id).advance(4),
12428 };
12429
12430 /* Outputs:
12431 * Callee shader PC: s[0-1]
12432 * Indirect descriptor sets: s[2]
12433 * Push constants pointer: s[3]
12434 * SBT descriptors: s[4-5]
12435 * Traversal shader address: s[6-7]
12436 * Ray launch sizes (xyz): s[8], s[9], s[10]
12437 * Scratch offset (<GFX9 only): s[11]
12438 * Ring offsets (<GFX9 only): s[12-13]
12439 * Ray launch IDs: v[0-2]
12440 * Stack pointer: v[3]
12441 * Shader VA: v[4-5]
12442 * Shader Record Ptr: v[6-7]
12443 */
12444 PhysReg out_uniform_shader_addr = get_arg_reg(out_args, out_args->rt.uniform_shader_addr);
12445 PhysReg out_launch_size_x = get_arg_reg(out_args, out_args->rt.launch_sizes[0]);
12446 PhysReg out_launch_size_y = get_arg_reg(out_args, out_args->rt.launch_sizes[1]);
12447 PhysReg out_launch_size_z = get_arg_reg(out_args, out_args->rt.launch_sizes[2]);
12448 PhysReg out_launch_ids[3];
12449 for (unsigned i = 0; i < 3; i++)
12450 out_launch_ids[i] = get_arg_reg(out_args, out_args->rt.launch_ids[i]);
12451 PhysReg out_stack_ptr = get_arg_reg(out_args, out_args->rt.dynamic_callable_stack_base);
12452 PhysReg out_record_ptr = get_arg_reg(out_args, out_args->rt.shader_record);
12453
12454 /* Temporaries: */
12455 num_sgprs = align(num_sgprs, 2);
12456 PhysReg tmp_raygen_sbt = PhysReg{num_sgprs};
12457 num_sgprs += 2;
12458 PhysReg tmp_ring_offsets = PhysReg{num_sgprs};
12459 num_sgprs += 2;
12460 PhysReg tmp_wg_id_x_times_size = PhysReg{num_sgprs};
12461 num_sgprs++;
12462
12463 PhysReg tmp_invocation_idx = PhysReg{256 + num_vgprs++};
12464
12465 /* Confirm some assumptions about register aliasing */
12466 assert(in_ring_offsets == out_uniform_shader_addr);
12467 assert(get_arg_reg(in_args, in_args->push_constants) ==
12468 get_arg_reg(out_args, out_args->push_constants));
12469 assert(get_arg_reg(in_args, in_args->rt.sbt_descriptors) ==
12470 get_arg_reg(out_args, out_args->rt.sbt_descriptors));
12471 assert(in_launch_size_addr == out_launch_size_x);
12472 assert(in_stack_base == out_launch_size_z);
12473 assert(in_local_ids[0] == out_launch_ids[0]);
12474
12475 /* <gfx9 reads in_scratch_offset at the end of the prolog to write out the scratch_offset
12476 * arg. Make sure no other outputs have overwritten it by then.
12477 */
12478 assert(options->gfx_level >= GFX9 || in_scratch_offset.reg() >= out_args->num_sgprs_used);
12479
12480 /* load raygen sbt */
12481 bld.smem(aco_opcode::s_load_dwordx2, Definition(tmp_raygen_sbt, s2), Operand(in_sbt_desc, s2),
12482 Operand::c32(0u));
12483
12484 /* init scratch */
12485 if (options->gfx_level < GFX9) {
12486 /* copy ring offsets to temporary location*/
12487 bld.sop1(aco_opcode::s_mov_b64, Definition(tmp_ring_offsets, s2),
12488 Operand(in_ring_offsets, s2));
12489 } else if (options->gfx_level < GFX11) {
12490 hw_init_scratch(bld, Definition(in_ring_offsets, s1), Operand(in_ring_offsets, s2),
12491 Operand(in_scratch_offset, s1));
12492 }
12493
12494 /* set stack ptr */
12495 bld.vop1(aco_opcode::v_mov_b32, Definition(out_stack_ptr, v1), Operand(in_stack_base, s1));
12496
12497 /* load raygen address */
12498 bld.smem(aco_opcode::s_load_dwordx2, Definition(out_uniform_shader_addr, s2),
12499 Operand(tmp_raygen_sbt, s2), Operand::c32(0u));
12500
12501 /* load ray launch sizes */
12502 bld.smem(aco_opcode::s_load_dword, Definition(out_launch_size_z, s1),
12503 Operand(in_launch_size_addr, s2), Operand::c32(8u));
12504 bld.smem(aco_opcode::s_load_dwordx2, Definition(out_launch_size_x, s2),
12505 Operand(in_launch_size_addr, s2), Operand::c32(0u));
12506
12507 /* calculate ray launch ids */
12508 if (options->gfx_level >= GFX11) {
12509 /* Thread IDs are packed in VGPR0, 10 bits per component. */
12510 bld.vop3(aco_opcode::v_bfe_u32, Definition(in_local_ids[1], v1), Operand(in_local_ids[0], v1),
12511 Operand::c32(10u), Operand::c32(3u));
12512 bld.vop2(aco_opcode::v_and_b32, Definition(in_local_ids[0], v1), Operand::c32(0x7),
12513 Operand(in_local_ids[0], v1));
12514 }
12515 /* Do this backwards to reduce some RAW hazards on GFX11+ */
12516 if (options->gfx_level >= GFX12) {
12517 bld.vop2_e64(aco_opcode::v_lshrrev_b32, Definition(out_launch_ids[2], v1), Operand::c32(16),
12518 Operand(in_wg_id_y, s1));
12519 bld.vop3(aco_opcode::v_mad_u32_u16, Definition(out_launch_ids[1], v1),
12520 Operand(in_wg_id_y, s1), Operand::c32(program->workgroup_size == 32 ? 4 : 8),
12521 Operand(in_local_ids[1], v1));
12522 } else {
12523 bld.vop1(aco_opcode::v_mov_b32, Definition(out_launch_ids[2], v1), Operand(in_wg_id_z, s1));
12524 bld.vop3(aco_opcode::v_mad_u32_u24, Definition(out_launch_ids[1], v1),
12525 Operand(in_wg_id_y, s1), Operand::c32(program->workgroup_size == 32 ? 4 : 8),
12526 Operand(in_local_ids[1], v1));
12527 }
12528 bld.vop3(aco_opcode::v_mad_u32_u24, Definition(out_launch_ids[0], v1), Operand(in_wg_id_x, s1),
12529 Operand::c32(8), Operand(in_local_ids[0], v1));
12530
12531 /* calculate shader record ptr: SBT + RADV_RT_HANDLE_SIZE */
12532 if (options->gfx_level < GFX9) {
12533 bld.vop2_e64(aco_opcode::v_add_co_u32, Definition(out_record_ptr, v1), Definition(vcc, s2),
12534 Operand(tmp_raygen_sbt, s1), Operand::c32(32u));
12535 } else {
12536 bld.vop2_e64(aco_opcode::v_add_u32, Definition(out_record_ptr, v1),
12537 Operand(tmp_raygen_sbt, s1), Operand::c32(32u));
12538 }
12539 bld.vop1(aco_opcode::v_mov_b32, Definition(out_record_ptr.advance(4), v1),
12540 Operand(tmp_raygen_sbt.advance(4), s1));
12541
12542 /* For 1D dispatches converted into 2D ones, we need to fix up the launch IDs.
12543 * Calculating the 1D launch ID is: id = local_invocation_index + (wg_id.x * wg_size).
12544 * tmp_wg_id_x_times_size now holds wg_id.x * wg_size.
12545 */
12546 bld.sop2(aco_opcode::s_lshl_b32, Definition(tmp_wg_id_x_times_size, s1), Definition(scc, s1),
12547 Operand(in_wg_id_x, s1), Operand::c32(program->workgroup_size == 32 ? 5 : 6));
12548
12549 /* Calculate and add local_invocation_index */
12550 bld.vop3(aco_opcode::v_mbcnt_lo_u32_b32, Definition(tmp_invocation_idx, v1), Operand::c32(-1u),
12551 Operand(tmp_wg_id_x_times_size, s1));
12552 if (program->wave_size == 64) {
12553 if (program->gfx_level <= GFX7)
12554 bld.vop2(aco_opcode::v_mbcnt_hi_u32_b32, Definition(tmp_invocation_idx, v1),
12555 Operand::c32(-1u), Operand(tmp_invocation_idx, v1));
12556 else
12557 bld.vop3(aco_opcode::v_mbcnt_hi_u32_b32_e64, Definition(tmp_invocation_idx, v1),
12558 Operand::c32(-1u), Operand(tmp_invocation_idx, v1));
12559 }
12560
12561 /* Make fixup operations a no-op if this is not a converted 2D dispatch. */
12562 bld.sopc(aco_opcode::s_cmp_lg_u32, Definition(scc, s1),
12563 Operand::c32(ACO_RT_CONVERTED_2D_LAUNCH_SIZE), Operand(out_launch_size_y, s1));
12564 bld.sop2(Builder::s_cselect, Definition(vcc, bld.lm),
12565 Operand::c32_or_c64(-1u, program->wave_size == 64),
12566 Operand::c32_or_c64(0, program->wave_size == 64), Operand(scc, s1));
12567 bld.vop2(aco_opcode::v_cndmask_b32, Definition(out_launch_ids[0], v1),
12568 Operand(tmp_invocation_idx, v1), Operand(out_launch_ids[0], v1), Operand(vcc, bld.lm));
12569 bld.vop2(aco_opcode::v_cndmask_b32, Definition(out_launch_ids[1], v1), Operand::zero(),
12570 Operand(out_launch_ids[1], v1), Operand(vcc, bld.lm));
12571
12572 if (options->gfx_level < GFX9) {
12573 /* write scratch/ring offsets to outputs, if needed */
12574 bld.sop1(aco_opcode::s_mov_b32,
12575 Definition(get_arg_reg(out_args, out_args->scratch_offset), s1),
12576 Operand(in_scratch_offset, s1));
12577 bld.sop1(aco_opcode::s_mov_b64, Definition(get_arg_reg(out_args, out_args->ring_offsets), s2),
12578 Operand(tmp_ring_offsets, s2));
12579 }
12580
12581 /* jump to raygen */
12582 bld.sop1(aco_opcode::s_setpc_b64, Operand(out_uniform_shader_addr, s2));
12583
12584 program->config->float_mode = program->blocks[0].fp_mode.val;
12585 program->config->num_vgprs = get_vgpr_alloc(program, num_vgprs);
12586 program->config->num_sgprs = get_sgpr_alloc(program, num_sgprs);
12587 }
12588
12589 PhysReg
get_next_vgpr(unsigned size,unsigned * num,int * offset=NULL)12590 get_next_vgpr(unsigned size, unsigned* num, int *offset = NULL)
12591 {
12592 unsigned reg = *num + (offset ? *offset : 0);
12593 if (reg + size >= *num) {
12594 *num = reg + size;
12595 if (offset)
12596 *offset = 0;
12597 } else if (offset) {
12598 *offset += size;
12599 }
12600 return PhysReg(256 + reg);
12601 }
12602
12603 struct UnalignedVsAttribLoad {
12604 /* dst/scratch are PhysReg converted to unsigned */
12605 unsigned dst;
12606 unsigned scratch;
12607 bool d16;
12608 const struct ac_vtx_format_info* vtx_info;
12609 };
12610
12611 struct UnalignedVsAttribLoadState {
12612 unsigned max_vgprs;
12613 unsigned initial_num_vgprs;
12614 unsigned* num_vgprs;
12615 unsigned overflow_num_vgprs;
12616 aco::small_vec<UnalignedVsAttribLoad, 16> current_loads;
12617 };
12618
12619 void
convert_unaligned_vs_attrib(Builder & bld,UnalignedVsAttribLoad load)12620 convert_unaligned_vs_attrib(Builder& bld, UnalignedVsAttribLoad load)
12621 {
12622 PhysReg dst(load.dst);
12623 PhysReg scratch(load.scratch);
12624 const struct ac_vtx_format_info* vtx_info = load.vtx_info;
12625 unsigned dfmt = vtx_info->hw_format[0] & 0xf;
12626 unsigned nfmt = vtx_info->hw_format[0] >> 4;
12627
12628 unsigned size = vtx_info->chan_byte_size ? vtx_info->chan_byte_size : vtx_info->element_size;
12629 if (load.d16) {
12630 bld.vop3(aco_opcode::v_lshl_or_b32, Definition(dst, v1), Operand(scratch, v1),
12631 Operand::c32(8), Operand(dst, v1));
12632 } else {
12633 for (unsigned i = 1; i < size; i++) {
12634 PhysReg byte_reg = scratch.advance(i * 4 - 4);
12635 if (bld.program->gfx_level >= GFX9) {
12636 bld.vop3(aco_opcode::v_lshl_or_b32, Definition(dst, v1), Operand(byte_reg, v1),
12637 Operand::c32(i * 8), Operand(dst, v1));
12638 } else {
12639 bld.vop2(aco_opcode::v_lshlrev_b32, Definition(byte_reg, v1), Operand::c32(i * 8),
12640 Operand(byte_reg, v1));
12641 bld.vop2(aco_opcode::v_or_b32, Definition(dst, v1), Operand(dst, v1),
12642 Operand(byte_reg, v1));
12643 }
12644 }
12645 }
12646
12647 unsigned num_channels = vtx_info->chan_byte_size ? 1 : vtx_info->num_channels;
12648 PhysReg chan[4] = {dst, dst.advance(4), dst.advance(8), dst.advance(12)};
12649
12650 if (dfmt == V_008F0C_BUF_DATA_FORMAT_10_11_11) {
12651 bld.vop3(aco_opcode::v_bfe_u32, Definition(chan[2], v1), Operand(dst, v1), Operand::c32(22),
12652 Operand::c32(10));
12653 bld.vop3(aco_opcode::v_bfe_u32, Definition(chan[1], v1), Operand(dst, v1), Operand::c32(11),
12654 Operand::c32(11));
12655 bld.vop3(aco_opcode::v_bfe_u32, Definition(chan[0], v1), Operand(dst, v1), Operand::c32(0),
12656 Operand::c32(11));
12657 bld.vop2(aco_opcode::v_lshlrev_b32, Definition(chan[2], v1), Operand::c32(5),
12658 Operand(chan[2], v1));
12659 bld.vop2(aco_opcode::v_lshlrev_b32, Definition(chan[1], v1), Operand::c32(4),
12660 Operand(chan[1], v1));
12661 bld.vop2(aco_opcode::v_lshlrev_b32, Definition(chan[0], v1), Operand::c32(4),
12662 Operand(chan[0], v1));
12663 } else if (dfmt == V_008F0C_BUF_DATA_FORMAT_2_10_10_10) {
12664 aco_opcode bfe = aco_opcode::v_bfe_u32;
12665 switch (nfmt) {
12666 case V_008F0C_BUF_NUM_FORMAT_SNORM:
12667 case V_008F0C_BUF_NUM_FORMAT_SSCALED:
12668 case V_008F0C_BUF_NUM_FORMAT_SINT: bfe = aco_opcode::v_bfe_i32; break;
12669 default: break;
12670 }
12671
12672 bool swapxz = G_008F0C_DST_SEL_X(vtx_info->dst_sel) != V_008F0C_SQ_SEL_X;
12673 bld.vop3(bfe, Definition(chan[3], v1), Operand(dst, v1), Operand::c32(30), Operand::c32(2));
12674 bld.vop3(bfe, Definition(chan[2], v1), Operand(dst, v1), Operand::c32(swapxz ? 0 : 20),
12675 Operand::c32(10));
12676 bld.vop3(bfe, Definition(chan[1], v1), Operand(dst, v1), Operand::c32(10), Operand::c32(10));
12677 bld.vop3(bfe, Definition(chan[0], v1), Operand(dst, v1), Operand::c32(swapxz ? 20 : 0),
12678 Operand::c32(10));
12679 } else if (dfmt == V_008F0C_BUF_DATA_FORMAT_8 || dfmt == V_008F0C_BUF_DATA_FORMAT_16) {
12680 unsigned bits = dfmt == V_008F0C_BUF_DATA_FORMAT_8 ? 8 : 16;
12681 switch (nfmt) {
12682 case V_008F0C_BUF_NUM_FORMAT_SNORM:
12683 case V_008F0C_BUF_NUM_FORMAT_SSCALED:
12684 case V_008F0C_BUF_NUM_FORMAT_SINT:
12685 bld.vop3(aco_opcode::v_bfe_i32, Definition(dst, v1), Operand(dst, v1), Operand::c32(0),
12686 Operand::c32(bits));
12687 break;
12688 default: break;
12689 }
12690 }
12691
12692 if (nfmt == V_008F0C_BUF_NUM_FORMAT_FLOAT &&
12693 (dfmt == V_008F0C_BUF_DATA_FORMAT_16 || dfmt == V_008F0C_BUF_DATA_FORMAT_10_11_11)) {
12694 for (unsigned i = 0; i < num_channels; i++)
12695 bld.vop1(aco_opcode::v_cvt_f32_f16, Definition(chan[i], v1), Operand(chan[i], v1));
12696 } else if (nfmt == V_008F0C_BUF_NUM_FORMAT_USCALED || nfmt == V_008F0C_BUF_NUM_FORMAT_UNORM) {
12697 for (unsigned i = 0; i < num_channels; i++)
12698 bld.vop1(aco_opcode::v_cvt_f32_u32, Definition(chan[i], v1), Operand(chan[i], v1));
12699 } else if (nfmt == V_008F0C_BUF_NUM_FORMAT_SSCALED || nfmt == V_008F0C_BUF_NUM_FORMAT_SNORM) {
12700 for (unsigned i = 0; i < num_channels; i++)
12701 bld.vop1(aco_opcode::v_cvt_f32_i32, Definition(chan[i], v1), Operand(chan[i], v1));
12702 }
12703
12704 std::array<unsigned, 4> chan_max;
12705 switch (dfmt) {
12706 case V_008F0C_BUF_DATA_FORMAT_2_10_10_10: chan_max = {1023, 1023, 1023, 3}; break;
12707 case V_008F0C_BUF_DATA_FORMAT_8: chan_max = {255, 255, 255, 255}; break;
12708 case V_008F0C_BUF_DATA_FORMAT_16: chan_max = {65535, 65535, 65535, 65535}; break;
12709 }
12710
12711 if (nfmt == V_008F0C_BUF_NUM_FORMAT_UNORM) {
12712 for (unsigned i = 0; i < num_channels; i++)
12713 bld.vop2(aco_opcode::v_mul_f32, Definition(chan[i], v1),
12714 Operand::c32(fui(1.0 / chan_max[i])), Operand(chan[i], v1));
12715 } else if (nfmt == V_008F0C_BUF_NUM_FORMAT_SNORM) {
12716 for (unsigned i = 0; i < num_channels; i++) {
12717 bld.vop2(aco_opcode::v_mul_f32, Definition(chan[i], v1),
12718 Operand::c32(fui(1.0 / (chan_max[i] >> 1))), Operand(chan[i], v1));
12719 bld.vop2(aco_opcode::v_max_f32, Definition(chan[i], v1), Operand::c32(0xbf800000),
12720 Operand(chan[i], v1));
12721 }
12722 }
12723 }
12724
12725 void
convert_current_unaligned_vs_attribs(Builder & bld,UnalignedVsAttribLoadState * state)12726 convert_current_unaligned_vs_attribs(Builder& bld, UnalignedVsAttribLoadState* state)
12727 {
12728 if (state->current_loads.empty())
12729 return;
12730
12731 wait_for_vmem_loads(bld);
12732
12733 for (UnalignedVsAttribLoad load : state->current_loads)
12734 convert_unaligned_vs_attrib(bld, load);
12735 state->current_loads.clear();
12736
12737 state->overflow_num_vgprs = state->initial_num_vgprs;
12738 state->num_vgprs = &state->overflow_num_vgprs;
12739 }
12740
12741 void
load_unaligned_vs_attrib(Builder & bld,PhysReg dst,Operand desc,Operand index,uint32_t offset,const struct ac_vtx_format_info * vtx_info,UnalignedVsAttribLoadState * state)12742 load_unaligned_vs_attrib(Builder& bld, PhysReg dst, Operand desc, Operand index, uint32_t offset,
12743 const struct ac_vtx_format_info* vtx_info,
12744 UnalignedVsAttribLoadState* state)
12745 {
12746 unsigned size = vtx_info->chan_byte_size ? vtx_info->chan_byte_size : vtx_info->element_size;
12747
12748 UnalignedVsAttribLoad load;
12749 load.dst = dst;
12750 load.vtx_info = vtx_info;
12751 load.d16 = bld.program->gfx_level >= GFX9 && !bld.program->dev.sram_ecc_enabled && size == 4;
12752
12753 unsigned num_scratch_vgprs = load.d16 ? 1 : (size - 1);
12754 if (!vtx_info->chan_byte_size) {
12755 /* When chan_byte_size==0, we're loading the entire attribute, so we can use the last 3
12756 * components of the destination.
12757 */
12758 assert(num_scratch_vgprs <= 3);
12759 load.scratch = dst.advance(4);
12760 } else {
12761 if (*state->num_vgprs + num_scratch_vgprs > state->max_vgprs)
12762 convert_current_unaligned_vs_attribs(bld, state);
12763
12764 load.scratch = get_next_vgpr(num_scratch_vgprs, state->num_vgprs, NULL);
12765 }
12766
12767 PhysReg scratch(load.scratch);
12768 if (load.d16) {
12769 bld.mubuf(aco_opcode::buffer_load_ubyte_d16, Definition(dst, v1), desc, index,
12770 Operand::c32(0u), offset, false, true);
12771 bld.mubuf(aco_opcode::buffer_load_ubyte_d16_hi, Definition(dst, v1), desc, index,
12772 Operand::c32(0u), offset + 2, false, true);
12773 bld.mubuf(aco_opcode::buffer_load_ubyte_d16, Definition(scratch, v1), desc, index,
12774 Operand::c32(0u), offset + 1, false, true);
12775 bld.mubuf(aco_opcode::buffer_load_ubyte_d16_hi, Definition(scratch, v1), desc, index,
12776 Operand::c32(0u), offset + 3, false, true);
12777 } else {
12778 for (unsigned i = 0; i < size; i++) {
12779 Definition def(i ? scratch.advance(i * 4 - 4) : dst, v1);
12780 unsigned soffset = 0, const_offset = 0;
12781
12782 if (bld.program->gfx_level >= GFX12) {
12783 const_offset = offset + i;
12784 } else {
12785 soffset = offset + i;
12786 }
12787
12788 bld.mubuf(aco_opcode::buffer_load_ubyte, def, desc, index, Operand::c32(soffset),
12789 const_offset, false, true);
12790 }
12791 }
12792
12793 state->current_loads.push_back(load);
12794 }
12795
12796 void
select_vs_prolog(Program * program,const struct aco_vs_prolog_info * pinfo,ac_shader_config * config,const struct aco_compiler_options * options,const struct aco_shader_info * info,const struct ac_shader_args * args)12797 select_vs_prolog(Program* program, const struct aco_vs_prolog_info* pinfo, ac_shader_config* config,
12798 const struct aco_compiler_options* options, const struct aco_shader_info* info,
12799 const struct ac_shader_args* args)
12800 {
12801 assert(pinfo->num_attributes > 0);
12802
12803 /* This should be enough for any shader/stage. */
12804 unsigned max_user_sgprs = options->gfx_level >= GFX9 ? 32 : 16;
12805
12806 init_program(program, compute_cs, info, options->gfx_level, options->family, options->wgp_mode,
12807 config);
12808 program->dev.vgpr_limit = 256;
12809
12810 Block* block = program->create_and_insert_block();
12811 block->kind = block_kind_top_level;
12812
12813 program->workgroup_size = 64;
12814 calc_min_waves(program);
12815
12816 /* Addition on GFX6-8 requires a carry-out (we use VCC) */
12817 program->needs_vcc = program->gfx_level <= GFX8;
12818
12819 Builder bld(program, block);
12820
12821 block->instructions.reserve(16 + pinfo->num_attributes * 4);
12822
12823 /* Besides performance, the purpose of this is also for the FeatureRequiredExportPriority GFX11.5
12824 * issue. */
12825 bld.sopp(aco_opcode::s_setprio, 3);
12826
12827 uint32_t attrib_mask = BITFIELD_MASK(pinfo->num_attributes);
12828 bool has_nontrivial_divisors = pinfo->nontrivial_divisors;
12829
12830 /* choose sgprs */
12831 PhysReg vertex_buffers(align(max_user_sgprs + 14, 2));
12832 PhysReg prolog_input = vertex_buffers.advance(8);
12833 PhysReg desc(
12834 align((has_nontrivial_divisors ? prolog_input : vertex_buffers).advance(8).reg(), 4));
12835
12836 Operand start_instance = get_arg_fixed(args, args->start_instance);
12837 Operand instance_id = get_arg_fixed(args, args->instance_id);
12838
12839 bool needs_instance_index =
12840 pinfo->instance_rate_inputs &
12841 ~(pinfo->zero_divisors | pinfo->nontrivial_divisors); /* divisor is 1 */
12842 bool needs_start_instance = pinfo->instance_rate_inputs & pinfo->zero_divisors;
12843 bool needs_vertex_index = ~pinfo->instance_rate_inputs & attrib_mask;
12844 bool needs_tmp_vgpr0 = has_nontrivial_divisors;
12845 bool needs_tmp_vgpr1 = has_nontrivial_divisors &&
12846 (program->gfx_level <= GFX8 || program->gfx_level >= GFX11);
12847
12848 int vgpr_offset = pinfo->misaligned_mask & (1u << (pinfo->num_attributes - 1)) ? 0 : -4;
12849
12850 unsigned num_vgprs = args->num_vgprs_used;
12851 PhysReg attributes_start = get_next_vgpr(pinfo->num_attributes * 4, &num_vgprs);
12852 PhysReg vertex_index, instance_index, start_instance_vgpr, nontrivial_tmp_vgpr0, nontrivial_tmp_vgpr1;
12853 if (needs_vertex_index)
12854 vertex_index = get_next_vgpr(1, &num_vgprs, &vgpr_offset);
12855 if (needs_instance_index)
12856 instance_index = get_next_vgpr(1, &num_vgprs, &vgpr_offset);
12857 if (needs_start_instance)
12858 start_instance_vgpr = get_next_vgpr(1, &num_vgprs, &vgpr_offset);
12859 if (needs_tmp_vgpr0)
12860 nontrivial_tmp_vgpr0 = get_next_vgpr(1, &num_vgprs, &vgpr_offset);
12861 if (needs_tmp_vgpr1)
12862 nontrivial_tmp_vgpr1 = get_next_vgpr(1, &num_vgprs, &vgpr_offset);
12863
12864 bld.sop1(aco_opcode::s_mov_b32, Definition(vertex_buffers, s1),
12865 get_arg_fixed(args, args->vertex_buffers));
12866 if (options->address32_hi >= 0xffff8000 || options->address32_hi <= 0x7fff) {
12867 bld.sopk(aco_opcode::s_movk_i32, Definition(vertex_buffers.advance(4), s1),
12868 options->address32_hi & 0xFFFF);
12869 } else {
12870 bld.sop1(aco_opcode::s_mov_b32, Definition(vertex_buffers.advance(4), s1),
12871 Operand::c32((unsigned)options->address32_hi));
12872 }
12873
12874 const struct ac_vtx_format_info* vtx_info_table =
12875 ac_get_vtx_format_info_table(GFX8, CHIP_POLARIS10);
12876
12877 UnalignedVsAttribLoadState unaligned_state;
12878 unaligned_state.max_vgprs = MAX2(84, num_vgprs + 8);
12879 unaligned_state.initial_num_vgprs = num_vgprs;
12880 unaligned_state.num_vgprs = &num_vgprs;
12881
12882 unsigned num_sgprs = 0;
12883 for (unsigned loc = 0; loc < pinfo->num_attributes;) {
12884 unsigned num_descs =
12885 load_vb_descs(bld, desc, Operand(vertex_buffers, s2), loc, pinfo->num_attributes - loc);
12886 num_sgprs = MAX2(num_sgprs, desc.advance(num_descs * 16u).reg());
12887
12888 if (loc == 0) {
12889 /* perform setup while we load the descriptors */
12890 if (pinfo->is_ngg || pinfo->next_stage != MESA_SHADER_VERTEX) {
12891 Operand count = get_arg_fixed(args, args->merged_wave_info);
12892 bld.sop2(aco_opcode::s_bfm_b64, Definition(exec, s2), count, Operand::c32(0u));
12893 if (program->wave_size == 64) {
12894 bld.sopc(aco_opcode::s_bitcmp1_b32, Definition(scc, s1), count,
12895 Operand::c32(6u /* log2(64) */));
12896 bld.sop2(aco_opcode::s_cselect_b64, Definition(exec, s2), Operand::c64(UINT64_MAX),
12897 Operand(exec, s2), Operand(scc, s1));
12898 }
12899 }
12900
12901 /* If there are no HS threads, SPI mistakenly loads the LS VGPRs starting at VGPR 0. */
12902 if (info->hw_stage == AC_HW_HULL_SHADER && options->has_ls_vgpr_init_bug) {
12903 /* We don't want load_vb_descs() to write vcc. */
12904 assert(program->dev.sgpr_limit <= vcc.reg());
12905
12906 bld.sop2(aco_opcode::s_bfe_u32, Definition(vcc, s1), Definition(scc, s1),
12907 get_arg_fixed(args, args->merged_wave_info), Operand::c32((8u << 16) | 8u));
12908 bld.sop2(Builder::s_cselect, Definition(vcc, bld.lm), Operand::c32(-1), Operand::zero(),
12909 Operand(scc, s1));
12910
12911 /* These copies are ordered so that vertex_id=tcs_patch_id doesn't overwrite vertex_id
12912 * before instance_id=vertex_id. */
12913 ac_arg src_args[] = {args->vertex_id, args->tcs_rel_ids, args->tcs_patch_id};
12914 ac_arg dst_args[] = {args->instance_id, args->vs_rel_patch_id, args->vertex_id};
12915 for (unsigned i = 0; i < 3; i++) {
12916 bld.vop2(aco_opcode::v_cndmask_b32, Definition(get_arg_reg(args, dst_args[i]), v1),
12917 get_arg_fixed(args, src_args[i]), get_arg_fixed(args, dst_args[i]),
12918 Operand(vcc, bld.lm));
12919 }
12920 }
12921
12922 if (needs_vertex_index)
12923 bld.vadd32(Definition(vertex_index, v1), get_arg_fixed(args, args->base_vertex),
12924 get_arg_fixed(args, args->vertex_id), false, Operand(s2), true);
12925 if (needs_instance_index)
12926 bld.vadd32(Definition(instance_index, v1), start_instance, instance_id, false,
12927 Operand(s2), true);
12928 if (needs_start_instance)
12929 bld.vop1(aco_opcode::v_mov_b32, Definition(start_instance_vgpr, v1), start_instance);
12930 }
12931
12932 wait_for_smem_loads(bld);
12933
12934 for (unsigned i = 0; i < num_descs;) {
12935 PhysReg dest(attributes_start.reg() + loc * 4u);
12936
12937 /* calculate index */
12938 Operand fetch_index = Operand(vertex_index, v1);
12939 if (pinfo->instance_rate_inputs & (1u << loc)) {
12940 if (!(pinfo->zero_divisors & (1u << loc))) {
12941 fetch_index = instance_id;
12942 if (pinfo->nontrivial_divisors & (1u << loc)) {
12943 unsigned index = util_bitcount(pinfo->nontrivial_divisors & BITFIELD_MASK(loc));
12944 fetch_index = calc_nontrivial_instance_id(
12945 bld, args, pinfo, index, instance_id, start_instance, prolog_input,
12946 nontrivial_tmp_vgpr0, nontrivial_tmp_vgpr1);
12947 } else {
12948 fetch_index = Operand(instance_index, v1);
12949 }
12950 } else {
12951 fetch_index = Operand(start_instance_vgpr, v1);
12952 }
12953 }
12954
12955 /* perform load */
12956 PhysReg cur_desc = desc.advance(i * 16);
12957 if ((pinfo->misaligned_mask & (1u << loc))) {
12958 const struct ac_vtx_format_info* vtx_info = &vtx_info_table[pinfo->formats[loc]];
12959
12960 assert(vtx_info->has_hw_format & 0x1);
12961 unsigned dfmt = vtx_info->hw_format[0] & 0xf;
12962 unsigned nfmt = vtx_info->hw_format[0] >> 4;
12963
12964 for (unsigned j = 0; j < (vtx_info->chan_byte_size ? vtx_info->num_channels : 1); j++) {
12965 bool post_shuffle = pinfo->post_shuffle & (1u << loc);
12966 unsigned offset = vtx_info->chan_byte_size * (post_shuffle && j < 3 ? 2 - j : j);
12967 unsigned soffset = 0, const_offset = 0;
12968
12969 /* We need to use soffset on GFX6-7 to avoid being considered
12970 * out-of-bounds when offset>=stride. GFX12 doesn't support a
12971 * non-zero constant soffset.
12972 */
12973 if (program->gfx_level >= GFX12) {
12974 const_offset = offset;
12975 } else {
12976 soffset = offset;
12977 }
12978
12979 if ((pinfo->unaligned_mask & (1u << loc)) && vtx_info->chan_byte_size <= 4)
12980 load_unaligned_vs_attrib(bld, dest.advance(j * 4u), Operand(cur_desc, s4),
12981 fetch_index, offset, vtx_info, &unaligned_state);
12982 else if (vtx_info->chan_byte_size == 8)
12983 bld.mtbuf(aco_opcode::tbuffer_load_format_xy,
12984 Definition(dest.advance(j * 8u), v2), Operand(cur_desc, s4),
12985 fetch_index, Operand::c32(soffset), dfmt, nfmt, const_offset, false,
12986 true);
12987 else
12988 bld.mtbuf(aco_opcode::tbuffer_load_format_x, Definition(dest.advance(j * 4u), v1),
12989 Operand(cur_desc, s4), fetch_index, Operand::c32(soffset), dfmt, nfmt,
12990 const_offset, false, true);
12991 }
12992
12993 unsigned slots = vtx_info->chan_byte_size == 8 && vtx_info->num_channels > 2 ? 2 : 1;
12994 loc += slots;
12995 i += slots;
12996 } else {
12997 bld.mubuf(aco_opcode::buffer_load_format_xyzw, Definition(dest, v4),
12998 Operand(cur_desc, s4), fetch_index, Operand::c32(0u), 0u, false, true);
12999 loc++;
13000 i++;
13001 }
13002 }
13003 }
13004
13005 uint32_t constant_mask = pinfo->misaligned_mask;
13006 while (constant_mask) {
13007 unsigned loc = u_bit_scan(&constant_mask);
13008 const struct ac_vtx_format_info* vtx_info = &vtx_info_table[pinfo->formats[loc]];
13009
13010 /* 22.1.1. Attribute Location and Component Assignment of Vulkan 1.3 specification:
13011 * For 64-bit data types, no default attribute values are provided. Input variables must
13012 * not use more components than provided by the attribute.
13013 */
13014 if (vtx_info->chan_byte_size == 8) {
13015 if (vtx_info->num_channels > 2)
13016 u_bit_scan(&constant_mask);
13017 continue;
13018 }
13019
13020 assert(vtx_info->has_hw_format & 0x1);
13021 unsigned nfmt = vtx_info->hw_format[0] >> 4;
13022
13023 uint32_t one = nfmt == V_008F0C_BUF_NUM_FORMAT_UINT || nfmt == V_008F0C_BUF_NUM_FORMAT_SINT
13024 ? 1u
13025 : 0x3f800000u;
13026 PhysReg dest(attributes_start.reg() + loc * 4u);
13027 for (unsigned j = vtx_info->num_channels; j < 4; j++) {
13028 bld.vop1(aco_opcode::v_mov_b32, Definition(dest.advance(j * 4u), v1),
13029 Operand::c32(j == 3 ? one : 0u));
13030 }
13031 }
13032
13033 convert_current_unaligned_vs_attribs(bld, &unaligned_state);
13034
13035 if (pinfo->alpha_adjust_lo | pinfo->alpha_adjust_hi)
13036 wait_for_vmem_loads(bld);
13037
13038 /* For 2_10_10_10 formats the alpha is handled as unsigned by pre-vega HW.
13039 * so we may need to fix it up. */
13040 u_foreach_bit (loc, (pinfo->alpha_adjust_lo | pinfo->alpha_adjust_hi)) {
13041 PhysReg alpha(attributes_start.reg() + loc * 4u + 3);
13042
13043 unsigned alpha_adjust = (pinfo->alpha_adjust_lo >> loc) & 0x1;
13044 alpha_adjust |= ((pinfo->alpha_adjust_hi >> loc) & 0x1) << 1;
13045
13046 if (alpha_adjust == AC_ALPHA_ADJUST_SSCALED)
13047 bld.vop1(aco_opcode::v_cvt_u32_f32, Definition(alpha, v1), Operand(alpha, v1));
13048
13049 /* For the integer-like cases, do a natural sign extension.
13050 *
13051 * For the SNORM case, the values are 0.0, 0.333, 0.666, 1.0
13052 * and happen to contain 0, 1, 2, 3 as the two LSBs of the
13053 * exponent.
13054 */
13055 unsigned offset = alpha_adjust == AC_ALPHA_ADJUST_SNORM ? 23u : 0u;
13056 bld.vop3(aco_opcode::v_bfe_i32, Definition(alpha, v1), Operand(alpha, v1),
13057 Operand::c32(offset), Operand::c32(2u));
13058
13059 /* Convert back to the right type. */
13060 if (alpha_adjust == AC_ALPHA_ADJUST_SNORM) {
13061 bld.vop1(aco_opcode::v_cvt_f32_i32, Definition(alpha, v1), Operand(alpha, v1));
13062 bld.vop2(aco_opcode::v_max_f32, Definition(alpha, v1), Operand::c32(0xbf800000u),
13063 Operand(alpha, v1));
13064 } else if (alpha_adjust == AC_ALPHA_ADJUST_SSCALED) {
13065 bld.vop1(aco_opcode::v_cvt_f32_i32, Definition(alpha, v1), Operand(alpha, v1));
13066 }
13067 }
13068
13069 block->kind |= block_kind_uniform;
13070
13071 /* continue on to the main shader */
13072 Operand continue_pc = get_arg_fixed(args, pinfo->inputs);
13073 if (has_nontrivial_divisors) {
13074 bld.smem(aco_opcode::s_load_dwordx2, Definition(prolog_input, s2),
13075 get_arg_fixed(args, pinfo->inputs), Operand::c32(0u));
13076 wait_for_smem_loads(bld);
13077 continue_pc = Operand(prolog_input, s2);
13078 }
13079
13080 bld.sop1(aco_opcode::s_setpc_b64, continue_pc);
13081
13082 program->config->float_mode = program->blocks[0].fp_mode.val;
13083 program->config->num_vgprs = std::min<uint16_t>(get_vgpr_alloc(program, num_vgprs), 256);
13084 program->config->num_sgprs = get_sgpr_alloc(program, num_sgprs);
13085 }
13086
13087 void
select_ps_epilog(Program * program,void * pinfo,ac_shader_config * config,const struct aco_compiler_options * options,const struct aco_shader_info * info,const struct ac_shader_args * args)13088 select_ps_epilog(Program* program, void* pinfo, ac_shader_config* config,
13089 const struct aco_compiler_options* options, const struct aco_shader_info* info,
13090 const struct ac_shader_args* args)
13091 {
13092 const struct aco_ps_epilog_info* einfo = (const struct aco_ps_epilog_info*)pinfo;
13093 isel_context ctx =
13094 setup_isel_context(program, 0, NULL, config, options, info, args, SWStage::FS);
13095
13096 ctx.block->fp_mode = program->next_fp_mode;
13097
13098 add_startpgm(&ctx);
13099 append_logical_start(ctx.block);
13100
13101 Builder bld(ctx.program, ctx.block);
13102
13103 bool has_mrtz_alpha = einfo->alpha_to_coverage_via_mrtz && einfo->colors[0].used;
13104 Temp mrtz_alpha;
13105
13106 Temp colors[MAX_DRAW_BUFFERS][4];
13107 for (unsigned i = 0; i < MAX_DRAW_BUFFERS; i++) {
13108 if (!einfo->colors[i].used)
13109 continue;
13110
13111 Temp color = get_arg(&ctx, einfo->colors[i]);
13112 unsigned col_types = (einfo->color_types >> (i * 2)) & 0x3;
13113
13114 emit_split_vector(&ctx, color, col_types == ACO_TYPE_ANY32 ? 4 : 8);
13115 for (unsigned c = 0; c < 4; ++c) {
13116 colors[i][c] = emit_extract_vector(&ctx, color, c, col_types == ACO_TYPE_ANY32 ? v1 : v2b);
13117 }
13118
13119 /* Store MRTZ.a before applying alpha-to-one if enabled. */
13120 if (has_mrtz_alpha && i == 0)
13121 mrtz_alpha = colors[0][3];
13122
13123 emit_clamp_alpha_test(&ctx, einfo, colors[i], i);
13124 }
13125
13126 bool has_mrtz_depth = einfo->depth.used && !einfo->kill_depth;
13127 bool has_mrtz_stencil = einfo->stencil.used && !einfo->kill_stencil;
13128 bool has_mrtz_samplemask = einfo->samplemask.used && !einfo->kill_samplemask;
13129 bool has_mrtz_export =
13130 has_mrtz_depth || has_mrtz_stencil || has_mrtz_samplemask || has_mrtz_alpha;
13131 if (has_mrtz_export) {
13132 Temp depth = has_mrtz_depth ? get_arg(&ctx, einfo->depth) : Temp();
13133 Temp stencil = has_mrtz_stencil ? get_arg(&ctx, einfo->stencil) : Temp();
13134 Temp samplemask = has_mrtz_samplemask ? get_arg(&ctx, einfo->samplemask) : Temp();
13135
13136 export_fs_mrtz(&ctx, einfo, depth, stencil, samplemask, mrtz_alpha);
13137 }
13138
13139 /* Export all color render targets */
13140 struct aco_export_mrt mrts[MAX_DRAW_BUFFERS];
13141 unsigned mrt_num = 0;
13142
13143 if (einfo->broadcast_last_cbuf) {
13144 for (unsigned i = 0; i <= einfo->broadcast_last_cbuf; i++) {
13145 struct aco_export_mrt* mrt = &mrts[mrt_num];
13146 if (export_fs_mrt_color(&ctx, einfo, colors[0], i, mrt))
13147 mrt->target += mrt_num++;
13148 }
13149 } else {
13150 for (unsigned i = 0; i < MAX_DRAW_BUFFERS; i++) {
13151 struct aco_export_mrt* mrt = &mrts[mrt_num];
13152 const uint8_t cb_idx = einfo->color_map[i];
13153
13154 if (cb_idx == 0xff || !einfo->colors[cb_idx].used)
13155 continue;
13156
13157 if (export_fs_mrt_color(&ctx, einfo, colors[cb_idx], i, mrt)) {
13158 mrt->target += mrt_num++;
13159 }
13160 }
13161 }
13162
13163 if (mrt_num) {
13164 if (ctx.options->gfx_level >= GFX11 && einfo->mrt0_is_dual_src) {
13165 assert(mrt_num == 2);
13166 create_fs_dual_src_export_gfx11(&ctx, &mrts[0], &mrts[1]);
13167 } else {
13168 for (unsigned i = 0; i < mrt_num; i++)
13169 export_mrt(&ctx, &mrts[i]);
13170 }
13171 } else if (!has_mrtz_export && !einfo->skip_null_export) {
13172 create_fs_null_export(&ctx);
13173 }
13174
13175 program->config->float_mode = program->blocks[0].fp_mode.val;
13176
13177 append_logical_end(ctx.block);
13178 ctx.block->kind |= block_kind_export_end;
13179 bld.reset(ctx.block);
13180 bld.sopp(aco_opcode::s_endpgm);
13181
13182 finish_program(&ctx);
13183 }
13184
13185 void
select_ps_prolog(Program * program,void * pinfo,ac_shader_config * config,const struct aco_compiler_options * options,const struct aco_shader_info * info,const struct ac_shader_args * args)13186 select_ps_prolog(Program* program, void* pinfo, ac_shader_config* config,
13187 const struct aco_compiler_options* options, const struct aco_shader_info* info,
13188 const struct ac_shader_args* args)
13189 {
13190 const struct aco_ps_prolog_info* finfo = (const struct aco_ps_prolog_info*)pinfo;
13191 isel_context ctx =
13192 setup_isel_context(program, 0, NULL, config, options, info, args, SWStage::FS);
13193
13194 ctx.block->fp_mode = program->next_fp_mode;
13195
13196 add_startpgm(&ctx);
13197 append_logical_start(ctx.block);
13198
13199 if (finfo->poly_stipple)
13200 emit_polygon_stipple(&ctx, finfo);
13201
13202 overwrite_interp_args(&ctx, finfo);
13203
13204 overwrite_samplemask_arg(&ctx, finfo);
13205
13206 std::vector<Operand> regs;
13207 passthrough_all_args(&ctx, regs);
13208
13209 interpolate_color_args(&ctx, finfo, regs);
13210
13211 program->config->float_mode = program->blocks[0].fp_mode.val;
13212
13213 append_logical_end(ctx.block);
13214
13215 build_end_with_regs(&ctx, regs);
13216
13217 /* To compute all end args in WQM mode if required by main part. */
13218 if (finfo->needs_wqm)
13219 set_wqm(&ctx, true);
13220
13221 /* Exit WQM mode finally. */
13222 program->needs_exact = true;
13223
13224 finish_program(&ctx);
13225 }
13226
13227 } // namespace aco
13228