1 /*
2 * Copyright © 2018 Valve Corporation
3 * Copyright © 2018 Google
4 *
5 * Permission is hereby granted, free of charge, to any person obtaining a
6 * copy of this software and associated documentation files (the "Software"),
7 * to deal in the Software without restriction, including without limitation
8 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
9 * and/or sell copies of the Software, and to permit persons to whom the
10 * Software is furnished to do so, subject to the following conditions:
11 *
12 * The above copyright notice and this permission notice (including the next
13 * paragraph) shall be included in all copies or substantial portions of the
14 * Software.
15 *
16 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
19 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
21 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
22 * IN THE SOFTWARE.
23 *
24 */
25
26 #include "aco_instruction_selection.h"
27
28 #include "aco_builder.h"
29 #include "aco_interface.h"
30 #include "aco_ir.h"
31
32 #include "common/ac_nir.h"
33 #include "common/sid.h"
34
35 #include "util/fast_idiv_by_const.h"
36 #include "util/memstream.h"
37
38 #include <array>
39 #include <functional>
40 #include <map>
41 #include <numeric>
42 #include <stack>
43 #include <utility>
44 #include <vector>
45
46 namespace aco {
47 namespace {
48
49 #define isel_err(...) _isel_err(ctx, __FILE__, __LINE__, __VA_ARGS__)
50
51 static void
_isel_err(isel_context * ctx,const char * file,unsigned line,const nir_instr * instr,const char * msg)52 _isel_err(isel_context* ctx, const char* file, unsigned line, const nir_instr* instr,
53 const char* msg)
54 {
55 char* out;
56 size_t outsize;
57 struct u_memstream mem;
58 u_memstream_open(&mem, &out, &outsize);
59 FILE* const memf = u_memstream_get(&mem);
60
61 fprintf(memf, "%s: ", msg);
62 nir_print_instr(instr, memf);
63 u_memstream_close(&mem);
64
65 _aco_err(ctx->program, file, line, out);
66 free(out);
67 }
68
69 struct if_context {
70 Temp cond;
71
72 bool divergent_old;
73 bool exec_potentially_empty_discard_old;
74 bool exec_potentially_empty_break_old;
75 bool had_divergent_discard_old;
76 bool had_divergent_discard_then;
77 uint16_t exec_potentially_empty_break_depth_old;
78
79 unsigned BB_if_idx;
80 unsigned invert_idx;
81 bool uniform_has_then_branch;
82 bool then_branch_divergent;
83 Block BB_invert;
84 Block BB_endif;
85 };
86
87 struct loop_context {
88 Block loop_exit;
89
90 unsigned header_idx_old;
91 Block* exit_old;
92 bool divergent_cont_old;
93 bool divergent_branch_old;
94 bool divergent_if_old;
95 };
96
97 static bool visit_cf_list(struct isel_context* ctx, struct exec_list* list);
98
99 static void
add_logical_edge(unsigned pred_idx,Block * succ)100 add_logical_edge(unsigned pred_idx, Block* succ)
101 {
102 succ->logical_preds.emplace_back(pred_idx);
103 }
104
105 static void
add_linear_edge(unsigned pred_idx,Block * succ)106 add_linear_edge(unsigned pred_idx, Block* succ)
107 {
108 succ->linear_preds.emplace_back(pred_idx);
109 }
110
111 static void
add_edge(unsigned pred_idx,Block * succ)112 add_edge(unsigned pred_idx, Block* succ)
113 {
114 add_logical_edge(pred_idx, succ);
115 add_linear_edge(pred_idx, succ);
116 }
117
118 static void
append_logical_start(Block * b)119 append_logical_start(Block* b)
120 {
121 Builder(NULL, b).pseudo(aco_opcode::p_logical_start);
122 }
123
124 static void
append_logical_end(Block * b)125 append_logical_end(Block* b)
126 {
127 Builder(NULL, b).pseudo(aco_opcode::p_logical_end);
128 }
129
130 Temp
get_ssa_temp(struct isel_context * ctx,nir_def * def)131 get_ssa_temp(struct isel_context* ctx, nir_def* def)
132 {
133 uint32_t id = ctx->first_temp_id + def->index;
134 return Temp(id, ctx->program->temp_rc[id]);
135 }
136
137 Temp
emit_mbcnt(isel_context * ctx,Temp dst,Operand mask=Operand (),Operand base=Operand::zero ())138 emit_mbcnt(isel_context* ctx, Temp dst, Operand mask = Operand(), Operand base = Operand::zero())
139 {
140 Builder bld(ctx->program, ctx->block);
141 assert(mask.isUndefined() || mask.isTemp() || (mask.isFixed() && mask.physReg() == exec));
142 assert(mask.isUndefined() || mask.bytes() == bld.lm.bytes());
143
144 if (ctx->program->wave_size == 32) {
145 Operand mask_lo = mask.isUndefined() ? Operand::c32(-1u) : mask;
146 return bld.vop3(aco_opcode::v_mbcnt_lo_u32_b32, Definition(dst), mask_lo, base);
147 }
148
149 Operand mask_lo = Operand::c32(-1u);
150 Operand mask_hi = Operand::c32(-1u);
151
152 if (mask.isTemp()) {
153 RegClass rc = RegClass(mask.regClass().type(), 1);
154 Builder::Result mask_split =
155 bld.pseudo(aco_opcode::p_split_vector, bld.def(rc), bld.def(rc), mask);
156 mask_lo = Operand(mask_split.def(0).getTemp());
157 mask_hi = Operand(mask_split.def(1).getTemp());
158 } else if (mask.physReg() == exec) {
159 mask_lo = Operand(exec_lo, s1);
160 mask_hi = Operand(exec_hi, s1);
161 }
162
163 Temp mbcnt_lo = bld.vop3(aco_opcode::v_mbcnt_lo_u32_b32, bld.def(v1), mask_lo, base);
164
165 if (ctx->program->gfx_level <= GFX7)
166 return bld.vop2(aco_opcode::v_mbcnt_hi_u32_b32, Definition(dst), mask_hi, mbcnt_lo);
167 else
168 return bld.vop3(aco_opcode::v_mbcnt_hi_u32_b32_e64, Definition(dst), mask_hi, mbcnt_lo);
169 }
170
171 inline void
set_wqm(isel_context * ctx,bool enable_helpers=false)172 set_wqm(isel_context* ctx, bool enable_helpers = false)
173 {
174 if (ctx->program->stage == fragment_fs) {
175 ctx->wqm_block_idx = ctx->block->index;
176 ctx->wqm_instruction_idx = ctx->block->instructions.size();
177 if (ctx->shader)
178 enable_helpers |= ctx->shader->info.fs.require_full_quads;
179 ctx->program->needs_wqm |= enable_helpers;
180 }
181 }
182
183 static Temp
emit_bpermute(isel_context * ctx,Builder & bld,Temp index,Temp data)184 emit_bpermute(isel_context* ctx, Builder& bld, Temp index, Temp data)
185 {
186 if (index.regClass() == s1)
187 return bld.readlane(bld.def(s1), data, index);
188
189 /* Avoid using shared VGPRs for shuffle on GFX10 when the shader consists
190 * of multiple binaries, because the VGPR use is not known when choosing
191 * which registers to use for the shared VGPRs.
192 */
193 const bool avoid_shared_vgprs =
194 ctx->options->gfx_level >= GFX10 && ctx->options->gfx_level < GFX11 &&
195 ctx->program->wave_size == 64 &&
196 (ctx->program->info.has_epilog || ctx->program->info.merged_shader_compiled_separately ||
197 ctx->stage == raytracing_cs);
198
199 if (ctx->options->gfx_level <= GFX7 || avoid_shared_vgprs) {
200 /* GFX6-7: there is no bpermute instruction */
201 Operand index_op(index);
202 Operand input_data(data);
203 index_op.setLateKill(true);
204 input_data.setLateKill(true);
205
206 return bld.pseudo(aco_opcode::p_bpermute_readlane, bld.def(v1), bld.def(bld.lm),
207 bld.def(bld.lm, vcc), index_op, input_data);
208 } else if (ctx->options->gfx_level >= GFX10 && ctx->program->wave_size == 64) {
209
210 /* GFX10 wave64 mode: emulate full-wave bpermute */
211 Temp index_is_lo =
212 bld.vopc(aco_opcode::v_cmp_ge_u32, bld.def(bld.lm), Operand::c32(31u), index);
213 Builder::Result index_is_lo_split =
214 bld.pseudo(aco_opcode::p_split_vector, bld.def(s1), bld.def(s1), index_is_lo);
215 Temp index_is_lo_n1 = bld.sop1(aco_opcode::s_not_b32, bld.def(s1), bld.def(s1, scc),
216 index_is_lo_split.def(1).getTemp());
217 Operand same_half = bld.pseudo(aco_opcode::p_create_vector, bld.def(s2),
218 index_is_lo_split.def(0).getTemp(), index_is_lo_n1);
219 Operand index_x4 = bld.vop2(aco_opcode::v_lshlrev_b32, bld.def(v1), Operand::c32(2u), index);
220 Operand input_data(data);
221
222 index_x4.setLateKill(true);
223 input_data.setLateKill(true);
224 same_half.setLateKill(true);
225
226 if (ctx->options->gfx_level <= GFX10_3) {
227 /* We need one pair of shared VGPRs:
228 * Note, that these have twice the allocation granularity of normal VGPRs
229 */
230 ctx->program->config->num_shared_vgprs = 2 * ctx->program->dev.vgpr_alloc_granule;
231
232 return bld.pseudo(aco_opcode::p_bpermute_shared_vgpr, bld.def(v1), bld.def(s2),
233 bld.def(s1, scc), index_x4, input_data, same_half);
234 } else {
235 return bld.pseudo(aco_opcode::p_bpermute_permlane, bld.def(v1), bld.def(s2),
236 bld.def(s1, scc), Operand(v1.as_linear()), index_x4, input_data,
237 same_half);
238 }
239 } else {
240 /* GFX8-9 or GFX10 wave32: bpermute works normally */
241 Temp index_x4 = bld.vop2(aco_opcode::v_lshlrev_b32, bld.def(v1), Operand::c32(2u), index);
242 return bld.ds(aco_opcode::ds_bpermute_b32, bld.def(v1), index_x4, data);
243 }
244 }
245
246 static Temp
emit_masked_swizzle(isel_context * ctx,Builder & bld,Temp src,unsigned mask,bool allow_fi)247 emit_masked_swizzle(isel_context* ctx, Builder& bld, Temp src, unsigned mask, bool allow_fi)
248 {
249 if (ctx->options->gfx_level >= GFX8) {
250 unsigned and_mask = mask & 0x1f;
251 unsigned or_mask = (mask >> 5) & 0x1f;
252 unsigned xor_mask = (mask >> 10) & 0x1f;
253
254 /* Eliminate or_mask. */
255 and_mask &= ~or_mask;
256 xor_mask ^= or_mask;
257
258 uint16_t dpp_ctrl = 0xffff;
259
260 /* DPP16 before DPP8 before v_permlane(x)16_b32
261 * because DPP16 supports modifiers and v_permlane
262 * can't be folded into valu instructions.
263 */
264 if ((and_mask & 0x1c) == 0x1c && xor_mask < 4) {
265 unsigned res[4];
266 for (unsigned i = 0; i < 4; i++)
267 res[i] = ((i & and_mask) ^ xor_mask);
268 dpp_ctrl = dpp_quad_perm(res[0], res[1], res[2], res[3]);
269 } else if (and_mask == 0x1f && xor_mask == 8) {
270 dpp_ctrl = dpp_row_rr(8);
271 } else if (and_mask == 0x1f && xor_mask == 0xf) {
272 dpp_ctrl = dpp_row_mirror;
273 } else if (and_mask == 0x1f && xor_mask == 0x7) {
274 dpp_ctrl = dpp_row_half_mirror;
275 } else if (ctx->options->gfx_level >= GFX11 && and_mask == 0x10 && xor_mask < 0x10) {
276 dpp_ctrl = dpp_row_share(xor_mask);
277 } else if (ctx->options->gfx_level >= GFX11 && and_mask == 0x1f && xor_mask < 0x10) {
278 dpp_ctrl = dpp_row_xmask(xor_mask);
279 } else if (ctx->options->gfx_level >= GFX10 && (and_mask & 0x18) == 0x18 && xor_mask < 8) {
280 uint32_t lane_sel = 0;
281 for (unsigned i = 0; i < 8; i++)
282 lane_sel |= ((i & and_mask) ^ xor_mask) << (i * 3);
283 return bld.vop1_dpp8(aco_opcode::v_mov_b32, bld.def(v1), src, lane_sel, allow_fi);
284 } else if (ctx->options->gfx_level >= GFX10 && (and_mask & 0x10) == 0x10) {
285 uint64_t lane_mask = 0;
286 for (unsigned i = 0; i < 16; i++)
287 lane_mask |= uint64_t((i & and_mask) ^ (xor_mask & 0xf)) << i * 4;
288 aco_opcode opcode =
289 xor_mask & 0x10 ? aco_opcode::v_permlanex16_b32 : aco_opcode::v_permlane16_b32;
290 Temp op1 = bld.copy(bld.def(s1), Operand::c32(lane_mask & 0xffffffff));
291 Temp op2 = bld.copy(bld.def(s1), Operand::c32(lane_mask >> 32));
292 Builder::Result ret = bld.vop3(opcode, bld.def(v1), src, op1, op2);
293 ret->valu().opsel[0] = allow_fi; /* set FETCH_INACTIVE */
294 ret->valu().opsel[1] = true; /* set BOUND_CTRL */
295 return ret;
296 }
297
298 if (dpp_ctrl != 0xffff)
299 return bld.vop1_dpp(aco_opcode::v_mov_b32, bld.def(v1), src, dpp_ctrl, 0xf, 0xf, true,
300 allow_fi);
301 }
302
303 return bld.ds(aco_opcode::ds_swizzle_b32, bld.def(v1), src, mask, 0, false);
304 }
305
306 Temp
as_vgpr(Builder & bld,Temp val)307 as_vgpr(Builder& bld, Temp val)
308 {
309 if (val.type() == RegType::sgpr)
310 return bld.copy(bld.def(RegType::vgpr, val.size()), val);
311 assert(val.type() == RegType::vgpr);
312 return val;
313 }
314
315 Temp
as_vgpr(isel_context * ctx,Temp val)316 as_vgpr(isel_context* ctx, Temp val)
317 {
318 Builder bld(ctx->program, ctx->block);
319 return as_vgpr(bld, val);
320 }
321
322 void
emit_extract_vector(isel_context * ctx,Temp src,uint32_t idx,Temp dst)323 emit_extract_vector(isel_context* ctx, Temp src, uint32_t idx, Temp dst)
324 {
325 Builder bld(ctx->program, ctx->block);
326 bld.pseudo(aco_opcode::p_extract_vector, Definition(dst), src, Operand::c32(idx));
327 }
328
329 Temp
emit_extract_vector(isel_context * ctx,Temp src,uint32_t idx,RegClass dst_rc)330 emit_extract_vector(isel_context* ctx, Temp src, uint32_t idx, RegClass dst_rc)
331 {
332 /* no need to extract the whole vector */
333 if (src.regClass() == dst_rc) {
334 assert(idx == 0);
335 return src;
336 }
337
338 assert(src.bytes() > (idx * dst_rc.bytes()));
339 Builder bld(ctx->program, ctx->block);
340 auto it = ctx->allocated_vec.find(src.id());
341 if (it != ctx->allocated_vec.end() && dst_rc.bytes() == it->second[idx].regClass().bytes()) {
342 if (it->second[idx].regClass() == dst_rc) {
343 return it->second[idx];
344 } else {
345 assert(!dst_rc.is_subdword());
346 assert(dst_rc.type() == RegType::vgpr && it->second[idx].type() == RegType::sgpr);
347 return bld.copy(bld.def(dst_rc), it->second[idx]);
348 }
349 }
350
351 if (dst_rc.is_subdword())
352 src = as_vgpr(ctx, src);
353
354 if (src.bytes() == dst_rc.bytes()) {
355 assert(idx == 0);
356 return bld.copy(bld.def(dst_rc), src);
357 } else {
358 Temp dst = bld.tmp(dst_rc);
359 emit_extract_vector(ctx, src, idx, dst);
360 return dst;
361 }
362 }
363
364 void
emit_split_vector(isel_context * ctx,Temp vec_src,unsigned num_components)365 emit_split_vector(isel_context* ctx, Temp vec_src, unsigned num_components)
366 {
367 if (num_components == 1)
368 return;
369 if (ctx->allocated_vec.find(vec_src.id()) != ctx->allocated_vec.end())
370 return;
371 RegClass rc;
372 if (num_components > vec_src.size()) {
373 if (vec_src.type() == RegType::sgpr) {
374 /* should still help get_alu_src() */
375 emit_split_vector(ctx, vec_src, vec_src.size());
376 return;
377 }
378 /* sub-dword split */
379 rc = RegClass(RegType::vgpr, vec_src.bytes() / num_components).as_subdword();
380 } else {
381 rc = RegClass(vec_src.type(), vec_src.size() / num_components);
382 }
383 aco_ptr<Pseudo_instruction> split{create_instruction<Pseudo_instruction>(
384 aco_opcode::p_split_vector, Format::PSEUDO, 1, num_components)};
385 split->operands[0] = Operand(vec_src);
386 std::array<Temp, NIR_MAX_VEC_COMPONENTS> elems;
387 for (unsigned i = 0; i < num_components; i++) {
388 elems[i] = ctx->program->allocateTmp(rc);
389 split->definitions[i] = Definition(elems[i]);
390 }
391 ctx->block->instructions.emplace_back(std::move(split));
392 ctx->allocated_vec.emplace(vec_src.id(), elems);
393 }
394
395 /* This vector expansion uses a mask to determine which elements in the new vector
396 * come from the original vector. The other elements are undefined. */
397 void
expand_vector(isel_context * ctx,Temp vec_src,Temp dst,unsigned num_components,unsigned mask,bool zero_padding=false)398 expand_vector(isel_context* ctx, Temp vec_src, Temp dst, unsigned num_components, unsigned mask,
399 bool zero_padding = false)
400 {
401 assert(vec_src.type() == RegType::vgpr);
402 Builder bld(ctx->program, ctx->block);
403
404 if (dst.type() == RegType::sgpr && num_components > dst.size()) {
405 Temp tmp_dst = bld.tmp(RegClass::get(RegType::vgpr, 2 * num_components));
406 expand_vector(ctx, vec_src, tmp_dst, num_components, mask, zero_padding);
407 bld.pseudo(aco_opcode::p_as_uniform, Definition(dst), tmp_dst);
408 ctx->allocated_vec[dst.id()] = ctx->allocated_vec[tmp_dst.id()];
409 return;
410 }
411
412 emit_split_vector(ctx, vec_src, util_bitcount(mask));
413
414 if (vec_src == dst)
415 return;
416
417 if (num_components == 1) {
418 if (dst.type() == RegType::sgpr)
419 bld.pseudo(aco_opcode::p_as_uniform, Definition(dst), vec_src);
420 else
421 bld.copy(Definition(dst), vec_src);
422 return;
423 }
424
425 unsigned component_bytes = dst.bytes() / num_components;
426 RegClass src_rc = RegClass::get(RegType::vgpr, component_bytes);
427 RegClass dst_rc = RegClass::get(dst.type(), component_bytes);
428 assert(dst.type() == RegType::vgpr || !src_rc.is_subdword());
429 std::array<Temp, NIR_MAX_VEC_COMPONENTS> elems;
430
431 Temp padding = Temp(0, dst_rc);
432 if (zero_padding)
433 padding = bld.copy(bld.def(dst_rc), Operand::zero(component_bytes));
434
435 aco_ptr<Pseudo_instruction> vec{create_instruction<Pseudo_instruction>(
436 aco_opcode::p_create_vector, Format::PSEUDO, num_components, 1)};
437 vec->definitions[0] = Definition(dst);
438 unsigned k = 0;
439 for (unsigned i = 0; i < num_components; i++) {
440 if (mask & (1 << i)) {
441 Temp src = emit_extract_vector(ctx, vec_src, k++, src_rc);
442 if (dst.type() == RegType::sgpr)
443 src = bld.as_uniform(src);
444 vec->operands[i] = Operand(src);
445 elems[i] = src;
446 } else {
447 vec->operands[i] = Operand::zero(component_bytes);
448 elems[i] = padding;
449 }
450 }
451 ctx->block->instructions.emplace_back(std::move(vec));
452 ctx->allocated_vec.emplace(dst.id(), elems);
453 }
454
455 /* adjust misaligned small bit size loads */
456 void
byte_align_scalar(isel_context * ctx,Temp vec,Operand offset,Temp dst)457 byte_align_scalar(isel_context* ctx, Temp vec, Operand offset, Temp dst)
458 {
459 Builder bld(ctx->program, ctx->block);
460 Operand shift;
461 Temp select = Temp();
462 if (offset.isConstant()) {
463 assert(offset.constantValue() && offset.constantValue() < 4);
464 shift = Operand::c32(offset.constantValue() * 8);
465 } else {
466 /* bit_offset = 8 * (offset & 0x3) */
467 Temp tmp =
468 bld.sop2(aco_opcode::s_and_b32, bld.def(s1), bld.def(s1, scc), offset, Operand::c32(3u));
469 select = bld.tmp(s1);
470 shift = bld.sop2(aco_opcode::s_lshl_b32, bld.def(s1), bld.scc(Definition(select)), tmp,
471 Operand::c32(3u));
472 }
473
474 if (vec.size() == 1) {
475 bld.sop2(aco_opcode::s_lshr_b32, Definition(dst), bld.def(s1, scc), vec, shift);
476 } else if (vec.size() == 2) {
477 Temp tmp = dst.size() == 2 ? dst : bld.tmp(s2);
478 bld.sop2(aco_opcode::s_lshr_b64, Definition(tmp), bld.def(s1, scc), vec, shift);
479 if (tmp == dst)
480 emit_split_vector(ctx, dst, 2);
481 else
482 emit_extract_vector(ctx, tmp, 0, dst);
483 } else if (vec.size() == 3 || vec.size() == 4) {
484 Temp lo = bld.tmp(s2), hi;
485 if (vec.size() == 3) {
486 /* this can happen if we use VMEM for a uniform load */
487 hi = bld.tmp(s1);
488 bld.pseudo(aco_opcode::p_split_vector, Definition(lo), Definition(hi), vec);
489 } else {
490 hi = bld.tmp(s2);
491 bld.pseudo(aco_opcode::p_split_vector, Definition(lo), Definition(hi), vec);
492 hi = bld.pseudo(aco_opcode::p_extract_vector, bld.def(s1), hi, Operand::zero());
493 }
494 if (select != Temp())
495 hi =
496 bld.sop2(aco_opcode::s_cselect_b32, bld.def(s1), hi, Operand::zero(), bld.scc(select));
497 lo = bld.sop2(aco_opcode::s_lshr_b64, bld.def(s2), bld.def(s1, scc), lo, shift);
498 Temp mid = bld.tmp(s1);
499 lo = bld.pseudo(aco_opcode::p_split_vector, bld.def(s1), Definition(mid), lo);
500 hi = bld.sop2(aco_opcode::s_lshl_b32, bld.def(s1), bld.def(s1, scc), hi, shift);
501 mid = bld.sop2(aco_opcode::s_or_b32, bld.def(s1), bld.def(s1, scc), hi, mid);
502 bld.pseudo(aco_opcode::p_create_vector, Definition(dst), lo, mid);
503 emit_split_vector(ctx, dst, 2);
504 }
505 }
506
507 void
byte_align_vector(isel_context * ctx,Temp vec,Operand offset,Temp dst,unsigned component_size)508 byte_align_vector(isel_context* ctx, Temp vec, Operand offset, Temp dst, unsigned component_size)
509 {
510 Builder bld(ctx->program, ctx->block);
511 if (offset.isTemp()) {
512 Temp tmp[4] = {vec, vec, vec, vec};
513
514 if (vec.size() == 4) {
515 tmp[0] = bld.tmp(v1), tmp[1] = bld.tmp(v1), tmp[2] = bld.tmp(v1), tmp[3] = bld.tmp(v1);
516 bld.pseudo(aco_opcode::p_split_vector, Definition(tmp[0]), Definition(tmp[1]),
517 Definition(tmp[2]), Definition(tmp[3]), vec);
518 } else if (vec.size() == 3) {
519 tmp[0] = bld.tmp(v1), tmp[1] = bld.tmp(v1), tmp[2] = bld.tmp(v1);
520 bld.pseudo(aco_opcode::p_split_vector, Definition(tmp[0]), Definition(tmp[1]),
521 Definition(tmp[2]), vec);
522 } else if (vec.size() == 2) {
523 tmp[0] = bld.tmp(v1), tmp[1] = bld.tmp(v1), tmp[2] = tmp[1];
524 bld.pseudo(aco_opcode::p_split_vector, Definition(tmp[0]), Definition(tmp[1]), vec);
525 }
526 for (unsigned i = 0; i < dst.size(); i++)
527 tmp[i] = bld.vop3(aco_opcode::v_alignbyte_b32, bld.def(v1), tmp[i + 1], tmp[i], offset);
528
529 vec = tmp[0];
530 if (dst.size() == 2)
531 vec = bld.pseudo(aco_opcode::p_create_vector, bld.def(v2), tmp[0], tmp[1]);
532
533 offset = Operand::zero();
534 }
535
536 unsigned num_components = vec.bytes() / component_size;
537 if (vec.regClass() == dst.regClass()) {
538 assert(offset.constantValue() == 0);
539 bld.copy(Definition(dst), vec);
540 emit_split_vector(ctx, dst, num_components);
541 return;
542 }
543
544 emit_split_vector(ctx, vec, num_components);
545 std::array<Temp, NIR_MAX_VEC_COMPONENTS> elems;
546 RegClass rc = RegClass(RegType::vgpr, component_size).as_subdword();
547
548 assert(offset.constantValue() % component_size == 0);
549 unsigned skip = offset.constantValue() / component_size;
550 for (unsigned i = skip; i < num_components; i++)
551 elems[i - skip] = emit_extract_vector(ctx, vec, i, rc);
552
553 if (dst.type() == RegType::vgpr) {
554 /* if dst is vgpr - split the src and create a shrunk version according to the mask. */
555 num_components = dst.bytes() / component_size;
556 aco_ptr<Pseudo_instruction> create_vec{create_instruction<Pseudo_instruction>(
557 aco_opcode::p_create_vector, Format::PSEUDO, num_components, 1)};
558 for (unsigned i = 0; i < num_components; i++)
559 create_vec->operands[i] = Operand(elems[i]);
560 create_vec->definitions[0] = Definition(dst);
561 bld.insert(std::move(create_vec));
562
563 } else if (skip) {
564 /* if dst is sgpr - split the src, but move the original to sgpr. */
565 vec = bld.pseudo(aco_opcode::p_as_uniform, bld.def(RegClass(RegType::sgpr, vec.size())), vec);
566 byte_align_scalar(ctx, vec, offset, dst);
567 } else {
568 assert(dst.size() == vec.size());
569 bld.pseudo(aco_opcode::p_as_uniform, Definition(dst), vec);
570 }
571
572 ctx->allocated_vec.emplace(dst.id(), elems);
573 }
574
575 Temp
get_ssa_temp_tex(struct isel_context * ctx,nir_def * def,bool is_16bit)576 get_ssa_temp_tex(struct isel_context* ctx, nir_def* def, bool is_16bit)
577 {
578 RegClass rc = RegClass::get(RegType::vgpr, (is_16bit ? 2 : 4) * def->num_components);
579 Temp tmp = get_ssa_temp(ctx, def);
580 if (tmp.bytes() != rc.bytes())
581 return emit_extract_vector(ctx, tmp, 0, rc);
582 else
583 return tmp;
584 }
585
586 Temp
bool_to_vector_condition(isel_context * ctx,Temp val,Temp dst=Temp (0,s2))587 bool_to_vector_condition(isel_context* ctx, Temp val, Temp dst = Temp(0, s2))
588 {
589 Builder bld(ctx->program, ctx->block);
590 if (!dst.id())
591 dst = bld.tmp(bld.lm);
592
593 assert(val.regClass() == s1);
594 assert(dst.regClass() == bld.lm);
595
596 return bld.sop2(Builder::s_cselect, Definition(dst), Operand::c32(-1), Operand::zero(),
597 bld.scc(val));
598 }
599
600 Temp
bool_to_scalar_condition(isel_context * ctx,Temp val,Temp dst=Temp (0,s1))601 bool_to_scalar_condition(isel_context* ctx, Temp val, Temp dst = Temp(0, s1))
602 {
603 Builder bld(ctx->program, ctx->block);
604 if (!dst.id())
605 dst = bld.tmp(s1);
606
607 assert(val.regClass() == bld.lm);
608 assert(dst.regClass() == s1);
609
610 /* if we're currently in WQM mode, ensure that the source is also computed in WQM */
611 bld.sop2(Builder::s_and, bld.def(bld.lm), bld.scc(Definition(dst)), val, Operand(exec, bld.lm));
612 return dst;
613 }
614
615 /**
616 * Copies the first src_bits of the input to the output Temp. Input bits at positions larger than
617 * src_bits and dst_bits are truncated.
618 *
619 * Sign extension may be applied using the sign_extend parameter. The position of the input sign
620 * bit is indicated by src_bits in this case.
621 *
622 * If dst.bytes() is larger than dst_bits/8, the value of the upper bits is undefined.
623 */
624 Temp
convert_int(isel_context * ctx,Builder & bld,Temp src,unsigned src_bits,unsigned dst_bits,bool sign_extend,Temp dst=Temp ())625 convert_int(isel_context* ctx, Builder& bld, Temp src, unsigned src_bits, unsigned dst_bits,
626 bool sign_extend, Temp dst = Temp())
627 {
628 assert(!(sign_extend && dst_bits < src_bits) &&
629 "Shrinking integers is not supported for signed inputs");
630
631 if (!dst.id()) {
632 if (dst_bits % 32 == 0 || src.type() == RegType::sgpr)
633 dst = bld.tmp(src.type(), DIV_ROUND_UP(dst_bits, 32u));
634 else
635 dst = bld.tmp(RegClass(RegType::vgpr, dst_bits / 8u).as_subdword());
636 }
637
638 assert(src.type() == RegType::sgpr || src_bits == src.bytes() * 8);
639 assert(dst.type() == RegType::sgpr || dst_bits == dst.bytes() * 8);
640
641 if (dst.bytes() == src.bytes() && dst_bits < src_bits) {
642 /* Copy the raw value, leaving an undefined value in the upper bits for
643 * the caller to handle appropriately */
644 return bld.copy(Definition(dst), src);
645 } else if (dst.bytes() < src.bytes()) {
646 return bld.pseudo(aco_opcode::p_extract_vector, Definition(dst), src, Operand::zero());
647 }
648
649 Temp tmp = dst;
650 if (dst_bits == 64)
651 tmp = src_bits == 32 ? src : bld.tmp(src.type(), 1);
652
653 if (tmp == src) {
654 } else if (src.regClass() == s1) {
655 assert(src_bits < 32);
656 bld.pseudo(aco_opcode::p_extract, Definition(tmp), bld.def(s1, scc), src, Operand::zero(),
657 Operand::c32(src_bits), Operand::c32((unsigned)sign_extend));
658 } else {
659 assert(src_bits < 32);
660 bld.pseudo(aco_opcode::p_extract, Definition(tmp), src, Operand::zero(),
661 Operand::c32(src_bits), Operand::c32((unsigned)sign_extend));
662 }
663
664 if (dst_bits == 64) {
665 if (sign_extend && dst.regClass() == s2) {
666 Temp high =
667 bld.sop2(aco_opcode::s_ashr_i32, bld.def(s1), bld.def(s1, scc), tmp, Operand::c32(31u));
668 bld.pseudo(aco_opcode::p_create_vector, Definition(dst), tmp, high);
669 } else if (sign_extend && dst.regClass() == v2) {
670 Temp high = bld.vop2(aco_opcode::v_ashrrev_i32, bld.def(v1), Operand::c32(31u), tmp);
671 bld.pseudo(aco_opcode::p_create_vector, Definition(dst), tmp, high);
672 } else {
673 bld.pseudo(aco_opcode::p_create_vector, Definition(dst), tmp, Operand::zero());
674 }
675 }
676
677 return dst;
678 }
679
680 enum sgpr_extract_mode {
681 sgpr_extract_sext,
682 sgpr_extract_zext,
683 sgpr_extract_undef,
684 };
685
686 Temp
extract_8_16_bit_sgpr_element(isel_context * ctx,Temp dst,nir_alu_src * src,sgpr_extract_mode mode)687 extract_8_16_bit_sgpr_element(isel_context* ctx, Temp dst, nir_alu_src* src, sgpr_extract_mode mode)
688 {
689 Temp vec = get_ssa_temp(ctx, src->src.ssa);
690 unsigned src_size = src->src.ssa->bit_size;
691 unsigned swizzle = src->swizzle[0];
692
693 if (vec.size() > 1) {
694 assert(src_size == 16);
695 vec = emit_extract_vector(ctx, vec, swizzle / 2, s1);
696 swizzle = swizzle & 1;
697 }
698
699 Builder bld(ctx->program, ctx->block);
700 Temp tmp = dst.regClass() == s2 ? bld.tmp(s1) : dst;
701
702 if (mode == sgpr_extract_undef && swizzle == 0)
703 bld.copy(Definition(tmp), vec);
704 else
705 bld.pseudo(aco_opcode::p_extract, Definition(tmp), bld.def(s1, scc), Operand(vec),
706 Operand::c32(swizzle), Operand::c32(src_size),
707 Operand::c32((mode == sgpr_extract_sext)));
708
709 if (dst.regClass() == s2)
710 convert_int(ctx, bld, tmp, 32, 64, mode == sgpr_extract_sext, dst);
711
712 return dst;
713 }
714
715 Temp
get_alu_src(struct isel_context * ctx,nir_alu_src src,unsigned size=1)716 get_alu_src(struct isel_context* ctx, nir_alu_src src, unsigned size = 1)
717 {
718 if (src.src.ssa->num_components == 1 && size == 1)
719 return get_ssa_temp(ctx, src.src.ssa);
720
721 Temp vec = get_ssa_temp(ctx, src.src.ssa);
722 unsigned elem_size = src.src.ssa->bit_size / 8u;
723 bool identity_swizzle = true;
724
725 for (unsigned i = 0; identity_swizzle && i < size; i++) {
726 if (src.swizzle[i] != i)
727 identity_swizzle = false;
728 }
729 if (identity_swizzle)
730 return emit_extract_vector(ctx, vec, 0, RegClass::get(vec.type(), elem_size * size));
731
732 assert(elem_size > 0);
733 assert(vec.bytes() % elem_size == 0);
734
735 if (elem_size < 4 && vec.type() == RegType::sgpr && size == 1) {
736 assert(src.src.ssa->bit_size == 8 || src.src.ssa->bit_size == 16);
737 return extract_8_16_bit_sgpr_element(ctx, ctx->program->allocateTmp(s1), &src,
738 sgpr_extract_undef);
739 }
740
741 bool as_uniform = elem_size < 4 && vec.type() == RegType::sgpr;
742 if (as_uniform)
743 vec = as_vgpr(ctx, vec);
744
745 RegClass elem_rc = elem_size < 4 ? RegClass(vec.type(), elem_size).as_subdword()
746 : RegClass(vec.type(), elem_size / 4);
747 if (size == 1) {
748 return emit_extract_vector(ctx, vec, src.swizzle[0], elem_rc);
749 } else {
750 assert(size <= 4);
751 std::array<Temp, NIR_MAX_VEC_COMPONENTS> elems;
752 aco_ptr<Pseudo_instruction> vec_instr{create_instruction<Pseudo_instruction>(
753 aco_opcode::p_create_vector, Format::PSEUDO, size, 1)};
754 for (unsigned i = 0; i < size; ++i) {
755 elems[i] = emit_extract_vector(ctx, vec, src.swizzle[i], elem_rc);
756 vec_instr->operands[i] = Operand{elems[i]};
757 }
758 Temp dst = ctx->program->allocateTmp(RegClass(vec.type(), elem_size * size / 4));
759 vec_instr->definitions[0] = Definition(dst);
760 ctx->block->instructions.emplace_back(std::move(vec_instr));
761 ctx->allocated_vec.emplace(dst.id(), elems);
762 return vec.type() == RegType::sgpr ? Builder(ctx->program, ctx->block).as_uniform(dst) : dst;
763 }
764 }
765
766 Temp
get_alu_src_vop3p(struct isel_context * ctx,nir_alu_src src)767 get_alu_src_vop3p(struct isel_context* ctx, nir_alu_src src)
768 {
769 /* returns v2b or v1 for vop3p usage.
770 * The source expects exactly 2 16bit components
771 * which are within the same dword
772 */
773 assert(src.src.ssa->bit_size == 16);
774 assert(src.swizzle[0] >> 1 == src.swizzle[1] >> 1);
775
776 Temp tmp = get_ssa_temp(ctx, src.src.ssa);
777 if (tmp.size() == 1)
778 return tmp;
779
780 /* the size is larger than 1 dword: check the swizzle */
781 unsigned dword = src.swizzle[0] >> 1;
782
783 /* extract a full dword if possible */
784 if (tmp.bytes() >= (dword + 1) * 4) {
785 /* if the source is split into components, use p_create_vector */
786 auto it = ctx->allocated_vec.find(tmp.id());
787 if (it != ctx->allocated_vec.end()) {
788 unsigned index = dword << 1;
789 Builder bld(ctx->program, ctx->block);
790 if (it->second[index].regClass() == v2b)
791 return bld.pseudo(aco_opcode::p_create_vector, bld.def(v1), it->second[index],
792 it->second[index + 1]);
793 }
794 return emit_extract_vector(ctx, tmp, dword, v1);
795 } else {
796 /* This must be a swizzled access to %a.zz where %a is v6b */
797 assert(((src.swizzle[0] | src.swizzle[1]) & 1) == 0);
798 assert(tmp.regClass() == v6b && dword == 1);
799 return emit_extract_vector(ctx, tmp, dword * 2, v2b);
800 }
801 }
802
803 uint32_t
get_alu_src_ub(isel_context * ctx,nir_alu_instr * instr,int src_idx)804 get_alu_src_ub(isel_context* ctx, nir_alu_instr* instr, int src_idx)
805 {
806 nir_scalar scalar = nir_scalar{instr->src[src_idx].src.ssa, instr->src[src_idx].swizzle[0]};
807 return nir_unsigned_upper_bound(ctx->shader, ctx->range_ht, scalar, &ctx->ub_config);
808 }
809
810 Temp
convert_pointer_to_64_bit(isel_context * ctx,Temp ptr,bool non_uniform=false)811 convert_pointer_to_64_bit(isel_context* ctx, Temp ptr, bool non_uniform = false)
812 {
813 if (ptr.size() == 2)
814 return ptr;
815 Builder bld(ctx->program, ctx->block);
816 if (ptr.type() == RegType::vgpr && !non_uniform)
817 ptr = bld.as_uniform(ptr);
818 return bld.pseudo(aco_opcode::p_create_vector, bld.def(RegClass(ptr.type(), 2)), ptr,
819 Operand::c32((unsigned)ctx->options->address32_hi));
820 }
821
822 void
emit_sop2_instruction(isel_context * ctx,nir_alu_instr * instr,aco_opcode op,Temp dst,bool writes_scc,uint8_t uses_ub=0)823 emit_sop2_instruction(isel_context* ctx, nir_alu_instr* instr, aco_opcode op, Temp dst,
824 bool writes_scc, uint8_t uses_ub = 0)
825 {
826 aco_ptr<SOP2_instruction> sop2{
827 create_instruction<SOP2_instruction>(op, Format::SOP2, 2, writes_scc ? 2 : 1)};
828 sop2->operands[0] = Operand(get_alu_src(ctx, instr->src[0]));
829 sop2->operands[1] = Operand(get_alu_src(ctx, instr->src[1]));
830 sop2->definitions[0] = Definition(dst);
831 if (instr->no_unsigned_wrap)
832 sop2->definitions[0].setNUW(true);
833 if (writes_scc)
834 sop2->definitions[1] = Definition(ctx->program->allocateId(s1), scc, s1);
835
836 for (int i = 0; i < 2; i++) {
837 if (uses_ub & (1 << i)) {
838 uint32_t src_ub = get_alu_src_ub(ctx, instr, i);
839 if (src_ub <= 0xffff)
840 sop2->operands[i].set16bit(true);
841 else if (src_ub <= 0xffffff)
842 sop2->operands[i].set24bit(true);
843 }
844 }
845
846 ctx->block->instructions.emplace_back(std::move(sop2));
847 }
848
849 void
emit_vop2_instruction(isel_context * ctx,nir_alu_instr * instr,aco_opcode opc,Temp dst,bool commutative,bool swap_srcs=false,bool flush_denorms=false,bool nuw=false,uint8_t uses_ub=0)850 emit_vop2_instruction(isel_context* ctx, nir_alu_instr* instr, aco_opcode opc, Temp dst,
851 bool commutative, bool swap_srcs = false, bool flush_denorms = false,
852 bool nuw = false, uint8_t uses_ub = 0)
853 {
854 Builder bld(ctx->program, ctx->block);
855 bld.is_precise = instr->exact;
856
857 Temp src0 = get_alu_src(ctx, instr->src[swap_srcs ? 1 : 0]);
858 Temp src1 = get_alu_src(ctx, instr->src[swap_srcs ? 0 : 1]);
859 if (src1.type() == RegType::sgpr) {
860 if (commutative && src0.type() == RegType::vgpr) {
861 Temp t = src0;
862 src0 = src1;
863 src1 = t;
864 } else {
865 src1 = as_vgpr(ctx, src1);
866 }
867 }
868
869 Operand op[2] = {Operand(src0), Operand(src1)};
870
871 for (int i = 0; i < 2; i++) {
872 if (uses_ub & (1 << i)) {
873 uint32_t src_ub = get_alu_src_ub(ctx, instr, swap_srcs ? !i : i);
874 if (src_ub <= 0xffff)
875 op[i].set16bit(true);
876 else if (src_ub <= 0xffffff)
877 op[i].set24bit(true);
878 }
879 }
880
881 if (flush_denorms && ctx->program->gfx_level < GFX9) {
882 assert(dst.size() == 1);
883 Temp tmp = bld.vop2(opc, bld.def(dst.regClass()), op[0], op[1]);
884 if (dst.bytes() == 2)
885 bld.vop2(aco_opcode::v_mul_f16, Definition(dst), Operand::c16(0x3c00), tmp);
886 else
887 bld.vop2(aco_opcode::v_mul_f32, Definition(dst), Operand::c32(0x3f800000u), tmp);
888 } else {
889 if (nuw) {
890 bld.nuw().vop2(opc, Definition(dst), op[0], op[1]);
891 } else {
892 bld.vop2(opc, Definition(dst), op[0], op[1]);
893 }
894 }
895 }
896
897 void
emit_vop2_instruction_logic64(isel_context * ctx,nir_alu_instr * instr,aco_opcode op,Temp dst)898 emit_vop2_instruction_logic64(isel_context* ctx, nir_alu_instr* instr, aco_opcode op, Temp dst)
899 {
900 Builder bld(ctx->program, ctx->block);
901 bld.is_precise = instr->exact;
902
903 Temp src0 = get_alu_src(ctx, instr->src[0]);
904 Temp src1 = get_alu_src(ctx, instr->src[1]);
905
906 if (src1.type() == RegType::sgpr) {
907 assert(src0.type() == RegType::vgpr);
908 std::swap(src0, src1);
909 }
910
911 Temp src00 = bld.tmp(src0.type(), 1);
912 Temp src01 = bld.tmp(src0.type(), 1);
913 bld.pseudo(aco_opcode::p_split_vector, Definition(src00), Definition(src01), src0);
914 Temp src10 = bld.tmp(v1);
915 Temp src11 = bld.tmp(v1);
916 bld.pseudo(aco_opcode::p_split_vector, Definition(src10), Definition(src11), src1);
917 Temp lo = bld.vop2(op, bld.def(v1), src00, src10);
918 Temp hi = bld.vop2(op, bld.def(v1), src01, src11);
919 bld.pseudo(aco_opcode::p_create_vector, Definition(dst), lo, hi);
920 }
921
922 void
emit_vop3a_instruction(isel_context * ctx,nir_alu_instr * instr,aco_opcode op,Temp dst,bool flush_denorms=false,unsigned num_sources=2,bool swap_srcs=false)923 emit_vop3a_instruction(isel_context* ctx, nir_alu_instr* instr, aco_opcode op, Temp dst,
924 bool flush_denorms = false, unsigned num_sources = 2, bool swap_srcs = false)
925 {
926 assert(num_sources == 2 || num_sources == 3);
927 Temp src[3] = {Temp(0, v1), Temp(0, v1), Temp(0, v1)};
928 bool has_sgpr = false;
929 for (unsigned i = 0; i < num_sources; i++) {
930 src[i] = get_alu_src(ctx, instr->src[(swap_srcs && i < 2) ? 1 - i : i]);
931 if (has_sgpr)
932 src[i] = as_vgpr(ctx, src[i]);
933 else
934 has_sgpr = src[i].type() == RegType::sgpr;
935 }
936
937 Builder bld(ctx->program, ctx->block);
938 bld.is_precise = instr->exact;
939 if (flush_denorms && ctx->program->gfx_level < GFX9) {
940 Temp tmp;
941 if (num_sources == 3)
942 tmp = bld.vop3(op, bld.def(dst.regClass()), src[0], src[1], src[2]);
943 else
944 tmp = bld.vop3(op, bld.def(dst.regClass()), src[0], src[1]);
945 if (dst.size() == 1)
946 bld.vop2(aco_opcode::v_mul_f32, Definition(dst), Operand::c32(0x3f800000u), tmp);
947 else
948 bld.vop3(aco_opcode::v_mul_f64, Definition(dst), Operand::c64(0x3FF0000000000000), tmp);
949 } else if (num_sources == 3) {
950 bld.vop3(op, Definition(dst), src[0], src[1], src[2]);
951 } else {
952 bld.vop3(op, Definition(dst), src[0], src[1]);
953 }
954 }
955
956 Builder::Result
emit_vop3p_instruction(isel_context * ctx,nir_alu_instr * instr,aco_opcode op,Temp dst,bool swap_srcs=false)957 emit_vop3p_instruction(isel_context* ctx, nir_alu_instr* instr, aco_opcode op, Temp dst,
958 bool swap_srcs = false)
959 {
960 Temp src0 = get_alu_src_vop3p(ctx, instr->src[swap_srcs]);
961 Temp src1 = get_alu_src_vop3p(ctx, instr->src[!swap_srcs]);
962 if (src0.type() == RegType::sgpr && src1.type() == RegType::sgpr)
963 src1 = as_vgpr(ctx, src1);
964 assert(instr->def.num_components == 2);
965
966 /* swizzle to opsel: all swizzles are either 0 (x) or 1 (y) */
967 unsigned opsel_lo =
968 (instr->src[!swap_srcs].swizzle[0] & 1) << 1 | (instr->src[swap_srcs].swizzle[0] & 1);
969 unsigned opsel_hi =
970 (instr->src[!swap_srcs].swizzle[1] & 1) << 1 | (instr->src[swap_srcs].swizzle[1] & 1);
971
972 Builder bld(ctx->program, ctx->block);
973 bld.is_precise = instr->exact;
974 Builder::Result res = bld.vop3p(op, Definition(dst), src0, src1, opsel_lo, opsel_hi);
975 return res;
976 }
977
978 void
emit_idot_instruction(isel_context * ctx,nir_alu_instr * instr,aco_opcode op,Temp dst,bool clamp,unsigned neg_lo=0)979 emit_idot_instruction(isel_context* ctx, nir_alu_instr* instr, aco_opcode op, Temp dst, bool clamp,
980 unsigned neg_lo = 0)
981 {
982 Temp src[3] = {Temp(0, v1), Temp(0, v1), Temp(0, v1)};
983 bool has_sgpr = false;
984 for (unsigned i = 0; i < 3; i++) {
985 src[i] = get_alu_src(ctx, instr->src[i]);
986 if (has_sgpr)
987 src[i] = as_vgpr(ctx, src[i]);
988 else
989 has_sgpr = src[i].type() == RegType::sgpr;
990 }
991
992 Builder bld(ctx->program, ctx->block);
993 bld.is_precise = instr->exact;
994 VALU_instruction& vop3p =
995 bld.vop3p(op, Definition(dst), src[0], src[1], src[2], 0x0, 0x7)->valu();
996 vop3p.clamp = clamp;
997 vop3p.neg_lo = neg_lo;
998 }
999
1000 void
emit_vop1_instruction(isel_context * ctx,nir_alu_instr * instr,aco_opcode op,Temp dst)1001 emit_vop1_instruction(isel_context* ctx, nir_alu_instr* instr, aco_opcode op, Temp dst)
1002 {
1003 Builder bld(ctx->program, ctx->block);
1004 bld.is_precise = instr->exact;
1005 if (dst.type() == RegType::sgpr)
1006 bld.pseudo(aco_opcode::p_as_uniform, Definition(dst),
1007 bld.vop1(op, bld.def(RegType::vgpr, dst.size()), get_alu_src(ctx, instr->src[0])));
1008 else
1009 bld.vop1(op, Definition(dst), get_alu_src(ctx, instr->src[0]));
1010 }
1011
1012 void
emit_vopc_instruction(isel_context * ctx,nir_alu_instr * instr,aco_opcode op,Temp dst)1013 emit_vopc_instruction(isel_context* ctx, nir_alu_instr* instr, aco_opcode op, Temp dst)
1014 {
1015 Temp src0 = get_alu_src(ctx, instr->src[0]);
1016 Temp src1 = get_alu_src(ctx, instr->src[1]);
1017 assert(src0.size() == src1.size());
1018
1019 aco_ptr<Instruction> vopc;
1020 if (src1.type() == RegType::sgpr) {
1021 if (src0.type() == RegType::vgpr) {
1022 /* to swap the operands, we might also have to change the opcode */
1023 switch (op) {
1024 case aco_opcode::v_cmp_lt_f16: op = aco_opcode::v_cmp_gt_f16; break;
1025 case aco_opcode::v_cmp_ge_f16: op = aco_opcode::v_cmp_le_f16; break;
1026 case aco_opcode::v_cmp_lt_i16: op = aco_opcode::v_cmp_gt_i16; break;
1027 case aco_opcode::v_cmp_ge_i16: op = aco_opcode::v_cmp_le_i16; break;
1028 case aco_opcode::v_cmp_lt_u16: op = aco_opcode::v_cmp_gt_u16; break;
1029 case aco_opcode::v_cmp_ge_u16: op = aco_opcode::v_cmp_le_u16; break;
1030 case aco_opcode::v_cmp_lt_f32: op = aco_opcode::v_cmp_gt_f32; break;
1031 case aco_opcode::v_cmp_ge_f32: op = aco_opcode::v_cmp_le_f32; break;
1032 case aco_opcode::v_cmp_lt_i32: op = aco_opcode::v_cmp_gt_i32; break;
1033 case aco_opcode::v_cmp_ge_i32: op = aco_opcode::v_cmp_le_i32; break;
1034 case aco_opcode::v_cmp_lt_u32: op = aco_opcode::v_cmp_gt_u32; break;
1035 case aco_opcode::v_cmp_ge_u32: op = aco_opcode::v_cmp_le_u32; break;
1036 case aco_opcode::v_cmp_lt_f64: op = aco_opcode::v_cmp_gt_f64; break;
1037 case aco_opcode::v_cmp_ge_f64: op = aco_opcode::v_cmp_le_f64; break;
1038 case aco_opcode::v_cmp_lt_i64: op = aco_opcode::v_cmp_gt_i64; break;
1039 case aco_opcode::v_cmp_ge_i64: op = aco_opcode::v_cmp_le_i64; break;
1040 case aco_opcode::v_cmp_lt_u64: op = aco_opcode::v_cmp_gt_u64; break;
1041 case aco_opcode::v_cmp_ge_u64: op = aco_opcode::v_cmp_le_u64; break;
1042 default: /* eq and ne are commutative */ break;
1043 }
1044 Temp t = src0;
1045 src0 = src1;
1046 src1 = t;
1047 } else {
1048 src1 = as_vgpr(ctx, src1);
1049 }
1050 }
1051
1052 Builder bld(ctx->program, ctx->block);
1053 bld.vopc(op, Definition(dst), src0, src1);
1054 }
1055
1056 void
emit_sopc_instruction(isel_context * ctx,nir_alu_instr * instr,aco_opcode op,Temp dst)1057 emit_sopc_instruction(isel_context* ctx, nir_alu_instr* instr, aco_opcode op, Temp dst)
1058 {
1059 Temp src0 = get_alu_src(ctx, instr->src[0]);
1060 Temp src1 = get_alu_src(ctx, instr->src[1]);
1061 Builder bld(ctx->program, ctx->block);
1062
1063 assert(dst.regClass() == bld.lm);
1064 assert(src0.type() == RegType::sgpr);
1065 assert(src1.type() == RegType::sgpr);
1066
1067 /* Emit the SALU comparison instruction */
1068 Temp cmp = bld.sopc(op, bld.scc(bld.def(s1)), src0, src1);
1069 /* Turn the result into a per-lane bool */
1070 bool_to_vector_condition(ctx, cmp, dst);
1071 }
1072
1073 void
emit_comparison(isel_context * ctx,nir_alu_instr * instr,Temp dst,aco_opcode v16_op,aco_opcode v32_op,aco_opcode v64_op,aco_opcode s32_op=aco_opcode::num_opcodes,aco_opcode s64_op=aco_opcode::num_opcodes)1074 emit_comparison(isel_context* ctx, nir_alu_instr* instr, Temp dst, aco_opcode v16_op,
1075 aco_opcode v32_op, aco_opcode v64_op, aco_opcode s32_op = aco_opcode::num_opcodes,
1076 aco_opcode s64_op = aco_opcode::num_opcodes)
1077 {
1078 aco_opcode s_op = instr->src[0].src.ssa->bit_size == 64 ? s64_op
1079 : instr->src[0].src.ssa->bit_size == 32 ? s32_op
1080 : aco_opcode::num_opcodes;
1081 aco_opcode v_op = instr->src[0].src.ssa->bit_size == 64 ? v64_op
1082 : instr->src[0].src.ssa->bit_size == 32 ? v32_op
1083 : v16_op;
1084 bool use_valu = s_op == aco_opcode::num_opcodes || instr->def.divergent ||
1085 get_ssa_temp(ctx, instr->src[0].src.ssa).type() == RegType::vgpr ||
1086 get_ssa_temp(ctx, instr->src[1].src.ssa).type() == RegType::vgpr;
1087 aco_opcode op = use_valu ? v_op : s_op;
1088 assert(op != aco_opcode::num_opcodes);
1089 assert(dst.regClass() == ctx->program->lane_mask);
1090
1091 if (use_valu)
1092 emit_vopc_instruction(ctx, instr, op, dst);
1093 else
1094 emit_sopc_instruction(ctx, instr, op, dst);
1095 }
1096
1097 void
emit_boolean_logic(isel_context * ctx,nir_alu_instr * instr,Builder::WaveSpecificOpcode op,Temp dst)1098 emit_boolean_logic(isel_context* ctx, nir_alu_instr* instr, Builder::WaveSpecificOpcode op,
1099 Temp dst)
1100 {
1101 Builder bld(ctx->program, ctx->block);
1102 Temp src0 = get_alu_src(ctx, instr->src[0]);
1103 Temp src1 = get_alu_src(ctx, instr->src[1]);
1104
1105 assert(dst.regClass() == bld.lm);
1106 assert(src0.regClass() == bld.lm);
1107 assert(src1.regClass() == bld.lm);
1108
1109 bld.sop2(op, Definition(dst), bld.def(s1, scc), src0, src1);
1110 }
1111
1112 void
select_vec2(isel_context * ctx,Temp dst,Temp cond,Temp then,Temp els)1113 select_vec2(isel_context* ctx, Temp dst, Temp cond, Temp then, Temp els)
1114 {
1115 Builder bld(ctx->program, ctx->block);
1116
1117 Temp then_lo = bld.tmp(v1), then_hi = bld.tmp(v1);
1118 bld.pseudo(aco_opcode::p_split_vector, Definition(then_lo), Definition(then_hi), then);
1119 Temp else_lo = bld.tmp(v1), else_hi = bld.tmp(v1);
1120 bld.pseudo(aco_opcode::p_split_vector, Definition(else_lo), Definition(else_hi), els);
1121
1122 Temp dst0 = bld.vop2(aco_opcode::v_cndmask_b32, bld.def(v1), else_lo, then_lo, cond);
1123 Temp dst1 = bld.vop2(aco_opcode::v_cndmask_b32, bld.def(v1), else_hi, then_hi, cond);
1124
1125 bld.pseudo(aco_opcode::p_create_vector, Definition(dst), dst0, dst1);
1126 }
1127
1128 void
emit_bcsel(isel_context * ctx,nir_alu_instr * instr,Temp dst)1129 emit_bcsel(isel_context* ctx, nir_alu_instr* instr, Temp dst)
1130 {
1131 Builder bld(ctx->program, ctx->block);
1132 Temp cond = get_alu_src(ctx, instr->src[0]);
1133 Temp then = get_alu_src(ctx, instr->src[1]);
1134 Temp els = get_alu_src(ctx, instr->src[2]);
1135
1136 assert(cond.regClass() == bld.lm);
1137
1138 if (dst.type() == RegType::vgpr) {
1139 aco_ptr<Instruction> bcsel;
1140 if (dst.size() == 1) {
1141 then = as_vgpr(ctx, then);
1142 els = as_vgpr(ctx, els);
1143
1144 bld.vop2(aco_opcode::v_cndmask_b32, Definition(dst), els, then, cond);
1145 } else if (dst.size() == 2) {
1146 select_vec2(ctx, dst, cond, then, els);
1147 } else {
1148 isel_err(&instr->instr, "Unimplemented NIR instr bit size");
1149 }
1150 return;
1151 }
1152
1153 if (instr->def.bit_size == 1) {
1154 assert(dst.regClass() == bld.lm);
1155 assert(then.regClass() == bld.lm);
1156 assert(els.regClass() == bld.lm);
1157 }
1158
1159 if (!nir_src_is_divergent(instr->src[0].src)) { /* uniform condition and values in sgpr */
1160 if (dst.regClass() == s1 || dst.regClass() == s2) {
1161 assert((then.regClass() == s1 || then.regClass() == s2) &&
1162 els.regClass() == then.regClass());
1163 assert(dst.size() == then.size());
1164 aco_opcode op =
1165 dst.regClass() == s1 ? aco_opcode::s_cselect_b32 : aco_opcode::s_cselect_b64;
1166 bld.sop2(op, Definition(dst), then, els, bld.scc(bool_to_scalar_condition(ctx, cond)));
1167 } else {
1168 isel_err(&instr->instr, "Unimplemented uniform bcsel bit size");
1169 }
1170 return;
1171 }
1172
1173 /* divergent boolean bcsel
1174 * this implements bcsel on bools: dst = s0 ? s1 : s2
1175 * are going to be: dst = (s0 & s1) | (~s0 & s2) */
1176 assert(instr->def.bit_size == 1);
1177
1178 if (cond.id() != then.id())
1179 then = bld.sop2(Builder::s_and, bld.def(bld.lm), bld.def(s1, scc), cond, then);
1180
1181 if (cond.id() == els.id())
1182 bld.copy(Definition(dst), then);
1183 else
1184 bld.sop2(Builder::s_or, Definition(dst), bld.def(s1, scc), then,
1185 bld.sop2(Builder::s_andn2, bld.def(bld.lm), bld.def(s1, scc), els, cond));
1186 }
1187
1188 void
emit_scaled_op(isel_context * ctx,Builder & bld,Definition dst,Temp val,aco_opcode op,uint32_t undo)1189 emit_scaled_op(isel_context* ctx, Builder& bld, Definition dst, Temp val, aco_opcode op,
1190 uint32_t undo)
1191 {
1192 /* multiply by 16777216 to handle denormals */
1193 Temp is_denormal = bld.tmp(bld.lm);
1194 VALU_instruction& valu =
1195 bld.vopc_e64(aco_opcode::v_cmp_class_f32, Definition(is_denormal), val, Operand::c32(1u << 4))
1196 ->valu();
1197 valu.neg[0] = true;
1198 valu.abs[0] = true;
1199 Temp scaled = bld.vop2(aco_opcode::v_mul_f32, bld.def(v1), Operand::c32(0x4b800000u), val);
1200 scaled = bld.vop1(op, bld.def(v1), scaled);
1201 scaled = bld.vop2(aco_opcode::v_mul_f32, bld.def(v1), Operand::c32(undo), scaled);
1202
1203 Temp not_scaled = bld.vop1(op, bld.def(v1), val);
1204
1205 bld.vop2(aco_opcode::v_cndmask_b32, dst, not_scaled, scaled, is_denormal);
1206 }
1207
1208 void
emit_rcp(isel_context * ctx,Builder & bld,Definition dst,Temp val)1209 emit_rcp(isel_context* ctx, Builder& bld, Definition dst, Temp val)
1210 {
1211 if (ctx->block->fp_mode.denorm32 == 0) {
1212 bld.vop1(aco_opcode::v_rcp_f32, dst, val);
1213 return;
1214 }
1215
1216 emit_scaled_op(ctx, bld, dst, val, aco_opcode::v_rcp_f32, 0x4b800000u);
1217 }
1218
1219 void
emit_rsq(isel_context * ctx,Builder & bld,Definition dst,Temp val)1220 emit_rsq(isel_context* ctx, Builder& bld, Definition dst, Temp val)
1221 {
1222 if (ctx->block->fp_mode.denorm32 == 0) {
1223 bld.vop1(aco_opcode::v_rsq_f32, dst, val);
1224 return;
1225 }
1226
1227 emit_scaled_op(ctx, bld, dst, val, aco_opcode::v_rsq_f32, 0x45800000u);
1228 }
1229
1230 void
emit_sqrt(isel_context * ctx,Builder & bld,Definition dst,Temp val)1231 emit_sqrt(isel_context* ctx, Builder& bld, Definition dst, Temp val)
1232 {
1233 if (ctx->block->fp_mode.denorm32 == 0) {
1234 bld.vop1(aco_opcode::v_sqrt_f32, dst, val);
1235 return;
1236 }
1237
1238 emit_scaled_op(ctx, bld, dst, val, aco_opcode::v_sqrt_f32, 0x39800000u);
1239 }
1240
1241 void
emit_log2(isel_context * ctx,Builder & bld,Definition dst,Temp val)1242 emit_log2(isel_context* ctx, Builder& bld, Definition dst, Temp val)
1243 {
1244 if (ctx->block->fp_mode.denorm32 == 0) {
1245 bld.vop1(aco_opcode::v_log_f32, dst, val);
1246 return;
1247 }
1248
1249 emit_scaled_op(ctx, bld, dst, val, aco_opcode::v_log_f32, 0xc1c00000u);
1250 }
1251
1252 Temp
emit_trunc_f64(isel_context * ctx,Builder & bld,Definition dst,Temp val)1253 emit_trunc_f64(isel_context* ctx, Builder& bld, Definition dst, Temp val)
1254 {
1255 if (ctx->options->gfx_level >= GFX7)
1256 return bld.vop1(aco_opcode::v_trunc_f64, Definition(dst), val);
1257
1258 /* GFX6 doesn't support V_TRUNC_F64, lower it. */
1259 /* TODO: create more efficient code! */
1260 if (val.type() == RegType::sgpr)
1261 val = as_vgpr(ctx, val);
1262
1263 /* Split the input value. */
1264 Temp val_lo = bld.tmp(v1), val_hi = bld.tmp(v1);
1265 bld.pseudo(aco_opcode::p_split_vector, Definition(val_lo), Definition(val_hi), val);
1266
1267 /* Extract the exponent and compute the unbiased value. */
1268 Temp exponent =
1269 bld.vop3(aco_opcode::v_bfe_u32, bld.def(v1), val_hi, Operand::c32(20u), Operand::c32(11u));
1270 exponent = bld.vsub32(bld.def(v1), exponent, Operand::c32(1023u));
1271
1272 /* Extract the fractional part. */
1273 Temp fract_mask = bld.pseudo(aco_opcode::p_create_vector, bld.def(v2), Operand::c32(-1u),
1274 Operand::c32(0x000fffffu));
1275 fract_mask = bld.vop3(aco_opcode::v_lshr_b64, bld.def(v2), fract_mask, exponent);
1276
1277 Temp fract_mask_lo = bld.tmp(v1), fract_mask_hi = bld.tmp(v1);
1278 bld.pseudo(aco_opcode::p_split_vector, Definition(fract_mask_lo), Definition(fract_mask_hi),
1279 fract_mask);
1280
1281 Temp fract_lo = bld.tmp(v1), fract_hi = bld.tmp(v1);
1282 Temp tmp = bld.vop1(aco_opcode::v_not_b32, bld.def(v1), fract_mask_lo);
1283 fract_lo = bld.vop2(aco_opcode::v_and_b32, bld.def(v1), val_lo, tmp);
1284 tmp = bld.vop1(aco_opcode::v_not_b32, bld.def(v1), fract_mask_hi);
1285 fract_hi = bld.vop2(aco_opcode::v_and_b32, bld.def(v1), val_hi, tmp);
1286
1287 /* Get the sign bit. */
1288 Temp sign = bld.vop2(aco_opcode::v_and_b32, bld.def(v1), Operand::c32(0x80000000u), val_hi);
1289
1290 /* Decide the operation to apply depending on the unbiased exponent. */
1291 Temp exp_lt0 =
1292 bld.vopc_e64(aco_opcode::v_cmp_lt_i32, bld.def(bld.lm), exponent, Operand::zero());
1293 Temp dst_lo = bld.vop2(aco_opcode::v_cndmask_b32, bld.def(v1), fract_lo,
1294 bld.copy(bld.def(v1), Operand::zero()), exp_lt0);
1295 Temp dst_hi = bld.vop2(aco_opcode::v_cndmask_b32, bld.def(v1), fract_hi, sign, exp_lt0);
1296 Temp exp_gt51 = bld.vopc_e64(aco_opcode::v_cmp_gt_i32, bld.def(s2), exponent, Operand::c32(51u));
1297 dst_lo = bld.vop2(aco_opcode::v_cndmask_b32, bld.def(v1), dst_lo, val_lo, exp_gt51);
1298 dst_hi = bld.vop2(aco_opcode::v_cndmask_b32, bld.def(v1), dst_hi, val_hi, exp_gt51);
1299
1300 return bld.pseudo(aco_opcode::p_create_vector, Definition(dst), dst_lo, dst_hi);
1301 }
1302
1303 Temp
emit_floor_f64(isel_context * ctx,Builder & bld,Definition dst,Temp val)1304 emit_floor_f64(isel_context* ctx, Builder& bld, Definition dst, Temp val)
1305 {
1306 if (ctx->options->gfx_level >= GFX7)
1307 return bld.vop1(aco_opcode::v_floor_f64, Definition(dst), val);
1308
1309 /* GFX6 doesn't support V_FLOOR_F64, lower it (note that it's actually
1310 * lowered at NIR level for precision reasons). */
1311 Temp src0 = as_vgpr(ctx, val);
1312
1313 Temp min_val = bld.pseudo(aco_opcode::p_create_vector, bld.def(s2), Operand::c32(-1u),
1314 Operand::c32(0x3fefffffu));
1315
1316 Temp isnan = bld.vopc(aco_opcode::v_cmp_neq_f64, bld.def(bld.lm), src0, src0);
1317 Temp fract = bld.vop1(aco_opcode::v_fract_f64, bld.def(v2), src0);
1318 Temp min = bld.vop3(aco_opcode::v_min_f64, bld.def(v2), fract, min_val);
1319
1320 Temp then_lo = bld.tmp(v1), then_hi = bld.tmp(v1);
1321 bld.pseudo(aco_opcode::p_split_vector, Definition(then_lo), Definition(then_hi), src0);
1322 Temp else_lo = bld.tmp(v1), else_hi = bld.tmp(v1);
1323 bld.pseudo(aco_opcode::p_split_vector, Definition(else_lo), Definition(else_hi), min);
1324
1325 Temp dst0 = bld.vop2(aco_opcode::v_cndmask_b32, bld.def(v1), else_lo, then_lo, isnan);
1326 Temp dst1 = bld.vop2(aco_opcode::v_cndmask_b32, bld.def(v1), else_hi, then_hi, isnan);
1327
1328 Temp v = bld.pseudo(aco_opcode::p_create_vector, bld.def(v2), dst0, dst1);
1329
1330 Instruction* add = bld.vop3(aco_opcode::v_add_f64, Definition(dst), src0, v);
1331 add->valu().neg[1] = true;
1332
1333 return add->definitions[0].getTemp();
1334 }
1335
1336 Temp
uadd32_sat(Builder & bld,Definition dst,Temp src0,Temp src1)1337 uadd32_sat(Builder& bld, Definition dst, Temp src0, Temp src1)
1338 {
1339 if (bld.program->gfx_level < GFX8) {
1340 Builder::Result add = bld.vadd32(bld.def(v1), src0, src1, true);
1341 return bld.vop2_e64(aco_opcode::v_cndmask_b32, dst, add.def(0).getTemp(), Operand::c32(-1),
1342 add.def(1).getTemp());
1343 }
1344
1345 Builder::Result add(NULL);
1346 if (bld.program->gfx_level >= GFX9) {
1347 add = bld.vop2_e64(aco_opcode::v_add_u32, dst, src0, src1);
1348 } else {
1349 add = bld.vop2_e64(aco_opcode::v_add_co_u32, dst, bld.def(bld.lm), src0, src1);
1350 }
1351 add->valu().clamp = 1;
1352 return dst.getTemp();
1353 }
1354
1355 Temp
usub32_sat(Builder & bld,Definition dst,Temp src0,Temp src1)1356 usub32_sat(Builder& bld, Definition dst, Temp src0, Temp src1)
1357 {
1358 if (bld.program->gfx_level < GFX8) {
1359 Builder::Result sub = bld.vsub32(bld.def(v1), src0, src1, true);
1360 return bld.vop2_e64(aco_opcode::v_cndmask_b32, dst, sub.def(0).getTemp(), Operand::c32(0u),
1361 sub.def(1).getTemp());
1362 }
1363
1364 Builder::Result sub(NULL);
1365 if (bld.program->gfx_level >= GFX9) {
1366 sub = bld.vop2_e64(aco_opcode::v_sub_u32, dst, src0, src1);
1367 } else {
1368 sub = bld.vop2_e64(aco_opcode::v_sub_co_u32, dst, bld.def(bld.lm), src0, src1);
1369 }
1370 sub->valu().clamp = 1;
1371 return dst.getTemp();
1372 }
1373
1374 void
emit_vec2_f2f16(isel_context * ctx,nir_alu_instr * instr,Temp dst)1375 emit_vec2_f2f16(isel_context* ctx, nir_alu_instr* instr, Temp dst)
1376 {
1377 Builder bld(ctx->program, ctx->block);
1378 Temp src = get_ssa_temp(ctx, instr->src[0].src.ssa);
1379 RegClass rc = RegClass(src.regClass().type(), instr->src[0].src.ssa->bit_size / 32);
1380 Temp src0 = emit_extract_vector(ctx, src, instr->src[0].swizzle[0], rc);
1381 Temp src1 = emit_extract_vector(ctx, src, instr->src[0].swizzle[1], rc);
1382
1383 src1 = as_vgpr(ctx, src1);
1384 if (ctx->program->gfx_level == GFX8 || ctx->program->gfx_level == GFX9)
1385 bld.vop3(aco_opcode::v_cvt_pkrtz_f16_f32_e64, Definition(dst), src0, src1);
1386 else
1387 bld.vop2(aco_opcode::v_cvt_pkrtz_f16_f32, Definition(dst), src0, src1);
1388 emit_split_vector(ctx, dst, 2);
1389 }
1390
1391 void
visit_alu_instr(isel_context * ctx,nir_alu_instr * instr)1392 visit_alu_instr(isel_context* ctx, nir_alu_instr* instr)
1393 {
1394 Builder bld(ctx->program, ctx->block);
1395 bld.is_precise = instr->exact;
1396 Temp dst = get_ssa_temp(ctx, &instr->def);
1397 switch (instr->op) {
1398 case nir_op_vec2:
1399 case nir_op_vec3:
1400 case nir_op_vec4:
1401 case nir_op_vec5:
1402 case nir_op_vec8:
1403 case nir_op_vec16: {
1404 std::array<Temp, NIR_MAX_VEC_COMPONENTS> elems;
1405 unsigned num = instr->def.num_components;
1406 for (unsigned i = 0; i < num; ++i)
1407 elems[i] = get_alu_src(ctx, instr->src[i]);
1408
1409 if (instr->def.bit_size >= 32 || dst.type() == RegType::vgpr) {
1410 aco_ptr<Pseudo_instruction> vec{create_instruction<Pseudo_instruction>(
1411 aco_opcode::p_create_vector, Format::PSEUDO, instr->def.num_components, 1)};
1412 RegClass elem_rc = RegClass::get(RegType::vgpr, instr->def.bit_size / 8u);
1413 for (unsigned i = 0; i < num; ++i) {
1414 if (elems[i].type() == RegType::sgpr && elem_rc.is_subdword())
1415 elems[i] = emit_extract_vector(ctx, elems[i], 0, elem_rc);
1416 vec->operands[i] = Operand{elems[i]};
1417 }
1418 vec->definitions[0] = Definition(dst);
1419 ctx->block->instructions.emplace_back(std::move(vec));
1420 ctx->allocated_vec.emplace(dst.id(), elems);
1421 } else {
1422 bool use_s_pack = ctx->program->gfx_level >= GFX9;
1423 Temp mask = bld.copy(bld.def(s1), Operand::c32((1u << instr->def.bit_size) - 1));
1424
1425 std::array<Temp, NIR_MAX_VEC_COMPONENTS> packed;
1426 uint32_t const_vals[NIR_MAX_VEC_COMPONENTS] = {};
1427 for (unsigned i = 0; i < num; i++) {
1428 unsigned packed_size = use_s_pack ? 16 : 32;
1429 unsigned idx = i * instr->def.bit_size / packed_size;
1430 unsigned offset = i * instr->def.bit_size % packed_size;
1431 if (nir_src_is_const(instr->src[i].src)) {
1432 const_vals[idx] |= nir_src_as_uint(instr->src[i].src) << offset;
1433 continue;
1434 }
1435 if (nir_src_is_undef(instr->src[i].src))
1436 continue;
1437
1438 if (offset != packed_size - instr->def.bit_size)
1439 elems[i] =
1440 bld.sop2(aco_opcode::s_and_b32, bld.def(s1), bld.def(s1, scc), elems[i], mask);
1441
1442 if (offset)
1443 elems[i] = bld.sop2(aco_opcode::s_lshl_b32, bld.def(s1), bld.def(s1, scc), elems[i],
1444 Operand::c32(offset));
1445
1446 if (packed[idx].id())
1447 packed[idx] = bld.sop2(aco_opcode::s_or_b32, bld.def(s1), bld.def(s1, scc), elems[i],
1448 packed[idx]);
1449 else
1450 packed[idx] = elems[i];
1451 }
1452
1453 if (use_s_pack) {
1454 for (unsigned i = 0; i < dst.size(); i++) {
1455 bool same = !!packed[i * 2].id() == !!packed[i * 2 + 1].id();
1456
1457 if (packed[i * 2].id() && packed[i * 2 + 1].id())
1458 packed[i] = bld.sop2(aco_opcode::s_pack_ll_b32_b16, bld.def(s1), packed[i * 2],
1459 packed[i * 2 + 1]);
1460 else if (packed[i * 2 + 1].id())
1461 packed[i] = bld.sop2(aco_opcode::s_pack_ll_b32_b16, bld.def(s1),
1462 Operand::c32(const_vals[i * 2]), packed[i * 2 + 1]);
1463 else if (packed[i * 2].id())
1464 packed[i] = bld.sop2(aco_opcode::s_pack_ll_b32_b16, bld.def(s1), packed[i * 2],
1465 Operand::c32(const_vals[i * 2 + 1]));
1466 else
1467 packed[i] = Temp(); /* Both constants, so reset the entry */
1468
1469 if (same)
1470 const_vals[i] = const_vals[i * 2] | (const_vals[i * 2 + 1] << 16);
1471 else
1472 const_vals[i] = 0;
1473 }
1474 }
1475
1476 for (unsigned i = 0; i < dst.size(); i++) {
1477 if (const_vals[i] && packed[i].id())
1478 packed[i] = bld.sop2(aco_opcode::s_or_b32, bld.def(s1), bld.def(s1, scc),
1479 Operand::c32(const_vals[i]), packed[i]);
1480 else if (!packed[i].id())
1481 packed[i] = bld.copy(bld.def(s1), Operand::c32(const_vals[i]));
1482 }
1483
1484 if (dst.size() == 1)
1485 bld.copy(Definition(dst), packed[0]);
1486 else {
1487 aco_ptr<Pseudo_instruction> vec{create_instruction<Pseudo_instruction>(
1488 aco_opcode::p_create_vector, Format::PSEUDO, dst.size(), 1)};
1489 vec->definitions[0] = Definition(dst);
1490 for (unsigned i = 0; i < dst.size(); ++i)
1491 vec->operands[i] = Operand(packed[i]);
1492 bld.insert(std::move(vec));
1493 }
1494 }
1495 break;
1496 }
1497 case nir_op_mov: {
1498 Temp src = get_alu_src(ctx, instr->src[0]);
1499 if (src.type() == RegType::vgpr && dst.type() == RegType::sgpr) {
1500 /* use size() instead of bytes() for 8/16-bit */
1501 assert(src.size() == dst.size() && "wrong src or dst register class for nir_op_mov");
1502 bld.pseudo(aco_opcode::p_as_uniform, Definition(dst), src);
1503 } else {
1504 assert(src.bytes() == dst.bytes() && "wrong src or dst register class for nir_op_mov");
1505 bld.copy(Definition(dst), src);
1506 }
1507 break;
1508 }
1509 case nir_op_inot: {
1510 Temp src = get_alu_src(ctx, instr->src[0]);
1511 if (dst.regClass() == v1 || dst.regClass() == v2b || dst.regClass() == v1b) {
1512 emit_vop1_instruction(ctx, instr, aco_opcode::v_not_b32, dst);
1513 } else if (dst.regClass() == v2) {
1514 Temp lo = bld.tmp(v1), hi = bld.tmp(v1);
1515 bld.pseudo(aco_opcode::p_split_vector, Definition(lo), Definition(hi), src);
1516 lo = bld.vop1(aco_opcode::v_not_b32, bld.def(v1), lo);
1517 hi = bld.vop1(aco_opcode::v_not_b32, bld.def(v1), hi);
1518 bld.pseudo(aco_opcode::p_create_vector, Definition(dst), lo, hi);
1519 } else if (dst.type() == RegType::sgpr) {
1520 aco_opcode opcode = dst.size() == 1 ? aco_opcode::s_not_b32 : aco_opcode::s_not_b64;
1521 bld.sop1(opcode, Definition(dst), bld.def(s1, scc), src);
1522 } else {
1523 isel_err(&instr->instr, "Unimplemented NIR instr bit size");
1524 }
1525 break;
1526 }
1527 case nir_op_iabs: {
1528 if (dst.regClass() == v1 && instr->def.bit_size == 16) {
1529 Temp src = get_alu_src_vop3p(ctx, instr->src[0]);
1530
1531 unsigned opsel_lo = (instr->src[0].swizzle[0] & 1) << 1;
1532 unsigned opsel_hi = ((instr->src[0].swizzle[1] & 1) << 1) | 1;
1533
1534 Temp sub = bld.vop3p(aco_opcode::v_pk_sub_u16, Definition(bld.tmp(v1)), Operand::zero(),
1535 src, opsel_lo, opsel_hi);
1536 bld.vop3p(aco_opcode::v_pk_max_i16, Definition(dst), sub, src, opsel_lo, opsel_hi);
1537 break;
1538 }
1539 Temp src = get_alu_src(ctx, instr->src[0]);
1540 if (dst.regClass() == s1) {
1541 bld.sop1(aco_opcode::s_abs_i32, Definition(dst), bld.def(s1, scc), src);
1542 } else if (dst.regClass() == v1) {
1543 bld.vop2(aco_opcode::v_max_i32, Definition(dst), src,
1544 bld.vsub32(bld.def(v1), Operand::zero(), src));
1545 } else if (dst.regClass() == v2b && ctx->program->gfx_level >= GFX10) {
1546 bld.vop3(
1547 aco_opcode::v_max_i16_e64, Definition(dst), src,
1548 bld.vop3(aco_opcode::v_sub_u16_e64, Definition(bld.tmp(v2b)), Operand::zero(2), src));
1549 } else if (dst.regClass() == v2b) {
1550 src = as_vgpr(ctx, src);
1551 bld.vop2(aco_opcode::v_max_i16, Definition(dst), src,
1552 bld.vop2(aco_opcode::v_sub_u16, Definition(bld.tmp(v2b)), Operand::zero(2), src));
1553 } else {
1554 isel_err(&instr->instr, "Unimplemented NIR instr bit size");
1555 }
1556 break;
1557 }
1558 case nir_op_isign: {
1559 Temp src = get_alu_src(ctx, instr->src[0]);
1560 if (dst.regClass() == s1) {
1561 Temp tmp =
1562 bld.sop2(aco_opcode::s_max_i32, bld.def(s1), bld.def(s1, scc), src, Operand::c32(-1));
1563 bld.sop2(aco_opcode::s_min_i32, Definition(dst), bld.def(s1, scc), tmp, Operand::c32(1u));
1564 } else if (dst.regClass() == s2) {
1565 Temp neg =
1566 bld.sop2(aco_opcode::s_ashr_i64, bld.def(s2), bld.def(s1, scc), src, Operand::c32(63u));
1567 Temp neqz;
1568 if (ctx->program->gfx_level >= GFX8)
1569 neqz = bld.sopc(aco_opcode::s_cmp_lg_u64, bld.def(s1, scc), src, Operand::zero());
1570 else
1571 neqz =
1572 bld.sop2(aco_opcode::s_or_b64, bld.def(s2), bld.def(s1, scc), src, Operand::zero())
1573 .def(1)
1574 .getTemp();
1575 /* SCC gets zero-extended to 64 bit */
1576 bld.sop2(aco_opcode::s_or_b64, Definition(dst), bld.def(s1, scc), neg, bld.scc(neqz));
1577 } else if (dst.regClass() == v1) {
1578 bld.vop3(aco_opcode::v_med3_i32, Definition(dst), Operand::c32(-1), src, Operand::c32(1u));
1579 } else if (dst.regClass() == v2b && ctx->program->gfx_level >= GFX9) {
1580 bld.vop3(aco_opcode::v_med3_i16, Definition(dst), Operand::c16(-1), src, Operand::c16(1u));
1581 } else if (dst.regClass() == v2b) {
1582 src = as_vgpr(ctx, src);
1583 bld.vop2(aco_opcode::v_max_i16, Definition(dst), Operand::c16(-1),
1584 bld.vop2(aco_opcode::v_min_i16, Definition(bld.tmp(v1)), Operand::c16(1u), src));
1585 } else if (dst.regClass() == v2) {
1586 Temp upper = emit_extract_vector(ctx, src, 1, v1);
1587 Temp neg = bld.vop2(aco_opcode::v_ashrrev_i32, bld.def(v1), Operand::c32(31u), upper);
1588 Temp gtz = bld.vopc(aco_opcode::v_cmp_ge_i64, bld.def(bld.lm), Operand::zero(), src);
1589 Temp lower = bld.vop2(aco_opcode::v_cndmask_b32, bld.def(v1), Operand::c32(1u), neg, gtz);
1590 upper = bld.vop2(aco_opcode::v_cndmask_b32, bld.def(v1), Operand::zero(), neg, gtz);
1591 bld.pseudo(aco_opcode::p_create_vector, Definition(dst), lower, upper);
1592 } else {
1593 isel_err(&instr->instr, "Unimplemented NIR instr bit size");
1594 }
1595 break;
1596 }
1597 case nir_op_imax: {
1598 if (dst.regClass() == v2b && ctx->program->gfx_level >= GFX10) {
1599 emit_vop3a_instruction(ctx, instr, aco_opcode::v_max_i16_e64, dst);
1600 } else if (dst.regClass() == v2b) {
1601 emit_vop2_instruction(ctx, instr, aco_opcode::v_max_i16, dst, true);
1602 } else if (dst.regClass() == v1 && instr->def.bit_size == 16) {
1603 emit_vop3p_instruction(ctx, instr, aco_opcode::v_pk_max_i16, dst);
1604 } else if (dst.regClass() == v1) {
1605 emit_vop2_instruction(ctx, instr, aco_opcode::v_max_i32, dst, true);
1606 } else if (dst.regClass() == s1) {
1607 emit_sop2_instruction(ctx, instr, aco_opcode::s_max_i32, dst, true);
1608 } else {
1609 isel_err(&instr->instr, "Unimplemented NIR instr bit size");
1610 }
1611 break;
1612 }
1613 case nir_op_umax: {
1614 if (dst.regClass() == v2b && ctx->program->gfx_level >= GFX10) {
1615 emit_vop3a_instruction(ctx, instr, aco_opcode::v_max_u16_e64, dst);
1616 } else if (dst.regClass() == v2b) {
1617 emit_vop2_instruction(ctx, instr, aco_opcode::v_max_u16, dst, true);
1618 } else if (dst.regClass() == v1 && instr->def.bit_size == 16) {
1619 emit_vop3p_instruction(ctx, instr, aco_opcode::v_pk_max_u16, dst);
1620 } else if (dst.regClass() == v1) {
1621 emit_vop2_instruction(ctx, instr, aco_opcode::v_max_u32, dst, true);
1622 } else if (dst.regClass() == s1) {
1623 emit_sop2_instruction(ctx, instr, aco_opcode::s_max_u32, dst, true);
1624 } else {
1625 isel_err(&instr->instr, "Unimplemented NIR instr bit size");
1626 }
1627 break;
1628 }
1629 case nir_op_imin: {
1630 if (dst.regClass() == v2b && ctx->program->gfx_level >= GFX10) {
1631 emit_vop3a_instruction(ctx, instr, aco_opcode::v_min_i16_e64, dst);
1632 } else if (dst.regClass() == v2b) {
1633 emit_vop2_instruction(ctx, instr, aco_opcode::v_min_i16, dst, true);
1634 } else if (dst.regClass() == v1 && instr->def.bit_size == 16) {
1635 emit_vop3p_instruction(ctx, instr, aco_opcode::v_pk_min_i16, dst);
1636 } else if (dst.regClass() == v1) {
1637 emit_vop2_instruction(ctx, instr, aco_opcode::v_min_i32, dst, true);
1638 } else if (dst.regClass() == s1) {
1639 emit_sop2_instruction(ctx, instr, aco_opcode::s_min_i32, dst, true);
1640 } else {
1641 isel_err(&instr->instr, "Unimplemented NIR instr bit size");
1642 }
1643 break;
1644 }
1645 case nir_op_umin: {
1646 if (dst.regClass() == v2b && ctx->program->gfx_level >= GFX10) {
1647 emit_vop3a_instruction(ctx, instr, aco_opcode::v_min_u16_e64, dst);
1648 } else if (dst.regClass() == v2b) {
1649 emit_vop2_instruction(ctx, instr, aco_opcode::v_min_u16, dst, true);
1650 } else if (dst.regClass() == v1 && instr->def.bit_size == 16) {
1651 emit_vop3p_instruction(ctx, instr, aco_opcode::v_pk_min_u16, dst);
1652 } else if (dst.regClass() == v1) {
1653 emit_vop2_instruction(ctx, instr, aco_opcode::v_min_u32, dst, true);
1654 } else if (dst.regClass() == s1) {
1655 emit_sop2_instruction(ctx, instr, aco_opcode::s_min_u32, dst, true);
1656 } else {
1657 isel_err(&instr->instr, "Unimplemented NIR instr bit size");
1658 }
1659 break;
1660 }
1661 case nir_op_ior: {
1662 if (instr->def.bit_size == 1) {
1663 emit_boolean_logic(ctx, instr, Builder::s_or, dst);
1664 } else if (dst.regClass() == v1 || dst.regClass() == v2b || dst.regClass() == v1b) {
1665 emit_vop2_instruction(ctx, instr, aco_opcode::v_or_b32, dst, true);
1666 } else if (dst.regClass() == v2) {
1667 emit_vop2_instruction_logic64(ctx, instr, aco_opcode::v_or_b32, dst);
1668 } else if (dst.regClass() == s1) {
1669 emit_sop2_instruction(ctx, instr, aco_opcode::s_or_b32, dst, true);
1670 } else if (dst.regClass() == s2) {
1671 emit_sop2_instruction(ctx, instr, aco_opcode::s_or_b64, dst, true);
1672 } else {
1673 isel_err(&instr->instr, "Unimplemented NIR instr bit size");
1674 }
1675 break;
1676 }
1677 case nir_op_iand: {
1678 if (instr->def.bit_size == 1) {
1679 emit_boolean_logic(ctx, instr, Builder::s_and, dst);
1680 } else if (dst.regClass() == v1 || dst.regClass() == v2b || dst.regClass() == v1b) {
1681 emit_vop2_instruction(ctx, instr, aco_opcode::v_and_b32, dst, true);
1682 } else if (dst.regClass() == v2) {
1683 emit_vop2_instruction_logic64(ctx, instr, aco_opcode::v_and_b32, dst);
1684 } else if (dst.regClass() == s1) {
1685 emit_sop2_instruction(ctx, instr, aco_opcode::s_and_b32, dst, true);
1686 } else if (dst.regClass() == s2) {
1687 emit_sop2_instruction(ctx, instr, aco_opcode::s_and_b64, dst, true);
1688 } else {
1689 isel_err(&instr->instr, "Unimplemented NIR instr bit size");
1690 }
1691 break;
1692 }
1693 case nir_op_ixor: {
1694 if (instr->def.bit_size == 1) {
1695 emit_boolean_logic(ctx, instr, Builder::s_xor, dst);
1696 } else if (dst.regClass() == v1 || dst.regClass() == v2b || dst.regClass() == v1b) {
1697 emit_vop2_instruction(ctx, instr, aco_opcode::v_xor_b32, dst, true);
1698 } else if (dst.regClass() == v2) {
1699 emit_vop2_instruction_logic64(ctx, instr, aco_opcode::v_xor_b32, dst);
1700 } else if (dst.regClass() == s1) {
1701 emit_sop2_instruction(ctx, instr, aco_opcode::s_xor_b32, dst, true);
1702 } else if (dst.regClass() == s2) {
1703 emit_sop2_instruction(ctx, instr, aco_opcode::s_xor_b64, dst, true);
1704 } else {
1705 isel_err(&instr->instr, "Unimplemented NIR instr bit size");
1706 }
1707 break;
1708 }
1709 case nir_op_ushr: {
1710 if (dst.regClass() == v2b && ctx->program->gfx_level >= GFX10) {
1711 emit_vop3a_instruction(ctx, instr, aco_opcode::v_lshrrev_b16_e64, dst, false, 2, true);
1712 } else if (dst.regClass() == v2b) {
1713 emit_vop2_instruction(ctx, instr, aco_opcode::v_lshrrev_b16, dst, false, true);
1714 } else if (dst.regClass() == v1 && instr->def.bit_size == 16) {
1715 emit_vop3p_instruction(ctx, instr, aco_opcode::v_pk_lshrrev_b16, dst, true);
1716 } else if (dst.regClass() == v1) {
1717 emit_vop2_instruction(ctx, instr, aco_opcode::v_lshrrev_b32, dst, false, true);
1718 } else if (dst.regClass() == v2 && ctx->program->gfx_level >= GFX8) {
1719 bld.vop3(aco_opcode::v_lshrrev_b64, Definition(dst), get_alu_src(ctx, instr->src[1]),
1720 get_alu_src(ctx, instr->src[0]));
1721 } else if (dst.regClass() == v2) {
1722 emit_vop3a_instruction(ctx, instr, aco_opcode::v_lshr_b64, dst);
1723 } else if (dst.regClass() == s2) {
1724 emit_sop2_instruction(ctx, instr, aco_opcode::s_lshr_b64, dst, true);
1725 } else if (dst.regClass() == s1) {
1726 emit_sop2_instruction(ctx, instr, aco_opcode::s_lshr_b32, dst, true);
1727 } else {
1728 isel_err(&instr->instr, "Unimplemented NIR instr bit size");
1729 }
1730 break;
1731 }
1732 case nir_op_ishl: {
1733 if (dst.regClass() == v2b && ctx->program->gfx_level >= GFX10) {
1734 emit_vop3a_instruction(ctx, instr, aco_opcode::v_lshlrev_b16_e64, dst, false, 2, true);
1735 } else if (dst.regClass() == v2b) {
1736 emit_vop2_instruction(ctx, instr, aco_opcode::v_lshlrev_b16, dst, false, true);
1737 } else if (dst.regClass() == v1 && instr->def.bit_size == 16) {
1738 emit_vop3p_instruction(ctx, instr, aco_opcode::v_pk_lshlrev_b16, dst, true);
1739 } else if (dst.regClass() == v1) {
1740 emit_vop2_instruction(ctx, instr, aco_opcode::v_lshlrev_b32, dst, false, true, false,
1741 false, 2);
1742 } else if (dst.regClass() == v2 && ctx->program->gfx_level >= GFX8) {
1743 bld.vop3(aco_opcode::v_lshlrev_b64, Definition(dst), get_alu_src(ctx, instr->src[1]),
1744 get_alu_src(ctx, instr->src[0]));
1745 } else if (dst.regClass() == v2) {
1746 emit_vop3a_instruction(ctx, instr, aco_opcode::v_lshl_b64, dst);
1747 } else if (dst.regClass() == s1) {
1748 emit_sop2_instruction(ctx, instr, aco_opcode::s_lshl_b32, dst, true, 1);
1749 } else if (dst.regClass() == s2) {
1750 emit_sop2_instruction(ctx, instr, aco_opcode::s_lshl_b64, dst, true);
1751 } else {
1752 isel_err(&instr->instr, "Unimplemented NIR instr bit size");
1753 }
1754 break;
1755 }
1756 case nir_op_ishr: {
1757 if (dst.regClass() == v2b && ctx->program->gfx_level >= GFX10) {
1758 emit_vop3a_instruction(ctx, instr, aco_opcode::v_ashrrev_i16_e64, dst, false, 2, true);
1759 } else if (dst.regClass() == v2b) {
1760 emit_vop2_instruction(ctx, instr, aco_opcode::v_ashrrev_i16, dst, false, true);
1761 } else if (dst.regClass() == v1 && instr->def.bit_size == 16) {
1762 emit_vop3p_instruction(ctx, instr, aco_opcode::v_pk_ashrrev_i16, dst, true);
1763 } else if (dst.regClass() == v1) {
1764 emit_vop2_instruction(ctx, instr, aco_opcode::v_ashrrev_i32, dst, false, true);
1765 } else if (dst.regClass() == v2 && ctx->program->gfx_level >= GFX8) {
1766 bld.vop3(aco_opcode::v_ashrrev_i64, Definition(dst), get_alu_src(ctx, instr->src[1]),
1767 get_alu_src(ctx, instr->src[0]));
1768 } else if (dst.regClass() == v2) {
1769 emit_vop3a_instruction(ctx, instr, aco_opcode::v_ashr_i64, dst);
1770 } else if (dst.regClass() == s1) {
1771 emit_sop2_instruction(ctx, instr, aco_opcode::s_ashr_i32, dst, true);
1772 } else if (dst.regClass() == s2) {
1773 emit_sop2_instruction(ctx, instr, aco_opcode::s_ashr_i64, dst, true);
1774 } else {
1775 isel_err(&instr->instr, "Unimplemented NIR instr bit size");
1776 }
1777 break;
1778 }
1779 case nir_op_find_lsb: {
1780 Temp src = get_alu_src(ctx, instr->src[0]);
1781 if (src.regClass() == s1) {
1782 bld.sop1(aco_opcode::s_ff1_i32_b32, Definition(dst), src);
1783 } else if (src.regClass() == v1) {
1784 emit_vop1_instruction(ctx, instr, aco_opcode::v_ffbl_b32, dst);
1785 } else if (src.regClass() == s2) {
1786 bld.sop1(aco_opcode::s_ff1_i32_b64, Definition(dst), src);
1787 } else if (src.regClass() == v2) {
1788 Temp lo = bld.tmp(v1), hi = bld.tmp(v1);
1789 bld.pseudo(aco_opcode::p_split_vector, Definition(lo), Definition(hi), src);
1790 lo = bld.vop1(aco_opcode::v_ffbl_b32, bld.def(v1), lo);
1791 hi = bld.vop1(aco_opcode::v_ffbl_b32, bld.def(v1), hi);
1792 hi = uadd32_sat(bld, bld.def(v1), bld.copy(bld.def(s1), Operand::c32(32u)), hi);
1793 bld.vop2(aco_opcode::v_min_u32, Definition(dst), lo, hi);
1794 } else {
1795 isel_err(&instr->instr, "Unimplemented NIR instr bit size");
1796 }
1797 break;
1798 }
1799 case nir_op_ufind_msb:
1800 case nir_op_ifind_msb: {
1801 Temp src = get_alu_src(ctx, instr->src[0]);
1802 if (src.regClass() == s1 || src.regClass() == s2) {
1803 aco_opcode op = src.regClass() == s2
1804 ? (instr->op == nir_op_ufind_msb ? aco_opcode::s_flbit_i32_b64
1805 : aco_opcode::s_flbit_i32_i64)
1806 : (instr->op == nir_op_ufind_msb ? aco_opcode::s_flbit_i32_b32
1807 : aco_opcode::s_flbit_i32);
1808 Temp msb_rev = bld.sop1(op, bld.def(s1), src);
1809
1810 Builder::Result sub = bld.sop2(aco_opcode::s_sub_u32, bld.def(s1), bld.def(s1, scc),
1811 Operand::c32(src.size() * 32u - 1u), msb_rev);
1812 Temp msb = sub.def(0).getTemp();
1813 Temp carry = sub.def(1).getTemp();
1814
1815 bld.sop2(aco_opcode::s_cselect_b32, Definition(dst), Operand::c32(-1), msb,
1816 bld.scc(carry));
1817 } else if (src.regClass() == v1) {
1818 aco_opcode op =
1819 instr->op == nir_op_ufind_msb ? aco_opcode::v_ffbh_u32 : aco_opcode::v_ffbh_i32;
1820 Temp msb_rev = bld.tmp(v1);
1821 emit_vop1_instruction(ctx, instr, op, msb_rev);
1822 Temp msb = bld.tmp(v1);
1823 Temp carry =
1824 bld.vsub32(Definition(msb), Operand::c32(31u), Operand(msb_rev), true).def(1).getTemp();
1825 bld.vop2(aco_opcode::v_cndmask_b32, Definition(dst), msb, msb_rev, carry);
1826 } else if (src.regClass() == v2) {
1827 aco_opcode op =
1828 instr->op == nir_op_ufind_msb ? aco_opcode::v_ffbh_u32 : aco_opcode::v_ffbh_i32;
1829
1830 Temp lo = bld.tmp(v1), hi = bld.tmp(v1);
1831 bld.pseudo(aco_opcode::p_split_vector, Definition(lo), Definition(hi), src);
1832
1833 lo = uadd32_sat(bld, bld.def(v1), bld.copy(bld.def(s1), Operand::c32(32u)),
1834 bld.vop1(op, bld.def(v1), lo));
1835 hi = bld.vop1(op, bld.def(v1), hi);
1836 Temp found_hi = bld.vopc(aco_opcode::v_cmp_lg_u32, bld.def(bld.lm), Operand::c32(-1), hi);
1837
1838 Temp msb_rev = bld.vop2(aco_opcode::v_cndmask_b32, bld.def(v1), lo, hi, found_hi);
1839
1840 Temp msb = bld.tmp(v1);
1841 Temp carry =
1842 bld.vsub32(Definition(msb), Operand::c32(63u), Operand(msb_rev), true).def(1).getTemp();
1843 bld.vop2(aco_opcode::v_cndmask_b32, Definition(dst), msb, msb_rev, carry);
1844 } else {
1845 isel_err(&instr->instr, "Unimplemented NIR instr bit size");
1846 }
1847 break;
1848 }
1849 case nir_op_ufind_msb_rev:
1850 case nir_op_ifind_msb_rev: {
1851 Temp src = get_alu_src(ctx, instr->src[0]);
1852 if (src.regClass() == s1) {
1853 aco_opcode op = instr->op == nir_op_ufind_msb_rev ? aco_opcode::s_flbit_i32_b32
1854 : aco_opcode::s_flbit_i32;
1855 bld.sop1(op, Definition(dst), src);
1856 } else if (src.regClass() == v1) {
1857 aco_opcode op =
1858 instr->op == nir_op_ufind_msb_rev ? aco_opcode::v_ffbh_u32 : aco_opcode::v_ffbh_i32;
1859 emit_vop1_instruction(ctx, instr, op, dst);
1860 } else {
1861 isel_err(&instr->instr, "Unimplemented NIR instr bit size");
1862 }
1863 break;
1864 }
1865 case nir_op_bitfield_reverse: {
1866 if (dst.regClass() == s1) {
1867 bld.sop1(aco_opcode::s_brev_b32, Definition(dst), get_alu_src(ctx, instr->src[0]));
1868 } else if (dst.regClass() == v1) {
1869 bld.vop1(aco_opcode::v_bfrev_b32, Definition(dst), get_alu_src(ctx, instr->src[0]));
1870 } else {
1871 isel_err(&instr->instr, "Unimplemented NIR instr bit size");
1872 }
1873 break;
1874 }
1875 case nir_op_iadd: {
1876 if (dst.regClass() == s1) {
1877 emit_sop2_instruction(ctx, instr, aco_opcode::s_add_u32, dst, true);
1878 break;
1879 } else if (dst.bytes() <= 2 && ctx->program->gfx_level >= GFX10) {
1880 emit_vop3a_instruction(ctx, instr, aco_opcode::v_add_u16_e64, dst);
1881 break;
1882 } else if (dst.bytes() <= 2 && ctx->program->gfx_level >= GFX8) {
1883 emit_vop2_instruction(ctx, instr, aco_opcode::v_add_u16, dst, true);
1884 break;
1885 } else if (dst.regClass() == v1 && instr->def.bit_size == 16) {
1886 emit_vop3p_instruction(ctx, instr, aco_opcode::v_pk_add_u16, dst);
1887 break;
1888 }
1889
1890 Temp src0 = get_alu_src(ctx, instr->src[0]);
1891 Temp src1 = get_alu_src(ctx, instr->src[1]);
1892 if (dst.type() == RegType::vgpr && dst.bytes() <= 4) {
1893 if (instr->no_unsigned_wrap)
1894 bld.nuw().vadd32(Definition(dst), Operand(src0), Operand(src1));
1895 else
1896 bld.vadd32(Definition(dst), Operand(src0), Operand(src1));
1897 break;
1898 }
1899
1900 assert(src0.size() == 2 && src1.size() == 2);
1901 Temp src00 = bld.tmp(src0.type(), 1);
1902 Temp src01 = bld.tmp(dst.type(), 1);
1903 bld.pseudo(aco_opcode::p_split_vector, Definition(src00), Definition(src01), src0);
1904 Temp src10 = bld.tmp(src1.type(), 1);
1905 Temp src11 = bld.tmp(dst.type(), 1);
1906 bld.pseudo(aco_opcode::p_split_vector, Definition(src10), Definition(src11), src1);
1907
1908 if (dst.regClass() == s2) {
1909 Temp carry = bld.tmp(s1);
1910 Temp dst0 =
1911 bld.sop2(aco_opcode::s_add_u32, bld.def(s1), bld.scc(Definition(carry)), src00, src10);
1912 Temp dst1 = bld.sop2(aco_opcode::s_addc_u32, bld.def(s1), bld.def(s1, scc), src01, src11,
1913 bld.scc(carry));
1914 bld.pseudo(aco_opcode::p_create_vector, Definition(dst), dst0, dst1);
1915 } else if (dst.regClass() == v2) {
1916 Temp dst0 = bld.tmp(v1);
1917 Temp carry = bld.vadd32(Definition(dst0), src00, src10, true).def(1).getTemp();
1918 Temp dst1 = bld.vadd32(bld.def(v1), src01, src11, false, carry);
1919 bld.pseudo(aco_opcode::p_create_vector, Definition(dst), dst0, dst1);
1920 } else {
1921 isel_err(&instr->instr, "Unimplemented NIR instr bit size");
1922 }
1923 break;
1924 }
1925 case nir_op_uadd_sat: {
1926 if (dst.regClass() == v1 && instr->def.bit_size == 16) {
1927 Instruction* add_instr = emit_vop3p_instruction(ctx, instr, aco_opcode::v_pk_add_u16, dst);
1928 add_instr->valu().clamp = 1;
1929 break;
1930 }
1931 Temp src0 = get_alu_src(ctx, instr->src[0]);
1932 Temp src1 = get_alu_src(ctx, instr->src[1]);
1933 if (dst.regClass() == s1) {
1934 Temp tmp = bld.tmp(s1), carry = bld.tmp(s1);
1935 bld.sop2(aco_opcode::s_add_u32, Definition(tmp), bld.scc(Definition(carry)), src0, src1);
1936 bld.sop2(aco_opcode::s_cselect_b32, Definition(dst), Operand::c32(-1), tmp,
1937 bld.scc(carry));
1938 break;
1939 } else if (dst.regClass() == v2b) {
1940 Instruction* add_instr;
1941 if (ctx->program->gfx_level >= GFX10) {
1942 add_instr = bld.vop3(aco_opcode::v_add_u16_e64, Definition(dst), src0, src1).instr;
1943 } else {
1944 if (src1.type() == RegType::sgpr)
1945 std::swap(src0, src1);
1946 add_instr =
1947 bld.vop2_e64(aco_opcode::v_add_u16, Definition(dst), src0, as_vgpr(ctx, src1)).instr;
1948 }
1949 add_instr->valu().clamp = 1;
1950 break;
1951 } else if (dst.regClass() == v1) {
1952 uadd32_sat(bld, Definition(dst), src0, src1);
1953 break;
1954 }
1955
1956 assert(src0.size() == 2 && src1.size() == 2);
1957
1958 Temp src00 = bld.tmp(src0.type(), 1);
1959 Temp src01 = bld.tmp(src0.type(), 1);
1960 bld.pseudo(aco_opcode::p_split_vector, Definition(src00), Definition(src01), src0);
1961 Temp src10 = bld.tmp(src1.type(), 1);
1962 Temp src11 = bld.tmp(src1.type(), 1);
1963 bld.pseudo(aco_opcode::p_split_vector, Definition(src10), Definition(src11), src1);
1964
1965 if (dst.regClass() == s2) {
1966 Temp carry0 = bld.tmp(s1);
1967 Temp carry1 = bld.tmp(s1);
1968
1969 Temp no_sat0 =
1970 bld.sop2(aco_opcode::s_add_u32, bld.def(s1), bld.scc(Definition(carry0)), src00, src10);
1971 Temp no_sat1 = bld.sop2(aco_opcode::s_addc_u32, bld.def(s1), bld.scc(Definition(carry1)),
1972 src01, src11, bld.scc(carry0));
1973
1974 Temp no_sat = bld.pseudo(aco_opcode::p_create_vector, bld.def(s2), no_sat0, no_sat1);
1975
1976 bld.sop2(aco_opcode::s_cselect_b64, Definition(dst), Operand::c64(-1), no_sat,
1977 bld.scc(carry1));
1978 } else if (dst.regClass() == v2) {
1979 Temp no_sat0 = bld.tmp(v1);
1980 Temp dst0 = bld.tmp(v1);
1981 Temp dst1 = bld.tmp(v1);
1982
1983 Temp carry0 = bld.vadd32(Definition(no_sat0), src00, src10, true).def(1).getTemp();
1984 Temp carry1;
1985
1986 if (ctx->program->gfx_level >= GFX8) {
1987 carry1 = bld.tmp(bld.lm);
1988 bld.vop2_e64(aco_opcode::v_addc_co_u32, Definition(dst1), Definition(carry1),
1989 as_vgpr(ctx, src01), as_vgpr(ctx, src11), carry0)
1990 ->valu()
1991 .clamp = 1;
1992 } else {
1993 Temp no_sat1 = bld.tmp(v1);
1994 carry1 = bld.vadd32(Definition(no_sat1), src01, src11, true, carry0).def(1).getTemp();
1995 bld.vop2_e64(aco_opcode::v_cndmask_b32, Definition(dst1), no_sat1, Operand::c32(-1),
1996 carry1);
1997 }
1998
1999 bld.vop2_e64(aco_opcode::v_cndmask_b32, Definition(dst0), no_sat0, Operand::c32(-1),
2000 carry1);
2001 bld.pseudo(aco_opcode::p_create_vector, Definition(dst), dst0, dst1);
2002 } else {
2003 isel_err(&instr->instr, "Unimplemented NIR instr bit size");
2004 }
2005 break;
2006 }
2007 case nir_op_iadd_sat: {
2008 if (dst.regClass() == v1 && instr->def.bit_size == 16) {
2009 Instruction* add_instr = emit_vop3p_instruction(ctx, instr, aco_opcode::v_pk_add_i16, dst);
2010 add_instr->valu().clamp = 1;
2011 break;
2012 }
2013 Temp src0 = get_alu_src(ctx, instr->src[0]);
2014 Temp src1 = get_alu_src(ctx, instr->src[1]);
2015 if (dst.regClass() == s1) {
2016 Temp cond = bld.sopc(aco_opcode::s_cmp_lt_i32, bld.def(s1, scc), src1, Operand::zero());
2017 Temp bound = bld.sop2(aco_opcode::s_add_u32, bld.def(s1), bld.scc(bld.def(s1, scc)),
2018 Operand::c32(INT32_MAX), cond);
2019 Temp overflow = bld.tmp(s1);
2020 Temp add =
2021 bld.sop2(aco_opcode::s_add_i32, bld.def(s1), bld.scc(Definition(overflow)), src0, src1);
2022 bld.sop2(aco_opcode::s_cselect_b32, Definition(dst), bound, add, bld.scc(overflow));
2023 break;
2024 }
2025
2026 src1 = as_vgpr(ctx, src1);
2027
2028 if (dst.regClass() == v2b) {
2029 Instruction* add_instr =
2030 bld.vop3(aco_opcode::v_add_i16, Definition(dst), src0, src1).instr;
2031 add_instr->valu().clamp = 1;
2032 } else if (dst.regClass() == v1) {
2033 Instruction* add_instr =
2034 bld.vop3(aco_opcode::v_add_i32, Definition(dst), src0, src1).instr;
2035 add_instr->valu().clamp = 1;
2036 } else {
2037 isel_err(&instr->instr, "Unimplemented NIR instr bit size");
2038 }
2039 break;
2040 }
2041 case nir_op_uadd_carry: {
2042 Temp src0 = get_alu_src(ctx, instr->src[0]);
2043 Temp src1 = get_alu_src(ctx, instr->src[1]);
2044 if (dst.regClass() == s1) {
2045 bld.sop2(aco_opcode::s_add_u32, bld.def(s1), bld.scc(Definition(dst)), src0, src1);
2046 break;
2047 }
2048 if (dst.regClass() == v1) {
2049 Temp carry = bld.vadd32(bld.def(v1), src0, src1, true).def(1).getTemp();
2050 bld.vop2_e64(aco_opcode::v_cndmask_b32, Definition(dst), Operand::zero(), Operand::c32(1u),
2051 carry);
2052 break;
2053 }
2054
2055 Temp src00 = bld.tmp(src0.type(), 1);
2056 Temp src01 = bld.tmp(dst.type(), 1);
2057 bld.pseudo(aco_opcode::p_split_vector, Definition(src00), Definition(src01), src0);
2058 Temp src10 = bld.tmp(src1.type(), 1);
2059 Temp src11 = bld.tmp(dst.type(), 1);
2060 bld.pseudo(aco_opcode::p_split_vector, Definition(src10), Definition(src11), src1);
2061 if (dst.regClass() == s2) {
2062 Temp carry = bld.tmp(s1);
2063 bld.sop2(aco_opcode::s_add_u32, bld.def(s1), bld.scc(Definition(carry)), src00, src10);
2064 carry = bld.sop2(aco_opcode::s_addc_u32, bld.def(s1), bld.scc(bld.def(s1)), src01, src11,
2065 bld.scc(carry))
2066 .def(1)
2067 .getTemp();
2068 bld.pseudo(aco_opcode::p_create_vector, Definition(dst), carry, Operand::zero());
2069 } else if (dst.regClass() == v2) {
2070 Temp carry = bld.vadd32(bld.def(v1), src00, src10, true).def(1).getTemp();
2071 carry = bld.vadd32(bld.def(v1), src01, src11, true, carry).def(1).getTemp();
2072 carry = bld.vop2_e64(aco_opcode::v_cndmask_b32, bld.def(v1), Operand::zero(),
2073 Operand::c32(1u), carry);
2074 bld.pseudo(aco_opcode::p_create_vector, Definition(dst), carry, Operand::zero());
2075 } else {
2076 isel_err(&instr->instr, "Unimplemented NIR instr bit size");
2077 }
2078 break;
2079 }
2080 case nir_op_isub: {
2081 if (dst.regClass() == s1) {
2082 emit_sop2_instruction(ctx, instr, aco_opcode::s_sub_i32, dst, true);
2083 break;
2084 } else if (dst.regClass() == v1 && instr->def.bit_size == 16) {
2085 emit_vop3p_instruction(ctx, instr, aco_opcode::v_pk_sub_u16, dst);
2086 break;
2087 }
2088
2089 Temp src0 = get_alu_src(ctx, instr->src[0]);
2090 Temp src1 = get_alu_src(ctx, instr->src[1]);
2091 if (dst.regClass() == v1) {
2092 bld.vsub32(Definition(dst), src0, src1);
2093 break;
2094 } else if (dst.bytes() <= 2) {
2095 if (ctx->program->gfx_level >= GFX10)
2096 bld.vop3(aco_opcode::v_sub_u16_e64, Definition(dst), src0, src1);
2097 else if (src1.type() == RegType::sgpr)
2098 bld.vop2(aco_opcode::v_subrev_u16, Definition(dst), src1, as_vgpr(ctx, src0));
2099 else if (ctx->program->gfx_level >= GFX8)
2100 bld.vop2(aco_opcode::v_sub_u16, Definition(dst), src0, as_vgpr(ctx, src1));
2101 else
2102 bld.vsub32(Definition(dst), src0, src1);
2103 break;
2104 }
2105
2106 Temp src00 = bld.tmp(src0.type(), 1);
2107 Temp src01 = bld.tmp(dst.type(), 1);
2108 bld.pseudo(aco_opcode::p_split_vector, Definition(src00), Definition(src01), src0);
2109 Temp src10 = bld.tmp(src1.type(), 1);
2110 Temp src11 = bld.tmp(dst.type(), 1);
2111 bld.pseudo(aco_opcode::p_split_vector, Definition(src10), Definition(src11), src1);
2112 if (dst.regClass() == s2) {
2113 Temp borrow = bld.tmp(s1);
2114 Temp dst0 =
2115 bld.sop2(aco_opcode::s_sub_u32, bld.def(s1), bld.scc(Definition(borrow)), src00, src10);
2116 Temp dst1 = bld.sop2(aco_opcode::s_subb_u32, bld.def(s1), bld.def(s1, scc), src01, src11,
2117 bld.scc(borrow));
2118 bld.pseudo(aco_opcode::p_create_vector, Definition(dst), dst0, dst1);
2119 } else if (dst.regClass() == v2) {
2120 Temp lower = bld.tmp(v1);
2121 Temp borrow = bld.vsub32(Definition(lower), src00, src10, true).def(1).getTemp();
2122 Temp upper = bld.vsub32(bld.def(v1), src01, src11, false, borrow);
2123 bld.pseudo(aco_opcode::p_create_vector, Definition(dst), lower, upper);
2124 } else {
2125 isel_err(&instr->instr, "Unimplemented NIR instr bit size");
2126 }
2127 break;
2128 }
2129 case nir_op_usub_borrow: {
2130 Temp src0 = get_alu_src(ctx, instr->src[0]);
2131 Temp src1 = get_alu_src(ctx, instr->src[1]);
2132 if (dst.regClass() == s1) {
2133 bld.sop2(aco_opcode::s_sub_u32, bld.def(s1), bld.scc(Definition(dst)), src0, src1);
2134 break;
2135 } else if (dst.regClass() == v1) {
2136 Temp borrow = bld.vsub32(bld.def(v1), src0, src1, true).def(1).getTemp();
2137 bld.vop2_e64(aco_opcode::v_cndmask_b32, Definition(dst), Operand::zero(), Operand::c32(1u),
2138 borrow);
2139 break;
2140 }
2141
2142 Temp src00 = bld.tmp(src0.type(), 1);
2143 Temp src01 = bld.tmp(dst.type(), 1);
2144 bld.pseudo(aco_opcode::p_split_vector, Definition(src00), Definition(src01), src0);
2145 Temp src10 = bld.tmp(src1.type(), 1);
2146 Temp src11 = bld.tmp(dst.type(), 1);
2147 bld.pseudo(aco_opcode::p_split_vector, Definition(src10), Definition(src11), src1);
2148 if (dst.regClass() == s2) {
2149 Temp borrow = bld.tmp(s1);
2150 bld.sop2(aco_opcode::s_sub_u32, bld.def(s1), bld.scc(Definition(borrow)), src00, src10);
2151 borrow = bld.sop2(aco_opcode::s_subb_u32, bld.def(s1), bld.scc(bld.def(s1)), src01, src11,
2152 bld.scc(borrow))
2153 .def(1)
2154 .getTemp();
2155 bld.pseudo(aco_opcode::p_create_vector, Definition(dst), borrow, Operand::zero());
2156 } else if (dst.regClass() == v2) {
2157 Temp borrow = bld.vsub32(bld.def(v1), src00, src10, true).def(1).getTemp();
2158 borrow = bld.vsub32(bld.def(v1), src01, src11, true, Operand(borrow)).def(1).getTemp();
2159 borrow = bld.vop2_e64(aco_opcode::v_cndmask_b32, bld.def(v1), Operand::zero(),
2160 Operand::c32(1u), borrow);
2161 bld.pseudo(aco_opcode::p_create_vector, Definition(dst), borrow, Operand::zero());
2162 } else {
2163 isel_err(&instr->instr, "Unimplemented NIR instr bit size");
2164 }
2165 break;
2166 }
2167 case nir_op_usub_sat: {
2168 if (dst.regClass() == v1 && instr->def.bit_size == 16) {
2169 Instruction* sub_instr = emit_vop3p_instruction(ctx, instr, aco_opcode::v_pk_sub_u16, dst);
2170 sub_instr->valu().clamp = 1;
2171 break;
2172 }
2173 Temp src0 = get_alu_src(ctx, instr->src[0]);
2174 Temp src1 = get_alu_src(ctx, instr->src[1]);
2175 if (dst.regClass() == s1) {
2176 Temp tmp = bld.tmp(s1), carry = bld.tmp(s1);
2177 bld.sop2(aco_opcode::s_sub_u32, Definition(tmp), bld.scc(Definition(carry)), src0, src1);
2178 bld.sop2(aco_opcode::s_cselect_b32, Definition(dst), Operand::c32(0), tmp, bld.scc(carry));
2179 break;
2180 } else if (dst.regClass() == v2b) {
2181 Instruction* sub_instr;
2182 if (ctx->program->gfx_level >= GFX10) {
2183 sub_instr = bld.vop3(aco_opcode::v_sub_u16_e64, Definition(dst), src0, src1).instr;
2184 } else {
2185 aco_opcode op = aco_opcode::v_sub_u16;
2186 if (src1.type() == RegType::sgpr) {
2187 std::swap(src0, src1);
2188 op = aco_opcode::v_subrev_u16;
2189 }
2190 sub_instr = bld.vop2_e64(op, Definition(dst), src0, as_vgpr(ctx, src1)).instr;
2191 }
2192 sub_instr->valu().clamp = 1;
2193 break;
2194 } else if (dst.regClass() == v1) {
2195 usub32_sat(bld, Definition(dst), src0, as_vgpr(ctx, src1));
2196 break;
2197 }
2198
2199 assert(src0.size() == 2 && src1.size() == 2);
2200 Temp src00 = bld.tmp(src0.type(), 1);
2201 Temp src01 = bld.tmp(src0.type(), 1);
2202 bld.pseudo(aco_opcode::p_split_vector, Definition(src00), Definition(src01), src0);
2203 Temp src10 = bld.tmp(src1.type(), 1);
2204 Temp src11 = bld.tmp(src1.type(), 1);
2205 bld.pseudo(aco_opcode::p_split_vector, Definition(src10), Definition(src11), src1);
2206
2207 if (dst.regClass() == s2) {
2208 Temp carry0 = bld.tmp(s1);
2209 Temp carry1 = bld.tmp(s1);
2210
2211 Temp no_sat0 =
2212 bld.sop2(aco_opcode::s_sub_u32, bld.def(s1), bld.scc(Definition(carry0)), src00, src10);
2213 Temp no_sat1 = bld.sop2(aco_opcode::s_subb_u32, bld.def(s1), bld.scc(Definition(carry1)),
2214 src01, src11, bld.scc(carry0));
2215
2216 Temp no_sat = bld.pseudo(aco_opcode::p_create_vector, bld.def(s2), no_sat0, no_sat1);
2217
2218 bld.sop2(aco_opcode::s_cselect_b64, Definition(dst), Operand::c64(0ull), no_sat,
2219 bld.scc(carry1));
2220 } else if (dst.regClass() == v2) {
2221 Temp no_sat0 = bld.tmp(v1);
2222 Temp dst0 = bld.tmp(v1);
2223 Temp dst1 = bld.tmp(v1);
2224
2225 Temp carry0 = bld.vsub32(Definition(no_sat0), src00, src10, true).def(1).getTemp();
2226 Temp carry1;
2227
2228 if (ctx->program->gfx_level >= GFX8) {
2229 carry1 = bld.tmp(bld.lm);
2230 bld.vop2_e64(aco_opcode::v_subb_co_u32, Definition(dst1), Definition(carry1),
2231 as_vgpr(ctx, src01), as_vgpr(ctx, src11), carry0)
2232 ->valu()
2233 .clamp = 1;
2234 } else {
2235 Temp no_sat1 = bld.tmp(v1);
2236 carry1 = bld.vsub32(Definition(no_sat1), src01, src11, true, carry0).def(1).getTemp();
2237 bld.vop2_e64(aco_opcode::v_cndmask_b32, Definition(dst1), no_sat1, Operand::c32(0u),
2238 carry1);
2239 }
2240
2241 bld.vop2_e64(aco_opcode::v_cndmask_b32, Definition(dst0), no_sat0, Operand::c32(0u),
2242 carry1);
2243 bld.pseudo(aco_opcode::p_create_vector, Definition(dst), dst0, dst1);
2244 } else {
2245 isel_err(&instr->instr, "Unimplemented NIR instr bit size");
2246 }
2247 break;
2248 }
2249 case nir_op_isub_sat: {
2250 if (dst.regClass() == v1 && instr->def.bit_size == 16) {
2251 Instruction* sub_instr = emit_vop3p_instruction(ctx, instr, aco_opcode::v_pk_sub_i16, dst);
2252 sub_instr->valu().clamp = 1;
2253 break;
2254 }
2255 Temp src0 = get_alu_src(ctx, instr->src[0]);
2256 Temp src1 = get_alu_src(ctx, instr->src[1]);
2257 if (dst.regClass() == s1) {
2258 Temp cond = bld.sopc(aco_opcode::s_cmp_gt_i32, bld.def(s1, scc), src1, Operand::zero());
2259 Temp bound = bld.sop2(aco_opcode::s_add_u32, bld.def(s1), bld.scc(bld.def(s1, scc)),
2260 Operand::c32(INT32_MAX), cond);
2261 Temp overflow = bld.tmp(s1);
2262 Temp sub =
2263 bld.sop2(aco_opcode::s_sub_i32, bld.def(s1), bld.scc(Definition(overflow)), src0, src1);
2264 bld.sop2(aco_opcode::s_cselect_b32, Definition(dst), bound, sub, bld.scc(overflow));
2265 break;
2266 }
2267
2268 src1 = as_vgpr(ctx, src1);
2269
2270 if (dst.regClass() == v2b) {
2271 Instruction* sub_instr =
2272 bld.vop3(aco_opcode::v_sub_i16, Definition(dst), src0, src1).instr;
2273 sub_instr->valu().clamp = 1;
2274 } else if (dst.regClass() == v1) {
2275 Instruction* sub_instr =
2276 bld.vop3(aco_opcode::v_sub_i32, Definition(dst), src0, src1).instr;
2277 sub_instr->valu().clamp = 1;
2278 } else {
2279 isel_err(&instr->instr, "Unimplemented NIR instr bit size");
2280 }
2281 break;
2282 }
2283 case nir_op_imul: {
2284 if (dst.bytes() <= 2 && ctx->program->gfx_level >= GFX10) {
2285 emit_vop3a_instruction(ctx, instr, aco_opcode::v_mul_lo_u16_e64, dst);
2286 } else if (dst.bytes() <= 2 && ctx->program->gfx_level >= GFX8) {
2287 emit_vop2_instruction(ctx, instr, aco_opcode::v_mul_lo_u16, dst, true);
2288 } else if (dst.regClass() == v1 && instr->def.bit_size == 16) {
2289 emit_vop3p_instruction(ctx, instr, aco_opcode::v_pk_mul_lo_u16, dst);
2290 } else if (dst.type() == RegType::vgpr) {
2291 uint32_t src0_ub = get_alu_src_ub(ctx, instr, 0);
2292 uint32_t src1_ub = get_alu_src_ub(ctx, instr, 1);
2293
2294 if (src0_ub <= 0xffffff && src1_ub <= 0xffffff) {
2295 bool nuw_16bit = src0_ub <= 0xffff && src1_ub <= 0xffff && src0_ub * src1_ub <= 0xffff;
2296 emit_vop2_instruction(ctx, instr, aco_opcode::v_mul_u32_u24, dst,
2297 true /* commutative */, false, false, nuw_16bit);
2298 } else if (nir_src_is_const(instr->src[0].src)) {
2299 bld.v_mul_imm(Definition(dst), get_alu_src(ctx, instr->src[1]),
2300 nir_src_as_uint(instr->src[0].src), false);
2301 } else if (nir_src_is_const(instr->src[1].src)) {
2302 bld.v_mul_imm(Definition(dst), get_alu_src(ctx, instr->src[0]),
2303 nir_src_as_uint(instr->src[1].src), false);
2304 } else {
2305 emit_vop3a_instruction(ctx, instr, aco_opcode::v_mul_lo_u32, dst);
2306 }
2307 } else if (dst.regClass() == s1) {
2308 emit_sop2_instruction(ctx, instr, aco_opcode::s_mul_i32, dst, false);
2309 } else {
2310 isel_err(&instr->instr, "Unimplemented NIR instr bit size");
2311 }
2312 break;
2313 }
2314 case nir_op_umul_high: {
2315 if (dst.regClass() == s1 && ctx->options->gfx_level >= GFX9) {
2316 emit_sop2_instruction(ctx, instr, aco_opcode::s_mul_hi_u32, dst, false);
2317 } else if (dst.bytes() == 4) {
2318 uint32_t src0_ub = get_alu_src_ub(ctx, instr, 0);
2319 uint32_t src1_ub = get_alu_src_ub(ctx, instr, 1);
2320
2321 Temp tmp = dst.regClass() == s1 ? bld.tmp(v1) : dst;
2322 if (src0_ub <= 0xffffff && src1_ub <= 0xffffff) {
2323 emit_vop2_instruction(ctx, instr, aco_opcode::v_mul_hi_u32_u24, tmp, true);
2324 } else {
2325 emit_vop3a_instruction(ctx, instr, aco_opcode::v_mul_hi_u32, tmp);
2326 }
2327
2328 if (dst.regClass() == s1)
2329 bld.pseudo(aco_opcode::p_as_uniform, Definition(dst), tmp);
2330 } else {
2331 isel_err(&instr->instr, "Unimplemented NIR instr bit size");
2332 }
2333 break;
2334 }
2335 case nir_op_imul_high: {
2336 if (dst.regClass() == v1) {
2337 emit_vop3a_instruction(ctx, instr, aco_opcode::v_mul_hi_i32, dst);
2338 } else if (dst.regClass() == s1 && ctx->options->gfx_level >= GFX9) {
2339 emit_sop2_instruction(ctx, instr, aco_opcode::s_mul_hi_i32, dst, false);
2340 } else if (dst.regClass() == s1) {
2341 Temp tmp = bld.vop3(aco_opcode::v_mul_hi_i32, bld.def(v1), get_alu_src(ctx, instr->src[0]),
2342 as_vgpr(ctx, get_alu_src(ctx, instr->src[1])));
2343 bld.pseudo(aco_opcode::p_as_uniform, Definition(dst), tmp);
2344 } else {
2345 isel_err(&instr->instr, "Unimplemented NIR instr bit size");
2346 }
2347 break;
2348 }
2349 case nir_op_fmul: {
2350 if (dst.regClass() == v2b) {
2351 emit_vop2_instruction(ctx, instr, aco_opcode::v_mul_f16, dst, true);
2352 } else if (dst.regClass() == v1 && instr->def.bit_size == 16) {
2353 emit_vop3p_instruction(ctx, instr, aco_opcode::v_pk_mul_f16, dst);
2354 } else if (dst.regClass() == v1) {
2355 emit_vop2_instruction(ctx, instr, aco_opcode::v_mul_f32, dst, true);
2356 } else if (dst.regClass() == v2) {
2357 emit_vop3a_instruction(ctx, instr, aco_opcode::v_mul_f64, dst);
2358 } else {
2359 isel_err(&instr->instr, "Unimplemented NIR instr bit size");
2360 }
2361 break;
2362 }
2363 case nir_op_fmulz: {
2364 if (dst.regClass() == v1) {
2365 emit_vop2_instruction(ctx, instr, aco_opcode::v_mul_legacy_f32, dst, true);
2366 } else {
2367 isel_err(&instr->instr, "Unimplemented NIR instr bit size");
2368 }
2369 break;
2370 }
2371 case nir_op_fadd: {
2372 if (dst.regClass() == v2b) {
2373 emit_vop2_instruction(ctx, instr, aco_opcode::v_add_f16, dst, true);
2374 } else if (dst.regClass() == v1 && instr->def.bit_size == 16) {
2375 emit_vop3p_instruction(ctx, instr, aco_opcode::v_pk_add_f16, dst);
2376 } else if (dst.regClass() == v1) {
2377 emit_vop2_instruction(ctx, instr, aco_opcode::v_add_f32, dst, true);
2378 } else if (dst.regClass() == v2) {
2379 emit_vop3a_instruction(ctx, instr, aco_opcode::v_add_f64, dst);
2380 } else {
2381 isel_err(&instr->instr, "Unimplemented NIR instr bit size");
2382 }
2383 break;
2384 }
2385 case nir_op_fsub: {
2386 if (dst.regClass() == v1 && instr->def.bit_size == 16) {
2387 Instruction* add = emit_vop3p_instruction(ctx, instr, aco_opcode::v_pk_add_f16, dst);
2388 VALU_instruction& sub = add->valu();
2389 sub.neg_lo[1] = true;
2390 sub.neg_hi[1] = true;
2391 break;
2392 }
2393
2394 Temp src0 = get_alu_src(ctx, instr->src[0]);
2395 Temp src1 = get_alu_src(ctx, instr->src[1]);
2396 if (dst.regClass() == v2b) {
2397 if (src1.type() == RegType::vgpr || src0.type() != RegType::vgpr)
2398 emit_vop2_instruction(ctx, instr, aco_opcode::v_sub_f16, dst, false);
2399 else
2400 emit_vop2_instruction(ctx, instr, aco_opcode::v_subrev_f16, dst, true);
2401 } else if (dst.regClass() == v1) {
2402 if (src1.type() == RegType::vgpr || src0.type() != RegType::vgpr)
2403 emit_vop2_instruction(ctx, instr, aco_opcode::v_sub_f32, dst, false);
2404 else
2405 emit_vop2_instruction(ctx, instr, aco_opcode::v_subrev_f32, dst, true);
2406 } else if (dst.regClass() == v2) {
2407 Instruction* add = bld.vop3(aco_opcode::v_add_f64, Definition(dst), as_vgpr(ctx, src0),
2408 as_vgpr(ctx, src1));
2409 add->valu().neg[1] = true;
2410 } else {
2411 isel_err(&instr->instr, "Unimplemented NIR instr bit size");
2412 }
2413 break;
2414 }
2415 case nir_op_ffma: {
2416 if (dst.regClass() == v2b) {
2417 emit_vop3a_instruction(ctx, instr, aco_opcode::v_fma_f16, dst, false, 3);
2418 } else if (dst.regClass() == v1 && instr->def.bit_size == 16) {
2419 assert(instr->def.num_components == 2);
2420
2421 Temp src0 = as_vgpr(ctx, get_alu_src_vop3p(ctx, instr->src[0]));
2422 Temp src1 = as_vgpr(ctx, get_alu_src_vop3p(ctx, instr->src[1]));
2423 Temp src2 = as_vgpr(ctx, get_alu_src_vop3p(ctx, instr->src[2]));
2424
2425 /* swizzle to opsel: all swizzles are either 0 (x) or 1 (y) */
2426 unsigned opsel_lo = 0, opsel_hi = 0;
2427 for (unsigned i = 0; i < 3; i++) {
2428 opsel_lo |= (instr->src[i].swizzle[0] & 1) << i;
2429 opsel_hi |= (instr->src[i].swizzle[1] & 1) << i;
2430 }
2431
2432 bld.vop3p(aco_opcode::v_pk_fma_f16, Definition(dst), src0, src1, src2, opsel_lo, opsel_hi);
2433 } else if (dst.regClass() == v1) {
2434 emit_vop3a_instruction(ctx, instr, aco_opcode::v_fma_f32, dst,
2435 ctx->block->fp_mode.must_flush_denorms32, 3);
2436 } else if (dst.regClass() == v2) {
2437 emit_vop3a_instruction(ctx, instr, aco_opcode::v_fma_f64, dst, false, 3);
2438 } else {
2439 isel_err(&instr->instr, "Unimplemented NIR instr bit size");
2440 }
2441 break;
2442 }
2443 case nir_op_ffmaz: {
2444 if (dst.regClass() == v1) {
2445 emit_vop3a_instruction(ctx, instr, aco_opcode::v_fma_legacy_f32, dst,
2446 ctx->block->fp_mode.must_flush_denorms32, 3);
2447 } else {
2448 isel_err(&instr->instr, "Unimplemented NIR instr bit size");
2449 }
2450 break;
2451 }
2452 case nir_op_fmax: {
2453 if (dst.regClass() == v2b) {
2454 emit_vop2_instruction(ctx, instr, aco_opcode::v_max_f16, dst, true, false,
2455 ctx->block->fp_mode.must_flush_denorms16_64);
2456 } else if (dst.regClass() == v1 && instr->def.bit_size == 16) {
2457 emit_vop3p_instruction(ctx, instr, aco_opcode::v_pk_max_f16, dst);
2458 } else if (dst.regClass() == v1) {
2459 emit_vop2_instruction(ctx, instr, aco_opcode::v_max_f32, dst, true, false,
2460 ctx->block->fp_mode.must_flush_denorms32);
2461 } else if (dst.regClass() == v2) {
2462 emit_vop3a_instruction(ctx, instr, aco_opcode::v_max_f64, dst,
2463 ctx->block->fp_mode.must_flush_denorms16_64);
2464 } else {
2465 isel_err(&instr->instr, "Unimplemented NIR instr bit size");
2466 }
2467 break;
2468 }
2469 case nir_op_fmin: {
2470 if (dst.regClass() == v2b) {
2471 emit_vop2_instruction(ctx, instr, aco_opcode::v_min_f16, dst, true, false,
2472 ctx->block->fp_mode.must_flush_denorms16_64);
2473 } else if (dst.regClass() == v1 && instr->def.bit_size == 16) {
2474 emit_vop3p_instruction(ctx, instr, aco_opcode::v_pk_min_f16, dst, true);
2475 } else if (dst.regClass() == v1) {
2476 emit_vop2_instruction(ctx, instr, aco_opcode::v_min_f32, dst, true, false,
2477 ctx->block->fp_mode.must_flush_denorms32);
2478 } else if (dst.regClass() == v2) {
2479 emit_vop3a_instruction(ctx, instr, aco_opcode::v_min_f64, dst,
2480 ctx->block->fp_mode.must_flush_denorms16_64);
2481 } else {
2482 isel_err(&instr->instr, "Unimplemented NIR instr bit size");
2483 }
2484 break;
2485 }
2486 case nir_op_sdot_4x8_iadd: {
2487 if (ctx->options->gfx_level >= GFX11)
2488 emit_idot_instruction(ctx, instr, aco_opcode::v_dot4_i32_iu8, dst, false, 0x3);
2489 else
2490 emit_idot_instruction(ctx, instr, aco_opcode::v_dot4_i32_i8, dst, false);
2491 break;
2492 }
2493 case nir_op_sdot_4x8_iadd_sat: {
2494 if (ctx->options->gfx_level >= GFX11)
2495 emit_idot_instruction(ctx, instr, aco_opcode::v_dot4_i32_iu8, dst, true, 0x3);
2496 else
2497 emit_idot_instruction(ctx, instr, aco_opcode::v_dot4_i32_i8, dst, true);
2498 break;
2499 }
2500 case nir_op_sudot_4x8_iadd: {
2501 emit_idot_instruction(ctx, instr, aco_opcode::v_dot4_i32_iu8, dst, false, 0x1);
2502 break;
2503 }
2504 case nir_op_sudot_4x8_iadd_sat: {
2505 emit_idot_instruction(ctx, instr, aco_opcode::v_dot4_i32_iu8, dst, true, 0x1);
2506 break;
2507 }
2508 case nir_op_udot_4x8_uadd: {
2509 emit_idot_instruction(ctx, instr, aco_opcode::v_dot4_u32_u8, dst, false);
2510 break;
2511 }
2512 case nir_op_udot_4x8_uadd_sat: {
2513 emit_idot_instruction(ctx, instr, aco_opcode::v_dot4_u32_u8, dst, true);
2514 break;
2515 }
2516 case nir_op_sdot_2x16_iadd: {
2517 emit_idot_instruction(ctx, instr, aco_opcode::v_dot2_i32_i16, dst, false);
2518 break;
2519 }
2520 case nir_op_sdot_2x16_iadd_sat: {
2521 emit_idot_instruction(ctx, instr, aco_opcode::v_dot2_i32_i16, dst, true);
2522 break;
2523 }
2524 case nir_op_udot_2x16_uadd: {
2525 emit_idot_instruction(ctx, instr, aco_opcode::v_dot2_u32_u16, dst, false);
2526 break;
2527 }
2528 case nir_op_udot_2x16_uadd_sat: {
2529 emit_idot_instruction(ctx, instr, aco_opcode::v_dot2_u32_u16, dst, true);
2530 break;
2531 }
2532 case nir_op_cube_amd: {
2533 Temp in = get_alu_src(ctx, instr->src[0], 3);
2534 Temp src[3] = {emit_extract_vector(ctx, in, 0, v1), emit_extract_vector(ctx, in, 1, v1),
2535 emit_extract_vector(ctx, in, 2, v1)};
2536 Temp ma = bld.vop3(aco_opcode::v_cubema_f32, bld.def(v1), src[0], src[1], src[2]);
2537 Temp sc = bld.vop3(aco_opcode::v_cubesc_f32, bld.def(v1), src[0], src[1], src[2]);
2538 Temp tc = bld.vop3(aco_opcode::v_cubetc_f32, bld.def(v1), src[0], src[1], src[2]);
2539 Temp id = bld.vop3(aco_opcode::v_cubeid_f32, bld.def(v1), src[0], src[1], src[2]);
2540 bld.pseudo(aco_opcode::p_create_vector, Definition(dst), tc, sc, ma, id);
2541 break;
2542 }
2543 case nir_op_bcsel: {
2544 emit_bcsel(ctx, instr, dst);
2545 break;
2546 }
2547 case nir_op_frsq: {
2548 if (dst.regClass() == v2b) {
2549 emit_vop1_instruction(ctx, instr, aco_opcode::v_rsq_f16, dst);
2550 } else if (dst.regClass() == v1) {
2551 Temp src = get_alu_src(ctx, instr->src[0]);
2552 emit_rsq(ctx, bld, Definition(dst), src);
2553 } else if (dst.regClass() == v2) {
2554 /* Lowered at NIR level for precision reasons. */
2555 emit_vop1_instruction(ctx, instr, aco_opcode::v_rsq_f64, dst);
2556 } else {
2557 isel_err(&instr->instr, "Unimplemented NIR instr bit size");
2558 }
2559 break;
2560 }
2561 case nir_op_fneg: {
2562 if (dst.regClass() == v1 && instr->def.bit_size == 16) {
2563 Temp src = get_alu_src_vop3p(ctx, instr->src[0]);
2564 Instruction* vop3p =
2565 bld.vop3p(aco_opcode::v_pk_mul_f16, Definition(dst), src, Operand::c16(0x3C00),
2566 instr->src[0].swizzle[0] & 1, instr->src[0].swizzle[1] & 1);
2567 vop3p->valu().neg_lo[0] = true;
2568 vop3p->valu().neg_hi[0] = true;
2569 break;
2570 }
2571 Temp src = get_alu_src(ctx, instr->src[0]);
2572 if (dst.regClass() == v2b) {
2573 bld.vop2(aco_opcode::v_mul_f16, Definition(dst), Operand::c16(0xbc00u), as_vgpr(ctx, src));
2574 } else if (dst.regClass() == v1) {
2575 bld.vop2(aco_opcode::v_mul_f32, Definition(dst), Operand::c32(0xbf800000u),
2576 as_vgpr(ctx, src));
2577 } else if (dst.regClass() == v2) {
2578 if (ctx->block->fp_mode.must_flush_denorms16_64)
2579 src = bld.vop3(aco_opcode::v_mul_f64, bld.def(v2), Operand::c64(0x3FF0000000000000),
2580 as_vgpr(ctx, src));
2581 Temp upper = bld.tmp(v1), lower = bld.tmp(v1);
2582 bld.pseudo(aco_opcode::p_split_vector, Definition(lower), Definition(upper), src);
2583 upper = bld.vop2(aco_opcode::v_xor_b32, bld.def(v1), Operand::c32(0x80000000u), upper);
2584 bld.pseudo(aco_opcode::p_create_vector, Definition(dst), lower, upper);
2585 } else {
2586 isel_err(&instr->instr, "Unimplemented NIR instr bit size");
2587 }
2588 break;
2589 }
2590 case nir_op_fabs: {
2591 if (dst.regClass() == v1 && instr->def.bit_size == 16) {
2592 Temp src = get_alu_src_vop3p(ctx, instr->src[0]);
2593 Instruction* vop3p =
2594 bld.vop3p(aco_opcode::v_pk_max_f16, Definition(dst), src, src,
2595 instr->src[0].swizzle[0] & 1 ? 3 : 0, instr->src[0].swizzle[1] & 1 ? 3 : 0)
2596 .instr;
2597 vop3p->valu().neg_lo[1] = true;
2598 vop3p->valu().neg_hi[1] = true;
2599 break;
2600 }
2601 Temp src = get_alu_src(ctx, instr->src[0]);
2602 if (dst.regClass() == v2b) {
2603 Instruction* mul = bld.vop2_e64(aco_opcode::v_mul_f16, Definition(dst),
2604 Operand::c16(0x3c00), as_vgpr(ctx, src))
2605 .instr;
2606 mul->valu().abs[1] = true;
2607 } else if (dst.regClass() == v1) {
2608 Instruction* mul = bld.vop2_e64(aco_opcode::v_mul_f32, Definition(dst),
2609 Operand::c32(0x3f800000u), as_vgpr(ctx, src))
2610 .instr;
2611 mul->valu().abs[1] = true;
2612 } else if (dst.regClass() == v2) {
2613 if (ctx->block->fp_mode.must_flush_denorms16_64)
2614 src = bld.vop3(aco_opcode::v_mul_f64, bld.def(v2), Operand::c64(0x3FF0000000000000),
2615 as_vgpr(ctx, src));
2616 Temp upper = bld.tmp(v1), lower = bld.tmp(v1);
2617 bld.pseudo(aco_opcode::p_split_vector, Definition(lower), Definition(upper), src);
2618 upper = bld.vop2(aco_opcode::v_and_b32, bld.def(v1), Operand::c32(0x7FFFFFFFu), upper);
2619 bld.pseudo(aco_opcode::p_create_vector, Definition(dst), lower, upper);
2620 } else {
2621 isel_err(&instr->instr, "Unimplemented NIR instr bit size");
2622 }
2623 break;
2624 }
2625 case nir_op_fsat: {
2626 if (dst.regClass() == v1 && instr->def.bit_size == 16) {
2627 Temp src = get_alu_src_vop3p(ctx, instr->src[0]);
2628 Instruction* vop3p =
2629 bld.vop3p(aco_opcode::v_pk_mul_f16, Definition(dst), src, Operand::c16(0x3C00),
2630 instr->src[0].swizzle[0] & 1, instr->src[0].swizzle[1] & 1);
2631 vop3p->valu().clamp = true;
2632 break;
2633 }
2634 Temp src = get_alu_src(ctx, instr->src[0]);
2635 if (dst.regClass() == v2b && ctx->program->gfx_level >= GFX9) {
2636 bld.vop3(aco_opcode::v_med3_f16, Definition(dst), Operand::c16(0u), Operand::c16(0x3c00),
2637 src);
2638 } else if (dst.regClass() == v2b) {
2639 bld.vop2_e64(aco_opcode::v_mul_f16, Definition(dst), Operand::c16(0x3c00), src)
2640 ->valu()
2641 .clamp = true;
2642 } else if (dst.regClass() == v1) {
2643 bld.vop3(aco_opcode::v_med3_f32, Definition(dst), Operand::zero(),
2644 Operand::c32(0x3f800000u), src);
2645 /* apparently, it is not necessary to flush denorms if this instruction is used with these
2646 * operands */
2647 // TODO: confirm that this holds under any circumstances
2648 } else if (dst.regClass() == v2) {
2649 Instruction* add = bld.vop3(aco_opcode::v_add_f64, Definition(dst), src, Operand::zero());
2650 add->valu().clamp = true;
2651 } else {
2652 isel_err(&instr->instr, "Unimplemented NIR instr bit size");
2653 }
2654 break;
2655 }
2656 case nir_op_flog2: {
2657 if (dst.regClass() == v2b) {
2658 emit_vop1_instruction(ctx, instr, aco_opcode::v_log_f16, dst);
2659 } else if (dst.regClass() == v1) {
2660 Temp src = get_alu_src(ctx, instr->src[0]);
2661 emit_log2(ctx, bld, Definition(dst), src);
2662 } else {
2663 isel_err(&instr->instr, "Unimplemented NIR instr bit size");
2664 }
2665 break;
2666 }
2667 case nir_op_frcp: {
2668 if (dst.regClass() == v2b) {
2669 emit_vop1_instruction(ctx, instr, aco_opcode::v_rcp_f16, dst);
2670 } else if (dst.regClass() == v1) {
2671 Temp src = get_alu_src(ctx, instr->src[0]);
2672 emit_rcp(ctx, bld, Definition(dst), src);
2673 } else if (dst.regClass() == v2) {
2674 /* Lowered at NIR level for precision reasons. */
2675 emit_vop1_instruction(ctx, instr, aco_opcode::v_rcp_f64, dst);
2676 } else {
2677 isel_err(&instr->instr, "Unimplemented NIR instr bit size");
2678 }
2679 break;
2680 }
2681 case nir_op_fexp2: {
2682 if (dst.regClass() == v2b) {
2683 emit_vop1_instruction(ctx, instr, aco_opcode::v_exp_f16, dst);
2684 } else if (dst.regClass() == v1) {
2685 emit_vop1_instruction(ctx, instr, aco_opcode::v_exp_f32, dst);
2686 } else {
2687 isel_err(&instr->instr, "Unimplemented NIR instr bit size");
2688 }
2689 break;
2690 }
2691 case nir_op_fsqrt: {
2692 if (dst.regClass() == v2b) {
2693 emit_vop1_instruction(ctx, instr, aco_opcode::v_sqrt_f16, dst);
2694 } else if (dst.regClass() == v1) {
2695 Temp src = get_alu_src(ctx, instr->src[0]);
2696 emit_sqrt(ctx, bld, Definition(dst), src);
2697 } else if (dst.regClass() == v2) {
2698 /* Lowered at NIR level for precision reasons. */
2699 emit_vop1_instruction(ctx, instr, aco_opcode::v_sqrt_f64, dst);
2700 } else {
2701 isel_err(&instr->instr, "Unimplemented NIR instr bit size");
2702 }
2703 break;
2704 }
2705 case nir_op_ffract: {
2706 if (dst.regClass() == v2b) {
2707 emit_vop1_instruction(ctx, instr, aco_opcode::v_fract_f16, dst);
2708 } else if (dst.regClass() == v1) {
2709 emit_vop1_instruction(ctx, instr, aco_opcode::v_fract_f32, dst);
2710 } else if (dst.regClass() == v2) {
2711 emit_vop1_instruction(ctx, instr, aco_opcode::v_fract_f64, dst);
2712 } else {
2713 isel_err(&instr->instr, "Unimplemented NIR instr bit size");
2714 }
2715 break;
2716 }
2717 case nir_op_ffloor: {
2718 if (dst.regClass() == v2b) {
2719 emit_vop1_instruction(ctx, instr, aco_opcode::v_floor_f16, dst);
2720 } else if (dst.regClass() == v1) {
2721 emit_vop1_instruction(ctx, instr, aco_opcode::v_floor_f32, dst);
2722 } else if (dst.regClass() == v2) {
2723 Temp src = get_alu_src(ctx, instr->src[0]);
2724 emit_floor_f64(ctx, bld, Definition(dst), src);
2725 } else {
2726 isel_err(&instr->instr, "Unimplemented NIR instr bit size");
2727 }
2728 break;
2729 }
2730 case nir_op_fceil: {
2731 if (dst.regClass() == v2b) {
2732 emit_vop1_instruction(ctx, instr, aco_opcode::v_ceil_f16, dst);
2733 } else if (dst.regClass() == v1) {
2734 emit_vop1_instruction(ctx, instr, aco_opcode::v_ceil_f32, dst);
2735 } else if (dst.regClass() == v2) {
2736 if (ctx->options->gfx_level >= GFX7) {
2737 emit_vop1_instruction(ctx, instr, aco_opcode::v_ceil_f64, dst);
2738 } else {
2739 /* GFX6 doesn't support V_CEIL_F64, lower it. */
2740 /* trunc = trunc(src0)
2741 * if (src0 > 0.0 && src0 != trunc)
2742 * trunc += 1.0
2743 */
2744 Temp src0 = get_alu_src(ctx, instr->src[0]);
2745 Temp trunc = emit_trunc_f64(ctx, bld, bld.def(v2), src0);
2746 Temp tmp0 =
2747 bld.vopc_e64(aco_opcode::v_cmp_gt_f64, bld.def(bld.lm), src0, Operand::zero());
2748 Temp tmp1 = bld.vopc(aco_opcode::v_cmp_lg_f64, bld.def(bld.lm), src0, trunc);
2749 Temp cond = bld.sop2(aco_opcode::s_and_b64, bld.def(s2), bld.def(s1, scc), tmp0, tmp1);
2750 Temp add = bld.vop2(aco_opcode::v_cndmask_b32, bld.def(v1),
2751 bld.copy(bld.def(v1), Operand::zero()),
2752 bld.copy(bld.def(v1), Operand::c32(0x3ff00000u)), cond);
2753 add = bld.pseudo(aco_opcode::p_create_vector, bld.def(v2),
2754 bld.copy(bld.def(v1), Operand::zero()), add);
2755 bld.vop3(aco_opcode::v_add_f64, Definition(dst), trunc, add);
2756 }
2757 } else {
2758 isel_err(&instr->instr, "Unimplemented NIR instr bit size");
2759 }
2760 break;
2761 }
2762 case nir_op_ftrunc: {
2763 if (dst.regClass() == v2b) {
2764 emit_vop1_instruction(ctx, instr, aco_opcode::v_trunc_f16, dst);
2765 } else if (dst.regClass() == v1) {
2766 emit_vop1_instruction(ctx, instr, aco_opcode::v_trunc_f32, dst);
2767 } else if (dst.regClass() == v2) {
2768 Temp src = get_alu_src(ctx, instr->src[0]);
2769 emit_trunc_f64(ctx, bld, Definition(dst), src);
2770 } else {
2771 isel_err(&instr->instr, "Unimplemented NIR instr bit size");
2772 }
2773 break;
2774 }
2775 case nir_op_fround_even: {
2776 if (dst.regClass() == v2b) {
2777 emit_vop1_instruction(ctx, instr, aco_opcode::v_rndne_f16, dst);
2778 } else if (dst.regClass() == v1) {
2779 emit_vop1_instruction(ctx, instr, aco_opcode::v_rndne_f32, dst);
2780 } else if (dst.regClass() == v2) {
2781 if (ctx->options->gfx_level >= GFX7) {
2782 emit_vop1_instruction(ctx, instr, aco_opcode::v_rndne_f64, dst);
2783 } else {
2784 /* GFX6 doesn't support V_RNDNE_F64, lower it. */
2785 Temp src0_lo = bld.tmp(v1), src0_hi = bld.tmp(v1);
2786 Temp src0 = get_alu_src(ctx, instr->src[0]);
2787 bld.pseudo(aco_opcode::p_split_vector, Definition(src0_lo), Definition(src0_hi), src0);
2788
2789 Temp bitmask = bld.sop1(aco_opcode::s_brev_b32, bld.def(s1),
2790 bld.copy(bld.def(s1), Operand::c32(-2u)));
2791 Temp bfi =
2792 bld.vop3(aco_opcode::v_bfi_b32, bld.def(v1), bitmask,
2793 bld.copy(bld.def(v1), Operand::c32(0x43300000u)), as_vgpr(ctx, src0_hi));
2794 Temp tmp =
2795 bld.vop3(aco_opcode::v_add_f64, bld.def(v2), src0,
2796 bld.pseudo(aco_opcode::p_create_vector, bld.def(v2), Operand::zero(), bfi));
2797 Instruction* sub =
2798 bld.vop3(aco_opcode::v_add_f64, bld.def(v2), tmp,
2799 bld.pseudo(aco_opcode::p_create_vector, bld.def(v2), Operand::zero(), bfi));
2800 sub->valu().neg[1] = true;
2801 tmp = sub->definitions[0].getTemp();
2802
2803 Temp v = bld.pseudo(aco_opcode::p_create_vector, bld.def(v2), Operand::c32(-1u),
2804 Operand::c32(0x432fffffu));
2805 Instruction* vop3 = bld.vopc_e64(aco_opcode::v_cmp_gt_f64, bld.def(bld.lm), src0, v);
2806 vop3->valu().abs[0] = true;
2807 Temp cond = vop3->definitions[0].getTemp();
2808
2809 Temp tmp_lo = bld.tmp(v1), tmp_hi = bld.tmp(v1);
2810 bld.pseudo(aco_opcode::p_split_vector, Definition(tmp_lo), Definition(tmp_hi), tmp);
2811 Temp dst0 = bld.vop2_e64(aco_opcode::v_cndmask_b32, bld.def(v1), tmp_lo,
2812 as_vgpr(ctx, src0_lo), cond);
2813 Temp dst1 = bld.vop2_e64(aco_opcode::v_cndmask_b32, bld.def(v1), tmp_hi,
2814 as_vgpr(ctx, src0_hi), cond);
2815
2816 bld.pseudo(aco_opcode::p_create_vector, Definition(dst), dst0, dst1);
2817 }
2818 } else {
2819 isel_err(&instr->instr, "Unimplemented NIR instr bit size");
2820 }
2821 break;
2822 }
2823 case nir_op_fsin_amd:
2824 case nir_op_fcos_amd: {
2825 Temp src = as_vgpr(ctx, get_alu_src(ctx, instr->src[0]));
2826 aco_ptr<Instruction> norm;
2827 if (dst.regClass() == v2b) {
2828 aco_opcode opcode =
2829 instr->op == nir_op_fsin_amd ? aco_opcode::v_sin_f16 : aco_opcode::v_cos_f16;
2830 bld.vop1(opcode, Definition(dst), src);
2831 } else if (dst.regClass() == v1) {
2832 /* before GFX9, v_sin_f32 and v_cos_f32 had a valid input domain of [-256, +256] */
2833 if (ctx->options->gfx_level < GFX9)
2834 src = bld.vop1(aco_opcode::v_fract_f32, bld.def(v1), src);
2835
2836 aco_opcode opcode =
2837 instr->op == nir_op_fsin_amd ? aco_opcode::v_sin_f32 : aco_opcode::v_cos_f32;
2838 bld.vop1(opcode, Definition(dst), src);
2839 } else {
2840 isel_err(&instr->instr, "Unimplemented NIR instr bit size");
2841 }
2842 break;
2843 }
2844 case nir_op_ldexp: {
2845 if (dst.regClass() == v2b) {
2846 emit_vop2_instruction(ctx, instr, aco_opcode::v_ldexp_f16, dst, false);
2847 } else if (dst.regClass() == v1) {
2848 emit_vop3a_instruction(ctx, instr, aco_opcode::v_ldexp_f32, dst);
2849 } else if (dst.regClass() == v2) {
2850 emit_vop3a_instruction(ctx, instr, aco_opcode::v_ldexp_f64, dst);
2851 } else {
2852 isel_err(&instr->instr, "Unimplemented NIR instr bit size");
2853 }
2854 break;
2855 }
2856 case nir_op_frexp_sig: {
2857 if (dst.regClass() == v2b) {
2858 emit_vop1_instruction(ctx, instr, aco_opcode::v_frexp_mant_f16, dst);
2859 } else if (dst.regClass() == v1) {
2860 emit_vop1_instruction(ctx, instr, aco_opcode::v_frexp_mant_f32, dst);
2861 } else if (dst.regClass() == v2) {
2862 emit_vop1_instruction(ctx, instr, aco_opcode::v_frexp_mant_f64, dst);
2863 } else {
2864 isel_err(&instr->instr, "Unimplemented NIR instr bit size");
2865 }
2866 break;
2867 }
2868 case nir_op_frexp_exp: {
2869 if (instr->src[0].src.ssa->bit_size == 16) {
2870 Temp src = get_alu_src(ctx, instr->src[0]);
2871 Temp tmp = bld.vop1(aco_opcode::v_frexp_exp_i16_f16, bld.def(v1), src);
2872 tmp = bld.pseudo(aco_opcode::p_extract_vector, bld.def(v1b), tmp, Operand::zero());
2873 convert_int(ctx, bld, tmp, 8, 32, true, dst);
2874 } else if (instr->src[0].src.ssa->bit_size == 32) {
2875 emit_vop1_instruction(ctx, instr, aco_opcode::v_frexp_exp_i32_f32, dst);
2876 } else if (instr->src[0].src.ssa->bit_size == 64) {
2877 emit_vop1_instruction(ctx, instr, aco_opcode::v_frexp_exp_i32_f64, dst);
2878 } else {
2879 isel_err(&instr->instr, "Unimplemented NIR instr bit size");
2880 }
2881 break;
2882 }
2883 case nir_op_fsign: {
2884 Temp src = as_vgpr(ctx, get_alu_src(ctx, instr->src[0]));
2885 if (dst.regClass() == v2b) {
2886 /* replace negative zero with positive zero */
2887 src = bld.vop2(aco_opcode::v_add_f16, bld.def(v2b), Operand::zero(), src);
2888 if (ctx->program->gfx_level >= GFX9) {
2889 src = bld.vop3(aco_opcode::v_med3_i16, bld.def(v2b), Operand::c16(-1), src,
2890 Operand::c16(1u));
2891 bld.vop1(aco_opcode::v_cvt_f16_i16, Definition(dst), src);
2892 } else {
2893 src = convert_int(ctx, bld, src, 16, 32, true);
2894 src = bld.vop3(aco_opcode::v_med3_i32, bld.def(v1), Operand::c32(-1), src,
2895 Operand::c32(1u));
2896 bld.vop1(aco_opcode::v_cvt_f16_i16, Definition(dst), src);
2897 }
2898 } else if (dst.regClass() == v1) {
2899 /* Legacy multiply with +Inf means +-0.0 becomes +0.0 and all other numbers
2900 * the correctly signed Inf. After that, we only need to clamp between -1.0 and +1.0.
2901 */
2902 Temp inf = bld.copy(bld.def(s1), Operand::c32(0x7f800000));
2903 src = bld.vop2(aco_opcode::v_mul_legacy_f32, bld.def(v1), inf, src);
2904 bld.vop3(aco_opcode::v_med3_f32, Definition(dst), Operand::c32(0x3f800000), src,
2905 Operand::c32(0xbf800000));
2906 } else if (dst.regClass() == v2) {
2907 Temp cond = bld.vopc(aco_opcode::v_cmp_nlt_f64, bld.def(bld.lm), Operand::zero(), src);
2908 Temp tmp = bld.copy(bld.def(v1), Operand::c32(0x3FF00000u));
2909 Temp upper = bld.vop2_e64(aco_opcode::v_cndmask_b32, bld.def(v1), tmp,
2910 emit_extract_vector(ctx, src, 1, v1), cond);
2911
2912 cond = bld.vopc(aco_opcode::v_cmp_le_f64, bld.def(bld.lm), Operand::zero(), src);
2913 tmp = bld.copy(bld.def(v1), Operand::c32(0xBFF00000u));
2914 upper = bld.vop2(aco_opcode::v_cndmask_b32, bld.def(v1), tmp, upper, cond);
2915
2916 bld.pseudo(aco_opcode::p_create_vector, Definition(dst), Operand::zero(), upper);
2917 } else {
2918 isel_err(&instr->instr, "Unimplemented NIR instr bit size");
2919 }
2920 break;
2921 }
2922 case nir_op_f2f16:
2923 case nir_op_f2f16_rtne: {
2924 assert(instr->src[0].src.ssa->bit_size == 32);
2925 if (instr->def.num_components == 2) {
2926 /* Vectorizing f2f16 is only possible with rtz. */
2927 assert(instr->op != nir_op_f2f16_rtne);
2928 assert(ctx->block->fp_mode.round16_64 == fp_round_tz ||
2929 !ctx->block->fp_mode.care_about_round16_64);
2930 emit_vec2_f2f16(ctx, instr, dst);
2931 break;
2932 }
2933 Temp src = get_alu_src(ctx, instr->src[0]);
2934 if (instr->op == nir_op_f2f16_rtne && ctx->block->fp_mode.round16_64 != fp_round_ne)
2935 /* We emit s_round_mode/s_setreg_imm32 in lower_to_hw_instr to
2936 * keep value numbering and the scheduler simpler.
2937 */
2938 bld.vop1(aco_opcode::p_cvt_f16_f32_rtne, Definition(dst), src);
2939 else
2940 bld.vop1(aco_opcode::v_cvt_f16_f32, Definition(dst), src);
2941 break;
2942 }
2943 case nir_op_f2f16_rtz: {
2944 assert(instr->src[0].src.ssa->bit_size == 32);
2945 if (instr->def.num_components == 2) {
2946 emit_vec2_f2f16(ctx, instr, dst);
2947 break;
2948 }
2949 Temp src = get_alu_src(ctx, instr->src[0]);
2950 if (ctx->block->fp_mode.round16_64 == fp_round_tz)
2951 bld.vop1(aco_opcode::v_cvt_f16_f32, Definition(dst), src);
2952 else if (ctx->program->gfx_level == GFX8 || ctx->program->gfx_level == GFX9)
2953 bld.vop3(aco_opcode::v_cvt_pkrtz_f16_f32_e64, Definition(dst), src, Operand::zero());
2954 else
2955 bld.vop2(aco_opcode::v_cvt_pkrtz_f16_f32, Definition(dst), src, as_vgpr(ctx, src));
2956 break;
2957 }
2958 case nir_op_f2f32: {
2959 if (instr->src[0].src.ssa->bit_size == 16) {
2960 emit_vop1_instruction(ctx, instr, aco_opcode::v_cvt_f32_f16, dst);
2961 } else if (instr->src[0].src.ssa->bit_size == 64) {
2962 emit_vop1_instruction(ctx, instr, aco_opcode::v_cvt_f32_f64, dst);
2963 } else {
2964 isel_err(&instr->instr, "Unimplemented NIR instr bit size");
2965 }
2966 break;
2967 }
2968 case nir_op_f2f64: {
2969 assert(instr->src[0].src.ssa->bit_size == 32);
2970 Temp src = get_alu_src(ctx, instr->src[0]);
2971 bld.vop1(aco_opcode::v_cvt_f64_f32, Definition(dst), src);
2972 break;
2973 }
2974 case nir_op_i2f16: {
2975 assert(dst.regClass() == v2b);
2976 Temp src = get_alu_src(ctx, instr->src[0]);
2977 const unsigned input_size = instr->src[0].src.ssa->bit_size;
2978 if (input_size <= 16) {
2979 /* Expand integer to the size expected by the uint→float converter used below */
2980 unsigned target_size = (ctx->program->gfx_level >= GFX8 ? 16 : 32);
2981 if (input_size != target_size) {
2982 src = convert_int(ctx, bld, src, input_size, target_size, true);
2983 }
2984 }
2985
2986 if (ctx->program->gfx_level >= GFX8 && input_size <= 16) {
2987 bld.vop1(aco_opcode::v_cvt_f16_i16, Definition(dst), src);
2988 } else {
2989 /* Large 32bit inputs need to return +-inf/FLOAT_MAX.
2990 *
2991 * This is also the fallback-path taken on GFX7 and earlier, which
2992 * do not support direct f16⟷i16 conversions.
2993 */
2994 src = bld.vop1(aco_opcode::v_cvt_f32_i32, bld.def(v1), src);
2995 bld.vop1(aco_opcode::v_cvt_f16_f32, Definition(dst), src);
2996 }
2997 break;
2998 }
2999 case nir_op_i2f32: {
3000 assert(dst.size() == 1);
3001 Temp src = get_alu_src(ctx, instr->src[0]);
3002 const unsigned input_size = instr->src[0].src.ssa->bit_size;
3003 if (input_size <= 32) {
3004 if (input_size <= 16) {
3005 /* Sign-extend to 32-bits */
3006 src = convert_int(ctx, bld, src, input_size, 32, true);
3007 }
3008 bld.vop1(aco_opcode::v_cvt_f32_i32, Definition(dst), src);
3009 } else {
3010 isel_err(&instr->instr, "Unimplemented NIR instr bit size");
3011 }
3012 break;
3013 }
3014 case nir_op_i2f64: {
3015 if (instr->src[0].src.ssa->bit_size <= 32) {
3016 Temp src = get_alu_src(ctx, instr->src[0]);
3017 if (instr->src[0].src.ssa->bit_size <= 16)
3018 src = convert_int(ctx, bld, src, instr->src[0].src.ssa->bit_size, 32, true);
3019 bld.vop1(aco_opcode::v_cvt_f64_i32, Definition(dst), src);
3020 } else {
3021 isel_err(&instr->instr, "Unimplemented NIR instr bit size");
3022 }
3023 break;
3024 }
3025 case nir_op_u2f16: {
3026 assert(dst.regClass() == v2b);
3027 Temp src = get_alu_src(ctx, instr->src[0]);
3028 const unsigned input_size = instr->src[0].src.ssa->bit_size;
3029 if (input_size <= 16) {
3030 /* Expand integer to the size expected by the uint→float converter used below */
3031 unsigned target_size = (ctx->program->gfx_level >= GFX8 ? 16 : 32);
3032 if (input_size != target_size) {
3033 src = convert_int(ctx, bld, src, input_size, target_size, false);
3034 }
3035 }
3036
3037 if (ctx->program->gfx_level >= GFX8 && input_size <= 16) {
3038 bld.vop1(aco_opcode::v_cvt_f16_u16, Definition(dst), src);
3039 } else {
3040 /* Large 32bit inputs need to return inf/FLOAT_MAX.
3041 *
3042 * This is also the fallback-path taken on GFX7 and earlier, which
3043 * do not support direct f16⟷u16 conversions.
3044 */
3045 src = bld.vop1(aco_opcode::v_cvt_f32_u32, bld.def(v1), src);
3046 bld.vop1(aco_opcode::v_cvt_f16_f32, Definition(dst), src);
3047 }
3048 break;
3049 }
3050 case nir_op_u2f32: {
3051 assert(dst.size() == 1);
3052 Temp src = get_alu_src(ctx, instr->src[0]);
3053 const unsigned input_size = instr->src[0].src.ssa->bit_size;
3054 if (input_size == 8) {
3055 bld.vop1(aco_opcode::v_cvt_f32_ubyte0, Definition(dst), src);
3056 } else if (input_size <= 32) {
3057 if (input_size == 16)
3058 src = convert_int(ctx, bld, src, instr->src[0].src.ssa->bit_size, 32, false);
3059 bld.vop1(aco_opcode::v_cvt_f32_u32, Definition(dst), src);
3060 } else {
3061 isel_err(&instr->instr, "Unimplemented NIR instr bit size");
3062 }
3063 break;
3064 }
3065 case nir_op_u2f64: {
3066 if (instr->src[0].src.ssa->bit_size <= 32) {
3067 Temp src = get_alu_src(ctx, instr->src[0]);
3068 if (instr->src[0].src.ssa->bit_size <= 16)
3069 src = convert_int(ctx, bld, src, instr->src[0].src.ssa->bit_size, 32, false);
3070 bld.vop1(aco_opcode::v_cvt_f64_u32, Definition(dst), src);
3071 } else {
3072 isel_err(&instr->instr, "Unimplemented NIR instr bit size");
3073 }
3074 break;
3075 }
3076 case nir_op_f2i8:
3077 case nir_op_f2i16: {
3078 if (instr->src[0].src.ssa->bit_size == 16) {
3079 if (ctx->program->gfx_level >= GFX8) {
3080 emit_vop1_instruction(ctx, instr, aco_opcode::v_cvt_i16_f16, dst);
3081 } else {
3082 /* GFX7 and earlier do not support direct f16⟷i16 conversions */
3083 Temp tmp = bld.tmp(v1);
3084 emit_vop1_instruction(ctx, instr, aco_opcode::v_cvt_f32_f16, tmp);
3085 tmp = bld.vop1(aco_opcode::v_cvt_i32_f32, bld.def(v1), tmp);
3086 tmp = convert_int(ctx, bld, tmp, 32, instr->def.bit_size, false,
3087 (dst.type() == RegType::sgpr) ? Temp() : dst);
3088 if (dst.type() == RegType::sgpr) {
3089 bld.pseudo(aco_opcode::p_as_uniform, Definition(dst), tmp);
3090 }
3091 }
3092 } else if (instr->src[0].src.ssa->bit_size == 32) {
3093 emit_vop1_instruction(ctx, instr, aco_opcode::v_cvt_i32_f32, dst);
3094 } else {
3095 emit_vop1_instruction(ctx, instr, aco_opcode::v_cvt_i32_f64, dst);
3096 }
3097 break;
3098 }
3099 case nir_op_f2u8:
3100 case nir_op_f2u16: {
3101 if (instr->src[0].src.ssa->bit_size == 16) {
3102 if (ctx->program->gfx_level >= GFX8) {
3103 emit_vop1_instruction(ctx, instr, aco_opcode::v_cvt_u16_f16, dst);
3104 } else {
3105 /* GFX7 and earlier do not support direct f16⟷u16 conversions */
3106 Temp tmp = bld.tmp(v1);
3107 emit_vop1_instruction(ctx, instr, aco_opcode::v_cvt_f32_f16, tmp);
3108 tmp = bld.vop1(aco_opcode::v_cvt_u32_f32, bld.def(v1), tmp);
3109 tmp = convert_int(ctx, bld, tmp, 32, instr->def.bit_size, false,
3110 (dst.type() == RegType::sgpr) ? Temp() : dst);
3111 if (dst.type() == RegType::sgpr) {
3112 bld.pseudo(aco_opcode::p_as_uniform, Definition(dst), tmp);
3113 }
3114 }
3115 } else if (instr->src[0].src.ssa->bit_size == 32) {
3116 emit_vop1_instruction(ctx, instr, aco_opcode::v_cvt_u32_f32, dst);
3117 } else {
3118 emit_vop1_instruction(ctx, instr, aco_opcode::v_cvt_u32_f64, dst);
3119 }
3120 break;
3121 }
3122 case nir_op_f2i32: {
3123 Temp src = get_alu_src(ctx, instr->src[0]);
3124 if (instr->src[0].src.ssa->bit_size == 16) {
3125 Temp tmp = bld.vop1(aco_opcode::v_cvt_f32_f16, bld.def(v1), src);
3126 if (dst.type() == RegType::vgpr) {
3127 bld.vop1(aco_opcode::v_cvt_i32_f32, Definition(dst), tmp);
3128 } else {
3129 bld.pseudo(aco_opcode::p_as_uniform, Definition(dst),
3130 bld.vop1(aco_opcode::v_cvt_i32_f32, bld.def(v1), tmp));
3131 }
3132 } else if (instr->src[0].src.ssa->bit_size == 32) {
3133 emit_vop1_instruction(ctx, instr, aco_opcode::v_cvt_i32_f32, dst);
3134 } else if (instr->src[0].src.ssa->bit_size == 64) {
3135 emit_vop1_instruction(ctx, instr, aco_opcode::v_cvt_i32_f64, dst);
3136 } else {
3137 isel_err(&instr->instr, "Unimplemented NIR instr bit size");
3138 }
3139 break;
3140 }
3141 case nir_op_f2u32: {
3142 Temp src = get_alu_src(ctx, instr->src[0]);
3143 if (instr->src[0].src.ssa->bit_size == 16) {
3144 Temp tmp = bld.vop1(aco_opcode::v_cvt_f32_f16, bld.def(v1), src);
3145 if (dst.type() == RegType::vgpr) {
3146 bld.vop1(aco_opcode::v_cvt_u32_f32, Definition(dst), tmp);
3147 } else {
3148 bld.pseudo(aco_opcode::p_as_uniform, Definition(dst),
3149 bld.vop1(aco_opcode::v_cvt_u32_f32, bld.def(v1), tmp));
3150 }
3151 } else if (instr->src[0].src.ssa->bit_size == 32) {
3152 emit_vop1_instruction(ctx, instr, aco_opcode::v_cvt_u32_f32, dst);
3153 } else if (instr->src[0].src.ssa->bit_size == 64) {
3154 emit_vop1_instruction(ctx, instr, aco_opcode::v_cvt_u32_f64, dst);
3155 } else {
3156 isel_err(&instr->instr, "Unimplemented NIR instr bit size");
3157 }
3158 break;
3159 }
3160 case nir_op_b2f16: {
3161 Temp src = get_alu_src(ctx, instr->src[0]);
3162 assert(src.regClass() == bld.lm);
3163
3164 if (dst.regClass() == s1) {
3165 src = bool_to_scalar_condition(ctx, src);
3166 bld.sop2(aco_opcode::s_mul_i32, Definition(dst), Operand::c32(0x3c00u), src);
3167 } else if (dst.regClass() == v2b) {
3168 Temp one = bld.copy(bld.def(v1), Operand::c32(0x3c00u));
3169 bld.vop2(aco_opcode::v_cndmask_b32, Definition(dst), Operand::zero(), one, src);
3170 } else {
3171 unreachable("Wrong destination register class for nir_op_b2f16.");
3172 }
3173 break;
3174 }
3175 case nir_op_b2f32: {
3176 Temp src = get_alu_src(ctx, instr->src[0]);
3177 assert(src.regClass() == bld.lm);
3178
3179 if (dst.regClass() == s1) {
3180 src = bool_to_scalar_condition(ctx, src);
3181 bld.sop2(aco_opcode::s_mul_i32, Definition(dst), Operand::c32(0x3f800000u), src);
3182 } else if (dst.regClass() == v1) {
3183 bld.vop2_e64(aco_opcode::v_cndmask_b32, Definition(dst), Operand::zero(),
3184 Operand::c32(0x3f800000u), src);
3185 } else {
3186 unreachable("Wrong destination register class for nir_op_b2f32.");
3187 }
3188 break;
3189 }
3190 case nir_op_b2f64: {
3191 Temp src = get_alu_src(ctx, instr->src[0]);
3192 assert(src.regClass() == bld.lm);
3193
3194 if (dst.regClass() == s2) {
3195 src = bool_to_scalar_condition(ctx, src);
3196 bld.sop2(aco_opcode::s_cselect_b64, Definition(dst), Operand::c32(0x3f800000u),
3197 Operand::zero(), bld.scc(src));
3198 } else if (dst.regClass() == v2) {
3199 Temp one = bld.copy(bld.def(v1), Operand::c32(0x3FF00000u));
3200 Temp upper =
3201 bld.vop2_e64(aco_opcode::v_cndmask_b32, bld.def(v1), Operand::zero(), one, src);
3202 bld.pseudo(aco_opcode::p_create_vector, Definition(dst), Operand::zero(), upper);
3203 } else {
3204 unreachable("Wrong destination register class for nir_op_b2f64.");
3205 }
3206 break;
3207 }
3208 case nir_op_i2i8:
3209 case nir_op_i2i16:
3210 case nir_op_i2i32: {
3211 if (dst.type() == RegType::sgpr && instr->src[0].src.ssa->bit_size < 32) {
3212 /* no need to do the extract in get_alu_src() */
3213 sgpr_extract_mode mode = instr->def.bit_size > instr->src[0].src.ssa->bit_size
3214 ? sgpr_extract_sext
3215 : sgpr_extract_undef;
3216 extract_8_16_bit_sgpr_element(ctx, dst, &instr->src[0], mode);
3217 } else {
3218 const unsigned input_bitsize = instr->src[0].src.ssa->bit_size;
3219 const unsigned output_bitsize = instr->def.bit_size;
3220 convert_int(ctx, bld, get_alu_src(ctx, instr->src[0]), input_bitsize, output_bitsize,
3221 output_bitsize > input_bitsize, dst);
3222 }
3223 break;
3224 }
3225 case nir_op_u2u8:
3226 case nir_op_u2u16:
3227 case nir_op_u2u32: {
3228 if (dst.type() == RegType::sgpr && instr->src[0].src.ssa->bit_size < 32) {
3229 /* no need to do the extract in get_alu_src() */
3230 sgpr_extract_mode mode = instr->def.bit_size > instr->src[0].src.ssa->bit_size
3231 ? sgpr_extract_zext
3232 : sgpr_extract_undef;
3233 extract_8_16_bit_sgpr_element(ctx, dst, &instr->src[0], mode);
3234 } else {
3235 convert_int(ctx, bld, get_alu_src(ctx, instr->src[0]), instr->src[0].src.ssa->bit_size,
3236 instr->def.bit_size, false, dst);
3237 }
3238 break;
3239 }
3240 case nir_op_b2b32:
3241 case nir_op_b2i8:
3242 case nir_op_b2i16:
3243 case nir_op_b2i32: {
3244 Temp src = get_alu_src(ctx, instr->src[0]);
3245 assert(src.regClass() == bld.lm);
3246
3247 if (dst.regClass() == s1) {
3248 bool_to_scalar_condition(ctx, src, dst);
3249 } else if (dst.type() == RegType::vgpr) {
3250 bld.vop2_e64(aco_opcode::v_cndmask_b32, Definition(dst), Operand::zero(), Operand::c32(1u),
3251 src);
3252 } else {
3253 unreachable("Invalid register class for b2i32");
3254 }
3255 break;
3256 }
3257 case nir_op_b2b1: {
3258 Temp src = get_alu_src(ctx, instr->src[0]);
3259 assert(dst.regClass() == bld.lm);
3260
3261 if (src.type() == RegType::vgpr) {
3262 assert(src.regClass() == v1 || src.regClass() == v2);
3263 assert(dst.regClass() == bld.lm);
3264 bld.vopc(src.size() == 2 ? aco_opcode::v_cmp_lg_u64 : aco_opcode::v_cmp_lg_u32,
3265 Definition(dst), Operand::zero(), src);
3266 } else {
3267 assert(src.regClass() == s1 || src.regClass() == s2);
3268 Temp tmp;
3269 if (src.regClass() == s2 && ctx->program->gfx_level <= GFX7) {
3270 tmp =
3271 bld.sop2(aco_opcode::s_or_b64, bld.def(s2), bld.def(s1, scc), Operand::zero(), src)
3272 .def(1)
3273 .getTemp();
3274 } else {
3275 tmp = bld.sopc(src.size() == 2 ? aco_opcode::s_cmp_lg_u64 : aco_opcode::s_cmp_lg_u32,
3276 bld.scc(bld.def(s1)), Operand::zero(), src);
3277 }
3278 bool_to_vector_condition(ctx, tmp, dst);
3279 }
3280 break;
3281 }
3282 case nir_op_unpack_64_2x32:
3283 case nir_op_unpack_32_2x16:
3284 case nir_op_unpack_64_4x16:
3285 case nir_op_unpack_32_4x8:
3286 bld.copy(Definition(dst), get_alu_src(ctx, instr->src[0]));
3287 emit_split_vector(
3288 ctx, dst, instr->op == nir_op_unpack_32_4x8 || instr->op == nir_op_unpack_64_4x16 ? 4 : 2);
3289 break;
3290 case nir_op_pack_64_2x32_split: {
3291 Temp src0 = get_alu_src(ctx, instr->src[0]);
3292 Temp src1 = get_alu_src(ctx, instr->src[1]);
3293
3294 bld.pseudo(aco_opcode::p_create_vector, Definition(dst), src0, src1);
3295 break;
3296 }
3297 case nir_op_unpack_64_2x32_split_x:
3298 bld.pseudo(aco_opcode::p_split_vector, Definition(dst), bld.def(dst.regClass()),
3299 get_alu_src(ctx, instr->src[0]));
3300 break;
3301 case nir_op_unpack_64_2x32_split_y:
3302 bld.pseudo(aco_opcode::p_split_vector, bld.def(dst.regClass()), Definition(dst),
3303 get_alu_src(ctx, instr->src[0]));
3304 break;
3305 case nir_op_unpack_32_2x16_split_x:
3306 if (dst.type() == RegType::vgpr) {
3307 bld.pseudo(aco_opcode::p_split_vector, Definition(dst), bld.def(dst.regClass()),
3308 get_alu_src(ctx, instr->src[0]));
3309 } else {
3310 bld.copy(Definition(dst), get_alu_src(ctx, instr->src[0]));
3311 }
3312 break;
3313 case nir_op_unpack_32_2x16_split_y:
3314 if (dst.type() == RegType::vgpr) {
3315 bld.pseudo(aco_opcode::p_split_vector, bld.def(dst.regClass()), Definition(dst),
3316 get_alu_src(ctx, instr->src[0]));
3317 } else {
3318 bld.pseudo(aco_opcode::p_extract, Definition(dst), bld.def(s1, scc),
3319 get_alu_src(ctx, instr->src[0]), Operand::c32(1u), Operand::c32(16u),
3320 Operand::zero());
3321 }
3322 break;
3323 case nir_op_pack_32_2x16_split: {
3324 Temp src0 = get_alu_src(ctx, instr->src[0]);
3325 Temp src1 = get_alu_src(ctx, instr->src[1]);
3326 if (dst.regClass() == v1) {
3327 src0 = emit_extract_vector(ctx, src0, 0, v2b);
3328 src1 = emit_extract_vector(ctx, src1, 0, v2b);
3329 bld.pseudo(aco_opcode::p_create_vector, Definition(dst), src0, src1);
3330 } else {
3331 src0 = bld.sop2(aco_opcode::s_and_b32, bld.def(s1), bld.def(s1, scc), src0,
3332 Operand::c32(0xFFFFu));
3333 src1 = bld.sop2(aco_opcode::s_lshl_b32, bld.def(s1), bld.def(s1, scc), src1,
3334 Operand::c32(16u));
3335 bld.sop2(aco_opcode::s_or_b32, Definition(dst), bld.def(s1, scc), src0, src1);
3336 }
3337 break;
3338 }
3339 case nir_op_pack_32_4x8: bld.copy(Definition(dst), get_alu_src(ctx, instr->src[0], 4)); break;
3340 case nir_op_pack_half_2x16_rtz_split:
3341 case nir_op_pack_half_2x16_split: {
3342 if (dst.regClass() == v1) {
3343 if (ctx->program->gfx_level == GFX8 || ctx->program->gfx_level == GFX9)
3344 emit_vop3a_instruction(ctx, instr, aco_opcode::v_cvt_pkrtz_f16_f32_e64, dst);
3345 else
3346 emit_vop2_instruction(ctx, instr, aco_opcode::v_cvt_pkrtz_f16_f32, dst, false);
3347 } else {
3348 isel_err(&instr->instr, "Unimplemented NIR instr bit size");
3349 }
3350 break;
3351 }
3352 case nir_op_pack_unorm_2x16:
3353 case nir_op_pack_snorm_2x16: {
3354 unsigned bit_size = instr->src[0].src.ssa->bit_size;
3355 /* Only support 16 and 32bit. */
3356 assert(bit_size == 32 || bit_size == 16);
3357
3358 RegClass src_rc = bit_size == 32 ? v1 : v2b;
3359 Temp src = get_alu_src(ctx, instr->src[0], 2);
3360 Temp src0 = emit_extract_vector(ctx, src, 0, src_rc);
3361 Temp src1 = emit_extract_vector(ctx, src, 1, src_rc);
3362
3363 /* Work around for pre-GFX9 GPU which don't have fp16 pknorm instruction. */
3364 if (bit_size == 16 && ctx->program->gfx_level < GFX9) {
3365 src0 = bld.vop1(aco_opcode::v_cvt_f32_f16, bld.def(v1), src0);
3366 src1 = bld.vop1(aco_opcode::v_cvt_f32_f16, bld.def(v1), src1);
3367 bit_size = 32;
3368 }
3369
3370 aco_opcode opcode;
3371 if (bit_size == 32) {
3372 opcode = instr->op == nir_op_pack_unorm_2x16 ? aco_opcode::v_cvt_pknorm_u16_f32
3373 : aco_opcode::v_cvt_pknorm_i16_f32;
3374 } else {
3375 opcode = instr->op == nir_op_pack_unorm_2x16 ? aco_opcode::v_cvt_pknorm_u16_f16
3376 : aco_opcode::v_cvt_pknorm_i16_f16;
3377 }
3378 bld.vop3(opcode, Definition(dst), src0, src1);
3379 break;
3380 }
3381 case nir_op_pack_uint_2x16:
3382 case nir_op_pack_sint_2x16: {
3383 Temp src = get_alu_src(ctx, instr->src[0], 2);
3384 Temp src0 = emit_extract_vector(ctx, src, 0, v1);
3385 Temp src1 = emit_extract_vector(ctx, src, 1, v1);
3386 aco_opcode opcode = instr->op == nir_op_pack_uint_2x16 ? aco_opcode::v_cvt_pk_u16_u32
3387 : aco_opcode::v_cvt_pk_i16_i32;
3388 bld.vop3(opcode, Definition(dst), src0, src1);
3389 break;
3390 }
3391 case nir_op_unpack_half_2x16_split_x_flush_to_zero:
3392 case nir_op_unpack_half_2x16_split_x: {
3393 Temp src = get_alu_src(ctx, instr->src[0]);
3394 if (src.regClass() == v1)
3395 src = bld.pseudo(aco_opcode::p_split_vector, bld.def(v2b), bld.def(v2b), src);
3396 if (dst.regClass() == v1) {
3397 assert(ctx->block->fp_mode.must_flush_denorms16_64 ==
3398 (instr->op == nir_op_unpack_half_2x16_split_x_flush_to_zero));
3399 bld.vop1(aco_opcode::v_cvt_f32_f16, Definition(dst), src);
3400 } else {
3401 isel_err(&instr->instr, "Unimplemented NIR instr bit size");
3402 }
3403 break;
3404 }
3405 case nir_op_unpack_half_2x16_split_y_flush_to_zero:
3406 case nir_op_unpack_half_2x16_split_y: {
3407 Temp src = get_alu_src(ctx, instr->src[0]);
3408 if (src.regClass() == s1)
3409 src = bld.pseudo(aco_opcode::p_extract, bld.def(s1), bld.def(s1, scc), src,
3410 Operand::c32(1u), Operand::c32(16u), Operand::zero());
3411 else
3412 src =
3413 bld.pseudo(aco_opcode::p_split_vector, bld.def(v2b), bld.def(v2b), src).def(1).getTemp();
3414 if (dst.regClass() == v1) {
3415 assert(ctx->block->fp_mode.must_flush_denorms16_64 ==
3416 (instr->op == nir_op_unpack_half_2x16_split_y_flush_to_zero));
3417 bld.vop1(aco_opcode::v_cvt_f32_f16, Definition(dst), src);
3418 } else {
3419 isel_err(&instr->instr, "Unimplemented NIR instr bit size");
3420 }
3421 break;
3422 }
3423 case nir_op_msad_4x8: {
3424 assert(dst.regClass() == v1);
3425 emit_vop3a_instruction(ctx, instr, aco_opcode::v_msad_u8, dst, false, 3u, true);
3426 break;
3427 }
3428 case nir_op_fquantize2f16: {
3429 Temp src = get_alu_src(ctx, instr->src[0]);
3430 Temp f16;
3431 if (ctx->block->fp_mode.round16_64 != fp_round_ne)
3432 f16 = bld.vop1(aco_opcode::p_cvt_f16_f32_rtne, bld.def(v2b), src);
3433 else
3434 f16 = bld.vop1(aco_opcode::v_cvt_f16_f32, bld.def(v2b), src);
3435 Temp f32, cmp_res;
3436
3437 if (ctx->program->gfx_level >= GFX8) {
3438 Temp mask = bld.copy(
3439 bld.def(s1), Operand::c32(0x36Fu)); /* value is NOT negative/positive denormal value */
3440 cmp_res = bld.vopc_e64(aco_opcode::v_cmp_class_f16, bld.def(bld.lm), f16, mask);
3441 f32 = bld.vop1(aco_opcode::v_cvt_f32_f16, bld.def(v1), f16);
3442 } else {
3443 /* 0x38800000 is smallest half float value (2^-14) in 32-bit float,
3444 * so compare the result and flush to 0 if it's smaller.
3445 */
3446 f32 = bld.vop1(aco_opcode::v_cvt_f32_f16, bld.def(v1), f16);
3447 Temp smallest = bld.copy(bld.def(s1), Operand::c32(0x38800000u));
3448 Instruction* tmp0 = bld.vopc_e64(aco_opcode::v_cmp_lt_f32, bld.def(bld.lm), f32, smallest);
3449 tmp0->valu().abs[0] = true;
3450 Temp tmp1 = bld.vopc(aco_opcode::v_cmp_lg_f32, bld.def(bld.lm), Operand::zero(), f32);
3451 cmp_res = bld.sop2(aco_opcode::s_nand_b64, bld.def(s2), bld.def(s1, scc),
3452 tmp0->definitions[0].getTemp(), tmp1);
3453 }
3454
3455 if (ctx->block->fp_mode.preserve_signed_zero_inf_nan32) {
3456 Temp copysign_0 =
3457 bld.vop2(aco_opcode::v_mul_f32, bld.def(v1), Operand::zero(), as_vgpr(ctx, src));
3458 bld.vop2(aco_opcode::v_cndmask_b32, Definition(dst), copysign_0, f32, cmp_res);
3459 } else {
3460 bld.vop2(aco_opcode::v_cndmask_b32, Definition(dst), Operand::zero(), f32, cmp_res);
3461 }
3462 break;
3463 }
3464 case nir_op_bfm: {
3465 Temp bits = get_alu_src(ctx, instr->src[0]);
3466 Temp offset = get_alu_src(ctx, instr->src[1]);
3467
3468 if (dst.regClass() == s1) {
3469 bld.sop2(aco_opcode::s_bfm_b32, Definition(dst), bits, offset);
3470 } else if (dst.regClass() == v1) {
3471 bld.vop3(aco_opcode::v_bfm_b32, Definition(dst), bits, offset);
3472 } else {
3473 isel_err(&instr->instr, "Unimplemented NIR instr bit size");
3474 }
3475 break;
3476 }
3477 case nir_op_bitfield_select: {
3478
3479 /* dst = (insert & bitmask) | (base & ~bitmask) */
3480 if (dst.regClass() == s1) {
3481 Temp bitmask = get_alu_src(ctx, instr->src[0]);
3482 Temp insert = get_alu_src(ctx, instr->src[1]);
3483 Temp base = get_alu_src(ctx, instr->src[2]);
3484 aco_ptr<Instruction> sop2;
3485 nir_const_value* const_bitmask = nir_src_as_const_value(instr->src[0].src);
3486 nir_const_value* const_insert = nir_src_as_const_value(instr->src[1].src);
3487 Operand lhs;
3488 if (const_insert && const_bitmask) {
3489 lhs = Operand::c32(const_insert->u32 & const_bitmask->u32);
3490 } else {
3491 insert =
3492 bld.sop2(aco_opcode::s_and_b32, bld.def(s1), bld.def(s1, scc), insert, bitmask);
3493 lhs = Operand(insert);
3494 }
3495
3496 Operand rhs;
3497 nir_const_value* const_base = nir_src_as_const_value(instr->src[2].src);
3498 if (const_base && const_bitmask) {
3499 rhs = Operand::c32(const_base->u32 & ~const_bitmask->u32);
3500 } else {
3501 base = bld.sop2(aco_opcode::s_andn2_b32, bld.def(s1), bld.def(s1, scc), base, bitmask);
3502 rhs = Operand(base);
3503 }
3504
3505 bld.sop2(aco_opcode::s_or_b32, Definition(dst), bld.def(s1, scc), rhs, lhs);
3506
3507 } else if (dst.regClass() == v1) {
3508 emit_vop3a_instruction(ctx, instr, aco_opcode::v_bfi_b32, dst, false, 3);
3509 } else {
3510 isel_err(&instr->instr, "Unimplemented NIR instr bit size");
3511 }
3512 break;
3513 }
3514 case nir_op_ubfe:
3515 case nir_op_ibfe: {
3516 if (dst.bytes() != 4)
3517 unreachable("Unsupported BFE bit size");
3518
3519 if (dst.type() == RegType::sgpr) {
3520 Temp base = get_alu_src(ctx, instr->src[0]);
3521
3522 nir_const_value* const_offset = nir_src_as_const_value(instr->src[1].src);
3523 nir_const_value* const_bits = nir_src_as_const_value(instr->src[2].src);
3524 aco_opcode opcode =
3525 instr->op == nir_op_ubfe ? aco_opcode::s_bfe_u32 : aco_opcode::s_bfe_i32;
3526 if (const_offset && const_bits) {
3527 uint32_t extract = ((const_bits->u32 & 0x1f) << 16) | (const_offset->u32 & 0x1f);
3528 bld.sop2(opcode, Definition(dst), bld.def(s1, scc), base, Operand::c32(extract));
3529 break;
3530 }
3531
3532 Temp offset = get_alu_src(ctx, instr->src[1]);
3533 Temp bits = get_alu_src(ctx, instr->src[2]);
3534
3535 if (ctx->program->gfx_level >= GFX9) {
3536 Operand bits_op = const_bits ? Operand::c32(const_bits->u32 & 0x1f)
3537 : bld.sop2(aco_opcode::s_and_b32, bld.def(s1),
3538 bld.def(s1, scc), bits, Operand::c32(0x1fu));
3539 Temp extract = bld.sop2(aco_opcode::s_pack_ll_b32_b16, bld.def(s1), offset, bits_op);
3540 bld.sop2(opcode, Definition(dst), bld.def(s1, scc), base, extract);
3541 } else if (instr->op == nir_op_ubfe) {
3542 Temp mask = bld.sop2(aco_opcode::s_bfm_b32, bld.def(s1), bits, offset);
3543 Temp masked =
3544 bld.sop2(aco_opcode::s_and_b32, bld.def(s1), bld.def(s1, scc), base, mask);
3545 bld.sop2(aco_opcode::s_lshr_b32, Definition(dst), bld.def(s1, scc), masked, offset);
3546 } else {
3547 Operand bits_op = const_bits
3548 ? Operand::c32((const_bits->u32 & 0x1f) << 16)
3549 : bld.sop2(aco_opcode::s_lshl_b32, bld.def(s1), bld.def(s1, scc),
3550 bld.sop2(aco_opcode::s_and_b32, bld.def(s1),
3551 bld.def(s1, scc), bits, Operand::c32(0x1fu)),
3552 Operand::c32(16u));
3553 Operand offset_op = const_offset
3554 ? Operand::c32(const_offset->u32 & 0x1fu)
3555 : bld.sop2(aco_opcode::s_and_b32, bld.def(s1), bld.def(s1, scc),
3556 offset, Operand::c32(0x1fu));
3557
3558 Temp extract =
3559 bld.sop2(aco_opcode::s_or_b32, bld.def(s1), bld.def(s1, scc), bits_op, offset_op);
3560 bld.sop2(aco_opcode::s_bfe_i32, Definition(dst), bld.def(s1, scc), base, extract);
3561 }
3562
3563 } else {
3564 aco_opcode opcode =
3565 instr->op == nir_op_ubfe ? aco_opcode::v_bfe_u32 : aco_opcode::v_bfe_i32;
3566 emit_vop3a_instruction(ctx, instr, opcode, dst, false, 3);
3567 }
3568 break;
3569 }
3570 case nir_op_extract_u8:
3571 case nir_op_extract_i8:
3572 case nir_op_extract_u16:
3573 case nir_op_extract_i16: {
3574 bool is_signed = instr->op == nir_op_extract_i16 || instr->op == nir_op_extract_i8;
3575 unsigned comp = instr->op == nir_op_extract_u8 || instr->op == nir_op_extract_i8 ? 4 : 2;
3576 uint32_t bits = comp == 4 ? 8 : 16;
3577 unsigned index = nir_src_as_uint(instr->src[1].src);
3578 if (bits >= instr->def.bit_size || index * bits >= instr->def.bit_size) {
3579 assert(index == 0);
3580 bld.copy(Definition(dst), get_alu_src(ctx, instr->src[0]));
3581 } else if (dst.regClass() == s1 && instr->def.bit_size == 16) {
3582 Temp vec = get_ssa_temp(ctx, instr->src[0].src.ssa);
3583 unsigned swizzle = instr->src[0].swizzle[0];
3584 if (vec.size() > 1) {
3585 vec = emit_extract_vector(ctx, vec, swizzle / 2, s1);
3586 swizzle = swizzle & 1;
3587 }
3588 index += swizzle * instr->def.bit_size / bits;
3589 bld.pseudo(aco_opcode::p_extract, Definition(dst), bld.def(s1, scc), Operand(vec),
3590 Operand::c32(index), Operand::c32(bits), Operand::c32(is_signed));
3591 } else {
3592 Temp src = get_alu_src(ctx, instr->src[0]);
3593 Definition def(dst);
3594 if (dst.bytes() == 8) {
3595 src = emit_extract_vector(ctx, src, index / comp, RegClass(src.type(), 1));
3596 index %= comp;
3597 def = bld.def(src.type(), 1);
3598 }
3599 assert(def.bytes() <= 4);
3600 if (def.regClass() == s1) {
3601 bld.pseudo(aco_opcode::p_extract, def, bld.def(s1, scc), Operand(src),
3602 Operand::c32(index), Operand::c32(bits), Operand::c32(is_signed));
3603 } else {
3604 src = emit_extract_vector(ctx, src, 0, def.regClass());
3605 bld.pseudo(aco_opcode::p_extract, def, Operand(src), Operand::c32(index),
3606 Operand::c32(bits), Operand::c32(is_signed));
3607 }
3608 if (dst.size() == 2)
3609 bld.pseudo(aco_opcode::p_create_vector, Definition(dst), def.getTemp(),
3610 Operand::zero());
3611 }
3612 break;
3613 }
3614 case nir_op_insert_u8:
3615 case nir_op_insert_u16: {
3616 unsigned comp = instr->op == nir_op_insert_u8 ? 4 : 2;
3617 uint32_t bits = comp == 4 ? 8 : 16;
3618 unsigned index = nir_src_as_uint(instr->src[1].src);
3619 if (bits >= instr->def.bit_size || index * bits >= instr->def.bit_size) {
3620 assert(index == 0);
3621 bld.copy(Definition(dst), get_alu_src(ctx, instr->src[0]));
3622 } else {
3623 Temp src = get_alu_src(ctx, instr->src[0]);
3624 Definition def(dst);
3625 bool swap = false;
3626 if (dst.bytes() == 8) {
3627 src = emit_extract_vector(ctx, src, 0u, RegClass(src.type(), 1));
3628 swap = index >= comp;
3629 index %= comp;
3630 def = bld.def(src.type(), 1);
3631 }
3632 if (def.regClass() == s1) {
3633 bld.pseudo(aco_opcode::p_insert, def, bld.def(s1, scc), Operand(src),
3634 Operand::c32(index), Operand::c32(bits));
3635 } else {
3636 src = emit_extract_vector(ctx, src, 0, def.regClass());
3637 bld.pseudo(aco_opcode::p_insert, def, Operand(src), Operand::c32(index),
3638 Operand::c32(bits));
3639 }
3640 if (dst.size() == 2 && swap)
3641 bld.pseudo(aco_opcode::p_create_vector, Definition(dst), Operand::zero(),
3642 def.getTemp());
3643 else if (dst.size() == 2)
3644 bld.pseudo(aco_opcode::p_create_vector, Definition(dst), def.getTemp(),
3645 Operand::zero());
3646 }
3647 break;
3648 }
3649 case nir_op_bit_count: {
3650 Temp src = get_alu_src(ctx, instr->src[0]);
3651 if (src.regClass() == s1) {
3652 bld.sop1(aco_opcode::s_bcnt1_i32_b32, Definition(dst), bld.def(s1, scc), src);
3653 } else if (src.regClass() == v1) {
3654 bld.vop3(aco_opcode::v_bcnt_u32_b32, Definition(dst), src, Operand::zero());
3655 } else if (src.regClass() == v2) {
3656 bld.vop3(aco_opcode::v_bcnt_u32_b32, Definition(dst), emit_extract_vector(ctx, src, 1, v1),
3657 bld.vop3(aco_opcode::v_bcnt_u32_b32, bld.def(v1),
3658 emit_extract_vector(ctx, src, 0, v1), Operand::zero()));
3659 } else if (src.regClass() == s2) {
3660 bld.sop1(aco_opcode::s_bcnt1_i32_b64, Definition(dst), bld.def(s1, scc), src);
3661 } else {
3662 isel_err(&instr->instr, "Unimplemented NIR instr bit size");
3663 }
3664 break;
3665 }
3666 case nir_op_flt: {
3667 emit_comparison(ctx, instr, dst, aco_opcode::v_cmp_lt_f16, aco_opcode::v_cmp_lt_f32,
3668 aco_opcode::v_cmp_lt_f64);
3669 break;
3670 }
3671 case nir_op_fge: {
3672 emit_comparison(ctx, instr, dst, aco_opcode::v_cmp_ge_f16, aco_opcode::v_cmp_ge_f32,
3673 aco_opcode::v_cmp_ge_f64);
3674 break;
3675 }
3676 case nir_op_feq: {
3677 emit_comparison(ctx, instr, dst, aco_opcode::v_cmp_eq_f16, aco_opcode::v_cmp_eq_f32,
3678 aco_opcode::v_cmp_eq_f64);
3679 break;
3680 }
3681 case nir_op_fneu: {
3682 emit_comparison(ctx, instr, dst, aco_opcode::v_cmp_neq_f16, aco_opcode::v_cmp_neq_f32,
3683 aco_opcode::v_cmp_neq_f64);
3684 break;
3685 }
3686 case nir_op_ilt: {
3687 emit_comparison(ctx, instr, dst, aco_opcode::v_cmp_lt_i16, aco_opcode::v_cmp_lt_i32,
3688 aco_opcode::v_cmp_lt_i64, aco_opcode::s_cmp_lt_i32);
3689 break;
3690 }
3691 case nir_op_ige: {
3692 emit_comparison(ctx, instr, dst, aco_opcode::v_cmp_ge_i16, aco_opcode::v_cmp_ge_i32,
3693 aco_opcode::v_cmp_ge_i64, aco_opcode::s_cmp_ge_i32);
3694 break;
3695 }
3696 case nir_op_ieq: {
3697 if (instr->src[0].src.ssa->bit_size == 1)
3698 emit_boolean_logic(ctx, instr, Builder::s_xnor, dst);
3699 else
3700 emit_comparison(
3701 ctx, instr, dst, aco_opcode::v_cmp_eq_i16, aco_opcode::v_cmp_eq_i32,
3702 aco_opcode::v_cmp_eq_i64, aco_opcode::s_cmp_eq_i32,
3703 ctx->program->gfx_level >= GFX8 ? aco_opcode::s_cmp_eq_u64 : aco_opcode::num_opcodes);
3704 break;
3705 }
3706 case nir_op_ine: {
3707 if (instr->src[0].src.ssa->bit_size == 1)
3708 emit_boolean_logic(ctx, instr, Builder::s_xor, dst);
3709 else
3710 emit_comparison(
3711 ctx, instr, dst, aco_opcode::v_cmp_lg_i16, aco_opcode::v_cmp_lg_i32,
3712 aco_opcode::v_cmp_lg_i64, aco_opcode::s_cmp_lg_i32,
3713 ctx->program->gfx_level >= GFX8 ? aco_opcode::s_cmp_lg_u64 : aco_opcode::num_opcodes);
3714 break;
3715 }
3716 case nir_op_ult: {
3717 emit_comparison(ctx, instr, dst, aco_opcode::v_cmp_lt_u16, aco_opcode::v_cmp_lt_u32,
3718 aco_opcode::v_cmp_lt_u64, aco_opcode::s_cmp_lt_u32);
3719 break;
3720 }
3721 case nir_op_uge: {
3722 emit_comparison(ctx, instr, dst, aco_opcode::v_cmp_ge_u16, aco_opcode::v_cmp_ge_u32,
3723 aco_opcode::v_cmp_ge_u64, aco_opcode::s_cmp_ge_u32);
3724 break;
3725 }
3726 case nir_op_bitz:
3727 case nir_op_bitnz: {
3728 assert(instr->src[0].src.ssa->bit_size != 1);
3729 bool test0 = instr->op == nir_op_bitz;
3730 Temp src0 = get_alu_src(ctx, instr->src[0]);
3731 Temp src1 = get_alu_src(ctx, instr->src[1]);
3732 bool use_valu = src0.type() == RegType::vgpr || src1.type() == RegType::vgpr;
3733 if (!use_valu) {
3734 aco_opcode op = instr->src[0].src.ssa->bit_size == 64 ? aco_opcode::s_bitcmp1_b64
3735 : aco_opcode::s_bitcmp1_b32;
3736 if (test0)
3737 op = instr->src[0].src.ssa->bit_size == 64 ? aco_opcode::s_bitcmp0_b64
3738 : aco_opcode::s_bitcmp0_b32;
3739 emit_sopc_instruction(ctx, instr, op, dst);
3740 break;
3741 }
3742
3743 /* We do not have a VALU version of s_bitcmp.
3744 * But if the second source is constant, we can use
3745 * v_cmp_class_f32's LUT to check the bit.
3746 * The LUT only has 10 entries, so extract a higher byte if we have to.
3747 * For sign bits comparision with 0 is better because v_cmp_class
3748 * can't be inverted.
3749 */
3750 if (nir_src_is_const(instr->src[1].src)) {
3751 uint32_t bit = nir_alu_src_as_uint(instr->src[1]);
3752 bit &= instr->src[0].src.ssa->bit_size - 1;
3753 src0 = as_vgpr(ctx, src0);
3754
3755 if (src0.regClass() == v2) {
3756 src0 = emit_extract_vector(ctx, src0, (bit & 32) != 0, v1);
3757 bit &= 31;
3758 }
3759
3760 if (bit == 31) {
3761 bld.vopc(test0 ? aco_opcode::v_cmp_le_i32 : aco_opcode::v_cmp_gt_i32, Definition(dst),
3762 Operand::c32(0), src0);
3763 break;
3764 }
3765
3766 if (bit == 15 && ctx->program->gfx_level >= GFX8) {
3767 bld.vopc(test0 ? aco_opcode::v_cmp_le_i16 : aco_opcode::v_cmp_gt_i16, Definition(dst),
3768 Operand::c32(0), src0);
3769 break;
3770 }
3771
3772 /* Set max_bit lower to avoid +inf if we can use sdwa+qnan instead. */
3773 const bool can_sdwa = ctx->program->gfx_level >= GFX8 && ctx->program->gfx_level < GFX11;
3774 const unsigned max_bit = can_sdwa ? 0x8 : 0x9;
3775 const bool use_opsel = bit > 0xf && (bit & 0xf) <= max_bit;
3776 if (use_opsel) {
3777 src0 = bld.pseudo(aco_opcode::p_extract, bld.def(v1), src0, Operand::c32(1),
3778 Operand::c32(16), Operand::c32(0));
3779 bit &= 0xf;
3780 }
3781
3782 /* If we can use sdwa the extract is free, while test0's s_not is not. */
3783 if (bit == 7 && test0 && can_sdwa) {
3784 src0 = bld.pseudo(aco_opcode::p_extract, bld.def(v1), src0, Operand::c32(bit / 8),
3785 Operand::c32(8), Operand::c32(1));
3786 bld.vopc(test0 ? aco_opcode::v_cmp_le_i32 : aco_opcode::v_cmp_gt_i32, Definition(dst),
3787 Operand::c32(0), src0);
3788 break;
3789 }
3790
3791 if (bit > max_bit) {
3792 src0 = bld.pseudo(aco_opcode::p_extract, bld.def(v1), src0, Operand::c32(bit / 8),
3793 Operand::c32(8), Operand::c32(0));
3794 bit &= 0x7;
3795 }
3796
3797 /* denorm and snan/qnan inputs are preserved using all float control modes. */
3798 static const struct {
3799 uint32_t fp32;
3800 uint32_t fp16;
3801 bool negate;
3802 } float_lut[10] = {
3803 {0x7f800001, 0x7c01, false}, /* snan */
3804 {~0u, ~0u, false}, /* qnan */
3805 {0xff800000, 0xfc00, false}, /* -inf */
3806 {0xbf800000, 0xbc00, false}, /* -normal (-1.0) */
3807 {1, 1, true}, /* -denormal */
3808 {0, 0, true}, /* -0.0 */
3809 {0, 0, false}, /* +0.0 */
3810 {1, 1, false}, /* +denormal */
3811 {0x3f800000, 0x3c00, false}, /* +normal (+1.0) */
3812 {0x7f800000, 0x7c00, false}, /* +inf */
3813 };
3814
3815 Temp tmp = test0 ? bld.tmp(bld.lm) : dst;
3816 /* fp16 can use s_movk for bit 0. It also supports opsel on gfx11. */
3817 const bool use_fp16 = (ctx->program->gfx_level >= GFX8 && bit == 0) ||
3818 (ctx->program->gfx_level >= GFX11 && use_opsel);
3819 const aco_opcode op = use_fp16 ? aco_opcode::v_cmp_class_f16 : aco_opcode::v_cmp_class_f32;
3820 const uint32_t c = use_fp16 ? float_lut[bit].fp16 : float_lut[bit].fp32;
3821
3822 VALU_instruction& res =
3823 bld.vopc(op, Definition(tmp), bld.copy(bld.def(s1), Operand::c32(c)), src0)->valu();
3824 if (float_lut[bit].negate) {
3825 res.format = asVOP3(res.format);
3826 res.neg[0] = true;
3827 }
3828
3829 if (test0)
3830 bld.sop1(Builder::s_not, Definition(dst), bld.def(s1, scc), tmp);
3831
3832 break;
3833 }
3834
3835 Temp res;
3836 aco_opcode op = test0 ? aco_opcode::v_cmp_eq_i32 : aco_opcode::v_cmp_lg_i32;
3837 if (instr->src[0].src.ssa->bit_size == 16) {
3838 op = test0 ? aco_opcode::v_cmp_eq_i16 : aco_opcode::v_cmp_lg_i16;
3839 if (ctx->program->gfx_level < GFX10)
3840 res = bld.vop2_e64(aco_opcode::v_lshlrev_b16, bld.def(v2b), src1, Operand::c32(1));
3841 else
3842 res = bld.vop3(aco_opcode::v_lshlrev_b16_e64, bld.def(v2b), src1, Operand::c32(1));
3843
3844 res = bld.vop2(aco_opcode::v_and_b32, bld.def(v2b), src0, res);
3845 } else if (instr->src[0].src.ssa->bit_size == 32) {
3846 res = bld.vop3(aco_opcode::v_bfe_u32, bld.def(v1), src0, src1, Operand::c32(1));
3847 } else if (instr->src[0].src.ssa->bit_size == 64) {
3848 if (ctx->program->gfx_level < GFX8)
3849 res = bld.vop3(aco_opcode::v_lshr_b64, bld.def(v2), src0, src1);
3850 else
3851 res = bld.vop3(aco_opcode::v_lshrrev_b64, bld.def(v2), src1, src0);
3852
3853 res = emit_extract_vector(ctx, res, 0, v1);
3854 res = bld.vop2(aco_opcode::v_and_b32, bld.def(v1), Operand::c32(0x1), res);
3855 } else {
3856 isel_err(&instr->instr, "Unimplemented NIR instr bit size");
3857 }
3858 bld.vopc(op, Definition(dst), Operand::c32(0), res);
3859 break;
3860 }
3861 case nir_op_fddx:
3862 case nir_op_fddy:
3863 case nir_op_fddx_fine:
3864 case nir_op_fddy_fine:
3865 case nir_op_fddx_coarse:
3866 case nir_op_fddy_coarse: {
3867 if (!nir_src_is_divergent(instr->src[0].src)) {
3868 /* Source is the same in all lanes, so the derivative is zero.
3869 * This also avoids emitting invalid IR.
3870 */
3871 bld.copy(Definition(dst), Operand::zero(dst.bytes()));
3872 break;
3873 }
3874
3875 uint16_t dpp_ctrl1, dpp_ctrl2;
3876 if (instr->op == nir_op_fddx_fine) {
3877 dpp_ctrl1 = dpp_quad_perm(0, 0, 2, 2);
3878 dpp_ctrl2 = dpp_quad_perm(1, 1, 3, 3);
3879 } else if (instr->op == nir_op_fddy_fine) {
3880 dpp_ctrl1 = dpp_quad_perm(0, 1, 0, 1);
3881 dpp_ctrl2 = dpp_quad_perm(2, 3, 2, 3);
3882 } else {
3883 dpp_ctrl1 = dpp_quad_perm(0, 0, 0, 0);
3884 if (instr->op == nir_op_fddx || instr->op == nir_op_fddx_coarse)
3885 dpp_ctrl2 = dpp_quad_perm(1, 1, 1, 1);
3886 else
3887 dpp_ctrl2 = dpp_quad_perm(2, 2, 2, 2);
3888 }
3889
3890 if (dst.regClass() == v1 && instr->def.bit_size == 16) {
3891 assert(instr->def.num_components == 2);
3892
3893 Temp src = as_vgpr(ctx, get_alu_src_vop3p(ctx, instr->src[0]));
3894
3895 /* swizzle to opsel: all swizzles are either 0 (x) or 1 (y) */
3896 unsigned opsel_lo = instr->src[0].swizzle[0] & 1;
3897 unsigned opsel_hi = instr->src[0].swizzle[1] & 1;
3898 opsel_lo |= opsel_lo << 1;
3899 opsel_hi |= opsel_hi << 1;
3900
3901 Temp tl = bld.vop1_dpp(aco_opcode::v_mov_b32, bld.def(v1), src, dpp_ctrl1);
3902 Temp tr = bld.vop1_dpp(aco_opcode::v_mov_b32, bld.def(v1), src, dpp_ctrl2);
3903
3904 VALU_instruction& sub =
3905 bld.vop3p(aco_opcode::v_pk_add_f16, Definition(dst), tr, tl, opsel_lo, opsel_hi)
3906 .instr->valu();
3907 sub.neg_lo[1] = true;
3908 sub.neg_hi[1] = true;
3909 } else {
3910 Temp src = as_vgpr(ctx, get_alu_src(ctx, instr->src[0]));
3911
3912 if (ctx->program->gfx_level >= GFX8) {
3913 aco_opcode sub =
3914 instr->def.bit_size == 16 ? aco_opcode::v_sub_f16 : aco_opcode::v_sub_f32;
3915 Temp tl = bld.vop1_dpp(aco_opcode::v_mov_b32, bld.def(v1), src, dpp_ctrl1);
3916 bld.vop2_dpp(sub, Definition(dst), src, tl, dpp_ctrl2);
3917 } else {
3918 Temp tl = bld.ds(aco_opcode::ds_swizzle_b32, bld.def(v1), src, (1 << 15) | dpp_ctrl1);
3919 Temp tr = bld.ds(aco_opcode::ds_swizzle_b32, bld.def(v1), src, (1 << 15) | dpp_ctrl2);
3920 bld.vop2(aco_opcode::v_sub_f32, Definition(dst), tr, tl);
3921 }
3922 }
3923 set_wqm(ctx, true);
3924 break;
3925 }
3926 default: isel_err(&instr->instr, "Unknown NIR ALU instr");
3927 }
3928 }
3929
3930 void
visit_load_const(isel_context * ctx,nir_load_const_instr * instr)3931 visit_load_const(isel_context* ctx, nir_load_const_instr* instr)
3932 {
3933 Temp dst = get_ssa_temp(ctx, &instr->def);
3934
3935 // TODO: we really want to have the resulting type as this would allow for 64bit literals
3936 // which get truncated the lsb if double and msb if int
3937 // for now, we only use s_mov_b64 with 64bit inline constants
3938 assert(instr->def.num_components == 1 && "Vector load_const should be lowered to scalar.");
3939 assert(dst.type() == RegType::sgpr);
3940
3941 Builder bld(ctx->program, ctx->block);
3942
3943 if (instr->def.bit_size == 1) {
3944 assert(dst.regClass() == bld.lm);
3945 int val = instr->value[0].b ? -1 : 0;
3946 Operand op = bld.lm.size() == 1 ? Operand::c32(val) : Operand::c64(val);
3947 bld.copy(Definition(dst), op);
3948 } else if (instr->def.bit_size == 8) {
3949 bld.copy(Definition(dst), Operand::c32(instr->value[0].u8));
3950 } else if (instr->def.bit_size == 16) {
3951 /* sign-extend to use s_movk_i32 instead of a literal */
3952 bld.copy(Definition(dst), Operand::c32(instr->value[0].i16));
3953 } else if (dst.size() == 1) {
3954 bld.copy(Definition(dst), Operand::c32(instr->value[0].u32));
3955 } else {
3956 assert(dst.size() != 1);
3957 aco_ptr<Pseudo_instruction> vec{create_instruction<Pseudo_instruction>(
3958 aco_opcode::p_create_vector, Format::PSEUDO, dst.size(), 1)};
3959 if (instr->def.bit_size == 64)
3960 for (unsigned i = 0; i < dst.size(); i++)
3961 vec->operands[i] = Operand::c32(instr->value[0].u64 >> i * 32);
3962 else {
3963 for (unsigned i = 0; i < dst.size(); i++)
3964 vec->operands[i] = Operand::c32(instr->value[i].u32);
3965 }
3966 vec->definitions[0] = Definition(dst);
3967 ctx->block->instructions.emplace_back(std::move(vec));
3968 }
3969 }
3970
3971 Temp
emit_readfirstlane(isel_context * ctx,Temp src,Temp dst)3972 emit_readfirstlane(isel_context* ctx, Temp src, Temp dst)
3973 {
3974 Builder bld(ctx->program, ctx->block);
3975
3976 if (src.regClass().type() == RegType::sgpr) {
3977 bld.copy(Definition(dst), src);
3978 } else if (src.size() == 1) {
3979 bld.vop1(aco_opcode::v_readfirstlane_b32, Definition(dst), src);
3980 } else {
3981 aco_ptr<Pseudo_instruction> split{create_instruction<Pseudo_instruction>(
3982 aco_opcode::p_split_vector, Format::PSEUDO, 1, src.size())};
3983 split->operands[0] = Operand(src);
3984
3985 for (unsigned i = 0; i < src.size(); i++) {
3986 split->definitions[i] =
3987 bld.def(RegClass::get(RegType::vgpr, MIN2(src.bytes() - i * 4, 4)));
3988 }
3989
3990 Instruction* split_raw = split.get();
3991 ctx->block->instructions.emplace_back(std::move(split));
3992
3993 aco_ptr<Pseudo_instruction> vec{create_instruction<Pseudo_instruction>(
3994 aco_opcode::p_create_vector, Format::PSEUDO, src.size(), 1)};
3995 vec->definitions[0] = Definition(dst);
3996 for (unsigned i = 0; i < src.size(); i++) {
3997 vec->operands[i] = bld.vop1(aco_opcode::v_readfirstlane_b32, bld.def(s1),
3998 split_raw->definitions[i].getTemp());
3999 }
4000
4001 ctx->block->instructions.emplace_back(std::move(vec));
4002 if (src.bytes() % 4 == 0)
4003 emit_split_vector(ctx, dst, src.size());
4004 }
4005
4006 return dst;
4007 }
4008
4009 bool
can_use_byte_align_for_global_load(unsigned num_components,unsigned component_size,unsigned align_,bool support_12_byte)4010 can_use_byte_align_for_global_load(unsigned num_components, unsigned component_size,
4011 unsigned align_, bool support_12_byte)
4012 {
4013 /* Only use byte-align for 8/16-bit loads if we won't have to increase it's size and won't have
4014 * to use unsupported load sizes.
4015 */
4016 assert(util_is_power_of_two_nonzero(align_));
4017 if (align_ < 4) {
4018 assert(component_size < 4);
4019 unsigned load_size = num_components * component_size;
4020 uint32_t new_size = align(load_size + (4 - align_), 4);
4021 return new_size == align(load_size, 4) && (new_size != 12 || support_12_byte);
4022 }
4023 return true;
4024 }
4025
4026 struct LoadEmitInfo {
4027 Operand offset;
4028 Temp dst;
4029 unsigned num_components;
4030 unsigned component_size;
4031 Temp resource = Temp(0, s1); /* buffer resource or base 64-bit address */
4032 Temp idx = Temp(0, v1); /* buffer index */
4033 unsigned component_stride = 0;
4034 unsigned const_offset = 0;
4035 unsigned align_mul = 0;
4036 unsigned align_offset = 0;
4037 pipe_format format;
4038
4039 bool glc = false;
4040 bool slc = false;
4041 bool split_by_component_stride = true;
4042 bool readfirstlane_for_uniform = false;
4043 unsigned swizzle_component_size = 0;
4044 memory_sync_info sync;
4045 Temp soffset = Temp(0, s1);
4046 };
4047
4048 struct EmitLoadParameters {
4049 using Callback = Temp (*)(Builder& bld, const LoadEmitInfo& info, Temp offset,
4050 unsigned bytes_needed, unsigned align, unsigned const_offset,
4051 Temp dst_hint);
4052
4053 Callback callback;
4054 bool byte_align_loads;
4055 bool supports_8bit_16bit_loads;
4056 unsigned max_const_offset_plus_one;
4057 };
4058
4059 void
emit_load(isel_context * ctx,Builder & bld,const LoadEmitInfo & info,const EmitLoadParameters & params)4060 emit_load(isel_context* ctx, Builder& bld, const LoadEmitInfo& info,
4061 const EmitLoadParameters& params)
4062 {
4063 unsigned load_size = info.num_components * info.component_size;
4064 unsigned component_size = info.component_size;
4065
4066 unsigned num_vals = 0;
4067 Temp* const vals = (Temp*)alloca(info.dst.bytes() * sizeof(Temp));
4068
4069 unsigned const_offset = info.const_offset;
4070
4071 const unsigned align_mul = info.align_mul ? info.align_mul : component_size;
4072 unsigned align_offset = info.align_offset % align_mul;
4073
4074 unsigned bytes_read = 0;
4075 while (bytes_read < load_size) {
4076 unsigned bytes_needed = load_size - bytes_read;
4077
4078 /* add buffer for unaligned loads */
4079 int byte_align = 0;
4080 if (params.byte_align_loads) {
4081 byte_align = align_mul % 4 == 0 ? align_offset % 4 : -1;
4082 }
4083
4084 if (byte_align) {
4085 if (bytes_needed > 2 || (bytes_needed == 2 && (align_mul % 2 || align_offset % 2)) ||
4086 !params.supports_8bit_16bit_loads) {
4087 if (info.component_stride) {
4088 assert(params.supports_8bit_16bit_loads && "unimplemented");
4089 bytes_needed = 2;
4090 byte_align = 0;
4091 } else {
4092 bytes_needed += byte_align == -1 ? 4 - info.align_mul : byte_align;
4093 bytes_needed = align(bytes_needed, 4);
4094 }
4095 } else {
4096 byte_align = 0;
4097 }
4098 }
4099
4100 if (info.split_by_component_stride) {
4101 if (info.swizzle_component_size)
4102 bytes_needed = MIN2(bytes_needed, info.swizzle_component_size);
4103 if (info.component_stride)
4104 bytes_needed = MIN2(bytes_needed, info.component_size);
4105 }
4106
4107 bool need_to_align_offset = byte_align && (align_mul % 4 || align_offset % 4);
4108
4109 /* reduce constant offset */
4110 Operand offset = info.offset;
4111 unsigned reduced_const_offset = const_offset;
4112 bool remove_const_offset_completely = need_to_align_offset;
4113 if (const_offset &&
4114 (remove_const_offset_completely || const_offset >= params.max_const_offset_plus_one)) {
4115 unsigned to_add = const_offset;
4116 if (remove_const_offset_completely) {
4117 reduced_const_offset = 0;
4118 } else {
4119 to_add =
4120 const_offset / params.max_const_offset_plus_one * params.max_const_offset_plus_one;
4121 reduced_const_offset %= params.max_const_offset_plus_one;
4122 }
4123 Temp offset_tmp = offset.isTemp() ? offset.getTemp() : Temp();
4124 if (offset.isConstant()) {
4125 offset = Operand::c32(offset.constantValue() + to_add);
4126 } else if (offset.isUndefined()) {
4127 offset = Operand::c32(to_add);
4128 } else if (offset_tmp.regClass() == s1) {
4129 offset = bld.sop2(aco_opcode::s_add_i32, bld.def(s1), bld.def(s1, scc), offset_tmp,
4130 Operand::c32(to_add));
4131 } else if (offset_tmp.regClass() == v1) {
4132 offset = bld.vadd32(bld.def(v1), offset_tmp, Operand::c32(to_add));
4133 } else {
4134 Temp lo = bld.tmp(offset_tmp.type(), 1);
4135 Temp hi = bld.tmp(offset_tmp.type(), 1);
4136 bld.pseudo(aco_opcode::p_split_vector, Definition(lo), Definition(hi), offset_tmp);
4137
4138 if (offset_tmp.regClass() == s2) {
4139 Temp carry = bld.tmp(s1);
4140 lo = bld.sop2(aco_opcode::s_add_u32, bld.def(s1), bld.scc(Definition(carry)), lo,
4141 Operand::c32(to_add));
4142 hi = bld.sop2(aco_opcode::s_add_u32, bld.def(s1), bld.def(s1, scc), hi, carry);
4143 offset = bld.pseudo(aco_opcode::p_create_vector, bld.def(s2), lo, hi);
4144 } else {
4145 Temp new_lo = bld.tmp(v1);
4146 Temp carry =
4147 bld.vadd32(Definition(new_lo), lo, Operand::c32(to_add), true).def(1).getTemp();
4148 hi = bld.vadd32(bld.def(v1), hi, Operand::zero(), false, carry);
4149 offset = bld.pseudo(aco_opcode::p_create_vector, bld.def(v2), new_lo, hi);
4150 }
4151 }
4152 }
4153
4154 /* align offset down if needed */
4155 Operand aligned_offset = offset;
4156 unsigned align = align_offset ? 1 << (ffs(align_offset) - 1) : align_mul;
4157 if (need_to_align_offset) {
4158 align = 4;
4159 Temp offset_tmp = offset.isTemp() ? offset.getTemp() : Temp();
4160 if (offset.isConstant()) {
4161 aligned_offset = Operand::c32(offset.constantValue() & 0xfffffffcu);
4162 } else if (offset.isUndefined()) {
4163 aligned_offset = Operand::zero();
4164 } else if (offset_tmp.regClass() == s1) {
4165 aligned_offset = bld.sop2(aco_opcode::s_and_b32, bld.def(s1), bld.def(s1, scc),
4166 Operand::c32(0xfffffffcu), offset_tmp);
4167 } else if (offset_tmp.regClass() == s2) {
4168 aligned_offset = bld.sop2(aco_opcode::s_and_b64, bld.def(s2), bld.def(s1, scc),
4169 Operand::c64(0xfffffffffffffffcllu), offset_tmp);
4170 } else if (offset_tmp.regClass() == v1) {
4171 aligned_offset =
4172 bld.vop2(aco_opcode::v_and_b32, bld.def(v1), Operand::c32(0xfffffffcu), offset_tmp);
4173 } else if (offset_tmp.regClass() == v2) {
4174 Temp hi = bld.tmp(v1), lo = bld.tmp(v1);
4175 bld.pseudo(aco_opcode::p_split_vector, Definition(lo), Definition(hi), offset_tmp);
4176 lo = bld.vop2(aco_opcode::v_and_b32, bld.def(v1), Operand::c32(0xfffffffcu), lo);
4177 aligned_offset = bld.pseudo(aco_opcode::p_create_vector, bld.def(v2), lo, hi);
4178 }
4179 }
4180 Temp aligned_offset_tmp = aligned_offset.isTemp() ? aligned_offset.getTemp()
4181 : aligned_offset.isConstant()
4182 ? bld.copy(bld.def(s1), aligned_offset)
4183 : Temp(0, s1);
4184
4185 Temp val = params.callback(bld, info, aligned_offset_tmp, bytes_needed, align,
4186 reduced_const_offset, byte_align ? Temp() : info.dst);
4187
4188 /* the callback wrote directly to dst */
4189 if (val == info.dst) {
4190 assert(num_vals == 0);
4191 emit_split_vector(ctx, info.dst, info.num_components);
4192 return;
4193 }
4194
4195 /* shift result right if needed */
4196 if (params.byte_align_loads && info.component_size < 4) {
4197 Operand byte_align_off = Operand::c32(byte_align);
4198 if (byte_align == -1) {
4199 if (offset.isConstant())
4200 byte_align_off = Operand::c32(offset.constantValue() % 4u);
4201 else if (offset.isUndefined())
4202 byte_align_off = Operand::zero();
4203 else if (offset.size() == 2)
4204 byte_align_off = Operand(emit_extract_vector(ctx, offset.getTemp(), 0,
4205 RegClass(offset.getTemp().type(), 1)));
4206 else
4207 byte_align_off = offset;
4208 }
4209
4210 assert(val.bytes() >= load_size && "unimplemented");
4211 if (val.type() == RegType::sgpr)
4212 byte_align_scalar(ctx, val, byte_align_off, info.dst);
4213 else
4214 byte_align_vector(ctx, val, byte_align_off, info.dst, component_size);
4215 return;
4216 }
4217
4218 /* add result to list and advance */
4219 if (info.component_stride) {
4220 assert(val.bytes() % info.component_size == 0);
4221 unsigned num_loaded_components = val.bytes() / info.component_size;
4222 unsigned advance_bytes = info.component_stride * num_loaded_components;
4223 const_offset += advance_bytes;
4224 align_offset = (align_offset + advance_bytes) % align_mul;
4225 } else {
4226 const_offset += val.bytes();
4227 align_offset = (align_offset + val.bytes()) % align_mul;
4228 }
4229 bytes_read += val.bytes();
4230 vals[num_vals++] = val;
4231 }
4232
4233 /* create array of components */
4234 unsigned components_split = 0;
4235 std::array<Temp, NIR_MAX_VEC_COMPONENTS> allocated_vec;
4236 bool has_vgprs = false;
4237 for (unsigned i = 0; i < num_vals;) {
4238 Temp* const tmp = (Temp*)alloca(num_vals * sizeof(Temp));
4239 unsigned num_tmps = 0;
4240 unsigned tmp_size = 0;
4241 RegType reg_type = RegType::sgpr;
4242 while ((!tmp_size || (tmp_size % component_size)) && i < num_vals) {
4243 if (vals[i].type() == RegType::vgpr)
4244 reg_type = RegType::vgpr;
4245 tmp_size += vals[i].bytes();
4246 tmp[num_tmps++] = vals[i++];
4247 }
4248 if (num_tmps > 1) {
4249 aco_ptr<Pseudo_instruction> vec{create_instruction<Pseudo_instruction>(
4250 aco_opcode::p_create_vector, Format::PSEUDO, num_tmps, 1)};
4251 for (unsigned j = 0; j < num_tmps; j++)
4252 vec->operands[j] = Operand(tmp[j]);
4253 tmp[0] = bld.tmp(RegClass::get(reg_type, tmp_size));
4254 vec->definitions[0] = Definition(tmp[0]);
4255 bld.insert(std::move(vec));
4256 }
4257
4258 if (tmp[0].bytes() % component_size) {
4259 /* trim tmp[0] */
4260 assert(i == num_vals);
4261 RegClass new_rc =
4262 RegClass::get(reg_type, tmp[0].bytes() / component_size * component_size);
4263 tmp[0] =
4264 bld.pseudo(aco_opcode::p_extract_vector, bld.def(new_rc), tmp[0], Operand::zero());
4265 }
4266
4267 RegClass elem_rc = RegClass::get(reg_type, component_size);
4268
4269 unsigned start = components_split;
4270
4271 if (tmp_size == elem_rc.bytes()) {
4272 allocated_vec[components_split++] = tmp[0];
4273 } else {
4274 assert(tmp_size % elem_rc.bytes() == 0);
4275 aco_ptr<Pseudo_instruction> split{create_instruction<Pseudo_instruction>(
4276 aco_opcode::p_split_vector, Format::PSEUDO, 1, tmp_size / elem_rc.bytes())};
4277 for (auto& def : split->definitions) {
4278 Temp component = bld.tmp(elem_rc);
4279 allocated_vec[components_split++] = component;
4280 def = Definition(component);
4281 }
4282 split->operands[0] = Operand(tmp[0]);
4283 bld.insert(std::move(split));
4284 }
4285
4286 /* try to p_as_uniform early so we can create more optimizable code and
4287 * also update allocated_vec */
4288 for (unsigned j = start; j < components_split; j++) {
4289 if (allocated_vec[j].bytes() % 4 == 0 && info.dst.type() == RegType::sgpr) {
4290 if (info.readfirstlane_for_uniform) {
4291 allocated_vec[j] = emit_readfirstlane(
4292 ctx, allocated_vec[j], bld.tmp(RegClass(RegType::sgpr, allocated_vec[j].size())));
4293 } else {
4294 allocated_vec[j] = bld.as_uniform(allocated_vec[j]);
4295 }
4296 }
4297 has_vgprs |= allocated_vec[j].type() == RegType::vgpr;
4298 }
4299 }
4300
4301 /* concatenate components and p_as_uniform() result if needed */
4302 if (info.dst.type() == RegType::vgpr || !has_vgprs)
4303 ctx->allocated_vec.emplace(info.dst.id(), allocated_vec);
4304
4305 int padding_bytes =
4306 MAX2((int)info.dst.bytes() - int(allocated_vec[0].bytes() * info.num_components), 0);
4307
4308 aco_ptr<Pseudo_instruction> vec{create_instruction<Pseudo_instruction>(
4309 aco_opcode::p_create_vector, Format::PSEUDO, info.num_components + !!padding_bytes, 1)};
4310 for (unsigned i = 0; i < info.num_components; i++)
4311 vec->operands[i] = Operand(allocated_vec[i]);
4312 if (padding_bytes)
4313 vec->operands[info.num_components] = Operand(RegClass::get(RegType::vgpr, padding_bytes));
4314 if (info.dst.type() == RegType::sgpr && has_vgprs) {
4315 Temp tmp = bld.tmp(RegType::vgpr, info.dst.size());
4316 vec->definitions[0] = Definition(tmp);
4317 bld.insert(std::move(vec));
4318 if (info.readfirstlane_for_uniform)
4319 emit_readfirstlane(ctx, tmp, info.dst);
4320 else
4321 bld.pseudo(aco_opcode::p_as_uniform, Definition(info.dst), tmp);
4322 } else {
4323 vec->definitions[0] = Definition(info.dst);
4324 bld.insert(std::move(vec));
4325 }
4326 }
4327
4328 Operand
load_lds_size_m0(Builder & bld)4329 load_lds_size_m0(Builder& bld)
4330 {
4331 /* m0 does not need to be initialized on GFX9+ */
4332 if (bld.program->gfx_level >= GFX9)
4333 return Operand(s1);
4334
4335 return bld.m0((Temp)bld.copy(bld.def(s1, m0), Operand::c32(0xffffffffu)));
4336 }
4337
4338 Temp
lds_load_callback(Builder & bld,const LoadEmitInfo & info,Temp offset,unsigned bytes_needed,unsigned align,unsigned const_offset,Temp dst_hint)4339 lds_load_callback(Builder& bld, const LoadEmitInfo& info, Temp offset, unsigned bytes_needed,
4340 unsigned align, unsigned const_offset, Temp dst_hint)
4341 {
4342 offset = offset.regClass() == s1 ? bld.copy(bld.def(v1), offset) : offset;
4343
4344 Operand m = load_lds_size_m0(bld);
4345
4346 bool large_ds_read = bld.program->gfx_level >= GFX7;
4347 bool usable_read2 = bld.program->gfx_level >= GFX7;
4348
4349 bool read2 = false;
4350 unsigned size = 0;
4351 aco_opcode op;
4352 if (bytes_needed >= 16 && align % 16 == 0 && large_ds_read) {
4353 size = 16;
4354 op = aco_opcode::ds_read_b128;
4355 } else if (bytes_needed >= 16 && align % 8 == 0 && const_offset % 8 == 0 && usable_read2) {
4356 size = 16;
4357 read2 = true;
4358 op = aco_opcode::ds_read2_b64;
4359 } else if (bytes_needed >= 12 && align % 16 == 0 && large_ds_read) {
4360 size = 12;
4361 op = aco_opcode::ds_read_b96;
4362 } else if (bytes_needed >= 8 && align % 8 == 0) {
4363 size = 8;
4364 op = aco_opcode::ds_read_b64;
4365 } else if (bytes_needed >= 8 && align % 4 == 0 && const_offset % 4 == 0 && usable_read2) {
4366 size = 8;
4367 read2 = true;
4368 op = aco_opcode::ds_read2_b32;
4369 } else if (bytes_needed >= 4 && align % 4 == 0) {
4370 size = 4;
4371 op = aco_opcode::ds_read_b32;
4372 } else if (bytes_needed >= 2 && align % 2 == 0) {
4373 size = 2;
4374 op = bld.program->gfx_level >= GFX9 ? aco_opcode::ds_read_u16_d16 : aco_opcode::ds_read_u16;
4375 } else {
4376 size = 1;
4377 op = bld.program->gfx_level >= GFX9 ? aco_opcode::ds_read_u8_d16 : aco_opcode::ds_read_u8;
4378 }
4379
4380 unsigned const_offset_unit = read2 ? size / 2u : 1u;
4381 unsigned const_offset_range = read2 ? 255 * const_offset_unit : 65536;
4382
4383 if (const_offset > (const_offset_range - const_offset_unit)) {
4384 unsigned excess = const_offset - (const_offset % const_offset_range);
4385 offset = bld.vadd32(bld.def(v1), offset, Operand::c32(excess));
4386 const_offset -= excess;
4387 }
4388
4389 const_offset /= const_offset_unit;
4390
4391 RegClass rc = RegClass::get(RegType::vgpr, size);
4392 Temp val = rc == info.dst.regClass() && dst_hint.id() ? dst_hint : bld.tmp(rc);
4393 Instruction* instr;
4394 if (read2)
4395 instr = bld.ds(op, Definition(val), offset, m, const_offset, const_offset + 1);
4396 else
4397 instr = bld.ds(op, Definition(val), offset, m, const_offset);
4398 instr->ds().sync = info.sync;
4399
4400 if (m.isUndefined())
4401 instr->operands.pop_back();
4402
4403 return val;
4404 }
4405
4406 const EmitLoadParameters lds_load_params{lds_load_callback, false, true, UINT32_MAX};
4407
4408 Temp
smem_load_callback(Builder & bld,const LoadEmitInfo & info,Temp offset,unsigned bytes_needed,unsigned align,unsigned const_offset,Temp dst_hint)4409 smem_load_callback(Builder& bld, const LoadEmitInfo& info, Temp offset, unsigned bytes_needed,
4410 unsigned align, unsigned const_offset, Temp dst_hint)
4411 {
4412 assert(align >= 4u);
4413
4414 bld.program->has_smem_buffer_or_global_loads = true;
4415
4416 bool buffer = info.resource.id() && info.resource.bytes() == 16;
4417 Temp addr = info.resource;
4418 if (!buffer && !addr.id()) {
4419 addr = offset;
4420 offset = Temp();
4421 }
4422
4423 bytes_needed = MIN2(bytes_needed, 64);
4424 unsigned needed_round_up = util_next_power_of_two(bytes_needed);
4425 unsigned needed_round_down = needed_round_up >> (needed_round_up != bytes_needed ? 1 : 0);
4426 /* Only round-up global loads if it's aligned so that it won't cross pages */
4427 bytes_needed = buffer || align % needed_round_up == 0 ? needed_round_up : needed_round_down;
4428
4429 aco_opcode op;
4430 if (bytes_needed <= 4) {
4431 op = buffer ? aco_opcode::s_buffer_load_dword : aco_opcode::s_load_dword;
4432 } else if (bytes_needed <= 8) {
4433 op = buffer ? aco_opcode::s_buffer_load_dwordx2 : aco_opcode::s_load_dwordx2;
4434 } else if (bytes_needed <= 16) {
4435 op = buffer ? aco_opcode::s_buffer_load_dwordx4 : aco_opcode::s_load_dwordx4;
4436 } else if (bytes_needed <= 32) {
4437 op = buffer ? aco_opcode::s_buffer_load_dwordx8 : aco_opcode::s_load_dwordx8;
4438 } else {
4439 assert(bytes_needed == 64);
4440 op = buffer ? aco_opcode::s_buffer_load_dwordx16 : aco_opcode::s_load_dwordx16;
4441 }
4442
4443 aco_ptr<SMEM_instruction> load{create_instruction<SMEM_instruction>(op, Format::SMEM, 2, 1)};
4444 if (buffer) {
4445 if (const_offset)
4446 offset = bld.sop2(aco_opcode::s_add_u32, bld.def(s1), bld.def(s1, scc), offset,
4447 Operand::c32(const_offset));
4448 load->operands[0] = Operand(info.resource);
4449 load->operands[1] = Operand(offset);
4450 } else {
4451 load->operands[0] = Operand(addr);
4452 if (offset.id() && const_offset)
4453 load->operands[1] = bld.sop2(aco_opcode::s_add_u32, bld.def(s1), bld.def(s1, scc), offset,
4454 Operand::c32(const_offset));
4455 else if (offset.id())
4456 load->operands[1] = Operand(offset);
4457 else
4458 load->operands[1] = Operand::c32(const_offset);
4459 }
4460 RegClass rc(RegType::sgpr, DIV_ROUND_UP(bytes_needed, 4u));
4461 Temp val = dst_hint.id() && dst_hint.regClass() == rc ? dst_hint : bld.tmp(rc);
4462 load->definitions[0] = Definition(val);
4463 load->glc = info.glc;
4464 load->dlc = info.glc && (bld.program->gfx_level == GFX10 || bld.program->gfx_level == GFX10_3);
4465 load->sync = info.sync;
4466 bld.insert(std::move(load));
4467 return val;
4468 }
4469
4470 const EmitLoadParameters smem_load_params{smem_load_callback, true, false, 1024};
4471
4472 Temp
mubuf_load_callback(Builder & bld,const LoadEmitInfo & info,Temp offset,unsigned bytes_needed,unsigned align_,unsigned const_offset,Temp dst_hint)4473 mubuf_load_callback(Builder& bld, const LoadEmitInfo& info, Temp offset, unsigned bytes_needed,
4474 unsigned align_, unsigned const_offset, Temp dst_hint)
4475 {
4476 Operand vaddr = offset.type() == RegType::vgpr ? Operand(offset) : Operand(v1);
4477 Operand soffset = offset.type() == RegType::sgpr ? Operand(offset) : Operand::c32(0);
4478
4479 if (info.soffset.id()) {
4480 if (soffset.isTemp())
4481 vaddr = bld.copy(bld.def(v1), soffset);
4482 soffset = Operand(info.soffset);
4483 }
4484
4485 if (soffset.isUndefined())
4486 soffset = Operand::zero();
4487
4488 bool offen = !vaddr.isUndefined();
4489 bool idxen = info.idx.id();
4490
4491 if (offen && idxen)
4492 vaddr = bld.pseudo(aco_opcode::p_create_vector, bld.def(v2), info.idx, vaddr);
4493 else if (idxen)
4494 vaddr = Operand(info.idx);
4495
4496 unsigned bytes_size = 0;
4497 aco_opcode op;
4498 if (bytes_needed == 1 || align_ % 2) {
4499 bytes_size = 1;
4500 op = aco_opcode::buffer_load_ubyte;
4501 } else if (bytes_needed == 2 || align_ % 4) {
4502 bytes_size = 2;
4503 op = aco_opcode::buffer_load_ushort;
4504 } else if (bytes_needed <= 4) {
4505 bytes_size = 4;
4506 op = aco_opcode::buffer_load_dword;
4507 } else if (bytes_needed <= 8) {
4508 bytes_size = 8;
4509 op = aco_opcode::buffer_load_dwordx2;
4510 } else if (bytes_needed <= 12 && bld.program->gfx_level > GFX6) {
4511 bytes_size = 12;
4512 op = aco_opcode::buffer_load_dwordx3;
4513 } else {
4514 bytes_size = 16;
4515 op = aco_opcode::buffer_load_dwordx4;
4516 }
4517 aco_ptr<MUBUF_instruction> mubuf{create_instruction<MUBUF_instruction>(op, Format::MUBUF, 3, 1)};
4518 mubuf->operands[0] = Operand(info.resource);
4519 mubuf->operands[1] = vaddr;
4520 mubuf->operands[2] = soffset;
4521 mubuf->offen = offen;
4522 mubuf->idxen = idxen;
4523 mubuf->glc = info.glc;
4524 mubuf->dlc = info.glc && (bld.program->gfx_level == GFX10 || bld.program->gfx_level == GFX10_3);
4525 mubuf->slc = info.slc;
4526 mubuf->sync = info.sync;
4527 mubuf->offset = const_offset;
4528 mubuf->swizzled = info.swizzle_component_size != 0;
4529 RegClass rc = RegClass::get(RegType::vgpr, bytes_size);
4530 Temp val = dst_hint.id() && rc == dst_hint.regClass() ? dst_hint : bld.tmp(rc);
4531 mubuf->definitions[0] = Definition(val);
4532 bld.insert(std::move(mubuf));
4533
4534 return val;
4535 }
4536
4537 const EmitLoadParameters mubuf_load_params{mubuf_load_callback, true, true, 4096};
4538
4539 Temp
mubuf_load_format_callback(Builder & bld,const LoadEmitInfo & info,Temp offset,unsigned bytes_needed,unsigned align_,unsigned const_offset,Temp dst_hint)4540 mubuf_load_format_callback(Builder& bld, const LoadEmitInfo& info, Temp offset,
4541 unsigned bytes_needed, unsigned align_, unsigned const_offset,
4542 Temp dst_hint)
4543 {
4544 Operand vaddr = offset.type() == RegType::vgpr ? Operand(offset) : Operand(v1);
4545 Operand soffset = offset.type() == RegType::sgpr ? Operand(offset) : Operand::c32(0);
4546
4547 if (info.soffset.id()) {
4548 if (soffset.isTemp())
4549 vaddr = bld.copy(bld.def(v1), soffset);
4550 soffset = Operand(info.soffset);
4551 }
4552
4553 if (soffset.isUndefined())
4554 soffset = Operand::zero();
4555
4556 bool offen = !vaddr.isUndefined();
4557 bool idxen = info.idx.id();
4558
4559 if (offen && idxen)
4560 vaddr = bld.pseudo(aco_opcode::p_create_vector, bld.def(v2), info.idx, vaddr);
4561 else if (idxen)
4562 vaddr = Operand(info.idx);
4563
4564 aco_opcode op = aco_opcode::num_opcodes;
4565 if (info.component_size == 2) {
4566 switch (bytes_needed) {
4567 case 2: op = aco_opcode::buffer_load_format_d16_x; break;
4568 case 4: op = aco_opcode::buffer_load_format_d16_xy; break;
4569 case 6: op = aco_opcode::buffer_load_format_d16_xyz; break;
4570 case 8: op = aco_opcode::buffer_load_format_d16_xyzw; break;
4571 default: unreachable("invalid buffer load format size"); break;
4572 }
4573 } else {
4574 assert(info.component_size == 4);
4575 switch (bytes_needed) {
4576 case 4: op = aco_opcode::buffer_load_format_x; break;
4577 case 8: op = aco_opcode::buffer_load_format_xy; break;
4578 case 12: op = aco_opcode::buffer_load_format_xyz; break;
4579 case 16: op = aco_opcode::buffer_load_format_xyzw; break;
4580 default: unreachable("invalid buffer load format size"); break;
4581 }
4582 }
4583
4584 aco_ptr<MUBUF_instruction> mubuf{create_instruction<MUBUF_instruction>(op, Format::MUBUF, 3, 1)};
4585 mubuf->operands[0] = Operand(info.resource);
4586 mubuf->operands[1] = vaddr;
4587 mubuf->operands[2] = soffset;
4588 mubuf->offen = offen;
4589 mubuf->idxen = idxen;
4590 mubuf->glc = info.glc;
4591 mubuf->dlc = info.glc && (bld.program->gfx_level == GFX10 || bld.program->gfx_level == GFX10_3);
4592 mubuf->slc = info.slc;
4593 mubuf->sync = info.sync;
4594 mubuf->offset = const_offset;
4595 RegClass rc = RegClass::get(RegType::vgpr, bytes_needed);
4596 Temp val = dst_hint.id() && rc == dst_hint.regClass() ? dst_hint : bld.tmp(rc);
4597 mubuf->definitions[0] = Definition(val);
4598 bld.insert(std::move(mubuf));
4599
4600 return val;
4601 }
4602
4603 const EmitLoadParameters mubuf_load_format_params{mubuf_load_format_callback, false, true, 4096};
4604
4605 Temp
scratch_load_callback(Builder & bld,const LoadEmitInfo & info,Temp offset,unsigned bytes_needed,unsigned align_,unsigned const_offset,Temp dst_hint)4606 scratch_load_callback(Builder& bld, const LoadEmitInfo& info, Temp offset, unsigned bytes_needed,
4607 unsigned align_, unsigned const_offset, Temp dst_hint)
4608 {
4609 unsigned bytes_size = 0;
4610 aco_opcode op;
4611 if (bytes_needed == 1 || align_ % 2u) {
4612 bytes_size = 1;
4613 op = aco_opcode::scratch_load_ubyte;
4614 } else if (bytes_needed == 2 || align_ % 4u) {
4615 bytes_size = 2;
4616 op = aco_opcode::scratch_load_ushort;
4617 } else if (bytes_needed <= 4) {
4618 bytes_size = 4;
4619 op = aco_opcode::scratch_load_dword;
4620 } else if (bytes_needed <= 8) {
4621 bytes_size = 8;
4622 op = aco_opcode::scratch_load_dwordx2;
4623 } else if (bytes_needed <= 12) {
4624 bytes_size = 12;
4625 op = aco_opcode::scratch_load_dwordx3;
4626 } else {
4627 bytes_size = 16;
4628 op = aco_opcode::scratch_load_dwordx4;
4629 }
4630 RegClass rc = RegClass::get(RegType::vgpr, bytes_size);
4631 Temp val = dst_hint.id() && rc == dst_hint.regClass() ? dst_hint : bld.tmp(rc);
4632 aco_ptr<FLAT_instruction> flat{create_instruction<FLAT_instruction>(op, Format::SCRATCH, 2, 1)};
4633 flat->operands[0] = offset.regClass() == s1 ? Operand(v1) : Operand(offset);
4634 flat->operands[1] = offset.regClass() == s1 ? Operand(offset) : Operand(s1);
4635 flat->sync = info.sync;
4636 flat->offset = const_offset;
4637 flat->definitions[0] = Definition(val);
4638 bld.insert(std::move(flat));
4639
4640 return val;
4641 }
4642
4643 const EmitLoadParameters scratch_mubuf_load_params{mubuf_load_callback, false, true, 4096};
4644 const EmitLoadParameters scratch_flat_load_params{scratch_load_callback, false, true, 2048};
4645
4646 Temp
get_gfx6_global_rsrc(Builder & bld,Temp addr)4647 get_gfx6_global_rsrc(Builder& bld, Temp addr)
4648 {
4649 uint32_t rsrc_conf = S_008F0C_NUM_FORMAT(V_008F0C_BUF_NUM_FORMAT_FLOAT) |
4650 S_008F0C_DATA_FORMAT(V_008F0C_BUF_DATA_FORMAT_32);
4651
4652 if (addr.type() == RegType::vgpr)
4653 return bld.pseudo(aco_opcode::p_create_vector, bld.def(s4), Operand::zero(), Operand::zero(),
4654 Operand::c32(-1u), Operand::c32(rsrc_conf));
4655 return bld.pseudo(aco_opcode::p_create_vector, bld.def(s4), addr, Operand::c32(-1u),
4656 Operand::c32(rsrc_conf));
4657 }
4658
4659 Temp
add64_32(Builder & bld,Temp src0,Temp src1)4660 add64_32(Builder& bld, Temp src0, Temp src1)
4661 {
4662 Temp src00 = bld.tmp(src0.type(), 1);
4663 Temp src01 = bld.tmp(src0.type(), 1);
4664 bld.pseudo(aco_opcode::p_split_vector, Definition(src00), Definition(src01), src0);
4665
4666 if (src0.type() == RegType::vgpr || src1.type() == RegType::vgpr) {
4667 Temp dst0 = bld.tmp(v1);
4668 Temp carry = bld.vadd32(Definition(dst0), src00, src1, true).def(1).getTemp();
4669 Temp dst1 = bld.vadd32(bld.def(v1), src01, Operand::zero(), false, carry);
4670 return bld.pseudo(aco_opcode::p_create_vector, bld.def(v2), dst0, dst1);
4671 } else {
4672 Temp carry = bld.tmp(s1);
4673 Temp dst0 =
4674 bld.sop2(aco_opcode::s_add_u32, bld.def(s1), bld.scc(Definition(carry)), src00, src1);
4675 Temp dst1 = bld.sop2(aco_opcode::s_add_u32, bld.def(s1), bld.def(s1, scc), src01, carry);
4676 return bld.pseudo(aco_opcode::p_create_vector, bld.def(s2), dst0, dst1);
4677 }
4678 }
4679
4680 void
lower_global_address(Builder & bld,uint32_t offset_in,Temp * address_inout,uint32_t * const_offset_inout,Temp * offset_inout)4681 lower_global_address(Builder& bld, uint32_t offset_in, Temp* address_inout,
4682 uint32_t* const_offset_inout, Temp* offset_inout)
4683 {
4684 Temp address = *address_inout;
4685 uint64_t const_offset = *const_offset_inout + offset_in;
4686 Temp offset = *offset_inout;
4687
4688 uint64_t max_const_offset_plus_one =
4689 1; /* GFX7/8/9: FLAT loads do not support constant offsets */
4690 if (bld.program->gfx_level >= GFX9)
4691 max_const_offset_plus_one = bld.program->dev.scratch_global_offset_max;
4692 else if (bld.program->gfx_level == GFX6)
4693 max_const_offset_plus_one = 4096; /* MUBUF has a 12-bit unsigned offset field */
4694 uint64_t excess_offset = const_offset - (const_offset % max_const_offset_plus_one);
4695 const_offset %= max_const_offset_plus_one;
4696
4697 if (!offset.id()) {
4698 while (unlikely(excess_offset > UINT32_MAX)) {
4699 address = add64_32(bld, address, bld.copy(bld.def(s1), Operand::c32(UINT32_MAX)));
4700 excess_offset -= UINT32_MAX;
4701 }
4702 if (excess_offset)
4703 offset = bld.copy(bld.def(s1), Operand::c32(excess_offset));
4704 } else {
4705 /* If we add to "offset", we would transform the indended
4706 * "address + u2u64(offset) + u2u64(const_offset)" into
4707 * "address + u2u64(offset + const_offset)", so add to the address.
4708 * This could be more efficient if excess_offset>UINT32_MAX by doing a full 64-bit addition,
4709 * but that should be really rare.
4710 */
4711 while (excess_offset) {
4712 uint32_t src2 = MIN2(excess_offset, UINT32_MAX);
4713 address = add64_32(bld, address, bld.copy(bld.def(s1), Operand::c32(src2)));
4714 excess_offset -= src2;
4715 }
4716 }
4717
4718 if (bld.program->gfx_level == GFX6) {
4719 /* GFX6 (MUBUF): (SGPR address, SGPR offset) or (VGPR address, SGPR offset) */
4720 if (offset.type() != RegType::sgpr) {
4721 address = add64_32(bld, address, offset);
4722 offset = Temp();
4723 }
4724 offset = offset.id() ? offset : bld.copy(bld.def(s1), Operand::zero());
4725 } else if (bld.program->gfx_level <= GFX8) {
4726 /* GFX7,8 (FLAT): VGPR address */
4727 if (offset.id()) {
4728 address = add64_32(bld, address, offset);
4729 offset = Temp();
4730 }
4731 address = as_vgpr(bld, address);
4732 } else {
4733 /* GFX9+ (GLOBAL): (VGPR address), or (SGPR address and VGPR offset) */
4734 if (address.type() == RegType::vgpr && offset.id()) {
4735 address = add64_32(bld, address, offset);
4736 offset = Temp();
4737 } else if (address.type() == RegType::sgpr && offset.id()) {
4738 offset = as_vgpr(bld, offset);
4739 }
4740 if (address.type() == RegType::sgpr && !offset.id())
4741 offset = bld.copy(bld.def(v1), bld.copy(bld.def(s1), Operand::zero()));
4742 }
4743
4744 *address_inout = address;
4745 *const_offset_inout = const_offset;
4746 *offset_inout = offset;
4747 }
4748
4749 Temp
global_load_callback(Builder & bld,const LoadEmitInfo & info,Temp offset,unsigned bytes_needed,unsigned align_,unsigned const_offset,Temp dst_hint)4750 global_load_callback(Builder& bld, const LoadEmitInfo& info, Temp offset, unsigned bytes_needed,
4751 unsigned align_, unsigned const_offset, Temp dst_hint)
4752 {
4753 Temp addr = info.resource;
4754 if (!addr.id()) {
4755 addr = offset;
4756 offset = Temp();
4757 }
4758 lower_global_address(bld, 0, &addr, &const_offset, &offset);
4759
4760 unsigned bytes_size = 0;
4761 bool use_mubuf = bld.program->gfx_level == GFX6;
4762 bool global = bld.program->gfx_level >= GFX9;
4763 aco_opcode op;
4764 if (bytes_needed == 1 || align_ % 2u) {
4765 bytes_size = 1;
4766 op = use_mubuf ? aco_opcode::buffer_load_ubyte
4767 : global ? aco_opcode::global_load_ubyte
4768 : aco_opcode::flat_load_ubyte;
4769 } else if (bytes_needed == 2 || align_ % 4u) {
4770 bytes_size = 2;
4771 op = use_mubuf ? aco_opcode::buffer_load_ushort
4772 : global ? aco_opcode::global_load_ushort
4773 : aco_opcode::flat_load_ushort;
4774 } else if (bytes_needed <= 4) {
4775 bytes_size = 4;
4776 op = use_mubuf ? aco_opcode::buffer_load_dword
4777 : global ? aco_opcode::global_load_dword
4778 : aco_opcode::flat_load_dword;
4779 } else if (bytes_needed <= 8 || (bytes_needed <= 12 && use_mubuf)) {
4780 bytes_size = 8;
4781 op = use_mubuf ? aco_opcode::buffer_load_dwordx2
4782 : global ? aco_opcode::global_load_dwordx2
4783 : aco_opcode::flat_load_dwordx2;
4784 } else if (bytes_needed <= 12 && !use_mubuf) {
4785 bytes_size = 12;
4786 op = global ? aco_opcode::global_load_dwordx3 : aco_opcode::flat_load_dwordx3;
4787 } else {
4788 bytes_size = 16;
4789 op = use_mubuf ? aco_opcode::buffer_load_dwordx4
4790 : global ? aco_opcode::global_load_dwordx4
4791 : aco_opcode::flat_load_dwordx4;
4792 }
4793 RegClass rc = RegClass::get(RegType::vgpr, bytes_size);
4794 Temp val = dst_hint.id() && rc == dst_hint.regClass() ? dst_hint : bld.tmp(rc);
4795 if (use_mubuf) {
4796 aco_ptr<MUBUF_instruction> mubuf{
4797 create_instruction<MUBUF_instruction>(op, Format::MUBUF, 3, 1)};
4798 mubuf->operands[0] = Operand(get_gfx6_global_rsrc(bld, addr));
4799 mubuf->operands[1] = addr.type() == RegType::vgpr ? Operand(addr) : Operand(v1);
4800 mubuf->operands[2] = Operand(offset);
4801 mubuf->glc = info.glc;
4802 mubuf->dlc = false;
4803 mubuf->offset = const_offset;
4804 mubuf->addr64 = addr.type() == RegType::vgpr;
4805 mubuf->disable_wqm = false;
4806 mubuf->sync = info.sync;
4807 mubuf->definitions[0] = Definition(val);
4808 bld.insert(std::move(mubuf));
4809 } else {
4810 aco_ptr<FLAT_instruction> flat{
4811 create_instruction<FLAT_instruction>(op, global ? Format::GLOBAL : Format::FLAT, 2, 1)};
4812 if (addr.regClass() == s2) {
4813 assert(global && offset.id() && offset.type() == RegType::vgpr);
4814 flat->operands[0] = Operand(offset);
4815 flat->operands[1] = Operand(addr);
4816 } else {
4817 assert(addr.type() == RegType::vgpr && !offset.id());
4818 flat->operands[0] = Operand(addr);
4819 flat->operands[1] = Operand(s1);
4820 }
4821 flat->glc = info.glc;
4822 flat->dlc =
4823 info.glc && (bld.program->gfx_level == GFX10 || bld.program->gfx_level == GFX10_3);
4824 flat->sync = info.sync;
4825 assert(global || !const_offset);
4826 flat->offset = const_offset;
4827 flat->definitions[0] = Definition(val);
4828 bld.insert(std::move(flat));
4829 }
4830
4831 return val;
4832 }
4833
4834 const EmitLoadParameters global_load_params{global_load_callback, true, true, UINT32_MAX};
4835
4836 Temp
load_lds(isel_context * ctx,unsigned elem_size_bytes,unsigned num_components,Temp dst,Temp address,unsigned base_offset,unsigned align)4837 load_lds(isel_context* ctx, unsigned elem_size_bytes, unsigned num_components, Temp dst,
4838 Temp address, unsigned base_offset, unsigned align)
4839 {
4840 assert(util_is_power_of_two_nonzero(align));
4841
4842 Builder bld(ctx->program, ctx->block);
4843
4844 LoadEmitInfo info = {Operand(as_vgpr(ctx, address)), dst, num_components, elem_size_bytes};
4845 info.align_mul = align;
4846 info.align_offset = 0;
4847 info.sync = memory_sync_info(storage_shared);
4848 info.const_offset = base_offset;
4849 /* The 2 separate loads for gfx10+ wave64 can see different values, even for uniform addresses,
4850 * if another wave writes LDS in between. Use v_readfirstlane instead of p_as_uniform in order
4851 * to avoid copy-propagation.
4852 */
4853 info.readfirstlane_for_uniform = ctx->options->gfx_level >= GFX10 &&
4854 ctx->program->wave_size == 64 &&
4855 ctx->program->workgroup_size > 64;
4856 emit_load(ctx, bld, info, lds_load_params);
4857
4858 return dst;
4859 }
4860
4861 void
split_store_data(isel_context * ctx,RegType dst_type,unsigned count,Temp * dst,unsigned * bytes,Temp src)4862 split_store_data(isel_context* ctx, RegType dst_type, unsigned count, Temp* dst, unsigned* bytes,
4863 Temp src)
4864 {
4865 if (!count)
4866 return;
4867
4868 Builder bld(ctx->program, ctx->block);
4869
4870 /* count == 1 fast path */
4871 if (count == 1) {
4872 if (dst_type == RegType::sgpr)
4873 dst[0] = bld.as_uniform(src);
4874 else
4875 dst[0] = as_vgpr(ctx, src);
4876 return;
4877 }
4878
4879 /* elem_size_bytes is the greatest common divisor which is a power of 2 */
4880 unsigned elem_size_bytes =
4881 1u << (ffs(std::accumulate(bytes, bytes + count, 8, std::bit_or<>{})) - 1);
4882
4883 ASSERTED bool is_subdword = elem_size_bytes < 4;
4884 assert(!is_subdword || dst_type == RegType::vgpr);
4885
4886 for (unsigned i = 0; i < count; i++)
4887 dst[i] = bld.tmp(RegClass::get(dst_type, bytes[i]));
4888
4889 std::vector<Temp> temps;
4890 /* use allocated_vec if possible */
4891 auto it = ctx->allocated_vec.find(src.id());
4892 if (it != ctx->allocated_vec.end()) {
4893 if (!it->second[0].id())
4894 goto split;
4895 unsigned elem_size = it->second[0].bytes();
4896 assert(src.bytes() % elem_size == 0);
4897
4898 for (unsigned i = 0; i < src.bytes() / elem_size; i++) {
4899 if (!it->second[i].id())
4900 goto split;
4901 }
4902 if (elem_size_bytes % elem_size)
4903 goto split;
4904
4905 temps.insert(temps.end(), it->second.begin(), it->second.begin() + src.bytes() / elem_size);
4906 elem_size_bytes = elem_size;
4907 }
4908
4909 split:
4910 /* split src if necessary */
4911 if (temps.empty()) {
4912 if (is_subdword && src.type() == RegType::sgpr)
4913 src = as_vgpr(ctx, src);
4914 if (dst_type == RegType::sgpr)
4915 src = bld.as_uniform(src);
4916
4917 unsigned num_elems = src.bytes() / elem_size_bytes;
4918 aco_ptr<Instruction> split{create_instruction<Pseudo_instruction>(
4919 aco_opcode::p_split_vector, Format::PSEUDO, 1, num_elems)};
4920 split->operands[0] = Operand(src);
4921 for (unsigned i = 0; i < num_elems; i++) {
4922 temps.emplace_back(bld.tmp(RegClass::get(dst_type, elem_size_bytes)));
4923 split->definitions[i] = Definition(temps.back());
4924 }
4925 bld.insert(std::move(split));
4926 }
4927
4928 unsigned idx = 0;
4929 for (unsigned i = 0; i < count; i++) {
4930 unsigned op_count = dst[i].bytes() / elem_size_bytes;
4931 if (op_count == 1) {
4932 if (dst_type == RegType::sgpr)
4933 dst[i] = bld.as_uniform(temps[idx++]);
4934 else
4935 dst[i] = as_vgpr(ctx, temps[idx++]);
4936 continue;
4937 }
4938
4939 aco_ptr<Instruction> vec{create_instruction<Pseudo_instruction>(aco_opcode::p_create_vector,
4940 Format::PSEUDO, op_count, 1)};
4941 for (unsigned j = 0; j < op_count; j++) {
4942 Temp tmp = temps[idx++];
4943 if (dst_type == RegType::sgpr)
4944 tmp = bld.as_uniform(tmp);
4945 vec->operands[j] = Operand(tmp);
4946 }
4947 vec->definitions[0] = Definition(dst[i]);
4948 bld.insert(std::move(vec));
4949 }
4950 return;
4951 }
4952
4953 bool
scan_write_mask(uint32_t mask,uint32_t todo_mask,int * start,int * count)4954 scan_write_mask(uint32_t mask, uint32_t todo_mask, int* start, int* count)
4955 {
4956 unsigned start_elem = ffs(todo_mask) - 1;
4957 bool skip = !(mask & (1 << start_elem));
4958 if (skip)
4959 mask = ~mask & todo_mask;
4960
4961 mask &= todo_mask;
4962
4963 u_bit_scan_consecutive_range(&mask, start, count);
4964
4965 return !skip;
4966 }
4967
4968 void
advance_write_mask(uint32_t * todo_mask,int start,int count)4969 advance_write_mask(uint32_t* todo_mask, int start, int count)
4970 {
4971 *todo_mask &= ~u_bit_consecutive(0, count) << start;
4972 }
4973
4974 void
store_lds(isel_context * ctx,unsigned elem_size_bytes,Temp data,uint32_t wrmask,Temp address,unsigned base_offset,unsigned align)4975 store_lds(isel_context* ctx, unsigned elem_size_bytes, Temp data, uint32_t wrmask, Temp address,
4976 unsigned base_offset, unsigned align)
4977 {
4978 assert(util_is_power_of_two_nonzero(align));
4979 assert(util_is_power_of_two_nonzero(elem_size_bytes) && elem_size_bytes <= 8);
4980
4981 Builder bld(ctx->program, ctx->block);
4982 bool large_ds_write = ctx->options->gfx_level >= GFX7;
4983 bool usable_write2 = ctx->options->gfx_level >= GFX7;
4984
4985 unsigned write_count = 0;
4986 Temp write_datas[32];
4987 unsigned offsets[32];
4988 unsigned bytes[32];
4989 aco_opcode opcodes[32];
4990
4991 wrmask = util_widen_mask(wrmask, elem_size_bytes);
4992
4993 const unsigned wrmask_bitcnt = util_bitcount(wrmask);
4994 uint32_t todo = u_bit_consecutive(0, data.bytes());
4995
4996 if (u_bit_consecutive(0, wrmask_bitcnt) == wrmask)
4997 todo = MIN2(todo, wrmask);
4998
4999 while (todo) {
5000 int offset, byte;
5001 if (!scan_write_mask(wrmask, todo, &offset, &byte)) {
5002 offsets[write_count] = offset;
5003 bytes[write_count] = byte;
5004 opcodes[write_count] = aco_opcode::num_opcodes;
5005 write_count++;
5006 advance_write_mask(&todo, offset, byte);
5007 continue;
5008 }
5009
5010 bool aligned2 = offset % 2 == 0 && align % 2 == 0;
5011 bool aligned4 = offset % 4 == 0 && align % 4 == 0;
5012 bool aligned8 = offset % 8 == 0 && align % 8 == 0;
5013 bool aligned16 = offset % 16 == 0 && align % 16 == 0;
5014
5015 // TODO: use ds_write_b8_d16_hi/ds_write_b16_d16_hi if beneficial
5016 aco_opcode op = aco_opcode::num_opcodes;
5017 if (byte >= 16 && aligned16 && large_ds_write) {
5018 op = aco_opcode::ds_write_b128;
5019 byte = 16;
5020 } else if (byte >= 12 && aligned16 && large_ds_write) {
5021 op = aco_opcode::ds_write_b96;
5022 byte = 12;
5023 } else if (byte >= 8 && aligned8) {
5024 op = aco_opcode::ds_write_b64;
5025 byte = 8;
5026 } else if (byte >= 4 && aligned4) {
5027 op = aco_opcode::ds_write_b32;
5028 byte = 4;
5029 } else if (byte >= 2 && aligned2) {
5030 op = aco_opcode::ds_write_b16;
5031 byte = 2;
5032 } else if (byte >= 1) {
5033 op = aco_opcode::ds_write_b8;
5034 byte = 1;
5035 } else {
5036 assert(false);
5037 }
5038
5039 offsets[write_count] = offset;
5040 bytes[write_count] = byte;
5041 opcodes[write_count] = op;
5042 write_count++;
5043 advance_write_mask(&todo, offset, byte);
5044 }
5045
5046 Operand m = load_lds_size_m0(bld);
5047
5048 split_store_data(ctx, RegType::vgpr, write_count, write_datas, bytes, data);
5049
5050 for (unsigned i = 0; i < write_count; i++) {
5051 aco_opcode op = opcodes[i];
5052 if (op == aco_opcode::num_opcodes)
5053 continue;
5054
5055 Temp split_data = write_datas[i];
5056
5057 unsigned second = write_count;
5058 if (usable_write2 && (op == aco_opcode::ds_write_b32 || op == aco_opcode::ds_write_b64)) {
5059 for (second = i + 1; second < write_count; second++) {
5060 if (opcodes[second] == op && (offsets[second] - offsets[i]) % split_data.bytes() == 0) {
5061 op = split_data.bytes() == 4 ? aco_opcode::ds_write2_b32 : aco_opcode::ds_write2_b64;
5062 opcodes[second] = aco_opcode::num_opcodes;
5063 break;
5064 }
5065 }
5066 }
5067
5068 bool write2 = op == aco_opcode::ds_write2_b32 || op == aco_opcode::ds_write2_b64;
5069 unsigned write2_off = (offsets[second] - offsets[i]) / split_data.bytes();
5070
5071 unsigned inline_offset = base_offset + offsets[i];
5072 unsigned max_offset = write2 ? (255 - write2_off) * split_data.bytes() : 65535;
5073 Temp address_offset = address;
5074 if (inline_offset > max_offset) {
5075 address_offset = bld.vadd32(bld.def(v1), Operand::c32(base_offset), address_offset);
5076 inline_offset = offsets[i];
5077 }
5078
5079 /* offsets[i] shouldn't be large enough for this to happen */
5080 assert(inline_offset <= max_offset);
5081
5082 Instruction* instr;
5083 if (write2) {
5084 Temp second_data = write_datas[second];
5085 inline_offset /= split_data.bytes();
5086 instr = bld.ds(op, address_offset, split_data, second_data, m, inline_offset,
5087 inline_offset + write2_off);
5088 } else {
5089 instr = bld.ds(op, address_offset, split_data, m, inline_offset);
5090 }
5091 instr->ds().sync = memory_sync_info(storage_shared);
5092
5093 if (m.isUndefined())
5094 instr->operands.pop_back();
5095 }
5096 }
5097
5098 aco_opcode
get_buffer_store_op(unsigned bytes)5099 get_buffer_store_op(unsigned bytes)
5100 {
5101 switch (bytes) {
5102 case 1: return aco_opcode::buffer_store_byte;
5103 case 2: return aco_opcode::buffer_store_short;
5104 case 4: return aco_opcode::buffer_store_dword;
5105 case 8: return aco_opcode::buffer_store_dwordx2;
5106 case 12: return aco_opcode::buffer_store_dwordx3;
5107 case 16: return aco_opcode::buffer_store_dwordx4;
5108 }
5109 unreachable("Unexpected store size");
5110 return aco_opcode::num_opcodes;
5111 }
5112
5113 void
split_buffer_store(isel_context * ctx,nir_intrinsic_instr * instr,bool smem,RegType dst_type,Temp data,unsigned writemask,int swizzle_element_size,unsigned * write_count,Temp * write_datas,unsigned * offsets)5114 split_buffer_store(isel_context* ctx, nir_intrinsic_instr* instr, bool smem, RegType dst_type,
5115 Temp data, unsigned writemask, int swizzle_element_size, unsigned* write_count,
5116 Temp* write_datas, unsigned* offsets)
5117 {
5118 unsigned write_count_with_skips = 0;
5119 bool skips[16];
5120 unsigned bytes[16];
5121
5122 /* determine how to split the data */
5123 unsigned todo = u_bit_consecutive(0, data.bytes());
5124 while (todo) {
5125 int offset, byte;
5126 skips[write_count_with_skips] = !scan_write_mask(writemask, todo, &offset, &byte);
5127 offsets[write_count_with_skips] = offset;
5128 if (skips[write_count_with_skips]) {
5129 bytes[write_count_with_skips] = byte;
5130 advance_write_mask(&todo, offset, byte);
5131 write_count_with_skips++;
5132 continue;
5133 }
5134
5135 /* only supported sizes are 1, 2, 4, 8, 12 and 16 bytes and can't be
5136 * larger than swizzle_element_size */
5137 byte = MIN2(byte, swizzle_element_size);
5138 if (byte % 4)
5139 byte = byte > 4 ? byte & ~0x3 : MIN2(byte, 2);
5140
5141 /* SMEM and GFX6 VMEM can't emit 12-byte stores */
5142 if ((ctx->program->gfx_level == GFX6 || smem) && byte == 12)
5143 byte = 8;
5144
5145 /* dword or larger stores have to be dword-aligned */
5146 unsigned align_mul = instr ? nir_intrinsic_align_mul(instr) : 4;
5147 unsigned align_offset = (instr ? nir_intrinsic_align_offset(instr) : 0) + offset;
5148 bool dword_aligned = align_offset % 4 == 0 && align_mul % 4 == 0;
5149 if (!dword_aligned)
5150 byte = MIN2(byte, (align_offset % 2 == 0 && align_mul % 2 == 0) ? 2 : 1);
5151
5152 bytes[write_count_with_skips] = byte;
5153 advance_write_mask(&todo, offset, byte);
5154 write_count_with_skips++;
5155 }
5156
5157 /* actually split data */
5158 split_store_data(ctx, dst_type, write_count_with_skips, write_datas, bytes, data);
5159
5160 /* remove skips */
5161 for (unsigned i = 0; i < write_count_with_skips; i++) {
5162 if (skips[i])
5163 continue;
5164 write_datas[*write_count] = write_datas[i];
5165 offsets[*write_count] = offsets[i];
5166 (*write_count)++;
5167 }
5168 }
5169
5170 Temp
create_vec_from_array(isel_context * ctx,Temp arr[],unsigned cnt,RegType reg_type,unsigned elem_size_bytes,unsigned split_cnt=0u,Temp dst=Temp ())5171 create_vec_from_array(isel_context* ctx, Temp arr[], unsigned cnt, RegType reg_type,
5172 unsigned elem_size_bytes, unsigned split_cnt = 0u, Temp dst = Temp())
5173 {
5174 Builder bld(ctx->program, ctx->block);
5175 unsigned dword_size = elem_size_bytes / 4;
5176
5177 if (!dst.id())
5178 dst = bld.tmp(RegClass(reg_type, cnt * dword_size));
5179
5180 std::array<Temp, NIR_MAX_VEC_COMPONENTS> allocated_vec;
5181 aco_ptr<Pseudo_instruction> instr{
5182 create_instruction<Pseudo_instruction>(aco_opcode::p_create_vector, Format::PSEUDO, cnt, 1)};
5183 instr->definitions[0] = Definition(dst);
5184
5185 for (unsigned i = 0; i < cnt; ++i) {
5186 if (arr[i].id()) {
5187 assert(arr[i].size() == dword_size);
5188 allocated_vec[i] = arr[i];
5189 instr->operands[i] = Operand(arr[i]);
5190 } else {
5191 Temp zero = bld.copy(bld.def(RegClass(reg_type, dword_size)),
5192 Operand::zero(dword_size == 2 ? 8 : 4));
5193 allocated_vec[i] = zero;
5194 instr->operands[i] = Operand(zero);
5195 }
5196 }
5197
5198 bld.insert(std::move(instr));
5199
5200 if (split_cnt)
5201 emit_split_vector(ctx, dst, split_cnt);
5202 else
5203 ctx->allocated_vec.emplace(dst.id(), allocated_vec); /* emit_split_vector already does this */
5204
5205 return dst;
5206 }
5207
5208 inline unsigned
resolve_excess_vmem_const_offset(Builder & bld,Temp & voffset,unsigned const_offset)5209 resolve_excess_vmem_const_offset(Builder& bld, Temp& voffset, unsigned const_offset)
5210 {
5211 if (const_offset >= 4096) {
5212 unsigned excess_const_offset = const_offset / 4096u * 4096u;
5213 const_offset %= 4096u;
5214
5215 if (!voffset.id())
5216 voffset = bld.copy(bld.def(v1), Operand::c32(excess_const_offset));
5217 else if (unlikely(voffset.regClass() == s1))
5218 voffset = bld.sop2(aco_opcode::s_add_u32, bld.def(s1), bld.def(s1, scc),
5219 Operand::c32(excess_const_offset), Operand(voffset));
5220 else if (likely(voffset.regClass() == v1))
5221 voffset = bld.vadd32(bld.def(v1), Operand(voffset), Operand::c32(excess_const_offset));
5222 else
5223 unreachable("Unsupported register class of voffset");
5224 }
5225
5226 return const_offset;
5227 }
5228
5229 void
emit_single_mubuf_store(isel_context * ctx,Temp descriptor,Temp voffset,Temp soffset,Temp idx,Temp vdata,unsigned const_offset,memory_sync_info sync,bool glc,bool slc,bool swizzled)5230 emit_single_mubuf_store(isel_context* ctx, Temp descriptor, Temp voffset, Temp soffset, Temp idx,
5231 Temp vdata, unsigned const_offset, memory_sync_info sync, bool glc,
5232 bool slc, bool swizzled)
5233 {
5234 assert(vdata.id());
5235 assert(vdata.size() != 3 || ctx->program->gfx_level != GFX6);
5236 assert(vdata.size() >= 1 && vdata.size() <= 4);
5237
5238 Builder bld(ctx->program, ctx->block);
5239 aco_opcode op = get_buffer_store_op(vdata.bytes());
5240 const_offset = resolve_excess_vmem_const_offset(bld, voffset, const_offset);
5241
5242 bool offen = voffset.id();
5243 bool idxen = idx.id();
5244
5245 Operand soffset_op = soffset.id() ? Operand(soffset) : Operand::zero();
5246 glc &= ctx->program->gfx_level < GFX11;
5247
5248 Operand vaddr_op(v1);
5249 if (offen && idxen)
5250 vaddr_op = bld.pseudo(aco_opcode::p_create_vector, bld.def(v2), idx, voffset);
5251 else if (offen)
5252 vaddr_op = Operand(voffset);
5253 else if (idxen)
5254 vaddr_op = Operand(idx);
5255
5256 Builder::Result r =
5257 bld.mubuf(op, Operand(descriptor), vaddr_op, soffset_op, Operand(vdata), const_offset, offen,
5258 swizzled, idxen, /* addr64 */ false, /* disable_wqm */ false, glc,
5259 /* dlc*/ false, slc);
5260
5261 r->mubuf().sync = sync;
5262 }
5263
5264 void
store_vmem_mubuf(isel_context * ctx,Temp src,Temp descriptor,Temp voffset,Temp soffset,Temp idx,unsigned base_const_offset,unsigned elem_size_bytes,unsigned write_mask,bool swizzled,memory_sync_info sync,bool glc,bool slc)5265 store_vmem_mubuf(isel_context* ctx, Temp src, Temp descriptor, Temp voffset, Temp soffset, Temp idx,
5266 unsigned base_const_offset, unsigned elem_size_bytes, unsigned write_mask,
5267 bool swizzled, memory_sync_info sync, bool glc, bool slc)
5268 {
5269 Builder bld(ctx->program, ctx->block);
5270 assert(elem_size_bytes == 1 || elem_size_bytes == 2 || elem_size_bytes == 4 ||
5271 elem_size_bytes == 8);
5272 assert(write_mask);
5273 write_mask = util_widen_mask(write_mask, elem_size_bytes);
5274
5275 unsigned write_count = 0;
5276 Temp write_datas[32];
5277 unsigned offsets[32];
5278 split_buffer_store(ctx, NULL, false, RegType::vgpr, src, write_mask,
5279 swizzled && ctx->program->gfx_level <= GFX8 ? 4 : 16, &write_count,
5280 write_datas, offsets);
5281
5282 for (unsigned i = 0; i < write_count; i++) {
5283 unsigned const_offset = offsets[i] + base_const_offset;
5284 emit_single_mubuf_store(ctx, descriptor, voffset, soffset, idx, write_datas[i], const_offset,
5285 sync, glc, slc, swizzled);
5286 }
5287 }
5288
5289 Temp
wave_id_in_threadgroup(isel_context * ctx)5290 wave_id_in_threadgroup(isel_context* ctx)
5291 {
5292 Builder bld(ctx->program, ctx->block);
5293 return bld.sop2(aco_opcode::s_bfe_u32, bld.def(s1), bld.def(s1, scc),
5294 get_arg(ctx, ctx->args->merged_wave_info), Operand::c32(24u | (4u << 16)));
5295 }
5296
5297 Temp
thread_id_in_threadgroup(isel_context * ctx)5298 thread_id_in_threadgroup(isel_context* ctx)
5299 {
5300 /* tid_in_tg = wave_id * wave_size + tid_in_wave */
5301
5302 Builder bld(ctx->program, ctx->block);
5303 Temp tid_in_wave = emit_mbcnt(ctx, bld.tmp(v1));
5304
5305 if (ctx->program->workgroup_size <= ctx->program->wave_size)
5306 return tid_in_wave;
5307
5308 Temp wave_id_in_tg = wave_id_in_threadgroup(ctx);
5309 Temp num_pre_threads =
5310 bld.sop2(aco_opcode::s_lshl_b32, bld.def(s1), bld.def(s1, scc), wave_id_in_tg,
5311 Operand::c32(ctx->program->wave_size == 64 ? 6u : 5u));
5312 return bld.vadd32(bld.def(v1), Operand(num_pre_threads), Operand(tid_in_wave));
5313 }
5314
5315 bool
store_output_to_temps(isel_context * ctx,nir_intrinsic_instr * instr)5316 store_output_to_temps(isel_context* ctx, nir_intrinsic_instr* instr)
5317 {
5318 unsigned write_mask = nir_intrinsic_write_mask(instr);
5319 unsigned component = nir_intrinsic_component(instr);
5320 nir_src offset = *nir_get_io_offset_src(instr);
5321
5322 if (!nir_src_is_const(offset) || nir_src_as_uint(offset))
5323 return false;
5324
5325 Temp src = get_ssa_temp(ctx, instr->src[0].ssa);
5326
5327 if (instr->src[0].ssa->bit_size == 64)
5328 write_mask = util_widen_mask(write_mask, 2);
5329
5330 RegClass rc = instr->src[0].ssa->bit_size == 16 ? v2b : v1;
5331
5332 /* Use semantic location as index. radv already uses it as intrinsic base
5333 * but radeonsi does not. We need to make LS output and TCS input index
5334 * match each other, so need to use semantic location explicitly. Also for
5335 * TCS epilog to index tess factor temps using semantic location directly.
5336 */
5337 nir_io_semantics sem = nir_intrinsic_io_semantics(instr);
5338 unsigned base = sem.location;
5339 if (ctx->stage == fragment_fs) {
5340 /* color result is a legacy slot which won't appear with data result
5341 * at the same time. Here we just use the data slot for it to simplify
5342 * code handling for both of them.
5343 */
5344 if (base == FRAG_RESULT_COLOR)
5345 base = FRAG_RESULT_DATA0;
5346
5347 /* Sencond output of dual source blend just use data1 slot for simplicity,
5348 * because dual source blend does not support multi render target.
5349 */
5350 base += sem.dual_source_blend_index;
5351 }
5352 unsigned idx = base * 4u + component;
5353
5354 for (unsigned i = 0; i < 8; ++i) {
5355 if (write_mask & (1 << i)) {
5356 ctx->outputs.mask[idx / 4u] |= 1 << (idx % 4u);
5357 ctx->outputs.temps[idx] = emit_extract_vector(ctx, src, i, rc);
5358 }
5359 idx++;
5360 }
5361
5362 if (ctx->stage == fragment_fs && ctx->program->info.has_epilog && base >= FRAG_RESULT_DATA0) {
5363 unsigned index = base - FRAG_RESULT_DATA0;
5364
5365 if (nir_intrinsic_src_type(instr) == nir_type_float16) {
5366 ctx->output_color_types |= ACO_TYPE_FLOAT16 << (index * 2);
5367 } else if (nir_intrinsic_src_type(instr) == nir_type_int16) {
5368 ctx->output_color_types |= ACO_TYPE_INT16 << (index * 2);
5369 } else if (nir_intrinsic_src_type(instr) == nir_type_uint16) {
5370 ctx->output_color_types |= ACO_TYPE_UINT16 << (index * 2);
5371 }
5372 }
5373
5374 return true;
5375 }
5376
5377 bool
load_input_from_temps(isel_context * ctx,nir_intrinsic_instr * instr,Temp dst)5378 load_input_from_temps(isel_context* ctx, nir_intrinsic_instr* instr, Temp dst)
5379 {
5380 /* Only TCS per-vertex inputs are supported by this function.
5381 * Per-vertex inputs only match between the VS/TCS invocation id when the number of invocations
5382 * is the same.
5383 */
5384 if (ctx->shader->info.stage != MESA_SHADER_TESS_CTRL || !ctx->tcs_in_out_eq)
5385 return false;
5386
5387 nir_src* off_src = nir_get_io_offset_src(instr);
5388 nir_src* vertex_index_src = nir_get_io_arrayed_index_src(instr);
5389 nir_instr* vertex_index_instr = vertex_index_src->ssa->parent_instr;
5390 bool can_use_temps =
5391 nir_src_is_const(*off_src) && vertex_index_instr->type == nir_instr_type_intrinsic &&
5392 nir_instr_as_intrinsic(vertex_index_instr)->intrinsic == nir_intrinsic_load_invocation_id;
5393
5394 if (!can_use_temps)
5395 return false;
5396
5397 nir_io_semantics sem = nir_intrinsic_io_semantics(instr);
5398
5399 unsigned idx =
5400 sem.location * 4u + nir_intrinsic_component(instr) + 4 * nir_src_as_uint(*off_src);
5401 Temp* src = &ctx->inputs.temps[idx];
5402 create_vec_from_array(ctx, src, dst.size(), dst.regClass().type(), 4u, 0, dst);
5403
5404 return true;
5405 }
5406
5407 void
visit_store_output(isel_context * ctx,nir_intrinsic_instr * instr)5408 visit_store_output(isel_context* ctx, nir_intrinsic_instr* instr)
5409 {
5410 /* LS pass output to TCS by temp if they have same in/out patch size. */
5411 bool ls_need_output = ctx->stage == vertex_tess_control_hs &&
5412 ctx->shader->info.stage == MESA_SHADER_VERTEX && ctx->tcs_in_out_eq;
5413
5414 bool tcs_need_output = ctx->shader->info.stage == MESA_SHADER_TESS_CTRL &&
5415 ctx->program->info.has_epilog &&
5416 ctx->program->info.tcs.pass_tessfactors_by_reg;
5417
5418 bool ps_need_output = ctx->stage == fragment_fs;
5419
5420 if (ls_need_output || tcs_need_output || ps_need_output) {
5421 bool stored_to_temps = store_output_to_temps(ctx, instr);
5422 if (!stored_to_temps) {
5423 isel_err(instr->src[1].ssa->parent_instr, "Unimplemented output offset instruction");
5424 abort();
5425 }
5426 } else {
5427 unreachable("Shader stage not implemented");
5428 }
5429 }
5430
5431 bool
in_exec_divergent_or_in_loop(isel_context * ctx)5432 in_exec_divergent_or_in_loop(isel_context* ctx)
5433 {
5434 return ctx->block->loop_nest_depth || ctx->cf_info.parent_if.is_divergent ||
5435 ctx->cf_info.had_divergent_discard;
5436 }
5437
5438 void
emit_interp_instr_gfx11(isel_context * ctx,unsigned idx,unsigned component,Temp src,Temp dst,Temp prim_mask)5439 emit_interp_instr_gfx11(isel_context* ctx, unsigned idx, unsigned component, Temp src, Temp dst,
5440 Temp prim_mask)
5441 {
5442 Temp coord1 = emit_extract_vector(ctx, src, 0, v1);
5443 Temp coord2 = emit_extract_vector(ctx, src, 1, v1);
5444
5445 Builder bld(ctx->program, ctx->block);
5446
5447 if (in_exec_divergent_or_in_loop(ctx)) {
5448 Operand prim_mask_op = bld.m0(prim_mask);
5449 prim_mask_op.setLateKill(true); /* we don't want the bld.lm definition to use m0 */
5450 Operand coord2_op(coord2);
5451 coord2_op.setLateKill(true); /* we re-use the destination reg in the middle */
5452 bld.pseudo(aco_opcode::p_interp_gfx11, Definition(dst), Operand(v1.as_linear()),
5453 Operand::c32(idx), Operand::c32(component), coord1, coord2_op, prim_mask_op);
5454 return;
5455 }
5456
5457 Temp p = bld.ldsdir(aco_opcode::lds_param_load, bld.def(v1), bld.m0(prim_mask), idx, component);
5458
5459 Temp res;
5460 if (dst.regClass() == v2b) {
5461 Temp p10 =
5462 bld.vinterp_inreg(aco_opcode::v_interp_p10_f16_f32_inreg, bld.def(v1), p, coord1, p);
5463 res = bld.vinterp_inreg(aco_opcode::v_interp_p2_f16_f32_inreg, bld.def(v1), p, coord2, p10);
5464 emit_extract_vector(ctx, res, 0, dst);
5465 } else {
5466 Temp p10 = bld.vinterp_inreg(aco_opcode::v_interp_p10_f32_inreg, bld.def(v1), p, coord1, p);
5467 bld.vinterp_inreg(aco_opcode::v_interp_p2_f32_inreg, Definition(dst), p, coord2, p10);
5468 }
5469 /* lds_param_load must be done in WQM, and the result kept valid for helper lanes. */
5470 set_wqm(ctx, true);
5471 }
5472
5473 void
emit_interp_instr(isel_context * ctx,unsigned idx,unsigned component,Temp src,Temp dst,Temp prim_mask)5474 emit_interp_instr(isel_context* ctx, unsigned idx, unsigned component, Temp src, Temp dst,
5475 Temp prim_mask)
5476 {
5477 if (ctx->options->gfx_level >= GFX11) {
5478 emit_interp_instr_gfx11(ctx, idx, component, src, dst, prim_mask);
5479 return;
5480 }
5481
5482 Temp coord1 = emit_extract_vector(ctx, src, 0, v1);
5483 Temp coord2 = emit_extract_vector(ctx, src, 1, v1);
5484
5485 Builder bld(ctx->program, ctx->block);
5486
5487 if (dst.regClass() == v2b) {
5488 if (ctx->program->dev.has_16bank_lds) {
5489 assert(ctx->options->gfx_level <= GFX8);
5490 Builder::Result interp_p1 =
5491 bld.vintrp(aco_opcode::v_interp_mov_f32, bld.def(v1), Operand::c32(2u) /* P0 */,
5492 bld.m0(prim_mask), idx, component);
5493 interp_p1 = bld.vintrp(aco_opcode::v_interp_p1lv_f16, bld.def(v2b), coord1,
5494 bld.m0(prim_mask), interp_p1, idx, component);
5495 bld.vintrp(aco_opcode::v_interp_p2_legacy_f16, Definition(dst), coord2, bld.m0(prim_mask),
5496 interp_p1, idx, component);
5497 } else {
5498 aco_opcode interp_p2_op = aco_opcode::v_interp_p2_f16;
5499
5500 if (ctx->options->gfx_level == GFX8)
5501 interp_p2_op = aco_opcode::v_interp_p2_legacy_f16;
5502
5503 Builder::Result interp_p1 = bld.vintrp(aco_opcode::v_interp_p1ll_f16, bld.def(v1), coord1,
5504 bld.m0(prim_mask), idx, component);
5505 bld.vintrp(interp_p2_op, Definition(dst), coord2, bld.m0(prim_mask), interp_p1, idx,
5506 component);
5507 }
5508 } else {
5509 Builder::Result interp_p1 = bld.vintrp(aco_opcode::v_interp_p1_f32, bld.def(v1), coord1,
5510 bld.m0(prim_mask), idx, component);
5511
5512 if (ctx->program->dev.has_16bank_lds)
5513 interp_p1->operands[0].setLateKill(true);
5514
5515 bld.vintrp(aco_opcode::v_interp_p2_f32, Definition(dst), coord2, bld.m0(prim_mask), interp_p1,
5516 idx, component);
5517 }
5518 }
5519
5520 void
emit_interp_mov_instr(isel_context * ctx,unsigned idx,unsigned component,unsigned vertex_id,Temp dst,Temp prim_mask)5521 emit_interp_mov_instr(isel_context* ctx, unsigned idx, unsigned component, unsigned vertex_id,
5522 Temp dst, Temp prim_mask)
5523 {
5524 Builder bld(ctx->program, ctx->block);
5525 if (ctx->options->gfx_level >= GFX11) {
5526 uint16_t dpp_ctrl = dpp_quad_perm(vertex_id, vertex_id, vertex_id, vertex_id);
5527 if (in_exec_divergent_or_in_loop(ctx)) {
5528 Operand prim_mask_op = bld.m0(prim_mask);
5529 prim_mask_op.setLateKill(true); /* we don't want the bld.lm definition to use m0 */
5530 bld.pseudo(aco_opcode::p_interp_gfx11, Definition(dst), Operand(v1.as_linear()),
5531 Operand::c32(idx), Operand::c32(component), Operand::c32(dpp_ctrl),
5532 prim_mask_op);
5533 } else {
5534 Temp p =
5535 bld.ldsdir(aco_opcode::lds_param_load, bld.def(v1), bld.m0(prim_mask), idx, component);
5536 if (dst.regClass() == v2b) {
5537 Temp res = bld.vop1_dpp(aco_opcode::v_mov_b32, bld.def(v1), p, dpp_ctrl);
5538 emit_extract_vector(ctx, res, 0, dst);
5539 } else {
5540 bld.vop1_dpp(aco_opcode::v_mov_b32, Definition(dst), p, dpp_ctrl);
5541 }
5542 /* lds_param_load must be done in WQM, and the result kept valid for helper lanes. */
5543 set_wqm(ctx, true);
5544 }
5545 } else {
5546 bld.vintrp(aco_opcode::v_interp_mov_f32, Definition(dst), Operand::c32((vertex_id + 2) % 3),
5547 bld.m0(prim_mask), idx, component);
5548 }
5549 }
5550
5551 void
emit_load_frag_coord(isel_context * ctx,Temp dst,unsigned num_components)5552 emit_load_frag_coord(isel_context* ctx, Temp dst, unsigned num_components)
5553 {
5554 Builder bld(ctx->program, ctx->block);
5555
5556 aco_ptr<Pseudo_instruction> vec(create_instruction<Pseudo_instruction>(
5557 aco_opcode::p_create_vector, Format::PSEUDO, num_components, 1));
5558 for (unsigned i = 0; i < num_components; i++) {
5559 if (ctx->args->frag_pos[i].used)
5560 vec->operands[i] = Operand(get_arg(ctx, ctx->args->frag_pos[i]));
5561 else
5562 vec->operands[i] = Operand(v1);
5563 }
5564 if (G_0286CC_POS_W_FLOAT_ENA(ctx->program->config->spi_ps_input_ena)) {
5565 assert(num_components == 4);
5566 vec->operands[3] =
5567 bld.vop1(aco_opcode::v_rcp_f32, bld.def(v1), get_arg(ctx, ctx->args->frag_pos[3]));
5568 }
5569
5570 for (Operand& op : vec->operands)
5571 op = op.isUndefined() ? Operand::zero() : op;
5572
5573 vec->definitions[0] = Definition(dst);
5574 ctx->block->instructions.emplace_back(std::move(vec));
5575 emit_split_vector(ctx, dst, num_components);
5576 return;
5577 }
5578
5579 void
emit_load_frag_shading_rate(isel_context * ctx,Temp dst)5580 emit_load_frag_shading_rate(isel_context* ctx, Temp dst)
5581 {
5582 Builder bld(ctx->program, ctx->block);
5583 Temp cond;
5584
5585 /* VRS Rate X = Ancillary[2:3]
5586 * VRS Rate Y = Ancillary[4:5]
5587 */
5588 Temp x_rate = bld.vop3(aco_opcode::v_bfe_u32, bld.def(v1), get_arg(ctx, ctx->args->ancillary),
5589 Operand::c32(2u), Operand::c32(2u));
5590 Temp y_rate = bld.vop3(aco_opcode::v_bfe_u32, bld.def(v1), get_arg(ctx, ctx->args->ancillary),
5591 Operand::c32(4u), Operand::c32(2u));
5592
5593 /* xRate = xRate == 0x1 ? Horizontal2Pixels : None. */
5594 cond = bld.vopc(aco_opcode::v_cmp_eq_i32, bld.def(bld.lm), Operand::c32(1u), Operand(x_rate));
5595 x_rate = bld.vop2(aco_opcode::v_cndmask_b32, bld.def(v1), bld.copy(bld.def(v1), Operand::zero()),
5596 bld.copy(bld.def(v1), Operand::c32(4u)), cond);
5597
5598 /* yRate = yRate == 0x1 ? Vertical2Pixels : None. */
5599 cond = bld.vopc(aco_opcode::v_cmp_eq_i32, bld.def(bld.lm), Operand::c32(1u), Operand(y_rate));
5600 y_rate = bld.vop2(aco_opcode::v_cndmask_b32, bld.def(v1), bld.copy(bld.def(v1), Operand::zero()),
5601 bld.copy(bld.def(v1), Operand::c32(1u)), cond);
5602
5603 bld.vop2(aco_opcode::v_or_b32, Definition(dst), Operand(x_rate), Operand(y_rate));
5604 }
5605
5606 void
visit_load_interpolated_input(isel_context * ctx,nir_intrinsic_instr * instr)5607 visit_load_interpolated_input(isel_context* ctx, nir_intrinsic_instr* instr)
5608 {
5609 Temp dst = get_ssa_temp(ctx, &instr->def);
5610 Temp coords = get_ssa_temp(ctx, instr->src[0].ssa);
5611 unsigned idx = nir_intrinsic_base(instr);
5612 unsigned component = nir_intrinsic_component(instr);
5613 Temp prim_mask = get_arg(ctx, ctx->args->prim_mask);
5614
5615 assert(nir_src_is_const(instr->src[1]) && !nir_src_as_uint(instr->src[1]));
5616
5617 if (instr->def.num_components == 1) {
5618 emit_interp_instr(ctx, idx, component, coords, dst, prim_mask);
5619 } else {
5620 aco_ptr<Pseudo_instruction> vec(create_instruction<Pseudo_instruction>(
5621 aco_opcode::p_create_vector, Format::PSEUDO, instr->def.num_components, 1));
5622 for (unsigned i = 0; i < instr->def.num_components; i++) {
5623 Temp tmp = ctx->program->allocateTmp(instr->def.bit_size == 16 ? v2b : v1);
5624 emit_interp_instr(ctx, idx, component + i, coords, tmp, prim_mask);
5625 vec->operands[i] = Operand(tmp);
5626 }
5627 vec->definitions[0] = Definition(dst);
5628 ctx->block->instructions.emplace_back(std::move(vec));
5629 }
5630 }
5631
5632 Temp
mtbuf_load_callback(Builder & bld,const LoadEmitInfo & info,Temp offset,unsigned bytes_needed,unsigned alignment,unsigned const_offset,Temp dst_hint)5633 mtbuf_load_callback(Builder& bld, const LoadEmitInfo& info, Temp offset, unsigned bytes_needed,
5634 unsigned alignment, unsigned const_offset, Temp dst_hint)
5635 {
5636 Operand vaddr = offset.type() == RegType::vgpr ? Operand(offset) : Operand(v1);
5637 Operand soffset = offset.type() == RegType::sgpr ? Operand(offset) : Operand::c32(0);
5638
5639 if (info.soffset.id()) {
5640 if (soffset.isTemp())
5641 vaddr = bld.copy(bld.def(v1), soffset);
5642 soffset = Operand(info.soffset);
5643 }
5644
5645 if (soffset.isUndefined())
5646 soffset = Operand::zero();
5647
5648 const bool offen = !vaddr.isUndefined();
5649 const bool idxen = info.idx.id();
5650
5651 if (offen && idxen)
5652 vaddr = bld.pseudo(aco_opcode::p_create_vector, bld.def(v2), info.idx, vaddr);
5653 else if (idxen)
5654 vaddr = Operand(info.idx);
5655
5656 /* Determine number of fetched components.
5657 * Note, ACO IR works with GFX6-8 nfmt + dfmt fields, these are later converted for GFX10+.
5658 */
5659 const struct ac_vtx_format_info* vtx_info =
5660 ac_get_vtx_format_info(GFX8, CHIP_POLARIS10, info.format);
5661 /* The number of channels in the format determines the memory range. */
5662 const unsigned max_components = vtx_info->num_channels;
5663 /* Calculate maximum number of components loaded according to alignment. */
5664 unsigned max_fetched_components = bytes_needed / info.component_size;
5665 max_fetched_components =
5666 ac_get_safe_fetch_size(bld.program->gfx_level, vtx_info, const_offset, max_components,
5667 alignment, max_fetched_components);
5668 const unsigned fetch_fmt = vtx_info->hw_format[max_fetched_components - 1];
5669 /* Adjust bytes needed in case we need to do a smaller load due to alignment.
5670 * If a larger format is selected, it's still OK to load a smaller amount from it.
5671 */
5672 bytes_needed = MIN2(bytes_needed, max_fetched_components * info.component_size);
5673 unsigned bytes_size = 0;
5674 const unsigned bit_size = info.component_size * 8;
5675 aco_opcode op = aco_opcode::num_opcodes;
5676
5677 if (bytes_needed == 2) {
5678 bytes_size = 2;
5679 op = aco_opcode::tbuffer_load_format_d16_x;
5680 } else if (bytes_needed <= 4) {
5681 bytes_size = 4;
5682 if (bit_size == 16)
5683 op = aco_opcode::tbuffer_load_format_d16_xy;
5684 else
5685 op = aco_opcode::tbuffer_load_format_x;
5686 } else if (bytes_needed <= 6) {
5687 bytes_size = 6;
5688 if (bit_size == 16)
5689 op = aco_opcode::tbuffer_load_format_d16_xyz;
5690 else
5691 op = aco_opcode::tbuffer_load_format_xy;
5692 } else if (bytes_needed <= 8) {
5693 bytes_size = 8;
5694 if (bit_size == 16)
5695 op = aco_opcode::tbuffer_load_format_d16_xyzw;
5696 else
5697 op = aco_opcode::tbuffer_load_format_xy;
5698 } else if (bytes_needed <= 12) {
5699 bytes_size = 12;
5700 op = aco_opcode::tbuffer_load_format_xyz;
5701 } else {
5702 bytes_size = 16;
5703 op = aco_opcode::tbuffer_load_format_xyzw;
5704 }
5705
5706 /* Abort when suitable opcode wasn't found so we don't compile buggy shaders. */
5707 if (op == aco_opcode::num_opcodes) {
5708 aco_err(bld.program, "unsupported bit size for typed buffer load");
5709 abort();
5710 }
5711
5712 aco_ptr<MTBUF_instruction> mtbuf{create_instruction<MTBUF_instruction>(op, Format::MTBUF, 3, 1)};
5713 mtbuf->operands[0] = Operand(info.resource);
5714 mtbuf->operands[1] = vaddr;
5715 mtbuf->operands[2] = soffset;
5716 mtbuf->offen = offen;
5717 mtbuf->idxen = idxen;
5718 mtbuf->glc = info.glc;
5719 mtbuf->dlc = info.glc && (bld.program->gfx_level == GFX10 || bld.program->gfx_level == GFX10_3);
5720 mtbuf->slc = info.slc;
5721 mtbuf->sync = info.sync;
5722 mtbuf->offset = const_offset;
5723 mtbuf->dfmt = fetch_fmt & 0xf;
5724 mtbuf->nfmt = fetch_fmt >> 4;
5725 RegClass rc = RegClass::get(RegType::vgpr, bytes_size);
5726 Temp val = dst_hint.id() && rc == dst_hint.regClass() ? dst_hint : bld.tmp(rc);
5727 mtbuf->definitions[0] = Definition(val);
5728 bld.insert(std::move(mtbuf));
5729
5730 return val;
5731 }
5732
5733 const EmitLoadParameters mtbuf_load_params{mtbuf_load_callback, false, true, 4096};
5734
5735 void
visit_load_fs_input(isel_context * ctx,nir_intrinsic_instr * instr)5736 visit_load_fs_input(isel_context* ctx, nir_intrinsic_instr* instr)
5737 {
5738 Builder bld(ctx->program, ctx->block);
5739 Temp dst = get_ssa_temp(ctx, &instr->def);
5740 nir_src offset = *nir_get_io_offset_src(instr);
5741
5742 if (!nir_src_is_const(offset) || nir_src_as_uint(offset))
5743 isel_err(offset.ssa->parent_instr, "Unimplemented non-zero nir_intrinsic_load_input offset");
5744
5745 Temp prim_mask = get_arg(ctx, ctx->args->prim_mask);
5746
5747 unsigned idx = nir_intrinsic_base(instr);
5748 unsigned component = nir_intrinsic_component(instr);
5749 unsigned vertex_id = 0; /* P0 */
5750
5751 if (instr->intrinsic == nir_intrinsic_load_input_vertex)
5752 vertex_id = nir_src_as_uint(instr->src[0]);
5753
5754 if (instr->def.num_components == 1 && instr->def.bit_size != 64) {
5755 emit_interp_mov_instr(ctx, idx, component, vertex_id, dst, prim_mask);
5756 } else {
5757 unsigned num_components = instr->def.num_components;
5758 if (instr->def.bit_size == 64)
5759 num_components *= 2;
5760 aco_ptr<Pseudo_instruction> vec{create_instruction<Pseudo_instruction>(
5761 aco_opcode::p_create_vector, Format::PSEUDO, num_components, 1)};
5762 for (unsigned i = 0; i < num_components; i++) {
5763 unsigned chan_component = (component + i) % 4;
5764 unsigned chan_idx = idx + (component + i) / 4;
5765 vec->operands[i] = Operand(bld.tmp(instr->def.bit_size == 16 ? v2b : v1));
5766 emit_interp_mov_instr(ctx, chan_idx, chan_component, vertex_id, vec->operands[i].getTemp(),
5767 prim_mask);
5768 }
5769 vec->definitions[0] = Definition(dst);
5770 bld.insert(std::move(vec));
5771 }
5772 }
5773
5774 void
visit_load_tcs_per_vertex_input(isel_context * ctx,nir_intrinsic_instr * instr)5775 visit_load_tcs_per_vertex_input(isel_context* ctx, nir_intrinsic_instr* instr)
5776 {
5777 assert(ctx->shader->info.stage == MESA_SHADER_TESS_CTRL);
5778
5779 Builder bld(ctx->program, ctx->block);
5780 Temp dst = get_ssa_temp(ctx, &instr->def);
5781
5782 if (load_input_from_temps(ctx, instr, dst))
5783 return;
5784
5785 unreachable("LDS-based TCS input should have been lowered in NIR.");
5786 }
5787
5788 void
visit_load_per_vertex_input(isel_context * ctx,nir_intrinsic_instr * instr)5789 visit_load_per_vertex_input(isel_context* ctx, nir_intrinsic_instr* instr)
5790 {
5791 switch (ctx->shader->info.stage) {
5792 case MESA_SHADER_TESS_CTRL: visit_load_tcs_per_vertex_input(ctx, instr); break;
5793 default: unreachable("Unimplemented shader stage");
5794 }
5795 }
5796
5797 void
visit_load_tess_coord(isel_context * ctx,nir_intrinsic_instr * instr)5798 visit_load_tess_coord(isel_context* ctx, nir_intrinsic_instr* instr)
5799 {
5800 assert(ctx->shader->info.stage == MESA_SHADER_TESS_EVAL);
5801
5802 Builder bld(ctx->program, ctx->block);
5803 Temp dst = get_ssa_temp(ctx, &instr->def);
5804
5805 Operand tes_u(get_arg(ctx, ctx->args->tes_u));
5806 Operand tes_v(get_arg(ctx, ctx->args->tes_v));
5807 Operand tes_w = Operand::zero();
5808
5809 if (ctx->shader->info.tess._primitive_mode == TESS_PRIMITIVE_TRIANGLES) {
5810 Temp tmp = bld.vop2(aco_opcode::v_add_f32, bld.def(v1), tes_u, tes_v);
5811 tmp = bld.vop2(aco_opcode::v_sub_f32, bld.def(v1), Operand::c32(0x3f800000u /* 1.0f */), tmp);
5812 tes_w = Operand(tmp);
5813 }
5814
5815 Temp tess_coord = bld.pseudo(aco_opcode::p_create_vector, Definition(dst), tes_u, tes_v, tes_w);
5816 emit_split_vector(ctx, tess_coord, 3);
5817 }
5818
5819 void
load_buffer(isel_context * ctx,unsigned num_components,unsigned component_size,Temp dst,Temp rsrc,Temp offset,unsigned align_mul,unsigned align_offset,bool glc=false,bool allow_smem=true,memory_sync_info sync=memory_sync_info ())5820 load_buffer(isel_context* ctx, unsigned num_components, unsigned component_size, Temp dst,
5821 Temp rsrc, Temp offset, unsigned align_mul, unsigned align_offset, bool glc = false,
5822 bool allow_smem = true, memory_sync_info sync = memory_sync_info())
5823 {
5824 Builder bld(ctx->program, ctx->block);
5825
5826 bool use_smem =
5827 dst.type() != RegType::vgpr && (!glc || ctx->options->gfx_level >= GFX8) && allow_smem;
5828 if (use_smem)
5829 offset = bld.as_uniform(offset);
5830 else {
5831 /* GFX6-7 are affected by a hw bug that prevents address clamping to
5832 * work correctly when the SGPR offset is used.
5833 */
5834 if (offset.type() == RegType::sgpr && ctx->options->gfx_level < GFX8)
5835 offset = as_vgpr(ctx, offset);
5836 }
5837
5838 LoadEmitInfo info = {Operand(offset), dst, num_components, component_size, rsrc};
5839 info.glc = glc;
5840 info.sync = sync;
5841 info.align_mul = align_mul;
5842 info.align_offset = align_offset;
5843 if (use_smem)
5844 emit_load(ctx, bld, info, smem_load_params);
5845 else
5846 emit_load(ctx, bld, info, mubuf_load_params);
5847 }
5848
5849 void
visit_load_ubo(isel_context * ctx,nir_intrinsic_instr * instr)5850 visit_load_ubo(isel_context* ctx, nir_intrinsic_instr* instr)
5851 {
5852 Temp dst = get_ssa_temp(ctx, &instr->def);
5853 Builder bld(ctx->program, ctx->block);
5854 Temp rsrc = bld.as_uniform(get_ssa_temp(ctx, instr->src[0].ssa));
5855
5856 unsigned size = instr->def.bit_size / 8;
5857 load_buffer(ctx, instr->num_components, size, dst, rsrc, get_ssa_temp(ctx, instr->src[1].ssa),
5858 nir_intrinsic_align_mul(instr), nir_intrinsic_align_offset(instr));
5859 }
5860
5861 void
visit_load_push_constant(isel_context * ctx,nir_intrinsic_instr * instr)5862 visit_load_push_constant(isel_context* ctx, nir_intrinsic_instr* instr)
5863 {
5864 Builder bld(ctx->program, ctx->block);
5865 Temp dst = get_ssa_temp(ctx, &instr->def);
5866 unsigned offset = nir_intrinsic_base(instr);
5867 unsigned count = instr->def.num_components;
5868 nir_const_value* index_cv = nir_src_as_const_value(instr->src[0]);
5869
5870 if (instr->def.bit_size == 64)
5871 count *= 2;
5872
5873 if (index_cv && instr->def.bit_size >= 32) {
5874 unsigned start = (offset + index_cv->u32) / 4u;
5875 uint64_t mask = BITFIELD64_MASK(count) << start;
5876 if ((ctx->args->inline_push_const_mask | mask) == ctx->args->inline_push_const_mask &&
5877 start + count <= (sizeof(ctx->args->inline_push_const_mask) * 8u)) {
5878 std::array<Temp, NIR_MAX_VEC_COMPONENTS> elems;
5879 aco_ptr<Pseudo_instruction> vec{create_instruction<Pseudo_instruction>(
5880 aco_opcode::p_create_vector, Format::PSEUDO, count, 1)};
5881 unsigned arg_index =
5882 util_bitcount64(ctx->args->inline_push_const_mask & BITFIELD64_MASK(start));
5883 for (unsigned i = 0; i < count; ++i) {
5884 elems[i] = get_arg(ctx, ctx->args->inline_push_consts[arg_index++]);
5885 vec->operands[i] = Operand{elems[i]};
5886 }
5887 vec->definitions[0] = Definition(dst);
5888 ctx->block->instructions.emplace_back(std::move(vec));
5889 ctx->allocated_vec.emplace(dst.id(), elems);
5890 return;
5891 }
5892 }
5893
5894 Temp index = bld.as_uniform(get_ssa_temp(ctx, instr->src[0].ssa));
5895 if (offset != 0) // TODO check if index != 0 as well
5896 index = bld.nuw().sop2(aco_opcode::s_add_i32, bld.def(s1), bld.def(s1, scc),
5897 Operand::c32(offset), index);
5898 Temp ptr = convert_pointer_to_64_bit(ctx, get_arg(ctx, ctx->args->push_constants));
5899 Temp vec = dst;
5900 bool trim = false;
5901 bool aligned = true;
5902
5903 if (instr->def.bit_size == 8) {
5904 aligned = index_cv && (offset + index_cv->u32) % 4 == 0;
5905 bool fits_in_dword = count == 1 || (index_cv && ((offset + index_cv->u32) % 4 + count) <= 4);
5906 if (!aligned)
5907 vec = fits_in_dword ? bld.tmp(s1) : bld.tmp(s2);
5908 } else if (instr->def.bit_size == 16) {
5909 aligned = index_cv && (offset + index_cv->u32) % 4 == 0;
5910 if (!aligned)
5911 vec = count == 4 ? bld.tmp(s4) : count > 1 ? bld.tmp(s2) : bld.tmp(s1);
5912 }
5913
5914 aco_opcode op;
5915
5916 switch (vec.size()) {
5917 case 1: op = aco_opcode::s_load_dword; break;
5918 case 2: op = aco_opcode::s_load_dwordx2; break;
5919 case 3:
5920 vec = bld.tmp(s4);
5921 trim = true;
5922 FALLTHROUGH;
5923 case 4: op = aco_opcode::s_load_dwordx4; break;
5924 case 6:
5925 vec = bld.tmp(s8);
5926 trim = true;
5927 FALLTHROUGH;
5928 case 8: op = aco_opcode::s_load_dwordx8; break;
5929 default: unreachable("unimplemented or forbidden load_push_constant.");
5930 }
5931
5932 bld.smem(op, Definition(vec), ptr, index);
5933
5934 if (!aligned) {
5935 Operand byte_offset = index_cv ? Operand::c32((offset + index_cv->u32) % 4) : Operand(index);
5936 byte_align_scalar(ctx, vec, byte_offset, dst);
5937 return;
5938 }
5939
5940 if (trim) {
5941 emit_split_vector(ctx, vec, 4);
5942 RegClass rc = dst.size() == 3 ? s1 : s2;
5943 bld.pseudo(aco_opcode::p_create_vector, Definition(dst), emit_extract_vector(ctx, vec, 0, rc),
5944 emit_extract_vector(ctx, vec, 1, rc), emit_extract_vector(ctx, vec, 2, rc));
5945 }
5946 emit_split_vector(ctx, dst, instr->def.num_components);
5947 }
5948
5949 void
visit_load_constant(isel_context * ctx,nir_intrinsic_instr * instr)5950 visit_load_constant(isel_context* ctx, nir_intrinsic_instr* instr)
5951 {
5952 Temp dst = get_ssa_temp(ctx, &instr->def);
5953
5954 Builder bld(ctx->program, ctx->block);
5955
5956 uint32_t desc_type =
5957 S_008F0C_DST_SEL_X(V_008F0C_SQ_SEL_X) | S_008F0C_DST_SEL_Y(V_008F0C_SQ_SEL_Y) |
5958 S_008F0C_DST_SEL_Z(V_008F0C_SQ_SEL_Z) | S_008F0C_DST_SEL_W(V_008F0C_SQ_SEL_W);
5959 if (ctx->options->gfx_level >= GFX10) {
5960 desc_type |= S_008F0C_FORMAT(V_008F0C_GFX10_FORMAT_32_FLOAT) |
5961 S_008F0C_OOB_SELECT(V_008F0C_OOB_SELECT_RAW) |
5962 S_008F0C_RESOURCE_LEVEL(ctx->options->gfx_level < GFX11);
5963 } else {
5964 desc_type |= S_008F0C_NUM_FORMAT(V_008F0C_BUF_NUM_FORMAT_FLOAT) |
5965 S_008F0C_DATA_FORMAT(V_008F0C_BUF_DATA_FORMAT_32);
5966 }
5967
5968 unsigned base = nir_intrinsic_base(instr);
5969 unsigned range = nir_intrinsic_range(instr);
5970
5971 Temp offset = get_ssa_temp(ctx, instr->src[0].ssa);
5972 if (base && offset.type() == RegType::sgpr)
5973 offset = bld.nuw().sop2(aco_opcode::s_add_u32, bld.def(s1), bld.def(s1, scc), offset,
5974 Operand::c32(base));
5975 else if (base && offset.type() == RegType::vgpr)
5976 offset = bld.vadd32(bld.def(v1), Operand::c32(base), offset);
5977
5978 Temp rsrc = bld.pseudo(aco_opcode::p_create_vector, bld.def(s4),
5979 bld.pseudo(aco_opcode::p_constaddr, bld.def(s2), bld.def(s1, scc),
5980 Operand::c32(ctx->constant_data_offset)),
5981 Operand::c32(MIN2(base + range, ctx->shader->constant_data_size)),
5982 Operand::c32(desc_type));
5983 unsigned size = instr->def.bit_size / 8;
5984 // TODO: get alignment information for subdword constants
5985 load_buffer(ctx, instr->num_components, size, dst, rsrc, offset, size, 0);
5986 }
5987
5988 /* Packs multiple Temps of different sizes in to a vector of v1 Temps.
5989 * The byte count of each input Temp must be a multiple of 2.
5990 */
5991 static std::vector<Temp>
emit_pack_v1(isel_context * ctx,const std::vector<Temp> & unpacked)5992 emit_pack_v1(isel_context* ctx, const std::vector<Temp>& unpacked)
5993 {
5994 Builder bld(ctx->program, ctx->block);
5995 std::vector<Temp> packed;
5996 Temp low = Temp();
5997 for (Temp tmp : unpacked) {
5998 assert(tmp.bytes() % 2 == 0);
5999 unsigned byte_idx = 0;
6000 while (byte_idx < tmp.bytes()) {
6001 if (low != Temp()) {
6002 Temp high = emit_extract_vector(ctx, tmp, byte_idx / 2, v2b);
6003 Temp dword = bld.pseudo(aco_opcode::p_create_vector, bld.def(v1), low, high);
6004 low = Temp();
6005 packed.push_back(dword);
6006 byte_idx += 2;
6007 } else if (byte_idx % 4 == 0 && (byte_idx + 4) <= tmp.bytes()) {
6008 packed.emplace_back(emit_extract_vector(ctx, tmp, byte_idx / 4, v1));
6009 byte_idx += 4;
6010 } else {
6011 low = emit_extract_vector(ctx, tmp, byte_idx / 2, v2b);
6012 byte_idx += 2;
6013 }
6014 }
6015 }
6016 if (low != Temp()) {
6017 Temp dword = bld.pseudo(aco_opcode::p_create_vector, bld.def(v1), low, Operand(v2b));
6018 packed.push_back(dword);
6019 }
6020 return packed;
6021 }
6022
6023 static bool
should_declare_array(ac_image_dim dim)6024 should_declare_array(ac_image_dim dim)
6025 {
6026 return dim == ac_image_cube || dim == ac_image_1darray || dim == ac_image_2darray ||
6027 dim == ac_image_2darraymsaa;
6028 }
6029
6030 static int
image_type_to_components_count(enum glsl_sampler_dim dim,bool array)6031 image_type_to_components_count(enum glsl_sampler_dim dim, bool array)
6032 {
6033 switch (dim) {
6034 case GLSL_SAMPLER_DIM_BUF: return 1;
6035 case GLSL_SAMPLER_DIM_1D: return array ? 2 : 1;
6036 case GLSL_SAMPLER_DIM_2D: return array ? 3 : 2;
6037 case GLSL_SAMPLER_DIM_MS: return array ? 3 : 2;
6038 case GLSL_SAMPLER_DIM_3D:
6039 case GLSL_SAMPLER_DIM_CUBE: return 3;
6040 case GLSL_SAMPLER_DIM_RECT:
6041 case GLSL_SAMPLER_DIM_SUBPASS: return 2;
6042 case GLSL_SAMPLER_DIM_SUBPASS_MS: return 2;
6043 default: break;
6044 }
6045 return 0;
6046 }
6047
6048 static MIMG_instruction*
emit_mimg(Builder & bld,aco_opcode op,Temp dst,Temp rsrc,Operand samp,std::vector<Temp> coords,Operand vdata=Operand (v1))6049 emit_mimg(Builder& bld, aco_opcode op, Temp dst, Temp rsrc, Operand samp, std::vector<Temp> coords,
6050 Operand vdata = Operand(v1))
6051 {
6052 size_t nsa_size = bld.program->dev.max_nsa_vgprs;
6053 nsa_size = bld.program->gfx_level >= GFX11 || coords.size() <= nsa_size ? nsa_size : 0;
6054
6055 const bool strict_wqm = coords[0].regClass().is_linear_vgpr();
6056 if (strict_wqm)
6057 nsa_size = coords.size();
6058
6059 for (unsigned i = 0; i < std::min(coords.size(), nsa_size); i++) {
6060 if (!coords[i].id())
6061 continue;
6062
6063 coords[i] = as_vgpr(bld, coords[i]);
6064 }
6065
6066 if (nsa_size < coords.size()) {
6067 Temp coord = coords[nsa_size];
6068 if (coords.size() - nsa_size > 1) {
6069 aco_ptr<Pseudo_instruction> vec{create_instruction<Pseudo_instruction>(
6070 aco_opcode::p_create_vector, Format::PSEUDO, coords.size() - nsa_size, 1)};
6071
6072 unsigned coord_size = 0;
6073 for (unsigned i = nsa_size; i < coords.size(); i++) {
6074 vec->operands[i - nsa_size] = Operand(coords[i]);
6075 coord_size += coords[i].size();
6076 }
6077
6078 coord = bld.tmp(RegType::vgpr, coord_size);
6079 vec->definitions[0] = Definition(coord);
6080 bld.insert(std::move(vec));
6081 } else {
6082 coord = as_vgpr(bld, coord);
6083 }
6084
6085 coords[nsa_size] = coord;
6086 coords.resize(nsa_size + 1);
6087 }
6088
6089 bool has_dst = dst.id() != 0;
6090
6091 aco_ptr<MIMG_instruction> mimg{
6092 create_instruction<MIMG_instruction>(op, Format::MIMG, 3 + coords.size(), has_dst)};
6093 if (has_dst)
6094 mimg->definitions[0] = Definition(dst);
6095 mimg->operands[0] = Operand(rsrc);
6096 mimg->operands[1] = samp;
6097 mimg->operands[2] = vdata;
6098 for (unsigned i = 0; i < coords.size(); i++)
6099 mimg->operands[3 + i] = Operand(coords[i]);
6100 mimg->strict_wqm = strict_wqm;
6101
6102 MIMG_instruction* res = mimg.get();
6103 bld.insert(std::move(mimg));
6104 return res;
6105 }
6106
6107 void
visit_bvh64_intersect_ray_amd(isel_context * ctx,nir_intrinsic_instr * instr)6108 visit_bvh64_intersect_ray_amd(isel_context* ctx, nir_intrinsic_instr* instr)
6109 {
6110 Builder bld(ctx->program, ctx->block);
6111 Temp dst = get_ssa_temp(ctx, &instr->def);
6112 Temp resource = get_ssa_temp(ctx, instr->src[0].ssa);
6113 Temp node = get_ssa_temp(ctx, instr->src[1].ssa);
6114 Temp tmax = get_ssa_temp(ctx, instr->src[2].ssa);
6115 Temp origin = get_ssa_temp(ctx, instr->src[3].ssa);
6116 Temp dir = get_ssa_temp(ctx, instr->src[4].ssa);
6117 Temp inv_dir = get_ssa_temp(ctx, instr->src[5].ssa);
6118
6119 /* On GFX11 image_bvh64_intersect_ray has a special vaddr layout with NSA:
6120 * There are five smaller vector groups:
6121 * node_pointer, ray_extent, ray_origin, ray_dir, ray_inv_dir.
6122 * These directly match the NIR intrinsic sources.
6123 */
6124 std::vector<Temp> args = {
6125 node, tmax, origin, dir, inv_dir,
6126 };
6127
6128 if (bld.program->gfx_level == GFX10_3) {
6129 std::vector<Temp> scalar_args;
6130 for (Temp tmp : args) {
6131 for (unsigned i = 0; i < tmp.size(); i++)
6132 scalar_args.push_back(emit_extract_vector(ctx, tmp, i, v1));
6133 }
6134 args = std::move(scalar_args);
6135 }
6136
6137 MIMG_instruction* mimg =
6138 emit_mimg(bld, aco_opcode::image_bvh64_intersect_ray, dst, resource, Operand(s4), args);
6139 mimg->dim = ac_image_1d;
6140 mimg->dmask = 0xf;
6141 mimg->unrm = true;
6142 mimg->r128 = true;
6143
6144 emit_split_vector(ctx, dst, instr->def.num_components);
6145 }
6146
6147 static std::vector<Temp>
get_image_coords(isel_context * ctx,const nir_intrinsic_instr * instr)6148 get_image_coords(isel_context* ctx, const nir_intrinsic_instr* instr)
6149 {
6150
6151 Temp src0 = get_ssa_temp(ctx, instr->src[1].ssa);
6152 bool a16 = instr->src[1].ssa->bit_size == 16;
6153 RegClass rc = a16 ? v2b : v1;
6154 enum glsl_sampler_dim dim = nir_intrinsic_image_dim(instr);
6155 bool is_array = nir_intrinsic_image_array(instr);
6156 ASSERTED bool add_frag_pos =
6157 (dim == GLSL_SAMPLER_DIM_SUBPASS || dim == GLSL_SAMPLER_DIM_SUBPASS_MS);
6158 assert(!add_frag_pos && "Input attachments should be lowered.");
6159 bool is_ms = (dim == GLSL_SAMPLER_DIM_MS || dim == GLSL_SAMPLER_DIM_SUBPASS_MS);
6160 bool gfx9_1d = ctx->options->gfx_level == GFX9 && dim == GLSL_SAMPLER_DIM_1D;
6161 int count = image_type_to_components_count(dim, is_array);
6162 std::vector<Temp> coords;
6163 Builder bld(ctx->program, ctx->block);
6164
6165 if (gfx9_1d) {
6166 coords.emplace_back(emit_extract_vector(ctx, src0, 0, rc));
6167 coords.emplace_back(bld.copy(bld.def(rc), Operand::zero(a16 ? 2 : 4)));
6168 if (is_array)
6169 coords.emplace_back(emit_extract_vector(ctx, src0, 1, rc));
6170 } else {
6171 for (int i = 0; i < count; i++)
6172 coords.emplace_back(emit_extract_vector(ctx, src0, i, rc));
6173 }
6174
6175 bool has_lod = false;
6176 Temp lod;
6177
6178 if (instr->intrinsic == nir_intrinsic_bindless_image_load ||
6179 instr->intrinsic == nir_intrinsic_bindless_image_sparse_load ||
6180 instr->intrinsic == nir_intrinsic_bindless_image_store) {
6181 int lod_index = instr->intrinsic == nir_intrinsic_bindless_image_store ? 4 : 3;
6182 assert(instr->src[lod_index].ssa->bit_size == (a16 ? 16 : 32));
6183 has_lod =
6184 !nir_src_is_const(instr->src[lod_index]) || nir_src_as_uint(instr->src[lod_index]) != 0;
6185
6186 if (has_lod)
6187 lod = get_ssa_temp_tex(ctx, instr->src[lod_index].ssa, a16);
6188 }
6189
6190 if (ctx->program->info.image_2d_view_of_3d && dim == GLSL_SAMPLER_DIM_2D && !is_array) {
6191 /* The hw can't bind a slice of a 3D image as a 2D image, because it
6192 * ignores BASE_ARRAY if the target is 3D. The workaround is to read
6193 * BASE_ARRAY and set it as the 3rd address operand for all 2D images.
6194 */
6195 assert(ctx->options->gfx_level == GFX9);
6196 Temp rsrc = bld.as_uniform(get_ssa_temp(ctx, instr->src[0].ssa));
6197 Temp rsrc_word5 = emit_extract_vector(ctx, rsrc, 5, v1);
6198 /* Extract the BASE_ARRAY field [0:12] from the descriptor. */
6199 Temp first_layer = bld.vop3(aco_opcode::v_bfe_u32, bld.def(v1), rsrc_word5, Operand::c32(0u),
6200 Operand::c32(13u));
6201
6202 if (has_lod) {
6203 /* If there's a lod parameter it matter if the image is 3d or 2d because
6204 * the hw reads either the fourth or third component as lod. So detect
6205 * 3d images and place the lod at the third component otherwise.
6206 * For non 3D descriptors we effectively add lod twice to coords,
6207 * but the hw will only read the first one, the second is ignored.
6208 */
6209 Temp rsrc_word3 = emit_extract_vector(ctx, rsrc, 3, s1);
6210 Temp type = bld.sop2(aco_opcode::s_bfe_u32, bld.def(s1), bld.def(s1, scc), rsrc_word3,
6211 Operand::c32(28 | (4 << 16))); /* extract last 4 bits */
6212 Temp is_3d = bld.vopc_e64(aco_opcode::v_cmp_eq_u32, bld.def(bld.lm), type,
6213 Operand::c32(V_008F1C_SQ_RSRC_IMG_3D));
6214 first_layer =
6215 bld.vop2(aco_opcode::v_cndmask_b32, bld.def(v1), as_vgpr(ctx, lod), first_layer, is_3d);
6216 }
6217
6218 if (a16)
6219 coords.emplace_back(emit_extract_vector(ctx, first_layer, 0, v2b));
6220 else
6221 coords.emplace_back(first_layer);
6222 }
6223
6224 if (is_ms && instr->intrinsic != nir_intrinsic_bindless_image_fragment_mask_load_amd) {
6225 assert(instr->src[2].ssa->bit_size == (a16 ? 16 : 32));
6226 coords.emplace_back(get_ssa_temp_tex(ctx, instr->src[2].ssa, a16));
6227 }
6228
6229 if (has_lod)
6230 coords.emplace_back(lod);
6231
6232 return emit_pack_v1(ctx, coords);
6233 }
6234
6235 memory_sync_info
get_memory_sync_info(nir_intrinsic_instr * instr,storage_class storage,unsigned semantics)6236 get_memory_sync_info(nir_intrinsic_instr* instr, storage_class storage, unsigned semantics)
6237 {
6238 /* atomicrmw might not have NIR_INTRINSIC_ACCESS and there's nothing interesting there anyway */
6239 if (semantics & semantic_atomicrmw)
6240 return memory_sync_info(storage, semantics);
6241
6242 unsigned access = nir_intrinsic_access(instr);
6243
6244 if (access & ACCESS_VOLATILE)
6245 semantics |= semantic_volatile;
6246 if (access & ACCESS_CAN_REORDER)
6247 semantics |= semantic_can_reorder | semantic_private;
6248
6249 return memory_sync_info(storage, semantics);
6250 }
6251
6252 Operand
emit_tfe_init(Builder & bld,Temp dst)6253 emit_tfe_init(Builder& bld, Temp dst)
6254 {
6255 Temp tmp = bld.tmp(dst.regClass());
6256
6257 aco_ptr<Pseudo_instruction> vec{create_instruction<Pseudo_instruction>(
6258 aco_opcode::p_create_vector, Format::PSEUDO, dst.size(), 1)};
6259 for (unsigned i = 0; i < dst.size(); i++)
6260 vec->operands[i] = Operand::zero();
6261 vec->definitions[0] = Definition(tmp);
6262 /* Since this is fixed to an instruction's definition register, any CSE will
6263 * just create copies. Copying costs about the same as zero-initialization,
6264 * but these copies can break up clauses.
6265 */
6266 vec->definitions[0].setNoCSE(true);
6267 bld.insert(std::move(vec));
6268
6269 return Operand(tmp);
6270 }
6271
6272 void
visit_image_load(isel_context * ctx,nir_intrinsic_instr * instr)6273 visit_image_load(isel_context* ctx, nir_intrinsic_instr* instr)
6274 {
6275 Builder bld(ctx->program, ctx->block);
6276 const enum glsl_sampler_dim dim = nir_intrinsic_image_dim(instr);
6277 bool is_array = nir_intrinsic_image_array(instr);
6278 bool is_sparse = instr->intrinsic == nir_intrinsic_bindless_image_sparse_load;
6279 Temp dst = get_ssa_temp(ctx, &instr->def);
6280
6281 memory_sync_info sync = get_memory_sync_info(instr, storage_image, 0);
6282 unsigned access = nir_intrinsic_access(instr);
6283
6284 unsigned result_size = instr->def.num_components - is_sparse;
6285 unsigned expand_mask = nir_def_components_read(&instr->def) & u_bit_consecutive(0, result_size);
6286 expand_mask = MAX2(expand_mask, 1); /* this can be zero in the case of sparse image loads */
6287 if (dim == GLSL_SAMPLER_DIM_BUF)
6288 expand_mask = (1u << util_last_bit(expand_mask)) - 1u;
6289 unsigned dmask = expand_mask;
6290 if (instr->def.bit_size == 64) {
6291 expand_mask &= 0x9;
6292 /* only R64_UINT and R64_SINT supported. x is in xy of the result, w in zw */
6293 dmask = ((expand_mask & 0x1) ? 0x3 : 0) | ((expand_mask & 0x8) ? 0xc : 0);
6294 }
6295 if (is_sparse)
6296 expand_mask |= 1 << result_size;
6297
6298 bool d16 = instr->def.bit_size == 16;
6299 assert(!d16 || !is_sparse);
6300
6301 unsigned num_bytes = util_bitcount(dmask) * (d16 ? 2 : 4) + is_sparse * 4;
6302
6303 Temp tmp;
6304 if (num_bytes == dst.bytes() && dst.type() == RegType::vgpr)
6305 tmp = dst;
6306 else
6307 tmp = bld.tmp(RegClass::get(RegType::vgpr, num_bytes));
6308
6309 Temp resource = bld.as_uniform(get_ssa_temp(ctx, instr->src[0].ssa));
6310
6311 if (dim == GLSL_SAMPLER_DIM_BUF) {
6312 Temp vindex = emit_extract_vector(ctx, get_ssa_temp(ctx, instr->src[1].ssa), 0, v1);
6313
6314 aco_opcode opcode;
6315 if (!d16) {
6316 switch (util_bitcount(dmask)) {
6317 case 1: opcode = aco_opcode::buffer_load_format_x; break;
6318 case 2: opcode = aco_opcode::buffer_load_format_xy; break;
6319 case 3: opcode = aco_opcode::buffer_load_format_xyz; break;
6320 case 4: opcode = aco_opcode::buffer_load_format_xyzw; break;
6321 default: unreachable(">4 channel buffer image load");
6322 }
6323 } else {
6324 switch (util_bitcount(dmask)) {
6325 case 1: opcode = aco_opcode::buffer_load_format_d16_x; break;
6326 case 2: opcode = aco_opcode::buffer_load_format_d16_xy; break;
6327 case 3: opcode = aco_opcode::buffer_load_format_d16_xyz; break;
6328 case 4: opcode = aco_opcode::buffer_load_format_d16_xyzw; break;
6329 default: unreachable(">4 channel buffer image load");
6330 }
6331 }
6332 aco_ptr<MUBUF_instruction> load{
6333 create_instruction<MUBUF_instruction>(opcode, Format::MUBUF, 3 + is_sparse, 1)};
6334 load->operands[0] = Operand(resource);
6335 load->operands[1] = Operand(vindex);
6336 load->operands[2] = Operand::c32(0);
6337 load->definitions[0] = Definition(tmp);
6338 load->idxen = true;
6339 load->glc = access & (ACCESS_VOLATILE | ACCESS_COHERENT);
6340 load->dlc =
6341 load->glc && (ctx->options->gfx_level == GFX10 || ctx->options->gfx_level == GFX10_3);
6342 load->sync = sync;
6343 load->tfe = is_sparse;
6344 if (load->tfe)
6345 load->operands[3] = emit_tfe_init(bld, tmp);
6346 ctx->block->instructions.emplace_back(std::move(load));
6347 } else {
6348 std::vector<Temp> coords = get_image_coords(ctx, instr);
6349
6350 aco_opcode opcode;
6351 if (instr->intrinsic == nir_intrinsic_bindless_image_fragment_mask_load_amd) {
6352 opcode = aco_opcode::image_load;
6353 } else {
6354 bool level_zero = nir_src_is_const(instr->src[3]) && nir_src_as_uint(instr->src[3]) == 0;
6355 opcode = level_zero ? aco_opcode::image_load : aco_opcode::image_load_mip;
6356 }
6357
6358 Operand vdata = is_sparse ? emit_tfe_init(bld, tmp) : Operand(v1);
6359 MIMG_instruction* load = emit_mimg(bld, opcode, tmp, resource, Operand(s4), coords, vdata);
6360 load->glc = access & (ACCESS_VOLATILE | ACCESS_COHERENT) ? 1 : 0;
6361 load->dlc =
6362 load->glc && (ctx->options->gfx_level == GFX10 || ctx->options->gfx_level == GFX10_3);
6363 load->a16 = instr->src[1].ssa->bit_size == 16;
6364 load->d16 = d16;
6365 load->dmask = dmask;
6366 load->unrm = true;
6367 load->tfe = is_sparse;
6368
6369 if (instr->intrinsic == nir_intrinsic_bindless_image_fragment_mask_load_amd) {
6370 load->dim = is_array ? ac_image_2darray : ac_image_2d;
6371 load->da = is_array;
6372 load->sync = memory_sync_info();
6373 } else {
6374 ac_image_dim sdim = ac_get_image_dim(ctx->options->gfx_level, dim, is_array);
6375 load->dim = sdim;
6376 load->da = should_declare_array(sdim);
6377 load->sync = sync;
6378 }
6379 }
6380
6381 if (is_sparse && instr->def.bit_size == 64) {
6382 /* The result components are 64-bit but the sparse residency code is
6383 * 32-bit. So add a zero to the end so expand_vector() works correctly.
6384 */
6385 tmp = bld.pseudo(aco_opcode::p_create_vector, bld.def(RegType::vgpr, tmp.size() + 1), tmp,
6386 Operand::zero());
6387 }
6388
6389 expand_vector(ctx, tmp, dst, instr->def.num_components, expand_mask, instr->def.bit_size == 64);
6390 }
6391
6392 void
visit_image_store(isel_context * ctx,nir_intrinsic_instr * instr)6393 visit_image_store(isel_context* ctx, nir_intrinsic_instr* instr)
6394 {
6395 Builder bld(ctx->program, ctx->block);
6396 const enum glsl_sampler_dim dim = nir_intrinsic_image_dim(instr);
6397 bool is_array = nir_intrinsic_image_array(instr);
6398 Temp data = get_ssa_temp(ctx, instr->src[3].ssa);
6399 bool d16 = instr->src[3].ssa->bit_size == 16;
6400
6401 /* only R64_UINT and R64_SINT supported */
6402 if (instr->src[3].ssa->bit_size == 64 && data.bytes() > 8)
6403 data = emit_extract_vector(ctx, data, 0, RegClass(data.type(), 2));
6404 data = as_vgpr(ctx, data);
6405
6406 uint32_t num_components = d16 ? instr->src[3].ssa->num_components : data.size();
6407
6408 memory_sync_info sync = get_memory_sync_info(instr, storage_image, 0);
6409 unsigned access = nir_intrinsic_access(instr);
6410 bool glc = ctx->options->gfx_level == GFX6 ||
6411 ((access & (ACCESS_VOLATILE | ACCESS_COHERENT)) && ctx->program->gfx_level < GFX11);
6412
6413 uint32_t dmask = BITFIELD_MASK(num_components);
6414 if (instr->src[3].ssa->bit_size == 32 || instr->src[3].ssa->bit_size == 16) {
6415 for (uint32_t i = 0; i < instr->num_components; i++) {
6416 /* components not in dmask receive:
6417 * GFX6-11.5: zero
6418 * GFX12+: first component in dmask
6419 */
6420 nir_scalar comp = nir_scalar_resolved(instr->src[3].ssa, i);
6421 if (nir_scalar_is_undef(comp)) {
6422 dmask &= ~BITFIELD_BIT(i);
6423 } else if (ctx->options->gfx_level <= GFX11_5) {
6424 if (nir_scalar_is_const(comp) && nir_scalar_as_uint(comp) == 0)
6425 dmask &= ~BITFIELD_BIT(i);
6426 } else {
6427 unsigned first = dim == GLSL_SAMPLER_DIM_BUF ? 0 : ffs(dmask) - 1;
6428 if (i != first && nir_scalar_equal(nir_scalar_resolved(instr->src[3].ssa, first), comp))
6429 dmask &= ~BITFIELD_BIT(i);
6430 }
6431 }
6432
6433 /* dmask cannot be 0, at least one vgpr is always read */
6434 if (dmask == 0)
6435 dmask = 1;
6436 /* buffer store only supports consecutive components. */
6437 if (dim == GLSL_SAMPLER_DIM_BUF)
6438 dmask = BITFIELD_MASK(util_last_bit(dmask));
6439
6440 if (dmask != BITFIELD_MASK(num_components)) {
6441 uint32_t dmask_count = util_bitcount(dmask);
6442 RegClass rc = d16 ? v2b : v1;
6443 if (dmask_count == 1) {
6444 data = emit_extract_vector(ctx, data, ffs(dmask) - 1, rc);
6445 } else {
6446 aco_ptr<Pseudo_instruction> vec{create_instruction<Pseudo_instruction>(
6447 aco_opcode::p_create_vector, Format::PSEUDO, dmask_count, 1)};
6448 uint32_t index = 0;
6449 u_foreach_bit (bit, dmask) {
6450 vec->operands[index++] = Operand(emit_extract_vector(ctx, data, bit, rc));
6451 }
6452 data = bld.tmp(RegClass::get(RegType::vgpr, dmask_count * rc.bytes()));
6453 vec->definitions[0] = Definition(data);
6454 bld.insert(std::move(vec));
6455 }
6456 }
6457 }
6458
6459 if (dim == GLSL_SAMPLER_DIM_BUF) {
6460 Temp rsrc = bld.as_uniform(get_ssa_temp(ctx, instr->src[0].ssa));
6461 Temp vindex = emit_extract_vector(ctx, get_ssa_temp(ctx, instr->src[1].ssa), 0, v1);
6462 aco_opcode opcode;
6463 if (!d16) {
6464 switch (dmask) {
6465 case 0x1: opcode = aco_opcode::buffer_store_format_x; break;
6466 case 0x3: opcode = aco_opcode::buffer_store_format_xy; break;
6467 case 0x7: opcode = aco_opcode::buffer_store_format_xyz; break;
6468 case 0xf: opcode = aco_opcode::buffer_store_format_xyzw; break;
6469 default: unreachable(">4 channel buffer image store");
6470 }
6471 } else {
6472 switch (dmask) {
6473 case 0x1: opcode = aco_opcode::buffer_store_format_d16_x; break;
6474 case 0x3: opcode = aco_opcode::buffer_store_format_d16_xy; break;
6475 case 0x7: opcode = aco_opcode::buffer_store_format_d16_xyz; break;
6476 case 0xf: opcode = aco_opcode::buffer_store_format_d16_xyzw; break;
6477 default: unreachable(">4 channel buffer image store");
6478 }
6479 }
6480 aco_ptr<MUBUF_instruction> store{
6481 create_instruction<MUBUF_instruction>(opcode, Format::MUBUF, 4, 0)};
6482 store->operands[0] = Operand(rsrc);
6483 store->operands[1] = Operand(vindex);
6484 store->operands[2] = Operand::c32(0);
6485 store->operands[3] = Operand(data);
6486 store->idxen = true;
6487 store->glc = glc;
6488 store->dlc = false;
6489 store->disable_wqm = true;
6490 store->sync = sync;
6491 ctx->program->needs_exact = true;
6492 ctx->block->instructions.emplace_back(std::move(store));
6493 return;
6494 }
6495
6496 assert(data.type() == RegType::vgpr);
6497 std::vector<Temp> coords = get_image_coords(ctx, instr);
6498 Temp resource = bld.as_uniform(get_ssa_temp(ctx, instr->src[0].ssa));
6499
6500 bool level_zero = nir_src_is_const(instr->src[4]) && nir_src_as_uint(instr->src[4]) == 0;
6501 aco_opcode opcode = level_zero ? aco_opcode::image_store : aco_opcode::image_store_mip;
6502
6503 MIMG_instruction* store =
6504 emit_mimg(bld, opcode, Temp(0, v1), resource, Operand(s4), coords, Operand(data));
6505 store->glc = glc;
6506 store->dlc = false;
6507 store->a16 = instr->src[1].ssa->bit_size == 16;
6508 store->d16 = d16;
6509 store->dmask = dmask;
6510 store->unrm = true;
6511 ac_image_dim sdim = ac_get_image_dim(ctx->options->gfx_level, dim, is_array);
6512 store->dim = sdim;
6513 store->da = should_declare_array(sdim);
6514 store->disable_wqm = true;
6515 store->sync = sync;
6516 ctx->program->needs_exact = true;
6517 return;
6518 }
6519
6520 void
translate_buffer_image_atomic_op(const nir_atomic_op op,aco_opcode * buf_op,aco_opcode * buf_op64,aco_opcode * image_op)6521 translate_buffer_image_atomic_op(const nir_atomic_op op, aco_opcode* buf_op, aco_opcode* buf_op64,
6522 aco_opcode* image_op)
6523 {
6524 switch (op) {
6525 case nir_atomic_op_iadd:
6526 *buf_op = aco_opcode::buffer_atomic_add;
6527 *buf_op64 = aco_opcode::buffer_atomic_add_x2;
6528 *image_op = aco_opcode::image_atomic_add;
6529 break;
6530 case nir_atomic_op_umin:
6531 *buf_op = aco_opcode::buffer_atomic_umin;
6532 *buf_op64 = aco_opcode::buffer_atomic_umin_x2;
6533 *image_op = aco_opcode::image_atomic_umin;
6534 break;
6535 case nir_atomic_op_imin:
6536 *buf_op = aco_opcode::buffer_atomic_smin;
6537 *buf_op64 = aco_opcode::buffer_atomic_smin_x2;
6538 *image_op = aco_opcode::image_atomic_smin;
6539 break;
6540 case nir_atomic_op_umax:
6541 *buf_op = aco_opcode::buffer_atomic_umax;
6542 *buf_op64 = aco_opcode::buffer_atomic_umax_x2;
6543 *image_op = aco_opcode::image_atomic_umax;
6544 break;
6545 case nir_atomic_op_imax:
6546 *buf_op = aco_opcode::buffer_atomic_smax;
6547 *buf_op64 = aco_opcode::buffer_atomic_smax_x2;
6548 *image_op = aco_opcode::image_atomic_smax;
6549 break;
6550 case nir_atomic_op_iand:
6551 *buf_op = aco_opcode::buffer_atomic_and;
6552 *buf_op64 = aco_opcode::buffer_atomic_and_x2;
6553 *image_op = aco_opcode::image_atomic_and;
6554 break;
6555 case nir_atomic_op_ior:
6556 *buf_op = aco_opcode::buffer_atomic_or;
6557 *buf_op64 = aco_opcode::buffer_atomic_or_x2;
6558 *image_op = aco_opcode::image_atomic_or;
6559 break;
6560 case nir_atomic_op_ixor:
6561 *buf_op = aco_opcode::buffer_atomic_xor;
6562 *buf_op64 = aco_opcode::buffer_atomic_xor_x2;
6563 *image_op = aco_opcode::image_atomic_xor;
6564 break;
6565 case nir_atomic_op_xchg:
6566 *buf_op = aco_opcode::buffer_atomic_swap;
6567 *buf_op64 = aco_opcode::buffer_atomic_swap_x2;
6568 *image_op = aco_opcode::image_atomic_swap;
6569 break;
6570 case nir_atomic_op_cmpxchg:
6571 *buf_op = aco_opcode::buffer_atomic_cmpswap;
6572 *buf_op64 = aco_opcode::buffer_atomic_cmpswap_x2;
6573 *image_op = aco_opcode::image_atomic_cmpswap;
6574 break;
6575 case nir_atomic_op_inc_wrap:
6576 *buf_op = aco_opcode::buffer_atomic_inc;
6577 *buf_op64 = aco_opcode::buffer_atomic_inc_x2;
6578 *image_op = aco_opcode::image_atomic_inc;
6579 break;
6580 case nir_atomic_op_dec_wrap:
6581 *buf_op = aco_opcode::buffer_atomic_dec;
6582 *buf_op64 = aco_opcode::buffer_atomic_dec_x2;
6583 *image_op = aco_opcode::image_atomic_dec;
6584 break;
6585 case nir_atomic_op_fadd:
6586 *buf_op = aco_opcode::buffer_atomic_add_f32;
6587 *buf_op64 = aco_opcode::num_opcodes;
6588 *image_op = aco_opcode::num_opcodes;
6589 break;
6590 case nir_atomic_op_fmin:
6591 *buf_op = aco_opcode::buffer_atomic_fmin;
6592 *buf_op64 = aco_opcode::buffer_atomic_fmin_x2;
6593 *image_op = aco_opcode::image_atomic_fmin;
6594 break;
6595 case nir_atomic_op_fmax:
6596 *buf_op = aco_opcode::buffer_atomic_fmax;
6597 *buf_op64 = aco_opcode::buffer_atomic_fmax_x2;
6598 *image_op = aco_opcode::image_atomic_fmax;
6599 break;
6600 default: unreachable("unsupported atomic operation");
6601 }
6602 }
6603
6604 void
visit_image_atomic(isel_context * ctx,nir_intrinsic_instr * instr)6605 visit_image_atomic(isel_context* ctx, nir_intrinsic_instr* instr)
6606 {
6607 bool return_previous = !nir_def_is_unused(&instr->def);
6608 const enum glsl_sampler_dim dim = nir_intrinsic_image_dim(instr);
6609 bool is_array = nir_intrinsic_image_array(instr);
6610 Builder bld(ctx->program, ctx->block);
6611
6612 const nir_atomic_op op = nir_intrinsic_atomic_op(instr);
6613 const bool cmpswap = op == nir_atomic_op_cmpxchg;
6614
6615 aco_opcode buf_op, buf_op64, image_op;
6616 translate_buffer_image_atomic_op(op, &buf_op, &buf_op64, &image_op);
6617
6618 Temp data = as_vgpr(ctx, get_ssa_temp(ctx, instr->src[3].ssa));
6619 bool is_64bit = data.bytes() == 8;
6620 assert((data.bytes() == 4 || data.bytes() == 8) && "only 32/64-bit image atomics implemented.");
6621
6622 if (cmpswap)
6623 data = bld.pseudo(aco_opcode::p_create_vector, bld.def(is_64bit ? v4 : v2),
6624 get_ssa_temp(ctx, instr->src[4].ssa), data);
6625
6626 Temp dst = get_ssa_temp(ctx, &instr->def);
6627 memory_sync_info sync = get_memory_sync_info(instr, storage_image, semantic_atomicrmw);
6628
6629 if (dim == GLSL_SAMPLER_DIM_BUF) {
6630 Temp vindex = emit_extract_vector(ctx, get_ssa_temp(ctx, instr->src[1].ssa), 0, v1);
6631 Temp resource = bld.as_uniform(get_ssa_temp(ctx, instr->src[0].ssa));
6632 // assert(ctx->options->gfx_level < GFX9 && "GFX9 stride size workaround not yet
6633 // implemented.");
6634 aco_ptr<MUBUF_instruction> mubuf{create_instruction<MUBUF_instruction>(
6635 is_64bit ? buf_op64 : buf_op, Format::MUBUF, 4, return_previous ? 1 : 0)};
6636 mubuf->operands[0] = Operand(resource);
6637 mubuf->operands[1] = Operand(vindex);
6638 mubuf->operands[2] = Operand::c32(0);
6639 mubuf->operands[3] = Operand(data);
6640 Definition def =
6641 return_previous ? (cmpswap ? bld.def(data.regClass()) : Definition(dst)) : Definition();
6642 if (return_previous)
6643 mubuf->definitions[0] = def;
6644 mubuf->offset = 0;
6645 mubuf->idxen = true;
6646 mubuf->glc = return_previous;
6647 mubuf->dlc = false; /* Not needed for atomics */
6648 mubuf->disable_wqm = true;
6649 mubuf->sync = sync;
6650 ctx->program->needs_exact = true;
6651 ctx->block->instructions.emplace_back(std::move(mubuf));
6652 if (return_previous && cmpswap)
6653 bld.pseudo(aco_opcode::p_extract_vector, Definition(dst), def.getTemp(), Operand::zero());
6654 return;
6655 }
6656
6657 std::vector<Temp> coords = get_image_coords(ctx, instr);
6658 Temp resource = bld.as_uniform(get_ssa_temp(ctx, instr->src[0].ssa));
6659 Temp tmp = return_previous ? (cmpswap ? bld.tmp(data.regClass()) : dst) : Temp(0, v1);
6660 MIMG_instruction* mimg =
6661 emit_mimg(bld, image_op, tmp, resource, Operand(s4), coords, Operand(data));
6662 mimg->glc = return_previous;
6663 mimg->dlc = false; /* Not needed for atomics */
6664 mimg->dmask = (1 << data.size()) - 1;
6665 mimg->a16 = instr->src[1].ssa->bit_size == 16;
6666 mimg->unrm = true;
6667 ac_image_dim sdim = ac_get_image_dim(ctx->options->gfx_level, dim, is_array);
6668 mimg->dim = sdim;
6669 mimg->da = should_declare_array(sdim);
6670 mimg->disable_wqm = true;
6671 mimg->sync = sync;
6672 ctx->program->needs_exact = true;
6673 if (return_previous && cmpswap)
6674 bld.pseudo(aco_opcode::p_extract_vector, Definition(dst), tmp, Operand::zero());
6675 return;
6676 }
6677
6678 void
visit_load_ssbo(isel_context * ctx,nir_intrinsic_instr * instr)6679 visit_load_ssbo(isel_context* ctx, nir_intrinsic_instr* instr)
6680 {
6681 Builder bld(ctx->program, ctx->block);
6682 unsigned num_components = instr->num_components;
6683
6684 Temp dst = get_ssa_temp(ctx, &instr->def);
6685 Temp rsrc = bld.as_uniform(get_ssa_temp(ctx, instr->src[0].ssa));
6686
6687 unsigned access = nir_intrinsic_access(instr);
6688 bool glc = access & (ACCESS_VOLATILE | ACCESS_COHERENT);
6689 unsigned size = instr->def.bit_size / 8;
6690
6691 bool allow_smem = access & ACCESS_CAN_REORDER;
6692
6693 load_buffer(ctx, num_components, size, dst, rsrc, get_ssa_temp(ctx, instr->src[1].ssa),
6694 nir_intrinsic_align_mul(instr), nir_intrinsic_align_offset(instr), glc, allow_smem,
6695 get_memory_sync_info(instr, storage_buffer, 0));
6696 }
6697
6698 void
visit_store_ssbo(isel_context * ctx,nir_intrinsic_instr * instr)6699 visit_store_ssbo(isel_context* ctx, nir_intrinsic_instr* instr)
6700 {
6701 Builder bld(ctx->program, ctx->block);
6702 Temp data = get_ssa_temp(ctx, instr->src[0].ssa);
6703 unsigned elem_size_bytes = instr->src[0].ssa->bit_size / 8;
6704 unsigned writemask = util_widen_mask(nir_intrinsic_write_mask(instr), elem_size_bytes);
6705 Temp offset = get_ssa_temp(ctx, instr->src[2].ssa);
6706
6707 Temp rsrc = bld.as_uniform(get_ssa_temp(ctx, instr->src[1].ssa));
6708
6709 memory_sync_info sync = get_memory_sync_info(instr, storage_buffer, 0);
6710 bool glc = (nir_intrinsic_access(instr) & (ACCESS_VOLATILE | ACCESS_COHERENT)) &&
6711 ctx->program->gfx_level < GFX11;
6712
6713 unsigned write_count = 0;
6714 Temp write_datas[32];
6715 unsigned offsets[32];
6716 split_buffer_store(ctx, instr, false, RegType::vgpr, data, writemask, 16, &write_count,
6717 write_datas, offsets);
6718
6719 /* GFX6-7 are affected by a hw bug that prevents address clamping to work
6720 * correctly when the SGPR offset is used.
6721 */
6722 if (offset.type() == RegType::sgpr && ctx->options->gfx_level < GFX8)
6723 offset = as_vgpr(ctx, offset);
6724
6725 for (unsigned i = 0; i < write_count; i++) {
6726 aco_opcode op = get_buffer_store_op(write_datas[i].bytes());
6727
6728 aco_ptr<MUBUF_instruction> store{
6729 create_instruction<MUBUF_instruction>(op, Format::MUBUF, 4, 0)};
6730 store->operands[0] = Operand(rsrc);
6731 store->operands[1] = offset.type() == RegType::vgpr ? Operand(offset) : Operand(v1);
6732 store->operands[2] = offset.type() == RegType::sgpr ? Operand(offset) : Operand::c32(0);
6733 store->operands[3] = Operand(write_datas[i]);
6734 store->offset = offsets[i];
6735 store->offen = (offset.type() == RegType::vgpr);
6736 store->glc = glc;
6737 store->dlc = false;
6738 store->disable_wqm = true;
6739 store->sync = sync;
6740 ctx->program->needs_exact = true;
6741 ctx->block->instructions.emplace_back(std::move(store));
6742 }
6743 }
6744
6745 void
visit_atomic_ssbo(isel_context * ctx,nir_intrinsic_instr * instr)6746 visit_atomic_ssbo(isel_context* ctx, nir_intrinsic_instr* instr)
6747 {
6748 Builder bld(ctx->program, ctx->block);
6749 bool return_previous = !nir_def_is_unused(&instr->def);
6750 Temp data = as_vgpr(ctx, get_ssa_temp(ctx, instr->src[2].ssa));
6751
6752 const nir_atomic_op nir_op = nir_intrinsic_atomic_op(instr);
6753 const bool cmpswap = nir_op == nir_atomic_op_cmpxchg;
6754
6755 aco_opcode op32, op64, image_op;
6756 translate_buffer_image_atomic_op(nir_op, &op32, &op64, &image_op);
6757
6758 if (cmpswap)
6759 data = bld.pseudo(aco_opcode::p_create_vector, bld.def(RegType::vgpr, data.size() * 2),
6760 get_ssa_temp(ctx, instr->src[3].ssa), data);
6761
6762 Temp offset = get_ssa_temp(ctx, instr->src[1].ssa);
6763 Temp rsrc = bld.as_uniform(get_ssa_temp(ctx, instr->src[0].ssa));
6764 Temp dst = get_ssa_temp(ctx, &instr->def);
6765
6766 aco_opcode op = instr->def.bit_size == 32 ? op32 : op64;
6767 aco_ptr<MUBUF_instruction> mubuf{
6768 create_instruction<MUBUF_instruction>(op, Format::MUBUF, 4, return_previous ? 1 : 0)};
6769 mubuf->operands[0] = Operand(rsrc);
6770 mubuf->operands[1] = offset.type() == RegType::vgpr ? Operand(offset) : Operand(v1);
6771 mubuf->operands[2] = offset.type() == RegType::sgpr ? Operand(offset) : Operand::c32(0);
6772 mubuf->operands[3] = Operand(data);
6773 Definition def =
6774 return_previous ? (cmpswap ? bld.def(data.regClass()) : Definition(dst)) : Definition();
6775 if (return_previous)
6776 mubuf->definitions[0] = def;
6777 mubuf->offset = 0;
6778 mubuf->offen = (offset.type() == RegType::vgpr);
6779 mubuf->glc = return_previous;
6780 mubuf->dlc = false; /* Not needed for atomics */
6781 mubuf->disable_wqm = true;
6782 mubuf->sync = get_memory_sync_info(instr, storage_buffer, semantic_atomicrmw);
6783 ctx->program->needs_exact = true;
6784 ctx->block->instructions.emplace_back(std::move(mubuf));
6785 if (return_previous && cmpswap)
6786 bld.pseudo(aco_opcode::p_extract_vector, Definition(dst), def.getTemp(), Operand::zero());
6787 }
6788
6789 void
parse_global(isel_context * ctx,nir_intrinsic_instr * intrin,Temp * address,uint32_t * const_offset,Temp * offset)6790 parse_global(isel_context* ctx, nir_intrinsic_instr* intrin, Temp* address, uint32_t* const_offset,
6791 Temp* offset)
6792 {
6793 bool is_store = intrin->intrinsic == nir_intrinsic_store_global_amd;
6794 *address = get_ssa_temp(ctx, intrin->src[is_store ? 1 : 0].ssa);
6795
6796 *const_offset = nir_intrinsic_base(intrin);
6797
6798 unsigned num_src = nir_intrinsic_infos[intrin->intrinsic].num_srcs;
6799 nir_src offset_src = intrin->src[num_src - 1];
6800 if (!nir_src_is_const(offset_src) || nir_src_as_uint(offset_src))
6801 *offset = get_ssa_temp(ctx, offset_src.ssa);
6802 else
6803 *offset = Temp();
6804 }
6805
6806 void
visit_load_global(isel_context * ctx,nir_intrinsic_instr * instr)6807 visit_load_global(isel_context* ctx, nir_intrinsic_instr* instr)
6808 {
6809 Builder bld(ctx->program, ctx->block);
6810 unsigned num_components = instr->num_components;
6811 unsigned component_size = instr->def.bit_size / 8;
6812
6813 Temp addr, offset;
6814 uint32_t const_offset;
6815 parse_global(ctx, instr, &addr, &const_offset, &offset);
6816
6817 LoadEmitInfo info = {Operand(addr), get_ssa_temp(ctx, &instr->def), num_components,
6818 component_size};
6819 if (offset.id()) {
6820 info.resource = addr;
6821 info.offset = Operand(offset);
6822 }
6823 info.const_offset = const_offset;
6824 info.glc = nir_intrinsic_access(instr) & (ACCESS_VOLATILE | ACCESS_COHERENT);
6825 info.align_mul = nir_intrinsic_align_mul(instr);
6826 info.align_offset = nir_intrinsic_align_offset(instr);
6827 info.sync = get_memory_sync_info(instr, storage_buffer, 0);
6828
6829 /* Don't expand global loads when they use MUBUF or SMEM.
6830 * Global loads don't have the bounds checking that buffer loads have that
6831 * makes this safe.
6832 */
6833 unsigned align = nir_intrinsic_align(instr);
6834 bool byte_align_for_smem_mubuf =
6835 can_use_byte_align_for_global_load(num_components, component_size, align, false);
6836
6837 /* VMEM stores don't update the SMEM cache and it's difficult to prove that
6838 * it's safe to use SMEM */
6839 bool can_use_smem =
6840 (nir_intrinsic_access(instr) & ACCESS_NON_WRITEABLE) && byte_align_for_smem_mubuf;
6841 if (info.dst.type() == RegType::vgpr || (info.glc && ctx->options->gfx_level < GFX8) ||
6842 !can_use_smem) {
6843 EmitLoadParameters params = global_load_params;
6844 params.byte_align_loads = ctx->options->gfx_level > GFX6 || byte_align_for_smem_mubuf;
6845 emit_load(ctx, bld, info, params);
6846 } else {
6847 if (info.resource.id())
6848 info.resource = bld.as_uniform(info.resource);
6849 info.offset = Operand(bld.as_uniform(info.offset));
6850 emit_load(ctx, bld, info, smem_load_params);
6851 }
6852 }
6853
6854 void
visit_store_global(isel_context * ctx,nir_intrinsic_instr * instr)6855 visit_store_global(isel_context* ctx, nir_intrinsic_instr* instr)
6856 {
6857 Builder bld(ctx->program, ctx->block);
6858 unsigned elem_size_bytes = instr->src[0].ssa->bit_size / 8;
6859 unsigned writemask = util_widen_mask(nir_intrinsic_write_mask(instr), elem_size_bytes);
6860
6861 Temp data = as_vgpr(ctx, get_ssa_temp(ctx, instr->src[0].ssa));
6862 memory_sync_info sync = get_memory_sync_info(instr, storage_buffer, 0);
6863 bool glc = (nir_intrinsic_access(instr) & (ACCESS_VOLATILE | ACCESS_COHERENT)) &&
6864 ctx->program->gfx_level < GFX11;
6865
6866 unsigned write_count = 0;
6867 Temp write_datas[32];
6868 unsigned offsets[32];
6869 split_buffer_store(ctx, instr, false, RegType::vgpr, data, writemask, 16, &write_count,
6870 write_datas, offsets);
6871
6872 Temp addr, offset;
6873 uint32_t const_offset;
6874 parse_global(ctx, instr, &addr, &const_offset, &offset);
6875
6876 for (unsigned i = 0; i < write_count; i++) {
6877 Temp write_address = addr;
6878 uint32_t write_const_offset = const_offset;
6879 Temp write_offset = offset;
6880 lower_global_address(bld, offsets[i], &write_address, &write_const_offset, &write_offset);
6881
6882 if (ctx->options->gfx_level >= GFX7) {
6883 bool global = ctx->options->gfx_level >= GFX9;
6884 aco_opcode op;
6885 switch (write_datas[i].bytes()) {
6886 case 1: op = global ? aco_opcode::global_store_byte : aco_opcode::flat_store_byte; break;
6887 case 2: op = global ? aco_opcode::global_store_short : aco_opcode::flat_store_short; break;
6888 case 4: op = global ? aco_opcode::global_store_dword : aco_opcode::flat_store_dword; break;
6889 case 8:
6890 op = global ? aco_opcode::global_store_dwordx2 : aco_opcode::flat_store_dwordx2;
6891 break;
6892 case 12:
6893 op = global ? aco_opcode::global_store_dwordx3 : aco_opcode::flat_store_dwordx3;
6894 break;
6895 case 16:
6896 op = global ? aco_opcode::global_store_dwordx4 : aco_opcode::flat_store_dwordx4;
6897 break;
6898 default: unreachable("store_global not implemented for this size.");
6899 }
6900
6901 aco_ptr<FLAT_instruction> flat{
6902 create_instruction<FLAT_instruction>(op, global ? Format::GLOBAL : Format::FLAT, 3, 0)};
6903 if (write_address.regClass() == s2) {
6904 assert(global && write_offset.id() && write_offset.type() == RegType::vgpr);
6905 flat->operands[0] = Operand(write_offset);
6906 flat->operands[1] = Operand(write_address);
6907 } else {
6908 assert(write_address.type() == RegType::vgpr && !write_offset.id());
6909 flat->operands[0] = Operand(write_address);
6910 flat->operands[1] = Operand(s1);
6911 }
6912 flat->operands[2] = Operand(write_datas[i]);
6913 flat->glc = glc;
6914 flat->dlc = false;
6915 assert(global || !write_const_offset);
6916 flat->offset = write_const_offset;
6917 flat->disable_wqm = true;
6918 flat->sync = sync;
6919 ctx->program->needs_exact = true;
6920 ctx->block->instructions.emplace_back(std::move(flat));
6921 } else {
6922 assert(ctx->options->gfx_level == GFX6);
6923
6924 aco_opcode op = get_buffer_store_op(write_datas[i].bytes());
6925
6926 Temp rsrc = get_gfx6_global_rsrc(bld, write_address);
6927
6928 aco_ptr<MUBUF_instruction> mubuf{
6929 create_instruction<MUBUF_instruction>(op, Format::MUBUF, 4, 0)};
6930 mubuf->operands[0] = Operand(rsrc);
6931 mubuf->operands[1] =
6932 write_address.type() == RegType::vgpr ? Operand(write_address) : Operand(v1);
6933 mubuf->operands[2] = Operand(write_offset);
6934 mubuf->operands[3] = Operand(write_datas[i]);
6935 mubuf->glc = glc;
6936 mubuf->dlc = false;
6937 mubuf->offset = write_const_offset;
6938 mubuf->addr64 = write_address.type() == RegType::vgpr;
6939 mubuf->disable_wqm = true;
6940 mubuf->sync = sync;
6941 ctx->program->needs_exact = true;
6942 ctx->block->instructions.emplace_back(std::move(mubuf));
6943 }
6944 }
6945 }
6946
6947 void
visit_global_atomic(isel_context * ctx,nir_intrinsic_instr * instr)6948 visit_global_atomic(isel_context* ctx, nir_intrinsic_instr* instr)
6949 {
6950 Builder bld(ctx->program, ctx->block);
6951 bool return_previous = !nir_def_is_unused(&instr->def);
6952 Temp data = as_vgpr(ctx, get_ssa_temp(ctx, instr->src[1].ssa));
6953
6954 const nir_atomic_op nir_op = nir_intrinsic_atomic_op(instr);
6955 const bool cmpswap = nir_op == nir_atomic_op_cmpxchg;
6956
6957 if (cmpswap)
6958 data = bld.pseudo(aco_opcode::p_create_vector, bld.def(RegType::vgpr, data.size() * 2),
6959 get_ssa_temp(ctx, instr->src[2].ssa), data);
6960
6961 Temp dst = get_ssa_temp(ctx, &instr->def);
6962
6963 aco_opcode op32, op64;
6964
6965 Temp addr, offset;
6966 uint32_t const_offset;
6967 parse_global(ctx, instr, &addr, &const_offset, &offset);
6968 lower_global_address(bld, 0, &addr, &const_offset, &offset);
6969
6970 if (ctx->options->gfx_level >= GFX7) {
6971 bool global = ctx->options->gfx_level >= GFX9;
6972 switch (nir_op) {
6973 case nir_atomic_op_iadd:
6974 op32 = global ? aco_opcode::global_atomic_add : aco_opcode::flat_atomic_add;
6975 op64 = global ? aco_opcode::global_atomic_add_x2 : aco_opcode::flat_atomic_add_x2;
6976 break;
6977 case nir_atomic_op_imin:
6978 op32 = global ? aco_opcode::global_atomic_smin : aco_opcode::flat_atomic_smin;
6979 op64 = global ? aco_opcode::global_atomic_smin_x2 : aco_opcode::flat_atomic_smin_x2;
6980 break;
6981 case nir_atomic_op_umin:
6982 op32 = global ? aco_opcode::global_atomic_umin : aco_opcode::flat_atomic_umin;
6983 op64 = global ? aco_opcode::global_atomic_umin_x2 : aco_opcode::flat_atomic_umin_x2;
6984 break;
6985 case nir_atomic_op_imax:
6986 op32 = global ? aco_opcode::global_atomic_smax : aco_opcode::flat_atomic_smax;
6987 op64 = global ? aco_opcode::global_atomic_smax_x2 : aco_opcode::flat_atomic_smax_x2;
6988 break;
6989 case nir_atomic_op_umax:
6990 op32 = global ? aco_opcode::global_atomic_umax : aco_opcode::flat_atomic_umax;
6991 op64 = global ? aco_opcode::global_atomic_umax_x2 : aco_opcode::flat_atomic_umax_x2;
6992 break;
6993 case nir_atomic_op_iand:
6994 op32 = global ? aco_opcode::global_atomic_and : aco_opcode::flat_atomic_and;
6995 op64 = global ? aco_opcode::global_atomic_and_x2 : aco_opcode::flat_atomic_and_x2;
6996 break;
6997 case nir_atomic_op_ior:
6998 op32 = global ? aco_opcode::global_atomic_or : aco_opcode::flat_atomic_or;
6999 op64 = global ? aco_opcode::global_atomic_or_x2 : aco_opcode::flat_atomic_or_x2;
7000 break;
7001 case nir_atomic_op_ixor:
7002 op32 = global ? aco_opcode::global_atomic_xor : aco_opcode::flat_atomic_xor;
7003 op64 = global ? aco_opcode::global_atomic_xor_x2 : aco_opcode::flat_atomic_xor_x2;
7004 break;
7005 case nir_atomic_op_xchg:
7006 op32 = global ? aco_opcode::global_atomic_swap : aco_opcode::flat_atomic_swap;
7007 op64 = global ? aco_opcode::global_atomic_swap_x2 : aco_opcode::flat_atomic_swap_x2;
7008 break;
7009 case nir_atomic_op_cmpxchg:
7010 op32 = global ? aco_opcode::global_atomic_cmpswap : aco_opcode::flat_atomic_cmpswap;
7011 op64 = global ? aco_opcode::global_atomic_cmpswap_x2 : aco_opcode::flat_atomic_cmpswap_x2;
7012 break;
7013 case nir_atomic_op_fadd:
7014 op32 = global ? aco_opcode::global_atomic_add_f32 : aco_opcode::flat_atomic_add_f32;
7015 op64 = aco_opcode::num_opcodes;
7016 break;
7017 case nir_atomic_op_fmin:
7018 op32 = global ? aco_opcode::global_atomic_fmin : aco_opcode::flat_atomic_fmin;
7019 op64 = global ? aco_opcode::global_atomic_fmin_x2 : aco_opcode::flat_atomic_fmin_x2;
7020 break;
7021 case nir_atomic_op_fmax:
7022 op32 = global ? aco_opcode::global_atomic_fmax : aco_opcode::flat_atomic_fmax;
7023 op64 = global ? aco_opcode::global_atomic_fmax_x2 : aco_opcode::flat_atomic_fmax_x2;
7024 break;
7025 default: unreachable("unsupported atomic operation");
7026 }
7027
7028 aco_opcode op = instr->def.bit_size == 32 ? op32 : op64;
7029 aco_ptr<FLAT_instruction> flat{create_instruction<FLAT_instruction>(
7030 op, global ? Format::GLOBAL : Format::FLAT, 3, return_previous ? 1 : 0)};
7031 if (addr.regClass() == s2) {
7032 assert(global && offset.id() && offset.type() == RegType::vgpr);
7033 flat->operands[0] = Operand(offset);
7034 flat->operands[1] = Operand(addr);
7035 } else {
7036 assert(addr.type() == RegType::vgpr && !offset.id());
7037 flat->operands[0] = Operand(addr);
7038 flat->operands[1] = Operand(s1);
7039 }
7040 flat->operands[2] = Operand(data);
7041 if (return_previous)
7042 flat->definitions[0] = Definition(dst);
7043 flat->glc = return_previous;
7044 flat->dlc = false; /* Not needed for atomics */
7045 assert(global || !const_offset);
7046 flat->offset = const_offset;
7047 flat->disable_wqm = true;
7048 flat->sync = get_memory_sync_info(instr, storage_buffer, semantic_atomicrmw);
7049 ctx->program->needs_exact = true;
7050 ctx->block->instructions.emplace_back(std::move(flat));
7051 } else {
7052 assert(ctx->options->gfx_level == GFX6);
7053
7054 UNUSED aco_opcode image_op;
7055 translate_buffer_image_atomic_op(nir_op, &op32, &op64, &image_op);
7056
7057 Temp rsrc = get_gfx6_global_rsrc(bld, addr);
7058
7059 aco_opcode op = instr->def.bit_size == 32 ? op32 : op64;
7060
7061 aco_ptr<MUBUF_instruction> mubuf{
7062 create_instruction<MUBUF_instruction>(op, Format::MUBUF, 4, return_previous ? 1 : 0)};
7063 mubuf->operands[0] = Operand(rsrc);
7064 mubuf->operands[1] = addr.type() == RegType::vgpr ? Operand(addr) : Operand(v1);
7065 mubuf->operands[2] = Operand(offset);
7066 mubuf->operands[3] = Operand(data);
7067 Definition def =
7068 return_previous ? (cmpswap ? bld.def(data.regClass()) : Definition(dst)) : Definition();
7069 if (return_previous)
7070 mubuf->definitions[0] = def;
7071 mubuf->glc = return_previous;
7072 mubuf->dlc = false;
7073 mubuf->offset = const_offset;
7074 mubuf->addr64 = addr.type() == RegType::vgpr;
7075 mubuf->disable_wqm = true;
7076 mubuf->sync = get_memory_sync_info(instr, storage_buffer, semantic_atomicrmw);
7077 ctx->program->needs_exact = true;
7078 ctx->block->instructions.emplace_back(std::move(mubuf));
7079 if (return_previous && cmpswap)
7080 bld.pseudo(aco_opcode::p_extract_vector, Definition(dst), def.getTemp(), Operand::zero());
7081 }
7082 }
7083
7084 unsigned
aco_storage_mode_from_nir_mem_mode(unsigned mem_mode)7085 aco_storage_mode_from_nir_mem_mode(unsigned mem_mode)
7086 {
7087 unsigned storage = storage_none;
7088
7089 if (mem_mode & nir_var_shader_out)
7090 storage |= storage_vmem_output;
7091 if ((mem_mode & nir_var_mem_ssbo) || (mem_mode & nir_var_mem_global))
7092 storage |= storage_buffer;
7093 if (mem_mode & nir_var_mem_task_payload)
7094 storage |= storage_task_payload;
7095 if (mem_mode & nir_var_mem_shared)
7096 storage |= storage_shared;
7097 if (mem_mode & nir_var_image)
7098 storage |= storage_image;
7099
7100 return storage;
7101 }
7102
7103 void
visit_load_buffer(isel_context * ctx,nir_intrinsic_instr * intrin)7104 visit_load_buffer(isel_context* ctx, nir_intrinsic_instr* intrin)
7105 {
7106 Builder bld(ctx->program, ctx->block);
7107
7108 /* Swizzled buffer addressing seems to be broken on GFX11 without the idxen bit. */
7109 bool swizzled = nir_intrinsic_access(intrin) & ACCESS_IS_SWIZZLED_AMD;
7110 bool idxen = (swizzled && ctx->program->gfx_level >= GFX11) ||
7111 !nir_src_is_const(intrin->src[3]) || nir_src_as_uint(intrin->src[3]);
7112 bool v_offset_zero = nir_src_is_const(intrin->src[1]) && !nir_src_as_uint(intrin->src[1]);
7113 bool s_offset_zero = nir_src_is_const(intrin->src[2]) && !nir_src_as_uint(intrin->src[2]);
7114
7115 Temp dst = get_ssa_temp(ctx, &intrin->def);
7116 Temp descriptor = bld.as_uniform(get_ssa_temp(ctx, intrin->src[0].ssa));
7117 Temp v_offset =
7118 v_offset_zero ? Temp(0, v1) : as_vgpr(ctx, get_ssa_temp(ctx, intrin->src[1].ssa));
7119 Temp s_offset =
7120 s_offset_zero ? Temp(0, s1) : bld.as_uniform(get_ssa_temp(ctx, intrin->src[2].ssa));
7121 Temp idx = idxen ? as_vgpr(ctx, get_ssa_temp(ctx, intrin->src[3].ssa)) : Temp();
7122
7123 bool glc = nir_intrinsic_access(intrin) & ACCESS_COHERENT;
7124 bool slc = nir_intrinsic_access(intrin) & ACCESS_NON_TEMPORAL;
7125
7126 unsigned const_offset = nir_intrinsic_base(intrin);
7127 unsigned elem_size_bytes = intrin->def.bit_size / 8u;
7128 unsigned num_components = intrin->def.num_components;
7129
7130 nir_variable_mode mem_mode = nir_intrinsic_memory_modes(intrin);
7131 memory_sync_info sync(aco_storage_mode_from_nir_mem_mode(mem_mode));
7132
7133 LoadEmitInfo info = {Operand(v_offset), dst, num_components, elem_size_bytes, descriptor};
7134 info.idx = idx;
7135 info.glc = glc;
7136 info.slc = slc;
7137 info.soffset = s_offset;
7138 info.const_offset = const_offset;
7139 info.sync = sync;
7140
7141 if (intrin->intrinsic == nir_intrinsic_load_typed_buffer_amd) {
7142 const pipe_format format = nir_intrinsic_format(intrin);
7143 const struct ac_vtx_format_info* vtx_info =
7144 ac_get_vtx_format_info(ctx->program->gfx_level, ctx->program->family, format);
7145 const struct util_format_description* f = util_format_description(format);
7146 const unsigned align_mul = nir_intrinsic_align_mul(intrin);
7147 const unsigned align_offset = nir_intrinsic_align_offset(intrin);
7148
7149 /* Avoid splitting:
7150 * - non-array formats because that would result in incorrect code
7151 * - when element size is same as component size (to reduce instruction count)
7152 */
7153 const bool can_split = f->is_array && elem_size_bytes != vtx_info->chan_byte_size;
7154
7155 info.align_mul = align_mul;
7156 info.align_offset = align_offset;
7157 info.format = format;
7158 info.component_stride = can_split ? vtx_info->chan_byte_size : 0;
7159 info.split_by_component_stride = false;
7160
7161 emit_load(ctx, bld, info, mtbuf_load_params);
7162 } else {
7163 assert(intrin->intrinsic == nir_intrinsic_load_buffer_amd);
7164
7165 if (nir_intrinsic_access(intrin) & ACCESS_USES_FORMAT_AMD) {
7166 assert(!swizzled);
7167
7168 emit_load(ctx, bld, info, mubuf_load_format_params);
7169 } else {
7170 const unsigned swizzle_element_size =
7171 swizzled ? (ctx->program->gfx_level <= GFX8 ? 4 : 16) : 0;
7172
7173 info.component_stride = swizzle_element_size;
7174 info.swizzle_component_size = swizzle_element_size ? 4 : 0;
7175 info.align_mul = MIN2(elem_size_bytes, 4);
7176 info.align_offset = 0;
7177
7178 emit_load(ctx, bld, info, mubuf_load_params);
7179 }
7180 }
7181 }
7182
7183 void
visit_store_buffer(isel_context * ctx,nir_intrinsic_instr * intrin)7184 visit_store_buffer(isel_context* ctx, nir_intrinsic_instr* intrin)
7185 {
7186 Builder bld(ctx->program, ctx->block);
7187
7188 /* Swizzled buffer addressing seems to be broken on GFX11 without the idxen bit. */
7189 bool swizzled = nir_intrinsic_access(intrin) & ACCESS_IS_SWIZZLED_AMD;
7190 bool idxen = (swizzled && ctx->program->gfx_level >= GFX11) ||
7191 !nir_src_is_const(intrin->src[4]) || nir_src_as_uint(intrin->src[4]);
7192 bool v_offset_zero = nir_src_is_const(intrin->src[2]) && !nir_src_as_uint(intrin->src[2]);
7193 bool s_offset_zero = nir_src_is_const(intrin->src[3]) && !nir_src_as_uint(intrin->src[3]);
7194
7195 Temp store_src = get_ssa_temp(ctx, intrin->src[0].ssa);
7196 Temp descriptor = bld.as_uniform(get_ssa_temp(ctx, intrin->src[1].ssa));
7197 Temp v_offset =
7198 v_offset_zero ? Temp(0, v1) : as_vgpr(ctx, get_ssa_temp(ctx, intrin->src[2].ssa));
7199 Temp s_offset =
7200 s_offset_zero ? Temp(0, s1) : bld.as_uniform(get_ssa_temp(ctx, intrin->src[3].ssa));
7201 Temp idx = idxen ? as_vgpr(ctx, get_ssa_temp(ctx, intrin->src[4].ssa)) : Temp();
7202
7203 bool glc = nir_intrinsic_access(intrin) & ACCESS_COHERENT;
7204 bool slc = nir_intrinsic_access(intrin) & ACCESS_NON_TEMPORAL;
7205
7206 unsigned const_offset = nir_intrinsic_base(intrin);
7207 unsigned write_mask = nir_intrinsic_write_mask(intrin);
7208 unsigned elem_size_bytes = intrin->src[0].ssa->bit_size / 8u;
7209
7210 nir_variable_mode mem_mode = nir_intrinsic_memory_modes(intrin);
7211 /* GS outputs are only written once. */
7212 const bool written_once =
7213 mem_mode == nir_var_shader_out && ctx->shader->info.stage == MESA_SHADER_GEOMETRY;
7214 memory_sync_info sync(aco_storage_mode_from_nir_mem_mode(mem_mode),
7215 written_once ? semantic_can_reorder : semantic_none);
7216
7217 store_vmem_mubuf(ctx, store_src, descriptor, v_offset, s_offset, idx, const_offset,
7218 elem_size_bytes, write_mask, swizzled, sync, glc, slc);
7219 }
7220
7221 void
visit_load_smem(isel_context * ctx,nir_intrinsic_instr * instr)7222 visit_load_smem(isel_context* ctx, nir_intrinsic_instr* instr)
7223 {
7224 Builder bld(ctx->program, ctx->block);
7225 Temp dst = get_ssa_temp(ctx, &instr->def);
7226 Temp base = bld.as_uniform(get_ssa_temp(ctx, instr->src[0].ssa));
7227 Temp offset = bld.as_uniform(get_ssa_temp(ctx, instr->src[1].ssa));
7228
7229 /* If base address is 32bit, convert to 64bit with the high 32bit part. */
7230 if (base.bytes() == 4) {
7231 base = bld.pseudo(aco_opcode::p_create_vector, bld.def(s2), base,
7232 Operand::c32(ctx->options->address32_hi));
7233 }
7234
7235 aco_opcode opcode = aco_opcode::s_load_dword;
7236 unsigned size = 1;
7237
7238 assert(dst.bytes() <= 64);
7239
7240 if (dst.bytes() > 32) {
7241 opcode = aco_opcode::s_load_dwordx16;
7242 size = 16;
7243 } else if (dst.bytes() > 16) {
7244 opcode = aco_opcode::s_load_dwordx8;
7245 size = 8;
7246 } else if (dst.bytes() > 8) {
7247 opcode = aco_opcode::s_load_dwordx4;
7248 size = 4;
7249 } else if (dst.bytes() > 4) {
7250 opcode = aco_opcode::s_load_dwordx2;
7251 size = 2;
7252 }
7253
7254 if (dst.size() != size) {
7255 bld.pseudo(aco_opcode::p_extract_vector, Definition(dst),
7256 bld.smem(opcode, bld.def(RegType::sgpr, size), base, offset), Operand::c32(0u));
7257 } else {
7258 bld.smem(opcode, Definition(dst), base, offset);
7259 }
7260 emit_split_vector(ctx, dst, instr->def.num_components);
7261 }
7262
7263 sync_scope
translate_nir_scope(mesa_scope scope)7264 translate_nir_scope(mesa_scope scope)
7265 {
7266 switch (scope) {
7267 case SCOPE_NONE:
7268 case SCOPE_INVOCATION: return scope_invocation;
7269 case SCOPE_SUBGROUP: return scope_subgroup;
7270 case SCOPE_WORKGROUP: return scope_workgroup;
7271 case SCOPE_QUEUE_FAMILY: return scope_queuefamily;
7272 case SCOPE_DEVICE: return scope_device;
7273 case SCOPE_SHADER_CALL: return scope_invocation;
7274 }
7275 unreachable("invalid scope");
7276 }
7277
7278 void
emit_barrier(isel_context * ctx,nir_intrinsic_instr * instr)7279 emit_barrier(isel_context* ctx, nir_intrinsic_instr* instr)
7280 {
7281 Builder bld(ctx->program, ctx->block);
7282
7283 unsigned storage_allowed = storage_buffer | storage_image;
7284 unsigned semantics = 0;
7285 sync_scope mem_scope = translate_nir_scope(nir_intrinsic_memory_scope(instr));
7286 sync_scope exec_scope = translate_nir_scope(nir_intrinsic_execution_scope(instr));
7287
7288 /* We use shared storage for the following:
7289 * - compute shaders expose it in their API
7290 * - when tessellation is used, TCS and VS I/O is lowered to shared memory
7291 * - when GS is used on GFX9+, VS->GS and TES->GS I/O is lowered to shared memory
7292 * - additionally, when NGG is used on GFX10+, shared memory is used for certain features
7293 */
7294 bool shared_storage_used =
7295 ctx->stage.hw == AC_HW_COMPUTE_SHADER || ctx->stage.hw == AC_HW_LOCAL_SHADER ||
7296 ctx->stage.hw == AC_HW_HULL_SHADER ||
7297 (ctx->stage.hw == AC_HW_LEGACY_GEOMETRY_SHADER && ctx->program->gfx_level >= GFX9) ||
7298 ctx->stage.hw == AC_HW_NEXT_GEN_GEOMETRY_SHADER;
7299
7300 if (shared_storage_used)
7301 storage_allowed |= storage_shared;
7302
7303 /* Task payload: Task Shader output, Mesh Shader input */
7304 if (ctx->stage.has(SWStage::MS) || ctx->stage.has(SWStage::TS))
7305 storage_allowed |= storage_task_payload;
7306
7307 /* Allow VMEM output for all stages that can have outputs. */
7308 if ((ctx->stage.hw != AC_HW_COMPUTE_SHADER && ctx->stage.hw != AC_HW_PIXEL_SHADER) ||
7309 ctx->stage.has(SWStage::TS))
7310 storage_allowed |= storage_vmem_output;
7311
7312 /* Workgroup barriers can hang merged shaders that can potentially have 0 threads in either half.
7313 * They are allowed in CS, TCS, and in any NGG shader.
7314 */
7315 ASSERTED bool workgroup_scope_allowed = ctx->stage.hw == AC_HW_COMPUTE_SHADER ||
7316 ctx->stage.hw == AC_HW_HULL_SHADER ||
7317 ctx->stage.hw == AC_HW_NEXT_GEN_GEOMETRY_SHADER;
7318
7319 unsigned nir_storage = nir_intrinsic_memory_modes(instr);
7320 unsigned storage = aco_storage_mode_from_nir_mem_mode(nir_storage);
7321 storage &= storage_allowed;
7322
7323 unsigned nir_semantics = nir_intrinsic_memory_semantics(instr);
7324 if (nir_semantics & NIR_MEMORY_ACQUIRE)
7325 semantics |= semantic_acquire | semantic_release;
7326 if (nir_semantics & NIR_MEMORY_RELEASE)
7327 semantics |= semantic_acquire | semantic_release;
7328
7329 assert(!(nir_semantics & (NIR_MEMORY_MAKE_AVAILABLE | NIR_MEMORY_MAKE_VISIBLE)));
7330 assert(exec_scope != scope_workgroup || workgroup_scope_allowed);
7331
7332 bld.barrier(aco_opcode::p_barrier,
7333 memory_sync_info((storage_class)storage, (memory_semantics)semantics, mem_scope),
7334 exec_scope);
7335 }
7336
7337 void
visit_load_shared(isel_context * ctx,nir_intrinsic_instr * instr)7338 visit_load_shared(isel_context* ctx, nir_intrinsic_instr* instr)
7339 {
7340 // TODO: implement sparse reads using ds_read2_b32 and nir_def_components_read()
7341 Temp dst = get_ssa_temp(ctx, &instr->def);
7342 Temp address = as_vgpr(ctx, get_ssa_temp(ctx, instr->src[0].ssa));
7343 Builder bld(ctx->program, ctx->block);
7344
7345 unsigned elem_size_bytes = instr->def.bit_size / 8;
7346 unsigned num_components = instr->def.num_components;
7347 unsigned align = nir_intrinsic_align_mul(instr) ? nir_intrinsic_align(instr) : elem_size_bytes;
7348 load_lds(ctx, elem_size_bytes, num_components, dst, address, nir_intrinsic_base(instr), align);
7349 }
7350
7351 void
visit_store_shared(isel_context * ctx,nir_intrinsic_instr * instr)7352 visit_store_shared(isel_context* ctx, nir_intrinsic_instr* instr)
7353 {
7354 unsigned writemask = nir_intrinsic_write_mask(instr);
7355 Temp data = get_ssa_temp(ctx, instr->src[0].ssa);
7356 Temp address = as_vgpr(ctx, get_ssa_temp(ctx, instr->src[1].ssa));
7357 unsigned elem_size_bytes = instr->src[0].ssa->bit_size / 8;
7358
7359 unsigned align = nir_intrinsic_align_mul(instr) ? nir_intrinsic_align(instr) : elem_size_bytes;
7360 store_lds(ctx, elem_size_bytes, data, writemask, address, nir_intrinsic_base(instr), align);
7361 }
7362
7363 void
visit_shared_atomic(isel_context * ctx,nir_intrinsic_instr * instr)7364 visit_shared_atomic(isel_context* ctx, nir_intrinsic_instr* instr)
7365 {
7366 unsigned offset = nir_intrinsic_base(instr);
7367 Builder bld(ctx->program, ctx->block);
7368 Operand m = load_lds_size_m0(bld);
7369 Temp data = as_vgpr(ctx, get_ssa_temp(ctx, instr->src[1].ssa));
7370 Temp address = as_vgpr(ctx, get_ssa_temp(ctx, instr->src[0].ssa));
7371
7372 unsigned num_operands = 3;
7373 aco_opcode op32, op64, op32_rtn, op64_rtn;
7374 switch (nir_intrinsic_atomic_op(instr)) {
7375 case nir_atomic_op_iadd:
7376 op32 = aco_opcode::ds_add_u32;
7377 op64 = aco_opcode::ds_add_u64;
7378 op32_rtn = aco_opcode::ds_add_rtn_u32;
7379 op64_rtn = aco_opcode::ds_add_rtn_u64;
7380 break;
7381 case nir_atomic_op_imin:
7382 op32 = aco_opcode::ds_min_i32;
7383 op64 = aco_opcode::ds_min_i64;
7384 op32_rtn = aco_opcode::ds_min_rtn_i32;
7385 op64_rtn = aco_opcode::ds_min_rtn_i64;
7386 break;
7387 case nir_atomic_op_umin:
7388 op32 = aco_opcode::ds_min_u32;
7389 op64 = aco_opcode::ds_min_u64;
7390 op32_rtn = aco_opcode::ds_min_rtn_u32;
7391 op64_rtn = aco_opcode::ds_min_rtn_u64;
7392 break;
7393 case nir_atomic_op_imax:
7394 op32 = aco_opcode::ds_max_i32;
7395 op64 = aco_opcode::ds_max_i64;
7396 op32_rtn = aco_opcode::ds_max_rtn_i32;
7397 op64_rtn = aco_opcode::ds_max_rtn_i64;
7398 break;
7399 case nir_atomic_op_umax:
7400 op32 = aco_opcode::ds_max_u32;
7401 op64 = aco_opcode::ds_max_u64;
7402 op32_rtn = aco_opcode::ds_max_rtn_u32;
7403 op64_rtn = aco_opcode::ds_max_rtn_u64;
7404 break;
7405 case nir_atomic_op_iand:
7406 op32 = aco_opcode::ds_and_b32;
7407 op64 = aco_opcode::ds_and_b64;
7408 op32_rtn = aco_opcode::ds_and_rtn_b32;
7409 op64_rtn = aco_opcode::ds_and_rtn_b64;
7410 break;
7411 case nir_atomic_op_ior:
7412 op32 = aco_opcode::ds_or_b32;
7413 op64 = aco_opcode::ds_or_b64;
7414 op32_rtn = aco_opcode::ds_or_rtn_b32;
7415 op64_rtn = aco_opcode::ds_or_rtn_b64;
7416 break;
7417 case nir_atomic_op_ixor:
7418 op32 = aco_opcode::ds_xor_b32;
7419 op64 = aco_opcode::ds_xor_b64;
7420 op32_rtn = aco_opcode::ds_xor_rtn_b32;
7421 op64_rtn = aco_opcode::ds_xor_rtn_b64;
7422 break;
7423 case nir_atomic_op_xchg:
7424 op32 = aco_opcode::ds_write_b32;
7425 op64 = aco_opcode::ds_write_b64;
7426 op32_rtn = aco_opcode::ds_wrxchg_rtn_b32;
7427 op64_rtn = aco_opcode::ds_wrxchg_rtn_b64;
7428 break;
7429 case nir_atomic_op_cmpxchg:
7430 op32 = aco_opcode::ds_cmpst_b32;
7431 op64 = aco_opcode::ds_cmpst_b64;
7432 op32_rtn = aco_opcode::ds_cmpst_rtn_b32;
7433 op64_rtn = aco_opcode::ds_cmpst_rtn_b64;
7434 num_operands = 4;
7435 break;
7436 case nir_atomic_op_fadd:
7437 op32 = aco_opcode::ds_add_f32;
7438 op32_rtn = aco_opcode::ds_add_rtn_f32;
7439 op64 = aco_opcode::num_opcodes;
7440 op64_rtn = aco_opcode::num_opcodes;
7441 break;
7442 case nir_atomic_op_fmin:
7443 op32 = aco_opcode::ds_min_f32;
7444 op32_rtn = aco_opcode::ds_min_rtn_f32;
7445 op64 = aco_opcode::ds_min_f64;
7446 op64_rtn = aco_opcode::ds_min_rtn_f64;
7447 break;
7448 case nir_atomic_op_fmax:
7449 op32 = aco_opcode::ds_max_f32;
7450 op32_rtn = aco_opcode::ds_max_rtn_f32;
7451 op64 = aco_opcode::ds_max_f64;
7452 op64_rtn = aco_opcode::ds_max_rtn_f64;
7453 break;
7454 default: unreachable("Unhandled shared atomic intrinsic");
7455 }
7456
7457 bool return_previous = !nir_def_is_unused(&instr->def);
7458
7459 aco_opcode op;
7460 if (data.size() == 1) {
7461 assert(instr->def.bit_size == 32);
7462 op = return_previous ? op32_rtn : op32;
7463 } else {
7464 assert(instr->def.bit_size == 64);
7465 op = return_previous ? op64_rtn : op64;
7466 }
7467
7468 if (offset > 65535) {
7469 address = bld.vadd32(bld.def(v1), Operand::c32(offset), address);
7470 offset = 0;
7471 }
7472
7473 aco_ptr<DS_instruction> ds;
7474 ds.reset(
7475 create_instruction<DS_instruction>(op, Format::DS, num_operands, return_previous ? 1 : 0));
7476 ds->operands[0] = Operand(address);
7477 ds->operands[1] = Operand(data);
7478 if (num_operands == 4) {
7479 Temp data2 = as_vgpr(ctx, get_ssa_temp(ctx, instr->src[2].ssa));
7480 ds->operands[2] = Operand(data2);
7481 if (bld.program->gfx_level >= GFX11)
7482 std::swap(ds->operands[1], ds->operands[2]);
7483 }
7484 ds->operands[num_operands - 1] = m;
7485 ds->offset0 = offset;
7486 if (return_previous)
7487 ds->definitions[0] = Definition(get_ssa_temp(ctx, &instr->def));
7488 ds->sync = memory_sync_info(storage_shared, semantic_atomicrmw);
7489
7490 if (m.isUndefined())
7491 ds->operands.pop_back();
7492
7493 ctx->block->instructions.emplace_back(std::move(ds));
7494 }
7495
7496 void
visit_access_shared2_amd(isel_context * ctx,nir_intrinsic_instr * instr)7497 visit_access_shared2_amd(isel_context* ctx, nir_intrinsic_instr* instr)
7498 {
7499 bool is_store = instr->intrinsic == nir_intrinsic_store_shared2_amd;
7500 Temp address = as_vgpr(ctx, get_ssa_temp(ctx, instr->src[is_store].ssa));
7501 Builder bld(ctx->program, ctx->block);
7502
7503 assert(bld.program->gfx_level >= GFX7);
7504
7505 bool is64bit = (is_store ? instr->src[0].ssa->bit_size : instr->def.bit_size) == 64;
7506 uint8_t offset0 = nir_intrinsic_offset0(instr);
7507 uint8_t offset1 = nir_intrinsic_offset1(instr);
7508 bool st64 = nir_intrinsic_st64(instr);
7509
7510 Operand m = load_lds_size_m0(bld);
7511 Instruction* ds;
7512 if (is_store) {
7513 aco_opcode op = st64
7514 ? (is64bit ? aco_opcode::ds_write2st64_b64 : aco_opcode::ds_write2st64_b32)
7515 : (is64bit ? aco_opcode::ds_write2_b64 : aco_opcode::ds_write2_b32);
7516 Temp data = get_ssa_temp(ctx, instr->src[0].ssa);
7517 RegClass comp_rc = is64bit ? v2 : v1;
7518 Temp data0 = emit_extract_vector(ctx, data, 0, comp_rc);
7519 Temp data1 = emit_extract_vector(ctx, data, 1, comp_rc);
7520 ds = bld.ds(op, address, data0, data1, m, offset0, offset1);
7521 } else {
7522 Temp dst = get_ssa_temp(ctx, &instr->def);
7523 Definition tmp_dst(dst.type() == RegType::vgpr ? dst : bld.tmp(is64bit ? v4 : v2));
7524 aco_opcode op = st64 ? (is64bit ? aco_opcode::ds_read2st64_b64 : aco_opcode::ds_read2st64_b32)
7525 : (is64bit ? aco_opcode::ds_read2_b64 : aco_opcode::ds_read2_b32);
7526 ds = bld.ds(op, tmp_dst, address, m, offset0, offset1);
7527 }
7528 ds->ds().sync = memory_sync_info(storage_shared);
7529 if (m.isUndefined())
7530 ds->operands.pop_back();
7531
7532 if (!is_store) {
7533 Temp dst = get_ssa_temp(ctx, &instr->def);
7534 if (dst.type() == RegType::sgpr) {
7535 emit_split_vector(ctx, ds->definitions[0].getTemp(), dst.size());
7536 Temp comp[4];
7537 /* Use scalar v_readfirstlane_b32 for better 32-bit copy propagation */
7538 for (unsigned i = 0; i < dst.size(); i++)
7539 comp[i] = bld.as_uniform(emit_extract_vector(ctx, ds->definitions[0].getTemp(), i, v1));
7540 if (is64bit) {
7541 Temp comp0 = bld.pseudo(aco_opcode::p_create_vector, bld.def(s2), comp[0], comp[1]);
7542 Temp comp1 = bld.pseudo(aco_opcode::p_create_vector, bld.def(s2), comp[2], comp[3]);
7543 ctx->allocated_vec[comp0.id()] = {comp[0], comp[1]};
7544 ctx->allocated_vec[comp1.id()] = {comp[2], comp[3]};
7545 bld.pseudo(aco_opcode::p_create_vector, Definition(dst), comp0, comp1);
7546 ctx->allocated_vec[dst.id()] = {comp0, comp1};
7547 } else {
7548 bld.pseudo(aco_opcode::p_create_vector, Definition(dst), comp[0], comp[1]);
7549 }
7550 }
7551
7552 emit_split_vector(ctx, dst, 2);
7553 }
7554 }
7555
7556 Temp
get_scratch_resource(isel_context * ctx)7557 get_scratch_resource(isel_context* ctx)
7558 {
7559 Builder bld(ctx->program, ctx->block);
7560 Temp scratch_addr = ctx->program->private_segment_buffer;
7561 if (!scratch_addr.bytes()) {
7562 Temp addr_lo =
7563 bld.sop1(aco_opcode::p_load_symbol, bld.def(s1), Operand::c32(aco_symbol_scratch_addr_lo));
7564 Temp addr_hi =
7565 bld.sop1(aco_opcode::p_load_symbol, bld.def(s1), Operand::c32(aco_symbol_scratch_addr_hi));
7566 scratch_addr = bld.pseudo(aco_opcode::p_create_vector, bld.def(s2), addr_lo, addr_hi);
7567 } else if (ctx->stage.hw != AC_HW_COMPUTE_SHADER) {
7568 scratch_addr =
7569 bld.smem(aco_opcode::s_load_dwordx2, bld.def(s2), scratch_addr, Operand::zero());
7570 }
7571
7572 uint32_t rsrc_conf =
7573 S_008F0C_ADD_TID_ENABLE(1) | S_008F0C_INDEX_STRIDE(ctx->program->wave_size == 64 ? 3 : 2);
7574
7575 if (ctx->program->gfx_level >= GFX10) {
7576 rsrc_conf |= S_008F0C_FORMAT(V_008F0C_GFX10_FORMAT_32_FLOAT) |
7577 S_008F0C_OOB_SELECT(V_008F0C_OOB_SELECT_RAW) |
7578 S_008F0C_RESOURCE_LEVEL(ctx->program->gfx_level < GFX11);
7579 } else if (ctx->program->gfx_level <=
7580 GFX7) { /* dfmt modifies stride on GFX8/GFX9 when ADD_TID_EN=1 */
7581 rsrc_conf |= S_008F0C_NUM_FORMAT(V_008F0C_BUF_NUM_FORMAT_FLOAT) |
7582 S_008F0C_DATA_FORMAT(V_008F0C_BUF_DATA_FORMAT_32);
7583 }
7584
7585 /* older generations need element size = 4 bytes. element size removed in GFX9 */
7586 if (ctx->program->gfx_level <= GFX8)
7587 rsrc_conf |= S_008F0C_ELEMENT_SIZE(1);
7588
7589 return bld.pseudo(aco_opcode::p_create_vector, bld.def(s4), scratch_addr, Operand::c32(-1u),
7590 Operand::c32(rsrc_conf));
7591 }
7592
7593 void
visit_load_scratch(isel_context * ctx,nir_intrinsic_instr * instr)7594 visit_load_scratch(isel_context* ctx, nir_intrinsic_instr* instr)
7595 {
7596 Builder bld(ctx->program, ctx->block);
7597 Temp dst = get_ssa_temp(ctx, &instr->def);
7598
7599 LoadEmitInfo info = {Operand(v1), dst, instr->def.num_components, instr->def.bit_size / 8u};
7600 info.align_mul = nir_intrinsic_align_mul(instr);
7601 info.align_offset = nir_intrinsic_align_offset(instr);
7602 info.swizzle_component_size = ctx->program->gfx_level <= GFX8 ? 4 : 0;
7603 info.sync = memory_sync_info(storage_scratch, semantic_private);
7604 if (ctx->program->gfx_level >= GFX9) {
7605 if (nir_src_is_const(instr->src[0])) {
7606 uint32_t max = ctx->program->dev.scratch_global_offset_max + 1;
7607 info.offset =
7608 bld.copy(bld.def(s1), Operand::c32(ROUND_DOWN_TO(nir_src_as_uint(instr->src[0]), max)));
7609 info.const_offset = nir_src_as_uint(instr->src[0]) % max;
7610 } else {
7611 info.offset = Operand(get_ssa_temp(ctx, instr->src[0].ssa));
7612 }
7613 EmitLoadParameters params = scratch_flat_load_params;
7614 params.max_const_offset_plus_one = ctx->program->dev.scratch_global_offset_max + 1;
7615 emit_load(ctx, bld, info, params);
7616 } else {
7617 info.resource = get_scratch_resource(ctx);
7618 info.offset = Operand(as_vgpr(ctx, get_ssa_temp(ctx, instr->src[0].ssa)));
7619 info.soffset = ctx->program->scratch_offset;
7620 emit_load(ctx, bld, info, scratch_mubuf_load_params);
7621 }
7622 }
7623
7624 void
visit_store_scratch(isel_context * ctx,nir_intrinsic_instr * instr)7625 visit_store_scratch(isel_context* ctx, nir_intrinsic_instr* instr)
7626 {
7627 Builder bld(ctx->program, ctx->block);
7628 Temp data = as_vgpr(ctx, get_ssa_temp(ctx, instr->src[0].ssa));
7629 Temp offset = get_ssa_temp(ctx, instr->src[1].ssa);
7630
7631 unsigned elem_size_bytes = instr->src[0].ssa->bit_size / 8;
7632 unsigned writemask = util_widen_mask(nir_intrinsic_write_mask(instr), elem_size_bytes);
7633
7634 unsigned write_count = 0;
7635 Temp write_datas[32];
7636 unsigned offsets[32];
7637 unsigned swizzle_component_size = ctx->program->gfx_level <= GFX8 ? 4 : 16;
7638 split_buffer_store(ctx, instr, false, RegType::vgpr, data, writemask, swizzle_component_size,
7639 &write_count, write_datas, offsets);
7640
7641 if (ctx->program->gfx_level >= GFX9) {
7642 uint32_t max = ctx->program->dev.scratch_global_offset_max + 1;
7643 offset = nir_src_is_const(instr->src[1]) ? Temp(0, s1) : offset;
7644 uint32_t base_const_offset =
7645 nir_src_is_const(instr->src[1]) ? nir_src_as_uint(instr->src[1]) : 0;
7646
7647 for (unsigned i = 0; i < write_count; i++) {
7648 aco_opcode op;
7649 switch (write_datas[i].bytes()) {
7650 case 1: op = aco_opcode::scratch_store_byte; break;
7651 case 2: op = aco_opcode::scratch_store_short; break;
7652 case 4: op = aco_opcode::scratch_store_dword; break;
7653 case 8: op = aco_opcode::scratch_store_dwordx2; break;
7654 case 12: op = aco_opcode::scratch_store_dwordx3; break;
7655 case 16: op = aco_opcode::scratch_store_dwordx4; break;
7656 default: unreachable("Unexpected store size");
7657 }
7658
7659 uint32_t const_offset = base_const_offset + offsets[i];
7660 assert(const_offset < max || offset.id() == 0);
7661
7662 Operand addr = offset.regClass() == s1 ? Operand(v1) : Operand(offset);
7663 Operand saddr = offset.regClass() == s1 ? Operand(offset) : Operand(s1);
7664 if (offset.id() == 0)
7665 saddr = bld.copy(bld.def(s1), Operand::c32(ROUND_DOWN_TO(const_offset, max)));
7666
7667 bld.scratch(op, addr, saddr, write_datas[i], const_offset % max,
7668 memory_sync_info(storage_scratch, semantic_private));
7669 }
7670 } else {
7671 Temp rsrc = get_scratch_resource(ctx);
7672 offset = as_vgpr(ctx, offset);
7673 for (unsigned i = 0; i < write_count; i++) {
7674 aco_opcode op = get_buffer_store_op(write_datas[i].bytes());
7675 Instruction* mubuf = bld.mubuf(op, rsrc, offset, ctx->program->scratch_offset,
7676 write_datas[i], offsets[i], true, true);
7677 mubuf->mubuf().sync = memory_sync_info(storage_scratch, semantic_private);
7678 }
7679 }
7680 }
7681
7682 ReduceOp
get_reduce_op(nir_op op,unsigned bit_size)7683 get_reduce_op(nir_op op, unsigned bit_size)
7684 {
7685 switch (op) {
7686 #define CASEI(name) \
7687 case nir_op_##name: \
7688 return (bit_size == 32) ? name##32 \
7689 : (bit_size == 16) ? name##16 \
7690 : (bit_size == 8) ? name##8 \
7691 : name##64;
7692 #define CASEF(name) \
7693 case nir_op_##name: return (bit_size == 32) ? name##32 : (bit_size == 16) ? name##16 : name##64;
7694 CASEI(iadd)
7695 CASEI(imul)
7696 CASEI(imin)
7697 CASEI(umin)
7698 CASEI(imax)
7699 CASEI(umax)
7700 CASEI(iand)
7701 CASEI(ior)
7702 CASEI(ixor)
7703 CASEF(fadd)
7704 CASEF(fmul)
7705 CASEF(fmin)
7706 CASEF(fmax)
7707 default: unreachable("unknown reduction op");
7708 #undef CASEI
7709 #undef CASEF
7710 }
7711 }
7712
7713 void
emit_uniform_subgroup(isel_context * ctx,nir_intrinsic_instr * instr,Temp src)7714 emit_uniform_subgroup(isel_context* ctx, nir_intrinsic_instr* instr, Temp src)
7715 {
7716 Builder bld(ctx->program, ctx->block);
7717 Definition dst(get_ssa_temp(ctx, &instr->def));
7718 assert(dst.regClass().type() != RegType::vgpr);
7719 if (src.regClass().type() == RegType::vgpr)
7720 bld.pseudo(aco_opcode::p_as_uniform, dst, src);
7721 else
7722 bld.copy(dst, src);
7723 }
7724
7725 void
emit_addition_uniform_reduce(isel_context * ctx,nir_op op,Definition dst,nir_src src,Temp count)7726 emit_addition_uniform_reduce(isel_context* ctx, nir_op op, Definition dst, nir_src src, Temp count)
7727 {
7728 Builder bld(ctx->program, ctx->block);
7729 Temp src_tmp = get_ssa_temp(ctx, src.ssa);
7730
7731 if (op == nir_op_fadd) {
7732 src_tmp = as_vgpr(ctx, src_tmp);
7733 Temp tmp = dst.regClass() == s1 ? bld.tmp(RegClass::get(RegType::vgpr, src.ssa->bit_size / 8))
7734 : dst.getTemp();
7735
7736 if (src.ssa->bit_size == 16) {
7737 count = bld.vop1(aco_opcode::v_cvt_f16_u16, bld.def(v2b), count);
7738 bld.vop2(aco_opcode::v_mul_f16, Definition(tmp), count, src_tmp);
7739 } else {
7740 assert(src.ssa->bit_size == 32);
7741 count = bld.vop1(aco_opcode::v_cvt_f32_u32, bld.def(v1), count);
7742 bld.vop2(aco_opcode::v_mul_f32, Definition(tmp), count, src_tmp);
7743 }
7744
7745 if (tmp != dst.getTemp())
7746 bld.pseudo(aco_opcode::p_as_uniform, dst, tmp);
7747
7748 return;
7749 }
7750
7751 if (dst.regClass() == s1)
7752 src_tmp = bld.as_uniform(src_tmp);
7753
7754 if (op == nir_op_ixor && count.type() == RegType::sgpr)
7755 count =
7756 bld.sop2(aco_opcode::s_and_b32, bld.def(s1), bld.def(s1, scc), count, Operand::c32(1u));
7757 else if (op == nir_op_ixor)
7758 count = bld.vop2(aco_opcode::v_and_b32, bld.def(v1), Operand::c32(1u), count);
7759
7760 assert(dst.getTemp().type() == count.type());
7761
7762 if (nir_src_is_const(src)) {
7763 if (nir_src_as_uint(src) == 1 && dst.bytes() <= 2)
7764 bld.pseudo(aco_opcode::p_extract_vector, dst, count, Operand::zero());
7765 else if (nir_src_as_uint(src) == 1)
7766 bld.copy(dst, count);
7767 else if (nir_src_as_uint(src) == 0)
7768 bld.copy(dst, Operand::zero(dst.bytes()));
7769 else if (count.type() == RegType::vgpr)
7770 bld.v_mul_imm(dst, count, nir_src_as_uint(src));
7771 else
7772 bld.sop2(aco_opcode::s_mul_i32, dst, src_tmp, count);
7773 } else if (dst.bytes() <= 2 && ctx->program->gfx_level >= GFX10) {
7774 bld.vop3(aco_opcode::v_mul_lo_u16_e64, dst, src_tmp, count);
7775 } else if (dst.bytes() <= 2 && ctx->program->gfx_level >= GFX8) {
7776 bld.vop2(aco_opcode::v_mul_lo_u16, dst, src_tmp, count);
7777 } else if (dst.getTemp().type() == RegType::vgpr) {
7778 bld.vop3(aco_opcode::v_mul_lo_u32, dst, src_tmp, count);
7779 } else {
7780 bld.sop2(aco_opcode::s_mul_i32, dst, src_tmp, count);
7781 }
7782 }
7783
7784 bool
emit_uniform_reduce(isel_context * ctx,nir_intrinsic_instr * instr)7785 emit_uniform_reduce(isel_context* ctx, nir_intrinsic_instr* instr)
7786 {
7787 nir_op op = (nir_op)nir_intrinsic_reduction_op(instr);
7788 if (op == nir_op_imul || op == nir_op_fmul)
7789 return false;
7790
7791 if (op == nir_op_iadd || op == nir_op_ixor || op == nir_op_fadd) {
7792 Builder bld(ctx->program, ctx->block);
7793 Definition dst(get_ssa_temp(ctx, &instr->def));
7794 unsigned bit_size = instr->src[0].ssa->bit_size;
7795 if (bit_size > 32)
7796 return false;
7797
7798 Temp thread_count =
7799 bld.sop1(Builder::s_bcnt1_i32, bld.def(s1), bld.def(s1, scc), Operand(exec, bld.lm));
7800 set_wqm(ctx);
7801
7802 emit_addition_uniform_reduce(ctx, op, dst, instr->src[0], thread_count);
7803 } else {
7804 emit_uniform_subgroup(ctx, instr, get_ssa_temp(ctx, instr->src[0].ssa));
7805 }
7806
7807 return true;
7808 }
7809
7810 bool
emit_uniform_scan(isel_context * ctx,nir_intrinsic_instr * instr)7811 emit_uniform_scan(isel_context* ctx, nir_intrinsic_instr* instr)
7812 {
7813 Builder bld(ctx->program, ctx->block);
7814 Definition dst(get_ssa_temp(ctx, &instr->def));
7815 nir_op op = (nir_op)nir_intrinsic_reduction_op(instr);
7816 bool inc = instr->intrinsic == nir_intrinsic_inclusive_scan;
7817
7818 if (op == nir_op_imul || op == nir_op_fmul)
7819 return false;
7820
7821 if (op == nir_op_iadd || op == nir_op_ixor || op == nir_op_fadd) {
7822 if (instr->src[0].ssa->bit_size > 32)
7823 return false;
7824
7825 Temp packed_tid;
7826 if (inc)
7827 packed_tid = emit_mbcnt(ctx, bld.tmp(v1), Operand(exec, bld.lm), Operand::c32(1u));
7828 else
7829 packed_tid = emit_mbcnt(ctx, bld.tmp(v1), Operand(exec, bld.lm));
7830 set_wqm(ctx);
7831
7832 emit_addition_uniform_reduce(ctx, op, dst, instr->src[0], packed_tid);
7833 return true;
7834 }
7835
7836 assert(op == nir_op_imin || op == nir_op_umin || op == nir_op_imax || op == nir_op_umax ||
7837 op == nir_op_iand || op == nir_op_ior || op == nir_op_fmin || op == nir_op_fmax);
7838
7839 if (inc) {
7840 emit_uniform_subgroup(ctx, instr, get_ssa_temp(ctx, instr->src[0].ssa));
7841 return true;
7842 }
7843
7844 /* Copy the source and write the reduction operation identity to the first lane. */
7845 Temp lane = bld.sop1(Builder::s_ff1_i32, bld.def(s1), Operand(exec, bld.lm));
7846 Temp src = get_ssa_temp(ctx, instr->src[0].ssa);
7847 ReduceOp reduce_op = get_reduce_op(op, instr->src[0].ssa->bit_size);
7848 if (dst.bytes() == 8) {
7849 Temp lo = bld.tmp(v1), hi = bld.tmp(v1);
7850 bld.pseudo(aco_opcode::p_split_vector, Definition(lo), Definition(hi), src);
7851 uint32_t identity_lo = get_reduction_identity(reduce_op, 0);
7852 uint32_t identity_hi = get_reduction_identity(reduce_op, 1);
7853
7854 lo =
7855 bld.writelane(bld.def(v1), bld.copy(bld.def(s1, m0), Operand::c32(identity_lo)), lane, lo);
7856 hi =
7857 bld.writelane(bld.def(v1), bld.copy(bld.def(s1, m0), Operand::c32(identity_hi)), lane, hi);
7858 bld.pseudo(aco_opcode::p_create_vector, dst, lo, hi);
7859 } else {
7860 uint32_t identity = get_reduction_identity(reduce_op, 0);
7861 bld.writelane(dst, bld.copy(bld.def(s1, m0), Operand::c32(identity)), lane,
7862 as_vgpr(ctx, src));
7863 }
7864
7865 set_wqm(ctx);
7866 return true;
7867 }
7868
7869 Temp
emit_reduction_instr(isel_context * ctx,aco_opcode aco_op,ReduceOp op,unsigned cluster_size,Definition dst,Temp src)7870 emit_reduction_instr(isel_context* ctx, aco_opcode aco_op, ReduceOp op, unsigned cluster_size,
7871 Definition dst, Temp src)
7872 {
7873 assert(src.bytes() <= 8);
7874 assert(src.type() == RegType::vgpr);
7875
7876 Builder bld(ctx->program, ctx->block);
7877
7878 unsigned num_defs = 0;
7879 Definition defs[5];
7880 defs[num_defs++] = dst;
7881 defs[num_defs++] = bld.def(bld.lm); /* used internally to save/restore exec */
7882
7883 /* scalar identity temporary */
7884 bool need_sitmp = (ctx->program->gfx_level <= GFX7 || ctx->program->gfx_level >= GFX10) &&
7885 aco_op != aco_opcode::p_reduce;
7886 if (aco_op == aco_opcode::p_exclusive_scan) {
7887 need_sitmp |= (op == imin8 || op == imin16 || op == imin32 || op == imin64 || op == imax8 ||
7888 op == imax16 || op == imax32 || op == imax64 || op == fmin16 || op == fmin32 ||
7889 op == fmin64 || op == fmax16 || op == fmax32 || op == fmax64 || op == fmul16 ||
7890 op == fmul64);
7891 }
7892 if (need_sitmp)
7893 defs[num_defs++] = bld.def(RegType::sgpr, dst.size());
7894
7895 /* scc clobber */
7896 defs[num_defs++] = bld.def(s1, scc);
7897
7898 /* vcc clobber */
7899 bool clobber_vcc = false;
7900 if ((op == iadd32 || op == imul64) && ctx->program->gfx_level < GFX9)
7901 clobber_vcc = true;
7902 if ((op == iadd8 || op == iadd16) && ctx->program->gfx_level < GFX8)
7903 clobber_vcc = true;
7904 if (op == iadd64 || op == umin64 || op == umax64 || op == imin64 || op == imax64)
7905 clobber_vcc = true;
7906
7907 if (clobber_vcc)
7908 defs[num_defs++] = bld.def(bld.lm, vcc);
7909
7910 Pseudo_reduction_instruction* reduce = create_instruction<Pseudo_reduction_instruction>(
7911 aco_op, Format::PSEUDO_REDUCTION, 3, num_defs);
7912 reduce->operands[0] = Operand(src);
7913 /* setup_reduce_temp will update these undef operands if needed */
7914 reduce->operands[1] = Operand(RegClass(RegType::vgpr, dst.size()).as_linear());
7915 reduce->operands[2] = Operand(v1.as_linear());
7916 std::copy(defs, defs + num_defs, reduce->definitions.begin());
7917
7918 reduce->reduce_op = op;
7919 reduce->cluster_size = cluster_size;
7920 bld.insert(std::move(reduce));
7921
7922 return dst.getTemp();
7923 }
7924
7925 Temp
inclusive_scan_to_exclusive(isel_context * ctx,ReduceOp op,Definition dst,Temp src)7926 inclusive_scan_to_exclusive(isel_context* ctx, ReduceOp op, Definition dst, Temp src)
7927 {
7928 Builder bld(ctx->program, ctx->block);
7929
7930 Temp scan = emit_reduction_instr(ctx, aco_opcode::p_inclusive_scan, op, ctx->program->wave_size,
7931 bld.def(dst.regClass()), src);
7932
7933 switch (op) {
7934 case iadd8:
7935 case iadd16:
7936 case iadd32: return bld.vsub32(dst, scan, src);
7937 case ixor64:
7938 case iadd64: {
7939 Temp src00 = bld.tmp(v1);
7940 Temp src01 = bld.tmp(v1);
7941 bld.pseudo(aco_opcode::p_split_vector, Definition(src00), Definition(src01), scan);
7942 Temp src10 = bld.tmp(v1);
7943 Temp src11 = bld.tmp(v1);
7944 bld.pseudo(aco_opcode::p_split_vector, Definition(src10), Definition(src11), src);
7945
7946 Temp lower = bld.tmp(v1);
7947 Temp upper = bld.tmp(v1);
7948 if (op == iadd64) {
7949 Temp borrow = bld.vsub32(Definition(lower), src00, src10, true).def(1).getTemp();
7950 bld.vsub32(Definition(upper), src01, src11, false, borrow);
7951 } else {
7952 bld.vop2(aco_opcode::v_xor_b32, Definition(lower), src00, src10);
7953 bld.vop2(aco_opcode::v_xor_b32, Definition(upper), src01, src11);
7954 }
7955 return bld.pseudo(aco_opcode::p_create_vector, dst, lower, upper);
7956 }
7957 case ixor8:
7958 case ixor16:
7959 case ixor32: return bld.vop2(aco_opcode::v_xor_b32, dst, scan, src);
7960 default: unreachable("Unsupported op");
7961 }
7962 }
7963
7964 bool
emit_rotate_by_constant(isel_context * ctx,Temp & dst,Temp src,unsigned cluster_size,uint64_t delta)7965 emit_rotate_by_constant(isel_context* ctx, Temp& dst, Temp src, unsigned cluster_size,
7966 uint64_t delta)
7967 {
7968 Builder bld(ctx->program, ctx->block);
7969 RegClass rc = src.regClass();
7970 dst = Temp(0, rc);
7971 delta %= cluster_size;
7972
7973 if (delta == 0) {
7974 dst = bld.copy(bld.def(rc), src);
7975 } else if (delta * 2 == cluster_size && cluster_size <= 32) {
7976 dst = emit_masked_swizzle(ctx, bld, src, ds_pattern_bitmode(0x1f, 0, delta), true);
7977 } else if (cluster_size == 4) {
7978 unsigned res[4];
7979 for (unsigned i = 0; i < 4; i++)
7980 res[i] = (i + delta) & 0x3;
7981 uint32_t dpp_ctrl = dpp_quad_perm(res[0], res[1], res[2], res[3]);
7982 if (ctx->program->gfx_level >= GFX8)
7983 dst = bld.vop1_dpp(aco_opcode::v_mov_b32, bld.def(rc), src, dpp_ctrl);
7984 else
7985 dst = bld.ds(aco_opcode::ds_swizzle_b32, bld.def(v1), src, (1 << 15) | dpp_ctrl);
7986 } else if (cluster_size == 8 && ctx->program->gfx_level >= GFX10) {
7987 uint32_t lane_sel = 0;
7988 for (unsigned i = 0; i < 8; i++)
7989 lane_sel |= ((i + delta) & 0x7) << (i * 3);
7990 dst = bld.vop1_dpp8(aco_opcode::v_mov_b32, bld.def(rc), src, lane_sel);
7991 } else if (cluster_size == 16 && ctx->program->gfx_level >= GFX8) {
7992 dst = bld.vop1_dpp(aco_opcode::v_mov_b32, bld.def(rc), src, dpp_row_rr(16 - delta));
7993 } else if (cluster_size <= 32 && ctx->program->gfx_level >= GFX9) {
7994 uint32_t ctrl = ds_pattern_rotate(delta, ~(cluster_size - 1) & 0x1f);
7995 dst = bld.ds(aco_opcode::ds_swizzle_b32, bld.def(v1), src, ctrl);
7996 } else if (cluster_size == 64) {
7997 bool has_wf_dpp = ctx->program->gfx_level >= GFX8 && ctx->program->gfx_level < GFX10;
7998 if (delta == 32 && ctx->program->gfx_level >= GFX11) {
7999 dst = bld.vop1(aco_opcode::v_permlane64_b32, bld.def(rc), src);
8000 } else if (delta == 1 && has_wf_dpp) {
8001 dst = bld.vop1_dpp(aco_opcode::v_mov_b32, bld.def(rc), src, dpp_wf_rl1);
8002 } else if (delta == 63 && has_wf_dpp) {
8003 dst = bld.vop1_dpp(aco_opcode::v_mov_b32, bld.def(rc), src, dpp_wf_rr1);
8004 }
8005 }
8006
8007 return dst.id() != 0;
8008 }
8009
8010 void
emit_interp_center(isel_context * ctx,Temp dst,Temp bary,Temp pos1,Temp pos2)8011 emit_interp_center(isel_context* ctx, Temp dst, Temp bary, Temp pos1, Temp pos2)
8012 {
8013 Builder bld(ctx->program, ctx->block);
8014 Temp p1 = emit_extract_vector(ctx, bary, 0, v1);
8015 Temp p2 = emit_extract_vector(ctx, bary, 1, v1);
8016
8017 Temp ddx_1, ddx_2, ddy_1, ddy_2;
8018 uint32_t dpp_ctrl0 = dpp_quad_perm(0, 0, 0, 0);
8019 uint32_t dpp_ctrl1 = dpp_quad_perm(1, 1, 1, 1);
8020 uint32_t dpp_ctrl2 = dpp_quad_perm(2, 2, 2, 2);
8021
8022 /* Build DD X/Y */
8023 if (ctx->program->gfx_level >= GFX8) {
8024 Temp tl_1 = bld.vop1_dpp(aco_opcode::v_mov_b32, bld.def(v1), p1, dpp_ctrl0);
8025 ddx_1 = bld.vop2_dpp(aco_opcode::v_sub_f32, bld.def(v1), p1, tl_1, dpp_ctrl1);
8026 ddy_1 = bld.vop2_dpp(aco_opcode::v_sub_f32, bld.def(v1), p1, tl_1, dpp_ctrl2);
8027 Temp tl_2 = bld.vop1_dpp(aco_opcode::v_mov_b32, bld.def(v1), p2, dpp_ctrl0);
8028 ddx_2 = bld.vop2_dpp(aco_opcode::v_sub_f32, bld.def(v1), p2, tl_2, dpp_ctrl1);
8029 ddy_2 = bld.vop2_dpp(aco_opcode::v_sub_f32, bld.def(v1), p2, tl_2, dpp_ctrl2);
8030 } else {
8031 Temp tl_1 = bld.ds(aco_opcode::ds_swizzle_b32, bld.def(v1), p1, (1 << 15) | dpp_ctrl0);
8032 ddx_1 = bld.ds(aco_opcode::ds_swizzle_b32, bld.def(v1), p1, (1 << 15) | dpp_ctrl1);
8033 ddx_1 = bld.vop2(aco_opcode::v_sub_f32, bld.def(v1), ddx_1, tl_1);
8034 ddy_1 = bld.ds(aco_opcode::ds_swizzle_b32, bld.def(v1), p1, (1 << 15) | dpp_ctrl2);
8035 ddy_1 = bld.vop2(aco_opcode::v_sub_f32, bld.def(v1), ddy_1, tl_1);
8036
8037 Temp tl_2 = bld.ds(aco_opcode::ds_swizzle_b32, bld.def(v1), p2, (1 << 15) | dpp_ctrl0);
8038 ddx_2 = bld.ds(aco_opcode::ds_swizzle_b32, bld.def(v1), p2, (1 << 15) | dpp_ctrl1);
8039 ddx_2 = bld.vop2(aco_opcode::v_sub_f32, bld.def(v1), ddx_2, tl_2);
8040 ddy_2 = bld.ds(aco_opcode::ds_swizzle_b32, bld.def(v1), p2, (1 << 15) | dpp_ctrl2);
8041 ddy_2 = bld.vop2(aco_opcode::v_sub_f32, bld.def(v1), ddy_2, tl_2);
8042 }
8043
8044 /* res_k = p_k + ddx_k * pos1 + ddy_k * pos2 */
8045 aco_opcode mad =
8046 ctx->program->gfx_level >= GFX10_3 ? aco_opcode::v_fma_f32 : aco_opcode::v_mad_f32;
8047 Temp tmp1 = bld.vop3(mad, bld.def(v1), ddx_1, pos1, p1);
8048 Temp tmp2 = bld.vop3(mad, bld.def(v1), ddx_2, pos1, p2);
8049 tmp1 = bld.vop3(mad, bld.def(v1), ddy_1, pos2, tmp1);
8050 tmp2 = bld.vop3(mad, bld.def(v1), ddy_2, pos2, tmp2);
8051 bld.pseudo(aco_opcode::p_create_vector, Definition(dst), tmp1, tmp2);
8052 set_wqm(ctx, true);
8053 return;
8054 }
8055
8056 Temp merged_wave_info_to_mask(isel_context* ctx, unsigned i);
8057 Temp lanecount_to_mask(isel_context* ctx, Temp count);
8058 void pops_await_overlapped_waves(isel_context* ctx);
8059
8060 Temp
get_interp_param(isel_context * ctx,nir_intrinsic_op intrin,enum glsl_interp_mode interp)8061 get_interp_param(isel_context* ctx, nir_intrinsic_op intrin, enum glsl_interp_mode interp)
8062 {
8063 bool linear = interp == INTERP_MODE_NOPERSPECTIVE;
8064 if (intrin == nir_intrinsic_load_barycentric_pixel ||
8065 intrin == nir_intrinsic_load_barycentric_at_offset) {
8066 return get_arg(ctx, linear ? ctx->args->linear_center : ctx->args->persp_center);
8067 } else if (intrin == nir_intrinsic_load_barycentric_centroid) {
8068 return get_arg(ctx, linear ? ctx->args->linear_centroid : ctx->args->persp_centroid);
8069 } else {
8070 assert(intrin == nir_intrinsic_load_barycentric_sample);
8071 return get_arg(ctx, linear ? ctx->args->linear_sample : ctx->args->persp_sample);
8072 }
8073 }
8074
8075 void
ds_ordered_count_offsets(isel_context * ctx,unsigned index_operand,unsigned wave_release,unsigned wave_done,unsigned * offset0,unsigned * offset1)8076 ds_ordered_count_offsets(isel_context* ctx, unsigned index_operand, unsigned wave_release,
8077 unsigned wave_done, unsigned* offset0, unsigned* offset1)
8078 {
8079 unsigned ordered_count_index = index_operand & 0x3f;
8080 unsigned count_dword = (index_operand >> 24) & 0xf;
8081
8082 assert(ctx->options->gfx_level >= GFX10);
8083 assert(count_dword >= 1 && count_dword <= 4);
8084
8085 *offset0 = ordered_count_index << 2;
8086 *offset1 = wave_release | (wave_done << 1) | ((count_dword - 1) << 6);
8087
8088 if (ctx->options->gfx_level < GFX11)
8089 *offset1 |= 3 /* GS shader type */ << 2;
8090 }
8091
8092 struct aco_export_mrt {
8093 Operand out[4];
8094 unsigned enabled_channels;
8095 unsigned target;
8096 bool compr;
8097 };
8098
8099 static void
create_fs_dual_src_export_gfx11(isel_context * ctx,const struct aco_export_mrt * mrt0,const struct aco_export_mrt * mrt1)8100 create_fs_dual_src_export_gfx11(isel_context* ctx, const struct aco_export_mrt* mrt0,
8101 const struct aco_export_mrt* mrt1)
8102 {
8103 Builder bld(ctx->program, ctx->block);
8104
8105 aco_ptr<Pseudo_instruction> exp{create_instruction<Pseudo_instruction>(
8106 aco_opcode::p_dual_src_export_gfx11, Format::PSEUDO, 8, 6)};
8107 for (unsigned i = 0; i < 4; i++) {
8108 exp->operands[i] = mrt0 ? mrt0->out[i] : Operand(v1);
8109 exp->operands[i].setLateKill(true);
8110 exp->operands[i + 4] = mrt1 ? mrt1->out[i] : Operand(v1);
8111 exp->operands[i + 4].setLateKill(true);
8112 }
8113
8114 RegClass type = RegClass(RegType::vgpr, util_bitcount(mrt0->enabled_channels));
8115 exp->definitions[0] = bld.def(type); /* mrt0 */
8116 exp->definitions[1] = bld.def(type); /* mrt1 */
8117 exp->definitions[2] = bld.def(bld.lm);
8118 exp->definitions[3] = bld.def(bld.lm);
8119 exp->definitions[4] = bld.def(bld.lm, vcc);
8120 exp->definitions[5] = bld.def(s1, scc);
8121 ctx->block->instructions.emplace_back(std::move(exp));
8122
8123 ctx->program->has_color_exports = true;
8124 }
8125
8126 static void
visit_cmat_muladd(isel_context * ctx,nir_intrinsic_instr * instr)8127 visit_cmat_muladd(isel_context* ctx, nir_intrinsic_instr* instr)
8128 {
8129 aco_opcode opcode = aco_opcode::num_opcodes;
8130 unsigned signed_mask = 0;
8131 bool clamp = false;
8132
8133 switch (instr->src[0].ssa->bit_size) {
8134 case 16:
8135 switch (instr->def.bit_size) {
8136 case 32: opcode = aco_opcode::v_wmma_f32_16x16x16_f16; break;
8137 case 16: opcode = aco_opcode::v_wmma_f16_16x16x16_f16; break;
8138 }
8139 break;
8140 case 8:
8141 opcode = aco_opcode::v_wmma_i32_16x16x16_iu8;
8142 signed_mask = nir_intrinsic_cmat_signed_mask(instr);
8143 clamp = nir_intrinsic_saturate(instr);
8144 break;
8145 }
8146
8147 if (opcode == aco_opcode::num_opcodes)
8148 unreachable("visit_cmat_muladd: invalid bit size combination");
8149
8150 Builder bld(ctx->program, ctx->block);
8151
8152 Temp dst = get_ssa_temp(ctx, &instr->def);
8153 Operand A(as_vgpr(ctx, get_ssa_temp(ctx, instr->src[0].ssa)));
8154 Operand B(as_vgpr(ctx, get_ssa_temp(ctx, instr->src[1].ssa)));
8155 Operand C(as_vgpr(ctx, get_ssa_temp(ctx, instr->src[2].ssa)));
8156
8157 A.setLateKill(true);
8158 B.setLateKill(true);
8159
8160 VALU_instruction& vop3p = bld.vop3p(opcode, Definition(dst), A, B, C, 0, 0)->valu();
8161 vop3p.neg_lo[0] = (signed_mask & 0x1) != 0;
8162 vop3p.neg_lo[1] = (signed_mask & 0x2) != 0;
8163 vop3p.clamp = clamp;
8164
8165 emit_split_vector(ctx, dst, instr->def.num_components);
8166 }
8167
8168 void
visit_intrinsic(isel_context * ctx,nir_intrinsic_instr * instr)8169 visit_intrinsic(isel_context* ctx, nir_intrinsic_instr* instr)
8170 {
8171 Builder bld(ctx->program, ctx->block);
8172 switch (instr->intrinsic) {
8173 case nir_intrinsic_load_barycentric_sample:
8174 case nir_intrinsic_load_barycentric_pixel:
8175 case nir_intrinsic_load_barycentric_centroid: {
8176 glsl_interp_mode mode = (glsl_interp_mode)nir_intrinsic_interp_mode(instr);
8177 Temp bary = get_interp_param(ctx, instr->intrinsic, mode);
8178 assert(bary.size() == 2);
8179 Temp dst = get_ssa_temp(ctx, &instr->def);
8180 bld.copy(Definition(dst), bary);
8181 emit_split_vector(ctx, dst, 2);
8182 break;
8183 }
8184 case nir_intrinsic_load_barycentric_model: {
8185 Temp model = get_arg(ctx, ctx->args->pull_model);
8186 assert(model.size() == 3);
8187 Temp dst = get_ssa_temp(ctx, &instr->def);
8188 bld.copy(Definition(dst), model);
8189 emit_split_vector(ctx, dst, 3);
8190 break;
8191 }
8192 case nir_intrinsic_load_barycentric_at_offset: {
8193 Temp offset = get_ssa_temp(ctx, instr->src[0].ssa);
8194 RegClass rc = RegClass(offset.type(), 1);
8195 Temp pos1 = bld.tmp(rc), pos2 = bld.tmp(rc);
8196 bld.pseudo(aco_opcode::p_split_vector, Definition(pos1), Definition(pos2), offset);
8197 Temp bary = get_interp_param(ctx, instr->intrinsic,
8198 (glsl_interp_mode)nir_intrinsic_interp_mode(instr));
8199 emit_interp_center(ctx, get_ssa_temp(ctx, &instr->def), bary, pos1, pos2);
8200 break;
8201 }
8202 case nir_intrinsic_load_front_face: {
8203 bld.vopc(aco_opcode::v_cmp_lg_u32, Definition(get_ssa_temp(ctx, &instr->def)),
8204 Operand::zero(), get_arg(ctx, ctx->args->front_face));
8205 break;
8206 }
8207 case nir_intrinsic_load_view_index: {
8208 Temp dst = get_ssa_temp(ctx, &instr->def);
8209 bld.copy(Definition(dst), Operand(get_arg(ctx, ctx->args->view_index)));
8210 break;
8211 }
8212 case nir_intrinsic_load_frag_coord: {
8213 emit_load_frag_coord(ctx, get_ssa_temp(ctx, &instr->def), 4);
8214 break;
8215 }
8216 case nir_intrinsic_load_frag_shading_rate:
8217 emit_load_frag_shading_rate(ctx, get_ssa_temp(ctx, &instr->def));
8218 break;
8219 case nir_intrinsic_load_sample_pos: {
8220 Temp posx = get_arg(ctx, ctx->args->frag_pos[0]);
8221 Temp posy = get_arg(ctx, ctx->args->frag_pos[1]);
8222 bld.pseudo(
8223 aco_opcode::p_create_vector, Definition(get_ssa_temp(ctx, &instr->def)),
8224 posx.id() ? bld.vop1(aco_opcode::v_fract_f32, bld.def(v1), posx) : Operand::zero(),
8225 posy.id() ? bld.vop1(aco_opcode::v_fract_f32, bld.def(v1), posy) : Operand::zero());
8226 break;
8227 }
8228 case nir_intrinsic_load_tess_coord: visit_load_tess_coord(ctx, instr); break;
8229 case nir_intrinsic_load_interpolated_input: visit_load_interpolated_input(ctx, instr); break;
8230 case nir_intrinsic_store_output: visit_store_output(ctx, instr); break;
8231 case nir_intrinsic_load_input:
8232 case nir_intrinsic_load_input_vertex:
8233 if (ctx->program->stage == fragment_fs)
8234 visit_load_fs_input(ctx, instr);
8235 else
8236 isel_err(&instr->instr, "Shader inputs should have been lowered in NIR.");
8237 break;
8238 case nir_intrinsic_load_per_vertex_input: visit_load_per_vertex_input(ctx, instr); break;
8239 case nir_intrinsic_load_ubo: visit_load_ubo(ctx, instr); break;
8240 case nir_intrinsic_load_push_constant: visit_load_push_constant(ctx, instr); break;
8241 case nir_intrinsic_load_constant: visit_load_constant(ctx, instr); break;
8242 case nir_intrinsic_load_shared: visit_load_shared(ctx, instr); break;
8243 case nir_intrinsic_store_shared: visit_store_shared(ctx, instr); break;
8244 case nir_intrinsic_shared_atomic:
8245 case nir_intrinsic_shared_atomic_swap: visit_shared_atomic(ctx, instr); break;
8246 case nir_intrinsic_load_shared2_amd:
8247 case nir_intrinsic_store_shared2_amd: visit_access_shared2_amd(ctx, instr); break;
8248 case nir_intrinsic_bindless_image_load:
8249 case nir_intrinsic_bindless_image_fragment_mask_load_amd:
8250 case nir_intrinsic_bindless_image_sparse_load: visit_image_load(ctx, instr); break;
8251 case nir_intrinsic_bindless_image_store: visit_image_store(ctx, instr); break;
8252 case nir_intrinsic_bindless_image_atomic:
8253 case nir_intrinsic_bindless_image_atomic_swap: visit_image_atomic(ctx, instr); break;
8254 case nir_intrinsic_load_ssbo: visit_load_ssbo(ctx, instr); break;
8255 case nir_intrinsic_store_ssbo: visit_store_ssbo(ctx, instr); break;
8256 case nir_intrinsic_load_typed_buffer_amd:
8257 case nir_intrinsic_load_buffer_amd: visit_load_buffer(ctx, instr); break;
8258 case nir_intrinsic_store_buffer_amd: visit_store_buffer(ctx, instr); break;
8259 case nir_intrinsic_load_smem_amd: visit_load_smem(ctx, instr); break;
8260 case nir_intrinsic_load_global_amd: visit_load_global(ctx, instr); break;
8261 case nir_intrinsic_store_global_amd: visit_store_global(ctx, instr); break;
8262 case nir_intrinsic_global_atomic_amd:
8263 case nir_intrinsic_global_atomic_swap_amd: visit_global_atomic(ctx, instr); break;
8264 case nir_intrinsic_ssbo_atomic:
8265 case nir_intrinsic_ssbo_atomic_swap: visit_atomic_ssbo(ctx, instr); break;
8266 case nir_intrinsic_load_scratch: visit_load_scratch(ctx, instr); break;
8267 case nir_intrinsic_store_scratch: visit_store_scratch(ctx, instr); break;
8268 case nir_intrinsic_barrier: emit_barrier(ctx, instr); break;
8269 case nir_intrinsic_load_num_workgroups: {
8270 Temp dst = get_ssa_temp(ctx, &instr->def);
8271 if (ctx->options->load_grid_size_from_user_sgpr) {
8272 bld.copy(Definition(dst), get_arg(ctx, ctx->args->num_work_groups));
8273 } else {
8274 Temp addr = get_arg(ctx, ctx->args->num_work_groups);
8275 assert(addr.regClass() == s2);
8276 bld.pseudo(aco_opcode::p_create_vector, Definition(dst),
8277 bld.smem(aco_opcode::s_load_dwordx2, bld.def(s2), addr, Operand::zero()),
8278 bld.smem(aco_opcode::s_load_dword, bld.def(s1), addr, Operand::c32(8)));
8279 }
8280 emit_split_vector(ctx, dst, 3);
8281 break;
8282 }
8283 case nir_intrinsic_load_ray_launch_size: {
8284 Temp dst = get_ssa_temp(ctx, &instr->def);
8285 bld.copy(Definition(dst), Operand(get_arg(ctx, ctx->args->rt.launch_size)));
8286 emit_split_vector(ctx, dst, 3);
8287 break;
8288 }
8289 case nir_intrinsic_load_ray_launch_id: {
8290 Temp dst = get_ssa_temp(ctx, &instr->def);
8291 bld.copy(Definition(dst), Operand(get_arg(ctx, ctx->args->rt.launch_id)));
8292 emit_split_vector(ctx, dst, 3);
8293 break;
8294 }
8295 case nir_intrinsic_load_local_invocation_id: {
8296 Temp dst = get_ssa_temp(ctx, &instr->def);
8297 if (ctx->options->gfx_level >= GFX11) {
8298 Temp local_ids[3];
8299
8300 /* Thread IDs are packed in VGPR0, 10 bits per component. */
8301 for (uint32_t i = 0; i < 3; i++) {
8302 if (i == 0 && ctx->shader->info.workgroup_size[1] == 1 &&
8303 ctx->shader->info.workgroup_size[2] == 1 &&
8304 !ctx->shader->info.workgroup_size_variable) {
8305 local_ids[i] = get_arg(ctx, ctx->args->local_invocation_ids);
8306 } else if (i == 2 || (i == 1 && ctx->shader->info.workgroup_size[2] == 1 &&
8307 !ctx->shader->info.workgroup_size_variable)) {
8308 local_ids[i] =
8309 bld.vop2(aco_opcode::v_lshrrev_b32, bld.def(v1), Operand::c32(i * 10u),
8310 get_arg(ctx, ctx->args->local_invocation_ids));
8311 } else {
8312 local_ids[i] = bld.vop3(aco_opcode::v_bfe_u32, bld.def(v1),
8313 get_arg(ctx, ctx->args->local_invocation_ids),
8314 Operand::c32(i * 10u), Operand::c32(10u));
8315 }
8316 }
8317
8318 bld.pseudo(aco_opcode::p_create_vector, Definition(dst), local_ids[0], local_ids[1],
8319 local_ids[2]);
8320 } else {
8321 bld.copy(Definition(dst), Operand(get_arg(ctx, ctx->args->local_invocation_ids)));
8322 }
8323 emit_split_vector(ctx, dst, 3);
8324 break;
8325 }
8326 case nir_intrinsic_load_workgroup_id: {
8327 Temp dst = get_ssa_temp(ctx, &instr->def);
8328 if (ctx->stage.hw == AC_HW_COMPUTE_SHADER) {
8329 const struct ac_arg* ids = ctx->args->workgroup_ids;
8330 bld.pseudo(aco_opcode::p_create_vector, Definition(dst),
8331 ids[0].used ? Operand(get_arg(ctx, ids[0])) : Operand::zero(),
8332 ids[1].used ? Operand(get_arg(ctx, ids[1])) : Operand::zero(),
8333 ids[2].used ? Operand(get_arg(ctx, ids[2])) : Operand::zero());
8334 emit_split_vector(ctx, dst, 3);
8335 } else {
8336 isel_err(&instr->instr, "Unsupported stage for load_workgroup_id");
8337 }
8338 break;
8339 }
8340 case nir_intrinsic_load_local_invocation_index: {
8341 if (ctx->stage.hw == AC_HW_LOCAL_SHADER || ctx->stage.hw == AC_HW_HULL_SHADER) {
8342 if (ctx->options->gfx_level >= GFX11) {
8343 /* On GFX11, RelAutoIndex is WaveID * WaveSize + ThreadID. */
8344 Temp wave_id =
8345 bld.sop2(aco_opcode::s_bfe_u32, bld.def(s1), bld.def(s1, scc),
8346 get_arg(ctx, ctx->args->tcs_wave_id), Operand::c32(0u | (3u << 16)));
8347
8348 Temp temp = bld.sop2(aco_opcode::s_mul_i32, bld.def(s1), wave_id,
8349 Operand::c32(ctx->program->wave_size));
8350 emit_mbcnt(ctx, get_ssa_temp(ctx, &instr->def), Operand(), Operand(temp));
8351 } else {
8352 bld.copy(Definition(get_ssa_temp(ctx, &instr->def)),
8353 get_arg(ctx, ctx->args->vs_rel_patch_id));
8354 }
8355 break;
8356 } else if (ctx->stage.hw == AC_HW_LEGACY_GEOMETRY_SHADER ||
8357 ctx->stage.hw == AC_HW_NEXT_GEN_GEOMETRY_SHADER) {
8358 bld.copy(Definition(get_ssa_temp(ctx, &instr->def)), thread_id_in_threadgroup(ctx));
8359 break;
8360 } else if (ctx->program->workgroup_size <= ctx->program->wave_size) {
8361 emit_mbcnt(ctx, get_ssa_temp(ctx, &instr->def));
8362 break;
8363 }
8364
8365 Temp id = emit_mbcnt(ctx, bld.tmp(v1));
8366
8367 /* The tg_size bits [6:11] contain the subgroup id,
8368 * we need this multiplied by the wave size, and then OR the thread id to it.
8369 */
8370 if (ctx->program->wave_size == 64) {
8371 /* After the s_and the bits are already multiplied by 64 (left shifted by 6) so we can just
8372 * feed that to v_or */
8373 Temp tg_num = bld.sop2(aco_opcode::s_and_b32, bld.def(s1), bld.def(s1, scc),
8374 Operand::c32(0xfc0u), get_arg(ctx, ctx->args->tg_size));
8375 bld.vop2(aco_opcode::v_or_b32, Definition(get_ssa_temp(ctx, &instr->def)), tg_num, id);
8376 } else {
8377 /* Extract the bit field and multiply the result by 32 (left shift by 5), then do the OR */
8378 Temp tg_num =
8379 bld.sop2(aco_opcode::s_bfe_u32, bld.def(s1), bld.def(s1, scc),
8380 get_arg(ctx, ctx->args->tg_size), Operand::c32(0x6u | (0x6u << 16)));
8381 bld.vop3(aco_opcode::v_lshl_or_b32, Definition(get_ssa_temp(ctx, &instr->def)), tg_num,
8382 Operand::c32(0x5u), id);
8383 }
8384 break;
8385 }
8386 case nir_intrinsic_load_subgroup_invocation: {
8387 emit_mbcnt(ctx, get_ssa_temp(ctx, &instr->def));
8388 break;
8389 }
8390 case nir_intrinsic_ballot_relaxed:
8391 case nir_intrinsic_ballot: {
8392 Temp src = get_ssa_temp(ctx, instr->src[0].ssa);
8393 Temp dst = get_ssa_temp(ctx, &instr->def);
8394
8395 if (instr->src[0].ssa->bit_size == 1) {
8396 assert(src.regClass() == bld.lm);
8397 } else if (instr->src[0].ssa->bit_size == 32 && src.regClass() == v1) {
8398 src = bld.vopc(aco_opcode::v_cmp_lg_u32, bld.def(bld.lm), Operand::zero(), src);
8399 } else if (instr->src[0].ssa->bit_size == 64 && src.regClass() == v2) {
8400 src = bld.vopc(aco_opcode::v_cmp_lg_u64, bld.def(bld.lm), Operand::zero(), src);
8401 } else {
8402 isel_err(&instr->instr, "Unimplemented NIR instr bit size");
8403 }
8404
8405 /* Make sure that all inactive lanes return zero.
8406 * Value-numbering might remove the comparison above */
8407 Definition def = dst.size() == bld.lm.size() ? Definition(dst) : bld.def(bld.lm);
8408 if (instr->intrinsic == nir_intrinsic_ballot_relaxed)
8409 src = bld.copy(def, src);
8410 else
8411 src = bld.sop2(Builder::s_and, def, bld.def(s1, scc), src, Operand(exec, bld.lm));
8412 if (dst.size() != bld.lm.size()) {
8413 /* Wave32 with ballot size set to 64 */
8414 bld.pseudo(aco_opcode::p_create_vector, Definition(dst), src, Operand::zero());
8415 }
8416
8417 set_wqm(ctx);
8418 break;
8419 }
8420 case nir_intrinsic_inverse_ballot: {
8421 Temp src = bld.as_uniform(get_ssa_temp(ctx, instr->src[0].ssa));
8422 Temp dst = get_ssa_temp(ctx, &instr->def);
8423
8424 assert(dst.size() == bld.lm.size());
8425 if (src.size() > dst.size()) {
8426 emit_extract_vector(ctx, src, 0, dst);
8427 } else if (src.size() < dst.size()) {
8428 bld.pseudo(aco_opcode::p_create_vector, Definition(dst), src, Operand::zero());
8429 } else {
8430 bld.copy(Definition(dst), src);
8431 }
8432 break;
8433 }
8434 case nir_intrinsic_shuffle:
8435 case nir_intrinsic_read_invocation: {
8436 Temp src = get_ssa_temp(ctx, instr->src[0].ssa);
8437 assert(instr->def.bit_size != 1);
8438 if (!nir_src_is_divergent(instr->src[0])) {
8439 emit_uniform_subgroup(ctx, instr, src);
8440 } else {
8441 Temp tid = get_ssa_temp(ctx, instr->src[1].ssa);
8442 if (instr->intrinsic == nir_intrinsic_read_invocation ||
8443 !nir_src_is_divergent(instr->src[1]))
8444 tid = bld.as_uniform(tid);
8445 Temp dst = get_ssa_temp(ctx, &instr->def);
8446
8447 src = as_vgpr(ctx, src);
8448
8449 if (src.regClass() == v1b || src.regClass() == v2b) {
8450 Temp tmp = bld.tmp(v1);
8451 tmp = emit_bpermute(ctx, bld, tid, src);
8452 if (dst.type() == RegType::vgpr)
8453 bld.pseudo(aco_opcode::p_split_vector, Definition(dst),
8454 bld.def(src.regClass() == v1b ? v3b : v2b), tmp);
8455 else
8456 bld.pseudo(aco_opcode::p_as_uniform, Definition(dst), tmp);
8457 } else if (src.regClass() == v1) {
8458 Temp tmp = emit_bpermute(ctx, bld, tid, src);
8459 bld.copy(Definition(dst), tmp);
8460 } else if (src.regClass() == v2) {
8461 Temp lo = bld.tmp(v1), hi = bld.tmp(v1);
8462 bld.pseudo(aco_opcode::p_split_vector, Definition(lo), Definition(hi), src);
8463 lo = emit_bpermute(ctx, bld, tid, lo);
8464 hi = emit_bpermute(ctx, bld, tid, hi);
8465 bld.pseudo(aco_opcode::p_create_vector, Definition(dst), lo, hi);
8466 emit_split_vector(ctx, dst, 2);
8467 } else {
8468 isel_err(&instr->instr, "Unimplemented NIR instr bit size");
8469 }
8470 set_wqm(ctx);
8471 }
8472 break;
8473 }
8474 case nir_intrinsic_rotate: {
8475 Temp src = get_ssa_temp(ctx, instr->src[0].ssa);
8476 Temp delta = get_ssa_temp(ctx, instr->src[1].ssa);
8477 Temp dst = get_ssa_temp(ctx, &instr->def);
8478 assert(nir_intrinsic_execution_scope(instr) == SCOPE_SUBGROUP);
8479 assert(instr->def.bit_size > 1 && instr->def.bit_size <= 32);
8480
8481 if (!nir_src_is_divergent(instr->src[0])) {
8482 emit_uniform_subgroup(ctx, instr, src);
8483 break;
8484 }
8485
8486 unsigned cluster_size = nir_intrinsic_cluster_size(instr);
8487 cluster_size = util_next_power_of_two(
8488 MIN2(cluster_size ? cluster_size : ctx->program->wave_size, ctx->program->wave_size));
8489
8490 if (cluster_size == 1) {
8491 bld.copy(Definition(dst), src);
8492 break;
8493 }
8494
8495 delta = bld.as_uniform(delta);
8496 src = as_vgpr(ctx, src);
8497
8498 Temp tmp;
8499 if (nir_src_is_const(instr->src[1]) &&
8500 emit_rotate_by_constant(ctx, tmp, src, cluster_size, nir_src_as_uint(instr->src[1]))) {
8501 } else if (cluster_size == 2) {
8502 Temp noswap =
8503 bld.sopc(aco_opcode::s_bitcmp0_b32, bld.def(s1, scc), delta, Operand::c32(0));
8504 noswap = bool_to_vector_condition(ctx, noswap);
8505 Temp swapped = emit_masked_swizzle(ctx, bld, src, ds_pattern_bitmode(0x1f, 0, 0x1), true);
8506 tmp = bld.vop2(aco_opcode::v_cndmask_b32, bld.def(src.regClass()), swapped, src, noswap);
8507 } else if (ctx->program->gfx_level >= GFX10 && cluster_size <= 16) {
8508 if (cluster_size == 4) /* shift mask already does this for 8/16. */
8509 delta = bld.sop2(aco_opcode::s_and_b32, bld.def(s1), bld.def(s1, scc), delta,
8510 Operand::c32(0x3));
8511 delta =
8512 bld.sop2(aco_opcode::s_lshl_b32, bld.def(s1), bld.def(s1, scc), delta, Operand::c32(2));
8513
8514 Temp lo = bld.copy(bld.def(s1), Operand::c32(cluster_size == 4 ? 0x32103210 : 0x76543210));
8515 Temp hi;
8516
8517 if (cluster_size <= 8) {
8518 Temp shr = bld.sop2(aco_opcode::s_lshr_b32, bld.def(s1), bld.def(s1, scc), lo, delta);
8519 if (cluster_size == 4) {
8520 Temp lotolohi = bld.copy(bld.def(s1), Operand::c32(0x4444));
8521 Temp lohi =
8522 bld.sop2(aco_opcode::s_or_b32, bld.def(s1), bld.def(s1, scc), shr, lotolohi);
8523 lo = bld.sop2(aco_opcode::s_pack_ll_b32_b16, bld.def(s1), shr, lohi);
8524 } else {
8525 delta = bld.sop2(aco_opcode::s_sub_u32, bld.def(s1), bld.def(s1, scc),
8526 Operand::c32(32), delta);
8527 Temp shl =
8528 bld.sop2(aco_opcode::s_lshl_b32, bld.def(s1), bld.def(s1, scc), lo, delta);
8529 lo = bld.sop2(aco_opcode::s_or_b32, bld.def(s1), bld.def(s1, scc), shr, shl);
8530 }
8531 Temp lotohi = bld.copy(bld.def(s1), Operand::c32(0x88888888));
8532 hi = bld.sop2(aco_opcode::s_or_b32, bld.def(s1), bld.def(s1, scc), lo, lotohi);
8533 } else {
8534 hi = bld.copy(bld.def(s1), Operand::c32(0xfedcba98));
8535
8536 Temp lohi = bld.pseudo(aco_opcode::p_create_vector, bld.def(s2), lo, hi);
8537
8538 Temp shr = bld.sop2(aco_opcode::s_lshr_b64, bld.def(s2), bld.def(s1, scc), lohi, delta);
8539 delta = bld.sop2(aco_opcode::s_sub_u32, bld.def(s1), bld.def(s1, scc), Operand::c32(64),
8540 delta);
8541 Temp shl = bld.sop2(aco_opcode::s_lshl_b64, bld.def(s2), bld.def(s1, scc), lohi, delta);
8542
8543 lohi = bld.sop2(aco_opcode::s_or_b64, bld.def(s2), bld.def(s1, scc), shr, shl);
8544 lo = bld.tmp(s1);
8545 hi = bld.tmp(s1);
8546 bld.pseudo(aco_opcode::p_split_vector, Definition(lo), Definition(hi), lohi);
8547 }
8548
8549 Builder::Result ret =
8550 bld.vop3(aco_opcode::v_permlane16_b32, bld.def(src.regClass()), src, lo, hi);
8551 ret->valu().opsel[0] = true; /* set FETCH_INACTIVE */
8552 ret->valu().opsel[1] = true; /* set BOUND_CTRL */
8553 tmp = ret;
8554 } else {
8555 /* Fallback to ds_bpermute if we can't find a special instruction. */
8556 Temp tid = emit_mbcnt(ctx, bld.tmp(v1));
8557 Temp src_lane = bld.vadd32(bld.def(v1), tid, delta);
8558
8559 if (ctx->program->gfx_level >= GFX10 && cluster_size == 32) {
8560 /* ds_bpermute is restricted to 32 lanes on GFX10+. */
8561 Temp index_x4 =
8562 bld.vop2(aco_opcode::v_lshlrev_b32, bld.def(v1), Operand::c32(2u), src_lane);
8563 tmp = bld.ds(aco_opcode::ds_bpermute_b32, bld.def(v1), index_x4, src);
8564 } else {
8565 /* Technically, full wave rotate doesn't need this, but it breaks the pseudo ops. */
8566 src_lane = bld.vop3(aco_opcode::v_bfi_b32, bld.def(v1), Operand::c32(cluster_size - 1),
8567 src_lane, tid);
8568 tmp = emit_bpermute(ctx, bld, src_lane, src);
8569 }
8570 }
8571
8572 tmp = emit_extract_vector(ctx, tmp, 0, dst.regClass());
8573 bld.copy(Definition(dst), tmp);
8574 set_wqm(ctx);
8575 break;
8576 }
8577 case nir_intrinsic_load_sample_id: {
8578 bld.vop3(aco_opcode::v_bfe_u32, Definition(get_ssa_temp(ctx, &instr->def)),
8579 get_arg(ctx, ctx->args->ancillary), Operand::c32(8u), Operand::c32(4u));
8580 break;
8581 }
8582 case nir_intrinsic_read_first_invocation: {
8583 Temp src = get_ssa_temp(ctx, instr->src[0].ssa);
8584 Temp dst = get_ssa_temp(ctx, &instr->def);
8585 if (instr->def.bit_size == 1) {
8586 assert(src.regClass() == bld.lm);
8587 Temp tmp = bld.sopc(Builder::s_bitcmp1, bld.def(s1, scc), src,
8588 bld.sop1(Builder::s_ff1_i32, bld.def(s1), Operand(exec, bld.lm)));
8589 bool_to_vector_condition(ctx, tmp, dst);
8590 } else {
8591 emit_readfirstlane(ctx, src, dst);
8592 }
8593 set_wqm(ctx);
8594 break;
8595 }
8596 case nir_intrinsic_as_uniform: {
8597 Temp src = get_ssa_temp(ctx, instr->src[0].ssa);
8598 Temp dst = get_ssa_temp(ctx, &instr->def);
8599 if (src.type() == RegType::vgpr)
8600 bld.pseudo(aco_opcode::p_as_uniform, Definition(dst), src);
8601 else
8602 bld.copy(Definition(dst), src);
8603 break;
8604 }
8605 case nir_intrinsic_vote_all: {
8606 Temp src = get_ssa_temp(ctx, instr->src[0].ssa);
8607 Temp dst = get_ssa_temp(ctx, &instr->def);
8608 assert(src.regClass() == bld.lm);
8609 assert(dst.regClass() == bld.lm);
8610
8611 Temp tmp = bld.sop1(Builder::s_not, bld.def(bld.lm), bld.def(s1, scc), src);
8612 tmp = bld.sop2(Builder::s_and, bld.def(bld.lm), bld.def(s1, scc), tmp, Operand(exec, bld.lm))
8613 .def(1)
8614 .getTemp();
8615 Temp cond = bool_to_vector_condition(ctx, tmp);
8616 bld.sop1(Builder::s_not, Definition(dst), bld.def(s1, scc), cond);
8617 set_wqm(ctx);
8618 break;
8619 }
8620 case nir_intrinsic_vote_any: {
8621 Temp src = get_ssa_temp(ctx, instr->src[0].ssa);
8622 Temp dst = get_ssa_temp(ctx, &instr->def);
8623 assert(src.regClass() == bld.lm);
8624 assert(dst.regClass() == bld.lm);
8625
8626 Temp tmp = bool_to_scalar_condition(ctx, src);
8627 bool_to_vector_condition(ctx, tmp, dst);
8628 set_wqm(ctx);
8629 break;
8630 }
8631 case nir_intrinsic_quad_vote_any: {
8632 Temp src = get_ssa_temp(ctx, instr->src[0].ssa);
8633 src = bld.sop2(Builder::s_and, bld.def(bld.lm), bld.def(s1, scc), src, Operand(exec, bld.lm));
8634 bld.sop1(Builder::s_wqm, Definition(get_ssa_temp(ctx, &instr->def)), bld.def(s1, scc), src);
8635 set_wqm(ctx);
8636 break;
8637 }
8638 case nir_intrinsic_quad_vote_all: {
8639 Temp src = get_ssa_temp(ctx, instr->src[0].ssa);
8640 src = bld.sop1(Builder::s_not, bld.def(bld.lm), bld.def(s1, scc), src);
8641 src = bld.sop2(Builder::s_and, bld.def(bld.lm), bld.def(s1, scc), src, Operand(exec, bld.lm));
8642 src = bld.sop1(Builder::s_wqm, bld.def(bld.lm), bld.def(s1, scc), src);
8643 bld.sop1(Builder::s_not, Definition(get_ssa_temp(ctx, &instr->def)), bld.def(s1, scc), src);
8644 set_wqm(ctx);
8645 break;
8646 }
8647 case nir_intrinsic_reduce:
8648 case nir_intrinsic_inclusive_scan:
8649 case nir_intrinsic_exclusive_scan: {
8650 Temp src = get_ssa_temp(ctx, instr->src[0].ssa);
8651 Temp dst = get_ssa_temp(ctx, &instr->def);
8652 nir_op op = (nir_op)nir_intrinsic_reduction_op(instr);
8653 unsigned cluster_size =
8654 instr->intrinsic == nir_intrinsic_reduce ? nir_intrinsic_cluster_size(instr) : 0;
8655 cluster_size = util_next_power_of_two(
8656 MIN2(cluster_size ? cluster_size : ctx->program->wave_size, ctx->program->wave_size));
8657 const unsigned bit_size = instr->src[0].ssa->bit_size;
8658 assert(bit_size != 1);
8659
8660 if (!nir_src_is_divergent(instr->src[0]) && cluster_size == ctx->program->wave_size) {
8661 /* We use divergence analysis to assign the regclass, so check if it's
8662 * working as expected */
8663 ASSERTED bool expected_divergent = instr->intrinsic == nir_intrinsic_exclusive_scan;
8664 if (instr->intrinsic == nir_intrinsic_inclusive_scan)
8665 expected_divergent = op == nir_op_iadd || op == nir_op_fadd || op == nir_op_ixor;
8666 assert(instr->def.divergent == expected_divergent);
8667
8668 if (instr->intrinsic == nir_intrinsic_reduce) {
8669 if (emit_uniform_reduce(ctx, instr))
8670 break;
8671 } else if (emit_uniform_scan(ctx, instr)) {
8672 break;
8673 }
8674 }
8675
8676 src = emit_extract_vector(ctx, src, 0, RegClass::get(RegType::vgpr, bit_size / 8));
8677 ReduceOp reduce_op = get_reduce_op(op, bit_size);
8678
8679 aco_opcode aco_op;
8680 switch (instr->intrinsic) {
8681 case nir_intrinsic_reduce: aco_op = aco_opcode::p_reduce; break;
8682 case nir_intrinsic_inclusive_scan: aco_op = aco_opcode::p_inclusive_scan; break;
8683 case nir_intrinsic_exclusive_scan: aco_op = aco_opcode::p_exclusive_scan; break;
8684 default: unreachable("unknown reduce intrinsic");
8685 }
8686
8687 /* Avoid whole wave shift. */
8688 const bool use_inclusive_for_exclusive = aco_op == aco_opcode::p_exclusive_scan &&
8689 (op == nir_op_iadd || op == nir_op_ixor) &&
8690 dst.type() == RegType::vgpr;
8691 if (use_inclusive_for_exclusive)
8692 inclusive_scan_to_exclusive(ctx, reduce_op, Definition(dst), src);
8693 else
8694 emit_reduction_instr(ctx, aco_op, reduce_op, cluster_size, Definition(dst), src);
8695
8696 set_wqm(ctx);
8697 break;
8698 }
8699 case nir_intrinsic_quad_broadcast:
8700 case nir_intrinsic_quad_swap_horizontal:
8701 case nir_intrinsic_quad_swap_vertical:
8702 case nir_intrinsic_quad_swap_diagonal:
8703 case nir_intrinsic_quad_swizzle_amd: {
8704 Temp src = get_ssa_temp(ctx, instr->src[0].ssa);
8705
8706 if (!instr->def.divergent) {
8707 emit_uniform_subgroup(ctx, instr, src);
8708 break;
8709 }
8710
8711 /* Quad broadcast lane. */
8712 unsigned lane = 0;
8713 /* Use VALU for the bool instructions that don't have a SALU-only special case. */
8714 bool bool_use_valu = instr->def.bit_size == 1;
8715
8716 uint16_t dpp_ctrl = 0;
8717
8718 bool allow_fi = true;
8719 switch (instr->intrinsic) {
8720 case nir_intrinsic_quad_swap_horizontal: dpp_ctrl = dpp_quad_perm(1, 0, 3, 2); break;
8721 case nir_intrinsic_quad_swap_vertical: dpp_ctrl = dpp_quad_perm(2, 3, 0, 1); break;
8722 case nir_intrinsic_quad_swap_diagonal: dpp_ctrl = dpp_quad_perm(3, 2, 1, 0); break;
8723 case nir_intrinsic_quad_swizzle_amd:
8724 dpp_ctrl = nir_intrinsic_swizzle_mask(instr);
8725 allow_fi &= nir_intrinsic_fetch_inactive(instr);
8726 break;
8727 case nir_intrinsic_quad_broadcast:
8728 lane = nir_src_as_const_value(instr->src[1])->u32;
8729 dpp_ctrl = dpp_quad_perm(lane, lane, lane, lane);
8730 bool_use_valu = false;
8731 break;
8732 default: break;
8733 }
8734
8735 Temp dst = get_ssa_temp(ctx, &instr->def);
8736
8737 /* Setup source. */
8738 if (bool_use_valu)
8739 src = bld.vop2_e64(aco_opcode::v_cndmask_b32, bld.def(v1), Operand::zero(),
8740 Operand::c32(-1), src);
8741 else if (instr->def.bit_size != 1)
8742 src = as_vgpr(ctx, src);
8743
8744 if (instr->def.bit_size == 1 && instr->intrinsic == nir_intrinsic_quad_broadcast) {
8745 /* Special case for quad broadcast using SALU only. */
8746 assert(src.regClass() == bld.lm && dst.regClass() == bld.lm);
8747
8748 uint32_t half_mask = 0x11111111u << lane;
8749 Operand mask_tmp = bld.lm.bytes() == 4
8750 ? Operand::c32(half_mask)
8751 : bld.pseudo(aco_opcode::p_create_vector, bld.def(bld.lm),
8752 Operand::c32(half_mask), Operand::c32(half_mask));
8753
8754 src =
8755 bld.sop2(Builder::s_and, bld.def(bld.lm), bld.def(s1, scc), src, Operand(exec, bld.lm));
8756 src = bld.sop2(Builder::s_and, bld.def(bld.lm), bld.def(s1, scc), mask_tmp, src);
8757 bld.sop1(Builder::s_wqm, Definition(dst), bld.def(s1, scc), src);
8758 } else if (instr->def.bit_size <= 32 || bool_use_valu) {
8759 unsigned excess_bytes = bool_use_valu ? 0 : 4 - instr->def.bit_size / 8;
8760 Definition def = (excess_bytes || bool_use_valu) ? bld.def(v1) : Definition(dst);
8761
8762 if (ctx->program->gfx_level >= GFX8)
8763 bld.vop1_dpp(aco_opcode::v_mov_b32, def, src, dpp_ctrl, 0xf, 0xf, true, allow_fi);
8764 else
8765 bld.ds(aco_opcode::ds_swizzle_b32, def, src, (1 << 15) | dpp_ctrl);
8766
8767 if (excess_bytes)
8768 bld.pseudo(aco_opcode::p_split_vector, Definition(dst),
8769 bld.def(RegClass::get(dst.type(), excess_bytes)), def.getTemp());
8770 if (bool_use_valu)
8771 bld.vopc(aco_opcode::v_cmp_lg_u32, Definition(dst), Operand::zero(), def.getTemp());
8772 } else if (instr->def.bit_size == 64) {
8773 Temp lo = bld.tmp(v1), hi = bld.tmp(v1);
8774 bld.pseudo(aco_opcode::p_split_vector, Definition(lo), Definition(hi), src);
8775
8776 if (ctx->program->gfx_level >= GFX8) {
8777 lo = bld.vop1_dpp(aco_opcode::v_mov_b32, bld.def(v1), lo, dpp_ctrl, 0xf, 0xf, true,
8778 allow_fi);
8779 hi = bld.vop1_dpp(aco_opcode::v_mov_b32, bld.def(v1), hi, dpp_ctrl, 0xf, 0xf, true,
8780 allow_fi);
8781 } else {
8782 lo = bld.ds(aco_opcode::ds_swizzle_b32, bld.def(v1), lo, (1 << 15) | dpp_ctrl);
8783 hi = bld.ds(aco_opcode::ds_swizzle_b32, bld.def(v1), hi, (1 << 15) | dpp_ctrl);
8784 }
8785
8786 bld.pseudo(aco_opcode::p_create_vector, Definition(dst), lo, hi);
8787 emit_split_vector(ctx, dst, 2);
8788 } else {
8789 isel_err(&instr->instr, "Unimplemented NIR quad group instruction bit size.");
8790 }
8791
8792 set_wqm(ctx);
8793 break;
8794 }
8795 case nir_intrinsic_masked_swizzle_amd: {
8796 Temp src = get_ssa_temp(ctx, instr->src[0].ssa);
8797 if (!instr->def.divergent) {
8798 emit_uniform_subgroup(ctx, instr, src);
8799 break;
8800 }
8801 Temp dst = get_ssa_temp(ctx, &instr->def);
8802 uint32_t mask = nir_intrinsic_swizzle_mask(instr);
8803 bool allow_fi = nir_intrinsic_fetch_inactive(instr);
8804
8805 if (instr->def.bit_size != 1)
8806 src = as_vgpr(ctx, src);
8807
8808 if (instr->def.bit_size == 1) {
8809 assert(src.regClass() == bld.lm);
8810 src = bld.vop2_e64(aco_opcode::v_cndmask_b32, bld.def(v1), Operand::zero(),
8811 Operand::c32(-1), src);
8812 src = emit_masked_swizzle(ctx, bld, src, mask, allow_fi);
8813 bld.vopc(aco_opcode::v_cmp_lg_u32, Definition(dst), Operand::zero(), src);
8814 } else if (dst.regClass() == v1b) {
8815 Temp tmp = emit_masked_swizzle(ctx, bld, src, mask, allow_fi);
8816 emit_extract_vector(ctx, tmp, 0, dst);
8817 } else if (dst.regClass() == v2b) {
8818 Temp tmp = emit_masked_swizzle(ctx, bld, src, mask, allow_fi);
8819 emit_extract_vector(ctx, tmp, 0, dst);
8820 } else if (dst.regClass() == v1) {
8821 bld.copy(Definition(dst), emit_masked_swizzle(ctx, bld, src, mask, allow_fi));
8822 } else if (dst.regClass() == v2) {
8823 Temp lo = bld.tmp(v1), hi = bld.tmp(v1);
8824 bld.pseudo(aco_opcode::p_split_vector, Definition(lo), Definition(hi), src);
8825 lo = emit_masked_swizzle(ctx, bld, lo, mask, allow_fi);
8826 hi = emit_masked_swizzle(ctx, bld, hi, mask, allow_fi);
8827 bld.pseudo(aco_opcode::p_create_vector, Definition(dst), lo, hi);
8828 emit_split_vector(ctx, dst, 2);
8829 } else {
8830 isel_err(&instr->instr, "Unimplemented NIR instr bit size");
8831 }
8832 set_wqm(ctx);
8833 break;
8834 }
8835 case nir_intrinsic_write_invocation_amd: {
8836 Temp src = as_vgpr(ctx, get_ssa_temp(ctx, instr->src[0].ssa));
8837 Temp val = bld.as_uniform(get_ssa_temp(ctx, instr->src[1].ssa));
8838 Temp lane = bld.as_uniform(get_ssa_temp(ctx, instr->src[2].ssa));
8839 Temp dst = get_ssa_temp(ctx, &instr->def);
8840 if (dst.regClass() == v1) {
8841 /* src2 is ignored for writelane. RA assigns the same reg for dst */
8842 bld.writelane(Definition(dst), val, lane, src);
8843 } else if (dst.regClass() == v2) {
8844 Temp src_lo = bld.tmp(v1), src_hi = bld.tmp(v1);
8845 Temp val_lo = bld.tmp(s1), val_hi = bld.tmp(s1);
8846 bld.pseudo(aco_opcode::p_split_vector, Definition(src_lo), Definition(src_hi), src);
8847 bld.pseudo(aco_opcode::p_split_vector, Definition(val_lo), Definition(val_hi), val);
8848 Temp lo = bld.writelane(bld.def(v1), val_lo, lane, src_hi);
8849 Temp hi = bld.writelane(bld.def(v1), val_hi, lane, src_hi);
8850 bld.pseudo(aco_opcode::p_create_vector, Definition(dst), lo, hi);
8851 emit_split_vector(ctx, dst, 2);
8852 } else {
8853 isel_err(&instr->instr, "Unimplemented NIR instr bit size");
8854 }
8855 break;
8856 }
8857 case nir_intrinsic_mbcnt_amd: {
8858 Temp src = get_ssa_temp(ctx, instr->src[0].ssa);
8859 Temp add_src = as_vgpr(ctx, get_ssa_temp(ctx, instr->src[1].ssa));
8860 Temp dst = get_ssa_temp(ctx, &instr->def);
8861 /* Fit 64-bit mask for wave32 */
8862 src = emit_extract_vector(ctx, src, 0, RegClass(src.type(), bld.lm.size()));
8863 emit_mbcnt(ctx, dst, Operand(src), Operand(add_src));
8864 set_wqm(ctx);
8865 break;
8866 }
8867 case nir_intrinsic_lane_permute_16_amd: {
8868 Temp src = get_ssa_temp(ctx, instr->src[0].ssa);
8869 Temp dst = get_ssa_temp(ctx, &instr->def);
8870 assert(ctx->program->gfx_level >= GFX10);
8871
8872 if (src.regClass() == s1) {
8873 bld.copy(Definition(dst), src);
8874 } else if (dst.regClass() == v1 && src.regClass() == v1) {
8875 bld.vop3(aco_opcode::v_permlane16_b32, Definition(dst), src,
8876 bld.as_uniform(get_ssa_temp(ctx, instr->src[1].ssa)),
8877 bld.as_uniform(get_ssa_temp(ctx, instr->src[2].ssa)));
8878 } else {
8879 isel_err(&instr->instr, "Unimplemented lane_permute_16_amd");
8880 }
8881 break;
8882 }
8883 case nir_intrinsic_load_helper_invocation:
8884 case nir_intrinsic_is_helper_invocation: {
8885 /* load_helper() after demote() get lowered to is_helper().
8886 * Otherwise, these two behave the same. */
8887 Temp dst = get_ssa_temp(ctx, &instr->def);
8888 bld.pseudo(aco_opcode::p_is_helper, Definition(dst), Operand(exec, bld.lm));
8889 ctx->program->needs_exact = true;
8890 break;
8891 }
8892 case nir_intrinsic_demote:
8893 case nir_intrinsic_demote_if: {
8894 Operand cond = Operand::c32(-1u);
8895 if (instr->intrinsic == nir_intrinsic_demote_if) {
8896 Temp src = get_ssa_temp(ctx, instr->src[0].ssa);
8897 assert(src.regClass() == bld.lm);
8898 cond =
8899 bld.sop2(Builder::s_and, bld.def(bld.lm), bld.def(s1, scc), src, Operand(exec, bld.lm));
8900 }
8901
8902 bld.pseudo(aco_opcode::p_demote_to_helper, cond);
8903
8904 if (ctx->block->loop_nest_depth || ctx->cf_info.parent_if.is_divergent)
8905 ctx->cf_info.exec_potentially_empty_discard = true;
8906
8907 ctx->block->kind |= block_kind_uses_discard;
8908 ctx->program->needs_exact = true;
8909
8910 /* Enable WQM in order to prevent helper lanes from getting terminated. */
8911 if (ctx->shader->info.maximally_reconverges)
8912 ctx->program->needs_wqm = true;
8913
8914 break;
8915 }
8916 case nir_intrinsic_terminate:
8917 case nir_intrinsic_terminate_if:
8918 case nir_intrinsic_discard:
8919 case nir_intrinsic_discard_if: {
8920 Operand cond = Operand::c32(-1u);
8921 if (instr->intrinsic == nir_intrinsic_discard_if ||
8922 instr->intrinsic == nir_intrinsic_terminate_if) {
8923 Temp src = get_ssa_temp(ctx, instr->src[0].ssa);
8924 assert(src.regClass() == bld.lm);
8925 cond =
8926 bld.sop2(Builder::s_and, bld.def(bld.lm), bld.def(s1, scc), src, Operand(exec, bld.lm));
8927
8928 ctx->cf_info.had_divergent_discard |= nir_src_is_divergent(instr->src[0]);
8929 }
8930
8931 bld.pseudo(aco_opcode::p_discard_if, cond);
8932
8933 if (ctx->block->loop_nest_depth || ctx->cf_info.parent_if.is_divergent)
8934 ctx->cf_info.exec_potentially_empty_discard = true;
8935 ctx->cf_info.had_divergent_discard |= in_exec_divergent_or_in_loop(ctx);
8936 ctx->block->kind |= block_kind_uses_discard;
8937 ctx->program->needs_exact = true;
8938 break;
8939 }
8940 case nir_intrinsic_first_invocation: {
8941 bld.sop1(Builder::s_ff1_i32, Definition(get_ssa_temp(ctx, &instr->def)),
8942 Operand(exec, bld.lm));
8943 set_wqm(ctx);
8944 break;
8945 }
8946 case nir_intrinsic_last_invocation: {
8947 Temp flbit = bld.sop1(Builder::s_flbit_i32, bld.def(s1), Operand(exec, bld.lm));
8948 bld.sop2(aco_opcode::s_sub_i32, Definition(get_ssa_temp(ctx, &instr->def)), bld.def(s1, scc),
8949 Operand::c32(ctx->program->wave_size - 1u), flbit);
8950 set_wqm(ctx);
8951 break;
8952 }
8953 case nir_intrinsic_elect: {
8954 /* p_elect is lowered in aco_insert_exec_mask.
8955 * Use exec as an operand so value numbering and the pre-RA optimizer won't recognize
8956 * two p_elect with different exec masks as the same.
8957 */
8958 bld.pseudo(aco_opcode::p_elect, Definition(get_ssa_temp(ctx, &instr->def)),
8959 Operand(exec, bld.lm));
8960 set_wqm(ctx);
8961 break;
8962 }
8963 case nir_intrinsic_shader_clock: {
8964 Temp dst = get_ssa_temp(ctx, &instr->def);
8965 if (nir_intrinsic_memory_scope(instr) == SCOPE_SUBGROUP &&
8966 ctx->options->gfx_level >= GFX10_3) {
8967 /* "((size - 1) << 11) | register" (SHADER_CYCLES is encoded as register 29) */
8968 Temp clock = bld.sopk(aco_opcode::s_getreg_b32, bld.def(s1), ((20 - 1) << 11) | 29);
8969 bld.pseudo(aco_opcode::p_create_vector, Definition(dst), clock, Operand::zero());
8970 } else if (nir_intrinsic_memory_scope(instr) == SCOPE_DEVICE &&
8971 ctx->options->gfx_level >= GFX11) {
8972 bld.sop1(aco_opcode::s_sendmsg_rtn_b64, Definition(dst),
8973 Operand::c32(sendmsg_rtn_get_realtime));
8974 } else {
8975 aco_opcode opcode = nir_intrinsic_memory_scope(instr) == SCOPE_DEVICE
8976 ? aco_opcode::s_memrealtime
8977 : aco_opcode::s_memtime;
8978 bld.smem(opcode, Definition(dst), memory_sync_info(0, semantic_volatile));
8979 }
8980 emit_split_vector(ctx, dst, 2);
8981 break;
8982 }
8983 case nir_intrinsic_load_vertex_id_zero_base: {
8984 Temp dst = get_ssa_temp(ctx, &instr->def);
8985 bld.copy(Definition(dst), get_arg(ctx, ctx->args->vertex_id));
8986 break;
8987 }
8988 case nir_intrinsic_load_first_vertex: {
8989 Temp dst = get_ssa_temp(ctx, &instr->def);
8990 bld.copy(Definition(dst), get_arg(ctx, ctx->args->base_vertex));
8991 break;
8992 }
8993 case nir_intrinsic_load_base_instance: {
8994 Temp dst = get_ssa_temp(ctx, &instr->def);
8995 bld.copy(Definition(dst), get_arg(ctx, ctx->args->start_instance));
8996 break;
8997 }
8998 case nir_intrinsic_load_instance_id: {
8999 Temp dst = get_ssa_temp(ctx, &instr->def);
9000 bld.copy(Definition(dst), get_arg(ctx, ctx->args->instance_id));
9001 break;
9002 }
9003 case nir_intrinsic_load_draw_id: {
9004 Temp dst = get_ssa_temp(ctx, &instr->def);
9005 bld.copy(Definition(dst), get_arg(ctx, ctx->args->draw_id));
9006 break;
9007 }
9008 case nir_intrinsic_load_invocation_id: {
9009 Temp dst = get_ssa_temp(ctx, &instr->def);
9010
9011 if (ctx->shader->info.stage == MESA_SHADER_GEOMETRY) {
9012 if (ctx->options->gfx_level >= GFX10)
9013 bld.vop2_e64(aco_opcode::v_and_b32, Definition(dst), Operand::c32(127u),
9014 get_arg(ctx, ctx->args->gs_invocation_id));
9015 else
9016 bld.copy(Definition(dst), get_arg(ctx, ctx->args->gs_invocation_id));
9017 } else if (ctx->shader->info.stage == MESA_SHADER_TESS_CTRL) {
9018 bld.vop3(aco_opcode::v_bfe_u32, Definition(dst), get_arg(ctx, ctx->args->tcs_rel_ids),
9019 Operand::c32(8u), Operand::c32(5u));
9020 } else {
9021 unreachable("Unsupported stage for load_invocation_id");
9022 }
9023
9024 break;
9025 }
9026 case nir_intrinsic_load_primitive_id: {
9027 Temp dst = get_ssa_temp(ctx, &instr->def);
9028
9029 switch (ctx->shader->info.stage) {
9030 case MESA_SHADER_GEOMETRY:
9031 bld.copy(Definition(dst), get_arg(ctx, ctx->args->gs_prim_id));
9032 break;
9033 case MESA_SHADER_TESS_CTRL:
9034 bld.copy(Definition(dst), get_arg(ctx, ctx->args->tcs_patch_id));
9035 break;
9036 case MESA_SHADER_TESS_EVAL:
9037 bld.copy(Definition(dst), get_arg(ctx, ctx->args->tes_patch_id));
9038 break;
9039 default:
9040 if (ctx->stage.hw == AC_HW_NEXT_GEN_GEOMETRY_SHADER && !ctx->stage.has(SWStage::GS)) {
9041 /* In case of NGG, the GS threads always have the primitive ID
9042 * even if there is no SW GS. */
9043 bld.copy(Definition(dst), get_arg(ctx, ctx->args->gs_prim_id));
9044 break;
9045 } else if (ctx->shader->info.stage == MESA_SHADER_VERTEX) {
9046 bld.copy(Definition(dst), get_arg(ctx, ctx->args->vs_prim_id));
9047 break;
9048 }
9049 unreachable("Unimplemented shader stage for nir_intrinsic_load_primitive_id");
9050 }
9051
9052 break;
9053 }
9054 case nir_intrinsic_sendmsg_amd: {
9055 unsigned imm = nir_intrinsic_base(instr);
9056 Temp m0_content = bld.as_uniform(get_ssa_temp(ctx, instr->src[0].ssa));
9057 bld.sopp(aco_opcode::s_sendmsg, bld.m0(m0_content), -1, imm);
9058 break;
9059 }
9060 case nir_intrinsic_load_gs_wave_id_amd: {
9061 Temp dst = get_ssa_temp(ctx, &instr->def);
9062 if (ctx->args->merged_wave_info.used)
9063 bld.pseudo(aco_opcode::p_extract, Definition(dst), bld.def(s1, scc),
9064 get_arg(ctx, ctx->args->merged_wave_info), Operand::c32(2u), Operand::c32(8u),
9065 Operand::zero());
9066 else if (ctx->args->gs_wave_id.used)
9067 bld.copy(Definition(dst), get_arg(ctx, ctx->args->gs_wave_id));
9068 else
9069 unreachable("Shader doesn't have GS wave ID.");
9070 break;
9071 }
9072 case nir_intrinsic_is_subgroup_invocation_lt_amd: {
9073 Temp src = bld.as_uniform(get_ssa_temp(ctx, instr->src[0].ssa));
9074 bld.copy(Definition(get_ssa_temp(ctx, &instr->def)), lanecount_to_mask(ctx, src));
9075 break;
9076 }
9077 case nir_intrinsic_gds_atomic_add_amd: {
9078 Temp store_val = get_ssa_temp(ctx, instr->src[0].ssa);
9079 Temp gds_addr = get_ssa_temp(ctx, instr->src[1].ssa);
9080 Temp m0_val = get_ssa_temp(ctx, instr->src[2].ssa);
9081 Operand m = bld.m0((Temp)bld.copy(bld.def(s1, m0), bld.as_uniform(m0_val)));
9082 bld.ds(aco_opcode::ds_add_u32, as_vgpr(ctx, gds_addr), as_vgpr(ctx, store_val), m, 0u, 0u,
9083 true);
9084 break;
9085 }
9086 case nir_intrinsic_load_sbt_base_amd: {
9087 Temp dst = get_ssa_temp(ctx, &instr->def);
9088 Temp addr = get_arg(ctx, ctx->args->rt.sbt_descriptors);
9089 assert(addr.regClass() == s2);
9090 bld.copy(Definition(dst), Operand(addr));
9091 break;
9092 }
9093 case nir_intrinsic_bvh64_intersect_ray_amd: visit_bvh64_intersect_ray_amd(ctx, instr); break;
9094 case nir_intrinsic_load_rt_dynamic_callable_stack_base_amd:
9095 bld.copy(Definition(get_ssa_temp(ctx, &instr->def)),
9096 get_arg(ctx, ctx->args->rt.dynamic_callable_stack_base));
9097 break;
9098 case nir_intrinsic_load_resume_shader_address_amd: {
9099 bld.pseudo(aco_opcode::p_resume_shader_address, Definition(get_ssa_temp(ctx, &instr->def)),
9100 bld.def(s1, scc), Operand::c32(nir_intrinsic_call_idx(instr)));
9101 break;
9102 }
9103 case nir_intrinsic_overwrite_vs_arguments_amd: {
9104 ctx->arg_temps[ctx->args->vertex_id.arg_index] = get_ssa_temp(ctx, instr->src[0].ssa);
9105 ctx->arg_temps[ctx->args->instance_id.arg_index] = get_ssa_temp(ctx, instr->src[1].ssa);
9106 break;
9107 }
9108 case nir_intrinsic_overwrite_tes_arguments_amd: {
9109 ctx->arg_temps[ctx->args->tes_u.arg_index] = get_ssa_temp(ctx, instr->src[0].ssa);
9110 ctx->arg_temps[ctx->args->tes_v.arg_index] = get_ssa_temp(ctx, instr->src[1].ssa);
9111 ctx->arg_temps[ctx->args->tes_rel_patch_id.arg_index] = get_ssa_temp(ctx, instr->src[3].ssa);
9112 ctx->arg_temps[ctx->args->tes_patch_id.arg_index] = get_ssa_temp(ctx, instr->src[2].ssa);
9113 break;
9114 }
9115 case nir_intrinsic_load_scalar_arg_amd:
9116 case nir_intrinsic_load_vector_arg_amd: {
9117 assert(nir_intrinsic_base(instr) < ctx->args->arg_count);
9118 Temp dst = get_ssa_temp(ctx, &instr->def);
9119 Temp src = ctx->arg_temps[nir_intrinsic_base(instr)];
9120 assert(src.id());
9121 assert(src.type() == (instr->intrinsic == nir_intrinsic_load_scalar_arg_amd ? RegType::sgpr
9122 : RegType::vgpr));
9123 bld.copy(Definition(dst), src);
9124 emit_split_vector(ctx, dst, dst.size());
9125 break;
9126 }
9127 case nir_intrinsic_ordered_xfb_counter_add_amd: {
9128 Temp dst = get_ssa_temp(ctx, &instr->def);
9129 Temp ordered_id = get_ssa_temp(ctx, instr->src[0].ssa);
9130 Temp counter = get_ssa_temp(ctx, instr->src[1].ssa);
9131
9132 Temp gds_base = bld.copy(bld.def(v1), Operand::c32(0u));
9133 unsigned offset0, offset1;
9134 Instruction* ds_instr;
9135 Operand m;
9136
9137 /* Lock a GDS mutex. */
9138 ds_ordered_count_offsets(ctx, 1 << 24u, false, false, &offset0, &offset1);
9139 m = bld.m0(bld.as_uniform(ordered_id));
9140 ds_instr =
9141 bld.ds(aco_opcode::ds_ordered_count, bld.def(v1), gds_base, m, offset0, offset1, true);
9142 ds_instr->ds().sync = memory_sync_info(storage_gds, semantic_volatile);
9143
9144 aco_ptr<Pseudo_instruction> vec{create_instruction<Pseudo_instruction>(
9145 aco_opcode::p_create_vector, Format::PSEUDO, instr->num_components, 1)};
9146 unsigned write_mask = nir_intrinsic_write_mask(instr);
9147
9148 for (unsigned i = 0; i < instr->num_components; i++) {
9149 if (write_mask & (1 << i)) {
9150 Temp chan_counter = emit_extract_vector(ctx, counter, i, v1);
9151
9152 ds_instr = bld.ds(aco_opcode::ds_add_gs_reg_rtn, bld.def(v1), Operand(), chan_counter,
9153 i * 4, 0u, true);
9154 ds_instr->ds().sync = memory_sync_info(storage_gds, semantic_atomicrmw);
9155
9156 vec->operands[i] = Operand(ds_instr->definitions[0].getTemp());
9157 } else {
9158 vec->operands[i] = Operand::zero();
9159 }
9160 }
9161
9162 vec->definitions[0] = Definition(dst);
9163 ctx->block->instructions.emplace_back(std::move(vec));
9164
9165 /* Unlock a GDS mutex. */
9166 ds_ordered_count_offsets(ctx, 1 << 24u, true, true, &offset0, &offset1);
9167 m = bld.m0(bld.as_uniform(ordered_id));
9168 ds_instr =
9169 bld.ds(aco_opcode::ds_ordered_count, bld.def(v1), gds_base, m, offset0, offset1, true);
9170 ds_instr->ds().sync = memory_sync_info(storage_gds, semantic_volatile);
9171
9172 emit_split_vector(ctx, dst, instr->num_components);
9173 break;
9174 }
9175 case nir_intrinsic_xfb_counter_sub_amd: {
9176 unsigned write_mask = nir_intrinsic_write_mask(instr);
9177 Temp counter = get_ssa_temp(ctx, instr->src[0].ssa);
9178
9179 u_foreach_bit (i, write_mask) {
9180 Temp chan_counter = emit_extract_vector(ctx, counter, i, v1);
9181 Instruction* ds_instr;
9182
9183 ds_instr = bld.ds(aco_opcode::ds_sub_gs_reg_rtn, bld.def(v1), Operand(), chan_counter,
9184 i * 4, 0u, true);
9185 ds_instr->ds().sync = memory_sync_info(storage_gds, semantic_atomicrmw);
9186 }
9187 break;
9188 }
9189 case nir_intrinsic_export_amd:
9190 case nir_intrinsic_export_row_amd: {
9191 unsigned flags = nir_intrinsic_flags(instr);
9192 unsigned target = nir_intrinsic_base(instr);
9193 unsigned write_mask = nir_intrinsic_write_mask(instr);
9194
9195 /* Mark vertex export block. */
9196 if (target == V_008DFC_SQ_EXP_POS || target <= V_008DFC_SQ_EXP_NULL)
9197 ctx->block->kind |= block_kind_export_end;
9198
9199 if (target < V_008DFC_SQ_EXP_MRTZ)
9200 ctx->program->has_color_exports = true;
9201
9202 const bool row_en = instr->intrinsic == nir_intrinsic_export_row_amd;
9203
9204 aco_ptr<Export_instruction> exp{
9205 create_instruction<Export_instruction>(aco_opcode::exp, Format::EXP, 4 + row_en, 0)};
9206
9207 exp->dest = target;
9208 exp->enabled_mask = write_mask;
9209 exp->compressed = flags & AC_EXP_FLAG_COMPRESSED;
9210
9211 /* ACO may reorder position/mrt export instructions, then mark done for last
9212 * export instruction. So don't respect the nir AC_EXP_FLAG_DONE for position/mrt
9213 * exports here and leave it to ACO.
9214 */
9215 if (target == V_008DFC_SQ_EXP_PRIM)
9216 exp->done = flags & AC_EXP_FLAG_DONE;
9217 else
9218 exp->done = false;
9219
9220 /* ACO may reorder mrt export instructions, then mark valid mask for last
9221 * export instruction. So don't respect the nir AC_EXP_FLAG_VALID_MASK for mrt
9222 * exports here and leave it to ACO.
9223 */
9224 if (target > V_008DFC_SQ_EXP_NULL)
9225 exp->valid_mask = flags & AC_EXP_FLAG_VALID_MASK;
9226 else
9227 exp->valid_mask = false;
9228
9229 exp->row_en = row_en;
9230
9231 /* Compressed export uses two bits for a channel. */
9232 uint32_t channel_mask =
9233 exp->compressed ? (write_mask & 0x3 ? 1 : 0) | (write_mask & 0xc ? 2 : 0) : write_mask;
9234
9235 Temp value = get_ssa_temp(ctx, instr->src[0].ssa);
9236 for (unsigned i = 0; i < 4; i++) {
9237 exp->operands[i] = channel_mask & BITFIELD_BIT(i)
9238 ? Operand(emit_extract_vector(ctx, value, i, v1))
9239 : Operand(v1);
9240 }
9241
9242 if (row_en) {
9243 Temp row = bld.as_uniform(get_ssa_temp(ctx, instr->src[1].ssa));
9244 /* Hack to prevent the RA from moving the source into m0 and then back to a normal SGPR. */
9245 row = bld.copy(bld.def(s1, m0), row);
9246 exp->operands[4] = bld.m0(row);
9247 }
9248
9249 ctx->block->instructions.emplace_back(std::move(exp));
9250 break;
9251 }
9252 case nir_intrinsic_export_dual_src_blend_amd: {
9253 Temp val0 = get_ssa_temp(ctx, instr->src[0].ssa);
9254 Temp val1 = get_ssa_temp(ctx, instr->src[1].ssa);
9255 unsigned write_mask = nir_intrinsic_write_mask(instr);
9256
9257 struct aco_export_mrt mrt0, mrt1;
9258 for (unsigned i = 0; i < 4; i++) {
9259 mrt0.out[i] = write_mask & BITFIELD_BIT(i) ? Operand(emit_extract_vector(ctx, val0, i, v1))
9260 : Operand(v1);
9261
9262 mrt1.out[i] = write_mask & BITFIELD_BIT(i) ? Operand(emit_extract_vector(ctx, val1, i, v1))
9263 : Operand(v1);
9264 }
9265 mrt0.enabled_channels = mrt1.enabled_channels = write_mask;
9266
9267 create_fs_dual_src_export_gfx11(ctx, &mrt0, &mrt1);
9268
9269 ctx->block->kind |= block_kind_export_end;
9270 break;
9271 }
9272 case nir_intrinsic_strict_wqm_coord_amd: {
9273 Temp dst = get_ssa_temp(ctx, &instr->def);
9274 Temp src = get_ssa_temp(ctx, instr->src[0].ssa);
9275 Temp tmp = bld.tmp(RegClass::get(RegType::vgpr, dst.bytes()));
9276 unsigned begin_size = nir_intrinsic_base(instr);
9277
9278 unsigned num_src = 1;
9279 auto it = ctx->allocated_vec.find(src.id());
9280 if (it != ctx->allocated_vec.end())
9281 num_src = src.bytes() / it->second[0].bytes();
9282
9283 aco_ptr<Pseudo_instruction> vec{create_instruction<Pseudo_instruction>(
9284 aco_opcode::p_create_vector, Format::PSEUDO, num_src + !!begin_size, 1)};
9285
9286 if (begin_size)
9287 vec->operands[0] = Operand(RegClass::get(RegType::vgpr, begin_size));
9288 for (unsigned i = 0; i < num_src; i++) {
9289 Temp comp = it != ctx->allocated_vec.end() ? it->second[i] : src;
9290 vec->operands[i + !!begin_size] = Operand(comp);
9291 }
9292
9293 vec->definitions[0] = Definition(tmp);
9294 ctx->block->instructions.emplace_back(std::move(vec));
9295
9296 bld.pseudo(aco_opcode::p_start_linear_vgpr, Definition(dst), tmp);
9297 break;
9298 }
9299 case nir_intrinsic_load_lds_ngg_scratch_base_amd: {
9300 Temp dst = get_ssa_temp(ctx, &instr->def);
9301 bld.sop1(aco_opcode::p_load_symbol, Definition(dst),
9302 Operand::c32(aco_symbol_lds_ngg_scratch_base));
9303 break;
9304 }
9305 case nir_intrinsic_load_lds_ngg_gs_out_vertex_base_amd: {
9306 Temp dst = get_ssa_temp(ctx, &instr->def);
9307 bld.sop1(aco_opcode::p_load_symbol, Definition(dst),
9308 Operand::c32(aco_symbol_lds_ngg_gs_out_vertex_base));
9309 break;
9310 }
9311 case nir_intrinsic_store_scalar_arg_amd: {
9312 ctx->arg_temps[nir_intrinsic_base(instr)] =
9313 bld.as_uniform(get_ssa_temp(ctx, instr->src[0].ssa));
9314 break;
9315 }
9316 case nir_intrinsic_store_vector_arg_amd: {
9317 ctx->arg_temps[nir_intrinsic_base(instr)] =
9318 as_vgpr(ctx, get_ssa_temp(ctx, instr->src[0].ssa));
9319 break;
9320 }
9321 case nir_intrinsic_begin_invocation_interlock: {
9322 pops_await_overlapped_waves(ctx);
9323 break;
9324 }
9325 case nir_intrinsic_end_invocation_interlock: {
9326 if (ctx->options->gfx_level < GFX11)
9327 bld.pseudo(aco_opcode::p_pops_gfx9_ordered_section_done);
9328 break;
9329 }
9330 case nir_intrinsic_cmat_muladd_amd: visit_cmat_muladd(ctx, instr); break;
9331 default:
9332 isel_err(&instr->instr, "Unimplemented intrinsic instr");
9333 abort();
9334
9335 break;
9336 }
9337 }
9338
9339 void
get_const_vec(nir_def * vec,nir_const_value * cv[4])9340 get_const_vec(nir_def* vec, nir_const_value* cv[4])
9341 {
9342 if (vec->parent_instr->type != nir_instr_type_alu)
9343 return;
9344 nir_alu_instr* vec_instr = nir_instr_as_alu(vec->parent_instr);
9345 if (vec_instr->op != nir_op_vec(vec->num_components))
9346 return;
9347
9348 for (unsigned i = 0; i < vec->num_components; i++) {
9349 cv[i] =
9350 vec_instr->src[i].swizzle[0] == 0 ? nir_src_as_const_value(vec_instr->src[i].src) : NULL;
9351 }
9352 }
9353
9354 void
visit_tex(isel_context * ctx,nir_tex_instr * instr)9355 visit_tex(isel_context* ctx, nir_tex_instr* instr)
9356 {
9357 assert(instr->op != nir_texop_samples_identical);
9358
9359 Builder bld(ctx->program, ctx->block);
9360 bool has_bias = false, has_lod = false, level_zero = false, has_compare = false,
9361 has_offset = false, has_ddx = false, has_ddy = false, has_derivs = false,
9362 has_sample_index = false, has_clamped_lod = false, has_wqm_coord = false;
9363 Temp resource, sampler, bias = Temp(), compare = Temp(), sample_index = Temp(), lod = Temp(),
9364 offset = Temp(), ddx = Temp(), ddy = Temp(), clamped_lod = Temp(),
9365 coord = Temp(), wqm_coord = Temp();
9366 std::vector<Temp> coords;
9367 std::vector<Temp> derivs;
9368 nir_const_value* const_offset[4] = {NULL, NULL, NULL, NULL};
9369
9370 for (unsigned i = 0; i < instr->num_srcs; i++) {
9371 switch (instr->src[i].src_type) {
9372 case nir_tex_src_texture_handle:
9373 resource = bld.as_uniform(get_ssa_temp(ctx, instr->src[i].src.ssa));
9374 break;
9375 case nir_tex_src_sampler_handle:
9376 sampler = bld.as_uniform(get_ssa_temp(ctx, instr->src[i].src.ssa));
9377 break;
9378 default: break;
9379 }
9380 }
9381
9382 bool tg4_integer_workarounds = ctx->options->gfx_level <= GFX8 && instr->op == nir_texop_tg4 &&
9383 (instr->dest_type & (nir_type_int | nir_type_uint));
9384 bool tg4_integer_cube_workaround =
9385 tg4_integer_workarounds && instr->sampler_dim == GLSL_SAMPLER_DIM_CUBE;
9386
9387 bool a16 = false, g16 = false;
9388
9389 int coord_idx = nir_tex_instr_src_index(instr, nir_tex_src_coord);
9390 if (coord_idx > 0)
9391 a16 = instr->src[coord_idx].src.ssa->bit_size == 16;
9392
9393 int ddx_idx = nir_tex_instr_src_index(instr, nir_tex_src_ddx);
9394 if (ddx_idx > 0)
9395 g16 = instr->src[ddx_idx].src.ssa->bit_size == 16;
9396
9397 for (unsigned i = 0; i < instr->num_srcs; i++) {
9398 switch (instr->src[i].src_type) {
9399 case nir_tex_src_coord: {
9400 assert(instr->src[i].src.ssa->bit_size == (a16 ? 16 : 32));
9401 coord = get_ssa_temp_tex(ctx, instr->src[i].src.ssa, a16);
9402 break;
9403 }
9404 case nir_tex_src_backend1: {
9405 assert(instr->src[i].src.ssa->bit_size == 32);
9406 wqm_coord = get_ssa_temp(ctx, instr->src[i].src.ssa);
9407 has_wqm_coord = true;
9408 break;
9409 }
9410 case nir_tex_src_bias:
9411 assert(instr->src[i].src.ssa->bit_size == (a16 ? 16 : 32));
9412 /* Doesn't need get_ssa_temp_tex because we pack it into its own dword anyway. */
9413 bias = get_ssa_temp(ctx, instr->src[i].src.ssa);
9414 has_bias = true;
9415 break;
9416 case nir_tex_src_lod: {
9417 if (nir_src_is_const(instr->src[i].src) && nir_src_as_uint(instr->src[i].src) == 0) {
9418 level_zero = true;
9419 } else {
9420 assert(instr->src[i].src.ssa->bit_size == (a16 ? 16 : 32));
9421 lod = get_ssa_temp_tex(ctx, instr->src[i].src.ssa, a16);
9422 has_lod = true;
9423 }
9424 break;
9425 }
9426 case nir_tex_src_min_lod:
9427 assert(instr->src[i].src.ssa->bit_size == (a16 ? 16 : 32));
9428 clamped_lod = get_ssa_temp_tex(ctx, instr->src[i].src.ssa, a16);
9429 has_clamped_lod = true;
9430 break;
9431 case nir_tex_src_comparator:
9432 if (instr->is_shadow) {
9433 assert(instr->src[i].src.ssa->bit_size == 32);
9434 compare = get_ssa_temp(ctx, instr->src[i].src.ssa);
9435 has_compare = true;
9436 }
9437 break;
9438 case nir_tex_src_offset:
9439 case nir_tex_src_backend2:
9440 assert(instr->src[i].src.ssa->bit_size == 32);
9441 offset = get_ssa_temp(ctx, instr->src[i].src.ssa);
9442 get_const_vec(instr->src[i].src.ssa, const_offset);
9443 has_offset = true;
9444 break;
9445 case nir_tex_src_ddx:
9446 assert(instr->src[i].src.ssa->bit_size == (g16 ? 16 : 32));
9447 ddx = get_ssa_temp_tex(ctx, instr->src[i].src.ssa, g16);
9448 has_ddx = true;
9449 break;
9450 case nir_tex_src_ddy:
9451 assert(instr->src[i].src.ssa->bit_size == (g16 ? 16 : 32));
9452 ddy = get_ssa_temp_tex(ctx, instr->src[i].src.ssa, g16);
9453 has_ddy = true;
9454 break;
9455 case nir_tex_src_ms_index:
9456 assert(instr->src[i].src.ssa->bit_size == (a16 ? 16 : 32));
9457 sample_index = get_ssa_temp_tex(ctx, instr->src[i].src.ssa, a16);
9458 has_sample_index = true;
9459 break;
9460 case nir_tex_src_texture_offset:
9461 case nir_tex_src_sampler_offset:
9462 default: break;
9463 }
9464 }
9465
9466 if (has_wqm_coord) {
9467 assert(instr->op == nir_texop_tex || instr->op == nir_texop_txb ||
9468 instr->op == nir_texop_lod);
9469 assert(wqm_coord.regClass().is_linear_vgpr());
9470 assert(!a16 && !g16);
9471 }
9472
9473 if (instr->op == nir_texop_tg4 && !has_lod && !instr->is_gather_implicit_lod)
9474 level_zero = true;
9475
9476 if (has_offset) {
9477 assert(instr->op != nir_texop_txf);
9478
9479 aco_ptr<Instruction> tmp_instr;
9480 Temp acc, pack = Temp();
9481
9482 uint32_t pack_const = 0;
9483 for (unsigned i = 0; i < offset.size(); i++) {
9484 if (!const_offset[i])
9485 continue;
9486 pack_const |= (const_offset[i]->u32 & 0x3Fu) << (8u * i);
9487 }
9488
9489 if (offset.type() == RegType::sgpr) {
9490 for (unsigned i = 0; i < offset.size(); i++) {
9491 if (const_offset[i])
9492 continue;
9493
9494 acc = emit_extract_vector(ctx, offset, i, s1);
9495 acc = bld.sop2(aco_opcode::s_and_b32, bld.def(s1), bld.def(s1, scc), acc,
9496 Operand::c32(0x3Fu));
9497
9498 if (i) {
9499 acc = bld.sop2(aco_opcode::s_lshl_b32, bld.def(s1), bld.def(s1, scc), acc,
9500 Operand::c32(8u * i));
9501 }
9502
9503 if (pack == Temp()) {
9504 pack = acc;
9505 } else {
9506 pack = bld.sop2(aco_opcode::s_or_b32, bld.def(s1), bld.def(s1, scc), pack, acc);
9507 }
9508 }
9509
9510 if (pack_const && pack != Temp())
9511 pack = bld.sop2(aco_opcode::s_or_b32, bld.def(s1), bld.def(s1, scc),
9512 Operand::c32(pack_const), pack);
9513 } else {
9514 for (unsigned i = 0; i < offset.size(); i++) {
9515 if (const_offset[i])
9516 continue;
9517
9518 acc = emit_extract_vector(ctx, offset, i, v1);
9519 acc = bld.vop2(aco_opcode::v_and_b32, bld.def(v1), Operand::c32(0x3Fu), acc);
9520
9521 if (i) {
9522 acc = bld.vop2(aco_opcode::v_lshlrev_b32, bld.def(v1), Operand::c32(8u * i), acc);
9523 }
9524
9525 if (pack == Temp()) {
9526 pack = acc;
9527 } else {
9528 pack = bld.vop2(aco_opcode::v_or_b32, bld.def(v1), pack, acc);
9529 }
9530 }
9531
9532 if (pack_const && pack != Temp())
9533 pack = bld.vop2(aco_opcode::v_or_b32, bld.def(v1), Operand::c32(pack_const), pack);
9534 }
9535 if (pack == Temp())
9536 offset = bld.copy(bld.def(v1), Operand::c32(pack_const));
9537 else
9538 offset = pack;
9539 }
9540
9541 std::vector<Temp> unpacked_coord;
9542 if (coord != Temp())
9543 unpacked_coord.push_back(coord);
9544 if (has_sample_index)
9545 unpacked_coord.push_back(sample_index);
9546 if (has_lod)
9547 unpacked_coord.push_back(lod);
9548 if (has_clamped_lod)
9549 unpacked_coord.push_back(clamped_lod);
9550
9551 coords = emit_pack_v1(ctx, unpacked_coord);
9552
9553 /* pack derivatives */
9554 if (has_ddx || has_ddy) {
9555 assert(a16 == g16 || ctx->options->gfx_level >= GFX10);
9556 std::array<Temp, 2> ddxddy = {ddx, ddy};
9557 for (Temp tmp : ddxddy) {
9558 if (tmp == Temp())
9559 continue;
9560 std::vector<Temp> unpacked = {tmp};
9561 for (Temp derv : emit_pack_v1(ctx, unpacked))
9562 derivs.push_back(derv);
9563 }
9564 has_derivs = true;
9565 }
9566
9567 unsigned dim = 0;
9568 bool da = false;
9569 if (instr->sampler_dim != GLSL_SAMPLER_DIM_BUF) {
9570 dim = ac_get_sampler_dim(ctx->options->gfx_level, instr->sampler_dim, instr->is_array);
9571 da = should_declare_array((ac_image_dim)dim);
9572 }
9573
9574 /* Build tex instruction */
9575 unsigned dmask = nir_def_components_read(&instr->def) & 0xf;
9576 if (instr->sampler_dim == GLSL_SAMPLER_DIM_BUF)
9577 dmask = u_bit_consecutive(0, util_last_bit(dmask));
9578 if (instr->is_sparse)
9579 dmask = MAX2(dmask, 1) | 0x10;
9580 bool d16 = instr->def.bit_size == 16;
9581 Temp dst = get_ssa_temp(ctx, &instr->def);
9582 Temp tmp_dst = dst;
9583
9584 /* gather4 selects the component by dmask and always returns vec4 (vec5 if sparse) */
9585 if (instr->op == nir_texop_tg4) {
9586 assert(instr->def.num_components == (4 + instr->is_sparse));
9587 if (instr->is_shadow)
9588 dmask = 1;
9589 else
9590 dmask = 1 << instr->component;
9591 if (tg4_integer_cube_workaround || dst.type() == RegType::sgpr)
9592 tmp_dst = bld.tmp(instr->is_sparse ? v5 : (d16 ? v2 : v4));
9593 } else if (instr->op == nir_texop_fragment_mask_fetch_amd) {
9594 tmp_dst = bld.tmp(v1);
9595 } else if (util_bitcount(dmask) != instr->def.num_components || dst.type() == RegType::sgpr) {
9596 unsigned bytes = util_bitcount(dmask) * instr->def.bit_size / 8;
9597 tmp_dst = bld.tmp(RegClass::get(RegType::vgpr, bytes));
9598 }
9599
9600 Temp tg4_compare_cube_wa64 = Temp();
9601
9602 if (tg4_integer_workarounds) {
9603 Temp half_texel[2];
9604 if (instr->sampler_dim == GLSL_SAMPLER_DIM_RECT) {
9605 half_texel[0] = half_texel[1] = bld.copy(bld.def(v1), Operand::c32(0xbf000000 /*-0.5*/));
9606 } else {
9607 Temp tg4_lod = bld.copy(bld.def(v1), Operand::zero());
9608 Temp size = bld.tmp(v2);
9609 MIMG_instruction* tex = emit_mimg(bld, aco_opcode::image_get_resinfo, size, resource,
9610 Operand(s4), std::vector<Temp>{tg4_lod});
9611 tex->dim = dim;
9612 tex->dmask = 0x3;
9613 tex->da = da;
9614 emit_split_vector(ctx, size, size.size());
9615
9616 for (unsigned i = 0; i < 2; i++) {
9617 half_texel[i] = emit_extract_vector(ctx, size, i, v1);
9618 half_texel[i] = bld.vop1(aco_opcode::v_cvt_f32_i32, bld.def(v1), half_texel[i]);
9619 half_texel[i] = bld.vop1(aco_opcode::v_rcp_iflag_f32, bld.def(v1), half_texel[i]);
9620 half_texel[i] = bld.vop2(aco_opcode::v_mul_f32, bld.def(v1),
9621 Operand::c32(0xbf000000 /*-0.5*/), half_texel[i]);
9622 }
9623
9624 if (instr->sampler_dim == GLSL_SAMPLER_DIM_2D && !instr->is_array) {
9625 /* In vulkan, whether the sampler uses unnormalized
9626 * coordinates or not is a dynamic property of the
9627 * sampler. Hence, to figure out whether or not we
9628 * need to divide by the texture size, we need to test
9629 * the sampler at runtime. This tests the bit set by
9630 * radv_init_sampler().
9631 */
9632 unsigned bit_idx = ffs(S_008F30_FORCE_UNNORMALIZED(1)) - 1;
9633 Temp dword0 = emit_extract_vector(ctx, sampler, 0, s1);
9634 Temp not_needed =
9635 bld.sopc(aco_opcode::s_bitcmp0_b32, bld.def(s1, scc), dword0, Operand::c32(bit_idx));
9636
9637 not_needed = bool_to_vector_condition(ctx, not_needed);
9638 half_texel[0] = bld.vop2(aco_opcode::v_cndmask_b32, bld.def(v1),
9639 Operand::c32(0xbf000000 /*-0.5*/), half_texel[0], not_needed);
9640 half_texel[1] = bld.vop2(aco_opcode::v_cndmask_b32, bld.def(v1),
9641 Operand::c32(0xbf000000 /*-0.5*/), half_texel[1], not_needed);
9642 }
9643 }
9644
9645 Temp new_coords[2] = {bld.vop2(aco_opcode::v_add_f32, bld.def(v1), coords[0], half_texel[0]),
9646 bld.vop2(aco_opcode::v_add_f32, bld.def(v1), coords[1], half_texel[1])};
9647
9648 if (tg4_integer_cube_workaround) {
9649 /* see comment in ac_nir_to_llvm.c's lower_gather4_integer() */
9650 Temp* const desc = (Temp*)alloca(resource.size() * sizeof(Temp));
9651 aco_ptr<Instruction> split{create_instruction<Pseudo_instruction>(
9652 aco_opcode::p_split_vector, Format::PSEUDO, 1, resource.size())};
9653 split->operands[0] = Operand(resource);
9654 for (unsigned i = 0; i < resource.size(); i++) {
9655 desc[i] = bld.tmp(s1);
9656 split->definitions[i] = Definition(desc[i]);
9657 }
9658 ctx->block->instructions.emplace_back(std::move(split));
9659
9660 Temp dfmt = bld.sop2(aco_opcode::s_bfe_u32, bld.def(s1), bld.def(s1, scc), desc[1],
9661 Operand::c32(20u | (6u << 16)));
9662 Temp compare_cube_wa = bld.sopc(aco_opcode::s_cmp_eq_u32, bld.def(s1, scc), dfmt,
9663 Operand::c32(V_008F14_IMG_DATA_FORMAT_8_8_8_8));
9664
9665 Temp nfmt;
9666 if (instr->dest_type & nir_type_uint) {
9667 nfmt = bld.sop2(aco_opcode::s_cselect_b32, bld.def(s1),
9668 Operand::c32(V_008F14_IMG_NUM_FORMAT_USCALED),
9669 Operand::c32(V_008F14_IMG_NUM_FORMAT_UINT), bld.scc(compare_cube_wa));
9670 } else {
9671 nfmt = bld.sop2(aco_opcode::s_cselect_b32, bld.def(s1),
9672 Operand::c32(V_008F14_IMG_NUM_FORMAT_SSCALED),
9673 Operand::c32(V_008F14_IMG_NUM_FORMAT_SINT), bld.scc(compare_cube_wa));
9674 }
9675 tg4_compare_cube_wa64 = bld.tmp(bld.lm);
9676 bool_to_vector_condition(ctx, compare_cube_wa, tg4_compare_cube_wa64);
9677
9678 nfmt = bld.sop2(aco_opcode::s_lshl_b32, bld.def(s1), bld.def(s1, scc), nfmt,
9679 Operand::c32(26u));
9680
9681 desc[1] = bld.sop2(aco_opcode::s_and_b32, bld.def(s1), bld.def(s1, scc), desc[1],
9682 Operand::c32(C_008F14_NUM_FORMAT));
9683 desc[1] = bld.sop2(aco_opcode::s_or_b32, bld.def(s1), bld.def(s1, scc), desc[1], nfmt);
9684
9685 aco_ptr<Instruction> vec{create_instruction<Pseudo_instruction>(
9686 aco_opcode::p_create_vector, Format::PSEUDO, resource.size(), 1)};
9687 for (unsigned i = 0; i < resource.size(); i++)
9688 vec->operands[i] = Operand(desc[i]);
9689 resource = bld.tmp(resource.regClass());
9690 vec->definitions[0] = Definition(resource);
9691 ctx->block->instructions.emplace_back(std::move(vec));
9692
9693 new_coords[0] = bld.vop2(aco_opcode::v_cndmask_b32, bld.def(v1), new_coords[0], coords[0],
9694 tg4_compare_cube_wa64);
9695 new_coords[1] = bld.vop2(aco_opcode::v_cndmask_b32, bld.def(v1), new_coords[1], coords[1],
9696 tg4_compare_cube_wa64);
9697 }
9698 coords[0] = new_coords[0];
9699 coords[1] = new_coords[1];
9700 }
9701
9702 if (instr->sampler_dim == GLSL_SAMPLER_DIM_BUF) {
9703 // FIXME: if (ctx->abi->gfx9_stride_size_workaround) return
9704 // ac_build_buffer_load_format_gfx9_safe()
9705
9706 assert(coords.size() == 1);
9707 aco_opcode op;
9708 if (d16) {
9709 switch (util_last_bit(dmask & 0xf)) {
9710 case 1: op = aco_opcode::buffer_load_format_d16_x; break;
9711 case 2: op = aco_opcode::buffer_load_format_d16_xy; break;
9712 case 3: op = aco_opcode::buffer_load_format_d16_xyz; break;
9713 case 4: op = aco_opcode::buffer_load_format_d16_xyzw; break;
9714 default: unreachable("Tex instruction loads more than 4 components.");
9715 }
9716 } else {
9717 switch (util_last_bit(dmask & 0xf)) {
9718 case 1: op = aco_opcode::buffer_load_format_x; break;
9719 case 2: op = aco_opcode::buffer_load_format_xy; break;
9720 case 3: op = aco_opcode::buffer_load_format_xyz; break;
9721 case 4: op = aco_opcode::buffer_load_format_xyzw; break;
9722 default: unreachable("Tex instruction loads more than 4 components.");
9723 }
9724 }
9725
9726 aco_ptr<MUBUF_instruction> mubuf{
9727 create_instruction<MUBUF_instruction>(op, Format::MUBUF, 3 + instr->is_sparse, 1)};
9728 mubuf->operands[0] = Operand(resource);
9729 mubuf->operands[1] = Operand(coords[0]);
9730 mubuf->operands[2] = Operand::c32(0);
9731 mubuf->definitions[0] = Definition(tmp_dst);
9732 mubuf->idxen = true;
9733 mubuf->tfe = instr->is_sparse;
9734 if (mubuf->tfe)
9735 mubuf->operands[3] = emit_tfe_init(bld, tmp_dst);
9736 ctx->block->instructions.emplace_back(std::move(mubuf));
9737
9738 expand_vector(ctx, tmp_dst, dst, instr->def.num_components, dmask);
9739 return;
9740 }
9741
9742 /* gather MIMG address components */
9743 std::vector<Temp> args;
9744 if (has_wqm_coord) {
9745 args.emplace_back(wqm_coord);
9746 if (!(ctx->block->kind & block_kind_top_level))
9747 ctx->unended_linear_vgprs.push_back(wqm_coord);
9748 }
9749 if (has_offset)
9750 args.emplace_back(offset);
9751 if (has_bias)
9752 args.emplace_back(emit_pack_v1(ctx, {bias})[0]);
9753 if (has_compare)
9754 args.emplace_back(compare);
9755 if (has_derivs)
9756 args.insert(args.end(), derivs.begin(), derivs.end());
9757
9758 args.insert(args.end(), coords.begin(), coords.end());
9759
9760 if (instr->op == nir_texop_txf || instr->op == nir_texop_fragment_fetch_amd ||
9761 instr->op == nir_texop_fragment_mask_fetch_amd || instr->op == nir_texop_txf_ms) {
9762 aco_opcode op = level_zero || instr->sampler_dim == GLSL_SAMPLER_DIM_MS ||
9763 instr->sampler_dim == GLSL_SAMPLER_DIM_SUBPASS_MS
9764 ? aco_opcode::image_load
9765 : aco_opcode::image_load_mip;
9766 Operand vdata = instr->is_sparse ? emit_tfe_init(bld, tmp_dst) : Operand(v1);
9767 MIMG_instruction* tex = emit_mimg(bld, op, tmp_dst, resource, Operand(s4), args, vdata);
9768 if (instr->op == nir_texop_fragment_mask_fetch_amd)
9769 tex->dim = da ? ac_image_2darray : ac_image_2d;
9770 else
9771 tex->dim = dim;
9772 tex->dmask = dmask & 0xf;
9773 tex->unrm = true;
9774 tex->da = da;
9775 tex->tfe = instr->is_sparse;
9776 tex->d16 = d16;
9777 tex->a16 = a16;
9778
9779 if (instr->op == nir_texop_fragment_mask_fetch_amd) {
9780 /* Use 0x76543210 if the image doesn't have FMASK. */
9781 assert(dmask == 1 && dst.bytes() == 4);
9782 assert(dst.id() != tmp_dst.id());
9783
9784 if (dst.regClass() == s1) {
9785 Temp is_not_null = bld.sopc(aco_opcode::s_cmp_lg_u32, bld.def(s1, scc), Operand::zero(),
9786 emit_extract_vector(ctx, resource, 1, s1));
9787 bld.sop2(aco_opcode::s_cselect_b32, Definition(dst), bld.as_uniform(tmp_dst),
9788 Operand::c32(0x76543210), bld.scc(is_not_null));
9789 } else {
9790 Temp is_not_null = bld.tmp(bld.lm);
9791 bld.vopc_e64(aco_opcode::v_cmp_lg_u32, Definition(is_not_null), Operand::zero(),
9792 emit_extract_vector(ctx, resource, 1, s1));
9793 bld.vop2(aco_opcode::v_cndmask_b32, Definition(dst),
9794 bld.copy(bld.def(v1), Operand::c32(0x76543210)), tmp_dst, is_not_null);
9795 }
9796 } else {
9797 expand_vector(ctx, tmp_dst, dst, instr->def.num_components, dmask);
9798 }
9799 return;
9800 }
9801
9802 bool separate_g16 = ctx->options->gfx_level >= GFX10 && g16;
9803
9804 // TODO: would be better to do this by adding offsets, but needs the opcodes ordered.
9805 aco_opcode opcode = aco_opcode::image_sample;
9806 if (has_offset) { /* image_sample_*_o */
9807 if (has_clamped_lod) {
9808 if (has_compare) {
9809 opcode = aco_opcode::image_sample_c_cl_o;
9810 if (separate_g16)
9811 opcode = aco_opcode::image_sample_c_d_cl_o_g16;
9812 else if (has_derivs)
9813 opcode = aco_opcode::image_sample_c_d_cl_o;
9814 if (has_bias)
9815 opcode = aco_opcode::image_sample_c_b_cl_o;
9816 } else {
9817 opcode = aco_opcode::image_sample_cl_o;
9818 if (separate_g16)
9819 opcode = aco_opcode::image_sample_d_cl_o_g16;
9820 else if (has_derivs)
9821 opcode = aco_opcode::image_sample_d_cl_o;
9822 if (has_bias)
9823 opcode = aco_opcode::image_sample_b_cl_o;
9824 }
9825 } else if (has_compare) {
9826 opcode = aco_opcode::image_sample_c_o;
9827 if (separate_g16)
9828 opcode = aco_opcode::image_sample_c_d_o_g16;
9829 else if (has_derivs)
9830 opcode = aco_opcode::image_sample_c_d_o;
9831 if (has_bias)
9832 opcode = aco_opcode::image_sample_c_b_o;
9833 if (level_zero)
9834 opcode = aco_opcode::image_sample_c_lz_o;
9835 if (has_lod)
9836 opcode = aco_opcode::image_sample_c_l_o;
9837 } else {
9838 opcode = aco_opcode::image_sample_o;
9839 if (separate_g16)
9840 opcode = aco_opcode::image_sample_d_o_g16;
9841 else if (has_derivs)
9842 opcode = aco_opcode::image_sample_d_o;
9843 if (has_bias)
9844 opcode = aco_opcode::image_sample_b_o;
9845 if (level_zero)
9846 opcode = aco_opcode::image_sample_lz_o;
9847 if (has_lod)
9848 opcode = aco_opcode::image_sample_l_o;
9849 }
9850 } else if (has_clamped_lod) { /* image_sample_*_cl */
9851 if (has_compare) {
9852 opcode = aco_opcode::image_sample_c_cl;
9853 if (separate_g16)
9854 opcode = aco_opcode::image_sample_c_d_cl_g16;
9855 else if (has_derivs)
9856 opcode = aco_opcode::image_sample_c_d_cl;
9857 if (has_bias)
9858 opcode = aco_opcode::image_sample_c_b_cl;
9859 } else {
9860 opcode = aco_opcode::image_sample_cl;
9861 if (separate_g16)
9862 opcode = aco_opcode::image_sample_d_cl_g16;
9863 else if (has_derivs)
9864 opcode = aco_opcode::image_sample_d_cl;
9865 if (has_bias)
9866 opcode = aco_opcode::image_sample_b_cl;
9867 }
9868 } else { /* no offset */
9869 if (has_compare) {
9870 opcode = aco_opcode::image_sample_c;
9871 if (separate_g16)
9872 opcode = aco_opcode::image_sample_c_d_g16;
9873 else if (has_derivs)
9874 opcode = aco_opcode::image_sample_c_d;
9875 if (has_bias)
9876 opcode = aco_opcode::image_sample_c_b;
9877 if (level_zero)
9878 opcode = aco_opcode::image_sample_c_lz;
9879 if (has_lod)
9880 opcode = aco_opcode::image_sample_c_l;
9881 } else {
9882 opcode = aco_opcode::image_sample;
9883 if (separate_g16)
9884 opcode = aco_opcode::image_sample_d_g16;
9885 else if (has_derivs)
9886 opcode = aco_opcode::image_sample_d;
9887 if (has_bias)
9888 opcode = aco_opcode::image_sample_b;
9889 if (level_zero)
9890 opcode = aco_opcode::image_sample_lz;
9891 if (has_lod)
9892 opcode = aco_opcode::image_sample_l;
9893 }
9894 }
9895
9896 if (instr->op == nir_texop_tg4) {
9897 /* GFX11 supports implicit LOD, but the extension is unsupported. */
9898 assert(level_zero || ctx->options->gfx_level < GFX11);
9899
9900 if (has_offset) { /* image_gather4_*_o */
9901 if (has_compare) {
9902 opcode = aco_opcode::image_gather4_c_o;
9903 if (level_zero)
9904 opcode = aco_opcode::image_gather4_c_lz_o;
9905 if (has_lod)
9906 opcode = aco_opcode::image_gather4_c_l_o;
9907 if (has_bias)
9908 opcode = aco_opcode::image_gather4_c_b_o;
9909 } else {
9910 opcode = aco_opcode::image_gather4_o;
9911 if (level_zero)
9912 opcode = aco_opcode::image_gather4_lz_o;
9913 if (has_lod)
9914 opcode = aco_opcode::image_gather4_l_o;
9915 if (has_bias)
9916 opcode = aco_opcode::image_gather4_b_o;
9917 }
9918 } else {
9919 if (has_compare) {
9920 opcode = aco_opcode::image_gather4_c;
9921 if (level_zero)
9922 opcode = aco_opcode::image_gather4_c_lz;
9923 if (has_lod)
9924 opcode = aco_opcode::image_gather4_c_l;
9925 if (has_bias)
9926 opcode = aco_opcode::image_gather4_c_b;
9927 } else {
9928 opcode = aco_opcode::image_gather4;
9929 if (level_zero)
9930 opcode = aco_opcode::image_gather4_lz;
9931 if (has_lod)
9932 opcode = aco_opcode::image_gather4_l;
9933 if (has_bias)
9934 opcode = aco_opcode::image_gather4_b;
9935 }
9936 }
9937 } else if (instr->op == nir_texop_lod) {
9938 opcode = aco_opcode::image_get_lod;
9939 }
9940
9941 bool implicit_derivs = bld.program->stage == fragment_fs && !has_derivs && !has_lod &&
9942 !level_zero && instr->sampler_dim != GLSL_SAMPLER_DIM_MS &&
9943 instr->sampler_dim != GLSL_SAMPLER_DIM_SUBPASS_MS;
9944
9945 Operand vdata = instr->is_sparse ? emit_tfe_init(bld, tmp_dst) : Operand(v1);
9946 MIMG_instruction* tex = emit_mimg(bld, opcode, tmp_dst, resource, Operand(sampler), args, vdata);
9947 tex->dim = dim;
9948 tex->dmask = dmask & 0xf;
9949 tex->da = da;
9950 tex->unrm = instr->sampler_dim == GLSL_SAMPLER_DIM_RECT;
9951 tex->tfe = instr->is_sparse;
9952 tex->d16 = d16;
9953 tex->a16 = a16;
9954 if (implicit_derivs)
9955 set_wqm(ctx, true);
9956
9957 if (tg4_integer_cube_workaround) {
9958 assert(tmp_dst.id() != dst.id());
9959 assert(tmp_dst.size() == dst.size());
9960
9961 emit_split_vector(ctx, tmp_dst, tmp_dst.size());
9962 Temp val[4];
9963 for (unsigned i = 0; i < 4; i++) {
9964 val[i] = emit_extract_vector(ctx, tmp_dst, i, v1);
9965 Temp cvt_val;
9966 if (instr->dest_type & nir_type_uint)
9967 cvt_val = bld.vop1(aco_opcode::v_cvt_u32_f32, bld.def(v1), val[i]);
9968 else
9969 cvt_val = bld.vop1(aco_opcode::v_cvt_i32_f32, bld.def(v1), val[i]);
9970 val[i] = bld.vop2(aco_opcode::v_cndmask_b32, bld.def(v1), val[i], cvt_val,
9971 tg4_compare_cube_wa64);
9972 }
9973
9974 Temp tmp = dst.regClass() == tmp_dst.regClass() ? dst : bld.tmp(tmp_dst.regClass());
9975 if (instr->is_sparse)
9976 tmp_dst = bld.pseudo(aco_opcode::p_create_vector, Definition(tmp), val[0], val[1], val[2],
9977 val[3], emit_extract_vector(ctx, tmp_dst, 4, v1));
9978 else
9979 tmp_dst = bld.pseudo(aco_opcode::p_create_vector, Definition(tmp), val[0], val[1], val[2],
9980 val[3]);
9981 }
9982 unsigned mask = instr->op == nir_texop_tg4 ? (instr->is_sparse ? 0x1F : 0xF) : dmask;
9983 expand_vector(ctx, tmp_dst, dst, instr->def.num_components, mask);
9984 }
9985
9986 Operand
get_phi_operand(isel_context * ctx,nir_def * ssa,RegClass rc,bool logical)9987 get_phi_operand(isel_context* ctx, nir_def* ssa, RegClass rc, bool logical)
9988 {
9989 Temp tmp = get_ssa_temp(ctx, ssa);
9990 if (ssa->parent_instr->type == nir_instr_type_undef) {
9991 return Operand(rc);
9992 } else if (logical && ssa->bit_size == 1 &&
9993 ssa->parent_instr->type == nir_instr_type_load_const) {
9994 bool val = nir_instr_as_load_const(ssa->parent_instr)->value[0].b;
9995 return Operand::c32_or_c64(val ? -1 : 0, ctx->program->lane_mask == s2);
9996 } else {
9997 return Operand(tmp);
9998 }
9999 }
10000
10001 void
visit_phi(isel_context * ctx,nir_phi_instr * instr)10002 visit_phi(isel_context* ctx, nir_phi_instr* instr)
10003 {
10004 aco_ptr<Pseudo_instruction> phi;
10005 Temp dst = get_ssa_temp(ctx, &instr->def);
10006 assert(instr->def.bit_size != 1 || dst.regClass() == ctx->program->lane_mask);
10007
10008 bool logical = !dst.is_linear() || instr->def.divergent;
10009 logical |= (ctx->block->kind & block_kind_merge) != 0;
10010 aco_opcode opcode = logical ? aco_opcode::p_phi : aco_opcode::p_linear_phi;
10011
10012 /* we want a sorted list of sources, since the predecessor list is also sorted */
10013 std::map<unsigned, nir_def*> phi_src;
10014 nir_foreach_phi_src (src, instr)
10015 phi_src[src->pred->index] = src->src.ssa;
10016
10017 std::vector<unsigned>& preds = logical ? ctx->block->logical_preds : ctx->block->linear_preds;
10018 unsigned num_operands = 0;
10019 Operand* const operands = (Operand*)alloca(
10020 (std::max(exec_list_length(&instr->srcs), (unsigned)preds.size()) + 1) * sizeof(Operand));
10021 unsigned num_defined = 0;
10022 unsigned cur_pred_idx = 0;
10023 for (std::pair<unsigned, nir_def*> src : phi_src) {
10024 if (cur_pred_idx < preds.size()) {
10025 /* handle missing preds (IF merges with discard/break) and extra preds
10026 * (loop exit with discard) */
10027 unsigned block = ctx->cf_info.nir_to_aco[src.first];
10028 unsigned skipped = 0;
10029 while (cur_pred_idx + skipped < preds.size() && preds[cur_pred_idx + skipped] != block)
10030 skipped++;
10031 if (cur_pred_idx + skipped < preds.size()) {
10032 for (unsigned i = 0; i < skipped; i++)
10033 operands[num_operands++] = Operand(dst.regClass());
10034 cur_pred_idx += skipped;
10035 } else {
10036 continue;
10037 }
10038 }
10039 /* Handle missing predecessors at the end. This shouldn't happen with loop
10040 * headers and we can't ignore these sources for loop header phis. */
10041 if (!(ctx->block->kind & block_kind_loop_header) && cur_pred_idx >= preds.size())
10042 continue;
10043 cur_pred_idx++;
10044 Operand op = get_phi_operand(ctx, src.second, dst.regClass(), logical);
10045 operands[num_operands++] = op;
10046 num_defined += !op.isUndefined();
10047 }
10048 /* handle block_kind_continue_or_break at loop exit blocks */
10049 while (cur_pred_idx++ < preds.size())
10050 operands[num_operands++] = Operand(dst.regClass());
10051
10052 /* If the loop ends with a break, still add a linear continue edge in case
10053 * that break is divergent or continue_or_break is used. We'll either remove
10054 * this operand later in visit_loop() if it's not necessary or replace the
10055 * undef with something correct. */
10056 if (!logical && ctx->block->kind & block_kind_loop_header) {
10057 nir_loop* loop = nir_cf_node_as_loop(instr->instr.block->cf_node.parent);
10058 nir_block* last = nir_loop_last_block(loop);
10059 if (last->successors[0] != instr->instr.block)
10060 operands[num_operands++] = Operand(RegClass());
10061 }
10062
10063 /* we can use a linear phi in some cases if one src is undef */
10064 if (dst.is_linear() && ctx->block->kind & block_kind_merge && num_defined == 1) {
10065 phi.reset(create_instruction<Pseudo_instruction>(aco_opcode::p_linear_phi, Format::PSEUDO,
10066 num_operands, 1));
10067
10068 Block* linear_else = &ctx->program->blocks[ctx->block->linear_preds[1]];
10069 Block* invert = &ctx->program->blocks[linear_else->linear_preds[0]];
10070 assert(invert->kind & block_kind_invert);
10071
10072 unsigned then_block = invert->linear_preds[0];
10073
10074 Block* insert_block = NULL;
10075 for (unsigned i = 0; i < num_operands; i++) {
10076 Operand op = operands[i];
10077 if (op.isUndefined())
10078 continue;
10079 insert_block = ctx->block->logical_preds[i] == then_block ? invert : ctx->block;
10080 phi->operands[0] = op;
10081 break;
10082 }
10083 assert(insert_block); /* should be handled by the "num_defined == 0" case above */
10084 phi->operands[1] = Operand(dst.regClass());
10085 phi->definitions[0] = Definition(dst);
10086 insert_block->instructions.emplace(insert_block->instructions.begin(), std::move(phi));
10087 return;
10088 }
10089
10090 phi.reset(create_instruction<Pseudo_instruction>(opcode, Format::PSEUDO, num_operands, 1));
10091 for (unsigned i = 0; i < num_operands; i++)
10092 phi->operands[i] = operands[i];
10093 phi->definitions[0] = Definition(dst);
10094 ctx->block->instructions.emplace(ctx->block->instructions.begin(), std::move(phi));
10095 }
10096
10097 void
visit_undef(isel_context * ctx,nir_undef_instr * instr)10098 visit_undef(isel_context* ctx, nir_undef_instr* instr)
10099 {
10100 Temp dst = get_ssa_temp(ctx, &instr->def);
10101
10102 assert(dst.type() == RegType::sgpr);
10103
10104 if (dst.size() == 1) {
10105 Builder(ctx->program, ctx->block).copy(Definition(dst), Operand::zero());
10106 } else {
10107 aco_ptr<Pseudo_instruction> vec{create_instruction<Pseudo_instruction>(
10108 aco_opcode::p_create_vector, Format::PSEUDO, dst.size(), 1)};
10109 for (unsigned i = 0; i < dst.size(); i++)
10110 vec->operands[i] = Operand::zero();
10111 vec->definitions[0] = Definition(dst);
10112 ctx->block->instructions.emplace_back(std::move(vec));
10113 }
10114 }
10115
10116 void
begin_loop(isel_context * ctx,loop_context * lc)10117 begin_loop(isel_context* ctx, loop_context* lc)
10118 {
10119 // TODO: we might want to wrap the loop around a branch if exec_potentially_empty=true
10120 append_logical_end(ctx->block);
10121 ctx->block->kind |= block_kind_loop_preheader | block_kind_uniform;
10122 Builder bld(ctx->program, ctx->block);
10123 bld.branch(aco_opcode::p_branch, bld.def(s2));
10124 unsigned loop_preheader_idx = ctx->block->index;
10125
10126 lc->loop_exit.kind |= (block_kind_loop_exit | (ctx->block->kind & block_kind_top_level));
10127
10128 ctx->program->next_loop_depth++;
10129
10130 Block* loop_header = ctx->program->create_and_insert_block();
10131 loop_header->kind |= block_kind_loop_header;
10132 add_edge(loop_preheader_idx, loop_header);
10133 ctx->block = loop_header;
10134
10135 append_logical_start(ctx->block);
10136
10137 lc->header_idx_old = std::exchange(ctx->cf_info.parent_loop.header_idx, loop_header->index);
10138 lc->exit_old = std::exchange(ctx->cf_info.parent_loop.exit, &lc->loop_exit);
10139 lc->divergent_cont_old = std::exchange(ctx->cf_info.parent_loop.has_divergent_continue, false);
10140 lc->divergent_branch_old = std::exchange(ctx->cf_info.parent_loop.has_divergent_branch, false);
10141 lc->divergent_if_old = std::exchange(ctx->cf_info.parent_if.is_divergent, false);
10142 }
10143
10144 void
end_loop(isel_context * ctx,loop_context * lc)10145 end_loop(isel_context* ctx, loop_context* lc)
10146 {
10147 // TODO: what if a loop ends with a unconditional or uniformly branched continue
10148 // and this branch is never taken?
10149 if (!ctx->cf_info.has_branch) {
10150 unsigned loop_header_idx = ctx->cf_info.parent_loop.header_idx;
10151 Builder bld(ctx->program, ctx->block);
10152 append_logical_end(ctx->block);
10153
10154 if (ctx->cf_info.exec_potentially_empty_discard ||
10155 ctx->cf_info.exec_potentially_empty_break) {
10156 /* Discards can result in code running with an empty exec mask.
10157 * This would result in divergent breaks not ever being taken. As a
10158 * workaround, break the loop when the loop mask is empty instead of
10159 * always continuing. */
10160 ctx->block->kind |= (block_kind_continue_or_break | block_kind_uniform);
10161 unsigned block_idx = ctx->block->index;
10162
10163 /* create helper blocks to avoid critical edges */
10164 Block* break_block = ctx->program->create_and_insert_block();
10165 break_block->kind = block_kind_uniform;
10166 bld.reset(break_block);
10167 bld.branch(aco_opcode::p_branch, bld.def(s2));
10168 add_linear_edge(block_idx, break_block);
10169 add_linear_edge(break_block->index, &lc->loop_exit);
10170
10171 Block* continue_block = ctx->program->create_and_insert_block();
10172 continue_block->kind = block_kind_uniform;
10173 bld.reset(continue_block);
10174 bld.branch(aco_opcode::p_branch, bld.def(s2));
10175 add_linear_edge(block_idx, continue_block);
10176 add_linear_edge(continue_block->index, &ctx->program->blocks[loop_header_idx]);
10177
10178 if (!ctx->cf_info.parent_loop.has_divergent_branch)
10179 add_logical_edge(block_idx, &ctx->program->blocks[loop_header_idx]);
10180 ctx->block = &ctx->program->blocks[block_idx];
10181 } else {
10182 ctx->block->kind |= (block_kind_continue | block_kind_uniform);
10183 if (!ctx->cf_info.parent_loop.has_divergent_branch)
10184 add_edge(ctx->block->index, &ctx->program->blocks[loop_header_idx]);
10185 else
10186 add_linear_edge(ctx->block->index, &ctx->program->blocks[loop_header_idx]);
10187 }
10188
10189 bld.reset(ctx->block);
10190 bld.branch(aco_opcode::p_branch, bld.def(s2));
10191 }
10192
10193 ctx->cf_info.has_branch = false;
10194 ctx->program->next_loop_depth--;
10195
10196 // TODO: if the loop has not a single exit, we must add one °°
10197 /* emit loop successor block */
10198 ctx->block = ctx->program->insert_block(std::move(lc->loop_exit));
10199 append_logical_start(ctx->block);
10200
10201 #if 0
10202 // TODO: check if it is beneficial to not branch on continues
10203 /* trim linear phis in loop header */
10204 for (auto&& instr : loop_entry->instructions) {
10205 if (instr->opcode == aco_opcode::p_linear_phi) {
10206 aco_ptr<Pseudo_instruction> new_phi{create_instruction<Pseudo_instruction>(aco_opcode::p_linear_phi, Format::PSEUDO, loop_entry->linear_predecessors.size(), 1)};
10207 new_phi->definitions[0] = instr->definitions[0];
10208 for (unsigned i = 0; i < new_phi->operands.size(); i++)
10209 new_phi->operands[i] = instr->operands[i];
10210 /* check that the remaining operands are all the same */
10211 for (unsigned i = new_phi->operands.size(); i < instr->operands.size(); i++)
10212 assert(instr->operands[i].tempId() == instr->operands.back().tempId());
10213 instr.swap(new_phi);
10214 } else if (instr->opcode == aco_opcode::p_phi) {
10215 continue;
10216 } else {
10217 break;
10218 }
10219 }
10220 #endif
10221
10222 ctx->cf_info.parent_loop.header_idx = lc->header_idx_old;
10223 ctx->cf_info.parent_loop.exit = lc->exit_old;
10224 ctx->cf_info.parent_loop.has_divergent_continue = lc->divergent_cont_old;
10225 ctx->cf_info.parent_loop.has_divergent_branch = lc->divergent_branch_old;
10226 ctx->cf_info.parent_if.is_divergent = lc->divergent_if_old;
10227 if (!ctx->block->loop_nest_depth && !ctx->cf_info.parent_if.is_divergent)
10228 ctx->cf_info.exec_potentially_empty_discard = false;
10229 }
10230
10231 void
emit_loop_jump(isel_context * ctx,bool is_break)10232 emit_loop_jump(isel_context* ctx, bool is_break)
10233 {
10234 Builder bld(ctx->program, ctx->block);
10235 Block* logical_target;
10236 append_logical_end(ctx->block);
10237 unsigned idx = ctx->block->index;
10238
10239 if (is_break) {
10240 logical_target = ctx->cf_info.parent_loop.exit;
10241 add_logical_edge(idx, logical_target);
10242 ctx->block->kind |= block_kind_break;
10243
10244 if (!ctx->cf_info.parent_if.is_divergent &&
10245 !ctx->cf_info.parent_loop.has_divergent_continue) {
10246 /* uniform break - directly jump out of the loop */
10247 ctx->block->kind |= block_kind_uniform;
10248 ctx->cf_info.has_branch = true;
10249 bld.branch(aco_opcode::p_branch, bld.def(s2));
10250 add_linear_edge(idx, logical_target);
10251 return;
10252 }
10253 ctx->cf_info.parent_loop.has_divergent_branch = true;
10254 } else {
10255 logical_target = &ctx->program->blocks[ctx->cf_info.parent_loop.header_idx];
10256 add_logical_edge(idx, logical_target);
10257 ctx->block->kind |= block_kind_continue;
10258
10259 if (!ctx->cf_info.parent_if.is_divergent) {
10260 /* uniform continue - directly jump to the loop header */
10261 ctx->block->kind |= block_kind_uniform;
10262 ctx->cf_info.has_branch = true;
10263 bld.branch(aco_opcode::p_branch, bld.def(s2));
10264 add_linear_edge(idx, logical_target);
10265 return;
10266 }
10267
10268 /* for potential uniform breaks after this continue,
10269 we must ensure that they are handled correctly */
10270 ctx->cf_info.parent_loop.has_divergent_continue = true;
10271 ctx->cf_info.parent_loop.has_divergent_branch = true;
10272 }
10273
10274 if (ctx->cf_info.parent_if.is_divergent && !ctx->cf_info.exec_potentially_empty_break) {
10275 ctx->cf_info.exec_potentially_empty_break = true;
10276 ctx->cf_info.exec_potentially_empty_break_depth = ctx->block->loop_nest_depth;
10277 }
10278
10279 /* remove critical edges from linear CFG */
10280 bld.branch(aco_opcode::p_branch, bld.def(s2));
10281 Block* break_block = ctx->program->create_and_insert_block();
10282 break_block->kind |= block_kind_uniform;
10283 add_linear_edge(idx, break_block);
10284 /* the loop_header pointer might be invalidated by this point */
10285 if (!is_break)
10286 logical_target = &ctx->program->blocks[ctx->cf_info.parent_loop.header_idx];
10287 add_linear_edge(break_block->index, logical_target);
10288 bld.reset(break_block);
10289 bld.branch(aco_opcode::p_branch, bld.def(s2));
10290
10291 Block* continue_block = ctx->program->create_and_insert_block();
10292 add_linear_edge(idx, continue_block);
10293 append_logical_start(continue_block);
10294 ctx->block = continue_block;
10295 }
10296
10297 void
emit_loop_break(isel_context * ctx)10298 emit_loop_break(isel_context* ctx)
10299 {
10300 emit_loop_jump(ctx, true);
10301 }
10302
10303 void
emit_loop_continue(isel_context * ctx)10304 emit_loop_continue(isel_context* ctx)
10305 {
10306 emit_loop_jump(ctx, false);
10307 }
10308
10309 void
visit_jump(isel_context * ctx,nir_jump_instr * instr)10310 visit_jump(isel_context* ctx, nir_jump_instr* instr)
10311 {
10312 /* visit_block() would usually do this but divergent jumps updates ctx->block */
10313 ctx->cf_info.nir_to_aco[instr->instr.block->index] = ctx->block->index;
10314
10315 switch (instr->type) {
10316 case nir_jump_break: emit_loop_break(ctx); break;
10317 case nir_jump_continue: emit_loop_continue(ctx); break;
10318 default: isel_err(&instr->instr, "Unknown NIR jump instr"); abort();
10319 }
10320 }
10321
10322 void
visit_block(isel_context * ctx,nir_block * block)10323 visit_block(isel_context* ctx, nir_block* block)
10324 {
10325 if (ctx->block->kind & block_kind_top_level) {
10326 Builder bld(ctx->program, ctx->block);
10327 for (Temp tmp : ctx->unended_linear_vgprs)
10328 bld.pseudo(aco_opcode::p_end_linear_vgpr, tmp);
10329 ctx->unended_linear_vgprs.clear();
10330 }
10331
10332 ctx->block->instructions.reserve(ctx->block->instructions.size() +
10333 exec_list_length(&block->instr_list) * 2);
10334 nir_foreach_instr (instr, block) {
10335 switch (instr->type) {
10336 case nir_instr_type_alu: visit_alu_instr(ctx, nir_instr_as_alu(instr)); break;
10337 case nir_instr_type_load_const: visit_load_const(ctx, nir_instr_as_load_const(instr)); break;
10338 case nir_instr_type_intrinsic: visit_intrinsic(ctx, nir_instr_as_intrinsic(instr)); break;
10339 case nir_instr_type_tex: visit_tex(ctx, nir_instr_as_tex(instr)); break;
10340 case nir_instr_type_phi: visit_phi(ctx, nir_instr_as_phi(instr)); break;
10341 case nir_instr_type_undef: visit_undef(ctx, nir_instr_as_undef(instr)); break;
10342 case nir_instr_type_deref: break;
10343 case nir_instr_type_jump: visit_jump(ctx, nir_instr_as_jump(instr)); break;
10344 default: isel_err(instr, "Unknown NIR instr type");
10345 }
10346 }
10347
10348 if (!ctx->cf_info.parent_loop.has_divergent_branch)
10349 ctx->cf_info.nir_to_aco[block->index] = ctx->block->index;
10350 }
10351
10352 static Operand
create_continue_phis(isel_context * ctx,unsigned first,unsigned last,aco_ptr<Instruction> & header_phi,Operand * vals)10353 create_continue_phis(isel_context* ctx, unsigned first, unsigned last,
10354 aco_ptr<Instruction>& header_phi, Operand* vals)
10355 {
10356 vals[0] = Operand(header_phi->definitions[0].getTemp());
10357 RegClass rc = vals[0].regClass();
10358
10359 unsigned loop_nest_depth = ctx->program->blocks[first].loop_nest_depth;
10360
10361 unsigned next_pred = 1;
10362
10363 for (unsigned idx = first + 1; idx <= last; idx++) {
10364 Block& block = ctx->program->blocks[idx];
10365 if (block.loop_nest_depth != loop_nest_depth) {
10366 vals[idx - first] = vals[idx - 1 - first];
10367 continue;
10368 }
10369
10370 if ((block.kind & block_kind_continue) && block.index != last) {
10371 vals[idx - first] = header_phi->operands[next_pred];
10372 next_pred++;
10373 continue;
10374 }
10375
10376 bool all_same = true;
10377 for (unsigned i = 1; all_same && (i < block.linear_preds.size()); i++)
10378 all_same = vals[block.linear_preds[i] - first] == vals[block.linear_preds[0] - first];
10379
10380 Operand val;
10381 if (all_same) {
10382 val = vals[block.linear_preds[0] - first];
10383 } else {
10384 aco_ptr<Instruction> phi(create_instruction<Pseudo_instruction>(
10385 aco_opcode::p_linear_phi, Format::PSEUDO, block.linear_preds.size(), 1));
10386 for (unsigned i = 0; i < block.linear_preds.size(); i++)
10387 phi->operands[i] = vals[block.linear_preds[i] - first];
10388 val = Operand(ctx->program->allocateTmp(rc));
10389 phi->definitions[0] = Definition(val.getTemp());
10390 block.instructions.emplace(block.instructions.begin(), std::move(phi));
10391 }
10392 vals[idx - first] = val;
10393 }
10394
10395 return vals[last - first];
10396 }
10397
10398 static void begin_uniform_if_then(isel_context* ctx, if_context* ic, Temp cond);
10399 static void begin_uniform_if_else(isel_context* ctx, if_context* ic);
10400 static void end_uniform_if(isel_context* ctx, if_context* ic);
10401
10402 static void
visit_loop(isel_context * ctx,nir_loop * loop)10403 visit_loop(isel_context* ctx, nir_loop* loop)
10404 {
10405 assert(!nir_loop_has_continue_construct(loop));
10406 loop_context lc;
10407 begin_loop(ctx, &lc);
10408
10409 bool unreachable = visit_cf_list(ctx, &loop->body);
10410
10411 unsigned loop_header_idx = ctx->cf_info.parent_loop.header_idx;
10412
10413 /* Fixup phis in loop header from unreachable blocks.
10414 * has_branch/has_divergent_branch also indicates if the loop ends with a
10415 * break/continue instruction, but we don't emit those if unreachable=true */
10416 if (unreachable) {
10417 assert(ctx->cf_info.has_branch || ctx->cf_info.parent_loop.has_divergent_branch);
10418 bool linear = ctx->cf_info.has_branch;
10419 bool logical = ctx->cf_info.has_branch || ctx->cf_info.parent_loop.has_divergent_branch;
10420 for (aco_ptr<Instruction>& instr : ctx->program->blocks[loop_header_idx].instructions) {
10421 if ((logical && instr->opcode == aco_opcode::p_phi) ||
10422 (linear && instr->opcode == aco_opcode::p_linear_phi)) {
10423 /* the last operand should be the one that needs to be removed */
10424 instr->operands.pop_back();
10425 } else if (!is_phi(instr)) {
10426 break;
10427 }
10428 }
10429 }
10430
10431 /* Fixup linear phis in loop header from expecting a continue. Both this fixup
10432 * and the previous one shouldn't both happen at once because a break in the
10433 * merge block would get CSE'd */
10434 if (nir_loop_last_block(loop)->successors[0] != nir_loop_first_block(loop)) {
10435 unsigned num_vals = ctx->cf_info.has_branch ? 1 : (ctx->block->index - loop_header_idx + 1);
10436 Operand* const vals = (Operand*)alloca(num_vals * sizeof(Operand));
10437 for (aco_ptr<Instruction>& instr : ctx->program->blocks[loop_header_idx].instructions) {
10438 if (instr->opcode == aco_opcode::p_linear_phi) {
10439 if (ctx->cf_info.has_branch)
10440 instr->operands.pop_back();
10441 else
10442 instr->operands.back() =
10443 create_continue_phis(ctx, loop_header_idx, ctx->block->index, instr, vals);
10444 } else if (!is_phi(instr)) {
10445 break;
10446 }
10447 }
10448 }
10449
10450 /* NIR seems to allow this, and even though the loop exit has no predecessors, SSA defs from the
10451 * loop header are live. Handle this without complicating the ACO IR by creating a dummy break.
10452 */
10453 if (nir_cf_node_cf_tree_next(&loop->cf_node)->predecessors->entries == 0) {
10454 Builder bld(ctx->program, ctx->block);
10455 Temp cond = bld.copy(bld.def(s1, scc), Operand::zero());
10456 if_context ic;
10457 begin_uniform_if_then(ctx, &ic, cond);
10458 emit_loop_break(ctx);
10459 begin_uniform_if_else(ctx, &ic);
10460 end_uniform_if(ctx, &ic);
10461 }
10462
10463 end_loop(ctx, &lc);
10464 }
10465
10466 static void
begin_divergent_if_then(isel_context * ctx,if_context * ic,Temp cond,nir_selection_control sel_ctrl=nir_selection_control_none)10467 begin_divergent_if_then(isel_context* ctx, if_context* ic, Temp cond,
10468 nir_selection_control sel_ctrl = nir_selection_control_none)
10469 {
10470 ic->cond = cond;
10471
10472 append_logical_end(ctx->block);
10473 ctx->block->kind |= block_kind_branch;
10474
10475 /* branch to linear then block */
10476 assert(cond.regClass() == ctx->program->lane_mask);
10477 aco_ptr<Pseudo_branch_instruction> branch;
10478 branch.reset(create_instruction<Pseudo_branch_instruction>(aco_opcode::p_cbranch_z,
10479 Format::PSEUDO_BRANCH, 1, 1));
10480 branch->definitions[0] = Definition(ctx->program->allocateTmp(s2));
10481 branch->operands[0] = Operand(cond);
10482 branch->selection_control_remove = sel_ctrl == nir_selection_control_flatten ||
10483 sel_ctrl == nir_selection_control_divergent_always_taken;
10484 ctx->block->instructions.push_back(std::move(branch));
10485
10486 ic->BB_if_idx = ctx->block->index;
10487 ic->BB_invert = Block();
10488 /* Invert blocks are intentionally not marked as top level because they
10489 * are not part of the logical cfg. */
10490 ic->BB_invert.kind |= block_kind_invert;
10491 ic->BB_endif = Block();
10492 ic->BB_endif.kind |= (block_kind_merge | (ctx->block->kind & block_kind_top_level));
10493
10494 ic->exec_potentially_empty_discard_old = ctx->cf_info.exec_potentially_empty_discard;
10495 ic->exec_potentially_empty_break_old = ctx->cf_info.exec_potentially_empty_break;
10496 ic->exec_potentially_empty_break_depth_old = ctx->cf_info.exec_potentially_empty_break_depth;
10497 ic->divergent_old = ctx->cf_info.parent_if.is_divergent;
10498 ic->had_divergent_discard_old = ctx->cf_info.had_divergent_discard;
10499 ctx->cf_info.parent_if.is_divergent = true;
10500
10501 /* divergent branches use cbranch_execz */
10502 ctx->cf_info.exec_potentially_empty_discard = false;
10503 ctx->cf_info.exec_potentially_empty_break = false;
10504 ctx->cf_info.exec_potentially_empty_break_depth = UINT16_MAX;
10505
10506 /** emit logical then block */
10507 ctx->program->next_divergent_if_logical_depth++;
10508 Block* BB_then_logical = ctx->program->create_and_insert_block();
10509 add_edge(ic->BB_if_idx, BB_then_logical);
10510 ctx->block = BB_then_logical;
10511 append_logical_start(BB_then_logical);
10512 }
10513
10514 static void
begin_divergent_if_else(isel_context * ctx,if_context * ic,nir_selection_control sel_ctrl=nir_selection_control_none)10515 begin_divergent_if_else(isel_context* ctx, if_context* ic,
10516 nir_selection_control sel_ctrl = nir_selection_control_none)
10517 {
10518 Block* BB_then_logical = ctx->block;
10519 append_logical_end(BB_then_logical);
10520 /* branch from logical then block to invert block */
10521 aco_ptr<Pseudo_branch_instruction> branch;
10522 branch.reset(create_instruction<Pseudo_branch_instruction>(aco_opcode::p_branch,
10523 Format::PSEUDO_BRANCH, 0, 1));
10524 branch->definitions[0] = Definition(ctx->program->allocateTmp(s2));
10525 BB_then_logical->instructions.emplace_back(std::move(branch));
10526 add_linear_edge(BB_then_logical->index, &ic->BB_invert);
10527 if (!ctx->cf_info.parent_loop.has_divergent_branch)
10528 add_logical_edge(BB_then_logical->index, &ic->BB_endif);
10529 BB_then_logical->kind |= block_kind_uniform;
10530 assert(!ctx->cf_info.has_branch);
10531 ic->then_branch_divergent = ctx->cf_info.parent_loop.has_divergent_branch;
10532 ctx->cf_info.parent_loop.has_divergent_branch = false;
10533 ctx->program->next_divergent_if_logical_depth--;
10534
10535 /** emit linear then block */
10536 Block* BB_then_linear = ctx->program->create_and_insert_block();
10537 BB_then_linear->kind |= block_kind_uniform;
10538 add_linear_edge(ic->BB_if_idx, BB_then_linear);
10539 /* branch from linear then block to invert block */
10540 branch.reset(create_instruction<Pseudo_branch_instruction>(aco_opcode::p_branch,
10541 Format::PSEUDO_BRANCH, 0, 1));
10542 branch->definitions[0] = Definition(ctx->program->allocateTmp(s2));
10543 BB_then_linear->instructions.emplace_back(std::move(branch));
10544 add_linear_edge(BB_then_linear->index, &ic->BB_invert);
10545
10546 /** emit invert merge block */
10547 ctx->block = ctx->program->insert_block(std::move(ic->BB_invert));
10548 ic->invert_idx = ctx->block->index;
10549
10550 /* branch to linear else block (skip else) */
10551 branch.reset(create_instruction<Pseudo_branch_instruction>(aco_opcode::p_branch,
10552 Format::PSEUDO_BRANCH, 0, 1));
10553 branch->definitions[0] = Definition(ctx->program->allocateTmp(s2));
10554 branch->selection_control_remove = sel_ctrl == nir_selection_control_flatten ||
10555 sel_ctrl == nir_selection_control_divergent_always_taken;
10556 ctx->block->instructions.push_back(std::move(branch));
10557
10558 ic->exec_potentially_empty_discard_old |= ctx->cf_info.exec_potentially_empty_discard;
10559 ic->exec_potentially_empty_break_old |= ctx->cf_info.exec_potentially_empty_break;
10560 ic->exec_potentially_empty_break_depth_old = std::min(
10561 ic->exec_potentially_empty_break_depth_old, ctx->cf_info.exec_potentially_empty_break_depth);
10562 /* divergent branches use cbranch_execz */
10563 ctx->cf_info.exec_potentially_empty_discard = false;
10564 ctx->cf_info.exec_potentially_empty_break = false;
10565 ctx->cf_info.exec_potentially_empty_break_depth = UINT16_MAX;
10566
10567 ic->had_divergent_discard_then = ctx->cf_info.had_divergent_discard;
10568 ctx->cf_info.had_divergent_discard = ic->had_divergent_discard_old;
10569
10570 /** emit logical else block */
10571 ctx->program->next_divergent_if_logical_depth++;
10572 Block* BB_else_logical = ctx->program->create_and_insert_block();
10573 add_logical_edge(ic->BB_if_idx, BB_else_logical);
10574 add_linear_edge(ic->invert_idx, BB_else_logical);
10575 ctx->block = BB_else_logical;
10576 append_logical_start(BB_else_logical);
10577 }
10578
10579 static void
end_divergent_if(isel_context * ctx,if_context * ic)10580 end_divergent_if(isel_context* ctx, if_context* ic)
10581 {
10582 Block* BB_else_logical = ctx->block;
10583 append_logical_end(BB_else_logical);
10584
10585 /* branch from logical else block to endif block */
10586 aco_ptr<Pseudo_branch_instruction> branch;
10587 branch.reset(create_instruction<Pseudo_branch_instruction>(aco_opcode::p_branch,
10588 Format::PSEUDO_BRANCH, 0, 1));
10589 branch->definitions[0] = Definition(ctx->program->allocateTmp(s2));
10590 BB_else_logical->instructions.emplace_back(std::move(branch));
10591 add_linear_edge(BB_else_logical->index, &ic->BB_endif);
10592 if (!ctx->cf_info.parent_loop.has_divergent_branch)
10593 add_logical_edge(BB_else_logical->index, &ic->BB_endif);
10594 BB_else_logical->kind |= block_kind_uniform;
10595 ctx->program->next_divergent_if_logical_depth--;
10596
10597 assert(!ctx->cf_info.has_branch);
10598 ctx->cf_info.parent_loop.has_divergent_branch &= ic->then_branch_divergent;
10599
10600 /** emit linear else block */
10601 Block* BB_else_linear = ctx->program->create_and_insert_block();
10602 BB_else_linear->kind |= block_kind_uniform;
10603 add_linear_edge(ic->invert_idx, BB_else_linear);
10604
10605 /* branch from linear else block to endif block */
10606 branch.reset(create_instruction<Pseudo_branch_instruction>(aco_opcode::p_branch,
10607 Format::PSEUDO_BRANCH, 0, 1));
10608 branch->definitions[0] = Definition(ctx->program->allocateTmp(s2));
10609 BB_else_linear->instructions.emplace_back(std::move(branch));
10610 add_linear_edge(BB_else_linear->index, &ic->BB_endif);
10611
10612 /** emit endif merge block */
10613 ctx->block = ctx->program->insert_block(std::move(ic->BB_endif));
10614 append_logical_start(ctx->block);
10615
10616 ctx->cf_info.parent_if.is_divergent = ic->divergent_old;
10617 ctx->cf_info.exec_potentially_empty_discard |= ic->exec_potentially_empty_discard_old;
10618 ctx->cf_info.exec_potentially_empty_break |= ic->exec_potentially_empty_break_old;
10619 ctx->cf_info.exec_potentially_empty_break_depth = std::min(
10620 ic->exec_potentially_empty_break_depth_old, ctx->cf_info.exec_potentially_empty_break_depth);
10621 if (ctx->block->loop_nest_depth == ctx->cf_info.exec_potentially_empty_break_depth &&
10622 !ctx->cf_info.parent_if.is_divergent) {
10623 ctx->cf_info.exec_potentially_empty_break = false;
10624 ctx->cf_info.exec_potentially_empty_break_depth = UINT16_MAX;
10625 }
10626 /* uniform control flow never has an empty exec-mask */
10627 if (!ctx->block->loop_nest_depth && !ctx->cf_info.parent_if.is_divergent) {
10628 ctx->cf_info.exec_potentially_empty_discard = false;
10629 ctx->cf_info.exec_potentially_empty_break = false;
10630 ctx->cf_info.exec_potentially_empty_break_depth = UINT16_MAX;
10631 }
10632 ctx->cf_info.had_divergent_discard |= ic->had_divergent_discard_then;
10633 }
10634
10635 static void
begin_uniform_if_then(isel_context * ctx,if_context * ic,Temp cond)10636 begin_uniform_if_then(isel_context* ctx, if_context* ic, Temp cond)
10637 {
10638 assert(cond.regClass() == s1);
10639
10640 append_logical_end(ctx->block);
10641 ctx->block->kind |= block_kind_uniform;
10642
10643 aco_ptr<Pseudo_branch_instruction> branch;
10644 aco_opcode branch_opcode = aco_opcode::p_cbranch_z;
10645 branch.reset(
10646 create_instruction<Pseudo_branch_instruction>(branch_opcode, Format::PSEUDO_BRANCH, 1, 1));
10647 branch->definitions[0] = Definition(ctx->program->allocateTmp(s2));
10648 branch->operands[0] = Operand(cond);
10649 branch->operands[0].setFixed(scc);
10650 ctx->block->instructions.emplace_back(std::move(branch));
10651
10652 ic->BB_if_idx = ctx->block->index;
10653 ic->BB_endif = Block();
10654 ic->BB_endif.kind |= ctx->block->kind & block_kind_top_level;
10655
10656 ctx->cf_info.has_branch = false;
10657 ctx->cf_info.parent_loop.has_divergent_branch = false;
10658
10659 ic->had_divergent_discard_old = ctx->cf_info.had_divergent_discard;
10660
10661 /** emit then block */
10662 ctx->program->next_uniform_if_depth++;
10663 Block* BB_then = ctx->program->create_and_insert_block();
10664 add_edge(ic->BB_if_idx, BB_then);
10665 append_logical_start(BB_then);
10666 ctx->block = BB_then;
10667 }
10668
10669 static void
begin_uniform_if_else(isel_context * ctx,if_context * ic)10670 begin_uniform_if_else(isel_context* ctx, if_context* ic)
10671 {
10672 Block* BB_then = ctx->block;
10673
10674 ic->uniform_has_then_branch = ctx->cf_info.has_branch;
10675 ic->then_branch_divergent = ctx->cf_info.parent_loop.has_divergent_branch;
10676
10677 if (!ic->uniform_has_then_branch) {
10678 append_logical_end(BB_then);
10679 /* branch from then block to endif block */
10680 aco_ptr<Pseudo_branch_instruction> branch;
10681 branch.reset(create_instruction<Pseudo_branch_instruction>(aco_opcode::p_branch,
10682 Format::PSEUDO_BRANCH, 0, 1));
10683 branch->definitions[0] = Definition(ctx->program->allocateTmp(s2));
10684 BB_then->instructions.emplace_back(std::move(branch));
10685 add_linear_edge(BB_then->index, &ic->BB_endif);
10686 if (!ic->then_branch_divergent)
10687 add_logical_edge(BB_then->index, &ic->BB_endif);
10688 BB_then->kind |= block_kind_uniform;
10689 }
10690
10691 ctx->cf_info.has_branch = false;
10692 ctx->cf_info.parent_loop.has_divergent_branch = false;
10693
10694 ic->had_divergent_discard_then = ctx->cf_info.had_divergent_discard;
10695 ctx->cf_info.had_divergent_discard = ic->had_divergent_discard_old;
10696
10697 /** emit else block */
10698 Block* BB_else = ctx->program->create_and_insert_block();
10699 add_edge(ic->BB_if_idx, BB_else);
10700 append_logical_start(BB_else);
10701 ctx->block = BB_else;
10702 }
10703
10704 static void
end_uniform_if(isel_context * ctx,if_context * ic)10705 end_uniform_if(isel_context* ctx, if_context* ic)
10706 {
10707 Block* BB_else = ctx->block;
10708
10709 if (!ctx->cf_info.has_branch) {
10710 append_logical_end(BB_else);
10711 /* branch from then block to endif block */
10712 aco_ptr<Pseudo_branch_instruction> branch;
10713 branch.reset(create_instruction<Pseudo_branch_instruction>(aco_opcode::p_branch,
10714 Format::PSEUDO_BRANCH, 0, 1));
10715 branch->definitions[0] = Definition(ctx->program->allocateTmp(s2));
10716 BB_else->instructions.emplace_back(std::move(branch));
10717 add_linear_edge(BB_else->index, &ic->BB_endif);
10718 if (!ctx->cf_info.parent_loop.has_divergent_branch)
10719 add_logical_edge(BB_else->index, &ic->BB_endif);
10720 BB_else->kind |= block_kind_uniform;
10721 }
10722
10723 ctx->cf_info.has_branch &= ic->uniform_has_then_branch;
10724 ctx->cf_info.parent_loop.has_divergent_branch &= ic->then_branch_divergent;
10725 ctx->cf_info.had_divergent_discard |= ic->had_divergent_discard_then;
10726
10727 /** emit endif merge block */
10728 ctx->program->next_uniform_if_depth--;
10729 if (!ctx->cf_info.has_branch) {
10730 ctx->block = ctx->program->insert_block(std::move(ic->BB_endif));
10731 append_logical_start(ctx->block);
10732 }
10733 }
10734
10735 static bool
visit_if(isel_context * ctx,nir_if * if_stmt)10736 visit_if(isel_context* ctx, nir_if* if_stmt)
10737 {
10738 Temp cond = get_ssa_temp(ctx, if_stmt->condition.ssa);
10739 Builder bld(ctx->program, ctx->block);
10740 aco_ptr<Pseudo_branch_instruction> branch;
10741 if_context ic;
10742
10743 if (!nir_src_is_divergent(if_stmt->condition)) { /* uniform condition */
10744 /**
10745 * Uniform conditionals are represented in the following way*) :
10746 *
10747 * The linear and logical CFG:
10748 * BB_IF
10749 * / \
10750 * BB_THEN (logical) BB_ELSE (logical)
10751 * \ /
10752 * BB_ENDIF
10753 *
10754 * *) Exceptions may be due to break and continue statements within loops
10755 * If a break/continue happens within uniform control flow, it branches
10756 * to the loop exit/entry block. Otherwise, it branches to the next
10757 * merge block.
10758 **/
10759
10760 assert(cond.regClass() == ctx->program->lane_mask);
10761 cond = bool_to_scalar_condition(ctx, cond);
10762
10763 begin_uniform_if_then(ctx, &ic, cond);
10764 visit_cf_list(ctx, &if_stmt->then_list);
10765
10766 begin_uniform_if_else(ctx, &ic);
10767 visit_cf_list(ctx, &if_stmt->else_list);
10768
10769 end_uniform_if(ctx, &ic);
10770 } else { /* non-uniform condition */
10771 /**
10772 * To maintain a logical and linear CFG without critical edges,
10773 * non-uniform conditionals are represented in the following way*) :
10774 *
10775 * The linear CFG:
10776 * BB_IF
10777 * / \
10778 * BB_THEN (logical) BB_THEN (linear)
10779 * \ /
10780 * BB_INVERT (linear)
10781 * / \
10782 * BB_ELSE (logical) BB_ELSE (linear)
10783 * \ /
10784 * BB_ENDIF
10785 *
10786 * The logical CFG:
10787 * BB_IF
10788 * / \
10789 * BB_THEN (logical) BB_ELSE (logical)
10790 * \ /
10791 * BB_ENDIF
10792 *
10793 * *) Exceptions may be due to break and continue statements within loops
10794 **/
10795
10796 begin_divergent_if_then(ctx, &ic, cond, if_stmt->control);
10797 visit_cf_list(ctx, &if_stmt->then_list);
10798
10799 begin_divergent_if_else(ctx, &ic, if_stmt->control);
10800 visit_cf_list(ctx, &if_stmt->else_list);
10801
10802 end_divergent_if(ctx, &ic);
10803 }
10804
10805 return !ctx->cf_info.has_branch && !ctx->block->logical_preds.empty();
10806 }
10807
10808 static bool
visit_cf_list(isel_context * ctx,struct exec_list * list)10809 visit_cf_list(isel_context* ctx, struct exec_list* list)
10810 {
10811 foreach_list_typed (nir_cf_node, node, node, list) {
10812 switch (node->type) {
10813 case nir_cf_node_block: visit_block(ctx, nir_cf_node_as_block(node)); break;
10814 case nir_cf_node_if:
10815 if (!visit_if(ctx, nir_cf_node_as_if(node)))
10816 return true;
10817 break;
10818 case nir_cf_node_loop: visit_loop(ctx, nir_cf_node_as_loop(node)); break;
10819 default: unreachable("unimplemented cf list type");
10820 }
10821 }
10822 return false;
10823 }
10824
10825 static void
export_mrt(isel_context * ctx,const struct aco_export_mrt * mrt)10826 export_mrt(isel_context* ctx, const struct aco_export_mrt* mrt)
10827 {
10828 Builder bld(ctx->program, ctx->block);
10829
10830 bld.exp(aco_opcode::exp, mrt->out[0], mrt->out[1], mrt->out[2], mrt->out[3],
10831 mrt->enabled_channels, mrt->target, mrt->compr);
10832
10833 ctx->program->has_color_exports = true;
10834 }
10835
10836 static bool
export_fs_mrt_color(isel_context * ctx,const struct aco_ps_epilog_info * info,Temp colors[4],unsigned slot,struct aco_export_mrt * mrt)10837 export_fs_mrt_color(isel_context* ctx, const struct aco_ps_epilog_info* info, Temp colors[4],
10838 unsigned slot, struct aco_export_mrt* mrt)
10839 {
10840 unsigned col_format = (info->spi_shader_col_format >> (slot * 4)) & 0xf;
10841
10842 if (col_format == V_028714_SPI_SHADER_ZERO)
10843 return false;
10844
10845 Builder bld(ctx->program, ctx->block);
10846 Operand values[4];
10847
10848 for (unsigned i = 0; i < 4; ++i) {
10849 values[i] = Operand(colors[i]);
10850 }
10851
10852 unsigned enabled_channels = 0;
10853 aco_opcode compr_op = aco_opcode::num_opcodes;
10854 bool compr = false;
10855 bool is_16bit = colors[0].regClass() == v2b;
10856 bool is_int8 = (info->color_is_int8 >> slot) & 1;
10857 bool is_int10 = (info->color_is_int10 >> slot) & 1;
10858 bool enable_mrt_output_nan_fixup = (ctx->options->enable_mrt_output_nan_fixup >> slot) & 1;
10859
10860 /* Replace NaN by zero (only 32-bit) to fix game bugs if requested. */
10861 if (enable_mrt_output_nan_fixup && !is_16bit &&
10862 (col_format == V_028714_SPI_SHADER_32_R || col_format == V_028714_SPI_SHADER_32_GR ||
10863 col_format == V_028714_SPI_SHADER_32_AR || col_format == V_028714_SPI_SHADER_32_ABGR ||
10864 col_format == V_028714_SPI_SHADER_FP16_ABGR)) {
10865 for (unsigned i = 0; i < 4; i++) {
10866 Temp is_not_nan =
10867 bld.vopc(aco_opcode::v_cmp_eq_f32, bld.def(bld.lm), values[i], values[i]);
10868 values[i] = bld.vop2(aco_opcode::v_cndmask_b32, bld.def(v1), Operand::zero(), values[i],
10869 is_not_nan);
10870 }
10871 }
10872
10873 switch (col_format) {
10874 case V_028714_SPI_SHADER_32_R: enabled_channels = 1; break;
10875
10876 case V_028714_SPI_SHADER_32_GR: enabled_channels = 0x3; break;
10877
10878 case V_028714_SPI_SHADER_32_AR:
10879 if (ctx->options->gfx_level >= GFX10) {
10880 /* Special case: on GFX10, the outputs are different for 32_AR */
10881 enabled_channels = 0x3;
10882 values[1] = values[3];
10883 values[3] = Operand(v1);
10884 } else {
10885 enabled_channels = 0x9;
10886 }
10887 break;
10888
10889 case V_028714_SPI_SHADER_FP16_ABGR:
10890 for (int i = 0; i < 2; i++) {
10891 if (is_16bit) {
10892 values[i] = bld.pseudo(aco_opcode::p_create_vector, bld.def(v1), values[i * 2],
10893 values[i * 2 + 1]);
10894 } else if (ctx->options->gfx_level == GFX8 || ctx->options->gfx_level == GFX9) {
10895 values[i] = bld.vop3(aco_opcode::v_cvt_pkrtz_f16_f32_e64, bld.def(v1), values[i * 2],
10896 values[i * 2 + 1]);
10897 } else {
10898 values[i] = bld.vop2(aco_opcode::v_cvt_pkrtz_f16_f32, bld.def(v1), values[i * 2],
10899 values[i * 2 + 1]);
10900 }
10901 }
10902 values[2] = Operand(v1);
10903 values[3] = Operand(v1);
10904 enabled_channels = 0xf;
10905 compr = true;
10906 break;
10907
10908 case V_028714_SPI_SHADER_UNORM16_ABGR:
10909 if (is_16bit && ctx->options->gfx_level >= GFX9) {
10910 compr_op = aco_opcode::v_cvt_pknorm_u16_f16;
10911 } else {
10912 compr_op = aco_opcode::v_cvt_pknorm_u16_f32;
10913 }
10914 break;
10915
10916 case V_028714_SPI_SHADER_SNORM16_ABGR:
10917 if (is_16bit && ctx->options->gfx_level >= GFX9) {
10918 compr_op = aco_opcode::v_cvt_pknorm_i16_f16;
10919 } else {
10920 compr_op = aco_opcode::v_cvt_pknorm_i16_f32;
10921 }
10922 break;
10923
10924 case V_028714_SPI_SHADER_UINT16_ABGR:
10925 compr_op = aco_opcode::v_cvt_pk_u16_u32;
10926 if (is_int8 || is_int10) {
10927 /* clamp */
10928 uint32_t max_rgb = is_int8 ? 255 : is_int10 ? 1023 : 0;
10929
10930 for (unsigned i = 0; i < 4; i++) {
10931 uint32_t max = i == 3 && is_int10 ? 3 : max_rgb;
10932
10933 values[i] = bld.vop2(aco_opcode::v_min_u32, bld.def(v1), Operand::c32(max), values[i]);
10934 }
10935 } else if (is_16bit) {
10936 for (unsigned i = 0; i < 4; i++) {
10937 Temp tmp = convert_int(ctx, bld, values[i].getTemp(), 16, 32, false);
10938 values[i] = Operand(tmp);
10939 }
10940 }
10941 break;
10942
10943 case V_028714_SPI_SHADER_SINT16_ABGR:
10944 compr_op = aco_opcode::v_cvt_pk_i16_i32;
10945 if (is_int8 || is_int10) {
10946 /* clamp */
10947 uint32_t max_rgb = is_int8 ? 127 : is_int10 ? 511 : 0;
10948 uint32_t min_rgb = is_int8 ? -128 : is_int10 ? -512 : 0;
10949
10950 for (unsigned i = 0; i < 4; i++) {
10951 uint32_t max = i == 3 && is_int10 ? 1 : max_rgb;
10952 uint32_t min = i == 3 && is_int10 ? -2u : min_rgb;
10953
10954 values[i] = bld.vop2(aco_opcode::v_min_i32, bld.def(v1), Operand::c32(max), values[i]);
10955 values[i] = bld.vop2(aco_opcode::v_max_i32, bld.def(v1), Operand::c32(min), values[i]);
10956 }
10957 } else if (is_16bit) {
10958 for (unsigned i = 0; i < 4; i++) {
10959 Temp tmp = convert_int(ctx, bld, values[i].getTemp(), 16, 32, true);
10960 values[i] = Operand(tmp);
10961 }
10962 }
10963 break;
10964
10965 case V_028714_SPI_SHADER_32_ABGR: enabled_channels = 0xF; break;
10966
10967 case V_028714_SPI_SHADER_ZERO:
10968 default: return false;
10969 }
10970
10971 if (compr_op != aco_opcode::num_opcodes) {
10972 values[0] = bld.vop3(compr_op, bld.def(v1), values[0], values[1]);
10973 values[1] = bld.vop3(compr_op, bld.def(v1), values[2], values[3]);
10974 values[2] = Operand(v1);
10975 values[3] = Operand(v1);
10976 enabled_channels = 0xf;
10977 compr = true;
10978 } else if (!compr) {
10979 for (int i = 0; i < 4; i++)
10980 values[i] = enabled_channels & (1 << i) ? values[i] : Operand(v1);
10981 }
10982
10983 if (ctx->program->gfx_level >= GFX11) {
10984 /* GFX11 doesn't use COMPR for exports, but the channel mask should be
10985 * 0x3 instead.
10986 */
10987 enabled_channels = compr ? 0x3 : enabled_channels;
10988 compr = false;
10989 }
10990
10991 for (unsigned i = 0; i < 4; i++)
10992 mrt->out[i] = values[i];
10993 mrt->target = V_008DFC_SQ_EXP_MRT;
10994 mrt->enabled_channels = enabled_channels;
10995 mrt->compr = compr;
10996
10997 return true;
10998 }
10999
11000 static void
export_fs_mrtz(isel_context * ctx,Temp depth,Temp stencil,Temp samplemask,Temp alpha)11001 export_fs_mrtz(isel_context* ctx, Temp depth, Temp stencil, Temp samplemask, Temp alpha)
11002 {
11003 Builder bld(ctx->program, ctx->block);
11004 unsigned enabled_channels = 0;
11005 bool compr = false;
11006 Operand values[4];
11007
11008 for (unsigned i = 0; i < 4; ++i) {
11009 values[i] = Operand(v1);
11010 }
11011
11012 /* Both stencil and sample mask only need 16-bits. */
11013 if (!depth.id() && !alpha.id() && (stencil.id() || samplemask.id())) {
11014 compr = ctx->program->gfx_level < GFX11; /* COMPR flag */
11015
11016 if (stencil.id()) {
11017 /* Stencil should be in X[23:16]. */
11018 values[0] = bld.vop2(aco_opcode::v_lshlrev_b32, bld.def(v1), Operand::c32(16u), stencil);
11019 enabled_channels |= ctx->program->gfx_level >= GFX11 ? 0x1 : 0x3;
11020 }
11021
11022 if (samplemask.id()) {
11023 /* SampleMask should be in Y[15:0]. */
11024 values[1] = Operand(samplemask);
11025 enabled_channels |= ctx->program->gfx_level >= GFX11 ? 0x2 : 0xc;
11026 }
11027 } else {
11028 if (depth.id()) {
11029 values[0] = Operand(depth);
11030 enabled_channels |= 0x1;
11031 }
11032
11033 if (stencil.id()) {
11034 values[1] = Operand(stencil);
11035 enabled_channels |= 0x2;
11036 }
11037
11038 if (samplemask.id()) {
11039 values[2] = Operand(samplemask);
11040 enabled_channels |= 0x4;
11041 }
11042
11043 if (alpha.id()) {
11044 assert(ctx->program->gfx_level >= GFX11);
11045 values[3] = Operand(alpha);
11046 enabled_channels |= 0x8;
11047 }
11048 }
11049
11050 /* GFX6 (except OLAND and HAINAN) has a bug that it only looks at the X
11051 * writemask component.
11052 */
11053 if (ctx->options->gfx_level == GFX6 && ctx->options->family != CHIP_OLAND &&
11054 ctx->options->family != CHIP_HAINAN) {
11055 enabled_channels |= 0x1;
11056 }
11057
11058 bld.exp(aco_opcode::exp, values[0], values[1], values[2], values[3], enabled_channels,
11059 V_008DFC_SQ_EXP_MRTZ, compr);
11060 }
11061
11062 static void
create_fs_null_export(isel_context * ctx)11063 create_fs_null_export(isel_context* ctx)
11064 {
11065 /* FS must always have exports.
11066 * So when there are none, we need to add a null export.
11067 */
11068
11069 Builder bld(ctx->program, ctx->block);
11070 /* GFX11 doesn't support NULL exports, and MRT0 should be exported instead. */
11071 unsigned dest = ctx->options->gfx_level >= GFX11 ? V_008DFC_SQ_EXP_MRT : V_008DFC_SQ_EXP_NULL;
11072 bld.exp(aco_opcode::exp, Operand(v1), Operand(v1), Operand(v1), Operand(v1),
11073 /* enabled_mask */ 0, dest, /* compr */ false, /* done */ true, /* vm */ true);
11074
11075 ctx->program->has_color_exports = true;
11076 }
11077
11078 static void
create_fs_jump_to_epilog(isel_context * ctx)11079 create_fs_jump_to_epilog(isel_context* ctx)
11080 {
11081 Builder bld(ctx->program, ctx->block);
11082 std::vector<Operand> exports;
11083 unsigned vgpr = 256; /* VGPR 0 */
11084
11085 if (ctx->outputs.mask[FRAG_RESULT_DEPTH])
11086 exports.emplace_back(Operand(ctx->outputs.temps[FRAG_RESULT_DEPTH * 4u], PhysReg{vgpr++}));
11087
11088 if (ctx->outputs.mask[FRAG_RESULT_STENCIL])
11089 exports.emplace_back(Operand(ctx->outputs.temps[FRAG_RESULT_STENCIL * 4u], PhysReg{vgpr++}));
11090
11091 if (ctx->outputs.mask[FRAG_RESULT_SAMPLE_MASK])
11092 exports.emplace_back(
11093 Operand(ctx->outputs.temps[FRAG_RESULT_SAMPLE_MASK * 4u], PhysReg{vgpr++}));
11094
11095 PhysReg exports_start(vgpr);
11096
11097 for (unsigned slot = FRAG_RESULT_DATA0; slot < FRAG_RESULT_DATA7 + 1; ++slot) {
11098 unsigned color_index = slot - FRAG_RESULT_DATA0;
11099 unsigned color_type = (ctx->output_color_types >> (color_index * 2)) & 0x3;
11100 unsigned write_mask = ctx->outputs.mask[slot];
11101
11102 if (!write_mask)
11103 continue;
11104
11105 PhysReg color_start(exports_start.reg() + color_index * 4);
11106
11107 for (unsigned i = 0; i < 4; i++) {
11108 if (!(write_mask & BITFIELD_BIT(i))) {
11109 exports.emplace_back(Operand(v1));
11110 continue;
11111 }
11112
11113 PhysReg chan_reg = color_start.advance(i * 4u);
11114 Operand chan(ctx->outputs.temps[slot * 4u + i]);
11115
11116 if (color_type == ACO_TYPE_FLOAT16) {
11117 chan = bld.vop1(aco_opcode::v_cvt_f32_f16, bld.def(v1), chan);
11118 } else if (color_type == ACO_TYPE_INT16 || color_type == ACO_TYPE_UINT16) {
11119 bool sign_ext = color_type == ACO_TYPE_INT16;
11120 Temp tmp = convert_int(ctx, bld, chan.getTemp(), 16, 32, sign_ext);
11121 chan = Operand(tmp);
11122 }
11123
11124 chan.setFixed(chan_reg);
11125 exports.emplace_back(chan);
11126 }
11127 }
11128
11129 Temp continue_pc = convert_pointer_to_64_bit(ctx, get_arg(ctx, ctx->program->info.ps.epilog_pc));
11130
11131 aco_ptr<Pseudo_instruction> jump{create_instruction<Pseudo_instruction>(
11132 aco_opcode::p_jump_to_epilog, Format::PSEUDO, 1 + exports.size(), 0)};
11133 jump->operands[0] = Operand(continue_pc);
11134 for (unsigned i = 0; i < exports.size(); i++) {
11135 jump->operands[i + 1] = exports[i];
11136 }
11137 ctx->block->instructions.emplace_back(std::move(jump));
11138 }
11139
11140 PhysReg
get_arg_reg(const struct ac_shader_args * args,struct ac_arg arg)11141 get_arg_reg(const struct ac_shader_args* args, struct ac_arg arg)
11142 {
11143 assert(arg.used);
11144 enum ac_arg_regfile file = args->args[arg.arg_index].file;
11145 unsigned reg = args->args[arg.arg_index].offset;
11146 return PhysReg(file == AC_ARG_SGPR ? reg : reg + 256);
11147 }
11148
11149 static Operand
get_arg_for_end(isel_context * ctx,struct ac_arg arg)11150 get_arg_for_end(isel_context* ctx, struct ac_arg arg)
11151 {
11152 return Operand(get_arg(ctx, arg), get_arg_reg(ctx->args, arg));
11153 }
11154
11155 static Temp
get_tcs_out_current_patch_data_offset(isel_context * ctx)11156 get_tcs_out_current_patch_data_offset(isel_context* ctx)
11157 {
11158 Builder bld(ctx->program, ctx->block);
11159
11160 const unsigned output_vertex_size = ctx->program->info.tcs.num_linked_outputs * 4u;
11161 const unsigned pervertex_output_patch_size =
11162 ctx->program->info.tcs.tcs_vertices_out * output_vertex_size;
11163 const unsigned output_patch_stride =
11164 pervertex_output_patch_size + ctx->program->info.tcs.num_linked_patch_outputs * 4u;
11165
11166 Temp tcs_rel_ids = get_arg(ctx, ctx->args->tcs_rel_ids);
11167 Temp rel_patch_id =
11168 bld.vop3(aco_opcode::v_bfe_u32, bld.def(v1), tcs_rel_ids, Operand::c32(0u), Operand::c32(8u));
11169 Temp patch_offset = bld.v_mul_imm(bld.def(v1), rel_patch_id, output_patch_stride, false);
11170
11171 Temp tcs_offchip_layout = get_arg(ctx, ctx->program->info.tcs.tcs_offchip_layout);
11172
11173 Temp patch_control_points = bld.sop2(aco_opcode::s_and_b32, bld.def(s1), bld.def(s1, scc),
11174 tcs_offchip_layout, Operand::c32(0x3f));
11175
11176 Temp num_patches = bld.sop2(aco_opcode::s_bfe_u32, bld.def(s1), bld.def(s1, scc),
11177 tcs_offchip_layout, Operand::c32(0x60006));
11178
11179 Temp lshs_vertex_stride = bld.sop2(aco_opcode::s_bfe_u32, bld.def(s1), bld.def(s1, scc),
11180 tcs_offchip_layout, Operand::c32(0x8000c));
11181
11182 Temp input_patch_size =
11183 bld.sop2(aco_opcode::s_mul_i32, bld.def(s1), patch_control_points, lshs_vertex_stride);
11184
11185 Temp output_patch0_offset =
11186 bld.sop2(aco_opcode::s_mul_i32, bld.def(s1), num_patches, input_patch_size);
11187
11188 Temp output_patch_offset =
11189 bld.nuw().sop2(aco_opcode::s_add_i32, bld.def(s1), bld.def(s1, scc),
11190 Operand::c32(pervertex_output_patch_size), output_patch0_offset);
11191
11192 return bld.nuw().vadd32(bld.def(v1), patch_offset, output_patch_offset);
11193 }
11194
11195 static Temp
get_patch_base(isel_context * ctx)11196 get_patch_base(isel_context* ctx)
11197 {
11198 Builder bld(ctx->program, ctx->block);
11199
11200 const unsigned output_vertex_size = ctx->program->info.tcs.num_linked_outputs * 16u;
11201 const unsigned pervertex_output_patch_size =
11202 ctx->program->info.tcs.tcs_vertices_out * output_vertex_size;
11203
11204 Temp num_patches =
11205 bld.sop2(aco_opcode::s_bfe_u32, bld.def(s1), bld.def(s1, scc),
11206 get_arg(ctx, ctx->program->info.tcs.tcs_offchip_layout), Operand::c32(0x60006));
11207
11208 return bld.sop2(aco_opcode::s_mul_i32, bld.def(s1), num_patches,
11209 Operand::c32(pervertex_output_patch_size));
11210 }
11211
11212 static void
passthrough_all_args(isel_context * ctx,std::vector<Operand> & regs)11213 passthrough_all_args(isel_context* ctx, std::vector<Operand>& regs)
11214 {
11215 struct ac_arg arg;
11216 arg.used = true;
11217
11218 for (arg.arg_index = 0; arg.arg_index < ctx->args->arg_count; arg.arg_index++)
11219 regs.emplace_back(get_arg_for_end(ctx, arg));
11220 }
11221
11222 static void
build_end_with_regs(isel_context * ctx,std::vector<Operand> & regs)11223 build_end_with_regs(isel_context* ctx, std::vector<Operand>& regs)
11224 {
11225 aco_ptr<Pseudo_instruction> end{create_instruction<Pseudo_instruction>(
11226 aco_opcode::p_end_with_regs, Format::PSEUDO, regs.size(), 0)};
11227
11228 for (unsigned i = 0; i < regs.size(); i++)
11229 end->operands[i] = regs[i];
11230
11231 ctx->block->instructions.emplace_back(std::move(end));
11232
11233 ctx->block->kind |= block_kind_end_with_regs;
11234 }
11235
11236 static void
create_tcs_jump_to_epilog(isel_context * ctx)11237 create_tcs_jump_to_epilog(isel_context* ctx)
11238 {
11239 Builder bld(ctx->program, ctx->block);
11240
11241 PhysReg vgpr_start(256); /* VGPR 0 */
11242 PhysReg sgpr_start(0); /* SGPR 0 */
11243
11244 /* SGPRs */
11245 Operand ring_offsets = Operand(get_arg(ctx, ctx->args->ring_offsets));
11246 ring_offsets.setFixed(sgpr_start);
11247
11248 Operand tess_offchip_offset = Operand(get_arg(ctx, ctx->args->tess_offchip_offset));
11249 tess_offchip_offset.setFixed(sgpr_start.advance(8u));
11250
11251 Operand tcs_factor_offset = Operand(get_arg(ctx, ctx->args->tcs_factor_offset));
11252 tcs_factor_offset.setFixed(sgpr_start.advance(12u));
11253
11254 Operand tcs_offchip_layout = Operand(get_arg(ctx, ctx->program->info.tcs.tcs_offchip_layout));
11255 tcs_offchip_layout.setFixed(sgpr_start.advance(16u));
11256
11257 Operand patch_base = Operand(get_patch_base(ctx));
11258 patch_base.setFixed(sgpr_start.advance(20u));
11259
11260 /* VGPRs */
11261 Operand tcs_out_current_patch_data_offset = Operand(get_tcs_out_current_patch_data_offset(ctx));
11262 tcs_out_current_patch_data_offset.setFixed(vgpr_start);
11263
11264 Operand invocation_id =
11265 bld.vop3(aco_opcode::v_bfe_u32, bld.def(v1), get_arg(ctx, ctx->args->tcs_rel_ids),
11266 Operand::c32(8u), Operand::c32(5u));
11267 invocation_id.setFixed(vgpr_start.advance(4u));
11268
11269 Operand rel_patch_id =
11270 bld.pseudo(aco_opcode::p_extract, bld.def(v1), get_arg(ctx, ctx->args->tcs_rel_ids),
11271 Operand::c32(0u), Operand::c32(8u), Operand::c32(0u));
11272 rel_patch_id.setFixed(vgpr_start.advance(8u));
11273
11274 Temp continue_pc =
11275 convert_pointer_to_64_bit(ctx, get_arg(ctx, ctx->program->info.tcs.epilog_pc));
11276
11277 aco_ptr<Pseudo_instruction> jump{
11278 create_instruction<Pseudo_instruction>(aco_opcode::p_jump_to_epilog, Format::PSEUDO, 9, 0)};
11279 jump->operands[0] = Operand(continue_pc);
11280 jump->operands[1] = ring_offsets;
11281 jump->operands[2] = tess_offchip_offset;
11282 jump->operands[3] = tcs_factor_offset;
11283 jump->operands[4] = tcs_offchip_layout;
11284 jump->operands[5] = patch_base;
11285 jump->operands[6] = tcs_out_current_patch_data_offset;
11286 jump->operands[7] = invocation_id;
11287 jump->operands[8] = rel_patch_id;
11288 ctx->block->instructions.emplace_back(std::move(jump));
11289 }
11290
11291 static void
create_tcs_end_for_epilog(isel_context * ctx)11292 create_tcs_end_for_epilog(isel_context* ctx)
11293 {
11294 std::vector<Operand> regs;
11295
11296 regs.emplace_back(get_arg_for_end(ctx, ctx->program->info.tcs.tcs_offchip_layout));
11297 regs.emplace_back(get_arg_for_end(ctx, ctx->program->info.tcs.tes_offchip_addr));
11298 regs.emplace_back(get_arg_for_end(ctx, ctx->args->tess_offchip_offset));
11299 regs.emplace_back(get_arg_for_end(ctx, ctx->args->tcs_factor_offset));
11300
11301 Builder bld(ctx->program, ctx->block);
11302
11303 /* Leave a hole corresponding to the two input VGPRs. This ensures that
11304 * the invocation_id output does not alias the tcs_rel_ids input,
11305 * which saves a V_MOV on gfx9.
11306 */
11307 unsigned vgpr = 256 + ctx->args->num_vgprs_used;
11308
11309 Temp rel_patch_id =
11310 bld.pseudo(aco_opcode::p_extract, bld.def(v1), get_arg(ctx, ctx->args->tcs_rel_ids),
11311 Operand::c32(0u), Operand::c32(8u), Operand::c32(0u));
11312 regs.emplace_back(Operand(rel_patch_id, PhysReg{vgpr++}));
11313
11314 Temp invocation_id =
11315 bld.vop3(aco_opcode::v_bfe_u32, bld.def(v1), get_arg(ctx, ctx->args->tcs_rel_ids),
11316 Operand::c32(8u), Operand::c32(5u));
11317 regs.emplace_back(Operand(invocation_id, PhysReg{vgpr++}));
11318
11319 if (ctx->program->info.tcs.pass_tessfactors_by_reg) {
11320 vgpr++; /* skip the tess factor LDS offset */
11321
11322 unsigned slot = VARYING_SLOT_TESS_LEVEL_OUTER;
11323 u_foreach_bit (i, ctx->outputs.mask[slot]) {
11324 regs.emplace_back(Operand(ctx->outputs.temps[slot * 4 + i], PhysReg{vgpr + i}));
11325 }
11326 vgpr += 4;
11327
11328 slot = VARYING_SLOT_TESS_LEVEL_INNER;
11329 u_foreach_bit (i, ctx->outputs.mask[slot]) {
11330 regs.emplace_back(Operand(ctx->outputs.temps[slot * 4 + i], PhysReg{vgpr + i}));
11331 }
11332 } else {
11333 Temp patch0_patch_data_offset =
11334 bld.sop2(aco_opcode::s_bfe_u32, bld.def(s1), bld.def(s1, scc),
11335 get_arg(ctx, ctx->program->info.tcs.vs_state_bits), Operand::c32(0xe000a));
11336
11337 Temp tf_lds_offset =
11338 bld.v_mul24_imm(bld.def(v1), rel_patch_id, ctx->program->info.tcs.patch_stride);
11339 tf_lds_offset = bld.nuw().vadd32(bld.def(v1), tf_lds_offset, patch0_patch_data_offset);
11340
11341 regs.emplace_back(Operand(tf_lds_offset, PhysReg{vgpr}));
11342 }
11343
11344 build_end_with_regs(ctx, regs);
11345 }
11346
11347 static void
create_fs_end_for_epilog(isel_context * ctx)11348 create_fs_end_for_epilog(isel_context* ctx)
11349 {
11350 Builder bld(ctx->program, ctx->block);
11351
11352 std::vector<Operand> regs;
11353
11354 regs.emplace_back(get_arg_for_end(ctx, ctx->program->info.ps.alpha_reference));
11355
11356 unsigned vgpr = 256;
11357
11358 for (unsigned slot = FRAG_RESULT_DATA0; slot <= FRAG_RESULT_DATA7; slot++) {
11359 unsigned index = slot - FRAG_RESULT_DATA0;
11360 unsigned type = (ctx->output_color_types >> (index * 2)) & 0x3;
11361 unsigned write_mask = ctx->outputs.mask[slot];
11362
11363 if (!write_mask)
11364 continue;
11365
11366 if (type == ACO_TYPE_ANY32) {
11367 u_foreach_bit (i, write_mask) {
11368 regs.emplace_back(Operand(ctx->outputs.temps[slot * 4 + i], PhysReg{vgpr + i}));
11369 }
11370 } else {
11371 for (unsigned i = 0; i < 2; i++) {
11372 unsigned mask = (write_mask >> (i * 2)) & 0x3;
11373 if (!mask)
11374 continue;
11375
11376 unsigned chan = slot * 4 + i * 2;
11377 Operand lo = mask & 0x1 ? Operand(ctx->outputs.temps[chan]) : Operand(v2b);
11378 Operand hi = mask & 0x2 ? Operand(ctx->outputs.temps[chan + 1]) : Operand(v2b);
11379
11380 Temp dst = bld.pseudo(aco_opcode::p_create_vector, bld.def(v1), lo, hi);
11381 regs.emplace_back(Operand(dst, PhysReg{vgpr + i}));
11382 }
11383 }
11384 vgpr += 4;
11385 }
11386
11387 if (ctx->outputs.mask[FRAG_RESULT_DEPTH])
11388 regs.emplace_back(Operand(ctx->outputs.temps[FRAG_RESULT_DEPTH * 4], PhysReg{vgpr++}));
11389
11390 if (ctx->outputs.mask[FRAG_RESULT_STENCIL])
11391 regs.emplace_back(Operand(ctx->outputs.temps[FRAG_RESULT_STENCIL * 4], PhysReg{vgpr++}));
11392
11393 if (ctx->outputs.mask[FRAG_RESULT_SAMPLE_MASK])
11394 regs.emplace_back(Operand(ctx->outputs.temps[FRAG_RESULT_SAMPLE_MASK * 4], PhysReg{vgpr++}));
11395
11396 build_end_with_regs(ctx, regs);
11397
11398 /* Exit WQM mode finally. */
11399 ctx->program->needs_exact = true;
11400 }
11401
11402 Pseudo_instruction*
add_startpgm(struct isel_context * ctx)11403 add_startpgm(struct isel_context* ctx)
11404 {
11405 unsigned def_count = 0;
11406 for (unsigned i = 0; i < ctx->args->arg_count; i++) {
11407 if (ctx->args->args[i].skip)
11408 continue;
11409 unsigned align = MIN2(4, util_next_power_of_two(ctx->args->args[i].size));
11410 if (ctx->args->args[i].file == AC_ARG_SGPR && ctx->args->args[i].offset % align)
11411 def_count += ctx->args->args[i].size;
11412 else
11413 def_count++;
11414 }
11415
11416 Pseudo_instruction* startpgm =
11417 create_instruction<Pseudo_instruction>(aco_opcode::p_startpgm, Format::PSEUDO, 0, def_count);
11418 ctx->block->instructions.emplace_back(startpgm);
11419 for (unsigned i = 0, arg = 0; i < ctx->args->arg_count; i++) {
11420 if (ctx->args->args[i].skip)
11421 continue;
11422
11423 enum ac_arg_regfile file = ctx->args->args[i].file;
11424 unsigned size = ctx->args->args[i].size;
11425 unsigned reg = ctx->args->args[i].offset;
11426 RegClass type = RegClass(file == AC_ARG_SGPR ? RegType::sgpr : RegType::vgpr, size);
11427
11428 if (file == AC_ARG_SGPR && reg % MIN2(4, util_next_power_of_two(size))) {
11429 Temp elems[16];
11430 for (unsigned j = 0; j < size; j++) {
11431 elems[j] = ctx->program->allocateTmp(s1);
11432 startpgm->definitions[arg++] = Definition(elems[j].id(), PhysReg{reg + j}, s1);
11433 }
11434 ctx->arg_temps[i] = create_vec_from_array(ctx, elems, size, RegType::sgpr, 4);
11435 } else {
11436 Temp dst = ctx->program->allocateTmp(type);
11437 Definition def(dst);
11438 def.setFixed(PhysReg{file == AC_ARG_SGPR ? reg : reg + 256});
11439 ctx->arg_temps[i] = dst;
11440 startpgm->definitions[arg++] = def;
11441
11442 if (ctx->args->args[i].pending_vmem) {
11443 assert(file == AC_ARG_VGPR);
11444 ctx->program->args_pending_vmem.push_back(def);
11445 }
11446 }
11447 }
11448
11449 /* epilog has no scratch */
11450 if (ctx->args->scratch_offset.used) {
11451 if (ctx->program->gfx_level < GFX9) {
11452 /* Stash these in the program so that they can be accessed later when
11453 * handling spilling.
11454 */
11455 if (ctx->args->ring_offsets.used)
11456 ctx->program->private_segment_buffer = get_arg(ctx, ctx->args->ring_offsets);
11457
11458 ctx->program->scratch_offset = get_arg(ctx, ctx->args->scratch_offset);
11459 } else if (ctx->program->gfx_level <= GFX10_3 && ctx->program->stage != raytracing_cs) {
11460 /* Manually initialize scratch. For RT stages scratch initialization is done in the prolog.
11461 */
11462 Operand scratch_offset = Operand(get_arg(ctx, ctx->args->scratch_offset));
11463 scratch_offset.setLateKill(true);
11464
11465 Operand scratch_addr = ctx->args->ring_offsets.used
11466 ? Operand(get_arg(ctx, ctx->args->ring_offsets))
11467 : Operand(s2);
11468
11469 Builder bld(ctx->program, ctx->block);
11470 bld.pseudo(aco_opcode::p_init_scratch, bld.def(s2), bld.def(s1, scc), scratch_addr,
11471 scratch_offset);
11472 }
11473 }
11474
11475 return startpgm;
11476 }
11477
11478 void
fix_ls_vgpr_init_bug(isel_context * ctx)11479 fix_ls_vgpr_init_bug(isel_context* ctx)
11480 {
11481 Builder bld(ctx->program, ctx->block);
11482 constexpr unsigned hs_idx = 1u;
11483 Builder::Result hs_thread_count =
11484 bld.sop2(aco_opcode::s_bfe_u32, bld.def(s1), bld.def(s1, scc),
11485 get_arg(ctx, ctx->args->merged_wave_info), Operand::c32((8u << 16) | (hs_idx * 8u)));
11486 Temp ls_has_nonzero_hs_threads = bool_to_vector_condition(ctx, hs_thread_count.def(1).getTemp());
11487
11488 /* If there are no HS threads, SPI mistakenly loads the LS VGPRs starting at VGPR 0. */
11489
11490 Temp instance_id =
11491 bld.vop2(aco_opcode::v_cndmask_b32, bld.def(v1), get_arg(ctx, ctx->args->vertex_id),
11492 get_arg(ctx, ctx->args->instance_id), ls_has_nonzero_hs_threads);
11493 Temp vs_rel_patch_id =
11494 bld.vop2(aco_opcode::v_cndmask_b32, bld.def(v1), get_arg(ctx, ctx->args->tcs_rel_ids),
11495 get_arg(ctx, ctx->args->vs_rel_patch_id), ls_has_nonzero_hs_threads);
11496 Temp vertex_id =
11497 bld.vop2(aco_opcode::v_cndmask_b32, bld.def(v1), get_arg(ctx, ctx->args->tcs_patch_id),
11498 get_arg(ctx, ctx->args->vertex_id), ls_has_nonzero_hs_threads);
11499
11500 ctx->arg_temps[ctx->args->instance_id.arg_index] = instance_id;
11501 ctx->arg_temps[ctx->args->vs_rel_patch_id.arg_index] = vs_rel_patch_id;
11502 ctx->arg_temps[ctx->args->vertex_id.arg_index] = vertex_id;
11503 }
11504
11505 void
split_arguments(isel_context * ctx,Pseudo_instruction * startpgm)11506 split_arguments(isel_context* ctx, Pseudo_instruction* startpgm)
11507 {
11508 /* Split all arguments except for the first (ring_offsets) and the last
11509 * (exec) so that the dead channels don't stay live throughout the program.
11510 */
11511 for (int i = 1; i < startpgm->definitions.size(); i++) {
11512 if (startpgm->definitions[i].regClass().size() > 1) {
11513 emit_split_vector(ctx, startpgm->definitions[i].getTemp(),
11514 startpgm->definitions[i].regClass().size());
11515 }
11516 }
11517 }
11518
11519 void
setup_fp_mode(isel_context * ctx,nir_shader * shader)11520 setup_fp_mode(isel_context* ctx, nir_shader* shader)
11521 {
11522 Program* program = ctx->program;
11523
11524 unsigned float_controls = shader->info.float_controls_execution_mode;
11525
11526 program->next_fp_mode.preserve_signed_zero_inf_nan32 =
11527 float_controls & FLOAT_CONTROLS_SIGNED_ZERO_INF_NAN_PRESERVE_FP32;
11528 program->next_fp_mode.preserve_signed_zero_inf_nan16_64 =
11529 float_controls & (FLOAT_CONTROLS_SIGNED_ZERO_INF_NAN_PRESERVE_FP16 |
11530 FLOAT_CONTROLS_SIGNED_ZERO_INF_NAN_PRESERVE_FP64);
11531
11532 program->next_fp_mode.must_flush_denorms32 =
11533 float_controls & FLOAT_CONTROLS_DENORM_FLUSH_TO_ZERO_FP32;
11534 program->next_fp_mode.must_flush_denorms16_64 =
11535 float_controls &
11536 (FLOAT_CONTROLS_DENORM_FLUSH_TO_ZERO_FP16 | FLOAT_CONTROLS_DENORM_FLUSH_TO_ZERO_FP64);
11537
11538 program->next_fp_mode.care_about_round32 =
11539 float_controls &
11540 (FLOAT_CONTROLS_ROUNDING_MODE_RTZ_FP32 | FLOAT_CONTROLS_ROUNDING_MODE_RTE_FP32);
11541
11542 program->next_fp_mode.care_about_round16_64 =
11543 float_controls &
11544 (FLOAT_CONTROLS_ROUNDING_MODE_RTZ_FP16 | FLOAT_CONTROLS_ROUNDING_MODE_RTZ_FP64 |
11545 FLOAT_CONTROLS_ROUNDING_MODE_RTE_FP16 | FLOAT_CONTROLS_ROUNDING_MODE_RTE_FP64);
11546
11547 /* default to preserving fp16 and fp64 denorms, since it's free for fp64 and
11548 * the precision seems needed for Wolfenstein: Youngblood to render correctly */
11549 if (program->next_fp_mode.must_flush_denorms16_64)
11550 program->next_fp_mode.denorm16_64 = 0;
11551 else
11552 program->next_fp_mode.denorm16_64 = fp_denorm_keep;
11553
11554 /* preserving fp32 denorms is expensive, so only do it if asked */
11555 if (float_controls & FLOAT_CONTROLS_DENORM_PRESERVE_FP32)
11556 program->next_fp_mode.denorm32 = fp_denorm_keep;
11557 else
11558 program->next_fp_mode.denorm32 = 0;
11559
11560 if (float_controls & FLOAT_CONTROLS_ROUNDING_MODE_RTZ_FP32)
11561 program->next_fp_mode.round32 = fp_round_tz;
11562 else
11563 program->next_fp_mode.round32 = fp_round_ne;
11564
11565 if (float_controls &
11566 (FLOAT_CONTROLS_ROUNDING_MODE_RTZ_FP16 | FLOAT_CONTROLS_ROUNDING_MODE_RTZ_FP64))
11567 program->next_fp_mode.round16_64 = fp_round_tz;
11568 else
11569 program->next_fp_mode.round16_64 = fp_round_ne;
11570
11571 ctx->block->fp_mode = program->next_fp_mode;
11572 }
11573
11574 void
cleanup_cfg(Program * program)11575 cleanup_cfg(Program* program)
11576 {
11577 /* create linear_succs/logical_succs */
11578 for (Block& BB : program->blocks) {
11579 for (unsigned idx : BB.linear_preds)
11580 program->blocks[idx].linear_succs.emplace_back(BB.index);
11581 for (unsigned idx : BB.logical_preds)
11582 program->blocks[idx].logical_succs.emplace_back(BB.index);
11583 }
11584 }
11585
11586 void
finish_program(isel_context * ctx)11587 finish_program(isel_context* ctx)
11588 {
11589 cleanup_cfg(ctx->program);
11590
11591 /* Insert a single p_end_wqm instruction after the last derivative calculation */
11592 if (ctx->program->stage == fragment_fs && ctx->program->needs_wqm && ctx->program->needs_exact) {
11593 /* Find the next BB at top-level CFG */
11594 while (!(ctx->program->blocks[ctx->wqm_block_idx].kind & block_kind_top_level)) {
11595 ctx->wqm_block_idx++;
11596 ctx->wqm_instruction_idx = 0;
11597 }
11598
11599 std::vector<aco_ptr<Instruction>>* instrs =
11600 &ctx->program->blocks[ctx->wqm_block_idx].instructions;
11601 auto it = instrs->begin() + ctx->wqm_instruction_idx;
11602
11603 /* Delay transistion to Exact to help optimizations and scheduling */
11604 while (it != instrs->end()) {
11605 aco_ptr<Instruction>& instr = *it;
11606 /* End WQM before: */
11607 if (instr->isVMEM() || instr->isFlatLike() || instr->isDS() || instr->isEXP() ||
11608 instr->opcode == aco_opcode::p_dual_src_export_gfx11 ||
11609 instr->opcode == aco_opcode::p_jump_to_epilog ||
11610 instr->opcode == aco_opcode::p_logical_start)
11611 break;
11612
11613 ++it;
11614
11615 /* End WQM after: */
11616 if (instr->opcode == aco_opcode::p_logical_end ||
11617 instr->opcode == aco_opcode::p_discard_if ||
11618 instr->opcode == aco_opcode::p_demote_to_helper ||
11619 instr->opcode == aco_opcode::p_end_with_regs)
11620 break;
11621 }
11622
11623 Builder bld(ctx->program);
11624 bld.reset(instrs, it);
11625 bld.pseudo(aco_opcode::p_end_wqm);
11626 }
11627 }
11628
11629 Temp
lanecount_to_mask(isel_context * ctx,Temp count)11630 lanecount_to_mask(isel_context* ctx, Temp count)
11631 {
11632 assert(count.regClass() == s1);
11633
11634 Builder bld(ctx->program, ctx->block);
11635 Temp mask = bld.sop2(aco_opcode::s_bfm_b64, bld.def(s2), count, Operand::zero());
11636 Temp cond;
11637
11638 if (ctx->program->wave_size == 64) {
11639 /* Special case for 64 active invocations, because 64 doesn't work with s_bfm */
11640 Temp active_64 = bld.sopc(aco_opcode::s_bitcmp1_b32, bld.def(s1, scc), count,
11641 Operand::c32(6u /* log2(64) */));
11642 cond =
11643 bld.sop2(Builder::s_cselect, bld.def(bld.lm), Operand::c32(-1u), mask, bld.scc(active_64));
11644 } else {
11645 /* We use s_bfm_b64 (not _b32) which works with 32, but we need to extract the lower half of
11646 * the register */
11647 cond = emit_extract_vector(ctx, mask, 0, bld.lm);
11648 }
11649
11650 return cond;
11651 }
11652
11653 Temp
merged_wave_info_to_mask(isel_context * ctx,unsigned i)11654 merged_wave_info_to_mask(isel_context* ctx, unsigned i)
11655 {
11656 Builder bld(ctx->program, ctx->block);
11657
11658 /* lanecount_to_mask() only cares about s0.u[6:0] so we don't need either s_bfe nor s_and here */
11659 Temp count = i == 0 ? get_arg(ctx, ctx->args->merged_wave_info)
11660 : bld.sop2(aco_opcode::s_lshr_b32, bld.def(s1), bld.def(s1, scc),
11661 get_arg(ctx, ctx->args->merged_wave_info), Operand::c32(i * 8u));
11662
11663 return lanecount_to_mask(ctx, count);
11664 }
11665
11666 static void
insert_rt_jump_next(isel_context & ctx,const struct ac_shader_args * args)11667 insert_rt_jump_next(isel_context& ctx, const struct ac_shader_args* args)
11668 {
11669 unsigned src_count = ctx.args->arg_count;
11670 Pseudo_instruction* ret =
11671 create_instruction<Pseudo_instruction>(aco_opcode::p_return, Format::PSEUDO, src_count, 0);
11672 ctx.block->instructions.emplace_back(ret);
11673
11674 for (unsigned i = 0; i < src_count; i++) {
11675 enum ac_arg_regfile file = ctx.args->args[i].file;
11676 unsigned size = ctx.args->args[i].size;
11677 unsigned reg = ctx.args->args[i].offset + (file == AC_ARG_SGPR ? 0 : 256);
11678 RegClass type = RegClass(file == AC_ARG_SGPR ? RegType::sgpr : RegType::vgpr, size);
11679 Operand op = ctx.arg_temps[i].id() ? Operand(ctx.arg_temps[i], PhysReg{reg})
11680 : Operand(PhysReg{reg}, type);
11681 ret->operands[i] = op;
11682 }
11683
11684 Builder bld(ctx.program, ctx.block);
11685 bld.sop1(aco_opcode::s_setpc_b64, get_arg(&ctx, ctx.args->rt.uniform_shader_addr));
11686 }
11687
11688 void
select_program_rt(isel_context & ctx,unsigned shader_count,struct nir_shader * const * shaders,const struct ac_shader_args * args)11689 select_program_rt(isel_context& ctx, unsigned shader_count, struct nir_shader* const* shaders,
11690 const struct ac_shader_args* args)
11691 {
11692 for (unsigned i = 0; i < shader_count; i++) {
11693 if (i) {
11694 ctx.block = ctx.program->create_and_insert_block();
11695 ctx.block->kind = block_kind_top_level | block_kind_resume;
11696 }
11697
11698 nir_shader* nir = shaders[i];
11699 init_context(&ctx, nir);
11700 setup_fp_mode(&ctx, nir);
11701
11702 Pseudo_instruction* startpgm = add_startpgm(&ctx);
11703 append_logical_start(ctx.block);
11704 split_arguments(&ctx, startpgm);
11705 visit_cf_list(&ctx, &nir_shader_get_entrypoint(nir)->body);
11706 append_logical_end(ctx.block);
11707 ctx.block->kind |= block_kind_uniform;
11708
11709 /* Fix output registers and jump to next shader. We can skip this when dealing with a raygen
11710 * shader without shader calls.
11711 */
11712 if (shader_count > 1 || shaders[i]->info.stage != MESA_SHADER_RAYGEN)
11713 insert_rt_jump_next(ctx, args);
11714
11715 cleanup_context(&ctx);
11716 }
11717
11718 ctx.program->config->float_mode = ctx.program->blocks[0].fp_mode.val;
11719 finish_program(&ctx);
11720 }
11721
11722 void
pops_await_overlapped_waves(isel_context * ctx)11723 pops_await_overlapped_waves(isel_context* ctx)
11724 {
11725 ctx->program->has_pops_overlapped_waves_wait = true;
11726
11727 Builder bld(ctx->program, ctx->block);
11728
11729 if (ctx->program->gfx_level >= GFX11) {
11730 /* GFX11+ - waiting for the export from the overlapped waves.
11731 * Await the export_ready event (bit wait_event_imm_dont_wait_export_ready clear).
11732 */
11733 bld.sopp(aco_opcode::s_wait_event, -1, 0);
11734 return;
11735 }
11736
11737 /* Pre-GFX11 - sleep loop polling the exiting wave ID. */
11738
11739 const Temp collision = get_arg(ctx, ctx->args->pops_collision_wave_id);
11740
11741 /* Check if there's an overlap in the current wave - otherwise, the wait may result in a hang. */
11742 const Temp did_overlap =
11743 bld.sopc(aco_opcode::s_bitcmp1_b32, bld.def(s1, scc), collision, Operand::c32(31));
11744 if_context did_overlap_if_context;
11745 begin_uniform_if_then(ctx, &did_overlap_if_context, did_overlap);
11746 bld.reset(ctx->block);
11747
11748 /* Set the packer register - after this, pops_exiting_wave_id can be polled. */
11749 if (ctx->program->gfx_level >= GFX10) {
11750 /* 2 packer ID bits on GFX10-10.3. */
11751 const Temp packer_id = bld.sop2(aco_opcode::s_bfe_u32, bld.def(s1), bld.def(s1, scc),
11752 collision, Operand::c32(0x2001c));
11753 /* POPS_PACKER register: bit 0 - POPS enabled for this wave, bits 2:1 - packer ID. */
11754 const Temp packer_id_hwreg_bits = bld.sop2(aco_opcode::s_lshl1_add_u32, bld.def(s1),
11755 bld.def(s1, scc), packer_id, Operand::c32(1));
11756 bld.sopk(aco_opcode::s_setreg_b32, packer_id_hwreg_bits, ((3 - 1) << 11) | 25);
11757 } else {
11758 /* 1 packer ID bit on GFX9. */
11759 const Temp packer_id = bld.sop2(aco_opcode::s_bfe_u32, bld.def(s1), bld.def(s1, scc),
11760 collision, Operand::c32(0x1001c));
11761 /* MODE register: bit 24 - wave is associated with packer 0, bit 25 - with packer 1.
11762 * Packer index to packer bits: 0 to 0b01, 1 to 0b10.
11763 */
11764 const Temp packer_id_hwreg_bits =
11765 bld.sop2(aco_opcode::s_add_i32, bld.def(s1), bld.def(s1, scc), packer_id, Operand::c32(1));
11766 bld.sopk(aco_opcode::s_setreg_b32, packer_id_hwreg_bits, ((2 - 1) << 11) | (24 << 6) | 1);
11767 }
11768
11769 Temp newest_overlapped_wave_id = bld.sop2(aco_opcode::s_bfe_u32, bld.def(s1), bld.def(s1, scc),
11770 collision, Operand::c32(0xa0010));
11771 if (ctx->program->gfx_level < GFX10) {
11772 /* On GFX9, the newest overlapped wave ID value passed to the shader is smaller than the
11773 * actual wave ID by 1 in case of wraparound.
11774 */
11775 const Temp current_wave_id = bld.sop2(aco_opcode::s_and_b32, bld.def(s1), bld.def(s1, scc),
11776 collision, Operand::c32(0x3ff));
11777 const Temp newest_overlapped_wave_id_wrapped = bld.sopc(
11778 aco_opcode::s_cmp_gt_u32, bld.def(s1, scc), newest_overlapped_wave_id, current_wave_id);
11779 newest_overlapped_wave_id =
11780 bld.sop2(aco_opcode::s_add_i32, bld.def(s1), bld.def(s1, scc), newest_overlapped_wave_id,
11781 newest_overlapped_wave_id_wrapped);
11782 }
11783
11784 /* The wave IDs are the low 10 bits of a monotonically increasing wave counter.
11785 * The overlapped and the exiting wave IDs can't be larger than the current wave ID, and they are
11786 * no more than 1023 values behind the current wave ID.
11787 * Remap the overlapped and the exiting wave IDs from wrapping to monotonic so an unsigned
11788 * comparison can be used: the wave `current - 1023` becomes 0, it's followed by a piece growing
11789 * away from 0, then a piece increasing until UINT32_MAX, and the current wave is UINT32_MAX.
11790 * To do that, subtract `current - 1023`, which with wrapping arithmetic is (current + 1), and
11791 * `a - (b + 1)` is `a + ~b`.
11792 * Note that if the 10-bit current wave ID is 1023 (thus 1024 will be subtracted), the wave
11793 * `current - 1023` will become `UINT32_MAX - 1023` rather than 0, but all the possible wave IDs
11794 * will still grow monotonically in the 32-bit value, and the unsigned comparison will behave as
11795 * expected.
11796 */
11797 const Temp wave_id_offset = bld.sop2(aco_opcode::s_nand_b32, bld.def(s1), bld.def(s1, scc),
11798 collision, Operand::c32(0x3ff));
11799 newest_overlapped_wave_id = bld.sop2(aco_opcode::s_add_i32, bld.def(s1), bld.def(s1, scc),
11800 newest_overlapped_wave_id, wave_id_offset);
11801
11802 /* Await the overlapped waves. */
11803
11804 loop_context wait_loop_context;
11805 begin_loop(ctx, &wait_loop_context);
11806 bld.reset(ctx->block);
11807
11808 const Temp exiting_wave_id = bld.pseudo(aco_opcode::p_pops_gfx9_add_exiting_wave_id, bld.def(s1),
11809 bld.def(s1, scc), wave_id_offset);
11810 /* If the exiting (not exited) wave ID is larger than the newest overlapped wave ID (after
11811 * remapping both to monotonically increasing unsigned integers), the newest overlapped wave has
11812 * exited the ordered section.
11813 */
11814 const Temp newest_overlapped_wave_exited = bld.sopc(aco_opcode::s_cmp_lt_u32, bld.def(s1, scc),
11815 newest_overlapped_wave_id, exiting_wave_id);
11816 if_context newest_overlapped_wave_exited_if_context;
11817 begin_uniform_if_then(ctx, &newest_overlapped_wave_exited_if_context,
11818 newest_overlapped_wave_exited);
11819 emit_loop_break(ctx);
11820 begin_uniform_if_else(ctx, &newest_overlapped_wave_exited_if_context);
11821 end_uniform_if(ctx, &newest_overlapped_wave_exited_if_context);
11822 bld.reset(ctx->block);
11823
11824 /* Sleep before rechecking to let overlapped waves run for some time. */
11825 bld.sopp(aco_opcode::s_sleep, -1, ctx->program->gfx_level >= GFX10 ? UINT16_MAX : 3);
11826
11827 end_loop(ctx, &wait_loop_context);
11828 bld.reset(ctx->block);
11829
11830 /* Indicate the wait has been done to subsequent compilation stages. */
11831 bld.pseudo(aco_opcode::p_pops_gfx9_overlapped_wave_wait_done);
11832
11833 begin_uniform_if_else(ctx, &did_overlap_if_context);
11834 end_uniform_if(ctx, &did_overlap_if_context);
11835 bld.reset(ctx->block);
11836 }
11837
11838 static void
create_merged_jump_to_epilog(isel_context * ctx)11839 create_merged_jump_to_epilog(isel_context* ctx)
11840 {
11841 Builder bld(ctx->program, ctx->block);
11842 std::vector<Operand> regs;
11843
11844 for (unsigned i = 0; i < ctx->args->arg_count; i++) {
11845 if (!ctx->args->args[i].preserved)
11846 continue;
11847
11848 const enum ac_arg_regfile file = ctx->args->args[i].file;
11849 const unsigned reg = ctx->args->args[i].offset;
11850
11851 Operand op(ctx->arg_temps[i]);
11852 op.setFixed(PhysReg{file == AC_ARG_SGPR ? reg : reg + 256});
11853 regs.emplace_back(op);
11854 }
11855
11856 Temp continue_pc =
11857 convert_pointer_to_64_bit(ctx, get_arg(ctx, ctx->program->info.next_stage_pc));
11858
11859 aco_ptr<Pseudo_instruction> jump{create_instruction<Pseudo_instruction>(
11860 aco_opcode::p_jump_to_epilog, Format::PSEUDO, 1 + regs.size(), 0)};
11861 jump->operands[0] = Operand(continue_pc);
11862 for (unsigned i = 0; i < regs.size(); i++) {
11863 jump->operands[i + 1] = regs[i];
11864 }
11865 ctx->block->instructions.emplace_back(std::move(jump));
11866 }
11867
11868 static void
create_end_for_merged_shader(isel_context * ctx)11869 create_end_for_merged_shader(isel_context* ctx)
11870 {
11871 std::vector<Operand> regs;
11872
11873 unsigned max_args;
11874 if (ctx->stage.sw == SWStage::VS) {
11875 assert(ctx->args->vertex_id.used);
11876 max_args = ctx->args->vertex_id.arg_index;
11877 } else {
11878 assert(ctx->stage.sw == SWStage::TES);
11879 assert(ctx->args->tes_u.used);
11880 max_args = ctx->args->tes_u.arg_index;
11881 }
11882
11883 struct ac_arg arg;
11884 arg.used = true;
11885
11886 for (arg.arg_index = 0; arg.arg_index < max_args; arg.arg_index++)
11887 regs.emplace_back(get_arg_for_end(ctx, arg));
11888
11889 build_end_with_regs(ctx, regs);
11890 }
11891
11892 void
select_shader(isel_context & ctx,nir_shader * nir,const bool need_startpgm,const bool need_endpgm,const bool need_barrier,if_context * ic_merged_wave_info,const bool check_merged_wave_info,const bool endif_merged_wave_info)11893 select_shader(isel_context& ctx, nir_shader* nir, const bool need_startpgm, const bool need_endpgm,
11894 const bool need_barrier, if_context* ic_merged_wave_info,
11895 const bool check_merged_wave_info, const bool endif_merged_wave_info)
11896 {
11897 init_context(&ctx, nir);
11898 setup_fp_mode(&ctx, nir);
11899
11900 Program* program = ctx.program;
11901
11902 if (need_startpgm) {
11903 /* Needs to be after init_context() for FS. */
11904 Pseudo_instruction* startpgm = add_startpgm(&ctx);
11905 append_logical_start(ctx.block);
11906
11907 if (ctx.options->has_ls_vgpr_init_bug && ctx.stage == vertex_tess_control_hs &&
11908 !program->info.vs.has_prolog)
11909 fix_ls_vgpr_init_bug(&ctx);
11910
11911 split_arguments(&ctx, startpgm);
11912
11913 if (!program->info.vs.has_prolog &&
11914 (program->stage.has(SWStage::VS) || program->stage.has(SWStage::TES))) {
11915 Builder(ctx.program, ctx.block).sopp(aco_opcode::s_setprio, -1u, 0x3u);
11916 }
11917 }
11918
11919 if (program->gfx_level == GFX10 && program->stage.hw == AC_HW_NEXT_GEN_GEOMETRY_SHADER &&
11920 !program->stage.has(SWStage::GS)) {
11921 /* Workaround for Navi1x HW bug to ensure that all NGG waves launch before
11922 * s_sendmsg(GS_ALLOC_REQ).
11923 */
11924 Builder(ctx.program, ctx.block).sopp(aco_opcode::s_barrier, -1u, 0u);
11925 }
11926
11927 if (check_merged_wave_info) {
11928 const unsigned i =
11929 nir->info.stage == MESA_SHADER_VERTEX || nir->info.stage == MESA_SHADER_TESS_EVAL ? 0 : 1;
11930 const Temp cond = merged_wave_info_to_mask(&ctx, i);
11931 begin_divergent_if_then(&ctx, ic_merged_wave_info, cond);
11932 }
11933
11934 if (need_barrier) {
11935 const sync_scope scope = ctx.stage == vertex_tess_control_hs && ctx.tcs_in_out_eq &&
11936 program->wave_size % nir->info.tess.tcs_vertices_out == 0
11937 ? scope_subgroup
11938 : scope_workgroup;
11939
11940 Builder(ctx.program, ctx.block)
11941 .barrier(aco_opcode::p_barrier, memory_sync_info(storage_shared, semantic_acqrel, scope),
11942 scope);
11943 }
11944
11945 nir_function_impl* func = nir_shader_get_entrypoint(nir);
11946 visit_cf_list(&ctx, &func->body);
11947
11948 if (ctx.program->info.has_epilog) {
11949 if (ctx.stage == fragment_fs) {
11950 if (ctx.options->is_opengl)
11951 create_fs_end_for_epilog(&ctx);
11952 else
11953 create_fs_jump_to_epilog(&ctx);
11954
11955 /* FS epilogs always have at least one color/null export. */
11956 ctx.program->has_color_exports = true;
11957 } else if (nir->info.stage == MESA_SHADER_TESS_CTRL) {
11958 assert(ctx.stage == tess_control_hs || ctx.stage == vertex_tess_control_hs);
11959 if (ctx.options->is_opengl)
11960 create_tcs_end_for_epilog(&ctx);
11961 else
11962 create_tcs_jump_to_epilog(&ctx);
11963 }
11964 }
11965
11966 if (endif_merged_wave_info) {
11967 begin_divergent_if_else(&ctx, ic_merged_wave_info);
11968 end_divergent_if(&ctx, ic_merged_wave_info);
11969 }
11970
11971 bool is_first_stage_of_merged_shader = false;
11972
11973 if (ctx.program->info.merged_shader_compiled_separately &&
11974 (ctx.stage.sw == SWStage::VS || ctx.stage.sw == SWStage::TES)) {
11975 assert(program->gfx_level >= GFX9);
11976 if (ctx.options->is_opengl)
11977 create_end_for_merged_shader(&ctx);
11978 else
11979 create_merged_jump_to_epilog(&ctx);
11980
11981 is_first_stage_of_merged_shader = true;
11982 }
11983
11984 cleanup_context(&ctx);
11985
11986 if (need_endpgm) {
11987 program->config->float_mode = program->blocks[0].fp_mode.val;
11988
11989 append_logical_end(ctx.block);
11990 ctx.block->kind |= block_kind_uniform;
11991
11992 if ((!program->info.has_epilog && !is_first_stage_of_merged_shader) ||
11993 (nir->info.stage == MESA_SHADER_TESS_CTRL && program->gfx_level >= GFX9)) {
11994 Builder(program, ctx.block).sopp(aco_opcode::s_endpgm);
11995 }
11996
11997 finish_program(&ctx);
11998 }
11999 }
12000
12001 void
select_program_merged(isel_context & ctx,const unsigned shader_count,nir_shader * const * shaders)12002 select_program_merged(isel_context& ctx, const unsigned shader_count, nir_shader* const* shaders)
12003 {
12004 if_context ic_merged_wave_info;
12005 const bool ngg_gs = ctx.stage.hw == AC_HW_NEXT_GEN_GEOMETRY_SHADER && ctx.stage.has(SWStage::GS);
12006
12007 for (unsigned i = 0; i < shader_count; i++) {
12008 nir_shader* nir = shaders[i];
12009
12010 /* We always need to insert p_startpgm at the beginning of the first shader. */
12011 const bool need_startpgm = i == 0;
12012
12013 /* Need to handle program end for last shader stage. */
12014 const bool need_endpgm = i == shader_count - 1;
12015
12016 /* In a merged VS+TCS HS, the VS implementation can be completely empty. */
12017 nir_function_impl* func = nir_shader_get_entrypoint(nir);
12018 const bool empty_shader =
12019 nir_cf_list_is_empty_block(&func->body) &&
12020 ((nir->info.stage == MESA_SHADER_VERTEX &&
12021 (ctx.stage == vertex_tess_control_hs || ctx.stage == vertex_geometry_gs)) ||
12022 (nir->info.stage == MESA_SHADER_TESS_EVAL && ctx.stage == tess_eval_geometry_gs));
12023
12024 /* See if we need to emit a check of the merged wave info SGPR. */
12025 const bool check_merged_wave_info =
12026 ctx.tcs_in_out_eq ? i == 0 : (shader_count >= 2 && !empty_shader && !(ngg_gs && i == 1));
12027 const bool endif_merged_wave_info =
12028 ctx.tcs_in_out_eq ? i == 1 : (check_merged_wave_info && !(ngg_gs && i == 1));
12029
12030 /* Skip s_barrier from TCS when VS outputs are not stored in the LDS. */
12031 const bool tcs_skip_barrier =
12032 ctx.stage == vertex_tess_control_hs && ctx.tcs_temp_only_inputs == nir->info.inputs_read;
12033
12034 /* A barrier is usually needed at the beginning of the second shader, with exceptions. */
12035 const bool need_barrier = i != 0 && !ngg_gs && !tcs_skip_barrier;
12036
12037 select_shader(ctx, nir, need_startpgm, need_endpgm, need_barrier, &ic_merged_wave_info,
12038 check_merged_wave_info, endif_merged_wave_info);
12039
12040 if (i == 0 && ctx.stage == vertex_tess_control_hs && ctx.tcs_in_out_eq) {
12041 /* Special handling when TCS input and output patch size is the same.
12042 * Outputs of the previous stage are inputs to the next stage.
12043 */
12044 ctx.inputs = ctx.outputs;
12045 ctx.outputs = shader_io_state();
12046 }
12047 }
12048 }
12049
12050 Temp
get_tess_ring_descriptor(isel_context * ctx,const struct aco_tcs_epilog_info * einfo,bool is_tcs_factor_ring)12051 get_tess_ring_descriptor(isel_context* ctx, const struct aco_tcs_epilog_info* einfo,
12052 bool is_tcs_factor_ring)
12053 {
12054 Builder bld(ctx->program, ctx->block);
12055
12056 if (!ctx->options->is_opengl) {
12057 Temp ring_offsets = get_arg(ctx, ctx->args->ring_offsets);
12058 uint32_t tess_ring_offset =
12059 is_tcs_factor_ring ? 5 /* RING_HS_TESS_FACTOR */ : 6 /* RING_HS_TESS_OFFCHIP */;
12060 return bld.smem(aco_opcode::s_load_dwordx4, bld.def(s4), ring_offsets,
12061 Operand::c32(tess_ring_offset * 16u));
12062 }
12063
12064 Temp addr = get_arg(ctx, einfo->tcs_out_lds_layout);
12065 /* TCS only receives high 13 bits of the address. */
12066 addr = bld.sop2(aco_opcode::s_and_b32, bld.def(s1), bld.def(s1, scc), addr,
12067 Operand::c32(0xfff80000));
12068
12069 if (is_tcs_factor_ring) {
12070 addr = bld.sop2(aco_opcode::s_add_u32, bld.def(s1), bld.def(s1, scc), addr,
12071 Operand::c32(einfo->tess_offchip_ring_size));
12072 }
12073
12074 uint32_t rsrc3 = S_008F0C_DST_SEL_X(V_008F0C_SQ_SEL_X) | S_008F0C_DST_SEL_Y(V_008F0C_SQ_SEL_Y) |
12075 S_008F0C_DST_SEL_Z(V_008F0C_SQ_SEL_Z) | S_008F0C_DST_SEL_W(V_008F0C_SQ_SEL_W);
12076
12077 if (ctx->options->gfx_level >= GFX11) {
12078 rsrc3 |= S_008F0C_FORMAT(V_008F0C_GFX11_FORMAT_32_FLOAT) |
12079 S_008F0C_OOB_SELECT(V_008F0C_OOB_SELECT_RAW);
12080 } else if (ctx->options->gfx_level >= GFX10) {
12081 rsrc3 |= S_008F0C_FORMAT(V_008F0C_GFX10_FORMAT_32_FLOAT) |
12082 S_008F0C_OOB_SELECT(V_008F0C_OOB_SELECT_RAW) | S_008F0C_RESOURCE_LEVEL(1);
12083 } else {
12084 rsrc3 |= S_008F0C_NUM_FORMAT(V_008F0C_BUF_NUM_FORMAT_FLOAT) |
12085 S_008F0C_DATA_FORMAT(V_008F0C_BUF_DATA_FORMAT_32);
12086 }
12087
12088 return bld.pseudo(aco_opcode::p_create_vector, bld.def(s4), addr,
12089 Operand::c32(ctx->options->address32_hi), Operand::c32(0xffffffff),
12090 Operand::c32(rsrc3));
12091 }
12092
12093 void
store_tess_factor_to_tess_ring(isel_context * ctx,Temp tess_ring_desc,Temp factors[],unsigned factor_comps,Temp sbase,Temp voffset,Temp num_patches,unsigned patch_offset)12094 store_tess_factor_to_tess_ring(isel_context* ctx, Temp tess_ring_desc, Temp factors[],
12095 unsigned factor_comps, Temp sbase, Temp voffset, Temp num_patches,
12096 unsigned patch_offset)
12097 {
12098 Builder bld(ctx->program, ctx->block);
12099
12100 Temp soffset = sbase;
12101 if (patch_offset) {
12102 Temp offset =
12103 bld.sop2(aco_opcode::s_mul_i32, bld.def(s1), num_patches, Operand::c32(patch_offset));
12104 soffset = bld.sop2(aco_opcode::s_add_u32, bld.def(s1), bld.def(s1, scc), soffset, offset);
12105 }
12106
12107 Temp data = factor_comps == 1
12108 ? factors[0]
12109 : create_vec_from_array(ctx, factors, factor_comps, RegType::vgpr, 4);
12110
12111 emit_single_mubuf_store(ctx, tess_ring_desc, voffset, soffset, Temp(), data, 0,
12112 memory_sync_info(storage_vmem_output), true, false, false);
12113 }
12114
12115 void
emit_polygon_stipple(isel_context * ctx,const struct aco_ps_prolog_info * finfo)12116 emit_polygon_stipple(isel_context* ctx, const struct aco_ps_prolog_info* finfo)
12117 {
12118 Builder bld(ctx->program, ctx->block);
12119
12120 /* Use the fixed-point gl_FragCoord input.
12121 * Since the stipple pattern is 32x32 and it repeats, just get 5 bits
12122 * per coordinate to get the repeating effect.
12123 */
12124 Temp pos_fixed_pt = get_arg(ctx, ctx->args->pos_fixed_pt);
12125 Temp addr0 = bld.vop2(aco_opcode::v_and_b32, bld.def(v1), Operand::c32(0x1f), pos_fixed_pt);
12126 Temp addr1 = bld.vop3(aco_opcode::v_bfe_u32, bld.def(v1), pos_fixed_pt, Operand::c32(16u),
12127 Operand::c32(5u));
12128
12129 /* Load the buffer descriptor. */
12130 Temp list = get_arg(ctx, finfo->internal_bindings);
12131 list = convert_pointer_to_64_bit(ctx, list);
12132 Temp desc = bld.smem(aco_opcode::s_load_dwordx4, bld.def(s4), list,
12133 Operand::c32(finfo->poly_stipple_buf_offset));
12134
12135 /* The stipple pattern is 32x32, each row has 32 bits. */
12136 Temp offset = bld.vop2(aco_opcode::v_lshlrev_b32, bld.def(v1), Operand::c32(2), addr1);
12137 Temp row = bld.mubuf(aco_opcode::buffer_load_dword, bld.def(v1), desc, offset, Operand::c32(0u),
12138 0, true);
12139 Temp bit = bld.vop3(aco_opcode::v_bfe_u32, bld.def(v1), row, addr0, Operand::c32(1u));
12140 Temp cond = bld.vopc(aco_opcode::v_cmp_eq_u32, bld.def(bld.lm), Operand::zero(), bit);
12141 bld.pseudo(aco_opcode::p_demote_to_helper, cond);
12142
12143 ctx->block->kind |= block_kind_uses_discard;
12144 ctx->program->needs_exact = true;
12145 }
12146
12147 void
overwrite_interp_args(isel_context * ctx,const struct aco_ps_prolog_info * finfo)12148 overwrite_interp_args(isel_context* ctx, const struct aco_ps_prolog_info* finfo)
12149 {
12150 Builder bld(ctx->program, ctx->block);
12151
12152 if (finfo->bc_optimize_for_persp || finfo->bc_optimize_for_linear) {
12153 /* The shader should do: if (PRIM_MASK[31]) CENTROID = CENTER;
12154 * The hw doesn't compute CENTROID if the whole wave only
12155 * contains fully-covered quads.
12156 */
12157 Temp bc_optimize = get_arg(ctx, ctx->args->prim_mask);
12158
12159 /* enabled when bit 31 is set */
12160 Temp cond =
12161 bld.sopc(aco_opcode::s_bitcmp1_b32, bld.def(s1, scc), bc_optimize, Operand::c32(31u));
12162
12163 /* scale 1bit scc to wave size bits used by v_cndmask */
12164 cond = bool_to_vector_condition(ctx, cond);
12165
12166 if (finfo->bc_optimize_for_persp) {
12167 Temp center = get_arg(ctx, ctx->args->persp_center);
12168 Temp centroid = get_arg(ctx, ctx->args->persp_centroid);
12169
12170 Temp dst = bld.tmp(v2);
12171 select_vec2(ctx, dst, cond, center, centroid);
12172 ctx->arg_temps[ctx->args->persp_centroid.arg_index] = dst;
12173 }
12174
12175 if (finfo->bc_optimize_for_linear) {
12176 Temp center = get_arg(ctx, ctx->args->linear_center);
12177 Temp centroid = get_arg(ctx, ctx->args->linear_centroid);
12178
12179 Temp dst = bld.tmp(v2);
12180 select_vec2(ctx, dst, cond, center, centroid);
12181 ctx->arg_temps[ctx->args->linear_centroid.arg_index] = dst;
12182 }
12183 }
12184
12185 if (finfo->force_persp_sample_interp) {
12186 Temp persp_sample = get_arg(ctx, ctx->args->persp_sample);
12187 ctx->arg_temps[ctx->args->persp_center.arg_index] = persp_sample;
12188 ctx->arg_temps[ctx->args->persp_centroid.arg_index] = persp_sample;
12189 }
12190
12191 if (finfo->force_linear_sample_interp) {
12192 Temp linear_sample = get_arg(ctx, ctx->args->linear_sample);
12193 ctx->arg_temps[ctx->args->linear_center.arg_index] = linear_sample;
12194 ctx->arg_temps[ctx->args->linear_centroid.arg_index] = linear_sample;
12195 }
12196
12197 if (finfo->force_persp_center_interp) {
12198 Temp persp_center = get_arg(ctx, ctx->args->persp_center);
12199 ctx->arg_temps[ctx->args->persp_sample.arg_index] = persp_center;
12200 ctx->arg_temps[ctx->args->persp_centroid.arg_index] = persp_center;
12201 }
12202
12203 if (finfo->force_linear_center_interp) {
12204 Temp linear_center = get_arg(ctx, ctx->args->linear_center);
12205 ctx->arg_temps[ctx->args->linear_sample.arg_index] = linear_center;
12206 ctx->arg_temps[ctx->args->linear_centroid.arg_index] = linear_center;
12207 }
12208 }
12209
12210 void
overwrite_samplemask_arg(isel_context * ctx,const struct aco_ps_prolog_info * finfo)12211 overwrite_samplemask_arg(isel_context* ctx, const struct aco_ps_prolog_info* finfo)
12212 {
12213 Builder bld(ctx->program, ctx->block);
12214
12215 /* Section 15.2.2 (Shader Inputs) of the OpenGL 4.5 (Core Profile) spec
12216 * says:
12217 *
12218 * "When per-sample shading is active due to the use of a fragment
12219 * input qualified by sample or due to the use of the gl_SampleID
12220 * or gl_SamplePosition variables, only the bit for the current
12221 * sample is set in gl_SampleMaskIn. When state specifies multiple
12222 * fragment shader invocations for a given fragment, the sample
12223 * mask for any single fragment shader invocation may specify a
12224 * subset of the covered samples for the fragment. In this case,
12225 * the bit corresponding to each covered sample will be set in
12226 * exactly one fragment shader invocation."
12227 *
12228 * The samplemask loaded by hardware is always the coverage of the
12229 * entire pixel/fragment, so mask bits out based on the sample ID.
12230 */
12231 if (finfo->samplemask_log_ps_iter) {
12232 Temp ancillary = get_arg(ctx, ctx->args->ancillary);
12233 Temp sampleid = bld.vop3(aco_opcode::v_bfe_u32, bld.def(v1), ancillary, Operand::c32(8u),
12234 Operand::c32(4u));
12235 Temp samplemask = get_arg(ctx, ctx->args->sample_coverage);
12236
12237 uint32_t ps_iter_mask = ac_get_ps_iter_mask(1 << finfo->samplemask_log_ps_iter);
12238 Temp iter_mask = bld.copy(bld.def(v1), Operand::c32(ps_iter_mask));
12239
12240 Temp mask = bld.vop2(aco_opcode::v_lshlrev_b32, bld.def(v1), sampleid, iter_mask);
12241 samplemask = bld.vop2(aco_opcode::v_and_b32, bld.def(v1), samplemask, mask);
12242
12243 ctx->arg_temps[ctx->args->sample_coverage.arg_index] = samplemask;
12244 }
12245 }
12246
12247 Temp
get_interp_color(isel_context * ctx,int interp_vgpr,unsigned attr_index,unsigned comp)12248 get_interp_color(isel_context* ctx, int interp_vgpr, unsigned attr_index, unsigned comp)
12249 {
12250 Builder bld(ctx->program, ctx->block);
12251
12252 Temp dst = bld.tmp(v1);
12253
12254 Temp prim_mask = get_arg(ctx, ctx->args->prim_mask);
12255
12256 if (interp_vgpr != -1) {
12257 /* interp args are all 2 vgprs */
12258 int arg_index = ctx->args->persp_sample.arg_index + interp_vgpr / 2;
12259 Temp interp_ij = ctx->arg_temps[arg_index];
12260
12261 emit_interp_instr(ctx, attr_index, comp, interp_ij, dst, prim_mask);
12262 } else {
12263 emit_interp_mov_instr(ctx, attr_index, comp, 0, dst, prim_mask);
12264 }
12265
12266 return dst;
12267 }
12268
12269 void
interpolate_color_args(isel_context * ctx,const struct aco_ps_prolog_info * finfo,std::vector<Operand> & regs)12270 interpolate_color_args(isel_context* ctx, const struct aco_ps_prolog_info* finfo,
12271 std::vector<Operand>& regs)
12272 {
12273 if (!finfo->colors_read)
12274 return;
12275
12276 Builder bld(ctx->program, ctx->block);
12277
12278 unsigned vgpr = 256 + ctx->args->num_vgprs_used;
12279
12280 if (finfo->color_two_side) {
12281 Temp face = get_arg(ctx, ctx->args->front_face);
12282 Temp is_face_positive =
12283 bld.vopc(aco_opcode::v_cmp_lg_u32, bld.def(bld.lm), Operand::zero(), face);
12284
12285 u_foreach_bit (i, finfo->colors_read) {
12286 unsigned color_index = i / 4;
12287 unsigned front_index = finfo->color_attr_index[color_index];
12288 int interp_vgpr = finfo->color_interp_vgpr_index[color_index];
12289
12290 /* If BCOLOR0 is used, BCOLOR1 is at offset "num_inputs + 1",
12291 * otherwise it's at offset "num_inputs".
12292 */
12293 unsigned back_index = finfo->num_interp_inputs;
12294 if (color_index == 1 && finfo->colors_read & 0xf)
12295 back_index++;
12296
12297 Temp front = get_interp_color(ctx, interp_vgpr, front_index, i % 4);
12298 Temp back = get_interp_color(ctx, interp_vgpr, back_index, i % 4);
12299
12300 Temp color =
12301 bld.vop2(aco_opcode::v_cndmask_b32, bld.def(v1), back, front, is_face_positive);
12302
12303 regs.emplace_back(Operand(color, PhysReg{vgpr++}));
12304 }
12305 } else {
12306 u_foreach_bit (i, finfo->colors_read) {
12307 unsigned color_index = i / 4;
12308 unsigned attr_index = finfo->color_attr_index[color_index];
12309 int interp_vgpr = finfo->color_interp_vgpr_index[color_index];
12310 Temp color = get_interp_color(ctx, interp_vgpr, attr_index, i % 4);
12311
12312 regs.emplace_back(Operand(color, PhysReg{vgpr++}));
12313 }
12314 }
12315 }
12316
12317 void
emit_clamp_alpha_test(isel_context * ctx,const struct aco_ps_epilog_info * info,Temp colors[4],unsigned color_index)12318 emit_clamp_alpha_test(isel_context* ctx, const struct aco_ps_epilog_info* info, Temp colors[4],
12319 unsigned color_index)
12320 {
12321 Builder bld(ctx->program, ctx->block);
12322
12323 if (info->clamp_color) {
12324 for (unsigned i = 0; i < 4; i++) {
12325 if (colors[i].regClass() == v2b) {
12326 colors[i] = bld.vop3(aco_opcode::v_med3_f16, bld.def(v2b), Operand::c16(0u),
12327 Operand::c16(0x3c00), colors[i]);
12328 } else {
12329 assert(colors[i].regClass() == v1);
12330 colors[i] = bld.vop3(aco_opcode::v_med3_f32, bld.def(v1), Operand::zero(),
12331 Operand::c32(0x3f800000u), colors[i]);
12332 }
12333 }
12334 }
12335
12336 if (info->alpha_to_one) {
12337 if (colors[3].regClass() == v2b)
12338 colors[3] = bld.copy(bld.def(v2b), Operand::c16(0x3c00));
12339 else
12340 colors[3] = bld.copy(bld.def(v1), Operand::c32(0x3f800000u));
12341 }
12342
12343 if (color_index == 0 && info->alpha_func != COMPARE_FUNC_ALWAYS) {
12344 Operand cond = Operand::c32(-1u);
12345 if (info->alpha_func != COMPARE_FUNC_NEVER) {
12346 aco_opcode opcode = aco_opcode::num_opcodes;
12347
12348 switch (info->alpha_func) {
12349 case COMPARE_FUNC_LESS: opcode = aco_opcode::v_cmp_ngt_f32; break;
12350 case COMPARE_FUNC_EQUAL: opcode = aco_opcode::v_cmp_neq_f32; break;
12351 case COMPARE_FUNC_LEQUAL: opcode = aco_opcode::v_cmp_nge_f32; break;
12352 case COMPARE_FUNC_GREATER: opcode = aco_opcode::v_cmp_nlt_f32; break;
12353 case COMPARE_FUNC_NOTEQUAL: opcode = aco_opcode::v_cmp_nlg_f32; break;
12354 case COMPARE_FUNC_GEQUAL: opcode = aco_opcode::v_cmp_nle_f32; break;
12355 default: unreachable("invalid alpha func");
12356 }
12357
12358 Temp ref = get_arg(ctx, info->alpha_reference);
12359
12360 Temp alpha = colors[3].regClass() == v2b
12361 ? bld.vop1(aco_opcode::v_cvt_f32_f16, bld.def(v1), colors[3])
12362 : colors[3];
12363
12364 /* true if not pass */
12365 cond = bld.vopc(opcode, bld.def(bld.lm), ref, alpha);
12366 }
12367
12368 bld.pseudo(aco_opcode::p_discard_if, cond);
12369 ctx->block->kind |= block_kind_uses_discard;
12370 ctx->program->needs_exact = true;
12371 }
12372 }
12373
12374 } /* end namespace */
12375
12376 void
select_program(Program * program,unsigned shader_count,struct nir_shader * const * shaders,ac_shader_config * config,const struct aco_compiler_options * options,const struct aco_shader_info * info,const struct ac_shader_args * args)12377 select_program(Program* program, unsigned shader_count, struct nir_shader* const* shaders,
12378 ac_shader_config* config, const struct aco_compiler_options* options,
12379 const struct aco_shader_info* info, const struct ac_shader_args* args)
12380 {
12381 isel_context ctx =
12382 setup_isel_context(program, shader_count, shaders, config, options, info, args);
12383
12384 if (ctx.stage == raytracing_cs)
12385 return select_program_rt(ctx, shader_count, shaders, args);
12386
12387 if (shader_count >= 2) {
12388 select_program_merged(ctx, shader_count, shaders);
12389 } else {
12390 bool need_barrier = false, check_merged_wave_info = false, endif_merged_wave_info = false;
12391 if_context ic_merged_wave_info;
12392
12393 /* Handle separate compilation of VS+TCS and {VS,TES}+GS on GFX9+. */
12394 if (ctx.program->info.merged_shader_compiled_separately) {
12395 assert(ctx.program->gfx_level >= GFX9);
12396 if (ctx.stage.sw == SWStage::VS || ctx.stage.sw == SWStage::TES) {
12397 check_merged_wave_info = endif_merged_wave_info = true;
12398 } else {
12399 const bool ngg_gs =
12400 ctx.stage.hw == AC_HW_NEXT_GEN_GEOMETRY_SHADER && ctx.stage.sw == SWStage::GS;
12401 assert(ctx.stage == tess_control_hs || ctx.stage == geometry_gs || ngg_gs);
12402 check_merged_wave_info = endif_merged_wave_info = !ngg_gs;
12403 need_barrier = !ngg_gs;
12404 }
12405 }
12406
12407 select_shader(ctx, shaders[0], true, true, need_barrier, &ic_merged_wave_info,
12408 check_merged_wave_info, endif_merged_wave_info);
12409 }
12410 }
12411
12412 void
select_trap_handler_shader(Program * program,struct nir_shader * shader,ac_shader_config * config,const struct aco_compiler_options * options,const struct aco_shader_info * info,const struct ac_shader_args * args)12413 select_trap_handler_shader(Program* program, struct nir_shader* shader, ac_shader_config* config,
12414 const struct aco_compiler_options* options,
12415 const struct aco_shader_info* info, const struct ac_shader_args* args)
12416 {
12417 assert(options->gfx_level == GFX8);
12418
12419 init_program(program, compute_cs, info, options->gfx_level, options->family, options->wgp_mode,
12420 config);
12421
12422 isel_context ctx = {};
12423 ctx.program = program;
12424 ctx.args = args;
12425 ctx.options = options;
12426 ctx.stage = program->stage;
12427
12428 ctx.block = ctx.program->create_and_insert_block();
12429 ctx.block->kind = block_kind_top_level;
12430
12431 program->workgroup_size = 1; /* XXX */
12432
12433 add_startpgm(&ctx);
12434 append_logical_start(ctx.block);
12435
12436 Builder bld(ctx.program, ctx.block);
12437
12438 /* Load the buffer descriptor from TMA. */
12439 bld.smem(aco_opcode::s_load_dwordx4, Definition(PhysReg{ttmp4}, s4), Operand(PhysReg{tma}, s2),
12440 Operand::zero());
12441
12442 /* Store TTMP0-TTMP1. */
12443 bld.smem(aco_opcode::s_buffer_store_dwordx2, Operand(PhysReg{ttmp4}, s4), Operand::zero(),
12444 Operand(PhysReg{ttmp0}, s2), memory_sync_info(), true);
12445
12446 uint32_t hw_regs_idx[] = {
12447 2, /* HW_REG_STATUS */
12448 3, /* HW_REG_TRAP_STS */
12449 4, /* HW_REG_HW_ID */
12450 7, /* HW_REG_IB_STS */
12451 };
12452
12453 /* Store some hardware registers. */
12454 for (unsigned i = 0; i < ARRAY_SIZE(hw_regs_idx); i++) {
12455 /* "((size - 1) << 11) | register" */
12456 bld.sopk(aco_opcode::s_getreg_b32, Definition(PhysReg{ttmp8}, s1),
12457 ((20 - 1) << 11) | hw_regs_idx[i]);
12458
12459 bld.smem(aco_opcode::s_buffer_store_dword, Operand(PhysReg{ttmp4}, s4),
12460 Operand::c32(8u + i * 4), Operand(PhysReg{ttmp8}, s1), memory_sync_info(), true);
12461 }
12462
12463 program->config->float_mode = program->blocks[0].fp_mode.val;
12464
12465 append_logical_end(ctx.block);
12466 ctx.block->kind |= block_kind_uniform;
12467 bld.sopp(aco_opcode::s_endpgm);
12468
12469 finish_program(&ctx);
12470 }
12471
12472 Operand
get_arg_fixed(const struct ac_shader_args * args,struct ac_arg arg)12473 get_arg_fixed(const struct ac_shader_args* args, struct ac_arg arg)
12474 {
12475 enum ac_arg_regfile file = args->args[arg.arg_index].file;
12476 unsigned size = args->args[arg.arg_index].size;
12477 RegClass rc = RegClass(file == AC_ARG_SGPR ? RegType::sgpr : RegType::vgpr, size);
12478 return Operand(get_arg_reg(args, arg), rc);
12479 }
12480
12481 unsigned
load_vb_descs(Builder & bld,PhysReg dest,Operand base,unsigned start,unsigned max)12482 load_vb_descs(Builder& bld, PhysReg dest, Operand base, unsigned start, unsigned max)
12483 {
12484 unsigned count = MIN2((bld.program->dev.sgpr_limit - dest.reg()) / 4u, max);
12485
12486 unsigned num_loads = (count / 4u) + util_bitcount(count & 0x3);
12487 if (bld.program->gfx_level >= GFX10 && num_loads > 1)
12488 bld.sopp(aco_opcode::s_clause, -1, num_loads - 1);
12489
12490 for (unsigned i = 0; i < count;) {
12491 unsigned size = 1u << util_logbase2(MIN2(count - i, 4));
12492
12493 if (size == 4)
12494 bld.smem(aco_opcode::s_load_dwordx16, Definition(dest, s16), base,
12495 Operand::c32((start + i) * 16u));
12496 else if (size == 2)
12497 bld.smem(aco_opcode::s_load_dwordx8, Definition(dest, s8), base,
12498 Operand::c32((start + i) * 16u));
12499 else
12500 bld.smem(aco_opcode::s_load_dwordx4, Definition(dest, s4), base,
12501 Operand::c32((start + i) * 16u));
12502
12503 dest = dest.advance(size * 16u);
12504 i += size;
12505 }
12506
12507 return count;
12508 }
12509
12510 Operand
calc_nontrivial_instance_id(Builder & bld,const struct ac_shader_args * args,const struct aco_vs_prolog_info * pinfo,unsigned index,Operand instance_id,Operand start_instance,PhysReg tmp_sgpr,PhysReg tmp_vgpr0,PhysReg tmp_vgpr1)12511 calc_nontrivial_instance_id(Builder& bld, const struct ac_shader_args* args,
12512 const struct aco_vs_prolog_info* pinfo, unsigned index,
12513 Operand instance_id, Operand start_instance, PhysReg tmp_sgpr,
12514 PhysReg tmp_vgpr0, PhysReg tmp_vgpr1)
12515 {
12516 bld.smem(aco_opcode::s_load_dwordx2, Definition(tmp_sgpr, s2),
12517 get_arg_fixed(args, pinfo->inputs), Operand::c32(8u + index * 8u));
12518
12519 wait_imm lgkm_imm;
12520 lgkm_imm.lgkm = 0;
12521 bld.sopp(aco_opcode::s_waitcnt, -1, lgkm_imm.pack(bld.program->gfx_level));
12522
12523 Definition fetch_index_def(tmp_vgpr0, v1);
12524 Operand fetch_index(tmp_vgpr0, v1);
12525
12526 Operand div_info(tmp_sgpr, s1);
12527 if (bld.program->gfx_level >= GFX8 && bld.program->gfx_level < GFX11) {
12528 /* use SDWA */
12529 if (bld.program->gfx_level < GFX9) {
12530 bld.vop1(aco_opcode::v_mov_b32, Definition(tmp_vgpr1, v1), div_info);
12531 div_info = Operand(tmp_vgpr1, v1);
12532 }
12533
12534 bld.vop2(aco_opcode::v_lshrrev_b32, fetch_index_def, div_info, instance_id);
12535
12536 Instruction* instr;
12537 if (bld.program->gfx_level >= GFX9)
12538 instr = bld.vop2_sdwa(aco_opcode::v_add_u32, fetch_index_def, div_info, fetch_index).instr;
12539 else
12540 instr = bld.vop2_sdwa(aco_opcode::v_add_co_u32, fetch_index_def, Definition(vcc, bld.lm),
12541 div_info, fetch_index)
12542 .instr;
12543 instr->sdwa().sel[0] = SubdwordSel::ubyte1;
12544
12545 bld.vop3(aco_opcode::v_mul_hi_u32, fetch_index_def, Operand(tmp_sgpr.advance(4), s1),
12546 fetch_index);
12547
12548 instr =
12549 bld.vop2_sdwa(aco_opcode::v_lshrrev_b32, fetch_index_def, div_info, fetch_index).instr;
12550 instr->sdwa().sel[0] = SubdwordSel::ubyte2;
12551 } else {
12552 Operand tmp_op(tmp_vgpr1, v1);
12553 Definition tmp_def(tmp_vgpr1, v1);
12554
12555 bld.vop2(aco_opcode::v_lshrrev_b32, fetch_index_def, div_info, instance_id);
12556
12557 bld.vop3(aco_opcode::v_bfe_u32, tmp_def, div_info, Operand::c32(8u), Operand::c32(8u));
12558 bld.vadd32(fetch_index_def, tmp_op, fetch_index, false, Operand(s2), true);
12559
12560 bld.vop3(aco_opcode::v_mul_hi_u32, fetch_index_def, fetch_index,
12561 Operand(tmp_sgpr.advance(4), s1));
12562
12563 bld.vop3(aco_opcode::v_bfe_u32, tmp_def, div_info, Operand::c32(16u), Operand::c32(8u));
12564 bld.vop2(aco_opcode::v_lshrrev_b32, fetch_index_def, tmp_op, fetch_index);
12565 }
12566
12567 bld.vadd32(fetch_index_def, start_instance, fetch_index, false, Operand(s2), true);
12568
12569 return fetch_index;
12570 }
12571
12572 void
select_rt_prolog(Program * program,ac_shader_config * config,const struct aco_compiler_options * options,const struct aco_shader_info * info,const struct ac_shader_args * in_args,const struct ac_shader_args * out_args)12573 select_rt_prolog(Program* program, ac_shader_config* config,
12574 const struct aco_compiler_options* options, const struct aco_shader_info* info,
12575 const struct ac_shader_args* in_args, const struct ac_shader_args* out_args)
12576 {
12577 init_program(program, compute_cs, info, options->gfx_level, options->family, options->wgp_mode,
12578 config);
12579 Block* block = program->create_and_insert_block();
12580 block->kind = block_kind_top_level;
12581 program->workgroup_size = info->workgroup_size;
12582 program->wave_size = info->workgroup_size;
12583 calc_min_waves(program);
12584 Builder bld(program, block);
12585 block->instructions.reserve(32);
12586 unsigned num_sgprs = MAX2(in_args->num_sgprs_used, out_args->num_sgprs_used);
12587 unsigned num_vgprs = MAX2(in_args->num_vgprs_used, out_args->num_vgprs_used);
12588
12589 /* Inputs:
12590 * Ring offsets: s[0-1]
12591 * Indirect descriptor sets: s[2]
12592 * Push constants pointer: s[3]
12593 * SBT descriptors: s[4-5]
12594 * Traversal shader address: s[6-7]
12595 * Ray launch size address: s[8-9]
12596 * Dynamic callable stack base: s[10]
12597 * Workgroup IDs (xyz): s[11], s[12], s[13]
12598 * Scratch offset: s[14]
12599 * Local invocation IDs: v[0-2]
12600 */
12601 PhysReg in_ring_offsets = get_arg_reg(in_args, in_args->ring_offsets);
12602 PhysReg in_sbt_desc = get_arg_reg(in_args, in_args->rt.sbt_descriptors);
12603 PhysReg in_launch_size_addr = get_arg_reg(in_args, in_args->rt.launch_size_addr);
12604 PhysReg in_stack_base = get_arg_reg(in_args, in_args->rt.dynamic_callable_stack_base);
12605 PhysReg in_wg_id_x = get_arg_reg(in_args, in_args->workgroup_ids[0]);
12606 PhysReg in_wg_id_y = get_arg_reg(in_args, in_args->workgroup_ids[1]);
12607 PhysReg in_wg_id_z = get_arg_reg(in_args, in_args->workgroup_ids[2]);
12608 PhysReg in_scratch_offset;
12609 if (options->gfx_level < GFX11)
12610 in_scratch_offset = get_arg_reg(in_args, in_args->scratch_offset);
12611 PhysReg in_local_ids[2] = {
12612 get_arg_reg(in_args, in_args->local_invocation_ids),
12613 get_arg_reg(in_args, in_args->local_invocation_ids).advance(4),
12614 };
12615
12616 /* Outputs:
12617 * Callee shader PC: s[0-1]
12618 * Indirect descriptor sets: s[2]
12619 * Push constants pointer: s[3]
12620 * SBT descriptors: s[4-5]
12621 * Traversal shader address: s[6-7]
12622 * Ray launch sizes (xyz): s[8], s[9], s[10]
12623 * Scratch offset (<GFX9 only): s[11]
12624 * Ring offsets (<GFX9 only): s[12-13]
12625 * Ray launch IDs: v[0-2]
12626 * Stack pointer: v[3]
12627 * Shader VA: v[4-5]
12628 * Shader Record Ptr: v[6-7]
12629 */
12630 PhysReg out_uniform_shader_addr = get_arg_reg(out_args, out_args->rt.uniform_shader_addr);
12631 PhysReg out_launch_size_x = get_arg_reg(out_args, out_args->rt.launch_size);
12632 PhysReg out_launch_size_y = out_launch_size_x.advance(4);
12633 PhysReg out_launch_size_z = out_launch_size_y.advance(4);
12634 PhysReg out_launch_ids[3];
12635 for (unsigned i = 0; i < 3; i++)
12636 out_launch_ids[i] = get_arg_reg(out_args, out_args->rt.launch_id).advance(i * 4);
12637 PhysReg out_stack_ptr = get_arg_reg(out_args, out_args->rt.dynamic_callable_stack_base);
12638 PhysReg out_record_ptr = get_arg_reg(out_args, out_args->rt.shader_record);
12639
12640 /* Temporaries: */
12641 num_sgprs = align(num_sgprs, 2);
12642 PhysReg tmp_raygen_sbt = PhysReg{num_sgprs};
12643 num_sgprs += 2;
12644 PhysReg tmp_ring_offsets = PhysReg{num_sgprs};
12645 num_sgprs += 2;
12646
12647 PhysReg tmp_invocation_idx = PhysReg{256 + num_vgprs++};
12648
12649 /* Confirm some assumptions about register aliasing */
12650 assert(in_ring_offsets == out_uniform_shader_addr);
12651 assert(get_arg_reg(in_args, in_args->push_constants) ==
12652 get_arg_reg(out_args, out_args->push_constants));
12653 assert(get_arg_reg(in_args, in_args->rt.sbt_descriptors) ==
12654 get_arg_reg(out_args, out_args->rt.sbt_descriptors));
12655 assert(in_launch_size_addr == out_launch_size_x);
12656 assert(in_stack_base == out_launch_size_z);
12657 assert(in_local_ids[0] == out_launch_ids[0]);
12658
12659 /* load raygen sbt */
12660 bld.smem(aco_opcode::s_load_dwordx2, Definition(tmp_raygen_sbt, s2), Operand(in_sbt_desc, s2),
12661 Operand::c32(0u));
12662
12663 /* init scratch */
12664 if (options->gfx_level < GFX9) {
12665 /* copy ring offsets to temporary location*/
12666 bld.sop1(aco_opcode::s_mov_b64, Definition(tmp_ring_offsets, s2),
12667 Operand(in_ring_offsets, s2));
12668 } else if (options->gfx_level < GFX11) {
12669 hw_init_scratch(bld, Definition(in_ring_offsets, s1), Operand(in_ring_offsets, s2),
12670 Operand(in_scratch_offset, s1));
12671 }
12672
12673 /* set stack ptr */
12674 bld.vop1(aco_opcode::v_mov_b32, Definition(out_stack_ptr, v1), Operand(in_stack_base, s1));
12675
12676 /* load raygen address */
12677 bld.smem(aco_opcode::s_load_dwordx2, Definition(out_uniform_shader_addr, s2),
12678 Operand(tmp_raygen_sbt, s2), Operand::c32(0u));
12679
12680 /* load ray launch sizes */
12681 bld.smem(aco_opcode::s_load_dword, Definition(out_launch_size_z, s1),
12682 Operand(in_launch_size_addr, s2), Operand::c32(8u));
12683 bld.smem(aco_opcode::s_load_dwordx2, Definition(out_launch_size_x, s2),
12684 Operand(in_launch_size_addr, s2), Operand::c32(0u));
12685
12686 /* calculate ray launch ids */
12687 if (options->gfx_level >= GFX11) {
12688 /* Thread IDs are packed in VGPR0, 10 bits per component. */
12689 bld.vop3(aco_opcode::v_bfe_u32, Definition(in_local_ids[1], v1), Operand(in_local_ids[0], v1),
12690 Operand::c32(10u), Operand::c32(3u));
12691 bld.vop2(aco_opcode::v_and_b32, Definition(in_local_ids[0], v1), Operand::c32(0x7),
12692 Operand(in_local_ids[0], v1));
12693 }
12694 /* Do this backwards to reduce some RAW hazards on GFX11+ */
12695 bld.vop1(aco_opcode::v_mov_b32, Definition(out_launch_ids[2], v1), Operand(in_wg_id_z, s1));
12696 bld.vop3(aco_opcode::v_mad_u32_u24, Definition(out_launch_ids[1], v1), Operand(in_wg_id_y, s1),
12697 Operand::c32(program->workgroup_size == 32 ? 4 : 8), Operand(in_local_ids[1], v1));
12698 bld.vop3(aco_opcode::v_mad_u32_u24, Definition(out_launch_ids[0], v1), Operand(in_wg_id_x, s1),
12699 Operand::c32(8), Operand(in_local_ids[0], v1));
12700
12701 if (options->gfx_level < GFX9) {
12702 /* write scratch/ring offsets to outputs, if needed */
12703 bld.sop1(aco_opcode::s_mov_b32,
12704 Definition(get_arg_reg(out_args, out_args->scratch_offset), s1),
12705 Operand(in_scratch_offset, s1));
12706 bld.sop1(aco_opcode::s_mov_b64, Definition(get_arg_reg(out_args, out_args->ring_offsets), s2),
12707 Operand(tmp_ring_offsets, s2));
12708 }
12709
12710 /* calculate shader record ptr: SBT + RADV_RT_HANDLE_SIZE */
12711 if (options->gfx_level < GFX9) {
12712 bld.vop2_e64(aco_opcode::v_add_co_u32, Definition(out_record_ptr, v1), Definition(vcc, s2),
12713 Operand(tmp_raygen_sbt, s1), Operand::c32(32u));
12714 } else {
12715 bld.vop2_e64(aco_opcode::v_add_u32, Definition(out_record_ptr, v1),
12716 Operand(tmp_raygen_sbt, s1), Operand::c32(32u));
12717 }
12718 bld.vop1(aco_opcode::v_mov_b32, Definition(out_record_ptr.advance(4), v1),
12719 Operand(tmp_raygen_sbt.advance(4), s1));
12720
12721 /* For 1D dispatches converted into 2D ones, we need to fix up the launch IDs.
12722 * Calculating the 1D launch ID is: id = local_invocation_index + (wg_id.x * wg_size).
12723 * in_wg_id_x now holds wg_id.x * wg_size.
12724 */
12725 bld.sop2(aco_opcode::s_lshl_b32, Definition(in_wg_id_x, s1), Definition(scc, s1),
12726 Operand(in_wg_id_x, s1), Operand::c32(program->workgroup_size == 32 ? 5 : 6));
12727
12728 /* Calculate and add local_invocation_index */
12729 bld.vop3(aco_opcode::v_mbcnt_lo_u32_b32, Definition(tmp_invocation_idx, v1), Operand::c32(-1u),
12730 Operand(in_wg_id_x, s1));
12731 if (program->wave_size == 64) {
12732 if (program->gfx_level <= GFX7)
12733 bld.vop2(aco_opcode::v_mbcnt_hi_u32_b32, Definition(tmp_invocation_idx, v1),
12734 Operand::c32(-1u), Operand(tmp_invocation_idx, v1));
12735 else
12736 bld.vop3(aco_opcode::v_mbcnt_hi_u32_b32_e64, Definition(tmp_invocation_idx, v1),
12737 Operand::c32(-1u), Operand(tmp_invocation_idx, v1));
12738 }
12739
12740 /* Make fixup operations a no-op if this is not a converted 2D dispatch. */
12741 bld.sopc(aco_opcode::s_cmp_lg_u32, Definition(scc, s1),
12742 Operand::c32(ACO_RT_CONVERTED_2D_LAUNCH_SIZE), Operand(out_launch_size_y, s1));
12743 bld.sop2(Builder::s_cselect, Definition(vcc, bld.lm),
12744 Operand::c32_or_c64(-1u, program->wave_size == 64),
12745 Operand::c32_or_c64(0, program->wave_size == 64), Operand(scc, s1));
12746 bld.vop2(aco_opcode::v_cndmask_b32, Definition(out_launch_ids[0], v1),
12747 Operand(tmp_invocation_idx, v1), Operand(out_launch_ids[0], v1), Operand(vcc, bld.lm));
12748 bld.vop2(aco_opcode::v_cndmask_b32, Definition(out_launch_ids[1], v1), Operand::zero(),
12749 Operand(out_launch_ids[1], v1), Operand(vcc, bld.lm));
12750
12751 /* jump to raygen */
12752 bld.sop1(aco_opcode::s_setpc_b64, Operand(out_uniform_shader_addr, s2));
12753
12754 program->config->float_mode = program->blocks[0].fp_mode.val;
12755 program->config->num_vgprs = get_vgpr_alloc(program, num_vgprs);
12756 program->config->num_sgprs = get_sgpr_alloc(program, num_sgprs);
12757 }
12758
12759 void
select_vs_prolog(Program * program,const struct aco_vs_prolog_info * pinfo,ac_shader_config * config,const struct aco_compiler_options * options,const struct aco_shader_info * info,const struct ac_shader_args * args)12760 select_vs_prolog(Program* program, const struct aco_vs_prolog_info* pinfo, ac_shader_config* config,
12761 const struct aco_compiler_options* options, const struct aco_shader_info* info,
12762 const struct ac_shader_args* args)
12763 {
12764 assert(pinfo->num_attributes > 0);
12765
12766 /* This should be enough for any shader/stage. */
12767 unsigned max_user_sgprs = options->gfx_level >= GFX9 ? 32 : 16;
12768
12769 init_program(program, compute_cs, info, options->gfx_level, options->family, options->wgp_mode,
12770 config);
12771 program->dev.vgpr_limit = 256;
12772
12773 Block* block = program->create_and_insert_block();
12774 block->kind = block_kind_top_level;
12775
12776 program->workgroup_size = 64;
12777 calc_min_waves(program);
12778
12779 Builder bld(program, block);
12780
12781 block->instructions.reserve(16 + pinfo->num_attributes * 4);
12782
12783 bld.sopp(aco_opcode::s_setprio, -1u, 0x3u);
12784
12785 uint32_t attrib_mask = BITFIELD_MASK(pinfo->num_attributes);
12786 bool has_nontrivial_divisors = pinfo->nontrivial_divisors;
12787
12788 wait_imm lgkm_imm;
12789 lgkm_imm.lgkm = 0;
12790
12791 /* choose sgprs */
12792 PhysReg vertex_buffers(align(max_user_sgprs + 14, 2));
12793 PhysReg prolog_input = vertex_buffers.advance(8);
12794 PhysReg desc(
12795 align((has_nontrivial_divisors ? prolog_input : vertex_buffers).advance(8).reg(), 4));
12796
12797 Operand start_instance = get_arg_fixed(args, args->start_instance);
12798 Operand instance_id = get_arg_fixed(args, args->instance_id);
12799
12800 PhysReg attributes_start(256 + args->num_vgprs_used);
12801 /* choose vgprs that won't be used for anything else until the last attribute load */
12802 PhysReg vertex_index(attributes_start.reg() + pinfo->num_attributes * 4 - 1);
12803 PhysReg instance_index(attributes_start.reg() + pinfo->num_attributes * 4 - 2);
12804 PhysReg start_instance_vgpr(attributes_start.reg() + pinfo->num_attributes * 4 - 3);
12805 PhysReg nontrivial_tmp_vgpr0(attributes_start.reg() + pinfo->num_attributes * 4 - 4);
12806 PhysReg nontrivial_tmp_vgpr1(attributes_start.reg() + pinfo->num_attributes * 4);
12807
12808 bld.sop1(aco_opcode::s_mov_b32, Definition(vertex_buffers, s1),
12809 get_arg_fixed(args, args->vertex_buffers));
12810 if (options->address32_hi >= 0xffff8000 || options->address32_hi <= 0x7fff) {
12811 bld.sopk(aco_opcode::s_movk_i32, Definition(vertex_buffers.advance(4), s1),
12812 options->address32_hi & 0xFFFF);
12813 } else {
12814 bld.sop1(aco_opcode::s_mov_b32, Definition(vertex_buffers.advance(4), s1),
12815 Operand::c32((unsigned)options->address32_hi));
12816 }
12817
12818 /* calculate vgpr requirements */
12819 unsigned num_vgprs = attributes_start.reg() - 256;
12820 num_vgprs += pinfo->num_attributes * 4;
12821 if (has_nontrivial_divisors && program->gfx_level <= GFX8)
12822 num_vgprs++; /* make space for nontrivial_tmp_vgpr1 */
12823 unsigned num_sgprs = 0;
12824
12825 const struct ac_vtx_format_info* vtx_info_table =
12826 ac_get_vtx_format_info_table(GFX8, CHIP_POLARIS10);
12827
12828 for (unsigned loc = 0; loc < pinfo->num_attributes;) {
12829 unsigned num_descs =
12830 load_vb_descs(bld, desc, Operand(vertex_buffers, s2), loc, pinfo->num_attributes - loc);
12831 num_sgprs = MAX2(num_sgprs, desc.advance(num_descs * 16u).reg());
12832
12833 if (loc == 0) {
12834 /* perform setup while we load the descriptors */
12835 if (pinfo->is_ngg || pinfo->next_stage != MESA_SHADER_VERTEX) {
12836 Operand count = get_arg_fixed(args, args->merged_wave_info);
12837 bld.sop2(aco_opcode::s_bfm_b64, Definition(exec, s2), count, Operand::c32(0u));
12838 if (program->wave_size == 64) {
12839 bld.sopc(aco_opcode::s_bitcmp1_b32, Definition(scc, s1), count,
12840 Operand::c32(6u /* log2(64) */));
12841 bld.sop2(aco_opcode::s_cselect_b64, Definition(exec, s2), Operand::c64(UINT64_MAX),
12842 Operand(exec, s2), Operand(scc, s1));
12843 }
12844 }
12845
12846 /* If there are no HS threads, SPI mistakenly loads the LS VGPRs starting at VGPR 0. */
12847 if (info->hw_stage == AC_HW_HULL_SHADER && options->has_ls_vgpr_init_bug) {
12848 /* We don't want load_vb_descs() to write vcc. */
12849 assert(program->dev.sgpr_limit <= vcc.reg());
12850
12851 bld.sop2(aco_opcode::s_bfe_u32, Definition(vcc, s1), Definition(scc, s1),
12852 get_arg_fixed(args, args->merged_wave_info), Operand::c32((8u << 16) | 8u));
12853 bld.sop2(Builder::s_cselect, Definition(vcc, bld.lm), Operand::c32(-1), Operand::zero(),
12854 Operand(scc, s1));
12855
12856 /* These copies are ordered so that vertex_id=tcs_patch_id doesn't overwrite vertex_id
12857 * before instance_id=vertex_id. */
12858 ac_arg src_args[] = {args->vertex_id, args->tcs_rel_ids, args->tcs_patch_id};
12859 ac_arg dst_args[] = {args->instance_id, args->vs_rel_patch_id, args->vertex_id};
12860 for (unsigned i = 0; i < 3; i++) {
12861 bld.vop2(aco_opcode::v_cndmask_b32, Definition(get_arg_reg(args, dst_args[i]), v1),
12862 get_arg_fixed(args, src_args[i]), get_arg_fixed(args, dst_args[i]),
12863 Operand(vcc, bld.lm));
12864 }
12865 }
12866
12867 bool needs_instance_index =
12868 pinfo->instance_rate_inputs &
12869 ~(pinfo->zero_divisors | pinfo->nontrivial_divisors); /* divisor is 1 */
12870 bool needs_start_instance = pinfo->instance_rate_inputs & pinfo->zero_divisors;
12871 bool needs_vertex_index = ~pinfo->instance_rate_inputs & attrib_mask;
12872 if (needs_vertex_index)
12873 bld.vadd32(Definition(vertex_index, v1), get_arg_fixed(args, args->base_vertex),
12874 get_arg_fixed(args, args->vertex_id), false, Operand(s2), true);
12875 if (needs_instance_index)
12876 bld.vadd32(Definition(instance_index, v1), start_instance, instance_id, false,
12877 Operand(s2), true);
12878 if (needs_start_instance)
12879 bld.vop1(aco_opcode::v_mov_b32, Definition(start_instance_vgpr, v1), start_instance);
12880 }
12881
12882 bld.sopp(aco_opcode::s_waitcnt, -1, lgkm_imm.pack(program->gfx_level));
12883
12884 for (unsigned i = 0; i < num_descs;) {
12885 PhysReg dest(attributes_start.reg() + loc * 4u);
12886
12887 /* calculate index */
12888 Operand fetch_index = Operand(vertex_index, v1);
12889 if (pinfo->instance_rate_inputs & (1u << loc)) {
12890 if (!(pinfo->zero_divisors & (1u << loc))) {
12891 fetch_index = instance_id;
12892 if (pinfo->nontrivial_divisors & (1u << loc)) {
12893 unsigned index = util_bitcount(pinfo->nontrivial_divisors & BITFIELD_MASK(loc));
12894 fetch_index = calc_nontrivial_instance_id(
12895 bld, args, pinfo, index, instance_id, start_instance, prolog_input,
12896 nontrivial_tmp_vgpr0, nontrivial_tmp_vgpr1);
12897 } else {
12898 fetch_index = Operand(instance_index, v1);
12899 }
12900 } else {
12901 fetch_index = Operand(start_instance_vgpr, v1);
12902 }
12903 }
12904
12905 /* perform load */
12906 PhysReg cur_desc = desc.advance(i * 16);
12907 if ((pinfo->misaligned_mask & (1u << loc))) {
12908 const struct ac_vtx_format_info* vtx_info = &vtx_info_table[pinfo->formats[loc]];
12909
12910 assert(vtx_info->has_hw_format & 0x1);
12911 unsigned dfmt = vtx_info->hw_format[0] & 0xf;
12912 unsigned nfmt = vtx_info->hw_format[0] >> 4;
12913
12914 for (unsigned j = 0; j < vtx_info->num_channels; j++) {
12915 bool post_shuffle = pinfo->post_shuffle & (1u << loc);
12916 unsigned offset = vtx_info->chan_byte_size * (post_shuffle && j < 3 ? 2 - j : j);
12917
12918 /* Use MUBUF to workaround hangs for byte-aligned dword loads. The Vulkan spec
12919 * doesn't require this to work, but some GL CTS tests over Zink do this anyway.
12920 * MTBUF can hang, but MUBUF doesn't (probably gives garbage, but GL CTS doesn't
12921 * care).
12922 */
12923 if (dfmt == V_008F0C_BUF_DATA_FORMAT_32)
12924 bld.mubuf(aco_opcode::buffer_load_dword, Definition(dest.advance(j * 4u), v1),
12925 Operand(cur_desc, s4), fetch_index, Operand::c32(0u), offset, false,
12926 false, true);
12927 else if (vtx_info->chan_byte_size == 8)
12928 bld.mtbuf(aco_opcode::tbuffer_load_format_xy,
12929 Definition(dest.advance(j * 8u), v2), Operand(cur_desc, s4),
12930 fetch_index, Operand::c32(0u), dfmt, nfmt, offset, false, true);
12931 else
12932 bld.mtbuf(aco_opcode::tbuffer_load_format_x, Definition(dest.advance(j * 4u), v1),
12933 Operand(cur_desc, s4), fetch_index, Operand::c32(0u), dfmt, nfmt,
12934 offset, false, true);
12935 }
12936 uint32_t one =
12937 nfmt == V_008F0C_BUF_NUM_FORMAT_UINT || nfmt == V_008F0C_BUF_NUM_FORMAT_SINT
12938 ? 1u
12939 : 0x3f800000u;
12940 /* 22.1.1. Attribute Location and Component Assignment of Vulkan 1.3 specification:
12941 * For 64-bit data types, no default attribute values are provided. Input variables must
12942 * not use more components than provided by the attribute.
12943 */
12944 for (unsigned j = vtx_info->num_channels; vtx_info->chan_byte_size != 8 && j < 4; j++) {
12945 bld.vop1(aco_opcode::v_mov_b32, Definition(dest.advance(j * 4u), v1),
12946 Operand::c32(j == 3 ? one : 0u));
12947 }
12948
12949 unsigned slots = vtx_info->chan_byte_size == 8 && vtx_info->num_channels > 2 ? 2 : 1;
12950 loc += slots;
12951 i += slots;
12952 } else {
12953 bld.mubuf(aco_opcode::buffer_load_format_xyzw, Definition(dest, v4),
12954 Operand(cur_desc, s4), fetch_index, Operand::c32(0u), 0u, false, false, true);
12955 loc++;
12956 i++;
12957 }
12958 }
12959 }
12960
12961 if (pinfo->alpha_adjust_lo | pinfo->alpha_adjust_hi) {
12962 wait_imm vm_imm;
12963 vm_imm.vm = 0;
12964 bld.sopp(aco_opcode::s_waitcnt, -1, vm_imm.pack(program->gfx_level));
12965 }
12966
12967 /* For 2_10_10_10 formats the alpha is handled as unsigned by pre-vega HW.
12968 * so we may need to fix it up. */
12969 u_foreach_bit (loc, (pinfo->alpha_adjust_lo | pinfo->alpha_adjust_hi)) {
12970 PhysReg alpha(attributes_start.reg() + loc * 4u + 3);
12971
12972 unsigned alpha_adjust = (pinfo->alpha_adjust_lo >> loc) & 0x1;
12973 alpha_adjust |= ((pinfo->alpha_adjust_hi >> loc) & 0x1) << 1;
12974
12975 if (alpha_adjust == AC_ALPHA_ADJUST_SSCALED)
12976 bld.vop1(aco_opcode::v_cvt_u32_f32, Definition(alpha, v1), Operand(alpha, v1));
12977
12978 /* For the integer-like cases, do a natural sign extension.
12979 *
12980 * For the SNORM case, the values are 0.0, 0.333, 0.666, 1.0
12981 * and happen to contain 0, 1, 2, 3 as the two LSBs of the
12982 * exponent.
12983 */
12984 unsigned offset = alpha_adjust == AC_ALPHA_ADJUST_SNORM ? 23u : 0u;
12985 bld.vop3(aco_opcode::v_bfe_i32, Definition(alpha, v1), Operand(alpha, v1),
12986 Operand::c32(offset), Operand::c32(2u));
12987
12988 /* Convert back to the right type. */
12989 if (alpha_adjust == AC_ALPHA_ADJUST_SNORM) {
12990 bld.vop1(aco_opcode::v_cvt_f32_i32, Definition(alpha, v1), Operand(alpha, v1));
12991 bld.vop2(aco_opcode::v_max_f32, Definition(alpha, v1), Operand::c32(0xbf800000u),
12992 Operand(alpha, v1));
12993 } else if (alpha_adjust == AC_ALPHA_ADJUST_SSCALED) {
12994 bld.vop1(aco_opcode::v_cvt_f32_i32, Definition(alpha, v1), Operand(alpha, v1));
12995 }
12996 }
12997
12998 block->kind |= block_kind_uniform;
12999
13000 /* continue on to the main shader */
13001 Operand continue_pc = get_arg_fixed(args, pinfo->inputs);
13002 if (has_nontrivial_divisors) {
13003 bld.smem(aco_opcode::s_load_dwordx2, Definition(prolog_input, s2),
13004 get_arg_fixed(args, pinfo->inputs), Operand::c32(0u));
13005 bld.sopp(aco_opcode::s_waitcnt, -1, lgkm_imm.pack(program->gfx_level));
13006 continue_pc = Operand(prolog_input, s2);
13007 }
13008
13009 bld.sop1(aco_opcode::s_setpc_b64, continue_pc);
13010
13011 program->config->float_mode = program->blocks[0].fp_mode.val;
13012 /* addition on GFX6-8 requires a carry-out (we use VCC) */
13013 program->needs_vcc = program->gfx_level <= GFX8;
13014 program->config->num_vgprs = std::min<uint16_t>(get_vgpr_alloc(program, num_vgprs), 256);
13015 program->config->num_sgprs = get_sgpr_alloc(program, num_sgprs);
13016 }
13017
13018 void
select_ps_epilog(Program * program,void * pinfo,ac_shader_config * config,const struct aco_compiler_options * options,const struct aco_shader_info * info,const struct ac_shader_args * args)13019 select_ps_epilog(Program* program, void* pinfo, ac_shader_config* config,
13020 const struct aco_compiler_options* options, const struct aco_shader_info* info,
13021 const struct ac_shader_args* args)
13022 {
13023 const struct aco_ps_epilog_info* einfo = (const struct aco_ps_epilog_info*)pinfo;
13024 isel_context ctx =
13025 setup_isel_context(program, 0, NULL, config, options, info, args, SWStage::FS);
13026
13027 ctx.block->fp_mode = program->next_fp_mode;
13028
13029 add_startpgm(&ctx);
13030 append_logical_start(ctx.block);
13031
13032 Builder bld(ctx.program, ctx.block);
13033
13034 Temp colors[MAX_DRAW_BUFFERS][4];
13035 for (unsigned i = 0; i < MAX_DRAW_BUFFERS; i++) {
13036 if (!einfo->colors[i].used)
13037 continue;
13038
13039 Temp color = get_arg(&ctx, einfo->colors[i]);
13040 unsigned col_types = (einfo->color_types >> (i * 2)) & 0x3;
13041
13042 emit_split_vector(&ctx, color, col_types == ACO_TYPE_ANY32 ? 4 : 8);
13043 for (unsigned c = 0; c < 4; ++c) {
13044 colors[i][c] = emit_extract_vector(&ctx, color, c, col_types == ACO_TYPE_ANY32 ? v1 : v2b);
13045 }
13046
13047 emit_clamp_alpha_test(&ctx, einfo, colors[i], i);
13048 }
13049
13050 bool has_mrtz_depth = einfo->depth.used;
13051 bool has_mrtz_stencil = einfo->stencil.used;
13052 bool has_mrtz_samplemask = einfo->samplemask.used;
13053 bool has_mrtz_alpha = einfo->alpha_to_coverage_via_mrtz && einfo->colors[0].used;
13054 bool has_mrtz_export =
13055 has_mrtz_depth || has_mrtz_stencil || has_mrtz_samplemask || has_mrtz_alpha;
13056 if (has_mrtz_export) {
13057 Temp depth = has_mrtz_depth ? get_arg(&ctx, einfo->depth) : Temp();
13058 Temp stencil = has_mrtz_stencil ? get_arg(&ctx, einfo->stencil) : Temp();
13059 Temp samplemask = has_mrtz_samplemask ? get_arg(&ctx, einfo->samplemask) : Temp();
13060 Temp alpha = has_mrtz_alpha ? colors[0][3] : Temp();
13061
13062 export_fs_mrtz(&ctx, depth, stencil, samplemask, alpha);
13063 }
13064
13065 /* Export all color render targets */
13066 struct aco_export_mrt mrts[MAX_DRAW_BUFFERS];
13067 unsigned mrt_num = 0;
13068
13069 if (einfo->broadcast_last_cbuf) {
13070 for (unsigned i = 0; i <= einfo->broadcast_last_cbuf; i++) {
13071 struct aco_export_mrt* mrt = &mrts[mrt_num];
13072 if (export_fs_mrt_color(&ctx, einfo, colors[0], i, mrt))
13073 mrt->target += mrt_num++;
13074 }
13075 } else {
13076 for (unsigned i = 0; i < MAX_DRAW_BUFFERS; i++) {
13077 struct aco_export_mrt* mrt = &mrts[mrt_num];
13078 if (export_fs_mrt_color(&ctx, einfo, colors[i], i, mrt))
13079 mrt->target += mrt_num++;
13080 }
13081 }
13082
13083 if (mrt_num) {
13084 if (ctx.options->gfx_level >= GFX11 && einfo->mrt0_is_dual_src) {
13085 assert(mrt_num == 2);
13086 create_fs_dual_src_export_gfx11(&ctx, &mrts[0], &mrts[1]);
13087 } else {
13088 for (unsigned i = 0; i < mrt_num; i++)
13089 export_mrt(&ctx, &mrts[i]);
13090 }
13091 } else if (!has_mrtz_export && !einfo->skip_null_export) {
13092 create_fs_null_export(&ctx);
13093 }
13094
13095 program->config->float_mode = program->blocks[0].fp_mode.val;
13096
13097 append_logical_end(ctx.block);
13098 ctx.block->kind |= block_kind_export_end;
13099 bld.reset(ctx.block);
13100 bld.sopp(aco_opcode::s_endpgm);
13101
13102 finish_program(&ctx);
13103 }
13104
13105 void
select_tcs_epilog(Program * program,void * pinfo,ac_shader_config * config,const struct aco_compiler_options * options,const struct aco_shader_info * info,const struct ac_shader_args * args)13106 select_tcs_epilog(Program* program, void* pinfo, ac_shader_config* config,
13107 const struct aco_compiler_options* options, const struct aco_shader_info* info,
13108 const struct ac_shader_args* args)
13109 {
13110 const struct aco_tcs_epilog_info* einfo = (const struct aco_tcs_epilog_info*)pinfo;
13111 isel_context ctx =
13112 setup_isel_context(program, 0, NULL, config, options, info, args, SWStage::TCS);
13113
13114 ctx.block->fp_mode = program->next_fp_mode;
13115
13116 add_startpgm(&ctx);
13117 append_logical_start(ctx.block);
13118
13119 Builder bld(ctx.program, ctx.block);
13120
13121 /* Add a barrier before loading tess factors from LDS. */
13122 if (!einfo->pass_tessfactors_by_reg) {
13123 /* To generate s_waitcnt lgkmcnt(0) when waitcnt insertion. */
13124 program->pending_lds_access = true;
13125
13126 sync_scope scope = einfo->tcs_out_patch_fits_subgroup ? scope_subgroup : scope_workgroup;
13127 bld.barrier(aco_opcode::p_barrier, memory_sync_info(storage_shared, semantic_acqrel, scope),
13128 scope);
13129 }
13130
13131 Temp invocation_id = get_arg(&ctx, einfo->invocation_id);
13132
13133 Temp cond = bld.vopc(aco_opcode::v_cmp_eq_u32, bld.def(bld.lm), Operand::zero(), invocation_id);
13134
13135 if_context ic_invoc_0;
13136 begin_divergent_if_then(&ctx, &ic_invoc_0, cond);
13137
13138 int outer_comps, inner_comps;
13139 switch (einfo->primitive_mode) {
13140 case TESS_PRIMITIVE_ISOLINES:
13141 outer_comps = 2;
13142 inner_comps = 0;
13143 break;
13144 case TESS_PRIMITIVE_TRIANGLES:
13145 outer_comps = 3;
13146 inner_comps = 1;
13147 break;
13148 case TESS_PRIMITIVE_QUADS:
13149 outer_comps = 4;
13150 inner_comps = 2;
13151 break;
13152 default: unreachable("invalid primitive mode"); return;
13153 }
13154
13155 bld.reset(ctx.block);
13156
13157 unsigned tess_lvl_out_loc =
13158 ac_shader_io_get_unique_index_patch(VARYING_SLOT_TESS_LEVEL_OUTER) * 16;
13159 unsigned tess_lvl_in_loc =
13160 ac_shader_io_get_unique_index_patch(VARYING_SLOT_TESS_LEVEL_INNER) * 16;
13161
13162 Temp outer[4];
13163 Temp inner[2];
13164 if (einfo->pass_tessfactors_by_reg) {
13165 for (int i = 0; i < outer_comps; i++)
13166 outer[i] = get_arg(&ctx, einfo->tess_lvl_out[i]);
13167
13168 for (int i = 0; i < inner_comps; i++)
13169 inner[i] = get_arg(&ctx, einfo->tess_lvl_in[i]);
13170 } else {
13171 Temp addr = get_arg(&ctx, einfo->tcs_out_current_patch_data_offset);
13172 addr = bld.vop2(aco_opcode::v_lshlrev_b32, bld.def(v1), Operand::c32(2), addr);
13173
13174 Temp data = program->allocateTmp(RegClass(RegType::vgpr, outer_comps));
13175 load_lds(&ctx, 4, outer_comps, data, addr, tess_lvl_out_loc, 4);
13176 for (int i = 0; i < outer_comps; i++)
13177 outer[i] = emit_extract_vector(&ctx, data, i, v1);
13178
13179 if (inner_comps) {
13180 data = program->allocateTmp(RegClass(RegType::vgpr, inner_comps));
13181 load_lds(&ctx, 4, inner_comps, data, addr, tess_lvl_in_loc, 4);
13182 for (int i = 0; i < inner_comps; i++)
13183 inner[i] = emit_extract_vector(&ctx, data, i, v1);
13184 }
13185 }
13186
13187 Temp tess_factor_ring_desc = get_tess_ring_descriptor(&ctx, einfo, true);
13188 Temp tess_factor_ring_base = get_arg(&ctx, args->tcs_factor_offset);
13189 Temp rel_patch_id = get_arg(&ctx, einfo->rel_patch_id);
13190 unsigned tess_factor_ring_const_offset = 0;
13191
13192 if (program->gfx_level <= GFX8) {
13193 /* Store the dynamic HS control word. */
13194 cond = bld.vopc(aco_opcode::v_cmp_eq_u32, bld.def(bld.lm), Operand::zero(), rel_patch_id);
13195
13196 if_context ic_patch_0;
13197 begin_divergent_if_then(&ctx, &ic_patch_0, cond);
13198
13199 bld.reset(ctx.block);
13200
13201 Temp data = bld.copy(bld.def(v1), Operand::c32(0x80000000u));
13202
13203 emit_single_mubuf_store(&ctx, tess_factor_ring_desc, Temp(0, v1), tess_factor_ring_base,
13204 Temp(), data, 0, memory_sync_info(), true, false, false);
13205
13206 tess_factor_ring_const_offset += 4;
13207
13208 begin_divergent_if_else(&ctx, &ic_patch_0);
13209 end_divergent_if(&ctx, &ic_patch_0);
13210 }
13211
13212 bld.reset(ctx.block);
13213
13214 Temp tess_factor_ring_offset =
13215 bld.v_mul_imm(bld.def(v1), rel_patch_id, (inner_comps + outer_comps) * 4, false);
13216
13217 switch (einfo->primitive_mode) {
13218 case TESS_PRIMITIVE_ISOLINES: {
13219 /* For isolines, the hardware expects tess factors in the reverse order. */
13220 Temp data = bld.pseudo(aco_opcode::p_create_vector, bld.def(v2), outer[1], outer[0]);
13221 emit_single_mubuf_store(&ctx, tess_factor_ring_desc, tess_factor_ring_offset,
13222 tess_factor_ring_base, Temp(), data, tess_factor_ring_const_offset,
13223 memory_sync_info(), true, false, false);
13224 break;
13225 }
13226 case TESS_PRIMITIVE_TRIANGLES: {
13227 Temp data = bld.pseudo(aco_opcode::p_create_vector, bld.def(v4), outer[0], outer[1], outer[2],
13228 inner[0]);
13229 emit_single_mubuf_store(&ctx, tess_factor_ring_desc, tess_factor_ring_offset,
13230 tess_factor_ring_base, Temp(), data, tess_factor_ring_const_offset,
13231 memory_sync_info(), true, false, false);
13232 break;
13233 }
13234 case TESS_PRIMITIVE_QUADS: {
13235 Temp data = bld.pseudo(aco_opcode::p_create_vector, bld.def(v4), outer[0], outer[1], outer[2],
13236 outer[3]);
13237 emit_single_mubuf_store(&ctx, tess_factor_ring_desc, tess_factor_ring_offset,
13238 tess_factor_ring_base, Temp(), data, tess_factor_ring_const_offset,
13239 memory_sync_info(), true, false, false);
13240
13241 data = bld.pseudo(aco_opcode::p_create_vector, bld.def(v2), inner[0], inner[1]);
13242 emit_single_mubuf_store(
13243 &ctx, tess_factor_ring_desc, tess_factor_ring_offset, tess_factor_ring_base, Temp(), data,
13244 tess_factor_ring_const_offset + 16, memory_sync_info(), true, false, false);
13245 break;
13246 }
13247 default: unreachable("invalid primitive mode"); break;
13248 }
13249
13250 if (einfo->tes_reads_tessfactors) {
13251 Temp layout = get_arg(&ctx, einfo->tcs_offchip_layout);
13252 Temp num_patches, patch_base;
13253
13254 if (ctx.options->is_opengl) {
13255 num_patches = bld.sop2(aco_opcode::s_and_b32, bld.def(s1), bld.def(s1, scc), layout,
13256 Operand::c32(0x3f));
13257 num_patches = bld.sop2(aco_opcode::s_add_u32, bld.def(s1), bld.def(s1, scc), num_patches,
13258 Operand::c32(1));
13259
13260 patch_base = bld.sop2(aco_opcode::s_lshr_b32, bld.def(s1), bld.def(s1, scc), layout,
13261 Operand::c32(16));
13262 } else {
13263 num_patches = bld.sop2(aco_opcode::s_bfe_u32, bld.def(s1), bld.def(s1, scc), layout,
13264 Operand::c32(0x60006));
13265
13266 patch_base = get_arg(&ctx, einfo->patch_base);
13267 }
13268
13269 Temp tess_ring_desc = get_tess_ring_descriptor(&ctx, einfo, false);
13270 Temp tess_ring_base = get_arg(&ctx, args->tess_offchip_offset);
13271
13272 Temp sbase =
13273 bld.sop2(aco_opcode::s_add_u32, bld.def(s1), bld.def(s1, scc), tess_ring_base, patch_base);
13274
13275 Temp voffset =
13276 bld.vop2(aco_opcode::v_lshlrev_b32, bld.def(v1), Operand::c32(4), rel_patch_id);
13277
13278 store_tess_factor_to_tess_ring(&ctx, tess_ring_desc, outer, outer_comps, sbase, voffset,
13279 num_patches, tess_lvl_out_loc);
13280
13281 if (inner_comps) {
13282 store_tess_factor_to_tess_ring(&ctx, tess_ring_desc, inner, inner_comps, sbase, voffset,
13283 num_patches, tess_lvl_in_loc);
13284 }
13285 }
13286
13287 begin_divergent_if_else(&ctx, &ic_invoc_0);
13288 end_divergent_if(&ctx, &ic_invoc_0);
13289
13290 program->config->float_mode = program->blocks[0].fp_mode.val;
13291
13292 append_logical_end(ctx.block);
13293
13294 bld.reset(ctx.block);
13295 bld.sopp(aco_opcode::s_endpgm);
13296
13297 finish_program(&ctx);
13298 }
13299
13300 void
select_ps_prolog(Program * program,void * pinfo,ac_shader_config * config,const struct aco_compiler_options * options,const struct aco_shader_info * info,const struct ac_shader_args * args)13301 select_ps_prolog(Program* program, void* pinfo, ac_shader_config* config,
13302 const struct aco_compiler_options* options, const struct aco_shader_info* info,
13303 const struct ac_shader_args* args)
13304 {
13305 const struct aco_ps_prolog_info* finfo = (const struct aco_ps_prolog_info*)pinfo;
13306 isel_context ctx =
13307 setup_isel_context(program, 0, NULL, config, options, info, args, SWStage::FS);
13308
13309 ctx.block->fp_mode = program->next_fp_mode;
13310
13311 add_startpgm(&ctx);
13312 append_logical_start(ctx.block);
13313
13314 if (finfo->poly_stipple)
13315 emit_polygon_stipple(&ctx, finfo);
13316
13317 overwrite_interp_args(&ctx, finfo);
13318
13319 overwrite_samplemask_arg(&ctx, finfo);
13320
13321 std::vector<Operand> regs;
13322 passthrough_all_args(&ctx, regs);
13323
13324 interpolate_color_args(&ctx, finfo, regs);
13325
13326 program->config->float_mode = program->blocks[0].fp_mode.val;
13327
13328 append_logical_end(ctx.block);
13329
13330 build_end_with_regs(&ctx, regs);
13331
13332 /* To compute all end args in WQM mode if required by main part. */
13333 if (finfo->needs_wqm)
13334 set_wqm(&ctx, true);
13335
13336 /* Exit WQM mode finally. */
13337 program->needs_exact = true;
13338
13339 finish_program(&ctx);
13340 }
13341
13342 } // namespace aco
13343