1 /*
2 * Copyright © 2018 Valve Corporation
3 * Copyright © 2018 Google
4 *
5 * Permission is hereby granted, free of charge, to any person obtaining a
6 * copy of this software and associated documentation files (the "Software"),
7 * to deal in the Software without restriction, including without limitation
8 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
9 * and/or sell copies of the Software, and to permit persons to whom the
10 * Software is furnished to do so, subject to the following conditions:
11 *
12 * The above copyright notice and this permission notice (including the next
13 * paragraph) shall be included in all copies or substantial portions of the
14 * Software.
15 *
16 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
19 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
21 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
22 * IN THE SOFTWARE.
23 *
24 */
25
26 #include "aco_instruction_selection.h"
27
28 #include "aco_builder.h"
29 #include "aco_ir.h"
30
31 #include "common/ac_exp_param.h"
32 #include "common/sid.h"
33 #include "vulkan/radv_descriptor_set.h"
34
35 #include "util/fast_idiv_by_const.h"
36 #include "util/memstream.h"
37
38 #include <array>
39 #include <functional>
40 #include <map>
41 #include <numeric>
42 #include <stack>
43 #include <utility>
44 #include <vector>
45
46 namespace aco {
47 namespace {
48
49 #define isel_err(...) _isel_err(ctx, __FILE__, __LINE__, __VA_ARGS__)
50
51 static void
_isel_err(isel_context * ctx,const char * file,unsigned line,const nir_instr * instr,const char * msg)52 _isel_err(isel_context* ctx, const char* file, unsigned line, const nir_instr* instr,
53 const char* msg)
54 {
55 char* out;
56 size_t outsize;
57 struct u_memstream mem;
58 u_memstream_open(&mem, &out, &outsize);
59 FILE* const memf = u_memstream_get(&mem);
60
61 fprintf(memf, "%s: ", msg);
62 nir_print_instr(instr, memf);
63 u_memstream_close(&mem);
64
65 _aco_err(ctx->program, file, line, out);
66 free(out);
67 }
68
69 struct if_context {
70 Temp cond;
71
72 bool divergent_old;
73 bool exec_potentially_empty_discard_old;
74 bool exec_potentially_empty_break_old;
75 uint16_t exec_potentially_empty_break_depth_old;
76
77 unsigned BB_if_idx;
78 unsigned invert_idx;
79 bool uniform_has_then_branch;
80 bool then_branch_divergent;
81 Block BB_invert;
82 Block BB_endif;
83 };
84
85 struct loop_context {
86 Block loop_exit;
87
88 unsigned header_idx_old;
89 Block* exit_old;
90 bool divergent_cont_old;
91 bool divergent_branch_old;
92 bool divergent_if_old;
93 };
94
95 static bool visit_cf_list(struct isel_context* ctx, struct exec_list* list);
96
97 static void
add_logical_edge(unsigned pred_idx,Block * succ)98 add_logical_edge(unsigned pred_idx, Block* succ)
99 {
100 succ->logical_preds.emplace_back(pred_idx);
101 }
102
103 static void
add_linear_edge(unsigned pred_idx,Block * succ)104 add_linear_edge(unsigned pred_idx, Block* succ)
105 {
106 succ->linear_preds.emplace_back(pred_idx);
107 }
108
109 static void
add_edge(unsigned pred_idx,Block * succ)110 add_edge(unsigned pred_idx, Block* succ)
111 {
112 add_logical_edge(pred_idx, succ);
113 add_linear_edge(pred_idx, succ);
114 }
115
116 static void
append_logical_start(Block * b)117 append_logical_start(Block* b)
118 {
119 Builder(NULL, b).pseudo(aco_opcode::p_logical_start);
120 }
121
122 static void
append_logical_end(Block * b)123 append_logical_end(Block* b)
124 {
125 Builder(NULL, b).pseudo(aco_opcode::p_logical_end);
126 }
127
128 Temp
get_ssa_temp(struct isel_context * ctx,nir_ssa_def * def)129 get_ssa_temp(struct isel_context* ctx, nir_ssa_def* def)
130 {
131 uint32_t id = ctx->first_temp_id + def->index;
132 return Temp(id, ctx->program->temp_rc[id]);
133 }
134
135 Temp
emit_mbcnt(isel_context * ctx,Temp dst,Operand mask=Operand (),Operand base=Operand::zero ())136 emit_mbcnt(isel_context* ctx, Temp dst, Operand mask = Operand(), Operand base = Operand::zero())
137 {
138 Builder bld(ctx->program, ctx->block);
139 assert(mask.isUndefined() || mask.isTemp() || (mask.isFixed() && mask.physReg() == exec));
140 assert(mask.isUndefined() || mask.bytes() == bld.lm.bytes());
141
142 if (ctx->program->wave_size == 32) {
143 Operand mask_lo = mask.isUndefined() ? Operand::c32(-1u) : mask;
144 return bld.vop3(aco_opcode::v_mbcnt_lo_u32_b32, Definition(dst), mask_lo, base);
145 }
146
147 Operand mask_lo = Operand::c32(-1u);
148 Operand mask_hi = Operand::c32(-1u);
149
150 if (mask.isTemp()) {
151 RegClass rc = RegClass(mask.regClass().type(), 1);
152 Builder::Result mask_split =
153 bld.pseudo(aco_opcode::p_split_vector, bld.def(rc), bld.def(rc), mask);
154 mask_lo = Operand(mask_split.def(0).getTemp());
155 mask_hi = Operand(mask_split.def(1).getTemp());
156 } else if (mask.physReg() == exec) {
157 mask_lo = Operand(exec_lo, s1);
158 mask_hi = Operand(exec_hi, s1);
159 }
160
161 Temp mbcnt_lo = bld.vop3(aco_opcode::v_mbcnt_lo_u32_b32, bld.def(v1), mask_lo, base);
162
163 if (ctx->program->chip_class <= GFX7)
164 return bld.vop2(aco_opcode::v_mbcnt_hi_u32_b32, Definition(dst), mask_hi, mbcnt_lo);
165 else
166 return bld.vop3(aco_opcode::v_mbcnt_hi_u32_b32_e64, Definition(dst), mask_hi, mbcnt_lo);
167 }
168
169 Temp
emit_wqm(Builder & bld,Temp src,Temp dst=Temp (0,s1),bool program_needs_wqm=false)170 emit_wqm(Builder& bld, Temp src, Temp dst = Temp(0, s1), bool program_needs_wqm = false)
171 {
172 if (!dst.id())
173 dst = bld.tmp(src.regClass());
174
175 assert(src.size() == dst.size());
176
177 if (bld.program->stage != fragment_fs) {
178 if (!dst.id())
179 return src;
180
181 bld.copy(Definition(dst), src);
182 return dst;
183 }
184
185 bld.pseudo(aco_opcode::p_wqm, Definition(dst), src);
186 bld.program->needs_wqm |= program_needs_wqm;
187 return dst;
188 }
189
190 static Temp
emit_bpermute(isel_context * ctx,Builder & bld,Temp index,Temp data)191 emit_bpermute(isel_context* ctx, Builder& bld, Temp index, Temp data)
192 {
193 if (index.regClass() == s1)
194 return bld.readlane(bld.def(s1), data, index);
195
196 if (ctx->options->chip_class <= GFX7) {
197 /* GFX6-7: there is no bpermute instruction */
198 Operand index_op(index);
199 Operand input_data(data);
200 index_op.setLateKill(true);
201 input_data.setLateKill(true);
202
203 return bld.pseudo(aco_opcode::p_bpermute, bld.def(v1), bld.def(bld.lm), bld.def(bld.lm, vcc),
204 index_op, input_data);
205 } else if (ctx->options->chip_class >= GFX10 && ctx->program->wave_size == 64) {
206
207 /* GFX10 wave64 mode: emulate full-wave bpermute */
208 Temp index_is_lo =
209 bld.vopc(aco_opcode::v_cmp_ge_u32, bld.def(bld.lm), Operand::c32(31u), index);
210 Builder::Result index_is_lo_split =
211 bld.pseudo(aco_opcode::p_split_vector, bld.def(s1), bld.def(s1), index_is_lo);
212 Temp index_is_lo_n1 = bld.sop1(aco_opcode::s_not_b32, bld.def(s1), bld.def(s1, scc),
213 index_is_lo_split.def(1).getTemp());
214 Operand same_half = bld.pseudo(aco_opcode::p_create_vector, bld.def(s2),
215 index_is_lo_split.def(0).getTemp(), index_is_lo_n1);
216 Operand index_x4 = bld.vop2(aco_opcode::v_lshlrev_b32, bld.def(v1), Operand::c32(2u), index);
217 Operand input_data(data);
218
219 index_x4.setLateKill(true);
220 input_data.setLateKill(true);
221 same_half.setLateKill(true);
222
223 /* We need one pair of shared VGPRs:
224 * Note, that these have twice the allocation granularity of normal VGPRs */
225 ctx->program->config->num_shared_vgprs = 2 * ctx->program->dev.vgpr_alloc_granule;
226
227 return bld.pseudo(aco_opcode::p_bpermute, bld.def(v1), bld.def(s2), bld.def(s1, scc),
228 index_x4, input_data, same_half);
229 } else {
230 /* GFX8-9 or GFX10 wave32: bpermute works normally */
231 Temp index_x4 = bld.vop2(aco_opcode::v_lshlrev_b32, bld.def(v1), Operand::c32(2u), index);
232 return bld.ds(aco_opcode::ds_bpermute_b32, bld.def(v1), index_x4, data);
233 }
234 }
235
236 static Temp
emit_masked_swizzle(isel_context * ctx,Builder & bld,Temp src,unsigned mask)237 emit_masked_swizzle(isel_context* ctx, Builder& bld, Temp src, unsigned mask)
238 {
239 if (ctx->options->chip_class >= GFX8) {
240 unsigned and_mask = mask & 0x1f;
241 unsigned or_mask = (mask >> 5) & 0x1f;
242 unsigned xor_mask = (mask >> 10) & 0x1f;
243
244 uint16_t dpp_ctrl = 0xffff;
245
246 // TODO: we could use DPP8 for some swizzles
247 if (and_mask == 0x1f && or_mask < 4 && xor_mask < 4) {
248 unsigned res[4] = {0, 1, 2, 3};
249 for (unsigned i = 0; i < 4; i++)
250 res[i] = ((res[i] | or_mask) ^ xor_mask) & 0x3;
251 dpp_ctrl = dpp_quad_perm(res[0], res[1], res[2], res[3]);
252 } else if (and_mask == 0x1f && !or_mask && xor_mask == 8) {
253 dpp_ctrl = dpp_row_rr(8);
254 } else if (and_mask == 0x1f && !or_mask && xor_mask == 0xf) {
255 dpp_ctrl = dpp_row_mirror;
256 } else if (and_mask == 0x1f && !or_mask && xor_mask == 0x7) {
257 dpp_ctrl = dpp_row_half_mirror;
258 }
259
260 if (dpp_ctrl != 0xffff)
261 return bld.vop1_dpp(aco_opcode::v_mov_b32, bld.def(v1), src, dpp_ctrl);
262 }
263
264 return bld.ds(aco_opcode::ds_swizzle_b32, bld.def(v1), src, mask, 0, false);
265 }
266
267 Temp
as_vgpr(isel_context * ctx,Temp val)268 as_vgpr(isel_context* ctx, Temp val)
269 {
270 if (val.type() == RegType::sgpr) {
271 Builder bld(ctx->program, ctx->block);
272 return bld.copy(bld.def(RegType::vgpr, val.size()), val);
273 }
274 assert(val.type() == RegType::vgpr);
275 return val;
276 }
277
278 // assumes a != 0xffffffff
279 void
emit_v_div_u32(isel_context * ctx,Temp dst,Temp a,uint32_t b)280 emit_v_div_u32(isel_context* ctx, Temp dst, Temp a, uint32_t b)
281 {
282 assert(b != 0);
283 Builder bld(ctx->program, ctx->block);
284
285 if (util_is_power_of_two_or_zero(b)) {
286 bld.vop2(aco_opcode::v_lshrrev_b32, Definition(dst), Operand::c32(util_logbase2(b)), a);
287 return;
288 }
289
290 util_fast_udiv_info info = util_compute_fast_udiv_info(b, 32, 32);
291
292 assert(info.multiplier <= 0xffffffff);
293
294 bool pre_shift = info.pre_shift != 0;
295 bool increment = info.increment != 0;
296 bool multiply = true;
297 bool post_shift = info.post_shift != 0;
298
299 if (!pre_shift && !increment && !multiply && !post_shift) {
300 bld.copy(Definition(dst), a);
301 return;
302 }
303
304 Temp pre_shift_dst = a;
305 if (pre_shift) {
306 pre_shift_dst = (increment || multiply || post_shift) ? bld.tmp(v1) : dst;
307 bld.vop2(aco_opcode::v_lshrrev_b32, Definition(pre_shift_dst), Operand::c32(info.pre_shift),
308 a);
309 }
310
311 Temp increment_dst = pre_shift_dst;
312 if (increment) {
313 increment_dst = (post_shift || multiply) ? bld.tmp(v1) : dst;
314 bld.vadd32(Definition(increment_dst), Operand::c32(info.increment), pre_shift_dst);
315 }
316
317 Temp multiply_dst = increment_dst;
318 if (multiply) {
319 multiply_dst = post_shift ? bld.tmp(v1) : dst;
320 bld.vop3(aco_opcode::v_mul_hi_u32, Definition(multiply_dst), increment_dst,
321 bld.copy(bld.def(v1), Operand::c32(info.multiplier)));
322 }
323
324 if (post_shift) {
325 bld.vop2(aco_opcode::v_lshrrev_b32, Definition(dst), Operand::c32(info.post_shift),
326 multiply_dst);
327 }
328 }
329
330 void
emit_extract_vector(isel_context * ctx,Temp src,uint32_t idx,Temp dst)331 emit_extract_vector(isel_context* ctx, Temp src, uint32_t idx, Temp dst)
332 {
333 Builder bld(ctx->program, ctx->block);
334 bld.pseudo(aco_opcode::p_extract_vector, Definition(dst), src, Operand::c32(idx));
335 }
336
337 Temp
emit_extract_vector(isel_context * ctx,Temp src,uint32_t idx,RegClass dst_rc)338 emit_extract_vector(isel_context* ctx, Temp src, uint32_t idx, RegClass dst_rc)
339 {
340 /* no need to extract the whole vector */
341 if (src.regClass() == dst_rc) {
342 assert(idx == 0);
343 return src;
344 }
345
346 assert(src.bytes() > (idx * dst_rc.bytes()));
347 Builder bld(ctx->program, ctx->block);
348 auto it = ctx->allocated_vec.find(src.id());
349 if (it != ctx->allocated_vec.end() && dst_rc.bytes() == it->second[idx].regClass().bytes()) {
350 if (it->second[idx].regClass() == dst_rc) {
351 return it->second[idx];
352 } else {
353 assert(!dst_rc.is_subdword());
354 assert(dst_rc.type() == RegType::vgpr && it->second[idx].type() == RegType::sgpr);
355 return bld.copy(bld.def(dst_rc), it->second[idx]);
356 }
357 }
358
359 if (dst_rc.is_subdword())
360 src = as_vgpr(ctx, src);
361
362 if (src.bytes() == dst_rc.bytes()) {
363 assert(idx == 0);
364 return bld.copy(bld.def(dst_rc), src);
365 } else {
366 Temp dst = bld.tmp(dst_rc);
367 emit_extract_vector(ctx, src, idx, dst);
368 return dst;
369 }
370 }
371
372 void
emit_split_vector(isel_context * ctx,Temp vec_src,unsigned num_components)373 emit_split_vector(isel_context* ctx, Temp vec_src, unsigned num_components)
374 {
375 if (num_components == 1)
376 return;
377 if (ctx->allocated_vec.find(vec_src.id()) != ctx->allocated_vec.end())
378 return;
379 RegClass rc;
380 if (num_components > vec_src.size()) {
381 if (vec_src.type() == RegType::sgpr) {
382 /* should still help get_alu_src() */
383 emit_split_vector(ctx, vec_src, vec_src.size());
384 return;
385 }
386 /* sub-dword split */
387 rc = RegClass(RegType::vgpr, vec_src.bytes() / num_components).as_subdword();
388 } else {
389 rc = RegClass(vec_src.type(), vec_src.size() / num_components);
390 }
391 aco_ptr<Pseudo_instruction> split{create_instruction<Pseudo_instruction>(
392 aco_opcode::p_split_vector, Format::PSEUDO, 1, num_components)};
393 split->operands[0] = Operand(vec_src);
394 std::array<Temp, NIR_MAX_VEC_COMPONENTS> elems;
395 for (unsigned i = 0; i < num_components; i++) {
396 elems[i] = ctx->program->allocateTmp(rc);
397 split->definitions[i] = Definition(elems[i]);
398 }
399 ctx->block->instructions.emplace_back(std::move(split));
400 ctx->allocated_vec.emplace(vec_src.id(), elems);
401 }
402
403 /* This vector expansion uses a mask to determine which elements in the new vector
404 * come from the original vector. The other elements are undefined. */
405 void
expand_vector(isel_context * ctx,Temp vec_src,Temp dst,unsigned num_components,unsigned mask)406 expand_vector(isel_context* ctx, Temp vec_src, Temp dst, unsigned num_components, unsigned mask)
407 {
408 emit_split_vector(ctx, vec_src, util_bitcount(mask));
409
410 if (vec_src == dst)
411 return;
412
413 Builder bld(ctx->program, ctx->block);
414 if (num_components == 1) {
415 if (dst.type() == RegType::sgpr)
416 bld.pseudo(aco_opcode::p_as_uniform, Definition(dst), vec_src);
417 else
418 bld.copy(Definition(dst), vec_src);
419 return;
420 }
421
422 unsigned component_size = dst.size() / num_components;
423 std::array<Temp, NIR_MAX_VEC_COMPONENTS> elems;
424
425 aco_ptr<Pseudo_instruction> vec{create_instruction<Pseudo_instruction>(
426 aco_opcode::p_create_vector, Format::PSEUDO, num_components, 1)};
427 vec->definitions[0] = Definition(dst);
428 unsigned k = 0;
429 for (unsigned i = 0; i < num_components; i++) {
430 if (mask & (1 << i)) {
431 Temp src =
432 emit_extract_vector(ctx, vec_src, k++, RegClass(vec_src.type(), component_size));
433 if (dst.type() == RegType::sgpr)
434 src = bld.as_uniform(src);
435 vec->operands[i] = Operand(src);
436 } else {
437 vec->operands[i] = Operand::zero(component_size == 2 ? 8 : 4);
438 }
439 elems[i] = vec->operands[i].getTemp();
440 }
441 ctx->block->instructions.emplace_back(std::move(vec));
442 ctx->allocated_vec.emplace(dst.id(), elems);
443 }
444
445 /* adjust misaligned small bit size loads */
446 void
byte_align_scalar(isel_context * ctx,Temp vec,Operand offset,Temp dst)447 byte_align_scalar(isel_context* ctx, Temp vec, Operand offset, Temp dst)
448 {
449 Builder bld(ctx->program, ctx->block);
450 Operand shift;
451 Temp select = Temp();
452 if (offset.isConstant()) {
453 assert(offset.constantValue() && offset.constantValue() < 4);
454 shift = Operand::c32(offset.constantValue() * 8);
455 } else {
456 /* bit_offset = 8 * (offset & 0x3) */
457 Temp tmp =
458 bld.sop2(aco_opcode::s_and_b32, bld.def(s1), bld.def(s1, scc), offset, Operand::c32(3u));
459 select = bld.tmp(s1);
460 shift = bld.sop2(aco_opcode::s_lshl_b32, bld.def(s1), bld.scc(Definition(select)), tmp,
461 Operand::c32(3u));
462 }
463
464 if (vec.size() == 1) {
465 bld.sop2(aco_opcode::s_lshr_b32, Definition(dst), bld.def(s1, scc), vec, shift);
466 } else if (vec.size() == 2) {
467 Temp tmp = dst.size() == 2 ? dst : bld.tmp(s2);
468 bld.sop2(aco_opcode::s_lshr_b64, Definition(tmp), bld.def(s1, scc), vec, shift);
469 if (tmp == dst)
470 emit_split_vector(ctx, dst, 2);
471 else
472 emit_extract_vector(ctx, tmp, 0, dst);
473 } else if (vec.size() == 3 || vec.size() == 4) {
474 Temp lo = bld.tmp(s2), hi;
475 if (vec.size() == 3) {
476 /* this can happen if we use VMEM for a uniform load */
477 hi = bld.tmp(s1);
478 bld.pseudo(aco_opcode::p_split_vector, Definition(lo), Definition(hi), vec);
479 } else {
480 hi = bld.tmp(s2);
481 bld.pseudo(aco_opcode::p_split_vector, Definition(lo), Definition(hi), vec);
482 hi = bld.pseudo(aco_opcode::p_extract_vector, bld.def(s1), hi, Operand::zero());
483 }
484 if (select != Temp())
485 hi =
486 bld.sop2(aco_opcode::s_cselect_b32, bld.def(s1), hi, Operand::zero(), bld.scc(select));
487 lo = bld.sop2(aco_opcode::s_lshr_b64, bld.def(s2), bld.def(s1, scc), lo, shift);
488 Temp mid = bld.tmp(s1);
489 lo = bld.pseudo(aco_opcode::p_split_vector, bld.def(s1), Definition(mid), lo);
490 hi = bld.sop2(aco_opcode::s_lshl_b32, bld.def(s1), bld.def(s1, scc), hi, shift);
491 mid = bld.sop2(aco_opcode::s_or_b32, bld.def(s1), bld.def(s1, scc), hi, mid);
492 bld.pseudo(aco_opcode::p_create_vector, Definition(dst), lo, mid);
493 emit_split_vector(ctx, dst, 2);
494 }
495 }
496
497 void
byte_align_vector(isel_context * ctx,Temp vec,Operand offset,Temp dst,unsigned component_size)498 byte_align_vector(isel_context* ctx, Temp vec, Operand offset, Temp dst, unsigned component_size)
499 {
500 Builder bld(ctx->program, ctx->block);
501 if (offset.isTemp()) {
502 Temp tmp[4] = {vec, vec, vec, vec};
503
504 if (vec.size() == 4) {
505 tmp[0] = bld.tmp(v1), tmp[1] = bld.tmp(v1), tmp[2] = bld.tmp(v1), tmp[3] = bld.tmp(v1);
506 bld.pseudo(aco_opcode::p_split_vector, Definition(tmp[0]), Definition(tmp[1]),
507 Definition(tmp[2]), Definition(tmp[3]), vec);
508 } else if (vec.size() == 3) {
509 tmp[0] = bld.tmp(v1), tmp[1] = bld.tmp(v1), tmp[2] = bld.tmp(v1);
510 bld.pseudo(aco_opcode::p_split_vector, Definition(tmp[0]), Definition(tmp[1]),
511 Definition(tmp[2]), vec);
512 } else if (vec.size() == 2) {
513 tmp[0] = bld.tmp(v1), tmp[1] = bld.tmp(v1), tmp[2] = tmp[1];
514 bld.pseudo(aco_opcode::p_split_vector, Definition(tmp[0]), Definition(tmp[1]), vec);
515 }
516 for (unsigned i = 0; i < dst.size(); i++)
517 tmp[i] = bld.vop3(aco_opcode::v_alignbyte_b32, bld.def(v1), tmp[i + 1], tmp[i], offset);
518
519 vec = tmp[0];
520 if (dst.size() == 2)
521 vec = bld.pseudo(aco_opcode::p_create_vector, bld.def(v2), tmp[0], tmp[1]);
522
523 offset = Operand::zero();
524 }
525
526 unsigned num_components = vec.bytes() / component_size;
527 if (vec.regClass() == dst.regClass()) {
528 assert(offset.constantValue() == 0);
529 bld.copy(Definition(dst), vec);
530 emit_split_vector(ctx, dst, num_components);
531 return;
532 }
533
534 emit_split_vector(ctx, vec, num_components);
535 std::array<Temp, NIR_MAX_VEC_COMPONENTS> elems;
536 RegClass rc = RegClass(RegType::vgpr, component_size).as_subdword();
537
538 assert(offset.constantValue() % component_size == 0);
539 unsigned skip = offset.constantValue() / component_size;
540 for (unsigned i = skip; i < num_components; i++)
541 elems[i - skip] = emit_extract_vector(ctx, vec, i, rc);
542
543 if (dst.type() == RegType::vgpr) {
544 /* if dst is vgpr - split the src and create a shrunk version according to the mask. */
545 num_components = dst.bytes() / component_size;
546 aco_ptr<Pseudo_instruction> create_vec{create_instruction<Pseudo_instruction>(
547 aco_opcode::p_create_vector, Format::PSEUDO, num_components, 1)};
548 for (unsigned i = 0; i < num_components; i++)
549 create_vec->operands[i] = Operand(elems[i]);
550 create_vec->definitions[0] = Definition(dst);
551 bld.insert(std::move(create_vec));
552
553 } else if (skip) {
554 /* if dst is sgpr - split the src, but move the original to sgpr. */
555 vec = bld.pseudo(aco_opcode::p_as_uniform, bld.def(RegClass(RegType::sgpr, vec.size())), vec);
556 byte_align_scalar(ctx, vec, offset, dst);
557 } else {
558 assert(dst.size() == vec.size());
559 bld.pseudo(aco_opcode::p_as_uniform, Definition(dst), vec);
560 }
561
562 ctx->allocated_vec.emplace(dst.id(), elems);
563 }
564
565 Temp
bool_to_vector_condition(isel_context * ctx,Temp val,Temp dst=Temp (0,s2))566 bool_to_vector_condition(isel_context* ctx, Temp val, Temp dst = Temp(0, s2))
567 {
568 Builder bld(ctx->program, ctx->block);
569 if (!dst.id())
570 dst = bld.tmp(bld.lm);
571
572 assert(val.regClass() == s1);
573 assert(dst.regClass() == bld.lm);
574
575 return bld.sop2(Builder::s_cselect, Definition(dst), Operand::c32(-1), Operand::zero(),
576 bld.scc(val));
577 }
578
579 Temp
bool_to_scalar_condition(isel_context * ctx,Temp val,Temp dst=Temp (0,s1))580 bool_to_scalar_condition(isel_context* ctx, Temp val, Temp dst = Temp(0, s1))
581 {
582 Builder bld(ctx->program, ctx->block);
583 if (!dst.id())
584 dst = bld.tmp(s1);
585
586 assert(val.regClass() == bld.lm);
587 assert(dst.regClass() == s1);
588
589 /* if we're currently in WQM mode, ensure that the source is also computed in WQM */
590 Temp tmp = bld.tmp(s1);
591 bld.sop2(Builder::s_and, bld.def(bld.lm), bld.scc(Definition(tmp)), val, Operand(exec, bld.lm));
592 return emit_wqm(bld, tmp, dst);
593 }
594
595 /**
596 * Copies the first src_bits of the input to the output Temp. Input bits at positions larger than
597 * src_bits and dst_bits are truncated.
598 *
599 * Sign extension may be applied using the sign_extend parameter. The position of the input sign
600 * bit is indicated by src_bits in this case.
601 *
602 * If dst.bytes() is larger than dst_bits/8, the value of the upper bits is undefined.
603 */
604 Temp
convert_int(isel_context * ctx,Builder & bld,Temp src,unsigned src_bits,unsigned dst_bits,bool sign_extend,Temp dst=Temp ())605 convert_int(isel_context* ctx, Builder& bld, Temp src, unsigned src_bits, unsigned dst_bits,
606 bool sign_extend, Temp dst = Temp())
607 {
608 assert(!(sign_extend && dst_bits < src_bits) &&
609 "Shrinking integers is not supported for signed inputs");
610
611 if (!dst.id()) {
612 if (dst_bits % 32 == 0 || src.type() == RegType::sgpr)
613 dst = bld.tmp(src.type(), DIV_ROUND_UP(dst_bits, 32u));
614 else
615 dst = bld.tmp(RegClass(RegType::vgpr, dst_bits / 8u).as_subdword());
616 }
617
618 assert(src.type() == RegType::sgpr || src_bits == src.bytes() * 8);
619 assert(dst.type() == RegType::sgpr || dst_bits == dst.bytes() * 8);
620
621 if (dst.bytes() == src.bytes() && dst_bits < src_bits) {
622 /* Copy the raw value, leaving an undefined value in the upper bits for
623 * the caller to handle appropriately */
624 return bld.copy(Definition(dst), src);
625 } else if (dst.bytes() < src.bytes()) {
626 return bld.pseudo(aco_opcode::p_extract_vector, Definition(dst), src, Operand::zero());
627 }
628
629 Temp tmp = dst;
630 if (dst_bits == 64)
631 tmp = src_bits == 32 ? src : bld.tmp(src.type(), 1);
632
633 if (tmp == src) {
634 } else if (src.regClass() == s1) {
635 assert(src_bits < 32);
636 bld.pseudo(aco_opcode::p_extract, Definition(tmp), bld.def(s1, scc), src, Operand::zero(),
637 Operand::c32(src_bits), Operand::c32((unsigned)sign_extend));
638 } else {
639 assert(src_bits < 32);
640 bld.pseudo(aco_opcode::p_extract, Definition(tmp), src, Operand::zero(), Operand::c32(src_bits),
641 Operand::c32((unsigned)sign_extend));
642 }
643
644 if (dst_bits == 64) {
645 if (sign_extend && dst.regClass() == s2) {
646 Temp high =
647 bld.sop2(aco_opcode::s_ashr_i32, bld.def(s1), bld.def(s1, scc), tmp, Operand::c32(31u));
648 bld.pseudo(aco_opcode::p_create_vector, Definition(dst), tmp, high);
649 } else if (sign_extend && dst.regClass() == v2) {
650 Temp high = bld.vop2(aco_opcode::v_ashrrev_i32, bld.def(v1), Operand::c32(31u), tmp);
651 bld.pseudo(aco_opcode::p_create_vector, Definition(dst), tmp, high);
652 } else {
653 bld.pseudo(aco_opcode::p_create_vector, Definition(dst), tmp, Operand::zero());
654 }
655 }
656
657 return dst;
658 }
659
660 enum sgpr_extract_mode {
661 sgpr_extract_sext,
662 sgpr_extract_zext,
663 sgpr_extract_undef,
664 };
665
666 Temp
extract_8_16_bit_sgpr_element(isel_context * ctx,Temp dst,nir_alu_src * src,sgpr_extract_mode mode)667 extract_8_16_bit_sgpr_element(isel_context* ctx, Temp dst, nir_alu_src* src, sgpr_extract_mode mode)
668 {
669 Temp vec = get_ssa_temp(ctx, src->src.ssa);
670 unsigned src_size = src->src.ssa->bit_size;
671 unsigned swizzle = src->swizzle[0];
672
673 if (vec.size() > 1) {
674 assert(src_size == 16);
675 vec = emit_extract_vector(ctx, vec, swizzle / 2, s1);
676 swizzle = swizzle & 1;
677 }
678
679 Builder bld(ctx->program, ctx->block);
680 Temp tmp = dst.regClass() == s2 ? bld.tmp(s1) : dst;
681
682 if (mode == sgpr_extract_undef && swizzle == 0)
683 bld.copy(Definition(tmp), vec);
684 else
685 bld.pseudo(aco_opcode::p_extract, Definition(tmp), bld.def(s1, scc), Operand(vec),
686 Operand::c32(swizzle), Operand::c32(src_size),
687 Operand::c32((mode == sgpr_extract_sext)));
688
689 if (dst.regClass() == s2)
690 convert_int(ctx, bld, tmp, 32, 64, mode == sgpr_extract_sext, dst);
691
692 return dst;
693 }
694
695 Temp
get_alu_src(struct isel_context * ctx,nir_alu_src src,unsigned size=1)696 get_alu_src(struct isel_context* ctx, nir_alu_src src, unsigned size = 1)
697 {
698 if (src.src.ssa->num_components == 1 && size == 1)
699 return get_ssa_temp(ctx, src.src.ssa);
700
701 Temp vec = get_ssa_temp(ctx, src.src.ssa);
702 unsigned elem_size = src.src.ssa->bit_size / 8u;
703 bool identity_swizzle = true;
704
705 for (unsigned i = 0; identity_swizzle && i < size; i++) {
706 if (src.swizzle[i] != i)
707 identity_swizzle = false;
708 }
709 if (identity_swizzle)
710 return emit_extract_vector(ctx, vec, 0, RegClass::get(vec.type(), elem_size * size));
711
712 assert(elem_size > 0);
713 assert(vec.bytes() % elem_size == 0);
714
715 if (elem_size < 4 && vec.type() == RegType::sgpr && size == 1) {
716 assert(src.src.ssa->bit_size == 8 || src.src.ssa->bit_size == 16);
717 return extract_8_16_bit_sgpr_element(ctx, ctx->program->allocateTmp(s1), &src,
718 sgpr_extract_undef);
719 }
720
721 bool as_uniform = elem_size < 4 && vec.type() == RegType::sgpr;
722 if (as_uniform)
723 vec = as_vgpr(ctx, vec);
724
725 RegClass elem_rc = elem_size < 4 ? RegClass(vec.type(), elem_size).as_subdword()
726 : RegClass(vec.type(), elem_size / 4);
727 if (size == 1) {
728 return emit_extract_vector(ctx, vec, src.swizzle[0], elem_rc);
729 } else {
730 assert(size <= 4);
731 std::array<Temp, NIR_MAX_VEC_COMPONENTS> elems;
732 aco_ptr<Pseudo_instruction> vec_instr{create_instruction<Pseudo_instruction>(
733 aco_opcode::p_create_vector, Format::PSEUDO, size, 1)};
734 for (unsigned i = 0; i < size; ++i) {
735 elems[i] = emit_extract_vector(ctx, vec, src.swizzle[i], elem_rc);
736 vec_instr->operands[i] = Operand{elems[i]};
737 }
738 Temp dst = ctx->program->allocateTmp(RegClass(vec.type(), elem_size * size / 4));
739 vec_instr->definitions[0] = Definition(dst);
740 ctx->block->instructions.emplace_back(std::move(vec_instr));
741 ctx->allocated_vec.emplace(dst.id(), elems);
742 return vec.type() == RegType::sgpr ? Builder(ctx->program, ctx->block).as_uniform(dst) : dst;
743 }
744 }
745
746 Temp
get_alu_src_vop3p(struct isel_context * ctx,nir_alu_src src)747 get_alu_src_vop3p(struct isel_context* ctx, nir_alu_src src)
748 {
749 /* returns v2b or v1 for vop3p usage.
750 * The source expects exactly 2 16bit components
751 * which are within the same dword
752 */
753 assert(src.src.ssa->bit_size == 16);
754 assert(src.swizzle[0] >> 1 == src.swizzle[1] >> 1);
755
756 Temp tmp = get_ssa_temp(ctx, src.src.ssa);
757 if (tmp.size() == 1)
758 return tmp;
759
760 /* the size is larger than 1 dword: check the swizzle */
761 unsigned dword = src.swizzle[0] >> 1;
762
763 /* extract a full dword if possible */
764 if (tmp.bytes() >= (dword + 1) * 4) {
765 return emit_extract_vector(ctx, tmp, dword, RegClass(tmp.type(), 1));
766 } else {
767 /* This must be a swizzled access to %a.zz where %a is v6b */
768 assert(((src.swizzle[0] | src.swizzle[1]) & 1) == 0);
769 assert(tmp.regClass() == v6b && dword == 1);
770 return emit_extract_vector(ctx, tmp, dword * 2, v2b);
771 }
772 }
773
774 uint32_t
get_alu_src_ub(isel_context * ctx,nir_alu_instr * instr,int src_idx)775 get_alu_src_ub(isel_context* ctx, nir_alu_instr* instr, int src_idx)
776 {
777 nir_ssa_scalar scalar =
778 nir_ssa_scalar{instr->src[src_idx].src.ssa, instr->src[src_idx].swizzle[0]};
779 return nir_unsigned_upper_bound(ctx->shader, ctx->range_ht, scalar, &ctx->ub_config);
780 }
781
782 Temp
convert_pointer_to_64_bit(isel_context * ctx,Temp ptr,bool non_uniform=false)783 convert_pointer_to_64_bit(isel_context* ctx, Temp ptr, bool non_uniform = false)
784 {
785 if (ptr.size() == 2)
786 return ptr;
787 Builder bld(ctx->program, ctx->block);
788 if (ptr.type() == RegType::vgpr && !non_uniform)
789 ptr = bld.as_uniform(ptr);
790 return bld.pseudo(aco_opcode::p_create_vector, bld.def(RegClass(ptr.type(), 2)), ptr,
791 Operand::c32((unsigned)ctx->options->address32_hi));
792 }
793
794 void
emit_sop2_instruction(isel_context * ctx,nir_alu_instr * instr,aco_opcode op,Temp dst,bool writes_scc,uint8_t uses_ub=0)795 emit_sop2_instruction(isel_context* ctx, nir_alu_instr* instr, aco_opcode op, Temp dst,
796 bool writes_scc, uint8_t uses_ub = 0)
797 {
798 aco_ptr<SOP2_instruction> sop2{
799 create_instruction<SOP2_instruction>(op, Format::SOP2, 2, writes_scc ? 2 : 1)};
800 sop2->operands[0] = Operand(get_alu_src(ctx, instr->src[0]));
801 sop2->operands[1] = Operand(get_alu_src(ctx, instr->src[1]));
802 sop2->definitions[0] = Definition(dst);
803 if (instr->no_unsigned_wrap)
804 sop2->definitions[0].setNUW(true);
805 if (writes_scc)
806 sop2->definitions[1] = Definition(ctx->program->allocateId(s1), scc, s1);
807
808 for (int i = 0; i < 2; i++) {
809 if (uses_ub & (1 << i)) {
810 uint32_t src_ub = get_alu_src_ub(ctx, instr, i);
811 if (src_ub <= 0xffff)
812 sop2->operands[i].set16bit(true);
813 else if (src_ub <= 0xffffff)
814 sop2->operands[i].set24bit(true);
815 }
816 }
817
818 ctx->block->instructions.emplace_back(std::move(sop2));
819 }
820
821 void
emit_vop2_instruction(isel_context * ctx,nir_alu_instr * instr,aco_opcode opc,Temp dst,bool commutative,bool swap_srcs=false,bool flush_denorms=false,bool nuw=false,uint8_t uses_ub=0)822 emit_vop2_instruction(isel_context* ctx, nir_alu_instr* instr, aco_opcode opc, Temp dst,
823 bool commutative, bool swap_srcs = false, bool flush_denorms = false,
824 bool nuw = false, uint8_t uses_ub = 0)
825 {
826 Builder bld(ctx->program, ctx->block);
827 bld.is_precise = instr->exact;
828
829 Temp src0 = get_alu_src(ctx, instr->src[swap_srcs ? 1 : 0]);
830 Temp src1 = get_alu_src(ctx, instr->src[swap_srcs ? 0 : 1]);
831 if (src1.type() == RegType::sgpr) {
832 if (commutative && src0.type() == RegType::vgpr) {
833 Temp t = src0;
834 src0 = src1;
835 src1 = t;
836 } else {
837 src1 = as_vgpr(ctx, src1);
838 }
839 }
840
841 Operand op[2] = {Operand(src0), Operand(src1)};
842
843 for (int i = 0; i < 2; i++) {
844 if (uses_ub & (1 << i)) {
845 uint32_t src_ub = get_alu_src_ub(ctx, instr, swap_srcs ? !i : i);
846 if (src_ub <= 0xffff)
847 op[i].set16bit(true);
848 else if (src_ub <= 0xffffff)
849 op[i].set24bit(true);
850 }
851 }
852
853 if (flush_denorms && ctx->program->chip_class < GFX9) {
854 assert(dst.size() == 1);
855 Temp tmp = bld.vop2(opc, bld.def(v1), op[0], op[1]);
856 bld.vop2(aco_opcode::v_mul_f32, Definition(dst), Operand::c32(0x3f800000u), tmp);
857 } else {
858 if (nuw) {
859 bld.nuw().vop2(opc, Definition(dst), op[0], op[1]);
860 } else {
861 bld.vop2(opc, Definition(dst), op[0], op[1]);
862 }
863 }
864 }
865
866 void
emit_vop2_instruction_logic64(isel_context * ctx,nir_alu_instr * instr,aco_opcode op,Temp dst)867 emit_vop2_instruction_logic64(isel_context* ctx, nir_alu_instr* instr, aco_opcode op, Temp dst)
868 {
869 Builder bld(ctx->program, ctx->block);
870 bld.is_precise = instr->exact;
871
872 Temp src0 = get_alu_src(ctx, instr->src[0]);
873 Temp src1 = get_alu_src(ctx, instr->src[1]);
874
875 if (src1.type() == RegType::sgpr) {
876 assert(src0.type() == RegType::vgpr);
877 std::swap(src0, src1);
878 }
879
880 Temp src00 = bld.tmp(src0.type(), 1);
881 Temp src01 = bld.tmp(src0.type(), 1);
882 bld.pseudo(aco_opcode::p_split_vector, Definition(src00), Definition(src01), src0);
883 Temp src10 = bld.tmp(v1);
884 Temp src11 = bld.tmp(v1);
885 bld.pseudo(aco_opcode::p_split_vector, Definition(src10), Definition(src11), src1);
886 Temp lo = bld.vop2(op, bld.def(v1), src00, src10);
887 Temp hi = bld.vop2(op, bld.def(v1), src01, src11);
888 bld.pseudo(aco_opcode::p_create_vector, Definition(dst), lo, hi);
889 }
890
891 void
emit_vop3a_instruction(isel_context * ctx,nir_alu_instr * instr,aco_opcode op,Temp dst,bool flush_denorms=false,unsigned num_sources=2,bool swap_srcs=false)892 emit_vop3a_instruction(isel_context* ctx, nir_alu_instr* instr, aco_opcode op, Temp dst,
893 bool flush_denorms = false, unsigned num_sources = 2, bool swap_srcs = false)
894 {
895 assert(num_sources == 2 || num_sources == 3);
896 Temp src[3] = {Temp(0, v1), Temp(0, v1), Temp(0, v1)};
897 bool has_sgpr = false;
898 for (unsigned i = 0; i < num_sources; i++) {
899 src[i] = get_alu_src(ctx, instr->src[swap_srcs ? 1 - i : i]);
900 if (has_sgpr)
901 src[i] = as_vgpr(ctx, src[i]);
902 else
903 has_sgpr = src[i].type() == RegType::sgpr;
904 }
905
906 Builder bld(ctx->program, ctx->block);
907 bld.is_precise = instr->exact;
908 if (flush_denorms && ctx->program->chip_class < GFX9) {
909 Temp tmp;
910 if (num_sources == 3)
911 tmp = bld.vop3(op, bld.def(dst.regClass()), src[0], src[1], src[2]);
912 else
913 tmp = bld.vop3(op, bld.def(dst.regClass()), src[0], src[1]);
914 if (dst.size() == 1)
915 bld.vop2(aco_opcode::v_mul_f32, Definition(dst), Operand::c32(0x3f800000u), tmp);
916 else
917 bld.vop3(aco_opcode::v_mul_f64, Definition(dst), Operand::c64(0x3FF0000000000000), tmp);
918 } else if (num_sources == 3) {
919 bld.vop3(op, Definition(dst), src[0], src[1], src[2]);
920 } else {
921 bld.vop3(op, Definition(dst), src[0], src[1]);
922 }
923 }
924
925 Builder::Result
emit_vop3p_instruction(isel_context * ctx,nir_alu_instr * instr,aco_opcode op,Temp dst,bool swap_srcs=false)926 emit_vop3p_instruction(isel_context* ctx, nir_alu_instr* instr, aco_opcode op, Temp dst,
927 bool swap_srcs = false)
928 {
929 Temp src0 = get_alu_src_vop3p(ctx, instr->src[swap_srcs]);
930 Temp src1 = get_alu_src_vop3p(ctx, instr->src[!swap_srcs]);
931 if (src0.type() == RegType::sgpr && src1.type() == RegType::sgpr)
932 src1 = as_vgpr(ctx, src1);
933 assert(instr->dest.dest.ssa.num_components == 2);
934
935 /* swizzle to opsel: all swizzles are either 0 (x) or 1 (y) */
936 unsigned opsel_lo =
937 (instr->src[!swap_srcs].swizzle[0] & 1) << 1 | (instr->src[swap_srcs].swizzle[0] & 1);
938 unsigned opsel_hi =
939 (instr->src[!swap_srcs].swizzle[1] & 1) << 1 | (instr->src[swap_srcs].swizzle[1] & 1);
940
941 Builder bld(ctx->program, ctx->block);
942 bld.is_precise = instr->exact;
943 Builder::Result res = bld.vop3p(op, Definition(dst), src0, src1, opsel_lo, opsel_hi);
944 emit_split_vector(ctx, dst, 2);
945 return res;
946 }
947
948 void
emit_idot_instruction(isel_context * ctx,nir_alu_instr * instr,aco_opcode op,Temp dst,bool clamp)949 emit_idot_instruction(isel_context* ctx, nir_alu_instr* instr, aco_opcode op, Temp dst, bool clamp)
950 {
951 Temp src[3] = {Temp(0, v1), Temp(0, v1), Temp(0, v1)};
952 bool has_sgpr = false;
953 for (unsigned i = 0; i < 3; i++) {
954 src[i] = get_alu_src(ctx, instr->src[i]);
955 if (has_sgpr)
956 src[i] = as_vgpr(ctx, src[i]);
957 else
958 has_sgpr = src[i].type() == RegType::sgpr;
959 }
960
961 Builder bld(ctx->program, ctx->block);
962 bld.is_precise = instr->exact;
963 bld.vop3p(op, Definition(dst), src[0], src[1], src[2], 0x0, 0x7).instr->vop3p().clamp = clamp;
964 }
965
966 void
emit_vop1_instruction(isel_context * ctx,nir_alu_instr * instr,aco_opcode op,Temp dst)967 emit_vop1_instruction(isel_context* ctx, nir_alu_instr* instr, aco_opcode op, Temp dst)
968 {
969 Builder bld(ctx->program, ctx->block);
970 bld.is_precise = instr->exact;
971 if (dst.type() == RegType::sgpr)
972 bld.pseudo(aco_opcode::p_as_uniform, Definition(dst),
973 bld.vop1(op, bld.def(RegType::vgpr, dst.size()), get_alu_src(ctx, instr->src[0])));
974 else
975 bld.vop1(op, Definition(dst), get_alu_src(ctx, instr->src[0]));
976 }
977
978 void
emit_vopc_instruction(isel_context * ctx,nir_alu_instr * instr,aco_opcode op,Temp dst)979 emit_vopc_instruction(isel_context* ctx, nir_alu_instr* instr, aco_opcode op, Temp dst)
980 {
981 Temp src0 = get_alu_src(ctx, instr->src[0]);
982 Temp src1 = get_alu_src(ctx, instr->src[1]);
983 assert(src0.size() == src1.size());
984
985 aco_ptr<Instruction> vopc;
986 if (src1.type() == RegType::sgpr) {
987 if (src0.type() == RegType::vgpr) {
988 /* to swap the operands, we might also have to change the opcode */
989 switch (op) {
990 case aco_opcode::v_cmp_lt_f16: op = aco_opcode::v_cmp_gt_f16; break;
991 case aco_opcode::v_cmp_ge_f16: op = aco_opcode::v_cmp_le_f16; break;
992 case aco_opcode::v_cmp_lt_i16: op = aco_opcode::v_cmp_gt_i16; break;
993 case aco_opcode::v_cmp_ge_i16: op = aco_opcode::v_cmp_le_i16; break;
994 case aco_opcode::v_cmp_lt_u16: op = aco_opcode::v_cmp_gt_u16; break;
995 case aco_opcode::v_cmp_ge_u16: op = aco_opcode::v_cmp_le_u16; break;
996 case aco_opcode::v_cmp_lt_f32: op = aco_opcode::v_cmp_gt_f32; break;
997 case aco_opcode::v_cmp_ge_f32: op = aco_opcode::v_cmp_le_f32; break;
998 case aco_opcode::v_cmp_lt_i32: op = aco_opcode::v_cmp_gt_i32; break;
999 case aco_opcode::v_cmp_ge_i32: op = aco_opcode::v_cmp_le_i32; break;
1000 case aco_opcode::v_cmp_lt_u32: op = aco_opcode::v_cmp_gt_u32; break;
1001 case aco_opcode::v_cmp_ge_u32: op = aco_opcode::v_cmp_le_u32; break;
1002 case aco_opcode::v_cmp_lt_f64: op = aco_opcode::v_cmp_gt_f64; break;
1003 case aco_opcode::v_cmp_ge_f64: op = aco_opcode::v_cmp_le_f64; break;
1004 case aco_opcode::v_cmp_lt_i64: op = aco_opcode::v_cmp_gt_i64; break;
1005 case aco_opcode::v_cmp_ge_i64: op = aco_opcode::v_cmp_le_i64; break;
1006 case aco_opcode::v_cmp_lt_u64: op = aco_opcode::v_cmp_gt_u64; break;
1007 case aco_opcode::v_cmp_ge_u64: op = aco_opcode::v_cmp_le_u64; break;
1008 default: /* eq and ne are commutative */ break;
1009 }
1010 Temp t = src0;
1011 src0 = src1;
1012 src1 = t;
1013 } else {
1014 src1 = as_vgpr(ctx, src1);
1015 }
1016 }
1017
1018 Builder bld(ctx->program, ctx->block);
1019 bld.vopc(op, bld.hint_vcc(Definition(dst)), src0, src1);
1020 }
1021
1022 void
emit_sopc_instruction(isel_context * ctx,nir_alu_instr * instr,aco_opcode op,Temp dst)1023 emit_sopc_instruction(isel_context* ctx, nir_alu_instr* instr, aco_opcode op, Temp dst)
1024 {
1025 Temp src0 = get_alu_src(ctx, instr->src[0]);
1026 Temp src1 = get_alu_src(ctx, instr->src[1]);
1027 Builder bld(ctx->program, ctx->block);
1028
1029 assert(dst.regClass() == bld.lm);
1030 assert(src0.type() == RegType::sgpr);
1031 assert(src1.type() == RegType::sgpr);
1032 assert(src0.regClass() == src1.regClass());
1033
1034 /* Emit the SALU comparison instruction */
1035 Temp cmp = bld.sopc(op, bld.scc(bld.def(s1)), src0, src1);
1036 /* Turn the result into a per-lane bool */
1037 bool_to_vector_condition(ctx, cmp, dst);
1038 }
1039
1040 void
emit_comparison(isel_context * ctx,nir_alu_instr * instr,Temp dst,aco_opcode v16_op,aco_opcode v32_op,aco_opcode v64_op,aco_opcode s32_op=aco_opcode::num_opcodes,aco_opcode s64_op=aco_opcode::num_opcodes)1041 emit_comparison(isel_context* ctx, nir_alu_instr* instr, Temp dst, aco_opcode v16_op,
1042 aco_opcode v32_op, aco_opcode v64_op, aco_opcode s32_op = aco_opcode::num_opcodes,
1043 aco_opcode s64_op = aco_opcode::num_opcodes)
1044 {
1045 aco_opcode s_op = instr->src[0].src.ssa->bit_size == 64 ? s64_op
1046 : instr->src[0].src.ssa->bit_size == 32 ? s32_op
1047 : aco_opcode::num_opcodes;
1048 aco_opcode v_op = instr->src[0].src.ssa->bit_size == 64 ? v64_op
1049 : instr->src[0].src.ssa->bit_size == 32 ? v32_op
1050 : v16_op;
1051 bool use_valu = s_op == aco_opcode::num_opcodes || nir_dest_is_divergent(instr->dest.dest) ||
1052 get_ssa_temp(ctx, instr->src[0].src.ssa).type() == RegType::vgpr ||
1053 get_ssa_temp(ctx, instr->src[1].src.ssa).type() == RegType::vgpr;
1054 aco_opcode op = use_valu ? v_op : s_op;
1055 assert(op != aco_opcode::num_opcodes);
1056 assert(dst.regClass() == ctx->program->lane_mask);
1057
1058 if (use_valu)
1059 emit_vopc_instruction(ctx, instr, op, dst);
1060 else
1061 emit_sopc_instruction(ctx, instr, op, dst);
1062 }
1063
1064 void
emit_boolean_logic(isel_context * ctx,nir_alu_instr * instr,Builder::WaveSpecificOpcode op,Temp dst)1065 emit_boolean_logic(isel_context* ctx, nir_alu_instr* instr, Builder::WaveSpecificOpcode op,
1066 Temp dst)
1067 {
1068 Builder bld(ctx->program, ctx->block);
1069 Temp src0 = get_alu_src(ctx, instr->src[0]);
1070 Temp src1 = get_alu_src(ctx, instr->src[1]);
1071
1072 assert(dst.regClass() == bld.lm);
1073 assert(src0.regClass() == bld.lm);
1074 assert(src1.regClass() == bld.lm);
1075
1076 bld.sop2(op, Definition(dst), bld.def(s1, scc), src0, src1);
1077 }
1078
1079 void
emit_bcsel(isel_context * ctx,nir_alu_instr * instr,Temp dst)1080 emit_bcsel(isel_context* ctx, nir_alu_instr* instr, Temp dst)
1081 {
1082 Builder bld(ctx->program, ctx->block);
1083 Temp cond = get_alu_src(ctx, instr->src[0]);
1084 Temp then = get_alu_src(ctx, instr->src[1]);
1085 Temp els = get_alu_src(ctx, instr->src[2]);
1086
1087 assert(cond.regClass() == bld.lm);
1088
1089 if (dst.type() == RegType::vgpr) {
1090 aco_ptr<Instruction> bcsel;
1091 if (dst.size() == 1) {
1092 then = as_vgpr(ctx, then);
1093 els = as_vgpr(ctx, els);
1094
1095 bld.vop2(aco_opcode::v_cndmask_b32, Definition(dst), els, then, cond);
1096 } else if (dst.size() == 2) {
1097 Temp then_lo = bld.tmp(v1), then_hi = bld.tmp(v1);
1098 bld.pseudo(aco_opcode::p_split_vector, Definition(then_lo), Definition(then_hi), then);
1099 Temp else_lo = bld.tmp(v1), else_hi = bld.tmp(v1);
1100 bld.pseudo(aco_opcode::p_split_vector, Definition(else_lo), Definition(else_hi), els);
1101
1102 Temp dst0 = bld.vop2(aco_opcode::v_cndmask_b32, bld.def(v1), else_lo, then_lo, cond);
1103 Temp dst1 = bld.vop2(aco_opcode::v_cndmask_b32, bld.def(v1), else_hi, then_hi, cond);
1104
1105 bld.pseudo(aco_opcode::p_create_vector, Definition(dst), dst0, dst1);
1106 } else {
1107 isel_err(&instr->instr, "Unimplemented NIR instr bit size");
1108 }
1109 return;
1110 }
1111
1112 if (instr->dest.dest.ssa.bit_size == 1) {
1113 assert(dst.regClass() == bld.lm);
1114 assert(then.regClass() == bld.lm);
1115 assert(els.regClass() == bld.lm);
1116 }
1117
1118 if (!nir_src_is_divergent(instr->src[0].src)) { /* uniform condition and values in sgpr */
1119 if (dst.regClass() == s1 || dst.regClass() == s2) {
1120 assert((then.regClass() == s1 || then.regClass() == s2) &&
1121 els.regClass() == then.regClass());
1122 assert(dst.size() == then.size());
1123 aco_opcode op =
1124 dst.regClass() == s1 ? aco_opcode::s_cselect_b32 : aco_opcode::s_cselect_b64;
1125 bld.sop2(op, Definition(dst), then, els, bld.scc(bool_to_scalar_condition(ctx, cond)));
1126 } else {
1127 isel_err(&instr->instr, "Unimplemented uniform bcsel bit size");
1128 }
1129 return;
1130 }
1131
1132 /* divergent boolean bcsel
1133 * this implements bcsel on bools: dst = s0 ? s1 : s2
1134 * are going to be: dst = (s0 & s1) | (~s0 & s2) */
1135 assert(instr->dest.dest.ssa.bit_size == 1);
1136
1137 if (cond.id() != then.id())
1138 then = bld.sop2(Builder::s_and, bld.def(bld.lm), bld.def(s1, scc), cond, then);
1139
1140 if (cond.id() == els.id())
1141 bld.copy(Definition(dst), then);
1142 else
1143 bld.sop2(Builder::s_or, Definition(dst), bld.def(s1, scc), then,
1144 bld.sop2(Builder::s_andn2, bld.def(bld.lm), bld.def(s1, scc), els, cond));
1145 }
1146
1147 void
emit_scaled_op(isel_context * ctx,Builder & bld,Definition dst,Temp val,aco_opcode op,uint32_t undo)1148 emit_scaled_op(isel_context* ctx, Builder& bld, Definition dst, Temp val, aco_opcode op,
1149 uint32_t undo)
1150 {
1151 /* multiply by 16777216 to handle denormals */
1152 Temp is_denormal =
1153 bld.vopc(aco_opcode::v_cmp_class_f32, bld.hint_vcc(bld.def(bld.lm)), as_vgpr(ctx, val),
1154 bld.copy(bld.def(v1), Operand::c32((1u << 7) | (1u << 4))));
1155 Temp scaled = bld.vop2(aco_opcode::v_mul_f32, bld.def(v1), Operand::c32(0x4b800000u), val);
1156 scaled = bld.vop1(op, bld.def(v1), scaled);
1157 scaled = bld.vop2(aco_opcode::v_mul_f32, bld.def(v1), Operand::c32(undo), scaled);
1158
1159 Temp not_scaled = bld.vop1(op, bld.def(v1), val);
1160
1161 bld.vop2(aco_opcode::v_cndmask_b32, dst, not_scaled, scaled, is_denormal);
1162 }
1163
1164 void
emit_rcp(isel_context * ctx,Builder & bld,Definition dst,Temp val)1165 emit_rcp(isel_context* ctx, Builder& bld, Definition dst, Temp val)
1166 {
1167 if (ctx->block->fp_mode.denorm32 == 0) {
1168 bld.vop1(aco_opcode::v_rcp_f32, dst, val);
1169 return;
1170 }
1171
1172 emit_scaled_op(ctx, bld, dst, val, aco_opcode::v_rcp_f32, 0x4b800000u);
1173 }
1174
1175 void
emit_rsq(isel_context * ctx,Builder & bld,Definition dst,Temp val)1176 emit_rsq(isel_context* ctx, Builder& bld, Definition dst, Temp val)
1177 {
1178 if (ctx->block->fp_mode.denorm32 == 0) {
1179 bld.vop1(aco_opcode::v_rsq_f32, dst, val);
1180 return;
1181 }
1182
1183 emit_scaled_op(ctx, bld, dst, val, aco_opcode::v_rsq_f32, 0x45800000u);
1184 }
1185
1186 void
emit_sqrt(isel_context * ctx,Builder & bld,Definition dst,Temp val)1187 emit_sqrt(isel_context* ctx, Builder& bld, Definition dst, Temp val)
1188 {
1189 if (ctx->block->fp_mode.denorm32 == 0) {
1190 bld.vop1(aco_opcode::v_sqrt_f32, dst, val);
1191 return;
1192 }
1193
1194 emit_scaled_op(ctx, bld, dst, val, aco_opcode::v_sqrt_f32, 0x39800000u);
1195 }
1196
1197 void
emit_log2(isel_context * ctx,Builder & bld,Definition dst,Temp val)1198 emit_log2(isel_context* ctx, Builder& bld, Definition dst, Temp val)
1199 {
1200 if (ctx->block->fp_mode.denorm32 == 0) {
1201 bld.vop1(aco_opcode::v_log_f32, dst, val);
1202 return;
1203 }
1204
1205 emit_scaled_op(ctx, bld, dst, val, aco_opcode::v_log_f32, 0xc1c00000u);
1206 }
1207
1208 Temp
emit_trunc_f64(isel_context * ctx,Builder & bld,Definition dst,Temp val)1209 emit_trunc_f64(isel_context* ctx, Builder& bld, Definition dst, Temp val)
1210 {
1211 if (ctx->options->chip_class >= GFX7)
1212 return bld.vop1(aco_opcode::v_trunc_f64, Definition(dst), val);
1213
1214 /* GFX6 doesn't support V_TRUNC_F64, lower it. */
1215 /* TODO: create more efficient code! */
1216 if (val.type() == RegType::sgpr)
1217 val = as_vgpr(ctx, val);
1218
1219 /* Split the input value. */
1220 Temp val_lo = bld.tmp(v1), val_hi = bld.tmp(v1);
1221 bld.pseudo(aco_opcode::p_split_vector, Definition(val_lo), Definition(val_hi), val);
1222
1223 /* Extract the exponent and compute the unbiased value. */
1224 Temp exponent =
1225 bld.vop3(aco_opcode::v_bfe_u32, bld.def(v1), val_hi, Operand::c32(20u), Operand::c32(11u));
1226 exponent = bld.vsub32(bld.def(v1), exponent, Operand::c32(1023u));
1227
1228 /* Extract the fractional part. */
1229 Temp fract_mask = bld.pseudo(aco_opcode::p_create_vector, bld.def(v2), Operand::c32(-1u),
1230 Operand::c32(0x000fffffu));
1231 fract_mask = bld.vop3(aco_opcode::v_lshr_b64, bld.def(v2), fract_mask, exponent);
1232
1233 Temp fract_mask_lo = bld.tmp(v1), fract_mask_hi = bld.tmp(v1);
1234 bld.pseudo(aco_opcode::p_split_vector, Definition(fract_mask_lo), Definition(fract_mask_hi),
1235 fract_mask);
1236
1237 Temp fract_lo = bld.tmp(v1), fract_hi = bld.tmp(v1);
1238 Temp tmp = bld.vop1(aco_opcode::v_not_b32, bld.def(v1), fract_mask_lo);
1239 fract_lo = bld.vop2(aco_opcode::v_and_b32, bld.def(v1), val_lo, tmp);
1240 tmp = bld.vop1(aco_opcode::v_not_b32, bld.def(v1), fract_mask_hi);
1241 fract_hi = bld.vop2(aco_opcode::v_and_b32, bld.def(v1), val_hi, tmp);
1242
1243 /* Get the sign bit. */
1244 Temp sign = bld.vop2(aco_opcode::v_and_b32, bld.def(v1), Operand::c32(0x80000000u), val_hi);
1245
1246 /* Decide the operation to apply depending on the unbiased exponent. */
1247 Temp exp_lt0 = bld.vopc_e64(aco_opcode::v_cmp_lt_i32, bld.hint_vcc(bld.def(bld.lm)), exponent,
1248 Operand::zero());
1249 Temp dst_lo = bld.vop2(aco_opcode::v_cndmask_b32, bld.def(v1), fract_lo,
1250 bld.copy(bld.def(v1), Operand::zero()), exp_lt0);
1251 Temp dst_hi = bld.vop2(aco_opcode::v_cndmask_b32, bld.def(v1), fract_hi, sign, exp_lt0);
1252 Temp exp_gt51 = bld.vopc_e64(aco_opcode::v_cmp_gt_i32, bld.def(s2), exponent, Operand::c32(51u));
1253 dst_lo = bld.vop2(aco_opcode::v_cndmask_b32, bld.def(v1), dst_lo, val_lo, exp_gt51);
1254 dst_hi = bld.vop2(aco_opcode::v_cndmask_b32, bld.def(v1), dst_hi, val_hi, exp_gt51);
1255
1256 return bld.pseudo(aco_opcode::p_create_vector, Definition(dst), dst_lo, dst_hi);
1257 }
1258
1259 Temp
emit_floor_f64(isel_context * ctx,Builder & bld,Definition dst,Temp val)1260 emit_floor_f64(isel_context* ctx, Builder& bld, Definition dst, Temp val)
1261 {
1262 if (ctx->options->chip_class >= GFX7)
1263 return bld.vop1(aco_opcode::v_floor_f64, Definition(dst), val);
1264
1265 /* GFX6 doesn't support V_FLOOR_F64, lower it (note that it's actually
1266 * lowered at NIR level for precision reasons). */
1267 Temp src0 = as_vgpr(ctx, val);
1268
1269 Temp mask = bld.copy(bld.def(s1), Operand::c32(3u)); /* isnan */
1270 Temp min_val = bld.pseudo(aco_opcode::p_create_vector, bld.def(s2), Operand::c32(-1u),
1271 Operand::c32(0x3fefffffu));
1272
1273 Temp isnan =
1274 bld.vopc_e64(aco_opcode::v_cmp_class_f64, bld.hint_vcc(bld.def(bld.lm)), src0, mask);
1275 Temp fract = bld.vop1(aco_opcode::v_fract_f64, bld.def(v2), src0);
1276 Temp min = bld.vop3(aco_opcode::v_min_f64, bld.def(v2), fract, min_val);
1277
1278 Temp then_lo = bld.tmp(v1), then_hi = bld.tmp(v1);
1279 bld.pseudo(aco_opcode::p_split_vector, Definition(then_lo), Definition(then_hi), src0);
1280 Temp else_lo = bld.tmp(v1), else_hi = bld.tmp(v1);
1281 bld.pseudo(aco_opcode::p_split_vector, Definition(else_lo), Definition(else_hi), min);
1282
1283 Temp dst0 = bld.vop2(aco_opcode::v_cndmask_b32, bld.def(v1), else_lo, then_lo, isnan);
1284 Temp dst1 = bld.vop2(aco_opcode::v_cndmask_b32, bld.def(v1), else_hi, then_hi, isnan);
1285
1286 Temp v = bld.pseudo(aco_opcode::p_create_vector, bld.def(v2), dst0, dst1);
1287
1288 Instruction* add = bld.vop3(aco_opcode::v_add_f64, Definition(dst), src0, v);
1289 add->vop3().neg[1] = true;
1290
1291 return add->definitions[0].getTemp();
1292 }
1293
1294 Temp
uadd32_sat(Builder & bld,Definition dst,Temp src0,Temp src1)1295 uadd32_sat(Builder& bld, Definition dst, Temp src0, Temp src1)
1296 {
1297 if (bld.program->chip_class < GFX8) {
1298 Builder::Result add = bld.vadd32(bld.def(v1), src0, src1, true);
1299 return bld.vop2_e64(aco_opcode::v_cndmask_b32, dst, add.def(0).getTemp(), Operand::c32(-1),
1300 add.def(1).getTemp());
1301 }
1302
1303 Builder::Result add(NULL);
1304 if (bld.program->chip_class >= GFX9) {
1305 add = bld.vop2_e64(aco_opcode::v_add_u32, dst, src0, src1);
1306 } else {
1307 add = bld.vop2_e64(aco_opcode::v_add_co_u32, dst, bld.hint_vcc(bld.def(bld.lm)), src0, src1);
1308 }
1309 add.instr->vop3().clamp = 1;
1310 return dst.getTemp();
1311 }
1312
1313 void
visit_alu_instr(isel_context * ctx,nir_alu_instr * instr)1314 visit_alu_instr(isel_context* ctx, nir_alu_instr* instr)
1315 {
1316 if (!instr->dest.dest.is_ssa) {
1317 isel_err(&instr->instr, "nir alu dst not in ssa");
1318 abort();
1319 }
1320 Builder bld(ctx->program, ctx->block);
1321 bld.is_precise = instr->exact;
1322 Temp dst = get_ssa_temp(ctx, &instr->dest.dest.ssa);
1323 switch (instr->op) {
1324 case nir_op_vec2:
1325 case nir_op_vec3:
1326 case nir_op_vec4:
1327 case nir_op_vec5: {
1328 std::array<Temp, NIR_MAX_VEC_COMPONENTS> elems;
1329 unsigned num = instr->dest.dest.ssa.num_components;
1330 for (unsigned i = 0; i < num; ++i)
1331 elems[i] = get_alu_src(ctx, instr->src[i]);
1332
1333 if (instr->dest.dest.ssa.bit_size >= 32 || dst.type() == RegType::vgpr) {
1334 aco_ptr<Pseudo_instruction> vec{create_instruction<Pseudo_instruction>(
1335 aco_opcode::p_create_vector, Format::PSEUDO, instr->dest.dest.ssa.num_components, 1)};
1336 RegClass elem_rc = RegClass::get(RegType::vgpr, instr->dest.dest.ssa.bit_size / 8u);
1337 for (unsigned i = 0; i < num; ++i) {
1338 if (elems[i].type() == RegType::sgpr && elem_rc.is_subdword())
1339 elems[i] = emit_extract_vector(ctx, elems[i], 0, elem_rc);
1340 vec->operands[i] = Operand{elems[i]};
1341 }
1342 vec->definitions[0] = Definition(dst);
1343 ctx->block->instructions.emplace_back(std::move(vec));
1344 ctx->allocated_vec.emplace(dst.id(), elems);
1345 } else {
1346 bool use_s_pack = ctx->program->chip_class >= GFX9;
1347 Temp mask = bld.copy(bld.def(s1), Operand::c32((1u << instr->dest.dest.ssa.bit_size) - 1));
1348
1349 std::array<Temp, NIR_MAX_VEC_COMPONENTS> packed;
1350 uint32_t const_vals[NIR_MAX_VEC_COMPONENTS] = {};
1351 for (unsigned i = 0; i < num; i++) {
1352 unsigned packed_size = use_s_pack ? 16 : 32;
1353 unsigned idx = i * instr->dest.dest.ssa.bit_size / packed_size;
1354 unsigned offset = i * instr->dest.dest.ssa.bit_size % packed_size;
1355 if (nir_src_is_const(instr->src[i].src)) {
1356 const_vals[idx] |= nir_src_as_uint(instr->src[i].src) << offset;
1357 continue;
1358 }
1359
1360 if (offset != packed_size - instr->dest.dest.ssa.bit_size)
1361 elems[i] =
1362 bld.sop2(aco_opcode::s_and_b32, bld.def(s1), bld.def(s1, scc), elems[i], mask);
1363
1364 if (offset)
1365 elems[i] = bld.sop2(aco_opcode::s_lshl_b32, bld.def(s1), bld.def(s1, scc), elems[i],
1366 Operand::c32(offset));
1367
1368 if (packed[idx].id())
1369 packed[idx] = bld.sop2(aco_opcode::s_or_b32, bld.def(s1), bld.def(s1, scc), elems[i],
1370 packed[idx]);
1371 else
1372 packed[idx] = elems[i];
1373 }
1374
1375 if (use_s_pack) {
1376 for (unsigned i = 0; i < dst.size(); i++) {
1377 bool same = !!packed[i * 2].id() == !!packed[i * 2 + 1].id();
1378
1379 if (packed[i * 2].id() && packed[i * 2 + 1].id())
1380 packed[i] = bld.sop2(aco_opcode::s_pack_ll_b32_b16, bld.def(s1), packed[i * 2],
1381 packed[i * 2 + 1]);
1382 else if (packed[i * 2 + 1].id())
1383 packed[i] = bld.sop2(aco_opcode::s_pack_ll_b32_b16, bld.def(s1),
1384 Operand::c32(const_vals[i * 2]), packed[i * 2 + 1]);
1385 else if (packed[i * 2].id())
1386 packed[i] = bld.sop2(aco_opcode::s_pack_ll_b32_b16, bld.def(s1), packed[i * 2],
1387 Operand::c32(const_vals[i * 2 + 1]));
1388
1389 if (same)
1390 const_vals[i] = const_vals[i * 2] | (const_vals[i * 2 + 1] << 16);
1391 else
1392 const_vals[i] = 0;
1393 }
1394 }
1395
1396 for (unsigned i = 0; i < dst.size(); i++) {
1397 if (const_vals[i] && packed[i].id())
1398 packed[i] = bld.sop2(aco_opcode::s_or_b32, bld.def(s1), bld.def(s1, scc),
1399 Operand::c32(const_vals[i]), packed[i]);
1400 else if (!packed[i].id())
1401 packed[i] = bld.copy(bld.def(s1), Operand::c32(const_vals[i]));
1402 }
1403
1404 if (dst.size() == 1)
1405 bld.copy(Definition(dst), packed[0]);
1406 else if (dst.size() == 2)
1407 bld.pseudo(aco_opcode::p_create_vector, Definition(dst), packed[0], packed[1]);
1408 else
1409 bld.pseudo(aco_opcode::p_create_vector, Definition(dst), packed[0], packed[1],
1410 packed[2]);
1411 }
1412 break;
1413 }
1414 case nir_op_mov: {
1415 Temp src = get_alu_src(ctx, instr->src[0]);
1416 if (src.type() == RegType::vgpr && dst.type() == RegType::sgpr) {
1417 /* use size() instead of bytes() for 8/16-bit */
1418 assert(src.size() == dst.size() && "wrong src or dst register class for nir_op_mov");
1419 bld.pseudo(aco_opcode::p_as_uniform, Definition(dst), src);
1420 } else {
1421 assert(src.bytes() == dst.bytes() && "wrong src or dst register class for nir_op_mov");
1422 bld.copy(Definition(dst), src);
1423 }
1424 break;
1425 }
1426 case nir_op_inot: {
1427 Temp src = get_alu_src(ctx, instr->src[0]);
1428 if (dst.regClass() == v1 || dst.regClass() == v2b || dst.regClass() == v1b) {
1429 emit_vop1_instruction(ctx, instr, aco_opcode::v_not_b32, dst);
1430 } else if (dst.regClass() == v2) {
1431 Temp lo = bld.tmp(v1), hi = bld.tmp(v1);
1432 bld.pseudo(aco_opcode::p_split_vector, Definition(lo), Definition(hi), src);
1433 lo = bld.vop1(aco_opcode::v_not_b32, bld.def(v1), lo);
1434 hi = bld.vop1(aco_opcode::v_not_b32, bld.def(v1), hi);
1435 bld.pseudo(aco_opcode::p_create_vector, Definition(dst), lo, hi);
1436 } else if (dst.type() == RegType::sgpr) {
1437 aco_opcode opcode = dst.size() == 1 ? aco_opcode::s_not_b32 : aco_opcode::s_not_b64;
1438 bld.sop1(opcode, Definition(dst), bld.def(s1, scc), src);
1439 } else {
1440 isel_err(&instr->instr, "Unimplemented NIR instr bit size");
1441 }
1442 break;
1443 }
1444 case nir_op_iabs: {
1445 Temp src = get_alu_src(ctx, instr->src[0]);
1446 if (dst.regClass() == s1) {
1447 bld.sop1(aco_opcode::s_abs_i32, Definition(dst), bld.def(s1, scc), src);
1448 } else if (dst.regClass() == v1) {
1449 bld.vop2(aco_opcode::v_max_i32, Definition(dst), src,
1450 bld.vsub32(bld.def(v1), Operand::zero(), src));
1451 } else {
1452 isel_err(&instr->instr, "Unimplemented NIR instr bit size");
1453 }
1454 break;
1455 }
1456 case nir_op_isign: {
1457 Temp src = get_alu_src(ctx, instr->src[0]);
1458 if (dst.regClass() == s1) {
1459 Temp tmp =
1460 bld.sop2(aco_opcode::s_max_i32, bld.def(s1), bld.def(s1, scc), src, Operand::c32(-1));
1461 bld.sop2(aco_opcode::s_min_i32, Definition(dst), bld.def(s1, scc), tmp, Operand::c32(1u));
1462 } else if (dst.regClass() == s2) {
1463 Temp neg =
1464 bld.sop2(aco_opcode::s_ashr_i64, bld.def(s2), bld.def(s1, scc), src, Operand::c32(63u));
1465 Temp neqz;
1466 if (ctx->program->chip_class >= GFX8)
1467 neqz = bld.sopc(aco_opcode::s_cmp_lg_u64, bld.def(s1, scc), src, Operand::zero());
1468 else
1469 neqz =
1470 bld.sop2(aco_opcode::s_or_b64, bld.def(s2), bld.def(s1, scc), src, Operand::zero())
1471 .def(1)
1472 .getTemp();
1473 /* SCC gets zero-extended to 64 bit */
1474 bld.sop2(aco_opcode::s_or_b64, Definition(dst), bld.def(s1, scc), neg, bld.scc(neqz));
1475 } else if (dst.regClass() == v1) {
1476 bld.vop3(aco_opcode::v_med3_i32, Definition(dst), Operand::c32(-1), src, Operand::c32(1u));
1477 } else if (dst.regClass() == v2) {
1478 Temp upper = emit_extract_vector(ctx, src, 1, v1);
1479 Temp neg = bld.vop2(aco_opcode::v_ashrrev_i32, bld.def(v1), Operand::c32(31u), upper);
1480 Temp gtz =
1481 bld.vopc(aco_opcode::v_cmp_ge_i64, bld.hint_vcc(bld.def(bld.lm)), Operand::zero(), src);
1482 Temp lower = bld.vop2(aco_opcode::v_cndmask_b32, bld.def(v1), Operand::c32(1u), neg, gtz);
1483 upper = bld.vop2(aco_opcode::v_cndmask_b32, bld.def(v1), Operand::zero(), neg, gtz);
1484 bld.pseudo(aco_opcode::p_create_vector, Definition(dst), lower, upper);
1485 } else {
1486 isel_err(&instr->instr, "Unimplemented NIR instr bit size");
1487 }
1488 break;
1489 }
1490 case nir_op_imax: {
1491 if (dst.regClass() == v2b && ctx->program->chip_class >= GFX10) {
1492 emit_vop3a_instruction(ctx, instr, aco_opcode::v_max_i16_e64, dst);
1493 } else if (dst.regClass() == v2b) {
1494 emit_vop2_instruction(ctx, instr, aco_opcode::v_max_i16, dst, true);
1495 } else if (dst.regClass() == v1 && instr->dest.dest.ssa.bit_size == 16) {
1496 emit_vop3p_instruction(ctx, instr, aco_opcode::v_pk_max_i16, dst);
1497 } else if (dst.regClass() == v1) {
1498 emit_vop2_instruction(ctx, instr, aco_opcode::v_max_i32, dst, true);
1499 } else if (dst.regClass() == s1) {
1500 emit_sop2_instruction(ctx, instr, aco_opcode::s_max_i32, dst, true);
1501 } else {
1502 isel_err(&instr->instr, "Unimplemented NIR instr bit size");
1503 }
1504 break;
1505 }
1506 case nir_op_umax: {
1507 if (dst.regClass() == v2b && ctx->program->chip_class >= GFX10) {
1508 emit_vop3a_instruction(ctx, instr, aco_opcode::v_max_u16_e64, dst);
1509 } else if (dst.regClass() == v2b) {
1510 emit_vop2_instruction(ctx, instr, aco_opcode::v_max_u16, dst, true);
1511 } else if (dst.regClass() == v1 && instr->dest.dest.ssa.bit_size == 16) {
1512 emit_vop3p_instruction(ctx, instr, aco_opcode::v_pk_max_u16, dst);
1513 } else if (dst.regClass() == v1) {
1514 emit_vop2_instruction(ctx, instr, aco_opcode::v_max_u32, dst, true);
1515 } else if (dst.regClass() == s1) {
1516 emit_sop2_instruction(ctx, instr, aco_opcode::s_max_u32, dst, true);
1517 } else {
1518 isel_err(&instr->instr, "Unimplemented NIR instr bit size");
1519 }
1520 break;
1521 }
1522 case nir_op_imin: {
1523 if (dst.regClass() == v2b && ctx->program->chip_class >= GFX10) {
1524 emit_vop3a_instruction(ctx, instr, aco_opcode::v_min_i16_e64, dst);
1525 } else if (dst.regClass() == v2b) {
1526 emit_vop2_instruction(ctx, instr, aco_opcode::v_min_i16, dst, true);
1527 } else if (dst.regClass() == v1 && instr->dest.dest.ssa.bit_size == 16) {
1528 emit_vop3p_instruction(ctx, instr, aco_opcode::v_pk_min_i16, dst);
1529 } else if (dst.regClass() == v1) {
1530 emit_vop2_instruction(ctx, instr, aco_opcode::v_min_i32, dst, true);
1531 } else if (dst.regClass() == s1) {
1532 emit_sop2_instruction(ctx, instr, aco_opcode::s_min_i32, dst, true);
1533 } else {
1534 isel_err(&instr->instr, "Unimplemented NIR instr bit size");
1535 }
1536 break;
1537 }
1538 case nir_op_umin: {
1539 if (dst.regClass() == v2b && ctx->program->chip_class >= GFX10) {
1540 emit_vop3a_instruction(ctx, instr, aco_opcode::v_min_u16_e64, dst);
1541 } else if (dst.regClass() == v2b) {
1542 emit_vop2_instruction(ctx, instr, aco_opcode::v_min_u16, dst, true);
1543 } else if (dst.regClass() == v1 && instr->dest.dest.ssa.bit_size == 16) {
1544 emit_vop3p_instruction(ctx, instr, aco_opcode::v_pk_min_u16, dst);
1545 } else if (dst.regClass() == v1) {
1546 emit_vop2_instruction(ctx, instr, aco_opcode::v_min_u32, dst, true);
1547 } else if (dst.regClass() == s1) {
1548 emit_sop2_instruction(ctx, instr, aco_opcode::s_min_u32, dst, true);
1549 } else {
1550 isel_err(&instr->instr, "Unimplemented NIR instr bit size");
1551 }
1552 break;
1553 }
1554 case nir_op_ior: {
1555 if (instr->dest.dest.ssa.bit_size == 1) {
1556 emit_boolean_logic(ctx, instr, Builder::s_or, dst);
1557 } else if (dst.regClass() == v1 || dst.regClass() == v2b || dst.regClass() == v1b) {
1558 emit_vop2_instruction(ctx, instr, aco_opcode::v_or_b32, dst, true);
1559 } else if (dst.regClass() == v2) {
1560 emit_vop2_instruction_logic64(ctx, instr, aco_opcode::v_or_b32, dst);
1561 } else if (dst.regClass() == s1) {
1562 emit_sop2_instruction(ctx, instr, aco_opcode::s_or_b32, dst, true);
1563 } else if (dst.regClass() == s2) {
1564 emit_sop2_instruction(ctx, instr, aco_opcode::s_or_b64, dst, true);
1565 } else {
1566 isel_err(&instr->instr, "Unimplemented NIR instr bit size");
1567 }
1568 break;
1569 }
1570 case nir_op_iand: {
1571 if (instr->dest.dest.ssa.bit_size == 1) {
1572 emit_boolean_logic(ctx, instr, Builder::s_and, dst);
1573 } else if (dst.regClass() == v1 || dst.regClass() == v2b || dst.regClass() == v1b) {
1574 emit_vop2_instruction(ctx, instr, aco_opcode::v_and_b32, dst, true);
1575 } else if (dst.regClass() == v2) {
1576 emit_vop2_instruction_logic64(ctx, instr, aco_opcode::v_and_b32, dst);
1577 } else if (dst.regClass() == s1) {
1578 emit_sop2_instruction(ctx, instr, aco_opcode::s_and_b32, dst, true);
1579 } else if (dst.regClass() == s2) {
1580 emit_sop2_instruction(ctx, instr, aco_opcode::s_and_b64, dst, true);
1581 } else {
1582 isel_err(&instr->instr, "Unimplemented NIR instr bit size");
1583 }
1584 break;
1585 }
1586 case nir_op_ixor: {
1587 if (instr->dest.dest.ssa.bit_size == 1) {
1588 emit_boolean_logic(ctx, instr, Builder::s_xor, dst);
1589 } else if (dst.regClass() == v1 || dst.regClass() == v2b || dst.regClass() == v1b) {
1590 emit_vop2_instruction(ctx, instr, aco_opcode::v_xor_b32, dst, true);
1591 } else if (dst.regClass() == v2) {
1592 emit_vop2_instruction_logic64(ctx, instr, aco_opcode::v_xor_b32, dst);
1593 } else if (dst.regClass() == s1) {
1594 emit_sop2_instruction(ctx, instr, aco_opcode::s_xor_b32, dst, true);
1595 } else if (dst.regClass() == s2) {
1596 emit_sop2_instruction(ctx, instr, aco_opcode::s_xor_b64, dst, true);
1597 } else {
1598 isel_err(&instr->instr, "Unimplemented NIR instr bit size");
1599 }
1600 break;
1601 }
1602 case nir_op_ushr: {
1603 if (dst.regClass() == v2b && ctx->program->chip_class >= GFX10) {
1604 emit_vop3a_instruction(ctx, instr, aco_opcode::v_lshrrev_b16_e64, dst, false, 2, true);
1605 } else if (dst.regClass() == v2b) {
1606 emit_vop2_instruction(ctx, instr, aco_opcode::v_lshrrev_b16, dst, false, true);
1607 } else if (dst.regClass() == v1 && instr->dest.dest.ssa.bit_size == 16) {
1608 emit_vop3p_instruction(ctx, instr, aco_opcode::v_pk_lshrrev_b16, dst, true);
1609 } else if (dst.regClass() == v1) {
1610 emit_vop2_instruction(ctx, instr, aco_opcode::v_lshrrev_b32, dst, false, true);
1611 } else if (dst.regClass() == v2 && ctx->program->chip_class >= GFX8) {
1612 bld.vop3(aco_opcode::v_lshrrev_b64, Definition(dst), get_alu_src(ctx, instr->src[1]),
1613 get_alu_src(ctx, instr->src[0]));
1614 } else if (dst.regClass() == v2) {
1615 emit_vop3a_instruction(ctx, instr, aco_opcode::v_lshr_b64, dst);
1616 } else if (dst.regClass() == s2) {
1617 emit_sop2_instruction(ctx, instr, aco_opcode::s_lshr_b64, dst, true);
1618 } else if (dst.regClass() == s1) {
1619 emit_sop2_instruction(ctx, instr, aco_opcode::s_lshr_b32, dst, true);
1620 } else {
1621 isel_err(&instr->instr, "Unimplemented NIR instr bit size");
1622 }
1623 break;
1624 }
1625 case nir_op_ishl: {
1626 if (dst.regClass() == v2b && ctx->program->chip_class >= GFX10) {
1627 emit_vop3a_instruction(ctx, instr, aco_opcode::v_lshlrev_b16_e64, dst, false, 2, true);
1628 } else if (dst.regClass() == v2b) {
1629 emit_vop2_instruction(ctx, instr, aco_opcode::v_lshlrev_b16, dst, false, true);
1630 } else if (dst.regClass() == v1 && instr->dest.dest.ssa.bit_size == 16) {
1631 emit_vop3p_instruction(ctx, instr, aco_opcode::v_pk_lshlrev_b16, dst, true);
1632 } else if (dst.regClass() == v1) {
1633 emit_vop2_instruction(ctx, instr, aco_opcode::v_lshlrev_b32, dst, false, true, false,
1634 false, 2);
1635 } else if (dst.regClass() == v2 && ctx->program->chip_class >= GFX8) {
1636 bld.vop3(aco_opcode::v_lshlrev_b64, Definition(dst), get_alu_src(ctx, instr->src[1]),
1637 get_alu_src(ctx, instr->src[0]));
1638 } else if (dst.regClass() == v2) {
1639 emit_vop3a_instruction(ctx, instr, aco_opcode::v_lshl_b64, dst);
1640 } else if (dst.regClass() == s1) {
1641 emit_sop2_instruction(ctx, instr, aco_opcode::s_lshl_b32, dst, true, 1);
1642 } else if (dst.regClass() == s2) {
1643 emit_sop2_instruction(ctx, instr, aco_opcode::s_lshl_b64, dst, true);
1644 } else {
1645 isel_err(&instr->instr, "Unimplemented NIR instr bit size");
1646 }
1647 break;
1648 }
1649 case nir_op_ishr: {
1650 if (dst.regClass() == v2b && ctx->program->chip_class >= GFX10) {
1651 emit_vop3a_instruction(ctx, instr, aco_opcode::v_ashrrev_i16_e64, dst, false, 2, true);
1652 } else if (dst.regClass() == v2b) {
1653 emit_vop2_instruction(ctx, instr, aco_opcode::v_ashrrev_i16, dst, false, true);
1654 } else if (dst.regClass() == v1 && instr->dest.dest.ssa.bit_size == 16) {
1655 emit_vop3p_instruction(ctx, instr, aco_opcode::v_pk_ashrrev_i16, dst, true);
1656 } else if (dst.regClass() == v1) {
1657 emit_vop2_instruction(ctx, instr, aco_opcode::v_ashrrev_i32, dst, false, true);
1658 } else if (dst.regClass() == v2 && ctx->program->chip_class >= GFX8) {
1659 bld.vop3(aco_opcode::v_ashrrev_i64, Definition(dst), get_alu_src(ctx, instr->src[1]),
1660 get_alu_src(ctx, instr->src[0]));
1661 } else if (dst.regClass() == v2) {
1662 emit_vop3a_instruction(ctx, instr, aco_opcode::v_ashr_i64, dst);
1663 } else if (dst.regClass() == s1) {
1664 emit_sop2_instruction(ctx, instr, aco_opcode::s_ashr_i32, dst, true);
1665 } else if (dst.regClass() == s2) {
1666 emit_sop2_instruction(ctx, instr, aco_opcode::s_ashr_i64, dst, true);
1667 } else {
1668 isel_err(&instr->instr, "Unimplemented NIR instr bit size");
1669 }
1670 break;
1671 }
1672 case nir_op_find_lsb: {
1673 Temp src = get_alu_src(ctx, instr->src[0]);
1674 if (src.regClass() == s1) {
1675 bld.sop1(aco_opcode::s_ff1_i32_b32, Definition(dst), src);
1676 } else if (src.regClass() == v1) {
1677 emit_vop1_instruction(ctx, instr, aco_opcode::v_ffbl_b32, dst);
1678 } else if (src.regClass() == s2) {
1679 bld.sop1(aco_opcode::s_ff1_i32_b64, Definition(dst), src);
1680 } else {
1681 isel_err(&instr->instr, "Unimplemented NIR instr bit size");
1682 }
1683 break;
1684 }
1685 case nir_op_ufind_msb:
1686 case nir_op_ifind_msb: {
1687 Temp src = get_alu_src(ctx, instr->src[0]);
1688 if (src.regClass() == s1 || src.regClass() == s2) {
1689 aco_opcode op = src.regClass() == s2
1690 ? (instr->op == nir_op_ufind_msb ? aco_opcode::s_flbit_i32_b64
1691 : aco_opcode::s_flbit_i32_i64)
1692 : (instr->op == nir_op_ufind_msb ? aco_opcode::s_flbit_i32_b32
1693 : aco_opcode::s_flbit_i32);
1694 Temp msb_rev = bld.sop1(op, bld.def(s1), src);
1695
1696 Builder::Result sub = bld.sop2(aco_opcode::s_sub_u32, bld.def(s1), bld.def(s1, scc),
1697 Operand::c32(src.size() * 32u - 1u), msb_rev);
1698 Temp msb = sub.def(0).getTemp();
1699 Temp carry = sub.def(1).getTemp();
1700
1701 bld.sop2(aco_opcode::s_cselect_b32, Definition(dst), Operand::c32(-1), msb,
1702 bld.scc(carry));
1703 } else if (src.regClass() == v1) {
1704 aco_opcode op =
1705 instr->op == nir_op_ufind_msb ? aco_opcode::v_ffbh_u32 : aco_opcode::v_ffbh_i32;
1706 Temp msb_rev = bld.tmp(v1);
1707 emit_vop1_instruction(ctx, instr, op, msb_rev);
1708 Temp msb = bld.tmp(v1);
1709 Temp carry =
1710 bld.vsub32(Definition(msb), Operand::c32(31u), Operand(msb_rev), true).def(1).getTemp();
1711 bld.vop2_e64(aco_opcode::v_cndmask_b32, Definition(dst), msb, Operand::c32(-1), carry);
1712 } else if (src.regClass() == v2) {
1713 aco_opcode op =
1714 instr->op == nir_op_ufind_msb ? aco_opcode::v_ffbh_u32 : aco_opcode::v_ffbh_i32;
1715
1716 Temp lo = bld.tmp(v1), hi = bld.tmp(v1);
1717 bld.pseudo(aco_opcode::p_split_vector, Definition(lo), Definition(hi), src);
1718
1719 lo = uadd32_sat(bld, bld.def(v1), bld.copy(bld.def(s1), Operand::c32(32u)),
1720 bld.vop1(op, bld.def(v1), lo));
1721 hi = bld.vop1(op, bld.def(v1), hi);
1722 Temp found_hi = bld.vopc(aco_opcode::v_cmp_lg_u32, bld.def(bld.lm), Operand::c32(-1), hi);
1723
1724 Temp msb_rev = bld.vop2(aco_opcode::v_cndmask_b32, bld.def(v1), lo, hi, found_hi);
1725
1726 Temp msb = bld.tmp(v1);
1727 Temp carry =
1728 bld.vsub32(Definition(msb), Operand::c32(63u), Operand(msb_rev), true).def(1).getTemp();
1729 bld.vop2_e64(aco_opcode::v_cndmask_b32, Definition(dst), msb, Operand::c32(-1), carry);
1730 } else {
1731 isel_err(&instr->instr, "Unimplemented NIR instr bit size");
1732 }
1733 break;
1734 }
1735 case nir_op_bitfield_reverse: {
1736 if (dst.regClass() == s1) {
1737 bld.sop1(aco_opcode::s_brev_b32, Definition(dst), get_alu_src(ctx, instr->src[0]));
1738 } else if (dst.regClass() == v1) {
1739 bld.vop1(aco_opcode::v_bfrev_b32, Definition(dst), get_alu_src(ctx, instr->src[0]));
1740 } else {
1741 isel_err(&instr->instr, "Unimplemented NIR instr bit size");
1742 }
1743 break;
1744 }
1745 case nir_op_iadd: {
1746 if (dst.regClass() == s1) {
1747 emit_sop2_instruction(ctx, instr, aco_opcode::s_add_u32, dst, true);
1748 break;
1749 } else if (dst.bytes() <= 2 && ctx->program->chip_class >= GFX10) {
1750 emit_vop3a_instruction(ctx, instr, aco_opcode::v_add_u16_e64, dst);
1751 break;
1752 } else if (dst.bytes() <= 2 && ctx->program->chip_class >= GFX8) {
1753 emit_vop2_instruction(ctx, instr, aco_opcode::v_add_u16, dst, true);
1754 break;
1755 } else if (dst.regClass() == v1 && instr->dest.dest.ssa.bit_size == 16) {
1756 emit_vop3p_instruction(ctx, instr, aco_opcode::v_pk_add_u16, dst);
1757 break;
1758 }
1759
1760 Temp src0 = get_alu_src(ctx, instr->src[0]);
1761 Temp src1 = get_alu_src(ctx, instr->src[1]);
1762 if (dst.type() == RegType::vgpr && dst.bytes() <= 4) {
1763 bld.vadd32(Definition(dst), Operand(src0), Operand(src1));
1764 break;
1765 }
1766
1767 assert(src0.size() == 2 && src1.size() == 2);
1768 Temp src00 = bld.tmp(src0.type(), 1);
1769 Temp src01 = bld.tmp(dst.type(), 1);
1770 bld.pseudo(aco_opcode::p_split_vector, Definition(src00), Definition(src01), src0);
1771 Temp src10 = bld.tmp(src1.type(), 1);
1772 Temp src11 = bld.tmp(dst.type(), 1);
1773 bld.pseudo(aco_opcode::p_split_vector, Definition(src10), Definition(src11), src1);
1774
1775 if (dst.regClass() == s2) {
1776 Temp carry = bld.tmp(s1);
1777 Temp dst0 =
1778 bld.sop2(aco_opcode::s_add_u32, bld.def(s1), bld.scc(Definition(carry)), src00, src10);
1779 Temp dst1 = bld.sop2(aco_opcode::s_addc_u32, bld.def(s1), bld.def(s1, scc), src01, src11,
1780 bld.scc(carry));
1781 bld.pseudo(aco_opcode::p_create_vector, Definition(dst), dst0, dst1);
1782 } else if (dst.regClass() == v2) {
1783 Temp dst0 = bld.tmp(v1);
1784 Temp carry = bld.vadd32(Definition(dst0), src00, src10, true).def(1).getTemp();
1785 Temp dst1 = bld.vadd32(bld.def(v1), src01, src11, false, carry);
1786 bld.pseudo(aco_opcode::p_create_vector, Definition(dst), dst0, dst1);
1787 } else {
1788 isel_err(&instr->instr, "Unimplemented NIR instr bit size");
1789 }
1790 break;
1791 }
1792 case nir_op_uadd_sat: {
1793 Temp src0 = get_alu_src(ctx, instr->src[0]);
1794 Temp src1 = get_alu_src(ctx, instr->src[1]);
1795 if (dst.regClass() == s1) {
1796 Temp tmp = bld.tmp(s1), carry = bld.tmp(s1);
1797 bld.sop2(aco_opcode::s_add_u32, Definition(tmp), bld.scc(Definition(carry)), src0, src1);
1798 bld.sop2(aco_opcode::s_cselect_b32, Definition(dst), Operand::c32(-1), tmp,
1799 bld.scc(carry));
1800 } else if (dst.regClass() == v2b) {
1801 Instruction* add_instr;
1802 if (ctx->program->chip_class >= GFX10) {
1803 add_instr = bld.vop3(aco_opcode::v_add_u16_e64, Definition(dst), src0, src1).instr;
1804 } else {
1805 if (src1.type() == RegType::sgpr)
1806 std::swap(src0, src1);
1807 add_instr =
1808 bld.vop2_e64(aco_opcode::v_add_u16, Definition(dst), src0, as_vgpr(ctx, src1)).instr;
1809 }
1810 add_instr->vop3().clamp = 1;
1811 } else if (dst.regClass() == v1) {
1812 uadd32_sat(bld, Definition(dst), src0, src1);
1813 } else {
1814 isel_err(&instr->instr, "Unimplemented NIR instr bit size");
1815 }
1816 break;
1817 }
1818 case nir_op_iadd_sat: {
1819 Temp src0 = get_alu_src(ctx, instr->src[0]);
1820 Temp src1 = as_vgpr(ctx, get_alu_src(ctx, instr->src[1]));
1821 if (dst.regClass() == v2b) {
1822 Instruction* add_instr =
1823 bld.vop3(aco_opcode::v_add_i16, Definition(dst), src0, src1).instr;
1824 add_instr->vop3().clamp = 1;
1825 } else if (dst.regClass() == v1) {
1826 Instruction* add_instr =
1827 bld.vop3(aco_opcode::v_add_i32, Definition(dst), src0, src1).instr;
1828 add_instr->vop3().clamp = 1;
1829 } else {
1830 isel_err(&instr->instr, "Unimplemented NIR instr bit size");
1831 }
1832 break;
1833 }
1834 case nir_op_uadd_carry: {
1835 Temp src0 = get_alu_src(ctx, instr->src[0]);
1836 Temp src1 = get_alu_src(ctx, instr->src[1]);
1837 if (dst.regClass() == s1) {
1838 bld.sop2(aco_opcode::s_add_u32, bld.def(s1), bld.scc(Definition(dst)), src0, src1);
1839 break;
1840 }
1841 if (dst.regClass() == v1) {
1842 Temp carry = bld.vadd32(bld.def(v1), src0, src1, true).def(1).getTemp();
1843 bld.vop2_e64(aco_opcode::v_cndmask_b32, Definition(dst), Operand::zero(), Operand::c32(1u),
1844 carry);
1845 break;
1846 }
1847
1848 Temp src00 = bld.tmp(src0.type(), 1);
1849 Temp src01 = bld.tmp(dst.type(), 1);
1850 bld.pseudo(aco_opcode::p_split_vector, Definition(src00), Definition(src01), src0);
1851 Temp src10 = bld.tmp(src1.type(), 1);
1852 Temp src11 = bld.tmp(dst.type(), 1);
1853 bld.pseudo(aco_opcode::p_split_vector, Definition(src10), Definition(src11), src1);
1854 if (dst.regClass() == s2) {
1855 Temp carry = bld.tmp(s1);
1856 bld.sop2(aco_opcode::s_add_u32, bld.def(s1), bld.scc(Definition(carry)), src00, src10);
1857 carry = bld.sop2(aco_opcode::s_addc_u32, bld.def(s1), bld.scc(bld.def(s1)), src01, src11,
1858 bld.scc(carry))
1859 .def(1)
1860 .getTemp();
1861 bld.pseudo(aco_opcode::p_create_vector, Definition(dst), carry, Operand::zero());
1862 } else if (dst.regClass() == v2) {
1863 Temp carry = bld.vadd32(bld.def(v1), src00, src10, true).def(1).getTemp();
1864 carry = bld.vadd32(bld.def(v1), src01, src11, true, carry).def(1).getTemp();
1865 carry = bld.vop2_e64(aco_opcode::v_cndmask_b32, bld.def(v1), Operand::zero(),
1866 Operand::c32(1u), carry);
1867 bld.pseudo(aco_opcode::p_create_vector, Definition(dst), carry, Operand::zero());
1868 } else {
1869 isel_err(&instr->instr, "Unimplemented NIR instr bit size");
1870 }
1871 break;
1872 }
1873 case nir_op_isub: {
1874 if (dst.regClass() == s1) {
1875 emit_sop2_instruction(ctx, instr, aco_opcode::s_sub_i32, dst, true);
1876 break;
1877 } else if (dst.regClass() == v1 && instr->dest.dest.ssa.bit_size == 16) {
1878 emit_vop3p_instruction(ctx, instr, aco_opcode::v_pk_sub_u16, dst);
1879 break;
1880 }
1881
1882 Temp src0 = get_alu_src(ctx, instr->src[0]);
1883 Temp src1 = get_alu_src(ctx, instr->src[1]);
1884 if (dst.regClass() == v1) {
1885 bld.vsub32(Definition(dst), src0, src1);
1886 break;
1887 } else if (dst.bytes() <= 2) {
1888 if (ctx->program->chip_class >= GFX10)
1889 bld.vop3(aco_opcode::v_sub_u16_e64, Definition(dst), src0, src1);
1890 else if (src1.type() == RegType::sgpr)
1891 bld.vop2(aco_opcode::v_subrev_u16, Definition(dst), src1, as_vgpr(ctx, src0));
1892 else if (ctx->program->chip_class >= GFX8)
1893 bld.vop2(aco_opcode::v_sub_u16, Definition(dst), src0, as_vgpr(ctx, src1));
1894 else
1895 bld.vsub32(Definition(dst), src0, src1);
1896 break;
1897 }
1898
1899 Temp src00 = bld.tmp(src0.type(), 1);
1900 Temp src01 = bld.tmp(dst.type(), 1);
1901 bld.pseudo(aco_opcode::p_split_vector, Definition(src00), Definition(src01), src0);
1902 Temp src10 = bld.tmp(src1.type(), 1);
1903 Temp src11 = bld.tmp(dst.type(), 1);
1904 bld.pseudo(aco_opcode::p_split_vector, Definition(src10), Definition(src11), src1);
1905 if (dst.regClass() == s2) {
1906 Temp borrow = bld.tmp(s1);
1907 Temp dst0 =
1908 bld.sop2(aco_opcode::s_sub_u32, bld.def(s1), bld.scc(Definition(borrow)), src00, src10);
1909 Temp dst1 = bld.sop2(aco_opcode::s_subb_u32, bld.def(s1), bld.def(s1, scc), src01, src11,
1910 bld.scc(borrow));
1911 bld.pseudo(aco_opcode::p_create_vector, Definition(dst), dst0, dst1);
1912 } else if (dst.regClass() == v2) {
1913 Temp lower = bld.tmp(v1);
1914 Temp borrow = bld.vsub32(Definition(lower), src00, src10, true).def(1).getTemp();
1915 Temp upper = bld.vsub32(bld.def(v1), src01, src11, false, borrow);
1916 bld.pseudo(aco_opcode::p_create_vector, Definition(dst), lower, upper);
1917 } else {
1918 isel_err(&instr->instr, "Unimplemented NIR instr bit size");
1919 }
1920 break;
1921 }
1922 case nir_op_usub_borrow: {
1923 Temp src0 = get_alu_src(ctx, instr->src[0]);
1924 Temp src1 = get_alu_src(ctx, instr->src[1]);
1925 if (dst.regClass() == s1) {
1926 bld.sop2(aco_opcode::s_sub_u32, bld.def(s1), bld.scc(Definition(dst)), src0, src1);
1927 break;
1928 } else if (dst.regClass() == v1) {
1929 Temp borrow = bld.vsub32(bld.def(v1), src0, src1, true).def(1).getTemp();
1930 bld.vop2_e64(aco_opcode::v_cndmask_b32, Definition(dst), Operand::zero(), Operand::c32(1u),
1931 borrow);
1932 break;
1933 }
1934
1935 Temp src00 = bld.tmp(src0.type(), 1);
1936 Temp src01 = bld.tmp(dst.type(), 1);
1937 bld.pseudo(aco_opcode::p_split_vector, Definition(src00), Definition(src01), src0);
1938 Temp src10 = bld.tmp(src1.type(), 1);
1939 Temp src11 = bld.tmp(dst.type(), 1);
1940 bld.pseudo(aco_opcode::p_split_vector, Definition(src10), Definition(src11), src1);
1941 if (dst.regClass() == s2) {
1942 Temp borrow = bld.tmp(s1);
1943 bld.sop2(aco_opcode::s_sub_u32, bld.def(s1), bld.scc(Definition(borrow)), src00, src10);
1944 borrow = bld.sop2(aco_opcode::s_subb_u32, bld.def(s1), bld.scc(bld.def(s1)), src01, src11,
1945 bld.scc(borrow))
1946 .def(1)
1947 .getTemp();
1948 bld.pseudo(aco_opcode::p_create_vector, Definition(dst), borrow, Operand::zero());
1949 } else if (dst.regClass() == v2) {
1950 Temp borrow = bld.vsub32(bld.def(v1), src00, src10, true).def(1).getTemp();
1951 borrow = bld.vsub32(bld.def(v1), src01, src11, true, Operand(borrow)).def(1).getTemp();
1952 borrow = bld.vop2_e64(aco_opcode::v_cndmask_b32, bld.def(v1), Operand::zero(),
1953 Operand::c32(1u), borrow);
1954 bld.pseudo(aco_opcode::p_create_vector, Definition(dst), borrow, Operand::zero());
1955 } else {
1956 isel_err(&instr->instr, "Unimplemented NIR instr bit size");
1957 }
1958 break;
1959 }
1960 case nir_op_imul: {
1961 if (dst.bytes() <= 2 && ctx->program->chip_class >= GFX10) {
1962 emit_vop3a_instruction(ctx, instr, aco_opcode::v_mul_lo_u16_e64, dst);
1963 } else if (dst.bytes() <= 2 && ctx->program->chip_class >= GFX8) {
1964 emit_vop2_instruction(ctx, instr, aco_opcode::v_mul_lo_u16, dst, true);
1965 } else if (dst.regClass() == v1 && instr->dest.dest.ssa.bit_size == 16) {
1966 emit_vop3p_instruction(ctx, instr, aco_opcode::v_pk_mul_lo_u16, dst);
1967 } else if (dst.type() == RegType::vgpr) {
1968 uint32_t src0_ub = get_alu_src_ub(ctx, instr, 0);
1969 uint32_t src1_ub = get_alu_src_ub(ctx, instr, 1);
1970
1971 if (src0_ub <= 0xffffff && src1_ub <= 0xffffff) {
1972 bool nuw_16bit = src0_ub <= 0xffff && src1_ub <= 0xffff && src0_ub * src1_ub <= 0xffff;
1973 emit_vop2_instruction(ctx, instr, aco_opcode::v_mul_u32_u24, dst,
1974 true /* commutative */, false, false, nuw_16bit);
1975 } else if (nir_src_is_const(instr->src[0].src)) {
1976 bld.v_mul_imm(Definition(dst), get_alu_src(ctx, instr->src[1]),
1977 nir_src_as_uint(instr->src[0].src), false);
1978 } else if (nir_src_is_const(instr->src[1].src)) {
1979 bld.v_mul_imm(Definition(dst), get_alu_src(ctx, instr->src[0]),
1980 nir_src_as_uint(instr->src[1].src), false);
1981 } else {
1982 emit_vop3a_instruction(ctx, instr, aco_opcode::v_mul_lo_u32, dst);
1983 }
1984 } else if (dst.regClass() == s1) {
1985 emit_sop2_instruction(ctx, instr, aco_opcode::s_mul_i32, dst, false);
1986 } else {
1987 isel_err(&instr->instr, "Unimplemented NIR instr bit size");
1988 }
1989 break;
1990 }
1991 case nir_op_umul_high: {
1992 if (dst.regClass() == s1 && ctx->options->chip_class >= GFX9) {
1993 emit_sop2_instruction(ctx, instr, aco_opcode::s_mul_hi_u32, dst, false);
1994 } else if (dst.bytes() == 4) {
1995 uint32_t src0_ub = get_alu_src_ub(ctx, instr, 0);
1996 uint32_t src1_ub = get_alu_src_ub(ctx, instr, 1);
1997
1998 Temp tmp = dst.regClass() == s1 ? bld.tmp(v1) : dst;
1999 if (src0_ub <= 0xffffff && src1_ub <= 0xffffff) {
2000 emit_vop2_instruction(ctx, instr, aco_opcode::v_mul_hi_u32_u24, tmp, true);
2001 } else {
2002 emit_vop3a_instruction(ctx, instr, aco_opcode::v_mul_hi_u32, tmp);
2003 }
2004
2005 if (dst.regClass() == s1)
2006 bld.pseudo(aco_opcode::p_as_uniform, Definition(dst), tmp);
2007 } else {
2008 isel_err(&instr->instr, "Unimplemented NIR instr bit size");
2009 }
2010 break;
2011 }
2012 case nir_op_imul_high: {
2013 if (dst.regClass() == v1) {
2014 emit_vop3a_instruction(ctx, instr, aco_opcode::v_mul_hi_i32, dst);
2015 } else if (dst.regClass() == s1 && ctx->options->chip_class >= GFX9) {
2016 emit_sop2_instruction(ctx, instr, aco_opcode::s_mul_hi_i32, dst, false);
2017 } else if (dst.regClass() == s1) {
2018 Temp tmp = bld.vop3(aco_opcode::v_mul_hi_i32, bld.def(v1), get_alu_src(ctx, instr->src[0]),
2019 as_vgpr(ctx, get_alu_src(ctx, instr->src[1])));
2020 bld.pseudo(aco_opcode::p_as_uniform, Definition(dst), tmp);
2021 } else {
2022 isel_err(&instr->instr, "Unimplemented NIR instr bit size");
2023 }
2024 break;
2025 }
2026 case nir_op_fmul: {
2027 if (dst.regClass() == v2b) {
2028 emit_vop2_instruction(ctx, instr, aco_opcode::v_mul_f16, dst, true);
2029 } else if (dst.regClass() == v1 && instr->dest.dest.ssa.bit_size == 16) {
2030 emit_vop3p_instruction(ctx, instr, aco_opcode::v_pk_mul_f16, dst);
2031 } else if (dst.regClass() == v1) {
2032 emit_vop2_instruction(ctx, instr, aco_opcode::v_mul_f32, dst, true);
2033 } else if (dst.regClass() == v2) {
2034 emit_vop3a_instruction(ctx, instr, aco_opcode::v_mul_f64, dst);
2035 } else {
2036 isel_err(&instr->instr, "Unimplemented NIR instr bit size");
2037 }
2038 break;
2039 }
2040 case nir_op_fadd: {
2041 if (dst.regClass() == v2b) {
2042 emit_vop2_instruction(ctx, instr, aco_opcode::v_add_f16, dst, true);
2043 } else if (dst.regClass() == v1 && instr->dest.dest.ssa.bit_size == 16) {
2044 emit_vop3p_instruction(ctx, instr, aco_opcode::v_pk_add_f16, dst);
2045 } else if (dst.regClass() == v1) {
2046 emit_vop2_instruction(ctx, instr, aco_opcode::v_add_f32, dst, true);
2047 } else if (dst.regClass() == v2) {
2048 emit_vop3a_instruction(ctx, instr, aco_opcode::v_add_f64, dst);
2049 } else {
2050 isel_err(&instr->instr, "Unimplemented NIR instr bit size");
2051 }
2052 break;
2053 }
2054 case nir_op_fsub: {
2055 if (dst.regClass() == v1 && instr->dest.dest.ssa.bit_size == 16) {
2056 Instruction* add = emit_vop3p_instruction(ctx, instr, aco_opcode::v_pk_add_f16, dst);
2057 VOP3P_instruction& sub = add->vop3p();
2058 sub.neg_lo[1] = true;
2059 sub.neg_hi[1] = true;
2060 break;
2061 }
2062
2063 Temp src0 = get_alu_src(ctx, instr->src[0]);
2064 Temp src1 = get_alu_src(ctx, instr->src[1]);
2065 if (dst.regClass() == v2b) {
2066 if (src1.type() == RegType::vgpr || src0.type() != RegType::vgpr)
2067 emit_vop2_instruction(ctx, instr, aco_opcode::v_sub_f16, dst, false);
2068 else
2069 emit_vop2_instruction(ctx, instr, aco_opcode::v_subrev_f16, dst, true);
2070 } else if (dst.regClass() == v1) {
2071 if (src1.type() == RegType::vgpr || src0.type() != RegType::vgpr)
2072 emit_vop2_instruction(ctx, instr, aco_opcode::v_sub_f32, dst, false);
2073 else
2074 emit_vop2_instruction(ctx, instr, aco_opcode::v_subrev_f32, dst, true);
2075 } else if (dst.regClass() == v2) {
2076 Instruction* add = bld.vop3(aco_opcode::v_add_f64, Definition(dst), as_vgpr(ctx, src0),
2077 as_vgpr(ctx, src1));
2078 add->vop3().neg[1] = true;
2079 } else {
2080 isel_err(&instr->instr, "Unimplemented NIR instr bit size");
2081 }
2082 break;
2083 }
2084 case nir_op_fmax: {
2085 if (dst.regClass() == v2b) {
2086 // TODO: check fp_mode.must_flush_denorms16_64
2087 emit_vop2_instruction(ctx, instr, aco_opcode::v_max_f16, dst, true);
2088 } else if (dst.regClass() == v1 && instr->dest.dest.ssa.bit_size == 16) {
2089 emit_vop3p_instruction(ctx, instr, aco_opcode::v_pk_max_f16, dst);
2090 } else if (dst.regClass() == v1) {
2091 emit_vop2_instruction(ctx, instr, aco_opcode::v_max_f32, dst, true, false,
2092 ctx->block->fp_mode.must_flush_denorms32);
2093 } else if (dst.regClass() == v2) {
2094 emit_vop3a_instruction(ctx, instr, aco_opcode::v_max_f64, dst,
2095 ctx->block->fp_mode.must_flush_denorms16_64);
2096 } else {
2097 isel_err(&instr->instr, "Unimplemented NIR instr bit size");
2098 }
2099 break;
2100 }
2101 case nir_op_fmin: {
2102 if (dst.regClass() == v2b) {
2103 // TODO: check fp_mode.must_flush_denorms16_64
2104 emit_vop2_instruction(ctx, instr, aco_opcode::v_min_f16, dst, true);
2105 } else if (dst.regClass() == v1 && instr->dest.dest.ssa.bit_size == 16) {
2106 emit_vop3p_instruction(ctx, instr, aco_opcode::v_pk_min_f16, dst, true);
2107 } else if (dst.regClass() == v1) {
2108 emit_vop2_instruction(ctx, instr, aco_opcode::v_min_f32, dst, true, false,
2109 ctx->block->fp_mode.must_flush_denorms32);
2110 } else if (dst.regClass() == v2) {
2111 emit_vop3a_instruction(ctx, instr, aco_opcode::v_min_f64, dst,
2112 ctx->block->fp_mode.must_flush_denorms16_64);
2113 } else {
2114 isel_err(&instr->instr, "Unimplemented NIR instr bit size");
2115 }
2116 break;
2117 }
2118 case nir_op_sdot_4x8_iadd: {
2119 emit_idot_instruction(ctx, instr, aco_opcode::v_dot4_i32_i8, dst, false);
2120 break;
2121 }
2122 case nir_op_sdot_4x8_iadd_sat: {
2123 emit_idot_instruction(ctx, instr, aco_opcode::v_dot4_i32_i8, dst, true);
2124 break;
2125 }
2126 case nir_op_udot_4x8_uadd: {
2127 emit_idot_instruction(ctx, instr, aco_opcode::v_dot4_u32_u8, dst, false);
2128 break;
2129 }
2130 case nir_op_udot_4x8_uadd_sat: {
2131 emit_idot_instruction(ctx, instr, aco_opcode::v_dot4_u32_u8, dst, true);
2132 break;
2133 }
2134 case nir_op_sdot_2x16_iadd: {
2135 emit_idot_instruction(ctx, instr, aco_opcode::v_dot2_i32_i16, dst, false);
2136 break;
2137 }
2138 case nir_op_sdot_2x16_iadd_sat: {
2139 emit_idot_instruction(ctx, instr, aco_opcode::v_dot2_i32_i16, dst, true);
2140 break;
2141 }
2142 case nir_op_udot_2x16_uadd: {
2143 emit_idot_instruction(ctx, instr, aco_opcode::v_dot2_u32_u16, dst, false);
2144 break;
2145 }
2146 case nir_op_udot_2x16_uadd_sat: {
2147 emit_idot_instruction(ctx, instr, aco_opcode::v_dot2_u32_u16, dst, true);
2148 break;
2149 }
2150 case nir_op_cube_face_coord_amd: {
2151 Temp in = get_alu_src(ctx, instr->src[0], 3);
2152 Temp src[3] = {emit_extract_vector(ctx, in, 0, v1), emit_extract_vector(ctx, in, 1, v1),
2153 emit_extract_vector(ctx, in, 2, v1)};
2154 Temp ma = bld.vop3(aco_opcode::v_cubema_f32, bld.def(v1), src[0], src[1], src[2]);
2155 ma = bld.vop1(aco_opcode::v_rcp_f32, bld.def(v1), ma);
2156 Temp sc = bld.vop3(aco_opcode::v_cubesc_f32, bld.def(v1), src[0], src[1], src[2]);
2157 Temp tc = bld.vop3(aco_opcode::v_cubetc_f32, bld.def(v1), src[0], src[1], src[2]);
2158 sc = bld.vop2(aco_opcode::v_add_f32, bld.def(v1), Operand::c32(0x3f000000u /*0.5*/),
2159 bld.vop2(aco_opcode::v_mul_f32, bld.def(v1), sc, ma));
2160 tc = bld.vop2(aco_opcode::v_add_f32, bld.def(v1), Operand::c32(0x3f000000u /*0.5*/),
2161 bld.vop2(aco_opcode::v_mul_f32, bld.def(v1), tc, ma));
2162 bld.pseudo(aco_opcode::p_create_vector, Definition(dst), sc, tc);
2163 break;
2164 }
2165 case nir_op_cube_face_index_amd: {
2166 Temp in = get_alu_src(ctx, instr->src[0], 3);
2167 Temp src[3] = {emit_extract_vector(ctx, in, 0, v1), emit_extract_vector(ctx, in, 1, v1),
2168 emit_extract_vector(ctx, in, 2, v1)};
2169 bld.vop3(aco_opcode::v_cubeid_f32, Definition(dst), src[0], src[1], src[2]);
2170 break;
2171 }
2172 case nir_op_bcsel: {
2173 emit_bcsel(ctx, instr, dst);
2174 break;
2175 }
2176 case nir_op_frsq: {
2177 if (dst.regClass() == v2b) {
2178 emit_vop1_instruction(ctx, instr, aco_opcode::v_rsq_f16, dst);
2179 } else if (dst.regClass() == v1) {
2180 Temp src = get_alu_src(ctx, instr->src[0]);
2181 emit_rsq(ctx, bld, Definition(dst), src);
2182 } else if (dst.regClass() == v2) {
2183 /* Lowered at NIR level for precision reasons. */
2184 emit_vop1_instruction(ctx, instr, aco_opcode::v_rsq_f64, dst);
2185 } else {
2186 isel_err(&instr->instr, "Unimplemented NIR instr bit size");
2187 }
2188 break;
2189 }
2190 case nir_op_fneg: {
2191 if (dst.regClass() == v1 && instr->dest.dest.ssa.bit_size == 16) {
2192 Temp src = get_alu_src_vop3p(ctx, instr->src[0]);
2193 bld.vop3p(aco_opcode::v_pk_mul_f16, Definition(dst), src, Operand::c16(0xBC00),
2194 instr->src[0].swizzle[0] & 1, instr->src[0].swizzle[1] & 1);
2195 emit_split_vector(ctx, dst, 2);
2196 break;
2197 }
2198 Temp src = get_alu_src(ctx, instr->src[0]);
2199 if (dst.regClass() == v2b) {
2200 bld.vop2(aco_opcode::v_mul_f16, Definition(dst), Operand::c16(0xbc00u), as_vgpr(ctx, src));
2201 } else if (dst.regClass() == v1) {
2202 bld.vop2(aco_opcode::v_mul_f32, Definition(dst), Operand::c32(0xbf800000u),
2203 as_vgpr(ctx, src));
2204 } else if (dst.regClass() == v2) {
2205 if (ctx->block->fp_mode.must_flush_denorms16_64)
2206 src = bld.vop3(aco_opcode::v_mul_f64, bld.def(v2), Operand::c64(0x3FF0000000000000),
2207 as_vgpr(ctx, src));
2208 Temp upper = bld.tmp(v1), lower = bld.tmp(v1);
2209 bld.pseudo(aco_opcode::p_split_vector, Definition(lower), Definition(upper), src);
2210 upper = bld.vop2(aco_opcode::v_xor_b32, bld.def(v1), Operand::c32(0x80000000u), upper);
2211 bld.pseudo(aco_opcode::p_create_vector, Definition(dst), lower, upper);
2212 } else {
2213 isel_err(&instr->instr, "Unimplemented NIR instr bit size");
2214 }
2215 break;
2216 }
2217 case nir_op_fabs: {
2218 Temp src = get_alu_src(ctx, instr->src[0]);
2219 if (dst.regClass() == v2b) {
2220 Instruction* mul = bld.vop2_e64(aco_opcode::v_mul_f16, Definition(dst),
2221 Operand::c16(0x3c00), as_vgpr(ctx, src))
2222 .instr;
2223 mul->vop3().abs[1] = true;
2224 } else if (dst.regClass() == v1) {
2225 Instruction* mul = bld.vop2_e64(aco_opcode::v_mul_f32, Definition(dst),
2226 Operand::c32(0x3f800000u), as_vgpr(ctx, src))
2227 .instr;
2228 mul->vop3().abs[1] = true;
2229 } else if (dst.regClass() == v2) {
2230 if (ctx->block->fp_mode.must_flush_denorms16_64)
2231 src = bld.vop3(aco_opcode::v_mul_f64, bld.def(v2), Operand::c64(0x3FF0000000000000),
2232 as_vgpr(ctx, src));
2233 Temp upper = bld.tmp(v1), lower = bld.tmp(v1);
2234 bld.pseudo(aco_opcode::p_split_vector, Definition(lower), Definition(upper), src);
2235 upper = bld.vop2(aco_opcode::v_and_b32, bld.def(v1), Operand::c32(0x7FFFFFFFu), upper);
2236 bld.pseudo(aco_opcode::p_create_vector, Definition(dst), lower, upper);
2237 } else {
2238 isel_err(&instr->instr, "Unimplemented NIR instr bit size");
2239 }
2240 break;
2241 }
2242 case nir_op_fsat: {
2243 if (dst.regClass() == v1 && instr->dest.dest.ssa.bit_size == 16) {
2244 Temp src = get_alu_src_vop3p(ctx, instr->src[0]);
2245 Instruction* vop3p =
2246 bld.vop3p(aco_opcode::v_pk_mul_f16, Definition(dst), src, Operand::c16(0x3C00),
2247 instr->src[0].swizzle[0] & 1, instr->src[0].swizzle[1] & 1);
2248 vop3p->vop3p().clamp = true;
2249 emit_split_vector(ctx, dst, 2);
2250 break;
2251 }
2252 Temp src = get_alu_src(ctx, instr->src[0]);
2253 if (dst.regClass() == v2b) {
2254 bld.vop3(aco_opcode::v_med3_f16, Definition(dst), Operand::c16(0u), Operand::c16(0x3c00),
2255 src);
2256 } else if (dst.regClass() == v1) {
2257 bld.vop3(aco_opcode::v_med3_f32, Definition(dst), Operand::zero(),
2258 Operand::c32(0x3f800000u), src);
2259 /* apparently, it is not necessary to flush denorms if this instruction is used with these
2260 * operands */
2261 // TODO: confirm that this holds under any circumstances
2262 } else if (dst.regClass() == v2) {
2263 Instruction* add = bld.vop3(aco_opcode::v_add_f64, Definition(dst), src, Operand::zero());
2264 add->vop3().clamp = true;
2265 } else {
2266 isel_err(&instr->instr, "Unimplemented NIR instr bit size");
2267 }
2268 break;
2269 }
2270 case nir_op_flog2: {
2271 if (dst.regClass() == v2b) {
2272 emit_vop1_instruction(ctx, instr, aco_opcode::v_log_f16, dst);
2273 } else if (dst.regClass() == v1) {
2274 Temp src = get_alu_src(ctx, instr->src[0]);
2275 emit_log2(ctx, bld, Definition(dst), src);
2276 } else {
2277 isel_err(&instr->instr, "Unimplemented NIR instr bit size");
2278 }
2279 break;
2280 }
2281 case nir_op_frcp: {
2282 if (dst.regClass() == v2b) {
2283 emit_vop1_instruction(ctx, instr, aco_opcode::v_rcp_f16, dst);
2284 } else if (dst.regClass() == v1) {
2285 Temp src = get_alu_src(ctx, instr->src[0]);
2286 emit_rcp(ctx, bld, Definition(dst), src);
2287 } else if (dst.regClass() == v2) {
2288 /* Lowered at NIR level for precision reasons. */
2289 emit_vop1_instruction(ctx, instr, aco_opcode::v_rcp_f64, dst);
2290 } else {
2291 isel_err(&instr->instr, "Unimplemented NIR instr bit size");
2292 }
2293 break;
2294 }
2295 case nir_op_fexp2: {
2296 if (dst.regClass() == v2b) {
2297 emit_vop1_instruction(ctx, instr, aco_opcode::v_exp_f16, dst);
2298 } else if (dst.regClass() == v1) {
2299 emit_vop1_instruction(ctx, instr, aco_opcode::v_exp_f32, dst);
2300 } else {
2301 isel_err(&instr->instr, "Unimplemented NIR instr bit size");
2302 }
2303 break;
2304 }
2305 case nir_op_fsqrt: {
2306 if (dst.regClass() == v2b) {
2307 emit_vop1_instruction(ctx, instr, aco_opcode::v_sqrt_f16, dst);
2308 } else if (dst.regClass() == v1) {
2309 Temp src = get_alu_src(ctx, instr->src[0]);
2310 emit_sqrt(ctx, bld, Definition(dst), src);
2311 } else if (dst.regClass() == v2) {
2312 /* Lowered at NIR level for precision reasons. */
2313 emit_vop1_instruction(ctx, instr, aco_opcode::v_sqrt_f64, dst);
2314 } else {
2315 isel_err(&instr->instr, "Unimplemented NIR instr bit size");
2316 }
2317 break;
2318 }
2319 case nir_op_ffract: {
2320 if (dst.regClass() == v2b) {
2321 emit_vop1_instruction(ctx, instr, aco_opcode::v_fract_f16, dst);
2322 } else if (dst.regClass() == v1) {
2323 emit_vop1_instruction(ctx, instr, aco_opcode::v_fract_f32, dst);
2324 } else if (dst.regClass() == v2) {
2325 emit_vop1_instruction(ctx, instr, aco_opcode::v_fract_f64, dst);
2326 } else {
2327 isel_err(&instr->instr, "Unimplemented NIR instr bit size");
2328 }
2329 break;
2330 }
2331 case nir_op_ffloor: {
2332 if (dst.regClass() == v2b) {
2333 emit_vop1_instruction(ctx, instr, aco_opcode::v_floor_f16, dst);
2334 } else if (dst.regClass() == v1) {
2335 emit_vop1_instruction(ctx, instr, aco_opcode::v_floor_f32, dst);
2336 } else if (dst.regClass() == v2) {
2337 Temp src = get_alu_src(ctx, instr->src[0]);
2338 emit_floor_f64(ctx, bld, Definition(dst), src);
2339 } else {
2340 isel_err(&instr->instr, "Unimplemented NIR instr bit size");
2341 }
2342 break;
2343 }
2344 case nir_op_fceil: {
2345 if (dst.regClass() == v2b) {
2346 emit_vop1_instruction(ctx, instr, aco_opcode::v_ceil_f16, dst);
2347 } else if (dst.regClass() == v1) {
2348 emit_vop1_instruction(ctx, instr, aco_opcode::v_ceil_f32, dst);
2349 } else if (dst.regClass() == v2) {
2350 if (ctx->options->chip_class >= GFX7) {
2351 emit_vop1_instruction(ctx, instr, aco_opcode::v_ceil_f64, dst);
2352 } else {
2353 /* GFX6 doesn't support V_CEIL_F64, lower it. */
2354 /* trunc = trunc(src0)
2355 * if (src0 > 0.0 && src0 != trunc)
2356 * trunc += 1.0
2357 */
2358 Temp src0 = get_alu_src(ctx, instr->src[0]);
2359 Temp trunc = emit_trunc_f64(ctx, bld, bld.def(v2), src0);
2360 Temp tmp0 =
2361 bld.vopc_e64(aco_opcode::v_cmp_gt_f64, bld.def(bld.lm), src0, Operand::zero());
2362 Temp tmp1 =
2363 bld.vopc(aco_opcode::v_cmp_lg_f64, bld.hint_vcc(bld.def(bld.lm)), src0, trunc);
2364 Temp cond = bld.sop2(aco_opcode::s_and_b64, bld.hint_vcc(bld.def(s2)), bld.def(s1, scc),
2365 tmp0, tmp1);
2366 Temp add = bld.vop2(aco_opcode::v_cndmask_b32, bld.def(v1),
2367 bld.copy(bld.def(v1), Operand::zero()),
2368 bld.copy(bld.def(v1), Operand::c32(0x3ff00000u)), cond);
2369 add = bld.pseudo(aco_opcode::p_create_vector, bld.def(v2),
2370 bld.copy(bld.def(v1), Operand::zero()), add);
2371 bld.vop3(aco_opcode::v_add_f64, Definition(dst), trunc, add);
2372 }
2373 } else {
2374 isel_err(&instr->instr, "Unimplemented NIR instr bit size");
2375 }
2376 break;
2377 }
2378 case nir_op_ftrunc: {
2379 if (dst.regClass() == v2b) {
2380 emit_vop1_instruction(ctx, instr, aco_opcode::v_trunc_f16, dst);
2381 } else if (dst.regClass() == v1) {
2382 emit_vop1_instruction(ctx, instr, aco_opcode::v_trunc_f32, dst);
2383 } else if (dst.regClass() == v2) {
2384 Temp src = get_alu_src(ctx, instr->src[0]);
2385 emit_trunc_f64(ctx, bld, Definition(dst), src);
2386 } else {
2387 isel_err(&instr->instr, "Unimplemented NIR instr bit size");
2388 }
2389 break;
2390 }
2391 case nir_op_fround_even: {
2392 if (dst.regClass() == v2b) {
2393 emit_vop1_instruction(ctx, instr, aco_opcode::v_rndne_f16, dst);
2394 } else if (dst.regClass() == v1) {
2395 emit_vop1_instruction(ctx, instr, aco_opcode::v_rndne_f32, dst);
2396 } else if (dst.regClass() == v2) {
2397 if (ctx->options->chip_class >= GFX7) {
2398 emit_vop1_instruction(ctx, instr, aco_opcode::v_rndne_f64, dst);
2399 } else {
2400 /* GFX6 doesn't support V_RNDNE_F64, lower it. */
2401 Temp src0_lo = bld.tmp(v1), src0_hi = bld.tmp(v1);
2402 Temp src0 = get_alu_src(ctx, instr->src[0]);
2403 bld.pseudo(aco_opcode::p_split_vector, Definition(src0_lo), Definition(src0_hi), src0);
2404
2405 Temp bitmask = bld.sop1(aco_opcode::s_brev_b32, bld.def(s1),
2406 bld.copy(bld.def(s1), Operand::c32(-2u)));
2407 Temp bfi =
2408 bld.vop3(aco_opcode::v_bfi_b32, bld.def(v1), bitmask,
2409 bld.copy(bld.def(v1), Operand::c32(0x43300000u)), as_vgpr(ctx, src0_hi));
2410 Temp tmp =
2411 bld.vop3(aco_opcode::v_add_f64, bld.def(v2), src0,
2412 bld.pseudo(aco_opcode::p_create_vector, bld.def(v2), Operand::zero(), bfi));
2413 Instruction* sub =
2414 bld.vop3(aco_opcode::v_add_f64, bld.def(v2), tmp,
2415 bld.pseudo(aco_opcode::p_create_vector, bld.def(v2), Operand::zero(), bfi));
2416 sub->vop3().neg[1] = true;
2417 tmp = sub->definitions[0].getTemp();
2418
2419 Temp v = bld.pseudo(aco_opcode::p_create_vector, bld.def(v2), Operand::c32(-1u),
2420 Operand::c32(0x432fffffu));
2421 Instruction* vop3 =
2422 bld.vopc_e64(aco_opcode::v_cmp_gt_f64, bld.hint_vcc(bld.def(bld.lm)), src0, v);
2423 vop3->vop3().abs[0] = true;
2424 Temp cond = vop3->definitions[0].getTemp();
2425
2426 Temp tmp_lo = bld.tmp(v1), tmp_hi = bld.tmp(v1);
2427 bld.pseudo(aco_opcode::p_split_vector, Definition(tmp_lo), Definition(tmp_hi), tmp);
2428 Temp dst0 = bld.vop2_e64(aco_opcode::v_cndmask_b32, bld.def(v1), tmp_lo,
2429 as_vgpr(ctx, src0_lo), cond);
2430 Temp dst1 = bld.vop2_e64(aco_opcode::v_cndmask_b32, bld.def(v1), tmp_hi,
2431 as_vgpr(ctx, src0_hi), cond);
2432
2433 bld.pseudo(aco_opcode::p_create_vector, Definition(dst), dst0, dst1);
2434 }
2435 } else {
2436 isel_err(&instr->instr, "Unimplemented NIR instr bit size");
2437 }
2438 break;
2439 }
2440 case nir_op_fsin:
2441 case nir_op_fcos: {
2442 Temp src = as_vgpr(ctx, get_alu_src(ctx, instr->src[0]));
2443 aco_ptr<Instruction> norm;
2444 if (dst.regClass() == v2b) {
2445 Temp half_pi = bld.copy(bld.def(s1), Operand::c32(0x3118u));
2446 Temp tmp = bld.vop2(aco_opcode::v_mul_f16, bld.def(v1), half_pi, src);
2447 aco_opcode opcode =
2448 instr->op == nir_op_fsin ? aco_opcode::v_sin_f16 : aco_opcode::v_cos_f16;
2449 bld.vop1(opcode, Definition(dst), tmp);
2450 } else if (dst.regClass() == v1) {
2451 Temp half_pi = bld.copy(bld.def(s1), Operand::c32(0x3e22f983u));
2452 Temp tmp = bld.vop2(aco_opcode::v_mul_f32, bld.def(v1), half_pi, src);
2453
2454 /* before GFX9, v_sin_f32 and v_cos_f32 had a valid input domain of [-256, +256] */
2455 if (ctx->options->chip_class < GFX9)
2456 tmp = bld.vop1(aco_opcode::v_fract_f32, bld.def(v1), tmp);
2457
2458 aco_opcode opcode =
2459 instr->op == nir_op_fsin ? aco_opcode::v_sin_f32 : aco_opcode::v_cos_f32;
2460 bld.vop1(opcode, Definition(dst), tmp);
2461 } else {
2462 isel_err(&instr->instr, "Unimplemented NIR instr bit size");
2463 }
2464 break;
2465 }
2466 case nir_op_ldexp: {
2467 if (dst.regClass() == v2b) {
2468 emit_vop2_instruction(ctx, instr, aco_opcode::v_ldexp_f16, dst, false);
2469 } else if (dst.regClass() == v1) {
2470 emit_vop3a_instruction(ctx, instr, aco_opcode::v_ldexp_f32, dst);
2471 } else if (dst.regClass() == v2) {
2472 emit_vop3a_instruction(ctx, instr, aco_opcode::v_ldexp_f64, dst);
2473 } else {
2474 isel_err(&instr->instr, "Unimplemented NIR instr bit size");
2475 }
2476 break;
2477 }
2478 case nir_op_frexp_sig: {
2479 if (dst.regClass() == v2b) {
2480 emit_vop1_instruction(ctx, instr, aco_opcode::v_frexp_mant_f16, dst);
2481 } else if (dst.regClass() == v1) {
2482 emit_vop1_instruction(ctx, instr, aco_opcode::v_frexp_mant_f32, dst);
2483 } else if (dst.regClass() == v2) {
2484 emit_vop1_instruction(ctx, instr, aco_opcode::v_frexp_mant_f64, dst);
2485 } else {
2486 isel_err(&instr->instr, "Unimplemented NIR instr bit size");
2487 }
2488 break;
2489 }
2490 case nir_op_frexp_exp: {
2491 if (instr->src[0].src.ssa->bit_size == 16) {
2492 Temp src = get_alu_src(ctx, instr->src[0]);
2493 Temp tmp = bld.vop1(aco_opcode::v_frexp_exp_i16_f16, bld.def(v1), src);
2494 tmp = bld.pseudo(aco_opcode::p_extract_vector, bld.def(v1b), tmp, Operand::zero());
2495 convert_int(ctx, bld, tmp, 8, 32, true, dst);
2496 } else if (instr->src[0].src.ssa->bit_size == 32) {
2497 emit_vop1_instruction(ctx, instr, aco_opcode::v_frexp_exp_i32_f32, dst);
2498 } else if (instr->src[0].src.ssa->bit_size == 64) {
2499 emit_vop1_instruction(ctx, instr, aco_opcode::v_frexp_exp_i32_f64, dst);
2500 } else {
2501 isel_err(&instr->instr, "Unimplemented NIR instr bit size");
2502 }
2503 break;
2504 }
2505 case nir_op_fsign: {
2506 Temp src = as_vgpr(ctx, get_alu_src(ctx, instr->src[0]));
2507 if (dst.regClass() == v2b) {
2508 assert(ctx->program->chip_class >= GFX9);
2509 /* replace negative zero with positive zero */
2510 src = bld.vop2(aco_opcode::v_add_f16, bld.def(v2b), Operand::zero(), src);
2511 src =
2512 bld.vop3(aco_opcode::v_med3_i16, bld.def(v2b), Operand::c16(-1), src, Operand::c16(1u));
2513 bld.vop1(aco_opcode::v_cvt_f16_i16, Definition(dst), src);
2514 } else if (dst.regClass() == v1) {
2515 src = bld.vop2(aco_opcode::v_add_f32, bld.def(v1), Operand::zero(), src);
2516 src =
2517 bld.vop3(aco_opcode::v_med3_i32, bld.def(v1), Operand::c32(-1), src, Operand::c32(1u));
2518 bld.vop1(aco_opcode::v_cvt_f32_i32, Definition(dst), src);
2519 } else if (dst.regClass() == v2) {
2520 Temp cond = bld.vopc(aco_opcode::v_cmp_nlt_f64, bld.hint_vcc(bld.def(bld.lm)),
2521 Operand::zero(), src);
2522 Temp tmp = bld.copy(bld.def(v1), Operand::c32(0x3FF00000u));
2523 Temp upper = bld.vop2_e64(aco_opcode::v_cndmask_b32, bld.def(v1), tmp,
2524 emit_extract_vector(ctx, src, 1, v1), cond);
2525
2526 cond =
2527 bld.vopc(aco_opcode::v_cmp_le_f64, bld.hint_vcc(bld.def(bld.lm)), Operand::zero(), src);
2528 tmp = bld.copy(bld.def(v1), Operand::c32(0xBFF00000u));
2529 upper = bld.vop2(aco_opcode::v_cndmask_b32, bld.def(v1), tmp, upper, cond);
2530
2531 bld.pseudo(aco_opcode::p_create_vector, Definition(dst), Operand::zero(), upper);
2532 } else {
2533 isel_err(&instr->instr, "Unimplemented NIR instr bit size");
2534 }
2535 break;
2536 }
2537 case nir_op_f2f16:
2538 case nir_op_f2f16_rtne: {
2539 Temp src = get_alu_src(ctx, instr->src[0]);
2540 if (instr->src[0].src.ssa->bit_size == 64)
2541 src = bld.vop1(aco_opcode::v_cvt_f32_f64, bld.def(v1), src);
2542 if (instr->op == nir_op_f2f16_rtne && ctx->block->fp_mode.round16_64 != fp_round_ne)
2543 /* We emit s_round_mode/s_setreg_imm32 in lower_to_hw_instr to
2544 * keep value numbering and the scheduler simpler.
2545 */
2546 bld.vop1(aco_opcode::p_cvt_f16_f32_rtne, Definition(dst), src);
2547 else
2548 bld.vop1(aco_opcode::v_cvt_f16_f32, Definition(dst), src);
2549 break;
2550 }
2551 case nir_op_f2f16_rtz: {
2552 Temp src = get_alu_src(ctx, instr->src[0]);
2553 if (instr->src[0].src.ssa->bit_size == 64)
2554 src = bld.vop1(aco_opcode::v_cvt_f32_f64, bld.def(v1), src);
2555 if (ctx->block->fp_mode.round16_64 == fp_round_tz)
2556 bld.vop1(aco_opcode::v_cvt_f16_f32, Definition(dst), src);
2557 else if (ctx->program->chip_class == GFX8 || ctx->program->chip_class == GFX9)
2558 bld.vop3(aco_opcode::v_cvt_pkrtz_f16_f32_e64, Definition(dst), src, Operand::zero());
2559 else
2560 bld.vop2(aco_opcode::v_cvt_pkrtz_f16_f32, Definition(dst), src, as_vgpr(ctx, src));
2561 break;
2562 }
2563 case nir_op_f2f32: {
2564 if (instr->src[0].src.ssa->bit_size == 16) {
2565 emit_vop1_instruction(ctx, instr, aco_opcode::v_cvt_f32_f16, dst);
2566 } else if (instr->src[0].src.ssa->bit_size == 64) {
2567 emit_vop1_instruction(ctx, instr, aco_opcode::v_cvt_f32_f64, dst);
2568 } else {
2569 isel_err(&instr->instr, "Unimplemented NIR instr bit size");
2570 }
2571 break;
2572 }
2573 case nir_op_f2f64: {
2574 Temp src = get_alu_src(ctx, instr->src[0]);
2575 if (instr->src[0].src.ssa->bit_size == 16)
2576 src = bld.vop1(aco_opcode::v_cvt_f32_f16, bld.def(v1), src);
2577 bld.vop1(aco_opcode::v_cvt_f64_f32, Definition(dst), src);
2578 break;
2579 }
2580 case nir_op_i2f16: {
2581 assert(dst.regClass() == v2b);
2582 Temp src = get_alu_src(ctx, instr->src[0]);
2583 const unsigned input_size = instr->src[0].src.ssa->bit_size;
2584 if (input_size <= 16) {
2585 /* Expand integer to the size expected by the uint→float converter used below */
2586 unsigned target_size = (ctx->program->chip_class >= GFX8 ? 16 : 32);
2587 if (input_size != target_size) {
2588 src = convert_int(ctx, bld, src, input_size, target_size, true);
2589 }
2590 } else if (input_size == 64) {
2591 /* Truncate down to 32 bits; if any of the upper bits are relevant,
2592 * the value does not fall into the single-precision float range
2593 * anyway. SPIR-V does not mandate any specific behavior for such
2594 * large inputs.
2595 */
2596 src = convert_int(ctx, bld, src, 64, 32, false);
2597 }
2598
2599 if (ctx->program->chip_class >= GFX8 && input_size <= 16) {
2600 bld.vop1(aco_opcode::v_cvt_f16_i16, Definition(dst), src);
2601 } else {
2602 /* Convert to f32 and then down to f16. This is needed to handle
2603 * inputs slightly outside the range [INT16_MIN, INT16_MAX],
2604 * which are representable via f16 but wouldn't be converted
2605 * correctly by v_cvt_f16_i16.
2606 *
2607 * This is also the fallback-path taken on GFX7 and earlier, which
2608 * do not support direct f16⟷i16 conversions.
2609 */
2610 src = bld.vop1(aco_opcode::v_cvt_f32_i32, bld.def(v1), src);
2611 bld.vop1(aco_opcode::v_cvt_f16_f32, Definition(dst), src);
2612 }
2613 break;
2614 }
2615 case nir_op_i2f32: {
2616 assert(dst.size() == 1);
2617 Temp src = get_alu_src(ctx, instr->src[0]);
2618 const unsigned input_size = instr->src[0].src.ssa->bit_size;
2619 if (input_size <= 32) {
2620 if (input_size <= 16) {
2621 /* Sign-extend to 32-bits */
2622 src = convert_int(ctx, bld, src, input_size, 32, true);
2623 }
2624 bld.vop1(aco_opcode::v_cvt_f32_i32, Definition(dst), src);
2625 } else {
2626 assert(input_size == 64);
2627 RegClass rc = RegClass(src.type(), 1);
2628 Temp lower = bld.tmp(rc), upper = bld.tmp(rc);
2629 bld.pseudo(aco_opcode::p_split_vector, Definition(lower), Definition(upper), src);
2630 lower = bld.vop1(aco_opcode::v_cvt_f64_u32, bld.def(v2), lower);
2631 upper = bld.vop1(aco_opcode::v_cvt_f64_i32, bld.def(v2), upper);
2632 upper = bld.vop3(aco_opcode::v_ldexp_f64, bld.def(v2), upper, Operand::c32(32u));
2633 upper = bld.vop3(aco_opcode::v_add_f64, bld.def(v2), lower, upper);
2634 bld.vop1(aco_opcode::v_cvt_f32_f64, Definition(dst), upper);
2635 }
2636
2637 break;
2638 }
2639 case nir_op_i2f64: {
2640 if (instr->src[0].src.ssa->bit_size <= 32) {
2641 Temp src = get_alu_src(ctx, instr->src[0]);
2642 if (instr->src[0].src.ssa->bit_size <= 16)
2643 src = convert_int(ctx, bld, src, instr->src[0].src.ssa->bit_size, 32, true);
2644 bld.vop1(aco_opcode::v_cvt_f64_i32, Definition(dst), src);
2645 } else if (instr->src[0].src.ssa->bit_size == 64) {
2646 Temp src = get_alu_src(ctx, instr->src[0]);
2647 RegClass rc = RegClass(src.type(), 1);
2648 Temp lower = bld.tmp(rc), upper = bld.tmp(rc);
2649 bld.pseudo(aco_opcode::p_split_vector, Definition(lower), Definition(upper), src);
2650 lower = bld.vop1(aco_opcode::v_cvt_f64_u32, bld.def(v2), lower);
2651 upper = bld.vop1(aco_opcode::v_cvt_f64_i32, bld.def(v2), upper);
2652 upper = bld.vop3(aco_opcode::v_ldexp_f64, bld.def(v2), upper, Operand::c32(32u));
2653 bld.vop3(aco_opcode::v_add_f64, Definition(dst), lower, upper);
2654
2655 } else {
2656 isel_err(&instr->instr, "Unimplemented NIR instr bit size");
2657 }
2658 break;
2659 }
2660 case nir_op_u2f16: {
2661 assert(dst.regClass() == v2b);
2662 Temp src = get_alu_src(ctx, instr->src[0]);
2663 const unsigned input_size = instr->src[0].src.ssa->bit_size;
2664 if (input_size <= 16) {
2665 /* Expand integer to the size expected by the uint→float converter used below */
2666 unsigned target_size = (ctx->program->chip_class >= GFX8 ? 16 : 32);
2667 if (input_size != target_size) {
2668 src = convert_int(ctx, bld, src, input_size, target_size, false);
2669 }
2670 } else if (input_size == 64) {
2671 /* Truncate down to 32 bits; if any of the upper bits are non-zero,
2672 * the value does not fall into the single-precision float range
2673 * anyway. SPIR-V does not mandate any specific behavior for such
2674 * large inputs.
2675 */
2676 src = convert_int(ctx, bld, src, 64, 32, false);
2677 }
2678
2679 if (ctx->program->chip_class >= GFX8) {
2680 /* float16 has a range of [0, 65519]. Converting from larger
2681 * inputs is UB, so we just need to consider the lower 16 bits */
2682 bld.vop1(aco_opcode::v_cvt_f16_u16, Definition(dst), src);
2683 } else {
2684 /* GFX7 and earlier do not support direct f16⟷u16 conversions */
2685 src = bld.vop1(aco_opcode::v_cvt_f32_u32, bld.def(v1), src);
2686 bld.vop1(aco_opcode::v_cvt_f16_f32, Definition(dst), src);
2687 }
2688 break;
2689 }
2690 case nir_op_u2f32: {
2691 assert(dst.size() == 1);
2692 Temp src = get_alu_src(ctx, instr->src[0]);
2693 const unsigned input_size = instr->src[0].src.ssa->bit_size;
2694 if (input_size == 8) {
2695 bld.vop1(aco_opcode::v_cvt_f32_ubyte0, Definition(dst), src);
2696 } else if (input_size <= 32) {
2697 if (input_size == 16)
2698 src = convert_int(ctx, bld, src, instr->src[0].src.ssa->bit_size, 32, false);
2699 bld.vop1(aco_opcode::v_cvt_f32_u32, Definition(dst), src);
2700 } else {
2701 assert(input_size == 64);
2702 RegClass rc = RegClass(src.type(), 1);
2703 Temp lower = bld.tmp(rc), upper = bld.tmp(rc);
2704 bld.pseudo(aco_opcode::p_split_vector, Definition(lower), Definition(upper), src);
2705 lower = bld.vop1(aco_opcode::v_cvt_f64_u32, bld.def(v2), lower);
2706 upper = bld.vop1(aco_opcode::v_cvt_f64_u32, bld.def(v2), upper);
2707 upper = bld.vop3(aco_opcode::v_ldexp_f64, bld.def(v2), upper, Operand::c32(32u));
2708 upper = bld.vop3(aco_opcode::v_add_f64, bld.def(v2), lower, upper);
2709 bld.vop1(aco_opcode::v_cvt_f32_f64, Definition(dst), upper);
2710 }
2711 break;
2712 }
2713 case nir_op_u2f64: {
2714 if (instr->src[0].src.ssa->bit_size <= 32) {
2715 Temp src = get_alu_src(ctx, instr->src[0]);
2716 if (instr->src[0].src.ssa->bit_size <= 16)
2717 src = convert_int(ctx, bld, src, instr->src[0].src.ssa->bit_size, 32, false);
2718 bld.vop1(aco_opcode::v_cvt_f64_u32, Definition(dst), src);
2719 } else if (instr->src[0].src.ssa->bit_size == 64) {
2720 Temp src = get_alu_src(ctx, instr->src[0]);
2721 RegClass rc = RegClass(src.type(), 1);
2722 Temp lower = bld.tmp(rc), upper = bld.tmp(rc);
2723 bld.pseudo(aco_opcode::p_split_vector, Definition(lower), Definition(upper), src);
2724 lower = bld.vop1(aco_opcode::v_cvt_f64_u32, bld.def(v2), lower);
2725 upper = bld.vop1(aco_opcode::v_cvt_f64_u32, bld.def(v2), upper);
2726 upper = bld.vop3(aco_opcode::v_ldexp_f64, bld.def(v2), upper, Operand::c32(32u));
2727 bld.vop3(aco_opcode::v_add_f64, Definition(dst), lower, upper);
2728 } else {
2729 isel_err(&instr->instr, "Unimplemented NIR instr bit size");
2730 }
2731 break;
2732 }
2733 case nir_op_f2i8:
2734 case nir_op_f2i16: {
2735 if (instr->src[0].src.ssa->bit_size == 16) {
2736 if (ctx->program->chip_class >= GFX8) {
2737 emit_vop1_instruction(ctx, instr, aco_opcode::v_cvt_i16_f16, dst);
2738 } else {
2739 /* GFX7 and earlier do not support direct f16⟷i16 conversions */
2740 Temp tmp = bld.tmp(v1);
2741 emit_vop1_instruction(ctx, instr, aco_opcode::v_cvt_f32_f16, tmp);
2742 tmp = bld.vop1(aco_opcode::v_cvt_i32_f32, bld.def(v1), tmp);
2743 tmp = convert_int(ctx, bld, tmp, 32, instr->dest.dest.ssa.bit_size, false,
2744 (dst.type() == RegType::sgpr) ? Temp() : dst);
2745 if (dst.type() == RegType::sgpr) {
2746 bld.pseudo(aco_opcode::p_as_uniform, Definition(dst), tmp);
2747 }
2748 }
2749 } else if (instr->src[0].src.ssa->bit_size == 32) {
2750 emit_vop1_instruction(ctx, instr, aco_opcode::v_cvt_i32_f32, dst);
2751 } else {
2752 emit_vop1_instruction(ctx, instr, aco_opcode::v_cvt_i32_f64, dst);
2753 }
2754 break;
2755 }
2756 case nir_op_f2u8:
2757 case nir_op_f2u16: {
2758 if (instr->src[0].src.ssa->bit_size == 16) {
2759 if (ctx->program->chip_class >= GFX8) {
2760 emit_vop1_instruction(ctx, instr, aco_opcode::v_cvt_u16_f16, dst);
2761 } else {
2762 /* GFX7 and earlier do not support direct f16⟷u16 conversions */
2763 Temp tmp = bld.tmp(v1);
2764 emit_vop1_instruction(ctx, instr, aco_opcode::v_cvt_f32_f16, tmp);
2765 tmp = bld.vop1(aco_opcode::v_cvt_u32_f32, bld.def(v1), tmp);
2766 tmp = convert_int(ctx, bld, tmp, 32, instr->dest.dest.ssa.bit_size, false,
2767 (dst.type() == RegType::sgpr) ? Temp() : dst);
2768 if (dst.type() == RegType::sgpr) {
2769 bld.pseudo(aco_opcode::p_as_uniform, Definition(dst), tmp);
2770 }
2771 }
2772 } else if (instr->src[0].src.ssa->bit_size == 32) {
2773 emit_vop1_instruction(ctx, instr, aco_opcode::v_cvt_u32_f32, dst);
2774 } else {
2775 emit_vop1_instruction(ctx, instr, aco_opcode::v_cvt_u32_f64, dst);
2776 }
2777 break;
2778 }
2779 case nir_op_f2i32: {
2780 Temp src = get_alu_src(ctx, instr->src[0]);
2781 if (instr->src[0].src.ssa->bit_size == 16) {
2782 Temp tmp = bld.vop1(aco_opcode::v_cvt_f32_f16, bld.def(v1), src);
2783 if (dst.type() == RegType::vgpr) {
2784 bld.vop1(aco_opcode::v_cvt_i32_f32, Definition(dst), tmp);
2785 } else {
2786 bld.pseudo(aco_opcode::p_as_uniform, Definition(dst),
2787 bld.vop1(aco_opcode::v_cvt_i32_f32, bld.def(v1), tmp));
2788 }
2789 } else if (instr->src[0].src.ssa->bit_size == 32) {
2790 emit_vop1_instruction(ctx, instr, aco_opcode::v_cvt_i32_f32, dst);
2791 } else if (instr->src[0].src.ssa->bit_size == 64) {
2792 emit_vop1_instruction(ctx, instr, aco_opcode::v_cvt_i32_f64, dst);
2793 } else {
2794 isel_err(&instr->instr, "Unimplemented NIR instr bit size");
2795 }
2796 break;
2797 }
2798 case nir_op_f2u32: {
2799 Temp src = get_alu_src(ctx, instr->src[0]);
2800 if (instr->src[0].src.ssa->bit_size == 16) {
2801 Temp tmp = bld.vop1(aco_opcode::v_cvt_f32_f16, bld.def(v1), src);
2802 if (dst.type() == RegType::vgpr) {
2803 bld.vop1(aco_opcode::v_cvt_u32_f32, Definition(dst), tmp);
2804 } else {
2805 bld.pseudo(aco_opcode::p_as_uniform, Definition(dst),
2806 bld.vop1(aco_opcode::v_cvt_u32_f32, bld.def(v1), tmp));
2807 }
2808 } else if (instr->src[0].src.ssa->bit_size == 32) {
2809 emit_vop1_instruction(ctx, instr, aco_opcode::v_cvt_u32_f32, dst);
2810 } else if (instr->src[0].src.ssa->bit_size == 64) {
2811 emit_vop1_instruction(ctx, instr, aco_opcode::v_cvt_u32_f64, dst);
2812 } else {
2813 isel_err(&instr->instr, "Unimplemented NIR instr bit size");
2814 }
2815 break;
2816 }
2817 case nir_op_f2i64: {
2818 Temp src = get_alu_src(ctx, instr->src[0]);
2819 if (instr->src[0].src.ssa->bit_size == 16)
2820 src = bld.vop1(aco_opcode::v_cvt_f32_f16, bld.def(v1), src);
2821
2822 if (instr->src[0].src.ssa->bit_size <= 32 && dst.type() == RegType::vgpr) {
2823 Temp exponent = bld.vop1(aco_opcode::v_frexp_exp_i32_f32, bld.def(v1), src);
2824 exponent = bld.vop3(aco_opcode::v_med3_i32, bld.def(v1), Operand::zero(), exponent,
2825 Operand::c32(64u));
2826 Temp mantissa = bld.vop2(aco_opcode::v_and_b32, bld.def(v1), Operand::c32(0x7fffffu), src);
2827 Temp sign = bld.vop2(aco_opcode::v_ashrrev_i32, bld.def(v1), Operand::c32(31u), src);
2828 mantissa = bld.vop2(aco_opcode::v_or_b32, bld.def(v1), Operand::c32(0x800000u), mantissa);
2829 mantissa = bld.vop2(aco_opcode::v_lshlrev_b32, bld.def(v1), Operand::c32(7u), mantissa);
2830 mantissa = bld.pseudo(aco_opcode::p_create_vector, bld.def(v2), Operand::zero(), mantissa);
2831 Temp new_exponent = bld.tmp(v1);
2832 Temp borrow =
2833 bld.vsub32(Definition(new_exponent), Operand::c32(63u), exponent, true).def(1).getTemp();
2834 if (ctx->program->chip_class >= GFX8)
2835 mantissa = bld.vop3(aco_opcode::v_lshrrev_b64, bld.def(v2), new_exponent, mantissa);
2836 else
2837 mantissa = bld.vop3(aco_opcode::v_lshr_b64, bld.def(v2), mantissa, new_exponent);
2838 Temp saturate = bld.vop1(aco_opcode::v_bfrev_b32, bld.def(v1), Operand::c32(0xfffffffeu));
2839 Temp lower = bld.tmp(v1), upper = bld.tmp(v1);
2840 bld.pseudo(aco_opcode::p_split_vector, Definition(lower), Definition(upper), mantissa);
2841 lower = bld.vop2_e64(aco_opcode::v_cndmask_b32, bld.def(v1), lower,
2842 Operand::c32(0xffffffffu), borrow);
2843 upper = bld.vop2(aco_opcode::v_cndmask_b32, bld.def(v1), upper, saturate, borrow);
2844 lower = bld.vop2(aco_opcode::v_xor_b32, bld.def(v1), sign, lower);
2845 upper = bld.vop2(aco_opcode::v_xor_b32, bld.def(v1), sign, upper);
2846 Temp new_lower = bld.tmp(v1);
2847 borrow = bld.vsub32(Definition(new_lower), lower, sign, true).def(1).getTemp();
2848 Temp new_upper = bld.vsub32(bld.def(v1), upper, sign, false, borrow);
2849 bld.pseudo(aco_opcode::p_create_vector, Definition(dst), new_lower, new_upper);
2850
2851 } else if (instr->src[0].src.ssa->bit_size <= 32 && dst.type() == RegType::sgpr) {
2852 if (src.type() == RegType::vgpr)
2853 src = bld.as_uniform(src);
2854 Temp exponent = bld.sop2(aco_opcode::s_bfe_u32, bld.def(s1), bld.def(s1, scc), src,
2855 Operand::c32(0x80017u));
2856 exponent = bld.sop2(aco_opcode::s_sub_i32, bld.def(s1), bld.def(s1, scc), exponent,
2857 Operand::c32(126u));
2858 exponent = bld.sop2(aco_opcode::s_max_i32, bld.def(s1), bld.def(s1, scc), Operand::zero(),
2859 exponent);
2860 exponent = bld.sop2(aco_opcode::s_min_i32, bld.def(s1), bld.def(s1, scc),
2861 Operand::c32(64u), exponent);
2862 Temp mantissa = bld.sop2(aco_opcode::s_and_b32, bld.def(s1), bld.def(s1, scc),
2863 Operand::c32(0x7fffffu), src);
2864 Temp sign =
2865 bld.sop2(aco_opcode::s_ashr_i32, bld.def(s1), bld.def(s1, scc), src, Operand::c32(31u));
2866 mantissa = bld.sop2(aco_opcode::s_or_b32, bld.def(s1), bld.def(s1, scc),
2867 Operand::c32(0x800000u), mantissa);
2868 mantissa = bld.sop2(aco_opcode::s_lshl_b32, bld.def(s1), bld.def(s1, scc), mantissa,
2869 Operand::c32(7u));
2870 mantissa = bld.pseudo(aco_opcode::p_create_vector, bld.def(s2), Operand::zero(), mantissa);
2871 exponent = bld.sop2(aco_opcode::s_sub_u32, bld.def(s1), bld.def(s1, scc),
2872 Operand::c32(63u), exponent);
2873 mantissa =
2874 bld.sop2(aco_opcode::s_lshr_b64, bld.def(s2), bld.def(s1, scc), mantissa, exponent);
2875 Temp cond = bld.sopc(aco_opcode::s_cmp_eq_u32, bld.def(s1, scc), exponent,
2876 Operand::c32(0xffffffffu)); // exp >= 64
2877 Temp saturate = bld.sop1(aco_opcode::s_brev_b64, bld.def(s2), Operand::c32(0xfffffffeu));
2878 mantissa = bld.sop2(aco_opcode::s_cselect_b64, bld.def(s2), saturate, mantissa, cond);
2879 Temp lower = bld.tmp(s1), upper = bld.tmp(s1);
2880 bld.pseudo(aco_opcode::p_split_vector, Definition(lower), Definition(upper), mantissa);
2881 lower = bld.sop2(aco_opcode::s_xor_b32, bld.def(s1), bld.def(s1, scc), sign, lower);
2882 upper = bld.sop2(aco_opcode::s_xor_b32, bld.def(s1), bld.def(s1, scc), sign, upper);
2883 Temp borrow = bld.tmp(s1);
2884 lower =
2885 bld.sop2(aco_opcode::s_sub_u32, bld.def(s1), bld.scc(Definition(borrow)), lower, sign);
2886 upper = bld.sop2(aco_opcode::s_subb_u32, bld.def(s1), bld.def(s1, scc), upper, sign,
2887 bld.scc(borrow));
2888 bld.pseudo(aco_opcode::p_create_vector, Definition(dst), lower, upper);
2889
2890 } else if (instr->src[0].src.ssa->bit_size == 64) {
2891 Temp vec = bld.pseudo(aco_opcode::p_create_vector, bld.def(s2), Operand::zero(),
2892 Operand::c32(0x3df00000u));
2893 Temp trunc = emit_trunc_f64(ctx, bld, bld.def(v2), src);
2894 Temp mul = bld.vop3(aco_opcode::v_mul_f64, bld.def(v2), trunc, vec);
2895 vec = bld.pseudo(aco_opcode::p_create_vector, bld.def(s2), Operand::zero(),
2896 Operand::c32(0xc1f00000u));
2897 Temp floor = emit_floor_f64(ctx, bld, bld.def(v2), mul);
2898 Temp fma = bld.vop3(aco_opcode::v_fma_f64, bld.def(v2), floor, vec, trunc);
2899 Temp lower = bld.vop1(aco_opcode::v_cvt_u32_f64, bld.def(v1), fma);
2900 Temp upper = bld.vop1(aco_opcode::v_cvt_i32_f64, bld.def(v1), floor);
2901 if (dst.type() == RegType::sgpr) {
2902 lower = bld.as_uniform(lower);
2903 upper = bld.as_uniform(upper);
2904 }
2905 bld.pseudo(aco_opcode::p_create_vector, Definition(dst), lower, upper);
2906
2907 } else {
2908 isel_err(&instr->instr, "Unimplemented NIR instr bit size");
2909 }
2910 break;
2911 }
2912 case nir_op_f2u64: {
2913 Temp src = get_alu_src(ctx, instr->src[0]);
2914 if (instr->src[0].src.ssa->bit_size == 16)
2915 src = bld.vop1(aco_opcode::v_cvt_f32_f16, bld.def(v1), src);
2916
2917 if (instr->src[0].src.ssa->bit_size <= 32 && dst.type() == RegType::vgpr) {
2918 Temp exponent = bld.vop1(aco_opcode::v_frexp_exp_i32_f32, bld.def(v1), src);
2919 Temp exponent_in_range = bld.vopc(aco_opcode::v_cmp_ge_i32, bld.hint_vcc(bld.def(bld.lm)),
2920 Operand::c32(64u), exponent);
2921 exponent = bld.vop2(aco_opcode::v_max_i32, bld.def(v1), Operand::zero(), exponent);
2922 Temp mantissa = bld.vop2(aco_opcode::v_and_b32, bld.def(v1), Operand::c32(0x7fffffu), src);
2923 mantissa = bld.vop2(aco_opcode::v_or_b32, bld.def(v1), Operand::c32(0x800000u), mantissa);
2924 Temp exponent_small = bld.vsub32(bld.def(v1), Operand::c32(24u), exponent);
2925 Temp small = bld.vop2(aco_opcode::v_lshrrev_b32, bld.def(v1), exponent_small, mantissa);
2926 mantissa = bld.pseudo(aco_opcode::p_create_vector, bld.def(v2), Operand::zero(), mantissa);
2927 Temp new_exponent = bld.tmp(v1);
2928 Temp cond_small =
2929 bld.vsub32(Definition(new_exponent), exponent, Operand::c32(24u), true).def(1).getTemp();
2930 if (ctx->program->chip_class >= GFX8)
2931 mantissa = bld.vop3(aco_opcode::v_lshlrev_b64, bld.def(v2), new_exponent, mantissa);
2932 else
2933 mantissa = bld.vop3(aco_opcode::v_lshl_b64, bld.def(v2), mantissa, new_exponent);
2934 Temp lower = bld.tmp(v1), upper = bld.tmp(v1);
2935 bld.pseudo(aco_opcode::p_split_vector, Definition(lower), Definition(upper), mantissa);
2936 lower = bld.vop2(aco_opcode::v_cndmask_b32, bld.def(v1), lower, small, cond_small);
2937 upper = bld.vop2_e64(aco_opcode::v_cndmask_b32, bld.def(v1), upper, Operand::zero(),
2938 cond_small);
2939 lower = bld.vop2(aco_opcode::v_cndmask_b32, bld.def(v1), Operand::c32(0xffffffffu), lower,
2940 exponent_in_range);
2941 upper = bld.vop2(aco_opcode::v_cndmask_b32, bld.def(v1), Operand::c32(0xffffffffu), upper,
2942 exponent_in_range);
2943 bld.pseudo(aco_opcode::p_create_vector, Definition(dst), lower, upper);
2944
2945 } else if (instr->src[0].src.ssa->bit_size <= 32 && dst.type() == RegType::sgpr) {
2946 if (src.type() == RegType::vgpr)
2947 src = bld.as_uniform(src);
2948 Temp exponent = bld.sop2(aco_opcode::s_bfe_u32, bld.def(s1), bld.def(s1, scc), src,
2949 Operand::c32(0x80017u));
2950 exponent = bld.sop2(aco_opcode::s_sub_i32, bld.def(s1), bld.def(s1, scc), exponent,
2951 Operand::c32(126u));
2952 exponent = bld.sop2(aco_opcode::s_max_i32, bld.def(s1), bld.def(s1, scc), Operand::zero(),
2953 exponent);
2954 Temp mantissa = bld.sop2(aco_opcode::s_and_b32, bld.def(s1), bld.def(s1, scc),
2955 Operand::c32(0x7fffffu), src);
2956 mantissa = bld.sop2(aco_opcode::s_or_b32, bld.def(s1), bld.def(s1, scc),
2957 Operand::c32(0x800000u), mantissa);
2958 Temp exponent_small = bld.sop2(aco_opcode::s_sub_u32, bld.def(s1), bld.def(s1, scc),
2959 Operand::c32(24u), exponent);
2960 Temp small = bld.sop2(aco_opcode::s_lshr_b32, bld.def(s1), bld.def(s1, scc), mantissa,
2961 exponent_small);
2962 mantissa = bld.pseudo(aco_opcode::p_create_vector, bld.def(s2), Operand::zero(), mantissa);
2963 Temp exponent_large = bld.sop2(aco_opcode::s_sub_u32, bld.def(s1), bld.def(s1, scc),
2964 exponent, Operand::c32(24u));
2965 mantissa = bld.sop2(aco_opcode::s_lshl_b64, bld.def(s2), bld.def(s1, scc), mantissa,
2966 exponent_large);
2967 Temp cond =
2968 bld.sopc(aco_opcode::s_cmp_ge_i32, bld.def(s1, scc), Operand::c32(64u), exponent);
2969 mantissa = bld.sop2(aco_opcode::s_cselect_b64, bld.def(s2), mantissa,
2970 Operand::c32(0xffffffffu), cond);
2971 Temp lower = bld.tmp(s1), upper = bld.tmp(s1);
2972 bld.pseudo(aco_opcode::p_split_vector, Definition(lower), Definition(upper), mantissa);
2973 Temp cond_small =
2974 bld.sopc(aco_opcode::s_cmp_le_i32, bld.def(s1, scc), exponent, Operand::c32(24u));
2975 lower = bld.sop2(aco_opcode::s_cselect_b32, bld.def(s1), small, lower, cond_small);
2976 upper =
2977 bld.sop2(aco_opcode::s_cselect_b32, bld.def(s1), Operand::zero(), upper, cond_small);
2978 bld.pseudo(aco_opcode::p_create_vector, Definition(dst), lower, upper);
2979
2980 } else if (instr->src[0].src.ssa->bit_size == 64) {
2981 Temp vec = bld.pseudo(aco_opcode::p_create_vector, bld.def(s2), Operand::zero(),
2982 Operand::c32(0x3df00000u));
2983 Temp trunc = emit_trunc_f64(ctx, bld, bld.def(v2), src);
2984 Temp mul = bld.vop3(aco_opcode::v_mul_f64, bld.def(v2), trunc, vec);
2985 vec = bld.pseudo(aco_opcode::p_create_vector, bld.def(s2), Operand::zero(),
2986 Operand::c32(0xc1f00000u));
2987 Temp floor = emit_floor_f64(ctx, bld, bld.def(v2), mul);
2988 Temp fma = bld.vop3(aco_opcode::v_fma_f64, bld.def(v2), floor, vec, trunc);
2989 Temp lower = bld.vop1(aco_opcode::v_cvt_u32_f64, bld.def(v1), fma);
2990 Temp upper = bld.vop1(aco_opcode::v_cvt_u32_f64, bld.def(v1), floor);
2991 if (dst.type() == RegType::sgpr) {
2992 lower = bld.as_uniform(lower);
2993 upper = bld.as_uniform(upper);
2994 }
2995 bld.pseudo(aco_opcode::p_create_vector, Definition(dst), lower, upper);
2996
2997 } else {
2998 isel_err(&instr->instr, "Unimplemented NIR instr bit size");
2999 }
3000 break;
3001 }
3002 case nir_op_b2f16: {
3003 Temp src = get_alu_src(ctx, instr->src[0]);
3004 assert(src.regClass() == bld.lm);
3005
3006 if (dst.regClass() == s1) {
3007 src = bool_to_scalar_condition(ctx, src);
3008 bld.sop2(aco_opcode::s_mul_i32, Definition(dst), Operand::c32(0x3c00u), src);
3009 } else if (dst.regClass() == v2b) {
3010 Temp one = bld.copy(bld.def(v1), Operand::c32(0x3c00u));
3011 bld.vop2(aco_opcode::v_cndmask_b32, Definition(dst), Operand::zero(), one, src);
3012 } else {
3013 unreachable("Wrong destination register class for nir_op_b2f16.");
3014 }
3015 break;
3016 }
3017 case nir_op_b2f32: {
3018 Temp src = get_alu_src(ctx, instr->src[0]);
3019 assert(src.regClass() == bld.lm);
3020
3021 if (dst.regClass() == s1) {
3022 src = bool_to_scalar_condition(ctx, src);
3023 bld.sop2(aco_opcode::s_mul_i32, Definition(dst), Operand::c32(0x3f800000u), src);
3024 } else if (dst.regClass() == v1) {
3025 bld.vop2_e64(aco_opcode::v_cndmask_b32, Definition(dst), Operand::zero(),
3026 Operand::c32(0x3f800000u), src);
3027 } else {
3028 unreachable("Wrong destination register class for nir_op_b2f32.");
3029 }
3030 break;
3031 }
3032 case nir_op_b2f64: {
3033 Temp src = get_alu_src(ctx, instr->src[0]);
3034 assert(src.regClass() == bld.lm);
3035
3036 if (dst.regClass() == s2) {
3037 src = bool_to_scalar_condition(ctx, src);
3038 bld.sop2(aco_opcode::s_cselect_b64, Definition(dst), Operand::c32(0x3f800000u),
3039 Operand::zero(), bld.scc(src));
3040 } else if (dst.regClass() == v2) {
3041 Temp one = bld.copy(bld.def(v1), Operand::c32(0x3FF00000u));
3042 Temp upper =
3043 bld.vop2_e64(aco_opcode::v_cndmask_b32, bld.def(v1), Operand::zero(), one, src);
3044 bld.pseudo(aco_opcode::p_create_vector, Definition(dst), Operand::zero(), upper);
3045 } else {
3046 unreachable("Wrong destination register class for nir_op_b2f64.");
3047 }
3048 break;
3049 }
3050 case nir_op_i2i8:
3051 case nir_op_i2i16:
3052 case nir_op_i2i32:
3053 case nir_op_i2i64: {
3054 if (dst.type() == RegType::sgpr && instr->src[0].src.ssa->bit_size < 32) {
3055 /* no need to do the extract in get_alu_src() */
3056 sgpr_extract_mode mode = instr->dest.dest.ssa.bit_size > instr->src[0].src.ssa->bit_size
3057 ? sgpr_extract_sext
3058 : sgpr_extract_undef;
3059 extract_8_16_bit_sgpr_element(ctx, dst, &instr->src[0], mode);
3060 } else {
3061 const unsigned input_bitsize = instr->src[0].src.ssa->bit_size;
3062 const unsigned output_bitsize = instr->dest.dest.ssa.bit_size;
3063 convert_int(ctx, bld, get_alu_src(ctx, instr->src[0]), input_bitsize, output_bitsize,
3064 output_bitsize > input_bitsize, dst);
3065 }
3066 break;
3067 }
3068 case nir_op_u2u8:
3069 case nir_op_u2u16:
3070 case nir_op_u2u32:
3071 case nir_op_u2u64: {
3072 if (dst.type() == RegType::sgpr && instr->src[0].src.ssa->bit_size < 32) {
3073 /* no need to do the extract in get_alu_src() */
3074 sgpr_extract_mode mode = instr->dest.dest.ssa.bit_size > instr->src[0].src.ssa->bit_size
3075 ? sgpr_extract_zext
3076 : sgpr_extract_undef;
3077 extract_8_16_bit_sgpr_element(ctx, dst, &instr->src[0], mode);
3078 } else {
3079 convert_int(ctx, bld, get_alu_src(ctx, instr->src[0]), instr->src[0].src.ssa->bit_size,
3080 instr->dest.dest.ssa.bit_size, false, dst);
3081 }
3082 break;
3083 }
3084 case nir_op_b2b32:
3085 case nir_op_b2i8:
3086 case nir_op_b2i16:
3087 case nir_op_b2i32:
3088 case nir_op_b2i64: {
3089 Temp src = get_alu_src(ctx, instr->src[0]);
3090 assert(src.regClass() == bld.lm);
3091
3092 Temp tmp = dst.bytes() == 8 ? bld.tmp(RegClass::get(dst.type(), 4)) : dst;
3093 if (tmp.regClass() == s1) {
3094 bool_to_scalar_condition(ctx, src, tmp);
3095 } else if (tmp.type() == RegType::vgpr) {
3096 bld.vop2_e64(aco_opcode::v_cndmask_b32, Definition(tmp), Operand::zero(), Operand::c32(1u),
3097 src);
3098 } else {
3099 unreachable("Invalid register class for b2i32");
3100 }
3101
3102 if (tmp != dst)
3103 bld.pseudo(aco_opcode::p_create_vector, Definition(dst), tmp, Operand::zero());
3104 break;
3105 }
3106 case nir_op_b2b1:
3107 case nir_op_i2b1: {
3108 Temp src = get_alu_src(ctx, instr->src[0]);
3109 assert(dst.regClass() == bld.lm);
3110
3111 if (src.type() == RegType::vgpr) {
3112 assert(src.regClass() == v1 || src.regClass() == v2);
3113 assert(dst.regClass() == bld.lm);
3114 bld.vopc(src.size() == 2 ? aco_opcode::v_cmp_lg_u64 : aco_opcode::v_cmp_lg_u32,
3115 Definition(dst), Operand::zero(), src)
3116 .def(0)
3117 .setHint(vcc);
3118 } else {
3119 assert(src.regClass() == s1 || src.regClass() == s2);
3120 Temp tmp;
3121 if (src.regClass() == s2 && ctx->program->chip_class <= GFX7) {
3122 tmp =
3123 bld.sop2(aco_opcode::s_or_b64, bld.def(s2), bld.def(s1, scc), Operand::zero(), src)
3124 .def(1)
3125 .getTemp();
3126 } else {
3127 tmp = bld.sopc(src.size() == 2 ? aco_opcode::s_cmp_lg_u64 : aco_opcode::s_cmp_lg_u32,
3128 bld.scc(bld.def(s1)), Operand::zero(), src);
3129 }
3130 bool_to_vector_condition(ctx, tmp, dst);
3131 }
3132 break;
3133 }
3134 case nir_op_unpack_64_2x32:
3135 case nir_op_unpack_32_2x16:
3136 case nir_op_unpack_64_4x16:
3137 bld.copy(Definition(dst), get_alu_src(ctx, instr->src[0]));
3138 emit_split_vector(ctx, dst, instr->op == nir_op_unpack_64_4x16 ? 4 : 2);
3139 break;
3140 case nir_op_pack_64_2x32_split: {
3141 Temp src0 = get_alu_src(ctx, instr->src[0]);
3142 Temp src1 = get_alu_src(ctx, instr->src[1]);
3143
3144 bld.pseudo(aco_opcode::p_create_vector, Definition(dst), src0, src1);
3145 break;
3146 }
3147 case nir_op_unpack_64_2x32_split_x:
3148 bld.pseudo(aco_opcode::p_split_vector, Definition(dst), bld.def(dst.regClass()),
3149 get_alu_src(ctx, instr->src[0]));
3150 break;
3151 case nir_op_unpack_64_2x32_split_y:
3152 bld.pseudo(aco_opcode::p_split_vector, bld.def(dst.regClass()), Definition(dst),
3153 get_alu_src(ctx, instr->src[0]));
3154 break;
3155 case nir_op_unpack_32_2x16_split_x:
3156 if (dst.type() == RegType::vgpr) {
3157 bld.pseudo(aco_opcode::p_split_vector, Definition(dst), bld.def(dst.regClass()),
3158 get_alu_src(ctx, instr->src[0]));
3159 } else {
3160 bld.copy(Definition(dst), get_alu_src(ctx, instr->src[0]));
3161 }
3162 break;
3163 case nir_op_unpack_32_2x16_split_y:
3164 if (dst.type() == RegType::vgpr) {
3165 bld.pseudo(aco_opcode::p_split_vector, bld.def(dst.regClass()), Definition(dst),
3166 get_alu_src(ctx, instr->src[0]));
3167 } else {
3168 bld.pseudo(aco_opcode::p_extract, Definition(dst), bld.def(s1, scc),
3169 get_alu_src(ctx, instr->src[0]), Operand::c32(1u), Operand::c32(16u),
3170 Operand::zero());
3171 }
3172 break;
3173 case nir_op_pack_32_2x16_split: {
3174 Temp src0 = get_alu_src(ctx, instr->src[0]);
3175 Temp src1 = get_alu_src(ctx, instr->src[1]);
3176 if (dst.regClass() == v1) {
3177 src0 = emit_extract_vector(ctx, src0, 0, v2b);
3178 src1 = emit_extract_vector(ctx, src1, 0, v2b);
3179 bld.pseudo(aco_opcode::p_create_vector, Definition(dst), src0, src1);
3180 } else {
3181 src0 = bld.sop2(aco_opcode::s_and_b32, bld.def(s1), bld.def(s1, scc), src0,
3182 Operand::c32(0xFFFFu));
3183 src1 = bld.sop2(aco_opcode::s_lshl_b32, bld.def(s1), bld.def(s1, scc), src1,
3184 Operand::c32(16u));
3185 bld.sop2(aco_opcode::s_or_b32, Definition(dst), bld.def(s1, scc), src0, src1);
3186 }
3187 break;
3188 }
3189 case nir_op_pack_32_4x8: bld.copy(Definition(dst), get_alu_src(ctx, instr->src[0], 4)); break;
3190 case nir_op_pack_half_2x16_split: {
3191 if (dst.regClass() == v1) {
3192 nir_const_value* val = nir_src_as_const_value(instr->src[1].src);
3193 if (val && val->u32 == 0 && ctx->program->chip_class <= GFX9) {
3194 /* upper bits zero on GFX6-GFX9 */
3195 bld.vop1(aco_opcode::v_cvt_f16_f32, Definition(dst), get_alu_src(ctx, instr->src[0]));
3196 } else if (!ctx->block->fp_mode.care_about_round16_64 ||
3197 ctx->block->fp_mode.round16_64 == fp_round_tz) {
3198 if (ctx->program->chip_class == GFX8 || ctx->program->chip_class == GFX9)
3199 emit_vop3a_instruction(ctx, instr, aco_opcode::v_cvt_pkrtz_f16_f32_e64, dst);
3200 else
3201 emit_vop2_instruction(ctx, instr, aco_opcode::v_cvt_pkrtz_f16_f32, dst, false);
3202 } else {
3203 Temp src0 =
3204 bld.vop1(aco_opcode::v_cvt_f16_f32, bld.def(v2b), get_alu_src(ctx, instr->src[0]));
3205 Temp src1 =
3206 bld.vop1(aco_opcode::v_cvt_f16_f32, bld.def(v2b), get_alu_src(ctx, instr->src[1]));
3207 bld.pseudo(aco_opcode::p_create_vector, Definition(dst), src0, src1);
3208 }
3209 } else {
3210 isel_err(&instr->instr, "Unimplemented NIR instr bit size");
3211 }
3212 break;
3213 }
3214 case nir_op_unpack_half_2x16_split_x_flush_to_zero:
3215 case nir_op_unpack_half_2x16_split_x: {
3216 Temp src = get_alu_src(ctx, instr->src[0]);
3217 if (src.regClass() == v1)
3218 src = bld.pseudo(aco_opcode::p_split_vector, bld.def(v2b), bld.def(v2b), src);
3219 if (dst.regClass() == v1) {
3220 assert(ctx->block->fp_mode.must_flush_denorms16_64 ==
3221 (instr->op == nir_op_unpack_half_2x16_split_x_flush_to_zero));
3222 bld.vop1(aco_opcode::v_cvt_f32_f16, Definition(dst), src);
3223 } else {
3224 isel_err(&instr->instr, "Unimplemented NIR instr bit size");
3225 }
3226 break;
3227 }
3228 case nir_op_unpack_half_2x16_split_y_flush_to_zero:
3229 case nir_op_unpack_half_2x16_split_y: {
3230 Temp src = get_alu_src(ctx, instr->src[0]);
3231 if (src.regClass() == s1)
3232 src =
3233 bld.sop2(aco_opcode::s_lshr_b32, bld.def(s1), bld.def(s1, scc), src, Operand::c32(16u));
3234 else
3235 src =
3236 bld.pseudo(aco_opcode::p_split_vector, bld.def(v2b), bld.def(v2b), src).def(1).getTemp();
3237 if (dst.regClass() == v1) {
3238 assert(ctx->block->fp_mode.must_flush_denorms16_64 ==
3239 (instr->op == nir_op_unpack_half_2x16_split_y_flush_to_zero));
3240 bld.vop1(aco_opcode::v_cvt_f32_f16, Definition(dst), src);
3241 } else {
3242 isel_err(&instr->instr, "Unimplemented NIR instr bit size");
3243 }
3244 break;
3245 }
3246 case nir_op_sad_u8x4: {
3247 assert(dst.regClass() == v1);
3248 emit_vop3a_instruction(ctx, instr, aco_opcode::v_sad_u8, dst, false, 3u, false);
3249 break;
3250 }
3251 case nir_op_fquantize2f16: {
3252 Temp src = get_alu_src(ctx, instr->src[0]);
3253 Temp f16 = bld.vop1(aco_opcode::v_cvt_f16_f32, bld.def(v1), src);
3254 Temp f32, cmp_res;
3255
3256 if (ctx->program->chip_class >= GFX8) {
3257 Temp mask = bld.copy(
3258 bld.def(s1), Operand::c32(0x36Fu)); /* value is NOT negative/positive denormal value */
3259 cmp_res =
3260 bld.vopc_e64(aco_opcode::v_cmp_class_f16, bld.hint_vcc(bld.def(bld.lm)), f16, mask);
3261 f32 = bld.vop1(aco_opcode::v_cvt_f32_f16, bld.def(v1), f16);
3262 } else {
3263 /* 0x38800000 is smallest half float value (2^-14) in 32-bit float,
3264 * so compare the result and flush to 0 if it's smaller.
3265 */
3266 f32 = bld.vop1(aco_opcode::v_cvt_f32_f16, bld.def(v1), f16);
3267 Temp smallest = bld.copy(bld.def(s1), Operand::c32(0x38800000u));
3268 Instruction* tmp0 = bld.vopc_e64(aco_opcode::v_cmp_lt_f32, bld.def(bld.lm), f32, smallest);
3269 tmp0->vop3().abs[0] = true;
3270 Temp tmp1 =
3271 bld.vopc(aco_opcode::v_cmp_lg_f32, bld.hint_vcc(bld.def(bld.lm)), Operand::zero(), f32);
3272 cmp_res = bld.sop2(aco_opcode::s_nand_b64, bld.def(s2), bld.def(s1, scc),
3273 tmp0->definitions[0].getTemp(), tmp1);
3274 }
3275
3276 if (ctx->block->fp_mode.preserve_signed_zero_inf_nan32) {
3277 Temp copysign_0 =
3278 bld.vop2(aco_opcode::v_mul_f32, bld.def(v1), Operand::zero(), as_vgpr(ctx, src));
3279 bld.vop2(aco_opcode::v_cndmask_b32, Definition(dst), copysign_0, f32, cmp_res);
3280 } else {
3281 bld.vop2(aco_opcode::v_cndmask_b32, Definition(dst), Operand::zero(), f32, cmp_res);
3282 }
3283 break;
3284 }
3285 case nir_op_bfm: {
3286 Temp bits = get_alu_src(ctx, instr->src[0]);
3287 Temp offset = get_alu_src(ctx, instr->src[1]);
3288
3289 if (dst.regClass() == s1) {
3290 bld.sop2(aco_opcode::s_bfm_b32, Definition(dst), bits, offset);
3291 } else if (dst.regClass() == v1) {
3292 bld.vop3(aco_opcode::v_bfm_b32, Definition(dst), bits, offset);
3293 } else {
3294 isel_err(&instr->instr, "Unimplemented NIR instr bit size");
3295 }
3296 break;
3297 }
3298 case nir_op_bitfield_select: {
3299
3300 /* dst = (insert & bitmask) | (base & ~bitmask) */
3301 if (dst.regClass() == s1) {
3302 Temp bitmask = get_alu_src(ctx, instr->src[0]);
3303 Temp insert = get_alu_src(ctx, instr->src[1]);
3304 Temp base = get_alu_src(ctx, instr->src[2]);
3305 aco_ptr<Instruction> sop2;
3306 nir_const_value* const_bitmask = nir_src_as_const_value(instr->src[0].src);
3307 nir_const_value* const_insert = nir_src_as_const_value(instr->src[1].src);
3308 Operand lhs;
3309 if (const_insert && const_bitmask) {
3310 lhs = Operand::c32(const_insert->u32 & const_bitmask->u32);
3311 } else {
3312 insert =
3313 bld.sop2(aco_opcode::s_and_b32, bld.def(s1), bld.def(s1, scc), insert, bitmask);
3314 lhs = Operand(insert);
3315 }
3316
3317 Operand rhs;
3318 nir_const_value* const_base = nir_src_as_const_value(instr->src[2].src);
3319 if (const_base && const_bitmask) {
3320 rhs = Operand::c32(const_base->u32 & ~const_bitmask->u32);
3321 } else {
3322 base = bld.sop2(aco_opcode::s_andn2_b32, bld.def(s1), bld.def(s1, scc), base, bitmask);
3323 rhs = Operand(base);
3324 }
3325
3326 bld.sop2(aco_opcode::s_or_b32, Definition(dst), bld.def(s1, scc), rhs, lhs);
3327
3328 } else if (dst.regClass() == v1) {
3329 emit_vop3a_instruction(ctx, instr, aco_opcode::v_bfi_b32, dst, false, 3);
3330 } else {
3331 isel_err(&instr->instr, "Unimplemented NIR instr bit size");
3332 }
3333 break;
3334 }
3335 case nir_op_ubfe:
3336 case nir_op_ibfe: {
3337 if (dst.bytes() != 4)
3338 unreachable("Unsupported BFE bit size");
3339
3340 if (dst.type() == RegType::sgpr) {
3341 Temp base = get_alu_src(ctx, instr->src[0]);
3342
3343 nir_const_value* const_offset = nir_src_as_const_value(instr->src[1].src);
3344 nir_const_value* const_bits = nir_src_as_const_value(instr->src[2].src);
3345 if (const_offset && const_bits) {
3346 uint32_t extract = (const_bits->u32 << 16) | (const_offset->u32 & 0x1f);
3347 aco_opcode opcode =
3348 instr->op == nir_op_ubfe ? aco_opcode::s_bfe_u32 : aco_opcode::s_bfe_i32;
3349 bld.sop2(opcode, Definition(dst), bld.def(s1, scc), base, Operand::c32(extract));
3350 break;
3351 }
3352
3353 Temp offset = get_alu_src(ctx, instr->src[1]);
3354 Temp bits = get_alu_src(ctx, instr->src[2]);
3355 if (instr->op == nir_op_ubfe) {
3356 Temp mask = bld.sop2(aco_opcode::s_bfm_b32, bld.def(s1), bits, offset);
3357 Temp masked =
3358 bld.sop2(aco_opcode::s_and_b32, bld.def(s1), bld.def(s1, scc), base, mask);
3359 bld.sop2(aco_opcode::s_lshr_b32, Definition(dst), bld.def(s1, scc), masked, offset);
3360 } else {
3361 Operand bits_op = const_bits ? Operand::c32(const_bits->u32 << 16)
3362 : bld.sop2(aco_opcode::s_lshl_b32, bld.def(s1),
3363 bld.def(s1, scc), bits, Operand::c32(16u));
3364 Operand offset_op = const_offset
3365 ? Operand::c32(const_offset->u32 & 0x1fu)
3366 : bld.sop2(aco_opcode::s_and_b32, bld.def(s1), bld.def(s1, scc),
3367 offset, Operand::c32(0x1fu));
3368
3369 Temp extract =
3370 bld.sop2(aco_opcode::s_or_b32, bld.def(s1), bld.def(s1, scc), bits_op, offset_op);
3371 bld.sop2(aco_opcode::s_bfe_i32, Definition(dst), bld.def(s1, scc), base, extract);
3372 }
3373
3374 } else {
3375 aco_opcode opcode =
3376 instr->op == nir_op_ubfe ? aco_opcode::v_bfe_u32 : aco_opcode::v_bfe_i32;
3377 emit_vop3a_instruction(ctx, instr, opcode, dst, false, 3);
3378 }
3379 break;
3380 }
3381 case nir_op_extract_u8:
3382 case nir_op_extract_i8:
3383 case nir_op_extract_u16:
3384 case nir_op_extract_i16: {
3385 bool is_signed = instr->op == nir_op_extract_i16 || instr->op == nir_op_extract_i8;
3386 unsigned comp = instr->op == nir_op_extract_u8 || instr->op == nir_op_extract_i8 ? 4 : 2;
3387 uint32_t bits = comp == 4 ? 8 : 16;
3388 unsigned index = nir_src_as_uint(instr->src[1].src);
3389 if (bits >= instr->dest.dest.ssa.bit_size || index * bits >= instr->dest.dest.ssa.bit_size) {
3390 assert(index == 0);
3391 bld.copy(Definition(dst), get_alu_src(ctx, instr->src[0]));
3392 } else if (dst.regClass() == s1 && instr->dest.dest.ssa.bit_size == 16) {
3393 Temp vec = get_ssa_temp(ctx, instr->src[0].src.ssa);
3394 unsigned swizzle = instr->src[0].swizzle[0];
3395 if (vec.size() > 1) {
3396 vec = emit_extract_vector(ctx, vec, swizzle / 2, s1);
3397 swizzle = swizzle & 1;
3398 }
3399 index += swizzle * instr->dest.dest.ssa.bit_size / bits;
3400 bld.pseudo(aco_opcode::p_extract, Definition(dst), bld.def(s1, scc), Operand(vec),
3401 Operand::c32(index), Operand::c32(bits), Operand::c32(is_signed));
3402 } else {
3403 Temp src = get_alu_src(ctx, instr->src[0]);
3404 Definition def(dst);
3405 if (dst.bytes() == 8) {
3406 src = emit_extract_vector(ctx, src, index / comp, RegClass(src.type(), 1));
3407 index %= comp;
3408 def = bld.def(src.type(), 1);
3409 }
3410 assert(def.bytes() <= 4);
3411 if (def.regClass() == s1) {
3412 bld.pseudo(aco_opcode::p_extract, def, bld.def(s1, scc), Operand(src),
3413 Operand::c32(index), Operand::c32(bits), Operand::c32(is_signed));
3414 } else {
3415 src = emit_extract_vector(ctx, src, 0, def.regClass());
3416 bld.pseudo(aco_opcode::p_extract, def, Operand(src), Operand::c32(index),
3417 Operand::c32(bits), Operand::c32(is_signed));
3418 }
3419 if (dst.size() == 2)
3420 bld.pseudo(aco_opcode::p_create_vector, Definition(dst), def.getTemp(),
3421 Operand::zero());
3422 }
3423 break;
3424 }
3425 case nir_op_insert_u8:
3426 case nir_op_insert_u16: {
3427 unsigned comp = instr->op == nir_op_insert_u8 ? 4 : 2;
3428 uint32_t bits = comp == 4 ? 8 : 16;
3429 unsigned index = nir_src_as_uint(instr->src[1].src);
3430 if (bits >= instr->dest.dest.ssa.bit_size || index * bits >= instr->dest.dest.ssa.bit_size) {
3431 assert(index == 0);
3432 bld.copy(Definition(dst), get_alu_src(ctx, instr->src[0]));
3433 } else {
3434 Temp src = get_alu_src(ctx, instr->src[0]);
3435 Definition def(dst);
3436 bool swap = false;
3437 if (dst.bytes() == 8) {
3438 src = emit_extract_vector(ctx, src, 0u, RegClass(src.type(), 1));
3439 swap = index >= comp;
3440 index %= comp;
3441 def = bld.def(src.type(), 1);
3442 }
3443 if (def.regClass() == s1) {
3444 bld.pseudo(aco_opcode::p_insert, def, bld.def(s1, scc), Operand(src),
3445 Operand::c32(index), Operand::c32(bits));
3446 } else {
3447 src = emit_extract_vector(ctx, src, 0, def.regClass());
3448 bld.pseudo(aco_opcode::p_insert, def, Operand(src), Operand::c32(index),
3449 Operand::c32(bits));
3450 }
3451 if (dst.size() == 2 && swap)
3452 bld.pseudo(aco_opcode::p_create_vector, Definition(dst), Operand::zero(),
3453 def.getTemp());
3454 else if (dst.size() == 2)
3455 bld.pseudo(aco_opcode::p_create_vector, Definition(dst), def.getTemp(),
3456 Operand::zero());
3457 }
3458 break;
3459 }
3460 case nir_op_bit_count: {
3461 Temp src = get_alu_src(ctx, instr->src[0]);
3462 if (src.regClass() == s1) {
3463 bld.sop1(aco_opcode::s_bcnt1_i32_b32, Definition(dst), bld.def(s1, scc), src);
3464 } else if (src.regClass() == v1) {
3465 bld.vop3(aco_opcode::v_bcnt_u32_b32, Definition(dst), src, Operand::zero());
3466 } else if (src.regClass() == v2) {
3467 bld.vop3(aco_opcode::v_bcnt_u32_b32, Definition(dst), emit_extract_vector(ctx, src, 1, v1),
3468 bld.vop3(aco_opcode::v_bcnt_u32_b32, bld.def(v1),
3469 emit_extract_vector(ctx, src, 0, v1), Operand::zero()));
3470 } else if (src.regClass() == s2) {
3471 bld.sop1(aco_opcode::s_bcnt1_i32_b64, Definition(dst), bld.def(s1, scc), src);
3472 } else {
3473 isel_err(&instr->instr, "Unimplemented NIR instr bit size");
3474 }
3475 break;
3476 }
3477 case nir_op_flt: {
3478 emit_comparison(ctx, instr, dst, aco_opcode::v_cmp_lt_f16, aco_opcode::v_cmp_lt_f32,
3479 aco_opcode::v_cmp_lt_f64);
3480 break;
3481 }
3482 case nir_op_fge: {
3483 emit_comparison(ctx, instr, dst, aco_opcode::v_cmp_ge_f16, aco_opcode::v_cmp_ge_f32,
3484 aco_opcode::v_cmp_ge_f64);
3485 break;
3486 }
3487 case nir_op_feq: {
3488 emit_comparison(ctx, instr, dst, aco_opcode::v_cmp_eq_f16, aco_opcode::v_cmp_eq_f32,
3489 aco_opcode::v_cmp_eq_f64);
3490 break;
3491 }
3492 case nir_op_fneu: {
3493 emit_comparison(ctx, instr, dst, aco_opcode::v_cmp_neq_f16, aco_opcode::v_cmp_neq_f32,
3494 aco_opcode::v_cmp_neq_f64);
3495 break;
3496 }
3497 case nir_op_ilt: {
3498 emit_comparison(ctx, instr, dst, aco_opcode::v_cmp_lt_i16, aco_opcode::v_cmp_lt_i32,
3499 aco_opcode::v_cmp_lt_i64, aco_opcode::s_cmp_lt_i32);
3500 break;
3501 }
3502 case nir_op_ige: {
3503 emit_comparison(ctx, instr, dst, aco_opcode::v_cmp_ge_i16, aco_opcode::v_cmp_ge_i32,
3504 aco_opcode::v_cmp_ge_i64, aco_opcode::s_cmp_ge_i32);
3505 break;
3506 }
3507 case nir_op_ieq: {
3508 if (instr->src[0].src.ssa->bit_size == 1)
3509 emit_boolean_logic(ctx, instr, Builder::s_xnor, dst);
3510 else
3511 emit_comparison(
3512 ctx, instr, dst, aco_opcode::v_cmp_eq_i16, aco_opcode::v_cmp_eq_i32,
3513 aco_opcode::v_cmp_eq_i64, aco_opcode::s_cmp_eq_i32,
3514 ctx->program->chip_class >= GFX8 ? aco_opcode::s_cmp_eq_u64 : aco_opcode::num_opcodes);
3515 break;
3516 }
3517 case nir_op_ine: {
3518 if (instr->src[0].src.ssa->bit_size == 1)
3519 emit_boolean_logic(ctx, instr, Builder::s_xor, dst);
3520 else
3521 emit_comparison(
3522 ctx, instr, dst, aco_opcode::v_cmp_lg_i16, aco_opcode::v_cmp_lg_i32,
3523 aco_opcode::v_cmp_lg_i64, aco_opcode::s_cmp_lg_i32,
3524 ctx->program->chip_class >= GFX8 ? aco_opcode::s_cmp_lg_u64 : aco_opcode::num_opcodes);
3525 break;
3526 }
3527 case nir_op_ult: {
3528 emit_comparison(ctx, instr, dst, aco_opcode::v_cmp_lt_u16, aco_opcode::v_cmp_lt_u32,
3529 aco_opcode::v_cmp_lt_u64, aco_opcode::s_cmp_lt_u32);
3530 break;
3531 }
3532 case nir_op_uge: {
3533 emit_comparison(ctx, instr, dst, aco_opcode::v_cmp_ge_u16, aco_opcode::v_cmp_ge_u32,
3534 aco_opcode::v_cmp_ge_u64, aco_opcode::s_cmp_ge_u32);
3535 break;
3536 }
3537 case nir_op_fddx:
3538 case nir_op_fddy:
3539 case nir_op_fddx_fine:
3540 case nir_op_fddy_fine:
3541 case nir_op_fddx_coarse:
3542 case nir_op_fddy_coarse: {
3543 if (!nir_src_is_divergent(instr->src[0].src)) {
3544 /* Source is the same in all lanes, so the derivative is zero.
3545 * This also avoids emitting invalid IR.
3546 */
3547 bld.copy(Definition(dst), Operand::zero());
3548 break;
3549 }
3550
3551 Temp src = as_vgpr(ctx, get_alu_src(ctx, instr->src[0]));
3552 uint16_t dpp_ctrl1, dpp_ctrl2;
3553 if (instr->op == nir_op_fddx_fine) {
3554 dpp_ctrl1 = dpp_quad_perm(0, 0, 2, 2);
3555 dpp_ctrl2 = dpp_quad_perm(1, 1, 3, 3);
3556 } else if (instr->op == nir_op_fddy_fine) {
3557 dpp_ctrl1 = dpp_quad_perm(0, 1, 0, 1);
3558 dpp_ctrl2 = dpp_quad_perm(2, 3, 2, 3);
3559 } else {
3560 dpp_ctrl1 = dpp_quad_perm(0, 0, 0, 0);
3561 if (instr->op == nir_op_fddx || instr->op == nir_op_fddx_coarse)
3562 dpp_ctrl2 = dpp_quad_perm(1, 1, 1, 1);
3563 else
3564 dpp_ctrl2 = dpp_quad_perm(2, 2, 2, 2);
3565 }
3566
3567 Temp tmp;
3568 if (ctx->program->chip_class >= GFX8) {
3569 Temp tl = bld.vop1_dpp(aco_opcode::v_mov_b32, bld.def(v1), src, dpp_ctrl1);
3570 tmp = bld.vop2_dpp(aco_opcode::v_sub_f32, bld.def(v1), src, tl, dpp_ctrl2);
3571 } else {
3572 Temp tl = bld.ds(aco_opcode::ds_swizzle_b32, bld.def(v1), src, (1 << 15) | dpp_ctrl1);
3573 Temp tr = bld.ds(aco_opcode::ds_swizzle_b32, bld.def(v1), src, (1 << 15) | dpp_ctrl2);
3574 tmp = bld.vop2(aco_opcode::v_sub_f32, bld.def(v1), tr, tl);
3575 }
3576 emit_wqm(bld, tmp, dst, true);
3577 break;
3578 }
3579 default: isel_err(&instr->instr, "Unknown NIR ALU instr");
3580 }
3581 }
3582
3583 void
visit_load_const(isel_context * ctx,nir_load_const_instr * instr)3584 visit_load_const(isel_context* ctx, nir_load_const_instr* instr)
3585 {
3586 Temp dst = get_ssa_temp(ctx, &instr->def);
3587
3588 // TODO: we really want to have the resulting type as this would allow for 64bit literals
3589 // which get truncated the lsb if double and msb if int
3590 // for now, we only use s_mov_b64 with 64bit inline constants
3591 assert(instr->def.num_components == 1 && "Vector load_const should be lowered to scalar.");
3592 assert(dst.type() == RegType::sgpr);
3593
3594 Builder bld(ctx->program, ctx->block);
3595
3596 if (instr->def.bit_size == 1) {
3597 assert(dst.regClass() == bld.lm);
3598 int val = instr->value[0].b ? -1 : 0;
3599 Operand op = bld.lm.size() == 1 ? Operand::c32(val) : Operand::c64(val);
3600 bld.copy(Definition(dst), op);
3601 } else if (instr->def.bit_size == 8) {
3602 bld.copy(Definition(dst), Operand::c32(instr->value[0].u8));
3603 } else if (instr->def.bit_size == 16) {
3604 /* sign-extend to use s_movk_i32 instead of a literal */
3605 bld.copy(Definition(dst), Operand::c32(instr->value[0].i16));
3606 } else if (dst.size() == 1) {
3607 bld.copy(Definition(dst), Operand::c32(instr->value[0].u32));
3608 } else {
3609 assert(dst.size() != 1);
3610 aco_ptr<Pseudo_instruction> vec{create_instruction<Pseudo_instruction>(
3611 aco_opcode::p_create_vector, Format::PSEUDO, dst.size(), 1)};
3612 if (instr->def.bit_size == 64)
3613 for (unsigned i = 0; i < dst.size(); i++)
3614 vec->operands[i] = Operand::c32(instr->value[0].u64 >> i * 32);
3615 else {
3616 for (unsigned i = 0; i < dst.size(); i++)
3617 vec->operands[i] = Operand::c32(instr->value[i].u32);
3618 }
3619 vec->definitions[0] = Definition(dst);
3620 ctx->block->instructions.emplace_back(std::move(vec));
3621 }
3622 }
3623
3624 uint32_t
widen_mask(uint32_t mask,unsigned multiplier)3625 widen_mask(uint32_t mask, unsigned multiplier)
3626 {
3627 uint32_t new_mask = 0;
3628 for (unsigned i = 0; i < 32 && (1u << i) <= mask; ++i)
3629 if (mask & (1u << i))
3630 new_mask |= ((1u << multiplier) - 1u) << (i * multiplier);
3631 return new_mask;
3632 }
3633
3634 struct LoadEmitInfo {
3635 Operand offset;
3636 Temp dst;
3637 unsigned num_components;
3638 unsigned component_size;
3639 Temp resource = Temp(0, s1);
3640 unsigned component_stride = 0;
3641 unsigned const_offset = 0;
3642 unsigned align_mul = 0;
3643 unsigned align_offset = 0;
3644
3645 bool glc = false;
3646 bool slc = false;
3647 unsigned swizzle_component_size = 0;
3648 memory_sync_info sync;
3649 Temp soffset = Temp(0, s1);
3650 };
3651
3652 struct EmitLoadParameters {
3653 using Callback = Temp (*)(Builder& bld, const LoadEmitInfo& info, Temp offset,
3654 unsigned bytes_needed, unsigned align, unsigned const_offset,
3655 Temp dst_hint);
3656
3657 Callback callback;
3658 bool byte_align_loads;
3659 bool supports_8bit_16bit_loads;
3660 unsigned max_const_offset_plus_one;
3661 };
3662
3663 void
emit_load(isel_context * ctx,Builder & bld,const LoadEmitInfo & info,const EmitLoadParameters & params)3664 emit_load(isel_context* ctx, Builder& bld, const LoadEmitInfo& info,
3665 const EmitLoadParameters& params)
3666 {
3667 unsigned load_size = info.num_components * info.component_size;
3668 unsigned component_size = info.component_size;
3669
3670 unsigned num_vals = 0;
3671 Temp* const vals = (Temp*)alloca(info.dst.bytes() * sizeof(Temp));
3672
3673 unsigned const_offset = info.const_offset;
3674
3675 const unsigned align_mul = info.align_mul ? info.align_mul : component_size;
3676 unsigned align_offset = (info.align_offset + const_offset) % align_mul;
3677
3678 unsigned bytes_read = 0;
3679 while (bytes_read < load_size) {
3680 unsigned bytes_needed = load_size - bytes_read;
3681
3682 /* add buffer for unaligned loads */
3683 int byte_align = 0;
3684 if (params.byte_align_loads) {
3685 byte_align = align_mul % 4 == 0 ? align_offset % 4 : -1;
3686 }
3687
3688 if (byte_align) {
3689 if (bytes_needed > 2 || (bytes_needed == 2 && (align_mul % 2 || align_offset % 2)) ||
3690 !params.supports_8bit_16bit_loads) {
3691 if (info.component_stride) {
3692 assert(params.supports_8bit_16bit_loads && "unimplemented");
3693 bytes_needed = 2;
3694 byte_align = 0;
3695 } else {
3696 bytes_needed += byte_align == -1 ? 4 - info.align_mul : byte_align;
3697 bytes_needed = align(bytes_needed, 4);
3698 }
3699 } else {
3700 byte_align = 0;
3701 }
3702 }
3703
3704 if (info.swizzle_component_size)
3705 bytes_needed = MIN2(bytes_needed, info.swizzle_component_size);
3706 if (info.component_stride)
3707 bytes_needed = MIN2(bytes_needed, info.component_size);
3708
3709 bool need_to_align_offset = byte_align && (align_mul % 4 || align_offset % 4);
3710
3711 /* reduce constant offset */
3712 Operand offset = info.offset;
3713 unsigned reduced_const_offset = const_offset;
3714 bool remove_const_offset_completely = need_to_align_offset;
3715 if (const_offset &&
3716 (remove_const_offset_completely || const_offset >= params.max_const_offset_plus_one)) {
3717 unsigned to_add = const_offset;
3718 if (remove_const_offset_completely) {
3719 reduced_const_offset = 0;
3720 } else {
3721 to_add =
3722 const_offset / params.max_const_offset_plus_one * params.max_const_offset_plus_one;
3723 reduced_const_offset %= params.max_const_offset_plus_one;
3724 }
3725 Temp offset_tmp = offset.isTemp() ? offset.getTemp() : Temp();
3726 if (offset.isConstant()) {
3727 offset = Operand::c32(offset.constantValue() + to_add);
3728 } else if (offset_tmp.regClass() == s1) {
3729 offset = bld.sop2(aco_opcode::s_add_i32, bld.def(s1), bld.def(s1, scc), offset_tmp,
3730 Operand::c32(to_add));
3731 } else if (offset_tmp.regClass() == v1) {
3732 offset = bld.vadd32(bld.def(v1), offset_tmp, Operand::c32(to_add));
3733 } else {
3734 Temp lo = bld.tmp(offset_tmp.type(), 1);
3735 Temp hi = bld.tmp(offset_tmp.type(), 1);
3736 bld.pseudo(aco_opcode::p_split_vector, Definition(lo), Definition(hi), offset_tmp);
3737
3738 if (offset_tmp.regClass() == s2) {
3739 Temp carry = bld.tmp(s1);
3740 lo = bld.sop2(aco_opcode::s_add_u32, bld.def(s1), bld.scc(Definition(carry)), lo,
3741 Operand::c32(to_add));
3742 hi = bld.sop2(aco_opcode::s_add_u32, bld.def(s1), bld.def(s1, scc), hi, carry);
3743 offset = bld.pseudo(aco_opcode::p_create_vector, bld.def(s2), lo, hi);
3744 } else {
3745 Temp new_lo = bld.tmp(v1);
3746 Temp carry =
3747 bld.vadd32(Definition(new_lo), lo, Operand::c32(to_add), true).def(1).getTemp();
3748 hi = bld.vadd32(bld.def(v1), hi, Operand::zero(), false, carry);
3749 offset = bld.pseudo(aco_opcode::p_create_vector, bld.def(v2), new_lo, hi);
3750 }
3751 }
3752 }
3753
3754 /* align offset down if needed */
3755 Operand aligned_offset = offset;
3756 unsigned align = align_offset ? 1 << (ffs(align_offset) - 1) : align_mul;
3757 if (need_to_align_offset) {
3758 align = 4;
3759 Temp offset_tmp = offset.isTemp() ? offset.getTemp() : Temp();
3760 if (offset.isConstant()) {
3761 aligned_offset = Operand::c32(offset.constantValue() & 0xfffffffcu);
3762 } else if (offset_tmp.regClass() == s1) {
3763 aligned_offset = bld.sop2(aco_opcode::s_and_b32, bld.def(s1), bld.def(s1, scc),
3764 Operand::c32(0xfffffffcu), offset_tmp);
3765 } else if (offset_tmp.regClass() == s2) {
3766 aligned_offset = bld.sop2(aco_opcode::s_and_b64, bld.def(s2), bld.def(s1, scc),
3767 Operand::c64(0xfffffffffffffffcllu), offset_tmp);
3768 } else if (offset_tmp.regClass() == v1) {
3769 aligned_offset =
3770 bld.vop2(aco_opcode::v_and_b32, bld.def(v1), Operand::c32(0xfffffffcu), offset_tmp);
3771 } else if (offset_tmp.regClass() == v2) {
3772 Temp hi = bld.tmp(v1), lo = bld.tmp(v1);
3773 bld.pseudo(aco_opcode::p_split_vector, Definition(lo), Definition(hi), offset_tmp);
3774 lo = bld.vop2(aco_opcode::v_and_b32, bld.def(v1), Operand::c32(0xfffffffcu), lo);
3775 aligned_offset = bld.pseudo(aco_opcode::p_create_vector, bld.def(v2), lo, hi);
3776 }
3777 }
3778 Temp aligned_offset_tmp =
3779 aligned_offset.isTemp() ? aligned_offset.getTemp() : bld.copy(bld.def(s1), aligned_offset);
3780
3781 Temp val = params.callback(bld, info, aligned_offset_tmp, bytes_needed, align,
3782 reduced_const_offset, byte_align ? Temp() : info.dst);
3783
3784 /* the callback wrote directly to dst */
3785 if (val == info.dst) {
3786 assert(num_vals == 0);
3787 emit_split_vector(ctx, info.dst, info.num_components);
3788 return;
3789 }
3790
3791 /* shift result right if needed */
3792 if (params.byte_align_loads && info.component_size < 4) {
3793 Operand byte_align_off = Operand::c32(byte_align);
3794 if (byte_align == -1) {
3795 if (offset.isConstant())
3796 byte_align_off = Operand::c32(offset.constantValue() % 4u);
3797 else if (offset.size() == 2)
3798 byte_align_off = Operand(emit_extract_vector(ctx, offset.getTemp(), 0,
3799 RegClass(offset.getTemp().type(), 1)));
3800 else
3801 byte_align_off = offset;
3802 }
3803
3804 assert(val.bytes() >= load_size && "unimplemented");
3805 if (val.type() == RegType::sgpr)
3806 byte_align_scalar(ctx, val, byte_align_off, info.dst);
3807 else
3808 byte_align_vector(ctx, val, byte_align_off, info.dst, component_size);
3809 return;
3810 }
3811
3812 /* add result to list and advance */
3813 if (info.component_stride) {
3814 assert(val.bytes() == info.component_size && "unimplemented");
3815 const_offset += info.component_stride;
3816 align_offset = (align_offset + info.component_stride) % align_mul;
3817 } else {
3818 const_offset += val.bytes();
3819 align_offset = (align_offset + val.bytes()) % align_mul;
3820 }
3821 bytes_read += val.bytes();
3822 vals[num_vals++] = val;
3823 }
3824
3825 /* create array of components */
3826 unsigned components_split = 0;
3827 std::array<Temp, NIR_MAX_VEC_COMPONENTS> allocated_vec;
3828 bool has_vgprs = false;
3829 for (unsigned i = 0; i < num_vals;) {
3830 Temp* const tmp = (Temp*)alloca(num_vals * sizeof(Temp));
3831 unsigned num_tmps = 0;
3832 unsigned tmp_size = 0;
3833 RegType reg_type = RegType::sgpr;
3834 while ((!tmp_size || (tmp_size % component_size)) && i < num_vals) {
3835 if (vals[i].type() == RegType::vgpr)
3836 reg_type = RegType::vgpr;
3837 tmp_size += vals[i].bytes();
3838 tmp[num_tmps++] = vals[i++];
3839 }
3840 if (num_tmps > 1) {
3841 aco_ptr<Pseudo_instruction> vec{create_instruction<Pseudo_instruction>(
3842 aco_opcode::p_create_vector, Format::PSEUDO, num_tmps, 1)};
3843 for (unsigned j = 0; j < num_tmps; j++)
3844 vec->operands[j] = Operand(tmp[j]);
3845 tmp[0] = bld.tmp(RegClass::get(reg_type, tmp_size));
3846 vec->definitions[0] = Definition(tmp[0]);
3847 bld.insert(std::move(vec));
3848 }
3849
3850 if (tmp[0].bytes() % component_size) {
3851 /* trim tmp[0] */
3852 assert(i == num_vals);
3853 RegClass new_rc =
3854 RegClass::get(reg_type, tmp[0].bytes() / component_size * component_size);
3855 tmp[0] =
3856 bld.pseudo(aco_opcode::p_extract_vector, bld.def(new_rc), tmp[0], Operand::zero());
3857 }
3858
3859 RegClass elem_rc = RegClass::get(reg_type, component_size);
3860
3861 unsigned start = components_split;
3862
3863 if (tmp_size == elem_rc.bytes()) {
3864 allocated_vec[components_split++] = tmp[0];
3865 } else {
3866 assert(tmp_size % elem_rc.bytes() == 0);
3867 aco_ptr<Pseudo_instruction> split{create_instruction<Pseudo_instruction>(
3868 aco_opcode::p_split_vector, Format::PSEUDO, 1, tmp_size / elem_rc.bytes())};
3869 for (auto& def : split->definitions) {
3870 Temp component = bld.tmp(elem_rc);
3871 allocated_vec[components_split++] = component;
3872 def = Definition(component);
3873 }
3874 split->operands[0] = Operand(tmp[0]);
3875 bld.insert(std::move(split));
3876 }
3877
3878 /* try to p_as_uniform early so we can create more optimizable code and
3879 * also update allocated_vec */
3880 for (unsigned j = start; j < components_split; j++) {
3881 if (allocated_vec[j].bytes() % 4 == 0 && info.dst.type() == RegType::sgpr)
3882 allocated_vec[j] = bld.as_uniform(allocated_vec[j]);
3883 has_vgprs |= allocated_vec[j].type() == RegType::vgpr;
3884 }
3885 }
3886
3887 /* concatenate components and p_as_uniform() result if needed */
3888 if (info.dst.type() == RegType::vgpr || !has_vgprs)
3889 ctx->allocated_vec.emplace(info.dst.id(), allocated_vec);
3890
3891 int padding_bytes =
3892 MAX2((int)info.dst.bytes() - int(allocated_vec[0].bytes() * info.num_components), 0);
3893
3894 aco_ptr<Pseudo_instruction> vec{create_instruction<Pseudo_instruction>(
3895 aco_opcode::p_create_vector, Format::PSEUDO, info.num_components + !!padding_bytes, 1)};
3896 for (unsigned i = 0; i < info.num_components; i++)
3897 vec->operands[i] = Operand(allocated_vec[i]);
3898 if (padding_bytes)
3899 vec->operands[info.num_components] = Operand(RegClass::get(RegType::vgpr, padding_bytes));
3900 if (info.dst.type() == RegType::sgpr && has_vgprs) {
3901 Temp tmp = bld.tmp(RegType::vgpr, info.dst.size());
3902 vec->definitions[0] = Definition(tmp);
3903 bld.insert(std::move(vec));
3904 bld.pseudo(aco_opcode::p_as_uniform, Definition(info.dst), tmp);
3905 } else {
3906 vec->definitions[0] = Definition(info.dst);
3907 bld.insert(std::move(vec));
3908 }
3909 }
3910
3911 Operand
load_lds_size_m0(Builder & bld)3912 load_lds_size_m0(Builder& bld)
3913 {
3914 /* m0 does not need to be initialized on GFX9+ */
3915 if (bld.program->chip_class >= GFX9)
3916 return Operand(s1);
3917
3918 return bld.m0((Temp)bld.copy(bld.def(s1, m0), Operand::c32(0xffffffffu)));
3919 }
3920
3921 Temp
lds_load_callback(Builder & bld,const LoadEmitInfo & info,Temp offset,unsigned bytes_needed,unsigned align,unsigned const_offset,Temp dst_hint)3922 lds_load_callback(Builder& bld, const LoadEmitInfo& info, Temp offset, unsigned bytes_needed,
3923 unsigned align, unsigned const_offset, Temp dst_hint)
3924 {
3925 offset = offset.regClass() == s1 ? bld.copy(bld.def(v1), offset) : offset;
3926
3927 Operand m = load_lds_size_m0(bld);
3928
3929 bool large_ds_read = bld.program->chip_class >= GFX7;
3930 bool usable_read2 = bld.program->chip_class >= GFX7;
3931
3932 bool read2 = false;
3933 unsigned size = 0;
3934 aco_opcode op;
3935 if (bytes_needed >= 16 && align % 16 == 0 && large_ds_read) {
3936 size = 16;
3937 op = aco_opcode::ds_read_b128;
3938 } else if (bytes_needed >= 16 && align % 8 == 0 && const_offset % 8 == 0 && usable_read2) {
3939 size = 16;
3940 read2 = true;
3941 op = aco_opcode::ds_read2_b64;
3942 } else if (bytes_needed >= 12 && align % 16 == 0 && large_ds_read) {
3943 size = 12;
3944 op = aco_opcode::ds_read_b96;
3945 } else if (bytes_needed >= 8 && align % 8 == 0) {
3946 size = 8;
3947 op = aco_opcode::ds_read_b64;
3948 } else if (bytes_needed >= 8 && align % 4 == 0 && const_offset % 4 == 0 && usable_read2) {
3949 size = 8;
3950 read2 = true;
3951 op = aco_opcode::ds_read2_b32;
3952 } else if (bytes_needed >= 4 && align % 4 == 0) {
3953 size = 4;
3954 op = aco_opcode::ds_read_b32;
3955 } else if (bytes_needed >= 2 && align % 2 == 0) {
3956 size = 2;
3957 op = bld.program->chip_class >= GFX9 ? aco_opcode::ds_read_u16_d16 : aco_opcode::ds_read_u16;
3958 } else {
3959 size = 1;
3960 op = bld.program->chip_class >= GFX9 ? aco_opcode::ds_read_u8_d16 : aco_opcode::ds_read_u8;
3961 }
3962
3963 unsigned const_offset_unit = read2 ? size / 2u : 1u;
3964 unsigned const_offset_range = read2 ? 255 * const_offset_unit : 65536;
3965
3966 if (const_offset > (const_offset_range - const_offset_unit)) {
3967 unsigned excess = const_offset - (const_offset % const_offset_range);
3968 offset = bld.vadd32(bld.def(v1), offset, Operand::c32(excess));
3969 const_offset -= excess;
3970 }
3971
3972 const_offset /= const_offset_unit;
3973
3974 RegClass rc = RegClass::get(RegType::vgpr, size);
3975 Temp val = rc == info.dst.regClass() && dst_hint.id() ? dst_hint : bld.tmp(rc);
3976 Instruction* instr;
3977 if (read2)
3978 instr = bld.ds(op, Definition(val), offset, m, const_offset, const_offset + 1);
3979 else
3980 instr = bld.ds(op, Definition(val), offset, m, const_offset);
3981 instr->ds().sync = info.sync;
3982
3983 if (m.isUndefined())
3984 instr->operands.pop_back();
3985
3986 return val;
3987 }
3988
3989 const EmitLoadParameters lds_load_params{lds_load_callback, false, true, UINT32_MAX};
3990
3991 Temp
smem_load_callback(Builder & bld,const LoadEmitInfo & info,Temp offset,unsigned bytes_needed,unsigned align,unsigned const_offset,Temp dst_hint)3992 smem_load_callback(Builder& bld, const LoadEmitInfo& info, Temp offset, unsigned bytes_needed,
3993 unsigned align, unsigned const_offset, Temp dst_hint)
3994 {
3995 unsigned size = 0;
3996 aco_opcode op;
3997 if (bytes_needed <= 4) {
3998 size = 1;
3999 op = info.resource.id() ? aco_opcode::s_buffer_load_dword : aco_opcode::s_load_dword;
4000 } else if (bytes_needed <= 8) {
4001 size = 2;
4002 op = info.resource.id() ? aco_opcode::s_buffer_load_dwordx2 : aco_opcode::s_load_dwordx2;
4003 } else if (bytes_needed <= 16) {
4004 size = 4;
4005 op = info.resource.id() ? aco_opcode::s_buffer_load_dwordx4 : aco_opcode::s_load_dwordx4;
4006 } else if (bytes_needed <= 32) {
4007 size = 8;
4008 op = info.resource.id() ? aco_opcode::s_buffer_load_dwordx8 : aco_opcode::s_load_dwordx8;
4009 } else {
4010 size = 16;
4011 op = info.resource.id() ? aco_opcode::s_buffer_load_dwordx16 : aco_opcode::s_load_dwordx16;
4012 }
4013 aco_ptr<SMEM_instruction> load{create_instruction<SMEM_instruction>(op, Format::SMEM, 2, 1)};
4014 if (info.resource.id()) {
4015 load->operands[0] = Operand(info.resource);
4016 load->operands[1] = Operand(offset);
4017 } else {
4018 load->operands[0] = Operand(offset);
4019 load->operands[1] = Operand::zero();
4020 }
4021 RegClass rc(RegType::sgpr, size);
4022 Temp val = dst_hint.id() && dst_hint.regClass() == rc ? dst_hint : bld.tmp(rc);
4023 load->definitions[0] = Definition(val);
4024 load->glc = info.glc;
4025 load->dlc = info.glc && bld.program->chip_class >= GFX10;
4026 load->sync = info.sync;
4027 bld.insert(std::move(load));
4028 return val;
4029 }
4030
4031 const EmitLoadParameters smem_load_params{smem_load_callback, true, false, 1024};
4032
4033 Temp
mubuf_load_callback(Builder & bld,const LoadEmitInfo & info,Temp offset,unsigned bytes_needed,unsigned align_,unsigned const_offset,Temp dst_hint)4034 mubuf_load_callback(Builder& bld, const LoadEmitInfo& info, Temp offset, unsigned bytes_needed,
4035 unsigned align_, unsigned const_offset, Temp dst_hint)
4036 {
4037 Operand vaddr = offset.type() == RegType::vgpr ? Operand(offset) : Operand(v1);
4038 Operand soffset = offset.type() == RegType::sgpr ? Operand(offset) : Operand::c32(0);
4039
4040 if (info.soffset.id()) {
4041 if (soffset.isTemp())
4042 vaddr = bld.copy(bld.def(v1), soffset);
4043 soffset = Operand(info.soffset);
4044 }
4045
4046 unsigned bytes_size = 0;
4047 aco_opcode op;
4048 if (bytes_needed == 1 || align_ % 2) {
4049 bytes_size = 1;
4050 op = aco_opcode::buffer_load_ubyte;
4051 } else if (bytes_needed == 2 || align_ % 4) {
4052 bytes_size = 2;
4053 op = aco_opcode::buffer_load_ushort;
4054 } else if (bytes_needed <= 4) {
4055 bytes_size = 4;
4056 op = aco_opcode::buffer_load_dword;
4057 } else if (bytes_needed <= 8) {
4058 bytes_size = 8;
4059 op = aco_opcode::buffer_load_dwordx2;
4060 } else if (bytes_needed <= 12 && bld.program->chip_class > GFX6) {
4061 bytes_size = 12;
4062 op = aco_opcode::buffer_load_dwordx3;
4063 } else {
4064 bytes_size = 16;
4065 op = aco_opcode::buffer_load_dwordx4;
4066 }
4067 aco_ptr<MUBUF_instruction> mubuf{create_instruction<MUBUF_instruction>(op, Format::MUBUF, 3, 1)};
4068 mubuf->operands[0] = Operand(info.resource);
4069 mubuf->operands[1] = vaddr;
4070 mubuf->operands[2] = soffset;
4071 mubuf->offen = (offset.type() == RegType::vgpr);
4072 mubuf->glc = info.glc;
4073 mubuf->dlc = info.glc && bld.program->chip_class >= GFX10;
4074 mubuf->slc = info.slc;
4075 mubuf->sync = info.sync;
4076 mubuf->offset = const_offset;
4077 mubuf->swizzled = info.swizzle_component_size != 0;
4078 RegClass rc = RegClass::get(RegType::vgpr, bytes_size);
4079 Temp val = dst_hint.id() && rc == dst_hint.regClass() ? dst_hint : bld.tmp(rc);
4080 mubuf->definitions[0] = Definition(val);
4081 bld.insert(std::move(mubuf));
4082
4083 return val;
4084 }
4085
4086 const EmitLoadParameters mubuf_load_params{mubuf_load_callback, true, true, 4096};
4087 const EmitLoadParameters scratch_load_params{mubuf_load_callback, false, true, 4096};
4088
4089 Temp
get_gfx6_global_rsrc(Builder & bld,Temp addr)4090 get_gfx6_global_rsrc(Builder& bld, Temp addr)
4091 {
4092 uint32_t rsrc_conf = S_008F0C_NUM_FORMAT(V_008F0C_BUF_NUM_FORMAT_FLOAT) |
4093 S_008F0C_DATA_FORMAT(V_008F0C_BUF_DATA_FORMAT_32);
4094
4095 if (addr.type() == RegType::vgpr)
4096 return bld.pseudo(aco_opcode::p_create_vector, bld.def(s4), Operand::zero(), Operand::zero(),
4097 Operand::c32(-1u), Operand::c32(rsrc_conf));
4098 return bld.pseudo(aco_opcode::p_create_vector, bld.def(s4), addr, Operand::c32(-1u),
4099 Operand::c32(rsrc_conf));
4100 }
4101
4102 Temp
global_load_callback(Builder & bld,const LoadEmitInfo & info,Temp offset,unsigned bytes_needed,unsigned align_,unsigned const_offset,Temp dst_hint)4103 global_load_callback(Builder& bld, const LoadEmitInfo& info, Temp offset, unsigned bytes_needed,
4104 unsigned align_, unsigned const_offset, Temp dst_hint)
4105 {
4106 unsigned bytes_size = 0;
4107 bool use_mubuf = bld.program->chip_class == GFX6;
4108 bool global = bld.program->chip_class >= GFX9;
4109 aco_opcode op;
4110 if (bytes_needed == 1) {
4111 bytes_size = 1;
4112 op = use_mubuf ? aco_opcode::buffer_load_ubyte
4113 : global ? aco_opcode::global_load_ubyte
4114 : aco_opcode::flat_load_ubyte;
4115 } else if (bytes_needed == 2) {
4116 bytes_size = 2;
4117 op = use_mubuf ? aco_opcode::buffer_load_ushort
4118 : global ? aco_opcode::global_load_ushort
4119 : aco_opcode::flat_load_ushort;
4120 } else if (bytes_needed <= 4) {
4121 bytes_size = 4;
4122 op = use_mubuf ? aco_opcode::buffer_load_dword
4123 : global ? aco_opcode::global_load_dword
4124 : aco_opcode::flat_load_dword;
4125 } else if (bytes_needed <= 8) {
4126 bytes_size = 8;
4127 op = use_mubuf ? aco_opcode::buffer_load_dwordx2
4128 : global ? aco_opcode::global_load_dwordx2
4129 : aco_opcode::flat_load_dwordx2;
4130 } else if (bytes_needed <= 12 && !use_mubuf) {
4131 bytes_size = 12;
4132 op = global ? aco_opcode::global_load_dwordx3 : aco_opcode::flat_load_dwordx3;
4133 } else {
4134 bytes_size = 16;
4135 op = use_mubuf ? aco_opcode::buffer_load_dwordx4
4136 : global ? aco_opcode::global_load_dwordx4
4137 : aco_opcode::flat_load_dwordx4;
4138 }
4139 RegClass rc = RegClass::get(RegType::vgpr, align(bytes_size, 4));
4140 Temp val = dst_hint.id() && rc == dst_hint.regClass() ? dst_hint : bld.tmp(rc);
4141 if (use_mubuf) {
4142 aco_ptr<MUBUF_instruction> mubuf{
4143 create_instruction<MUBUF_instruction>(op, Format::MUBUF, 3, 1)};
4144 mubuf->operands[0] = Operand(get_gfx6_global_rsrc(bld, offset));
4145 mubuf->operands[1] = offset.type() == RegType::vgpr ? Operand(offset) : Operand(v1);
4146 mubuf->operands[2] = Operand::zero();
4147 mubuf->glc = info.glc;
4148 mubuf->dlc = false;
4149 mubuf->offset = 0;
4150 mubuf->addr64 = offset.type() == RegType::vgpr;
4151 mubuf->disable_wqm = false;
4152 mubuf->sync = info.sync;
4153 mubuf->definitions[0] = Definition(val);
4154 bld.insert(std::move(mubuf));
4155 } else {
4156 offset = offset.regClass() == s2 ? bld.copy(bld.def(v2), offset) : offset;
4157
4158 aco_ptr<FLAT_instruction> flat{
4159 create_instruction<FLAT_instruction>(op, global ? Format::GLOBAL : Format::FLAT, 2, 1)};
4160 flat->operands[0] = Operand(offset);
4161 flat->operands[1] = Operand(s1);
4162 flat->glc = info.glc;
4163 flat->dlc = info.glc && bld.program->chip_class >= GFX10;
4164 flat->sync = info.sync;
4165 flat->offset = 0u;
4166 flat->definitions[0] = Definition(val);
4167 bld.insert(std::move(flat));
4168 }
4169
4170 return val;
4171 }
4172
4173 const EmitLoadParameters global_load_params{global_load_callback, true, true, 1};
4174
4175 Temp
load_lds(isel_context * ctx,unsigned elem_size_bytes,unsigned num_components,Temp dst,Temp address,unsigned base_offset,unsigned align)4176 load_lds(isel_context* ctx, unsigned elem_size_bytes, unsigned num_components, Temp dst,
4177 Temp address, unsigned base_offset, unsigned align)
4178 {
4179 assert(util_is_power_of_two_nonzero(align));
4180
4181 Builder bld(ctx->program, ctx->block);
4182
4183 LoadEmitInfo info = {Operand(as_vgpr(ctx, address)), dst, num_components, elem_size_bytes};
4184 info.align_mul = align;
4185 info.align_offset = 0;
4186 info.sync = memory_sync_info(storage_shared);
4187 info.const_offset = base_offset;
4188 emit_load(ctx, bld, info, lds_load_params);
4189
4190 return dst;
4191 }
4192
4193 void
split_store_data(isel_context * ctx,RegType dst_type,unsigned count,Temp * dst,unsigned * bytes,Temp src)4194 split_store_data(isel_context* ctx, RegType dst_type, unsigned count, Temp* dst, unsigned* bytes,
4195 Temp src)
4196 {
4197 if (!count)
4198 return;
4199
4200 Builder bld(ctx->program, ctx->block);
4201
4202 /* count == 1 fast path */
4203 if (count == 1) {
4204 if (dst_type == RegType::sgpr)
4205 dst[0] = bld.as_uniform(src);
4206 else
4207 dst[0] = as_vgpr(ctx, src);
4208 return;
4209 }
4210
4211 /* elem_size_bytes is the greatest common divisor which is a power of 2 */
4212 unsigned elem_size_bytes =
4213 1u << (ffs(std::accumulate(bytes, bytes + count, 8, std::bit_or<>{})) - 1);
4214
4215 ASSERTED bool is_subdword = elem_size_bytes < 4;
4216 assert(!is_subdword || dst_type == RegType::vgpr);
4217
4218 for (unsigned i = 0; i < count; i++)
4219 dst[i] = bld.tmp(RegClass::get(dst_type, bytes[i]));
4220
4221 std::vector<Temp> temps;
4222 /* use allocated_vec if possible */
4223 auto it = ctx->allocated_vec.find(src.id());
4224 if (it != ctx->allocated_vec.end()) {
4225 if (!it->second[0].id())
4226 goto split;
4227 unsigned elem_size = it->second[0].bytes();
4228 assert(src.bytes() % elem_size == 0);
4229
4230 for (unsigned i = 0; i < src.bytes() / elem_size; i++) {
4231 if (!it->second[i].id())
4232 goto split;
4233 }
4234 if (elem_size_bytes % elem_size)
4235 goto split;
4236
4237 temps.insert(temps.end(), it->second.begin(), it->second.begin() + src.bytes() / elem_size);
4238 elem_size_bytes = elem_size;
4239 }
4240
4241 split:
4242 /* split src if necessary */
4243 if (temps.empty()) {
4244 if (is_subdword && src.type() == RegType::sgpr)
4245 src = as_vgpr(ctx, src);
4246 if (dst_type == RegType::sgpr)
4247 src = bld.as_uniform(src);
4248
4249 unsigned num_elems = src.bytes() / elem_size_bytes;
4250 aco_ptr<Instruction> split{create_instruction<Pseudo_instruction>(
4251 aco_opcode::p_split_vector, Format::PSEUDO, 1, num_elems)};
4252 split->operands[0] = Operand(src);
4253 for (unsigned i = 0; i < num_elems; i++) {
4254 temps.emplace_back(bld.tmp(RegClass::get(dst_type, elem_size_bytes)));
4255 split->definitions[i] = Definition(temps.back());
4256 }
4257 bld.insert(std::move(split));
4258 }
4259
4260 unsigned idx = 0;
4261 for (unsigned i = 0; i < count; i++) {
4262 unsigned op_count = dst[i].bytes() / elem_size_bytes;
4263 if (op_count == 1) {
4264 if (dst_type == RegType::sgpr)
4265 dst[i] = bld.as_uniform(temps[idx++]);
4266 else
4267 dst[i] = as_vgpr(ctx, temps[idx++]);
4268 continue;
4269 }
4270
4271 aco_ptr<Instruction> vec{create_instruction<Pseudo_instruction>(aco_opcode::p_create_vector,
4272 Format::PSEUDO, op_count, 1)};
4273 for (unsigned j = 0; j < op_count; j++) {
4274 Temp tmp = temps[idx++];
4275 if (dst_type == RegType::sgpr)
4276 tmp = bld.as_uniform(tmp);
4277 vec->operands[j] = Operand(tmp);
4278 }
4279 vec->definitions[0] = Definition(dst[i]);
4280 bld.insert(std::move(vec));
4281 }
4282 return;
4283 }
4284
4285 bool
scan_write_mask(uint32_t mask,uint32_t todo_mask,int * start,int * count)4286 scan_write_mask(uint32_t mask, uint32_t todo_mask, int* start, int* count)
4287 {
4288 unsigned start_elem = ffs(todo_mask) - 1;
4289 bool skip = !(mask & (1 << start_elem));
4290 if (skip)
4291 mask = ~mask & todo_mask;
4292
4293 mask &= todo_mask;
4294
4295 u_bit_scan_consecutive_range(&mask, start, count);
4296
4297 return !skip;
4298 }
4299
4300 void
advance_write_mask(uint32_t * todo_mask,int start,int count)4301 advance_write_mask(uint32_t* todo_mask, int start, int count)
4302 {
4303 *todo_mask &= ~u_bit_consecutive(0, count) << start;
4304 }
4305
4306 void
store_lds(isel_context * ctx,unsigned elem_size_bytes,Temp data,uint32_t wrmask,Temp address,unsigned base_offset,unsigned align)4307 store_lds(isel_context* ctx, unsigned elem_size_bytes, Temp data, uint32_t wrmask, Temp address,
4308 unsigned base_offset, unsigned align)
4309 {
4310 assert(util_is_power_of_two_nonzero(align));
4311 assert(util_is_power_of_two_nonzero(elem_size_bytes) && elem_size_bytes <= 8);
4312
4313 Builder bld(ctx->program, ctx->block);
4314 bool large_ds_write = ctx->options->chip_class >= GFX7;
4315 bool usable_write2 = ctx->options->chip_class >= GFX7;
4316
4317 unsigned write_count = 0;
4318 Temp write_datas[32];
4319 unsigned offsets[32];
4320 unsigned bytes[32];
4321 aco_opcode opcodes[32];
4322
4323 wrmask = widen_mask(wrmask, elem_size_bytes);
4324
4325 uint32_t todo = u_bit_consecutive(0, data.bytes());
4326 while (todo) {
4327 int offset, byte;
4328 if (!scan_write_mask(wrmask, todo, &offset, &byte)) {
4329 offsets[write_count] = offset;
4330 bytes[write_count] = byte;
4331 opcodes[write_count] = aco_opcode::num_opcodes;
4332 write_count++;
4333 advance_write_mask(&todo, offset, byte);
4334 continue;
4335 }
4336
4337 bool aligned2 = offset % 2 == 0 && align % 2 == 0;
4338 bool aligned4 = offset % 4 == 0 && align % 4 == 0;
4339 bool aligned8 = offset % 8 == 0 && align % 8 == 0;
4340 bool aligned16 = offset % 16 == 0 && align % 16 == 0;
4341
4342 // TODO: use ds_write_b8_d16_hi/ds_write_b16_d16_hi if beneficial
4343 aco_opcode op = aco_opcode::num_opcodes;
4344 if (byte >= 16 && aligned16 && large_ds_write) {
4345 op = aco_opcode::ds_write_b128;
4346 byte = 16;
4347 } else if (byte >= 12 && aligned16 && large_ds_write) {
4348 op = aco_opcode::ds_write_b96;
4349 byte = 12;
4350 } else if (byte >= 8 && aligned8) {
4351 op = aco_opcode::ds_write_b64;
4352 byte = 8;
4353 } else if (byte >= 4 && aligned4) {
4354 op = aco_opcode::ds_write_b32;
4355 byte = 4;
4356 } else if (byte >= 2 && aligned2) {
4357 op = aco_opcode::ds_write_b16;
4358 byte = 2;
4359 } else if (byte >= 1) {
4360 op = aco_opcode::ds_write_b8;
4361 byte = 1;
4362 } else {
4363 assert(false);
4364 }
4365
4366 offsets[write_count] = offset;
4367 bytes[write_count] = byte;
4368 opcodes[write_count] = op;
4369 write_count++;
4370 advance_write_mask(&todo, offset, byte);
4371 }
4372
4373 Operand m = load_lds_size_m0(bld);
4374
4375 split_store_data(ctx, RegType::vgpr, write_count, write_datas, bytes, data);
4376
4377 for (unsigned i = 0; i < write_count; i++) {
4378 aco_opcode op = opcodes[i];
4379 if (op == aco_opcode::num_opcodes)
4380 continue;
4381
4382 Temp split_data = write_datas[i];
4383
4384 unsigned second = write_count;
4385 if (usable_write2 && (op == aco_opcode::ds_write_b32 || op == aco_opcode::ds_write_b64)) {
4386 for (second = i + 1; second < write_count; second++) {
4387 if (opcodes[second] == op && (offsets[second] - offsets[i]) % split_data.bytes() == 0) {
4388 op = split_data.bytes() == 4 ? aco_opcode::ds_write2_b32 : aco_opcode::ds_write2_b64;
4389 opcodes[second] = aco_opcode::num_opcodes;
4390 break;
4391 }
4392 }
4393 }
4394
4395 bool write2 = op == aco_opcode::ds_write2_b32 || op == aco_opcode::ds_write2_b64;
4396 unsigned write2_off = (offsets[second] - offsets[i]) / split_data.bytes();
4397
4398 unsigned inline_offset = base_offset + offsets[i];
4399 unsigned max_offset = write2 ? (255 - write2_off) * split_data.bytes() : 65535;
4400 Temp address_offset = address;
4401 if (inline_offset > max_offset) {
4402 address_offset = bld.vadd32(bld.def(v1), Operand::c32(base_offset), address_offset);
4403 inline_offset = offsets[i];
4404 }
4405
4406 /* offsets[i] shouldn't be large enough for this to happen */
4407 assert(inline_offset <= max_offset);
4408
4409 Instruction* instr;
4410 if (write2) {
4411 Temp second_data = write_datas[second];
4412 inline_offset /= split_data.bytes();
4413 instr = bld.ds(op, address_offset, split_data, second_data, m, inline_offset,
4414 inline_offset + write2_off);
4415 } else {
4416 instr = bld.ds(op, address_offset, split_data, m, inline_offset);
4417 }
4418 instr->ds().sync = memory_sync_info(storage_shared);
4419
4420 if (m.isUndefined())
4421 instr->operands.pop_back();
4422 }
4423 }
4424
4425 aco_opcode
get_buffer_store_op(unsigned bytes)4426 get_buffer_store_op(unsigned bytes)
4427 {
4428 switch (bytes) {
4429 case 1: return aco_opcode::buffer_store_byte;
4430 case 2: return aco_opcode::buffer_store_short;
4431 case 4: return aco_opcode::buffer_store_dword;
4432 case 8: return aco_opcode::buffer_store_dwordx2;
4433 case 12: return aco_opcode::buffer_store_dwordx3;
4434 case 16: return aco_opcode::buffer_store_dwordx4;
4435 }
4436 unreachable("Unexpected store size");
4437 return aco_opcode::num_opcodes;
4438 }
4439
4440 void
split_buffer_store(isel_context * ctx,nir_intrinsic_instr * instr,bool smem,RegType dst_type,Temp data,unsigned writemask,int swizzle_element_size,unsigned * write_count,Temp * write_datas,unsigned * offsets)4441 split_buffer_store(isel_context* ctx, nir_intrinsic_instr* instr, bool smem, RegType dst_type,
4442 Temp data, unsigned writemask, int swizzle_element_size, unsigned* write_count,
4443 Temp* write_datas, unsigned* offsets)
4444 {
4445 unsigned write_count_with_skips = 0;
4446 bool skips[16];
4447 unsigned bytes[16];
4448
4449 /* determine how to split the data */
4450 unsigned todo = u_bit_consecutive(0, data.bytes());
4451 while (todo) {
4452 int offset, byte;
4453 skips[write_count_with_skips] = !scan_write_mask(writemask, todo, &offset, &byte);
4454 offsets[write_count_with_skips] = offset;
4455 if (skips[write_count_with_skips]) {
4456 bytes[write_count_with_skips] = byte;
4457 advance_write_mask(&todo, offset, byte);
4458 write_count_with_skips++;
4459 continue;
4460 }
4461
4462 /* only supported sizes are 1, 2, 4, 8, 12 and 16 bytes and can't be
4463 * larger than swizzle_element_size */
4464 byte = MIN2(byte, swizzle_element_size);
4465 if (byte % 4)
4466 byte = byte > 4 ? byte & ~0x3 : MIN2(byte, 2);
4467
4468 /* SMEM and GFX6 VMEM can't emit 12-byte stores */
4469 if ((ctx->program->chip_class == GFX6 || smem) && byte == 12)
4470 byte = 8;
4471
4472 /* dword or larger stores have to be dword-aligned */
4473 unsigned align_mul = instr ? nir_intrinsic_align_mul(instr) : 4;
4474 unsigned align_offset = (instr ? nir_intrinsic_align_offset(instr) : 0) + offset;
4475 bool dword_aligned = align_offset % 4 == 0 && align_mul % 4 == 0;
4476 if (!dword_aligned)
4477 byte = MIN2(byte, (align_offset % 2 == 0 && align_mul % 2 == 0) ? 2 : 1);
4478
4479 bytes[write_count_with_skips] = byte;
4480 advance_write_mask(&todo, offset, byte);
4481 write_count_with_skips++;
4482 }
4483
4484 /* actually split data */
4485 split_store_data(ctx, dst_type, write_count_with_skips, write_datas, bytes, data);
4486
4487 /* remove skips */
4488 for (unsigned i = 0; i < write_count_with_skips; i++) {
4489 if (skips[i])
4490 continue;
4491 write_datas[*write_count] = write_datas[i];
4492 offsets[*write_count] = offsets[i];
4493 (*write_count)++;
4494 }
4495 }
4496
4497 Temp
create_vec_from_array(isel_context * ctx,Temp arr[],unsigned cnt,RegType reg_type,unsigned elem_size_bytes,unsigned split_cnt=0u,Temp dst=Temp ())4498 create_vec_from_array(isel_context* ctx, Temp arr[], unsigned cnt, RegType reg_type,
4499 unsigned elem_size_bytes, unsigned split_cnt = 0u, Temp dst = Temp())
4500 {
4501 Builder bld(ctx->program, ctx->block);
4502 unsigned dword_size = elem_size_bytes / 4;
4503
4504 if (!dst.id())
4505 dst = bld.tmp(RegClass(reg_type, cnt * dword_size));
4506
4507 std::array<Temp, NIR_MAX_VEC_COMPONENTS> allocated_vec;
4508 aco_ptr<Pseudo_instruction> instr{
4509 create_instruction<Pseudo_instruction>(aco_opcode::p_create_vector, Format::PSEUDO, cnt, 1)};
4510 instr->definitions[0] = Definition(dst);
4511
4512 for (unsigned i = 0; i < cnt; ++i) {
4513 if (arr[i].id()) {
4514 assert(arr[i].size() == dword_size);
4515 allocated_vec[i] = arr[i];
4516 instr->operands[i] = Operand(arr[i]);
4517 } else {
4518 Temp zero = bld.copy(bld.def(RegClass(reg_type, dword_size)),
4519 Operand::zero(dword_size == 2 ? 8 : 4));
4520 allocated_vec[i] = zero;
4521 instr->operands[i] = Operand(zero);
4522 }
4523 }
4524
4525 bld.insert(std::move(instr));
4526
4527 if (split_cnt)
4528 emit_split_vector(ctx, dst, split_cnt);
4529 else
4530 ctx->allocated_vec.emplace(dst.id(), allocated_vec); /* emit_split_vector already does this */
4531
4532 return dst;
4533 }
4534
4535 inline unsigned
resolve_excess_vmem_const_offset(Builder & bld,Temp & voffset,unsigned const_offset)4536 resolve_excess_vmem_const_offset(Builder& bld, Temp& voffset, unsigned const_offset)
4537 {
4538 if (const_offset >= 4096) {
4539 unsigned excess_const_offset = const_offset / 4096u * 4096u;
4540 const_offset %= 4096u;
4541
4542 if (!voffset.id())
4543 voffset = bld.copy(bld.def(v1), Operand::c32(excess_const_offset));
4544 else if (unlikely(voffset.regClass() == s1))
4545 voffset = bld.sop2(aco_opcode::s_add_u32, bld.def(s1), bld.def(s1, scc),
4546 Operand::c32(excess_const_offset), Operand(voffset));
4547 else if (likely(voffset.regClass() == v1))
4548 voffset = bld.vadd32(bld.def(v1), Operand(voffset), Operand::c32(excess_const_offset));
4549 else
4550 unreachable("Unsupported register class of voffset");
4551 }
4552
4553 return const_offset;
4554 }
4555
4556 void
emit_single_mubuf_store(isel_context * ctx,Temp descriptor,Temp voffset,Temp soffset,Temp vdata,unsigned const_offset=0u,memory_sync_info sync=memory_sync_info (),bool slc=false,bool swizzled=false)4557 emit_single_mubuf_store(isel_context* ctx, Temp descriptor, Temp voffset, Temp soffset, Temp vdata,
4558 unsigned const_offset = 0u, memory_sync_info sync = memory_sync_info(),
4559 bool slc = false, bool swizzled = false)
4560 {
4561 assert(vdata.id());
4562 assert(vdata.size() != 3 || ctx->program->chip_class != GFX6);
4563 assert(vdata.size() >= 1 && vdata.size() <= 4);
4564
4565 Builder bld(ctx->program, ctx->block);
4566 aco_opcode op = get_buffer_store_op(vdata.bytes());
4567 const_offset = resolve_excess_vmem_const_offset(bld, voffset, const_offset);
4568
4569 Operand voffset_op = voffset.id() ? Operand(as_vgpr(ctx, voffset)) : Operand(v1);
4570 Operand soffset_op = soffset.id() ? Operand(soffset) : Operand::zero();
4571 Builder::Result r =
4572 bld.mubuf(op, Operand(descriptor), voffset_op, soffset_op, Operand(vdata), const_offset,
4573 /* offen */ !voffset_op.isUndefined(), /* swizzled */ swizzled,
4574 /* idxen*/ false, /* addr64 */ false, /* disable_wqm */ false, /* glc */ true,
4575 /* dlc*/ false, /* slc */ slc);
4576
4577 r.instr->mubuf().sync = sync;
4578 }
4579
4580 void
store_vmem_mubuf(isel_context * ctx,Temp src,Temp descriptor,Temp voffset,Temp soffset,unsigned base_const_offset,unsigned elem_size_bytes,unsigned write_mask,bool allow_combining=true,memory_sync_info sync=memory_sync_info (),bool slc=false)4581 store_vmem_mubuf(isel_context* ctx, Temp src, Temp descriptor, Temp voffset, Temp soffset,
4582 unsigned base_const_offset, unsigned elem_size_bytes, unsigned write_mask,
4583 bool allow_combining = true, memory_sync_info sync = memory_sync_info(),
4584 bool slc = false)
4585 {
4586 Builder bld(ctx->program, ctx->block);
4587 assert(elem_size_bytes == 2 || elem_size_bytes == 4 || elem_size_bytes == 8);
4588 assert(write_mask);
4589 write_mask = widen_mask(write_mask, elem_size_bytes);
4590
4591 unsigned write_count = 0;
4592 Temp write_datas[32];
4593 unsigned offsets[32];
4594 split_buffer_store(ctx, NULL, false, RegType::vgpr, src, write_mask, allow_combining ? 16 : 4,
4595 &write_count, write_datas, offsets);
4596
4597 for (unsigned i = 0; i < write_count; i++) {
4598 unsigned const_offset = offsets[i] + base_const_offset;
4599 emit_single_mubuf_store(ctx, descriptor, voffset, soffset, write_datas[i], const_offset, sync,
4600 slc, !allow_combining);
4601 }
4602 }
4603
4604 void
load_vmem_mubuf(isel_context * ctx,Temp dst,Temp descriptor,Temp voffset,Temp soffset,unsigned base_const_offset,unsigned elem_size_bytes,unsigned num_components,unsigned stride=0u,bool allow_combining=true,bool allow_reorder=true,bool slc=false)4605 load_vmem_mubuf(isel_context* ctx, Temp dst, Temp descriptor, Temp voffset, Temp soffset,
4606 unsigned base_const_offset, unsigned elem_size_bytes, unsigned num_components,
4607 unsigned stride = 0u, bool allow_combining = true, bool allow_reorder = true,
4608 bool slc = false)
4609 {
4610 assert(elem_size_bytes == 2 || elem_size_bytes == 4 || elem_size_bytes == 8);
4611 assert((num_components * elem_size_bytes) == dst.bytes());
4612 assert(!!stride != allow_combining);
4613
4614 Builder bld(ctx->program, ctx->block);
4615
4616 LoadEmitInfo info = {Operand(voffset), dst, num_components, elem_size_bytes, descriptor};
4617 info.component_stride = allow_combining ? 0 : stride;
4618 info.glc = true;
4619 info.slc = slc;
4620 info.swizzle_component_size = allow_combining ? 0 : 4;
4621 info.align_mul = MIN2(elem_size_bytes, 4);
4622 info.align_offset = 0;
4623 info.soffset = soffset;
4624 info.const_offset = base_const_offset;
4625 emit_load(ctx, bld, info, mubuf_load_params);
4626 }
4627
4628 Temp
wave_id_in_threadgroup(isel_context * ctx)4629 wave_id_in_threadgroup(isel_context* ctx)
4630 {
4631 Builder bld(ctx->program, ctx->block);
4632 return bld.sop2(aco_opcode::s_bfe_u32, bld.def(s1), bld.def(s1, scc),
4633 get_arg(ctx, ctx->args->ac.merged_wave_info), Operand::c32(24u | (4u << 16)));
4634 }
4635
4636 Temp
thread_id_in_threadgroup(isel_context * ctx)4637 thread_id_in_threadgroup(isel_context* ctx)
4638 {
4639 /* tid_in_tg = wave_id * wave_size + tid_in_wave */
4640
4641 Builder bld(ctx->program, ctx->block);
4642 Temp tid_in_wave = emit_mbcnt(ctx, bld.tmp(v1));
4643
4644 if (ctx->program->workgroup_size <= ctx->program->wave_size)
4645 return tid_in_wave;
4646
4647 Temp wave_id_in_tg = wave_id_in_threadgroup(ctx);
4648 Temp num_pre_threads =
4649 bld.sop2(aco_opcode::s_lshl_b32, bld.def(s1), bld.def(s1, scc), wave_id_in_tg,
4650 Operand::c32(ctx->program->wave_size == 64 ? 6u : 5u));
4651 return bld.vadd32(bld.def(v1), Operand(num_pre_threads), Operand(tid_in_wave));
4652 }
4653
4654 Temp
get_tess_rel_patch_id(isel_context * ctx)4655 get_tess_rel_patch_id(isel_context* ctx)
4656 {
4657 Builder bld(ctx->program, ctx->block);
4658
4659 switch (ctx->shader->info.stage) {
4660 case MESA_SHADER_TESS_CTRL:
4661 return bld.pseudo(aco_opcode::p_extract, bld.def(v1), get_arg(ctx, ctx->args->ac.tcs_rel_ids),
4662 Operand::zero(), Operand::c32(8u), Operand::zero());
4663 case MESA_SHADER_TESS_EVAL: return get_arg(ctx, ctx->args->ac.tes_rel_patch_id);
4664 default: unreachable("Unsupported stage in get_tess_rel_patch_id");
4665 }
4666 }
4667
4668 bool
store_output_to_temps(isel_context * ctx,nir_intrinsic_instr * instr)4669 store_output_to_temps(isel_context* ctx, nir_intrinsic_instr* instr)
4670 {
4671 unsigned write_mask = nir_intrinsic_write_mask(instr);
4672 unsigned component = nir_intrinsic_component(instr);
4673 unsigned idx = nir_intrinsic_base(instr) * 4u + component;
4674 nir_src offset = *nir_get_io_offset_src(instr);
4675
4676 if (!nir_src_is_const(offset) || nir_src_as_uint(offset))
4677 return false;
4678
4679 Temp src = get_ssa_temp(ctx, instr->src[0].ssa);
4680
4681 if (instr->src[0].ssa->bit_size == 64)
4682 write_mask = widen_mask(write_mask, 2);
4683
4684 RegClass rc = instr->src[0].ssa->bit_size == 16 ? v2b : v1;
4685
4686 for (unsigned i = 0; i < 8; ++i) {
4687 if (write_mask & (1 << i)) {
4688 ctx->outputs.mask[idx / 4u] |= 1 << (idx % 4u);
4689 ctx->outputs.temps[idx] = emit_extract_vector(ctx, src, i, rc);
4690 }
4691 idx++;
4692 }
4693
4694 return true;
4695 }
4696
4697 bool
load_input_from_temps(isel_context * ctx,nir_intrinsic_instr * instr,Temp dst)4698 load_input_from_temps(isel_context* ctx, nir_intrinsic_instr* instr, Temp dst)
4699 {
4700 /* Only TCS per-vertex inputs are supported by this function.
4701 * Per-vertex inputs only match between the VS/TCS invocation id when the number of invocations
4702 * is the same.
4703 */
4704 if (ctx->shader->info.stage != MESA_SHADER_TESS_CTRL || !ctx->tcs_in_out_eq)
4705 return false;
4706
4707 nir_src* off_src = nir_get_io_offset_src(instr);
4708 nir_src* vertex_index_src = nir_get_io_vertex_index_src(instr);
4709 nir_instr* vertex_index_instr = vertex_index_src->ssa->parent_instr;
4710 bool can_use_temps =
4711 nir_src_is_const(*off_src) && vertex_index_instr->type == nir_instr_type_intrinsic &&
4712 nir_instr_as_intrinsic(vertex_index_instr)->intrinsic == nir_intrinsic_load_invocation_id;
4713
4714 if (!can_use_temps)
4715 return false;
4716
4717 unsigned idx = nir_intrinsic_base(instr) * 4u + nir_intrinsic_component(instr) +
4718 4 * nir_src_as_uint(*off_src);
4719 Temp* src = &ctx->inputs.temps[idx];
4720 create_vec_from_array(ctx, src, dst.size(), dst.regClass().type(), 4u, 0, dst);
4721
4722 return true;
4723 }
4724
4725 static void export_vs_varying(isel_context* ctx, int slot, bool is_pos, int* next_pos);
4726
4727 void
visit_store_output(isel_context * ctx,nir_intrinsic_instr * instr)4728 visit_store_output(isel_context* ctx, nir_intrinsic_instr* instr)
4729 {
4730 if (ctx->stage == vertex_vs || ctx->stage == tess_eval_vs || ctx->stage == fragment_fs ||
4731 ctx->stage == vertex_ngg || ctx->stage == tess_eval_ngg ||
4732 (ctx->stage == vertex_tess_control_hs && ctx->shader->info.stage == MESA_SHADER_VERTEX) ||
4733 ctx->shader->info.stage == MESA_SHADER_GEOMETRY) {
4734 bool stored_to_temps = store_output_to_temps(ctx, instr);
4735 if (!stored_to_temps) {
4736 isel_err(instr->src[1].ssa->parent_instr, "Unimplemented output offset instruction");
4737 abort();
4738 }
4739 } else {
4740 unreachable("Shader stage not implemented");
4741 }
4742
4743 /* For NGG VS and TES shaders the primitive ID is exported manually after the other exports so we
4744 * have to emit an exp here manually */
4745 if (ctx->stage.hw == HWStage::NGG &&
4746 (ctx->stage.has(SWStage::VS) || ctx->stage.has(SWStage::TES)) &&
4747 nir_intrinsic_io_semantics(instr).location == VARYING_SLOT_PRIMITIVE_ID)
4748 export_vs_varying(ctx, VARYING_SLOT_PRIMITIVE_ID, false, NULL);
4749 }
4750
4751 void
emit_interp_instr(isel_context * ctx,unsigned idx,unsigned component,Temp src,Temp dst,Temp prim_mask)4752 emit_interp_instr(isel_context* ctx, unsigned idx, unsigned component, Temp src, Temp dst,
4753 Temp prim_mask)
4754 {
4755 Temp coord1 = emit_extract_vector(ctx, src, 0, v1);
4756 Temp coord2 = emit_extract_vector(ctx, src, 1, v1);
4757
4758 Builder bld(ctx->program, ctx->block);
4759
4760 if (dst.regClass() == v2b) {
4761 if (ctx->program->dev.has_16bank_lds) {
4762 assert(ctx->options->chip_class <= GFX8);
4763 Builder::Result interp_p1 =
4764 bld.vintrp(aco_opcode::v_interp_mov_f32, bld.def(v1), Operand::c32(2u) /* P0 */,
4765 bld.m0(prim_mask), idx, component);
4766 interp_p1 = bld.vintrp(aco_opcode::v_interp_p1lv_f16, bld.def(v2b), coord1,
4767 bld.m0(prim_mask), interp_p1, idx, component);
4768 bld.vintrp(aco_opcode::v_interp_p2_legacy_f16, Definition(dst), coord2, bld.m0(prim_mask),
4769 interp_p1, idx, component);
4770 } else {
4771 aco_opcode interp_p2_op = aco_opcode::v_interp_p2_f16;
4772
4773 if (ctx->options->chip_class == GFX8)
4774 interp_p2_op = aco_opcode::v_interp_p2_legacy_f16;
4775
4776 Builder::Result interp_p1 = bld.vintrp(aco_opcode::v_interp_p1ll_f16, bld.def(v1), coord1,
4777 bld.m0(prim_mask), idx, component);
4778 bld.vintrp(interp_p2_op, Definition(dst), coord2, bld.m0(prim_mask), interp_p1, idx,
4779 component);
4780 }
4781 } else {
4782 Builder::Result interp_p1 = bld.vintrp(aco_opcode::v_interp_p1_f32, bld.def(v1), coord1,
4783 bld.m0(prim_mask), idx, component);
4784
4785 if (ctx->program->dev.has_16bank_lds)
4786 interp_p1.instr->operands[0].setLateKill(true);
4787
4788 bld.vintrp(aco_opcode::v_interp_p2_f32, Definition(dst), coord2, bld.m0(prim_mask), interp_p1,
4789 idx, component);
4790 }
4791 }
4792
4793 void
emit_load_frag_coord(isel_context * ctx,Temp dst,unsigned num_components)4794 emit_load_frag_coord(isel_context* ctx, Temp dst, unsigned num_components)
4795 {
4796 Builder bld(ctx->program, ctx->block);
4797
4798 aco_ptr<Pseudo_instruction> vec(create_instruction<Pseudo_instruction>(
4799 aco_opcode::p_create_vector, Format::PSEUDO, num_components, 1));
4800 for (unsigned i = 0; i < num_components; i++) {
4801 if (ctx->args->ac.frag_pos[i].used)
4802 vec->operands[i] = Operand(get_arg(ctx, ctx->args->ac.frag_pos[i]));
4803 else
4804 vec->operands[i] = Operand(v1);
4805 }
4806 if (G_0286CC_POS_W_FLOAT_ENA(ctx->program->config->spi_ps_input_ena)) {
4807 assert(num_components == 4);
4808 vec->operands[3] =
4809 bld.vop1(aco_opcode::v_rcp_f32, bld.def(v1), get_arg(ctx, ctx->args->ac.frag_pos[3]));
4810 }
4811
4812 if (ctx->options->adjust_frag_coord_z &&
4813 G_0286CC_POS_Z_FLOAT_ENA(ctx->program->config->spi_ps_input_ena)) {
4814 /* Adjust gl_FragCoord.z for VRS due to a hw bug on some GFX10.3 chips. */
4815 Operand frag_z = vec->operands[2];
4816 Temp adjusted_frag_z = bld.tmp(v1);
4817 Temp tmp;
4818
4819 /* dFdx fine */
4820 Temp tl = bld.vop1_dpp(aco_opcode::v_mov_b32, bld.def(v1), frag_z, dpp_quad_perm(0, 0, 2, 2));
4821 tmp = bld.vop2_dpp(aco_opcode::v_sub_f32, bld.def(v1), frag_z, tl, dpp_quad_perm(1, 1, 3, 3));
4822 emit_wqm(bld, tmp, adjusted_frag_z, true);
4823
4824 /* adjusted_frag_z * 0.0625 + frag_z */
4825 adjusted_frag_z = bld.vop3(aco_opcode::v_fma_f32, bld.def(v1), adjusted_frag_z,
4826 Operand::c32(0x3d800000u /* 0.0625 */), frag_z);
4827
4828 /* VRS Rate X = Ancillary[2:3] */
4829 Temp x_rate =
4830 bld.vop3(aco_opcode::v_bfe_u32, bld.def(v1), get_arg(ctx, ctx->args->ac.ancillary),
4831 Operand::c32(2u), Operand::c32(2u));
4832
4833 /* xRate = xRate == 0x1 ? adjusted_frag_z : frag_z. */
4834 Temp cond =
4835 bld.vopc(aco_opcode::v_cmp_eq_i32, bld.def(bld.lm), Operand::c32(1u), Operand(x_rate));
4836 vec->operands[2] =
4837 bld.vop2(aco_opcode::v_cndmask_b32, bld.def(v1), frag_z, adjusted_frag_z, cond);
4838 }
4839
4840 for (Operand& op : vec->operands)
4841 op = op.isUndefined() ? Operand::zero() : op;
4842
4843 vec->definitions[0] = Definition(dst);
4844 ctx->block->instructions.emplace_back(std::move(vec));
4845 emit_split_vector(ctx, dst, num_components);
4846 return;
4847 }
4848
4849 void
emit_load_frag_shading_rate(isel_context * ctx,Temp dst)4850 emit_load_frag_shading_rate(isel_context* ctx, Temp dst)
4851 {
4852 Builder bld(ctx->program, ctx->block);
4853 Temp cond;
4854
4855 /* VRS Rate X = Ancillary[2:3]
4856 * VRS Rate Y = Ancillary[4:5]
4857 */
4858 Temp x_rate = bld.vop3(aco_opcode::v_bfe_u32, bld.def(v1), get_arg(ctx, ctx->args->ac.ancillary),
4859 Operand::c32(2u), Operand::c32(2u));
4860 Temp y_rate = bld.vop3(aco_opcode::v_bfe_u32, bld.def(v1), get_arg(ctx, ctx->args->ac.ancillary),
4861 Operand::c32(4u), Operand::c32(2u));
4862
4863 /* xRate = xRate == 0x1 ? Horizontal2Pixels : None. */
4864 cond = bld.vopc(aco_opcode::v_cmp_eq_i32, bld.def(bld.lm), Operand::c32(1u), Operand(x_rate));
4865 x_rate = bld.vop2(aco_opcode::v_cndmask_b32, bld.def(v1), bld.copy(bld.def(v1), Operand::zero()),
4866 bld.copy(bld.def(v1), Operand::c32(4u)), cond);
4867
4868 /* yRate = yRate == 0x1 ? Vertical2Pixels : None. */
4869 cond = bld.vopc(aco_opcode::v_cmp_eq_i32, bld.def(bld.lm), Operand::c32(1u), Operand(y_rate));
4870 y_rate = bld.vop2(aco_opcode::v_cndmask_b32, bld.def(v1), bld.copy(bld.def(v1), Operand::zero()),
4871 bld.copy(bld.def(v1), Operand::c32(1u)), cond);
4872
4873 bld.vop2(aco_opcode::v_or_b32, Definition(dst), Operand(x_rate), Operand(y_rate));
4874 }
4875
4876 void
visit_load_interpolated_input(isel_context * ctx,nir_intrinsic_instr * instr)4877 visit_load_interpolated_input(isel_context* ctx, nir_intrinsic_instr* instr)
4878 {
4879 Temp dst = get_ssa_temp(ctx, &instr->dest.ssa);
4880 Temp coords = get_ssa_temp(ctx, instr->src[0].ssa);
4881 unsigned idx = nir_intrinsic_base(instr);
4882 unsigned component = nir_intrinsic_component(instr);
4883 Temp prim_mask = get_arg(ctx, ctx->args->ac.prim_mask);
4884
4885 assert(nir_src_is_const(instr->src[1]) && !nir_src_as_uint(instr->src[1]));
4886
4887 if (instr->dest.ssa.num_components == 1) {
4888 emit_interp_instr(ctx, idx, component, coords, dst, prim_mask);
4889 } else {
4890 aco_ptr<Pseudo_instruction> vec(create_instruction<Pseudo_instruction>(
4891 aco_opcode::p_create_vector, Format::PSEUDO, instr->dest.ssa.num_components, 1));
4892 for (unsigned i = 0; i < instr->dest.ssa.num_components; i++) {
4893 Temp tmp = ctx->program->allocateTmp(instr->dest.ssa.bit_size == 16 ? v2b : v1);
4894 emit_interp_instr(ctx, idx, component + i, coords, tmp, prim_mask);
4895 vec->operands[i] = Operand(tmp);
4896 }
4897 vec->definitions[0] = Definition(dst);
4898 ctx->block->instructions.emplace_back(std::move(vec));
4899 }
4900 }
4901
4902 bool
check_vertex_fetch_size(isel_context * ctx,const ac_data_format_info * vtx_info,unsigned offset,unsigned binding_align,unsigned channels)4903 check_vertex_fetch_size(isel_context* ctx, const ac_data_format_info* vtx_info, unsigned offset,
4904 unsigned binding_align, unsigned channels)
4905 {
4906 unsigned vertex_byte_size = vtx_info->chan_byte_size * channels;
4907 if (vtx_info->chan_byte_size != 4 && channels == 3)
4908 return false;
4909
4910 /* Split typed vertex buffer loads on GFX6 and GFX10+ to avoid any
4911 * alignment issues that triggers memory violations and eventually a GPU
4912 * hang. This can happen if the stride (static or dynamic) is unaligned and
4913 * also if the VBO offset is aligned to a scalar (eg. stride is 8 and VBO
4914 * offset is 2 for R16G16B16A16_SNORM).
4915 */
4916 return (ctx->options->chip_class >= GFX7 && ctx->options->chip_class <= GFX9) ||
4917 (offset % vertex_byte_size == 0 && MAX2(binding_align, 1) % vertex_byte_size == 0);
4918 }
4919
4920 uint8_t
get_fetch_data_format(isel_context * ctx,const ac_data_format_info * vtx_info,unsigned offset,unsigned * channels,unsigned max_channels,unsigned binding_align)4921 get_fetch_data_format(isel_context* ctx, const ac_data_format_info* vtx_info, unsigned offset,
4922 unsigned* channels, unsigned max_channels, unsigned binding_align)
4923 {
4924 if (!vtx_info->chan_byte_size) {
4925 *channels = vtx_info->num_channels;
4926 return vtx_info->chan_format;
4927 }
4928
4929 unsigned num_channels = *channels;
4930 if (!check_vertex_fetch_size(ctx, vtx_info, offset, binding_align, *channels)) {
4931 unsigned new_channels = num_channels + 1;
4932 /* first, assume more loads is worse and try using a larger data format */
4933 while (new_channels <= max_channels &&
4934 !check_vertex_fetch_size(ctx, vtx_info, offset, binding_align, new_channels)) {
4935 new_channels++;
4936 }
4937
4938 if (new_channels > max_channels) {
4939 /* then try decreasing load size (at the cost of more loads) */
4940 new_channels = *channels;
4941 while (new_channels > 1 &&
4942 !check_vertex_fetch_size(ctx, vtx_info, offset, binding_align, new_channels))
4943 new_channels--;
4944 }
4945
4946 if (new_channels < *channels)
4947 *channels = new_channels;
4948 num_channels = new_channels;
4949 }
4950
4951 switch (vtx_info->chan_format) {
4952 case V_008F0C_BUF_DATA_FORMAT_8:
4953 return std::array<uint8_t, 4>{V_008F0C_BUF_DATA_FORMAT_8, V_008F0C_BUF_DATA_FORMAT_8_8,
4954 V_008F0C_BUF_DATA_FORMAT_INVALID,
4955 V_008F0C_BUF_DATA_FORMAT_8_8_8_8}[num_channels - 1];
4956 case V_008F0C_BUF_DATA_FORMAT_16:
4957 return std::array<uint8_t, 4>{V_008F0C_BUF_DATA_FORMAT_16, V_008F0C_BUF_DATA_FORMAT_16_16,
4958 V_008F0C_BUF_DATA_FORMAT_INVALID,
4959 V_008F0C_BUF_DATA_FORMAT_16_16_16_16}[num_channels - 1];
4960 case V_008F0C_BUF_DATA_FORMAT_32:
4961 return std::array<uint8_t, 4>{V_008F0C_BUF_DATA_FORMAT_32, V_008F0C_BUF_DATA_FORMAT_32_32,
4962 V_008F0C_BUF_DATA_FORMAT_32_32_32,
4963 V_008F0C_BUF_DATA_FORMAT_32_32_32_32}[num_channels - 1];
4964 }
4965 unreachable("shouldn't reach here");
4966 return V_008F0C_BUF_DATA_FORMAT_INVALID;
4967 }
4968
4969 /* For 2_10_10_10 formats the alpha is handled as unsigned by pre-vega HW.
4970 * so we may need to fix it up. */
4971 Temp
adjust_vertex_fetch_alpha(isel_context * ctx,enum radv_vs_input_alpha_adjust adjustment,Temp alpha)4972 adjust_vertex_fetch_alpha(isel_context* ctx, enum radv_vs_input_alpha_adjust adjustment, Temp alpha)
4973 {
4974 Builder bld(ctx->program, ctx->block);
4975
4976 if (adjustment == ALPHA_ADJUST_SSCALED)
4977 alpha = bld.vop1(aco_opcode::v_cvt_u32_f32, bld.def(v1), alpha);
4978
4979 /* For the integer-like cases, do a natural sign extension.
4980 *
4981 * For the SNORM case, the values are 0.0, 0.333, 0.666, 1.0
4982 * and happen to contain 0, 1, 2, 3 as the two LSBs of the
4983 * exponent.
4984 */
4985 unsigned offset = adjustment == ALPHA_ADJUST_SNORM ? 23u : 0u;
4986 alpha =
4987 bld.vop3(aco_opcode::v_bfe_i32, bld.def(v1), alpha, Operand::c32(offset), Operand::c32(2u));
4988
4989 /* Convert back to the right type. */
4990 if (adjustment == ALPHA_ADJUST_SNORM) {
4991 alpha = bld.vop1(aco_opcode::v_cvt_f32_i32, bld.def(v1), alpha);
4992 alpha = bld.vop2(aco_opcode::v_max_f32, bld.def(v1), Operand::c32(0xbf800000u), alpha);
4993 } else if (adjustment == ALPHA_ADJUST_SSCALED) {
4994 alpha = bld.vop1(aco_opcode::v_cvt_f32_i32, bld.def(v1), alpha);
4995 }
4996
4997 return alpha;
4998 }
4999
5000 void
visit_load_input(isel_context * ctx,nir_intrinsic_instr * instr)5001 visit_load_input(isel_context* ctx, nir_intrinsic_instr* instr)
5002 {
5003 Builder bld(ctx->program, ctx->block);
5004 Temp dst = get_ssa_temp(ctx, &instr->dest.ssa);
5005 nir_src offset = *nir_get_io_offset_src(instr);
5006
5007 if (ctx->shader->info.stage == MESA_SHADER_VERTEX && ctx->args->shader_info->vs.dynamic_inputs) {
5008 if (!nir_src_is_const(offset) || nir_src_as_uint(offset))
5009 isel_err(offset.ssa->parent_instr,
5010 "Unimplemented non-zero nir_intrinsic_load_input offset");
5011
5012 unsigned location = nir_intrinsic_base(instr) - VERT_ATTRIB_GENERIC0;
5013 unsigned component = nir_intrinsic_component(instr);
5014 unsigned bitsize = instr->dest.ssa.bit_size;
5015 unsigned num_components = instr->dest.ssa.num_components;
5016
5017 Temp input = get_arg(ctx, ctx->args->vs_inputs[location]);
5018
5019 aco_ptr<Instruction> vec{create_instruction<Pseudo_instruction>(
5020 aco_opcode::p_create_vector, Format::PSEUDO, num_components, 1)};
5021 std::array<Temp, NIR_MAX_VEC_COMPONENTS> elems;
5022 for (unsigned i = 0; i < num_components; i++) {
5023 elems[i] = emit_extract_vector(ctx, input, component + i, bitsize == 64 ? v2 : v1);
5024 if (bitsize == 16) {
5025 if (nir_alu_type_get_base_type(nir_intrinsic_dest_type(instr)) == nir_type_float)
5026 elems[i] = bld.vop1(aco_opcode::v_cvt_f16_f32, bld.def(v2b), elems[i]);
5027 else
5028 elems[i] = bld.pseudo(aco_opcode::p_extract_vector, bld.def(v2b), elems[i],
5029 Operand::c32(0u));
5030 }
5031 vec->operands[i] = Operand(elems[i]);
5032 }
5033 vec->definitions[0] = Definition(dst);
5034 ctx->block->instructions.emplace_back(std::move(vec));
5035 ctx->allocated_vec.emplace(dst.id(), elems);
5036 } else if (ctx->shader->info.stage == MESA_SHADER_VERTEX) {
5037
5038 if (!nir_src_is_const(offset) || nir_src_as_uint(offset))
5039 isel_err(offset.ssa->parent_instr,
5040 "Unimplemented non-zero nir_intrinsic_load_input offset");
5041
5042 Temp vertex_buffers =
5043 convert_pointer_to_64_bit(ctx, get_arg(ctx, ctx->args->ac.vertex_buffers));
5044
5045 unsigned location = nir_intrinsic_base(instr) - VERT_ATTRIB_GENERIC0;
5046 unsigned component = nir_intrinsic_component(instr);
5047 unsigned bitsize = instr->dest.ssa.bit_size;
5048 unsigned attrib_binding = ctx->options->key.vs.vertex_attribute_bindings[location];
5049 uint32_t attrib_offset = ctx->options->key.vs.vertex_attribute_offsets[location];
5050 uint32_t attrib_stride = ctx->options->key.vs.vertex_attribute_strides[location];
5051 unsigned attrib_format = ctx->options->key.vs.vertex_attribute_formats[location];
5052 unsigned binding_align = ctx->options->key.vs.vertex_binding_align[attrib_binding];
5053 enum radv_vs_input_alpha_adjust alpha_adjust =
5054 ctx->options->key.vs.vertex_alpha_adjust[location];
5055
5056 unsigned dfmt = attrib_format & 0xf;
5057 unsigned nfmt = (attrib_format >> 4) & 0x7;
5058 const struct ac_data_format_info* vtx_info = ac_get_data_format_info(dfmt);
5059
5060 unsigned mask = nir_ssa_def_components_read(&instr->dest.ssa) << component;
5061 unsigned num_channels = MIN2(util_last_bit(mask), vtx_info->num_channels);
5062 bool post_shuffle = ctx->options->key.vs.vertex_post_shuffle & (1 << location);
5063 if (post_shuffle)
5064 num_channels = MAX2(num_channels, 3);
5065
5066 unsigned desc_index =
5067 ctx->program->info->vs.use_per_attribute_vb_descs ? location : attrib_binding;
5068 desc_index = util_bitcount(ctx->program->info->vs.vb_desc_usage_mask &
5069 u_bit_consecutive(0, desc_index));
5070 Operand off = bld.copy(bld.def(s1), Operand::c32(desc_index * 16u));
5071 Temp list = bld.smem(aco_opcode::s_load_dwordx4, bld.def(s4), vertex_buffers, off);
5072
5073 Temp index;
5074 if (ctx->options->key.vs.instance_rate_inputs & (1u << location)) {
5075 uint32_t divisor = ctx->options->key.vs.instance_rate_divisors[location];
5076 Temp start_instance = get_arg(ctx, ctx->args->ac.start_instance);
5077 if (divisor) {
5078 Temp instance_id = get_arg(ctx, ctx->args->ac.instance_id);
5079 if (divisor != 1) {
5080 Temp divided = bld.tmp(v1);
5081 emit_v_div_u32(ctx, divided, as_vgpr(ctx, instance_id), divisor);
5082 index = bld.vadd32(bld.def(v1), start_instance, divided);
5083 } else {
5084 index = bld.vadd32(bld.def(v1), start_instance, instance_id);
5085 }
5086 } else {
5087 index = bld.copy(bld.def(v1), start_instance);
5088 }
5089 } else {
5090 index = bld.vadd32(bld.def(v1), get_arg(ctx, ctx->args->ac.base_vertex),
5091 get_arg(ctx, ctx->args->ac.vertex_id));
5092 }
5093
5094 Temp* const channels = (Temp*)alloca(num_channels * sizeof(Temp));
5095 unsigned channel_start = 0;
5096 bool direct_fetch = false;
5097
5098 /* skip unused channels at the start */
5099 if (vtx_info->chan_byte_size && !post_shuffle) {
5100 channel_start = ffs(mask) - 1;
5101 for (unsigned i = 0; i < MIN2(channel_start, num_channels); i++)
5102 channels[i] = Temp(0, s1);
5103 } else if (vtx_info->chan_byte_size && post_shuffle && !(mask & 0x8)) {
5104 num_channels = 3 - (ffs(mask) - 1);
5105 }
5106
5107 /* load channels */
5108 while (channel_start < num_channels) {
5109 unsigned fetch_component = num_channels - channel_start;
5110 unsigned fetch_offset = attrib_offset + channel_start * vtx_info->chan_byte_size;
5111 bool expanded = false;
5112
5113 /* use MUBUF when possible to avoid possible alignment issues */
5114 /* TODO: we could use SDWA to unpack 8/16-bit attributes without extra instructions */
5115 bool use_mubuf =
5116 (nfmt == V_008F0C_BUF_NUM_FORMAT_FLOAT || nfmt == V_008F0C_BUF_NUM_FORMAT_UINT ||
5117 nfmt == V_008F0C_BUF_NUM_FORMAT_SINT) &&
5118 vtx_info->chan_byte_size == 4;
5119 unsigned fetch_dfmt = V_008F0C_BUF_DATA_FORMAT_INVALID;
5120 if (!use_mubuf) {
5121 fetch_dfmt =
5122 get_fetch_data_format(ctx, vtx_info, fetch_offset, &fetch_component,
5123 vtx_info->num_channels - channel_start, binding_align);
5124 } else {
5125 if (fetch_component == 3 && ctx->options->chip_class == GFX6) {
5126 /* GFX6 only supports loading vec3 with MTBUF, expand to vec4. */
5127 fetch_component = 4;
5128 expanded = true;
5129 }
5130 }
5131
5132 unsigned fetch_bytes = fetch_component * bitsize / 8;
5133
5134 Temp fetch_index = index;
5135 if (attrib_stride != 0 && fetch_offset > attrib_stride) {
5136 fetch_index =
5137 bld.vadd32(bld.def(v1), Operand::c32(fetch_offset / attrib_stride), fetch_index);
5138 fetch_offset = fetch_offset % attrib_stride;
5139 }
5140
5141 Operand soffset = Operand::zero();
5142 if (fetch_offset >= 4096) {
5143 soffset = bld.copy(bld.def(s1), Operand::c32(fetch_offset / 4096 * 4096));
5144 fetch_offset %= 4096;
5145 }
5146
5147 aco_opcode opcode;
5148 switch (fetch_bytes) {
5149 case 2:
5150 assert(!use_mubuf && bitsize == 16);
5151 opcode = aco_opcode::tbuffer_load_format_d16_x;
5152 break;
5153 case 4:
5154 if (bitsize == 16) {
5155 assert(!use_mubuf);
5156 opcode = aco_opcode::tbuffer_load_format_d16_xy;
5157 } else {
5158 opcode =
5159 use_mubuf ? aco_opcode::buffer_load_dword : aco_opcode::tbuffer_load_format_x;
5160 }
5161 break;
5162 case 6:
5163 assert(!use_mubuf && bitsize == 16);
5164 opcode = aco_opcode::tbuffer_load_format_d16_xyz;
5165 break;
5166 case 8:
5167 if (bitsize == 16) {
5168 assert(!use_mubuf);
5169 opcode = aco_opcode::tbuffer_load_format_d16_xyzw;
5170 } else {
5171 opcode =
5172 use_mubuf ? aco_opcode::buffer_load_dwordx2 : aco_opcode::tbuffer_load_format_xy;
5173 }
5174 break;
5175 case 12:
5176 assert(ctx->options->chip_class >= GFX7 ||
5177 (!use_mubuf && ctx->options->chip_class == GFX6));
5178 opcode =
5179 use_mubuf ? aco_opcode::buffer_load_dwordx3 : aco_opcode::tbuffer_load_format_xyz;
5180 break;
5181 case 16:
5182 opcode =
5183 use_mubuf ? aco_opcode::buffer_load_dwordx4 : aco_opcode::tbuffer_load_format_xyzw;
5184 break;
5185 default: unreachable("Unimplemented load_input vector size");
5186 }
5187
5188 Temp fetch_dst;
5189 if (channel_start == 0 && fetch_bytes == dst.bytes() && !post_shuffle && !expanded &&
5190 (alpha_adjust == ALPHA_ADJUST_NONE || num_channels <= 3)) {
5191 direct_fetch = true;
5192 fetch_dst = dst;
5193 } else {
5194 fetch_dst = bld.tmp(RegClass::get(RegType::vgpr, fetch_bytes));
5195 }
5196
5197 if (use_mubuf) {
5198 Instruction* mubuf = bld.mubuf(opcode, Definition(fetch_dst), list, fetch_index,
5199 soffset, fetch_offset, false, false, true)
5200 .instr;
5201 mubuf->mubuf().vtx_binding = attrib_binding + 1;
5202 } else {
5203 Instruction* mtbuf = bld.mtbuf(opcode, Definition(fetch_dst), list, fetch_index,
5204 soffset, fetch_dfmt, nfmt, fetch_offset, false, true)
5205 .instr;
5206 mtbuf->mtbuf().vtx_binding = attrib_binding + 1;
5207 }
5208
5209 emit_split_vector(ctx, fetch_dst, fetch_dst.size());
5210
5211 if (fetch_component == 1) {
5212 channels[channel_start] = fetch_dst;
5213 } else {
5214 for (unsigned i = 0; i < MIN2(fetch_component, num_channels - channel_start); i++)
5215 channels[channel_start + i] =
5216 emit_extract_vector(ctx, fetch_dst, i, bitsize == 16 ? v2b : v1);
5217 }
5218
5219 channel_start += fetch_component;
5220 }
5221
5222 if (!direct_fetch) {
5223 bool is_float =
5224 nfmt != V_008F0C_BUF_NUM_FORMAT_UINT && nfmt != V_008F0C_BUF_NUM_FORMAT_SINT;
5225
5226 static const unsigned swizzle_normal[4] = {0, 1, 2, 3};
5227 static const unsigned swizzle_post_shuffle[4] = {2, 1, 0, 3};
5228 const unsigned* swizzle = post_shuffle ? swizzle_post_shuffle : swizzle_normal;
5229 unsigned num_components = instr->dest.ssa.num_components;
5230
5231 aco_ptr<Instruction> vec{create_instruction<Pseudo_instruction>(
5232 aco_opcode::p_create_vector, Format::PSEUDO, num_components, 1)};
5233 std::array<Temp, NIR_MAX_VEC_COMPONENTS> elems;
5234 unsigned num_temp = 0;
5235 for (unsigned i = 0; i < num_components; i++) {
5236 unsigned idx = i + component;
5237 if (swizzle[idx] < num_channels && channels[swizzle[idx]].id()) {
5238 Temp channel = channels[swizzle[idx]];
5239 if (idx == 3 && alpha_adjust != ALPHA_ADJUST_NONE)
5240 channel = adjust_vertex_fetch_alpha(ctx, alpha_adjust, channel);
5241 vec->operands[i] = Operand(channel);
5242
5243 num_temp++;
5244 elems[i] = channel;
5245 } else if (is_float && idx == 3) {
5246 vec->operands[i] = Operand::c32(0x3f800000u);
5247 } else if (!is_float && idx == 3) {
5248 vec->operands[i] = Operand::c32(1u);
5249 } else {
5250 vec->operands[i] = Operand::zero();
5251 }
5252 }
5253 vec->definitions[0] = Definition(dst);
5254 ctx->block->instructions.emplace_back(std::move(vec));
5255 emit_split_vector(ctx, dst, num_components);
5256
5257 if (num_temp == num_components)
5258 ctx->allocated_vec.emplace(dst.id(), elems);
5259 }
5260 } else if (ctx->shader->info.stage == MESA_SHADER_FRAGMENT) {
5261 if (!nir_src_is_const(offset) || nir_src_as_uint(offset))
5262 isel_err(offset.ssa->parent_instr,
5263 "Unimplemented non-zero nir_intrinsic_load_input offset");
5264
5265 Temp prim_mask = get_arg(ctx, ctx->args->ac.prim_mask);
5266
5267 unsigned idx = nir_intrinsic_base(instr);
5268 unsigned component = nir_intrinsic_component(instr);
5269 unsigned vertex_id = 2; /* P0 */
5270
5271 if (instr->intrinsic == nir_intrinsic_load_input_vertex) {
5272 nir_const_value* src0 = nir_src_as_const_value(instr->src[0]);
5273 switch (src0->u32) {
5274 case 0:
5275 vertex_id = 2; /* P0 */
5276 break;
5277 case 1:
5278 vertex_id = 0; /* P10 */
5279 break;
5280 case 2:
5281 vertex_id = 1; /* P20 */
5282 break;
5283 default: unreachable("invalid vertex index");
5284 }
5285 }
5286
5287 if (instr->dest.ssa.num_components == 1 &&
5288 instr->dest.ssa.bit_size != 64) {
5289 bld.vintrp(aco_opcode::v_interp_mov_f32, Definition(dst), Operand::c32(vertex_id),
5290 bld.m0(prim_mask), idx, component);
5291 } else {
5292 unsigned num_components = instr->dest.ssa.num_components;
5293 if (instr->dest.ssa.bit_size == 64)
5294 num_components *= 2;
5295 aco_ptr<Pseudo_instruction> vec{create_instruction<Pseudo_instruction>(
5296 aco_opcode::p_create_vector, Format::PSEUDO, num_components, 1)};
5297 for (unsigned i = 0; i < num_components; i++) {
5298 unsigned chan_component = (component + i) % 4;
5299 unsigned chan_idx = idx + (component + i) / 4;
5300 vec->operands[i] = bld.vintrp(
5301 aco_opcode::v_interp_mov_f32, bld.def(instr->dest.ssa.bit_size == 16 ? v2b : v1),
5302 Operand::c32(vertex_id), bld.m0(prim_mask), chan_idx, chan_component);
5303 }
5304 vec->definitions[0] = Definition(dst);
5305 bld.insert(std::move(vec));
5306 }
5307 } else {
5308 unreachable("Shader stage not implemented");
5309 }
5310 }
5311
5312 void
visit_load_tcs_per_vertex_input(isel_context * ctx,nir_intrinsic_instr * instr)5313 visit_load_tcs_per_vertex_input(isel_context* ctx, nir_intrinsic_instr* instr)
5314 {
5315 assert(ctx->shader->info.stage == MESA_SHADER_TESS_CTRL);
5316
5317 Builder bld(ctx->program, ctx->block);
5318 Temp dst = get_ssa_temp(ctx, &instr->dest.ssa);
5319
5320 if (load_input_from_temps(ctx, instr, dst))
5321 return;
5322
5323 unreachable("LDS-based TCS input should have been lowered in NIR.");
5324 }
5325
5326 void
visit_load_per_vertex_input(isel_context * ctx,nir_intrinsic_instr * instr)5327 visit_load_per_vertex_input(isel_context* ctx, nir_intrinsic_instr* instr)
5328 {
5329 switch (ctx->shader->info.stage) {
5330 case MESA_SHADER_TESS_CTRL: visit_load_tcs_per_vertex_input(ctx, instr); break;
5331 default: unreachable("Unimplemented shader stage");
5332 }
5333 }
5334
5335 void
visit_load_tess_coord(isel_context * ctx,nir_intrinsic_instr * instr)5336 visit_load_tess_coord(isel_context* ctx, nir_intrinsic_instr* instr)
5337 {
5338 assert(ctx->shader->info.stage == MESA_SHADER_TESS_EVAL);
5339
5340 Builder bld(ctx->program, ctx->block);
5341 Temp dst = get_ssa_temp(ctx, &instr->dest.ssa);
5342
5343 Operand tes_u(get_arg(ctx, ctx->args->ac.tes_u));
5344 Operand tes_v(get_arg(ctx, ctx->args->ac.tes_v));
5345 Operand tes_w = Operand::zero();
5346
5347 if (ctx->shader->info.tess.primitive_mode == GL_TRIANGLES) {
5348 Temp tmp = bld.vop2(aco_opcode::v_add_f32, bld.def(v1), tes_u, tes_v);
5349 tmp = bld.vop2(aco_opcode::v_sub_f32, bld.def(v1), Operand::c32(0x3f800000u /* 1.0f */), tmp);
5350 tes_w = Operand(tmp);
5351 }
5352
5353 Temp tess_coord = bld.pseudo(aco_opcode::p_create_vector, Definition(dst), tes_u, tes_v, tes_w);
5354 emit_split_vector(ctx, tess_coord, 3);
5355 }
5356
5357 Temp
load_desc_ptr(isel_context * ctx,unsigned desc_set)5358 load_desc_ptr(isel_context* ctx, unsigned desc_set)
5359 {
5360 const struct radv_userdata_locations *user_sgprs_locs = &ctx->program->info->user_sgprs_locs;
5361
5362 if (user_sgprs_locs->shader_data[AC_UD_INDIRECT_DESCRIPTOR_SETS].sgpr_idx != -1) {
5363 Builder bld(ctx->program, ctx->block);
5364 Temp ptr64 = convert_pointer_to_64_bit(ctx, get_arg(ctx, ctx->args->descriptor_sets[0]));
5365 Operand off = bld.copy(bld.def(s1), Operand::c32(desc_set << 2));
5366 return bld.smem(aco_opcode::s_load_dword, bld.def(s1), ptr64, off); //, false, false, false);
5367 }
5368
5369 return get_arg(ctx, ctx->args->descriptor_sets[desc_set]);
5370 }
5371
5372 void
visit_load_resource(isel_context * ctx,nir_intrinsic_instr * instr)5373 visit_load_resource(isel_context* ctx, nir_intrinsic_instr* instr)
5374 {
5375 Builder bld(ctx->program, ctx->block);
5376 Temp index = get_ssa_temp(ctx, instr->src[0].ssa);
5377 if (!nir_dest_is_divergent(instr->dest))
5378 index = bld.as_uniform(index);
5379 unsigned desc_set = nir_intrinsic_desc_set(instr);
5380 unsigned binding = nir_intrinsic_binding(instr);
5381
5382 Temp desc_ptr;
5383 radv_pipeline_layout* pipeline_layout = ctx->options->layout;
5384 radv_descriptor_set_layout* layout = pipeline_layout->set[desc_set].layout;
5385 unsigned offset = layout->binding[binding].offset;
5386 unsigned stride;
5387 if (layout->binding[binding].type == VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER_DYNAMIC ||
5388 layout->binding[binding].type == VK_DESCRIPTOR_TYPE_STORAGE_BUFFER_DYNAMIC) {
5389 unsigned idx = pipeline_layout->set[desc_set].dynamic_offset_start +
5390 layout->binding[binding].dynamic_offset_offset;
5391 desc_ptr = get_arg(ctx, ctx->args->ac.push_constants);
5392 offset = pipeline_layout->push_constant_size + 16 * idx;
5393 stride = 16;
5394 } else {
5395 desc_ptr = load_desc_ptr(ctx, desc_set);
5396 stride = layout->binding[binding].size;
5397 }
5398
5399 if (nir_src_is_const(instr->src[0])) {
5400 index =
5401 bld.copy(bld.def(s1), Operand::c32((offset + nir_src_as_uint(instr->src[0]) * stride)));
5402 } else if (index.type() == RegType::vgpr) {
5403 if (stride != 1) {
5404 bool index24bit = layout->binding[binding].array_size <= 0x1000000;
5405 index = bld.v_mul_imm(bld.def(v1), index, stride, index24bit);
5406 }
5407 if (offset)
5408 index = bld.vadd32(bld.def(v1), Operand::c32(offset), index);
5409 } else {
5410 if (stride != 1)
5411 index = bld.sop2(aco_opcode::s_mul_i32, bld.def(s1), Operand::c32(stride), index);
5412 if (offset)
5413 index = bld.sop2(aco_opcode::s_add_i32, bld.def(s1), bld.def(s1, scc),
5414 Operand::c32(offset), index);
5415 }
5416
5417 Temp dst = get_ssa_temp(ctx, &instr->dest.ssa);
5418 std::array<Temp, NIR_MAX_VEC_COMPONENTS> elems;
5419 elems[0] = desc_ptr;
5420 elems[1] = index;
5421 ctx->allocated_vec.emplace(dst.id(), elems);
5422 bld.pseudo(aco_opcode::p_create_vector, Definition(dst), desc_ptr, index, Operand::zero());
5423 }
5424
5425 void
load_buffer(isel_context * ctx,unsigned num_components,unsigned component_size,Temp dst,Temp rsrc,Temp offset,unsigned align_mul,unsigned align_offset,bool glc=false,bool allow_smem=true,memory_sync_info sync=memory_sync_info ())5426 load_buffer(isel_context* ctx, unsigned num_components, unsigned component_size, Temp dst,
5427 Temp rsrc, Temp offset, unsigned align_mul, unsigned align_offset, bool glc = false,
5428 bool allow_smem = true, memory_sync_info sync = memory_sync_info())
5429 {
5430 Builder bld(ctx->program, ctx->block);
5431
5432 bool use_smem =
5433 dst.type() != RegType::vgpr && (!glc || ctx->options->chip_class >= GFX8) && allow_smem;
5434 if (use_smem)
5435 offset = bld.as_uniform(offset);
5436 else {
5437 /* GFX6-7 are affected by a hw bug that prevents address clamping to
5438 * work correctly when the SGPR offset is used.
5439 */
5440 if (offset.type() == RegType::sgpr && ctx->options->chip_class < GFX8)
5441 offset = as_vgpr(ctx, offset);
5442 }
5443
5444 LoadEmitInfo info = {Operand(offset), dst, num_components, component_size, rsrc};
5445 info.glc = glc;
5446 info.sync = sync;
5447 info.align_mul = align_mul;
5448 info.align_offset = align_offset;
5449 if (use_smem)
5450 emit_load(ctx, bld, info, smem_load_params);
5451 else
5452 emit_load(ctx, bld, info, mubuf_load_params);
5453 }
5454
5455 Temp
load_buffer_rsrc(isel_context * ctx,Temp rsrc)5456 load_buffer_rsrc(isel_context* ctx, Temp rsrc)
5457 {
5458 Builder bld(ctx->program, ctx->block);
5459 Temp set_ptr = emit_extract_vector(ctx, rsrc, 0, RegClass(rsrc.type(), 1));
5460 Temp binding = bld.as_uniform(emit_extract_vector(ctx, rsrc, 1, RegClass(rsrc.type(), 1)));
5461 set_ptr = convert_pointer_to_64_bit(ctx, set_ptr);
5462 return bld.smem(aco_opcode::s_load_dwordx4, bld.def(s4), set_ptr, binding);
5463 }
5464
5465 bool
is_inline_ubo(isel_context * ctx,nir_src rsrc)5466 is_inline_ubo(isel_context* ctx, nir_src rsrc)
5467 {
5468 nir_binding binding = nir_chase_binding(rsrc);
5469 if (!binding.success)
5470 return false;
5471
5472 radv_descriptor_set_layout* layout = ctx->options->layout->set[binding.desc_set].layout;
5473 return layout->binding[binding.binding].type == VK_DESCRIPTOR_TYPE_INLINE_UNIFORM_BLOCK_EXT;
5474 }
5475
5476 void
visit_load_ubo(isel_context * ctx,nir_intrinsic_instr * instr)5477 visit_load_ubo(isel_context* ctx, nir_intrinsic_instr* instr)
5478 {
5479 Temp dst = get_ssa_temp(ctx, &instr->dest.ssa);
5480 Temp rsrc = get_ssa_temp(ctx, instr->src[0].ssa);
5481
5482 Builder bld(ctx->program, ctx->block);
5483
5484 if (is_inline_ubo(ctx, instr->src[0])) {
5485 Temp set_ptr = bld.as_uniform(emit_extract_vector(ctx, rsrc, 0, RegClass(rsrc.type(), 1)));
5486 Temp binding_off =
5487 bld.as_uniform(emit_extract_vector(ctx, rsrc, 1, RegClass(rsrc.type(), 1)));
5488 rsrc = bld.sop2(aco_opcode::s_add_u32, bld.def(s1), bld.def(s1, scc), set_ptr, binding_off);
5489
5490 uint32_t desc_type =
5491 S_008F0C_DST_SEL_X(V_008F0C_SQ_SEL_X) | S_008F0C_DST_SEL_Y(V_008F0C_SQ_SEL_Y) |
5492 S_008F0C_DST_SEL_Z(V_008F0C_SQ_SEL_Z) | S_008F0C_DST_SEL_W(V_008F0C_SQ_SEL_W);
5493 if (ctx->options->chip_class >= GFX10) {
5494 desc_type |= S_008F0C_FORMAT(V_008F0C_GFX10_FORMAT_32_FLOAT) |
5495 S_008F0C_OOB_SELECT(V_008F0C_OOB_SELECT_RAW) | S_008F0C_RESOURCE_LEVEL(1);
5496 } else {
5497 desc_type |= S_008F0C_NUM_FORMAT(V_008F0C_BUF_NUM_FORMAT_FLOAT) |
5498 S_008F0C_DATA_FORMAT(V_008F0C_BUF_DATA_FORMAT_32);
5499 }
5500 rsrc = bld.pseudo(aco_opcode::p_create_vector, bld.def(s4), rsrc,
5501 Operand::c32(S_008F04_BASE_ADDRESS_HI(ctx->options->address32_hi)),
5502 Operand::c32(0xFFFFFFFFu), Operand::c32(desc_type));
5503 } else {
5504 rsrc = load_buffer_rsrc(ctx, rsrc);
5505 }
5506 unsigned size = instr->dest.ssa.bit_size / 8;
5507 load_buffer(ctx, instr->num_components, size, dst, rsrc, get_ssa_temp(ctx, instr->src[1].ssa),
5508 nir_intrinsic_align_mul(instr), nir_intrinsic_align_offset(instr));
5509 }
5510
5511 void
visit_load_sbt_amd(isel_context * ctx,nir_intrinsic_instr * instr)5512 visit_load_sbt_amd(isel_context* ctx, nir_intrinsic_instr* instr)
5513 {
5514 Temp dst = get_ssa_temp(ctx, &instr->dest.ssa);
5515 unsigned binding = nir_intrinsic_binding(instr);
5516
5517 Builder bld(ctx->program, ctx->block);
5518 Temp desc_base = convert_pointer_to_64_bit(ctx, get_arg(ctx, ctx->args->ac.sbt_descriptors));
5519 Operand desc_off = bld.copy(bld.def(s1), Operand::c32(binding * 16u));
5520 bld.smem(aco_opcode::s_load_dwordx4, Definition(dst), desc_base, desc_off);
5521 }
5522
5523 void
visit_load_push_constant(isel_context * ctx,nir_intrinsic_instr * instr)5524 visit_load_push_constant(isel_context* ctx, nir_intrinsic_instr* instr)
5525 {
5526 Builder bld(ctx->program, ctx->block);
5527 Temp dst = get_ssa_temp(ctx, &instr->dest.ssa);
5528 unsigned offset = nir_intrinsic_base(instr);
5529 unsigned count = instr->dest.ssa.num_components;
5530 nir_const_value* index_cv = nir_src_as_const_value(instr->src[0]);
5531
5532 if (index_cv && instr->dest.ssa.bit_size == 32) {
5533 struct radv_userdata_info *loc =
5534 &ctx->args->shader_info->user_sgprs_locs.shader_data[AC_UD_INLINE_PUSH_CONSTANTS];
5535 unsigned start = (offset + index_cv->u32) / 4u;
5536 unsigned num_inline_push_consts = loc->sgpr_idx != -1 ? loc->num_sgprs : 0;
5537
5538 start -= ctx->args->shader_info->min_push_constant_used / 4;
5539 if (start + count <= num_inline_push_consts) {
5540 std::array<Temp, NIR_MAX_VEC_COMPONENTS> elems;
5541 aco_ptr<Pseudo_instruction> vec{create_instruction<Pseudo_instruction>(
5542 aco_opcode::p_create_vector, Format::PSEUDO, count, 1)};
5543 for (unsigned i = 0; i < count; ++i) {
5544 elems[i] = get_arg(ctx, ctx->args->ac.inline_push_consts[start + i]);
5545 vec->operands[i] = Operand{elems[i]};
5546 }
5547 vec->definitions[0] = Definition(dst);
5548 ctx->block->instructions.emplace_back(std::move(vec));
5549 ctx->allocated_vec.emplace(dst.id(), elems);
5550 return;
5551 }
5552 }
5553
5554 Temp index = bld.as_uniform(get_ssa_temp(ctx, instr->src[0].ssa));
5555 if (offset != 0) // TODO check if index != 0 as well
5556 index = bld.nuw().sop2(aco_opcode::s_add_i32, bld.def(s1), bld.def(s1, scc),
5557 Operand::c32(offset), index);
5558 Temp ptr = convert_pointer_to_64_bit(ctx, get_arg(ctx, ctx->args->ac.push_constants));
5559 Temp vec = dst;
5560 bool trim = false;
5561 bool aligned = true;
5562
5563 if (instr->dest.ssa.bit_size == 8) {
5564 aligned = index_cv && (offset + index_cv->u32) % 4 == 0;
5565 bool fits_in_dword = count == 1 || (index_cv && ((offset + index_cv->u32) % 4 + count) <= 4);
5566 if (!aligned)
5567 vec = fits_in_dword ? bld.tmp(s1) : bld.tmp(s2);
5568 } else if (instr->dest.ssa.bit_size == 16) {
5569 aligned = index_cv && (offset + index_cv->u32) % 4 == 0;
5570 if (!aligned)
5571 vec = count == 4 ? bld.tmp(s4) : count > 1 ? bld.tmp(s2) : bld.tmp(s1);
5572 }
5573
5574 aco_opcode op;
5575
5576 switch (vec.size()) {
5577 case 1: op = aco_opcode::s_load_dword; break;
5578 case 2: op = aco_opcode::s_load_dwordx2; break;
5579 case 3:
5580 vec = bld.tmp(s4);
5581 trim = true;
5582 FALLTHROUGH;
5583 case 4: op = aco_opcode::s_load_dwordx4; break;
5584 case 6:
5585 vec = bld.tmp(s8);
5586 trim = true;
5587 FALLTHROUGH;
5588 case 8: op = aco_opcode::s_load_dwordx8; break;
5589 default: unreachable("unimplemented or forbidden load_push_constant.");
5590 }
5591
5592 bld.smem(op, Definition(vec), ptr, index).instr->smem().prevent_overflow = true;
5593
5594 if (!aligned) {
5595 Operand byte_offset = index_cv ? Operand::c32((offset + index_cv->u32) % 4) : Operand(index);
5596 byte_align_scalar(ctx, vec, byte_offset, dst);
5597 return;
5598 }
5599
5600 if (trim) {
5601 emit_split_vector(ctx, vec, 4);
5602 RegClass rc = dst.size() == 3 ? s1 : s2;
5603 bld.pseudo(aco_opcode::p_create_vector, Definition(dst), emit_extract_vector(ctx, vec, 0, rc),
5604 emit_extract_vector(ctx, vec, 1, rc), emit_extract_vector(ctx, vec, 2, rc));
5605 }
5606 emit_split_vector(ctx, dst, instr->dest.ssa.num_components);
5607 }
5608
5609 void
visit_load_constant(isel_context * ctx,nir_intrinsic_instr * instr)5610 visit_load_constant(isel_context* ctx, nir_intrinsic_instr* instr)
5611 {
5612 Temp dst = get_ssa_temp(ctx, &instr->dest.ssa);
5613
5614 Builder bld(ctx->program, ctx->block);
5615
5616 uint32_t desc_type =
5617 S_008F0C_DST_SEL_X(V_008F0C_SQ_SEL_X) | S_008F0C_DST_SEL_Y(V_008F0C_SQ_SEL_Y) |
5618 S_008F0C_DST_SEL_Z(V_008F0C_SQ_SEL_Z) | S_008F0C_DST_SEL_W(V_008F0C_SQ_SEL_W);
5619 if (ctx->options->chip_class >= GFX10) {
5620 desc_type |= S_008F0C_FORMAT(V_008F0C_GFX10_FORMAT_32_FLOAT) |
5621 S_008F0C_OOB_SELECT(V_008F0C_OOB_SELECT_RAW) | S_008F0C_RESOURCE_LEVEL(1);
5622 } else {
5623 desc_type |= S_008F0C_NUM_FORMAT(V_008F0C_BUF_NUM_FORMAT_FLOAT) |
5624 S_008F0C_DATA_FORMAT(V_008F0C_BUF_DATA_FORMAT_32);
5625 }
5626
5627 unsigned base = nir_intrinsic_base(instr);
5628 unsigned range = nir_intrinsic_range(instr);
5629
5630 Temp offset = get_ssa_temp(ctx, instr->src[0].ssa);
5631 if (base && offset.type() == RegType::sgpr)
5632 offset = bld.nuw().sop2(aco_opcode::s_add_u32, bld.def(s1), bld.def(s1, scc), offset,
5633 Operand::c32(base));
5634 else if (base && offset.type() == RegType::vgpr)
5635 offset = bld.vadd32(bld.def(v1), Operand::c32(base), offset);
5636
5637 Temp rsrc = bld.pseudo(aco_opcode::p_create_vector, bld.def(s4),
5638 bld.pseudo(aco_opcode::p_constaddr, bld.def(s2), bld.def(s1, scc),
5639 Operand::c32(ctx->constant_data_offset)),
5640 Operand::c32(MIN2(base + range, ctx->shader->constant_data_size)),
5641 Operand::c32(desc_type));
5642 unsigned size = instr->dest.ssa.bit_size / 8;
5643 // TODO: get alignment information for subdword constants
5644 load_buffer(ctx, instr->num_components, size, dst, rsrc, offset, size, 0);
5645 }
5646
5647 void
visit_discard_if(isel_context * ctx,nir_intrinsic_instr * instr)5648 visit_discard_if(isel_context* ctx, nir_intrinsic_instr* instr)
5649 {
5650 if (ctx->block->loop_nest_depth || ctx->cf_info.parent_if.is_divergent)
5651 ctx->cf_info.exec_potentially_empty_discard = true;
5652
5653 ctx->program->needs_exact = true;
5654
5655 // TODO: optimize uniform conditions
5656 Builder bld(ctx->program, ctx->block);
5657 Temp src = get_ssa_temp(ctx, instr->src[0].ssa);
5658 assert(src.regClass() == bld.lm);
5659 src = bld.sop2(Builder::s_and, bld.def(bld.lm), bld.def(s1, scc), src, Operand(exec, bld.lm));
5660 bld.pseudo(aco_opcode::p_discard_if, src);
5661 ctx->block->kind |= block_kind_uses_discard_if;
5662 return;
5663 }
5664
5665 void
visit_discard(isel_context * ctx,nir_intrinsic_instr * instr)5666 visit_discard(isel_context* ctx, nir_intrinsic_instr* instr)
5667 {
5668 Builder bld(ctx->program, ctx->block);
5669
5670 if (ctx->block->loop_nest_depth || ctx->cf_info.parent_if.is_divergent)
5671 ctx->cf_info.exec_potentially_empty_discard = true;
5672
5673 bool divergent =
5674 ctx->cf_info.parent_if.is_divergent || ctx->cf_info.parent_loop.has_divergent_continue;
5675
5676 if (ctx->block->loop_nest_depth && (nir_instr_is_last(&instr->instr) && !divergent)) {
5677 /* we handle discards the same way as jump instructions */
5678 append_logical_end(ctx->block);
5679
5680 /* in loops, discard behaves like break */
5681 Block* linear_target = ctx->cf_info.parent_loop.exit;
5682 ctx->block->kind |= block_kind_discard;
5683
5684 /* uniform discard - loop ends here */
5685 assert(nir_instr_is_last(&instr->instr));
5686 ctx->block->kind |= block_kind_uniform;
5687 ctx->cf_info.has_branch = true;
5688 bld.branch(aco_opcode::p_branch, bld.hint_vcc(bld.def(s2)));
5689 add_linear_edge(ctx->block->index, linear_target);
5690 return;
5691 }
5692
5693 /* it can currently happen that NIR doesn't remove the unreachable code */
5694 if (!nir_instr_is_last(&instr->instr)) {
5695 ctx->program->needs_exact = true;
5696 /* save exec somewhere temporarily so that it doesn't get
5697 * overwritten before the discard from outer exec masks */
5698 Temp cond = bld.sop2(Builder::s_and, bld.def(bld.lm), bld.def(s1, scc),
5699 Operand::c32(0xFFFFFFFF), Operand(exec, bld.lm));
5700 bld.pseudo(aco_opcode::p_discard_if, cond);
5701 ctx->block->kind |= block_kind_uses_discard_if;
5702 return;
5703 }
5704
5705 /* This condition is incorrect for uniformly branched discards in a loop
5706 * predicated by a divergent condition, but the above code catches that case
5707 * and the discard would end up turning into a discard_if.
5708 * For example:
5709 * if (divergent) {
5710 * while (...) {
5711 * if (uniform) {
5712 * discard;
5713 * }
5714 * }
5715 * }
5716 */
5717 if (!ctx->cf_info.parent_if.is_divergent) {
5718 /* program just ends here */
5719 ctx->block->kind |= block_kind_uses_discard_if;
5720 bld.pseudo(aco_opcode::p_discard_if, Operand::c32(0xFFFFFFFFu));
5721 // TODO: it will potentially be followed by a branch which is dead code to sanitize NIR phis
5722 } else {
5723 ctx->block->kind |= block_kind_discard;
5724 /* branch and linear edge is added by visit_if() */
5725 }
5726 }
5727
5728 enum aco_descriptor_type {
5729 ACO_DESC_IMAGE,
5730 ACO_DESC_FMASK,
5731 ACO_DESC_SAMPLER,
5732 ACO_DESC_BUFFER,
5733 ACO_DESC_PLANE_0,
5734 ACO_DESC_PLANE_1,
5735 ACO_DESC_PLANE_2,
5736 };
5737
5738 static bool
should_declare_array(isel_context * ctx,enum glsl_sampler_dim sampler_dim,bool is_array)5739 should_declare_array(isel_context* ctx, enum glsl_sampler_dim sampler_dim, bool is_array)
5740 {
5741 if (sampler_dim == GLSL_SAMPLER_DIM_BUF)
5742 return false;
5743 ac_image_dim dim = ac_get_sampler_dim(ctx->options->chip_class, sampler_dim, is_array);
5744 return dim == ac_image_cube || dim == ac_image_1darray || dim == ac_image_2darray ||
5745 dim == ac_image_2darraymsaa;
5746 }
5747
5748 Temp
get_sampler_desc(isel_context * ctx,nir_deref_instr * deref_instr,enum aco_descriptor_type desc_type,const nir_tex_instr * tex_instr,bool write)5749 get_sampler_desc(isel_context* ctx, nir_deref_instr* deref_instr,
5750 enum aco_descriptor_type desc_type, const nir_tex_instr* tex_instr, bool write)
5751 {
5752 /* FIXME: we should lower the deref with some new nir_intrinsic_load_desc
5753 std::unordered_map<uint64_t, Temp>::iterator it = ctx->tex_desc.find((uint64_t) desc_type <<
5754 32 | deref_instr->dest.ssa.index); if (it != ctx->tex_desc.end()) return it->second;
5755 */
5756 Temp index = Temp();
5757 bool index_set = false;
5758 unsigned constant_index = 0;
5759 unsigned descriptor_set;
5760 unsigned base_index;
5761 Builder bld(ctx->program, ctx->block);
5762
5763 if (!deref_instr) {
5764 assert(tex_instr);
5765 descriptor_set = 0;
5766 base_index = tex_instr->sampler_index;
5767 } else {
5768 while (deref_instr->deref_type != nir_deref_type_var) {
5769 unsigned array_size = glsl_get_aoa_size(deref_instr->type);
5770 if (!array_size)
5771 array_size = 1;
5772
5773 assert(deref_instr->deref_type == nir_deref_type_array);
5774 nir_const_value* const_value = nir_src_as_const_value(deref_instr->arr.index);
5775 if (const_value) {
5776 constant_index += array_size * const_value->u32;
5777 } else {
5778 Temp indirect = get_ssa_temp(ctx, deref_instr->arr.index.ssa);
5779 if (indirect.type() == RegType::vgpr)
5780 indirect = bld.as_uniform(indirect);
5781
5782 if (array_size != 1)
5783 indirect =
5784 bld.sop2(aco_opcode::s_mul_i32, bld.def(s1), Operand::c32(array_size), indirect);
5785
5786 if (!index_set) {
5787 index = indirect;
5788 index_set = true;
5789 } else {
5790 index =
5791 bld.sop2(aco_opcode::s_add_i32, bld.def(s1), bld.def(s1, scc), index, indirect);
5792 }
5793 }
5794
5795 deref_instr = nir_src_as_deref(deref_instr->parent);
5796 }
5797 descriptor_set = deref_instr->var->data.descriptor_set;
5798 base_index = deref_instr->var->data.binding;
5799 }
5800
5801 Temp list = load_desc_ptr(ctx, descriptor_set);
5802 list = convert_pointer_to_64_bit(ctx, list);
5803
5804 struct radv_descriptor_set_layout* layout = ctx->options->layout->set[descriptor_set].layout;
5805 struct radv_descriptor_set_binding_layout* binding = layout->binding + base_index;
5806 unsigned offset = binding->offset;
5807 unsigned stride = binding->size;
5808 aco_opcode opcode;
5809 RegClass type;
5810
5811 assert(base_index < layout->binding_count);
5812
5813 switch (desc_type) {
5814 case ACO_DESC_IMAGE:
5815 type = s8;
5816 opcode = aco_opcode::s_load_dwordx8;
5817 break;
5818 case ACO_DESC_FMASK:
5819 type = s8;
5820 opcode = aco_opcode::s_load_dwordx8;
5821 offset += 32;
5822 break;
5823 case ACO_DESC_SAMPLER:
5824 type = s4;
5825 opcode = aco_opcode::s_load_dwordx4;
5826 if (binding->type == VK_DESCRIPTOR_TYPE_COMBINED_IMAGE_SAMPLER)
5827 offset += radv_combined_image_descriptor_sampler_offset(binding);
5828 break;
5829 case ACO_DESC_BUFFER:
5830 type = s4;
5831 opcode = aco_opcode::s_load_dwordx4;
5832 break;
5833 case ACO_DESC_PLANE_0:
5834 case ACO_DESC_PLANE_1:
5835 type = s8;
5836 opcode = aco_opcode::s_load_dwordx8;
5837 offset += 32 * (desc_type - ACO_DESC_PLANE_0);
5838 break;
5839 case ACO_DESC_PLANE_2:
5840 type = s4;
5841 opcode = aco_opcode::s_load_dwordx4;
5842 offset += 64;
5843 break;
5844 default: unreachable("invalid desc_type\n");
5845 }
5846
5847 offset += constant_index * stride;
5848
5849 if (desc_type == ACO_DESC_SAMPLER && binding->immutable_samplers_offset &&
5850 (!index_set || binding->immutable_samplers_equal)) {
5851 if (binding->immutable_samplers_equal)
5852 constant_index = 0;
5853
5854 const uint32_t* samplers = radv_immutable_samplers(layout, binding);
5855 uint32_t dword0_mask = tex_instr->op == nir_texop_tg4 ? C_008F30_TRUNC_COORD : 0xffffffffu;
5856 return bld.pseudo(aco_opcode::p_create_vector, bld.def(s4),
5857 Operand::c32(samplers[constant_index * 4 + 0] & dword0_mask),
5858 Operand::c32(samplers[constant_index * 4 + 1]),
5859 Operand::c32(samplers[constant_index * 4 + 2]),
5860 Operand::c32(samplers[constant_index * 4 + 3]));
5861 }
5862
5863 Operand off;
5864 if (!index_set) {
5865 off = bld.copy(bld.def(s1), Operand::c32(offset));
5866 } else {
5867 off = Operand(
5868 (Temp)bld.sop2(aco_opcode::s_add_i32, bld.def(s1), bld.def(s1, scc), Operand::c32(offset),
5869 bld.sop2(aco_opcode::s_mul_i32, bld.def(s1), Operand::c32(stride), index)));
5870 }
5871
5872 Temp res = bld.smem(opcode, bld.def(type), list, off);
5873
5874 if (desc_type == ACO_DESC_PLANE_2) {
5875 Temp components[8];
5876 for (unsigned i = 0; i < 8; i++)
5877 components[i] = bld.tmp(s1);
5878 bld.pseudo(aco_opcode::p_split_vector, Definition(components[0]), Definition(components[1]),
5879 Definition(components[2]), Definition(components[3]), res);
5880
5881 Temp desc2 = get_sampler_desc(ctx, deref_instr, ACO_DESC_PLANE_1, tex_instr, write);
5882 bld.pseudo(aco_opcode::p_split_vector, bld.def(s1), bld.def(s1), bld.def(s1), bld.def(s1),
5883 Definition(components[4]), Definition(components[5]), Definition(components[6]),
5884 Definition(components[7]), desc2);
5885
5886 res = bld.pseudo(aco_opcode::p_create_vector, bld.def(s8), components[0], components[1],
5887 components[2], components[3], components[4], components[5], components[6],
5888 components[7]);
5889 } else if (desc_type == ACO_DESC_IMAGE && ctx->options->has_image_load_dcc_bug && !tex_instr &&
5890 !write) {
5891 Temp components[8];
5892 for (unsigned i = 0; i < 8; i++)
5893 components[i] = bld.tmp(s1);
5894
5895 bld.pseudo(aco_opcode::p_split_vector, Definition(components[0]), Definition(components[1]),
5896 Definition(components[2]), Definition(components[3]), Definition(components[4]),
5897 Definition(components[5]), Definition(components[6]), Definition(components[7]),
5898 res);
5899
5900 /* WRITE_COMPRESS_ENABLE must be 0 for all image loads to workaround a
5901 * hardware bug.
5902 */
5903 components[6] = bld.sop2(aco_opcode::s_and_b32, bld.def(s1), bld.def(s1, scc), components[6],
5904 bld.copy(bld.def(s1), Operand::c32(C_00A018_WRITE_COMPRESS_ENABLE)));
5905
5906 res = bld.pseudo(aco_opcode::p_create_vector, bld.def(s8), components[0], components[1],
5907 components[2], components[3], components[4], components[5], components[6],
5908 components[7]);
5909 } else if (desc_type == ACO_DESC_SAMPLER && tex_instr->op == nir_texop_tg4) {
5910 Temp components[4];
5911 for (unsigned i = 0; i < 4; i++)
5912 components[i] = bld.tmp(s1);
5913
5914 bld.pseudo(aco_opcode::p_split_vector, Definition(components[0]), Definition(components[1]),
5915 Definition(components[2]), Definition(components[3]), res);
5916
5917 /* We want to always use the linear filtering truncation behaviour for
5918 * nir_texop_tg4, even if the sampler uses nearest/point filtering.
5919 */
5920 components[0] = bld.sop2(aco_opcode::s_and_b32, bld.def(s1), bld.def(s1, scc), components[0],
5921 Operand::c32(C_008F30_TRUNC_COORD));
5922
5923 res = bld.pseudo(aco_opcode::p_create_vector, bld.def(s4), components[0], components[1],
5924 components[2], components[3]);
5925 }
5926
5927 return res;
5928 }
5929
5930 static int
image_type_to_components_count(enum glsl_sampler_dim dim,bool array)5931 image_type_to_components_count(enum glsl_sampler_dim dim, bool array)
5932 {
5933 switch (dim) {
5934 case GLSL_SAMPLER_DIM_BUF: return 1;
5935 case GLSL_SAMPLER_DIM_1D: return array ? 2 : 1;
5936 case GLSL_SAMPLER_DIM_2D: return array ? 3 : 2;
5937 case GLSL_SAMPLER_DIM_MS: return array ? 4 : 3;
5938 case GLSL_SAMPLER_DIM_3D:
5939 case GLSL_SAMPLER_DIM_CUBE: return 3;
5940 case GLSL_SAMPLER_DIM_RECT:
5941 case GLSL_SAMPLER_DIM_SUBPASS: return 2;
5942 case GLSL_SAMPLER_DIM_SUBPASS_MS: return 3;
5943 default: break;
5944 }
5945 return 0;
5946 }
5947
5948 static MIMG_instruction*
emit_mimg(Builder & bld,aco_opcode op,Definition dst,Temp rsrc,Operand samp,std::vector<Temp> coords,unsigned wqm_mask=0,Operand vdata=Operand (v1))5949 emit_mimg(Builder& bld, aco_opcode op, Definition dst, Temp rsrc, Operand samp,
5950 std::vector<Temp> coords, unsigned wqm_mask = 0, Operand vdata = Operand(v1))
5951 {
5952 /* Limit NSA instructions to 3 dwords on GFX10 to avoid stability issues. */
5953 unsigned max_nsa_size = bld.program->chip_class >= GFX10_3 ? 13 : 5;
5954 bool use_nsa = bld.program->chip_class >= GFX10 && coords.size() <= max_nsa_size;
5955
5956 if (!use_nsa) {
5957 Temp coord = coords[0];
5958 if (coords.size() > 1) {
5959 coord = bld.tmp(RegType::vgpr, coords.size());
5960
5961 aco_ptr<Pseudo_instruction> vec{create_instruction<Pseudo_instruction>(
5962 aco_opcode::p_create_vector, Format::PSEUDO, coords.size(), 1)};
5963 for (unsigned i = 0; i < coords.size(); i++)
5964 vec->operands[i] = Operand(coords[i]);
5965 vec->definitions[0] = Definition(coord);
5966 bld.insert(std::move(vec));
5967 } else if (coord.type() == RegType::sgpr) {
5968 coord = bld.copy(bld.def(v1), coord);
5969 }
5970
5971 if (wqm_mask) {
5972 /* We don't need the bias, sample index, compare value or offset to be
5973 * computed in WQM but if the p_create_vector copies the coordinates, then it
5974 * needs to be in WQM. */
5975 coord = emit_wqm(bld, coord, bld.tmp(coord.regClass()), true);
5976 }
5977
5978 coords[0] = coord;
5979 coords.resize(1);
5980 } else {
5981 for (unsigned i = 0; i < coords.size(); i++) {
5982 if (wqm_mask & (1u << i))
5983 coords[i] = emit_wqm(bld, coords[i], bld.tmp(coords[i].regClass()), true);
5984 }
5985
5986 for (Temp& coord : coords) {
5987 if (coord.type() == RegType::sgpr)
5988 coord = bld.copy(bld.def(v1), coord);
5989 }
5990 }
5991
5992 aco_ptr<MIMG_instruction> mimg{
5993 create_instruction<MIMG_instruction>(op, Format::MIMG, 3 + coords.size(), dst.isTemp())};
5994 if (dst.isTemp())
5995 mimg->definitions[0] = dst;
5996 mimg->operands[0] = Operand(rsrc);
5997 mimg->operands[1] = samp;
5998 mimg->operands[2] = vdata;
5999 for (unsigned i = 0; i < coords.size(); i++)
6000 mimg->operands[3 + i] = Operand(coords[i]);
6001
6002 MIMG_instruction* res = mimg.get();
6003 bld.insert(std::move(mimg));
6004 return res;
6005 }
6006
6007 void
visit_bvh64_intersect_ray_amd(isel_context * ctx,nir_intrinsic_instr * instr)6008 visit_bvh64_intersect_ray_amd(isel_context* ctx, nir_intrinsic_instr* instr)
6009 {
6010 Builder bld(ctx->program, ctx->block);
6011 Temp dst = get_ssa_temp(ctx, &instr->dest.ssa);
6012 Temp resource = get_ssa_temp(ctx, instr->src[0].ssa);
6013 Temp node = get_ssa_temp(ctx, instr->src[1].ssa);
6014 Temp tmax = get_ssa_temp(ctx, instr->src[2].ssa);
6015 Temp origin = get_ssa_temp(ctx, instr->src[3].ssa);
6016 Temp dir = get_ssa_temp(ctx, instr->src[4].ssa);
6017 Temp inv_dir = get_ssa_temp(ctx, instr->src[5].ssa);
6018
6019 std::vector<Temp> args;
6020 args.push_back(emit_extract_vector(ctx, node, 0, v1));
6021 args.push_back(emit_extract_vector(ctx, node, 1, v1));
6022 args.push_back(as_vgpr(ctx, tmax));
6023 args.push_back(emit_extract_vector(ctx, origin, 0, v1));
6024 args.push_back(emit_extract_vector(ctx, origin, 1, v1));
6025 args.push_back(emit_extract_vector(ctx, origin, 2, v1));
6026 args.push_back(emit_extract_vector(ctx, dir, 0, v1));
6027 args.push_back(emit_extract_vector(ctx, dir, 1, v1));
6028 args.push_back(emit_extract_vector(ctx, dir, 2, v1));
6029 args.push_back(emit_extract_vector(ctx, inv_dir, 0, v1));
6030 args.push_back(emit_extract_vector(ctx, inv_dir, 1, v1));
6031 args.push_back(emit_extract_vector(ctx, inv_dir, 2, v1));
6032
6033 MIMG_instruction* mimg = emit_mimg(bld, aco_opcode::image_bvh64_intersect_ray, Definition(dst),
6034 resource, Operand(s4), args);
6035 mimg->dim = ac_image_1d;
6036 mimg->dmask = 0xf;
6037 mimg->unrm = true;
6038 mimg->r128 = true;
6039 }
6040
6041 static std::vector<Temp>
get_image_coords(isel_context * ctx,const nir_intrinsic_instr * instr)6042 get_image_coords(isel_context* ctx, const nir_intrinsic_instr* instr)
6043 {
6044
6045 Temp src0 = get_ssa_temp(ctx, instr->src[1].ssa);
6046 enum glsl_sampler_dim dim = nir_intrinsic_image_dim(instr);
6047 bool is_array = nir_intrinsic_image_array(instr);
6048 ASSERTED bool add_frag_pos =
6049 (dim == GLSL_SAMPLER_DIM_SUBPASS || dim == GLSL_SAMPLER_DIM_SUBPASS_MS);
6050 assert(!add_frag_pos && "Input attachments should be lowered.");
6051 bool is_ms = (dim == GLSL_SAMPLER_DIM_MS || dim == GLSL_SAMPLER_DIM_SUBPASS_MS);
6052 bool gfx9_1d = ctx->options->chip_class == GFX9 && dim == GLSL_SAMPLER_DIM_1D;
6053 int count = image_type_to_components_count(dim, is_array);
6054 std::vector<Temp> coords(count);
6055 Builder bld(ctx->program, ctx->block);
6056
6057 if (is_ms)
6058 coords[--count] = emit_extract_vector(ctx, get_ssa_temp(ctx, instr->src[2].ssa), 0, v1);
6059
6060 if (gfx9_1d) {
6061 coords[0] = emit_extract_vector(ctx, src0, 0, v1);
6062 coords.resize(coords.size() + 1);
6063 coords[1] = bld.copy(bld.def(v1), Operand::zero());
6064 if (is_array)
6065 coords[2] = emit_extract_vector(ctx, src0, 1, v1);
6066 } else {
6067 for (int i = 0; i < count; i++)
6068 coords[i] = emit_extract_vector(ctx, src0, i, v1);
6069 }
6070
6071 if (instr->intrinsic == nir_intrinsic_image_deref_load ||
6072 instr->intrinsic == nir_intrinsic_image_deref_sparse_load ||
6073 instr->intrinsic == nir_intrinsic_image_deref_store) {
6074 int lod_index = instr->intrinsic == nir_intrinsic_image_deref_store ? 4 : 3;
6075 bool level_zero =
6076 nir_src_is_const(instr->src[lod_index]) && nir_src_as_uint(instr->src[lod_index]) == 0;
6077
6078 if (!level_zero)
6079 coords.emplace_back(get_ssa_temp(ctx, instr->src[lod_index].ssa));
6080 }
6081
6082 return coords;
6083 }
6084
6085 memory_sync_info
get_memory_sync_info(nir_intrinsic_instr * instr,storage_class storage,unsigned semantics)6086 get_memory_sync_info(nir_intrinsic_instr* instr, storage_class storage, unsigned semantics)
6087 {
6088 /* atomicrmw might not have NIR_INTRINSIC_ACCESS and there's nothing interesting there anyway */
6089 if (semantics & semantic_atomicrmw)
6090 return memory_sync_info(storage, semantics);
6091
6092 unsigned access = nir_intrinsic_access(instr);
6093
6094 if (access & ACCESS_VOLATILE)
6095 semantics |= semantic_volatile;
6096 if (access & ACCESS_CAN_REORDER)
6097 semantics |= semantic_can_reorder | semantic_private;
6098
6099 return memory_sync_info(storage, semantics);
6100 }
6101
6102 Operand
emit_tfe_init(Builder & bld,Temp dst)6103 emit_tfe_init(Builder& bld, Temp dst)
6104 {
6105 Temp tmp = bld.tmp(dst.regClass());
6106
6107 aco_ptr<Pseudo_instruction> vec{create_instruction<Pseudo_instruction>(
6108 aco_opcode::p_create_vector, Format::PSEUDO, dst.size(), 1)};
6109 for (unsigned i = 0; i < dst.size(); i++)
6110 vec->operands[i] = Operand::zero();
6111 vec->definitions[0] = Definition(tmp);
6112 /* Since this is fixed to an instruction's definition register, any CSE will
6113 * just create copies. Copying costs about the same as zero-initialization,
6114 * but these copies can break up clauses.
6115 */
6116 vec->definitions[0].setNoCSE(true);
6117 bld.insert(std::move(vec));
6118
6119 return Operand(tmp);
6120 }
6121
6122 void
visit_image_load(isel_context * ctx,nir_intrinsic_instr * instr)6123 visit_image_load(isel_context* ctx, nir_intrinsic_instr* instr)
6124 {
6125 Builder bld(ctx->program, ctx->block);
6126 const nir_variable* var =
6127 nir_deref_instr_get_variable(nir_instr_as_deref(instr->src[0].ssa->parent_instr));
6128 const enum glsl_sampler_dim dim = nir_intrinsic_image_dim(instr);
6129 bool is_array = nir_intrinsic_image_array(instr);
6130 bool is_sparse = instr->intrinsic == nir_intrinsic_image_deref_sparse_load;
6131 Temp dst = get_ssa_temp(ctx, &instr->dest.ssa);
6132
6133 memory_sync_info sync = get_memory_sync_info(instr, storage_image, 0);
6134 unsigned access = var->data.access | nir_intrinsic_access(instr);
6135
6136 unsigned result_size = instr->dest.ssa.num_components - is_sparse;
6137 unsigned expand_mask =
6138 nir_ssa_def_components_read(&instr->dest.ssa) & u_bit_consecutive(0, result_size);
6139 expand_mask = MAX2(expand_mask, 1); /* this can be zero in the case of sparse image loads */
6140 if (dim == GLSL_SAMPLER_DIM_BUF)
6141 expand_mask = (1u << util_last_bit(expand_mask)) - 1u;
6142 unsigned dmask = expand_mask;
6143 if (instr->dest.ssa.bit_size == 64) {
6144 expand_mask &= 0x9;
6145 /* only R64_UINT and R64_SINT supported. x is in xy of the result, w in zw */
6146 dmask = ((expand_mask & 0x1) ? 0x3 : 0) | ((expand_mask & 0x8) ? 0xc : 0);
6147 }
6148 if (is_sparse)
6149 expand_mask |= 1 << result_size;
6150 unsigned num_components = util_bitcount(dmask) + is_sparse;
6151
6152 Temp tmp;
6153 if (num_components == dst.size() && dst.type() == RegType::vgpr)
6154 tmp = dst;
6155 else
6156 tmp = ctx->program->allocateTmp(RegClass(RegType::vgpr, num_components));
6157
6158 Temp resource = get_sampler_desc(ctx, nir_instr_as_deref(instr->src[0].ssa->parent_instr),
6159 dim == GLSL_SAMPLER_DIM_BUF ? ACO_DESC_BUFFER : ACO_DESC_IMAGE,
6160 nullptr, false);
6161
6162 if (dim == GLSL_SAMPLER_DIM_BUF) {
6163 Temp vindex = emit_extract_vector(ctx, get_ssa_temp(ctx, instr->src[1].ssa), 0, v1);
6164
6165 aco_opcode opcode;
6166 switch (util_bitcount(dmask)) {
6167 case 1: opcode = aco_opcode::buffer_load_format_x; break;
6168 case 2: opcode = aco_opcode::buffer_load_format_xy; break;
6169 case 3: opcode = aco_opcode::buffer_load_format_xyz; break;
6170 case 4: opcode = aco_opcode::buffer_load_format_xyzw; break;
6171 default: unreachable(">4 channel buffer image load");
6172 }
6173 aco_ptr<MUBUF_instruction> load{
6174 create_instruction<MUBUF_instruction>(opcode, Format::MUBUF, 3 + is_sparse, 1)};
6175 load->operands[0] = Operand(resource);
6176 load->operands[1] = Operand(vindex);
6177 load->operands[2] = Operand::c32(0);
6178 load->definitions[0] = Definition(tmp);
6179 load->idxen = true;
6180 load->glc = access & (ACCESS_VOLATILE | ACCESS_COHERENT);
6181 load->dlc = load->glc && ctx->options->chip_class >= GFX10;
6182 load->sync = sync;
6183 load->tfe = is_sparse;
6184 if (load->tfe)
6185 load->operands[3] = emit_tfe_init(bld, tmp);
6186 ctx->block->instructions.emplace_back(std::move(load));
6187 } else {
6188 std::vector<Temp> coords = get_image_coords(ctx, instr);
6189
6190 bool level_zero = nir_src_is_const(instr->src[3]) && nir_src_as_uint(instr->src[3]) == 0;
6191 aco_opcode opcode = level_zero ? aco_opcode::image_load : aco_opcode::image_load_mip;
6192
6193 Operand vdata = is_sparse ? emit_tfe_init(bld, tmp) : Operand(v1);
6194 MIMG_instruction* load =
6195 emit_mimg(bld, opcode, Definition(tmp), resource, Operand(s4), coords, 0, vdata);
6196 load->glc = access & (ACCESS_VOLATILE | ACCESS_COHERENT) ? 1 : 0;
6197 load->dlc = load->glc && ctx->options->chip_class >= GFX10;
6198 load->dim = ac_get_image_dim(ctx->options->chip_class, dim, is_array);
6199 load->dmask = dmask;
6200 load->unrm = true;
6201 load->da = should_declare_array(ctx, dim, is_array);
6202 load->sync = sync;
6203 load->tfe = is_sparse;
6204 }
6205
6206 if (is_sparse && instr->dest.ssa.bit_size == 64) {
6207 /* The result components are 64-bit but the sparse residency code is
6208 * 32-bit. So add a zero to the end so expand_vector() works correctly.
6209 */
6210 tmp = bld.pseudo(aco_opcode::p_create_vector, bld.def(RegType::vgpr, tmp.size() + 1), tmp,
6211 Operand::zero());
6212 }
6213
6214 expand_vector(ctx, tmp, dst, instr->dest.ssa.num_components, expand_mask);
6215 }
6216
6217 void
visit_image_store(isel_context * ctx,nir_intrinsic_instr * instr)6218 visit_image_store(isel_context* ctx, nir_intrinsic_instr* instr)
6219 {
6220 const nir_variable* var =
6221 nir_deref_instr_get_variable(nir_instr_as_deref(instr->src[0].ssa->parent_instr));
6222 const enum glsl_sampler_dim dim = nir_intrinsic_image_dim(instr);
6223 bool is_array = nir_intrinsic_image_array(instr);
6224 Temp data = get_ssa_temp(ctx, instr->src[3].ssa);
6225
6226 /* only R64_UINT and R64_SINT supported */
6227 if (instr->src[3].ssa->bit_size == 64 && data.bytes() > 8)
6228 data = emit_extract_vector(ctx, data, 0, RegClass(data.type(), 2));
6229 data = as_vgpr(ctx, data);
6230
6231 memory_sync_info sync = get_memory_sync_info(instr, storage_image, 0);
6232 unsigned access = var->data.access | nir_intrinsic_access(instr);
6233 bool glc = ctx->options->chip_class == GFX6 ||
6234 access & (ACCESS_VOLATILE | ACCESS_COHERENT | ACCESS_NON_READABLE)
6235 ? 1
6236 : 0;
6237
6238 if (dim == GLSL_SAMPLER_DIM_BUF) {
6239 Temp rsrc = get_sampler_desc(ctx, nir_instr_as_deref(instr->src[0].ssa->parent_instr),
6240 ACO_DESC_BUFFER, nullptr, true);
6241 Temp vindex = emit_extract_vector(ctx, get_ssa_temp(ctx, instr->src[1].ssa), 0, v1);
6242 aco_opcode opcode;
6243 switch (data.size()) {
6244 case 1: opcode = aco_opcode::buffer_store_format_x; break;
6245 case 2: opcode = aco_opcode::buffer_store_format_xy; break;
6246 case 3: opcode = aco_opcode::buffer_store_format_xyz; break;
6247 case 4: opcode = aco_opcode::buffer_store_format_xyzw; break;
6248 default: unreachable(">4 channel buffer image store");
6249 }
6250 aco_ptr<MUBUF_instruction> store{
6251 create_instruction<MUBUF_instruction>(opcode, Format::MUBUF, 4, 0)};
6252 store->operands[0] = Operand(rsrc);
6253 store->operands[1] = Operand(vindex);
6254 store->operands[2] = Operand::c32(0);
6255 store->operands[3] = Operand(data);
6256 store->idxen = true;
6257 store->glc = glc;
6258 store->dlc = false;
6259 store->disable_wqm = true;
6260 store->sync = sync;
6261 ctx->program->needs_exact = true;
6262 ctx->block->instructions.emplace_back(std::move(store));
6263 return;
6264 }
6265
6266 assert(data.type() == RegType::vgpr);
6267 std::vector<Temp> coords = get_image_coords(ctx, instr);
6268 Temp resource = get_sampler_desc(ctx, nir_instr_as_deref(instr->src[0].ssa->parent_instr),
6269 ACO_DESC_IMAGE, nullptr, true);
6270
6271 bool level_zero = nir_src_is_const(instr->src[4]) && nir_src_as_uint(instr->src[4]) == 0;
6272 aco_opcode opcode = level_zero ? aco_opcode::image_store : aco_opcode::image_store_mip;
6273
6274 Builder bld(ctx->program, ctx->block);
6275 MIMG_instruction* store =
6276 emit_mimg(bld, opcode, Definition(), resource, Operand(s4), coords, 0, Operand(data));
6277 store->glc = glc;
6278 store->dlc = false;
6279 store->dim = ac_get_image_dim(ctx->options->chip_class, dim, is_array);
6280 store->dmask = (1 << data.size()) - 1;
6281 store->unrm = true;
6282 store->da = should_declare_array(ctx, dim, is_array);
6283 store->disable_wqm = true;
6284 store->sync = sync;
6285 ctx->program->needs_exact = true;
6286 return;
6287 }
6288
6289 void
visit_image_atomic(isel_context * ctx,nir_intrinsic_instr * instr)6290 visit_image_atomic(isel_context* ctx, nir_intrinsic_instr* instr)
6291 {
6292 bool return_previous = !nir_ssa_def_is_unused(&instr->dest.ssa);
6293 const enum glsl_sampler_dim dim = nir_intrinsic_image_dim(instr);
6294 bool is_array = nir_intrinsic_image_array(instr);
6295 Builder bld(ctx->program, ctx->block);
6296
6297 Temp data = as_vgpr(ctx, get_ssa_temp(ctx, instr->src[3].ssa));
6298 bool is_64bit = data.bytes() == 8;
6299 assert((data.bytes() == 4 || data.bytes() == 8) && "only 32/64-bit image atomics implemented.");
6300
6301 if (instr->intrinsic == nir_intrinsic_image_deref_atomic_comp_swap)
6302 data = bld.pseudo(aco_opcode::p_create_vector, bld.def(is_64bit ? v4 : v2),
6303 get_ssa_temp(ctx, instr->src[4].ssa), data);
6304
6305 aco_opcode buf_op, buf_op64, image_op;
6306 switch (instr->intrinsic) {
6307 case nir_intrinsic_image_deref_atomic_add:
6308 buf_op = aco_opcode::buffer_atomic_add;
6309 buf_op64 = aco_opcode::buffer_atomic_add_x2;
6310 image_op = aco_opcode::image_atomic_add;
6311 break;
6312 case nir_intrinsic_image_deref_atomic_umin:
6313 buf_op = aco_opcode::buffer_atomic_umin;
6314 buf_op64 = aco_opcode::buffer_atomic_umin_x2;
6315 image_op = aco_opcode::image_atomic_umin;
6316 break;
6317 case nir_intrinsic_image_deref_atomic_imin:
6318 buf_op = aco_opcode::buffer_atomic_smin;
6319 buf_op64 = aco_opcode::buffer_atomic_smin_x2;
6320 image_op = aco_opcode::image_atomic_smin;
6321 break;
6322 case nir_intrinsic_image_deref_atomic_umax:
6323 buf_op = aco_opcode::buffer_atomic_umax;
6324 buf_op64 = aco_opcode::buffer_atomic_umax_x2;
6325 image_op = aco_opcode::image_atomic_umax;
6326 break;
6327 case nir_intrinsic_image_deref_atomic_imax:
6328 buf_op = aco_opcode::buffer_atomic_smax;
6329 buf_op64 = aco_opcode::buffer_atomic_smax_x2;
6330 image_op = aco_opcode::image_atomic_smax;
6331 break;
6332 case nir_intrinsic_image_deref_atomic_and:
6333 buf_op = aco_opcode::buffer_atomic_and;
6334 buf_op64 = aco_opcode::buffer_atomic_and_x2;
6335 image_op = aco_opcode::image_atomic_and;
6336 break;
6337 case nir_intrinsic_image_deref_atomic_or:
6338 buf_op = aco_opcode::buffer_atomic_or;
6339 buf_op64 = aco_opcode::buffer_atomic_or_x2;
6340 image_op = aco_opcode::image_atomic_or;
6341 break;
6342 case nir_intrinsic_image_deref_atomic_xor:
6343 buf_op = aco_opcode::buffer_atomic_xor;
6344 buf_op64 = aco_opcode::buffer_atomic_xor_x2;
6345 image_op = aco_opcode::image_atomic_xor;
6346 break;
6347 case nir_intrinsic_image_deref_atomic_exchange:
6348 buf_op = aco_opcode::buffer_atomic_swap;
6349 buf_op64 = aco_opcode::buffer_atomic_swap_x2;
6350 image_op = aco_opcode::image_atomic_swap;
6351 break;
6352 case nir_intrinsic_image_deref_atomic_comp_swap:
6353 buf_op = aco_opcode::buffer_atomic_cmpswap;
6354 buf_op64 = aco_opcode::buffer_atomic_cmpswap_x2;
6355 image_op = aco_opcode::image_atomic_cmpswap;
6356 break;
6357 case nir_intrinsic_image_deref_atomic_fmin:
6358 buf_op = aco_opcode::buffer_atomic_fmin;
6359 buf_op64 = aco_opcode::buffer_atomic_fmin_x2;
6360 image_op = aco_opcode::image_atomic_fmin;
6361 break;
6362 case nir_intrinsic_image_deref_atomic_fmax:
6363 buf_op = aco_opcode::buffer_atomic_fmax;
6364 buf_op64 = aco_opcode::buffer_atomic_fmax_x2;
6365 image_op = aco_opcode::image_atomic_fmax;
6366 break;
6367 default:
6368 unreachable("visit_image_atomic should only be called with "
6369 "nir_intrinsic_image_deref_atomic_* instructions.");
6370 }
6371
6372 Temp dst = get_ssa_temp(ctx, &instr->dest.ssa);
6373 memory_sync_info sync = get_memory_sync_info(instr, storage_image, semantic_atomicrmw);
6374
6375 if (dim == GLSL_SAMPLER_DIM_BUF) {
6376 Temp vindex = emit_extract_vector(ctx, get_ssa_temp(ctx, instr->src[1].ssa), 0, v1);
6377 Temp resource = get_sampler_desc(ctx, nir_instr_as_deref(instr->src[0].ssa->parent_instr),
6378 ACO_DESC_BUFFER, nullptr, true);
6379 // assert(ctx->options->chip_class < GFX9 && "GFX9 stride size workaround not yet
6380 // implemented.");
6381 aco_ptr<MUBUF_instruction> mubuf{create_instruction<MUBUF_instruction>(
6382 is_64bit ? buf_op64 : buf_op, Format::MUBUF, 4, return_previous ? 1 : 0)};
6383 mubuf->operands[0] = Operand(resource);
6384 mubuf->operands[1] = Operand(vindex);
6385 mubuf->operands[2] = Operand::c32(0);
6386 mubuf->operands[3] = Operand(data);
6387 if (return_previous)
6388 mubuf->definitions[0] = Definition(dst);
6389 mubuf->offset = 0;
6390 mubuf->idxen = true;
6391 mubuf->glc = return_previous;
6392 mubuf->dlc = false; /* Not needed for atomics */
6393 mubuf->disable_wqm = true;
6394 mubuf->sync = sync;
6395 ctx->program->needs_exact = true;
6396 ctx->block->instructions.emplace_back(std::move(mubuf));
6397 return;
6398 }
6399
6400 std::vector<Temp> coords = get_image_coords(ctx, instr);
6401 Temp resource = get_sampler_desc(ctx, nir_instr_as_deref(instr->src[0].ssa->parent_instr),
6402 ACO_DESC_IMAGE, nullptr, true);
6403 Definition def = return_previous ? Definition(dst) : Definition();
6404 MIMG_instruction* mimg =
6405 emit_mimg(bld, image_op, def, resource, Operand(s4), coords, 0, Operand(data));
6406 mimg->glc = return_previous;
6407 mimg->dlc = false; /* Not needed for atomics */
6408 mimg->dim = ac_get_image_dim(ctx->options->chip_class, dim, is_array);
6409 mimg->dmask = (1 << data.size()) - 1;
6410 mimg->unrm = true;
6411 mimg->da = should_declare_array(ctx, dim, is_array);
6412 mimg->disable_wqm = true;
6413 mimg->sync = sync;
6414 ctx->program->needs_exact = true;
6415 return;
6416 }
6417
6418 void
get_buffer_size(isel_context * ctx,Temp desc,Temp dst)6419 get_buffer_size(isel_context* ctx, Temp desc, Temp dst)
6420 {
6421 if (ctx->options->chip_class == GFX8) {
6422 /* we only have to divide by 1, 2, 4, 8, 12 or 16 */
6423 Builder bld(ctx->program, ctx->block);
6424
6425 Temp size = emit_extract_vector(ctx, desc, 2, s1);
6426
6427 Temp size_div3 = bld.vop3(aco_opcode::v_mul_hi_u32, bld.def(v1),
6428 bld.copy(bld.def(v1), Operand::c32(0xaaaaaaabu)), size);
6429 size_div3 = bld.sop2(aco_opcode::s_lshr_b32, bld.def(s1), bld.def(s1, scc),
6430 bld.as_uniform(size_div3), Operand::c32(1u));
6431
6432 Temp stride = emit_extract_vector(ctx, desc, 1, s1);
6433 stride = bld.sop2(aco_opcode::s_bfe_u32, bld.def(s1), bld.def(s1, scc), stride,
6434 Operand::c32((5u << 16) | 16u));
6435
6436 Temp is12 = bld.sopc(aco_opcode::s_cmp_eq_i32, bld.def(s1, scc), stride, Operand::c32(12u));
6437 size = bld.sop2(aco_opcode::s_cselect_b32, bld.def(s1), size_div3, size, bld.scc(is12));
6438
6439 Temp shr_dst = dst.type() == RegType::vgpr ? bld.tmp(s1) : dst;
6440 bld.sop2(aco_opcode::s_lshr_b32, Definition(shr_dst), bld.def(s1, scc), size,
6441 bld.sop1(aco_opcode::s_ff1_i32_b32, bld.def(s1), stride));
6442 if (dst.type() == RegType::vgpr)
6443 bld.copy(Definition(dst), shr_dst);
6444
6445 /* TODO: we can probably calculate this faster with v_skip when stride != 12 */
6446 } else {
6447 emit_extract_vector(ctx, desc, 2, dst);
6448 }
6449 }
6450
6451 void
visit_image_size(isel_context * ctx,nir_intrinsic_instr * instr)6452 visit_image_size(isel_context* ctx, nir_intrinsic_instr* instr)
6453 {
6454 const enum glsl_sampler_dim dim = nir_intrinsic_image_dim(instr);
6455 bool is_array = nir_intrinsic_image_array(instr);
6456 Builder bld(ctx->program, ctx->block);
6457
6458 if (dim == GLSL_SAMPLER_DIM_BUF) {
6459 Temp desc = get_sampler_desc(ctx, nir_instr_as_deref(instr->src[0].ssa->parent_instr),
6460 ACO_DESC_BUFFER, NULL, false);
6461 return get_buffer_size(ctx, desc, get_ssa_temp(ctx, &instr->dest.ssa));
6462 }
6463
6464 /* LOD */
6465 assert(nir_src_as_uint(instr->src[1]) == 0);
6466 std::vector<Temp> lod{bld.copy(bld.def(v1), Operand::zero())};
6467
6468 /* Resource */
6469 Temp resource = get_sampler_desc(ctx, nir_instr_as_deref(instr->src[0].ssa->parent_instr),
6470 ACO_DESC_IMAGE, NULL, false);
6471
6472 Temp dst = get_ssa_temp(ctx, &instr->dest.ssa);
6473
6474 MIMG_instruction* mimg =
6475 emit_mimg(bld, aco_opcode::image_get_resinfo, Definition(dst), resource, Operand(s4), lod);
6476 uint8_t& dmask = mimg->dmask;
6477 mimg->dim = ac_get_image_dim(ctx->options->chip_class, dim, is_array);
6478 mimg->dmask = (1 << instr->dest.ssa.num_components) - 1;
6479 mimg->da = is_array;
6480
6481 if (ctx->options->chip_class == GFX9 && dim == GLSL_SAMPLER_DIM_1D && is_array) {
6482 assert(instr->dest.ssa.num_components == 2);
6483 dmask = 0x5;
6484 }
6485
6486 emit_split_vector(ctx, dst, instr->dest.ssa.num_components);
6487 }
6488
6489 void
get_image_samples(isel_context * ctx,Definition dst,Temp resource)6490 get_image_samples(isel_context* ctx, Definition dst, Temp resource)
6491 {
6492 Builder bld(ctx->program, ctx->block);
6493
6494 Temp dword3 = emit_extract_vector(ctx, resource, 3, s1);
6495 Temp samples_log2 = bld.sop2(aco_opcode::s_bfe_u32, bld.def(s1), bld.def(s1, scc), dword3,
6496 Operand::c32(16u | 4u << 16));
6497 Temp samples = bld.sop2(aco_opcode::s_lshl_b32, bld.def(s1), bld.def(s1, scc), Operand::c32(1u),
6498 samples_log2);
6499 Temp type = bld.sop2(aco_opcode::s_bfe_u32, bld.def(s1), bld.def(s1, scc), dword3,
6500 Operand::c32(28u | 4u << 16 /* offset=28, width=4 */));
6501
6502 Operand default_sample = Operand::c32(1u);
6503 if (ctx->options->robust_buffer_access) {
6504 /* Extract the second dword of the descriptor, if it's
6505 * all zero, then it's a null descriptor.
6506 */
6507 Temp dword1 = emit_extract_vector(ctx, resource, 1, s1);
6508 Temp is_non_null_descriptor =
6509 bld.sopc(aco_opcode::s_cmp_gt_u32, bld.def(s1, scc), dword1, Operand::zero());
6510 default_sample = Operand(is_non_null_descriptor);
6511 }
6512
6513 Temp is_msaa = bld.sopc(aco_opcode::s_cmp_ge_u32, bld.def(s1, scc), type, Operand::c32(14u));
6514 bld.sop2(aco_opcode::s_cselect_b32, dst, samples, default_sample, bld.scc(is_msaa));
6515 }
6516
6517 void
visit_image_samples(isel_context * ctx,nir_intrinsic_instr * instr)6518 visit_image_samples(isel_context* ctx, nir_intrinsic_instr* instr)
6519 {
6520 Builder bld(ctx->program, ctx->block);
6521 Temp dst = get_ssa_temp(ctx, &instr->dest.ssa);
6522 Temp resource = get_sampler_desc(ctx, nir_instr_as_deref(instr->src[0].ssa->parent_instr),
6523 ACO_DESC_IMAGE, NULL, false);
6524 get_image_samples(ctx, Definition(dst), resource);
6525 }
6526
6527 void
visit_load_ssbo(isel_context * ctx,nir_intrinsic_instr * instr)6528 visit_load_ssbo(isel_context* ctx, nir_intrinsic_instr* instr)
6529 {
6530 Builder bld(ctx->program, ctx->block);
6531 unsigned num_components = instr->num_components;
6532
6533 Temp dst = get_ssa_temp(ctx, &instr->dest.ssa);
6534 Temp rsrc = load_buffer_rsrc(ctx, get_ssa_temp(ctx, instr->src[0].ssa));
6535
6536 unsigned access = nir_intrinsic_access(instr);
6537 bool glc = access & (ACCESS_VOLATILE | ACCESS_COHERENT);
6538 unsigned size = instr->dest.ssa.bit_size / 8;
6539
6540 bool allow_smem = access & ACCESS_CAN_REORDER;
6541
6542 load_buffer(ctx, num_components, size, dst, rsrc, get_ssa_temp(ctx, instr->src[1].ssa),
6543 nir_intrinsic_align_mul(instr), nir_intrinsic_align_offset(instr), glc, allow_smem,
6544 get_memory_sync_info(instr, storage_buffer, 0));
6545 }
6546
6547 void
visit_store_ssbo(isel_context * ctx,nir_intrinsic_instr * instr)6548 visit_store_ssbo(isel_context* ctx, nir_intrinsic_instr* instr)
6549 {
6550 Builder bld(ctx->program, ctx->block);
6551 Temp data = get_ssa_temp(ctx, instr->src[0].ssa);
6552 unsigned elem_size_bytes = instr->src[0].ssa->bit_size / 8;
6553 unsigned writemask = widen_mask(nir_intrinsic_write_mask(instr), elem_size_bytes);
6554 Temp offset = get_ssa_temp(ctx, instr->src[2].ssa);
6555
6556 Temp rsrc = load_buffer_rsrc(ctx, get_ssa_temp(ctx, instr->src[1].ssa));
6557
6558 memory_sync_info sync = get_memory_sync_info(instr, storage_buffer, 0);
6559 bool glc =
6560 nir_intrinsic_access(instr) & (ACCESS_VOLATILE | ACCESS_COHERENT | ACCESS_NON_READABLE);
6561
6562 unsigned write_count = 0;
6563 Temp write_datas[32];
6564 unsigned offsets[32];
6565 split_buffer_store(ctx, instr, false, RegType::vgpr, data, writemask, 16, &write_count,
6566 write_datas, offsets);
6567
6568 /* GFX6-7 are affected by a hw bug that prevents address clamping to work
6569 * correctly when the SGPR offset is used.
6570 */
6571 if (offset.type() == RegType::sgpr && ctx->options->chip_class < GFX8)
6572 offset = as_vgpr(ctx, offset);
6573
6574 for (unsigned i = 0; i < write_count; i++) {
6575 aco_opcode op = get_buffer_store_op(write_datas[i].bytes());
6576
6577 aco_ptr<MUBUF_instruction> store{
6578 create_instruction<MUBUF_instruction>(op, Format::MUBUF, 4, 0)};
6579 store->operands[0] = Operand(rsrc);
6580 store->operands[1] = offset.type() == RegType::vgpr ? Operand(offset) : Operand(v1);
6581 store->operands[2] = offset.type() == RegType::sgpr ? Operand(offset) : Operand::c32(0);
6582 store->operands[3] = Operand(write_datas[i]);
6583 store->offset = offsets[i];
6584 store->offen = (offset.type() == RegType::vgpr);
6585 store->glc = glc;
6586 store->dlc = false;
6587 store->disable_wqm = true;
6588 store->sync = sync;
6589 ctx->program->needs_exact = true;
6590 ctx->block->instructions.emplace_back(std::move(store));
6591 }
6592 }
6593
6594 void
visit_atomic_ssbo(isel_context * ctx,nir_intrinsic_instr * instr)6595 visit_atomic_ssbo(isel_context* ctx, nir_intrinsic_instr* instr)
6596 {
6597 Builder bld(ctx->program, ctx->block);
6598 bool return_previous = !nir_ssa_def_is_unused(&instr->dest.ssa);
6599 Temp data = as_vgpr(ctx, get_ssa_temp(ctx, instr->src[2].ssa));
6600
6601 if (instr->intrinsic == nir_intrinsic_ssbo_atomic_comp_swap)
6602 data = bld.pseudo(aco_opcode::p_create_vector, bld.def(RegType::vgpr, data.size() * 2),
6603 get_ssa_temp(ctx, instr->src[3].ssa), data);
6604
6605 Temp offset = get_ssa_temp(ctx, instr->src[1].ssa);
6606 Temp rsrc = load_buffer_rsrc(ctx, get_ssa_temp(ctx, instr->src[0].ssa));
6607
6608 Temp dst = get_ssa_temp(ctx, &instr->dest.ssa);
6609
6610 aco_opcode op32, op64;
6611 switch (instr->intrinsic) {
6612 case nir_intrinsic_ssbo_atomic_add:
6613 op32 = aco_opcode::buffer_atomic_add;
6614 op64 = aco_opcode::buffer_atomic_add_x2;
6615 break;
6616 case nir_intrinsic_ssbo_atomic_imin:
6617 op32 = aco_opcode::buffer_atomic_smin;
6618 op64 = aco_opcode::buffer_atomic_smin_x2;
6619 break;
6620 case nir_intrinsic_ssbo_atomic_umin:
6621 op32 = aco_opcode::buffer_atomic_umin;
6622 op64 = aco_opcode::buffer_atomic_umin_x2;
6623 break;
6624 case nir_intrinsic_ssbo_atomic_imax:
6625 op32 = aco_opcode::buffer_atomic_smax;
6626 op64 = aco_opcode::buffer_atomic_smax_x2;
6627 break;
6628 case nir_intrinsic_ssbo_atomic_umax:
6629 op32 = aco_opcode::buffer_atomic_umax;
6630 op64 = aco_opcode::buffer_atomic_umax_x2;
6631 break;
6632 case nir_intrinsic_ssbo_atomic_and:
6633 op32 = aco_opcode::buffer_atomic_and;
6634 op64 = aco_opcode::buffer_atomic_and_x2;
6635 break;
6636 case nir_intrinsic_ssbo_atomic_or:
6637 op32 = aco_opcode::buffer_atomic_or;
6638 op64 = aco_opcode::buffer_atomic_or_x2;
6639 break;
6640 case nir_intrinsic_ssbo_atomic_xor:
6641 op32 = aco_opcode::buffer_atomic_xor;
6642 op64 = aco_opcode::buffer_atomic_xor_x2;
6643 break;
6644 case nir_intrinsic_ssbo_atomic_exchange:
6645 op32 = aco_opcode::buffer_atomic_swap;
6646 op64 = aco_opcode::buffer_atomic_swap_x2;
6647 break;
6648 case nir_intrinsic_ssbo_atomic_comp_swap:
6649 op32 = aco_opcode::buffer_atomic_cmpswap;
6650 op64 = aco_opcode::buffer_atomic_cmpswap_x2;
6651 break;
6652 case nir_intrinsic_ssbo_atomic_fmin:
6653 op32 = aco_opcode::buffer_atomic_fmin;
6654 op64 = aco_opcode::buffer_atomic_fmin_x2;
6655 break;
6656 case nir_intrinsic_ssbo_atomic_fmax:
6657 op32 = aco_opcode::buffer_atomic_fmax;
6658 op64 = aco_opcode::buffer_atomic_fmax_x2;
6659 break;
6660 default:
6661 unreachable(
6662 "visit_atomic_ssbo should only be called with nir_intrinsic_ssbo_atomic_* instructions.");
6663 }
6664 aco_opcode op = instr->dest.ssa.bit_size == 32 ? op32 : op64;
6665 aco_ptr<MUBUF_instruction> mubuf{
6666 create_instruction<MUBUF_instruction>(op, Format::MUBUF, 4, return_previous ? 1 : 0)};
6667 mubuf->operands[0] = Operand(rsrc);
6668 mubuf->operands[1] = offset.type() == RegType::vgpr ? Operand(offset) : Operand(v1);
6669 mubuf->operands[2] = offset.type() == RegType::sgpr ? Operand(offset) : Operand::c32(0);
6670 mubuf->operands[3] = Operand(data);
6671 if (return_previous)
6672 mubuf->definitions[0] = Definition(dst);
6673 mubuf->offset = 0;
6674 mubuf->offen = (offset.type() == RegType::vgpr);
6675 mubuf->glc = return_previous;
6676 mubuf->dlc = false; /* Not needed for atomics */
6677 mubuf->disable_wqm = true;
6678 mubuf->sync = get_memory_sync_info(instr, storage_buffer, semantic_atomicrmw);
6679 ctx->program->needs_exact = true;
6680 ctx->block->instructions.emplace_back(std::move(mubuf));
6681 }
6682
6683 void
visit_get_ssbo_size(isel_context * ctx,nir_intrinsic_instr * instr)6684 visit_get_ssbo_size(isel_context* ctx, nir_intrinsic_instr* instr)
6685 {
6686
6687 Temp rsrc = get_ssa_temp(ctx, instr->src[0].ssa);
6688 Temp dst = get_ssa_temp(ctx, &instr->dest.ssa);
6689 bool non_uniform = dst.type() == RegType::vgpr;
6690
6691 Builder bld(ctx->program, ctx->block);
6692 if (non_uniform) {
6693 Temp set_ptr = emit_extract_vector(ctx, rsrc, 0, RegClass(rsrc.type(), 1));
6694 Temp binding = emit_extract_vector(ctx, rsrc, 1, RegClass(rsrc.type(), 1));
6695 Temp index = bld.vadd32(bld.def(v1), set_ptr, binding);
6696 index = convert_pointer_to_64_bit(ctx, index, non_uniform);
6697
6698 LoadEmitInfo info = {Operand(index), dst, 1, 4};
6699 info.align_mul = 4;
6700 info.const_offset = 8;
6701 emit_load(ctx, bld, info, global_load_params);
6702 } else {
6703 emit_extract_vector(ctx, load_buffer_rsrc(ctx, rsrc), 2, dst);
6704 }
6705 }
6706
6707 void
visit_load_global(isel_context * ctx,nir_intrinsic_instr * instr)6708 visit_load_global(isel_context* ctx, nir_intrinsic_instr* instr)
6709 {
6710 Builder bld(ctx->program, ctx->block);
6711 unsigned num_components = instr->num_components;
6712 unsigned component_size = instr->dest.ssa.bit_size / 8;
6713
6714 LoadEmitInfo info = {Operand(get_ssa_temp(ctx, instr->src[0].ssa)),
6715 get_ssa_temp(ctx, &instr->dest.ssa), num_components, component_size};
6716 info.glc = nir_intrinsic_access(instr) & (ACCESS_VOLATILE | ACCESS_COHERENT);
6717 info.align_mul = nir_intrinsic_align_mul(instr);
6718 info.align_offset = nir_intrinsic_align_offset(instr);
6719 info.sync = get_memory_sync_info(instr, storage_buffer, 0);
6720 /* VMEM stores don't update the SMEM cache and it's difficult to prove that
6721 * it's safe to use SMEM */
6722 bool can_use_smem = nir_intrinsic_access(instr) & ACCESS_NON_WRITEABLE;
6723 if (info.dst.type() == RegType::vgpr || (info.glc && ctx->options->chip_class < GFX8) ||
6724 !can_use_smem) {
6725 emit_load(ctx, bld, info, global_load_params);
6726 } else {
6727 info.offset = Operand(bld.as_uniform(info.offset));
6728 emit_load(ctx, bld, info, smem_load_params);
6729 }
6730 }
6731
6732 void
visit_store_global(isel_context * ctx,nir_intrinsic_instr * instr)6733 visit_store_global(isel_context* ctx, nir_intrinsic_instr* instr)
6734 {
6735 Builder bld(ctx->program, ctx->block);
6736 unsigned elem_size_bytes = instr->src[0].ssa->bit_size / 8;
6737 unsigned writemask = widen_mask(nir_intrinsic_write_mask(instr), elem_size_bytes);
6738
6739 Temp data = as_vgpr(ctx, get_ssa_temp(ctx, instr->src[0].ssa));
6740 Temp addr = get_ssa_temp(ctx, instr->src[1].ssa);
6741 memory_sync_info sync = get_memory_sync_info(instr, storage_buffer, 0);
6742 bool glc =
6743 nir_intrinsic_access(instr) & (ACCESS_VOLATILE | ACCESS_COHERENT | ACCESS_NON_READABLE);
6744
6745 if (ctx->options->chip_class >= GFX7)
6746 addr = as_vgpr(ctx, addr);
6747
6748 unsigned write_count = 0;
6749 Temp write_datas[32];
6750 unsigned offsets[32];
6751 split_buffer_store(ctx, instr, false, RegType::vgpr, data, writemask, 16, &write_count,
6752 write_datas, offsets);
6753
6754 for (unsigned i = 0; i < write_count; i++) {
6755 if (ctx->options->chip_class >= GFX7) {
6756 unsigned offset = offsets[i];
6757 Temp store_addr = addr;
6758 if (offset > 0 && ctx->options->chip_class < GFX9) {
6759 Temp addr0 = bld.tmp(v1), addr1 = bld.tmp(v1);
6760 Temp new_addr0 = bld.tmp(v1), new_addr1 = bld.tmp(v1);
6761 Temp carry = bld.tmp(bld.lm);
6762 bld.pseudo(aco_opcode::p_split_vector, Definition(addr0), Definition(addr1), addr);
6763
6764 bld.vop2(aco_opcode::v_add_co_u32, Definition(new_addr0),
6765 bld.hint_vcc(Definition(carry)), Operand::c32(offset), addr0);
6766 bld.vop2(aco_opcode::v_addc_co_u32, Definition(new_addr1), bld.def(bld.lm),
6767 Operand::zero(), addr1, carry)
6768 .def(1)
6769 .setHint(vcc);
6770
6771 store_addr = bld.pseudo(aco_opcode::p_create_vector, bld.def(v2), new_addr0, new_addr1);
6772
6773 offset = 0;
6774 }
6775
6776 bool global = ctx->options->chip_class >= GFX9;
6777 aco_opcode op;
6778 switch (write_datas[i].bytes()) {
6779 case 1: op = global ? aco_opcode::global_store_byte : aco_opcode::flat_store_byte; break;
6780 case 2: op = global ? aco_opcode::global_store_short : aco_opcode::flat_store_short; break;
6781 case 4: op = global ? aco_opcode::global_store_dword : aco_opcode::flat_store_dword; break;
6782 case 8:
6783 op = global ? aco_opcode::global_store_dwordx2 : aco_opcode::flat_store_dwordx2;
6784 break;
6785 case 12:
6786 op = global ? aco_opcode::global_store_dwordx3 : aco_opcode::flat_store_dwordx3;
6787 break;
6788 case 16:
6789 op = global ? aco_opcode::global_store_dwordx4 : aco_opcode::flat_store_dwordx4;
6790 break;
6791 default: unreachable("store_global not implemented for this size.");
6792 }
6793
6794 aco_ptr<FLAT_instruction> flat{
6795 create_instruction<FLAT_instruction>(op, global ? Format::GLOBAL : Format::FLAT, 3, 0)};
6796 flat->operands[0] = Operand(store_addr);
6797 flat->operands[1] = Operand(s1);
6798 flat->operands[2] = Operand(write_datas[i]);
6799 flat->glc = glc;
6800 flat->dlc = false;
6801 flat->offset = offset;
6802 flat->disable_wqm = true;
6803 flat->sync = sync;
6804 ctx->program->needs_exact = true;
6805 ctx->block->instructions.emplace_back(std::move(flat));
6806 } else {
6807 assert(ctx->options->chip_class == GFX6);
6808
6809 aco_opcode op = get_buffer_store_op(write_datas[i].bytes());
6810
6811 Temp rsrc = get_gfx6_global_rsrc(bld, addr);
6812
6813 aco_ptr<MUBUF_instruction> mubuf{
6814 create_instruction<MUBUF_instruction>(op, Format::MUBUF, 4, 0)};
6815 mubuf->operands[0] = Operand(rsrc);
6816 mubuf->operands[1] = addr.type() == RegType::vgpr ? Operand(addr) : Operand(v1);
6817 mubuf->operands[2] = Operand::zero();
6818 mubuf->operands[3] = Operand(write_datas[i]);
6819 mubuf->glc = glc;
6820 mubuf->dlc = false;
6821 mubuf->offset = offsets[i];
6822 mubuf->addr64 = addr.type() == RegType::vgpr;
6823 mubuf->disable_wqm = true;
6824 mubuf->sync = sync;
6825 ctx->program->needs_exact = true;
6826 ctx->block->instructions.emplace_back(std::move(mubuf));
6827 }
6828 }
6829 }
6830
6831 void
visit_global_atomic(isel_context * ctx,nir_intrinsic_instr * instr)6832 visit_global_atomic(isel_context* ctx, nir_intrinsic_instr* instr)
6833 {
6834 Builder bld(ctx->program, ctx->block);
6835 bool return_previous = !nir_ssa_def_is_unused(&instr->dest.ssa);
6836 Temp addr = get_ssa_temp(ctx, instr->src[0].ssa);
6837 Temp data = as_vgpr(ctx, get_ssa_temp(ctx, instr->src[1].ssa));
6838
6839 if (ctx->options->chip_class >= GFX7)
6840 addr = as_vgpr(ctx, addr);
6841
6842 if (instr->intrinsic == nir_intrinsic_global_atomic_comp_swap)
6843 data = bld.pseudo(aco_opcode::p_create_vector, bld.def(RegType::vgpr, data.size() * 2),
6844 get_ssa_temp(ctx, instr->src[2].ssa), data);
6845
6846 Temp dst = get_ssa_temp(ctx, &instr->dest.ssa);
6847
6848 aco_opcode op32, op64;
6849
6850 if (ctx->options->chip_class >= GFX7) {
6851 bool global = ctx->options->chip_class >= GFX9;
6852 switch (instr->intrinsic) {
6853 case nir_intrinsic_global_atomic_add:
6854 op32 = global ? aco_opcode::global_atomic_add : aco_opcode::flat_atomic_add;
6855 op64 = global ? aco_opcode::global_atomic_add_x2 : aco_opcode::flat_atomic_add_x2;
6856 break;
6857 case nir_intrinsic_global_atomic_imin:
6858 op32 = global ? aco_opcode::global_atomic_smin : aco_opcode::flat_atomic_smin;
6859 op64 = global ? aco_opcode::global_atomic_smin_x2 : aco_opcode::flat_atomic_smin_x2;
6860 break;
6861 case nir_intrinsic_global_atomic_umin:
6862 op32 = global ? aco_opcode::global_atomic_umin : aco_opcode::flat_atomic_umin;
6863 op64 = global ? aco_opcode::global_atomic_umin_x2 : aco_opcode::flat_atomic_umin_x2;
6864 break;
6865 case nir_intrinsic_global_atomic_imax:
6866 op32 = global ? aco_opcode::global_atomic_smax : aco_opcode::flat_atomic_smax;
6867 op64 = global ? aco_opcode::global_atomic_smax_x2 : aco_opcode::flat_atomic_smax_x2;
6868 break;
6869 case nir_intrinsic_global_atomic_umax:
6870 op32 = global ? aco_opcode::global_atomic_umax : aco_opcode::flat_atomic_umax;
6871 op64 = global ? aco_opcode::global_atomic_umax_x2 : aco_opcode::flat_atomic_umax_x2;
6872 break;
6873 case nir_intrinsic_global_atomic_and:
6874 op32 = global ? aco_opcode::global_atomic_and : aco_opcode::flat_atomic_and;
6875 op64 = global ? aco_opcode::global_atomic_and_x2 : aco_opcode::flat_atomic_and_x2;
6876 break;
6877 case nir_intrinsic_global_atomic_or:
6878 op32 = global ? aco_opcode::global_atomic_or : aco_opcode::flat_atomic_or;
6879 op64 = global ? aco_opcode::global_atomic_or_x2 : aco_opcode::flat_atomic_or_x2;
6880 break;
6881 case nir_intrinsic_global_atomic_xor:
6882 op32 = global ? aco_opcode::global_atomic_xor : aco_opcode::flat_atomic_xor;
6883 op64 = global ? aco_opcode::global_atomic_xor_x2 : aco_opcode::flat_atomic_xor_x2;
6884 break;
6885 case nir_intrinsic_global_atomic_exchange:
6886 op32 = global ? aco_opcode::global_atomic_swap : aco_opcode::flat_atomic_swap;
6887 op64 = global ? aco_opcode::global_atomic_swap_x2 : aco_opcode::flat_atomic_swap_x2;
6888 break;
6889 case nir_intrinsic_global_atomic_comp_swap:
6890 op32 = global ? aco_opcode::global_atomic_cmpswap : aco_opcode::flat_atomic_cmpswap;
6891 op64 = global ? aco_opcode::global_atomic_cmpswap_x2 : aco_opcode::flat_atomic_cmpswap_x2;
6892 break;
6893 case nir_intrinsic_global_atomic_fmin:
6894 op32 = global ? aco_opcode::global_atomic_fmin : aco_opcode::flat_atomic_fmin;
6895 op64 = global ? aco_opcode::global_atomic_fmin_x2 : aco_opcode::flat_atomic_fmin_x2;
6896 break;
6897 case nir_intrinsic_global_atomic_fmax:
6898 op32 = global ? aco_opcode::global_atomic_fmax : aco_opcode::flat_atomic_fmax;
6899 op64 = global ? aco_opcode::global_atomic_fmax_x2 : aco_opcode::flat_atomic_fmax_x2;
6900 break;
6901 default:
6902 unreachable("visit_atomic_global should only be called with nir_intrinsic_global_atomic_* "
6903 "instructions.");
6904 }
6905
6906 aco_opcode op = instr->dest.ssa.bit_size == 32 ? op32 : op64;
6907 aco_ptr<FLAT_instruction> flat{create_instruction<FLAT_instruction>(
6908 op, global ? Format::GLOBAL : Format::FLAT, 3, return_previous ? 1 : 0)};
6909 flat->operands[0] = Operand(addr);
6910 flat->operands[1] = Operand(s1);
6911 flat->operands[2] = Operand(data);
6912 if (return_previous)
6913 flat->definitions[0] = Definition(dst);
6914 flat->glc = return_previous;
6915 flat->dlc = false; /* Not needed for atomics */
6916 flat->offset = 0;
6917 flat->disable_wqm = true;
6918 flat->sync = get_memory_sync_info(instr, storage_buffer, semantic_atomicrmw);
6919 ctx->program->needs_exact = true;
6920 ctx->block->instructions.emplace_back(std::move(flat));
6921 } else {
6922 assert(ctx->options->chip_class == GFX6);
6923
6924 switch (instr->intrinsic) {
6925 case nir_intrinsic_global_atomic_add:
6926 op32 = aco_opcode::buffer_atomic_add;
6927 op64 = aco_opcode::buffer_atomic_add_x2;
6928 break;
6929 case nir_intrinsic_global_atomic_imin:
6930 op32 = aco_opcode::buffer_atomic_smin;
6931 op64 = aco_opcode::buffer_atomic_smin_x2;
6932 break;
6933 case nir_intrinsic_global_atomic_umin:
6934 op32 = aco_opcode::buffer_atomic_umin;
6935 op64 = aco_opcode::buffer_atomic_umin_x2;
6936 break;
6937 case nir_intrinsic_global_atomic_imax:
6938 op32 = aco_opcode::buffer_atomic_smax;
6939 op64 = aco_opcode::buffer_atomic_smax_x2;
6940 break;
6941 case nir_intrinsic_global_atomic_umax:
6942 op32 = aco_opcode::buffer_atomic_umax;
6943 op64 = aco_opcode::buffer_atomic_umax_x2;
6944 break;
6945 case nir_intrinsic_global_atomic_and:
6946 op32 = aco_opcode::buffer_atomic_and;
6947 op64 = aco_opcode::buffer_atomic_and_x2;
6948 break;
6949 case nir_intrinsic_global_atomic_or:
6950 op32 = aco_opcode::buffer_atomic_or;
6951 op64 = aco_opcode::buffer_atomic_or_x2;
6952 break;
6953 case nir_intrinsic_global_atomic_xor:
6954 op32 = aco_opcode::buffer_atomic_xor;
6955 op64 = aco_opcode::buffer_atomic_xor_x2;
6956 break;
6957 case nir_intrinsic_global_atomic_exchange:
6958 op32 = aco_opcode::buffer_atomic_swap;
6959 op64 = aco_opcode::buffer_atomic_swap_x2;
6960 break;
6961 case nir_intrinsic_global_atomic_comp_swap:
6962 op32 = aco_opcode::buffer_atomic_cmpswap;
6963 op64 = aco_opcode::buffer_atomic_cmpswap_x2;
6964 break;
6965 case nir_intrinsic_global_atomic_fmin:
6966 op32 = aco_opcode::buffer_atomic_fmin;
6967 op64 = aco_opcode::buffer_atomic_fmin_x2;
6968 break;
6969 case nir_intrinsic_global_atomic_fmax:
6970 op32 = aco_opcode::buffer_atomic_fmax;
6971 op64 = aco_opcode::buffer_atomic_fmax_x2;
6972 break;
6973 default:
6974 unreachable("visit_atomic_global should only be called with nir_intrinsic_global_atomic_* "
6975 "instructions.");
6976 }
6977
6978 Temp rsrc = get_gfx6_global_rsrc(bld, addr);
6979
6980 aco_opcode op = instr->dest.ssa.bit_size == 32 ? op32 : op64;
6981
6982 aco_ptr<MUBUF_instruction> mubuf{
6983 create_instruction<MUBUF_instruction>(op, Format::MUBUF, 4, return_previous ? 1 : 0)};
6984 mubuf->operands[0] = Operand(rsrc);
6985 mubuf->operands[1] = addr.type() == RegType::vgpr ? Operand(addr) : Operand(v1);
6986 mubuf->operands[2] = Operand::zero();
6987 mubuf->operands[3] = Operand(data);
6988 if (return_previous)
6989 mubuf->definitions[0] = Definition(dst);
6990 mubuf->glc = return_previous;
6991 mubuf->dlc = false;
6992 mubuf->offset = 0;
6993 mubuf->addr64 = addr.type() == RegType::vgpr;
6994 mubuf->disable_wqm = true;
6995 mubuf->sync = get_memory_sync_info(instr, storage_buffer, semantic_atomicrmw);
6996 ctx->program->needs_exact = true;
6997 ctx->block->instructions.emplace_back(std::move(mubuf));
6998 }
6999 }
7000
7001 void
visit_load_buffer(isel_context * ctx,nir_intrinsic_instr * intrin)7002 visit_load_buffer(isel_context* ctx, nir_intrinsic_instr* intrin)
7003 {
7004 Builder bld(ctx->program, ctx->block);
7005
7006 Temp dst = get_ssa_temp(ctx, &intrin->dest.ssa);
7007 Temp descriptor = bld.as_uniform(get_ssa_temp(ctx, intrin->src[0].ssa));
7008 Temp v_offset = as_vgpr(ctx, get_ssa_temp(ctx, intrin->src[1].ssa));
7009 Temp s_offset = bld.as_uniform(get_ssa_temp(ctx, intrin->src[2].ssa));
7010
7011 bool swizzled = nir_intrinsic_is_swizzled(intrin);
7012 bool reorder = nir_intrinsic_can_reorder(intrin);
7013 bool slc = nir_intrinsic_slc_amd(intrin);
7014
7015 unsigned const_offset = nir_intrinsic_base(intrin);
7016 unsigned elem_size_bytes = intrin->dest.ssa.bit_size / 8u;
7017 unsigned num_components = intrin->dest.ssa.num_components;
7018 unsigned swizzle_element_size = swizzled ? (ctx->program->chip_class <= GFX8 ? 4 : 16) : 0;
7019
7020 load_vmem_mubuf(ctx, dst, descriptor, v_offset, s_offset, const_offset, elem_size_bytes,
7021 num_components, swizzle_element_size, !swizzled, reorder, slc);
7022 }
7023
7024 void
visit_store_buffer(isel_context * ctx,nir_intrinsic_instr * intrin)7025 visit_store_buffer(isel_context* ctx, nir_intrinsic_instr* intrin)
7026 {
7027 Temp store_src = get_ssa_temp(ctx, intrin->src[0].ssa);
7028 Temp descriptor = get_ssa_temp(ctx, intrin->src[1].ssa);
7029 Temp v_offset = get_ssa_temp(ctx, intrin->src[2].ssa);
7030 Temp s_offset = get_ssa_temp(ctx, intrin->src[3].ssa);
7031
7032 bool swizzled = nir_intrinsic_is_swizzled(intrin);
7033 bool slc = nir_intrinsic_slc_amd(intrin);
7034
7035 unsigned const_offset = nir_intrinsic_base(intrin);
7036 unsigned write_mask = nir_intrinsic_write_mask(intrin);
7037 unsigned elem_size_bytes = intrin->src[0].ssa->bit_size / 8u;
7038
7039 nir_variable_mode mem_mode = nir_intrinsic_memory_modes(intrin);
7040 memory_sync_info sync(mem_mode == nir_var_shader_out ? storage_vmem_output : storage_none);
7041
7042 store_vmem_mubuf(ctx, store_src, descriptor, v_offset, s_offset, const_offset, elem_size_bytes,
7043 write_mask, !swizzled, sync, slc);
7044 }
7045
7046 sync_scope
translate_nir_scope(nir_scope scope)7047 translate_nir_scope(nir_scope scope)
7048 {
7049 switch (scope) {
7050 case NIR_SCOPE_NONE:
7051 case NIR_SCOPE_INVOCATION: return scope_invocation;
7052 case NIR_SCOPE_SUBGROUP: return scope_subgroup;
7053 case NIR_SCOPE_WORKGROUP: return scope_workgroup;
7054 case NIR_SCOPE_QUEUE_FAMILY: return scope_queuefamily;
7055 case NIR_SCOPE_DEVICE: return scope_device;
7056 case NIR_SCOPE_SHADER_CALL: return scope_invocation;
7057 }
7058 unreachable("invalid scope");
7059 }
7060
7061 void
emit_scoped_barrier(isel_context * ctx,nir_intrinsic_instr * instr)7062 emit_scoped_barrier(isel_context* ctx, nir_intrinsic_instr* instr)
7063 {
7064 Builder bld(ctx->program, ctx->block);
7065
7066 unsigned semantics = 0;
7067 unsigned storage = 0;
7068 sync_scope mem_scope = translate_nir_scope(nir_intrinsic_memory_scope(instr));
7069 sync_scope exec_scope = translate_nir_scope(nir_intrinsic_execution_scope(instr));
7070
7071 /* We use shared storage for the following:
7072 * - compute shaders expose it in their API
7073 * - when tessellation is used, TCS and VS I/O is lowered to shared memory
7074 * - when GS is used on GFX9+, VS->GS and TES->GS I/O is lowered to shared memory
7075 * - additionally, when NGG is used on GFX10+, shared memory is used for certain features
7076 */
7077 bool shared_storage_used = ctx->stage.hw == HWStage::CS || ctx->stage.hw == HWStage::LS ||
7078 ctx->stage.hw == HWStage::HS ||
7079 (ctx->stage.hw == HWStage::GS && ctx->program->chip_class >= GFX9) ||
7080 ctx->stage.hw == HWStage::NGG;
7081
7082 /* Workgroup barriers can hang merged shaders that can potentially have 0 threads in either half.
7083 * They are allowed in CS, TCS, and in any NGG shader.
7084 */
7085 ASSERTED bool workgroup_scope_allowed =
7086 ctx->stage.hw == HWStage::CS || ctx->stage.hw == HWStage::HS || ctx->stage.hw == HWStage::NGG;
7087
7088 unsigned nir_storage = nir_intrinsic_memory_modes(instr);
7089 if (nir_storage & (nir_var_mem_ssbo | nir_var_mem_global))
7090 storage |= storage_buffer | storage_image; // TODO: split this when NIR gets nir_var_mem_image
7091 if (shared_storage_used && (nir_storage & nir_var_mem_shared))
7092 storage |= storage_shared;
7093
7094 unsigned nir_semantics = nir_intrinsic_memory_semantics(instr);
7095 if (nir_semantics & NIR_MEMORY_ACQUIRE)
7096 semantics |= semantic_acquire | semantic_release;
7097 if (nir_semantics & NIR_MEMORY_RELEASE)
7098 semantics |= semantic_acquire | semantic_release;
7099
7100 assert(!(nir_semantics & (NIR_MEMORY_MAKE_AVAILABLE | NIR_MEMORY_MAKE_VISIBLE)));
7101 assert(exec_scope != scope_workgroup || workgroup_scope_allowed);
7102
7103 bld.barrier(aco_opcode::p_barrier,
7104 memory_sync_info((storage_class)storage, (memory_semantics)semantics, mem_scope),
7105 exec_scope);
7106 }
7107
7108 void
visit_load_shared(isel_context * ctx,nir_intrinsic_instr * instr)7109 visit_load_shared(isel_context* ctx, nir_intrinsic_instr* instr)
7110 {
7111 // TODO: implement sparse reads using ds_read2_b32 and nir_ssa_def_components_read()
7112 Temp dst = get_ssa_temp(ctx, &instr->dest.ssa);
7113 Temp address = as_vgpr(ctx, get_ssa_temp(ctx, instr->src[0].ssa));
7114 Builder bld(ctx->program, ctx->block);
7115
7116 unsigned elem_size_bytes = instr->dest.ssa.bit_size / 8;
7117 unsigned num_components = instr->dest.ssa.num_components;
7118 unsigned align = nir_intrinsic_align_mul(instr) ? nir_intrinsic_align(instr) : elem_size_bytes;
7119 load_lds(ctx, elem_size_bytes, num_components, dst, address, nir_intrinsic_base(instr), align);
7120 }
7121
7122 void
visit_store_shared(isel_context * ctx,nir_intrinsic_instr * instr)7123 visit_store_shared(isel_context* ctx, nir_intrinsic_instr* instr)
7124 {
7125 unsigned writemask = nir_intrinsic_write_mask(instr);
7126 Temp data = get_ssa_temp(ctx, instr->src[0].ssa);
7127 Temp address = as_vgpr(ctx, get_ssa_temp(ctx, instr->src[1].ssa));
7128 unsigned elem_size_bytes = instr->src[0].ssa->bit_size / 8;
7129
7130 unsigned align = nir_intrinsic_align_mul(instr) ? nir_intrinsic_align(instr) : elem_size_bytes;
7131 store_lds(ctx, elem_size_bytes, data, writemask, address, nir_intrinsic_base(instr), align);
7132 }
7133
7134 void
visit_shared_atomic(isel_context * ctx,nir_intrinsic_instr * instr)7135 visit_shared_atomic(isel_context* ctx, nir_intrinsic_instr* instr)
7136 {
7137 unsigned offset = nir_intrinsic_base(instr);
7138 Builder bld(ctx->program, ctx->block);
7139 Operand m = load_lds_size_m0(bld);
7140 Temp data = as_vgpr(ctx, get_ssa_temp(ctx, instr->src[1].ssa));
7141 Temp address = as_vgpr(ctx, get_ssa_temp(ctx, instr->src[0].ssa));
7142
7143 unsigned num_operands = 3;
7144 aco_opcode op32, op64, op32_rtn, op64_rtn;
7145 switch (instr->intrinsic) {
7146 case nir_intrinsic_shared_atomic_add:
7147 op32 = aco_opcode::ds_add_u32;
7148 op64 = aco_opcode::ds_add_u64;
7149 op32_rtn = aco_opcode::ds_add_rtn_u32;
7150 op64_rtn = aco_opcode::ds_add_rtn_u64;
7151 break;
7152 case nir_intrinsic_shared_atomic_imin:
7153 op32 = aco_opcode::ds_min_i32;
7154 op64 = aco_opcode::ds_min_i64;
7155 op32_rtn = aco_opcode::ds_min_rtn_i32;
7156 op64_rtn = aco_opcode::ds_min_rtn_i64;
7157 break;
7158 case nir_intrinsic_shared_atomic_umin:
7159 op32 = aco_opcode::ds_min_u32;
7160 op64 = aco_opcode::ds_min_u64;
7161 op32_rtn = aco_opcode::ds_min_rtn_u32;
7162 op64_rtn = aco_opcode::ds_min_rtn_u64;
7163 break;
7164 case nir_intrinsic_shared_atomic_imax:
7165 op32 = aco_opcode::ds_max_i32;
7166 op64 = aco_opcode::ds_max_i64;
7167 op32_rtn = aco_opcode::ds_max_rtn_i32;
7168 op64_rtn = aco_opcode::ds_max_rtn_i64;
7169 break;
7170 case nir_intrinsic_shared_atomic_umax:
7171 op32 = aco_opcode::ds_max_u32;
7172 op64 = aco_opcode::ds_max_u64;
7173 op32_rtn = aco_opcode::ds_max_rtn_u32;
7174 op64_rtn = aco_opcode::ds_max_rtn_u64;
7175 break;
7176 case nir_intrinsic_shared_atomic_and:
7177 op32 = aco_opcode::ds_and_b32;
7178 op64 = aco_opcode::ds_and_b64;
7179 op32_rtn = aco_opcode::ds_and_rtn_b32;
7180 op64_rtn = aco_opcode::ds_and_rtn_b64;
7181 break;
7182 case nir_intrinsic_shared_atomic_or:
7183 op32 = aco_opcode::ds_or_b32;
7184 op64 = aco_opcode::ds_or_b64;
7185 op32_rtn = aco_opcode::ds_or_rtn_b32;
7186 op64_rtn = aco_opcode::ds_or_rtn_b64;
7187 break;
7188 case nir_intrinsic_shared_atomic_xor:
7189 op32 = aco_opcode::ds_xor_b32;
7190 op64 = aco_opcode::ds_xor_b64;
7191 op32_rtn = aco_opcode::ds_xor_rtn_b32;
7192 op64_rtn = aco_opcode::ds_xor_rtn_b64;
7193 break;
7194 case nir_intrinsic_shared_atomic_exchange:
7195 op32 = aco_opcode::ds_write_b32;
7196 op64 = aco_opcode::ds_write_b64;
7197 op32_rtn = aco_opcode::ds_wrxchg_rtn_b32;
7198 op64_rtn = aco_opcode::ds_wrxchg_rtn_b64;
7199 break;
7200 case nir_intrinsic_shared_atomic_comp_swap:
7201 op32 = aco_opcode::ds_cmpst_b32;
7202 op64 = aco_opcode::ds_cmpst_b64;
7203 op32_rtn = aco_opcode::ds_cmpst_rtn_b32;
7204 op64_rtn = aco_opcode::ds_cmpst_rtn_b64;
7205 num_operands = 4;
7206 break;
7207 case nir_intrinsic_shared_atomic_fadd:
7208 op32 = aco_opcode::ds_add_f32;
7209 op32_rtn = aco_opcode::ds_add_rtn_f32;
7210 op64 = aco_opcode::num_opcodes;
7211 op64_rtn = aco_opcode::num_opcodes;
7212 break;
7213 case nir_intrinsic_shared_atomic_fmin:
7214 op32 = aco_opcode::ds_min_f32;
7215 op32_rtn = aco_opcode::ds_min_rtn_f32;
7216 op64 = aco_opcode::ds_min_f64;
7217 op64_rtn = aco_opcode::ds_min_rtn_f64;
7218 break;
7219 case nir_intrinsic_shared_atomic_fmax:
7220 op32 = aco_opcode::ds_max_f32;
7221 op32_rtn = aco_opcode::ds_max_rtn_f32;
7222 op64 = aco_opcode::ds_max_f64;
7223 op64_rtn = aco_opcode::ds_max_rtn_f64;
7224 break;
7225 default: unreachable("Unhandled shared atomic intrinsic");
7226 }
7227
7228 bool return_previous = !nir_ssa_def_is_unused(&instr->dest.ssa);
7229
7230 aco_opcode op;
7231 if (data.size() == 1) {
7232 assert(instr->dest.ssa.bit_size == 32);
7233 op = return_previous ? op32_rtn : op32;
7234 } else {
7235 assert(instr->dest.ssa.bit_size == 64);
7236 op = return_previous ? op64_rtn : op64;
7237 }
7238
7239 if (offset > 65535) {
7240 address = bld.vadd32(bld.def(v1), Operand::c32(offset), address);
7241 offset = 0;
7242 }
7243
7244 aco_ptr<DS_instruction> ds;
7245 ds.reset(
7246 create_instruction<DS_instruction>(op, Format::DS, num_operands, return_previous ? 1 : 0));
7247 ds->operands[0] = Operand(address);
7248 ds->operands[1] = Operand(data);
7249 if (num_operands == 4) {
7250 Temp data2 = as_vgpr(ctx, get_ssa_temp(ctx, instr->src[2].ssa));
7251 ds->operands[2] = Operand(data2);
7252 }
7253 ds->operands[num_operands - 1] = m;
7254 ds->offset0 = offset;
7255 if (return_previous)
7256 ds->definitions[0] = Definition(get_ssa_temp(ctx, &instr->dest.ssa));
7257 ds->sync = memory_sync_info(storage_shared, semantic_atomicrmw);
7258
7259 if (m.isUndefined())
7260 ds->operands.pop_back();
7261
7262 ctx->block->instructions.emplace_back(std::move(ds));
7263 }
7264
7265 Temp
get_scratch_resource(isel_context * ctx)7266 get_scratch_resource(isel_context* ctx)
7267 {
7268 Builder bld(ctx->program, ctx->block);
7269 Temp scratch_addr = ctx->program->private_segment_buffer;
7270 if (ctx->stage != compute_cs)
7271 scratch_addr =
7272 bld.smem(aco_opcode::s_load_dwordx2, bld.def(s2), scratch_addr, Operand::zero());
7273
7274 uint32_t rsrc_conf =
7275 S_008F0C_ADD_TID_ENABLE(1) | S_008F0C_INDEX_STRIDE(ctx->program->wave_size == 64 ? 3 : 2);
7276
7277 if (ctx->program->chip_class >= GFX10) {
7278 rsrc_conf |= S_008F0C_FORMAT(V_008F0C_GFX10_FORMAT_32_FLOAT) |
7279 S_008F0C_OOB_SELECT(V_008F0C_OOB_SELECT_RAW) | S_008F0C_RESOURCE_LEVEL(1);
7280 } else if (ctx->program->chip_class <=
7281 GFX7) { /* dfmt modifies stride on GFX8/GFX9 when ADD_TID_EN=1 */
7282 rsrc_conf |= S_008F0C_NUM_FORMAT(V_008F0C_BUF_NUM_FORMAT_FLOAT) |
7283 S_008F0C_DATA_FORMAT(V_008F0C_BUF_DATA_FORMAT_32);
7284 }
7285
7286 /* older generations need element size = 4 bytes. element size removed in GFX9 */
7287 if (ctx->program->chip_class <= GFX8)
7288 rsrc_conf |= S_008F0C_ELEMENT_SIZE(1);
7289
7290 return bld.pseudo(aco_opcode::p_create_vector, bld.def(s4), scratch_addr, Operand::c32(-1u),
7291 Operand::c32(rsrc_conf));
7292 }
7293
7294 void
visit_load_scratch(isel_context * ctx,nir_intrinsic_instr * instr)7295 visit_load_scratch(isel_context* ctx, nir_intrinsic_instr* instr)
7296 {
7297 Builder bld(ctx->program, ctx->block);
7298 Temp rsrc = get_scratch_resource(ctx);
7299 Temp offset = as_vgpr(ctx, get_ssa_temp(ctx, instr->src[0].ssa));
7300 Temp dst = get_ssa_temp(ctx, &instr->dest.ssa);
7301
7302 LoadEmitInfo info = {Operand(offset), dst, instr->dest.ssa.num_components,
7303 instr->dest.ssa.bit_size / 8u, rsrc};
7304 info.align_mul = nir_intrinsic_align_mul(instr);
7305 info.align_offset = nir_intrinsic_align_offset(instr);
7306 info.swizzle_component_size = ctx->program->chip_class <= GFX8 ? 4 : 0;
7307 info.sync = memory_sync_info(storage_scratch, semantic_private);
7308 info.soffset = ctx->program->scratch_offset;
7309 emit_load(ctx, bld, info, scratch_load_params);
7310 }
7311
7312 void
visit_store_scratch(isel_context * ctx,nir_intrinsic_instr * instr)7313 visit_store_scratch(isel_context* ctx, nir_intrinsic_instr* instr)
7314 {
7315 Builder bld(ctx->program, ctx->block);
7316 Temp rsrc = get_scratch_resource(ctx);
7317 Temp data = as_vgpr(ctx, get_ssa_temp(ctx, instr->src[0].ssa));
7318 Temp offset = as_vgpr(ctx, get_ssa_temp(ctx, instr->src[1].ssa));
7319
7320 unsigned elem_size_bytes = instr->src[0].ssa->bit_size / 8;
7321 unsigned writemask = widen_mask(nir_intrinsic_write_mask(instr), elem_size_bytes);
7322
7323 unsigned write_count = 0;
7324 Temp write_datas[32];
7325 unsigned offsets[32];
7326 unsigned swizzle_component_size = ctx->program->chip_class <= GFX8 ? 4 : 16;
7327 split_buffer_store(ctx, instr, false, RegType::vgpr, data, writemask, swizzle_component_size,
7328 &write_count, write_datas, offsets);
7329
7330 for (unsigned i = 0; i < write_count; i++) {
7331 aco_opcode op = get_buffer_store_op(write_datas[i].bytes());
7332 Instruction* mubuf = bld.mubuf(op, rsrc, offset, ctx->program->scratch_offset, write_datas[i],
7333 offsets[i], true, true);
7334 mubuf->mubuf().sync = memory_sync_info(storage_scratch, semantic_private);
7335 }
7336 }
7337
7338 void
visit_load_sample_mask_in(isel_context * ctx,nir_intrinsic_instr * instr)7339 visit_load_sample_mask_in(isel_context* ctx, nir_intrinsic_instr* instr)
7340 {
7341 uint8_t log2_ps_iter_samples;
7342 if (ctx->program->info->ps.uses_sample_shading) {
7343 log2_ps_iter_samples = util_logbase2(ctx->options->key.ps.num_samples);
7344 } else {
7345 log2_ps_iter_samples = ctx->options->key.ps.log2_ps_iter_samples;
7346 }
7347
7348 Builder bld(ctx->program, ctx->block);
7349
7350 Temp dst = get_ssa_temp(ctx, &instr->dest.ssa);
7351
7352 if (log2_ps_iter_samples) {
7353 /* gl_SampleMaskIn[0] = (SampleCoverage & (1 << gl_SampleID)). */
7354 Temp sample_id =
7355 bld.vop3(aco_opcode::v_bfe_u32, bld.def(v1), get_arg(ctx, ctx->args->ac.ancillary),
7356 Operand::c32(8u), Operand::c32(4u));
7357 Temp mask = bld.vop2(aco_opcode::v_lshlrev_b32, bld.def(v1), sample_id,
7358 bld.copy(bld.def(v1), Operand::c32(1u)));
7359 bld.vop2(aco_opcode::v_and_b32, Definition(dst), mask,
7360 get_arg(ctx, ctx->args->ac.sample_coverage));
7361 } else {
7362 bld.copy(Definition(dst), get_arg(ctx, ctx->args->ac.sample_coverage));
7363 }
7364 }
7365
7366 void
visit_emit_vertex_with_counter(isel_context * ctx,nir_intrinsic_instr * instr)7367 visit_emit_vertex_with_counter(isel_context* ctx, nir_intrinsic_instr* instr)
7368 {
7369 Builder bld(ctx->program, ctx->block);
7370
7371 unsigned stream = nir_intrinsic_stream_id(instr);
7372 Temp next_vertex = as_vgpr(ctx, get_ssa_temp(ctx, instr->src[0].ssa));
7373 next_vertex = bld.v_mul_imm(bld.def(v1), next_vertex, 4u);
7374 nir_const_value* next_vertex_cv = nir_src_as_const_value(instr->src[0]);
7375
7376 /* get GSVS ring */
7377 Temp gsvs_ring =
7378 bld.smem(aco_opcode::s_load_dwordx4, bld.def(s4), ctx->program->private_segment_buffer,
7379 Operand::c32(RING_GSVS_GS * 16u));
7380
7381 unsigned num_components = ctx->program->info->gs.num_stream_output_components[stream];
7382
7383 unsigned stride = 4u * num_components * ctx->shader->info.gs.vertices_out;
7384 unsigned stream_offset = 0;
7385 for (unsigned i = 0; i < stream; i++) {
7386 unsigned prev_stride = 4u * ctx->program->info->gs.num_stream_output_components[i] *
7387 ctx->shader->info.gs.vertices_out;
7388 stream_offset += prev_stride * ctx->program->wave_size;
7389 }
7390
7391 /* Limit on the stride field for <= GFX7. */
7392 assert(stride < (1 << 14));
7393
7394 Temp gsvs_dwords[4];
7395 for (unsigned i = 0; i < 4; i++)
7396 gsvs_dwords[i] = bld.tmp(s1);
7397 bld.pseudo(aco_opcode::p_split_vector, Definition(gsvs_dwords[0]), Definition(gsvs_dwords[1]),
7398 Definition(gsvs_dwords[2]), Definition(gsvs_dwords[3]), gsvs_ring);
7399
7400 if (stream_offset) {
7401 Temp stream_offset_tmp = bld.copy(bld.def(s1), Operand::c32(stream_offset));
7402
7403 Temp carry = bld.tmp(s1);
7404 gsvs_dwords[0] = bld.sop2(aco_opcode::s_add_u32, bld.def(s1), bld.scc(Definition(carry)),
7405 gsvs_dwords[0], stream_offset_tmp);
7406 gsvs_dwords[1] = bld.sop2(aco_opcode::s_addc_u32, bld.def(s1), bld.def(s1, scc),
7407 gsvs_dwords[1], Operand::zero(), bld.scc(carry));
7408 }
7409
7410 gsvs_dwords[1] = bld.sop2(aco_opcode::s_or_b32, bld.def(s1), bld.def(s1, scc), gsvs_dwords[1],
7411 Operand::c32(S_008F04_STRIDE(stride)));
7412 gsvs_dwords[2] = bld.copy(bld.def(s1), Operand::c32(ctx->program->wave_size));
7413
7414 gsvs_ring = bld.pseudo(aco_opcode::p_create_vector, bld.def(s4), gsvs_dwords[0], gsvs_dwords[1],
7415 gsvs_dwords[2], gsvs_dwords[3]);
7416
7417 unsigned offset = 0;
7418 for (unsigned i = 0; i <= VARYING_SLOT_VAR31; i++) {
7419 if (ctx->program->info->gs.output_streams[i] != stream)
7420 continue;
7421
7422 for (unsigned j = 0; j < 4; j++) {
7423 if (!(ctx->program->info->gs.output_usage_mask[i] & (1 << j)))
7424 continue;
7425
7426 if (ctx->outputs.mask[i] & (1 << j)) {
7427 Operand vaddr_offset = next_vertex_cv ? Operand(v1) : Operand(next_vertex);
7428 unsigned const_offset = (offset + (next_vertex_cv ? next_vertex_cv->u32 : 0u)) * 4u;
7429 if (const_offset >= 4096u) {
7430 if (vaddr_offset.isUndefined())
7431 vaddr_offset = bld.copy(bld.def(v1), Operand::c32(const_offset / 4096u * 4096u));
7432 else
7433 vaddr_offset = bld.vadd32(bld.def(v1), Operand::c32(const_offset / 4096u * 4096u),
7434 vaddr_offset);
7435 const_offset %= 4096u;
7436 }
7437
7438 aco_ptr<MTBUF_instruction> mtbuf{create_instruction<MTBUF_instruction>(
7439 aco_opcode::tbuffer_store_format_x, Format::MTBUF, 4, 0)};
7440 mtbuf->operands[0] = Operand(gsvs_ring);
7441 mtbuf->operands[1] = vaddr_offset;
7442 mtbuf->operands[2] = Operand(get_arg(ctx, ctx->args->ac.gs2vs_offset));
7443 mtbuf->operands[3] = Operand(ctx->outputs.temps[i * 4u + j]);
7444 mtbuf->offen = !vaddr_offset.isUndefined();
7445 mtbuf->dfmt = V_008F0C_BUF_DATA_FORMAT_32;
7446 mtbuf->nfmt = V_008F0C_BUF_NUM_FORMAT_UINT;
7447 mtbuf->offset = const_offset;
7448 mtbuf->glc = true;
7449 mtbuf->slc = true;
7450 mtbuf->sync = memory_sync_info(storage_vmem_output, semantic_can_reorder);
7451 bld.insert(std::move(mtbuf));
7452 }
7453
7454 offset += ctx->shader->info.gs.vertices_out;
7455 }
7456
7457 /* outputs for the next vertex are undefined and keeping them around can
7458 * create invalid IR with control flow */
7459 ctx->outputs.mask[i] = 0;
7460 }
7461
7462 bld.sopp(aco_opcode::s_sendmsg, bld.m0(ctx->gs_wave_id), -1, sendmsg_gs(false, true, stream));
7463 }
7464
7465 Temp
emit_boolean_reduce(isel_context * ctx,nir_op op,unsigned cluster_size,Temp src)7466 emit_boolean_reduce(isel_context* ctx, nir_op op, unsigned cluster_size, Temp src)
7467 {
7468 Builder bld(ctx->program, ctx->block);
7469
7470 if (cluster_size == 1) {
7471 return src;
7472 }
7473 if (op == nir_op_iand && cluster_size == 4) {
7474 /* subgroupClusteredAnd(val, 4) -> ~wqm(exec & ~val) */
7475 Temp tmp =
7476 bld.sop2(Builder::s_andn2, bld.def(bld.lm), bld.def(s1, scc), Operand(exec, bld.lm), src);
7477 return bld.sop1(Builder::s_not, bld.def(bld.lm), bld.def(s1, scc),
7478 bld.sop1(Builder::s_wqm, bld.def(bld.lm), bld.def(s1, scc), tmp));
7479 } else if (op == nir_op_ior && cluster_size == 4) {
7480 /* subgroupClusteredOr(val, 4) -> wqm(val & exec) */
7481 return bld.sop1(
7482 Builder::s_wqm, bld.def(bld.lm), bld.def(s1, scc),
7483 bld.sop2(Builder::s_and, bld.def(bld.lm), bld.def(s1, scc), src, Operand(exec, bld.lm)));
7484 } else if (op == nir_op_iand && cluster_size == ctx->program->wave_size) {
7485 /* subgroupAnd(val) -> (exec & ~val) == 0 */
7486 Temp tmp =
7487 bld.sop2(Builder::s_andn2, bld.def(bld.lm), bld.def(s1, scc), Operand(exec, bld.lm), src)
7488 .def(1)
7489 .getTemp();
7490 Temp cond = bool_to_vector_condition(ctx, emit_wqm(bld, tmp));
7491 return bld.sop1(Builder::s_not, bld.def(bld.lm), bld.def(s1, scc), cond);
7492 } else if (op == nir_op_ior && cluster_size == ctx->program->wave_size) {
7493 /* subgroupOr(val) -> (val & exec) != 0 */
7494 Temp tmp =
7495 bld.sop2(Builder::s_and, bld.def(bld.lm), bld.def(s1, scc), src, Operand(exec, bld.lm))
7496 .def(1)
7497 .getTemp();
7498 return bool_to_vector_condition(ctx, tmp);
7499 } else if (op == nir_op_ixor && cluster_size == ctx->program->wave_size) {
7500 /* subgroupXor(val) -> s_bcnt1_i32_b64(val & exec) & 1 */
7501 Temp tmp =
7502 bld.sop2(Builder::s_and, bld.def(bld.lm), bld.def(s1, scc), src, Operand(exec, bld.lm));
7503 tmp = bld.sop1(Builder::s_bcnt1_i32, bld.def(s1), bld.def(s1, scc), tmp);
7504 tmp = bld.sop2(aco_opcode::s_and_b32, bld.def(s1), bld.def(s1, scc), tmp, Operand::c32(1u))
7505 .def(1)
7506 .getTemp();
7507 return bool_to_vector_condition(ctx, tmp);
7508 } else {
7509 /* subgroupClustered{And,Or,Xor}(val, n):
7510 * lane_id = v_mbcnt_hi_u32_b32(-1, v_mbcnt_lo_u32_b32(-1, 0)) (just v_mbcnt_lo on wave32)
7511 * cluster_offset = ~(n - 1) & lane_id cluster_mask = ((1 << n) - 1)
7512 * subgroupClusteredAnd():
7513 * return ((val | ~exec) >> cluster_offset) & cluster_mask == cluster_mask
7514 * subgroupClusteredOr():
7515 * return ((val & exec) >> cluster_offset) & cluster_mask != 0
7516 * subgroupClusteredXor():
7517 * return v_bnt_u32_b32(((val & exec) >> cluster_offset) & cluster_mask, 0) & 1 != 0
7518 */
7519 Temp lane_id = emit_mbcnt(ctx, bld.tmp(v1));
7520 Temp cluster_offset = bld.vop2(aco_opcode::v_and_b32, bld.def(v1),
7521 Operand::c32(~uint32_t(cluster_size - 1)), lane_id);
7522
7523 Temp tmp;
7524 if (op == nir_op_iand)
7525 tmp = bld.sop2(Builder::s_orn2, bld.def(bld.lm), bld.def(s1, scc), src,
7526 Operand(exec, bld.lm));
7527 else
7528 tmp =
7529 bld.sop2(Builder::s_and, bld.def(bld.lm), bld.def(s1, scc), src, Operand(exec, bld.lm));
7530
7531 uint32_t cluster_mask = cluster_size == 32 ? -1 : (1u << cluster_size) - 1u;
7532
7533 if (ctx->program->chip_class <= GFX7)
7534 tmp = bld.vop3(aco_opcode::v_lshr_b64, bld.def(v2), tmp, cluster_offset);
7535 else if (ctx->program->wave_size == 64)
7536 tmp = bld.vop3(aco_opcode::v_lshrrev_b64, bld.def(v2), cluster_offset, tmp);
7537 else
7538 tmp = bld.vop2_e64(aco_opcode::v_lshrrev_b32, bld.def(v1), cluster_offset, tmp);
7539 tmp = emit_extract_vector(ctx, tmp, 0, v1);
7540 if (cluster_mask != 0xffffffff)
7541 tmp = bld.vop2(aco_opcode::v_and_b32, bld.def(v1), Operand::c32(cluster_mask), tmp);
7542
7543 if (op == nir_op_iand) {
7544 return bld.vopc(aco_opcode::v_cmp_eq_u32, bld.hint_vcc(bld.lm), Operand::c32(cluster_mask),
7545 tmp);
7546 } else if (op == nir_op_ior) {
7547 return bld.vopc(aco_opcode::v_cmp_lg_u32, bld.hint_vcc(bld.lm), Operand::zero(), tmp);
7548 } else if (op == nir_op_ixor) {
7549 tmp = bld.vop2(aco_opcode::v_and_b32, bld.def(v1), Operand::c32(1u),
7550 bld.vop3(aco_opcode::v_bcnt_u32_b32, bld.def(v1), tmp, Operand::zero()));
7551 return bld.vopc(aco_opcode::v_cmp_lg_u32, bld.hint_vcc(bld.lm), Operand::zero(), tmp);
7552 }
7553 assert(false);
7554 return Temp();
7555 }
7556 }
7557
7558 Temp
emit_boolean_exclusive_scan(isel_context * ctx,nir_op op,Temp src)7559 emit_boolean_exclusive_scan(isel_context* ctx, nir_op op, Temp src)
7560 {
7561 Builder bld(ctx->program, ctx->block);
7562 assert(src.regClass() == bld.lm);
7563
7564 /* subgroupExclusiveAnd(val) -> mbcnt(exec & ~val) == 0
7565 * subgroupExclusiveOr(val) -> mbcnt(val & exec) != 0
7566 * subgroupExclusiveXor(val) -> mbcnt(val & exec) & 1 != 0
7567 */
7568 Temp tmp;
7569 if (op == nir_op_iand)
7570 tmp =
7571 bld.sop2(Builder::s_andn2, bld.def(bld.lm), bld.def(s1, scc), Operand(exec, bld.lm), src);
7572 else
7573 tmp = bld.sop2(Builder::s_and, bld.def(bld.lm), bld.def(s1, scc), src, Operand(exec, bld.lm));
7574
7575 Temp mbcnt = emit_mbcnt(ctx, bld.tmp(v1), Operand(tmp));
7576
7577 if (op == nir_op_iand)
7578 return bld.vopc(aco_opcode::v_cmp_eq_u32, bld.hint_vcc(bld.lm), Operand::zero(), mbcnt);
7579 else if (op == nir_op_ior)
7580 return bld.vopc(aco_opcode::v_cmp_lg_u32, bld.hint_vcc(bld.lm), Operand::zero(), mbcnt);
7581 else if (op == nir_op_ixor)
7582 return bld.vopc(aco_opcode::v_cmp_lg_u32, bld.hint_vcc(bld.lm), Operand::zero(),
7583 bld.vop2(aco_opcode::v_and_b32, bld.def(v1), Operand::c32(1u), mbcnt));
7584
7585 assert(false);
7586 return Temp();
7587 }
7588
7589 Temp
emit_boolean_inclusive_scan(isel_context * ctx,nir_op op,Temp src)7590 emit_boolean_inclusive_scan(isel_context* ctx, nir_op op, Temp src)
7591 {
7592 Builder bld(ctx->program, ctx->block);
7593
7594 /* subgroupInclusiveAnd(val) -> subgroupExclusiveAnd(val) && val
7595 * subgroupInclusiveOr(val) -> subgroupExclusiveOr(val) || val
7596 * subgroupInclusiveXor(val) -> subgroupExclusiveXor(val) ^^ val
7597 */
7598 Temp tmp = emit_boolean_exclusive_scan(ctx, op, src);
7599 if (op == nir_op_iand)
7600 return bld.sop2(Builder::s_and, bld.def(bld.lm), bld.def(s1, scc), tmp, src);
7601 else if (op == nir_op_ior)
7602 return bld.sop2(Builder::s_or, bld.def(bld.lm), bld.def(s1, scc), tmp, src);
7603 else if (op == nir_op_ixor)
7604 return bld.sop2(Builder::s_xor, bld.def(bld.lm), bld.def(s1, scc), tmp, src);
7605
7606 assert(false);
7607 return Temp();
7608 }
7609
7610 ReduceOp
get_reduce_op(nir_op op,unsigned bit_size)7611 get_reduce_op(nir_op op, unsigned bit_size)
7612 {
7613 switch (op) {
7614 #define CASEI(name) \
7615 case nir_op_##name: \
7616 return (bit_size == 32) ? name##32 \
7617 : (bit_size == 16) ? name##16 \
7618 : (bit_size == 8) ? name##8 \
7619 : name##64;
7620 #define CASEF(name) \
7621 case nir_op_##name: return (bit_size == 32) ? name##32 : (bit_size == 16) ? name##16 : name##64;
7622 CASEI(iadd)
7623 CASEI(imul)
7624 CASEI(imin)
7625 CASEI(umin)
7626 CASEI(imax)
7627 CASEI(umax)
7628 CASEI(iand)
7629 CASEI(ior)
7630 CASEI(ixor)
7631 CASEF(fadd)
7632 CASEF(fmul)
7633 CASEF(fmin)
7634 CASEF(fmax)
7635 default: unreachable("unknown reduction op");
7636 #undef CASEI
7637 #undef CASEF
7638 }
7639 }
7640
7641 void
emit_uniform_subgroup(isel_context * ctx,nir_intrinsic_instr * instr,Temp src)7642 emit_uniform_subgroup(isel_context* ctx, nir_intrinsic_instr* instr, Temp src)
7643 {
7644 Builder bld(ctx->program, ctx->block);
7645 Definition dst(get_ssa_temp(ctx, &instr->dest.ssa));
7646 assert(dst.regClass().type() != RegType::vgpr);
7647 if (src.regClass().type() == RegType::vgpr)
7648 bld.pseudo(aco_opcode::p_as_uniform, dst, src);
7649 else
7650 bld.copy(dst, src);
7651 }
7652
7653 void
emit_addition_uniform_reduce(isel_context * ctx,nir_op op,Definition dst,nir_src src,Temp count)7654 emit_addition_uniform_reduce(isel_context* ctx, nir_op op, Definition dst, nir_src src, Temp count)
7655 {
7656 Builder bld(ctx->program, ctx->block);
7657 Temp src_tmp = get_ssa_temp(ctx, src.ssa);
7658
7659 if (op == nir_op_fadd) {
7660 src_tmp = as_vgpr(ctx, src_tmp);
7661 Temp tmp = dst.regClass() == s1 ? bld.tmp(src_tmp.regClass()) : dst.getTemp();
7662
7663 if (src.ssa->bit_size == 16) {
7664 count = bld.vop1(aco_opcode::v_cvt_f16_u16, bld.def(v2b), count);
7665 bld.vop2(aco_opcode::v_mul_f16, Definition(tmp), count, src_tmp);
7666 } else {
7667 assert(src.ssa->bit_size == 32);
7668 count = bld.vop1(aco_opcode::v_cvt_f32_u32, bld.def(v1), count);
7669 bld.vop2(aco_opcode::v_mul_f32, Definition(tmp), count, src_tmp);
7670 }
7671
7672 if (tmp != dst.getTemp())
7673 bld.pseudo(aco_opcode::p_as_uniform, dst, tmp);
7674
7675 return;
7676 }
7677
7678 if (dst.regClass() == s1)
7679 src_tmp = bld.as_uniform(src_tmp);
7680
7681 if (op == nir_op_ixor && count.type() == RegType::sgpr)
7682 count =
7683 bld.sop2(aco_opcode::s_and_b32, bld.def(s1), bld.def(s1, scc), count, Operand::c32(1u));
7684 else if (op == nir_op_ixor)
7685 count = bld.vop2(aco_opcode::v_and_b32, bld.def(v1), Operand::c32(1u), count);
7686
7687 assert(dst.getTemp().type() == count.type());
7688
7689 if (nir_src_is_const(src)) {
7690 if (nir_src_as_uint(src) == 1 && dst.bytes() <= 2)
7691 bld.pseudo(aco_opcode::p_extract_vector, dst, count, Operand::zero());
7692 else if (nir_src_as_uint(src) == 1)
7693 bld.copy(dst, count);
7694 else if (nir_src_as_uint(src) == 0 && dst.bytes() <= 2)
7695 bld.vop1(aco_opcode::v_mov_b32, dst, Operand::zero()); /* RA will use SDWA if possible */
7696 else if (nir_src_as_uint(src) == 0)
7697 bld.copy(dst, Operand::zero());
7698 else if (count.type() == RegType::vgpr)
7699 bld.v_mul_imm(dst, count, nir_src_as_uint(src));
7700 else
7701 bld.sop2(aco_opcode::s_mul_i32, dst, src_tmp, count);
7702 } else if (dst.bytes() <= 2 && ctx->program->chip_class >= GFX10) {
7703 bld.vop3(aco_opcode::v_mul_lo_u16_e64, dst, src_tmp, count);
7704 } else if (dst.bytes() <= 2 && ctx->program->chip_class >= GFX8) {
7705 bld.vop2(aco_opcode::v_mul_lo_u16, dst, src_tmp, count);
7706 } else if (dst.getTemp().type() == RegType::vgpr) {
7707 bld.vop3(aco_opcode::v_mul_lo_u32, dst, src_tmp, count);
7708 } else {
7709 bld.sop2(aco_opcode::s_mul_i32, dst, src_tmp, count);
7710 }
7711 }
7712
7713 bool
emit_uniform_reduce(isel_context * ctx,nir_intrinsic_instr * instr)7714 emit_uniform_reduce(isel_context* ctx, nir_intrinsic_instr* instr)
7715 {
7716 nir_op op = (nir_op)nir_intrinsic_reduction_op(instr);
7717 if (op == nir_op_imul || op == nir_op_fmul)
7718 return false;
7719
7720 if (op == nir_op_iadd || op == nir_op_ixor || op == nir_op_fadd) {
7721 Builder bld(ctx->program, ctx->block);
7722 Definition dst(get_ssa_temp(ctx, &instr->dest.ssa));
7723 unsigned bit_size = instr->src[0].ssa->bit_size;
7724 if (bit_size > 32)
7725 return false;
7726
7727 Temp thread_count =
7728 bld.sop1(Builder::s_bcnt1_i32, bld.def(s1), bld.def(s1, scc), Operand(exec, bld.lm));
7729
7730 emit_addition_uniform_reduce(ctx, op, dst, instr->src[0], thread_count);
7731 } else {
7732 emit_uniform_subgroup(ctx, instr, get_ssa_temp(ctx, instr->src[0].ssa));
7733 }
7734
7735 return true;
7736 }
7737
7738 bool
emit_uniform_scan(isel_context * ctx,nir_intrinsic_instr * instr)7739 emit_uniform_scan(isel_context* ctx, nir_intrinsic_instr* instr)
7740 {
7741 Builder bld(ctx->program, ctx->block);
7742 Definition dst(get_ssa_temp(ctx, &instr->dest.ssa));
7743 nir_op op = (nir_op)nir_intrinsic_reduction_op(instr);
7744 bool inc = instr->intrinsic == nir_intrinsic_inclusive_scan;
7745
7746 if (op == nir_op_imul || op == nir_op_fmul)
7747 return false;
7748
7749 if (op == nir_op_iadd || op == nir_op_ixor || op == nir_op_fadd) {
7750 if (instr->src[0].ssa->bit_size > 32)
7751 return false;
7752
7753 Temp packed_tid;
7754 if (inc)
7755 packed_tid = emit_mbcnt(ctx, bld.tmp(v1), Operand(exec, bld.lm), Operand::c32(1u));
7756 else
7757 packed_tid = emit_mbcnt(ctx, bld.tmp(v1), Operand(exec, bld.lm));
7758
7759 emit_addition_uniform_reduce(ctx, op, dst, instr->src[0], packed_tid);
7760 return true;
7761 }
7762
7763 assert(op == nir_op_imin || op == nir_op_umin || op == nir_op_imax || op == nir_op_umax ||
7764 op == nir_op_iand || op == nir_op_ior || op == nir_op_fmin || op == nir_op_fmax);
7765
7766 if (inc) {
7767 emit_uniform_subgroup(ctx, instr, get_ssa_temp(ctx, instr->src[0].ssa));
7768 return true;
7769 }
7770
7771 /* Copy the source and write the reduction operation identity to the first lane. */
7772 Temp lane = bld.sop1(Builder::s_ff1_i32, bld.def(s1), Operand(exec, bld.lm));
7773 Temp src = get_ssa_temp(ctx, instr->src[0].ssa);
7774 ReduceOp reduce_op = get_reduce_op(op, instr->src[0].ssa->bit_size);
7775 if (dst.bytes() == 8) {
7776 Temp lo = bld.tmp(v1), hi = bld.tmp(v1);
7777 bld.pseudo(aco_opcode::p_split_vector, Definition(lo), Definition(hi), src);
7778 uint32_t identity_lo = get_reduction_identity(reduce_op, 0);
7779 uint32_t identity_hi = get_reduction_identity(reduce_op, 1);
7780
7781 lo =
7782 bld.writelane(bld.def(v1), bld.copy(bld.hint_m0(s1), Operand::c32(identity_lo)), lane, lo);
7783 hi =
7784 bld.writelane(bld.def(v1), bld.copy(bld.hint_m0(s1), Operand::c32(identity_hi)), lane, hi);
7785 bld.pseudo(aco_opcode::p_create_vector, dst, lo, hi);
7786 } else {
7787 uint32_t identity = get_reduction_identity(reduce_op, 0);
7788 bld.writelane(dst, bld.copy(bld.hint_m0(s1), Operand::c32(identity)), lane,
7789 as_vgpr(ctx, src));
7790 }
7791
7792 return true;
7793 }
7794
7795 Temp
emit_reduction_instr(isel_context * ctx,aco_opcode aco_op,ReduceOp op,unsigned cluster_size,Definition dst,Temp src)7796 emit_reduction_instr(isel_context* ctx, aco_opcode aco_op, ReduceOp op, unsigned cluster_size,
7797 Definition dst, Temp src)
7798 {
7799 assert(src.bytes() <= 8);
7800 assert(src.type() == RegType::vgpr);
7801
7802 Builder bld(ctx->program, ctx->block);
7803
7804 unsigned num_defs = 0;
7805 Definition defs[5];
7806 defs[num_defs++] = dst;
7807 defs[num_defs++] = bld.def(bld.lm); /* used internally to save/restore exec */
7808
7809 /* scalar identity temporary */
7810 bool need_sitmp = (ctx->program->chip_class <= GFX7 || ctx->program->chip_class >= GFX10) &&
7811 aco_op != aco_opcode::p_reduce;
7812 if (aco_op == aco_opcode::p_exclusive_scan) {
7813 need_sitmp |= (op == imin8 || op == imin16 || op == imin32 || op == imin64 || op == imax8 ||
7814 op == imax16 || op == imax32 || op == imax64 || op == fmin16 || op == fmin32 ||
7815 op == fmin64 || op == fmax16 || op == fmax32 || op == fmax64 || op == fmul16 ||
7816 op == fmul64);
7817 }
7818 if (need_sitmp)
7819 defs[num_defs++] = bld.def(RegType::sgpr, dst.size());
7820
7821 /* scc clobber */
7822 defs[num_defs++] = bld.def(s1, scc);
7823
7824 /* vcc clobber */
7825 bool clobber_vcc = false;
7826 if ((op == iadd32 || op == imul64) && ctx->program->chip_class < GFX9)
7827 clobber_vcc = true;
7828 if ((op == iadd8 || op == iadd16) && ctx->program->chip_class < GFX8)
7829 clobber_vcc = true;
7830 if (op == iadd64 || op == umin64 || op == umax64 || op == imin64 || op == imax64)
7831 clobber_vcc = true;
7832
7833 if (clobber_vcc)
7834 defs[num_defs++] = bld.def(bld.lm, vcc);
7835
7836 Pseudo_reduction_instruction* reduce = create_instruction<Pseudo_reduction_instruction>(
7837 aco_op, Format::PSEUDO_REDUCTION, 3, num_defs);
7838 reduce->operands[0] = Operand(src);
7839 /* setup_reduce_temp will update these undef operands if needed */
7840 reduce->operands[1] = Operand(RegClass(RegType::vgpr, dst.size()).as_linear());
7841 reduce->operands[2] = Operand(v1.as_linear());
7842 std::copy(defs, defs + num_defs, reduce->definitions.begin());
7843
7844 reduce->reduce_op = op;
7845 reduce->cluster_size = cluster_size;
7846 bld.insert(std::move(reduce));
7847
7848 return dst.getTemp();
7849 }
7850
7851 void
emit_interp_center(isel_context * ctx,Temp dst,Temp bary,Temp pos1,Temp pos2)7852 emit_interp_center(isel_context* ctx, Temp dst, Temp bary, Temp pos1, Temp pos2)
7853 {
7854 Builder bld(ctx->program, ctx->block);
7855 Temp p1 = emit_extract_vector(ctx, bary, 0, v1);
7856 Temp p2 = emit_extract_vector(ctx, bary, 1, v1);
7857
7858 Temp ddx_1, ddx_2, ddy_1, ddy_2;
7859 uint32_t dpp_ctrl0 = dpp_quad_perm(0, 0, 0, 0);
7860 uint32_t dpp_ctrl1 = dpp_quad_perm(1, 1, 1, 1);
7861 uint32_t dpp_ctrl2 = dpp_quad_perm(2, 2, 2, 2);
7862
7863 /* Build DD X/Y */
7864 if (ctx->program->chip_class >= GFX8) {
7865 Temp tl_1 = bld.vop1_dpp(aco_opcode::v_mov_b32, bld.def(v1), p1, dpp_ctrl0);
7866 ddx_1 = bld.vop2_dpp(aco_opcode::v_sub_f32, bld.def(v1), p1, tl_1, dpp_ctrl1);
7867 ddy_1 = bld.vop2_dpp(aco_opcode::v_sub_f32, bld.def(v1), p1, tl_1, dpp_ctrl2);
7868 Temp tl_2 = bld.vop1_dpp(aco_opcode::v_mov_b32, bld.def(v1), p2, dpp_ctrl0);
7869 ddx_2 = bld.vop2_dpp(aco_opcode::v_sub_f32, bld.def(v1), p2, tl_2, dpp_ctrl1);
7870 ddy_2 = bld.vop2_dpp(aco_opcode::v_sub_f32, bld.def(v1), p2, tl_2, dpp_ctrl2);
7871 } else {
7872 Temp tl_1 = bld.ds(aco_opcode::ds_swizzle_b32, bld.def(v1), p1, (1 << 15) | dpp_ctrl0);
7873 ddx_1 = bld.ds(aco_opcode::ds_swizzle_b32, bld.def(v1), p1, (1 << 15) | dpp_ctrl1);
7874 ddx_1 = bld.vop2(aco_opcode::v_sub_f32, bld.def(v1), ddx_1, tl_1);
7875 ddx_2 = bld.ds(aco_opcode::ds_swizzle_b32, bld.def(v1), p1, (1 << 15) | dpp_ctrl2);
7876 ddx_2 = bld.vop2(aco_opcode::v_sub_f32, bld.def(v1), ddx_2, tl_1);
7877 Temp tl_2 = bld.ds(aco_opcode::ds_swizzle_b32, bld.def(v1), p2, (1 << 15) | dpp_ctrl0);
7878 ddy_1 = bld.ds(aco_opcode::ds_swizzle_b32, bld.def(v1), p2, (1 << 15) | dpp_ctrl1);
7879 ddy_1 = bld.vop2(aco_opcode::v_sub_f32, bld.def(v1), ddy_1, tl_2);
7880 ddy_2 = bld.ds(aco_opcode::ds_swizzle_b32, bld.def(v1), p2, (1 << 15) | dpp_ctrl2);
7881 ddy_2 = bld.vop2(aco_opcode::v_sub_f32, bld.def(v1), ddy_2, tl_2);
7882 }
7883
7884 /* res_k = p_k + ddx_k * pos1 + ddy_k * pos2 */
7885 aco_opcode mad =
7886 ctx->program->chip_class >= GFX10_3 ? aco_opcode::v_fma_f32 : aco_opcode::v_mad_f32;
7887 Temp tmp1 = bld.vop3(mad, bld.def(v1), ddx_1, pos1, p1);
7888 Temp tmp2 = bld.vop3(mad, bld.def(v1), ddx_2, pos1, p2);
7889 tmp1 = bld.vop3(mad, bld.def(v1), ddy_1, pos2, tmp1);
7890 tmp2 = bld.vop3(mad, bld.def(v1), ddy_2, pos2, tmp2);
7891 Temp wqm1 = bld.tmp(v1);
7892 emit_wqm(bld, tmp1, wqm1, true);
7893 Temp wqm2 = bld.tmp(v1);
7894 emit_wqm(bld, tmp2, wqm2, true);
7895 bld.pseudo(aco_opcode::p_create_vector, Definition(dst), wqm1, wqm2);
7896 return;
7897 }
7898
7899 Temp merged_wave_info_to_mask(isel_context* ctx, unsigned i);
7900 void ngg_emit_sendmsg_gs_alloc_req(isel_context* ctx, Temp vtx_cnt, Temp prm_cnt);
7901 static void create_vs_exports(isel_context* ctx);
7902
7903 Temp
get_interp_param(isel_context * ctx,nir_intrinsic_op intrin,enum glsl_interp_mode interp)7904 get_interp_param(isel_context* ctx, nir_intrinsic_op intrin,
7905 enum glsl_interp_mode interp)
7906 {
7907 bool linear = interp == INTERP_MODE_NOPERSPECTIVE;
7908 if (intrin == nir_intrinsic_load_barycentric_pixel ||
7909 intrin == nir_intrinsic_load_barycentric_at_sample ||
7910 intrin == nir_intrinsic_load_barycentric_at_offset) {
7911 return get_arg(ctx, linear ? ctx->args->ac.linear_center : ctx->args->ac.persp_center);
7912 } else if (intrin == nir_intrinsic_load_barycentric_centroid) {
7913 return linear ? ctx->linear_centroid : ctx->persp_centroid;
7914 } else {
7915 assert(intrin == nir_intrinsic_load_barycentric_sample);
7916 return get_arg(ctx, linear ? ctx->args->ac.linear_sample : ctx->args->ac.persp_sample);
7917 }
7918 }
7919
7920 void
visit_intrinsic(isel_context * ctx,nir_intrinsic_instr * instr)7921 visit_intrinsic(isel_context* ctx, nir_intrinsic_instr* instr)
7922 {
7923 Builder bld(ctx->program, ctx->block);
7924 switch (instr->intrinsic) {
7925 case nir_intrinsic_load_barycentric_sample:
7926 case nir_intrinsic_load_barycentric_pixel:
7927 case nir_intrinsic_load_barycentric_centroid: {
7928 glsl_interp_mode mode = (glsl_interp_mode)nir_intrinsic_interp_mode(instr);
7929 Temp bary = get_interp_param(ctx, instr->intrinsic, mode);
7930 Temp dst = get_ssa_temp(ctx, &instr->dest.ssa);
7931 Temp p1 = emit_extract_vector(ctx, bary, 0, v1);
7932 Temp p2 = emit_extract_vector(ctx, bary, 1, v1);
7933 bld.pseudo(aco_opcode::p_create_vector, Definition(dst), Operand(p1), Operand(p2));
7934 emit_split_vector(ctx, dst, 2);
7935 break;
7936 }
7937 case nir_intrinsic_load_barycentric_model: {
7938 Temp model = get_arg(ctx, ctx->args->ac.pull_model);
7939
7940 Temp dst = get_ssa_temp(ctx, &instr->dest.ssa);
7941 Temp p1 = emit_extract_vector(ctx, model, 0, v1);
7942 Temp p2 = emit_extract_vector(ctx, model, 1, v1);
7943 Temp p3 = emit_extract_vector(ctx, model, 2, v1);
7944 bld.pseudo(aco_opcode::p_create_vector, Definition(dst), Operand(p1), Operand(p2),
7945 Operand(p3));
7946 emit_split_vector(ctx, dst, 3);
7947 break;
7948 }
7949 case nir_intrinsic_load_barycentric_at_sample: {
7950 uint32_t sample_pos_offset = RING_PS_SAMPLE_POSITIONS * 16;
7951 switch (ctx->options->key.ps.num_samples) {
7952 case 2: sample_pos_offset += 1 << 3; break;
7953 case 4: sample_pos_offset += 3 << 3; break;
7954 case 8: sample_pos_offset += 7 << 3; break;
7955 default: break;
7956 }
7957 Temp sample_pos;
7958 Temp addr = get_ssa_temp(ctx, instr->src[0].ssa);
7959 nir_const_value* const_addr = nir_src_as_const_value(instr->src[0]);
7960 Temp private_segment_buffer = ctx->program->private_segment_buffer;
7961 // TODO: bounds checking?
7962 if (addr.type() == RegType::sgpr) {
7963 Operand offset;
7964 if (const_addr) {
7965 sample_pos_offset += const_addr->u32 << 3;
7966 offset = Operand::c32(sample_pos_offset);
7967 } else if (ctx->options->chip_class >= GFX9) {
7968 offset = bld.sop2(aco_opcode::s_lshl3_add_u32, bld.def(s1), bld.def(s1, scc), addr,
7969 Operand::c32(sample_pos_offset));
7970 } else {
7971 offset = bld.sop2(aco_opcode::s_lshl_b32, bld.def(s1), bld.def(s1, scc), addr,
7972 Operand::c32(3u));
7973 offset = bld.sop2(aco_opcode::s_add_u32, bld.def(s1), bld.def(s1, scc), offset,
7974 Operand::c32(sample_pos_offset));
7975 }
7976
7977 Operand off = bld.copy(bld.def(s1), Operand(offset));
7978 sample_pos =
7979 bld.smem(aco_opcode::s_load_dwordx2, bld.def(s2), private_segment_buffer, off);
7980
7981 } else if (ctx->options->chip_class >= GFX9) {
7982 addr = bld.vop2(aco_opcode::v_lshlrev_b32, bld.def(v1), Operand::c32(3u), addr);
7983 sample_pos = bld.global(aco_opcode::global_load_dwordx2, bld.def(v2), addr,
7984 private_segment_buffer, sample_pos_offset);
7985 } else if (ctx->options->chip_class >= GFX7) {
7986 /* addr += private_segment_buffer + sample_pos_offset */
7987 Temp tmp0 = bld.tmp(s1);
7988 Temp tmp1 = bld.tmp(s1);
7989 bld.pseudo(aco_opcode::p_split_vector, Definition(tmp0), Definition(tmp1),
7990 private_segment_buffer);
7991 Definition scc_tmp = bld.def(s1, scc);
7992 tmp0 = bld.sop2(aco_opcode::s_add_u32, bld.def(s1), scc_tmp, tmp0,
7993 Operand::c32(sample_pos_offset));
7994 tmp1 = bld.sop2(aco_opcode::s_addc_u32, bld.def(s1), bld.def(s1, scc), tmp1,
7995 Operand::zero(), bld.scc(scc_tmp.getTemp()));
7996 addr = bld.vop2(aco_opcode::v_lshlrev_b32, bld.def(v1), Operand::c32(3u), addr);
7997 Temp pck0 = bld.tmp(v1);
7998 Temp carry = bld.vadd32(Definition(pck0), tmp0, addr, true).def(1).getTemp();
7999 tmp1 = as_vgpr(ctx, tmp1);
8000 Temp pck1 = bld.vop2_e64(aco_opcode::v_addc_co_u32, bld.def(v1),
8001 bld.hint_vcc(bld.def(bld.lm)), tmp1, Operand::zero(), carry);
8002 addr = bld.pseudo(aco_opcode::p_create_vector, bld.def(v2), pck0, pck1);
8003
8004 /* sample_pos = flat_load_dwordx2 addr */
8005 sample_pos = bld.flat(aco_opcode::flat_load_dwordx2, bld.def(v2), addr, Operand(s1));
8006 } else {
8007 assert(ctx->options->chip_class == GFX6);
8008
8009 uint32_t rsrc_conf = S_008F0C_NUM_FORMAT(V_008F0C_BUF_NUM_FORMAT_FLOAT) |
8010 S_008F0C_DATA_FORMAT(V_008F0C_BUF_DATA_FORMAT_32);
8011 Temp rsrc = bld.pseudo(aco_opcode::p_create_vector, bld.def(s4), private_segment_buffer,
8012 Operand::zero(), Operand::c32(rsrc_conf));
8013
8014 addr = bld.vop2(aco_opcode::v_lshlrev_b32, bld.def(v1), Operand::c32(3u), addr);
8015 addr = bld.pseudo(aco_opcode::p_create_vector, bld.def(v2), addr, Operand::zero());
8016
8017 sample_pos = bld.tmp(v2);
8018
8019 aco_ptr<MUBUF_instruction> load{create_instruction<MUBUF_instruction>(
8020 aco_opcode::buffer_load_dwordx2, Format::MUBUF, 3, 1)};
8021 load->definitions[0] = Definition(sample_pos);
8022 load->operands[0] = Operand(rsrc);
8023 load->operands[1] = Operand(addr);
8024 load->operands[2] = Operand::zero();
8025 load->offset = sample_pos_offset;
8026 load->offen = 0;
8027 load->addr64 = true;
8028 load->glc = false;
8029 load->dlc = false;
8030 load->disable_wqm = false;
8031 ctx->block->instructions.emplace_back(std::move(load));
8032 }
8033
8034 /* sample_pos -= 0.5 */
8035 Temp pos1 = bld.tmp(RegClass(sample_pos.type(), 1));
8036 Temp pos2 = bld.tmp(RegClass(sample_pos.type(), 1));
8037 bld.pseudo(aco_opcode::p_split_vector, Definition(pos1), Definition(pos2), sample_pos);
8038 pos1 = bld.vop2_e64(aco_opcode::v_sub_f32, bld.def(v1), pos1, Operand::c32(0x3f000000u));
8039 pos2 = bld.vop2_e64(aco_opcode::v_sub_f32, bld.def(v1), pos2, Operand::c32(0x3f000000u));
8040
8041 Temp bary = get_interp_param(ctx, instr->intrinsic, (glsl_interp_mode)nir_intrinsic_interp_mode(instr));
8042 emit_interp_center(ctx, get_ssa_temp(ctx, &instr->dest.ssa), bary, pos1, pos2);
8043 break;
8044 }
8045 case nir_intrinsic_load_barycentric_at_offset: {
8046 Temp offset = get_ssa_temp(ctx, instr->src[0].ssa);
8047 RegClass rc = RegClass(offset.type(), 1);
8048 Temp pos1 = bld.tmp(rc), pos2 = bld.tmp(rc);
8049 bld.pseudo(aco_opcode::p_split_vector, Definition(pos1), Definition(pos2), offset);
8050 Temp bary = get_interp_param(ctx, instr->intrinsic, (glsl_interp_mode)nir_intrinsic_interp_mode(instr));
8051 emit_interp_center(ctx, get_ssa_temp(ctx, &instr->dest.ssa), bary, pos1, pos2);
8052 break;
8053 }
8054 case nir_intrinsic_load_front_face: {
8055 bld.vopc(aco_opcode::v_cmp_lg_u32, Definition(get_ssa_temp(ctx, &instr->dest.ssa)),
8056 Operand::zero(), get_arg(ctx, ctx->args->ac.front_face))
8057 .def(0)
8058 .setHint(vcc);
8059 break;
8060 }
8061 case nir_intrinsic_load_view_index: {
8062 Temp dst = get_ssa_temp(ctx, &instr->dest.ssa);
8063 bld.copy(Definition(dst), Operand(get_arg(ctx, ctx->args->ac.view_index)));
8064 break;
8065 }
8066 case nir_intrinsic_load_frag_coord: {
8067 emit_load_frag_coord(ctx, get_ssa_temp(ctx, &instr->dest.ssa), 4);
8068 break;
8069 }
8070 case nir_intrinsic_load_frag_shading_rate:
8071 emit_load_frag_shading_rate(ctx, get_ssa_temp(ctx, &instr->dest.ssa));
8072 break;
8073 case nir_intrinsic_load_sample_pos: {
8074 Temp posx = get_arg(ctx, ctx->args->ac.frag_pos[0]);
8075 Temp posy = get_arg(ctx, ctx->args->ac.frag_pos[1]);
8076 bld.pseudo(
8077 aco_opcode::p_create_vector, Definition(get_ssa_temp(ctx, &instr->dest.ssa)),
8078 posx.id() ? bld.vop1(aco_opcode::v_fract_f32, bld.def(v1), posx) : Operand::zero(),
8079 posy.id() ? bld.vop1(aco_opcode::v_fract_f32, bld.def(v1), posy) : Operand::zero());
8080 break;
8081 }
8082 case nir_intrinsic_load_tess_coord: visit_load_tess_coord(ctx, instr); break;
8083 case nir_intrinsic_load_interpolated_input: visit_load_interpolated_input(ctx, instr); break;
8084 case nir_intrinsic_store_output: visit_store_output(ctx, instr); break;
8085 case nir_intrinsic_load_input:
8086 case nir_intrinsic_load_input_vertex: visit_load_input(ctx, instr); break;
8087 case nir_intrinsic_load_per_vertex_input: visit_load_per_vertex_input(ctx, instr); break;
8088 case nir_intrinsic_load_ubo: visit_load_ubo(ctx, instr); break;
8089 case nir_intrinsic_load_push_constant: visit_load_push_constant(ctx, instr); break;
8090 case nir_intrinsic_load_constant: visit_load_constant(ctx, instr); break;
8091 case nir_intrinsic_vulkan_resource_index: visit_load_resource(ctx, instr); break;
8092 case nir_intrinsic_terminate:
8093 case nir_intrinsic_discard: visit_discard(ctx, instr); break;
8094 case nir_intrinsic_terminate_if:
8095 case nir_intrinsic_discard_if: visit_discard_if(ctx, instr); break;
8096 case nir_intrinsic_load_shared: visit_load_shared(ctx, instr); break;
8097 case nir_intrinsic_store_shared: visit_store_shared(ctx, instr); break;
8098 case nir_intrinsic_shared_atomic_add:
8099 case nir_intrinsic_shared_atomic_imin:
8100 case nir_intrinsic_shared_atomic_umin:
8101 case nir_intrinsic_shared_atomic_imax:
8102 case nir_intrinsic_shared_atomic_umax:
8103 case nir_intrinsic_shared_atomic_and:
8104 case nir_intrinsic_shared_atomic_or:
8105 case nir_intrinsic_shared_atomic_xor:
8106 case nir_intrinsic_shared_atomic_exchange:
8107 case nir_intrinsic_shared_atomic_comp_swap:
8108 case nir_intrinsic_shared_atomic_fadd:
8109 case nir_intrinsic_shared_atomic_fmin:
8110 case nir_intrinsic_shared_atomic_fmax: visit_shared_atomic(ctx, instr); break;
8111 case nir_intrinsic_image_deref_load:
8112 case nir_intrinsic_image_deref_sparse_load: visit_image_load(ctx, instr); break;
8113 case nir_intrinsic_image_deref_store: visit_image_store(ctx, instr); break;
8114 case nir_intrinsic_image_deref_atomic_add:
8115 case nir_intrinsic_image_deref_atomic_umin:
8116 case nir_intrinsic_image_deref_atomic_imin:
8117 case nir_intrinsic_image_deref_atomic_umax:
8118 case nir_intrinsic_image_deref_atomic_imax:
8119 case nir_intrinsic_image_deref_atomic_and:
8120 case nir_intrinsic_image_deref_atomic_or:
8121 case nir_intrinsic_image_deref_atomic_xor:
8122 case nir_intrinsic_image_deref_atomic_exchange:
8123 case nir_intrinsic_image_deref_atomic_comp_swap:
8124 case nir_intrinsic_image_deref_atomic_fmin:
8125 case nir_intrinsic_image_deref_atomic_fmax: visit_image_atomic(ctx, instr); break;
8126 case nir_intrinsic_image_deref_size: visit_image_size(ctx, instr); break;
8127 case nir_intrinsic_image_deref_samples: visit_image_samples(ctx, instr); break;
8128 case nir_intrinsic_load_ssbo: visit_load_ssbo(ctx, instr); break;
8129 case nir_intrinsic_store_ssbo: visit_store_ssbo(ctx, instr); break;
8130 case nir_intrinsic_load_global_constant:
8131 case nir_intrinsic_load_global: visit_load_global(ctx, instr); break;
8132 case nir_intrinsic_load_buffer_amd: visit_load_buffer(ctx, instr); break;
8133 case nir_intrinsic_store_buffer_amd: visit_store_buffer(ctx, instr); break;
8134 case nir_intrinsic_store_global: visit_store_global(ctx, instr); break;
8135 case nir_intrinsic_global_atomic_add:
8136 case nir_intrinsic_global_atomic_imin:
8137 case nir_intrinsic_global_atomic_umin:
8138 case nir_intrinsic_global_atomic_imax:
8139 case nir_intrinsic_global_atomic_umax:
8140 case nir_intrinsic_global_atomic_and:
8141 case nir_intrinsic_global_atomic_or:
8142 case nir_intrinsic_global_atomic_xor:
8143 case nir_intrinsic_global_atomic_exchange:
8144 case nir_intrinsic_global_atomic_comp_swap:
8145 case nir_intrinsic_global_atomic_fmin:
8146 case nir_intrinsic_global_atomic_fmax: visit_global_atomic(ctx, instr); break;
8147 case nir_intrinsic_ssbo_atomic_add:
8148 case nir_intrinsic_ssbo_atomic_imin:
8149 case nir_intrinsic_ssbo_atomic_umin:
8150 case nir_intrinsic_ssbo_atomic_imax:
8151 case nir_intrinsic_ssbo_atomic_umax:
8152 case nir_intrinsic_ssbo_atomic_and:
8153 case nir_intrinsic_ssbo_atomic_or:
8154 case nir_intrinsic_ssbo_atomic_xor:
8155 case nir_intrinsic_ssbo_atomic_exchange:
8156 case nir_intrinsic_ssbo_atomic_comp_swap:
8157 case nir_intrinsic_ssbo_atomic_fmin:
8158 case nir_intrinsic_ssbo_atomic_fmax: visit_atomic_ssbo(ctx, instr); break;
8159 case nir_intrinsic_load_scratch: visit_load_scratch(ctx, instr); break;
8160 case nir_intrinsic_store_scratch: visit_store_scratch(ctx, instr); break;
8161 case nir_intrinsic_get_ssbo_size: visit_get_ssbo_size(ctx, instr); break;
8162 case nir_intrinsic_scoped_barrier: emit_scoped_barrier(ctx, instr); break;
8163 case nir_intrinsic_load_num_workgroups: {
8164 Temp dst = get_ssa_temp(ctx, &instr->dest.ssa);
8165 bld.copy(Definition(dst), Operand(get_arg(ctx, ctx->args->ac.num_work_groups)));
8166 emit_split_vector(ctx, dst, 3);
8167 break;
8168 }
8169 case nir_intrinsic_load_ray_launch_size: {
8170 Temp dst = get_ssa_temp(ctx, &instr->dest.ssa);
8171 bld.copy(Definition(dst), Operand(get_arg(ctx, ctx->args->ac.ray_launch_size)));
8172 emit_split_vector(ctx, dst, 3);
8173 break;
8174 }
8175 case nir_intrinsic_load_local_invocation_id: {
8176 Temp dst = get_ssa_temp(ctx, &instr->dest.ssa);
8177 bld.copy(Definition(dst), Operand(get_arg(ctx, ctx->args->ac.local_invocation_ids)));
8178 emit_split_vector(ctx, dst, 3);
8179 break;
8180 }
8181 case nir_intrinsic_load_workgroup_id: {
8182 Temp dst = get_ssa_temp(ctx, &instr->dest.ssa);
8183 const struct ac_arg* args = ctx->args->ac.workgroup_ids;
8184 bld.pseudo(aco_opcode::p_create_vector, Definition(dst),
8185 args[0].used ? Operand(get_arg(ctx, args[0])) : Operand::zero(),
8186 args[1].used ? Operand(get_arg(ctx, args[1])) : Operand::zero(),
8187 args[2].used ? Operand(get_arg(ctx, args[2])) : Operand::zero());
8188 emit_split_vector(ctx, dst, 3);
8189 break;
8190 }
8191 case nir_intrinsic_load_local_invocation_index: {
8192 if (ctx->stage.hw == HWStage::LS || ctx->stage.hw == HWStage::HS) {
8193 bld.copy(Definition(get_ssa_temp(ctx, &instr->dest.ssa)),
8194 get_arg(ctx, ctx->args->ac.vs_rel_patch_id));
8195 break;
8196 } else if (ctx->stage.hw == HWStage::GS || ctx->stage.hw == HWStage::NGG) {
8197 bld.copy(Definition(get_ssa_temp(ctx, &instr->dest.ssa)), thread_id_in_threadgroup(ctx));
8198 break;
8199 }
8200
8201 Temp id = emit_mbcnt(ctx, bld.tmp(v1));
8202
8203 /* The tg_size bits [6:11] contain the subgroup id,
8204 * we need this multiplied by the wave size, and then OR the thread id to it.
8205 */
8206 if (ctx->program->wave_size == 64) {
8207 /* After the s_and the bits are already multiplied by 64 (left shifted by 6) so we can just
8208 * feed that to v_or */
8209 Temp tg_num = bld.sop2(aco_opcode::s_and_b32, bld.def(s1), bld.def(s1, scc),
8210 Operand::c32(0xfc0u), get_arg(ctx, ctx->args->ac.tg_size));
8211 bld.vop2(aco_opcode::v_or_b32, Definition(get_ssa_temp(ctx, &instr->dest.ssa)), tg_num,
8212 id);
8213 } else {
8214 /* Extract the bit field and multiply the result by 32 (left shift by 5), then do the OR */
8215 Temp tg_num =
8216 bld.sop2(aco_opcode::s_bfe_u32, bld.def(s1), bld.def(s1, scc),
8217 get_arg(ctx, ctx->args->ac.tg_size), Operand::c32(0x6u | (0x6u << 16)));
8218 bld.vop3(aco_opcode::v_lshl_or_b32, Definition(get_ssa_temp(ctx, &instr->dest.ssa)),
8219 tg_num, Operand::c32(0x5u), id);
8220 }
8221 break;
8222 }
8223 case nir_intrinsic_load_subgroup_id: {
8224 if (ctx->stage == compute_cs) {
8225 bld.sop2(aco_opcode::s_bfe_u32, Definition(get_ssa_temp(ctx, &instr->dest.ssa)),
8226 bld.def(s1, scc), get_arg(ctx, ctx->args->ac.tg_size),
8227 Operand::c32(0x6u | (0x6u << 16)));
8228 } else if (ctx->stage.hw == HWStage::NGG) {
8229 /* Get the id of the current wave within the threadgroup (workgroup) */
8230 bld.sop2(aco_opcode::s_bfe_u32, Definition(get_ssa_temp(ctx, &instr->dest.ssa)),
8231 bld.def(s1, scc), get_arg(ctx, ctx->args->ac.merged_wave_info),
8232 Operand::c32(24u | (4u << 16)));
8233 } else {
8234 bld.copy(Definition(get_ssa_temp(ctx, &instr->dest.ssa)), Operand::zero());
8235 }
8236 break;
8237 }
8238 case nir_intrinsic_load_subgroup_invocation: {
8239 emit_mbcnt(ctx, get_ssa_temp(ctx, &instr->dest.ssa));
8240 break;
8241 }
8242 case nir_intrinsic_load_num_subgroups: {
8243 if (ctx->stage == compute_cs)
8244 bld.sop2(aco_opcode::s_and_b32, Definition(get_ssa_temp(ctx, &instr->dest.ssa)),
8245 bld.def(s1, scc), Operand::c32(0x3fu), get_arg(ctx, ctx->args->ac.tg_size));
8246 else if (ctx->stage.hw == HWStage::NGG)
8247 bld.sop2(aco_opcode::s_bfe_u32, Definition(get_ssa_temp(ctx, &instr->dest.ssa)),
8248 bld.def(s1, scc), get_arg(ctx, ctx->args->ac.merged_wave_info),
8249 Operand::c32(28u | (4u << 16)));
8250 else
8251 bld.copy(Definition(get_ssa_temp(ctx, &instr->dest.ssa)), Operand::c32(0x1u));
8252 break;
8253 }
8254 case nir_intrinsic_ballot: {
8255 Temp src = get_ssa_temp(ctx, instr->src[0].ssa);
8256 Temp dst = get_ssa_temp(ctx, &instr->dest.ssa);
8257
8258 if (instr->src[0].ssa->bit_size == 1) {
8259 assert(src.regClass() == bld.lm);
8260 } else if (instr->src[0].ssa->bit_size == 32 && src.regClass() == v1) {
8261 src = bld.vopc(aco_opcode::v_cmp_lg_u32, bld.def(bld.lm), Operand::zero(), src);
8262 } else if (instr->src[0].ssa->bit_size == 64 && src.regClass() == v2) {
8263 src = bld.vopc(aco_opcode::v_cmp_lg_u64, bld.def(bld.lm), Operand::zero(), src);
8264 } else {
8265 isel_err(&instr->instr, "Unimplemented NIR instr bit size");
8266 }
8267
8268 /* Make sure that all inactive lanes return zero.
8269 * Value-numbering might remove the comparison above */
8270 src = bld.sop2(Builder::s_and, bld.def(bld.lm), bld.def(s1, scc), src, Operand(exec, bld.lm));
8271 if (dst.size() != bld.lm.size()) {
8272 /* Wave32 with ballot size set to 64 */
8273 src =
8274 bld.pseudo(aco_opcode::p_create_vector, bld.def(dst.regClass()), src, Operand::zero());
8275 }
8276
8277 emit_wqm(bld, src, dst);
8278 break;
8279 }
8280 case nir_intrinsic_shuffle:
8281 case nir_intrinsic_read_invocation: {
8282 Temp src = get_ssa_temp(ctx, instr->src[0].ssa);
8283 if (!nir_src_is_divergent(instr->src[0])) {
8284 emit_uniform_subgroup(ctx, instr, src);
8285 } else {
8286 Temp tid = get_ssa_temp(ctx, instr->src[1].ssa);
8287 if (instr->intrinsic == nir_intrinsic_read_invocation ||
8288 !nir_src_is_divergent(instr->src[1]))
8289 tid = bld.as_uniform(tid);
8290 Temp dst = get_ssa_temp(ctx, &instr->dest.ssa);
8291
8292 if (instr->dest.ssa.bit_size != 1)
8293 src = as_vgpr(ctx, src);
8294
8295 if (src.regClass() == v1b || src.regClass() == v2b) {
8296 Temp tmp = bld.tmp(v1);
8297 tmp = emit_wqm(bld, emit_bpermute(ctx, bld, tid, src), tmp);
8298 if (dst.type() == RegType::vgpr)
8299 bld.pseudo(aco_opcode::p_split_vector, Definition(dst),
8300 bld.def(src.regClass() == v1b ? v3b : v2b), tmp);
8301 else
8302 bld.pseudo(aco_opcode::p_as_uniform, Definition(dst), tmp);
8303 } else if (src.regClass() == v1) {
8304 emit_wqm(bld, emit_bpermute(ctx, bld, tid, src), dst);
8305 } else if (src.regClass() == v2) {
8306 Temp lo = bld.tmp(v1), hi = bld.tmp(v1);
8307 bld.pseudo(aco_opcode::p_split_vector, Definition(lo), Definition(hi), src);
8308 lo = emit_wqm(bld, emit_bpermute(ctx, bld, tid, lo));
8309 hi = emit_wqm(bld, emit_bpermute(ctx, bld, tid, hi));
8310 bld.pseudo(aco_opcode::p_create_vector, Definition(dst), lo, hi);
8311 emit_split_vector(ctx, dst, 2);
8312 } else if (instr->dest.ssa.bit_size == 1 && tid.regClass() == s1) {
8313 assert(src.regClass() == bld.lm);
8314 Temp tmp = bld.sopc(Builder::s_bitcmp1, bld.def(s1, scc), src, tid);
8315 bool_to_vector_condition(ctx, emit_wqm(bld, tmp), dst);
8316 } else if (instr->dest.ssa.bit_size == 1 && tid.regClass() == v1) {
8317 assert(src.regClass() == bld.lm);
8318 Temp tmp;
8319 if (ctx->program->chip_class <= GFX7)
8320 tmp = bld.vop3(aco_opcode::v_lshr_b64, bld.def(v2), src, tid);
8321 else if (ctx->program->wave_size == 64)
8322 tmp = bld.vop3(aco_opcode::v_lshrrev_b64, bld.def(v2), tid, src);
8323 else
8324 tmp = bld.vop2_e64(aco_opcode::v_lshrrev_b32, bld.def(v1), tid, src);
8325 tmp = emit_extract_vector(ctx, tmp, 0, v1);
8326 tmp = bld.vop2(aco_opcode::v_and_b32, bld.def(v1), Operand::c32(1u), tmp);
8327 emit_wqm(bld, bld.vopc(aco_opcode::v_cmp_lg_u32, bld.def(bld.lm), Operand::zero(), tmp),
8328 dst);
8329 } else {
8330 isel_err(&instr->instr, "Unimplemented NIR instr bit size");
8331 }
8332 }
8333 break;
8334 }
8335 case nir_intrinsic_load_sample_id: {
8336 bld.vop3(aco_opcode::v_bfe_u32, Definition(get_ssa_temp(ctx, &instr->dest.ssa)),
8337 get_arg(ctx, ctx->args->ac.ancillary), Operand::c32(8u), Operand::c32(4u));
8338 break;
8339 }
8340 case nir_intrinsic_load_sample_mask_in: {
8341 visit_load_sample_mask_in(ctx, instr);
8342 break;
8343 }
8344 case nir_intrinsic_read_first_invocation: {
8345 Temp src = get_ssa_temp(ctx, instr->src[0].ssa);
8346 Temp dst = get_ssa_temp(ctx, &instr->dest.ssa);
8347 if (src.regClass() == v1b || src.regClass() == v2b || src.regClass() == v1) {
8348 emit_wqm(bld, bld.vop1(aco_opcode::v_readfirstlane_b32, bld.def(s1), src), dst);
8349 } else if (src.regClass() == v2) {
8350 Temp lo = bld.tmp(v1), hi = bld.tmp(v1);
8351 bld.pseudo(aco_opcode::p_split_vector, Definition(lo), Definition(hi), src);
8352 lo = emit_wqm(bld, bld.vop1(aco_opcode::v_readfirstlane_b32, bld.def(s1), lo));
8353 hi = emit_wqm(bld, bld.vop1(aco_opcode::v_readfirstlane_b32, bld.def(s1), hi));
8354 bld.pseudo(aco_opcode::p_create_vector, Definition(dst), lo, hi);
8355 emit_split_vector(ctx, dst, 2);
8356 } else if (instr->dest.ssa.bit_size == 1) {
8357 assert(src.regClass() == bld.lm);
8358 Temp tmp = bld.sopc(Builder::s_bitcmp1, bld.def(s1, scc), src,
8359 bld.sop1(Builder::s_ff1_i32, bld.def(s1), Operand(exec, bld.lm)));
8360 bool_to_vector_condition(ctx, emit_wqm(bld, tmp), dst);
8361 } else {
8362 bld.copy(Definition(dst), src);
8363 }
8364 break;
8365 }
8366 case nir_intrinsic_vote_all: {
8367 Temp src = get_ssa_temp(ctx, instr->src[0].ssa);
8368 Temp dst = get_ssa_temp(ctx, &instr->dest.ssa);
8369 assert(src.regClass() == bld.lm);
8370 assert(dst.regClass() == bld.lm);
8371
8372 Temp tmp =
8373 bld.sop2(Builder::s_andn2, bld.def(bld.lm), bld.def(s1, scc), Operand(exec, bld.lm), src)
8374 .def(1)
8375 .getTemp();
8376 Temp cond = bool_to_vector_condition(ctx, emit_wqm(bld, tmp));
8377 bld.sop1(Builder::s_not, Definition(dst), bld.def(s1, scc), cond);
8378 break;
8379 }
8380 case nir_intrinsic_vote_any: {
8381 Temp src = get_ssa_temp(ctx, instr->src[0].ssa);
8382 Temp dst = get_ssa_temp(ctx, &instr->dest.ssa);
8383 assert(src.regClass() == bld.lm);
8384 assert(dst.regClass() == bld.lm);
8385
8386 Temp tmp = bool_to_scalar_condition(ctx, src);
8387 bool_to_vector_condition(ctx, emit_wqm(bld, tmp), dst);
8388 break;
8389 }
8390 case nir_intrinsic_reduce:
8391 case nir_intrinsic_inclusive_scan:
8392 case nir_intrinsic_exclusive_scan: {
8393 Temp src = get_ssa_temp(ctx, instr->src[0].ssa);
8394 Temp dst = get_ssa_temp(ctx, &instr->dest.ssa);
8395 nir_op op = (nir_op)nir_intrinsic_reduction_op(instr);
8396 unsigned cluster_size =
8397 instr->intrinsic == nir_intrinsic_reduce ? nir_intrinsic_cluster_size(instr) : 0;
8398 cluster_size = util_next_power_of_two(
8399 MIN2(cluster_size ? cluster_size : ctx->program->wave_size, ctx->program->wave_size));
8400
8401 if (!nir_src_is_divergent(instr->src[0]) && cluster_size == ctx->program->wave_size &&
8402 instr->dest.ssa.bit_size != 1) {
8403 /* We use divergence analysis to assign the regclass, so check if it's
8404 * working as expected */
8405 ASSERTED bool expected_divergent = instr->intrinsic == nir_intrinsic_exclusive_scan;
8406 if (instr->intrinsic == nir_intrinsic_inclusive_scan)
8407 expected_divergent = op == nir_op_iadd || op == nir_op_fadd || op == nir_op_ixor;
8408 assert(nir_dest_is_divergent(instr->dest) == expected_divergent);
8409
8410 if (instr->intrinsic == nir_intrinsic_reduce) {
8411 if (emit_uniform_reduce(ctx, instr))
8412 break;
8413 } else if (emit_uniform_scan(ctx, instr)) {
8414 break;
8415 }
8416 }
8417
8418 if (instr->dest.ssa.bit_size == 1) {
8419 if (op == nir_op_imul || op == nir_op_umin || op == nir_op_imin)
8420 op = nir_op_iand;
8421 else if (op == nir_op_iadd)
8422 op = nir_op_ixor;
8423 else if (op == nir_op_umax || op == nir_op_imax)
8424 op = nir_op_ior;
8425 assert(op == nir_op_iand || op == nir_op_ior || op == nir_op_ixor);
8426
8427 switch (instr->intrinsic) {
8428 case nir_intrinsic_reduce:
8429 emit_wqm(bld, emit_boolean_reduce(ctx, op, cluster_size, src), dst);
8430 break;
8431 case nir_intrinsic_exclusive_scan:
8432 emit_wqm(bld, emit_boolean_exclusive_scan(ctx, op, src), dst);
8433 break;
8434 case nir_intrinsic_inclusive_scan:
8435 emit_wqm(bld, emit_boolean_inclusive_scan(ctx, op, src), dst);
8436 break;
8437 default: assert(false);
8438 }
8439 } else if (cluster_size == 1) {
8440 bld.copy(Definition(dst), src);
8441 } else {
8442 unsigned bit_size = instr->src[0].ssa->bit_size;
8443
8444 src = emit_extract_vector(ctx, src, 0, RegClass::get(RegType::vgpr, bit_size / 8));
8445
8446 ReduceOp reduce_op = get_reduce_op(op, bit_size);
8447
8448 aco_opcode aco_op;
8449 switch (instr->intrinsic) {
8450 case nir_intrinsic_reduce: aco_op = aco_opcode::p_reduce; break;
8451 case nir_intrinsic_inclusive_scan: aco_op = aco_opcode::p_inclusive_scan; break;
8452 case nir_intrinsic_exclusive_scan: aco_op = aco_opcode::p_exclusive_scan; break;
8453 default: unreachable("unknown reduce intrinsic");
8454 }
8455
8456 Temp tmp_dst = emit_reduction_instr(ctx, aco_op, reduce_op, cluster_size,
8457 bld.def(dst.regClass()), src);
8458 emit_wqm(bld, tmp_dst, dst);
8459 }
8460 break;
8461 }
8462 case nir_intrinsic_quad_broadcast:
8463 case nir_intrinsic_quad_swap_horizontal:
8464 case nir_intrinsic_quad_swap_vertical:
8465 case nir_intrinsic_quad_swap_diagonal:
8466 case nir_intrinsic_quad_swizzle_amd: {
8467 Temp src = get_ssa_temp(ctx, instr->src[0].ssa);
8468
8469 if (!nir_dest_is_divergent(instr->dest)) {
8470 emit_uniform_subgroup(ctx, instr, src);
8471 break;
8472 }
8473
8474 /* Quad broadcast lane. */
8475 unsigned lane = 0;
8476 /* Use VALU for the bool instructions that don't have a SALU-only special case. */
8477 bool bool_use_valu = instr->dest.ssa.bit_size == 1;
8478
8479 uint16_t dpp_ctrl = 0;
8480
8481 switch (instr->intrinsic) {
8482 case nir_intrinsic_quad_swap_horizontal: dpp_ctrl = dpp_quad_perm(1, 0, 3, 2); break;
8483 case nir_intrinsic_quad_swap_vertical: dpp_ctrl = dpp_quad_perm(2, 3, 0, 1); break;
8484 case nir_intrinsic_quad_swap_diagonal: dpp_ctrl = dpp_quad_perm(3, 2, 1, 0); break;
8485 case nir_intrinsic_quad_swizzle_amd: dpp_ctrl = nir_intrinsic_swizzle_mask(instr); break;
8486 case nir_intrinsic_quad_broadcast:
8487 lane = nir_src_as_const_value(instr->src[1])->u32;
8488 dpp_ctrl = dpp_quad_perm(lane, lane, lane, lane);
8489 bool_use_valu = false;
8490 break;
8491 default: break;
8492 }
8493
8494 Temp dst = get_ssa_temp(ctx, &instr->dest.ssa);
8495 Temp tmp(dst);
8496
8497 /* Setup source. */
8498 if (bool_use_valu)
8499 src = bld.vop2_e64(aco_opcode::v_cndmask_b32, bld.def(v1), Operand::zero(),
8500 Operand::c32(-1), src);
8501 else if (instr->dest.ssa.bit_size != 1)
8502 src = as_vgpr(ctx, src);
8503
8504 /* Setup temporary destination. */
8505 if (bool_use_valu)
8506 tmp = bld.tmp(v1);
8507 else if (ctx->program->stage == fragment_fs)
8508 tmp = bld.tmp(dst.regClass());
8509
8510 if (instr->dest.ssa.bit_size == 1 && instr->intrinsic == nir_intrinsic_quad_broadcast) {
8511 /* Special case for quad broadcast using SALU only. */
8512 assert(src.regClass() == bld.lm && tmp.regClass() == bld.lm);
8513
8514 uint32_t half_mask = 0x11111111u << lane;
8515 Operand mask_tmp = bld.lm.bytes() == 4
8516 ? Operand::c32(half_mask)
8517 : bld.pseudo(aco_opcode::p_create_vector, bld.def(bld.lm),
8518 Operand::c32(half_mask), Operand::c32(half_mask));
8519
8520 src =
8521 bld.sop2(Builder::s_and, bld.def(bld.lm), bld.def(s1, scc), src, Operand(exec, bld.lm));
8522 src = bld.sop2(Builder::s_and, bld.def(bld.lm), bld.def(s1, scc), mask_tmp, src);
8523 bld.sop1(Builder::s_wqm, Definition(tmp), src);
8524 } else if (instr->dest.ssa.bit_size <= 32 || bool_use_valu) {
8525 unsigned excess_bytes = bool_use_valu ? 0 : 4 - instr->dest.ssa.bit_size / 8;
8526 Definition def = excess_bytes ? bld.def(v1) : Definition(tmp);
8527
8528 if (ctx->program->chip_class >= GFX8)
8529 bld.vop1_dpp(aco_opcode::v_mov_b32, def, src, dpp_ctrl);
8530 else
8531 bld.ds(aco_opcode::ds_swizzle_b32, def, src, (1 << 15) | dpp_ctrl);
8532
8533 if (excess_bytes)
8534 bld.pseudo(aco_opcode::p_split_vector, Definition(tmp),
8535 bld.def(RegClass::get(tmp.type(), excess_bytes)), def.getTemp());
8536 } else if (instr->dest.ssa.bit_size == 64) {
8537 Temp lo = bld.tmp(v1), hi = bld.tmp(v1);
8538 bld.pseudo(aco_opcode::p_split_vector, Definition(lo), Definition(hi), src);
8539
8540 if (ctx->program->chip_class >= GFX8) {
8541 lo = bld.vop1_dpp(aco_opcode::v_mov_b32, bld.def(v1), lo, dpp_ctrl);
8542 hi = bld.vop1_dpp(aco_opcode::v_mov_b32, bld.def(v1), hi, dpp_ctrl);
8543 } else {
8544 lo = bld.ds(aco_opcode::ds_swizzle_b32, bld.def(v1), lo, (1 << 15) | dpp_ctrl);
8545 hi = bld.ds(aco_opcode::ds_swizzle_b32, bld.def(v1), hi, (1 << 15) | dpp_ctrl);
8546 }
8547
8548 bld.pseudo(aco_opcode::p_create_vector, Definition(tmp), lo, hi);
8549 emit_split_vector(ctx, tmp, 2);
8550 } else {
8551 isel_err(&instr->instr, "Unimplemented NIR quad group instruction bit size.");
8552 }
8553
8554 if (tmp.id() != dst.id()) {
8555 if (bool_use_valu)
8556 tmp = bld.vopc(aco_opcode::v_cmp_lg_u32, bld.def(bld.lm), Operand::zero(), tmp);
8557
8558 /* Vulkan spec 9.25: Helper invocations must be active for quad group instructions. */
8559 emit_wqm(bld, tmp, dst, true);
8560 }
8561
8562 break;
8563 }
8564 case nir_intrinsic_masked_swizzle_amd: {
8565 Temp src = get_ssa_temp(ctx, instr->src[0].ssa);
8566 if (!nir_dest_is_divergent(instr->dest)) {
8567 emit_uniform_subgroup(ctx, instr, src);
8568 break;
8569 }
8570 Temp dst = get_ssa_temp(ctx, &instr->dest.ssa);
8571 uint32_t mask = nir_intrinsic_swizzle_mask(instr);
8572
8573 if (instr->dest.ssa.bit_size != 1)
8574 src = as_vgpr(ctx, src);
8575
8576 if (instr->dest.ssa.bit_size == 1) {
8577 assert(src.regClass() == bld.lm);
8578 src = bld.vop2_e64(aco_opcode::v_cndmask_b32, bld.def(v1), Operand::zero(),
8579 Operand::c32(-1), src);
8580 src = emit_masked_swizzle(ctx, bld, src, mask);
8581 Temp tmp = bld.vopc(aco_opcode::v_cmp_lg_u32, bld.def(bld.lm), Operand::zero(), src);
8582 emit_wqm(bld, tmp, dst);
8583 } else if (dst.regClass() == v1b) {
8584 Temp tmp = emit_wqm(bld, emit_masked_swizzle(ctx, bld, src, mask));
8585 emit_extract_vector(ctx, tmp, 0, dst);
8586 } else if (dst.regClass() == v2b) {
8587 Temp tmp = emit_wqm(bld, emit_masked_swizzle(ctx, bld, src, mask));
8588 emit_extract_vector(ctx, tmp, 0, dst);
8589 } else if (dst.regClass() == v1) {
8590 emit_wqm(bld, emit_masked_swizzle(ctx, bld, src, mask), dst);
8591 } else if (dst.regClass() == v2) {
8592 Temp lo = bld.tmp(v1), hi = bld.tmp(v1);
8593 bld.pseudo(aco_opcode::p_split_vector, Definition(lo), Definition(hi), src);
8594 lo = emit_wqm(bld, emit_masked_swizzle(ctx, bld, lo, mask));
8595 hi = emit_wqm(bld, emit_masked_swizzle(ctx, bld, hi, mask));
8596 bld.pseudo(aco_opcode::p_create_vector, Definition(dst), lo, hi);
8597 emit_split_vector(ctx, dst, 2);
8598 } else {
8599 isel_err(&instr->instr, "Unimplemented NIR instr bit size");
8600 }
8601 break;
8602 }
8603 case nir_intrinsic_write_invocation_amd: {
8604 Temp src = as_vgpr(ctx, get_ssa_temp(ctx, instr->src[0].ssa));
8605 Temp val = bld.as_uniform(get_ssa_temp(ctx, instr->src[1].ssa));
8606 Temp lane = bld.as_uniform(get_ssa_temp(ctx, instr->src[2].ssa));
8607 Temp dst = get_ssa_temp(ctx, &instr->dest.ssa);
8608 if (dst.regClass() == v1) {
8609 /* src2 is ignored for writelane. RA assigns the same reg for dst */
8610 emit_wqm(bld, bld.writelane(bld.def(v1), val, lane, src), dst);
8611 } else if (dst.regClass() == v2) {
8612 Temp src_lo = bld.tmp(v1), src_hi = bld.tmp(v1);
8613 Temp val_lo = bld.tmp(s1), val_hi = bld.tmp(s1);
8614 bld.pseudo(aco_opcode::p_split_vector, Definition(src_lo), Definition(src_hi), src);
8615 bld.pseudo(aco_opcode::p_split_vector, Definition(val_lo), Definition(val_hi), val);
8616 Temp lo = emit_wqm(bld, bld.writelane(bld.def(v1), val_lo, lane, src_hi));
8617 Temp hi = emit_wqm(bld, bld.writelane(bld.def(v1), val_hi, lane, src_hi));
8618 bld.pseudo(aco_opcode::p_create_vector, Definition(dst), lo, hi);
8619 emit_split_vector(ctx, dst, 2);
8620 } else {
8621 isel_err(&instr->instr, "Unimplemented NIR instr bit size");
8622 }
8623 break;
8624 }
8625 case nir_intrinsic_mbcnt_amd: {
8626 Temp src = get_ssa_temp(ctx, instr->src[0].ssa);
8627 Temp add_src = as_vgpr(ctx, get_ssa_temp(ctx, instr->src[1].ssa));
8628 Temp dst = get_ssa_temp(ctx, &instr->dest.ssa);
8629 /* Fit 64-bit mask for wave32 */
8630 src = emit_extract_vector(ctx, src, 0, RegClass(src.type(), bld.lm.size()));
8631 Temp wqm_tmp = emit_mbcnt(ctx, bld.tmp(v1), Operand(src), Operand(add_src));
8632 emit_wqm(bld, wqm_tmp, dst);
8633 break;
8634 }
8635 case nir_intrinsic_byte_permute_amd: {
8636 Temp dst = get_ssa_temp(ctx, &instr->dest.ssa);
8637 assert(dst.regClass() == v1);
8638 assert(ctx->program->chip_class >= GFX8);
8639 bld.vop3(aco_opcode::v_perm_b32, Definition(dst), get_ssa_temp(ctx, instr->src[0].ssa),
8640 as_vgpr(ctx, get_ssa_temp(ctx, instr->src[1].ssa)),
8641 as_vgpr(ctx, get_ssa_temp(ctx, instr->src[2].ssa)));
8642 break;
8643 }
8644 case nir_intrinsic_lane_permute_16_amd: {
8645 Temp src = get_ssa_temp(ctx, instr->src[0].ssa);
8646 Temp dst = get_ssa_temp(ctx, &instr->dest.ssa);
8647 assert(ctx->program->chip_class >= GFX10);
8648
8649 if (src.regClass() == s1) {
8650 bld.copy(Definition(dst), src);
8651 } else if (dst.regClass() == v1 && src.regClass() == v1) {
8652 bld.vop3(aco_opcode::v_permlane16_b32, Definition(dst), src,
8653 bld.as_uniform(get_ssa_temp(ctx, instr->src[1].ssa)),
8654 bld.as_uniform(get_ssa_temp(ctx, instr->src[2].ssa)));
8655 } else {
8656 isel_err(&instr->instr, "Unimplemented lane_permute_16_amd");
8657 }
8658 break;
8659 }
8660 case nir_intrinsic_load_helper_invocation:
8661 case nir_intrinsic_is_helper_invocation: {
8662 /* load_helper() after demote() get lowered to is_helper().
8663 * Otherwise, these two behave the same. */
8664 Temp dst = get_ssa_temp(ctx, &instr->dest.ssa);
8665 bld.pseudo(aco_opcode::p_is_helper, Definition(dst), Operand(exec, bld.lm));
8666 ctx->block->kind |= block_kind_needs_lowering;
8667 ctx->program->needs_exact = true;
8668 break;
8669 }
8670 case nir_intrinsic_demote:
8671 bld.pseudo(aco_opcode::p_demote_to_helper, Operand::c32(-1u));
8672
8673 if (ctx->block->loop_nest_depth || ctx->cf_info.parent_if.is_divergent)
8674 ctx->cf_info.exec_potentially_empty_discard = true;
8675 ctx->block->kind |= block_kind_uses_demote;
8676 ctx->program->needs_exact = true;
8677 break;
8678 case nir_intrinsic_demote_if: {
8679 Temp src = get_ssa_temp(ctx, instr->src[0].ssa);
8680 assert(src.regClass() == bld.lm);
8681 Temp cond =
8682 bld.sop2(Builder::s_and, bld.def(bld.lm), bld.def(s1, scc), src, Operand(exec, bld.lm));
8683 bld.pseudo(aco_opcode::p_demote_to_helper, cond);
8684
8685 if (ctx->block->loop_nest_depth || ctx->cf_info.parent_if.is_divergent)
8686 ctx->cf_info.exec_potentially_empty_discard = true;
8687 ctx->block->kind |= block_kind_uses_demote;
8688 ctx->program->needs_exact = true;
8689 break;
8690 }
8691 case nir_intrinsic_first_invocation: {
8692 emit_wqm(bld, bld.sop1(Builder::s_ff1_i32, bld.def(s1), Operand(exec, bld.lm)),
8693 get_ssa_temp(ctx, &instr->dest.ssa));
8694 break;
8695 }
8696 case nir_intrinsic_last_invocation: {
8697 Temp flbit = bld.sop1(Builder::s_flbit_i32, bld.def(s1), Operand(exec, bld.lm));
8698 Temp last = bld.sop2(aco_opcode::s_sub_i32, bld.def(s1), bld.def(s1, scc),
8699 Operand::c32(ctx->program->wave_size - 1u), flbit);
8700 emit_wqm(bld, last, get_ssa_temp(ctx, &instr->dest.ssa));
8701 break;
8702 }
8703 case nir_intrinsic_elect: {
8704 /* p_elect is lowered in aco_insert_exec_mask.
8705 * Use exec as an operand so value numbering and the pre-RA optimizer won't recognize
8706 * two p_elect with different exec masks as the same.
8707 */
8708 Temp elected = bld.pseudo(aco_opcode::p_elect, bld.def(bld.lm), Operand(exec, bld.lm));
8709 emit_wqm(bld, elected, get_ssa_temp(ctx, &instr->dest.ssa));
8710 ctx->block->kind |= block_kind_needs_lowering;
8711 break;
8712 }
8713 case nir_intrinsic_shader_clock: {
8714 Temp dst = get_ssa_temp(ctx, &instr->dest.ssa);
8715 if (nir_intrinsic_memory_scope(instr) == NIR_SCOPE_SUBGROUP &&
8716 ctx->options->chip_class >= GFX10_3) {
8717 /* "((size - 1) << 11) | register" (SHADER_CYCLES is encoded as register 29) */
8718 Temp clock = bld.sopk(aco_opcode::s_getreg_b32, bld.def(s1), ((20 - 1) << 11) | 29);
8719 bld.pseudo(aco_opcode::p_create_vector, Definition(dst), clock, Operand::zero());
8720 } else {
8721 aco_opcode opcode = nir_intrinsic_memory_scope(instr) == NIR_SCOPE_DEVICE
8722 ? aco_opcode::s_memrealtime
8723 : aco_opcode::s_memtime;
8724 bld.smem(opcode, Definition(dst), memory_sync_info(0, semantic_volatile));
8725 }
8726 emit_split_vector(ctx, dst, 2);
8727 break;
8728 }
8729 case nir_intrinsic_load_vertex_id_zero_base: {
8730 Temp dst = get_ssa_temp(ctx, &instr->dest.ssa);
8731 bld.copy(Definition(dst), get_arg(ctx, ctx->args->ac.vertex_id));
8732 break;
8733 }
8734 case nir_intrinsic_load_first_vertex: {
8735 Temp dst = get_ssa_temp(ctx, &instr->dest.ssa);
8736 bld.copy(Definition(dst), get_arg(ctx, ctx->args->ac.base_vertex));
8737 break;
8738 }
8739 case nir_intrinsic_load_base_instance: {
8740 Temp dst = get_ssa_temp(ctx, &instr->dest.ssa);
8741 bld.copy(Definition(dst), get_arg(ctx, ctx->args->ac.start_instance));
8742 break;
8743 }
8744 case nir_intrinsic_load_instance_id: {
8745 Temp dst = get_ssa_temp(ctx, &instr->dest.ssa);
8746 bld.copy(Definition(dst), get_arg(ctx, ctx->args->ac.instance_id));
8747 break;
8748 }
8749 case nir_intrinsic_load_draw_id: {
8750 Temp dst = get_ssa_temp(ctx, &instr->dest.ssa);
8751 bld.copy(Definition(dst), get_arg(ctx, ctx->args->ac.draw_id));
8752 break;
8753 }
8754 case nir_intrinsic_load_invocation_id: {
8755 Temp dst = get_ssa_temp(ctx, &instr->dest.ssa);
8756
8757 if (ctx->shader->info.stage == MESA_SHADER_GEOMETRY) {
8758 if (ctx->options->chip_class >= GFX10)
8759 bld.vop2_e64(aco_opcode::v_and_b32, Definition(dst), Operand::c32(127u),
8760 get_arg(ctx, ctx->args->ac.gs_invocation_id));
8761 else
8762 bld.copy(Definition(dst), get_arg(ctx, ctx->args->ac.gs_invocation_id));
8763 } else if (ctx->shader->info.stage == MESA_SHADER_TESS_CTRL) {
8764 bld.vop3(aco_opcode::v_bfe_u32, Definition(dst), get_arg(ctx, ctx->args->ac.tcs_rel_ids),
8765 Operand::c32(8u), Operand::c32(5u));
8766 } else {
8767 unreachable("Unsupported stage for load_invocation_id");
8768 }
8769
8770 break;
8771 }
8772 case nir_intrinsic_load_primitive_id: {
8773 Temp dst = get_ssa_temp(ctx, &instr->dest.ssa);
8774
8775 switch (ctx->shader->info.stage) {
8776 case MESA_SHADER_GEOMETRY:
8777 bld.copy(Definition(dst), get_arg(ctx, ctx->args->ac.gs_prim_id));
8778 break;
8779 case MESA_SHADER_TESS_CTRL:
8780 bld.copy(Definition(dst), get_arg(ctx, ctx->args->ac.tcs_patch_id));
8781 break;
8782 case MESA_SHADER_TESS_EVAL:
8783 bld.copy(Definition(dst), get_arg(ctx, ctx->args->ac.tes_patch_id));
8784 break;
8785 default:
8786 if (ctx->stage.hw == HWStage::NGG && !ctx->stage.has(SWStage::GS)) {
8787 /* In case of NGG, the GS threads always have the primitive ID
8788 * even if there is no SW GS. */
8789 bld.copy(Definition(dst), get_arg(ctx, ctx->args->ac.gs_prim_id));
8790 break;
8791 }
8792 unreachable("Unimplemented shader stage for nir_intrinsic_load_primitive_id");
8793 }
8794
8795 break;
8796 }
8797 case nir_intrinsic_load_patch_vertices_in: {
8798 assert(ctx->shader->info.stage == MESA_SHADER_TESS_CTRL ||
8799 ctx->shader->info.stage == MESA_SHADER_TESS_EVAL);
8800
8801 Temp dst = get_ssa_temp(ctx, &instr->dest.ssa);
8802 bld.copy(Definition(dst), Operand::c32(ctx->args->options->key.tcs.tess_input_vertices));
8803 break;
8804 }
8805 case nir_intrinsic_emit_vertex_with_counter: {
8806 assert(ctx->stage.hw == HWStage::GS);
8807 visit_emit_vertex_with_counter(ctx, instr);
8808 break;
8809 }
8810 case nir_intrinsic_end_primitive_with_counter: {
8811 if (ctx->stage.hw != HWStage::NGG) {
8812 unsigned stream = nir_intrinsic_stream_id(instr);
8813 bld.sopp(aco_opcode::s_sendmsg, bld.m0(ctx->gs_wave_id), -1,
8814 sendmsg_gs(true, false, stream));
8815 }
8816 break;
8817 }
8818 case nir_intrinsic_set_vertex_and_primitive_count: {
8819 assert(ctx->stage.hw == HWStage::GS);
8820 /* unused in the legacy pipeline, the HW keeps track of this for us */
8821 break;
8822 }
8823 case nir_intrinsic_load_tess_rel_patch_id_amd: {
8824 bld.copy(Definition(get_ssa_temp(ctx, &instr->dest.ssa)), get_tess_rel_patch_id(ctx));
8825 break;
8826 }
8827 case nir_intrinsic_load_ring_tess_factors_amd: {
8828 bld.smem(aco_opcode::s_load_dwordx4, Definition(get_ssa_temp(ctx, &instr->dest.ssa)),
8829 ctx->program->private_segment_buffer, Operand::c32(RING_HS_TESS_FACTOR * 16u));
8830 break;
8831 }
8832 case nir_intrinsic_load_ring_tess_factors_offset_amd: {
8833 bld.copy(Definition(get_ssa_temp(ctx, &instr->dest.ssa)),
8834 get_arg(ctx, ctx->args->ac.tcs_factor_offset));
8835 break;
8836 }
8837 case nir_intrinsic_load_ring_tess_offchip_amd: {
8838 bld.smem(aco_opcode::s_load_dwordx4, Definition(get_ssa_temp(ctx, &instr->dest.ssa)),
8839 ctx->program->private_segment_buffer, Operand::c32(RING_HS_TESS_OFFCHIP * 16u));
8840 break;
8841 }
8842 case nir_intrinsic_load_ring_tess_offchip_offset_amd: {
8843 bld.copy(Definition(get_ssa_temp(ctx, &instr->dest.ssa)),
8844 get_arg(ctx, ctx->args->ac.tess_offchip_offset));
8845 break;
8846 }
8847 case nir_intrinsic_load_ring_esgs_amd: {
8848 unsigned ring = ctx->stage.hw == HWStage::ES ? RING_ESGS_VS : RING_ESGS_GS;
8849 bld.smem(aco_opcode::s_load_dwordx4, Definition(get_ssa_temp(ctx, &instr->dest.ssa)),
8850 ctx->program->private_segment_buffer, Operand::c32(ring * 16u));
8851 break;
8852 }
8853 case nir_intrinsic_load_ring_es2gs_offset_amd: {
8854 bld.copy(Definition(get_ssa_temp(ctx, &instr->dest.ssa)),
8855 get_arg(ctx, ctx->args->ac.es2gs_offset));
8856 break;
8857 }
8858 case nir_intrinsic_load_gs_vertex_offset_amd: {
8859 /* GFX6-8 uses 6 separate args, while GFX9+ packs these into only 3 args. */
8860 unsigned b = nir_intrinsic_base(instr);
8861 assert(b <= (ctx->program->chip_class >= GFX9 ? 2 : 5));
8862 bld.copy(Definition(get_ssa_temp(ctx, &instr->dest.ssa)),
8863 get_arg(ctx, ctx->args->ac.gs_vtx_offset[b]));
8864 break;
8865 }
8866 case nir_intrinsic_has_input_vertex_amd:
8867 case nir_intrinsic_has_input_primitive_amd: {
8868 assert(ctx->stage.hw == HWStage::NGG);
8869 unsigned i = instr->intrinsic == nir_intrinsic_has_input_vertex_amd ? 0 : 1;
8870 bld.copy(Definition(get_ssa_temp(ctx, &instr->dest.ssa)), merged_wave_info_to_mask(ctx, i));
8871 break;
8872 }
8873 case nir_intrinsic_load_workgroup_num_input_vertices_amd:
8874 case nir_intrinsic_load_workgroup_num_input_primitives_amd: {
8875 assert(ctx->stage.hw == HWStage::NGG);
8876 unsigned pos =
8877 instr->intrinsic == nir_intrinsic_load_workgroup_num_input_vertices_amd ? 12 : 22;
8878 bld.sop2(aco_opcode::s_bfe_u32, Definition(get_ssa_temp(ctx, &instr->dest.ssa)),
8879 bld.def(s1, scc), get_arg(ctx, ctx->args->ac.gs_tg_info),
8880 Operand::c32(pos | (9u << 16u)));
8881 break;
8882 }
8883 case nir_intrinsic_load_initial_edgeflags_amd: {
8884 assert(ctx->stage.hw == HWStage::NGG);
8885
8886 Temp gs_invocation_id = get_arg(ctx, ctx->args->ac.gs_invocation_id);
8887 /* Get initial edgeflags for each vertex at bits 8, 9, 10 of gs_invocation_id. */
8888 Temp flags =
8889 bld.vop2(aco_opcode::v_and_b32, bld.def(v1), Operand::c32(0x700u), gs_invocation_id);
8890 /* Move the bits to their desired position: 8->9, 9->19, 10->29. */
8891 flags = bld.vop2(aco_opcode::v_mul_u32_u24, bld.def(v1), Operand::c32(0x80402u), flags);
8892 /* Remove garbage bits that are a byproduct of the multiplication. */
8893 bld.vop2(aco_opcode::v_and_b32, Definition(get_ssa_temp(ctx, &instr->dest.ssa)),
8894 Operand::c32(0x20080200), flags);
8895 break;
8896 }
8897 case nir_intrinsic_load_packed_passthrough_primitive_amd: {
8898 bld.copy(Definition(get_ssa_temp(ctx, &instr->dest.ssa)),
8899 get_arg(ctx, ctx->args->ac.gs_vtx_offset[0]));
8900 break;
8901 }
8902 case nir_intrinsic_export_vertex_amd: {
8903 ctx->block->kind |= block_kind_export_end;
8904 create_vs_exports(ctx);
8905 break;
8906 }
8907 case nir_intrinsic_export_primitive_amd: {
8908 assert(ctx->stage.hw == HWStage::NGG);
8909 Temp prim_exp_arg = get_ssa_temp(ctx, instr->src[0].ssa);
8910 bld.exp(aco_opcode::exp, prim_exp_arg, Operand(v1), Operand(v1), Operand(v1),
8911 1 /* enabled mask */, V_008DFC_SQ_EXP_PRIM /* dest */, false /* compressed */,
8912 true /* done */, false /* valid mask */);
8913 break;
8914 }
8915 case nir_intrinsic_alloc_vertices_and_primitives_amd: {
8916 assert(ctx->stage.hw == HWStage::NGG);
8917 Temp num_vertices = get_ssa_temp(ctx, instr->src[0].ssa);
8918 Temp num_primitives = get_ssa_temp(ctx, instr->src[1].ssa);
8919 ngg_emit_sendmsg_gs_alloc_req(ctx, num_vertices, num_primitives);
8920 break;
8921 }
8922 case nir_intrinsic_gds_atomic_add_amd: {
8923 Temp store_val = get_ssa_temp(ctx, instr->src[0].ssa);
8924 Temp gds_addr = get_ssa_temp(ctx, instr->src[1].ssa);
8925 Temp m0_val = get_ssa_temp(ctx, instr->src[2].ssa);
8926 Operand m = bld.m0((Temp)bld.copy(bld.def(s1, m0), bld.as_uniform(m0_val)));
8927 bld.ds(aco_opcode::ds_add_u32, as_vgpr(ctx, gds_addr), as_vgpr(ctx, store_val), m, 0u, 0u,
8928 true);
8929 break;
8930 }
8931 case nir_intrinsic_load_shader_query_enabled_amd: {
8932 unsigned cmp_bit = 0;
8933 Temp shader_query_enabled =
8934 bld.sopc(aco_opcode::s_bitcmp1_b32, bld.def(s1, scc),
8935 get_arg(ctx, ctx->args->ngg_gs_state), Operand::c32(cmp_bit));
8936 bld.copy(Definition(get_ssa_temp(ctx, &instr->dest.ssa)),
8937 bool_to_vector_condition(ctx, shader_query_enabled));
8938 break;
8939 }
8940 case nir_intrinsic_load_cull_front_face_enabled_amd:
8941 case nir_intrinsic_load_cull_back_face_enabled_amd:
8942 case nir_intrinsic_load_cull_ccw_amd:
8943 case nir_intrinsic_load_cull_small_primitives_enabled_amd: {
8944 unsigned cmp_bit;
8945 if (instr->intrinsic == nir_intrinsic_load_cull_front_face_enabled_amd)
8946 cmp_bit = 0;
8947 else if (instr->intrinsic == nir_intrinsic_load_cull_back_face_enabled_amd)
8948 cmp_bit = 1;
8949 else if (instr->intrinsic == nir_intrinsic_load_cull_ccw_amd)
8950 cmp_bit = 2;
8951 else if (instr->intrinsic == nir_intrinsic_load_cull_small_primitives_enabled_amd)
8952 cmp_bit = 3;
8953 else
8954 unreachable("unimplemented culling intrinsic");
8955
8956 Builder::Result enabled =
8957 bld.sopc(aco_opcode::s_bitcmp1_b32, bld.def(s1, scc),
8958 get_arg(ctx, ctx->args->ngg_culling_settings), Operand::c32(cmp_bit));
8959 enabled.instr->definitions[0].setNoCSE(true);
8960 bld.copy(Definition(get_ssa_temp(ctx, &instr->dest.ssa)),
8961 bool_to_vector_condition(ctx, enabled));
8962 break;
8963 }
8964 case nir_intrinsic_load_sbt_amd: visit_load_sbt_amd(ctx, instr); break;
8965 case nir_intrinsic_bvh64_intersect_ray_amd: visit_bvh64_intersect_ray_amd(ctx, instr); break;
8966 case nir_intrinsic_load_cull_any_enabled_amd: {
8967 Builder::Result cull_any_enabled =
8968 bld.sop2(aco_opcode::s_and_b32, bld.def(s1), bld.def(s1, scc),
8969 get_arg(ctx, ctx->args->ngg_culling_settings), Operand::c32(0xbu));
8970 cull_any_enabled.instr->definitions[1].setNoCSE(true);
8971 bld.copy(Definition(get_ssa_temp(ctx, &instr->dest.ssa)),
8972 bool_to_vector_condition(ctx, cull_any_enabled.def(1).getTemp()));
8973 break;
8974 }
8975 case nir_intrinsic_load_cull_small_prim_precision_amd: {
8976 /* Exponent is 8-bit signed int, move that into a signed 32-bit int. */
8977 Temp exponent = bld.sop2(aco_opcode::s_ashr_i32, bld.def(s1), bld.def(s1, scc),
8978 get_arg(ctx, ctx->args->ngg_culling_settings), Operand::c32(24u));
8979 /* small_prim_precision = 1.0 * 2^X */
8980 bld.vop3(aco_opcode::v_ldexp_f32, Definition(get_ssa_temp(ctx, &instr->dest.ssa)),
8981 Operand::c32(0x3f800000u), Operand(exponent));
8982 break;
8983 }
8984 case nir_intrinsic_load_viewport_x_scale: {
8985 bld.copy(Definition(get_ssa_temp(ctx, &instr->dest.ssa)),
8986 get_arg(ctx, ctx->args->ngg_viewport_scale[0]));
8987 break;
8988 }
8989 case nir_intrinsic_load_viewport_y_scale: {
8990 bld.copy(Definition(get_ssa_temp(ctx, &instr->dest.ssa)),
8991 get_arg(ctx, ctx->args->ngg_viewport_scale[1]));
8992 break;
8993 }
8994 case nir_intrinsic_load_viewport_x_offset: {
8995 bld.copy(Definition(get_ssa_temp(ctx, &instr->dest.ssa)),
8996 get_arg(ctx, ctx->args->ngg_viewport_translate[0]));
8997 break;
8998 }
8999 case nir_intrinsic_load_viewport_y_offset: {
9000 bld.copy(Definition(get_ssa_temp(ctx, &instr->dest.ssa)),
9001 get_arg(ctx, ctx->args->ngg_viewport_translate[1]));
9002 break;
9003 }
9004 case nir_intrinsic_overwrite_vs_arguments_amd: {
9005 ctx->arg_temps[ctx->args->ac.vertex_id.arg_index] = get_ssa_temp(ctx, instr->src[0].ssa);
9006 ctx->arg_temps[ctx->args->ac.instance_id.arg_index] = get_ssa_temp(ctx, instr->src[1].ssa);
9007 break;
9008 }
9009 case nir_intrinsic_overwrite_tes_arguments_amd: {
9010 ctx->arg_temps[ctx->args->ac.tes_u.arg_index] = get_ssa_temp(ctx, instr->src[0].ssa);
9011 ctx->arg_temps[ctx->args->ac.tes_v.arg_index] = get_ssa_temp(ctx, instr->src[1].ssa);
9012 ctx->arg_temps[ctx->args->ac.tes_rel_patch_id.arg_index] =
9013 get_ssa_temp(ctx, instr->src[2].ssa);
9014 ctx->arg_temps[ctx->args->ac.tes_patch_id.arg_index] = get_ssa_temp(ctx, instr->src[3].ssa);
9015 break;
9016 }
9017 default:
9018 isel_err(&instr->instr, "Unimplemented intrinsic instr");
9019 abort();
9020
9021 break;
9022 }
9023 }
9024
9025 void
tex_fetch_ptrs(isel_context * ctx,nir_tex_instr * instr,Temp * res_ptr,Temp * samp_ptr,enum glsl_base_type * stype)9026 tex_fetch_ptrs(isel_context* ctx, nir_tex_instr* instr, Temp* res_ptr, Temp* samp_ptr,
9027 enum glsl_base_type* stype)
9028 {
9029 nir_deref_instr* texture_deref_instr = NULL;
9030 nir_deref_instr* sampler_deref_instr = NULL;
9031 int plane = -1;
9032
9033 for (unsigned i = 0; i < instr->num_srcs; i++) {
9034 switch (instr->src[i].src_type) {
9035 case nir_tex_src_texture_deref:
9036 texture_deref_instr = nir_src_as_deref(instr->src[i].src);
9037 break;
9038 case nir_tex_src_sampler_deref:
9039 sampler_deref_instr = nir_src_as_deref(instr->src[i].src);
9040 break;
9041 case nir_tex_src_plane: plane = nir_src_as_int(instr->src[i].src); break;
9042 default: break;
9043 }
9044 }
9045
9046 *stype = glsl_get_sampler_result_type(texture_deref_instr->type);
9047
9048 if (!sampler_deref_instr)
9049 sampler_deref_instr = texture_deref_instr;
9050
9051 if (plane >= 0) {
9052 assert(instr->sampler_dim != GLSL_SAMPLER_DIM_BUF);
9053 *res_ptr = get_sampler_desc(ctx, texture_deref_instr,
9054 (aco_descriptor_type)(ACO_DESC_PLANE_0 + plane), instr, false);
9055 } else if (instr->sampler_dim == GLSL_SAMPLER_DIM_BUF) {
9056 *res_ptr = get_sampler_desc(ctx, texture_deref_instr, ACO_DESC_BUFFER, instr, false);
9057 } else if (instr->op == nir_texop_fragment_mask_fetch_amd) {
9058 *res_ptr = get_sampler_desc(ctx, texture_deref_instr, ACO_DESC_FMASK, instr, false);
9059 } else {
9060 *res_ptr = get_sampler_desc(ctx, texture_deref_instr, ACO_DESC_IMAGE, instr, false);
9061 }
9062 if (samp_ptr) {
9063 *samp_ptr = get_sampler_desc(ctx, sampler_deref_instr, ACO_DESC_SAMPLER, instr, false);
9064
9065 if (instr->sampler_dim < GLSL_SAMPLER_DIM_RECT && ctx->options->chip_class < GFX8) {
9066 /* fix sampler aniso on SI/CI: samp[0] = samp[0] & img[7] */
9067 Builder bld(ctx->program, ctx->block);
9068
9069 /* to avoid unnecessary moves, we split and recombine sampler and image */
9070 Temp img[8] = {bld.tmp(s1), bld.tmp(s1), bld.tmp(s1), bld.tmp(s1),
9071 bld.tmp(s1), bld.tmp(s1), bld.tmp(s1), bld.tmp(s1)};
9072 Temp samp[4] = {bld.tmp(s1), bld.tmp(s1), bld.tmp(s1), bld.tmp(s1)};
9073 bld.pseudo(aco_opcode::p_split_vector, Definition(img[0]), Definition(img[1]),
9074 Definition(img[2]), Definition(img[3]), Definition(img[4]), Definition(img[5]),
9075 Definition(img[6]), Definition(img[7]), *res_ptr);
9076 bld.pseudo(aco_opcode::p_split_vector, Definition(samp[0]), Definition(samp[1]),
9077 Definition(samp[2]), Definition(samp[3]), *samp_ptr);
9078
9079 samp[0] = bld.sop2(aco_opcode::s_and_b32, bld.def(s1), bld.def(s1, scc), samp[0], img[7]);
9080 *res_ptr = bld.pseudo(aco_opcode::p_create_vector, bld.def(s8), img[0], img[1], img[2],
9081 img[3], img[4], img[5], img[6], img[7]);
9082 *samp_ptr = bld.pseudo(aco_opcode::p_create_vector, bld.def(s4), samp[0], samp[1], samp[2],
9083 samp[3]);
9084 }
9085 }
9086 }
9087
9088 void
build_cube_select(isel_context * ctx,Temp ma,Temp id,Temp deriv,Temp * out_ma,Temp * out_sc,Temp * out_tc)9089 build_cube_select(isel_context* ctx, Temp ma, Temp id, Temp deriv, Temp* out_ma, Temp* out_sc,
9090 Temp* out_tc)
9091 {
9092 Builder bld(ctx->program, ctx->block);
9093
9094 Temp deriv_x = emit_extract_vector(ctx, deriv, 0, v1);
9095 Temp deriv_y = emit_extract_vector(ctx, deriv, 1, v1);
9096 Temp deriv_z = emit_extract_vector(ctx, deriv, 2, v1);
9097
9098 Operand neg_one = Operand::c32(0xbf800000u);
9099 Operand one = Operand::c32(0x3f800000u);
9100 Operand two = Operand::c32(0x40000000u);
9101 Operand four = Operand::c32(0x40800000u);
9102
9103 Temp is_ma_positive =
9104 bld.vopc(aco_opcode::v_cmp_le_f32, bld.hint_vcc(bld.def(bld.lm)), Operand::zero(), ma);
9105 Temp sgn_ma = bld.vop2_e64(aco_opcode::v_cndmask_b32, bld.def(v1), neg_one, one, is_ma_positive);
9106 Temp neg_sgn_ma = bld.vop2(aco_opcode::v_sub_f32, bld.def(v1), Operand::zero(), sgn_ma);
9107
9108 Temp is_ma_z = bld.vopc(aco_opcode::v_cmp_le_f32, bld.hint_vcc(bld.def(bld.lm)), four, id);
9109 Temp is_ma_y = bld.vopc(aco_opcode::v_cmp_le_f32, bld.def(bld.lm), two, id);
9110 is_ma_y = bld.sop2(Builder::s_andn2, bld.hint_vcc(bld.def(bld.lm)), is_ma_y, is_ma_z);
9111 Temp is_not_ma_x = bld.sop2(aco_opcode::s_or_b64, bld.hint_vcc(bld.def(bld.lm)),
9112 bld.def(s1, scc), is_ma_z, is_ma_y);
9113
9114 /* select sc */
9115 Temp tmp = bld.vop2(aco_opcode::v_cndmask_b32, bld.def(v1), deriv_z, deriv_x, is_not_ma_x);
9116 Temp sgn = bld.vop2_e64(
9117 aco_opcode::v_cndmask_b32, bld.def(v1),
9118 bld.vop2(aco_opcode::v_cndmask_b32, bld.def(v1), neg_sgn_ma, sgn_ma, is_ma_z), one, is_ma_y);
9119 *out_sc = bld.vop2(aco_opcode::v_mul_f32, bld.def(v1), tmp, sgn);
9120
9121 /* select tc */
9122 tmp = bld.vop2(aco_opcode::v_cndmask_b32, bld.def(v1), deriv_y, deriv_z, is_ma_y);
9123 sgn = bld.vop2(aco_opcode::v_cndmask_b32, bld.def(v1), neg_one, sgn_ma, is_ma_y);
9124 *out_tc = bld.vop2(aco_opcode::v_mul_f32, bld.def(v1), tmp, sgn);
9125
9126 /* select ma */
9127 tmp = bld.vop2(aco_opcode::v_cndmask_b32, bld.def(v1),
9128 bld.vop2(aco_opcode::v_cndmask_b32, bld.def(v1), deriv_x, deriv_y, is_ma_y),
9129 deriv_z, is_ma_z);
9130 tmp = bld.vop2(aco_opcode::v_and_b32, bld.def(v1), Operand::c32(0x7fffffffu), tmp);
9131 *out_ma = bld.vop2(aco_opcode::v_mul_f32, bld.def(v1), two, tmp);
9132 }
9133
9134 void
prepare_cube_coords(isel_context * ctx,std::vector<Temp> & coords,Temp * ddx,Temp * ddy,bool is_deriv,bool is_array)9135 prepare_cube_coords(isel_context* ctx, std::vector<Temp>& coords, Temp* ddx, Temp* ddy,
9136 bool is_deriv, bool is_array)
9137 {
9138 Builder bld(ctx->program, ctx->block);
9139 Temp ma, tc, sc, id;
9140 aco_opcode madak =
9141 ctx->program->chip_class >= GFX10_3 ? aco_opcode::v_fmaak_f32 : aco_opcode::v_madak_f32;
9142 aco_opcode madmk =
9143 ctx->program->chip_class >= GFX10_3 ? aco_opcode::v_fmamk_f32 : aco_opcode::v_madmk_f32;
9144
9145 if (is_array) {
9146 coords[3] = bld.vop1(aco_opcode::v_rndne_f32, bld.def(v1), coords[3]);
9147
9148 /* see comment in ac_prepare_cube_coords() */
9149 if (ctx->options->chip_class <= GFX8)
9150 coords[3] = bld.vop2(aco_opcode::v_max_f32, bld.def(v1), Operand::zero(), coords[3]);
9151 }
9152
9153 ma = bld.vop3(aco_opcode::v_cubema_f32, bld.def(v1), coords[0], coords[1], coords[2]);
9154
9155 aco_ptr<VOP3_instruction> vop3a{
9156 create_instruction<VOP3_instruction>(aco_opcode::v_rcp_f32, asVOP3(Format::VOP1), 1, 1)};
9157 vop3a->operands[0] = Operand(ma);
9158 vop3a->abs[0] = true;
9159 Temp invma = bld.tmp(v1);
9160 vop3a->definitions[0] = Definition(invma);
9161 ctx->block->instructions.emplace_back(std::move(vop3a));
9162
9163 sc = bld.vop3(aco_opcode::v_cubesc_f32, bld.def(v1), coords[0], coords[1], coords[2]);
9164 if (!is_deriv)
9165 sc = bld.vop2(madak, bld.def(v1), sc, invma, Operand::c32(0x3fc00000u /*1.5*/));
9166
9167 tc = bld.vop3(aco_opcode::v_cubetc_f32, bld.def(v1), coords[0], coords[1], coords[2]);
9168 if (!is_deriv)
9169 tc = bld.vop2(madak, bld.def(v1), tc, invma, Operand::c32(0x3fc00000u /*1.5*/));
9170
9171 id = bld.vop3(aco_opcode::v_cubeid_f32, bld.def(v1), coords[0], coords[1], coords[2]);
9172
9173 if (is_deriv) {
9174 sc = bld.vop2(aco_opcode::v_mul_f32, bld.def(v1), sc, invma);
9175 tc = bld.vop2(aco_opcode::v_mul_f32, bld.def(v1), tc, invma);
9176
9177 for (unsigned i = 0; i < 2; i++) {
9178 /* see comment in ac_prepare_cube_coords() */
9179 Temp deriv_ma;
9180 Temp deriv_sc, deriv_tc;
9181 build_cube_select(ctx, ma, id, i ? *ddy : *ddx, &deriv_ma, &deriv_sc, &deriv_tc);
9182
9183 deriv_ma = bld.vop2(aco_opcode::v_mul_f32, bld.def(v1), deriv_ma, invma);
9184
9185 Temp x = bld.vop2(aco_opcode::v_sub_f32, bld.def(v1),
9186 bld.vop2(aco_opcode::v_mul_f32, bld.def(v1), deriv_sc, invma),
9187 bld.vop2(aco_opcode::v_mul_f32, bld.def(v1), deriv_ma, sc));
9188 Temp y = bld.vop2(aco_opcode::v_sub_f32, bld.def(v1),
9189 bld.vop2(aco_opcode::v_mul_f32, bld.def(v1), deriv_tc, invma),
9190 bld.vop2(aco_opcode::v_mul_f32, bld.def(v1), deriv_ma, tc));
9191 *(i ? ddy : ddx) = bld.pseudo(aco_opcode::p_create_vector, bld.def(v2), x, y);
9192 }
9193
9194 sc = bld.vop2(aco_opcode::v_add_f32, bld.def(v1), Operand::c32(0x3fc00000u /*1.5*/), sc);
9195 tc = bld.vop2(aco_opcode::v_add_f32, bld.def(v1), Operand::c32(0x3fc00000u /*1.5*/), tc);
9196 }
9197
9198 if (is_array)
9199 id = bld.vop2(madmk, bld.def(v1), coords[3], id, Operand::c32(0x41000000u /*8.0*/));
9200 coords.resize(3);
9201 coords[0] = sc;
9202 coords[1] = tc;
9203 coords[2] = id;
9204 }
9205
9206 void
get_const_vec(nir_ssa_def * vec,nir_const_value * cv[4])9207 get_const_vec(nir_ssa_def* vec, nir_const_value* cv[4])
9208 {
9209 if (vec->parent_instr->type != nir_instr_type_alu)
9210 return;
9211 nir_alu_instr* vec_instr = nir_instr_as_alu(vec->parent_instr);
9212 if (vec_instr->op != nir_op_vec(vec->num_components))
9213 return;
9214
9215 for (unsigned i = 0; i < vec->num_components; i++) {
9216 cv[i] =
9217 vec_instr->src[i].swizzle[0] == 0 ? nir_src_as_const_value(vec_instr->src[i].src) : NULL;
9218 }
9219 }
9220
9221 void
visit_tex(isel_context * ctx,nir_tex_instr * instr)9222 visit_tex(isel_context* ctx, nir_tex_instr* instr)
9223 {
9224 assert(instr->op != nir_texop_txf_ms && instr->op != nir_texop_samples_identical);
9225
9226 Builder bld(ctx->program, ctx->block);
9227 bool has_bias = false, has_lod = false, level_zero = false, has_compare = false,
9228 has_offset = false, has_ddx = false, has_ddy = false, has_derivs = false,
9229 has_sample_index = false, has_clamped_lod = false;
9230 Temp resource, sampler, bias = Temp(), compare = Temp(), sample_index = Temp(), lod = Temp(),
9231 offset = Temp(), ddx = Temp(), ddy = Temp(), clamped_lod = Temp();
9232 std::vector<Temp> coords;
9233 std::vector<Temp> derivs;
9234 nir_const_value* const_offset[4] = {NULL, NULL, NULL, NULL};
9235 enum glsl_base_type stype;
9236 tex_fetch_ptrs(ctx, instr, &resource, &sampler, &stype);
9237
9238 bool tg4_integer_workarounds = ctx->options->chip_class <= GFX8 && instr->op == nir_texop_tg4 &&
9239 (stype == GLSL_TYPE_UINT || stype == GLSL_TYPE_INT);
9240 bool tg4_integer_cube_workaround =
9241 tg4_integer_workarounds && instr->sampler_dim == GLSL_SAMPLER_DIM_CUBE;
9242
9243 for (unsigned i = 0; i < instr->num_srcs; i++) {
9244 switch (instr->src[i].src_type) {
9245 case nir_tex_src_coord: {
9246 Temp coord = get_ssa_temp(ctx, instr->src[i].src.ssa);
9247 for (unsigned j = 0; j < coord.size(); j++)
9248 coords.emplace_back(emit_extract_vector(ctx, coord, j, v1));
9249 break;
9250 }
9251 case nir_tex_src_bias:
9252 bias = get_ssa_temp(ctx, instr->src[i].src.ssa);
9253 has_bias = true;
9254 break;
9255 case nir_tex_src_lod: {
9256 if (nir_src_is_const(instr->src[i].src) && nir_src_as_uint(instr->src[i].src) == 0) {
9257 level_zero = true;
9258 } else {
9259 lod = get_ssa_temp(ctx, instr->src[i].src.ssa);
9260 has_lod = true;
9261 }
9262 break;
9263 }
9264 case nir_tex_src_min_lod:
9265 clamped_lod = get_ssa_temp(ctx, instr->src[i].src.ssa);
9266 has_clamped_lod = true;
9267 break;
9268 case nir_tex_src_comparator:
9269 if (instr->is_shadow) {
9270 compare = get_ssa_temp(ctx, instr->src[i].src.ssa);
9271 has_compare = true;
9272 }
9273 break;
9274 case nir_tex_src_offset:
9275 offset = get_ssa_temp(ctx, instr->src[i].src.ssa);
9276 get_const_vec(instr->src[i].src.ssa, const_offset);
9277 has_offset = true;
9278 break;
9279 case nir_tex_src_ddx:
9280 ddx = get_ssa_temp(ctx, instr->src[i].src.ssa);
9281 has_ddx = true;
9282 break;
9283 case nir_tex_src_ddy:
9284 ddy = get_ssa_temp(ctx, instr->src[i].src.ssa);
9285 has_ddy = true;
9286 break;
9287 case nir_tex_src_ms_index:
9288 sample_index = get_ssa_temp(ctx, instr->src[i].src.ssa);
9289 has_sample_index = true;
9290 break;
9291 case nir_tex_src_texture_offset:
9292 case nir_tex_src_sampler_offset:
9293 default: break;
9294 }
9295 }
9296
9297 if (instr->op == nir_texop_txs && instr->sampler_dim == GLSL_SAMPLER_DIM_BUF)
9298 return get_buffer_size(ctx, resource, get_ssa_temp(ctx, &instr->dest.ssa));
9299
9300 if (instr->op == nir_texop_texture_samples) {
9301 get_image_samples(ctx, Definition(get_ssa_temp(ctx, &instr->dest.ssa)), resource);
9302 return;
9303 }
9304
9305 if (has_offset && instr->op != nir_texop_txf) {
9306 aco_ptr<Instruction> tmp_instr;
9307 Temp acc, pack = Temp();
9308
9309 uint32_t pack_const = 0;
9310 for (unsigned i = 0; i < offset.size(); i++) {
9311 if (!const_offset[i])
9312 continue;
9313 pack_const |= (const_offset[i]->u32 & 0x3Fu) << (8u * i);
9314 }
9315
9316 if (offset.type() == RegType::sgpr) {
9317 for (unsigned i = 0; i < offset.size(); i++) {
9318 if (const_offset[i])
9319 continue;
9320
9321 acc = emit_extract_vector(ctx, offset, i, s1);
9322 acc = bld.sop2(aco_opcode::s_and_b32, bld.def(s1), bld.def(s1, scc), acc,
9323 Operand::c32(0x3Fu));
9324
9325 if (i) {
9326 acc = bld.sop2(aco_opcode::s_lshl_b32, bld.def(s1), bld.def(s1, scc), acc,
9327 Operand::c32(8u * i));
9328 }
9329
9330 if (pack == Temp()) {
9331 pack = acc;
9332 } else {
9333 pack = bld.sop2(aco_opcode::s_or_b32, bld.def(s1), bld.def(s1, scc), pack, acc);
9334 }
9335 }
9336
9337 if (pack_const && pack != Temp())
9338 pack = bld.sop2(aco_opcode::s_or_b32, bld.def(s1), bld.def(s1, scc),
9339 Operand::c32(pack_const), pack);
9340 } else {
9341 for (unsigned i = 0; i < offset.size(); i++) {
9342 if (const_offset[i])
9343 continue;
9344
9345 acc = emit_extract_vector(ctx, offset, i, v1);
9346 acc = bld.vop2(aco_opcode::v_and_b32, bld.def(v1), Operand::c32(0x3Fu), acc);
9347
9348 if (i) {
9349 acc = bld.vop2(aco_opcode::v_lshlrev_b32, bld.def(v1), Operand::c32(8u * i), acc);
9350 }
9351
9352 if (pack == Temp()) {
9353 pack = acc;
9354 } else {
9355 pack = bld.vop2(aco_opcode::v_or_b32, bld.def(v1), pack, acc);
9356 }
9357 }
9358
9359 if (pack_const && pack != Temp())
9360 pack = bld.sop2(aco_opcode::v_or_b32, bld.def(v1), Operand::c32(pack_const), pack);
9361 }
9362 if (pack_const && pack == Temp())
9363 offset = bld.copy(bld.def(v1), Operand::c32(pack_const));
9364 else if (pack == Temp())
9365 has_offset = false;
9366 else
9367 offset = pack;
9368 }
9369
9370 if (instr->sampler_dim == GLSL_SAMPLER_DIM_CUBE && instr->coord_components)
9371 prepare_cube_coords(ctx, coords, &ddx, &ddy, instr->op == nir_texop_txd,
9372 instr->is_array && instr->op != nir_texop_lod);
9373
9374 /* pack derivatives */
9375 if (has_ddx || has_ddy) {
9376 if (instr->sampler_dim == GLSL_SAMPLER_DIM_1D && ctx->options->chip_class == GFX9) {
9377 assert(has_ddx && has_ddy && ddx.size() == 1 && ddy.size() == 1);
9378 Temp zero = bld.copy(bld.def(v1), Operand::zero());
9379 derivs = {ddx, zero, ddy, zero};
9380 } else {
9381 for (unsigned i = 0; has_ddx && i < ddx.size(); i++)
9382 derivs.emplace_back(emit_extract_vector(ctx, ddx, i, v1));
9383 for (unsigned i = 0; has_ddy && i < ddy.size(); i++)
9384 derivs.emplace_back(emit_extract_vector(ctx, ddy, i, v1));
9385 }
9386 has_derivs = true;
9387 }
9388
9389 if (instr->coord_components > 1 && instr->sampler_dim == GLSL_SAMPLER_DIM_1D &&
9390 instr->is_array && instr->op != nir_texop_txf)
9391 coords[1] = bld.vop1(aco_opcode::v_rndne_f32, bld.def(v1), coords[1]);
9392
9393 if (instr->coord_components > 2 &&
9394 (instr->sampler_dim == GLSL_SAMPLER_DIM_2D || instr->sampler_dim == GLSL_SAMPLER_DIM_MS ||
9395 instr->sampler_dim == GLSL_SAMPLER_DIM_SUBPASS ||
9396 instr->sampler_dim == GLSL_SAMPLER_DIM_SUBPASS_MS) &&
9397 instr->is_array && instr->op != nir_texop_txf && instr->op != nir_texop_fragment_fetch_amd &&
9398 instr->op != nir_texop_fragment_mask_fetch_amd)
9399 coords[2] = bld.vop1(aco_opcode::v_rndne_f32, bld.def(v1), coords[2]);
9400
9401 if (ctx->options->chip_class == GFX9 && instr->sampler_dim == GLSL_SAMPLER_DIM_1D &&
9402 instr->op != nir_texop_lod && instr->coord_components) {
9403 assert(coords.size() > 0 && coords.size() < 3);
9404
9405 coords.insert(std::next(coords.begin()),
9406 bld.copy(bld.def(v1), instr->op == nir_texop_txf ? Operand::c32(0)
9407 : Operand::c32(0x3f000000)));
9408 }
9409
9410 bool da = should_declare_array(ctx, instr->sampler_dim, instr->is_array);
9411
9412 if (has_offset && instr->op == nir_texop_txf) {
9413 for (unsigned i = 0; i < std::min(offset.size(), instr->coord_components); i++) {
9414 Temp off = emit_extract_vector(ctx, offset, i, v1);
9415 coords[i] = bld.vadd32(bld.def(v1), coords[i], off);
9416 }
9417 has_offset = false;
9418 }
9419
9420 /* Build tex instruction */
9421 unsigned dmask = nir_ssa_def_components_read(&instr->dest.ssa) & 0xf;
9422 if (instr->sampler_dim == GLSL_SAMPLER_DIM_BUF)
9423 dmask = u_bit_consecutive(0, util_last_bit(dmask));
9424 if (instr->is_sparse)
9425 dmask = MAX2(dmask, 1) | 0x10;
9426 unsigned dim =
9427 ctx->options->chip_class >= GFX10 && instr->sampler_dim != GLSL_SAMPLER_DIM_BUF
9428 ? ac_get_sampler_dim(ctx->options->chip_class, instr->sampler_dim, instr->is_array)
9429 : 0;
9430 Temp dst = get_ssa_temp(ctx, &instr->dest.ssa);
9431 Temp tmp_dst = dst;
9432
9433 /* gather4 selects the component by dmask and always returns vec4 (vec5 if sparse) */
9434 if (instr->op == nir_texop_tg4) {
9435 assert(instr->dest.ssa.num_components == (4 + instr->is_sparse));
9436 if (instr->is_shadow)
9437 dmask = 1;
9438 else
9439 dmask = 1 << instr->component;
9440 if (tg4_integer_cube_workaround || dst.type() == RegType::sgpr)
9441 tmp_dst = bld.tmp(instr->is_sparse ? v5 : v4);
9442 } else if (instr->op == nir_texop_fragment_mask_fetch_amd) {
9443 tmp_dst = bld.tmp(v1);
9444 } else if (util_bitcount(dmask) != instr->dest.ssa.num_components ||
9445 dst.type() == RegType::sgpr) {
9446 tmp_dst = bld.tmp(RegClass(RegType::vgpr, util_bitcount(dmask)));
9447 }
9448
9449 if (instr->op == nir_texop_txs || instr->op == nir_texop_query_levels) {
9450 if (!has_lod)
9451 lod = bld.copy(bld.def(v1), Operand::zero());
9452
9453 MIMG_instruction* tex = emit_mimg(bld, aco_opcode::image_get_resinfo, Definition(tmp_dst),
9454 resource, Operand(s4), std::vector<Temp>{lod});
9455 if (ctx->options->chip_class == GFX9 && instr->op == nir_texop_txs &&
9456 instr->sampler_dim == GLSL_SAMPLER_DIM_1D && instr->is_array) {
9457 tex->dmask = (dmask & 0x1) | ((dmask & 0x2) << 1);
9458 } else if (instr->op == nir_texop_query_levels) {
9459 tex->dmask = 1 << 3;
9460 } else {
9461 tex->dmask = dmask;
9462 }
9463 tex->da = da;
9464 tex->dim = dim;
9465
9466 expand_vector(ctx, tmp_dst, dst, instr->dest.ssa.num_components, dmask);
9467 return;
9468 }
9469
9470 Temp tg4_compare_cube_wa64 = Temp();
9471
9472 if (tg4_integer_workarounds) {
9473 Temp tg4_lod = bld.copy(bld.def(v1), Operand::zero());
9474 Temp size = bld.tmp(v2);
9475 MIMG_instruction* tex = emit_mimg(bld, aco_opcode::image_get_resinfo, Definition(size),
9476 resource, Operand(s4), std::vector<Temp>{tg4_lod});
9477 tex->dim = dim;
9478 tex->dmask = 0x3;
9479 tex->da = da;
9480 emit_split_vector(ctx, size, size.size());
9481
9482 Temp half_texel[2];
9483 for (unsigned i = 0; i < 2; i++) {
9484 half_texel[i] = emit_extract_vector(ctx, size, i, v1);
9485 half_texel[i] = bld.vop1(aco_opcode::v_cvt_f32_i32, bld.def(v1), half_texel[i]);
9486 half_texel[i] = bld.vop1(aco_opcode::v_rcp_iflag_f32, bld.def(v1), half_texel[i]);
9487 half_texel[i] = bld.vop2(aco_opcode::v_mul_f32, bld.def(v1),
9488 Operand::c32(0xbf000000 /*-0.5*/), half_texel[i]);
9489 }
9490
9491 if (instr->sampler_dim == GLSL_SAMPLER_DIM_2D && !instr->is_array) {
9492 /* In vulkan, whether the sampler uses unnormalized
9493 * coordinates or not is a dynamic property of the
9494 * sampler. Hence, to figure out whether or not we
9495 * need to divide by the texture size, we need to test
9496 * the sampler at runtime. This tests the bit set by
9497 * radv_init_sampler().
9498 */
9499 unsigned bit_idx = ffs(S_008F30_FORCE_UNNORMALIZED(1)) - 1;
9500 Temp not_needed =
9501 bld.sopc(aco_opcode::s_bitcmp0_b32, bld.def(s1, scc), sampler, Operand::c32(bit_idx));
9502
9503 not_needed = bool_to_vector_condition(ctx, not_needed);
9504 half_texel[0] = bld.vop2(aco_opcode::v_cndmask_b32, bld.def(v1),
9505 Operand::c32(0xbf000000 /*-0.5*/), half_texel[0], not_needed);
9506 half_texel[1] = bld.vop2(aco_opcode::v_cndmask_b32, bld.def(v1),
9507 Operand::c32(0xbf000000 /*-0.5*/), half_texel[1], not_needed);
9508 }
9509
9510 Temp new_coords[2] = {bld.vop2(aco_opcode::v_add_f32, bld.def(v1), coords[0], half_texel[0]),
9511 bld.vop2(aco_opcode::v_add_f32, bld.def(v1), coords[1], half_texel[1])};
9512
9513 if (tg4_integer_cube_workaround) {
9514 /* see comment in ac_nir_to_llvm.c's lower_gather4_integer() */
9515 Temp* const desc = (Temp*)alloca(resource.size() * sizeof(Temp));
9516 aco_ptr<Instruction> split{create_instruction<Pseudo_instruction>(
9517 aco_opcode::p_split_vector, Format::PSEUDO, 1, resource.size())};
9518 split->operands[0] = Operand(resource);
9519 for (unsigned i = 0; i < resource.size(); i++) {
9520 desc[i] = bld.tmp(s1);
9521 split->definitions[i] = Definition(desc[i]);
9522 }
9523 ctx->block->instructions.emplace_back(std::move(split));
9524
9525 Temp dfmt = bld.sop2(aco_opcode::s_bfe_u32, bld.def(s1), bld.def(s1, scc), desc[1],
9526 Operand::c32(20u | (6u << 16)));
9527 Temp compare_cube_wa = bld.sopc(aco_opcode::s_cmp_eq_u32, bld.def(s1, scc), dfmt,
9528 Operand::c32(V_008F14_IMG_DATA_FORMAT_8_8_8_8));
9529
9530 Temp nfmt;
9531 if (stype == GLSL_TYPE_UINT) {
9532 nfmt = bld.sop2(aco_opcode::s_cselect_b32, bld.def(s1),
9533 Operand::c32(V_008F14_IMG_NUM_FORMAT_USCALED),
9534 Operand::c32(V_008F14_IMG_NUM_FORMAT_UINT), bld.scc(compare_cube_wa));
9535 } else {
9536 nfmt = bld.sop2(aco_opcode::s_cselect_b32, bld.def(s1),
9537 Operand::c32(V_008F14_IMG_NUM_FORMAT_SSCALED),
9538 Operand::c32(V_008F14_IMG_NUM_FORMAT_SINT), bld.scc(compare_cube_wa));
9539 }
9540 tg4_compare_cube_wa64 = bld.tmp(bld.lm);
9541 bool_to_vector_condition(ctx, compare_cube_wa, tg4_compare_cube_wa64);
9542
9543 nfmt = bld.sop2(aco_opcode::s_lshl_b32, bld.def(s1), bld.def(s1, scc), nfmt,
9544 Operand::c32(26u));
9545
9546 desc[1] = bld.sop2(aco_opcode::s_and_b32, bld.def(s1), bld.def(s1, scc), desc[1],
9547 Operand::c32(C_008F14_NUM_FORMAT));
9548 desc[1] = bld.sop2(aco_opcode::s_or_b32, bld.def(s1), bld.def(s1, scc), desc[1], nfmt);
9549
9550 aco_ptr<Instruction> vec{create_instruction<Pseudo_instruction>(
9551 aco_opcode::p_create_vector, Format::PSEUDO, resource.size(), 1)};
9552 for (unsigned i = 0; i < resource.size(); i++)
9553 vec->operands[i] = Operand(desc[i]);
9554 resource = bld.tmp(resource.regClass());
9555 vec->definitions[0] = Definition(resource);
9556 ctx->block->instructions.emplace_back(std::move(vec));
9557
9558 new_coords[0] = bld.vop2(aco_opcode::v_cndmask_b32, bld.def(v1), new_coords[0], coords[0],
9559 tg4_compare_cube_wa64);
9560 new_coords[1] = bld.vop2(aco_opcode::v_cndmask_b32, bld.def(v1), new_coords[1], coords[1],
9561 tg4_compare_cube_wa64);
9562 }
9563 coords[0] = new_coords[0];
9564 coords[1] = new_coords[1];
9565 }
9566
9567 if (instr->sampler_dim == GLSL_SAMPLER_DIM_BUF) {
9568 // FIXME: if (ctx->abi->gfx9_stride_size_workaround) return
9569 // ac_build_buffer_load_format_gfx9_safe()
9570
9571 assert(coords.size() == 1);
9572 aco_opcode op;
9573 switch (util_last_bit(dmask & 0xf)) {
9574 case 1: op = aco_opcode::buffer_load_format_x; break;
9575 case 2: op = aco_opcode::buffer_load_format_xy; break;
9576 case 3: op = aco_opcode::buffer_load_format_xyz; break;
9577 case 4: op = aco_opcode::buffer_load_format_xyzw; break;
9578 default: unreachable("Tex instruction loads more than 4 components.");
9579 }
9580
9581 aco_ptr<MUBUF_instruction> mubuf{
9582 create_instruction<MUBUF_instruction>(op, Format::MUBUF, 3 + instr->is_sparse, 1)};
9583 mubuf->operands[0] = Operand(resource);
9584 mubuf->operands[1] = Operand(coords[0]);
9585 mubuf->operands[2] = Operand::c32(0);
9586 mubuf->definitions[0] = Definition(tmp_dst);
9587 mubuf->idxen = true;
9588 mubuf->tfe = instr->is_sparse;
9589 if (mubuf->tfe)
9590 mubuf->operands[3] = emit_tfe_init(bld, tmp_dst);
9591 ctx->block->instructions.emplace_back(std::move(mubuf));
9592
9593 expand_vector(ctx, tmp_dst, dst, instr->dest.ssa.num_components, dmask);
9594 return;
9595 }
9596
9597 /* gather MIMG address components */
9598 std::vector<Temp> args;
9599 unsigned wqm_mask = 0;
9600 if (has_offset) {
9601 wqm_mask |= u_bit_consecutive(args.size(), 1);
9602 args.emplace_back(offset);
9603 }
9604 if (has_bias)
9605 args.emplace_back(bias);
9606 if (has_compare)
9607 args.emplace_back(compare);
9608 if (has_derivs)
9609 args.insert(args.end(), derivs.begin(), derivs.end());
9610
9611 wqm_mask |= u_bit_consecutive(args.size(), coords.size());
9612 args.insert(args.end(), coords.begin(), coords.end());
9613
9614 if (has_sample_index)
9615 args.emplace_back(sample_index);
9616 if (has_lod)
9617 args.emplace_back(lod);
9618 if (has_clamped_lod)
9619 args.emplace_back(clamped_lod);
9620
9621 if (instr->op == nir_texop_txf || instr->op == nir_texop_fragment_fetch_amd ||
9622 instr->op == nir_texop_fragment_mask_fetch_amd) {
9623 aco_opcode op = level_zero || instr->sampler_dim == GLSL_SAMPLER_DIM_MS ||
9624 instr->sampler_dim == GLSL_SAMPLER_DIM_SUBPASS_MS
9625 ? aco_opcode::image_load
9626 : aco_opcode::image_load_mip;
9627 Operand vdata = instr->is_sparse ? emit_tfe_init(bld, tmp_dst) : Operand(v1);
9628 MIMG_instruction* tex =
9629 emit_mimg(bld, op, Definition(tmp_dst), resource, Operand(s4), args, 0, vdata);
9630 if (instr->op == nir_texop_fragment_mask_fetch_amd)
9631 tex->dim = da ? ac_image_2darray : ac_image_2d;
9632 else
9633 tex->dim = dim;
9634 tex->dmask = dmask & 0xf;
9635 tex->unrm = true;
9636 tex->da = da;
9637 tex->tfe = instr->is_sparse;
9638
9639 if (instr->op == nir_texop_fragment_mask_fetch_amd) {
9640 /* Use 0x76543210 if the image doesn't have FMASK. */
9641 assert(dmask == 1 && dst.bytes() == 4);
9642 assert(dst.id() != tmp_dst.id());
9643
9644 if (dst.regClass() == s1) {
9645 Temp is_not_null = bld.sopc(aco_opcode::s_cmp_lg_u32, bld.def(s1, scc), Operand::zero(),
9646 emit_extract_vector(ctx, resource, 1, s1));
9647 bld.sop2(aco_opcode::s_cselect_b32, Definition(dst),
9648 bld.as_uniform(tmp_dst), Operand::c32(0x76543210),
9649 bld.scc(is_not_null));
9650 } else {
9651 Temp is_not_null = bld.tmp(bld.lm);
9652 bld.vopc_e64(aco_opcode::v_cmp_lg_u32, Definition(is_not_null), Operand::zero(),
9653 emit_extract_vector(ctx, resource, 1, s1))
9654 .def(0)
9655 .setHint(vcc);
9656 bld.vop2(aco_opcode::v_cndmask_b32, Definition(dst),
9657 bld.copy(bld.def(v1), Operand::c32(0x76543210)), tmp_dst, is_not_null);
9658 }
9659 } else {
9660 expand_vector(ctx, tmp_dst, dst, instr->dest.ssa.num_components, dmask);
9661 }
9662 return;
9663 }
9664
9665 // TODO: would be better to do this by adding offsets, but needs the opcodes ordered.
9666 aco_opcode opcode = aco_opcode::image_sample;
9667 if (has_offset) { /* image_sample_*_o */
9668 if (has_clamped_lod) {
9669 if (has_compare) {
9670 opcode = aco_opcode::image_sample_c_cl_o;
9671 if (has_derivs)
9672 opcode = aco_opcode::image_sample_c_d_cl_o;
9673 if (has_bias)
9674 opcode = aco_opcode::image_sample_c_b_cl_o;
9675 } else {
9676 opcode = aco_opcode::image_sample_cl_o;
9677 if (has_derivs)
9678 opcode = aco_opcode::image_sample_d_cl_o;
9679 if (has_bias)
9680 opcode = aco_opcode::image_sample_b_cl_o;
9681 }
9682 } else if (has_compare) {
9683 opcode = aco_opcode::image_sample_c_o;
9684 if (has_derivs)
9685 opcode = aco_opcode::image_sample_c_d_o;
9686 if (has_bias)
9687 opcode = aco_opcode::image_sample_c_b_o;
9688 if (level_zero)
9689 opcode = aco_opcode::image_sample_c_lz_o;
9690 if (has_lod)
9691 opcode = aco_opcode::image_sample_c_l_o;
9692 } else {
9693 opcode = aco_opcode::image_sample_o;
9694 if (has_derivs)
9695 opcode = aco_opcode::image_sample_d_o;
9696 if (has_bias)
9697 opcode = aco_opcode::image_sample_b_o;
9698 if (level_zero)
9699 opcode = aco_opcode::image_sample_lz_o;
9700 if (has_lod)
9701 opcode = aco_opcode::image_sample_l_o;
9702 }
9703 } else if (has_clamped_lod) { /* image_sample_*_cl */
9704 if (has_compare) {
9705 opcode = aco_opcode::image_sample_c_cl;
9706 if (has_derivs)
9707 opcode = aco_opcode::image_sample_c_d_cl;
9708 if (has_bias)
9709 opcode = aco_opcode::image_sample_c_b_cl;
9710 } else {
9711 opcode = aco_opcode::image_sample_cl;
9712 if (has_derivs)
9713 opcode = aco_opcode::image_sample_d_cl;
9714 if (has_bias)
9715 opcode = aco_opcode::image_sample_b_cl;
9716 }
9717 } else { /* no offset */
9718 if (has_compare) {
9719 opcode = aco_opcode::image_sample_c;
9720 if (has_derivs)
9721 opcode = aco_opcode::image_sample_c_d;
9722 if (has_bias)
9723 opcode = aco_opcode::image_sample_c_b;
9724 if (level_zero)
9725 opcode = aco_opcode::image_sample_c_lz;
9726 if (has_lod)
9727 opcode = aco_opcode::image_sample_c_l;
9728 } else {
9729 opcode = aco_opcode::image_sample;
9730 if (has_derivs)
9731 opcode = aco_opcode::image_sample_d;
9732 if (has_bias)
9733 opcode = aco_opcode::image_sample_b;
9734 if (level_zero)
9735 opcode = aco_opcode::image_sample_lz;
9736 if (has_lod)
9737 opcode = aco_opcode::image_sample_l;
9738 }
9739 }
9740
9741 if (instr->op == nir_texop_tg4) {
9742 if (has_offset) { /* image_gather4_*_o */
9743 if (has_compare) {
9744 opcode = aco_opcode::image_gather4_c_lz_o;
9745 if (has_lod)
9746 opcode = aco_opcode::image_gather4_c_l_o;
9747 if (has_bias)
9748 opcode = aco_opcode::image_gather4_c_b_o;
9749 } else {
9750 opcode = aco_opcode::image_gather4_lz_o;
9751 if (has_lod)
9752 opcode = aco_opcode::image_gather4_l_o;
9753 if (has_bias)
9754 opcode = aco_opcode::image_gather4_b_o;
9755 }
9756 } else {
9757 if (has_compare) {
9758 opcode = aco_opcode::image_gather4_c_lz;
9759 if (has_lod)
9760 opcode = aco_opcode::image_gather4_c_l;
9761 if (has_bias)
9762 opcode = aco_opcode::image_gather4_c_b;
9763 } else {
9764 opcode = aco_opcode::image_gather4_lz;
9765 if (has_lod)
9766 opcode = aco_opcode::image_gather4_l;
9767 if (has_bias)
9768 opcode = aco_opcode::image_gather4_b;
9769 }
9770 }
9771 } else if (instr->op == nir_texop_lod) {
9772 opcode = aco_opcode::image_get_lod;
9773 }
9774
9775 bool implicit_derivs = bld.program->stage == fragment_fs && !has_derivs && !has_lod &&
9776 !level_zero && instr->sampler_dim != GLSL_SAMPLER_DIM_MS &&
9777 instr->sampler_dim != GLSL_SAMPLER_DIM_SUBPASS_MS;
9778
9779 Operand vdata = instr->is_sparse ? emit_tfe_init(bld, tmp_dst) : Operand(v1);
9780 MIMG_instruction* tex = emit_mimg(bld, opcode, Definition(tmp_dst), resource, Operand(sampler),
9781 args, implicit_derivs ? wqm_mask : 0, vdata);
9782 tex->dim = dim;
9783 tex->dmask = dmask & 0xf;
9784 tex->da = da;
9785 tex->tfe = instr->is_sparse;
9786
9787 if (tg4_integer_cube_workaround) {
9788 assert(tmp_dst.id() != dst.id());
9789 assert(tmp_dst.size() == dst.size());
9790
9791 emit_split_vector(ctx, tmp_dst, tmp_dst.size());
9792 Temp val[4];
9793 for (unsigned i = 0; i < 4; i++) {
9794 val[i] = emit_extract_vector(ctx, tmp_dst, i, v1);
9795 Temp cvt_val;
9796 if (stype == GLSL_TYPE_UINT)
9797 cvt_val = bld.vop1(aco_opcode::v_cvt_u32_f32, bld.def(v1), val[i]);
9798 else
9799 cvt_val = bld.vop1(aco_opcode::v_cvt_i32_f32, bld.def(v1), val[i]);
9800 val[i] = bld.vop2(aco_opcode::v_cndmask_b32, bld.def(v1), val[i], cvt_val,
9801 tg4_compare_cube_wa64);
9802 }
9803
9804 Temp tmp = dst.regClass() == tmp_dst.regClass() ? dst : bld.tmp(tmp_dst.regClass());
9805 if (instr->is_sparse)
9806 tmp_dst = bld.pseudo(aco_opcode::p_create_vector, Definition(tmp), val[0], val[1], val[2],
9807 val[3], emit_extract_vector(ctx, tmp_dst, 4, v1));
9808 else
9809 tmp_dst = bld.pseudo(aco_opcode::p_create_vector, Definition(tmp), val[0], val[1], val[2],
9810 val[3]);
9811 }
9812 unsigned mask = instr->op == nir_texop_tg4 ? (instr->is_sparse ? 0x1F : 0xF) : dmask;
9813 expand_vector(ctx, tmp_dst, dst, instr->dest.ssa.num_components, mask);
9814 }
9815
9816 Operand
get_phi_operand(isel_context * ctx,nir_ssa_def * ssa,RegClass rc,bool logical)9817 get_phi_operand(isel_context* ctx, nir_ssa_def* ssa, RegClass rc, bool logical)
9818 {
9819 Temp tmp = get_ssa_temp(ctx, ssa);
9820 if (ssa->parent_instr->type == nir_instr_type_ssa_undef) {
9821 return Operand(rc);
9822 } else if (logical && ssa->bit_size == 1 &&
9823 ssa->parent_instr->type == nir_instr_type_load_const) {
9824 if (ctx->program->wave_size == 64)
9825 return Operand::c64(nir_instr_as_load_const(ssa->parent_instr)->value[0].b ? UINT64_MAX
9826 : 0u);
9827 else
9828 return Operand::c32(nir_instr_as_load_const(ssa->parent_instr)->value[0].b ? UINT32_MAX
9829 : 0u);
9830 } else {
9831 return Operand(tmp);
9832 }
9833 }
9834
9835 void
visit_phi(isel_context * ctx,nir_phi_instr * instr)9836 visit_phi(isel_context* ctx, nir_phi_instr* instr)
9837 {
9838 aco_ptr<Pseudo_instruction> phi;
9839 Temp dst = get_ssa_temp(ctx, &instr->dest.ssa);
9840 assert(instr->dest.ssa.bit_size != 1 || dst.regClass() == ctx->program->lane_mask);
9841
9842 bool logical = !dst.is_linear() || nir_dest_is_divergent(instr->dest);
9843 logical |= (ctx->block->kind & block_kind_merge) != 0;
9844 aco_opcode opcode = logical ? aco_opcode::p_phi : aco_opcode::p_linear_phi;
9845
9846 /* we want a sorted list of sources, since the predecessor list is also sorted */
9847 std::map<unsigned, nir_ssa_def*> phi_src;
9848 nir_foreach_phi_src (src, instr)
9849 phi_src[src->pred->index] = src->src.ssa;
9850
9851 std::vector<unsigned>& preds = logical ? ctx->block->logical_preds : ctx->block->linear_preds;
9852 unsigned num_operands = 0;
9853 Operand* const operands = (Operand*)alloca(
9854 (std::max(exec_list_length(&instr->srcs), (unsigned)preds.size()) + 1) * sizeof(Operand));
9855 unsigned num_defined = 0;
9856 unsigned cur_pred_idx = 0;
9857 for (std::pair<unsigned, nir_ssa_def*> src : phi_src) {
9858 if (cur_pred_idx < preds.size()) {
9859 /* handle missing preds (IF merges with discard/break) and extra preds
9860 * (loop exit with discard) */
9861 unsigned block = ctx->cf_info.nir_to_aco[src.first];
9862 unsigned skipped = 0;
9863 while (cur_pred_idx + skipped < preds.size() && preds[cur_pred_idx + skipped] != block)
9864 skipped++;
9865 if (cur_pred_idx + skipped < preds.size()) {
9866 for (unsigned i = 0; i < skipped; i++)
9867 operands[num_operands++] = Operand(dst.regClass());
9868 cur_pred_idx += skipped;
9869 } else {
9870 continue;
9871 }
9872 }
9873 /* Handle missing predecessors at the end. This shouldn't happen with loop
9874 * headers and we can't ignore these sources for loop header phis. */
9875 if (!(ctx->block->kind & block_kind_loop_header) && cur_pred_idx >= preds.size())
9876 continue;
9877 cur_pred_idx++;
9878 Operand op = get_phi_operand(ctx, src.second, dst.regClass(), logical);
9879 operands[num_operands++] = op;
9880 num_defined += !op.isUndefined();
9881 }
9882 /* handle block_kind_continue_or_break at loop exit blocks */
9883 while (cur_pred_idx++ < preds.size())
9884 operands[num_operands++] = Operand(dst.regClass());
9885
9886 /* If the loop ends with a break, still add a linear continue edge in case
9887 * that break is divergent or continue_or_break is used. We'll either remove
9888 * this operand later in visit_loop() if it's not necessary or replace the
9889 * undef with something correct. */
9890 if (!logical && ctx->block->kind & block_kind_loop_header) {
9891 nir_loop* loop = nir_cf_node_as_loop(instr->instr.block->cf_node.parent);
9892 nir_block* last = nir_loop_last_block(loop);
9893 if (last->successors[0] != instr->instr.block)
9894 operands[num_operands++] = Operand(RegClass());
9895 }
9896
9897 /* we can use a linear phi in some cases if one src is undef */
9898 if (dst.is_linear() && ctx->block->kind & block_kind_merge && num_defined == 1) {
9899 phi.reset(create_instruction<Pseudo_instruction>(aco_opcode::p_linear_phi, Format::PSEUDO,
9900 num_operands, 1));
9901
9902 Block* linear_else = &ctx->program->blocks[ctx->block->linear_preds[1]];
9903 Block* invert = &ctx->program->blocks[linear_else->linear_preds[0]];
9904 assert(invert->kind & block_kind_invert);
9905
9906 unsigned then_block = invert->linear_preds[0];
9907
9908 Block* insert_block = NULL;
9909 for (unsigned i = 0; i < num_operands; i++) {
9910 Operand op = operands[i];
9911 if (op.isUndefined())
9912 continue;
9913 insert_block = ctx->block->logical_preds[i] == then_block ? invert : ctx->block;
9914 phi->operands[0] = op;
9915 break;
9916 }
9917 assert(insert_block); /* should be handled by the "num_defined == 0" case above */
9918 phi->operands[1] = Operand(dst.regClass());
9919 phi->definitions[0] = Definition(dst);
9920 insert_block->instructions.emplace(insert_block->instructions.begin(), std::move(phi));
9921 return;
9922 }
9923
9924 phi.reset(create_instruction<Pseudo_instruction>(opcode, Format::PSEUDO, num_operands, 1));
9925 for (unsigned i = 0; i < num_operands; i++)
9926 phi->operands[i] = operands[i];
9927 phi->definitions[0] = Definition(dst);
9928 ctx->block->instructions.emplace(ctx->block->instructions.begin(), std::move(phi));
9929 }
9930
9931 void
visit_undef(isel_context * ctx,nir_ssa_undef_instr * instr)9932 visit_undef(isel_context* ctx, nir_ssa_undef_instr* instr)
9933 {
9934 Temp dst = get_ssa_temp(ctx, &instr->def);
9935
9936 assert(dst.type() == RegType::sgpr);
9937
9938 if (dst.size() == 1) {
9939 Builder(ctx->program, ctx->block).copy(Definition(dst), Operand::zero());
9940 } else {
9941 aco_ptr<Pseudo_instruction> vec{create_instruction<Pseudo_instruction>(
9942 aco_opcode::p_create_vector, Format::PSEUDO, dst.size(), 1)};
9943 for (unsigned i = 0; i < dst.size(); i++)
9944 vec->operands[i] = Operand::zero();
9945 vec->definitions[0] = Definition(dst);
9946 ctx->block->instructions.emplace_back(std::move(vec));
9947 }
9948 }
9949
9950 void
begin_loop(isel_context * ctx,loop_context * lc)9951 begin_loop(isel_context* ctx, loop_context* lc)
9952 {
9953 // TODO: we might want to wrap the loop around a branch if exec_potentially_empty=true
9954 append_logical_end(ctx->block);
9955 ctx->block->kind |= block_kind_loop_preheader | block_kind_uniform;
9956 Builder bld(ctx->program, ctx->block);
9957 bld.branch(aco_opcode::p_branch, bld.hint_vcc(bld.def(s2)));
9958 unsigned loop_preheader_idx = ctx->block->index;
9959
9960 lc->loop_exit.kind |= (block_kind_loop_exit | (ctx->block->kind & block_kind_top_level));
9961
9962 ctx->program->next_loop_depth++;
9963
9964 Block* loop_header = ctx->program->create_and_insert_block();
9965 loop_header->kind |= block_kind_loop_header;
9966 add_edge(loop_preheader_idx, loop_header);
9967 ctx->block = loop_header;
9968
9969 append_logical_start(ctx->block);
9970
9971 lc->header_idx_old = std::exchange(ctx->cf_info.parent_loop.header_idx, loop_header->index);
9972 lc->exit_old = std::exchange(ctx->cf_info.parent_loop.exit, &lc->loop_exit);
9973 lc->divergent_cont_old = std::exchange(ctx->cf_info.parent_loop.has_divergent_continue, false);
9974 lc->divergent_branch_old = std::exchange(ctx->cf_info.parent_loop.has_divergent_branch, false);
9975 lc->divergent_if_old = std::exchange(ctx->cf_info.parent_if.is_divergent, false);
9976 }
9977
9978 void
end_loop(isel_context * ctx,loop_context * lc)9979 end_loop(isel_context* ctx, loop_context* lc)
9980 {
9981 // TODO: what if a loop ends with a unconditional or uniformly branched continue
9982 // and this branch is never taken?
9983 if (!ctx->cf_info.has_branch) {
9984 unsigned loop_header_idx = ctx->cf_info.parent_loop.header_idx;
9985 Builder bld(ctx->program, ctx->block);
9986 append_logical_end(ctx->block);
9987
9988 if (ctx->cf_info.exec_potentially_empty_discard ||
9989 ctx->cf_info.exec_potentially_empty_break) {
9990 /* Discards can result in code running with an empty exec mask.
9991 * This would result in divergent breaks not ever being taken. As a
9992 * workaround, break the loop when the loop mask is empty instead of
9993 * always continuing. */
9994 ctx->block->kind |= (block_kind_continue_or_break | block_kind_uniform);
9995 unsigned block_idx = ctx->block->index;
9996
9997 /* create helper blocks to avoid critical edges */
9998 Block* break_block = ctx->program->create_and_insert_block();
9999 break_block->kind = block_kind_uniform;
10000 bld.reset(break_block);
10001 bld.branch(aco_opcode::p_branch, bld.hint_vcc(bld.def(s2)));
10002 add_linear_edge(block_idx, break_block);
10003 add_linear_edge(break_block->index, &lc->loop_exit);
10004
10005 Block* continue_block = ctx->program->create_and_insert_block();
10006 continue_block->kind = block_kind_uniform;
10007 bld.reset(continue_block);
10008 bld.branch(aco_opcode::p_branch, bld.hint_vcc(bld.def(s2)));
10009 add_linear_edge(block_idx, continue_block);
10010 add_linear_edge(continue_block->index, &ctx->program->blocks[loop_header_idx]);
10011
10012 if (!ctx->cf_info.parent_loop.has_divergent_branch)
10013 add_logical_edge(block_idx, &ctx->program->blocks[loop_header_idx]);
10014 ctx->block = &ctx->program->blocks[block_idx];
10015 } else {
10016 ctx->block->kind |= (block_kind_continue | block_kind_uniform);
10017 if (!ctx->cf_info.parent_loop.has_divergent_branch)
10018 add_edge(ctx->block->index, &ctx->program->blocks[loop_header_idx]);
10019 else
10020 add_linear_edge(ctx->block->index, &ctx->program->blocks[loop_header_idx]);
10021 }
10022
10023 bld.reset(ctx->block);
10024 bld.branch(aco_opcode::p_branch, bld.hint_vcc(bld.def(s2)));
10025 }
10026
10027 ctx->cf_info.has_branch = false;
10028 ctx->program->next_loop_depth--;
10029
10030 // TODO: if the loop has not a single exit, we must add one °°
10031 /* emit loop successor block */
10032 ctx->block = ctx->program->insert_block(std::move(lc->loop_exit));
10033 append_logical_start(ctx->block);
10034
10035 #if 0
10036 // TODO: check if it is beneficial to not branch on continues
10037 /* trim linear phis in loop header */
10038 for (auto&& instr : loop_entry->instructions) {
10039 if (instr->opcode == aco_opcode::p_linear_phi) {
10040 aco_ptr<Pseudo_instruction> new_phi{create_instruction<Pseudo_instruction>(aco_opcode::p_linear_phi, Format::PSEUDO, loop_entry->linear_predecessors.size(), 1)};
10041 new_phi->definitions[0] = instr->definitions[0];
10042 for (unsigned i = 0; i < new_phi->operands.size(); i++)
10043 new_phi->operands[i] = instr->operands[i];
10044 /* check that the remaining operands are all the same */
10045 for (unsigned i = new_phi->operands.size(); i < instr->operands.size(); i++)
10046 assert(instr->operands[i].tempId() == instr->operands.back().tempId());
10047 instr.swap(new_phi);
10048 } else if (instr->opcode == aco_opcode::p_phi) {
10049 continue;
10050 } else {
10051 break;
10052 }
10053 }
10054 #endif
10055
10056 ctx->cf_info.parent_loop.header_idx = lc->header_idx_old;
10057 ctx->cf_info.parent_loop.exit = lc->exit_old;
10058 ctx->cf_info.parent_loop.has_divergent_continue = lc->divergent_cont_old;
10059 ctx->cf_info.parent_loop.has_divergent_branch = lc->divergent_branch_old;
10060 ctx->cf_info.parent_if.is_divergent = lc->divergent_if_old;
10061 if (!ctx->block->loop_nest_depth && !ctx->cf_info.parent_if.is_divergent)
10062 ctx->cf_info.exec_potentially_empty_discard = false;
10063 }
10064
10065 void
emit_loop_jump(isel_context * ctx,bool is_break)10066 emit_loop_jump(isel_context* ctx, bool is_break)
10067 {
10068 Builder bld(ctx->program, ctx->block);
10069 Block* logical_target;
10070 append_logical_end(ctx->block);
10071 unsigned idx = ctx->block->index;
10072
10073 if (is_break) {
10074 logical_target = ctx->cf_info.parent_loop.exit;
10075 add_logical_edge(idx, logical_target);
10076 ctx->block->kind |= block_kind_break;
10077
10078 if (!ctx->cf_info.parent_if.is_divergent &&
10079 !ctx->cf_info.parent_loop.has_divergent_continue) {
10080 /* uniform break - directly jump out of the loop */
10081 ctx->block->kind |= block_kind_uniform;
10082 ctx->cf_info.has_branch = true;
10083 bld.branch(aco_opcode::p_branch, bld.hint_vcc(bld.def(s2)));
10084 add_linear_edge(idx, logical_target);
10085 return;
10086 }
10087 ctx->cf_info.parent_loop.has_divergent_branch = true;
10088 } else {
10089 logical_target = &ctx->program->blocks[ctx->cf_info.parent_loop.header_idx];
10090 add_logical_edge(idx, logical_target);
10091 ctx->block->kind |= block_kind_continue;
10092
10093 if (!ctx->cf_info.parent_if.is_divergent) {
10094 /* uniform continue - directly jump to the loop header */
10095 ctx->block->kind |= block_kind_uniform;
10096 ctx->cf_info.has_branch = true;
10097 bld.branch(aco_opcode::p_branch, bld.hint_vcc(bld.def(s2)));
10098 add_linear_edge(idx, logical_target);
10099 return;
10100 }
10101
10102 /* for potential uniform breaks after this continue,
10103 we must ensure that they are handled correctly */
10104 ctx->cf_info.parent_loop.has_divergent_continue = true;
10105 ctx->cf_info.parent_loop.has_divergent_branch = true;
10106 }
10107
10108 if (ctx->cf_info.parent_if.is_divergent && !ctx->cf_info.exec_potentially_empty_break) {
10109 ctx->cf_info.exec_potentially_empty_break = true;
10110 ctx->cf_info.exec_potentially_empty_break_depth = ctx->block->loop_nest_depth;
10111 }
10112
10113 /* remove critical edges from linear CFG */
10114 bld.branch(aco_opcode::p_branch, bld.hint_vcc(bld.def(s2)));
10115 Block* break_block = ctx->program->create_and_insert_block();
10116 break_block->kind |= block_kind_uniform;
10117 add_linear_edge(idx, break_block);
10118 /* the loop_header pointer might be invalidated by this point */
10119 if (!is_break)
10120 logical_target = &ctx->program->blocks[ctx->cf_info.parent_loop.header_idx];
10121 add_linear_edge(break_block->index, logical_target);
10122 bld.reset(break_block);
10123 bld.branch(aco_opcode::p_branch, bld.hint_vcc(bld.def(s2)));
10124
10125 Block* continue_block = ctx->program->create_and_insert_block();
10126 add_linear_edge(idx, continue_block);
10127 append_logical_start(continue_block);
10128 ctx->block = continue_block;
10129 }
10130
10131 void
emit_loop_break(isel_context * ctx)10132 emit_loop_break(isel_context* ctx)
10133 {
10134 emit_loop_jump(ctx, true);
10135 }
10136
10137 void
emit_loop_continue(isel_context * ctx)10138 emit_loop_continue(isel_context* ctx)
10139 {
10140 emit_loop_jump(ctx, false);
10141 }
10142
10143 void
visit_jump(isel_context * ctx,nir_jump_instr * instr)10144 visit_jump(isel_context* ctx, nir_jump_instr* instr)
10145 {
10146 /* visit_block() would usually do this but divergent jumps updates ctx->block */
10147 ctx->cf_info.nir_to_aco[instr->instr.block->index] = ctx->block->index;
10148
10149 switch (instr->type) {
10150 case nir_jump_break: emit_loop_break(ctx); break;
10151 case nir_jump_continue: emit_loop_continue(ctx); break;
10152 default: isel_err(&instr->instr, "Unknown NIR jump instr"); abort();
10153 }
10154 }
10155
10156 void
visit_block(isel_context * ctx,nir_block * block)10157 visit_block(isel_context* ctx, nir_block* block)
10158 {
10159 nir_foreach_instr (instr, block) {
10160 switch (instr->type) {
10161 case nir_instr_type_alu: visit_alu_instr(ctx, nir_instr_as_alu(instr)); break;
10162 case nir_instr_type_load_const: visit_load_const(ctx, nir_instr_as_load_const(instr)); break;
10163 case nir_instr_type_intrinsic: visit_intrinsic(ctx, nir_instr_as_intrinsic(instr)); break;
10164 case nir_instr_type_tex: visit_tex(ctx, nir_instr_as_tex(instr)); break;
10165 case nir_instr_type_phi: visit_phi(ctx, nir_instr_as_phi(instr)); break;
10166 case nir_instr_type_ssa_undef: visit_undef(ctx, nir_instr_as_ssa_undef(instr)); break;
10167 case nir_instr_type_deref: break;
10168 case nir_instr_type_jump: visit_jump(ctx, nir_instr_as_jump(instr)); break;
10169 default: isel_err(instr, "Unknown NIR instr type");
10170 }
10171 }
10172
10173 if (!ctx->cf_info.parent_loop.has_divergent_branch)
10174 ctx->cf_info.nir_to_aco[block->index] = ctx->block->index;
10175 }
10176
10177 static Operand
create_continue_phis(isel_context * ctx,unsigned first,unsigned last,aco_ptr<Instruction> & header_phi,Operand * vals)10178 create_continue_phis(isel_context* ctx, unsigned first, unsigned last,
10179 aco_ptr<Instruction>& header_phi, Operand* vals)
10180 {
10181 vals[0] = Operand(header_phi->definitions[0].getTemp());
10182 RegClass rc = vals[0].regClass();
10183
10184 unsigned loop_nest_depth = ctx->program->blocks[first].loop_nest_depth;
10185
10186 unsigned next_pred = 1;
10187
10188 for (unsigned idx = first + 1; idx <= last; idx++) {
10189 Block& block = ctx->program->blocks[idx];
10190 if (block.loop_nest_depth != loop_nest_depth) {
10191 vals[idx - first] = vals[idx - 1 - first];
10192 continue;
10193 }
10194
10195 if ((block.kind & block_kind_continue) && block.index != last) {
10196 vals[idx - first] = header_phi->operands[next_pred];
10197 next_pred++;
10198 continue;
10199 }
10200
10201 bool all_same = true;
10202 for (unsigned i = 1; all_same && (i < block.linear_preds.size()); i++)
10203 all_same = vals[block.linear_preds[i] - first] == vals[block.linear_preds[0] - first];
10204
10205 Operand val;
10206 if (all_same) {
10207 val = vals[block.linear_preds[0] - first];
10208 } else {
10209 aco_ptr<Instruction> phi(create_instruction<Pseudo_instruction>(
10210 aco_opcode::p_linear_phi, Format::PSEUDO, block.linear_preds.size(), 1));
10211 for (unsigned i = 0; i < block.linear_preds.size(); i++)
10212 phi->operands[i] = vals[block.linear_preds[i] - first];
10213 val = Operand(ctx->program->allocateTmp(rc));
10214 phi->definitions[0] = Definition(val.getTemp());
10215 block.instructions.emplace(block.instructions.begin(), std::move(phi));
10216 }
10217 vals[idx - first] = val;
10218 }
10219
10220 return vals[last - first];
10221 }
10222
10223 static void begin_uniform_if_then(isel_context* ctx, if_context* ic, Temp cond);
10224 static void begin_uniform_if_else(isel_context* ctx, if_context* ic);
10225 static void end_uniform_if(isel_context* ctx, if_context* ic);
10226
10227 static void
visit_loop(isel_context * ctx,nir_loop * loop)10228 visit_loop(isel_context* ctx, nir_loop* loop)
10229 {
10230 loop_context lc;
10231 begin_loop(ctx, &lc);
10232
10233 /* NIR seems to allow this, and even though the loop exit has no predecessors, SSA defs from the
10234 * loop header are live. Handle this without complicating the ACO IR by creating a dummy break.
10235 */
10236 if (nir_cf_node_cf_tree_next(&loop->cf_node)->predecessors->entries == 0) {
10237 Builder bld(ctx->program, ctx->block);
10238 Temp cond = bld.copy(bld.def(s1, scc), Operand::zero());
10239 if_context ic;
10240 begin_uniform_if_then(ctx, &ic, cond);
10241 emit_loop_break(ctx);
10242 begin_uniform_if_else(ctx, &ic);
10243 end_uniform_if(ctx, &ic);
10244 }
10245
10246 bool unreachable = visit_cf_list(ctx, &loop->body);
10247
10248 unsigned loop_header_idx = ctx->cf_info.parent_loop.header_idx;
10249
10250 /* Fixup phis in loop header from unreachable blocks.
10251 * has_branch/has_divergent_branch also indicates if the loop ends with a
10252 * break/continue instruction, but we don't emit those if unreachable=true */
10253 if (unreachable) {
10254 assert(ctx->cf_info.has_branch || ctx->cf_info.parent_loop.has_divergent_branch);
10255 bool linear = ctx->cf_info.has_branch;
10256 bool logical = ctx->cf_info.has_branch || ctx->cf_info.parent_loop.has_divergent_branch;
10257 for (aco_ptr<Instruction>& instr : ctx->program->blocks[loop_header_idx].instructions) {
10258 if ((logical && instr->opcode == aco_opcode::p_phi) ||
10259 (linear && instr->opcode == aco_opcode::p_linear_phi)) {
10260 /* the last operand should be the one that needs to be removed */
10261 instr->operands.pop_back();
10262 } else if (!is_phi(instr)) {
10263 break;
10264 }
10265 }
10266 }
10267
10268 /* Fixup linear phis in loop header from expecting a continue. Both this fixup
10269 * and the previous one shouldn't both happen at once because a break in the
10270 * merge block would get CSE'd */
10271 if (nir_loop_last_block(loop)->successors[0] != nir_loop_first_block(loop)) {
10272 unsigned num_vals = ctx->cf_info.has_branch ? 1 : (ctx->block->index - loop_header_idx + 1);
10273 Operand* const vals = (Operand*)alloca(num_vals * sizeof(Operand));
10274 for (aco_ptr<Instruction>& instr : ctx->program->blocks[loop_header_idx].instructions) {
10275 if (instr->opcode == aco_opcode::p_linear_phi) {
10276 if (ctx->cf_info.has_branch)
10277 instr->operands.pop_back();
10278 else
10279 instr->operands.back() =
10280 create_continue_phis(ctx, loop_header_idx, ctx->block->index, instr, vals);
10281 } else if (!is_phi(instr)) {
10282 break;
10283 }
10284 }
10285 }
10286
10287 end_loop(ctx, &lc);
10288 }
10289
10290 static void
begin_divergent_if_then(isel_context * ctx,if_context * ic,Temp cond)10291 begin_divergent_if_then(isel_context* ctx, if_context* ic, Temp cond)
10292 {
10293 ic->cond = cond;
10294
10295 append_logical_end(ctx->block);
10296 ctx->block->kind |= block_kind_branch;
10297
10298 /* branch to linear then block */
10299 assert(cond.regClass() == ctx->program->lane_mask);
10300 aco_ptr<Pseudo_branch_instruction> branch;
10301 branch.reset(create_instruction<Pseudo_branch_instruction>(aco_opcode::p_cbranch_z,
10302 Format::PSEUDO_BRANCH, 1, 1));
10303 branch->definitions[0] = Definition(ctx->program->allocateTmp(s2));
10304 branch->definitions[0].setHint(vcc);
10305 branch->operands[0] = Operand(cond);
10306 ctx->block->instructions.push_back(std::move(branch));
10307
10308 ic->BB_if_idx = ctx->block->index;
10309 ic->BB_invert = Block();
10310 /* Invert blocks are intentionally not marked as top level because they
10311 * are not part of the logical cfg. */
10312 ic->BB_invert.kind |= block_kind_invert;
10313 ic->BB_endif = Block();
10314 ic->BB_endif.kind |= (block_kind_merge | (ctx->block->kind & block_kind_top_level));
10315
10316 ic->exec_potentially_empty_discard_old = ctx->cf_info.exec_potentially_empty_discard;
10317 ic->exec_potentially_empty_break_old = ctx->cf_info.exec_potentially_empty_break;
10318 ic->exec_potentially_empty_break_depth_old = ctx->cf_info.exec_potentially_empty_break_depth;
10319 ic->divergent_old = ctx->cf_info.parent_if.is_divergent;
10320 ctx->cf_info.parent_if.is_divergent = true;
10321
10322 /* divergent branches use cbranch_execz */
10323 ctx->cf_info.exec_potentially_empty_discard = false;
10324 ctx->cf_info.exec_potentially_empty_break = false;
10325 ctx->cf_info.exec_potentially_empty_break_depth = UINT16_MAX;
10326
10327 /** emit logical then block */
10328 ctx->program->next_divergent_if_logical_depth++;
10329 Block* BB_then_logical = ctx->program->create_and_insert_block();
10330 add_edge(ic->BB_if_idx, BB_then_logical);
10331 ctx->block = BB_then_logical;
10332 append_logical_start(BB_then_logical);
10333 }
10334
10335 static void
begin_divergent_if_else(isel_context * ctx,if_context * ic)10336 begin_divergent_if_else(isel_context* ctx, if_context* ic)
10337 {
10338 Block* BB_then_logical = ctx->block;
10339 append_logical_end(BB_then_logical);
10340 /* branch from logical then block to invert block */
10341 aco_ptr<Pseudo_branch_instruction> branch;
10342 branch.reset(create_instruction<Pseudo_branch_instruction>(aco_opcode::p_branch,
10343 Format::PSEUDO_BRANCH, 0, 1));
10344 branch->definitions[0] = Definition(ctx->program->allocateTmp(s2));
10345 branch->definitions[0].setHint(vcc);
10346 BB_then_logical->instructions.emplace_back(std::move(branch));
10347 add_linear_edge(BB_then_logical->index, &ic->BB_invert);
10348 if (!ctx->cf_info.parent_loop.has_divergent_branch)
10349 add_logical_edge(BB_then_logical->index, &ic->BB_endif);
10350 BB_then_logical->kind |= block_kind_uniform;
10351 assert(!ctx->cf_info.has_branch);
10352 ic->then_branch_divergent = ctx->cf_info.parent_loop.has_divergent_branch;
10353 ctx->cf_info.parent_loop.has_divergent_branch = false;
10354 ctx->program->next_divergent_if_logical_depth--;
10355
10356 /** emit linear then block */
10357 Block* BB_then_linear = ctx->program->create_and_insert_block();
10358 BB_then_linear->kind |= block_kind_uniform;
10359 add_linear_edge(ic->BB_if_idx, BB_then_linear);
10360 /* branch from linear then block to invert block */
10361 branch.reset(create_instruction<Pseudo_branch_instruction>(aco_opcode::p_branch,
10362 Format::PSEUDO_BRANCH, 0, 1));
10363 branch->definitions[0] = Definition(ctx->program->allocateTmp(s2));
10364 branch->definitions[0].setHint(vcc);
10365 BB_then_linear->instructions.emplace_back(std::move(branch));
10366 add_linear_edge(BB_then_linear->index, &ic->BB_invert);
10367
10368 /** emit invert merge block */
10369 ctx->block = ctx->program->insert_block(std::move(ic->BB_invert));
10370 ic->invert_idx = ctx->block->index;
10371
10372 /* branch to linear else block (skip else) */
10373 branch.reset(create_instruction<Pseudo_branch_instruction>(aco_opcode::p_branch,
10374 Format::PSEUDO_BRANCH, 0, 1));
10375 branch->definitions[0] = Definition(ctx->program->allocateTmp(s2));
10376 branch->definitions[0].setHint(vcc);
10377 ctx->block->instructions.push_back(std::move(branch));
10378
10379 ic->exec_potentially_empty_discard_old |= ctx->cf_info.exec_potentially_empty_discard;
10380 ic->exec_potentially_empty_break_old |= ctx->cf_info.exec_potentially_empty_break;
10381 ic->exec_potentially_empty_break_depth_old = std::min(
10382 ic->exec_potentially_empty_break_depth_old, ctx->cf_info.exec_potentially_empty_break_depth);
10383 /* divergent branches use cbranch_execz */
10384 ctx->cf_info.exec_potentially_empty_discard = false;
10385 ctx->cf_info.exec_potentially_empty_break = false;
10386 ctx->cf_info.exec_potentially_empty_break_depth = UINT16_MAX;
10387
10388 /** emit logical else block */
10389 ctx->program->next_divergent_if_logical_depth++;
10390 Block* BB_else_logical = ctx->program->create_and_insert_block();
10391 add_logical_edge(ic->BB_if_idx, BB_else_logical);
10392 add_linear_edge(ic->invert_idx, BB_else_logical);
10393 ctx->block = BB_else_logical;
10394 append_logical_start(BB_else_logical);
10395 }
10396
10397 static void
end_divergent_if(isel_context * ctx,if_context * ic)10398 end_divergent_if(isel_context* ctx, if_context* ic)
10399 {
10400 Block* BB_else_logical = ctx->block;
10401 append_logical_end(BB_else_logical);
10402
10403 /* branch from logical else block to endif block */
10404 aco_ptr<Pseudo_branch_instruction> branch;
10405 branch.reset(create_instruction<Pseudo_branch_instruction>(aco_opcode::p_branch,
10406 Format::PSEUDO_BRANCH, 0, 1));
10407 branch->definitions[0] = Definition(ctx->program->allocateTmp(s2));
10408 branch->definitions[0].setHint(vcc);
10409 BB_else_logical->instructions.emplace_back(std::move(branch));
10410 add_linear_edge(BB_else_logical->index, &ic->BB_endif);
10411 if (!ctx->cf_info.parent_loop.has_divergent_branch)
10412 add_logical_edge(BB_else_logical->index, &ic->BB_endif);
10413 BB_else_logical->kind |= block_kind_uniform;
10414 ctx->program->next_divergent_if_logical_depth--;
10415
10416 assert(!ctx->cf_info.has_branch);
10417 ctx->cf_info.parent_loop.has_divergent_branch &= ic->then_branch_divergent;
10418
10419 /** emit linear else block */
10420 Block* BB_else_linear = ctx->program->create_and_insert_block();
10421 BB_else_linear->kind |= block_kind_uniform;
10422 add_linear_edge(ic->invert_idx, BB_else_linear);
10423
10424 /* branch from linear else block to endif block */
10425 branch.reset(create_instruction<Pseudo_branch_instruction>(aco_opcode::p_branch,
10426 Format::PSEUDO_BRANCH, 0, 1));
10427 branch->definitions[0] = Definition(ctx->program->allocateTmp(s2));
10428 branch->definitions[0].setHint(vcc);
10429 BB_else_linear->instructions.emplace_back(std::move(branch));
10430 add_linear_edge(BB_else_linear->index, &ic->BB_endif);
10431
10432 /** emit endif merge block */
10433 ctx->block = ctx->program->insert_block(std::move(ic->BB_endif));
10434 append_logical_start(ctx->block);
10435
10436 ctx->cf_info.parent_if.is_divergent = ic->divergent_old;
10437 ctx->cf_info.exec_potentially_empty_discard |= ic->exec_potentially_empty_discard_old;
10438 ctx->cf_info.exec_potentially_empty_break |= ic->exec_potentially_empty_break_old;
10439 ctx->cf_info.exec_potentially_empty_break_depth = std::min(
10440 ic->exec_potentially_empty_break_depth_old, ctx->cf_info.exec_potentially_empty_break_depth);
10441 if (ctx->block->loop_nest_depth == ctx->cf_info.exec_potentially_empty_break_depth &&
10442 !ctx->cf_info.parent_if.is_divergent) {
10443 ctx->cf_info.exec_potentially_empty_break = false;
10444 ctx->cf_info.exec_potentially_empty_break_depth = UINT16_MAX;
10445 }
10446 /* uniform control flow never has an empty exec-mask */
10447 if (!ctx->block->loop_nest_depth && !ctx->cf_info.parent_if.is_divergent) {
10448 ctx->cf_info.exec_potentially_empty_discard = false;
10449 ctx->cf_info.exec_potentially_empty_break = false;
10450 ctx->cf_info.exec_potentially_empty_break_depth = UINT16_MAX;
10451 }
10452 }
10453
10454 static void
begin_uniform_if_then(isel_context * ctx,if_context * ic,Temp cond)10455 begin_uniform_if_then(isel_context* ctx, if_context* ic, Temp cond)
10456 {
10457 assert(cond.regClass() == s1);
10458
10459 append_logical_end(ctx->block);
10460 ctx->block->kind |= block_kind_uniform;
10461
10462 aco_ptr<Pseudo_branch_instruction> branch;
10463 aco_opcode branch_opcode = aco_opcode::p_cbranch_z;
10464 branch.reset(
10465 create_instruction<Pseudo_branch_instruction>(branch_opcode, Format::PSEUDO_BRANCH, 1, 1));
10466 branch->definitions[0] = Definition(ctx->program->allocateTmp(s2));
10467 branch->definitions[0].setHint(vcc);
10468 branch->operands[0] = Operand(cond);
10469 branch->operands[0].setFixed(scc);
10470 ctx->block->instructions.emplace_back(std::move(branch));
10471
10472 ic->BB_if_idx = ctx->block->index;
10473 ic->BB_endif = Block();
10474 ic->BB_endif.kind |= ctx->block->kind & block_kind_top_level;
10475
10476 ctx->cf_info.has_branch = false;
10477 ctx->cf_info.parent_loop.has_divergent_branch = false;
10478
10479 /** emit then block */
10480 ctx->program->next_uniform_if_depth++;
10481 Block* BB_then = ctx->program->create_and_insert_block();
10482 add_edge(ic->BB_if_idx, BB_then);
10483 append_logical_start(BB_then);
10484 ctx->block = BB_then;
10485 }
10486
10487 static void
begin_uniform_if_else(isel_context * ctx,if_context * ic)10488 begin_uniform_if_else(isel_context* ctx, if_context* ic)
10489 {
10490 Block* BB_then = ctx->block;
10491
10492 ic->uniform_has_then_branch = ctx->cf_info.has_branch;
10493 ic->then_branch_divergent = ctx->cf_info.parent_loop.has_divergent_branch;
10494
10495 if (!ic->uniform_has_then_branch) {
10496 append_logical_end(BB_then);
10497 /* branch from then block to endif block */
10498 aco_ptr<Pseudo_branch_instruction> branch;
10499 branch.reset(create_instruction<Pseudo_branch_instruction>(aco_opcode::p_branch,
10500 Format::PSEUDO_BRANCH, 0, 1));
10501 branch->definitions[0] = Definition(ctx->program->allocateTmp(s2));
10502 branch->definitions[0].setHint(vcc);
10503 BB_then->instructions.emplace_back(std::move(branch));
10504 add_linear_edge(BB_then->index, &ic->BB_endif);
10505 if (!ic->then_branch_divergent)
10506 add_logical_edge(BB_then->index, &ic->BB_endif);
10507 BB_then->kind |= block_kind_uniform;
10508 }
10509
10510 ctx->cf_info.has_branch = false;
10511 ctx->cf_info.parent_loop.has_divergent_branch = false;
10512
10513 /** emit else block */
10514 Block* BB_else = ctx->program->create_and_insert_block();
10515 add_edge(ic->BB_if_idx, BB_else);
10516 append_logical_start(BB_else);
10517 ctx->block = BB_else;
10518 }
10519
10520 static void
end_uniform_if(isel_context * ctx,if_context * ic)10521 end_uniform_if(isel_context* ctx, if_context* ic)
10522 {
10523 Block* BB_else = ctx->block;
10524
10525 if (!ctx->cf_info.has_branch) {
10526 append_logical_end(BB_else);
10527 /* branch from then block to endif block */
10528 aco_ptr<Pseudo_branch_instruction> branch;
10529 branch.reset(create_instruction<Pseudo_branch_instruction>(aco_opcode::p_branch,
10530 Format::PSEUDO_BRANCH, 0, 1));
10531 branch->definitions[0] = Definition(ctx->program->allocateTmp(s2));
10532 branch->definitions[0].setHint(vcc);
10533 BB_else->instructions.emplace_back(std::move(branch));
10534 add_linear_edge(BB_else->index, &ic->BB_endif);
10535 if (!ctx->cf_info.parent_loop.has_divergent_branch)
10536 add_logical_edge(BB_else->index, &ic->BB_endif);
10537 BB_else->kind |= block_kind_uniform;
10538 }
10539
10540 ctx->cf_info.has_branch &= ic->uniform_has_then_branch;
10541 ctx->cf_info.parent_loop.has_divergent_branch &= ic->then_branch_divergent;
10542
10543 /** emit endif merge block */
10544 ctx->program->next_uniform_if_depth--;
10545 if (!ctx->cf_info.has_branch) {
10546 ctx->block = ctx->program->insert_block(std::move(ic->BB_endif));
10547 append_logical_start(ctx->block);
10548 }
10549 }
10550
10551 static bool
visit_if(isel_context * ctx,nir_if * if_stmt)10552 visit_if(isel_context* ctx, nir_if* if_stmt)
10553 {
10554 Temp cond = get_ssa_temp(ctx, if_stmt->condition.ssa);
10555 Builder bld(ctx->program, ctx->block);
10556 aco_ptr<Pseudo_branch_instruction> branch;
10557 if_context ic;
10558
10559 if (!nir_src_is_divergent(if_stmt->condition)) { /* uniform condition */
10560 /**
10561 * Uniform conditionals are represented in the following way*) :
10562 *
10563 * The linear and logical CFG:
10564 * BB_IF
10565 * / \
10566 * BB_THEN (logical) BB_ELSE (logical)
10567 * \ /
10568 * BB_ENDIF
10569 *
10570 * *) Exceptions may be due to break and continue statements within loops
10571 * If a break/continue happens within uniform control flow, it branches
10572 * to the loop exit/entry block. Otherwise, it branches to the next
10573 * merge block.
10574 **/
10575
10576 assert(cond.regClass() == ctx->program->lane_mask);
10577 cond = bool_to_scalar_condition(ctx, cond);
10578
10579 begin_uniform_if_then(ctx, &ic, cond);
10580 visit_cf_list(ctx, &if_stmt->then_list);
10581
10582 begin_uniform_if_else(ctx, &ic);
10583 visit_cf_list(ctx, &if_stmt->else_list);
10584
10585 end_uniform_if(ctx, &ic);
10586 } else { /* non-uniform condition */
10587 /**
10588 * To maintain a logical and linear CFG without critical edges,
10589 * non-uniform conditionals are represented in the following way*) :
10590 *
10591 * The linear CFG:
10592 * BB_IF
10593 * / \
10594 * BB_THEN (logical) BB_THEN (linear)
10595 * \ /
10596 * BB_INVERT (linear)
10597 * / \
10598 * BB_ELSE (logical) BB_ELSE (linear)
10599 * \ /
10600 * BB_ENDIF
10601 *
10602 * The logical CFG:
10603 * BB_IF
10604 * / \
10605 * BB_THEN (logical) BB_ELSE (logical)
10606 * \ /
10607 * BB_ENDIF
10608 *
10609 * *) Exceptions may be due to break and continue statements within loops
10610 **/
10611
10612 begin_divergent_if_then(ctx, &ic, cond);
10613 visit_cf_list(ctx, &if_stmt->then_list);
10614
10615 begin_divergent_if_else(ctx, &ic);
10616 visit_cf_list(ctx, &if_stmt->else_list);
10617
10618 end_divergent_if(ctx, &ic);
10619 }
10620
10621 return !ctx->cf_info.has_branch && !ctx->block->logical_preds.empty();
10622 }
10623
10624 static bool
visit_cf_list(isel_context * ctx,struct exec_list * list)10625 visit_cf_list(isel_context* ctx, struct exec_list* list)
10626 {
10627 foreach_list_typed (nir_cf_node, node, node, list) {
10628 switch (node->type) {
10629 case nir_cf_node_block: visit_block(ctx, nir_cf_node_as_block(node)); break;
10630 case nir_cf_node_if:
10631 if (!visit_if(ctx, nir_cf_node_as_if(node)))
10632 return true;
10633 break;
10634 case nir_cf_node_loop: visit_loop(ctx, nir_cf_node_as_loop(node)); break;
10635 default: unreachable("unimplemented cf list type");
10636 }
10637 }
10638 return false;
10639 }
10640
10641 static void
export_vs_varying(isel_context * ctx,int slot,bool is_pos,int * next_pos)10642 export_vs_varying(isel_context* ctx, int slot, bool is_pos, int* next_pos)
10643 {
10644 assert(ctx->stage.hw == HWStage::VS || ctx->stage.hw == HWStage::NGG);
10645
10646 int offset = (ctx->stage.has(SWStage::TES) && !ctx->stage.has(SWStage::GS))
10647 ? ctx->program->info->tes.outinfo.vs_output_param_offset[slot]
10648 : ctx->program->info->vs.outinfo.vs_output_param_offset[slot];
10649 unsigned mask = ctx->outputs.mask[slot];
10650 if (!is_pos && !mask)
10651 return;
10652 if (!is_pos && offset == AC_EXP_PARAM_UNDEFINED)
10653 return;
10654 aco_ptr<Export_instruction> exp{
10655 create_instruction<Export_instruction>(aco_opcode::exp, Format::EXP, 4, 0)};
10656 exp->enabled_mask = mask;
10657 for (unsigned i = 0; i < 4; ++i) {
10658 if (mask & (1 << i))
10659 exp->operands[i] = Operand(ctx->outputs.temps[slot * 4u + i]);
10660 else
10661 exp->operands[i] = Operand(v1);
10662 }
10663 /* GFX10 (Navi1x) skip POS0 exports if EXEC=0 and DONE=0, causing a hang.
10664 * Setting valid_mask=1 prevents it and has no other effect.
10665 */
10666 exp->valid_mask = ctx->options->chip_class == GFX10 && is_pos && *next_pos == 0;
10667 exp->done = false;
10668 exp->compressed = false;
10669 if (is_pos)
10670 exp->dest = V_008DFC_SQ_EXP_POS + (*next_pos)++;
10671 else
10672 exp->dest = V_008DFC_SQ_EXP_PARAM + offset;
10673 ctx->block->instructions.emplace_back(std::move(exp));
10674 }
10675
10676 static void
export_vs_psiz_layer_viewport_vrs(isel_context * ctx,int * next_pos)10677 export_vs_psiz_layer_viewport_vrs(isel_context* ctx, int* next_pos)
10678 {
10679 aco_ptr<Export_instruction> exp{
10680 create_instruction<Export_instruction>(aco_opcode::exp, Format::EXP, 4, 0)};
10681 exp->enabled_mask = 0;
10682 for (unsigned i = 0; i < 4; ++i)
10683 exp->operands[i] = Operand(v1);
10684 if (ctx->outputs.mask[VARYING_SLOT_PSIZ]) {
10685 exp->operands[0] = Operand(ctx->outputs.temps[VARYING_SLOT_PSIZ * 4u]);
10686 exp->enabled_mask |= 0x1;
10687 }
10688 if (ctx->outputs.mask[VARYING_SLOT_LAYER]) {
10689 exp->operands[2] = Operand(ctx->outputs.temps[VARYING_SLOT_LAYER * 4u]);
10690 exp->enabled_mask |= 0x4;
10691 }
10692 if (ctx->outputs.mask[VARYING_SLOT_VIEWPORT]) {
10693 if (ctx->options->chip_class < GFX9) {
10694 exp->operands[3] = Operand(ctx->outputs.temps[VARYING_SLOT_VIEWPORT * 4u]);
10695 exp->enabled_mask |= 0x8;
10696 } else {
10697 Builder bld(ctx->program, ctx->block);
10698
10699 Temp out = bld.vop2(aco_opcode::v_lshlrev_b32, bld.def(v1), Operand::c32(16u),
10700 Operand(ctx->outputs.temps[VARYING_SLOT_VIEWPORT * 4u]));
10701 if (exp->operands[2].isTemp())
10702 out = bld.vop2(aco_opcode::v_or_b32, bld.def(v1), Operand(out), exp->operands[2]);
10703
10704 exp->operands[2] = Operand(out);
10705 exp->enabled_mask |= 0x4;
10706 }
10707 }
10708 if (ctx->outputs.mask[VARYING_SLOT_PRIMITIVE_SHADING_RATE]) {
10709 exp->operands[1] = Operand(ctx->outputs.temps[VARYING_SLOT_PRIMITIVE_SHADING_RATE * 4u]);
10710 exp->enabled_mask |= 0x2;
10711 } else if (ctx->options->force_vrs_rates) {
10712 /* Bits [2:3] = VRS rate X
10713 * Bits [4:5] = VRS rate Y
10714 *
10715 * The range is [-2, 1]. Values:
10716 * 1: 2x coarser shading rate in that direction.
10717 * 0: normal shading rate
10718 * -1: 2x finer shading rate (sample shading, not directional)
10719 * -2: 4x finer shading rate (sample shading, not directional)
10720 *
10721 * Sample shading can't go above 8 samples, so both numbers can't be -2
10722 * at the same time.
10723 */
10724 Builder bld(ctx->program, ctx->block);
10725 Temp rates = bld.copy(bld.def(v1), Operand::c32((unsigned)ctx->options->force_vrs_rates));
10726
10727 /* If Pos.W != 1 (typical for non-GUI elements), use 2x2 coarse shading. */
10728 Temp cond = bld.vopc(aco_opcode::v_cmp_neq_f32, bld.def(bld.lm), Operand::c32(0x3f800000u),
10729 Operand(ctx->outputs.temps[VARYING_SLOT_POS + 3]));
10730 rates = bld.vop2(aco_opcode::v_cndmask_b32, bld.def(v1),
10731 bld.copy(bld.def(v1), Operand::zero()), rates, cond);
10732
10733 exp->operands[1] = Operand(rates);
10734 exp->enabled_mask |= 0x2;
10735 }
10736
10737 exp->valid_mask = ctx->options->chip_class == GFX10 && *next_pos == 0;
10738 exp->done = false;
10739 exp->compressed = false;
10740 exp->dest = V_008DFC_SQ_EXP_POS + (*next_pos)++;
10741 ctx->block->instructions.emplace_back(std::move(exp));
10742 }
10743
10744 static void
create_vs_exports(isel_context * ctx)10745 create_vs_exports(isel_context* ctx)
10746 {
10747 assert(ctx->stage.hw == HWStage::VS || ctx->stage.hw == HWStage::NGG);
10748
10749 const radv_vs_output_info* outinfo = (ctx->stage.has(SWStage::TES) && !ctx->stage.has(SWStage::GS))
10750 ? &ctx->program->info->tes.outinfo
10751 : &ctx->program->info->vs.outinfo;
10752
10753 ctx->block->kind |= block_kind_export_end;
10754
10755 if (outinfo->export_prim_id && ctx->stage.hw != HWStage::NGG) {
10756 ctx->outputs.mask[VARYING_SLOT_PRIMITIVE_ID] |= 0x1;
10757 if (ctx->stage.has(SWStage::TES))
10758 ctx->outputs.temps[VARYING_SLOT_PRIMITIVE_ID * 4u] =
10759 get_arg(ctx, ctx->args->ac.tes_patch_id);
10760 else
10761 ctx->outputs.temps[VARYING_SLOT_PRIMITIVE_ID * 4u] =
10762 get_arg(ctx, ctx->args->ac.vs_prim_id);
10763 }
10764
10765 if (ctx->options->key.has_multiview_view_index) {
10766 ctx->outputs.mask[VARYING_SLOT_LAYER] |= 0x1;
10767 ctx->outputs.temps[VARYING_SLOT_LAYER * 4u] =
10768 as_vgpr(ctx, get_arg(ctx, ctx->args->ac.view_index));
10769 }
10770
10771 /* Hardware requires position data to always be exported, even if the
10772 * application did not write gl_Position.
10773 */
10774 ctx->outputs.mask[VARYING_SLOT_POS] = 0xf;
10775
10776 /* the order these position exports are created is important */
10777 int next_pos = 0;
10778 export_vs_varying(ctx, VARYING_SLOT_POS, true, &next_pos);
10779
10780 bool writes_primitive_shading_rate =
10781 outinfo->writes_primitive_shading_rate || ctx->options->force_vrs_rates;
10782 if (outinfo->writes_pointsize || outinfo->writes_layer || outinfo->writes_viewport_index ||
10783 writes_primitive_shading_rate) {
10784 export_vs_psiz_layer_viewport_vrs(ctx, &next_pos);
10785 }
10786 if (ctx->num_clip_distances + ctx->num_cull_distances > 0)
10787 export_vs_varying(ctx, VARYING_SLOT_CLIP_DIST0, true, &next_pos);
10788 if (ctx->num_clip_distances + ctx->num_cull_distances > 4)
10789 export_vs_varying(ctx, VARYING_SLOT_CLIP_DIST1, true, &next_pos);
10790
10791 if (ctx->export_clip_dists) {
10792 if (ctx->num_clip_distances + ctx->num_cull_distances > 0)
10793 export_vs_varying(ctx, VARYING_SLOT_CLIP_DIST0, false, &next_pos);
10794 if (ctx->num_clip_distances + ctx->num_cull_distances > 4)
10795 export_vs_varying(ctx, VARYING_SLOT_CLIP_DIST1, false, &next_pos);
10796 }
10797
10798 for (unsigned i = 0; i <= VARYING_SLOT_VAR31; ++i) {
10799 if (i < VARYING_SLOT_VAR0 && i != VARYING_SLOT_LAYER && i != VARYING_SLOT_PRIMITIVE_ID &&
10800 i != VARYING_SLOT_VIEWPORT)
10801 continue;
10802
10803 export_vs_varying(ctx, i, false, NULL);
10804 }
10805 }
10806
10807 static bool
export_fs_mrt_z(isel_context * ctx)10808 export_fs_mrt_z(isel_context* ctx)
10809 {
10810 Builder bld(ctx->program, ctx->block);
10811 unsigned enabled_channels = 0;
10812 bool compr = false;
10813 Operand values[4];
10814
10815 for (unsigned i = 0; i < 4; ++i) {
10816 values[i] = Operand(v1);
10817 }
10818
10819 /* Both stencil and sample mask only need 16-bits. */
10820 if (!ctx->program->info->ps.writes_z &&
10821 (ctx->program->info->ps.writes_stencil || ctx->program->info->ps.writes_sample_mask)) {
10822 compr = true; /* COMPR flag */
10823
10824 if (ctx->program->info->ps.writes_stencil) {
10825 /* Stencil should be in X[23:16]. */
10826 values[0] = Operand(ctx->outputs.temps[FRAG_RESULT_STENCIL * 4u]);
10827 values[0] = bld.vop2(aco_opcode::v_lshlrev_b32, bld.def(v1), Operand::c32(16u), values[0]);
10828 enabled_channels |= 0x3;
10829 }
10830
10831 if (ctx->program->info->ps.writes_sample_mask) {
10832 /* SampleMask should be in Y[15:0]. */
10833 values[1] = Operand(ctx->outputs.temps[FRAG_RESULT_SAMPLE_MASK * 4u]);
10834 enabled_channels |= 0xc;
10835 }
10836 } else {
10837 if (ctx->program->info->ps.writes_z) {
10838 values[0] = Operand(ctx->outputs.temps[FRAG_RESULT_DEPTH * 4u]);
10839 enabled_channels |= 0x1;
10840 }
10841
10842 if (ctx->program->info->ps.writes_stencil) {
10843 values[1] = Operand(ctx->outputs.temps[FRAG_RESULT_STENCIL * 4u]);
10844 enabled_channels |= 0x2;
10845 }
10846
10847 if (ctx->program->info->ps.writes_sample_mask) {
10848 values[2] = Operand(ctx->outputs.temps[FRAG_RESULT_SAMPLE_MASK * 4u]);
10849 enabled_channels |= 0x4;
10850 }
10851 }
10852
10853 /* GFX6 (except OLAND and HAINAN) has a bug that it only looks at the X
10854 * writemask component.
10855 */
10856 if (ctx->options->chip_class == GFX6 && ctx->options->family != CHIP_OLAND &&
10857 ctx->options->family != CHIP_HAINAN) {
10858 enabled_channels |= 0x1;
10859 }
10860
10861 bld.exp(aco_opcode::exp, values[0], values[1], values[2], values[3], enabled_channels,
10862 V_008DFC_SQ_EXP_MRTZ, compr);
10863
10864 return true;
10865 }
10866
10867 static bool
export_fs_mrt_color(isel_context * ctx,int slot)10868 export_fs_mrt_color(isel_context* ctx, int slot)
10869 {
10870 Builder bld(ctx->program, ctx->block);
10871 unsigned write_mask = ctx->outputs.mask[slot];
10872 Operand values[4];
10873
10874 for (unsigned i = 0; i < 4; ++i) {
10875 if (write_mask & (1 << i)) {
10876 values[i] = Operand(ctx->outputs.temps[slot * 4u + i]);
10877 } else {
10878 values[i] = Operand(v1);
10879 }
10880 }
10881
10882 unsigned target, col_format;
10883 unsigned enabled_channels = 0;
10884 aco_opcode compr_op = (aco_opcode)0;
10885 bool compr = false;
10886
10887 slot -= FRAG_RESULT_DATA0;
10888 target = V_008DFC_SQ_EXP_MRT + slot;
10889 col_format = (ctx->options->key.ps.col_format >> (4 * slot)) & 0xf;
10890
10891 bool is_int8 = (ctx->options->key.ps.is_int8 >> slot) & 1;
10892 bool is_int10 = (ctx->options->key.ps.is_int10 >> slot) & 1;
10893 bool is_16bit = values[0].regClass() == v2b;
10894
10895 /* Replace NaN by zero (only 32-bit) to fix game bugs if requested. */
10896 if (ctx->options->enable_mrt_output_nan_fixup && !is_16bit &&
10897 (col_format == V_028714_SPI_SHADER_32_R || col_format == V_028714_SPI_SHADER_32_GR ||
10898 col_format == V_028714_SPI_SHADER_32_AR || col_format == V_028714_SPI_SHADER_32_ABGR ||
10899 col_format == V_028714_SPI_SHADER_FP16_ABGR)) {
10900 for (int i = 0; i < 4; i++) {
10901 if (!(write_mask & (1 << i)))
10902 continue;
10903
10904 Temp isnan = bld.vopc(aco_opcode::v_cmp_class_f32, bld.hint_vcc(bld.def(bld.lm)),
10905 values[i], bld.copy(bld.def(v1), Operand::c32(3u)));
10906 values[i] = bld.vop2(aco_opcode::v_cndmask_b32, bld.def(v1), values[i],
10907 bld.copy(bld.def(v1), Operand::zero()), isnan);
10908 }
10909 }
10910
10911 switch (col_format) {
10912 case V_028714_SPI_SHADER_32_R: enabled_channels = 1; break;
10913
10914 case V_028714_SPI_SHADER_32_GR: enabled_channels = 0x3; break;
10915
10916 case V_028714_SPI_SHADER_32_AR:
10917 if (ctx->options->chip_class >= GFX10) {
10918 /* Special case: on GFX10, the outputs are different for 32_AR */
10919 enabled_channels = 0x3;
10920 values[1] = values[3];
10921 values[3] = Operand(v1);
10922 } else {
10923 enabled_channels = 0x9;
10924 }
10925 break;
10926
10927 case V_028714_SPI_SHADER_FP16_ABGR:
10928 for (int i = 0; i < 2; i++) {
10929 bool enabled = (write_mask >> (i * 2)) & 0x3;
10930 if (enabled) {
10931 enabled_channels |= 0x3 << (i * 2);
10932 if (is_16bit) {
10933 values[i] =
10934 bld.pseudo(aco_opcode::p_create_vector, bld.def(v1),
10935 values[i * 2].isUndefined() ? Operand(v2b) : values[i * 2],
10936 values[i * 2 + 1].isUndefined() ? Operand(v2b) : values[i * 2 + 1]);
10937 } else if (ctx->options->chip_class == GFX8 || ctx->options->chip_class == GFX9) {
10938 values[i] =
10939 bld.vop3(aco_opcode::v_cvt_pkrtz_f16_f32_e64, bld.def(v1),
10940 values[i * 2].isUndefined() ? Operand::zero() : values[i * 2],
10941 values[i * 2 + 1].isUndefined() ? Operand::zero() : values[i * 2 + 1]);
10942 } else {
10943 values[i] =
10944 bld.vop2(aco_opcode::v_cvt_pkrtz_f16_f32, bld.def(v1),
10945 values[i * 2].isUndefined() ? values[i * 2 + 1] : values[i * 2],
10946 values[i * 2 + 1].isUndefined() ? values[i * 2] : values[i * 2 + 1]);
10947 }
10948 } else {
10949 values[i] = Operand(v1);
10950 }
10951 }
10952 values[2] = Operand(v1);
10953 values[3] = Operand(v1);
10954 compr = true;
10955 break;
10956
10957 case V_028714_SPI_SHADER_UNORM16_ABGR:
10958 if (is_16bit && ctx->options->chip_class >= GFX9) {
10959 compr_op = aco_opcode::v_cvt_pknorm_u16_f16;
10960 } else {
10961 compr_op = aco_opcode::v_cvt_pknorm_u16_f32;
10962 }
10963 break;
10964
10965 case V_028714_SPI_SHADER_SNORM16_ABGR:
10966 if (is_16bit && ctx->options->chip_class >= GFX9) {
10967 compr_op = aco_opcode::v_cvt_pknorm_i16_f16;
10968 } else {
10969 compr_op = aco_opcode::v_cvt_pknorm_i16_f32;
10970 }
10971 break;
10972
10973 case V_028714_SPI_SHADER_UINT16_ABGR: {
10974 compr_op = aco_opcode::v_cvt_pk_u16_u32;
10975 if (is_int8 || is_int10) {
10976 /* clamp */
10977 uint32_t max_rgb = is_int8 ? 255 : is_int10 ? 1023 : 0;
10978 Temp max_rgb_val = bld.copy(bld.def(s1), Operand::c32(max_rgb));
10979
10980 for (unsigned i = 0; i < 4; i++) {
10981 if ((write_mask >> i) & 1) {
10982 values[i] =
10983 bld.vop2(aco_opcode::v_min_u32, bld.def(v1),
10984 i == 3 && is_int10 ? Operand::c32(3u) : Operand(max_rgb_val), values[i]);
10985 }
10986 }
10987 } else if (is_16bit) {
10988 for (unsigned i = 0; i < 4; i++) {
10989 if ((write_mask >> i) & 1) {
10990 Temp tmp = convert_int(ctx, bld, values[i].getTemp(), 16, 32, false);
10991 values[i] = Operand(tmp);
10992 }
10993 }
10994 }
10995 break;
10996 }
10997
10998 case V_028714_SPI_SHADER_SINT16_ABGR:
10999 compr_op = aco_opcode::v_cvt_pk_i16_i32;
11000 if (is_int8 || is_int10) {
11001 /* clamp */
11002 uint32_t max_rgb = is_int8 ? 127 : is_int10 ? 511 : 0;
11003 uint32_t min_rgb = is_int8 ? -128 : is_int10 ? -512 : 0;
11004 Temp max_rgb_val = bld.copy(bld.def(s1), Operand::c32(max_rgb));
11005 Temp min_rgb_val = bld.copy(bld.def(s1), Operand::c32(min_rgb));
11006
11007 for (unsigned i = 0; i < 4; i++) {
11008 if ((write_mask >> i) & 1) {
11009 values[i] =
11010 bld.vop2(aco_opcode::v_min_i32, bld.def(v1),
11011 i == 3 && is_int10 ? Operand::c32(1u) : Operand(max_rgb_val), values[i]);
11012 values[i] = bld.vop2(aco_opcode::v_max_i32, bld.def(v1),
11013 i == 3 && is_int10 ? Operand::c32(-2u) : Operand(min_rgb_val),
11014 values[i]);
11015 }
11016 }
11017 } else if (is_16bit) {
11018 for (unsigned i = 0; i < 4; i++) {
11019 if ((write_mask >> i) & 1) {
11020 Temp tmp = convert_int(ctx, bld, values[i].getTemp(), 16, 32, true);
11021 values[i] = Operand(tmp);
11022 }
11023 }
11024 }
11025 break;
11026
11027 case V_028714_SPI_SHADER_32_ABGR: enabled_channels = 0xF; break;
11028
11029 case V_028714_SPI_SHADER_ZERO:
11030 default: return false;
11031 }
11032
11033 if ((bool)compr_op) {
11034 for (int i = 0; i < 2; i++) {
11035 /* check if at least one of the values to be compressed is enabled */
11036 bool enabled = (write_mask >> (i * 2)) & 0x3;
11037 if (enabled) {
11038 enabled_channels |= 0x3 << (i * 2);
11039 values[i] = bld.vop3(
11040 compr_op, bld.def(v1), values[i * 2].isUndefined() ? Operand::zero() : values[i * 2],
11041 values[i * 2 + 1].isUndefined() ? Operand::zero() : values[i * 2 + 1]);
11042 } else {
11043 values[i] = Operand(v1);
11044 }
11045 }
11046 values[2] = Operand(v1);
11047 values[3] = Operand(v1);
11048 compr = true;
11049 } else if (!compr) {
11050 for (int i = 0; i < 4; i++)
11051 values[i] = enabled_channels & (1 << i) ? values[i] : Operand(v1);
11052 }
11053
11054 bld.exp(aco_opcode::exp, values[0], values[1], values[2], values[3], enabled_channels, target,
11055 compr);
11056 return true;
11057 }
11058
11059 static void
create_fs_null_export(isel_context * ctx)11060 create_fs_null_export(isel_context* ctx)
11061 {
11062 /* FS must always have exports.
11063 * So when there are none, we need to add a null export.
11064 */
11065
11066 Builder bld(ctx->program, ctx->block);
11067 unsigned dest = V_008DFC_SQ_EXP_NULL;
11068 bld.exp(aco_opcode::exp, Operand(v1), Operand(v1), Operand(v1), Operand(v1),
11069 /* enabled_mask */ 0, dest, /* compr */ false, /* done */ true, /* vm */ true);
11070 }
11071
11072 static void
create_fs_exports(isel_context * ctx)11073 create_fs_exports(isel_context* ctx)
11074 {
11075 bool exported = false;
11076
11077 /* Export depth, stencil and sample mask. */
11078 if (ctx->outputs.mask[FRAG_RESULT_DEPTH] || ctx->outputs.mask[FRAG_RESULT_STENCIL] ||
11079 ctx->outputs.mask[FRAG_RESULT_SAMPLE_MASK])
11080 exported |= export_fs_mrt_z(ctx);
11081
11082 /* Export all color render targets. */
11083 for (unsigned i = FRAG_RESULT_DATA0; i < FRAG_RESULT_DATA7 + 1; ++i)
11084 if (ctx->outputs.mask[i])
11085 exported |= export_fs_mrt_color(ctx, i);
11086
11087 if (!exported)
11088 create_fs_null_export(ctx);
11089
11090 ctx->block->kind |= block_kind_export_end;
11091 }
11092
11093 static void
create_workgroup_barrier(Builder & bld)11094 create_workgroup_barrier(Builder& bld)
11095 {
11096 bld.barrier(aco_opcode::p_barrier,
11097 memory_sync_info(storage_shared, semantic_acqrel, scope_workgroup), scope_workgroup);
11098 }
11099
11100 static void
emit_stream_output(isel_context * ctx,Temp const * so_buffers,Temp const * so_write_offset,const struct radv_stream_output * output)11101 emit_stream_output(isel_context* ctx, Temp const* so_buffers, Temp const* so_write_offset,
11102 const struct radv_stream_output* output)
11103 {
11104 unsigned num_comps = util_bitcount(output->component_mask);
11105 unsigned writemask = (1 << num_comps) - 1;
11106 unsigned loc = output->location;
11107 unsigned buf = output->buffer;
11108
11109 assert(num_comps && num_comps <= 4);
11110 if (!num_comps || num_comps > 4)
11111 return;
11112
11113 unsigned first_comp = ffs(output->component_mask) - 1;
11114
11115 Temp out[4];
11116 bool all_undef = true;
11117 assert(ctx->stage.hw == HWStage::VS);
11118 for (unsigned i = 0; i < num_comps; i++) {
11119 out[i] = ctx->outputs.temps[loc * 4 + first_comp + i];
11120 all_undef = all_undef && !out[i].id();
11121 }
11122 if (all_undef)
11123 return;
11124
11125 while (writemask) {
11126 int start, count;
11127 u_bit_scan_consecutive_range(&writemask, &start, &count);
11128 if (count == 3 && ctx->options->chip_class == GFX6) {
11129 /* GFX6 doesn't support storing vec3, split it. */
11130 writemask |= 1u << (start + 2);
11131 count = 2;
11132 }
11133
11134 unsigned offset = output->offset + start * 4;
11135
11136 Temp write_data = ctx->program->allocateTmp(RegClass(RegType::vgpr, count));
11137 aco_ptr<Pseudo_instruction> vec{create_instruction<Pseudo_instruction>(
11138 aco_opcode::p_create_vector, Format::PSEUDO, count, 1)};
11139 for (int i = 0; i < count; ++i)
11140 vec->operands[i] =
11141 (ctx->outputs.mask[loc] & 1 << (start + first_comp + i)) ? Operand(out[start + i]) : Operand::zero();
11142 vec->definitions[0] = Definition(write_data);
11143 ctx->block->instructions.emplace_back(std::move(vec));
11144
11145 aco_opcode opcode;
11146 switch (count) {
11147 case 1: opcode = aco_opcode::buffer_store_dword; break;
11148 case 2: opcode = aco_opcode::buffer_store_dwordx2; break;
11149 case 3: opcode = aco_opcode::buffer_store_dwordx3; break;
11150 case 4: opcode = aco_opcode::buffer_store_dwordx4; break;
11151 default: unreachable("Unsupported dword count.");
11152 }
11153
11154 aco_ptr<MUBUF_instruction> store{
11155 create_instruction<MUBUF_instruction>(opcode, Format::MUBUF, 4, 0)};
11156 store->operands[0] = Operand(so_buffers[buf]);
11157 store->operands[1] = Operand(so_write_offset[buf]);
11158 store->operands[2] = Operand::c32(0);
11159 store->operands[3] = Operand(write_data);
11160 if (offset > 4095) {
11161 /* Don't think this can happen in RADV, but maybe GL? It's easy to do this anyway. */
11162 Builder bld(ctx->program, ctx->block);
11163 store->operands[0] =
11164 bld.vadd32(bld.def(v1), Operand::c32(offset), Operand(so_write_offset[buf]));
11165 } else {
11166 store->offset = offset;
11167 }
11168 store->offen = true;
11169 store->glc = true;
11170 store->dlc = false;
11171 store->slc = true;
11172 ctx->block->instructions.emplace_back(std::move(store));
11173 }
11174 }
11175
11176 static void
emit_streamout(isel_context * ctx,unsigned stream)11177 emit_streamout(isel_context* ctx, unsigned stream)
11178 {
11179 Builder bld(ctx->program, ctx->block);
11180
11181 Temp so_vtx_count =
11182 bld.sop2(aco_opcode::s_bfe_u32, bld.def(s1), bld.def(s1, scc),
11183 get_arg(ctx, ctx->args->ac.streamout_config), Operand::c32(0x70010u));
11184
11185 Temp tid = emit_mbcnt(ctx, bld.tmp(v1));
11186
11187 Temp can_emit = bld.vopc(aco_opcode::v_cmp_gt_i32, bld.def(bld.lm), so_vtx_count, tid);
11188
11189 if_context ic;
11190 begin_divergent_if_then(ctx, &ic, can_emit);
11191
11192 bld.reset(ctx->block);
11193
11194 Temp so_write_index =
11195 bld.vadd32(bld.def(v1), get_arg(ctx, ctx->args->ac.streamout_write_index), tid);
11196
11197 Temp so_buffers[4];
11198 Temp so_write_offset[4];
11199 Temp buf_ptr = convert_pointer_to_64_bit(ctx, get_arg(ctx, ctx->args->streamout_buffers));
11200
11201 for (unsigned i = 0; i < 4; i++) {
11202 unsigned stride = ctx->program->info->so.strides[i];
11203 if (!stride)
11204 continue;
11205
11206 so_buffers[i] = bld.smem(aco_opcode::s_load_dwordx4, bld.def(s4), buf_ptr,
11207 bld.copy(bld.def(s1), Operand::c32(i * 16u)));
11208
11209 if (stride == 1) {
11210 Temp offset = bld.sop2(aco_opcode::s_add_i32, bld.def(s1), bld.def(s1, scc),
11211 get_arg(ctx, ctx->args->ac.streamout_write_index),
11212 get_arg(ctx, ctx->args->ac.streamout_offset[i]));
11213 Temp new_offset = bld.vadd32(bld.def(v1), offset, tid);
11214
11215 so_write_offset[i] =
11216 bld.vop2(aco_opcode::v_lshlrev_b32, bld.def(v1), Operand::c32(2u), new_offset);
11217 } else {
11218 Temp offset = bld.v_mul_imm(bld.def(v1), so_write_index, stride * 4u);
11219 Temp offset2 = bld.sop2(aco_opcode::s_mul_i32, bld.def(s1), Operand::c32(4u),
11220 get_arg(ctx, ctx->args->ac.streamout_offset[i]));
11221 so_write_offset[i] = bld.vadd32(bld.def(v1), offset, offset2);
11222 }
11223 }
11224
11225 for (unsigned i = 0; i < ctx->program->info->so.num_outputs; i++) {
11226 const struct radv_stream_output* output = &ctx->program->info->so.outputs[i];
11227 if (stream != output->stream)
11228 continue;
11229
11230 emit_stream_output(ctx, so_buffers, so_write_offset, output);
11231 }
11232
11233 begin_divergent_if_else(ctx, &ic);
11234 end_divergent_if(ctx, &ic);
11235 }
11236
11237 Pseudo_instruction*
add_startpgm(struct isel_context * ctx)11238 add_startpgm(struct isel_context* ctx)
11239 {
11240 aco_ptr<Pseudo_instruction> startpgm{
11241 create_instruction<Pseudo_instruction>(aco_opcode::p_startpgm, Format::PSEUDO, 0, ctx->args->ac.arg_count)};
11242 for (unsigned i = 0, arg = 0; i < ctx->args->ac.arg_count; i++) {
11243 if (ctx->args->ac.args[i].skip)
11244 continue;
11245
11246 enum ac_arg_regfile file = ctx->args->ac.args[i].file;
11247 unsigned size = ctx->args->ac.args[i].size;
11248 unsigned reg = ctx->args->ac.args[i].offset;
11249 RegClass type = RegClass(file == AC_ARG_SGPR ? RegType::sgpr : RegType::vgpr, size);
11250 Temp dst = ctx->program->allocateTmp(type);
11251 ctx->arg_temps[i] = dst;
11252 startpgm->definitions[arg] = Definition(dst);
11253 startpgm->definitions[arg].setFixed(PhysReg{file == AC_ARG_SGPR ? reg : reg + 256});
11254 arg++;
11255 }
11256 Pseudo_instruction* instr = startpgm.get();
11257 ctx->block->instructions.push_back(std::move(startpgm));
11258
11259 /* Stash these in the program so that they can be accessed later when
11260 * handling spilling.
11261 */
11262 ctx->program->private_segment_buffer = get_arg(ctx, ctx->args->ring_offsets);
11263 ctx->program->scratch_offset = get_arg(ctx, ctx->args->ac.scratch_offset);
11264
11265 if (ctx->stage.has(SWStage::VS) && ctx->program->info->vs.dynamic_inputs) {
11266 unsigned num_attributes = util_last_bit(ctx->program->info->vs.vb_desc_usage_mask);
11267 for (unsigned i = 0; i < num_attributes; i++) {
11268 Definition def(get_arg(ctx, ctx->args->vs_inputs[i]));
11269
11270 unsigned idx = ctx->args->vs_inputs[i].arg_index;
11271 def.setFixed(PhysReg(256 + ctx->args->ac.args[idx].offset));
11272
11273 ctx->program->vs_inputs.push_back(def);
11274 }
11275 }
11276
11277 return instr;
11278 }
11279
11280 void
fix_ls_vgpr_init_bug(isel_context * ctx,Pseudo_instruction * startpgm)11281 fix_ls_vgpr_init_bug(isel_context* ctx, Pseudo_instruction* startpgm)
11282 {
11283 assert(ctx->shader->info.stage == MESA_SHADER_VERTEX);
11284 Builder bld(ctx->program, ctx->block);
11285 constexpr unsigned hs_idx = 1u;
11286 Builder::Result hs_thread_count = bld.sop2(aco_opcode::s_bfe_u32, bld.def(s1), bld.def(s1, scc),
11287 get_arg(ctx, ctx->args->ac.merged_wave_info),
11288 Operand::c32((8u << 16) | (hs_idx * 8u)));
11289 Temp ls_has_nonzero_hs_threads = bool_to_vector_condition(ctx, hs_thread_count.def(1).getTemp());
11290
11291 /* If there are no HS threads, SPI mistakenly loads the LS VGPRs starting at VGPR 0. */
11292
11293 Temp instance_id =
11294 bld.vop2(aco_opcode::v_cndmask_b32, bld.def(v1), get_arg(ctx, ctx->args->ac.vertex_id),
11295 get_arg(ctx, ctx->args->ac.instance_id), ls_has_nonzero_hs_threads);
11296 Temp vs_rel_patch_id =
11297 bld.vop2(aco_opcode::v_cndmask_b32, bld.def(v1), get_arg(ctx, ctx->args->ac.tcs_rel_ids),
11298 get_arg(ctx, ctx->args->ac.vs_rel_patch_id), ls_has_nonzero_hs_threads);
11299 Temp vertex_id =
11300 bld.vop2(aco_opcode::v_cndmask_b32, bld.def(v1), get_arg(ctx, ctx->args->ac.tcs_patch_id),
11301 get_arg(ctx, ctx->args->ac.vertex_id), ls_has_nonzero_hs_threads);
11302
11303 ctx->arg_temps[ctx->args->ac.instance_id.arg_index] = instance_id;
11304 ctx->arg_temps[ctx->args->ac.vs_rel_patch_id.arg_index] = vs_rel_patch_id;
11305 ctx->arg_temps[ctx->args->ac.vertex_id.arg_index] = vertex_id;
11306 }
11307
11308 void
split_arguments(isel_context * ctx,Pseudo_instruction * startpgm)11309 split_arguments(isel_context* ctx, Pseudo_instruction* startpgm)
11310 {
11311 /* Split all arguments except for the first (ring_offsets) and the last
11312 * (exec) so that the dead channels don't stay live throughout the program.
11313 */
11314 for (int i = 1; i < startpgm->definitions.size(); i++) {
11315 if (startpgm->definitions[i].regClass().size() > 1) {
11316 emit_split_vector(ctx, startpgm->definitions[i].getTemp(),
11317 startpgm->definitions[i].regClass().size());
11318 }
11319 }
11320 }
11321
11322 void
handle_bc_optimize(isel_context * ctx)11323 handle_bc_optimize(isel_context* ctx)
11324 {
11325 /* needed when SPI_PS_IN_CONTROL.BC_OPTIMIZE_DISABLE is set to 0 */
11326 Builder bld(ctx->program, ctx->block);
11327 uint32_t spi_ps_input_ena = ctx->program->config->spi_ps_input_ena;
11328 bool uses_center =
11329 G_0286CC_PERSP_CENTER_ENA(spi_ps_input_ena) || G_0286CC_LINEAR_CENTER_ENA(spi_ps_input_ena);
11330 bool uses_persp_centroid = G_0286CC_PERSP_CENTROID_ENA(spi_ps_input_ena);
11331 bool uses_linear_centroid = G_0286CC_LINEAR_CENTROID_ENA(spi_ps_input_ena);
11332
11333 if (uses_persp_centroid)
11334 ctx->persp_centroid = get_arg(ctx, ctx->args->ac.persp_centroid);
11335 if (uses_linear_centroid)
11336 ctx->linear_centroid = get_arg(ctx, ctx->args->ac.linear_centroid);
11337
11338 if (uses_center && (uses_persp_centroid || uses_linear_centroid)) {
11339 Temp sel = bld.vopc_e64(aco_opcode::v_cmp_lt_i32, bld.hint_vcc(bld.def(bld.lm)),
11340 get_arg(ctx, ctx->args->ac.prim_mask), Operand::zero());
11341
11342 if (uses_persp_centroid) {
11343 Temp new_coord[2];
11344 for (unsigned i = 0; i < 2; i++) {
11345 Temp persp_centroid =
11346 emit_extract_vector(ctx, get_arg(ctx, ctx->args->ac.persp_centroid), i, v1);
11347 Temp persp_center =
11348 emit_extract_vector(ctx, get_arg(ctx, ctx->args->ac.persp_center), i, v1);
11349 new_coord[i] =
11350 bld.vop2(aco_opcode::v_cndmask_b32, bld.def(v1), persp_centroid, persp_center, sel);
11351 }
11352 ctx->persp_centroid = bld.tmp(v2);
11353 bld.pseudo(aco_opcode::p_create_vector, Definition(ctx->persp_centroid),
11354 Operand(new_coord[0]), Operand(new_coord[1]));
11355 emit_split_vector(ctx, ctx->persp_centroid, 2);
11356 }
11357
11358 if (uses_linear_centroid) {
11359 Temp new_coord[2];
11360 for (unsigned i = 0; i < 2; i++) {
11361 Temp linear_centroid =
11362 emit_extract_vector(ctx, get_arg(ctx, ctx->args->ac.linear_centroid), i, v1);
11363 Temp linear_center =
11364 emit_extract_vector(ctx, get_arg(ctx, ctx->args->ac.linear_center), i, v1);
11365 new_coord[i] = bld.vop2(aco_opcode::v_cndmask_b32, bld.def(v1), linear_centroid,
11366 linear_center, sel);
11367 }
11368 ctx->linear_centroid = bld.tmp(v2);
11369 bld.pseudo(aco_opcode::p_create_vector, Definition(ctx->linear_centroid),
11370 Operand(new_coord[0]), Operand(new_coord[1]));
11371 emit_split_vector(ctx, ctx->linear_centroid, 2);
11372 }
11373 }
11374 }
11375
11376 void
setup_fp_mode(isel_context * ctx,nir_shader * shader)11377 setup_fp_mode(isel_context* ctx, nir_shader* shader)
11378 {
11379 Program* program = ctx->program;
11380
11381 unsigned float_controls = shader->info.float_controls_execution_mode;
11382
11383 program->next_fp_mode.preserve_signed_zero_inf_nan32 =
11384 float_controls & FLOAT_CONTROLS_SIGNED_ZERO_INF_NAN_PRESERVE_FP32;
11385 program->next_fp_mode.preserve_signed_zero_inf_nan16_64 =
11386 float_controls & (FLOAT_CONTROLS_SIGNED_ZERO_INF_NAN_PRESERVE_FP16 |
11387 FLOAT_CONTROLS_SIGNED_ZERO_INF_NAN_PRESERVE_FP64);
11388
11389 program->next_fp_mode.must_flush_denorms32 =
11390 float_controls & FLOAT_CONTROLS_DENORM_FLUSH_TO_ZERO_FP32;
11391 program->next_fp_mode.must_flush_denorms16_64 =
11392 float_controls &
11393 (FLOAT_CONTROLS_DENORM_FLUSH_TO_ZERO_FP16 | FLOAT_CONTROLS_DENORM_FLUSH_TO_ZERO_FP64);
11394
11395 program->next_fp_mode.care_about_round32 =
11396 float_controls &
11397 (FLOAT_CONTROLS_ROUNDING_MODE_RTZ_FP32 | FLOAT_CONTROLS_ROUNDING_MODE_RTE_FP32);
11398
11399 program->next_fp_mode.care_about_round16_64 =
11400 float_controls &
11401 (FLOAT_CONTROLS_ROUNDING_MODE_RTZ_FP16 | FLOAT_CONTROLS_ROUNDING_MODE_RTZ_FP64 |
11402 FLOAT_CONTROLS_ROUNDING_MODE_RTE_FP16 | FLOAT_CONTROLS_ROUNDING_MODE_RTE_FP64);
11403
11404 /* default to preserving fp16 and fp64 denorms, since it's free for fp64 and
11405 * the precision seems needed for Wolfenstein: Youngblood to render correctly */
11406 if (program->next_fp_mode.must_flush_denorms16_64)
11407 program->next_fp_mode.denorm16_64 = 0;
11408 else
11409 program->next_fp_mode.denorm16_64 = fp_denorm_keep;
11410
11411 /* preserving fp32 denorms is expensive, so only do it if asked */
11412 if (float_controls & FLOAT_CONTROLS_DENORM_PRESERVE_FP32)
11413 program->next_fp_mode.denorm32 = fp_denorm_keep;
11414 else
11415 program->next_fp_mode.denorm32 = 0;
11416
11417 if (float_controls & FLOAT_CONTROLS_ROUNDING_MODE_RTZ_FP32)
11418 program->next_fp_mode.round32 = fp_round_tz;
11419 else
11420 program->next_fp_mode.round32 = fp_round_ne;
11421
11422 if (float_controls &
11423 (FLOAT_CONTROLS_ROUNDING_MODE_RTZ_FP16 | FLOAT_CONTROLS_ROUNDING_MODE_RTZ_FP64))
11424 program->next_fp_mode.round16_64 = fp_round_tz;
11425 else
11426 program->next_fp_mode.round16_64 = fp_round_ne;
11427
11428 ctx->block->fp_mode = program->next_fp_mode;
11429 }
11430
11431 void
cleanup_cfg(Program * program)11432 cleanup_cfg(Program* program)
11433 {
11434 /* create linear_succs/logical_succs */
11435 for (Block& BB : program->blocks) {
11436 for (unsigned idx : BB.linear_preds)
11437 program->blocks[idx].linear_succs.emplace_back(BB.index);
11438 for (unsigned idx : BB.logical_preds)
11439 program->blocks[idx].logical_succs.emplace_back(BB.index);
11440 }
11441 }
11442
11443 Temp
lanecount_to_mask(isel_context * ctx,Temp count,bool allow64=true)11444 lanecount_to_mask(isel_context* ctx, Temp count, bool allow64 = true)
11445 {
11446 assert(count.regClass() == s1);
11447
11448 Builder bld(ctx->program, ctx->block);
11449 Temp mask = bld.sop2(aco_opcode::s_bfm_b64, bld.def(s2), count, Operand::zero());
11450 Temp cond;
11451
11452 if (ctx->program->wave_size == 64) {
11453 /* If we know that all 64 threads can't be active at a time, we just use the mask as-is */
11454 if (!allow64)
11455 return mask;
11456
11457 /* Special case for 64 active invocations, because 64 doesn't work with s_bfm */
11458 Temp active_64 = bld.sopc(aco_opcode::s_bitcmp1_b32, bld.def(s1, scc), count,
11459 Operand::c32(6u /* log2(64) */));
11460 cond =
11461 bld.sop2(Builder::s_cselect, bld.def(bld.lm), Operand::c32(-1u), mask, bld.scc(active_64));
11462 } else {
11463 /* We use s_bfm_b64 (not _b32) which works with 32, but we need to extract the lower half of
11464 * the register */
11465 cond = emit_extract_vector(ctx, mask, 0, bld.lm);
11466 }
11467
11468 return cond;
11469 }
11470
11471 Temp
merged_wave_info_to_mask(isel_context * ctx,unsigned i)11472 merged_wave_info_to_mask(isel_context* ctx, unsigned i)
11473 {
11474 Builder bld(ctx->program, ctx->block);
11475
11476 /* lanecount_to_mask() only cares about s0.u[6:0] so we don't need either s_bfe nor s_and here */
11477 Temp count = i == 0
11478 ? get_arg(ctx, ctx->args->ac.merged_wave_info)
11479 : bld.sop2(aco_opcode::s_lshr_b32, bld.def(s1), bld.def(s1, scc),
11480 get_arg(ctx, ctx->args->ac.merged_wave_info), Operand::c32(i * 8u));
11481
11482 return lanecount_to_mask(ctx, count);
11483 }
11484
11485 void
ngg_emit_sendmsg_gs_alloc_req(isel_context * ctx,Temp vtx_cnt,Temp prm_cnt)11486 ngg_emit_sendmsg_gs_alloc_req(isel_context* ctx, Temp vtx_cnt, Temp prm_cnt)
11487 {
11488 assert(vtx_cnt.id() && prm_cnt.id());
11489
11490 Builder bld(ctx->program, ctx->block);
11491 Temp prm_cnt_0;
11492
11493 if (ctx->program->chip_class == GFX10 &&
11494 (ctx->stage.has(SWStage::GS) || ctx->program->info->has_ngg_culling)) {
11495 /* Navi 1x workaround: check whether the workgroup has no output.
11496 * If so, change the number of exported vertices and primitives to 1.
11497 */
11498 prm_cnt_0 = bld.sopc(aco_opcode::s_cmp_eq_u32, bld.def(s1, scc), prm_cnt, Operand::zero());
11499 prm_cnt = bld.sop2(aco_opcode::s_cselect_b32, bld.def(s1), Operand::c32(1u), prm_cnt,
11500 bld.scc(prm_cnt_0));
11501 vtx_cnt = bld.sop2(aco_opcode::s_cselect_b32, bld.def(s1), Operand::c32(1u), vtx_cnt,
11502 bld.scc(prm_cnt_0));
11503 }
11504
11505 /* Put the number of vertices and primitives into m0 for the GS_ALLOC_REQ */
11506 Temp tmp =
11507 bld.sop2(aco_opcode::s_lshl_b32, bld.def(s1), bld.def(s1, scc), prm_cnt, Operand::c32(12u));
11508 tmp = bld.sop2(aco_opcode::s_or_b32, bld.m0(bld.def(s1)), bld.def(s1, scc), tmp, vtx_cnt);
11509
11510 /* Request the SPI to allocate space for the primitives and vertices
11511 * that will be exported by the threadgroup.
11512 */
11513 bld.sopp(aco_opcode::s_sendmsg, bld.m0(tmp), -1, sendmsg_gs_alloc_req);
11514
11515 if (prm_cnt_0.id()) {
11516 /* Navi 1x workaround: export a triangle with NaN coordinates when NGG has no output.
11517 * It can't have all-zero positions because that would render an undesired pixel with
11518 * conservative rasterization.
11519 */
11520 Temp first_lane = bld.sop1(Builder::s_ff1_i32, bld.def(s1), Operand(exec, bld.lm));
11521 Temp cond = bld.sop2(Builder::s_lshl, bld.def(bld.lm), bld.def(s1, scc),
11522 Operand::c32_or_c64(1u, ctx->program->wave_size == 64), first_lane);
11523 cond = bld.sop2(Builder::s_cselect, bld.def(bld.lm), cond,
11524 Operand::zero(ctx->program->wave_size == 64 ? 8 : 4), bld.scc(prm_cnt_0));
11525
11526 if_context ic_prim_0;
11527 begin_divergent_if_then(ctx, &ic_prim_0, cond);
11528 bld.reset(ctx->block);
11529 ctx->block->kind |= block_kind_export_end;
11530
11531 /* Use zero: means that it's a triangle whose every vertex index is 0. */
11532 Temp zero = bld.copy(bld.def(v1), Operand::zero());
11533 /* Use NaN for the coordinates, so that the rasterizer allways culls it. */
11534 Temp nan_coord = bld.copy(bld.def(v1), Operand::c32(-1u));
11535
11536 bld.exp(aco_opcode::exp, zero, Operand(v1), Operand(v1), Operand(v1), 1 /* enabled mask */,
11537 V_008DFC_SQ_EXP_PRIM /* dest */, false /* compressed */, true /* done */,
11538 false /* valid mask */);
11539 bld.exp(aco_opcode::exp, nan_coord, nan_coord, nan_coord, nan_coord, 0xf /* enabled mask */,
11540 V_008DFC_SQ_EXP_POS /* dest */, false /* compressed */, true /* done */,
11541 true /* valid mask */);
11542
11543 begin_divergent_if_else(ctx, &ic_prim_0);
11544 end_divergent_if(ctx, &ic_prim_0);
11545 bld.reset(ctx->block);
11546 }
11547 }
11548
11549 } /* end namespace */
11550
11551 void
select_program(Program * program,unsigned shader_count,struct nir_shader * const * shaders,ac_shader_config * config,const struct radv_shader_args * args)11552 select_program(Program* program, unsigned shader_count, struct nir_shader* const* shaders,
11553 ac_shader_config* config, const struct radv_shader_args* args)
11554 {
11555 isel_context ctx = setup_isel_context(program, shader_count, shaders, config, args, false);
11556 if_context ic_merged_wave_info;
11557 bool ngg_gs = ctx.stage.hw == HWStage::NGG && ctx.stage.has(SWStage::GS);
11558
11559 for (unsigned i = 0; i < shader_count; i++) {
11560 nir_shader* nir = shaders[i];
11561 init_context(&ctx, nir);
11562
11563 setup_fp_mode(&ctx, nir);
11564
11565 if (!i) {
11566 /* needs to be after init_context() for FS */
11567 Pseudo_instruction* startpgm = add_startpgm(&ctx);
11568 append_logical_start(ctx.block);
11569
11570 if (unlikely(args->options->has_ls_vgpr_init_bug && ctx.stage == vertex_tess_control_hs))
11571 fix_ls_vgpr_init_bug(&ctx, startpgm);
11572
11573 split_arguments(&ctx, startpgm);
11574
11575 if (!args->shader_info->vs.has_prolog &&
11576 (program->stage.has(SWStage::VS) || program->stage.has(SWStage::TES))) {
11577 Builder(ctx.program, ctx.block).sopp(aco_opcode::s_setprio, -1u, 0x3u);
11578 }
11579 }
11580
11581 /* In a merged VS+TCS HS, the VS implementation can be completely empty. */
11582 nir_function_impl* func = nir_shader_get_entrypoint(nir);
11583 bool empty_shader =
11584 nir_cf_list_is_empty_block(&func->body) &&
11585 ((nir->info.stage == MESA_SHADER_VERTEX &&
11586 (ctx.stage == vertex_tess_control_hs || ctx.stage == vertex_geometry_gs)) ||
11587 (nir->info.stage == MESA_SHADER_TESS_EVAL && ctx.stage == tess_eval_geometry_gs));
11588
11589 bool check_merged_wave_info =
11590 ctx.tcs_in_out_eq ? i == 0 : (shader_count >= 2 && !empty_shader && !(ngg_gs && i == 1));
11591 bool endif_merged_wave_info =
11592 ctx.tcs_in_out_eq ? i == 1 : (check_merged_wave_info && !(ngg_gs && i == 1));
11593
11594 if (program->chip_class == GFX10 && program->stage.hw == HWStage::NGG &&
11595 program->stage.num_sw_stages() == 1) {
11596 /* Workaround for Navi1x HW bug to ensure that all NGG waves launch before
11597 * s_sendmsg(GS_ALLOC_REQ). */
11598 Builder(ctx.program, ctx.block).sopp(aco_opcode::s_barrier, -1u, 0u);
11599 }
11600
11601 if (check_merged_wave_info) {
11602 Temp cond = merged_wave_info_to_mask(&ctx, i);
11603 begin_divergent_if_then(&ctx, &ic_merged_wave_info, cond);
11604 }
11605
11606 if (i) {
11607 Builder bld(ctx.program, ctx.block);
11608
11609 /* Skip s_barrier from TCS when VS outputs are not stored in the LDS. */
11610 bool tcs_skip_barrier = ctx.stage == vertex_tess_control_hs &&
11611 ctx.tcs_temp_only_inputs == nir->info.inputs_read;
11612
11613 if (!ngg_gs && !tcs_skip_barrier)
11614 create_workgroup_barrier(bld);
11615
11616 if (ctx.stage == vertex_geometry_gs || ctx.stage == tess_eval_geometry_gs) {
11617 ctx.gs_wave_id = bld.pseudo(aco_opcode::p_extract, bld.def(s1, m0), bld.def(s1, scc),
11618 get_arg(&ctx, args->ac.merged_wave_info), Operand::c32(2u),
11619 Operand::c32(8u), Operand::zero());
11620 }
11621 } else if (ctx.stage == geometry_gs)
11622 ctx.gs_wave_id = get_arg(&ctx, args->ac.gs_wave_id);
11623
11624 if (ctx.stage == fragment_fs)
11625 handle_bc_optimize(&ctx);
11626
11627 visit_cf_list(&ctx, &func->body);
11628
11629 if (ctx.program->info->so.num_outputs && ctx.stage.hw == HWStage::VS)
11630 emit_streamout(&ctx, 0);
11631
11632 if (ctx.stage.hw == HWStage::VS) {
11633 create_vs_exports(&ctx);
11634 } else if (nir->info.stage == MESA_SHADER_GEOMETRY && !ngg_gs) {
11635 Builder bld(ctx.program, ctx.block);
11636 bld.barrier(aco_opcode::p_barrier,
11637 memory_sync_info(storage_vmem_output, semantic_release, scope_device));
11638 bld.sopp(aco_opcode::s_sendmsg, bld.m0(ctx.gs_wave_id), -1,
11639 sendmsg_gs_done(false, false, 0));
11640 }
11641
11642 if (ctx.stage == fragment_fs) {
11643 create_fs_exports(&ctx);
11644 }
11645
11646 if (endif_merged_wave_info) {
11647 begin_divergent_if_else(&ctx, &ic_merged_wave_info);
11648 end_divergent_if(&ctx, &ic_merged_wave_info);
11649 }
11650
11651 if (i == 0 && ctx.stage == vertex_tess_control_hs && ctx.tcs_in_out_eq) {
11652 /* Outputs of the previous stage are inputs to the next stage */
11653 ctx.inputs = ctx.outputs;
11654 ctx.outputs = shader_io_state();
11655 }
11656
11657 cleanup_context(&ctx);
11658 }
11659
11660 program->config->float_mode = program->blocks[0].fp_mode.val;
11661
11662 append_logical_end(ctx.block);
11663 ctx.block->kind |= block_kind_uniform;
11664 Builder bld(ctx.program, ctx.block);
11665 bld.sopp(aco_opcode::s_endpgm);
11666
11667 cleanup_cfg(program);
11668 }
11669
11670 void
select_gs_copy_shader(Program * program,struct nir_shader * gs_shader,ac_shader_config * config,const struct radv_shader_args * args)11671 select_gs_copy_shader(Program* program, struct nir_shader* gs_shader, ac_shader_config* config,
11672 const struct radv_shader_args* args)
11673 {
11674 isel_context ctx = setup_isel_context(program, 1, &gs_shader, config, args, true);
11675
11676 ctx.block->fp_mode = program->next_fp_mode;
11677
11678 add_startpgm(&ctx);
11679 append_logical_start(ctx.block);
11680
11681 Builder bld(ctx.program, ctx.block);
11682
11683 Temp gsvs_ring = bld.smem(aco_opcode::s_load_dwordx4, bld.def(s4),
11684 program->private_segment_buffer, Operand::c32(RING_GSVS_VS * 16u));
11685
11686 Operand stream_id = Operand::zero();
11687 if (args->shader_info->so.num_outputs)
11688 stream_id = bld.sop2(aco_opcode::s_bfe_u32, bld.def(s1), bld.def(s1, scc),
11689 get_arg(&ctx, ctx.args->ac.streamout_config), Operand::c32(0x20018u));
11690
11691 Temp vtx_offset = bld.vop2(aco_opcode::v_lshlrev_b32, bld.def(v1), Operand::c32(2u),
11692 get_arg(&ctx, ctx.args->ac.vertex_id));
11693
11694 std::stack<if_context, std::vector<if_context>> if_contexts;
11695
11696 for (unsigned stream = 0; stream < 4; stream++) {
11697 if (stream_id.isConstant() && stream != stream_id.constantValue())
11698 continue;
11699
11700 unsigned num_components = args->shader_info->gs.num_stream_output_components[stream];
11701 if (stream > 0 && (!num_components || !args->shader_info->so.num_outputs))
11702 continue;
11703
11704 memset(ctx.outputs.mask, 0, sizeof(ctx.outputs.mask));
11705
11706 if (!stream_id.isConstant()) {
11707 Temp cond =
11708 bld.sopc(aco_opcode::s_cmp_eq_u32, bld.def(s1, scc), stream_id, Operand::c32(stream));
11709 if_contexts.emplace();
11710 begin_uniform_if_then(&ctx, &if_contexts.top(), cond);
11711 bld.reset(ctx.block);
11712 }
11713
11714 unsigned offset = 0;
11715 for (unsigned i = 0; i <= VARYING_SLOT_VAR31; ++i) {
11716 if (args->shader_info->gs.output_streams[i] != stream)
11717 continue;
11718
11719 unsigned output_usage_mask = args->shader_info->gs.output_usage_mask[i];
11720 unsigned length = util_last_bit(output_usage_mask);
11721 for (unsigned j = 0; j < length; ++j) {
11722 if (!(output_usage_mask & (1 << j)))
11723 continue;
11724
11725 Temp val = bld.tmp(v1);
11726 unsigned const_offset = offset * args->shader_info->gs.vertices_out * 16 * 4;
11727 load_vmem_mubuf(&ctx, val, gsvs_ring, vtx_offset, Temp(), const_offset, 4, 1, 0u, true,
11728 true, true);
11729
11730 ctx.outputs.mask[i] |= 1 << j;
11731 ctx.outputs.temps[i * 4u + j] = val;
11732
11733 offset++;
11734 }
11735 }
11736
11737 if (args->shader_info->so.num_outputs) {
11738 emit_streamout(&ctx, stream);
11739 bld.reset(ctx.block);
11740 }
11741
11742 if (stream == 0) {
11743 create_vs_exports(&ctx);
11744 }
11745
11746 if (!stream_id.isConstant()) {
11747 begin_uniform_if_else(&ctx, &if_contexts.top());
11748 bld.reset(ctx.block);
11749 }
11750 }
11751
11752 while (!if_contexts.empty()) {
11753 end_uniform_if(&ctx, &if_contexts.top());
11754 if_contexts.pop();
11755 }
11756
11757 program->config->float_mode = program->blocks[0].fp_mode.val;
11758
11759 append_logical_end(ctx.block);
11760 ctx.block->kind |= block_kind_uniform;
11761 bld.reset(ctx.block);
11762 bld.sopp(aco_opcode::s_endpgm);
11763
11764 cleanup_cfg(program);
11765 }
11766
11767 void
select_trap_handler_shader(Program * program,struct nir_shader * shader,ac_shader_config * config,const struct radv_shader_args * args)11768 select_trap_handler_shader(Program* program, struct nir_shader* shader, ac_shader_config* config,
11769 const struct radv_shader_args* args)
11770 {
11771 assert(args->options->chip_class == GFX8);
11772
11773 init_program(program, compute_cs, args->shader_info, args->options->chip_class,
11774 args->options->family, args->options->wgp_mode, config);
11775
11776 isel_context ctx = {};
11777 ctx.program = program;
11778 ctx.args = args;
11779 ctx.options = args->options;
11780 ctx.stage = program->stage;
11781
11782 ctx.block = ctx.program->create_and_insert_block();
11783 ctx.block->kind = block_kind_top_level;
11784
11785 program->workgroup_size = 1; /* XXX */
11786
11787 add_startpgm(&ctx);
11788 append_logical_start(ctx.block);
11789
11790 Builder bld(ctx.program, ctx.block);
11791
11792 /* Load the buffer descriptor from TMA. */
11793 bld.smem(aco_opcode::s_load_dwordx4, Definition(PhysReg{ttmp4}, s4), Operand(PhysReg{tma}, s2),
11794 Operand::zero());
11795
11796 /* Store TTMP0-TTMP1. */
11797 bld.smem(aco_opcode::s_buffer_store_dwordx2, Operand(PhysReg{ttmp4}, s4), Operand::zero(),
11798 Operand(PhysReg{ttmp0}, s2), memory_sync_info(), true);
11799
11800 uint32_t hw_regs_idx[] = {
11801 2, /* HW_REG_STATUS */
11802 3, /* HW_REG_TRAP_STS */
11803 4, /* HW_REG_HW_ID */
11804 7, /* HW_REG_IB_STS */
11805 };
11806
11807 /* Store some hardware registers. */
11808 for (unsigned i = 0; i < ARRAY_SIZE(hw_regs_idx); i++) {
11809 /* "((size - 1) << 11) | register" */
11810 bld.sopk(aco_opcode::s_getreg_b32, Definition(PhysReg{ttmp8}, s1),
11811 ((20 - 1) << 11) | hw_regs_idx[i]);
11812
11813 bld.smem(aco_opcode::s_buffer_store_dword, Operand(PhysReg{ttmp4}, s4),
11814 Operand::c32(8u + i * 4), Operand(PhysReg{ttmp8}, s1), memory_sync_info(), true);
11815 }
11816
11817 program->config->float_mode = program->blocks[0].fp_mode.val;
11818
11819 append_logical_end(ctx.block);
11820 ctx.block->kind |= block_kind_uniform;
11821 bld.sopp(aco_opcode::s_endpgm);
11822
11823 cleanup_cfg(program);
11824 }
11825
11826 Operand
get_arg_fixed(const struct radv_shader_args * args,struct ac_arg arg)11827 get_arg_fixed(const struct radv_shader_args* args, struct ac_arg arg)
11828 {
11829 assert(arg.used);
11830
11831 enum ac_arg_regfile file = args->ac.args[arg.arg_index].file;
11832 unsigned size = args->ac.args[arg.arg_index].size;
11833 unsigned reg = args->ac.args[arg.arg_index].offset;
11834
11835 return Operand(PhysReg(file == AC_ARG_SGPR ? reg : reg + 256),
11836 RegClass(file == AC_ARG_SGPR ? RegType::sgpr : RegType::vgpr, size));
11837 }
11838
11839 unsigned
load_vb_descs(Builder & bld,PhysReg dest,Operand base,unsigned start,unsigned max)11840 load_vb_descs(Builder& bld, PhysReg dest, Operand base, unsigned start, unsigned max)
11841 {
11842 unsigned count = MIN2((bld.program->dev.sgpr_limit - dest.reg()) / 4u, max);
11843
11844 unsigned num_loads = (count / 4u) + util_bitcount(count & 0x3);
11845 if (bld.program->chip_class >= GFX10 && num_loads > 1)
11846 bld.sopp(aco_opcode::s_clause, -1, num_loads - 1);
11847
11848 for (unsigned i = 0; i < count;) {
11849 unsigned size = 1u << util_logbase2(MIN2(count - i, 4));
11850
11851 if (size == 4)
11852 bld.smem(aco_opcode::s_load_dwordx16, Definition(dest, s16), base,
11853 Operand::c32((start + i) * 16u));
11854 else if (size == 2)
11855 bld.smem(aco_opcode::s_load_dwordx8, Definition(dest, s8), base,
11856 Operand::c32((start + i) * 16u));
11857 else
11858 bld.smem(aco_opcode::s_load_dwordx4, Definition(dest, s4), base,
11859 Operand::c32((start + i) * 16u));
11860
11861 dest = dest.advance(size * 16u);
11862 i += size;
11863 }
11864
11865 return count;
11866 }
11867
11868 Operand
calc_nontrivial_instance_id(Builder & bld,const struct radv_shader_args * args,unsigned index,Operand instance_id,Operand start_instance,PhysReg tmp_sgpr,PhysReg tmp_vgpr0,PhysReg tmp_vgpr1)11869 calc_nontrivial_instance_id(Builder& bld, const struct radv_shader_args* args, unsigned index,
11870 Operand instance_id, Operand start_instance, PhysReg tmp_sgpr,
11871 PhysReg tmp_vgpr0, PhysReg tmp_vgpr1)
11872 {
11873 bld.smem(aco_opcode::s_load_dwordx2, Definition(tmp_sgpr, s2),
11874 get_arg_fixed(args, args->prolog_inputs), Operand::c32(8u + index * 8u));
11875
11876 wait_imm lgkm_imm;
11877 lgkm_imm.lgkm = 0;
11878 bld.sopp(aco_opcode::s_waitcnt, -1, lgkm_imm.pack(bld.program->chip_class));
11879
11880 Definition fetch_index_def(tmp_vgpr0, v1);
11881 Operand fetch_index(tmp_vgpr0, v1);
11882
11883 Operand div_info(tmp_sgpr, s1);
11884 if (bld.program->chip_class >= GFX8) {
11885 /* use SDWA */
11886 if (bld.program->chip_class < GFX9) {
11887 bld.vop1(aco_opcode::v_mov_b32, Definition(tmp_vgpr1, v1), div_info);
11888 div_info = Operand(tmp_vgpr1, v1);
11889 }
11890
11891 bld.vop2(aco_opcode::v_lshrrev_b32, fetch_index_def, div_info, instance_id).instr;
11892
11893 Instruction* instr;
11894 if (bld.program->chip_class >= GFX9)
11895 instr = bld.vop2_sdwa(aco_opcode::v_add_u32, fetch_index_def, div_info, fetch_index).instr;
11896 else
11897 instr = bld.vop2_sdwa(aco_opcode::v_add_co_u32, fetch_index_def, Definition(vcc, bld.lm),
11898 div_info, fetch_index)
11899 .instr;
11900 instr->sdwa().sel[0] = SubdwordSel::ubyte1;
11901
11902 bld.vop3(aco_opcode::v_mul_hi_u32, fetch_index_def, Operand(tmp_sgpr.advance(4), s1),
11903 fetch_index);
11904
11905 instr =
11906 bld.vop2_sdwa(aco_opcode::v_lshrrev_b32, fetch_index_def, div_info, fetch_index).instr;
11907 instr->sdwa().sel[0] = SubdwordSel::ubyte2;
11908 } else {
11909 Operand tmp_op(tmp_vgpr1, v1);
11910 Definition tmp_def(tmp_vgpr1, v1);
11911
11912 bld.vop2(aco_opcode::v_lshrrev_b32, fetch_index_def, div_info, instance_id);
11913
11914 bld.vop3(aco_opcode::v_bfe_u32, tmp_def, div_info, Operand::c32(8u), Operand::c32(8u));
11915 bld.vadd32(fetch_index_def, tmp_op, fetch_index, false, Operand(s2), true);
11916
11917 bld.vop3(aco_opcode::v_mul_hi_u32, fetch_index_def, fetch_index,
11918 Operand(tmp_sgpr.advance(4), s1));
11919
11920 bld.vop3(aco_opcode::v_bfe_u32, tmp_def, div_info, Operand::c32(16u), Operand::c32(8u));
11921 bld.vop2(aco_opcode::v_lshrrev_b32, fetch_index_def, tmp_op, fetch_index);
11922 }
11923
11924 bld.vadd32(fetch_index_def, start_instance, fetch_index, false, Operand(s2), true);
11925
11926 return fetch_index;
11927 }
11928
11929 void
select_vs_prolog(Program * program,const struct radv_vs_prolog_key * key,ac_shader_config * config,const struct radv_shader_args * args,unsigned * num_preserved_sgprs)11930 select_vs_prolog(Program* program, const struct radv_vs_prolog_key* key, ac_shader_config* config,
11931 const struct radv_shader_args* args, unsigned* num_preserved_sgprs)
11932 {
11933 assert(key->num_attributes > 0);
11934
11935 /* This should be enough for any shader/stage. */
11936 unsigned max_user_sgprs = args->options->chip_class >= GFX9 ? 32 : 16;
11937 *num_preserved_sgprs = max_user_sgprs + 14;
11938
11939 init_program(program, compute_cs, args->shader_info, args->options->chip_class,
11940 args->options->family, args->options->wgp_mode, config);
11941
11942 Block* block = program->create_and_insert_block();
11943 block->kind = block_kind_top_level;
11944
11945 program->workgroup_size = 64;
11946 calc_min_waves(program);
11947
11948 Builder bld(program, block);
11949
11950 block->instructions.reserve(16 + key->num_attributes * 4);
11951
11952 bld.sopp(aco_opcode::s_setprio, -1u, 0x3u);
11953
11954 uint32_t attrib_mask = BITFIELD_MASK(key->num_attributes);
11955 bool has_nontrivial_divisors = key->state->nontrivial_divisors & attrib_mask;
11956
11957 wait_imm lgkm_imm;
11958 lgkm_imm.lgkm = 0;
11959
11960 /* choose sgprs */
11961 PhysReg vertex_buffers(align(*num_preserved_sgprs, 2));
11962 PhysReg prolog_input = vertex_buffers.advance(8);
11963 PhysReg desc(
11964 align((has_nontrivial_divisors ? prolog_input : vertex_buffers).advance(8).reg(), 4));
11965
11966 Operand start_instance = get_arg_fixed(args, args->ac.start_instance);
11967 Operand instance_id = get_arg_fixed(args, args->ac.instance_id);
11968
11969 PhysReg attributes_start(256 + args->ac.num_vgprs_used);
11970 /* choose vgprs that won't be used for anything else until the last attribute load */
11971 PhysReg vertex_index(attributes_start.reg() + key->num_attributes * 4 - 1);
11972 PhysReg instance_index(attributes_start.reg() + key->num_attributes * 4 - 2);
11973 PhysReg start_instance_vgpr(attributes_start.reg() + key->num_attributes * 4 - 3);
11974 PhysReg nontrivial_tmp_vgpr0(attributes_start.reg() + key->num_attributes * 4 - 4);
11975 PhysReg nontrivial_tmp_vgpr1(attributes_start.reg() + key->num_attributes * 4);
11976
11977 bld.sop1(aco_opcode::s_mov_b32, Definition(vertex_buffers, s1),
11978 get_arg_fixed(args, args->ac.vertex_buffers));
11979 bld.sop1(aco_opcode::s_mov_b32, Definition(vertex_buffers.advance(4), s1),
11980 Operand::c32((unsigned)args->options->address32_hi));
11981
11982 /* calculate vgpr requirements */
11983 unsigned num_vgprs = attributes_start.reg() - 256;
11984 num_vgprs += key->num_attributes * 4;
11985 if (has_nontrivial_divisors && program->chip_class <= GFX8)
11986 num_vgprs++; /* make space for nontrivial_tmp_vgpr1 */
11987 unsigned num_sgprs = 0;
11988
11989 for (unsigned loc = 0; loc < key->num_attributes;) {
11990 unsigned num_descs =
11991 load_vb_descs(bld, desc, Operand(vertex_buffers, s2), loc, key->num_attributes - loc);
11992 num_sgprs = MAX2(num_sgprs, desc.advance(num_descs * 16u).reg());
11993
11994 if (loc == 0) {
11995 /* perform setup while we load the descriptors */
11996 if (key->is_ngg || key->next_stage != MESA_SHADER_VERTEX) {
11997 Operand count = get_arg_fixed(args, args->ac.merged_wave_info);
11998 bld.sop2(aco_opcode::s_bfm_b64, Definition(exec, s2), count, Operand::c32(0u));
11999 if (program->wave_size == 64) {
12000 bld.sopc(aco_opcode::s_bitcmp1_b32, Definition(scc, s1), count,
12001 Operand::c32(6u /* log2(64) */));
12002 bld.sop2(aco_opcode::s_cselect_b64, Definition(exec, s2), Operand::c64(UINT64_MAX),
12003 Operand(exec, s2), Operand(scc, s1));
12004 }
12005 }
12006
12007 bool needs_instance_index = false;
12008 bool needs_start_instance = false;
12009 u_foreach_bit(i, key->state->instance_rate_inputs & attrib_mask)
12010 {
12011 needs_instance_index |= key->state->divisors[i] == 1;
12012 needs_start_instance |= key->state->divisors[i] == 0;
12013 }
12014 bool needs_vertex_index = ~key->state->instance_rate_inputs & attrib_mask;
12015 if (needs_vertex_index)
12016 bld.vadd32(Definition(vertex_index, v1), get_arg_fixed(args, args->ac.base_vertex),
12017 get_arg_fixed(args, args->ac.vertex_id), false, Operand(s2), true);
12018 if (needs_instance_index)
12019 bld.vadd32(Definition(instance_index, v1), start_instance, instance_id, false,
12020 Operand(s2), true);
12021 if (needs_start_instance)
12022 bld.vop1(aco_opcode::v_mov_b32, Definition(start_instance_vgpr, v1), start_instance);
12023 }
12024
12025 bld.sopp(aco_opcode::s_waitcnt, -1, lgkm_imm.pack(program->chip_class));
12026
12027 for (unsigned i = 0; i < num_descs; i++, loc++) {
12028 PhysReg dest(attributes_start.reg() + loc * 4u);
12029
12030 /* calculate index */
12031 Operand fetch_index = Operand(vertex_index, v1);
12032 if (key->state->instance_rate_inputs & (1u << loc)) {
12033 uint32_t divisor = key->state->divisors[loc];
12034 if (divisor) {
12035 fetch_index = instance_id;
12036 if (key->state->nontrivial_divisors & (1u << loc)) {
12037 unsigned index =
12038 util_bitcount(key->state->nontrivial_divisors & BITFIELD_MASK(loc));
12039 fetch_index = calc_nontrivial_instance_id(
12040 bld, args, index, instance_id, start_instance, prolog_input,
12041 nontrivial_tmp_vgpr0, nontrivial_tmp_vgpr1);
12042 } else {
12043 fetch_index = Operand(instance_index, v1);
12044 }
12045 } else {
12046 fetch_index = Operand(start_instance_vgpr, v1);
12047 }
12048 }
12049
12050 /* perform load */
12051 PhysReg cur_desc = desc.advance(i * 16);
12052 if ((key->misaligned_mask & (1u << loc))) {
12053 unsigned dfmt = key->state->formats[loc] & 0xf;
12054 unsigned nfmt = key->state->formats[loc] >> 4;
12055 const struct ac_data_format_info* vtx_info = ac_get_data_format_info(dfmt);
12056 for (unsigned j = 0; j < vtx_info->num_channels; j++) {
12057 bool post_shuffle = key->state->post_shuffle & (1u << loc);
12058 unsigned offset = vtx_info->chan_byte_size * (post_shuffle && j < 3 ? 2 - j : j);
12059
12060 /* Use MUBUF to workaround hangs for byte-aligned dword loads. The Vulkan spec
12061 * doesn't require this to work, but some GL CTS tests over Zink do this anyway.
12062 * MTBUF can hang, but MUBUF doesn't (probably gives garbage, but GL CTS doesn't
12063 * care).
12064 */
12065 if (vtx_info->chan_format == V_008F0C_BUF_DATA_FORMAT_32)
12066 bld.mubuf(aco_opcode::buffer_load_dword, Definition(dest.advance(j * 4u), v1),
12067 Operand(cur_desc, s4), fetch_index, Operand::c32(0u), offset, false,
12068 false, true);
12069 else
12070 bld.mtbuf(aco_opcode::tbuffer_load_format_x, Definition(dest.advance(j * 4u), v1),
12071 Operand(cur_desc, s4), fetch_index, Operand::c32(0u),
12072 vtx_info->chan_format, nfmt, offset, false, true);
12073 }
12074 uint32_t one =
12075 nfmt == V_008F0C_BUF_NUM_FORMAT_UINT || nfmt == V_008F0C_BUF_NUM_FORMAT_SINT
12076 ? 1u
12077 : 0x3f800000u;
12078 for (unsigned j = vtx_info->num_channels; j < 4; j++) {
12079 bld.vop1(aco_opcode::v_mov_b32, Definition(dest.advance(j * 4u), v1),
12080 Operand::c32(j == 3 ? one : 0u));
12081 }
12082 } else {
12083 bld.mubuf(aco_opcode::buffer_load_format_xyzw, Definition(dest, v4),
12084 Operand(cur_desc, s4), fetch_index, Operand::c32(0u), 0u, false, false, true);
12085 }
12086 }
12087 }
12088
12089 if (key->state->alpha_adjust_lo | key->state->alpha_adjust_hi) {
12090 wait_imm vm_imm;
12091 vm_imm.vm = 0;
12092 bld.sopp(aco_opcode::s_waitcnt, -1, vm_imm.pack(program->chip_class));
12093 }
12094
12095 /* For 2_10_10_10 formats the alpha is handled as unsigned by pre-vega HW.
12096 * so we may need to fix it up. */
12097 u_foreach_bit(loc, (key->state->alpha_adjust_lo | key->state->alpha_adjust_hi))
12098 {
12099 PhysReg alpha(attributes_start.reg() + loc * 4u + 3);
12100
12101 unsigned alpha_adjust = (key->state->alpha_adjust_lo >> loc) & 0x1;
12102 alpha_adjust |= ((key->state->alpha_adjust_hi >> loc) & 0x1) << 1;
12103
12104 if (alpha_adjust == ALPHA_ADJUST_SSCALED)
12105 bld.vop1(aco_opcode::v_cvt_u32_f32, Definition(alpha, v1), Operand(alpha, v1));
12106
12107 /* For the integer-like cases, do a natural sign extension.
12108 *
12109 * For the SNORM case, the values are 0.0, 0.333, 0.666, 1.0
12110 * and happen to contain 0, 1, 2, 3 as the two LSBs of the
12111 * exponent.
12112 */
12113 unsigned offset = alpha_adjust == ALPHA_ADJUST_SNORM ? 23u : 0u;
12114 bld.vop3(aco_opcode::v_bfe_i32, Definition(alpha, v1), Operand(alpha, v1),
12115 Operand::c32(offset), Operand::c32(2u));
12116
12117 /* Convert back to the right type. */
12118 if (alpha_adjust == ALPHA_ADJUST_SNORM) {
12119 bld.vop1(aco_opcode::v_cvt_f32_i32, Definition(alpha, v1), Operand(alpha, v1));
12120 bld.vop2(aco_opcode::v_max_f32, Definition(alpha, v1), Operand::c32(0xbf800000u),
12121 Operand(alpha, v1));
12122 } else if (alpha_adjust == ALPHA_ADJUST_SSCALED) {
12123 bld.vop1(aco_opcode::v_cvt_f32_i32, Definition(alpha, v1), Operand(alpha, v1));
12124 }
12125 }
12126
12127 block->kind |= block_kind_uniform;
12128
12129 /* continue on to the main shader */
12130 Operand continue_pc = get_arg_fixed(args, args->prolog_inputs);
12131 if (has_nontrivial_divisors) {
12132 bld.smem(aco_opcode::s_load_dwordx2, Definition(prolog_input, s2),
12133 get_arg_fixed(args, args->prolog_inputs), Operand::c32(0u));
12134 bld.sopp(aco_opcode::s_waitcnt, -1, lgkm_imm.pack(program->chip_class));
12135 continue_pc = Operand(prolog_input, s2);
12136 }
12137
12138 bld.sop1(aco_opcode::s_setpc_b64, continue_pc);
12139
12140 program->config->float_mode = program->blocks[0].fp_mode.val;
12141 /* addition on GFX6-8 requires a carry-out (we use VCC) */
12142 program->needs_vcc = program->chip_class <= GFX8;
12143 program->config->num_vgprs = get_vgpr_alloc(program, num_vgprs);
12144 program->config->num_sgprs = get_sgpr_alloc(program, num_sgprs);
12145 }
12146 } // namespace aco
12147