1 /*
2 * Copyright © 2018 Valve Corporation
3 *
4 * SPDX-License-Identifier: MIT
5 */
6
7 #include "aco_builder.h"
8 #include "aco_ir.h"
9
10 #include "common/sid.h"
11
12 #include <map>
13 #include <vector>
14
15 namespace aco {
16
17 namespace {
18
19 struct lower_context {
20 Program* program;
21 Block* block;
22 std::vector<aco_ptr<Instruction>> instructions;
23 };
24
25 /* Class for obtaining where s_sendmsg(MSG_ORDERED_PS_DONE) must be done in a Primitive Ordered
26 * Pixel Shader on GFX9-10.3.
27 *
28 * MSG_ORDERED_PS_DONE must be sent once after the ordered section is done along all execution paths
29 * from the POPS packer ID hardware register setting to s_endpgm. It is, however, also okay to send
30 * it if the packer ID is not going to be set at all by the wave, so some conservativeness is fine.
31 *
32 * For simplicity, sending the message from top-level blocks as dominance and post-dominance
33 * checking for any location in the shader is trivial in them. Also, for simplicity, sending it
34 * regardless of whether the POPS packer ID hardware register has already potentially been set up.
35 *
36 * Note that there can be multiple interlock end instructions in the shader.
37 * SPV_EXT_fragment_shader_interlock requires OpEndInvocationInterlockEXT to be executed exactly
38 * once by the invocation. However, there may be, for instance, multiple ordered sections, and which
39 * one will be executed may depend on divergent control flow (some lanes may execute one ordered
40 * section, other lanes may execute another). MSG_ORDERED_PS_DONE, however, is sent via a scalar
41 * instruction, so it must be ensured that the message is sent after the last ordered section in the
42 * entire wave.
43 */
44 class gfx9_pops_done_msg_bounds {
45 public:
46 explicit gfx9_pops_done_msg_bounds() = default;
47
gfx9_pops_done_msg_bounds(const Program * const program)48 explicit gfx9_pops_done_msg_bounds(const Program* const program)
49 {
50 /* Find the top-level location after the last ordered section end pseudo-instruction in the
51 * program.
52 * Consider `p_pops_gfx9_overlapped_wave_wait_done` a boundary too - make sure the message
53 * isn't sent if any wait hasn't been fully completed yet (if a begin-end-begin situation
54 * occurs somehow, as the location of `p_pops_gfx9_ordered_section_done` is controlled by the
55 * application) for safety, assuming that waits are the only thing that need the packer
56 * hardware register to be set at some point during or before them, and it won't be set
57 * anymore after the last wait.
58 */
59 int last_top_level_block_idx = -1;
60 for (int block_idx = (int)program->blocks.size() - 1; block_idx >= 0; block_idx--) {
61 const Block& block = program->blocks[block_idx];
62 if (block.kind & block_kind_top_level) {
63 last_top_level_block_idx = block_idx;
64 }
65 for (size_t instr_idx = block.instructions.size() - 1; instr_idx + size_t(1) > 0;
66 instr_idx--) {
67 const aco_opcode opcode = block.instructions[instr_idx]->opcode;
68 if (opcode == aco_opcode::p_pops_gfx9_ordered_section_done ||
69 opcode == aco_opcode::p_pops_gfx9_overlapped_wave_wait_done) {
70 end_block_idx_ = last_top_level_block_idx;
71 /* The same block if it's already a top-level block, or the beginning of the next
72 * top-level block.
73 */
74 instr_after_end_idx_ = block_idx == end_block_idx_ ? instr_idx + 1 : 0;
75 break;
76 }
77 }
78 if (end_block_idx_ != -1) {
79 break;
80 }
81 }
82 }
83
84 /* If this is not -1, during the normal execution flow (not early exiting), MSG_ORDERED_PS_DONE
85 * must be sent in this block.
86 */
end_block_idx() const87 int end_block_idx() const { return end_block_idx_; }
88
89 /* If end_block_idx() is an existing block, during the normal execution flow (not early exiting),
90 * MSG_ORDERED_PS_DONE must be sent before this instruction in the block end_block_idx().
91 * If this is out of the bounds of the instructions in the end block, it must be sent in the end
92 * of that block.
93 */
instr_after_end_idx() const94 size_t instr_after_end_idx() const { return instr_after_end_idx_; }
95
96 /* Whether an instruction doing early exit (such as discard) needs to send MSG_ORDERED_PS_DONE
97 * before actually ending the program.
98 */
early_exit_needs_done_msg(const int block_idx,const size_t instr_idx) const99 bool early_exit_needs_done_msg(const int block_idx, const size_t instr_idx) const
100 {
101 return block_idx <= end_block_idx_ &&
102 (block_idx != end_block_idx_ || instr_idx < instr_after_end_idx_);
103 }
104
105 private:
106 /* Initialize to an empty range for which "is inside" comparisons will be failing for any
107 * block.
108 */
109 int end_block_idx_ = -1;
110 size_t instr_after_end_idx_ = 0;
111 };
112
113 void
copy_constant_sgpr(Builder & bld,Definition dst,uint64_t constant)114 copy_constant_sgpr(Builder& bld, Definition dst, uint64_t constant)
115 {
116 if (dst.regClass() == s1) {
117 uint32_t imm = constant;
118 Operand op = Operand::get_const(bld.program->gfx_level, imm, 4);
119 if (op.isLiteral()) {
120 if (imm >= 0xffff8000 || imm <= 0x7fff) {
121 bld.sopk(aco_opcode::s_movk_i32, dst, imm & 0xFFFFu);
122 return;
123 }
124
125 Operand rev_op = Operand::get_const(bld.program->gfx_level, util_bitreverse(imm), 4);
126 if (!rev_op.isLiteral()) {
127 bld.sop1(aco_opcode::s_brev_b32, dst, rev_op);
128 return;
129 }
130
131 unsigned start = (ffs(imm) - 1) & 0x1f;
132 unsigned size = util_bitcount(imm) & 0x1f;
133 if (BITFIELD_RANGE(start, size) == imm) {
134 bld.sop2(aco_opcode::s_bfm_b32, dst, Operand::c32(size), Operand::c32(start));
135 return;
136 }
137
138 if (bld.program->gfx_level >= GFX9) {
139 Operand op_lo = Operand::c32(int32_t(int16_t(imm)));
140 Operand op_hi = Operand::c32(int32_t(int16_t(imm >> 16)));
141 if (!op_lo.isLiteral() && !op_hi.isLiteral()) {
142 bld.sop2(aco_opcode::s_pack_ll_b32_b16, dst, op_lo, op_hi);
143 return;
144 }
145 }
146 }
147
148 bld.sop1(aco_opcode::s_mov_b32, dst, op);
149 return;
150 }
151
152 assert(dst.regClass() == s2);
153
154 bool can_use_mov = Operand::is_constant_representable(constant, 8, true, false);
155 if (can_use_mov && !Operand::c64(constant).isLiteral()) {
156 bld.sop1(aco_opcode::s_mov_b64, dst, Operand::c64(constant));
157 return;
158 }
159
160 unsigned start = (ffsll(constant) - 1) & 0x3f;
161 unsigned size = util_bitcount64(constant) & 0x3f;
162 if (BITFIELD64_RANGE(start, size) == constant) {
163 bld.sop2(aco_opcode::s_bfm_b64, dst, Operand::c32(size), Operand::c32(start));
164 return;
165 }
166
167 uint64_t rev = ((uint64_t)util_bitreverse(constant) << 32) | util_bitreverse(constant >> 32);
168 if (Operand::is_constant_representable(rev, 8, true, false)) {
169 bld.sop1(aco_opcode::s_brev_b64, dst, Operand::c64(rev));
170 return;
171 }
172
173 if (can_use_mov) {
174 bld.sop1(aco_opcode::s_mov_b64, dst, Operand::c64(constant));
175 return;
176 }
177
178 uint32_t derep = 0;
179 bool can_use_rep = bld.program->gfx_level >= GFX9;
180 for (unsigned i = 0; can_use_rep && i < 32; i++) {
181 uint32_t lo = (constant >> (i * 2)) & 0x1;
182 uint32_t hi = (constant >> ((i * 2) + 1)) & 0x1;
183 can_use_rep &= lo == hi;
184 derep |= lo << i;
185 }
186 if (can_use_rep) {
187 bld.sop1(aco_opcode::s_bitreplicate_b64_b32, dst, Operand::c32(derep));
188 return;
189 }
190
191 copy_constant_sgpr(bld, Definition(dst.physReg(), s1), (uint32_t)constant);
192 copy_constant_sgpr(bld, Definition(dst.physReg().advance(4), s1), constant >> 32);
193 }
194
195 /* used by handle_operands() indirectly through Builder::copy */
196 uint8_t int8_mul_table[512] = {
197 0, 20, 1, 1, 1, 2, 1, 3, 1, 4, 1, 5, 1, 6, 1, 7, 1, 8, 1, 9,
198 1, 10, 1, 11, 1, 12, 1, 13, 1, 14, 1, 15, 1, 16, 1, 17, 1, 18, 1, 19,
199 1, 20, 1, 21, 1, 22, 1, 23, 1, 24, 1, 25, 1, 26, 1, 27, 1, 28, 1, 29,
200 1, 30, 1, 31, 1, 32, 1, 33, 1, 34, 1, 35, 1, 36, 1, 37, 1, 38, 1, 39,
201 1, 40, 1, 41, 1, 42, 1, 43, 1, 44, 1, 45, 1, 46, 1, 47, 1, 48, 1, 49,
202 1, 50, 1, 51, 1, 52, 1, 53, 1, 54, 1, 55, 1, 56, 1, 57, 1, 58, 1, 59,
203 1, 60, 1, 61, 1, 62, 1, 63, 1, 64, 5, 13, 2, 33, 17, 19, 2, 34, 3, 23,
204 2, 35, 11, 53, 2, 36, 7, 47, 2, 37, 3, 25, 2, 38, 7, 11, 2, 39, 53, 243,
205 2, 40, 3, 27, 2, 41, 17, 35, 2, 42, 5, 17, 2, 43, 3, 29, 2, 44, 15, 23,
206 2, 45, 7, 13, 2, 46, 3, 31, 2, 47, 5, 19, 2, 48, 19, 59, 2, 49, 3, 33,
207 2, 50, 7, 51, 2, 51, 15, 41, 2, 52, 3, 35, 2, 53, 11, 33, 2, 54, 23, 27,
208 2, 55, 3, 37, 2, 56, 9, 41, 2, 57, 5, 23, 2, 58, 3, 39, 2, 59, 7, 17,
209 2, 60, 9, 241, 2, 61, 3, 41, 2, 62, 5, 25, 2, 63, 35, 245, 2, 64, 3, 43,
210 5, 26, 9, 43, 3, 44, 7, 19, 10, 39, 3, 45, 4, 34, 11, 59, 3, 46, 9, 243,
211 4, 35, 3, 47, 22, 53, 7, 57, 3, 48, 5, 29, 10, 245, 3, 49, 4, 37, 9, 45,
212 3, 50, 7, 241, 4, 38, 3, 51, 7, 22, 5, 31, 3, 52, 7, 59, 7, 242, 3, 53,
213 4, 40, 7, 23, 3, 54, 15, 45, 4, 41, 3, 55, 6, 241, 9, 47, 3, 56, 13, 13,
214 5, 34, 3, 57, 4, 43, 11, 39, 3, 58, 5, 35, 4, 44, 3, 59, 6, 243, 7, 245,
215 3, 60, 5, 241, 7, 26, 3, 61, 4, 46, 5, 37, 3, 62, 11, 17, 4, 47, 3, 63,
216 5, 38, 5, 243, 3, 64, 7, 247, 9, 50, 5, 39, 4, 241, 33, 37, 6, 33, 13, 35,
217 4, 242, 5, 245, 6, 247, 7, 29, 4, 51, 5, 41, 5, 246, 7, 249, 3, 240, 11, 19,
218 5, 42, 3, 241, 4, 245, 25, 29, 3, 242, 5, 43, 4, 246, 3, 243, 17, 58, 17, 43,
219 3, 244, 5, 249, 6, 37, 3, 245, 2, 240, 5, 45, 2, 241, 21, 23, 2, 242, 3, 247,
220 2, 243, 5, 251, 2, 244, 29, 61, 2, 245, 3, 249, 2, 246, 17, 29, 2, 247, 9, 55,
221 1, 240, 1, 241, 1, 242, 1, 243, 1, 244, 1, 245, 1, 246, 1, 247, 1, 248, 1, 249,
222 1, 250, 1, 251, 1, 252, 1, 253, 1, 254, 1, 255};
223
224 aco_opcode
get_reduce_opcode(amd_gfx_level gfx_level,ReduceOp op)225 get_reduce_opcode(amd_gfx_level gfx_level, ReduceOp op)
226 {
227 /* Because some 16-bit instructions are already VOP3 on GFX10, we use the
228 * 32-bit opcodes (VOP2) which allows to remove the temporary VGPR and to use
229 * DPP with the arithmetic instructions. This requires to sign-extend.
230 */
231 switch (op) {
232 case iadd8:
233 case iadd16:
234 if (gfx_level >= GFX10) {
235 return aco_opcode::v_add_u32;
236 } else if (gfx_level >= GFX8) {
237 return aco_opcode::v_add_u16;
238 } else {
239 return aco_opcode::v_add_co_u32;
240 }
241 break;
242 case imul8:
243 case imul16:
244 if (gfx_level >= GFX10) {
245 return aco_opcode::v_mul_lo_u16_e64;
246 } else if (gfx_level >= GFX8) {
247 return aco_opcode::v_mul_lo_u16;
248 } else {
249 return aco_opcode::v_mul_u32_u24;
250 }
251 break;
252 case fadd16: return aco_opcode::v_add_f16;
253 case fmul16: return aco_opcode::v_mul_f16;
254 case imax8:
255 case imax16:
256 if (gfx_level >= GFX10) {
257 return aco_opcode::v_max_i32;
258 } else if (gfx_level >= GFX8) {
259 return aco_opcode::v_max_i16;
260 } else {
261 return aco_opcode::v_max_i32;
262 }
263 break;
264 case imin8:
265 case imin16:
266 if (gfx_level >= GFX10) {
267 return aco_opcode::v_min_i32;
268 } else if (gfx_level >= GFX8) {
269 return aco_opcode::v_min_i16;
270 } else {
271 return aco_opcode::v_min_i32;
272 }
273 break;
274 case umin8:
275 case umin16:
276 if (gfx_level >= GFX10) {
277 return aco_opcode::v_min_u32;
278 } else if (gfx_level >= GFX8) {
279 return aco_opcode::v_min_u16;
280 } else {
281 return aco_opcode::v_min_u32;
282 }
283 break;
284 case umax8:
285 case umax16:
286 if (gfx_level >= GFX10) {
287 return aco_opcode::v_max_u32;
288 } else if (gfx_level >= GFX8) {
289 return aco_opcode::v_max_u16;
290 } else {
291 return aco_opcode::v_max_u32;
292 }
293 break;
294 case fmin16: return aco_opcode::v_min_f16;
295 case fmax16: return aco_opcode::v_max_f16;
296 case iadd32: return gfx_level >= GFX9 ? aco_opcode::v_add_u32 : aco_opcode::v_add_co_u32;
297 case imul32: return aco_opcode::v_mul_lo_u32;
298 case fadd32: return aco_opcode::v_add_f32;
299 case fmul32: return aco_opcode::v_mul_f32;
300 case imax32: return aco_opcode::v_max_i32;
301 case imin32: return aco_opcode::v_min_i32;
302 case umin32: return aco_opcode::v_min_u32;
303 case umax32: return aco_opcode::v_max_u32;
304 case fmin32: return aco_opcode::v_min_f32;
305 case fmax32: return aco_opcode::v_max_f32;
306 case iand8:
307 case iand16:
308 case iand32: return aco_opcode::v_and_b32;
309 case ixor8:
310 case ixor16:
311 case ixor32: return aco_opcode::v_xor_b32;
312 case ior8:
313 case ior16:
314 case ior32: return aco_opcode::v_or_b32;
315 case iadd64: return aco_opcode::num_opcodes;
316 case imul64: return aco_opcode::num_opcodes;
317 case fadd64: return aco_opcode::v_add_f64_e64;
318 case fmul64: return aco_opcode::v_mul_f64_e64;
319 case imin64: return aco_opcode::num_opcodes;
320 case imax64: return aco_opcode::num_opcodes;
321 case umin64: return aco_opcode::num_opcodes;
322 case umax64: return aco_opcode::num_opcodes;
323 case fmin64: return aco_opcode::v_min_f64_e64;
324 case fmax64: return aco_opcode::v_max_f64_e64;
325 case iand64: return aco_opcode::num_opcodes;
326 case ior64: return aco_opcode::num_opcodes;
327 case ixor64: return aco_opcode::num_opcodes;
328 default: return aco_opcode::num_opcodes;
329 }
330 }
331
332 bool
is_vop3_reduce_opcode(aco_opcode opcode)333 is_vop3_reduce_opcode(aco_opcode opcode)
334 {
335 /* 64-bit reductions are VOP3. */
336 if (opcode == aco_opcode::num_opcodes)
337 return true;
338
339 return instr_info.format[(int)opcode] == Format::VOP3;
340 }
341
342 void
emit_vadd32(Builder & bld,Definition def,Operand src0,Operand src1)343 emit_vadd32(Builder& bld, Definition def, Operand src0, Operand src1)
344 {
345 Instruction* instr = bld.vadd32(def, src0, src1, false, Operand(s2), true);
346 if (instr->definitions.size() >= 2) {
347 assert(instr->definitions[1].regClass() == bld.lm);
348 instr->definitions[1].setFixed(vcc);
349 }
350 }
351
352 void
emit_int64_dpp_op(lower_context * ctx,PhysReg dst_reg,PhysReg src0_reg,PhysReg src1_reg,PhysReg vtmp_reg,ReduceOp op,unsigned dpp_ctrl,unsigned row_mask,unsigned bank_mask,bool bound_ctrl,Operand * identity=NULL)353 emit_int64_dpp_op(lower_context* ctx, PhysReg dst_reg, PhysReg src0_reg, PhysReg src1_reg,
354 PhysReg vtmp_reg, ReduceOp op, unsigned dpp_ctrl, unsigned row_mask,
355 unsigned bank_mask, bool bound_ctrl, Operand* identity = NULL)
356 {
357 Builder bld(ctx->program, &ctx->instructions);
358 Definition dst[] = {Definition(dst_reg, v1), Definition(PhysReg{dst_reg + 1}, v1)};
359 Definition vtmp_def[] = {Definition(vtmp_reg, v1), Definition(PhysReg{vtmp_reg + 1}, v1)};
360 Operand src0[] = {Operand(src0_reg, v1), Operand(PhysReg{src0_reg + 1}, v1)};
361 Operand src1[] = {Operand(src1_reg, v1), Operand(PhysReg{src1_reg + 1}, v1)};
362 Operand src1_64 = Operand(src1_reg, v2);
363 Operand vtmp_op[] = {Operand(vtmp_reg, v1), Operand(PhysReg{vtmp_reg + 1}, v1)};
364 Operand vtmp_op64 = Operand(vtmp_reg, v2);
365 if (op == iadd64) {
366 if (ctx->program->gfx_level >= GFX10) {
367 if (identity)
368 bld.vop1(aco_opcode::v_mov_b32, vtmp_def[0], identity[0]);
369 bld.vop1_dpp(aco_opcode::v_mov_b32, vtmp_def[0], src0[0], dpp_ctrl, row_mask, bank_mask,
370 bound_ctrl);
371 bld.vop3(aco_opcode::v_add_co_u32_e64, dst[0], Definition(vcc, bld.lm), vtmp_op[0],
372 src1[0]);
373 } else {
374 bld.vop2_dpp(aco_opcode::v_add_co_u32, dst[0], Definition(vcc, bld.lm), src0[0], src1[0],
375 dpp_ctrl, row_mask, bank_mask, bound_ctrl);
376 }
377 bld.vop2_dpp(aco_opcode::v_addc_co_u32, dst[1], Definition(vcc, bld.lm), src0[1], src1[1],
378 Operand(vcc, bld.lm), dpp_ctrl, row_mask, bank_mask, bound_ctrl);
379 } else if (op == iand64) {
380 bld.vop2_dpp(aco_opcode::v_and_b32, dst[0], src0[0], src1[0], dpp_ctrl, row_mask, bank_mask,
381 bound_ctrl);
382 bld.vop2_dpp(aco_opcode::v_and_b32, dst[1], src0[1], src1[1], dpp_ctrl, row_mask, bank_mask,
383 bound_ctrl);
384 } else if (op == ior64) {
385 bld.vop2_dpp(aco_opcode::v_or_b32, dst[0], src0[0], src1[0], dpp_ctrl, row_mask, bank_mask,
386 bound_ctrl);
387 bld.vop2_dpp(aco_opcode::v_or_b32, dst[1], src0[1], src1[1], dpp_ctrl, row_mask, bank_mask,
388 bound_ctrl);
389 } else if (op == ixor64) {
390 bld.vop2_dpp(aco_opcode::v_xor_b32, dst[0], src0[0], src1[0], dpp_ctrl, row_mask, bank_mask,
391 bound_ctrl);
392 bld.vop2_dpp(aco_opcode::v_xor_b32, dst[1], src0[1], src1[1], dpp_ctrl, row_mask, bank_mask,
393 bound_ctrl);
394 } else if (op == umin64 || op == umax64 || op == imin64 || op == imax64) {
395 aco_opcode cmp = aco_opcode::num_opcodes;
396 switch (op) {
397 case umin64: cmp = aco_opcode::v_cmp_gt_u64; break;
398 case umax64: cmp = aco_opcode::v_cmp_lt_u64; break;
399 case imin64: cmp = aco_opcode::v_cmp_gt_i64; break;
400 case imax64: cmp = aco_opcode::v_cmp_lt_i64; break;
401 default: break;
402 }
403
404 if (identity) {
405 bld.vop1(aco_opcode::v_mov_b32, vtmp_def[0], identity[0]);
406 bld.vop1(aco_opcode::v_mov_b32, vtmp_def[1], identity[1]);
407 }
408 bld.vop1_dpp(aco_opcode::v_mov_b32, vtmp_def[0], src0[0], dpp_ctrl, row_mask, bank_mask,
409 bound_ctrl);
410 bld.vop1_dpp(aco_opcode::v_mov_b32, vtmp_def[1], src0[1], dpp_ctrl, row_mask, bank_mask,
411 bound_ctrl);
412
413 bld.vopc(cmp, Definition(vcc, bld.lm), vtmp_op64, src1_64);
414 bld.vop2(aco_opcode::v_cndmask_b32, dst[0], vtmp_op[0], src1[0], Operand(vcc, bld.lm));
415 bld.vop2(aco_opcode::v_cndmask_b32, dst[1], vtmp_op[1], src1[1], Operand(vcc, bld.lm));
416 } else if (op == imul64) {
417 /* t4 = dpp(x_hi)
418 * t1 = umul_lo(t4, y_lo)
419 * t3 = dpp(x_lo)
420 * t0 = umul_lo(t3, y_hi)
421 * t2 = iadd(t0, t1)
422 * t5 = umul_hi(t3, y_lo)
423 * res_hi = iadd(t2, t5)
424 * res_lo = umul_lo(t3, y_lo)
425 * Requires that res_hi != src0[0] and res_hi != src1[0]
426 * and that vtmp[0] != res_hi.
427 */
428 if (identity)
429 bld.vop1(aco_opcode::v_mov_b32, vtmp_def[0], identity[1]);
430 bld.vop1_dpp(aco_opcode::v_mov_b32, vtmp_def[0], src0[1], dpp_ctrl, row_mask, bank_mask,
431 bound_ctrl);
432 bld.vop3(aco_opcode::v_mul_lo_u32, vtmp_def[1], vtmp_op[0], src1[0]);
433 if (identity)
434 bld.vop1(aco_opcode::v_mov_b32, vtmp_def[0], identity[0]);
435 bld.vop1_dpp(aco_opcode::v_mov_b32, vtmp_def[0], src0[0], dpp_ctrl, row_mask, bank_mask,
436 bound_ctrl);
437 bld.vop3(aco_opcode::v_mul_lo_u32, vtmp_def[0], vtmp_op[0], src1[1]);
438 emit_vadd32(bld, vtmp_def[1], vtmp_op[0], vtmp_op[1]);
439 if (identity)
440 bld.vop1(aco_opcode::v_mov_b32, vtmp_def[0], identity[0]);
441 bld.vop1_dpp(aco_opcode::v_mov_b32, vtmp_def[0], src0[0], dpp_ctrl, row_mask, bank_mask,
442 bound_ctrl);
443 bld.vop3(aco_opcode::v_mul_hi_u32, vtmp_def[0], vtmp_op[0], src1[0]);
444 emit_vadd32(bld, dst[1], vtmp_op[1], vtmp_op[0]);
445 if (identity)
446 bld.vop1(aco_opcode::v_mov_b32, vtmp_def[0], identity[0]);
447 bld.vop1_dpp(aco_opcode::v_mov_b32, vtmp_def[0], src0[0], dpp_ctrl, row_mask, bank_mask,
448 bound_ctrl);
449 bld.vop3(aco_opcode::v_mul_lo_u32, dst[0], vtmp_op[0], src1[0]);
450 }
451 }
452
453 void
emit_int64_op(lower_context * ctx,PhysReg dst_reg,PhysReg src0_reg,PhysReg src1_reg,PhysReg vtmp,ReduceOp op)454 emit_int64_op(lower_context* ctx, PhysReg dst_reg, PhysReg src0_reg, PhysReg src1_reg, PhysReg vtmp,
455 ReduceOp op)
456 {
457 Builder bld(ctx->program, &ctx->instructions);
458 Definition dst[] = {Definition(dst_reg, v1), Definition(PhysReg{dst_reg + 1}, v1)};
459 RegClass src0_rc = src0_reg.reg() >= 256 ? v1 : s1;
460 Operand src0[] = {Operand(src0_reg, src0_rc), Operand(PhysReg{src0_reg + 1}, src0_rc)};
461 Operand src1[] = {Operand(src1_reg, v1), Operand(PhysReg{src1_reg + 1}, v1)};
462 Operand src0_64 = Operand(src0_reg, src0_reg.reg() >= 256 ? v2 : s2);
463 Operand src1_64 = Operand(src1_reg, v2);
464
465 if (src0_rc == s1 &&
466 (op == imul64 || op == umin64 || op == umax64 || op == imin64 || op == imax64)) {
467 assert(vtmp.reg() != 0);
468 bld.vop1(aco_opcode::v_mov_b32, Definition(vtmp, v1), src0[0]);
469 bld.vop1(aco_opcode::v_mov_b32, Definition(PhysReg{vtmp + 1}, v1), src0[1]);
470 src0_reg = vtmp;
471 src0[0] = Operand(vtmp, v1);
472 src0[1] = Operand(PhysReg{vtmp + 1}, v1);
473 src0_64 = Operand(vtmp, v2);
474 } else if (src0_rc == s1 && op == iadd64) {
475 assert(vtmp.reg() != 0);
476 bld.vop1(aco_opcode::v_mov_b32, Definition(PhysReg{vtmp + 1}, v1), src0[1]);
477 src0[1] = Operand(PhysReg{vtmp + 1}, v1);
478 }
479
480 if (op == iadd64) {
481 if (ctx->program->gfx_level >= GFX10) {
482 bld.vop3(aco_opcode::v_add_co_u32_e64, dst[0], Definition(vcc, bld.lm), src0[0], src1[0]);
483 } else {
484 bld.vop2(aco_opcode::v_add_co_u32, dst[0], Definition(vcc, bld.lm), src0[0], src1[0]);
485 }
486 bld.vop2(aco_opcode::v_addc_co_u32, dst[1], Definition(vcc, bld.lm), src0[1], src1[1],
487 Operand(vcc, bld.lm));
488 } else if (op == iand64) {
489 bld.vop2(aco_opcode::v_and_b32, dst[0], src0[0], src1[0]);
490 bld.vop2(aco_opcode::v_and_b32, dst[1], src0[1], src1[1]);
491 } else if (op == ior64) {
492 bld.vop2(aco_opcode::v_or_b32, dst[0], src0[0], src1[0]);
493 bld.vop2(aco_opcode::v_or_b32, dst[1], src0[1], src1[1]);
494 } else if (op == ixor64) {
495 bld.vop2(aco_opcode::v_xor_b32, dst[0], src0[0], src1[0]);
496 bld.vop2(aco_opcode::v_xor_b32, dst[1], src0[1], src1[1]);
497 } else if (op == umin64 || op == umax64 || op == imin64 || op == imax64) {
498 aco_opcode cmp = aco_opcode::num_opcodes;
499 switch (op) {
500 case umin64: cmp = aco_opcode::v_cmp_gt_u64; break;
501 case umax64: cmp = aco_opcode::v_cmp_lt_u64; break;
502 case imin64: cmp = aco_opcode::v_cmp_gt_i64; break;
503 case imax64: cmp = aco_opcode::v_cmp_lt_i64; break;
504 default: break;
505 }
506
507 bld.vopc(cmp, Definition(vcc, bld.lm), src0_64, src1_64);
508 bld.vop2(aco_opcode::v_cndmask_b32, dst[0], src0[0], src1[0], Operand(vcc, bld.lm));
509 bld.vop2(aco_opcode::v_cndmask_b32, dst[1], src0[1], src1[1], Operand(vcc, bld.lm));
510 } else if (op == imul64) {
511 if (src1_reg == dst_reg) {
512 /* it's fine if src0==dst but not if src1==dst */
513 std::swap(src0_reg, src1_reg);
514 std::swap(src0[0], src1[0]);
515 std::swap(src0[1], src1[1]);
516 std::swap(src0_64, src1_64);
517 }
518 assert(!(src0_reg == src1_reg));
519 /* t1 = umul_lo(x_hi, y_lo)
520 * t0 = umul_lo(x_lo, y_hi)
521 * t2 = iadd(t0, t1)
522 * t5 = umul_hi(x_lo, y_lo)
523 * res_hi = iadd(t2, t5)
524 * res_lo = umul_lo(x_lo, y_lo)
525 * assumes that it's ok to modify x_hi/y_hi, since we might not have vtmp
526 */
527 Definition tmp0_def(PhysReg{src0_reg + 1}, v1);
528 Definition tmp1_def(PhysReg{src1_reg + 1}, v1);
529 Operand tmp0_op = src0[1];
530 Operand tmp1_op = src1[1];
531 bld.vop3(aco_opcode::v_mul_lo_u32, tmp0_def, src0[1], src1[0]);
532 bld.vop3(aco_opcode::v_mul_lo_u32, tmp1_def, src0[0], src1[1]);
533 emit_vadd32(bld, tmp0_def, tmp1_op, tmp0_op);
534 bld.vop3(aco_opcode::v_mul_hi_u32, tmp1_def, src0[0], src1[0]);
535 emit_vadd32(bld, dst[1], tmp0_op, tmp1_op);
536 bld.vop3(aco_opcode::v_mul_lo_u32, dst[0], src0[0], src1[0]);
537 }
538 }
539
540 void
emit_dpp_op(lower_context * ctx,PhysReg dst_reg,PhysReg src0_reg,PhysReg src1_reg,PhysReg vtmp,ReduceOp op,unsigned size,unsigned dpp_ctrl,unsigned row_mask,unsigned bank_mask,bool bound_ctrl,Operand * identity=NULL)541 emit_dpp_op(lower_context* ctx, PhysReg dst_reg, PhysReg src0_reg, PhysReg src1_reg, PhysReg vtmp,
542 ReduceOp op, unsigned size, unsigned dpp_ctrl, unsigned row_mask, unsigned bank_mask,
543 bool bound_ctrl, Operand* identity = NULL) /* for VOP3 with sparse writes */
544 {
545 Builder bld(ctx->program, &ctx->instructions);
546 RegClass rc = RegClass(RegType::vgpr, size);
547 Definition dst(dst_reg, rc);
548 Operand src0(src0_reg, rc);
549 Operand src1(src1_reg, rc);
550
551 aco_opcode opcode = get_reduce_opcode(ctx->program->gfx_level, op);
552 bool vop3 = is_vop3_reduce_opcode(opcode);
553
554 if (!vop3) {
555 if (opcode == aco_opcode::v_add_co_u32)
556 bld.vop2_dpp(opcode, dst, Definition(vcc, bld.lm), src0, src1, dpp_ctrl, row_mask,
557 bank_mask, bound_ctrl);
558 else
559 bld.vop2_dpp(opcode, dst, src0, src1, dpp_ctrl, row_mask, bank_mask, bound_ctrl);
560 return;
561 }
562
563 if (opcode == aco_opcode::num_opcodes) {
564 emit_int64_dpp_op(ctx, dst_reg, src0_reg, src1_reg, vtmp, op, dpp_ctrl, row_mask, bank_mask,
565 bound_ctrl, identity);
566 return;
567 }
568
569 if (identity)
570 bld.vop1(aco_opcode::v_mov_b32, Definition(vtmp, v1), identity[0]);
571 if (identity && size >= 2)
572 bld.vop1(aco_opcode::v_mov_b32, Definition(PhysReg{vtmp + 1}, v1), identity[1]);
573
574 for (unsigned i = 0; i < size; i++)
575 bld.vop1_dpp(aco_opcode::v_mov_b32, Definition(PhysReg{vtmp + i}, v1),
576 Operand(PhysReg{src0_reg + i}, v1), dpp_ctrl, row_mask, bank_mask, bound_ctrl);
577
578 bld.vop3(opcode, dst, Operand(vtmp, rc), src1);
579 }
580
581 void
emit_op(lower_context * ctx,PhysReg dst_reg,PhysReg src0_reg,PhysReg src1_reg,PhysReg vtmp,ReduceOp op,unsigned size)582 emit_op(lower_context* ctx, PhysReg dst_reg, PhysReg src0_reg, PhysReg src1_reg, PhysReg vtmp,
583 ReduceOp op, unsigned size)
584 {
585 Builder bld(ctx->program, &ctx->instructions);
586 RegClass rc = RegClass(RegType::vgpr, size);
587 Definition dst(dst_reg, rc);
588 Operand src0(src0_reg, RegClass(src0_reg.reg() >= 256 ? RegType::vgpr : RegType::sgpr, size));
589 Operand src1(src1_reg, rc);
590
591 aco_opcode opcode = get_reduce_opcode(ctx->program->gfx_level, op);
592 bool vop3 = is_vop3_reduce_opcode(opcode);
593
594 if (opcode == aco_opcode::num_opcodes) {
595 emit_int64_op(ctx, dst_reg, src0_reg, src1_reg, vtmp, op);
596 return;
597 }
598
599 if (vop3) {
600 bld.vop3(opcode, dst, src0, src1);
601 } else if (opcode == aco_opcode::v_add_co_u32) {
602 bld.vop2(opcode, dst, Definition(vcc, bld.lm), src0, src1);
603 } else {
604 bld.vop2(opcode, dst, src0, src1);
605 }
606 }
607
608 void
emit_dpp_mov(lower_context * ctx,PhysReg dst,PhysReg src0,unsigned size,unsigned dpp_ctrl,unsigned row_mask,unsigned bank_mask,bool bound_ctrl)609 emit_dpp_mov(lower_context* ctx, PhysReg dst, PhysReg src0, unsigned size, unsigned dpp_ctrl,
610 unsigned row_mask, unsigned bank_mask, bool bound_ctrl)
611 {
612 Builder bld(ctx->program, &ctx->instructions);
613 for (unsigned i = 0; i < size; i++) {
614 bld.vop1_dpp(aco_opcode::v_mov_b32, Definition(PhysReg{dst + i}, v1),
615 Operand(PhysReg{src0 + i}, v1), dpp_ctrl, row_mask, bank_mask, bound_ctrl);
616 }
617 }
618
619 void
emit_ds_swizzle(Builder bld,PhysReg dst,PhysReg src,unsigned size,unsigned ds_pattern)620 emit_ds_swizzle(Builder bld, PhysReg dst, PhysReg src, unsigned size, unsigned ds_pattern)
621 {
622 for (unsigned i = 0; i < size; i++) {
623 bld.ds(aco_opcode::ds_swizzle_b32, Definition(PhysReg{dst + i}, v1),
624 Operand(PhysReg{src + i}, v1), ds_pattern);
625 }
626 }
627
628 void
emit_reduction(lower_context * ctx,aco_opcode op,ReduceOp reduce_op,unsigned cluster_size,PhysReg tmp,PhysReg stmp,PhysReg vtmp,PhysReg sitmp,Operand src,Definition dst)629 emit_reduction(lower_context* ctx, aco_opcode op, ReduceOp reduce_op, unsigned cluster_size,
630 PhysReg tmp, PhysReg stmp, PhysReg vtmp, PhysReg sitmp, Operand src, Definition dst)
631 {
632 assert(cluster_size == ctx->program->wave_size || op == aco_opcode::p_reduce);
633 assert(cluster_size <= ctx->program->wave_size);
634
635 Builder bld(ctx->program, &ctx->instructions);
636
637 Operand identity[2];
638 identity[0] = Operand::c32(get_reduction_identity(reduce_op, 0));
639 identity[1] = Operand::c32(get_reduction_identity(reduce_op, 1));
640 Operand vcndmask_identity[2] = {identity[0], identity[1]};
641
642 /* First, copy the source to tmp and set inactive lanes to the identity */
643 bld.sop1(Builder::s_or_saveexec, Definition(stmp, bld.lm), Definition(scc, s1),
644 Definition(exec, bld.lm), Operand::c64(UINT64_MAX), Operand(exec, bld.lm));
645
646 /* On GFX10+ v_writelane_b32/v_cndmask_b32_e64 can take a literal */
647 if (ctx->program->gfx_level < GFX10) {
648 for (unsigned i = 0; i < src.size(); i++) {
649 /* p_exclusive_scan uses identity for v_writelane_b32 */
650 if (identity[i].isLiteral() && op == aco_opcode::p_exclusive_scan) {
651 bld.sop1(aco_opcode::s_mov_b32, Definition(PhysReg{sitmp + i}, s1), identity[i]);
652 identity[i] = Operand(PhysReg{sitmp + i}, s1);
653
654 bld.vop1(aco_opcode::v_mov_b32, Definition(PhysReg{tmp + i}, v1), identity[i]);
655 vcndmask_identity[i] = Operand(PhysReg{tmp + i}, v1);
656 } else if (identity[i].isLiteral()) {
657 bld.vop1(aco_opcode::v_mov_b32, Definition(PhysReg{tmp + i}, v1), identity[i]);
658 vcndmask_identity[i] = Operand(PhysReg{tmp + i}, v1);
659 }
660 }
661 }
662
663 for (unsigned i = 0; i < src.size(); i++) {
664 bld.vop2_e64(aco_opcode::v_cndmask_b32, Definition(PhysReg{tmp + i}, v1),
665 vcndmask_identity[i], Operand(PhysReg{src.physReg() + i}, v1),
666 Operand(stmp, bld.lm));
667 }
668
669 if (reduce_op == iadd8 || reduce_op == imul8 || reduce_op == imax8 || reduce_op == imin8 ||
670 reduce_op == umin8 || reduce_op == umax8 || reduce_op == ixor8 || reduce_op == ior8 ||
671 reduce_op == iand8) {
672 if (ctx->program->gfx_level >= GFX8 && ctx->program->gfx_level < GFX11) {
673 aco_ptr<Instruction> sdwa{
674 create_instruction(aco_opcode::v_mov_b32, asSDWA(Format::VOP1), 1, 1)};
675 sdwa->operands[0] = Operand(PhysReg{tmp}, v1);
676 sdwa->definitions[0] = Definition(PhysReg{tmp}, v1);
677 bool sext = reduce_op == imin8 || reduce_op == imax8;
678 sdwa->sdwa().sel[0] = SubdwordSel(1, 0, sext);
679 sdwa->sdwa().dst_sel = SubdwordSel::dword;
680 bld.insert(std::move(sdwa));
681 } else {
682 aco_opcode opcode;
683
684 if (reduce_op == imin8 || reduce_op == imax8)
685 opcode = aco_opcode::v_bfe_i32;
686 else
687 opcode = aco_opcode::v_bfe_u32;
688
689 bld.vop3(opcode, Definition(PhysReg{tmp}, v1), Operand(PhysReg{tmp}, v1), Operand::zero(),
690 Operand::c32(8u));
691 }
692 } else if (reduce_op == iadd16 || reduce_op == imul16 || reduce_op == imax16 ||
693 reduce_op == imin16 || reduce_op == umin16 || reduce_op == umax16 ||
694 reduce_op == ixor16 || reduce_op == ior16 || reduce_op == iand16 ||
695 reduce_op == fadd16 || reduce_op == fmul16 || reduce_op == fmin16 ||
696 reduce_op == fmax16) {
697 bool is_add_cmp = reduce_op == iadd16 || reduce_op == imax16 || reduce_op == imin16 ||
698 reduce_op == umin16 || reduce_op == umax16;
699 if (ctx->program->gfx_level >= GFX10 && ctx->program->gfx_level < GFX11 && is_add_cmp) {
700 aco_ptr<Instruction> sdwa{
701 create_instruction(aco_opcode::v_mov_b32, asSDWA(Format::VOP1), 1, 1)};
702 sdwa->operands[0] = Operand(PhysReg{tmp}, v1);
703 sdwa->definitions[0] = Definition(PhysReg{tmp}, v1);
704 bool sext = reduce_op == imin16 || reduce_op == imax16 || reduce_op == iadd16;
705 sdwa->sdwa().sel[0] = SubdwordSel(2, 0, sext);
706 sdwa->sdwa().dst_sel = SubdwordSel::dword;
707 bld.insert(std::move(sdwa));
708 } else if (ctx->program->gfx_level <= GFX7 ||
709 (ctx->program->gfx_level >= GFX11 && is_add_cmp)) {
710 aco_opcode opcode;
711
712 if (reduce_op == imin16 || reduce_op == imax16 || reduce_op == iadd16)
713 opcode = aco_opcode::v_bfe_i32;
714 else
715 opcode = aco_opcode::v_bfe_u32;
716
717 bld.vop3(opcode, Definition(PhysReg{tmp}, v1), Operand(PhysReg{tmp}, v1), Operand::zero(),
718 Operand::c32(16u));
719 }
720 }
721
722 bool reduction_needs_last_op = false;
723 switch (op) {
724 case aco_opcode::p_reduce:
725 if (cluster_size == 1)
726 break;
727
728 if (ctx->program->gfx_level <= GFX7) {
729 reduction_needs_last_op = true;
730 emit_ds_swizzle(bld, vtmp, tmp, src.size(), (1 << 15) | dpp_quad_perm(1, 0, 3, 2));
731 if (cluster_size == 2)
732 break;
733 emit_op(ctx, tmp, vtmp, tmp, PhysReg{0}, reduce_op, src.size());
734 emit_ds_swizzle(bld, vtmp, tmp, src.size(), (1 << 15) | dpp_quad_perm(2, 3, 0, 1));
735 if (cluster_size == 4)
736 break;
737 emit_op(ctx, tmp, vtmp, tmp, PhysReg{0}, reduce_op, src.size());
738 emit_ds_swizzle(bld, vtmp, tmp, src.size(), ds_pattern_bitmode(0x1f, 0, 0x04));
739 if (cluster_size == 8)
740 break;
741 emit_op(ctx, tmp, vtmp, tmp, PhysReg{0}, reduce_op, src.size());
742 emit_ds_swizzle(bld, vtmp, tmp, src.size(), ds_pattern_bitmode(0x1f, 0, 0x08));
743 if (cluster_size == 16)
744 break;
745 emit_op(ctx, tmp, vtmp, tmp, PhysReg{0}, reduce_op, src.size());
746 emit_ds_swizzle(bld, vtmp, tmp, src.size(), ds_pattern_bitmode(0x1f, 0, 0x10));
747 if (cluster_size == 32)
748 break;
749 emit_op(ctx, tmp, vtmp, tmp, PhysReg{0}, reduce_op, src.size());
750 for (unsigned i = 0; i < src.size(); i++)
751 bld.readlane(Definition(PhysReg{dst.physReg() + i}, s1), Operand(PhysReg{tmp + i}, v1),
752 Operand::zero());
753 // TODO: it would be more effective to do the last reduction step on SALU
754 emit_op(ctx, tmp, dst.physReg(), tmp, vtmp, reduce_op, src.size());
755 reduction_needs_last_op = false;
756 break;
757 }
758
759 emit_dpp_op(ctx, tmp, tmp, tmp, vtmp, reduce_op, src.size(), dpp_quad_perm(1, 0, 3, 2), 0xf,
760 0xf, false);
761 if (cluster_size == 2)
762 break;
763 emit_dpp_op(ctx, tmp, tmp, tmp, vtmp, reduce_op, src.size(), dpp_quad_perm(2, 3, 0, 1), 0xf,
764 0xf, false);
765 if (cluster_size == 4)
766 break;
767 emit_dpp_op(ctx, tmp, tmp, tmp, vtmp, reduce_op, src.size(), dpp_row_half_mirror, 0xf, 0xf,
768 false);
769 if (cluster_size == 8)
770 break;
771 emit_dpp_op(ctx, tmp, tmp, tmp, vtmp, reduce_op, src.size(), dpp_row_mirror, 0xf, 0xf, false);
772 if (cluster_size == 16)
773 break;
774
775 if (ctx->program->gfx_level >= GFX10) {
776 /* GFX10+ doesn't support row_bcast15 and row_bcast31 */
777 for (unsigned i = 0; i < src.size(); i++)
778 bld.vop3(aco_opcode::v_permlanex16_b32, Definition(PhysReg{vtmp + i}, v1),
779 Operand(PhysReg{tmp + i}, v1), Operand::zero(), Operand::zero());
780
781 if (cluster_size == 32) {
782 reduction_needs_last_op = true;
783 break;
784 }
785
786 emit_op(ctx, tmp, tmp, vtmp, PhysReg{0}, reduce_op, src.size());
787 for (unsigned i = 0; i < src.size(); i++)
788 bld.readlane(Definition(PhysReg{dst.physReg() + i}, s1), Operand(PhysReg{tmp + i}, v1),
789 Operand::zero());
790 // TODO: it would be more effective to do the last reduction step on SALU
791 emit_op(ctx, tmp, dst.physReg(), tmp, vtmp, reduce_op, src.size());
792 break;
793 }
794
795 if (cluster_size == 32) {
796 emit_ds_swizzle(bld, vtmp, tmp, src.size(), ds_pattern_bitmode(0x1f, 0, 0x10));
797 reduction_needs_last_op = true;
798 break;
799 }
800 assert(cluster_size == 64);
801 emit_dpp_op(ctx, tmp, tmp, tmp, vtmp, reduce_op, src.size(), dpp_row_bcast15, 0xa, 0xf,
802 false);
803 emit_dpp_op(ctx, tmp, tmp, tmp, vtmp, reduce_op, src.size(), dpp_row_bcast31, 0xc, 0xf,
804 false);
805 break;
806 case aco_opcode::p_exclusive_scan:
807 if (ctx->program->gfx_level >= GFX10) { /* gfx10 doesn't support wf_sr1, so emulate it */
808 /* shift rows right */
809 emit_dpp_mov(ctx, vtmp, tmp, src.size(), dpp_row_sr(1), 0xf, 0xf, true);
810
811 /* fill in the gaps in rows 1 and 3 */
812 copy_constant_sgpr(bld, Definition(exec, bld.lm), 0x0001'0000'0001'0000ull);
813 for (unsigned i = 0; i < src.size(); i++) {
814 Instruction* perm =
815 bld.vop3(aco_opcode::v_permlanex16_b32, Definition(PhysReg{vtmp + i}, v1),
816 Operand(PhysReg{tmp + i}, v1), Operand::c32(0xffffffffu),
817 Operand::c32(0xffffffffu))
818 .instr;
819 perm->valu().opsel = 1; /* FI (Fetch Inactive) */
820 }
821 copy_constant_sgpr(bld, Definition(exec, bld.lm), UINT64_MAX);
822
823 if (ctx->program->wave_size == 64) {
824 /* fill in the gap in row 2 */
825 for (unsigned i = 0; i < src.size(); i++) {
826 bld.readlane(Definition(PhysReg{sitmp + i}, s1), Operand(PhysReg{tmp + i}, v1),
827 Operand::c32(31u));
828 bld.writelane(Definition(PhysReg{vtmp + i}, v1), Operand(PhysReg{sitmp + i}, s1),
829 Operand::c32(32u), Operand(PhysReg{vtmp + i}, v1));
830 }
831 }
832 std::swap(tmp, vtmp);
833 } else if (ctx->program->gfx_level >= GFX8) {
834 emit_dpp_mov(ctx, tmp, tmp, src.size(), dpp_wf_sr1, 0xf, 0xf, true);
835 } else {
836 // TODO: use LDS on CS with a single write and shifted read
837 /* wavefront shift_right by 1 on SI/CI */
838 emit_ds_swizzle(bld, vtmp, tmp, src.size(), (1 << 15) | dpp_quad_perm(0, 0, 1, 2));
839 emit_ds_swizzle(bld, tmp, tmp, src.size(),
840 ds_pattern_bitmode(0x1F, 0x00, 0x07)); /* mirror(8) */
841 copy_constant_sgpr(bld, Definition(exec, s2), 0x1010'1010'1010'1010ull);
842 for (unsigned i = 0; i < src.size(); i++)
843 bld.vop1(aco_opcode::v_mov_b32, Definition(PhysReg{vtmp + i}, v1),
844 Operand(PhysReg{tmp + i}, v1));
845
846 copy_constant_sgpr(bld, Definition(exec, s2), UINT64_MAX);
847 emit_ds_swizzle(bld, tmp, tmp, src.size(),
848 ds_pattern_bitmode(0x1F, 0x00, 0x08)); /* swap(8) */
849 copy_constant_sgpr(bld, Definition(exec, s2), 0x0100'0100'0100'0100ull);
850 for (unsigned i = 0; i < src.size(); i++)
851 bld.vop1(aco_opcode::v_mov_b32, Definition(PhysReg{vtmp + i}, v1),
852 Operand(PhysReg{tmp + i}, v1));
853
854 copy_constant_sgpr(bld, Definition(exec, s2), UINT64_MAX);
855 emit_ds_swizzle(bld, tmp, tmp, src.size(),
856 ds_pattern_bitmode(0x1F, 0x00, 0x10)); /* swap(16) */
857 copy_constant_sgpr(bld, Definition(exec, s2), 0x0001'0000'0001'0000ull);
858 for (unsigned i = 0; i < src.size(); i++)
859 bld.vop1(aco_opcode::v_mov_b32, Definition(PhysReg{vtmp + i}, v1),
860 Operand(PhysReg{tmp + i}, v1));
861
862 copy_constant_sgpr(bld, Definition(exec, s2), UINT64_MAX);
863 for (unsigned i = 0; i < src.size(); i++) {
864 bld.writelane(Definition(PhysReg{vtmp + i}, v1), identity[i], Operand::zero(),
865 Operand(PhysReg{vtmp + i}, v1));
866 bld.readlane(Definition(PhysReg{sitmp + i}, s1), Operand(PhysReg{tmp + i}, v1),
867 Operand::zero());
868 bld.writelane(Definition(PhysReg{vtmp + i}, v1), Operand(PhysReg{sitmp + i}, s1),
869 Operand::c32(32u), Operand(PhysReg{vtmp + i}, v1));
870 identity[i] = Operand::zero(); /* prevent further uses of identity */
871 }
872 std::swap(tmp, vtmp);
873 }
874
875 for (unsigned i = 0; i < src.size(); i++) {
876 if (!identity[i].isConstant() ||
877 identity[i].constantValue()) { /* bound_ctrl should take care of this otherwise */
878 if (ctx->program->gfx_level < GFX10)
879 assert((identity[i].isConstant() && !identity[i].isLiteral()) ||
880 identity[i].physReg() == PhysReg{sitmp + i});
881 bld.writelane(Definition(PhysReg{tmp + i}, v1), identity[i], Operand::zero(),
882 Operand(PhysReg{tmp + i}, v1));
883 }
884 }
885 FALLTHROUGH;
886 case aco_opcode::p_inclusive_scan:
887 assert(cluster_size == ctx->program->wave_size);
888 if (ctx->program->gfx_level <= GFX7) {
889 emit_ds_swizzle(bld, vtmp, tmp, src.size(), ds_pattern_bitmode(0x1e, 0x00, 0x00));
890 copy_constant_sgpr(bld, Definition(exec, s2), 0xaaaa'aaaa'aaaa'aaaaull);
891 emit_op(ctx, tmp, tmp, vtmp, PhysReg{0}, reduce_op, src.size());
892
893 copy_constant_sgpr(bld, Definition(exec, s2), UINT64_MAX);
894 emit_ds_swizzle(bld, vtmp, tmp, src.size(), ds_pattern_bitmode(0x1c, 0x01, 0x00));
895 copy_constant_sgpr(bld, Definition(exec, s2), 0xcccc'cccc'cccc'ccccull);
896 emit_op(ctx, tmp, tmp, vtmp, PhysReg{0}, reduce_op, src.size());
897
898 copy_constant_sgpr(bld, Definition(exec, s2), UINT64_MAX);
899 emit_ds_swizzle(bld, vtmp, tmp, src.size(), ds_pattern_bitmode(0x18, 0x03, 0x00));
900 copy_constant_sgpr(bld, Definition(exec, s2), 0xf0f0'f0f0'f0f0'f0f0ull);
901 emit_op(ctx, tmp, tmp, vtmp, PhysReg{0}, reduce_op, src.size());
902
903 copy_constant_sgpr(bld, Definition(exec, s2), UINT64_MAX);
904 emit_ds_swizzle(bld, vtmp, tmp, src.size(), ds_pattern_bitmode(0x10, 0x07, 0x00));
905 copy_constant_sgpr(bld, Definition(exec, s2), 0xff00'ff00'ff00'ff00ull);
906 emit_op(ctx, tmp, tmp, vtmp, PhysReg{0}, reduce_op, src.size());
907
908 copy_constant_sgpr(bld, Definition(exec, s2), UINT64_MAX);
909 emit_ds_swizzle(bld, vtmp, tmp, src.size(), ds_pattern_bitmode(0x00, 0x0f, 0x00));
910 copy_constant_sgpr(bld, Definition(exec, s2), 0xffff'0000'ffff'0000ull);
911 emit_op(ctx, tmp, tmp, vtmp, PhysReg{0}, reduce_op, src.size());
912
913 for (unsigned i = 0; i < src.size(); i++)
914 bld.readlane(Definition(PhysReg{sitmp + i}, s1), Operand(PhysReg{tmp + i}, v1),
915 Operand::c32(31u));
916 copy_constant_sgpr(bld, Definition(exec, s2), 0xffff'ffff'0000'0000ull);
917 emit_op(ctx, tmp, sitmp, tmp, vtmp, reduce_op, src.size());
918 break;
919 }
920
921 emit_dpp_op(ctx, tmp, tmp, tmp, vtmp, reduce_op, src.size(), dpp_row_sr(1), 0xf, 0xf, false,
922 identity);
923 emit_dpp_op(ctx, tmp, tmp, tmp, vtmp, reduce_op, src.size(), dpp_row_sr(2), 0xf, 0xf, false,
924 identity);
925 emit_dpp_op(ctx, tmp, tmp, tmp, vtmp, reduce_op, src.size(), dpp_row_sr(4), 0xf, 0xf, false,
926 identity);
927 emit_dpp_op(ctx, tmp, tmp, tmp, vtmp, reduce_op, src.size(), dpp_row_sr(8), 0xf, 0xf, false,
928 identity);
929 if (ctx->program->gfx_level >= GFX10) {
930 copy_constant_sgpr(bld, Definition(exec, bld.lm), 0xffff'0000'ffff'0000ull);
931 for (unsigned i = 0; i < src.size(); i++) {
932 Instruction* perm =
933 bld.vop3(aco_opcode::v_permlanex16_b32, Definition(PhysReg{vtmp + i}, v1),
934 Operand(PhysReg{tmp + i}, v1), Operand::c32(0xffffffffu),
935 Operand::c32(0xffffffffu))
936 .instr;
937 perm->valu().opsel = 1; /* FI (Fetch Inactive) */
938 }
939 emit_op(ctx, tmp, tmp, vtmp, PhysReg{0}, reduce_op, src.size());
940
941 if (ctx->program->wave_size == 64) {
942 copy_constant_sgpr(bld, Definition(exec, s2), 0xffff'ffff'0000'0000ull);
943 for (unsigned i = 0; i < src.size(); i++)
944 bld.readlane(Definition(PhysReg{sitmp + i}, s1), Operand(PhysReg{tmp + i}, v1),
945 Operand::c32(31u));
946 emit_op(ctx, tmp, sitmp, tmp, vtmp, reduce_op, src.size());
947 }
948 } else {
949 emit_dpp_op(ctx, tmp, tmp, tmp, vtmp, reduce_op, src.size(), dpp_row_bcast15, 0xa, 0xf,
950 false, identity);
951 emit_dpp_op(ctx, tmp, tmp, tmp, vtmp, reduce_op, src.size(), dpp_row_bcast31, 0xc, 0xf,
952 false, identity);
953 }
954 break;
955 default: unreachable("Invalid reduction mode");
956 }
957
958 if (op == aco_opcode::p_reduce) {
959 if (reduction_needs_last_op && dst.regClass().type() == RegType::vgpr) {
960 bld.sop1(Builder::s_mov, Definition(exec, bld.lm), Operand(stmp, bld.lm));
961 emit_op(ctx, dst.physReg(), tmp, vtmp, PhysReg{0}, reduce_op, src.size());
962 return;
963 }
964
965 if (reduction_needs_last_op)
966 emit_op(ctx, tmp, vtmp, tmp, PhysReg{0}, reduce_op, src.size());
967 }
968
969 /* restore exec */
970 bld.sop1(Builder::s_mov, Definition(exec, bld.lm), Operand(stmp, bld.lm));
971
972 if (dst.regClass().type() == RegType::sgpr) {
973 for (unsigned k = 0; k < src.size(); k++) {
974 bld.readlane(Definition(PhysReg{dst.physReg() + k}, s1), Operand(PhysReg{tmp + k}, v1),
975 Operand::c32(ctx->program->wave_size - 1));
976 }
977 } else if (dst.physReg() != tmp) {
978 for (unsigned k = 0; k < src.size(); k++) {
979 bld.vop1(aco_opcode::v_mov_b32, Definition(PhysReg{dst.physReg() + k}, v1),
980 Operand(PhysReg{tmp + k}, v1));
981 }
982 }
983 }
984
985 void
adjust_bpermute_dst(Builder & bld,Definition dst,Operand input_data)986 adjust_bpermute_dst(Builder& bld, Definition dst, Operand input_data)
987 {
988 /* RA assumes that the result is always in the low part of the register, so we have to shift,
989 * if it's not there already.
990 */
991 if (input_data.physReg().byte()) {
992 unsigned right_shift = input_data.physReg().byte() * 8;
993 bld.vop2(aco_opcode::v_lshrrev_b32, dst, Operand::c32(right_shift),
994 Operand(dst.physReg(), dst.regClass()));
995 }
996 }
997
998 void
emit_bpermute_permlane(Builder & bld,aco_ptr<Instruction> & instr)999 emit_bpermute_permlane(Builder& bld, aco_ptr<Instruction>& instr)
1000 {
1001 /* Emulates proper bpermute on GFX11 in wave64 mode.
1002 *
1003 * Similar to emit_gfx10_wave64_bpermute, but uses the new
1004 * v_permlane64_b32 instruction to swap data between lo and hi halves.
1005 */
1006
1007 assert(bld.program->gfx_level >= GFX11);
1008 assert(bld.program->wave_size == 64);
1009
1010 Definition dst = instr->definitions[0];
1011 Definition tmp_exec = instr->definitions[1];
1012 Definition clobber_scc = instr->definitions[2];
1013 Operand tmp_op = instr->operands[0];
1014 Operand index_x4 = instr->operands[1];
1015 Operand input_data = instr->operands[2];
1016 Operand same_half = instr->operands[3];
1017
1018 assert(dst.regClass() == v1);
1019 assert(tmp_exec.regClass() == bld.lm);
1020 assert(clobber_scc.isFixed() && clobber_scc.physReg() == scc);
1021 assert(same_half.regClass() == bld.lm);
1022 assert(tmp_op.regClass() == v1.as_linear());
1023 assert(index_x4.regClass() == v1);
1024 assert(input_data.regClass().type() == RegType::vgpr);
1025 assert(input_data.bytes() <= 4);
1026
1027 Definition tmp_def(tmp_op.physReg(), tmp_op.regClass());
1028
1029 /* Permute the input within the same half-wave. */
1030 bld.ds(aco_opcode::ds_bpermute_b32, dst, index_x4, input_data);
1031
1032 /* Save EXEC and enable all lanes. */
1033 bld.sop1(aco_opcode::s_or_saveexec_b64, tmp_exec, clobber_scc, Definition(exec, s2),
1034 Operand::c32(-1u), Operand(exec, s2));
1035
1036 /* Copy input data from other half to current half's linear VGPR. */
1037 bld.vop1(aco_opcode::v_permlane64_b32, tmp_def, input_data);
1038
1039 /* Permute the input from the other half-wave, write to linear VGPR. */
1040 bld.ds(aco_opcode::ds_bpermute_b32, tmp_def, index_x4, tmp_op);
1041
1042 /* Restore saved EXEC. */
1043 bld.sop1(aco_opcode::s_mov_b64, Definition(exec, s2), Operand(tmp_exec.physReg(), s2));
1044
1045 /* Select correct permute result. */
1046 bld.vop2_e64(aco_opcode::v_cndmask_b32, dst, tmp_op, Operand(dst.physReg(), dst.regClass()),
1047 same_half);
1048
1049 adjust_bpermute_dst(bld, dst, input_data);
1050 }
1051
1052 void
emit_bpermute_shared_vgpr(Builder & bld,aco_ptr<Instruction> & instr)1053 emit_bpermute_shared_vgpr(Builder& bld, aco_ptr<Instruction>& instr)
1054 {
1055 /* Emulates proper bpermute on GFX10 in wave64 mode.
1056 *
1057 * This is necessary because on GFX10 the bpermute instruction only works
1058 * on half waves (you can think of it as having a cluster size of 32), so we
1059 * manually swap the data between the two halves using two shared VGPRs.
1060 */
1061
1062 assert(bld.program->gfx_level >= GFX10 && bld.program->gfx_level <= GFX10_3);
1063 assert(bld.program->wave_size == 64);
1064
1065 unsigned shared_vgpr_reg_0 = align(bld.program->config->num_vgprs, 4) + 256;
1066 Definition dst = instr->definitions[0];
1067 Definition tmp_exec = instr->definitions[1];
1068 Definition clobber_scc = instr->definitions[2];
1069 Operand index_x4 = instr->operands[0];
1070 Operand input_data = instr->operands[1];
1071 Operand same_half = instr->operands[2];
1072
1073 assert(dst.regClass() == v1);
1074 assert(tmp_exec.regClass() == bld.lm);
1075 assert(clobber_scc.isFixed() && clobber_scc.physReg() == scc);
1076 assert(same_half.regClass() == bld.lm);
1077 assert(index_x4.regClass() == v1);
1078 assert(input_data.regClass().type() == RegType::vgpr);
1079 assert(input_data.bytes() <= 4);
1080 assert(dst.physReg() != index_x4.physReg());
1081 assert(dst.physReg() != input_data.physReg());
1082 assert(tmp_exec.physReg() != same_half.physReg());
1083
1084 PhysReg shared_vgpr_lo(shared_vgpr_reg_0);
1085 PhysReg shared_vgpr_hi(shared_vgpr_reg_0 + 1);
1086
1087 /* Permute the input within the same half-wave */
1088 bld.ds(aco_opcode::ds_bpermute_b32, dst, index_x4, input_data);
1089
1090 /* HI: Copy data from high lanes 32-63 to shared vgpr */
1091 bld.vop1_dpp(aco_opcode::v_mov_b32, Definition(shared_vgpr_hi, v1), input_data,
1092 dpp_quad_perm(0, 1, 2, 3), 0xc, 0xf, false);
1093 /* Save EXEC */
1094 bld.sop1(aco_opcode::s_mov_b64, tmp_exec, Operand(exec, s2));
1095 /* Set EXEC to enable LO lanes only */
1096 copy_constant_sgpr(bld, Definition(exec, s2), 0x0000'0000'ffff'ffffull);
1097 /* LO: Copy data from low lanes 0-31 to shared vgpr */
1098 bld.vop1(aco_opcode::v_mov_b32, Definition(shared_vgpr_lo, v1), input_data);
1099 /* LO: bpermute shared vgpr (high lanes' data) */
1100 bld.ds(aco_opcode::ds_bpermute_b32, Definition(shared_vgpr_hi, v1), index_x4,
1101 Operand(shared_vgpr_hi, v1));
1102 /* Set EXEC to enable HI lanes only */
1103 copy_constant_sgpr(bld, Definition(exec, s2), 0xffff'ffff'0000'0000ull);
1104 /* HI: bpermute shared vgpr (low lanes' data) */
1105 bld.ds(aco_opcode::ds_bpermute_b32, Definition(shared_vgpr_lo, v1), index_x4,
1106 Operand(shared_vgpr_lo, v1));
1107
1108 /* Only enable lanes which use the other half's data */
1109 bld.sop2(aco_opcode::s_andn2_b64, Definition(exec, s2), clobber_scc,
1110 Operand(tmp_exec.physReg(), s2), same_half);
1111 /* LO: Copy shared vgpr (high lanes' bpermuted data) to output vgpr */
1112 bld.vop1_dpp(aco_opcode::v_mov_b32, dst, Operand(shared_vgpr_hi, v1), dpp_quad_perm(0, 1, 2, 3),
1113 0x3, 0xf, false);
1114 /* HI: Copy shared vgpr (low lanes' bpermuted data) to output vgpr */
1115 bld.vop1_dpp(aco_opcode::v_mov_b32, dst, Operand(shared_vgpr_lo, v1), dpp_quad_perm(0, 1, 2, 3),
1116 0xc, 0xf, false);
1117
1118 /* Restore saved EXEC */
1119 bld.sop1(aco_opcode::s_mov_b64, Definition(exec, s2), Operand(tmp_exec.physReg(), s2));
1120
1121 adjust_bpermute_dst(bld, dst, input_data);
1122 }
1123
1124 void
emit_bpermute_readlane(Builder & bld,aco_ptr<Instruction> & instr)1125 emit_bpermute_readlane(Builder& bld, aco_ptr<Instruction>& instr)
1126 {
1127 /* Emulates bpermute using readlane instructions */
1128
1129 Operand index = instr->operands[0];
1130 Operand input = instr->operands[1];
1131 Definition dst = instr->definitions[0];
1132 Definition temp_exec = instr->definitions[1];
1133 Definition clobber_vcc = instr->definitions[2];
1134
1135 assert(dst.regClass() == v1);
1136 assert(temp_exec.regClass() == bld.lm);
1137 assert(clobber_vcc.regClass() == bld.lm);
1138 assert(clobber_vcc.physReg() == vcc);
1139 assert(index.regClass() == v1);
1140 assert(index.physReg() != dst.physReg());
1141 assert(input.regClass().type() == RegType::vgpr);
1142 assert(input.bytes() <= 4);
1143 assert(input.physReg() != dst.physReg());
1144
1145 /* Save original EXEC */
1146 bld.sop1(Builder::s_mov, temp_exec, Operand(exec, bld.lm));
1147
1148 /* An "unrolled loop" that is executed per each lane.
1149 * This takes only a few instructions per lane, as opposed to a "real" loop
1150 * with branching, where the branch instruction alone would take 16+ cycles.
1151 */
1152 for (unsigned n = 0; n < bld.program->wave_size; ++n) {
1153 /* Activate the lane which has N for its source index */
1154 if (bld.program->gfx_level >= GFX10)
1155 bld.vopc(aco_opcode::v_cmpx_eq_u32, Definition(exec, bld.lm), Operand::c32(n), index);
1156 else
1157 bld.vopc(aco_opcode::v_cmpx_eq_u32, clobber_vcc, Definition(exec, bld.lm), Operand::c32(n),
1158 index);
1159 /* Read the data from lane N */
1160 bld.readlane(Definition(vcc, s1), input, Operand::c32(n));
1161 /* On the active lane, move the data we read from lane N to the destination VGPR */
1162 bld.vop1(aco_opcode::v_mov_b32, dst, Operand(vcc, s1));
1163 /* Restore original EXEC */
1164 bld.sop1(Builder::s_mov, Definition(exec, bld.lm), Operand(temp_exec.physReg(), bld.lm));
1165 }
1166
1167 adjust_bpermute_dst(bld, dst, input);
1168 }
1169
1170 struct copy_operation {
1171 Operand op;
1172 Definition def;
1173 unsigned bytes;
1174 union {
1175 uint8_t uses[8];
1176 uint64_t is_used = 0;
1177 };
1178 };
1179
1180 void
split_copy(lower_context * ctx,unsigned offset,Definition * def,Operand * op,const copy_operation & src,bool ignore_uses,unsigned max_size)1181 split_copy(lower_context* ctx, unsigned offset, Definition* def, Operand* op,
1182 const copy_operation& src, bool ignore_uses, unsigned max_size)
1183 {
1184 PhysReg def_reg = src.def.physReg();
1185 PhysReg op_reg = src.op.physReg();
1186 def_reg.reg_b += offset;
1187 op_reg.reg_b += offset;
1188
1189 /* 64-bit VGPR copies (implemented with v_lshrrev_b64) are slow before GFX10, and on GFX11
1190 * v_lshrrev_b64 doesn't get dual issued. */
1191 if ((ctx->program->gfx_level < GFX10 || ctx->program->gfx_level >= GFX11) &&
1192 src.def.regClass().type() == RegType::vgpr)
1193 max_size = MIN2(max_size, 4);
1194 unsigned max_align = src.def.regClass().type() == RegType::vgpr ? 4 : 16;
1195
1196 /* make sure the size is a power of two and reg % bytes == 0 */
1197 unsigned bytes = 1;
1198 for (; bytes <= max_size; bytes *= 2) {
1199 unsigned next = bytes * 2u;
1200 bool can_increase = def_reg.reg_b % MIN2(next, max_align) == 0 &&
1201 offset + next <= src.bytes && next <= max_size;
1202 if (!src.op.isConstant() && can_increase)
1203 can_increase = op_reg.reg_b % MIN2(next, max_align) == 0;
1204 for (unsigned i = 0; !ignore_uses && can_increase && (i < bytes); i++)
1205 can_increase = (src.uses[offset + bytes + i] == 0) == (src.uses[offset] == 0);
1206 if (!can_increase)
1207 break;
1208 }
1209
1210 *def = Definition(def_reg, src.def.regClass().resize(bytes));
1211 if (src.op.isConstant()) {
1212 assert(bytes >= 1 && bytes <= 8);
1213 uint64_t val = src.op.constantValue64() >> (offset * 8u);
1214 *op = Operand::get_const(ctx->program->gfx_level, val, bytes);
1215 } else {
1216 RegClass op_cls = src.op.regClass().resize(bytes);
1217 *op = Operand(op_reg, op_cls);
1218 op->setTemp(Temp(src.op.tempId(), op_cls));
1219 }
1220 }
1221
1222 uint32_t
get_intersection_mask(int a_start,int a_size,int b_start,int b_size)1223 get_intersection_mask(int a_start, int a_size, int b_start, int b_size)
1224 {
1225 int intersection_start = MAX2(b_start - a_start, 0);
1226 int intersection_end = MAX2(b_start + b_size - a_start, 0);
1227 if (intersection_start >= a_size || intersection_end == 0)
1228 return 0;
1229
1230 uint32_t mask = u_bit_consecutive(0, a_size);
1231 return u_bit_consecutive(intersection_start, intersection_end - intersection_start) & mask;
1232 }
1233
1234 /* src1 are bytes 0-3. dst/src0 are bytes 4-7. */
1235 void
create_bperm(Builder & bld,uint8_t swiz[4],Definition dst,Operand src1,Operand src0=Operand (v1))1236 create_bperm(Builder& bld, uint8_t swiz[4], Definition dst, Operand src1,
1237 Operand src0 = Operand(v1))
1238 {
1239 uint32_t swiz_packed =
1240 swiz[0] | ((uint32_t)swiz[1] << 8) | ((uint32_t)swiz[2] << 16) | ((uint32_t)swiz[3] << 24);
1241
1242 dst = Definition(PhysReg(dst.physReg().reg()), v1);
1243 if (!src1.isConstant())
1244 src1 = Operand(PhysReg(src1.physReg().reg()), v1);
1245 if (src0.isUndefined())
1246 src0 = Operand(dst.physReg(), v1);
1247 else if (!src0.isConstant())
1248 src0 = Operand(PhysReg(src0.physReg().reg()), v1);
1249 bld.vop3(aco_opcode::v_perm_b32, dst, src0, src1, Operand::c32(swiz_packed));
1250 }
1251
1252 void
emit_v_mov_b16(Builder & bld,Definition dst,Operand op)1253 emit_v_mov_b16(Builder& bld, Definition dst, Operand op)
1254 {
1255 /* v_mov_b16 uses 32bit inline constants. */
1256 if (op.isConstant()) {
1257 if (!op.isLiteral() && op.physReg() >= 240) {
1258 /* v_add_f16 is smaller because it can use 16bit fp inline constants. */
1259 Instruction* instr = bld.vop2_e64(aco_opcode::v_add_f16, dst, op, Operand::zero());
1260 instr->valu().opsel[3] = dst.physReg().byte() == 2;
1261 return;
1262 }
1263 op = Operand::c32((int32_t)(int16_t)op.constantValue());
1264 }
1265
1266 Instruction* instr = bld.vop1(aco_opcode::v_mov_b16, dst, op);
1267 instr->valu().opsel[0] = op.physReg().byte() == 2;
1268 instr->valu().opsel[3] = dst.physReg().byte() == 2;
1269
1270 if (op.physReg().reg() < 256 && instr->valu().opsel[0])
1271 instr->format = asVOP3(instr->format);
1272 }
1273
1274 void
copy_constant(lower_context * ctx,Builder & bld,Definition dst,Operand op)1275 copy_constant(lower_context* ctx, Builder& bld, Definition dst, Operand op)
1276 {
1277 assert(op.bytes() == dst.bytes());
1278
1279 if (dst.regClass().type() == RegType::sgpr)
1280 return copy_constant_sgpr(bld, dst, op.constantValue64());
1281
1282 bool dual_issue_mov = ctx->program->gfx_level >= GFX11 && ctx->program->wave_size == 64 &&
1283 ctx->program->workgroup_size > 32;
1284 if (dst.bytes() == 4 && op.isLiteral() && !dual_issue_mov) {
1285 uint32_t imm = op.constantValue();
1286 Operand rev_op = Operand::get_const(ctx->program->gfx_level, util_bitreverse(imm), 4);
1287 if (!rev_op.isLiteral()) {
1288 bld.vop1(aco_opcode::v_bfrev_b32, dst, rev_op);
1289 return;
1290 }
1291 }
1292
1293 if (op.bytes() == 4 && op.constantEquals(0x3e22f983) && ctx->program->gfx_level >= GFX8)
1294 op.setFixed(PhysReg{248}); /* it can be an inline constant on GFX8+ */
1295
1296 if (dst.regClass() == v2) {
1297 if (Operand::is_constant_representable(op.constantValue64(), 8, true, false)) {
1298 bld.vop3(aco_opcode::v_lshrrev_b64, dst, Operand::zero(), op);
1299 } else {
1300 assert(Operand::is_constant_representable(op.constantValue64(), 8, false, true));
1301 bld.vop3(aco_opcode::v_ashrrev_i64, dst, Operand::zero(), op);
1302 }
1303 } else if (dst.regClass() == v1) {
1304 bld.vop1(aco_opcode::v_mov_b32, dst, op);
1305 } else {
1306 assert(dst.regClass() == v1b || dst.regClass() == v2b);
1307
1308 bool use_sdwa = ctx->program->gfx_level >= GFX9 && ctx->program->gfx_level < GFX11;
1309 if (dst.regClass() == v1b && use_sdwa) {
1310 uint8_t val = op.constantValue();
1311 Operand op32 = Operand::c32((uint32_t)val | (val & 0x80u ? 0xffffff00u : 0u));
1312 if (op32.isLiteral()) {
1313 uint32_t a = (uint32_t)int8_mul_table[val * 2];
1314 uint32_t b = (uint32_t)int8_mul_table[val * 2 + 1];
1315 bld.vop2_sdwa(aco_opcode::v_mul_u32_u24, dst,
1316 Operand::c32(a | (a & 0x80u ? 0xffffff00u : 0x0u)),
1317 Operand::c32(b | (b & 0x80u ? 0xffffff00u : 0x0u)));
1318 } else {
1319 bld.vop1_sdwa(aco_opcode::v_mov_b32, dst, op32);
1320 }
1321 } else if (dst.regClass() == v1b && ctx->program->gfx_level >= GFX10) {
1322 Operand fop = Operand::c32(fui(float(op.constantValue())));
1323 Operand offset = Operand::c32(dst.physReg().byte());
1324 Operand def_op(PhysReg(dst.physReg().reg()), v1);
1325 bld.vop3(aco_opcode::v_cvt_pk_u8_f32, dst, fop, offset, def_op);
1326 } else if (dst.regClass() == v2b && ctx->program->gfx_level >= GFX11) {
1327 emit_v_mov_b16(bld, dst, op);
1328 } else if (dst.regClass() == v2b && use_sdwa && !op.isLiteral()) {
1329 if (op.constantValue() >= 0xfff0 || op.constantValue() <= 64) {
1330 /* use v_mov_b32 to avoid possible issues with denormal flushing or
1331 * NaN. v_add_f16 is still needed for float constants. */
1332 uint32_t val32 = (int32_t)(int16_t)op.constantValue();
1333 bld.vop1_sdwa(aco_opcode::v_mov_b32, dst, Operand::c32(val32));
1334 } else {
1335 bld.vop2_sdwa(aco_opcode::v_add_f16, dst, op, Operand::zero());
1336 }
1337 } else if (dst.regClass() == v2b && ctx->program->gfx_level >= GFX10) {
1338 op = Operand::c32(op.constantValue());
1339 Instruction* instr = bld.vop3(aco_opcode::v_add_u16_e64, dst, op, Operand::c32(0));
1340 instr->valu().opsel[3] = dst.physReg().byte() == 2;
1341 } else {
1342 uint32_t offset = dst.physReg().byte() * 8u;
1343 uint32_t mask = ((1u << (dst.bytes() * 8)) - 1) << offset;
1344 uint32_t val = (op.constantValue() << offset) & mask;
1345 dst = Definition(PhysReg(dst.physReg().reg()), v1);
1346 Operand def_op(dst.physReg(), v1);
1347 if (val != mask)
1348 bld.vop2(aco_opcode::v_and_b32, dst, Operand::c32(~mask), def_op);
1349 if (val != 0)
1350 bld.vop2(aco_opcode::v_or_b32, dst, Operand::c32(val), def_op);
1351 }
1352 }
1353 }
1354
1355 bool
do_copy(lower_context * ctx,Builder & bld,const copy_operation & copy,bool * preserve_scc,PhysReg scratch_sgpr)1356 do_copy(lower_context* ctx, Builder& bld, const copy_operation& copy, bool* preserve_scc,
1357 PhysReg scratch_sgpr)
1358 {
1359 bool did_copy = false;
1360 for (unsigned offset = 0; offset < copy.bytes;) {
1361 if (copy.uses[offset]) {
1362 offset++;
1363 continue;
1364 }
1365
1366 Definition def;
1367 Operand op;
1368 split_copy(ctx, offset, &def, &op, copy, false, 8);
1369
1370 if (def.physReg() == scc) {
1371 bld.sopc(aco_opcode::s_cmp_lg_i32, def, op, Operand::zero());
1372 *preserve_scc = true;
1373 } else if (op.isConstant()) {
1374 copy_constant(ctx, bld, def, op);
1375 } else if (def.regClass() == v1) {
1376 bld.vop1(aco_opcode::v_mov_b32, def, op);
1377 } else if (def.regClass() == v2) {
1378 bld.vop3(aco_opcode::v_lshrrev_b64, def, Operand::zero(), op);
1379 } else if (def.regClass() == s1) {
1380 bld.sop1(aco_opcode::s_mov_b32, def, op);
1381 } else if (def.regClass() == s2) {
1382 bld.sop1(aco_opcode::s_mov_b64, def, op);
1383 } else if (def.regClass() == v1b && ctx->program->gfx_level >= GFX11) {
1384 uint8_t swiz[] = {4, 5, 6, 7};
1385 swiz[def.physReg().byte()] = op.physReg().byte();
1386 create_bperm(bld, swiz, def, op);
1387 } else if (def.regClass() == v2b && ctx->program->gfx_level >= GFX11) {
1388 emit_v_mov_b16(bld, def, op);
1389 } else if (def.regClass().is_subdword()) {
1390 bld.vop1_sdwa(aco_opcode::v_mov_b32, def, op);
1391 } else {
1392 unreachable("unsupported copy");
1393 }
1394
1395 did_copy = true;
1396 offset += def.bytes();
1397 }
1398 return did_copy;
1399 }
1400
1401 void
swap_subdword_gfx11(Builder & bld,Definition def,Operand op)1402 swap_subdword_gfx11(Builder& bld, Definition def, Operand op)
1403 {
1404 if (def.physReg().reg() == op.physReg().reg()) {
1405 assert(def.bytes() != 2); /* handled by caller */
1406 uint8_t swiz[] = {4, 5, 6, 7};
1407 std::swap(swiz[def.physReg().byte()], swiz[op.physReg().byte()]);
1408 create_bperm(bld, swiz, def, Operand::zero());
1409 return;
1410 }
1411
1412 if (def.bytes() == 2) {
1413 Operand def_as_op = Operand(def.physReg(), def.regClass());
1414 Definition op_as_def = Definition(op.physReg(), op.regClass());
1415 /* v_swap_b16 is not offically supported as VOP3, so it can't be used with v128-255.
1416 * Tests show that VOP3 appears to work correctly, but according to AMD that should
1417 * not be relied on.
1418 */
1419 if (def.physReg() < (256 + 128) && op.physReg() < (256 + 128)) {
1420 Instruction* instr = bld.vop1(aco_opcode::v_swap_b16, def, op_as_def, op, def_as_op);
1421 instr->valu().opsel[0] = op.physReg().byte();
1422 instr->valu().opsel[3] = def.physReg().byte();
1423 } else {
1424 Instruction* instr = bld.vop3(aco_opcode::v_xor_b16, def, op, def_as_op);
1425 instr->valu().opsel[0] = op.physReg().byte();
1426 instr->valu().opsel[1] = def_as_op.physReg().byte();
1427 instr->valu().opsel[3] = def.physReg().byte();
1428 instr = bld.vop3(aco_opcode::v_xor_b16, op_as_def, op, def_as_op);
1429 instr->valu().opsel[0] = op.physReg().byte();
1430 instr->valu().opsel[1] = def_as_op.physReg().byte();
1431 instr->valu().opsel[3] = op_as_def.physReg().byte();
1432 instr = bld.vop3(aco_opcode::v_xor_b16, def, op, def_as_op);
1433 instr->valu().opsel[0] = op.physReg().byte();
1434 instr->valu().opsel[1] = def_as_op.physReg().byte();
1435 instr->valu().opsel[3] = def.physReg().byte();
1436 }
1437 } else {
1438 PhysReg op_half = op.physReg();
1439 op_half.reg_b &= ~1;
1440
1441 PhysReg def_other_half = def.physReg();
1442 def_other_half.reg_b &= ~1;
1443 def_other_half.reg_b ^= 2;
1444
1445 /* We can only swap individual bytes within a single VGPR, so temporarily move both bytes
1446 * into the same VGPR.
1447 */
1448 swap_subdword_gfx11(bld, Definition(def_other_half, v2b), Operand(op_half, v2b));
1449 swap_subdword_gfx11(bld, def, Operand(def_other_half.advance(op.physReg().byte() & 1), v1b));
1450 swap_subdword_gfx11(bld, Definition(def_other_half, v2b), Operand(op_half, v2b));
1451 }
1452 }
1453
1454 void
do_swap(lower_context * ctx,Builder & bld,const copy_operation & copy,bool preserve_scc,Pseudo_instruction * pi)1455 do_swap(lower_context* ctx, Builder& bld, const copy_operation& copy, bool preserve_scc,
1456 Pseudo_instruction* pi)
1457 {
1458 unsigned offset = 0;
1459
1460 if (copy.bytes == 3 && (copy.def.physReg().reg_b % 4 <= 1) &&
1461 (copy.def.physReg().reg_b % 4) == (copy.op.physReg().reg_b % 4)) {
1462 /* instead of doing a 2-byte and 1-byte swap, do a 4-byte swap and then fixup with a 1-byte
1463 * swap */
1464 PhysReg op = copy.op.physReg();
1465 PhysReg def = copy.def.physReg();
1466 op.reg_b &= ~0x3;
1467 def.reg_b &= ~0x3;
1468
1469 copy_operation tmp;
1470 tmp.op = Operand(op, v1);
1471 tmp.def = Definition(def, v1);
1472 tmp.bytes = 4;
1473 memset(tmp.uses, 1, 4);
1474 do_swap(ctx, bld, tmp, preserve_scc, pi);
1475
1476 op.reg_b += copy.def.physReg().reg_b % 4 == 0 ? 3 : 0;
1477 def.reg_b += copy.def.physReg().reg_b % 4 == 0 ? 3 : 0;
1478 tmp.op = Operand(op, v1b);
1479 tmp.def = Definition(def, v1b);
1480 tmp.bytes = 1;
1481 tmp.uses[0] = 1;
1482 do_swap(ctx, bld, tmp, preserve_scc, pi);
1483
1484 offset = copy.bytes;
1485 }
1486
1487 for (; offset < copy.bytes;) {
1488 Definition def;
1489 Operand op;
1490 unsigned max_size = copy.def.regClass().type() == RegType::vgpr ? 4 : 8;
1491 split_copy(ctx, offset, &def, &op, copy, true, max_size);
1492
1493 assert(op.regClass() == def.regClass());
1494 Operand def_as_op = Operand(def.physReg(), def.regClass());
1495 Definition op_as_def = Definition(op.physReg(), op.regClass());
1496 if (ctx->program->gfx_level >= GFX9 && def.regClass() == v1) {
1497 bld.vop1(aco_opcode::v_swap_b32, def, op_as_def, op, def_as_op);
1498 } else if (def.regClass() == v1) {
1499 assert(def.physReg().byte() == 0 && op.physReg().byte() == 0);
1500 bld.vop2(aco_opcode::v_xor_b32, op_as_def, op, def_as_op);
1501 bld.vop2(aco_opcode::v_xor_b32, def, op, def_as_op);
1502 bld.vop2(aco_opcode::v_xor_b32, op_as_def, op, def_as_op);
1503 } else if (op.physReg() == scc || def.physReg() == scc) {
1504 /* we need to swap scc and another sgpr */
1505 assert(!preserve_scc);
1506
1507 PhysReg other = op.physReg() == scc ? def.physReg() : op.physReg();
1508
1509 bld.sop1(aco_opcode::s_mov_b32, Definition(pi->scratch_sgpr, s1), Operand(scc, s1));
1510 bld.sopc(aco_opcode::s_cmp_lg_i32, Definition(scc, s1), Operand(other, s1),
1511 Operand::zero());
1512 bld.sop1(aco_opcode::s_mov_b32, Definition(other, s1), Operand(pi->scratch_sgpr, s1));
1513 } else if (def.regClass() == s1) {
1514 if (preserve_scc) {
1515 bld.sop1(aco_opcode::s_mov_b32, Definition(pi->scratch_sgpr, s1), op);
1516 bld.sop1(aco_opcode::s_mov_b32, op_as_def, def_as_op);
1517 bld.sop1(aco_opcode::s_mov_b32, def, Operand(pi->scratch_sgpr, s1));
1518 } else {
1519 bld.sop2(aco_opcode::s_xor_b32, op_as_def, Definition(scc, s1), op, def_as_op);
1520 bld.sop2(aco_opcode::s_xor_b32, def, Definition(scc, s1), op, def_as_op);
1521 bld.sop2(aco_opcode::s_xor_b32, op_as_def, Definition(scc, s1), op, def_as_op);
1522 }
1523 } else if (def.regClass() == s2) {
1524 if (preserve_scc)
1525 bld.sop1(aco_opcode::s_mov_b32, Definition(pi->scratch_sgpr, s1), Operand(scc, s1));
1526 bld.sop2(aco_opcode::s_xor_b64, op_as_def, Definition(scc, s1), op, def_as_op);
1527 bld.sop2(aco_opcode::s_xor_b64, def, Definition(scc, s1), op, def_as_op);
1528 bld.sop2(aco_opcode::s_xor_b64, op_as_def, Definition(scc, s1), op, def_as_op);
1529 if (preserve_scc)
1530 bld.sopc(aco_opcode::s_cmp_lg_i32, Definition(scc, s1), Operand(pi->scratch_sgpr, s1),
1531 Operand::zero());
1532 } else if (def.bytes() == 2 && def.physReg().reg() == op.physReg().reg()) {
1533 bld.vop3(aco_opcode::v_alignbyte_b32, Definition(def.physReg(), v1), def_as_op, op,
1534 Operand::c32(2u));
1535 } else {
1536 assert(def.regClass().is_subdword());
1537 if (ctx->program->gfx_level >= GFX11) {
1538 swap_subdword_gfx11(bld, def, op);
1539 } else {
1540 bld.vop2_sdwa(aco_opcode::v_xor_b32, op_as_def, op, def_as_op);
1541 bld.vop2_sdwa(aco_opcode::v_xor_b32, def, op, def_as_op);
1542 bld.vop2_sdwa(aco_opcode::v_xor_b32, op_as_def, op, def_as_op);
1543 }
1544 }
1545
1546 offset += def.bytes();
1547 }
1548
1549 if (ctx->program->gfx_level <= GFX7)
1550 return;
1551
1552 /* fixup in case we swapped bytes we shouldn't have */
1553 copy_operation tmp_copy = copy;
1554 tmp_copy.op.setFixed(copy.def.physReg());
1555 tmp_copy.def.setFixed(copy.op.physReg());
1556 do_copy(ctx, bld, tmp_copy, &preserve_scc, pi->scratch_sgpr);
1557 }
1558
1559 void
do_pack_2x16(lower_context * ctx,Builder & bld,Definition def,Operand lo,Operand hi)1560 do_pack_2x16(lower_context* ctx, Builder& bld, Definition def, Operand lo, Operand hi)
1561 {
1562 assert(ctx->program->gfx_level >= GFX8);
1563
1564 if (lo.isConstant() && hi.isConstant()) {
1565 copy_constant(ctx, bld, def, Operand::c32(lo.constantValue() | (hi.constantValue() << 16)));
1566 return;
1567 }
1568
1569 /* v_pack_b32_f16 can be used for bit exact copies if:
1570 * - fp16 input denorms are enabled, otherwise they get flushed to zero
1571 * - signalling input NaNs are kept, which is the case with IEEE_MODE=0
1572 * GFX12+ always quiets signalling NaNs, IEEE_MODE was removed
1573 */
1574 bool can_use_pack = (ctx->block->fp_mode.denorm16_64 & fp_denorm_keep_in) &&
1575 (ctx->program->gfx_level >= GFX10 ||
1576 (ctx->program->gfx_level >= GFX9 && !lo.isLiteral() && !hi.isLiteral())) &&
1577 ctx->program->gfx_level < GFX12;
1578
1579 if (can_use_pack) {
1580 Instruction* instr = bld.vop3(aco_opcode::v_pack_b32_f16, def, lo, hi);
1581 /* opsel: 0 = select low half, 1 = select high half. [0] = src0, [1] = src1 */
1582 instr->valu().opsel = hi.physReg().byte() | (lo.physReg().byte() >> 1);
1583 return;
1584 }
1585
1586 /* a single alignbyte can be sufficient: hi can be a 32-bit integer constant */
1587 if (lo.physReg().byte() == 2 && hi.physReg().byte() == 0 &&
1588 (!hi.isConstant() || (hi.constantValue() && (!Operand::c32(hi.constantValue()).isLiteral() ||
1589 ctx->program->gfx_level >= GFX10)))) {
1590 if (hi.isConstant())
1591 bld.vop3(aco_opcode::v_alignbyte_b32, def, Operand::c32(hi.constantValue()), lo,
1592 Operand::c32(2u));
1593 else
1594 bld.vop3(aco_opcode::v_alignbyte_b32, def, hi, lo, Operand::c32(2u));
1595 return;
1596 }
1597
1598 Definition def_lo = Definition(def.physReg(), v2b);
1599 Definition def_hi = Definition(def.physReg().advance(2), v2b);
1600
1601 if (lo.isConstant()) {
1602 /* move hi and zero low bits */
1603 if (hi.physReg().byte() == 0)
1604 bld.vop2(aco_opcode::v_lshlrev_b32, def_hi, Operand::c32(16u), hi);
1605 else
1606 bld.vop2(aco_opcode::v_and_b32, def_hi, Operand::c32(~0xFFFFu), hi);
1607 if (lo.constantValue())
1608 bld.vop2(aco_opcode::v_or_b32, def, Operand::c32(lo.constantValue()),
1609 Operand(def.physReg(), v1));
1610 return;
1611 }
1612 if (hi.isConstant()) {
1613 /* move lo and zero high bits */
1614 if (lo.physReg().byte() == 2)
1615 bld.vop2(aco_opcode::v_lshrrev_b32, def_lo, Operand::c32(16u), lo);
1616 else if (ctx->program->gfx_level >= GFX11)
1617 bld.vop1(aco_opcode::v_cvt_u32_u16, def, lo);
1618 else
1619 bld.vop2(aco_opcode::v_and_b32, def_lo, Operand::c32(0xFFFFu), lo);
1620 if (hi.constantValue())
1621 bld.vop2(aco_opcode::v_or_b32, def, Operand::c32(hi.constantValue() << 16u),
1622 Operand(def.physReg(), v1));
1623 return;
1624 }
1625
1626 if (lo.physReg().reg() == def.physReg().reg()) {
1627 /* lo is in the high bits of def */
1628 assert(lo.physReg().byte() == 2);
1629 bld.vop2(aco_opcode::v_lshrrev_b32, def_lo, Operand::c32(16u), lo);
1630 lo.setFixed(def.physReg());
1631 } else if (hi.physReg() == def.physReg()) {
1632 /* hi is in the low bits of def */
1633 assert(hi.physReg().byte() == 0);
1634 bld.vop2(aco_opcode::v_lshlrev_b32, def_hi, Operand::c32(16u), hi);
1635 hi.setFixed(def.physReg().advance(2));
1636 } else if (ctx->program->gfx_level >= GFX8) {
1637 /* Either lo or hi can be placed with just a v_mov. SDWA is not needed, because
1638 * op.physReg().byte()==def.physReg().byte() and the other half will be overwritten.
1639 */
1640 assert(lo.physReg().byte() == 0 || hi.physReg().byte() == 2);
1641 Operand& op = lo.physReg().byte() == 0 ? lo : hi;
1642 PhysReg reg = def.physReg().advance(op.physReg().byte());
1643 bld.vop1(aco_opcode::v_mov_b32, Definition(reg, v2b), op);
1644 op.setFixed(reg);
1645 }
1646
1647 /* either hi or lo are already placed correctly */
1648 if (ctx->program->gfx_level >= GFX11) {
1649 if (lo.physReg().reg() == def.physReg().reg())
1650 emit_v_mov_b16(bld, def_hi, hi);
1651 else
1652 emit_v_mov_b16(bld, def_lo, lo);
1653 } else {
1654 if (lo.physReg().reg() == def.physReg().reg())
1655 bld.vop1_sdwa(aco_opcode::v_mov_b32, def_hi, hi);
1656 else
1657 bld.vop1_sdwa(aco_opcode::v_mov_b32, def_lo, lo);
1658 }
1659 }
1660
1661 void
try_coalesce_copies(lower_context * ctx,std::map<PhysReg,copy_operation> & copy_map,copy_operation & copy)1662 try_coalesce_copies(lower_context* ctx, std::map<PhysReg, copy_operation>& copy_map,
1663 copy_operation& copy)
1664 {
1665 // TODO try more relaxed alignment for subdword copies
1666 unsigned next_def_align = util_next_power_of_two(copy.bytes + 1);
1667 unsigned next_op_align = next_def_align;
1668 if (copy.def.regClass().type() == RegType::vgpr)
1669 next_def_align = MIN2(next_def_align, 4);
1670 if (copy.op.regClass().type() == RegType::vgpr)
1671 next_op_align = MIN2(next_op_align, 4);
1672
1673 if (copy.bytes >= 8 || copy.def.physReg().reg_b % next_def_align ||
1674 (!copy.op.isConstant() && copy.op.physReg().reg_b % next_op_align))
1675 return;
1676
1677 auto other = copy_map.find(copy.def.physReg().advance(copy.bytes));
1678 if (other == copy_map.end() || copy.bytes + other->second.bytes > 8 ||
1679 copy.op.isConstant() != other->second.op.isConstant())
1680 return;
1681
1682 /* don't create 64-bit copies before GFX10 */
1683 if (copy.bytes >= 4 && copy.def.regClass().type() == RegType::vgpr &&
1684 ctx->program->gfx_level < GFX10)
1685 return;
1686
1687 unsigned new_size = copy.bytes + other->second.bytes;
1688 if (copy.op.isConstant()) {
1689 uint64_t val =
1690 copy.op.constantValue64() | (other->second.op.constantValue64() << (copy.bytes * 8u));
1691 if (!util_is_power_of_two_or_zero(new_size))
1692 return;
1693 if (!Operand::is_constant_representable(val, new_size, true,
1694 copy.def.regClass().type() == RegType::vgpr))
1695 return;
1696 copy.op = Operand::get_const(ctx->program->gfx_level, val, new_size);
1697 } else {
1698 if (other->second.op.physReg() != copy.op.physReg().advance(copy.bytes))
1699 return;
1700 copy.op = Operand(copy.op.physReg(), copy.op.regClass().resize(new_size));
1701 }
1702
1703 copy.bytes = new_size;
1704 copy.def = Definition(copy.def.physReg(), copy.def.regClass().resize(copy.bytes));
1705 copy_map.erase(other);
1706 }
1707
1708 void
handle_operands(std::map<PhysReg,copy_operation> & copy_map,lower_context * ctx,amd_gfx_level gfx_level,Pseudo_instruction * pi)1709 handle_operands(std::map<PhysReg, copy_operation>& copy_map, lower_context* ctx,
1710 amd_gfx_level gfx_level, Pseudo_instruction* pi)
1711 {
1712 Builder bld(ctx->program, &ctx->instructions);
1713 unsigned num_instructions_before = ctx->instructions.size();
1714 aco_ptr<Instruction> mov;
1715 bool writes_scc = false;
1716
1717 /* count the number of uses for each dst reg */
1718 for (auto it = copy_map.begin(); it != copy_map.end();) {
1719
1720 if (it->second.def.physReg() == scc)
1721 writes_scc = true;
1722
1723 assert(!pi->needs_scratch_reg || it->second.def.physReg() != pi->scratch_sgpr);
1724
1725 /* if src and dst reg are the same, remove operation */
1726 if (it->first == it->second.op.physReg()) {
1727 it = copy_map.erase(it);
1728 continue;
1729 }
1730
1731 /* split large copies */
1732 if (it->second.bytes > 8) {
1733 assert(!it->second.op.isConstant());
1734 assert(!it->second.def.regClass().is_subdword());
1735 RegClass rc = it->second.def.regClass().resize(it->second.def.bytes() - 8);
1736 Definition hi_def = Definition(PhysReg{it->first + 2}, rc);
1737 rc = it->second.op.regClass().resize(it->second.op.bytes() - 8);
1738 Operand hi_op = Operand(PhysReg{it->second.op.physReg() + 2}, rc);
1739 copy_operation copy = {hi_op, hi_def, it->second.bytes - 8};
1740 copy_map[hi_def.physReg()] = copy;
1741 assert(it->second.op.physReg().byte() == 0 && it->second.def.physReg().byte() == 0);
1742 it->second.op = Operand(it->second.op.physReg(), it->second.op.regClass().resize(8));
1743 it->second.def = Definition(it->second.def.physReg(), it->second.def.regClass().resize(8));
1744 it->second.bytes = 8;
1745 }
1746
1747 try_coalesce_copies(ctx, copy_map, it->second);
1748
1749 /* check if the definition reg is used by another copy operation */
1750 for (std::pair<const PhysReg, copy_operation>& copy : copy_map) {
1751 if (copy.second.op.isConstant())
1752 continue;
1753 for (uint16_t i = 0; i < it->second.bytes; i++) {
1754 /* distance might underflow */
1755 unsigned distance = it->first.reg_b + i - copy.second.op.physReg().reg_b;
1756 if (distance < copy.second.bytes)
1757 it->second.uses[i] += 1;
1758 }
1759 }
1760
1761 ++it;
1762 }
1763
1764 /* first, handle paths in the location transfer graph */
1765 bool preserve_scc = pi->needs_scratch_reg && pi->scratch_sgpr != scc && !writes_scc;
1766 bool skip_partial_copies = true;
1767 for (auto it = copy_map.begin();;) {
1768 if (copy_map.empty()) {
1769 ctx->program->statistics[aco_statistic_copies] +=
1770 ctx->instructions.size() - num_instructions_before;
1771 return;
1772 }
1773 if (it == copy_map.end()) {
1774 if (!skip_partial_copies)
1775 break;
1776 skip_partial_copies = false;
1777 it = copy_map.begin();
1778 }
1779
1780 /* check if we can pack one register at once */
1781 if (it->first.byte() == 0 && it->second.bytes == 2) {
1782 PhysReg reg_hi = it->first.advance(2);
1783 std::map<PhysReg, copy_operation>::iterator other = copy_map.find(reg_hi);
1784 if (other != copy_map.end() && other->second.bytes == 2) {
1785 /* check if the target register is otherwise unused */
1786 bool unused_lo = !it->second.is_used || (it->second.is_used == 0x0101 &&
1787 other->second.op.physReg() == it->first);
1788 bool unused_hi = !other->second.is_used ||
1789 (other->second.is_used == 0x0101 && it->second.op.physReg() == reg_hi);
1790 if (unused_lo && unused_hi) {
1791 Operand lo = it->second.op;
1792 Operand hi = other->second.op;
1793 do_pack_2x16(ctx, bld, Definition(it->first, v1), lo, hi);
1794 copy_map.erase(it);
1795 copy_map.erase(other);
1796
1797 for (std::pair<const PhysReg, copy_operation>& other2 : copy_map) {
1798 for (uint16_t i = 0; i < other2.second.bytes; i++) {
1799 /* distance might underflow */
1800 unsigned distance_lo = other2.first.reg_b + i - lo.physReg().reg_b;
1801 unsigned distance_hi = other2.first.reg_b + i - hi.physReg().reg_b;
1802 if (distance_lo < 2 || distance_hi < 2)
1803 other2.second.uses[i] -= 1;
1804 }
1805 }
1806 it = copy_map.begin();
1807 continue;
1808 }
1809 }
1810 }
1811
1812 /* optimize constant copies to aligned sgpr pair that's otherwise unused. */
1813 if (it->first <= exec && (it->first % 2) == 0 && it->second.bytes == 4 &&
1814 it->second.op.isConstant() && !it->second.is_used) {
1815 PhysReg reg_hi = it->first.advance(4);
1816 std::map<PhysReg, copy_operation>::iterator other = copy_map.find(reg_hi);
1817 if (other != copy_map.end() && other->second.bytes == 4 && other->second.op.isConstant() &&
1818 !other->second.is_used) {
1819 uint64_t constant =
1820 it->second.op.constantValue64() | (other->second.op.constantValue64() << 32);
1821 copy_constant_sgpr(bld, Definition(it->first, s2), constant);
1822 copy_map.erase(it);
1823 copy_map.erase(other);
1824 it = copy_map.begin();
1825 continue;
1826 }
1827 }
1828
1829 /* find portions where the target reg is not used as operand for any other copy */
1830 if (it->second.is_used) {
1831 if (it->second.op.isConstant() || skip_partial_copies) {
1832 /* we have to skip constants until is_used=0.
1833 * we also skip partial copies at the beginning to help coalescing */
1834 ++it;
1835 continue;
1836 }
1837
1838 unsigned has_zero_use_bytes = 0;
1839 for (unsigned i = 0; i < it->second.bytes; i++)
1840 has_zero_use_bytes |= (it->second.uses[i] == 0) << i;
1841
1842 if (has_zero_use_bytes) {
1843 /* Skipping partial copying and doing a v_swap_b32 and then fixup
1844 * copies is usually beneficial for sub-dword copies, but if doing
1845 * a partial copy allows further copies, it should be done instead. */
1846 bool partial_copy = (has_zero_use_bytes == 0xf) || (has_zero_use_bytes == 0xf0);
1847 for (std::pair<const PhysReg, copy_operation>& copy : copy_map) {
1848 if (partial_copy)
1849 break;
1850 for (uint16_t i = 0; i < copy.second.bytes; i++) {
1851 /* distance might underflow */
1852 unsigned distance = copy.first.reg_b + i - it->second.op.physReg().reg_b;
1853 if (distance < it->second.bytes && copy.second.uses[i] == 1 &&
1854 !it->second.uses[distance])
1855 partial_copy = true;
1856 }
1857 }
1858
1859 if (!partial_copy) {
1860 ++it;
1861 continue;
1862 }
1863 } else {
1864 /* full target reg is used: register swapping needed */
1865 ++it;
1866 continue;
1867 }
1868 }
1869
1870 bool did_copy = do_copy(ctx, bld, it->second, &preserve_scc, pi->scratch_sgpr);
1871 skip_partial_copies = did_copy;
1872 std::pair<PhysReg, copy_operation> copy = *it;
1873
1874 if (it->second.is_used == 0) {
1875 /* the target reg is not used as operand for any other copy, so we
1876 * copied to all of it */
1877 copy_map.erase(it);
1878 it = copy_map.begin();
1879 } else {
1880 /* we only performed some portions of this copy, so split it to only
1881 * leave the portions that still need to be done */
1882 copy_operation original = it->second; /* the map insertion below can overwrite this */
1883 copy_map.erase(it);
1884 for (unsigned offset = 0; offset < original.bytes;) {
1885 if (original.uses[offset] == 0) {
1886 offset++;
1887 continue;
1888 }
1889 Definition def;
1890 Operand op;
1891 split_copy(ctx, offset, &def, &op, original, false, 8);
1892
1893 copy_operation new_copy = {op, def, def.bytes()};
1894 for (unsigned i = 0; i < new_copy.bytes; i++)
1895 new_copy.uses[i] = original.uses[i + offset];
1896 copy_map[def.physReg()] = new_copy;
1897
1898 offset += def.bytes();
1899 }
1900
1901 it = copy_map.begin();
1902 }
1903
1904 /* Reduce the number of uses of the operand reg by one. Do this after
1905 * splitting the copy or removing it in case the copy writes to it's own
1906 * operand (for example, v[7:8] = v[8:9]) */
1907 if (did_copy && !copy.second.op.isConstant()) {
1908 for (std::pair<const PhysReg, copy_operation>& other : copy_map) {
1909 for (uint16_t i = 0; i < other.second.bytes; i++) {
1910 /* distance might underflow */
1911 unsigned distance = other.first.reg_b + i - copy.second.op.physReg().reg_b;
1912 if (distance < copy.second.bytes && !copy.second.uses[distance])
1913 other.second.uses[i] -= 1;
1914 }
1915 }
1916 }
1917 }
1918
1919 /* all target regs are needed as operand somewhere which means, all entries are part of a cycle */
1920 unsigned largest = 0;
1921 for (const std::pair<const PhysReg, copy_operation>& op : copy_map)
1922 largest = MAX2(largest, op.second.bytes);
1923
1924 while (!copy_map.empty()) {
1925
1926 /* Perform larger swaps first, because larger swaps swaps can make other
1927 * swaps unnecessary. */
1928 auto it = copy_map.begin();
1929 for (auto it2 = copy_map.begin(); it2 != copy_map.end(); ++it2) {
1930 if (it2->second.bytes > it->second.bytes) {
1931 it = it2;
1932 if (it->second.bytes == largest)
1933 break;
1934 }
1935 }
1936
1937 /* should already be done */
1938 assert(!it->second.op.isConstant());
1939
1940 assert(it->second.op.isFixed());
1941 assert(it->second.def.regClass() == it->second.op.regClass());
1942
1943 if (it->first == it->second.op.physReg()) {
1944 copy_map.erase(it);
1945 continue;
1946 }
1947
1948 if (it->second.def.getTemp().type() == RegType::sgpr) {
1949 assert(it->second.def.physReg() != pi->scratch_sgpr);
1950 assert(pi->needs_scratch_reg);
1951 assert(!preserve_scc || pi->scratch_sgpr != scc);
1952 }
1953
1954 /* to resolve the cycle, we have to swap the src reg with the dst reg */
1955 copy_operation swap = it->second;
1956
1957 /* if this is self-intersecting, we have to split it because
1958 * self-intersecting swaps don't make sense */
1959 PhysReg src = swap.op.physReg(), dst = swap.def.physReg();
1960 if (abs((int)src.reg_b - (int)dst.reg_b) < (int)swap.bytes) {
1961 unsigned offset = abs((int)src.reg_b - (int)dst.reg_b);
1962
1963 copy_operation remaining;
1964 src.reg_b += offset;
1965 dst.reg_b += offset;
1966 remaining.bytes = swap.bytes - offset;
1967 memcpy(remaining.uses, swap.uses + offset, remaining.bytes);
1968 remaining.op = Operand(src, swap.def.regClass().resize(remaining.bytes));
1969 remaining.def = Definition(dst, swap.def.regClass().resize(remaining.bytes));
1970 copy_map[dst] = remaining;
1971
1972 memset(swap.uses + offset, 0, swap.bytes - offset);
1973 swap.bytes = offset;
1974 }
1975
1976 /* GFX6-7 can only swap full registers */
1977 assert (ctx->program->gfx_level > GFX7 || (swap.bytes % 4) == 0);
1978
1979 do_swap(ctx, bld, swap, preserve_scc, pi);
1980
1981 /* remove from map */
1982 copy_map.erase(it);
1983
1984 /* change the operand reg of the target's uses and split uses if needed */
1985 uint32_t bytes_left = u_bit_consecutive(0, swap.bytes);
1986 for (auto target = copy_map.begin(); target != copy_map.end(); ++target) {
1987 if (target->second.op.physReg() == swap.def.physReg() &&
1988 swap.bytes == target->second.bytes) {
1989 target->second.op.setFixed(swap.op.physReg());
1990 break;
1991 }
1992
1993 uint32_t imask =
1994 get_intersection_mask(swap.def.physReg().reg_b, swap.bytes,
1995 target->second.op.physReg().reg_b, target->second.bytes);
1996
1997 if (!imask)
1998 continue;
1999
2000 int offset = (int)target->second.op.physReg().reg_b - (int)swap.def.physReg().reg_b;
2001
2002 /* split and update the middle (the portion that reads the swap's
2003 * definition) to read the swap's operand instead */
2004 int target_op_end = target->second.op.physReg().reg_b + target->second.bytes;
2005 int swap_def_end = swap.def.physReg().reg_b + swap.bytes;
2006 int before_bytes = MAX2(-offset, 0);
2007 int after_bytes = MAX2(target_op_end - swap_def_end, 0);
2008 int middle_bytes = target->second.bytes - before_bytes - after_bytes;
2009
2010 if (after_bytes) {
2011 unsigned after_offset = before_bytes + middle_bytes;
2012 assert(after_offset > 0);
2013 copy_operation copy;
2014 copy.bytes = after_bytes;
2015 memcpy(copy.uses, target->second.uses + after_offset, copy.bytes);
2016 RegClass rc = target->second.op.regClass().resize(after_bytes);
2017 copy.op = Operand(target->second.op.physReg().advance(after_offset), rc);
2018 copy.def = Definition(target->second.def.physReg().advance(after_offset), rc);
2019 copy_map[copy.def.physReg()] = copy;
2020 }
2021
2022 if (middle_bytes) {
2023 copy_operation copy;
2024 copy.bytes = middle_bytes;
2025 memcpy(copy.uses, target->second.uses + before_bytes, copy.bytes);
2026 RegClass rc = target->second.op.regClass().resize(middle_bytes);
2027 copy.op = Operand(swap.op.physReg().advance(MAX2(offset, 0)), rc);
2028 copy.def = Definition(target->second.def.physReg().advance(before_bytes), rc);
2029 copy_map[copy.def.physReg()] = copy;
2030 }
2031
2032 if (before_bytes) {
2033 copy_operation copy;
2034 target->second.bytes = before_bytes;
2035 RegClass rc = target->second.op.regClass().resize(before_bytes);
2036 target->second.op = Operand(target->second.op.physReg(), rc);
2037 target->second.def = Definition(target->second.def.physReg(), rc);
2038 memset(target->second.uses + target->second.bytes, 0, 8 - target->second.bytes);
2039 }
2040
2041 /* break early since we know each byte of the swap's definition is used
2042 * at most once */
2043 bytes_left &= ~imask;
2044 if (!bytes_left)
2045 break;
2046 }
2047 }
2048 ctx->program->statistics[aco_statistic_copies] +=
2049 ctx->instructions.size() - num_instructions_before;
2050 }
2051
2052 void
handle_operands_linear_vgpr(std::map<PhysReg,copy_operation> & copy_map,lower_context * ctx,amd_gfx_level gfx_level,Pseudo_instruction * pi)2053 handle_operands_linear_vgpr(std::map<PhysReg, copy_operation>& copy_map, lower_context* ctx,
2054 amd_gfx_level gfx_level, Pseudo_instruction* pi)
2055 {
2056 Builder bld(ctx->program, &ctx->instructions);
2057
2058 for (auto& copy : copy_map) {
2059 copy.second.op =
2060 Operand(copy.second.op.physReg(), RegClass::get(RegType::vgpr, copy.second.op.bytes()));
2061 copy.second.def = Definition(copy.second.def.physReg(),
2062 RegClass::get(RegType::vgpr, copy.second.def.bytes()));
2063 }
2064
2065 std::map<PhysReg, copy_operation> second_map(copy_map);
2066 handle_operands(second_map, ctx, gfx_level, pi);
2067
2068 assert(pi->needs_scratch_reg);
2069 PhysReg scratch_sgpr = pi->scratch_sgpr;
2070 if (scratch_sgpr != scc) {
2071 bld.sop1(aco_opcode::s_mov_b32, Definition(scratch_sgpr, s1), Operand(scc, s1));
2072 pi->scratch_sgpr = scc;
2073 }
2074 bld.sop1(Builder::s_not, Definition(exec, bld.lm), Definition(scc, s1), Operand(exec, bld.lm));
2075
2076 handle_operands(copy_map, ctx, gfx_level, pi);
2077
2078 bld.sop1(Builder::s_not, Definition(exec, bld.lm), Definition(scc, s1), Operand(exec, bld.lm));
2079 if (scratch_sgpr != scc) {
2080 bld.sopc(aco_opcode::s_cmp_lg_i32, Definition(scc, s1), Operand(scratch_sgpr, s1),
2081 Operand::zero());
2082 pi->scratch_sgpr = scratch_sgpr;
2083 }
2084
2085 ctx->program->statistics[aco_statistic_copies] += scratch_sgpr == scc ? 2 : 4;
2086 }
2087
2088 void
emit_set_mode(Builder & bld,float_mode new_mode,bool set_round,bool set_denorm)2089 emit_set_mode(Builder& bld, float_mode new_mode, bool set_round, bool set_denorm)
2090 {
2091 if (bld.program->gfx_level >= GFX10) {
2092 if (set_round)
2093 bld.sopp(aco_opcode::s_round_mode, new_mode.round);
2094 if (set_denorm)
2095 bld.sopp(aco_opcode::s_denorm_mode, new_mode.denorm);
2096 } else if (set_round || set_denorm) {
2097 /* "((size - 1) << 11) | register" (MODE is encoded as register 1) */
2098 bld.sopk(aco_opcode::s_setreg_imm32_b32, Operand::literal32(new_mode.val), (7 << 11) | 1);
2099 }
2100 }
2101
2102 void
emit_set_mode_from_block(Builder & bld,Program & program,Block * block)2103 emit_set_mode_from_block(Builder& bld, Program& program, Block* block)
2104 {
2105 float_mode initial;
2106 initial.val = program.config->float_mode;
2107
2108 bool inital_unknown =
2109 (program.info.merged_shader_compiled_separately && program.stage.sw == SWStage::GS) ||
2110 (program.info.merged_shader_compiled_separately && program.stage.sw == SWStage::TCS);
2111 bool is_start = block->index == 0;
2112 bool set_round = is_start && (inital_unknown || block->fp_mode.round != initial.round);
2113 bool set_denorm = is_start && (inital_unknown || block->fp_mode.denorm != initial.denorm);
2114 if (block->kind & block_kind_top_level) {
2115 for (unsigned pred : block->linear_preds) {
2116 if (program.blocks[pred].fp_mode.round != block->fp_mode.round)
2117 set_round = true;
2118 if (program.blocks[pred].fp_mode.denorm != block->fp_mode.denorm)
2119 set_denorm = true;
2120 }
2121 }
2122 /* only allow changing modes at top-level blocks so this doesn't break
2123 * the "jump over empty blocks" optimization */
2124 assert((!set_round && !set_denorm) || (block->kind & block_kind_top_level));
2125 emit_set_mode(bld, block->fp_mode, set_round, set_denorm);
2126 }
2127
2128 void
lower_image_sample(lower_context * ctx,aco_ptr<Instruction> & instr)2129 lower_image_sample(lower_context* ctx, aco_ptr<Instruction>& instr)
2130 {
2131 Operand linear_vgpr = instr->operands[3];
2132
2133 unsigned nsa_size = ctx->program->dev.max_nsa_vgprs;
2134 unsigned vaddr_size = linear_vgpr.size();
2135 unsigned num_copied_vgprs = instr->operands.size() - 4;
2136 nsa_size = num_copied_vgprs > 0 && (ctx->program->gfx_level >= GFX11 || vaddr_size <= nsa_size)
2137 ? nsa_size
2138 : 0;
2139
2140 Operand vaddr[16];
2141 unsigned num_vaddr = 0;
2142
2143 if (nsa_size) {
2144 assert(num_copied_vgprs <= nsa_size);
2145 for (unsigned i = 0; i < num_copied_vgprs; i++)
2146 vaddr[num_vaddr++] = instr->operands[4 + i];
2147 for (unsigned i = num_copied_vgprs; i < std::min(vaddr_size, nsa_size); i++)
2148 vaddr[num_vaddr++] = Operand(linear_vgpr.physReg().advance(i * 4), v1);
2149 if (vaddr_size > nsa_size) {
2150 RegClass rc = RegClass::get(RegType::vgpr, (vaddr_size - nsa_size) * 4);
2151 vaddr[num_vaddr++] = Operand(PhysReg(linear_vgpr.physReg().advance(nsa_size * 4)), rc);
2152 }
2153 } else {
2154 PhysReg reg = linear_vgpr.physReg();
2155 std::map<PhysReg, copy_operation> copy_operations;
2156 for (unsigned i = 4; i < instr->operands.size(); i++) {
2157 Operand arg = instr->operands[i];
2158 Definition def(reg, RegClass::get(RegType::vgpr, arg.bytes()));
2159 copy_operations[def.physReg()] = {arg, def, def.bytes()};
2160 reg = reg.advance(arg.bytes());
2161 }
2162 vaddr[num_vaddr++] = linear_vgpr;
2163
2164 Pseudo_instruction pi = {};
2165 handle_operands(copy_operations, ctx, ctx->program->gfx_level, &pi);
2166 }
2167
2168 instr->mimg().strict_wqm = false;
2169
2170 if ((3 + num_vaddr) > instr->operands.size()) {
2171 Instruction* new_instr =
2172 create_instruction(instr->opcode, Format::MIMG, 3 + num_vaddr, instr->definitions.size());
2173 std::copy(instr->definitions.cbegin(), instr->definitions.cend(),
2174 new_instr->definitions.begin());
2175 new_instr->operands[0] = instr->operands[0];
2176 new_instr->operands[1] = instr->operands[1];
2177 new_instr->operands[2] = instr->operands[2];
2178 memcpy((uint8_t*)new_instr + sizeof(Instruction), (uint8_t*)instr.get() + sizeof(Instruction),
2179 sizeof(MIMG_instruction) - sizeof(Instruction));
2180 instr.reset(new_instr);
2181 } else {
2182 while (instr->operands.size() > (3 + num_vaddr))
2183 instr->operands.pop_back();
2184 }
2185 std::copy(vaddr, vaddr + num_vaddr, std::next(instr->operands.begin(), 3));
2186 }
2187
2188 } /* end namespace */
2189
2190 void
hw_init_scratch(Builder & bld,Definition def,Operand scratch_addr,Operand scratch_offset)2191 hw_init_scratch(Builder& bld, Definition def, Operand scratch_addr, Operand scratch_offset)
2192 {
2193 /* Since we know what the high 16 bits of scratch_hi is, we can set all the high 16
2194 * bits in the same instruction that we add the carry.
2195 */
2196 Operand hi_add = Operand::c32(0xffff0000 - S_008F04_SWIZZLE_ENABLE_GFX6(1));
2197 Operand scratch_addr_lo(scratch_addr.physReg(), s1);
2198 Operand scratch_addr_hi(scratch_addr_lo.physReg().advance(4), s1);
2199
2200 if (bld.program->gfx_level >= GFX10) {
2201 PhysReg scratch_lo = def.physReg();
2202 PhysReg scratch_hi = def.physReg().advance(4);
2203
2204 bld.sop2(aco_opcode::s_add_u32, Definition(scratch_lo, s1), Definition(scc, s1),
2205 scratch_addr_lo, scratch_offset);
2206 bld.sop2(aco_opcode::s_addc_u32, Definition(scratch_hi, s1), Definition(scc, s1),
2207 scratch_addr_hi, hi_add, Operand(scc, s1));
2208
2209 /* "((size - 1) << 11) | register" (FLAT_SCRATCH_LO/HI is encoded as register
2210 * 20/21) */
2211 bld.sopk(aco_opcode::s_setreg_b32, Operand(scratch_lo, s1), (31 << 11) | 20);
2212 bld.sopk(aco_opcode::s_setreg_b32, Operand(scratch_hi, s1), (31 << 11) | 21);
2213 } else {
2214 bld.sop2(aco_opcode::s_add_u32, Definition(flat_scr_lo, s1), Definition(scc, s1),
2215 scratch_addr_lo, scratch_offset);
2216 bld.sop2(aco_opcode::s_addc_u32, Definition(flat_scr_hi, s1), Definition(scc, s1),
2217 scratch_addr_hi, hi_add, Operand(scc, s1));
2218 }
2219 }
2220
2221 void
lower_to_hw_instr(Program * program)2222 lower_to_hw_instr(Program* program)
2223 {
2224 gfx9_pops_done_msg_bounds pops_done_msg_bounds;
2225 if (program->has_pops_overlapped_waves_wait && program->gfx_level < GFX11) {
2226 pops_done_msg_bounds = gfx9_pops_done_msg_bounds(program);
2227 }
2228
2229 Block* discard_exit_block = NULL;
2230 Block* discard_pops_done_and_exit_block = NULL;
2231
2232 int end_with_regs_block_index = -1;
2233
2234 bool should_dealloc_vgprs = dealloc_vgprs(program);
2235
2236 for (int block_idx = program->blocks.size() - 1; block_idx >= 0; block_idx--) {
2237 Block* block = &program->blocks[block_idx];
2238 lower_context ctx;
2239 ctx.program = program;
2240 ctx.block = block;
2241 ctx.instructions.reserve(block->instructions.size());
2242 Builder bld(program, &ctx.instructions);
2243
2244 emit_set_mode_from_block(bld, *program, block);
2245
2246 for (size_t instr_idx = 0; instr_idx < block->instructions.size(); instr_idx++) {
2247 aco_ptr<Instruction>& instr = block->instructions[instr_idx];
2248
2249 /* Send the ordered section done message from the middle of the block if needed (if the
2250 * ordered section is ended by an instruction inside this block).
2251 * Also make sure the done message is sent if it's needed in case early exit happens for
2252 * any reason.
2253 */
2254 if ((block_idx == pops_done_msg_bounds.end_block_idx() &&
2255 instr_idx == pops_done_msg_bounds.instr_after_end_idx()) ||
2256 (instr->opcode == aco_opcode::s_endpgm &&
2257 pops_done_msg_bounds.early_exit_needs_done_msg(block_idx, instr_idx))) {
2258 bld.sopp(aco_opcode::s_sendmsg, sendmsg_ordered_ps_done);
2259 }
2260
2261 aco_ptr<Instruction> mov;
2262 if (instr->isPseudo() && instr->opcode != aco_opcode::p_unit_test &&
2263 instr->opcode != aco_opcode::p_debug_info) {
2264 Pseudo_instruction* pi = &instr->pseudo();
2265
2266 switch (instr->opcode) {
2267 case aco_opcode::p_extract_vector: {
2268 PhysReg reg = instr->operands[0].physReg();
2269 Definition& def = instr->definitions[0];
2270 reg.reg_b += instr->operands[1].constantValue() * def.bytes();
2271
2272 if (reg == def.physReg())
2273 break;
2274
2275 RegClass op_rc = def.regClass().is_subdword()
2276 ? def.regClass()
2277 : RegClass(instr->operands[0].getTemp().type(), def.size());
2278 std::map<PhysReg, copy_operation> copy_operations;
2279 copy_operations[def.physReg()] = {Operand(reg, op_rc), def, def.bytes()};
2280 handle_operands(copy_operations, &ctx, program->gfx_level, pi);
2281 break;
2282 }
2283 case aco_opcode::p_create_vector:
2284 case aco_opcode::p_start_linear_vgpr: {
2285 if (instr->operands.empty())
2286 break;
2287
2288 std::map<PhysReg, copy_operation> copy_operations;
2289 PhysReg reg = instr->definitions[0].physReg();
2290
2291 for (const Operand& op : instr->operands) {
2292 RegClass rc = RegClass::get(instr->definitions[0].regClass().type(), op.bytes());
2293 if (op.isConstant()) {
2294 const Definition def = Definition(reg, rc);
2295 copy_operations[reg] = {op, def, op.bytes()};
2296 reg.reg_b += op.bytes();
2297 continue;
2298 }
2299 if (op.isUndefined()) {
2300 // TODO: coalesce subdword copies if dst byte is 0
2301 reg.reg_b += op.bytes();
2302 continue;
2303 }
2304
2305 RegClass rc_def = op.regClass().is_subdword() ? op.regClass() : rc;
2306 const Definition def = Definition(reg, rc_def);
2307 copy_operations[def.physReg()] = {op, def, op.bytes()};
2308 reg.reg_b += op.bytes();
2309 }
2310 handle_operands(copy_operations, &ctx, program->gfx_level, pi);
2311 break;
2312 }
2313 case aco_opcode::p_split_vector: {
2314 std::map<PhysReg, copy_operation> copy_operations;
2315 PhysReg reg = instr->operands[0].physReg();
2316
2317 for (const Definition& def : instr->definitions) {
2318 RegClass rc_op = def.regClass().is_subdword()
2319 ? def.regClass()
2320 : instr->operands[0].getTemp().regClass().resize(def.bytes());
2321 const Operand op = Operand(reg, rc_op);
2322 copy_operations[def.physReg()] = {op, def, def.bytes()};
2323 reg.reg_b += def.bytes();
2324 }
2325 handle_operands(copy_operations, &ctx, program->gfx_level, pi);
2326 break;
2327 }
2328 case aco_opcode::p_parallelcopy: {
2329 std::map<PhysReg, copy_operation> copy_operations;
2330 bool linear_vgpr = false;
2331 bool non_linear_vgpr = false;
2332 for (unsigned j = 0; j < instr->operands.size(); j++) {
2333 assert(instr->definitions[j].bytes() == instr->operands[j].bytes());
2334 copy_operations[instr->definitions[j].physReg()] = {
2335 instr->operands[j], instr->definitions[j], instr->operands[j].bytes()};
2336 linear_vgpr |= instr->definitions[j].regClass().is_linear_vgpr();
2337 non_linear_vgpr |= !instr->definitions[j].regClass().is_linear_vgpr();
2338 }
2339 assert(!linear_vgpr || !non_linear_vgpr);
2340 if (linear_vgpr)
2341 handle_operands_linear_vgpr(copy_operations, &ctx, program->gfx_level, pi);
2342 else
2343 handle_operands(copy_operations, &ctx, program->gfx_level, pi);
2344 break;
2345 }
2346 case aco_opcode::p_exit_early_if_not: {
2347 /* don't bother with an early exit near the end of the program */
2348 if ((block->instructions.size() - 1 - instr_idx) <= 5 &&
2349 block->instructions.back()->opcode == aco_opcode::s_endpgm) {
2350 unsigned null_exp_dest =
2351 program->gfx_level >= GFX11 ? V_008DFC_SQ_EXP_MRT : V_008DFC_SQ_EXP_NULL;
2352 bool ignore_early_exit = true;
2353
2354 for (unsigned k = instr_idx + 1; k < block->instructions.size(); ++k) {
2355 const aco_ptr<Instruction>& instr2 = block->instructions[k];
2356 if (instr2->opcode == aco_opcode::s_endpgm ||
2357 instr2->opcode == aco_opcode::p_logical_end)
2358 continue;
2359 else if (instr2->opcode == aco_opcode::exp &&
2360 instr2->exp().dest == null_exp_dest &&
2361 instr2->exp().enabled_mask == 0)
2362 continue;
2363 else if (instr2->opcode == aco_opcode::p_parallelcopy &&
2364 instr2->definitions[0].isFixed() &&
2365 instr2->definitions[0].physReg() == exec)
2366 continue;
2367 else if (instr2->opcode == aco_opcode::s_sendmsg &&
2368 instr2->salu().imm == sendmsg_dealloc_vgprs)
2369 continue;
2370
2371 ignore_early_exit = false;
2372 }
2373
2374 if (ignore_early_exit)
2375 break;
2376 }
2377
2378 const bool discard_sends_pops_done =
2379 pops_done_msg_bounds.early_exit_needs_done_msg(block_idx, instr_idx);
2380
2381 Block* discard_block =
2382 discard_sends_pops_done ? discard_pops_done_and_exit_block : discard_exit_block;
2383 if (!discard_block) {
2384 discard_block = program->create_and_insert_block();
2385 discard_block->kind = block_kind_discard_early_exit;
2386 if (discard_sends_pops_done) {
2387 discard_pops_done_and_exit_block = discard_block;
2388 } else {
2389 discard_exit_block = discard_block;
2390 }
2391 block = &program->blocks[block_idx];
2392
2393 /* sendmsg(dealloc_vgprs) releases scratch, so it isn't safe if there is an
2394 * in-progress scratch store. */
2395 wait_imm wait;
2396 if (should_dealloc_vgprs && uses_scratch(program))
2397 wait.vs = 0;
2398
2399 bld.reset(discard_block);
2400 if (program->has_pops_overlapped_waves_wait &&
2401 (program->gfx_level >= GFX11 || discard_sends_pops_done)) {
2402 /* If this discard early exit potentially exits the POPS ordered section, do
2403 * the waitcnt necessary before resuming overlapping waves as the normal
2404 * waitcnt insertion doesn't work in a discard early exit block.
2405 */
2406 if (program->gfx_level >= GFX10)
2407 wait.vs = 0;
2408 wait.vm = 0;
2409 if (program->has_smem_buffer_or_global_loads)
2410 wait.lgkm = 0;
2411 wait.build_waitcnt(bld);
2412 }
2413 if (discard_sends_pops_done)
2414 bld.sopp(aco_opcode::s_sendmsg, sendmsg_ordered_ps_done);
2415
2416 unsigned target = V_008DFC_SQ_EXP_NULL;
2417 if (program->gfx_level >= GFX11)
2418 target =
2419 program->has_color_exports ? V_008DFC_SQ_EXP_MRT : V_008DFC_SQ_EXP_MRTZ;
2420 if (program->stage == fragment_fs)
2421 bld.exp(aco_opcode::exp, Operand(v1), Operand(v1), Operand(v1), Operand(v1), 0,
2422 target, false, true, true);
2423
2424 wait.build_waitcnt(bld);
2425 if (should_dealloc_vgprs)
2426 bld.sopp(aco_opcode::s_sendmsg, sendmsg_dealloc_vgprs);
2427
2428 bld.sopp(aco_opcode::s_endpgm);
2429
2430 bld.reset(&ctx.instructions);
2431 }
2432
2433 assert(instr->operands[0].physReg() == scc || instr->operands[0].physReg() == exec);
2434 if (instr->operands[0].physReg() == scc)
2435 bld.sopp(aco_opcode::s_cbranch_scc0, discard_block->index);
2436 else
2437 bld.sopp(aco_opcode::s_cbranch_execz, discard_block->index);
2438
2439 discard_block->linear_preds.push_back(block->index);
2440 block->linear_succs.push_back(discard_block->index);
2441 break;
2442 }
2443 case aco_opcode::p_spill: {
2444 assert(instr->operands[0].regClass() == v1.as_linear());
2445 for (unsigned i = 0; i < instr->operands[2].size(); i++) {
2446 Operand src =
2447 instr->operands[2].isConstant()
2448 ? Operand::c32(uint32_t(instr->operands[2].constantValue64() >> (32 * i)))
2449 : Operand(PhysReg{instr->operands[2].physReg() + i}, s1);
2450 bld.writelane(Definition(instr->operands[0].physReg(), v1), src,
2451 Operand::c32(instr->operands[1].constantValue() + i),
2452 instr->operands[0]);
2453 }
2454 break;
2455 }
2456 case aco_opcode::p_reload: {
2457 assert(instr->operands[0].regClass() == v1.as_linear());
2458 for (unsigned i = 0; i < instr->definitions[0].size(); i++)
2459 bld.readlane(Definition(PhysReg{instr->definitions[0].physReg() + i}, s1),
2460 instr->operands[0],
2461 Operand::c32(instr->operands[1].constantValue() + i));
2462 break;
2463 }
2464 case aco_opcode::p_as_uniform: {
2465 if (instr->operands[0].isConstant() ||
2466 instr->operands[0].regClass().type() == RegType::sgpr) {
2467 std::map<PhysReg, copy_operation> copy_operations;
2468 copy_operations[instr->definitions[0].physReg()] = {
2469 instr->operands[0], instr->definitions[0], instr->definitions[0].bytes()};
2470 handle_operands(copy_operations, &ctx, program->gfx_level, pi);
2471 } else {
2472 assert(instr->operands[0].regClass().type() == RegType::vgpr);
2473 assert(instr->definitions[0].regClass().type() == RegType::sgpr);
2474 assert(instr->operands[0].size() == instr->definitions[0].size());
2475 for (unsigned i = 0; i < instr->definitions[0].size(); i++) {
2476 bld.vop1(aco_opcode::v_readfirstlane_b32,
2477 Definition(PhysReg{instr->definitions[0].physReg() + i}, s1),
2478 Operand(PhysReg{instr->operands[0].physReg() + i}, v1));
2479 }
2480 }
2481 break;
2482 }
2483 case aco_opcode::p_pops_gfx9_add_exiting_wave_id: {
2484 bld.sop2(aco_opcode::s_add_i32, instr->definitions[0], instr->definitions[1],
2485 Operand(pops_exiting_wave_id, s1), instr->operands[0]);
2486 break;
2487 }
2488 case aco_opcode::p_bpermute_readlane: {
2489 emit_bpermute_readlane(bld, instr);
2490 break;
2491 }
2492 case aco_opcode::p_bpermute_shared_vgpr: {
2493 emit_bpermute_shared_vgpr(bld, instr);
2494 break;
2495 }
2496 case aco_opcode::p_bpermute_permlane: {
2497 emit_bpermute_permlane(bld, instr);
2498 break;
2499 }
2500 case aco_opcode::p_constaddr: {
2501 unsigned id = instr->definitions[0].tempId();
2502 PhysReg reg = instr->definitions[0].physReg();
2503 bld.sop1(aco_opcode::p_constaddr_getpc, instr->definitions[0], Operand::c32(id));
2504 if (ctx.program->gfx_level >= GFX12)
2505 bld.sop1(aco_opcode::s_sext_i32_i16, Definition(reg.advance(4), s1), Operand(reg.advance(4), s1));
2506 bld.sop2(aco_opcode::p_constaddr_addlo, Definition(reg, s1), instr->definitions[1],
2507 Operand(reg, s1), instr->operands[0], Operand::c32(id));
2508 /* s_addc_u32 not needed because the program is in a 32-bit VA range */
2509 break;
2510 }
2511 case aco_opcode::p_resume_shader_address: {
2512 /* Find index of resume block. */
2513 unsigned resume_idx = instr->operands[0].constantValue();
2514 unsigned resume_block_idx = 0;
2515 for (Block& resume_block : program->blocks) {
2516 if (resume_block.kind & block_kind_resume) {
2517 if (resume_idx == 0) {
2518 resume_block_idx = resume_block.index;
2519 break;
2520 }
2521 resume_idx--;
2522 }
2523 }
2524 assert(resume_block_idx != 0);
2525 unsigned id = instr->definitions[0].tempId();
2526 PhysReg reg = instr->definitions[0].physReg();
2527 bld.sop1(aco_opcode::p_resumeaddr_getpc, instr->definitions[0], Operand::c32(id));
2528 if (ctx.program->gfx_level >= GFX12)
2529 bld.sop1(aco_opcode::s_sext_i32_i16, Definition(reg.advance(4), s1), Operand(reg.advance(4), s1));
2530 bld.sop2(aco_opcode::p_resumeaddr_addlo, Definition(reg, s1), instr->definitions[1],
2531 Operand(reg, s1), Operand::c32(resume_block_idx), Operand::c32(id));
2532 /* s_addc_u32 not needed because the program is in a 32-bit VA range */
2533 break;
2534 }
2535 case aco_opcode::p_extract: {
2536 assert(instr->operands[1].isConstant());
2537 assert(instr->operands[2].isConstant());
2538 assert(instr->operands[3].isConstant());
2539 if (instr->definitions[0].regClass() == s1)
2540 assert(instr->definitions.size() >= 2 && instr->definitions[1].physReg() == scc);
2541 Definition dst = instr->definitions[0];
2542 Operand op = instr->operands[0];
2543 unsigned bits = instr->operands[2].constantValue();
2544 unsigned index = instr->operands[1].constantValue();
2545 unsigned offset = index * bits;
2546 bool signext = !instr->operands[3].constantEquals(0);
2547
2548 if (dst.regClass() == s1) {
2549 if (offset == 0 && signext && (bits == 8 || bits == 16)) {
2550 bld.sop1(bits == 8 ? aco_opcode::s_sext_i32_i8 : aco_opcode::s_sext_i32_i16,
2551 dst, op);
2552 } else if (ctx.program->gfx_level >= GFX9 && offset == 0 && bits == 16) {
2553 bld.sop2(aco_opcode::s_pack_ll_b32_b16, dst, op, Operand::zero());
2554 } else if (ctx.program->gfx_level >= GFX9 && offset == 16 && bits == 16 &&
2555 !signext) {
2556 bld.sop2(aco_opcode::s_pack_hh_b32_b16, dst, op, Operand::zero());
2557 } else if (offset == (32 - bits)) {
2558 bld.sop2(signext ? aco_opcode::s_ashr_i32 : aco_opcode::s_lshr_b32, dst,
2559 instr->definitions[1], op, Operand::c32(offset));
2560 } else {
2561 bld.sop2(signext ? aco_opcode::s_bfe_i32 : aco_opcode::s_bfe_u32, dst,
2562 instr->definitions[1], op, Operand::c32((bits << 16) | offset));
2563 }
2564 } else if (dst.regClass() == v1) {
2565 if (op.physReg().byte()) {
2566 offset += op.physReg().byte() * 8;
2567 op = Operand(PhysReg(op.physReg().reg()), v1);
2568 }
2569 assert(op.physReg().byte() == 0 && dst.physReg().byte() == 0);
2570 if (offset == (32 - bits) && op.regClass() != s1) {
2571 bld.vop2(signext ? aco_opcode::v_ashrrev_i32 : aco_opcode::v_lshrrev_b32, dst,
2572 Operand::c32(offset), op);
2573 } else if (offset == 0 && bits == 16 && ctx.program->gfx_level >= GFX11) {
2574 bld.vop1(signext ? aco_opcode::v_cvt_i32_i16 : aco_opcode::v_cvt_u32_u16, dst,
2575 op);
2576 } else {
2577 bld.vop3(signext ? aco_opcode::v_bfe_i32 : aco_opcode::v_bfe_u32, dst, op,
2578 Operand::c32(offset), Operand::c32(bits));
2579 }
2580 } else {
2581 assert(dst.regClass() == v2b || dst.regClass() == v1b || op.regClass() == v2b ||
2582 op.regClass() == v1b);
2583 if (ctx.program->gfx_level >= GFX11) {
2584 unsigned op_vgpr_byte = op.physReg().byte() + offset / 8;
2585 unsigned sign_byte = op_vgpr_byte + bits / 8 - 1;
2586
2587 uint8_t swiz[4] = {4, 5, 6, 7};
2588 swiz[dst.physReg().byte()] = op_vgpr_byte;
2589 if (bits == 16 && dst.bytes() >= 2)
2590 swiz[dst.physReg().byte() + 1] = op_vgpr_byte + 1;
2591 for (unsigned i = bits / 8; i < dst.bytes(); i++) {
2592 uint8_t ext = bperm_0;
2593 if (signext) {
2594 if (sign_byte == 1)
2595 ext = bperm_b1_sign;
2596 else if (sign_byte == 3)
2597 ext = bperm_b3_sign;
2598 else /* replicate so sign-extension can be done later */
2599 ext = sign_byte;
2600 }
2601 swiz[dst.physReg().byte() + i] = ext;
2602 }
2603 create_bperm(bld, swiz, dst, op);
2604
2605 if (signext && sign_byte != 3 && sign_byte != 1) {
2606 assert(bits == 8);
2607 assert(dst.regClass() == v2b || dst.regClass() == v1);
2608 uint8_t ext_swiz[4] = {4, 5, 6, 7};
2609 uint8_t ext = dst.physReg().byte() == 2 ? bperm_b7_sign : bperm_b5_sign;
2610 memset(ext_swiz + dst.physReg().byte() + 1, ext, dst.bytes() - 1);
2611 create_bperm(bld, ext_swiz, dst, Operand::zero());
2612 }
2613 } else {
2614 SDWA_instruction& sdwa = bld.vop1_sdwa(aco_opcode::v_mov_b32, dst, op)->sdwa();
2615 sdwa.sel[0] = SubdwordSel(bits / 8, offset / 8, signext);
2616 }
2617 }
2618 break;
2619 }
2620 case aco_opcode::p_insert: {
2621 assert(instr->operands[1].isConstant());
2622 assert(instr->operands[2].isConstant());
2623 if (instr->definitions[0].regClass() == s1)
2624 assert(instr->definitions.size() >= 2 && instr->definitions[1].physReg() == scc);
2625 Definition dst = instr->definitions[0];
2626 Operand op = instr->operands[0];
2627 unsigned bits = instr->operands[2].constantValue();
2628 unsigned index = instr->operands[1].constantValue();
2629 unsigned offset = index * bits;
2630
2631 bool has_sdwa = program->gfx_level >= GFX8 && program->gfx_level < GFX11;
2632 if (dst.regClass() == s1) {
2633 if (ctx.program->gfx_level >= GFX9 && offset == 0 && bits == 16) {
2634 bld.sop2(aco_opcode::s_pack_ll_b32_b16, dst, op, Operand::zero());
2635 } else if (ctx.program->gfx_level >= GFX9 && offset == 16 && bits == 16) {
2636 bld.sop2(aco_opcode::s_pack_ll_b32_b16, dst, Operand::zero(), op);
2637 } else if (offset == (32 - bits)) {
2638 bld.sop2(aco_opcode::s_lshl_b32, dst, instr->definitions[1], op,
2639 Operand::c32(offset));
2640 } else if (offset == 0) {
2641 bld.sop2(aco_opcode::s_bfe_u32, dst, instr->definitions[1], op,
2642 Operand::c32(bits << 16));
2643 } else {
2644 bld.sop2(aco_opcode::s_bfe_u32, dst, instr->definitions[1], op,
2645 Operand::c32(bits << 16));
2646 bld.sop2(aco_opcode::s_lshl_b32, dst, instr->definitions[1],
2647 Operand(dst.physReg(), s1), Operand::c32(offset));
2648 }
2649 } else if (dst.regClass() == v1 || !has_sdwa) {
2650 if (offset == (dst.bytes() * 8u - bits) && dst.regClass() == v1) {
2651 bld.vop2(aco_opcode::v_lshlrev_b32, dst, Operand::c32(offset), op);
2652 } else if (offset == 0 && dst.regClass() == v1) {
2653 bld.vop3(aco_opcode::v_bfe_u32, dst, op, Operand::zero(), Operand::c32(bits));
2654 } else if (has_sdwa && (op.regClass() != s1 || program->gfx_level >= GFX9)) {
2655 bld.vop1_sdwa(aco_opcode::v_mov_b32, dst, op)->sdwa().dst_sel =
2656 SubdwordSel(bits / 8, offset / 8, false);
2657 } else if (program->gfx_level >= GFX11) {
2658 uint8_t swiz[] = {4, 5, 6, 7};
2659 for (unsigned i = 0; i < dst.bytes(); i++)
2660 swiz[dst.physReg().byte() + i] = bperm_0;
2661 for (unsigned i = 0; i < bits / 8; i++)
2662 swiz[dst.physReg().byte() + i + offset / 8] = op.physReg().byte() + i;
2663 create_bperm(bld, swiz, dst, op);
2664 } else {
2665 bld.vop3(aco_opcode::v_bfe_u32, dst, op, Operand::zero(), Operand::c32(bits));
2666 bld.vop2(aco_opcode::v_lshlrev_b32, dst, Operand::c32(offset),
2667 Operand(dst.physReg(), v1));
2668 }
2669 } else {
2670 assert(dst.regClass() == v2b);
2671 if (!offset) {
2672 bld.vop1_sdwa(aco_opcode::v_mov_b32, dst, op)->sdwa().sel[0] =
2673 SubdwordSel::ubyte;
2674 } else if (program->gfx_level >= GFX9) {
2675 bld.vop2_sdwa(aco_opcode::v_lshlrev_b32, dst, Operand::c32(offset), op)
2676 ->sdwa()
2677 .sel[1] = SubdwordSel::ubyte;
2678 } else {
2679 assert(offset == 8);
2680 Definition dst_hi = Definition(dst.physReg().advance(1), v1b);
2681 bld.vop1_sdwa(aco_opcode::v_mov_b32, dst_hi, op)->sdwa().sel[0] =
2682 SubdwordSel::ubyte;
2683 uint32_t c = ~(BITFIELD_MASK(offset) << (dst.physReg().byte() * 8));
2684 bld.vop2(aco_opcode::v_and_b32, dst, Operand::c32(c),
2685 Operand(PhysReg(op.physReg().reg()), v1));
2686 }
2687 }
2688 break;
2689 }
2690 case aco_opcode::p_init_scratch: {
2691 assert(program->gfx_level >= GFX8 && program->gfx_level <= GFX10_3);
2692 if (!program->config->scratch_bytes_per_wave)
2693 break;
2694
2695 Operand scratch_addr = instr->operands[0];
2696 if (scratch_addr.isUndefined()) {
2697 PhysReg reg = instr->definitions[0].physReg();
2698 bld.sop1(aco_opcode::p_load_symbol, Definition(reg, s1),
2699 Operand::c32(aco_symbol_scratch_addr_lo));
2700 bld.sop1(aco_opcode::p_load_symbol, Definition(reg.advance(4), s1),
2701 Operand::c32(aco_symbol_scratch_addr_hi));
2702 scratch_addr.setFixed(reg);
2703 } else if (program->stage.hw != AC_HW_COMPUTE_SHADER) {
2704 bld.smem(aco_opcode::s_load_dwordx2, instr->definitions[0], scratch_addr,
2705 Operand::zero());
2706 scratch_addr.setFixed(instr->definitions[0].physReg());
2707 }
2708
2709 hw_init_scratch(bld, instr->definitions[0], scratch_addr, instr->operands[1]);
2710 break;
2711 }
2712 case aco_opcode::p_jump_to_epilog: {
2713 if (pops_done_msg_bounds.early_exit_needs_done_msg(block_idx, instr_idx)) {
2714 bld.sopp(aco_opcode::s_sendmsg, sendmsg_ordered_ps_done);
2715 }
2716 bld.sop1(aco_opcode::s_setpc_b64, instr->operands[0]);
2717 break;
2718 }
2719 case aco_opcode::p_interp_gfx11: {
2720 assert(instr->definitions[0].regClass() == v1 ||
2721 instr->definitions[0].regClass() == v2b);
2722 assert(instr->operands[0].regClass() == v1.as_linear());
2723 assert(instr->operands[1].isConstant());
2724 assert(instr->operands[2].isConstant());
2725 assert(instr->operands.back().physReg() == m0);
2726 Definition dst = instr->definitions[0];
2727 PhysReg lin_vgpr = instr->operands[0].physReg();
2728 unsigned attribute = instr->operands[1].constantValue();
2729 unsigned component = instr->operands[2].constantValue();
2730 uint16_t dpp_ctrl = 0;
2731 bool high_16bits = false;
2732 Operand coord1, coord2;
2733 if (instr->operands.size() == 7) {
2734 assert(instr->operands[3].isConstant());
2735 high_16bits = instr->operands[3].constantValue();
2736 assert(instr->operands[4].regClass() == v1);
2737 assert(instr->operands[5].regClass() == v1);
2738 coord1 = instr->operands[4];
2739 coord2 = instr->operands[5];
2740 } else {
2741 assert(instr->operands[3].isConstant());
2742 dpp_ctrl = instr->operands[3].constantValue();
2743 }
2744
2745 bld.ldsdir(aco_opcode::lds_param_load, Definition(lin_vgpr, v1), Operand(m0, s1),
2746 attribute, component);
2747
2748 Operand p(lin_vgpr, v1);
2749 Operand dst_op(dst.physReg(), v1);
2750 if (instr->operands.size() == 5) {
2751 bld.vop1_dpp(aco_opcode::v_mov_b32, Definition(dst), p, dpp_ctrl);
2752 } else if (dst.regClass() == v2b) {
2753 bld.vinterp_inreg(aco_opcode::v_interp_p10_f16_f32_inreg, Definition(dst), p,
2754 coord1, p, high_16bits ? 0x5 : 0);
2755 bld.vinterp_inreg(aco_opcode::v_interp_p2_f16_f32_inreg, Definition(dst), p,
2756 coord2, dst_op, high_16bits ? 0x1 : 0);
2757 } else {
2758 bld.vinterp_inreg(aco_opcode::v_interp_p10_f32_inreg, Definition(dst), p, coord1,
2759 p);
2760 bld.vinterp_inreg(aco_opcode::v_interp_p2_f32_inreg, Definition(dst), p, coord2,
2761 dst_op);
2762 }
2763 break;
2764 }
2765 case aco_opcode::p_dual_src_export_gfx11: {
2766 PhysReg dst0 = instr->definitions[0].physReg();
2767 PhysReg dst1 = instr->definitions[1].physReg();
2768 Definition exec_tmp = instr->definitions[2];
2769 Definition not_vcc_tmp = instr->definitions[3];
2770 Definition clobber_vcc = instr->definitions[4];
2771 Definition clobber_scc = instr->definitions[5];
2772
2773 assert(exec_tmp.regClass() == bld.lm);
2774 assert(not_vcc_tmp.regClass() == bld.lm);
2775 assert(clobber_vcc.regClass() == bld.lm && clobber_vcc.physReg() == vcc);
2776 assert(clobber_scc.isFixed() && clobber_scc.physReg() == scc);
2777
2778 bld.sop1(Builder::s_mov, Definition(exec_tmp.physReg(), bld.lm),
2779 Operand(exec, bld.lm));
2780 bld.sop1(Builder::s_wqm, Definition(exec, bld.lm), clobber_scc,
2781 Operand(exec, bld.lm));
2782
2783 uint8_t enabled_channels = 0;
2784 Operand mrt0[4], mrt1[4];
2785
2786 copy_constant_sgpr(bld, clobber_vcc, 0x5555'5555'5555'5555ull);
2787
2788 Operand src_even = Operand(clobber_vcc.physReg(), bld.lm);
2789
2790 bld.sop1(Builder::s_not, not_vcc_tmp, clobber_scc, src_even);
2791
2792 Operand src_odd = Operand(not_vcc_tmp.physReg(), bld.lm);
2793
2794 for (unsigned i = 0; i < 4; i++) {
2795 if (instr->operands[i].isUndefined() && instr->operands[i + 4].isUndefined()) {
2796 mrt0[i] = instr->operands[i];
2797 mrt1[i] = instr->operands[i + 4];
2798 continue;
2799 }
2800
2801 Operand src0 = instr->operands[i];
2802 Operand src1 = instr->operands[i + 4];
2803
2804 /* | even lanes | odd lanes
2805 * mrt0 | src0 even | src1 even
2806 * mrt1 | src0 odd | src1 odd
2807 */
2808 bld.vop2_dpp(aco_opcode::v_cndmask_b32, Definition(dst0, v1), src1, src0,
2809 src_even, dpp_row_xmask(1));
2810 bld.vop2_e64_dpp(aco_opcode::v_cndmask_b32, Definition(dst1, v1), src0, src1,
2811 src_odd, dpp_row_xmask(1));
2812
2813 mrt0[i] = Operand(dst0, v1);
2814 mrt1[i] = Operand(dst1, v1);
2815
2816 enabled_channels |= 1 << i;
2817
2818 dst0 = dst0.advance(4);
2819 dst1 = dst1.advance(4);
2820 }
2821
2822 bld.sop1(Builder::s_mov, Definition(exec, bld.lm),
2823 Operand(exec_tmp.physReg(), bld.lm));
2824
2825 /* Force export all channels when everything is undefined. */
2826 if (!enabled_channels)
2827 enabled_channels = 0xf;
2828
2829 bld.exp(aco_opcode::exp, mrt0[0], mrt0[1], mrt0[2], mrt0[3], enabled_channels,
2830 V_008DFC_SQ_EXP_MRT + 21, false);
2831 bld.exp(aco_opcode::exp, mrt1[0], mrt1[1], mrt1[2], mrt1[3], enabled_channels,
2832 V_008DFC_SQ_EXP_MRT + 22, false);
2833 break;
2834 }
2835 case aco_opcode::p_end_with_regs: {
2836 end_with_regs_block_index = block->index;
2837 break;
2838 }
2839 case aco_opcode::p_shader_cycles_hi_lo_hi: {
2840 unsigned shader_cycles_lo = 29;
2841 unsigned shader_cycles_hi = 30;
2842 bld.sopk(aco_opcode::s_getreg_b32, instr->definitions[0],
2843 ((32 - 1) << 11) | shader_cycles_hi);
2844 bld.sopk(aco_opcode::s_getreg_b32, instr->definitions[1],
2845 ((32 - 1) << 11) | shader_cycles_lo);
2846 bld.sopk(aco_opcode::s_getreg_b32, instr->definitions[2],
2847 ((32 - 1) << 11) | shader_cycles_hi);
2848 break;
2849 }
2850 default: break;
2851 }
2852 } else if (instr->isReduction()) {
2853 Pseudo_reduction_instruction& reduce = instr->reduction();
2854 emit_reduction(&ctx, reduce.opcode, reduce.reduce_op, reduce.cluster_size,
2855 reduce.operands[1].physReg(), // tmp
2856 reduce.definitions[1].physReg(), // stmp
2857 reduce.operands[2].physReg(), // vtmp
2858 reduce.definitions[2].physReg(), // sitmp
2859 reduce.operands[0], reduce.definitions[0]);
2860 } else if (instr->isBarrier()) {
2861 Pseudo_barrier_instruction& barrier = instr->barrier();
2862
2863 /* Anything larger than a workgroup isn't possible. Anything
2864 * smaller requires no instructions and this pseudo instruction
2865 * would only be included to control optimizations. */
2866 bool emit_s_barrier = barrier.exec_scope == scope_workgroup &&
2867 program->workgroup_size > program->wave_size;
2868
2869 bld.insert(std::move(instr));
2870 if (emit_s_barrier && ctx.program->gfx_level >= GFX12) {
2871 bld.sop1(aco_opcode::s_barrier_signal, Operand::c32(-1));
2872 bld.sopp(aco_opcode::s_barrier_wait, UINT16_MAX);
2873 } else if (emit_s_barrier) {
2874 bld.sopp(aco_opcode::s_barrier);
2875 }
2876 } else if (instr->opcode == aco_opcode::p_v_cvt_f16_f32_rtne ||
2877 instr->opcode == aco_opcode::p_s_cvt_f16_f32_rtne) {
2878 float_mode new_mode = block->fp_mode;
2879 new_mode.round16_64 = fp_round_ne;
2880 bool set_round = new_mode.round != block->fp_mode.round;
2881
2882 emit_set_mode(bld, new_mode, set_round, false);
2883
2884 if (instr->opcode == aco_opcode::p_v_cvt_f16_f32_rtne)
2885 instr->opcode = aco_opcode::v_cvt_f16_f32;
2886 else
2887 instr->opcode = aco_opcode::s_cvt_f16_f32;
2888 ctx.instructions.emplace_back(std::move(instr));
2889
2890 emit_set_mode(bld, block->fp_mode, set_round, false);
2891 } else if (instr->opcode == aco_opcode::p_v_cvt_pk_u8_f32) {
2892 Definition def = instr->definitions[0];
2893 VALU_instruction& valu =
2894 bld.vop3(aco_opcode::v_cvt_pk_u8_f32, def, instr->operands[0],
2895 Operand::c32(def.physReg().byte()), Operand(def.physReg(), v1))
2896 ->valu();
2897 valu.abs = instr->valu().abs;
2898 valu.neg = instr->valu().neg;
2899 } else if (instr->isMIMG() && instr->mimg().strict_wqm) {
2900 lower_image_sample(&ctx, instr);
2901 ctx.instructions.emplace_back(std::move(instr));
2902 } else {
2903 ctx.instructions.emplace_back(std::move(instr));
2904 }
2905 }
2906
2907 /* Send the ordered section done message from this block if it's needed in this block, but
2908 * instr_after_end_idx() points beyond the end of its instructions. This may commonly happen
2909 * if the common post-dominator of multiple end locations turns out to be an empty block.
2910 */
2911 if (block_idx == pops_done_msg_bounds.end_block_idx() &&
2912 pops_done_msg_bounds.instr_after_end_idx() >= block->instructions.size()) {
2913 bld.sopp(aco_opcode::s_sendmsg, sendmsg_ordered_ps_done);
2914 }
2915
2916 block->instructions = std::move(ctx.instructions);
2917 }
2918
2919 /* If block with p_end_with_regs is not the last block (i.e. p_exit_early_if_not may append exit
2920 * block at last), create an exit block for it to branch to.
2921 */
2922 int last_block_index = program->blocks.size() - 1;
2923 if (end_with_regs_block_index >= 0 && end_with_regs_block_index != last_block_index) {
2924 Block* exit_block = program->create_and_insert_block();
2925 Block* end_with_regs_block = &program->blocks[end_with_regs_block_index];
2926 exit_block->linear_preds.push_back(end_with_regs_block->index);
2927 end_with_regs_block->linear_succs.push_back(exit_block->index);
2928
2929 Builder bld(program, end_with_regs_block);
2930 bld.sopp(aco_opcode::s_branch, exit_block->index);
2931
2932 /* For insert waitcnt pass to add waitcnt in exit block, otherwise waitcnt will be added
2933 * after the s_branch which won't be executed.
2934 */
2935 end_with_regs_block->kind &= ~block_kind_end_with_regs;
2936 exit_block->kind |= block_kind_end_with_regs;
2937 }
2938
2939 program->progress = CompilationProgress::after_lower_to_hw;
2940 }
2941
2942 } // namespace aco
2943