1 /*
2 * Copyright © 2018 Valve Corporation
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
13 * Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21 * IN THE SOFTWARE.
22 *
23 */
24
25 #include "aco_builder.h"
26 #include "aco_ir.h"
27
28 #include "common/sid.h"
29
30 #include <map>
31 #include <vector>
32
33 namespace aco {
34
35 struct lower_context {
36 Program* program;
37 Block* block;
38 std::vector<aco_ptr<Instruction>> instructions;
39 };
40
41 /* Class for obtaining where s_sendmsg(MSG_ORDERED_PS_DONE) must be done in a Primitive Ordered
42 * Pixel Shader on GFX9-10.3.
43 *
44 * MSG_ORDERED_PS_DONE must be sent once after the ordered section is done along all execution paths
45 * from the POPS packer ID hardware register setting to s_endpgm. It is, however, also okay to send
46 * it if the packer ID is not going to be set at all by the wave, so some conservativeness is fine.
47 *
48 * For simplicity, sending the message from top-level blocks as dominance and post-dominance
49 * checking for any location in the shader is trivial in them. Also, for simplicity, sending it
50 * regardless of whether the POPS packer ID hardware register has already potentially been set up.
51 *
52 * Note that there can be multiple interlock end instructions in the shader.
53 * SPV_EXT_fragment_shader_interlock requires OpEndInvocationInterlockEXT to be executed exactly
54 * once by the invocation. However, there may be, for instance, multiple ordered sections, and which
55 * one will be executed may depend on divergent control flow (some lanes may execute one ordered
56 * section, other lanes may execute another). MSG_ORDERED_PS_DONE, however, is sent via a scalar
57 * instruction, so it must be ensured that the message is sent after the last ordered section in the
58 * entire wave.
59 */
60 class gfx9_pops_done_msg_bounds {
61 public:
62 explicit gfx9_pops_done_msg_bounds() = default;
63
gfx9_pops_done_msg_bounds(const Program * const program)64 explicit gfx9_pops_done_msg_bounds(const Program* const program)
65 {
66 /* Find the top-level location after the last ordered section end pseudo-instruction in the
67 * program.
68 * Consider `p_pops_gfx9_overlapped_wave_wait_done` a boundary too - make sure the message
69 * isn't sent if any wait hasn't been fully completed yet (if a begin-end-begin situation
70 * occurs somehow, as the location of `p_pops_gfx9_ordered_section_done` is controlled by the
71 * application) for safety, assuming that waits are the only thing that need the packer
72 * hardware register to be set at some point during or before them, and it won't be set
73 * anymore after the last wait.
74 */
75 int last_top_level_block_idx = -1;
76 for (int block_idx = (int)program->blocks.size() - 1; block_idx >= 0; block_idx--) {
77 const Block& block = program->blocks[block_idx];
78 if (block.kind & block_kind_top_level) {
79 last_top_level_block_idx = block_idx;
80 }
81 for (size_t instr_idx = block.instructions.size() - 1; instr_idx + size_t(1) > 0;
82 instr_idx--) {
83 const aco_opcode opcode = block.instructions[instr_idx]->opcode;
84 if (opcode == aco_opcode::p_pops_gfx9_ordered_section_done ||
85 opcode == aco_opcode::p_pops_gfx9_overlapped_wave_wait_done) {
86 end_block_idx_ = last_top_level_block_idx;
87 /* The same block if it's already a top-level block, or the beginning of the next
88 * top-level block.
89 */
90 instr_after_end_idx_ = block_idx == end_block_idx_ ? instr_idx + 1 : 0;
91 break;
92 }
93 }
94 if (end_block_idx_ != -1) {
95 break;
96 }
97 }
98 }
99
100 /* If this is not -1, during the normal execution flow (not early exiting), MSG_ORDERED_PS_DONE
101 * must be sent in this block.
102 */
end_block_idx() const103 int end_block_idx() const { return end_block_idx_; }
104
105 /* If end_block_idx() is an existing block, during the normal execution flow (not early exiting),
106 * MSG_ORDERED_PS_DONE must be sent before this instruction in the block end_block_idx().
107 * If this is out of the bounds of the instructions in the end block, it must be sent in the end
108 * of that block.
109 */
instr_after_end_idx() const110 size_t instr_after_end_idx() const { return instr_after_end_idx_; }
111
112 /* Whether an instruction doing early exit (such as discard) needs to send MSG_ORDERED_PS_DONE
113 * before actually ending the program.
114 */
early_exit_needs_done_msg(const int block_idx,const size_t instr_idx) const115 bool early_exit_needs_done_msg(const int block_idx, const size_t instr_idx) const
116 {
117 return block_idx <= end_block_idx_ &&
118 (block_idx != end_block_idx_ || instr_idx < instr_after_end_idx_);
119 }
120
121 private:
122 /* Initialize to an empty range for which "is inside" comparisons will be failing for any
123 * block.
124 */
125 int end_block_idx_ = -1;
126 size_t instr_after_end_idx_ = 0;
127 };
128
129 /* used by handle_operands() indirectly through Builder::copy */
130 uint8_t int8_mul_table[512] = {
131 0, 20, 1, 1, 1, 2, 1, 3, 1, 4, 1, 5, 1, 6, 1, 7, 1, 8, 1, 9,
132 1, 10, 1, 11, 1, 12, 1, 13, 1, 14, 1, 15, 1, 16, 1, 17, 1, 18, 1, 19,
133 1, 20, 1, 21, 1, 22, 1, 23, 1, 24, 1, 25, 1, 26, 1, 27, 1, 28, 1, 29,
134 1, 30, 1, 31, 1, 32, 1, 33, 1, 34, 1, 35, 1, 36, 1, 37, 1, 38, 1, 39,
135 1, 40, 1, 41, 1, 42, 1, 43, 1, 44, 1, 45, 1, 46, 1, 47, 1, 48, 1, 49,
136 1, 50, 1, 51, 1, 52, 1, 53, 1, 54, 1, 55, 1, 56, 1, 57, 1, 58, 1, 59,
137 1, 60, 1, 61, 1, 62, 1, 63, 1, 64, 5, 13, 2, 33, 17, 19, 2, 34, 3, 23,
138 2, 35, 11, 53, 2, 36, 7, 47, 2, 37, 3, 25, 2, 38, 7, 11, 2, 39, 53, 243,
139 2, 40, 3, 27, 2, 41, 17, 35, 2, 42, 5, 17, 2, 43, 3, 29, 2, 44, 15, 23,
140 2, 45, 7, 13, 2, 46, 3, 31, 2, 47, 5, 19, 2, 48, 19, 59, 2, 49, 3, 33,
141 2, 50, 7, 51, 2, 51, 15, 41, 2, 52, 3, 35, 2, 53, 11, 33, 2, 54, 23, 27,
142 2, 55, 3, 37, 2, 56, 9, 41, 2, 57, 5, 23, 2, 58, 3, 39, 2, 59, 7, 17,
143 2, 60, 9, 241, 2, 61, 3, 41, 2, 62, 5, 25, 2, 63, 35, 245, 2, 64, 3, 43,
144 5, 26, 9, 43, 3, 44, 7, 19, 10, 39, 3, 45, 4, 34, 11, 59, 3, 46, 9, 243,
145 4, 35, 3, 47, 22, 53, 7, 57, 3, 48, 5, 29, 10, 245, 3, 49, 4, 37, 9, 45,
146 3, 50, 7, 241, 4, 38, 3, 51, 7, 22, 5, 31, 3, 52, 7, 59, 7, 242, 3, 53,
147 4, 40, 7, 23, 3, 54, 15, 45, 4, 41, 3, 55, 6, 241, 9, 47, 3, 56, 13, 13,
148 5, 34, 3, 57, 4, 43, 11, 39, 3, 58, 5, 35, 4, 44, 3, 59, 6, 243, 7, 245,
149 3, 60, 5, 241, 7, 26, 3, 61, 4, 46, 5, 37, 3, 62, 11, 17, 4, 47, 3, 63,
150 5, 38, 5, 243, 3, 64, 7, 247, 9, 50, 5, 39, 4, 241, 33, 37, 6, 33, 13, 35,
151 4, 242, 5, 245, 6, 247, 7, 29, 4, 51, 5, 41, 5, 246, 7, 249, 3, 240, 11, 19,
152 5, 42, 3, 241, 4, 245, 25, 29, 3, 242, 5, 43, 4, 246, 3, 243, 17, 58, 17, 43,
153 3, 244, 5, 249, 6, 37, 3, 245, 2, 240, 5, 45, 2, 241, 21, 23, 2, 242, 3, 247,
154 2, 243, 5, 251, 2, 244, 29, 61, 2, 245, 3, 249, 2, 246, 17, 29, 2, 247, 9, 55,
155 1, 240, 1, 241, 1, 242, 1, 243, 1, 244, 1, 245, 1, 246, 1, 247, 1, 248, 1, 249,
156 1, 250, 1, 251, 1, 252, 1, 253, 1, 254, 1, 255};
157
158 aco_opcode
get_reduce_opcode(amd_gfx_level gfx_level,ReduceOp op)159 get_reduce_opcode(amd_gfx_level gfx_level, ReduceOp op)
160 {
161 /* Because some 16-bit instructions are already VOP3 on GFX10, we use the
162 * 32-bit opcodes (VOP2) which allows to remove the temporary VGPR and to use
163 * DPP with the arithmetic instructions. This requires to sign-extend.
164 */
165 switch (op) {
166 case iadd8:
167 case iadd16:
168 if (gfx_level >= GFX10) {
169 return aco_opcode::v_add_u32;
170 } else if (gfx_level >= GFX8) {
171 return aco_opcode::v_add_u16;
172 } else {
173 return aco_opcode::v_add_co_u32;
174 }
175 break;
176 case imul8:
177 case imul16:
178 if (gfx_level >= GFX10) {
179 return aco_opcode::v_mul_lo_u16_e64;
180 } else if (gfx_level >= GFX8) {
181 return aco_opcode::v_mul_lo_u16;
182 } else {
183 return aco_opcode::v_mul_u32_u24;
184 }
185 break;
186 case fadd16: return aco_opcode::v_add_f16;
187 case fmul16: return aco_opcode::v_mul_f16;
188 case imax8:
189 case imax16:
190 if (gfx_level >= GFX10) {
191 return aco_opcode::v_max_i32;
192 } else if (gfx_level >= GFX8) {
193 return aco_opcode::v_max_i16;
194 } else {
195 return aco_opcode::v_max_i32;
196 }
197 break;
198 case imin8:
199 case imin16:
200 if (gfx_level >= GFX10) {
201 return aco_opcode::v_min_i32;
202 } else if (gfx_level >= GFX8) {
203 return aco_opcode::v_min_i16;
204 } else {
205 return aco_opcode::v_min_i32;
206 }
207 break;
208 case umin8:
209 case umin16:
210 if (gfx_level >= GFX10) {
211 return aco_opcode::v_min_u32;
212 } else if (gfx_level >= GFX8) {
213 return aco_opcode::v_min_u16;
214 } else {
215 return aco_opcode::v_min_u32;
216 }
217 break;
218 case umax8:
219 case umax16:
220 if (gfx_level >= GFX10) {
221 return aco_opcode::v_max_u32;
222 } else if (gfx_level >= GFX8) {
223 return aco_opcode::v_max_u16;
224 } else {
225 return aco_opcode::v_max_u32;
226 }
227 break;
228 case fmin16: return aco_opcode::v_min_f16;
229 case fmax16: return aco_opcode::v_max_f16;
230 case iadd32: return gfx_level >= GFX9 ? aco_opcode::v_add_u32 : aco_opcode::v_add_co_u32;
231 case imul32: return aco_opcode::v_mul_lo_u32;
232 case fadd32: return aco_opcode::v_add_f32;
233 case fmul32: return aco_opcode::v_mul_f32;
234 case imax32: return aco_opcode::v_max_i32;
235 case imin32: return aco_opcode::v_min_i32;
236 case umin32: return aco_opcode::v_min_u32;
237 case umax32: return aco_opcode::v_max_u32;
238 case fmin32: return aco_opcode::v_min_f32;
239 case fmax32: return aco_opcode::v_max_f32;
240 case iand8:
241 case iand16:
242 case iand32: return aco_opcode::v_and_b32;
243 case ixor8:
244 case ixor16:
245 case ixor32: return aco_opcode::v_xor_b32;
246 case ior8:
247 case ior16:
248 case ior32: return aco_opcode::v_or_b32;
249 case iadd64: return aco_opcode::num_opcodes;
250 case imul64: return aco_opcode::num_opcodes;
251 case fadd64: return aco_opcode::v_add_f64;
252 case fmul64: return aco_opcode::v_mul_f64;
253 case imin64: return aco_opcode::num_opcodes;
254 case imax64: return aco_opcode::num_opcodes;
255 case umin64: return aco_opcode::num_opcodes;
256 case umax64: return aco_opcode::num_opcodes;
257 case fmin64: return aco_opcode::v_min_f64;
258 case fmax64: return aco_opcode::v_max_f64;
259 case iand64: return aco_opcode::num_opcodes;
260 case ior64: return aco_opcode::num_opcodes;
261 case ixor64: return aco_opcode::num_opcodes;
262 default: return aco_opcode::num_opcodes;
263 }
264 }
265
266 bool
is_vop3_reduce_opcode(aco_opcode opcode)267 is_vop3_reduce_opcode(aco_opcode opcode)
268 {
269 /* 64-bit reductions are VOP3. */
270 if (opcode == aco_opcode::num_opcodes)
271 return true;
272
273 return instr_info.format[(int)opcode] == Format::VOP3;
274 }
275
276 void
emit_vadd32(Builder & bld,Definition def,Operand src0,Operand src1)277 emit_vadd32(Builder& bld, Definition def, Operand src0, Operand src1)
278 {
279 Instruction* instr = bld.vadd32(def, src0, src1, false, Operand(s2), true);
280 if (instr->definitions.size() >= 2) {
281 assert(instr->definitions[1].regClass() == bld.lm);
282 instr->definitions[1].setFixed(vcc);
283 }
284 }
285
286 void
emit_int64_dpp_op(lower_context * ctx,PhysReg dst_reg,PhysReg src0_reg,PhysReg src1_reg,PhysReg vtmp_reg,ReduceOp op,unsigned dpp_ctrl,unsigned row_mask,unsigned bank_mask,bool bound_ctrl,Operand * identity=NULL)287 emit_int64_dpp_op(lower_context* ctx, PhysReg dst_reg, PhysReg src0_reg, PhysReg src1_reg,
288 PhysReg vtmp_reg, ReduceOp op, unsigned dpp_ctrl, unsigned row_mask,
289 unsigned bank_mask, bool bound_ctrl, Operand* identity = NULL)
290 {
291 Builder bld(ctx->program, &ctx->instructions);
292 Definition dst[] = {Definition(dst_reg, v1), Definition(PhysReg{dst_reg + 1}, v1)};
293 Definition vtmp_def[] = {Definition(vtmp_reg, v1), Definition(PhysReg{vtmp_reg + 1}, v1)};
294 Operand src0[] = {Operand(src0_reg, v1), Operand(PhysReg{src0_reg + 1}, v1)};
295 Operand src1[] = {Operand(src1_reg, v1), Operand(PhysReg{src1_reg + 1}, v1)};
296 Operand src1_64 = Operand(src1_reg, v2);
297 Operand vtmp_op[] = {Operand(vtmp_reg, v1), Operand(PhysReg{vtmp_reg + 1}, v1)};
298 Operand vtmp_op64 = Operand(vtmp_reg, v2);
299 if (op == iadd64) {
300 if (ctx->program->gfx_level >= GFX10) {
301 if (identity)
302 bld.vop1(aco_opcode::v_mov_b32, vtmp_def[0], identity[0]);
303 bld.vop1_dpp(aco_opcode::v_mov_b32, vtmp_def[0], src0[0], dpp_ctrl, row_mask, bank_mask,
304 bound_ctrl);
305 bld.vop3(aco_opcode::v_add_co_u32_e64, dst[0], bld.def(bld.lm, vcc), vtmp_op[0], src1[0]);
306 } else {
307 bld.vop2_dpp(aco_opcode::v_add_co_u32, dst[0], bld.def(bld.lm, vcc), src0[0], src1[0],
308 dpp_ctrl, row_mask, bank_mask, bound_ctrl);
309 }
310 bld.vop2_dpp(aco_opcode::v_addc_co_u32, dst[1], bld.def(bld.lm, vcc), src0[1], src1[1],
311 Operand(vcc, bld.lm), dpp_ctrl, row_mask, bank_mask, bound_ctrl);
312 } else if (op == iand64) {
313 bld.vop2_dpp(aco_opcode::v_and_b32, dst[0], src0[0], src1[0], dpp_ctrl, row_mask, bank_mask,
314 bound_ctrl);
315 bld.vop2_dpp(aco_opcode::v_and_b32, dst[1], src0[1], src1[1], dpp_ctrl, row_mask, bank_mask,
316 bound_ctrl);
317 } else if (op == ior64) {
318 bld.vop2_dpp(aco_opcode::v_or_b32, dst[0], src0[0], src1[0], dpp_ctrl, row_mask, bank_mask,
319 bound_ctrl);
320 bld.vop2_dpp(aco_opcode::v_or_b32, dst[1], src0[1], src1[1], dpp_ctrl, row_mask, bank_mask,
321 bound_ctrl);
322 } else if (op == ixor64) {
323 bld.vop2_dpp(aco_opcode::v_xor_b32, dst[0], src0[0], src1[0], dpp_ctrl, row_mask, bank_mask,
324 bound_ctrl);
325 bld.vop2_dpp(aco_opcode::v_xor_b32, dst[1], src0[1], src1[1], dpp_ctrl, row_mask, bank_mask,
326 bound_ctrl);
327 } else if (op == umin64 || op == umax64 || op == imin64 || op == imax64) {
328 aco_opcode cmp = aco_opcode::num_opcodes;
329 switch (op) {
330 case umin64: cmp = aco_opcode::v_cmp_gt_u64; break;
331 case umax64: cmp = aco_opcode::v_cmp_lt_u64; break;
332 case imin64: cmp = aco_opcode::v_cmp_gt_i64; break;
333 case imax64: cmp = aco_opcode::v_cmp_lt_i64; break;
334 default: break;
335 }
336
337 if (identity) {
338 bld.vop1(aco_opcode::v_mov_b32, vtmp_def[0], identity[0]);
339 bld.vop1(aco_opcode::v_mov_b32, vtmp_def[1], identity[1]);
340 }
341 bld.vop1_dpp(aco_opcode::v_mov_b32, vtmp_def[0], src0[0], dpp_ctrl, row_mask, bank_mask,
342 bound_ctrl);
343 bld.vop1_dpp(aco_opcode::v_mov_b32, vtmp_def[1], src0[1], dpp_ctrl, row_mask, bank_mask,
344 bound_ctrl);
345
346 bld.vopc(cmp, bld.def(bld.lm, vcc), vtmp_op64, src1_64);
347 bld.vop2(aco_opcode::v_cndmask_b32, dst[0], vtmp_op[0], src1[0], Operand(vcc, bld.lm));
348 bld.vop2(aco_opcode::v_cndmask_b32, dst[1], vtmp_op[1], src1[1], Operand(vcc, bld.lm));
349 } else if (op == imul64) {
350 /* t4 = dpp(x_hi)
351 * t1 = umul_lo(t4, y_lo)
352 * t3 = dpp(x_lo)
353 * t0 = umul_lo(t3, y_hi)
354 * t2 = iadd(t0, t1)
355 * t5 = umul_hi(t3, y_lo)
356 * res_hi = iadd(t2, t5)
357 * res_lo = umul_lo(t3, y_lo)
358 * Requires that res_hi != src0[0] and res_hi != src1[0]
359 * and that vtmp[0] != res_hi.
360 */
361 if (identity)
362 bld.vop1(aco_opcode::v_mov_b32, vtmp_def[0], identity[1]);
363 bld.vop1_dpp(aco_opcode::v_mov_b32, vtmp_def[0], src0[1], dpp_ctrl, row_mask, bank_mask,
364 bound_ctrl);
365 bld.vop3(aco_opcode::v_mul_lo_u32, vtmp_def[1], vtmp_op[0], src1[0]);
366 if (identity)
367 bld.vop1(aco_opcode::v_mov_b32, vtmp_def[0], identity[0]);
368 bld.vop1_dpp(aco_opcode::v_mov_b32, vtmp_def[0], src0[0], dpp_ctrl, row_mask, bank_mask,
369 bound_ctrl);
370 bld.vop3(aco_opcode::v_mul_lo_u32, vtmp_def[0], vtmp_op[0], src1[1]);
371 emit_vadd32(bld, vtmp_def[1], vtmp_op[0], vtmp_op[1]);
372 if (identity)
373 bld.vop1(aco_opcode::v_mov_b32, vtmp_def[0], identity[0]);
374 bld.vop1_dpp(aco_opcode::v_mov_b32, vtmp_def[0], src0[0], dpp_ctrl, row_mask, bank_mask,
375 bound_ctrl);
376 bld.vop3(aco_opcode::v_mul_hi_u32, vtmp_def[0], vtmp_op[0], src1[0]);
377 emit_vadd32(bld, dst[1], vtmp_op[1], vtmp_op[0]);
378 if (identity)
379 bld.vop1(aco_opcode::v_mov_b32, vtmp_def[0], identity[0]);
380 bld.vop1_dpp(aco_opcode::v_mov_b32, vtmp_def[0], src0[0], dpp_ctrl, row_mask, bank_mask,
381 bound_ctrl);
382 bld.vop3(aco_opcode::v_mul_lo_u32, dst[0], vtmp_op[0], src1[0]);
383 }
384 }
385
386 void
emit_int64_op(lower_context * ctx,PhysReg dst_reg,PhysReg src0_reg,PhysReg src1_reg,PhysReg vtmp,ReduceOp op)387 emit_int64_op(lower_context* ctx, PhysReg dst_reg, PhysReg src0_reg, PhysReg src1_reg, PhysReg vtmp,
388 ReduceOp op)
389 {
390 Builder bld(ctx->program, &ctx->instructions);
391 Definition dst[] = {Definition(dst_reg, v1), Definition(PhysReg{dst_reg + 1}, v1)};
392 RegClass src0_rc = src0_reg.reg() >= 256 ? v1 : s1;
393 Operand src0[] = {Operand(src0_reg, src0_rc), Operand(PhysReg{src0_reg + 1}, src0_rc)};
394 Operand src1[] = {Operand(src1_reg, v1), Operand(PhysReg{src1_reg + 1}, v1)};
395 Operand src0_64 = Operand(src0_reg, src0_reg.reg() >= 256 ? v2 : s2);
396 Operand src1_64 = Operand(src1_reg, v2);
397
398 if (src0_rc == s1 &&
399 (op == imul64 || op == umin64 || op == umax64 || op == imin64 || op == imax64)) {
400 assert(vtmp.reg() != 0);
401 bld.vop1(aco_opcode::v_mov_b32, Definition(vtmp, v1), src0[0]);
402 bld.vop1(aco_opcode::v_mov_b32, Definition(PhysReg{vtmp + 1}, v1), src0[1]);
403 src0_reg = vtmp;
404 src0[0] = Operand(vtmp, v1);
405 src0[1] = Operand(PhysReg{vtmp + 1}, v1);
406 src0_64 = Operand(vtmp, v2);
407 } else if (src0_rc == s1 && op == iadd64) {
408 assert(vtmp.reg() != 0);
409 bld.vop1(aco_opcode::v_mov_b32, Definition(PhysReg{vtmp + 1}, v1), src0[1]);
410 src0[1] = Operand(PhysReg{vtmp + 1}, v1);
411 }
412
413 if (op == iadd64) {
414 if (ctx->program->gfx_level >= GFX10) {
415 bld.vop3(aco_opcode::v_add_co_u32_e64, dst[0], bld.def(bld.lm, vcc), src0[0], src1[0]);
416 } else {
417 bld.vop2(aco_opcode::v_add_co_u32, dst[0], bld.def(bld.lm, vcc), src0[0], src1[0]);
418 }
419 bld.vop2(aco_opcode::v_addc_co_u32, dst[1], bld.def(bld.lm, vcc), src0[1], src1[1],
420 Operand(vcc, bld.lm));
421 } else if (op == iand64) {
422 bld.vop2(aco_opcode::v_and_b32, dst[0], src0[0], src1[0]);
423 bld.vop2(aco_opcode::v_and_b32, dst[1], src0[1], src1[1]);
424 } else if (op == ior64) {
425 bld.vop2(aco_opcode::v_or_b32, dst[0], src0[0], src1[0]);
426 bld.vop2(aco_opcode::v_or_b32, dst[1], src0[1], src1[1]);
427 } else if (op == ixor64) {
428 bld.vop2(aco_opcode::v_xor_b32, dst[0], src0[0], src1[0]);
429 bld.vop2(aco_opcode::v_xor_b32, dst[1], src0[1], src1[1]);
430 } else if (op == umin64 || op == umax64 || op == imin64 || op == imax64) {
431 aco_opcode cmp = aco_opcode::num_opcodes;
432 switch (op) {
433 case umin64: cmp = aco_opcode::v_cmp_gt_u64; break;
434 case umax64: cmp = aco_opcode::v_cmp_lt_u64; break;
435 case imin64: cmp = aco_opcode::v_cmp_gt_i64; break;
436 case imax64: cmp = aco_opcode::v_cmp_lt_i64; break;
437 default: break;
438 }
439
440 bld.vopc(cmp, bld.def(bld.lm, vcc), src0_64, src1_64);
441 bld.vop2(aco_opcode::v_cndmask_b32, dst[0], src0[0], src1[0], Operand(vcc, bld.lm));
442 bld.vop2(aco_opcode::v_cndmask_b32, dst[1], src0[1], src1[1], Operand(vcc, bld.lm));
443 } else if (op == imul64) {
444 if (src1_reg == dst_reg) {
445 /* it's fine if src0==dst but not if src1==dst */
446 std::swap(src0_reg, src1_reg);
447 std::swap(src0[0], src1[0]);
448 std::swap(src0[1], src1[1]);
449 std::swap(src0_64, src1_64);
450 }
451 assert(!(src0_reg == src1_reg));
452 /* t1 = umul_lo(x_hi, y_lo)
453 * t0 = umul_lo(x_lo, y_hi)
454 * t2 = iadd(t0, t1)
455 * t5 = umul_hi(x_lo, y_lo)
456 * res_hi = iadd(t2, t5)
457 * res_lo = umul_lo(x_lo, y_lo)
458 * assumes that it's ok to modify x_hi/y_hi, since we might not have vtmp
459 */
460 Definition tmp0_def(PhysReg{src0_reg + 1}, v1);
461 Definition tmp1_def(PhysReg{src1_reg + 1}, v1);
462 Operand tmp0_op = src0[1];
463 Operand tmp1_op = src1[1];
464 bld.vop3(aco_opcode::v_mul_lo_u32, tmp0_def, src0[1], src1[0]);
465 bld.vop3(aco_opcode::v_mul_lo_u32, tmp1_def, src0[0], src1[1]);
466 emit_vadd32(bld, tmp0_def, tmp1_op, tmp0_op);
467 bld.vop3(aco_opcode::v_mul_hi_u32, tmp1_def, src0[0], src1[0]);
468 emit_vadd32(bld, dst[1], tmp0_op, tmp1_op);
469 bld.vop3(aco_opcode::v_mul_lo_u32, dst[0], src0[0], src1[0]);
470 }
471 }
472
473 void
emit_dpp_op(lower_context * ctx,PhysReg dst_reg,PhysReg src0_reg,PhysReg src1_reg,PhysReg vtmp,ReduceOp op,unsigned size,unsigned dpp_ctrl,unsigned row_mask,unsigned bank_mask,bool bound_ctrl,Operand * identity=NULL)474 emit_dpp_op(lower_context* ctx, PhysReg dst_reg, PhysReg src0_reg, PhysReg src1_reg, PhysReg vtmp,
475 ReduceOp op, unsigned size, unsigned dpp_ctrl, unsigned row_mask, unsigned bank_mask,
476 bool bound_ctrl, Operand* identity = NULL) /* for VOP3 with sparse writes */
477 {
478 Builder bld(ctx->program, &ctx->instructions);
479 RegClass rc = RegClass(RegType::vgpr, size);
480 Definition dst(dst_reg, rc);
481 Operand src0(src0_reg, rc);
482 Operand src1(src1_reg, rc);
483
484 aco_opcode opcode = get_reduce_opcode(ctx->program->gfx_level, op);
485 bool vop3 = is_vop3_reduce_opcode(opcode);
486
487 if (!vop3) {
488 if (opcode == aco_opcode::v_add_co_u32)
489 bld.vop2_dpp(opcode, dst, bld.def(bld.lm, vcc), src0, src1, dpp_ctrl, row_mask, bank_mask,
490 bound_ctrl);
491 else
492 bld.vop2_dpp(opcode, dst, src0, src1, dpp_ctrl, row_mask, bank_mask, bound_ctrl);
493 return;
494 }
495
496 if (opcode == aco_opcode::num_opcodes) {
497 emit_int64_dpp_op(ctx, dst_reg, src0_reg, src1_reg, vtmp, op, dpp_ctrl, row_mask, bank_mask,
498 bound_ctrl, identity);
499 return;
500 }
501
502 if (identity)
503 bld.vop1(aco_opcode::v_mov_b32, Definition(vtmp, v1), identity[0]);
504 if (identity && size >= 2)
505 bld.vop1(aco_opcode::v_mov_b32, Definition(PhysReg{vtmp + 1}, v1), identity[1]);
506
507 for (unsigned i = 0; i < size; i++)
508 bld.vop1_dpp(aco_opcode::v_mov_b32, Definition(PhysReg{vtmp + i}, v1),
509 Operand(PhysReg{src0_reg + i}, v1), dpp_ctrl, row_mask, bank_mask, bound_ctrl);
510
511 bld.vop3(opcode, dst, Operand(vtmp, rc), src1);
512 }
513
514 void
emit_op(lower_context * ctx,PhysReg dst_reg,PhysReg src0_reg,PhysReg src1_reg,PhysReg vtmp,ReduceOp op,unsigned size)515 emit_op(lower_context* ctx, PhysReg dst_reg, PhysReg src0_reg, PhysReg src1_reg, PhysReg vtmp,
516 ReduceOp op, unsigned size)
517 {
518 Builder bld(ctx->program, &ctx->instructions);
519 RegClass rc = RegClass(RegType::vgpr, size);
520 Definition dst(dst_reg, rc);
521 Operand src0(src0_reg, RegClass(src0_reg.reg() >= 256 ? RegType::vgpr : RegType::sgpr, size));
522 Operand src1(src1_reg, rc);
523
524 aco_opcode opcode = get_reduce_opcode(ctx->program->gfx_level, op);
525 bool vop3 = is_vop3_reduce_opcode(opcode);
526
527 if (opcode == aco_opcode::num_opcodes) {
528 emit_int64_op(ctx, dst_reg, src0_reg, src1_reg, vtmp, op);
529 return;
530 }
531
532 if (vop3) {
533 bld.vop3(opcode, dst, src0, src1);
534 } else if (opcode == aco_opcode::v_add_co_u32) {
535 bld.vop2(opcode, dst, bld.def(bld.lm, vcc), src0, src1);
536 } else {
537 bld.vop2(opcode, dst, src0, src1);
538 }
539 }
540
541 void
emit_dpp_mov(lower_context * ctx,PhysReg dst,PhysReg src0,unsigned size,unsigned dpp_ctrl,unsigned row_mask,unsigned bank_mask,bool bound_ctrl)542 emit_dpp_mov(lower_context* ctx, PhysReg dst, PhysReg src0, unsigned size, unsigned dpp_ctrl,
543 unsigned row_mask, unsigned bank_mask, bool bound_ctrl)
544 {
545 Builder bld(ctx->program, &ctx->instructions);
546 for (unsigned i = 0; i < size; i++) {
547 bld.vop1_dpp(aco_opcode::v_mov_b32, Definition(PhysReg{dst + i}, v1),
548 Operand(PhysReg{src0 + i}, v1), dpp_ctrl, row_mask, bank_mask, bound_ctrl);
549 }
550 }
551
552 void
emit_ds_swizzle(Builder bld,PhysReg dst,PhysReg src,unsigned size,unsigned ds_pattern)553 emit_ds_swizzle(Builder bld, PhysReg dst, PhysReg src, unsigned size, unsigned ds_pattern)
554 {
555 for (unsigned i = 0; i < size; i++) {
556 bld.ds(aco_opcode::ds_swizzle_b32, Definition(PhysReg{dst + i}, v1),
557 Operand(PhysReg{src + i}, v1), ds_pattern);
558 }
559 }
560
561 void
emit_reduction(lower_context * ctx,aco_opcode op,ReduceOp reduce_op,unsigned cluster_size,PhysReg tmp,PhysReg stmp,PhysReg vtmp,PhysReg sitmp,Operand src,Definition dst)562 emit_reduction(lower_context* ctx, aco_opcode op, ReduceOp reduce_op, unsigned cluster_size,
563 PhysReg tmp, PhysReg stmp, PhysReg vtmp, PhysReg sitmp, Operand src, Definition dst)
564 {
565 assert(cluster_size == ctx->program->wave_size || op == aco_opcode::p_reduce);
566 assert(cluster_size <= ctx->program->wave_size);
567
568 Builder bld(ctx->program, &ctx->instructions);
569
570 Operand identity[2];
571 identity[0] = Operand::c32(get_reduction_identity(reduce_op, 0));
572 identity[1] = Operand::c32(get_reduction_identity(reduce_op, 1));
573 Operand vcndmask_identity[2] = {identity[0], identity[1]};
574
575 /* First, copy the source to tmp and set inactive lanes to the identity */
576 bld.sop1(Builder::s_or_saveexec, Definition(stmp, bld.lm), Definition(scc, s1),
577 Definition(exec, bld.lm), Operand::c64(UINT64_MAX), Operand(exec, bld.lm));
578
579 /* On GFX10+ v_writelane_b32/v_cndmask_b32_e64 can take a literal */
580 if (ctx->program->gfx_level < GFX10) {
581 for (unsigned i = 0; i < src.size(); i++) {
582 /* p_exclusive_scan uses identity for v_writelane_b32 */
583 if (identity[i].isLiteral() && op == aco_opcode::p_exclusive_scan) {
584 bld.sop1(aco_opcode::s_mov_b32, Definition(PhysReg{sitmp + i}, s1), identity[i]);
585 identity[i] = Operand(PhysReg{sitmp + i}, s1);
586
587 bld.vop1(aco_opcode::v_mov_b32, Definition(PhysReg{tmp + i}, v1), identity[i]);
588 vcndmask_identity[i] = Operand(PhysReg{tmp + i}, v1);
589 } else if (identity[i].isLiteral()) {
590 bld.vop1(aco_opcode::v_mov_b32, Definition(PhysReg{tmp + i}, v1), identity[i]);
591 vcndmask_identity[i] = Operand(PhysReg{tmp + i}, v1);
592 }
593 }
594 }
595
596 for (unsigned i = 0; i < src.size(); i++) {
597 bld.vop2_e64(aco_opcode::v_cndmask_b32, Definition(PhysReg{tmp + i}, v1),
598 vcndmask_identity[i], Operand(PhysReg{src.physReg() + i}, v1),
599 Operand(stmp, bld.lm));
600 }
601
602 if (src.regClass() == v1b) {
603 if (ctx->program->gfx_level >= GFX8 && ctx->program->gfx_level < GFX11) {
604 aco_ptr<SDWA_instruction> sdwa{create_instruction<SDWA_instruction>(
605 aco_opcode::v_mov_b32, asSDWA(Format::VOP1), 1, 1)};
606 sdwa->operands[0] = Operand(PhysReg{tmp}, v1);
607 sdwa->definitions[0] = Definition(PhysReg{tmp}, v1);
608 bool sext = reduce_op == imin8 || reduce_op == imax8;
609 sdwa->sel[0] = SubdwordSel(1, 0, sext);
610 sdwa->dst_sel = SubdwordSel::dword;
611 bld.insert(std::move(sdwa));
612 } else {
613 aco_opcode opcode;
614
615 if (reduce_op == imin8 || reduce_op == imax8)
616 opcode = aco_opcode::v_bfe_i32;
617 else
618 opcode = aco_opcode::v_bfe_u32;
619
620 bld.vop3(opcode, Definition(PhysReg{tmp}, v1), Operand(PhysReg{tmp}, v1), Operand::zero(),
621 Operand::c32(8u));
622 }
623 } else if (src.regClass() == v2b) {
624 bool is_add_cmp = reduce_op == iadd16 || reduce_op == imax16 || reduce_op == imin16 ||
625 reduce_op == umin16 || reduce_op == umax16;
626 if (ctx->program->gfx_level >= GFX10 && ctx->program->gfx_level < GFX11 && is_add_cmp) {
627 aco_ptr<SDWA_instruction> sdwa{create_instruction<SDWA_instruction>(
628 aco_opcode::v_mov_b32, asSDWA(Format::VOP1), 1, 1)};
629 sdwa->operands[0] = Operand(PhysReg{tmp}, v1);
630 sdwa->definitions[0] = Definition(PhysReg{tmp}, v1);
631 bool sext = reduce_op == imin16 || reduce_op == imax16 || reduce_op == iadd16;
632 sdwa->sel[0] = SubdwordSel(2, 0, sext);
633 sdwa->dst_sel = SubdwordSel::dword;
634 bld.insert(std::move(sdwa));
635 } else if (ctx->program->gfx_level <= GFX7 ||
636 (ctx->program->gfx_level >= GFX11 && is_add_cmp)) {
637 aco_opcode opcode;
638
639 if (reduce_op == imin16 || reduce_op == imax16 || reduce_op == iadd16)
640 opcode = aco_opcode::v_bfe_i32;
641 else
642 opcode = aco_opcode::v_bfe_u32;
643
644 bld.vop3(opcode, Definition(PhysReg{tmp}, v1), Operand(PhysReg{tmp}, v1), Operand::zero(),
645 Operand::c32(16u));
646 }
647 }
648
649 bool reduction_needs_last_op = false;
650 switch (op) {
651 case aco_opcode::p_reduce:
652 if (cluster_size == 1)
653 break;
654
655 if (ctx->program->gfx_level <= GFX7) {
656 reduction_needs_last_op = true;
657 emit_ds_swizzle(bld, vtmp, tmp, src.size(), (1 << 15) | dpp_quad_perm(1, 0, 3, 2));
658 if (cluster_size == 2)
659 break;
660 emit_op(ctx, tmp, vtmp, tmp, PhysReg{0}, reduce_op, src.size());
661 emit_ds_swizzle(bld, vtmp, tmp, src.size(), (1 << 15) | dpp_quad_perm(2, 3, 0, 1));
662 if (cluster_size == 4)
663 break;
664 emit_op(ctx, tmp, vtmp, tmp, PhysReg{0}, reduce_op, src.size());
665 emit_ds_swizzle(bld, vtmp, tmp, src.size(), ds_pattern_bitmode(0x1f, 0, 0x04));
666 if (cluster_size == 8)
667 break;
668 emit_op(ctx, tmp, vtmp, tmp, PhysReg{0}, reduce_op, src.size());
669 emit_ds_swizzle(bld, vtmp, tmp, src.size(), ds_pattern_bitmode(0x1f, 0, 0x08));
670 if (cluster_size == 16)
671 break;
672 emit_op(ctx, tmp, vtmp, tmp, PhysReg{0}, reduce_op, src.size());
673 emit_ds_swizzle(bld, vtmp, tmp, src.size(), ds_pattern_bitmode(0x1f, 0, 0x10));
674 if (cluster_size == 32)
675 break;
676 emit_op(ctx, tmp, vtmp, tmp, PhysReg{0}, reduce_op, src.size());
677 for (unsigned i = 0; i < src.size(); i++)
678 bld.readlane(Definition(PhysReg{dst.physReg() + i}, s1), Operand(PhysReg{tmp + i}, v1),
679 Operand::zero());
680 // TODO: it would be more effective to do the last reduction step on SALU
681 emit_op(ctx, tmp, dst.physReg(), tmp, vtmp, reduce_op, src.size());
682 reduction_needs_last_op = false;
683 break;
684 }
685
686 emit_dpp_op(ctx, tmp, tmp, tmp, vtmp, reduce_op, src.size(), dpp_quad_perm(1, 0, 3, 2), 0xf,
687 0xf, false);
688 if (cluster_size == 2)
689 break;
690 emit_dpp_op(ctx, tmp, tmp, tmp, vtmp, reduce_op, src.size(), dpp_quad_perm(2, 3, 0, 1), 0xf,
691 0xf, false);
692 if (cluster_size == 4)
693 break;
694 emit_dpp_op(ctx, tmp, tmp, tmp, vtmp, reduce_op, src.size(), dpp_row_half_mirror, 0xf, 0xf,
695 false);
696 if (cluster_size == 8)
697 break;
698 emit_dpp_op(ctx, tmp, tmp, tmp, vtmp, reduce_op, src.size(), dpp_row_mirror, 0xf, 0xf, false);
699 if (cluster_size == 16)
700 break;
701
702 if (ctx->program->gfx_level >= GFX10) {
703 /* GFX10+ doesn't support row_bcast15 and row_bcast31 */
704 for (unsigned i = 0; i < src.size(); i++)
705 bld.vop3(aco_opcode::v_permlanex16_b32, Definition(PhysReg{vtmp + i}, v1),
706 Operand(PhysReg{tmp + i}, v1), Operand::zero(), Operand::zero());
707
708 if (cluster_size == 32) {
709 reduction_needs_last_op = true;
710 break;
711 }
712
713 emit_op(ctx, tmp, tmp, vtmp, PhysReg{0}, reduce_op, src.size());
714 for (unsigned i = 0; i < src.size(); i++)
715 bld.readlane(Definition(PhysReg{dst.physReg() + i}, s1), Operand(PhysReg{tmp + i}, v1),
716 Operand::zero());
717 // TODO: it would be more effective to do the last reduction step on SALU
718 emit_op(ctx, tmp, dst.physReg(), tmp, vtmp, reduce_op, src.size());
719 break;
720 }
721
722 if (cluster_size == 32) {
723 emit_ds_swizzle(bld, vtmp, tmp, src.size(), ds_pattern_bitmode(0x1f, 0, 0x10));
724 reduction_needs_last_op = true;
725 break;
726 }
727 assert(cluster_size == 64);
728 emit_dpp_op(ctx, tmp, tmp, tmp, vtmp, reduce_op, src.size(), dpp_row_bcast15, 0xa, 0xf,
729 false);
730 emit_dpp_op(ctx, tmp, tmp, tmp, vtmp, reduce_op, src.size(), dpp_row_bcast31, 0xc, 0xf,
731 false);
732 break;
733 case aco_opcode::p_exclusive_scan:
734 if (ctx->program->gfx_level >= GFX10) { /* gfx10 doesn't support wf_sr1, so emulate it */
735 /* shift rows right */
736 emit_dpp_mov(ctx, vtmp, tmp, src.size(), dpp_row_sr(1), 0xf, 0xf, true);
737
738 /* fill in the gaps in rows 1 and 3 */
739 bld.sop1(aco_opcode::s_mov_b32, Definition(exec_lo, s1), Operand::c32(0x10000u));
740 if (ctx->program->wave_size == 64)
741 bld.sop1(aco_opcode::s_mov_b32, Definition(exec_hi, s1), Operand::c32(0x10000u));
742 for (unsigned i = 0; i < src.size(); i++) {
743 Instruction* perm =
744 bld.vop3(aco_opcode::v_permlanex16_b32, Definition(PhysReg{vtmp + i}, v1),
745 Operand(PhysReg{tmp + i}, v1), Operand::c32(0xffffffffu),
746 Operand::c32(0xffffffffu))
747 .instr;
748 perm->valu().opsel = 1; /* FI (Fetch Inactive) */
749 }
750 bld.sop1(Builder::s_mov, Definition(exec, bld.lm), Operand::c64(UINT64_MAX));
751
752 if (ctx->program->wave_size == 64) {
753 /* fill in the gap in row 2 */
754 for (unsigned i = 0; i < src.size(); i++) {
755 bld.readlane(Definition(PhysReg{sitmp + i}, s1), Operand(PhysReg{tmp + i}, v1),
756 Operand::c32(31u));
757 bld.writelane(Definition(PhysReg{vtmp + i}, v1), Operand(PhysReg{sitmp + i}, s1),
758 Operand::c32(32u), Operand(PhysReg{vtmp + i}, v1));
759 }
760 }
761 std::swap(tmp, vtmp);
762 } else if (ctx->program->gfx_level >= GFX8) {
763 emit_dpp_mov(ctx, tmp, tmp, src.size(), dpp_wf_sr1, 0xf, 0xf, true);
764 } else {
765 // TODO: use LDS on CS with a single write and shifted read
766 /* wavefront shift_right by 1 on SI/CI */
767 emit_ds_swizzle(bld, vtmp, tmp, src.size(), (1 << 15) | dpp_quad_perm(0, 0, 1, 2));
768 emit_ds_swizzle(bld, tmp, tmp, src.size(),
769 ds_pattern_bitmode(0x1F, 0x00, 0x07)); /* mirror(8) */
770 bld.sop1(aco_opcode::s_mov_b32, Definition(exec_lo, s1), Operand::c32(0x10101010u));
771 bld.sop1(aco_opcode::s_mov_b32, Definition(exec_hi, s1), Operand(exec_lo, s1));
772 for (unsigned i = 0; i < src.size(); i++)
773 bld.vop1(aco_opcode::v_mov_b32, Definition(PhysReg{vtmp + i}, v1),
774 Operand(PhysReg{tmp + i}, v1));
775
776 bld.sop1(aco_opcode::s_mov_b64, Definition(exec, s2), Operand::c64(UINT64_MAX));
777 emit_ds_swizzle(bld, tmp, tmp, src.size(),
778 ds_pattern_bitmode(0x1F, 0x00, 0x08)); /* swap(8) */
779 bld.sop1(aco_opcode::s_mov_b32, Definition(exec_lo, s1), Operand::c32(0x01000100u));
780 bld.sop1(aco_opcode::s_mov_b32, Definition(exec_hi, s1), Operand(exec_lo, s1));
781 for (unsigned i = 0; i < src.size(); i++)
782 bld.vop1(aco_opcode::v_mov_b32, Definition(PhysReg{vtmp + i}, v1),
783 Operand(PhysReg{tmp + i}, v1));
784
785 bld.sop1(aco_opcode::s_mov_b64, Definition(exec, s2), Operand::c64(UINT64_MAX));
786 emit_ds_swizzle(bld, tmp, tmp, src.size(),
787 ds_pattern_bitmode(0x1F, 0x00, 0x10)); /* swap(16) */
788 bld.sop2(aco_opcode::s_bfm_b32, Definition(exec_lo, s1), Operand::c32(1u),
789 Operand::c32(16u));
790 bld.sop2(aco_opcode::s_bfm_b32, Definition(exec_hi, s1), Operand::c32(1u),
791 Operand::c32(16u));
792 for (unsigned i = 0; i < src.size(); i++)
793 bld.vop1(aco_opcode::v_mov_b32, Definition(PhysReg{vtmp + i}, v1),
794 Operand(PhysReg{tmp + i}, v1));
795
796 bld.sop1(aco_opcode::s_mov_b64, Definition(exec, s2), Operand::c64(UINT64_MAX));
797 for (unsigned i = 0; i < src.size(); i++) {
798 bld.writelane(Definition(PhysReg{vtmp + i}, v1), identity[i], Operand::zero(),
799 Operand(PhysReg{vtmp + i}, v1));
800 bld.readlane(Definition(PhysReg{sitmp + i}, s1), Operand(PhysReg{tmp + i}, v1),
801 Operand::zero());
802 bld.writelane(Definition(PhysReg{vtmp + i}, v1), Operand(PhysReg{sitmp + i}, s1),
803 Operand::c32(32u), Operand(PhysReg{vtmp + i}, v1));
804 identity[i] = Operand::zero(); /* prevent further uses of identity */
805 }
806 std::swap(tmp, vtmp);
807 }
808
809 for (unsigned i = 0; i < src.size(); i++) {
810 if (!identity[i].isConstant() ||
811 identity[i].constantValue()) { /* bound_ctrl should take care of this otherwise */
812 if (ctx->program->gfx_level < GFX10)
813 assert((identity[i].isConstant() && !identity[i].isLiteral()) ||
814 identity[i].physReg() == PhysReg{sitmp + i});
815 bld.writelane(Definition(PhysReg{tmp + i}, v1), identity[i], Operand::zero(),
816 Operand(PhysReg{tmp + i}, v1));
817 }
818 }
819 FALLTHROUGH;
820 case aco_opcode::p_inclusive_scan:
821 assert(cluster_size == ctx->program->wave_size);
822 if (ctx->program->gfx_level <= GFX7) {
823 emit_ds_swizzle(bld, vtmp, tmp, src.size(), ds_pattern_bitmode(0x1e, 0x00, 0x00));
824 bld.sop1(aco_opcode::s_mov_b32, Definition(exec_lo, s1), Operand::c32(0xAAAAAAAAu));
825 bld.sop1(aco_opcode::s_mov_b32, Definition(exec_hi, s1), Operand(exec_lo, s1));
826 emit_op(ctx, tmp, tmp, vtmp, PhysReg{0}, reduce_op, src.size());
827
828 bld.sop1(aco_opcode::s_mov_b64, Definition(exec, s2), Operand::c64(UINT64_MAX));
829 emit_ds_swizzle(bld, vtmp, tmp, src.size(), ds_pattern_bitmode(0x1c, 0x01, 0x00));
830 bld.sop1(aco_opcode::s_mov_b32, Definition(exec_lo, s1), Operand::c32(0xCCCCCCCCu));
831 bld.sop1(aco_opcode::s_mov_b32, Definition(exec_hi, s1), Operand(exec_lo, s1));
832 emit_op(ctx, tmp, tmp, vtmp, PhysReg{0}, reduce_op, src.size());
833
834 bld.sop1(aco_opcode::s_mov_b64, Definition(exec, s2), Operand::c64(UINT64_MAX));
835 emit_ds_swizzle(bld, vtmp, tmp, src.size(), ds_pattern_bitmode(0x18, 0x03, 0x00));
836 bld.sop1(aco_opcode::s_mov_b32, Definition(exec_lo, s1), Operand::c32(0xF0F0F0F0u));
837 bld.sop1(aco_opcode::s_mov_b32, Definition(exec_hi, s1), Operand(exec_lo, s1));
838 emit_op(ctx, tmp, tmp, vtmp, PhysReg{0}, reduce_op, src.size());
839
840 bld.sop1(aco_opcode::s_mov_b64, Definition(exec, s2), Operand::c64(UINT64_MAX));
841 emit_ds_swizzle(bld, vtmp, tmp, src.size(), ds_pattern_bitmode(0x10, 0x07, 0x00));
842 bld.sop1(aco_opcode::s_mov_b32, Definition(exec_lo, s1), Operand::c32(0xFF00FF00u));
843 bld.sop1(aco_opcode::s_mov_b32, Definition(exec_hi, s1), Operand(exec_lo, s1));
844 emit_op(ctx, tmp, tmp, vtmp, PhysReg{0}, reduce_op, src.size());
845
846 bld.sop1(aco_opcode::s_mov_b64, Definition(exec, s2), Operand::c64(UINT64_MAX));
847 emit_ds_swizzle(bld, vtmp, tmp, src.size(), ds_pattern_bitmode(0x00, 0x0f, 0x00));
848 bld.sop2(aco_opcode::s_bfm_b32, Definition(exec_lo, s1), Operand::c32(16u),
849 Operand::c32(16u));
850 bld.sop2(aco_opcode::s_bfm_b32, Definition(exec_hi, s1), Operand::c32(16u),
851 Operand::c32(16u));
852 emit_op(ctx, tmp, tmp, vtmp, PhysReg{0}, reduce_op, src.size());
853
854 for (unsigned i = 0; i < src.size(); i++)
855 bld.readlane(Definition(PhysReg{sitmp + i}, s1), Operand(PhysReg{tmp + i}, v1),
856 Operand::c32(31u));
857 bld.sop2(aco_opcode::s_bfm_b64, Definition(exec, s2), Operand::c32(32u),
858 Operand::c32(32u));
859 emit_op(ctx, tmp, sitmp, tmp, vtmp, reduce_op, src.size());
860 break;
861 }
862
863 emit_dpp_op(ctx, tmp, tmp, tmp, vtmp, reduce_op, src.size(), dpp_row_sr(1), 0xf, 0xf, false,
864 identity);
865 emit_dpp_op(ctx, tmp, tmp, tmp, vtmp, reduce_op, src.size(), dpp_row_sr(2), 0xf, 0xf, false,
866 identity);
867 emit_dpp_op(ctx, tmp, tmp, tmp, vtmp, reduce_op, src.size(), dpp_row_sr(4), 0xf, 0xf, false,
868 identity);
869 emit_dpp_op(ctx, tmp, tmp, tmp, vtmp, reduce_op, src.size(), dpp_row_sr(8), 0xf, 0xf, false,
870 identity);
871 if (ctx->program->gfx_level >= GFX10) {
872 if (ctx->program->wave_size == 64) {
873 bld.sop1(aco_opcode::s_bitreplicate_b64_b32, Definition(exec, s2),
874 Operand::c32(0xff00ff00u));
875 } else {
876 bld.sop2(aco_opcode::s_bfm_b32, Definition(exec_lo, s1), Operand::c32(16u),
877 Operand::c32(16u));
878 }
879 for (unsigned i = 0; i < src.size(); i++) {
880 Instruction* perm =
881 bld.vop3(aco_opcode::v_permlanex16_b32, Definition(PhysReg{vtmp + i}, v1),
882 Operand(PhysReg{tmp + i}, v1), Operand::c32(0xffffffffu),
883 Operand::c32(0xffffffffu))
884 .instr;
885 perm->valu().opsel = 1; /* FI (Fetch Inactive) */
886 }
887 emit_op(ctx, tmp, tmp, vtmp, PhysReg{0}, reduce_op, src.size());
888
889 if (ctx->program->wave_size == 64) {
890 bld.sop2(aco_opcode::s_bfm_b64, Definition(exec, s2), Operand::c32(32u),
891 Operand::c32(32u));
892 for (unsigned i = 0; i < src.size(); i++)
893 bld.readlane(Definition(PhysReg{sitmp + i}, s1), Operand(PhysReg{tmp + i}, v1),
894 Operand::c32(31u));
895 emit_op(ctx, tmp, sitmp, tmp, vtmp, reduce_op, src.size());
896 }
897 } else {
898 emit_dpp_op(ctx, tmp, tmp, tmp, vtmp, reduce_op, src.size(), dpp_row_bcast15, 0xa, 0xf,
899 false, identity);
900 emit_dpp_op(ctx, tmp, tmp, tmp, vtmp, reduce_op, src.size(), dpp_row_bcast31, 0xc, 0xf,
901 false, identity);
902 }
903 break;
904 default: unreachable("Invalid reduction mode");
905 }
906
907 if (op == aco_opcode::p_reduce) {
908 if (reduction_needs_last_op && dst.regClass().type() == RegType::vgpr) {
909 bld.sop1(Builder::s_mov, Definition(exec, bld.lm), Operand(stmp, bld.lm));
910 emit_op(ctx, dst.physReg(), tmp, vtmp, PhysReg{0}, reduce_op, src.size());
911 return;
912 }
913
914 if (reduction_needs_last_op)
915 emit_op(ctx, tmp, vtmp, tmp, PhysReg{0}, reduce_op, src.size());
916 }
917
918 /* restore exec */
919 bld.sop1(Builder::s_mov, Definition(exec, bld.lm), Operand(stmp, bld.lm));
920
921 if (dst.regClass().type() == RegType::sgpr) {
922 for (unsigned k = 0; k < src.size(); k++) {
923 bld.readlane(Definition(PhysReg{dst.physReg() + k}, s1), Operand(PhysReg{tmp + k}, v1),
924 Operand::c32(ctx->program->wave_size - 1));
925 }
926 } else if (dst.physReg() != tmp) {
927 for (unsigned k = 0; k < src.size(); k++) {
928 bld.vop1(aco_opcode::v_mov_b32, Definition(PhysReg{dst.physReg() + k}, v1),
929 Operand(PhysReg{tmp + k}, v1));
930 }
931 }
932 }
933
934 void
adjust_bpermute_dst(Builder & bld,Definition dst,Operand input_data)935 adjust_bpermute_dst(Builder& bld, Definition dst, Operand input_data)
936 {
937 /* RA assumes that the result is always in the low part of the register, so we have to shift,
938 * if it's not there already.
939 */
940 if (input_data.physReg().byte()) {
941 unsigned right_shift = input_data.physReg().byte() * 8;
942 bld.vop2(aco_opcode::v_lshrrev_b32, dst, Operand::c32(right_shift),
943 Operand(dst.physReg(), dst.regClass()));
944 }
945 }
946
947 void
emit_bpermute_permlane(Program * program,aco_ptr<Instruction> & instr,Builder & bld)948 emit_bpermute_permlane(Program* program, aco_ptr<Instruction>& instr, Builder& bld)
949 {
950 /* Emulates proper bpermute on GFX11 in wave64 mode.
951 *
952 * Similar to emit_gfx10_wave64_bpermute, but uses the new
953 * v_permlane64_b32 instruction to swap data between lo and hi halves.
954 */
955
956 assert(program->gfx_level >= GFX11);
957 assert(program->wave_size == 64);
958
959 Definition dst = instr->definitions[0];
960 Definition tmp_exec = instr->definitions[1];
961 Definition clobber_scc = instr->definitions[2];
962 Operand tmp_op = instr->operands[0];
963 Operand index_x4 = instr->operands[1];
964 Operand input_data = instr->operands[2];
965 Operand same_half = instr->operands[3];
966
967 assert(dst.regClass() == v1);
968 assert(tmp_exec.regClass() == bld.lm);
969 assert(clobber_scc.isFixed() && clobber_scc.physReg() == scc);
970 assert(same_half.regClass() == bld.lm);
971 assert(tmp_op.regClass() == v1.as_linear());
972 assert(index_x4.regClass() == v1);
973 assert(input_data.regClass().type() == RegType::vgpr);
974 assert(input_data.bytes() <= 4);
975
976 Definition tmp_def(tmp_op.physReg(), tmp_op.regClass());
977
978 /* Permute the input within the same half-wave. */
979 bld.ds(aco_opcode::ds_bpermute_b32, dst, index_x4, input_data);
980
981 /* Save EXEC and enable all lanes. */
982 bld.sop1(aco_opcode::s_or_saveexec_b64, tmp_exec, clobber_scc, Definition(exec, s2),
983 Operand::c32(-1u), Operand(exec, s2));
984
985 /* Copy input data from other half to current half's linear VGPR. */
986 bld.vop1(aco_opcode::v_permlane64_b32, tmp_def, input_data);
987
988 /* Permute the input from the other half-wave, write to linear VGPR. */
989 bld.ds(aco_opcode::ds_bpermute_b32, tmp_def, index_x4, tmp_op);
990
991 /* Restore saved EXEC. */
992 bld.sop1(aco_opcode::s_mov_b64, Definition(exec, s2), Operand(tmp_exec.physReg(), s2));
993
994 /* Select correct permute result. */
995 bld.vop2_e64(aco_opcode::v_cndmask_b32, dst, tmp_op, Operand(dst.physReg(), dst.regClass()),
996 same_half);
997
998 adjust_bpermute_dst(bld, dst, input_data);
999 }
1000
1001 void
emit_bpermute_shared_vgpr(Program * program,aco_ptr<Instruction> & instr,Builder & bld)1002 emit_bpermute_shared_vgpr(Program* program, aco_ptr<Instruction>& instr, Builder& bld)
1003 {
1004 /* Emulates proper bpermute on GFX10 in wave64 mode.
1005 *
1006 * This is necessary because on GFX10 the bpermute instruction only works
1007 * on half waves (you can think of it as having a cluster size of 32), so we
1008 * manually swap the data between the two halves using two shared VGPRs.
1009 */
1010
1011 assert(program->gfx_level >= GFX10 && program->gfx_level <= GFX10_3);
1012 assert(program->wave_size == 64);
1013
1014 unsigned shared_vgpr_reg_0 = align(program->config->num_vgprs, 4) + 256;
1015 Definition dst = instr->definitions[0];
1016 Definition tmp_exec = instr->definitions[1];
1017 Definition clobber_scc = instr->definitions[2];
1018 Operand index_x4 = instr->operands[0];
1019 Operand input_data = instr->operands[1];
1020 Operand same_half = instr->operands[2];
1021
1022 assert(dst.regClass() == v1);
1023 assert(tmp_exec.regClass() == bld.lm);
1024 assert(clobber_scc.isFixed() && clobber_scc.physReg() == scc);
1025 assert(same_half.regClass() == bld.lm);
1026 assert(index_x4.regClass() == v1);
1027 assert(input_data.regClass().type() == RegType::vgpr);
1028 assert(input_data.bytes() <= 4);
1029 assert(dst.physReg() != index_x4.physReg());
1030 assert(dst.physReg() != input_data.physReg());
1031 assert(tmp_exec.physReg() != same_half.physReg());
1032
1033 PhysReg shared_vgpr_lo(shared_vgpr_reg_0);
1034 PhysReg shared_vgpr_hi(shared_vgpr_reg_0 + 1);
1035
1036 /* Permute the input within the same half-wave */
1037 bld.ds(aco_opcode::ds_bpermute_b32, dst, index_x4, input_data);
1038
1039 /* HI: Copy data from high lanes 32-63 to shared vgpr */
1040 bld.vop1_dpp(aco_opcode::v_mov_b32, Definition(shared_vgpr_hi, v1), input_data,
1041 dpp_quad_perm(0, 1, 2, 3), 0xc, 0xf, false);
1042 /* Save EXEC */
1043 bld.sop1(aco_opcode::s_mov_b64, tmp_exec, Operand(exec, s2));
1044 /* Set EXEC to enable LO lanes only */
1045 bld.sop2(aco_opcode::s_bfm_b64, Definition(exec, s2), Operand::c32(32u), Operand::zero());
1046 /* LO: Copy data from low lanes 0-31 to shared vgpr */
1047 bld.vop1(aco_opcode::v_mov_b32, Definition(shared_vgpr_lo, v1), input_data);
1048 /* LO: bpermute shared vgpr (high lanes' data) */
1049 bld.ds(aco_opcode::ds_bpermute_b32, Definition(shared_vgpr_hi, v1), index_x4,
1050 Operand(shared_vgpr_hi, v1));
1051 /* Set EXEC to enable HI lanes only */
1052 bld.sop2(aco_opcode::s_bfm_b64, Definition(exec, s2), Operand::c32(32u), Operand::c32(32u));
1053 /* HI: bpermute shared vgpr (low lanes' data) */
1054 bld.ds(aco_opcode::ds_bpermute_b32, Definition(shared_vgpr_lo, v1), index_x4,
1055 Operand(shared_vgpr_lo, v1));
1056
1057 /* Only enable lanes which use the other half's data */
1058 bld.sop2(aco_opcode::s_andn2_b64, Definition(exec, s2), clobber_scc,
1059 Operand(tmp_exec.physReg(), s2), same_half);
1060 /* LO: Copy shared vgpr (high lanes' bpermuted data) to output vgpr */
1061 bld.vop1_dpp(aco_opcode::v_mov_b32, dst, Operand(shared_vgpr_hi, v1), dpp_quad_perm(0, 1, 2, 3),
1062 0x3, 0xf, false);
1063 /* HI: Copy shared vgpr (low lanes' bpermuted data) to output vgpr */
1064 bld.vop1_dpp(aco_opcode::v_mov_b32, dst, Operand(shared_vgpr_lo, v1), dpp_quad_perm(0, 1, 2, 3),
1065 0xc, 0xf, false);
1066
1067 /* Restore saved EXEC */
1068 bld.sop1(aco_opcode::s_mov_b64, Definition(exec, s2), Operand(tmp_exec.physReg(), s2));
1069
1070 adjust_bpermute_dst(bld, dst, input_data);
1071 }
1072
1073 void
emit_bpermute_readlane(Program * program,aco_ptr<Instruction> & instr,Builder & bld)1074 emit_bpermute_readlane(Program* program, aco_ptr<Instruction>& instr, Builder& bld)
1075 {
1076 /* Emulates bpermute using readlane instructions */
1077
1078 Operand index = instr->operands[0];
1079 Operand input = instr->operands[1];
1080 Definition dst = instr->definitions[0];
1081 Definition temp_exec = instr->definitions[1];
1082 Definition clobber_vcc = instr->definitions[2];
1083
1084 assert(dst.regClass() == v1);
1085 assert(temp_exec.regClass() == bld.lm);
1086 assert(clobber_vcc.regClass() == bld.lm);
1087 assert(clobber_vcc.physReg() == vcc);
1088 assert(index.regClass() == v1);
1089 assert(index.physReg() != dst.physReg());
1090 assert(input.regClass().type() == RegType::vgpr);
1091 assert(input.bytes() <= 4);
1092 assert(input.physReg() != dst.physReg());
1093
1094 /* Save original EXEC */
1095 bld.sop1(Builder::s_mov, temp_exec, Operand(exec, bld.lm));
1096
1097 /* An "unrolled loop" that is executed per each lane.
1098 * This takes only a few instructions per lane, as opposed to a "real" loop
1099 * with branching, where the branch instruction alone would take 16+ cycles.
1100 */
1101 for (unsigned n = 0; n < program->wave_size; ++n) {
1102 /* Activate the lane which has N for its source index */
1103 if (program->gfx_level >= GFX10)
1104 bld.vopc(aco_opcode::v_cmpx_eq_u32, Definition(exec, bld.lm), Operand::c32(n), index);
1105 else
1106 bld.vopc(aco_opcode::v_cmpx_eq_u32, clobber_vcc, Definition(exec, bld.lm), Operand::c32(n),
1107 index);
1108 /* Read the data from lane N */
1109 bld.readlane(Definition(vcc, s1), input, Operand::c32(n));
1110 /* On the active lane, move the data we read from lane N to the destination VGPR */
1111 bld.vop1(aco_opcode::v_mov_b32, dst, Operand(vcc, s1));
1112 /* Restore original EXEC */
1113 bld.sop1(Builder::s_mov, Definition(exec, bld.lm), Operand(temp_exec.physReg(), bld.lm));
1114 }
1115
1116 adjust_bpermute_dst(bld, dst, input);
1117 }
1118
1119 struct copy_operation {
1120 Operand op;
1121 Definition def;
1122 unsigned bytes;
1123 union {
1124 uint8_t uses[8];
1125 uint64_t is_used = 0;
1126 };
1127 };
1128
1129 void
split_copy(lower_context * ctx,unsigned offset,Definition * def,Operand * op,const copy_operation & src,bool ignore_uses,unsigned max_size)1130 split_copy(lower_context* ctx, unsigned offset, Definition* def, Operand* op,
1131 const copy_operation& src, bool ignore_uses, unsigned max_size)
1132 {
1133 PhysReg def_reg = src.def.physReg();
1134 PhysReg op_reg = src.op.physReg();
1135 def_reg.reg_b += offset;
1136 op_reg.reg_b += offset;
1137
1138 /* 64-bit VGPR copies (implemented with v_lshrrev_b64) are slow before GFX10, and on GFX11
1139 * v_lshrrev_b64 doesn't get dual issued. */
1140 if ((ctx->program->gfx_level < GFX10 || ctx->program->gfx_level >= GFX11) &&
1141 src.def.regClass().type() == RegType::vgpr)
1142 max_size = MIN2(max_size, 4);
1143 unsigned max_align = src.def.regClass().type() == RegType::vgpr ? 4 : 16;
1144
1145 /* make sure the size is a power of two and reg % bytes == 0 */
1146 unsigned bytes = 1;
1147 for (; bytes <= max_size; bytes *= 2) {
1148 unsigned next = bytes * 2u;
1149 bool can_increase = def_reg.reg_b % MIN2(next, max_align) == 0 &&
1150 offset + next <= src.bytes && next <= max_size;
1151 if (!src.op.isConstant() && can_increase)
1152 can_increase = op_reg.reg_b % MIN2(next, max_align) == 0;
1153 for (unsigned i = 0; !ignore_uses && can_increase && (i < bytes); i++)
1154 can_increase = (src.uses[offset + bytes + i] == 0) == (src.uses[offset] == 0);
1155 if (!can_increase)
1156 break;
1157 }
1158
1159 *def = Definition(src.def.tempId(), def_reg, src.def.regClass().resize(bytes));
1160 if (src.op.isConstant()) {
1161 assert(bytes >= 1 && bytes <= 8);
1162 uint64_t val = src.op.constantValue64() >> (offset * 8u);
1163 *op = Operand::get_const(ctx->program->gfx_level, val, bytes);
1164 } else {
1165 RegClass op_cls = src.op.regClass().resize(bytes);
1166 *op = Operand(op_reg, op_cls);
1167 op->setTemp(Temp(src.op.tempId(), op_cls));
1168 }
1169 }
1170
1171 uint32_t
get_intersection_mask(int a_start,int a_size,int b_start,int b_size)1172 get_intersection_mask(int a_start, int a_size, int b_start, int b_size)
1173 {
1174 int intersection_start = MAX2(b_start - a_start, 0);
1175 int intersection_end = MAX2(b_start + b_size - a_start, 0);
1176 if (intersection_start >= a_size || intersection_end == 0)
1177 return 0;
1178
1179 uint32_t mask = u_bit_consecutive(0, a_size);
1180 return u_bit_consecutive(intersection_start, intersection_end - intersection_start) & mask;
1181 }
1182
1183 /* src1 are bytes 0-3. dst/src0 are bytes 4-7. */
1184 void
create_bperm(Builder & bld,uint8_t swiz[4],Definition dst,Operand src1,Operand src0=Operand (v1))1185 create_bperm(Builder& bld, uint8_t swiz[4], Definition dst, Operand src1,
1186 Operand src0 = Operand(v1))
1187 {
1188 uint32_t swiz_packed =
1189 swiz[0] | ((uint32_t)swiz[1] << 8) | ((uint32_t)swiz[2] << 16) | ((uint32_t)swiz[3] << 24);
1190
1191 dst = Definition(PhysReg(dst.physReg().reg()), v1);
1192 if (!src1.isConstant())
1193 src1 = Operand(PhysReg(src1.physReg().reg()), v1);
1194 if (src0.isUndefined())
1195 src0 = Operand(dst.physReg(), v1);
1196 else if (!src0.isConstant())
1197 src0 = Operand(PhysReg(src0.physReg().reg()), v1);
1198 bld.vop3(aco_opcode::v_perm_b32, dst, src0, src1, Operand::c32(swiz_packed));
1199 }
1200
1201 void
emit_v_mov_b16(Builder & bld,Definition dst,Operand op)1202 emit_v_mov_b16(Builder& bld, Definition dst, Operand op)
1203 {
1204 /* v_mov_b16 uses 32bit inline constants. */
1205 if (op.isConstant()) {
1206 if (!op.isLiteral() && op.physReg() >= 240) {
1207 /* v_add_f16 is smaller because it can use 16bit fp inline constants. */
1208 Instruction* instr = bld.vop2_e64(aco_opcode::v_add_f16, dst, op, Operand::zero());
1209 instr->valu().opsel[3] = dst.physReg().byte() == 2;
1210 return;
1211 }
1212 op = Operand::c32((int32_t)(int16_t)op.constantValue());
1213 }
1214
1215 Instruction* instr = bld.vop1(aco_opcode::v_mov_b16, dst, op);
1216 instr->valu().opsel[0] = op.physReg().byte() == 2;
1217 instr->valu().opsel[3] = dst.physReg().byte() == 2;
1218 }
1219
1220 void
copy_constant(lower_context * ctx,Builder & bld,Definition dst,Operand op)1221 copy_constant(lower_context* ctx, Builder& bld, Definition dst, Operand op)
1222 {
1223 assert(op.bytes() == dst.bytes());
1224
1225 if (dst.bytes() == 4 && op.isLiteral()) {
1226 uint32_t imm = op.constantValue();
1227 if (dst.regClass() == s1 && (imm >= 0xffff8000 || imm <= 0x7fff)) {
1228 bld.sopk(aco_opcode::s_movk_i32, dst, imm & 0xFFFFu);
1229 return;
1230 } else if (util_bitreverse(imm) <= 64 || util_bitreverse(imm) >= 0xFFFFFFF0) {
1231 uint32_t rev = util_bitreverse(imm);
1232 if (dst.regClass() == s1)
1233 bld.sop1(aco_opcode::s_brev_b32, dst, Operand::c32(rev));
1234 else
1235 bld.vop1(aco_opcode::v_bfrev_b32, dst, Operand::c32(rev));
1236 return;
1237 } else if (dst.regClass() == s1) {
1238 unsigned start = (ffs(imm) - 1) & 0x1f;
1239 unsigned size = util_bitcount(imm) & 0x1f;
1240 if (BITFIELD_RANGE(start, size) == imm) {
1241 bld.sop2(aco_opcode::s_bfm_b32, dst, Operand::c32(size), Operand::c32(start));
1242 return;
1243 }
1244 if (ctx->program->gfx_level >= GFX9) {
1245 Operand op_lo = Operand::c32(int32_t(int16_t(imm)));
1246 Operand op_hi = Operand::c32(int32_t(int16_t(imm >> 16)));
1247 if (!op_lo.isLiteral() && !op_hi.isLiteral()) {
1248 bld.sop2(aco_opcode::s_pack_ll_b32_b16, dst, op_lo, op_hi);
1249 return;
1250 }
1251 }
1252 }
1253 }
1254
1255 if (op.bytes() == 4 && op.constantEquals(0x3e22f983) && ctx->program->gfx_level >= GFX8)
1256 op.setFixed(PhysReg{248}); /* it can be an inline constant on GFX8+ */
1257
1258 if (dst.regClass() == s1) {
1259 bld.sop1(aco_opcode::s_mov_b32, dst, op);
1260 } else if (dst.regClass() == s2) {
1261 /* s_ashr_i64 writes SCC, so we can't use it */
1262 assert(Operand::is_constant_representable(op.constantValue64(), 8, true, false));
1263 uint64_t imm = op.constantValue64();
1264 if (op.isLiteral()) {
1265 unsigned start = (ffsll(imm) - 1) & 0x3f;
1266 unsigned size = util_bitcount64(imm) & 0x3f;
1267 if (BITFIELD64_RANGE(start, size) == imm) {
1268 bld.sop2(aco_opcode::s_bfm_b64, dst, Operand::c32(size), Operand::c32(start));
1269 return;
1270 }
1271 }
1272 bld.sop1(aco_opcode::s_mov_b64, dst, op);
1273 } else if (dst.regClass() == v2) {
1274 if (Operand::is_constant_representable(op.constantValue64(), 8, true, false)) {
1275 bld.vop3(aco_opcode::v_lshrrev_b64, dst, Operand::zero(), op);
1276 } else {
1277 assert(Operand::is_constant_representable(op.constantValue64(), 8, false, true));
1278 bld.vop3(aco_opcode::v_ashrrev_i64, dst, Operand::zero(), op);
1279 }
1280 } else if (dst.regClass() == v1) {
1281 bld.vop1(aco_opcode::v_mov_b32, dst, op);
1282 } else {
1283 assert(dst.regClass() == v1b || dst.regClass() == v2b);
1284
1285 bool use_sdwa = ctx->program->gfx_level >= GFX9 && ctx->program->gfx_level < GFX11;
1286 /* We need the v_perm_b32 (VOP3) to be able to take literals, and that's a GFX10+ feature. */
1287 bool can_use_perm = ctx->program->gfx_level >= GFX10 &&
1288 (op.constantEquals(0) || op.constantEquals(0xff) ||
1289 op.constantEquals(0xffff) || op.constantEquals(0xff00));
1290 if (dst.regClass() == v1b && use_sdwa) {
1291 uint8_t val = op.constantValue();
1292 Operand op32 = Operand::c32((uint32_t)val | (val & 0x80u ? 0xffffff00u : 0u));
1293 if (op32.isLiteral()) {
1294 uint32_t a = (uint32_t)int8_mul_table[val * 2];
1295 uint32_t b = (uint32_t)int8_mul_table[val * 2 + 1];
1296 bld.vop2_sdwa(aco_opcode::v_mul_u32_u24, dst,
1297 Operand::c32(a | (a & 0x80u ? 0xffffff00u : 0x0u)),
1298 Operand::c32(b | (b & 0x80u ? 0xffffff00u : 0x0u)));
1299 } else {
1300 bld.vop1_sdwa(aco_opcode::v_mov_b32, dst, op32);
1301 }
1302 } else if (dst.regClass() == v2b && ctx->program->gfx_level >= GFX11) {
1303 emit_v_mov_b16(bld, dst, op);
1304 } else if (dst.regClass() == v2b && use_sdwa && !op.isLiteral()) {
1305 if (op.constantValue() >= 0xfff0 || op.constantValue() <= 64) {
1306 /* use v_mov_b32 to avoid possible issues with denormal flushing or
1307 * NaN. v_add_f16 is still needed for float constants. */
1308 uint32_t val32 = (int32_t)(int16_t)op.constantValue();
1309 bld.vop1_sdwa(aco_opcode::v_mov_b32, dst, Operand::c32(val32));
1310 } else {
1311 bld.vop2_sdwa(aco_opcode::v_add_f16, dst, op, Operand::zero());
1312 }
1313 } else if (dst.regClass() == v2b && ctx->program->gfx_level >= GFX10 &&
1314 (ctx->block->fp_mode.denorm16_64 & fp_denorm_keep_in)) {
1315 if (dst.physReg().byte() == 2) {
1316 Operand def_lo(dst.physReg().advance(-2), v2b);
1317 Instruction* instr = bld.vop3(aco_opcode::v_pack_b32_f16, dst, def_lo, op);
1318 instr->valu().opsel = 0;
1319 } else {
1320 assert(dst.physReg().byte() == 0);
1321 Operand def_hi(dst.physReg().advance(2), v2b);
1322 Instruction* instr = bld.vop3(aco_opcode::v_pack_b32_f16, dst, op, def_hi);
1323 instr->valu().opsel = 2;
1324 }
1325 } else if (can_use_perm) {
1326 uint8_t swiz[] = {4, 5, 6, 7};
1327 swiz[dst.physReg().byte()] = op.constantValue() & 0xff ? bperm_255 : bperm_0;
1328 if (dst.bytes() == 2)
1329 swiz[dst.physReg().byte() + 1] = op.constantValue() >> 8 ? bperm_255 : bperm_0;
1330 create_bperm(bld, swiz, dst, Operand::zero());
1331 } else {
1332 uint32_t offset = dst.physReg().byte() * 8u;
1333 uint32_t mask = ((1u << (dst.bytes() * 8)) - 1) << offset;
1334 uint32_t val = (op.constantValue() << offset) & mask;
1335 dst = Definition(PhysReg(dst.physReg().reg()), v1);
1336 Operand def_op(dst.physReg(), v1);
1337 if (val != mask)
1338 bld.vop2(aco_opcode::v_and_b32, dst, Operand::c32(~mask), def_op);
1339 if (val != 0)
1340 bld.vop2(aco_opcode::v_or_b32, dst, Operand::c32(val), def_op);
1341 }
1342 }
1343 }
1344
1345 void
copy_linear_vgpr(Builder & bld,Definition def,Operand op,bool preserve_scc,PhysReg scratch_sgpr)1346 copy_linear_vgpr(Builder& bld, Definition def, Operand op, bool preserve_scc, PhysReg scratch_sgpr)
1347 {
1348 if (preserve_scc)
1349 bld.sop1(aco_opcode::s_mov_b32, Definition(scratch_sgpr, s1), Operand(scc, s1));
1350
1351 for (unsigned i = 0; i < 2; i++) {
1352 if (def.size() == 2)
1353 bld.vop3(aco_opcode::v_lshrrev_b64, def, Operand::zero(), op);
1354 else
1355 bld.vop1(aco_opcode::v_mov_b32, def, op);
1356
1357 bld.sop1(Builder::s_not, Definition(exec, bld.lm), Definition(scc, s1),
1358 Operand(exec, bld.lm));
1359 }
1360
1361 if (preserve_scc)
1362 bld.sopc(aco_opcode::s_cmp_lg_i32, Definition(scc, s1), Operand(scratch_sgpr, s1),
1363 Operand::zero());
1364 }
1365
1366 void
swap_linear_vgpr(Builder & bld,Definition def,Operand op,bool preserve_scc,PhysReg scratch_sgpr)1367 swap_linear_vgpr(Builder& bld, Definition def, Operand op, bool preserve_scc, PhysReg scratch_sgpr)
1368 {
1369 if (preserve_scc)
1370 bld.sop1(aco_opcode::s_mov_b32, Definition(scratch_sgpr, s1), Operand(scc, s1));
1371
1372 Operand def_as_op = Operand(def.physReg(), def.regClass());
1373 Definition op_as_def = Definition(op.physReg(), op.regClass());
1374
1375 for (unsigned i = 0; i < 2; i++) {
1376 if (bld.program->gfx_level >= GFX9) {
1377 bld.vop1(aco_opcode::v_swap_b32, def, op_as_def, op, def_as_op);
1378 } else {
1379 bld.vop2(aco_opcode::v_xor_b32, op_as_def, op, def_as_op);
1380 bld.vop2(aco_opcode::v_xor_b32, def, op, def_as_op);
1381 bld.vop2(aco_opcode::v_xor_b32, op_as_def, op, def_as_op);
1382 }
1383
1384 bld.sop1(Builder::s_not, Definition(exec, bld.lm), Definition(scc, s1),
1385 Operand(exec, bld.lm));
1386 }
1387
1388 if (preserve_scc)
1389 bld.sopc(aco_opcode::s_cmp_lg_i32, Definition(scc, s1), Operand(scratch_sgpr, s1),
1390 Operand::zero());
1391 }
1392
1393 void
addsub_subdword_gfx11(Builder & bld,Definition dst,Operand src0,Operand src1,bool sub)1394 addsub_subdword_gfx11(Builder& bld, Definition dst, Operand src0, Operand src1, bool sub)
1395 {
1396 Instruction* instr =
1397 bld.vop3(sub ? aco_opcode::v_sub_u16_e64 : aco_opcode::v_add_u16_e64, dst, src0, src1).instr;
1398 if (src0.physReg().byte() == 2)
1399 instr->valu().opsel |= 0x1;
1400 if (src1.physReg().byte() == 2)
1401 instr->valu().opsel |= 0x2;
1402 if (dst.physReg().byte() == 2)
1403 instr->valu().opsel |= 0x8;
1404 }
1405
1406 bool
do_copy(lower_context * ctx,Builder & bld,const copy_operation & copy,bool * preserve_scc,PhysReg scratch_sgpr)1407 do_copy(lower_context* ctx, Builder& bld, const copy_operation& copy, bool* preserve_scc,
1408 PhysReg scratch_sgpr)
1409 {
1410 bool did_copy = false;
1411 for (unsigned offset = 0; offset < copy.bytes;) {
1412 if (copy.uses[offset]) {
1413 offset++;
1414 continue;
1415 }
1416
1417 Definition def;
1418 Operand op;
1419 split_copy(ctx, offset, &def, &op, copy, false, 8);
1420
1421 if (def.physReg() == scc) {
1422 bld.sopc(aco_opcode::s_cmp_lg_i32, def, op, Operand::zero());
1423 *preserve_scc = true;
1424 } else if (op.isConstant()) {
1425 copy_constant(ctx, bld, def, op);
1426 } else if (def.regClass().is_linear_vgpr()) {
1427 copy_linear_vgpr(bld, def, op, *preserve_scc, scratch_sgpr);
1428 } else if (def.regClass() == v1) {
1429 bld.vop1(aco_opcode::v_mov_b32, def, op);
1430 } else if (def.regClass() == v2) {
1431 bld.vop3(aco_opcode::v_lshrrev_b64, def, Operand::zero(), op);
1432 } else if (def.regClass() == s1) {
1433 bld.sop1(aco_opcode::s_mov_b32, def, op);
1434 } else if (def.regClass() == s2) {
1435 bld.sop1(aco_opcode::s_mov_b64, def, op);
1436 } else if (def.regClass().is_subdword() && ctx->program->gfx_level < GFX8) {
1437 if (op.physReg().byte()) {
1438 assert(def.physReg().byte() == 0);
1439 bld.vop2(aco_opcode::v_lshrrev_b32, def, Operand::c32(op.physReg().byte() * 8), op);
1440 } else if (def.physReg().byte()) {
1441 assert(op.physReg().byte() == 0);
1442 /* preserve the target's lower half */
1443 uint32_t bits = def.physReg().byte() * 8;
1444 PhysReg lo_reg = PhysReg(def.physReg().reg());
1445 Definition lo_half =
1446 Definition(lo_reg, RegClass::get(RegType::vgpr, def.physReg().byte()));
1447 Definition dst =
1448 Definition(lo_reg, RegClass::get(RegType::vgpr, lo_half.bytes() + op.bytes()));
1449
1450 if (def.physReg().reg() == op.physReg().reg()) {
1451 bld.vop2(aco_opcode::v_and_b32, lo_half, Operand::c32((1 << bits) - 1u),
1452 Operand(lo_reg, lo_half.regClass()));
1453 if (def.physReg().byte() == 1) {
1454 bld.vop2(aco_opcode::v_mul_u32_u24, dst, Operand::c32((1 << bits) + 1u), op);
1455 } else if (def.physReg().byte() == 2) {
1456 bld.vop2(aco_opcode::v_cvt_pk_u16_u32, dst, Operand(lo_reg, v2b), op);
1457 } else if (def.physReg().byte() == 3) {
1458 bld.sop1(aco_opcode::s_mov_b32, Definition(scratch_sgpr, s1),
1459 Operand::c32((1 << bits) + 1u));
1460 bld.vop3(aco_opcode::v_mul_lo_u32, dst, Operand(scratch_sgpr, s1), op);
1461 }
1462 } else {
1463 lo_half.setFixed(lo_half.physReg().advance(4 - def.physReg().byte()));
1464 bld.vop2(aco_opcode::v_lshlrev_b32, lo_half, Operand::c32(32 - bits),
1465 Operand(lo_reg, lo_half.regClass()));
1466 bld.vop3(aco_opcode::v_alignbyte_b32, dst, op,
1467 Operand(lo_half.physReg(), lo_half.regClass()),
1468 Operand::c32(4 - def.physReg().byte()));
1469 }
1470 } else {
1471 bld.vop1(aco_opcode::v_mov_b32, def, op);
1472 }
1473 } else if (def.regClass() == v1b && ctx->program->gfx_level >= GFX11) {
1474 uint8_t swiz[] = {4, 5, 6, 7};
1475 swiz[def.physReg().byte()] = op.physReg().byte();
1476 create_bperm(bld, swiz, def, op);
1477 } else if (def.regClass() == v2b && ctx->program->gfx_level >= GFX11) {
1478 emit_v_mov_b16(bld, def, op);
1479 } else if (def.regClass().is_subdword()) {
1480 bld.vop1_sdwa(aco_opcode::v_mov_b32, def, op);
1481 } else {
1482 unreachable("unsupported copy");
1483 }
1484
1485 did_copy = true;
1486 offset += def.bytes();
1487 }
1488 return did_copy;
1489 }
1490
1491 void
swap_subdword_gfx11(Builder & bld,Definition def,Operand op)1492 swap_subdword_gfx11(Builder& bld, Definition def, Operand op)
1493 {
1494 if (def.physReg().reg() == op.physReg().reg()) {
1495 assert(def.bytes() != 2); /* handled by caller */
1496 uint8_t swiz[] = {4, 5, 6, 7};
1497 std::swap(swiz[def.physReg().byte()], swiz[op.physReg().byte()]);
1498 create_bperm(bld, swiz, def, Operand::zero());
1499 return;
1500 }
1501
1502 if (def.bytes() == 2) {
1503 Operand def_as_op = Operand(def.physReg(), def.regClass());
1504 Definition op_as_def = Definition(op.physReg(), op.regClass());
1505 addsub_subdword_gfx11(bld, def, def_as_op, op, false);
1506 addsub_subdword_gfx11(bld, op_as_def, def_as_op, op, true);
1507 addsub_subdword_gfx11(bld, def, def_as_op, op, true);
1508 } else {
1509 PhysReg op_half = op.physReg();
1510 op_half.reg_b &= ~1;
1511
1512 PhysReg def_other_half = def.physReg();
1513 def_other_half.reg_b &= ~1;
1514 def_other_half.reg_b ^= 2;
1515
1516 /* We can only swap individual bytes within a single VGPR, so temporarily move both bytes
1517 * into the same VGPR.
1518 */
1519 swap_subdword_gfx11(bld, Definition(def_other_half, v2b), Operand(op_half, v2b));
1520 swap_subdword_gfx11(bld, def, Operand(def_other_half.advance(op.physReg().byte() & 1), v1b));
1521 swap_subdword_gfx11(bld, Definition(def_other_half, v2b), Operand(op_half, v2b));
1522 }
1523 }
1524
1525 void
do_swap(lower_context * ctx,Builder & bld,const copy_operation & copy,bool preserve_scc,Pseudo_instruction * pi)1526 do_swap(lower_context* ctx, Builder& bld, const copy_operation& copy, bool preserve_scc,
1527 Pseudo_instruction* pi)
1528 {
1529 unsigned offset = 0;
1530
1531 if (copy.bytes == 3 && (copy.def.physReg().reg_b % 4 <= 1) &&
1532 (copy.def.physReg().reg_b % 4) == (copy.op.physReg().reg_b % 4)) {
1533 /* instead of doing a 2-byte and 1-byte swap, do a 4-byte swap and then fixup with a 1-byte
1534 * swap */
1535 PhysReg op = copy.op.physReg();
1536 PhysReg def = copy.def.physReg();
1537 op.reg_b &= ~0x3;
1538 def.reg_b &= ~0x3;
1539
1540 copy_operation tmp;
1541 tmp.op = Operand(op, v1);
1542 tmp.def = Definition(def, v1);
1543 tmp.bytes = 4;
1544 memset(tmp.uses, 1, 4);
1545 do_swap(ctx, bld, tmp, preserve_scc, pi);
1546
1547 op.reg_b += copy.def.physReg().reg_b % 4 == 0 ? 3 : 0;
1548 def.reg_b += copy.def.physReg().reg_b % 4 == 0 ? 3 : 0;
1549 tmp.op = Operand(op, v1b);
1550 tmp.def = Definition(def, v1b);
1551 tmp.bytes = 1;
1552 tmp.uses[0] = 1;
1553 do_swap(ctx, bld, tmp, preserve_scc, pi);
1554
1555 offset = copy.bytes;
1556 }
1557
1558 for (; offset < copy.bytes;) {
1559 Definition def;
1560 Operand op;
1561 unsigned max_size = copy.def.regClass().type() == RegType::vgpr ? 4 : 8;
1562 split_copy(ctx, offset, &def, &op, copy, true, max_size);
1563
1564 assert(op.regClass() == def.regClass());
1565 Operand def_as_op = Operand(def.physReg(), def.regClass());
1566 Definition op_as_def = Definition(op.physReg(), op.regClass());
1567 if (def.regClass().is_linear_vgpr()) {
1568 swap_linear_vgpr(bld, def, op, preserve_scc, pi->scratch_sgpr);
1569 } else if (ctx->program->gfx_level >= GFX9 && def.regClass() == v1) {
1570 bld.vop1(aco_opcode::v_swap_b32, def, op_as_def, op, def_as_op);
1571 } else if (def.regClass() == v1) {
1572 assert(def.physReg().byte() == 0 && op.physReg().byte() == 0);
1573 bld.vop2(aco_opcode::v_xor_b32, op_as_def, op, def_as_op);
1574 bld.vop2(aco_opcode::v_xor_b32, def, op, def_as_op);
1575 bld.vop2(aco_opcode::v_xor_b32, op_as_def, op, def_as_op);
1576 } else if (op.physReg() == scc || def.physReg() == scc) {
1577 /* we need to swap scc and another sgpr */
1578 assert(!preserve_scc);
1579
1580 PhysReg other = op.physReg() == scc ? def.physReg() : op.physReg();
1581
1582 bld.sop1(aco_opcode::s_mov_b32, Definition(pi->scratch_sgpr, s1), Operand(scc, s1));
1583 bld.sopc(aco_opcode::s_cmp_lg_i32, Definition(scc, s1), Operand(other, s1),
1584 Operand::zero());
1585 bld.sop1(aco_opcode::s_mov_b32, Definition(other, s1), Operand(pi->scratch_sgpr, s1));
1586 } else if (def.regClass() == s1) {
1587 if (preserve_scc) {
1588 bld.sop1(aco_opcode::s_mov_b32, Definition(pi->scratch_sgpr, s1), op);
1589 bld.sop1(aco_opcode::s_mov_b32, op_as_def, def_as_op);
1590 bld.sop1(aco_opcode::s_mov_b32, def, Operand(pi->scratch_sgpr, s1));
1591 } else {
1592 bld.sop2(aco_opcode::s_xor_b32, op_as_def, Definition(scc, s1), op, def_as_op);
1593 bld.sop2(aco_opcode::s_xor_b32, def, Definition(scc, s1), op, def_as_op);
1594 bld.sop2(aco_opcode::s_xor_b32, op_as_def, Definition(scc, s1), op, def_as_op);
1595 }
1596 } else if (def.regClass() == s2) {
1597 if (preserve_scc)
1598 bld.sop1(aco_opcode::s_mov_b32, Definition(pi->scratch_sgpr, s1), Operand(scc, s1));
1599 bld.sop2(aco_opcode::s_xor_b64, op_as_def, Definition(scc, s1), op, def_as_op);
1600 bld.sop2(aco_opcode::s_xor_b64, def, Definition(scc, s1), op, def_as_op);
1601 bld.sop2(aco_opcode::s_xor_b64, op_as_def, Definition(scc, s1), op, def_as_op);
1602 if (preserve_scc)
1603 bld.sopc(aco_opcode::s_cmp_lg_i32, Definition(scc, s1), Operand(pi->scratch_sgpr, s1),
1604 Operand::zero());
1605 } else if (def.bytes() == 2 && def.physReg().reg() == op.physReg().reg()) {
1606 bld.vop3(aco_opcode::v_alignbyte_b32, Definition(def.physReg(), v1), def_as_op, op,
1607 Operand::c32(2u));
1608 } else {
1609 assert(def.regClass().is_subdword());
1610 if (ctx->program->gfx_level >= GFX11) {
1611 swap_subdword_gfx11(bld, def, op);
1612 } else {
1613 bld.vop2_sdwa(aco_opcode::v_xor_b32, op_as_def, op, def_as_op);
1614 bld.vop2_sdwa(aco_opcode::v_xor_b32, def, op, def_as_op);
1615 bld.vop2_sdwa(aco_opcode::v_xor_b32, op_as_def, op, def_as_op);
1616 }
1617 }
1618
1619 offset += def.bytes();
1620 }
1621
1622 if (ctx->program->gfx_level <= GFX7)
1623 return;
1624
1625 /* fixup in case we swapped bytes we shouldn't have */
1626 copy_operation tmp_copy = copy;
1627 tmp_copy.op.setFixed(copy.def.physReg());
1628 tmp_copy.def.setFixed(copy.op.physReg());
1629 do_copy(ctx, bld, tmp_copy, &preserve_scc, pi->scratch_sgpr);
1630 }
1631
1632 void
do_pack_2x16(lower_context * ctx,Builder & bld,Definition def,Operand lo,Operand hi)1633 do_pack_2x16(lower_context* ctx, Builder& bld, Definition def, Operand lo, Operand hi)
1634 {
1635 if (lo.isConstant() && hi.isConstant()) {
1636 copy_constant(ctx, bld, def, Operand::c32(lo.constantValue() | (hi.constantValue() << 16)));
1637 return;
1638 }
1639
1640 bool can_use_pack = (ctx->block->fp_mode.denorm16_64 & fp_denorm_keep_in) &&
1641 (ctx->program->gfx_level >= GFX10 ||
1642 (ctx->program->gfx_level >= GFX9 && !lo.isLiteral() && !hi.isLiteral()));
1643
1644 if (can_use_pack) {
1645 Instruction* instr = bld.vop3(aco_opcode::v_pack_b32_f16, def, lo, hi);
1646 /* opsel: 0 = select low half, 1 = select high half. [0] = src0, [1] = src1 */
1647 instr->valu().opsel = hi.physReg().byte() | (lo.physReg().byte() >> 1);
1648 return;
1649 }
1650
1651 /* a single alignbyte can be sufficient: hi can be a 32-bit integer constant */
1652 if (lo.physReg().byte() == 2 && hi.physReg().byte() == 0 &&
1653 (!hi.isConstant() || (hi.constantValue() && (!Operand::c32(hi.constantValue()).isLiteral() ||
1654 ctx->program->gfx_level >= GFX10)))) {
1655 if (hi.isConstant())
1656 bld.vop3(aco_opcode::v_alignbyte_b32, def, Operand::c32(hi.constantValue()), lo,
1657 Operand::c32(2u));
1658 else
1659 bld.vop3(aco_opcode::v_alignbyte_b32, def, hi, lo, Operand::c32(2u));
1660 return;
1661 }
1662
1663 Definition def_lo = Definition(def.physReg(), v2b);
1664 Definition def_hi = Definition(def.physReg().advance(2), v2b);
1665
1666 if (lo.isConstant()) {
1667 /* move hi and zero low bits */
1668 if (hi.physReg().byte() == 0)
1669 bld.vop2(aco_opcode::v_lshlrev_b32, def_hi, Operand::c32(16u), hi);
1670 else
1671 bld.vop2(aco_opcode::v_and_b32, def_hi, Operand::c32(~0xFFFFu), hi);
1672 if (lo.constantValue())
1673 bld.vop2(aco_opcode::v_or_b32, def, Operand::c32(lo.constantValue()),
1674 Operand(def.physReg(), v1));
1675 return;
1676 }
1677 if (hi.isConstant()) {
1678 /* move lo and zero high bits */
1679 if (lo.physReg().byte() == 2)
1680 bld.vop2(aco_opcode::v_lshrrev_b32, def_lo, Operand::c32(16u), lo);
1681 else if (ctx->program->gfx_level >= GFX11)
1682 bld.vop1(aco_opcode::v_cvt_u32_u16, def, lo);
1683 else
1684 bld.vop2(aco_opcode::v_and_b32, def_lo, Operand::c32(0xFFFFu), lo);
1685 if (hi.constantValue())
1686 bld.vop2(aco_opcode::v_or_b32, def, Operand::c32(hi.constantValue() << 16u),
1687 Operand(def.physReg(), v1));
1688 return;
1689 }
1690
1691 if (lo.physReg().reg() == def.physReg().reg()) {
1692 /* lo is in the high bits of def */
1693 assert(lo.physReg().byte() == 2);
1694 bld.vop2(aco_opcode::v_lshrrev_b32, def_lo, Operand::c32(16u), lo);
1695 lo.setFixed(def.physReg());
1696 } else if (hi.physReg() == def.physReg()) {
1697 /* hi is in the low bits of def */
1698 assert(hi.physReg().byte() == 0);
1699 bld.vop2(aco_opcode::v_lshlrev_b32, def_hi, Operand::c32(16u), hi);
1700 hi.setFixed(def.physReg().advance(2));
1701 } else if (ctx->program->gfx_level >= GFX8) {
1702 /* Either lo or hi can be placed with just a v_mov. SDWA is not needed, because
1703 * op.physReg().byte()==def.physReg().byte() and the other half will be overwritten.
1704 */
1705 assert(lo.physReg().byte() == 0 || hi.physReg().byte() == 2);
1706 Operand& op = lo.physReg().byte() == 0 ? lo : hi;
1707 PhysReg reg = def.physReg().advance(op.physReg().byte());
1708 bld.vop1(aco_opcode::v_mov_b32, Definition(reg, v2b), op);
1709 op.setFixed(reg);
1710 }
1711
1712 /* either hi or lo are already placed correctly */
1713 if (ctx->program->gfx_level >= GFX11) {
1714 if (lo.physReg().reg() == def.physReg().reg())
1715 emit_v_mov_b16(bld, def_hi, hi);
1716 else
1717 emit_v_mov_b16(bld, def_lo, lo);
1718 return;
1719 } else if (ctx->program->gfx_level >= GFX8) {
1720 if (lo.physReg().reg() == def.physReg().reg())
1721 bld.vop1_sdwa(aco_opcode::v_mov_b32, def_hi, hi);
1722 else
1723 bld.vop1_sdwa(aco_opcode::v_mov_b32, def_lo, lo);
1724 return;
1725 }
1726
1727 /* alignbyte needs the operands in the following way:
1728 * | xx hi | lo xx | >> 2 byte */
1729 if (lo.physReg().byte() != hi.physReg().byte()) {
1730 /* | xx lo | hi xx | => | lo hi | lo hi | */
1731 assert(lo.physReg().byte() == 0 && hi.physReg().byte() == 2);
1732 bld.vop3(aco_opcode::v_alignbyte_b32, def, lo, hi, Operand::c32(2u));
1733 lo = Operand(def_hi.physReg(), v2b);
1734 hi = Operand(def_lo.physReg(), v2b);
1735 } else if (lo.physReg().byte() == 0) {
1736 /* | xx hi | xx lo | => | xx hi | lo 00 | */
1737 bld.vop2(aco_opcode::v_lshlrev_b32, def_hi, Operand::c32(16u), lo);
1738 lo = Operand(def_hi.physReg(), v2b);
1739 } else {
1740 /* | hi xx | lo xx | => | 00 hi | lo xx | */
1741 assert(hi.physReg().byte() == 2);
1742 bld.vop2(aco_opcode::v_lshrrev_b32, def_lo, Operand::c32(16u), hi);
1743 hi = Operand(def_lo.physReg(), v2b);
1744 }
1745 /* perform the alignbyte */
1746 bld.vop3(aco_opcode::v_alignbyte_b32, def, hi, lo, Operand::c32(2u));
1747 }
1748
1749 void
try_coalesce_copies(lower_context * ctx,std::map<PhysReg,copy_operation> & copy_map,copy_operation & copy)1750 try_coalesce_copies(lower_context* ctx, std::map<PhysReg, copy_operation>& copy_map,
1751 copy_operation& copy)
1752 {
1753 // TODO try more relaxed alignment for subdword copies
1754 unsigned next_def_align = util_next_power_of_two(copy.bytes + 1);
1755 unsigned next_op_align = next_def_align;
1756 if (copy.def.regClass().type() == RegType::vgpr)
1757 next_def_align = MIN2(next_def_align, 4);
1758 if (copy.op.regClass().type() == RegType::vgpr)
1759 next_op_align = MIN2(next_op_align, 4);
1760
1761 if (copy.bytes >= 8 || copy.def.physReg().reg_b % next_def_align ||
1762 (!copy.op.isConstant() && copy.op.physReg().reg_b % next_op_align))
1763 return;
1764
1765 auto other = copy_map.find(copy.def.physReg().advance(copy.bytes));
1766 if (other == copy_map.end() || copy.bytes + other->second.bytes > 8 ||
1767 copy.op.isConstant() != other->second.op.isConstant())
1768 return;
1769
1770 /* don't create 64-bit copies before GFX10 */
1771 if (copy.bytes >= 4 && copy.def.regClass().type() == RegType::vgpr &&
1772 ctx->program->gfx_level < GFX10)
1773 return;
1774
1775 unsigned new_size = copy.bytes + other->second.bytes;
1776 if (copy.op.isConstant()) {
1777 uint64_t val =
1778 copy.op.constantValue64() | (other->second.op.constantValue64() << (copy.bytes * 8u));
1779 if (!util_is_power_of_two_or_zero(new_size))
1780 return;
1781 if (!Operand::is_constant_representable(val, new_size, true,
1782 copy.def.regClass().type() == RegType::vgpr))
1783 return;
1784 copy.op = Operand::get_const(ctx->program->gfx_level, val, new_size);
1785 } else {
1786 if (other->second.op.physReg() != copy.op.physReg().advance(copy.bytes))
1787 return;
1788 copy.op = Operand(copy.op.physReg(), copy.op.regClass().resize(new_size));
1789 }
1790
1791 copy.bytes = new_size;
1792 copy.def = Definition(copy.def.physReg(), copy.def.regClass().resize(copy.bytes));
1793 copy_map.erase(other);
1794 }
1795
1796 void
handle_operands(std::map<PhysReg,copy_operation> & copy_map,lower_context * ctx,amd_gfx_level gfx_level,Pseudo_instruction * pi)1797 handle_operands(std::map<PhysReg, copy_operation>& copy_map, lower_context* ctx,
1798 amd_gfx_level gfx_level, Pseudo_instruction* pi)
1799 {
1800 Builder bld(ctx->program, &ctx->instructions);
1801 unsigned num_instructions_before = ctx->instructions.size();
1802 aco_ptr<Instruction> mov;
1803 bool writes_scc = false;
1804
1805 /* count the number of uses for each dst reg */
1806 for (auto it = copy_map.begin(); it != copy_map.end();) {
1807
1808 if (it->second.def.physReg() == scc)
1809 writes_scc = true;
1810
1811 assert(!pi->tmp_in_scc || !(it->second.def.physReg() == pi->scratch_sgpr));
1812
1813 /* if src and dst reg are the same, remove operation */
1814 if (it->first == it->second.op.physReg()) {
1815 it = copy_map.erase(it);
1816 continue;
1817 }
1818
1819 /* split large copies */
1820 if (it->second.bytes > 8) {
1821 assert(!it->second.op.isConstant());
1822 assert(!it->second.def.regClass().is_subdword());
1823 RegClass rc = it->second.def.regClass().resize(it->second.def.bytes() - 8);
1824 Definition hi_def = Definition(PhysReg{it->first + 2}, rc);
1825 rc = it->second.op.regClass().resize(it->second.op.bytes() - 8);
1826 Operand hi_op = Operand(PhysReg{it->second.op.physReg() + 2}, rc);
1827 copy_operation copy = {hi_op, hi_def, it->second.bytes - 8};
1828 copy_map[hi_def.physReg()] = copy;
1829 assert(it->second.op.physReg().byte() == 0 && it->second.def.physReg().byte() == 0);
1830 it->second.op = Operand(it->second.op.physReg(), it->second.op.regClass().resize(8));
1831 it->second.def = Definition(it->second.def.physReg(), it->second.def.regClass().resize(8));
1832 it->second.bytes = 8;
1833 }
1834
1835 try_coalesce_copies(ctx, copy_map, it->second);
1836
1837 /* check if the definition reg is used by another copy operation */
1838 for (std::pair<const PhysReg, copy_operation>& copy : copy_map) {
1839 if (copy.second.op.isConstant())
1840 continue;
1841 for (uint16_t i = 0; i < it->second.bytes; i++) {
1842 /* distance might underflow */
1843 unsigned distance = it->first.reg_b + i - copy.second.op.physReg().reg_b;
1844 if (distance < copy.second.bytes)
1845 it->second.uses[i] += 1;
1846 }
1847 }
1848
1849 ++it;
1850 }
1851
1852 /* first, handle paths in the location transfer graph */
1853 bool preserve_scc = pi->tmp_in_scc && !writes_scc;
1854 bool skip_partial_copies = true;
1855 for (auto it = copy_map.begin();;) {
1856 if (copy_map.empty()) {
1857 ctx->program->statistics[aco_statistic_copies] +=
1858 ctx->instructions.size() - num_instructions_before;
1859 return;
1860 }
1861 if (it == copy_map.end()) {
1862 if (!skip_partial_copies)
1863 break;
1864 skip_partial_copies = false;
1865 it = copy_map.begin();
1866 }
1867
1868 /* check if we can pack one register at once */
1869 if (it->first.byte() == 0 && it->second.bytes == 2) {
1870 PhysReg reg_hi = it->first.advance(2);
1871 std::map<PhysReg, copy_operation>::iterator other = copy_map.find(reg_hi);
1872 if (other != copy_map.end() && other->second.bytes == 2) {
1873 /* check if the target register is otherwise unused */
1874 bool unused_lo = !it->second.is_used || (it->second.is_used == 0x0101 &&
1875 other->second.op.physReg() == it->first);
1876 bool unused_hi = !other->second.is_used ||
1877 (other->second.is_used == 0x0101 && it->second.op.physReg() == reg_hi);
1878 if (unused_lo && unused_hi) {
1879 Operand lo = it->second.op;
1880 Operand hi = other->second.op;
1881 do_pack_2x16(ctx, bld, Definition(it->first, v1), lo, hi);
1882 copy_map.erase(it);
1883 copy_map.erase(other);
1884
1885 for (std::pair<const PhysReg, copy_operation>& other2 : copy_map) {
1886 for (uint16_t i = 0; i < other2.second.bytes; i++) {
1887 /* distance might underflow */
1888 unsigned distance_lo = other2.first.reg_b + i - lo.physReg().reg_b;
1889 unsigned distance_hi = other2.first.reg_b + i - hi.physReg().reg_b;
1890 if (distance_lo < 2 || distance_hi < 2)
1891 other2.second.uses[i] -= 1;
1892 }
1893 }
1894 it = copy_map.begin();
1895 continue;
1896 }
1897 }
1898 }
1899
1900 /* on GFX6/7, we need some small workarounds as there is no
1901 * SDWA instruction to do partial register writes */
1902 if (ctx->program->gfx_level < GFX8 && it->second.bytes < 4) {
1903 if (it->first.byte() == 0 && it->second.op.physReg().byte() == 0 && !it->second.is_used &&
1904 pi->opcode == aco_opcode::p_split_vector) {
1905 /* Other operations might overwrite the high bits, so change all users
1906 * of the high bits to the new target where they are still available.
1907 * This mechanism depends on also emitting dead definitions. */
1908 PhysReg reg_hi = it->second.op.physReg().advance(it->second.bytes);
1909 while (reg_hi != PhysReg(it->second.op.physReg().reg() + 1)) {
1910 std::map<PhysReg, copy_operation>::iterator other = copy_map.begin();
1911 for (other = copy_map.begin(); other != copy_map.end(); other++) {
1912 /* on GFX6/7, if the high bits are used as operand, they cannot be a target */
1913 if (other->second.op.physReg() == reg_hi) {
1914 other->second.op.setFixed(it->first.advance(reg_hi.byte()));
1915 break; /* break because an operand can only be used once */
1916 }
1917 }
1918 reg_hi = reg_hi.advance(it->second.bytes);
1919 }
1920 } else if (it->first.byte()) {
1921 assert(pi->opcode == aco_opcode::p_create_vector);
1922 /* on GFX6/7, if we target an upper half where the lower half hasn't yet been handled,
1923 * move to the target operand's high bits. This is save to do as it cannot be an operand
1924 */
1925 PhysReg lo = PhysReg(it->first.reg());
1926 std::map<PhysReg, copy_operation>::iterator other = copy_map.find(lo);
1927 if (other != copy_map.end()) {
1928 assert(other->second.bytes == it->first.byte());
1929 PhysReg new_reg_hi = other->second.op.physReg().advance(it->first.byte());
1930 it->second.def = Definition(new_reg_hi, it->second.def.regClass());
1931 it->second.is_used = 0;
1932 other->second.bytes += it->second.bytes;
1933 other->second.def.setTemp(Temp(other->second.def.tempId(),
1934 RegClass::get(RegType::vgpr, other->second.bytes)));
1935 other->second.op.setTemp(Temp(other->second.op.tempId(),
1936 RegClass::get(RegType::vgpr, other->second.bytes)));
1937 /* if the new target's high bits are also a target, change uses */
1938 std::map<PhysReg, copy_operation>::iterator target = copy_map.find(new_reg_hi);
1939 if (target != copy_map.end()) {
1940 for (unsigned i = 0; i < it->second.bytes; i++)
1941 target->second.uses[i]++;
1942 }
1943 }
1944 }
1945 }
1946
1947 /* find portions where the target reg is not used as operand for any other copy */
1948 if (it->second.is_used) {
1949 if (it->second.op.isConstant() || skip_partial_copies) {
1950 /* we have to skip constants until is_used=0.
1951 * we also skip partial copies at the beginning to help coalescing */
1952 ++it;
1953 continue;
1954 }
1955
1956 unsigned has_zero_use_bytes = 0;
1957 for (unsigned i = 0; i < it->second.bytes; i++)
1958 has_zero_use_bytes |= (it->second.uses[i] == 0) << i;
1959
1960 if (has_zero_use_bytes) {
1961 /* Skipping partial copying and doing a v_swap_b32 and then fixup
1962 * copies is usually beneficial for sub-dword copies, but if doing
1963 * a partial copy allows further copies, it should be done instead. */
1964 bool partial_copy = (has_zero_use_bytes == 0xf) || (has_zero_use_bytes == 0xf0);
1965 for (std::pair<const PhysReg, copy_operation>& copy : copy_map) {
1966 /* on GFX6/7, we can only do copies with full registers */
1967 if (partial_copy || ctx->program->gfx_level <= GFX7)
1968 break;
1969 for (uint16_t i = 0; i < copy.second.bytes; i++) {
1970 /* distance might underflow */
1971 unsigned distance = copy.first.reg_b + i - it->second.op.physReg().reg_b;
1972 if (distance < it->second.bytes && copy.second.uses[i] == 1 &&
1973 !it->second.uses[distance])
1974 partial_copy = true;
1975 }
1976 }
1977
1978 if (!partial_copy) {
1979 ++it;
1980 continue;
1981 }
1982 } else {
1983 /* full target reg is used: register swapping needed */
1984 ++it;
1985 continue;
1986 }
1987 }
1988
1989 bool did_copy = do_copy(ctx, bld, it->second, &preserve_scc, pi->scratch_sgpr);
1990 skip_partial_copies = did_copy;
1991 std::pair<PhysReg, copy_operation> copy = *it;
1992
1993 if (it->second.is_used == 0) {
1994 /* the target reg is not used as operand for any other copy, so we
1995 * copied to all of it */
1996 copy_map.erase(it);
1997 it = copy_map.begin();
1998 } else {
1999 /* we only performed some portions of this copy, so split it to only
2000 * leave the portions that still need to be done */
2001 copy_operation original = it->second; /* the map insertion below can overwrite this */
2002 copy_map.erase(it);
2003 for (unsigned offset = 0; offset < original.bytes;) {
2004 if (original.uses[offset] == 0) {
2005 offset++;
2006 continue;
2007 }
2008 Definition def;
2009 Operand op;
2010 split_copy(ctx, offset, &def, &op, original, false, 8);
2011
2012 copy_operation new_copy = {op, def, def.bytes()};
2013 for (unsigned i = 0; i < new_copy.bytes; i++)
2014 new_copy.uses[i] = original.uses[i + offset];
2015 copy_map[def.physReg()] = new_copy;
2016
2017 offset += def.bytes();
2018 }
2019
2020 it = copy_map.begin();
2021 }
2022
2023 /* Reduce the number of uses of the operand reg by one. Do this after
2024 * splitting the copy or removing it in case the copy writes to it's own
2025 * operand (for example, v[7:8] = v[8:9]) */
2026 if (did_copy && !copy.second.op.isConstant()) {
2027 for (std::pair<const PhysReg, copy_operation>& other : copy_map) {
2028 for (uint16_t i = 0; i < other.second.bytes; i++) {
2029 /* distance might underflow */
2030 unsigned distance = other.first.reg_b + i - copy.second.op.physReg().reg_b;
2031 if (distance < copy.second.bytes && !copy.second.uses[distance])
2032 other.second.uses[i] -= 1;
2033 }
2034 }
2035 }
2036 }
2037
2038 /* all target regs are needed as operand somewhere which means, all entries are part of a cycle */
2039 unsigned largest = 0;
2040 for (const std::pair<const PhysReg, copy_operation>& op : copy_map)
2041 largest = MAX2(largest, op.second.bytes);
2042
2043 while (!copy_map.empty()) {
2044
2045 /* Perform larger swaps first, because larger swaps swaps can make other
2046 * swaps unnecessary. */
2047 auto it = copy_map.begin();
2048 for (auto it2 = copy_map.begin(); it2 != copy_map.end(); ++it2) {
2049 if (it2->second.bytes > it->second.bytes) {
2050 it = it2;
2051 if (it->second.bytes == largest)
2052 break;
2053 }
2054 }
2055
2056 /* should already be done */
2057 assert(!it->second.op.isConstant());
2058
2059 assert(it->second.op.isFixed());
2060 assert(it->second.def.regClass() == it->second.op.regClass());
2061
2062 if (it->first == it->second.op.physReg()) {
2063 copy_map.erase(it);
2064 continue;
2065 }
2066
2067 if (preserve_scc && it->second.def.getTemp().type() == RegType::sgpr)
2068 assert(!(it->second.def.physReg() == pi->scratch_sgpr));
2069
2070 /* to resolve the cycle, we have to swap the src reg with the dst reg */
2071 copy_operation swap = it->second;
2072
2073 /* if this is self-intersecting, we have to split it because
2074 * self-intersecting swaps don't make sense */
2075 PhysReg src = swap.op.physReg(), dst = swap.def.physReg();
2076 if (abs((int)src.reg_b - (int)dst.reg_b) < (int)swap.bytes) {
2077 unsigned offset = abs((int)src.reg_b - (int)dst.reg_b);
2078
2079 copy_operation remaining;
2080 src.reg_b += offset;
2081 dst.reg_b += offset;
2082 remaining.bytes = swap.bytes - offset;
2083 memcpy(remaining.uses, swap.uses + offset, remaining.bytes);
2084 remaining.op = Operand(src, swap.def.regClass().resize(remaining.bytes));
2085 remaining.def = Definition(dst, swap.def.regClass().resize(remaining.bytes));
2086 copy_map[dst] = remaining;
2087
2088 memset(swap.uses + offset, 0, swap.bytes - offset);
2089 swap.bytes = offset;
2090 }
2091
2092 /* GFX6-7 can only swap full registers */
2093 if (ctx->program->gfx_level <= GFX7)
2094 swap.bytes = align(swap.bytes, 4);
2095
2096 do_swap(ctx, bld, swap, preserve_scc, pi);
2097
2098 /* remove from map */
2099 copy_map.erase(it);
2100
2101 /* change the operand reg of the target's uses and split uses if needed */
2102 uint32_t bytes_left = u_bit_consecutive(0, swap.bytes);
2103 for (auto target = copy_map.begin(); target != copy_map.end(); ++target) {
2104 if (target->second.op.physReg() == swap.def.physReg() &&
2105 swap.bytes == target->second.bytes) {
2106 target->second.op.setFixed(swap.op.physReg());
2107 break;
2108 }
2109
2110 uint32_t imask =
2111 get_intersection_mask(swap.def.physReg().reg_b, swap.bytes,
2112 target->second.op.physReg().reg_b, target->second.bytes);
2113
2114 if (!imask)
2115 continue;
2116
2117 int offset = (int)target->second.op.physReg().reg_b - (int)swap.def.physReg().reg_b;
2118
2119 /* split and update the middle (the portion that reads the swap's
2120 * definition) to read the swap's operand instead */
2121 int target_op_end = target->second.op.physReg().reg_b + target->second.bytes;
2122 int swap_def_end = swap.def.physReg().reg_b + swap.bytes;
2123 int before_bytes = MAX2(-offset, 0);
2124 int after_bytes = MAX2(target_op_end - swap_def_end, 0);
2125 int middle_bytes = target->second.bytes - before_bytes - after_bytes;
2126
2127 if (after_bytes) {
2128 unsigned after_offset = before_bytes + middle_bytes;
2129 assert(after_offset > 0);
2130 copy_operation copy;
2131 copy.bytes = after_bytes;
2132 memcpy(copy.uses, target->second.uses + after_offset, copy.bytes);
2133 RegClass rc = target->second.op.regClass().resize(after_bytes);
2134 copy.op = Operand(target->second.op.physReg().advance(after_offset), rc);
2135 copy.def = Definition(target->second.def.physReg().advance(after_offset), rc);
2136 copy_map[copy.def.physReg()] = copy;
2137 }
2138
2139 if (middle_bytes) {
2140 copy_operation copy;
2141 copy.bytes = middle_bytes;
2142 memcpy(copy.uses, target->second.uses + before_bytes, copy.bytes);
2143 RegClass rc = target->second.op.regClass().resize(middle_bytes);
2144 copy.op = Operand(swap.op.physReg().advance(MAX2(offset, 0)), rc);
2145 copy.def = Definition(target->second.def.physReg().advance(before_bytes), rc);
2146 copy_map[copy.def.physReg()] = copy;
2147 }
2148
2149 if (before_bytes) {
2150 copy_operation copy;
2151 target->second.bytes = before_bytes;
2152 RegClass rc = target->second.op.regClass().resize(before_bytes);
2153 target->second.op = Operand(target->second.op.physReg(), rc);
2154 target->second.def = Definition(target->second.def.physReg(), rc);
2155 memset(target->second.uses + target->second.bytes, 0, 8 - target->second.bytes);
2156 }
2157
2158 /* break early since we know each byte of the swap's definition is used
2159 * at most once */
2160 bytes_left &= ~imask;
2161 if (!bytes_left)
2162 break;
2163 }
2164 }
2165 ctx->program->statistics[aco_statistic_copies] +=
2166 ctx->instructions.size() - num_instructions_before;
2167 }
2168
2169 void
emit_set_mode(Builder & bld,float_mode new_mode,bool set_round,bool set_denorm)2170 emit_set_mode(Builder& bld, float_mode new_mode, bool set_round, bool set_denorm)
2171 {
2172 if (bld.program->gfx_level >= GFX10) {
2173 if (set_round)
2174 bld.sopp(aco_opcode::s_round_mode, -1, new_mode.round);
2175 if (set_denorm)
2176 bld.sopp(aco_opcode::s_denorm_mode, -1, new_mode.denorm);
2177 } else if (set_round || set_denorm) {
2178 /* "((size - 1) << 11) | register" (MODE is encoded as register 1) */
2179 bld.sopk(aco_opcode::s_setreg_imm32_b32, Operand::literal32(new_mode.val), (7 << 11) | 1);
2180 }
2181 }
2182
2183 void
emit_set_mode_from_block(Builder & bld,Program & program,Block * block,bool always_set)2184 emit_set_mode_from_block(Builder& bld, Program& program, Block* block, bool always_set)
2185 {
2186 float_mode config_mode;
2187 config_mode.val = program.config->float_mode;
2188
2189 bool set_round = always_set && block->fp_mode.round != config_mode.round;
2190 bool set_denorm = always_set && block->fp_mode.denorm != config_mode.denorm;
2191 if (block->kind & block_kind_top_level) {
2192 for (unsigned pred : block->linear_preds) {
2193 if (program.blocks[pred].fp_mode.round != block->fp_mode.round)
2194 set_round = true;
2195 if (program.blocks[pred].fp_mode.denorm != block->fp_mode.denorm)
2196 set_denorm = true;
2197 }
2198 }
2199 /* only allow changing modes at top-level blocks so this doesn't break
2200 * the "jump over empty blocks" optimization */
2201 assert((!set_round && !set_denorm) || (block->kind & block_kind_top_level));
2202 emit_set_mode(bld, block->fp_mode, set_round, set_denorm);
2203 }
2204
2205 void
hw_init_scratch(Builder & bld,Definition def,Operand scratch_addr,Operand scratch_offset)2206 hw_init_scratch(Builder& bld, Definition def, Operand scratch_addr, Operand scratch_offset)
2207 {
2208 /* Since we know what the high 16 bits of scratch_hi is, we can set all the high 16
2209 * bits in the same instruction that we add the carry.
2210 */
2211 Operand hi_add = Operand::c32(0xffff0000 - S_008F04_SWIZZLE_ENABLE_GFX6(1));
2212 Operand scratch_addr_lo(scratch_addr.physReg(), s1);
2213 Operand scratch_addr_hi(scratch_addr_lo.physReg().advance(4), s1);
2214
2215 if (bld.program->gfx_level >= GFX10) {
2216 PhysReg scratch_lo = def.physReg();
2217 PhysReg scratch_hi = def.physReg().advance(4);
2218
2219 bld.sop2(aco_opcode::s_add_u32, Definition(scratch_lo, s1), Definition(scc, s1),
2220 scratch_addr_lo, scratch_offset);
2221 bld.sop2(aco_opcode::s_addc_u32, Definition(scratch_hi, s1), Definition(scc, s1),
2222 scratch_addr_hi, hi_add, Operand(scc, s1));
2223
2224 /* "((size - 1) << 11) | register" (FLAT_SCRATCH_LO/HI is encoded as register
2225 * 20/21) */
2226 bld.sopk(aco_opcode::s_setreg_b32, Operand(scratch_lo, s1), (31 << 11) | 20);
2227 bld.sopk(aco_opcode::s_setreg_b32, Operand(scratch_hi, s1), (31 << 11) | 21);
2228 } else {
2229 bld.sop2(aco_opcode::s_add_u32, Definition(flat_scr_lo, s1), Definition(scc, s1),
2230 scratch_addr_lo, scratch_offset);
2231 bld.sop2(aco_opcode::s_addc_u32, Definition(flat_scr_hi, s1), Definition(scc, s1),
2232 scratch_addr_hi, hi_add, Operand(scc, s1));
2233 }
2234 }
2235
2236 void
lower_image_sample(lower_context * ctx,aco_ptr<Instruction> & instr)2237 lower_image_sample(lower_context* ctx, aco_ptr<Instruction>& instr)
2238 {
2239 Operand linear_vgpr = instr->operands[3];
2240
2241 unsigned nsa_size = ctx->program->dev.max_nsa_vgprs;
2242 unsigned vaddr_size = linear_vgpr.size();
2243 unsigned num_copied_vgprs = instr->operands.size() - 4;
2244 nsa_size = num_copied_vgprs > 0 && (ctx->program->gfx_level >= GFX11 || vaddr_size <= nsa_size)
2245 ? nsa_size
2246 : 0;
2247
2248 Operand vaddr[16];
2249 unsigned num_vaddr = 0;
2250
2251 if (nsa_size) {
2252 assert(num_copied_vgprs <= nsa_size);
2253 for (unsigned i = 0; i < num_copied_vgprs; i++)
2254 vaddr[num_vaddr++] = instr->operands[4 + i];
2255 for (unsigned i = num_copied_vgprs; i < std::min(vaddr_size, nsa_size); i++)
2256 vaddr[num_vaddr++] = Operand(linear_vgpr.physReg().advance(i * 4), v1);
2257 if (vaddr_size > nsa_size) {
2258 RegClass rc = RegClass::get(RegType::vgpr, (vaddr_size - nsa_size) * 4);
2259 vaddr[num_vaddr++] = Operand(PhysReg(linear_vgpr.physReg().advance(nsa_size * 4)), rc);
2260 }
2261 } else {
2262 PhysReg reg = linear_vgpr.physReg();
2263 std::map<PhysReg, copy_operation> copy_operations;
2264 for (unsigned i = 4; i < instr->operands.size(); i++) {
2265 Operand arg = instr->operands[i];
2266 Definition def(reg, RegClass::get(RegType::vgpr, arg.bytes()));
2267 copy_operations[def.physReg()] = {arg, def, def.bytes()};
2268 reg = reg.advance(arg.bytes());
2269 }
2270 vaddr[num_vaddr++] = linear_vgpr;
2271
2272 Pseudo_instruction pi = {};
2273 handle_operands(copy_operations, ctx, ctx->program->gfx_level, &pi);
2274 }
2275
2276 instr->mimg().strict_wqm = false;
2277
2278 if ((3 + num_vaddr) > instr->operands.size()) {
2279 MIMG_instruction* new_instr = create_instruction<MIMG_instruction>(
2280 instr->opcode, Format::MIMG, 3 + num_vaddr, instr->definitions.size());
2281 std::copy(instr->definitions.cbegin(), instr->definitions.cend(),
2282 new_instr->definitions.begin());
2283 new_instr->operands[0] = instr->operands[0];
2284 new_instr->operands[1] = instr->operands[1];
2285 new_instr->operands[2] = instr->operands[2];
2286 memcpy((uint8_t*)new_instr + sizeof(Instruction), (uint8_t*)instr.get() + sizeof(Instruction),
2287 sizeof(MIMG_instruction) - sizeof(Instruction));
2288 instr.reset(new_instr);
2289 } else {
2290 while (instr->operands.size() > (3 + num_vaddr))
2291 instr->operands.pop_back();
2292 }
2293 std::copy(vaddr, vaddr + num_vaddr, std::next(instr->operands.begin(), 3));
2294 }
2295
2296 void
lower_to_hw_instr(Program * program)2297 lower_to_hw_instr(Program* program)
2298 {
2299 gfx9_pops_done_msg_bounds pops_done_msg_bounds;
2300 if (program->has_pops_overlapped_waves_wait && program->gfx_level < GFX11) {
2301 pops_done_msg_bounds = gfx9_pops_done_msg_bounds(program);
2302 }
2303
2304 Block* discard_exit_block = NULL;
2305 Block* discard_pops_done_and_exit_block = NULL;
2306
2307 int end_with_regs_block_index = -1;
2308
2309 bool should_dealloc_vgprs = dealloc_vgprs(program);
2310
2311 for (int block_idx = program->blocks.size() - 1; block_idx >= 0; block_idx--) {
2312 Block* block = &program->blocks[block_idx];
2313 lower_context ctx;
2314 ctx.program = program;
2315 ctx.block = block;
2316 ctx.instructions.reserve(block->instructions.size());
2317 Builder bld(program, &ctx.instructions);
2318
2319 emit_set_mode_from_block(bld, *program, block, (block_idx == 0));
2320
2321 for (size_t instr_idx = 0; instr_idx < block->instructions.size(); instr_idx++) {
2322 aco_ptr<Instruction>& instr = block->instructions[instr_idx];
2323
2324 /* Send the ordered section done message from the middle of the block if needed (if the
2325 * ordered section is ended by an instruction inside this block).
2326 * Also make sure the done message is sent if it's needed in case early exit happens for
2327 * any reason.
2328 */
2329 if ((block_idx == pops_done_msg_bounds.end_block_idx() &&
2330 instr_idx == pops_done_msg_bounds.instr_after_end_idx()) ||
2331 (instr->opcode == aco_opcode::s_endpgm &&
2332 pops_done_msg_bounds.early_exit_needs_done_msg(block_idx, instr_idx))) {
2333 bld.sopp(aco_opcode::s_sendmsg, -1, sendmsg_ordered_ps_done);
2334 }
2335
2336 aco_ptr<Instruction> mov;
2337 if (instr->isPseudo() && instr->opcode != aco_opcode::p_unit_test) {
2338 Pseudo_instruction* pi = &instr->pseudo();
2339
2340 switch (instr->opcode) {
2341 case aco_opcode::p_extract_vector: {
2342 PhysReg reg = instr->operands[0].physReg();
2343 Definition& def = instr->definitions[0];
2344 reg.reg_b += instr->operands[1].constantValue() * def.bytes();
2345
2346 if (reg == def.physReg())
2347 break;
2348
2349 RegClass op_rc = def.regClass().is_subdword()
2350 ? def.regClass()
2351 : RegClass(instr->operands[0].getTemp().type(), def.size());
2352 std::map<PhysReg, copy_operation> copy_operations;
2353 copy_operations[def.physReg()] = {Operand(reg, op_rc), def, def.bytes()};
2354 handle_operands(copy_operations, &ctx, program->gfx_level, pi);
2355 break;
2356 }
2357 case aco_opcode::p_create_vector: {
2358 std::map<PhysReg, copy_operation> copy_operations;
2359 PhysReg reg = instr->definitions[0].physReg();
2360
2361 for (const Operand& op : instr->operands) {
2362 if (op.isConstant()) {
2363 const Definition def = Definition(
2364 reg, instr->definitions[0].getTemp().regClass().resize(op.bytes()));
2365 copy_operations[reg] = {op, def, op.bytes()};
2366 reg.reg_b += op.bytes();
2367 continue;
2368 }
2369 if (op.isUndefined()) {
2370 // TODO: coalesce subdword copies if dst byte is 0
2371 reg.reg_b += op.bytes();
2372 continue;
2373 }
2374
2375 RegClass rc_def =
2376 op.regClass().is_subdword()
2377 ? op.regClass()
2378 : instr->definitions[0].getTemp().regClass().resize(op.bytes());
2379 const Definition def = Definition(reg, rc_def);
2380 copy_operations[def.physReg()] = {op, def, op.bytes()};
2381 reg.reg_b += op.bytes();
2382 }
2383 handle_operands(copy_operations, &ctx, program->gfx_level, pi);
2384 break;
2385 }
2386 case aco_opcode::p_split_vector: {
2387 std::map<PhysReg, copy_operation> copy_operations;
2388 PhysReg reg = instr->operands[0].physReg();
2389
2390 for (const Definition& def : instr->definitions) {
2391 RegClass rc_op = def.regClass().is_subdword()
2392 ? def.regClass()
2393 : instr->operands[0].getTemp().regClass().resize(def.bytes());
2394 const Operand op = Operand(reg, rc_op);
2395 copy_operations[def.physReg()] = {op, def, def.bytes()};
2396 reg.reg_b += def.bytes();
2397 }
2398 handle_operands(copy_operations, &ctx, program->gfx_level, pi);
2399 break;
2400 }
2401 case aco_opcode::p_parallelcopy: {
2402 std::map<PhysReg, copy_operation> copy_operations;
2403 for (unsigned j = 0; j < instr->operands.size(); j++) {
2404 assert(instr->definitions[j].bytes() == instr->operands[j].bytes());
2405 copy_operations[instr->definitions[j].physReg()] = {
2406 instr->operands[j], instr->definitions[j], instr->operands[j].bytes()};
2407 }
2408 handle_operands(copy_operations, &ctx, program->gfx_level, pi);
2409 break;
2410 }
2411 case aco_opcode::p_start_linear_vgpr: {
2412 if (instr->operands.empty())
2413 break;
2414
2415 Definition def(instr->definitions[0].physReg(),
2416 RegClass::get(RegType::vgpr, instr->definitions[0].bytes()));
2417
2418 std::map<PhysReg, copy_operation> copy_operations;
2419 copy_operations[def.physReg()] = {instr->operands[0], def,
2420 instr->operands[0].bytes()};
2421 handle_operands(copy_operations, &ctx, program->gfx_level, pi);
2422 break;
2423 }
2424 case aco_opcode::p_exit_early_if: {
2425 /* don't bother with an early exit near the end of the program */
2426 if ((block->instructions.size() - 1 - instr_idx) <= 4 &&
2427 block->instructions.back()->opcode == aco_opcode::s_endpgm) {
2428 unsigned null_exp_dest =
2429 program->gfx_level >= GFX11 ? V_008DFC_SQ_EXP_MRT : V_008DFC_SQ_EXP_NULL;
2430 bool ignore_early_exit = true;
2431
2432 for (unsigned k = instr_idx + 1; k < block->instructions.size(); ++k) {
2433 const aco_ptr<Instruction>& instr2 = block->instructions[k];
2434 if (instr2->opcode == aco_opcode::s_endpgm ||
2435 instr2->opcode == aco_opcode::p_logical_end)
2436 continue;
2437 else if (instr2->opcode == aco_opcode::exp &&
2438 instr2->exp().dest == null_exp_dest &&
2439 instr2->exp().enabled_mask == 0)
2440 continue;
2441 else if (instr2->opcode == aco_opcode::p_parallelcopy &&
2442 instr2->definitions[0].isFixed() &&
2443 instr2->definitions[0].physReg() == exec)
2444 continue;
2445
2446 ignore_early_exit = false;
2447 }
2448
2449 if (ignore_early_exit)
2450 break;
2451 }
2452
2453 const bool discard_sends_pops_done =
2454 pops_done_msg_bounds.early_exit_needs_done_msg(block_idx, instr_idx);
2455
2456 Block* discard_block =
2457 discard_sends_pops_done ? discard_pops_done_and_exit_block : discard_exit_block;
2458 if (!discard_block) {
2459 discard_block = program->create_and_insert_block();
2460 discard_block->kind = block_kind_discard_early_exit;
2461 if (discard_sends_pops_done) {
2462 discard_pops_done_and_exit_block = discard_block;
2463 } else {
2464 discard_exit_block = discard_block;
2465 }
2466 block = &program->blocks[block_idx];
2467
2468 bld.reset(discard_block);
2469 if (program->has_pops_overlapped_waves_wait &&
2470 (program->gfx_level >= GFX11 || discard_sends_pops_done)) {
2471 /* If this discard early exit potentially exits the POPS ordered section, do
2472 * the waitcnt necessary before resuming overlapping waves as the normal
2473 * waitcnt insertion doesn't work in a discard early exit block.
2474 */
2475 if (program->gfx_level >= GFX10)
2476 bld.sopk(aco_opcode::s_waitcnt_vscnt, Operand(sgpr_null, s1), 0);
2477 wait_imm pops_exit_wait_imm;
2478 pops_exit_wait_imm.vm = 0;
2479 if (program->has_smem_buffer_or_global_loads)
2480 pops_exit_wait_imm.lgkm = 0;
2481 bld.sopp(aco_opcode::s_waitcnt, -1,
2482 pops_exit_wait_imm.pack(program->gfx_level));
2483 }
2484 if (discard_sends_pops_done)
2485 bld.sopp(aco_opcode::s_sendmsg, -1, sendmsg_ordered_ps_done);
2486 unsigned target = V_008DFC_SQ_EXP_NULL;
2487 if (program->gfx_level >= GFX11)
2488 target =
2489 program->has_color_exports ? V_008DFC_SQ_EXP_MRT : V_008DFC_SQ_EXP_MRTZ;
2490 if (program->stage == fragment_fs)
2491 bld.exp(aco_opcode::exp, Operand(v1), Operand(v1), Operand(v1), Operand(v1), 0,
2492 target, false, true, true);
2493 if (should_dealloc_vgprs)
2494 bld.sopp(aco_opcode::s_sendmsg, -1, sendmsg_dealloc_vgprs);
2495 bld.sopp(aco_opcode::s_endpgm);
2496
2497 bld.reset(&ctx.instructions);
2498 }
2499
2500 assert(instr->operands[0].physReg() == scc);
2501 bld.sopp(aco_opcode::s_cbranch_scc0, instr->operands[0], discard_block->index);
2502
2503 discard_block->linear_preds.push_back(block->index);
2504 block->linear_succs.push_back(discard_block->index);
2505 break;
2506 }
2507 case aco_opcode::p_spill: {
2508 assert(instr->operands[0].regClass() == v1.as_linear());
2509 for (unsigned i = 0; i < instr->operands[2].size(); i++) {
2510 Operand src =
2511 instr->operands[2].isConstant()
2512 ? Operand::c32(uint32_t(instr->operands[2].constantValue64() >> (32 * i)))
2513 : Operand(PhysReg{instr->operands[2].physReg() + i}, s1);
2514 bld.writelane(bld.def(v1, instr->operands[0].physReg()), src,
2515 Operand::c32(instr->operands[1].constantValue() + i),
2516 instr->operands[0]);
2517 }
2518 break;
2519 }
2520 case aco_opcode::p_reload: {
2521 assert(instr->operands[0].regClass() == v1.as_linear());
2522 for (unsigned i = 0; i < instr->definitions[0].size(); i++)
2523 bld.readlane(bld.def(s1, PhysReg{instr->definitions[0].physReg() + i}),
2524 instr->operands[0],
2525 Operand::c32(instr->operands[1].constantValue() + i));
2526 break;
2527 }
2528 case aco_opcode::p_as_uniform: {
2529 if (instr->operands[0].isConstant() ||
2530 instr->operands[0].regClass().type() == RegType::sgpr) {
2531 std::map<PhysReg, copy_operation> copy_operations;
2532 copy_operations[instr->definitions[0].physReg()] = {
2533 instr->operands[0], instr->definitions[0], instr->definitions[0].bytes()};
2534 handle_operands(copy_operations, &ctx, program->gfx_level, pi);
2535 } else {
2536 assert(instr->operands[0].regClass().type() == RegType::vgpr);
2537 assert(instr->definitions[0].regClass().type() == RegType::sgpr);
2538 assert(instr->operands[0].size() == instr->definitions[0].size());
2539 for (unsigned i = 0; i < instr->definitions[0].size(); i++) {
2540 bld.vop1(aco_opcode::v_readfirstlane_b32,
2541 bld.def(s1, PhysReg{instr->definitions[0].physReg() + i}),
2542 Operand(PhysReg{instr->operands[0].physReg() + i}, v1));
2543 }
2544 }
2545 break;
2546 }
2547 case aco_opcode::p_pops_gfx9_add_exiting_wave_id: {
2548 bld.sop2(aco_opcode::s_add_i32, instr->definitions[0], instr->definitions[1],
2549 Operand(pops_exiting_wave_id, s1), instr->operands[0]);
2550 break;
2551 }
2552 case aco_opcode::p_bpermute_readlane: {
2553 emit_bpermute_readlane(program, instr, bld);
2554 break;
2555 }
2556 case aco_opcode::p_bpermute_shared_vgpr: {
2557 emit_bpermute_shared_vgpr(program, instr, bld);
2558 break;
2559 }
2560 case aco_opcode::p_bpermute_permlane: {
2561 emit_bpermute_permlane(program, instr, bld);
2562 break;
2563 }
2564 case aco_opcode::p_constaddr: {
2565 unsigned id = instr->definitions[0].tempId();
2566 PhysReg reg = instr->definitions[0].physReg();
2567 bld.sop1(aco_opcode::p_constaddr_getpc, instr->definitions[0], Operand::c32(id));
2568 bld.sop2(aco_opcode::p_constaddr_addlo, Definition(reg, s1), bld.def(s1, scc),
2569 Operand(reg, s1), instr->operands[0], Operand::c32(id));
2570 /* s_addc_u32 not needed because the program is in a 32-bit VA range */
2571 break;
2572 }
2573 case aco_opcode::p_resume_shader_address: {
2574 /* Find index of resume block. */
2575 unsigned resume_idx = instr->operands[0].constantValue();
2576 unsigned resume_block_idx = 0;
2577 for (Block& resume_block : program->blocks) {
2578 if (resume_block.kind & block_kind_resume) {
2579 if (resume_idx == 0) {
2580 resume_block_idx = resume_block.index;
2581 break;
2582 }
2583 resume_idx--;
2584 }
2585 }
2586 assert(resume_block_idx != 0);
2587 unsigned id = instr->definitions[0].tempId();
2588 PhysReg reg = instr->definitions[0].physReg();
2589 bld.sop1(aco_opcode::p_resumeaddr_getpc, instr->definitions[0], Operand::c32(id));
2590 bld.sop2(aco_opcode::p_resumeaddr_addlo, Definition(reg, s1), bld.def(s1, scc),
2591 Operand(reg, s1), Operand::c32(resume_block_idx), Operand::c32(id));
2592 /* s_addc_u32 not needed because the program is in a 32-bit VA range */
2593 break;
2594 }
2595 case aco_opcode::p_extract: {
2596 assert(instr->operands[1].isConstant());
2597 assert(instr->operands[2].isConstant());
2598 assert(instr->operands[3].isConstant());
2599 if (instr->definitions[0].regClass() == s1)
2600 assert(instr->definitions.size() >= 2 && instr->definitions[1].physReg() == scc);
2601 Definition dst = instr->definitions[0];
2602 Operand op = instr->operands[0];
2603 unsigned bits = instr->operands[2].constantValue();
2604 unsigned index = instr->operands[1].constantValue();
2605 unsigned offset = index * bits;
2606 bool signext = !instr->operands[3].constantEquals(0);
2607
2608 if (dst.regClass() == s1) {
2609 if (offset == (32 - bits)) {
2610 bld.sop2(signext ? aco_opcode::s_ashr_i32 : aco_opcode::s_lshr_b32, dst,
2611 bld.def(s1, scc), op, Operand::c32(offset));
2612 } else if (offset == 0 && signext && (bits == 8 || bits == 16)) {
2613 bld.sop1(bits == 8 ? aco_opcode::s_sext_i32_i8 : aco_opcode::s_sext_i32_i16,
2614 dst, op);
2615 } else if (ctx.program->gfx_level >= GFX9 && offset == 0 && bits == 16) {
2616 bld.sop2(aco_opcode::s_pack_ll_b32_b16, dst, op, Operand::zero());
2617 } else {
2618 bld.sop2(signext ? aco_opcode::s_bfe_i32 : aco_opcode::s_bfe_u32, dst,
2619 bld.def(s1, scc), op, Operand::c32((bits << 16) | offset));
2620 }
2621 } else if ((dst.regClass() == v1 && op.physReg().byte() == 0) ||
2622 ctx.program->gfx_level <= GFX7) {
2623 assert(op.physReg().byte() == 0 && dst.physReg().byte() == 0);
2624 if (offset == (32 - bits) && op.regClass() != s1) {
2625 bld.vop2(signext ? aco_opcode::v_ashrrev_i32 : aco_opcode::v_lshrrev_b32, dst,
2626 Operand::c32(offset), op);
2627 } else if (offset == 0 && bits == 16 && ctx.program->gfx_level >= GFX11) {
2628 bld.vop1(signext ? aco_opcode::v_cvt_i32_i16 : aco_opcode::v_cvt_u32_u16, dst,
2629 op);
2630 } else {
2631 bld.vop3(signext ? aco_opcode::v_bfe_i32 : aco_opcode::v_bfe_u32, dst, op,
2632 Operand::c32(offset), Operand::c32(bits));
2633 }
2634 } else {
2635 assert(dst.regClass() == v2b || dst.regClass() == v1b || op.regClass() == v2b ||
2636 op.regClass() == v1b);
2637 if (ctx.program->gfx_level >= GFX11) {
2638 unsigned op_vgpr_byte = op.physReg().byte() + offset / 8;
2639 unsigned sign_byte = op_vgpr_byte + bits / 8 - 1;
2640
2641 uint8_t swiz[4] = {4, 5, 6, 7};
2642 swiz[dst.physReg().byte()] = op_vgpr_byte;
2643 if (bits == 16)
2644 swiz[dst.physReg().byte() + 1] = op_vgpr_byte + 1;
2645 for (unsigned i = bits / 8; i < dst.bytes(); i++) {
2646 uint8_t ext = bperm_0;
2647 if (signext) {
2648 if (sign_byte == 1)
2649 ext = bperm_b1_sign;
2650 else if (sign_byte == 3)
2651 ext = bperm_b3_sign;
2652 else /* replicate so sign-extension can be done later */
2653 ext = sign_byte;
2654 }
2655 swiz[dst.physReg().byte() + i] = ext;
2656 }
2657 create_bperm(bld, swiz, dst, op);
2658
2659 if (signext && sign_byte != 3 && sign_byte != 1) {
2660 assert(bits == 8);
2661 assert(dst.regClass() == v2b || dst.regClass() == v1);
2662 uint8_t ext_swiz[4] = {4, 5, 6, 7};
2663 uint8_t ext = dst.physReg().byte() == 2 ? bperm_b7_sign : bperm_b5_sign;
2664 memset(ext_swiz + dst.physReg().byte() + 1, ext, dst.bytes() - 1);
2665 create_bperm(bld, ext_swiz, dst, Operand::zero());
2666 }
2667 } else {
2668 SDWA_instruction& sdwa = bld.vop1_sdwa(aco_opcode::v_mov_b32, dst, op)->sdwa();
2669 sdwa.sel[0] = SubdwordSel(bits / 8, offset / 8, signext);
2670 }
2671 }
2672 break;
2673 }
2674 case aco_opcode::p_insert: {
2675 assert(instr->operands[1].isConstant());
2676 assert(instr->operands[2].isConstant());
2677 if (instr->definitions[0].regClass() == s1)
2678 assert(instr->definitions.size() >= 2 && instr->definitions[1].physReg() == scc);
2679 Definition dst = instr->definitions[0];
2680 Operand op = instr->operands[0];
2681 unsigned bits = instr->operands[2].constantValue();
2682 unsigned index = instr->operands[1].constantValue();
2683 unsigned offset = index * bits;
2684
2685 bool has_sdwa = program->gfx_level >= GFX8 && program->gfx_level < GFX11;
2686 if (dst.regClass() == s1) {
2687 if (offset == (32 - bits)) {
2688 bld.sop2(aco_opcode::s_lshl_b32, dst, bld.def(s1, scc), op,
2689 Operand::c32(offset));
2690 } else if (offset == 0) {
2691 bld.sop2(aco_opcode::s_bfe_u32, dst, bld.def(s1, scc), op,
2692 Operand::c32(bits << 16));
2693 } else {
2694 bld.sop2(aco_opcode::s_bfe_u32, dst, bld.def(s1, scc), op,
2695 Operand::c32(bits << 16));
2696 bld.sop2(aco_opcode::s_lshl_b32, dst, bld.def(s1, scc),
2697 Operand(dst.physReg(), s1), Operand::c32(offset));
2698 }
2699 } else if (dst.regClass() == v1 || !has_sdwa) {
2700 if (offset == (dst.bytes() * 8u - bits) &&
2701 (dst.regClass() == v1 || program->gfx_level <= GFX7)) {
2702 bld.vop2(aco_opcode::v_lshlrev_b32, dst, Operand::c32(offset), op);
2703 } else if (offset == 0 && (dst.regClass() == v1 || program->gfx_level <= GFX7)) {
2704 bld.vop3(aco_opcode::v_bfe_u32, dst, op, Operand::zero(), Operand::c32(bits));
2705 } else if (has_sdwa && (op.regClass() != s1 || program->gfx_level >= GFX9)) {
2706 bld.vop1_sdwa(aco_opcode::v_mov_b32, dst, op)->sdwa().dst_sel =
2707 SubdwordSel(bits / 8, offset / 8, false);
2708 } else if (program->gfx_level >= GFX11) {
2709 uint8_t swiz[] = {4, 5, 6, 7};
2710 for (unsigned i = 0; i < dst.bytes(); i++)
2711 swiz[dst.physReg().byte() + i] = bperm_0;
2712 for (unsigned i = 0; i < bits / 8; i++)
2713 swiz[dst.physReg().byte() + i + offset / 8] = op.physReg().byte() + i;
2714 create_bperm(bld, swiz, dst, op);
2715 } else {
2716 bld.vop3(aco_opcode::v_bfe_u32, dst, op, Operand::zero(), Operand::c32(bits));
2717 bld.vop2(aco_opcode::v_lshlrev_b32, dst, Operand::c32(offset),
2718 Operand(dst.physReg(), v1));
2719 }
2720 } else {
2721 assert(dst.regClass() == v2b);
2722 bld.vop2_sdwa(aco_opcode::v_lshlrev_b32, dst, Operand::c32(offset), op)
2723 ->sdwa()
2724 .sel[1] = SubdwordSel::ubyte;
2725 }
2726 break;
2727 }
2728 case aco_opcode::p_init_scratch: {
2729 assert(program->gfx_level >= GFX8 && program->gfx_level <= GFX10_3);
2730 if (!program->config->scratch_bytes_per_wave)
2731 break;
2732
2733 Operand scratch_addr = instr->operands[0];
2734 if (scratch_addr.isUndefined()) {
2735 PhysReg reg = instr->definitions[0].physReg();
2736 bld.sop1(aco_opcode::p_load_symbol, Definition(reg, s1),
2737 Operand::c32(aco_symbol_scratch_addr_lo));
2738 bld.sop1(aco_opcode::p_load_symbol, Definition(reg.advance(4), s1),
2739 Operand::c32(aco_symbol_scratch_addr_hi));
2740 scratch_addr.setFixed(reg);
2741 } else if (program->stage.hw != AC_HW_COMPUTE_SHADER) {
2742 bld.smem(aco_opcode::s_load_dwordx2, instr->definitions[0], scratch_addr,
2743 Operand::zero());
2744 scratch_addr.setFixed(instr->definitions[0].physReg());
2745 }
2746
2747 hw_init_scratch(bld, instr->definitions[0], scratch_addr, instr->operands[1]);
2748 break;
2749 }
2750 case aco_opcode::p_jump_to_epilog: {
2751 if (pops_done_msg_bounds.early_exit_needs_done_msg(block_idx, instr_idx)) {
2752 bld.sopp(aco_opcode::s_sendmsg, -1, sendmsg_ordered_ps_done);
2753 }
2754 bld.sop1(aco_opcode::s_setpc_b64, instr->operands[0]);
2755 break;
2756 }
2757 case aco_opcode::p_interp_gfx11: {
2758 assert(instr->definitions[0].regClass() == v1 ||
2759 instr->definitions[0].regClass() == v2b);
2760 assert(instr->operands[0].regClass() == v1.as_linear());
2761 assert(instr->operands[1].isConstant());
2762 assert(instr->operands[2].isConstant());
2763 assert(instr->operands.back().physReg() == m0);
2764 Definition dst = instr->definitions[0];
2765 PhysReg lin_vgpr = instr->operands[0].physReg();
2766 unsigned attribute = instr->operands[1].constantValue();
2767 unsigned component = instr->operands[2].constantValue();
2768 uint16_t dpp_ctrl = 0;
2769 Operand coord1, coord2;
2770 if (instr->operands.size() == 6) {
2771 assert(instr->operands[3].regClass() == v1);
2772 assert(instr->operands[4].regClass() == v1);
2773 coord1 = instr->operands[3];
2774 coord2 = instr->operands[4];
2775 } else {
2776 assert(instr->operands[3].isConstant());
2777 dpp_ctrl = instr->operands[3].constantValue();
2778 }
2779
2780 bld.ldsdir(aco_opcode::lds_param_load, Definition(lin_vgpr, v1), Operand(m0, s1),
2781 attribute, component);
2782
2783 Operand p(lin_vgpr, v1);
2784 Operand dst_op(dst.physReg(), v1);
2785 if (instr->operands.size() == 5) {
2786 bld.vop1_dpp(aco_opcode::v_mov_b32, Definition(dst), p, dpp_ctrl);
2787 } else if (dst.regClass() == v2b) {
2788 bld.vinterp_inreg(aco_opcode::v_interp_p10_f16_f32_inreg, Definition(dst), p,
2789 coord1, p);
2790 bld.vinterp_inreg(aco_opcode::v_interp_p2_f16_f32_inreg, Definition(dst), p,
2791 coord2, dst_op);
2792 } else {
2793 bld.vinterp_inreg(aco_opcode::v_interp_p10_f32_inreg, Definition(dst), p, coord1,
2794 p);
2795 bld.vinterp_inreg(aco_opcode::v_interp_p2_f32_inreg, Definition(dst), p, coord2,
2796 dst_op);
2797 }
2798 break;
2799 }
2800 case aco_opcode::p_dual_src_export_gfx11: {
2801 PhysReg dst0 = instr->definitions[0].physReg();
2802 PhysReg dst1 = instr->definitions[1].physReg();
2803 Definition exec_tmp = instr->definitions[2];
2804 Definition not_vcc_tmp = instr->definitions[3];
2805 Definition clobber_vcc = instr->definitions[4];
2806 Definition clobber_scc = instr->definitions[5];
2807
2808 assert(exec_tmp.regClass() == bld.lm);
2809 assert(not_vcc_tmp.regClass() == bld.lm);
2810 assert(clobber_vcc.regClass() == bld.lm && clobber_vcc.physReg() == vcc);
2811 assert(clobber_scc.isFixed() && clobber_scc.physReg() == scc);
2812
2813 bld.sop1(Builder::s_mov, Definition(exec_tmp.physReg(), bld.lm),
2814 Operand(exec, bld.lm));
2815 bld.sop1(Builder::s_wqm, Definition(exec, bld.lm), clobber_scc,
2816 Operand(exec, bld.lm));
2817
2818 uint8_t enabled_channels = 0;
2819 Operand mrt0[4], mrt1[4];
2820
2821 bld.sop1(aco_opcode::s_mov_b32, Definition(clobber_vcc.physReg(), s1),
2822 Operand::c32(0x55555555));
2823 if (ctx.program->wave_size == 64)
2824 bld.sop1(aco_opcode::s_mov_b32, Definition(clobber_vcc.physReg().advance(4), s1),
2825 Operand::c32(0x55555555));
2826
2827 Operand src_even = Operand(clobber_vcc.physReg(), bld.lm);
2828
2829 bld.sop1(Builder::s_not, not_vcc_tmp, clobber_scc, src_even);
2830
2831 Operand src_odd = Operand(not_vcc_tmp.physReg(), bld.lm);
2832
2833 for (unsigned i = 0; i < 4; i++) {
2834 if (instr->operands[i].isUndefined() && instr->operands[i + 4].isUndefined()) {
2835 mrt0[i] = instr->operands[i];
2836 mrt1[i] = instr->operands[i + 4];
2837 continue;
2838 }
2839
2840 Operand src0 = instr->operands[i];
2841 Operand src1 = instr->operands[i + 4];
2842
2843 /* | even lanes | odd lanes
2844 * mrt0 | src0 even | src1 even
2845 * mrt1 | src0 odd | src1 odd
2846 */
2847 bld.vop2_dpp(aco_opcode::v_cndmask_b32, Definition(dst0, v1), src1, src0,
2848 src_even, dpp_row_xmask(1));
2849 bld.vop2_e64_dpp(aco_opcode::v_cndmask_b32, Definition(dst1, v1), src0, src1,
2850 src_odd, dpp_row_xmask(1));
2851
2852 mrt0[i] = Operand(dst0, v1);
2853 mrt1[i] = Operand(dst1, v1);
2854
2855 enabled_channels |= 1 << i;
2856
2857 dst0 = dst0.advance(4);
2858 dst1 = dst1.advance(4);
2859 }
2860
2861 bld.sop1(Builder::s_mov, Definition(exec, bld.lm),
2862 Operand(exec_tmp.physReg(), bld.lm));
2863
2864 /* Force export all channels when everything is undefined. */
2865 if (!enabled_channels)
2866 enabled_channels = 0xf;
2867
2868 bld.exp(aco_opcode::exp, mrt0[0], mrt0[1], mrt0[2], mrt0[3], enabled_channels,
2869 V_008DFC_SQ_EXP_MRT + 21, false);
2870 bld.exp(aco_opcode::exp, mrt1[0], mrt1[1], mrt1[2], mrt1[3], enabled_channels,
2871 V_008DFC_SQ_EXP_MRT + 22, false);
2872 break;
2873 }
2874 case aco_opcode::p_end_with_regs: {
2875 end_with_regs_block_index = block->index;
2876 break;
2877 }
2878 default: break;
2879 }
2880 } else if (instr->isBranch()) {
2881 Pseudo_branch_instruction* branch = &instr->branch();
2882 const uint32_t target = branch->target[0];
2883 const bool uniform_branch = !((branch->opcode == aco_opcode::p_cbranch_z ||
2884 branch->opcode == aco_opcode::p_cbranch_nz) &&
2885 branch->operands[0].physReg() == exec);
2886
2887 /* Check if the branch instruction can be removed.
2888 * This is beneficial when executing the next block with an empty exec mask
2889 * is faster than the branch instruction itself.
2890 *
2891 * Override this judgement when:
2892 * - The application prefers to remove control flow
2893 * - The compiler stack knows that it's a divergent branch always taken
2894 */
2895 const bool prefer_remove =
2896 branch->selection_control_remove && ctx.program->gfx_level >= GFX10;
2897 bool can_remove = block->index < target;
2898 unsigned num_scalar = 0;
2899 unsigned num_vector = 0;
2900
2901 /* Check the instructions between branch and target */
2902 for (unsigned i = block->index + 1; i < branch->target[0]; i++) {
2903 /* Uniform conditional branches must not be ignored if they
2904 * are about to jump over actual instructions */
2905 if (uniform_branch && !program->blocks[i].instructions.empty())
2906 can_remove = false;
2907
2908 if (!can_remove)
2909 break;
2910
2911 for (aco_ptr<Instruction>& inst : program->blocks[i].instructions) {
2912 if (inst->isSOPP()) {
2913 /* Discard early exits and loop breaks and continues should work fine with an
2914 * empty exec mask.
2915 */
2916 bool is_break_continue =
2917 program->blocks[i].kind & (block_kind_break | block_kind_continue);
2918 bool discard_early_exit =
2919 inst->sopp().block != -1 &&
2920 (program->blocks[inst->sopp().block].kind & block_kind_discard_early_exit);
2921 if ((inst->opcode != aco_opcode::s_cbranch_scc0 &&
2922 inst->opcode != aco_opcode::s_cbranch_scc1) ||
2923 (!discard_early_exit && !is_break_continue))
2924 can_remove = false;
2925 } else if (inst->isSALU()) {
2926 num_scalar++;
2927 } else if (inst->isVALU() || inst->isVINTRP()) {
2928 if (instr->opcode == aco_opcode::v_writelane_b32 ||
2929 instr->opcode == aco_opcode::v_writelane_b32_e64) {
2930 /* writelane ignores exec, writing inactive lanes results in UB. */
2931 can_remove = false;
2932 }
2933 num_vector++;
2934 /* VALU which writes SGPRs are always executed on GFX10+ */
2935 if (ctx.program->gfx_level >= GFX10) {
2936 for (Definition& def : inst->definitions) {
2937 if (def.regClass().type() == RegType::sgpr)
2938 num_scalar++;
2939 }
2940 }
2941 } else if (inst->isEXP()) {
2942 /* Export instructions with exec=0 can hang some GFX10+ (unclear on old GPUs). */
2943 can_remove = false;
2944 } else if (inst->isVMEM() || inst->isFlatLike() || inst->isDS() ||
2945 inst->isLDSDIR()) {
2946 // TODO: GFX6-9 can use vskip
2947 can_remove = prefer_remove;
2948 } else if (inst->isSMEM()) {
2949 /* SMEM are at least as expensive as branches */
2950 can_remove = prefer_remove;
2951 } else if (inst->isBarrier()) {
2952 can_remove = prefer_remove;
2953 } else {
2954 can_remove = false;
2955 assert(false && "Pseudo instructions should be lowered by this point.");
2956 }
2957
2958 if (!prefer_remove) {
2959 /* Under these conditions, we shouldn't remove the branch.
2960 * Don't care about the estimated cycles when the shader prefers flattening.
2961 */
2962 unsigned est_cycles;
2963 if (ctx.program->gfx_level >= GFX10)
2964 est_cycles = num_scalar * 2 + num_vector;
2965 else
2966 est_cycles = num_scalar * 4 + num_vector * 4;
2967
2968 if (est_cycles > 16)
2969 can_remove = false;
2970 }
2971
2972 if (!can_remove)
2973 break;
2974 }
2975 }
2976
2977 if (can_remove)
2978 continue;
2979
2980 /* emit branch instruction */
2981 switch (instr->opcode) {
2982 case aco_opcode::p_branch:
2983 assert(block->linear_succs[0] == target);
2984 bld.sopp(aco_opcode::s_branch, branch->definitions[0], target);
2985 break;
2986 case aco_opcode::p_cbranch_nz:
2987 assert(block->linear_succs[1] == target);
2988 if (branch->operands[0].physReg() == exec)
2989 bld.sopp(aco_opcode::s_cbranch_execnz, branch->definitions[0], target);
2990 else if (branch->operands[0].physReg() == vcc)
2991 bld.sopp(aco_opcode::s_cbranch_vccnz, branch->definitions[0], target);
2992 else {
2993 assert(branch->operands[0].physReg() == scc);
2994 bld.sopp(aco_opcode::s_cbranch_scc1, branch->definitions[0], target);
2995 }
2996 break;
2997 case aco_opcode::p_cbranch_z:
2998 assert(block->linear_succs[1] == target);
2999 if (branch->operands[0].physReg() == exec)
3000 bld.sopp(aco_opcode::s_cbranch_execz, branch->definitions[0], target);
3001 else if (branch->operands[0].physReg() == vcc)
3002 bld.sopp(aco_opcode::s_cbranch_vccz, branch->definitions[0], target);
3003 else {
3004 assert(branch->operands[0].physReg() == scc);
3005 bld.sopp(aco_opcode::s_cbranch_scc0, branch->definitions[0], target);
3006 }
3007 break;
3008 default: unreachable("Unknown Pseudo branch instruction!");
3009 }
3010
3011 } else if (instr->isReduction()) {
3012 Pseudo_reduction_instruction& reduce = instr->reduction();
3013 emit_reduction(&ctx, reduce.opcode, reduce.reduce_op, reduce.cluster_size,
3014 reduce.operands[1].physReg(), // tmp
3015 reduce.definitions[1].physReg(), // stmp
3016 reduce.operands[2].physReg(), // vtmp
3017 reduce.definitions[2].physReg(), // sitmp
3018 reduce.operands[0], reduce.definitions[0]);
3019 } else if (instr->isBarrier()) {
3020 Pseudo_barrier_instruction& barrier = instr->barrier();
3021
3022 /* Anything larger than a workgroup isn't possible. Anything
3023 * smaller requires no instructions and this pseudo instruction
3024 * would only be included to control optimizations. */
3025 bool emit_s_barrier = barrier.exec_scope == scope_workgroup &&
3026 program->workgroup_size > program->wave_size;
3027
3028 bld.insert(std::move(instr));
3029 if (emit_s_barrier)
3030 bld.sopp(aco_opcode::s_barrier);
3031 } else if (instr->opcode == aco_opcode::p_cvt_f16_f32_rtne) {
3032 float_mode new_mode = block->fp_mode;
3033 new_mode.round16_64 = fp_round_ne;
3034 bool set_round = new_mode.round != block->fp_mode.round;
3035
3036 emit_set_mode(bld, new_mode, set_round, false);
3037
3038 instr->opcode = aco_opcode::v_cvt_f16_f32;
3039 ctx.instructions.emplace_back(std::move(instr));
3040
3041 emit_set_mode(bld, block->fp_mode, set_round, false);
3042 } else if (instr->isMIMG() && instr->mimg().strict_wqm) {
3043 lower_image_sample(&ctx, instr);
3044 ctx.instructions.emplace_back(std::move(instr));
3045 } else {
3046 ctx.instructions.emplace_back(std::move(instr));
3047 }
3048 }
3049
3050 /* Send the ordered section done message from this block if it's needed in this block, but
3051 * instr_after_end_idx() points beyond the end of its instructions. This may commonly happen
3052 * if the common post-dominator of multiple end locations turns out to be an empty block.
3053 */
3054 if (block_idx == pops_done_msg_bounds.end_block_idx() &&
3055 pops_done_msg_bounds.instr_after_end_idx() >= block->instructions.size()) {
3056 bld.sopp(aco_opcode::s_sendmsg, -1, sendmsg_ordered_ps_done);
3057 }
3058
3059 block->instructions = std::move(ctx.instructions);
3060 }
3061
3062 /* If block with p_end_with_regs is not the last block (i.e. p_exit_early_if may append exit
3063 * block at last), create an exit block for it to branch to.
3064 */
3065 int last_block_index = program->blocks.size() - 1;
3066 if (end_with_regs_block_index >= 0 && end_with_regs_block_index != last_block_index) {
3067 Block* exit_block = program->create_and_insert_block();
3068 Block* end_with_regs_block = &program->blocks[end_with_regs_block_index];
3069 exit_block->linear_preds.push_back(end_with_regs_block->index);
3070 end_with_regs_block->linear_succs.push_back(exit_block->index);
3071
3072 Builder bld(program, end_with_regs_block);
3073 bld.sopp(aco_opcode::s_branch, exit_block->index);
3074
3075 /* For insert waitcnt pass to add waitcnt in exit block, otherwise waitcnt will be added
3076 * after the s_branch which won't be executed.
3077 */
3078 end_with_regs_block->kind &= ~block_kind_end_with_regs;
3079 exit_block->kind |= block_kind_end_with_regs;
3080 }
3081 }
3082
3083 } // namespace aco
3084