1 /*
2 * Copyright © 2018 Valve Corporation
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
13 * Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21 * IN THE SOFTWARE.
22 *
23 * Authors:
24 * Daniel Schürmann (daniel.schuermann@campus.tu-berlin.de)
25 *
26 */
27
28 #include <map>
29
30 #include "aco_ir.h"
31 #include "aco_builder.h"
32 #include "util/u_math.h"
33 #include "sid.h"
34
35
36 namespace aco {
37
38 struct lower_context {
39 Program *program;
40 Block *block;
41 std::vector<aco_ptr<Instruction>> instructions;
42 };
43
44 /* used by handle_operands() indirectly through Builder::copy */
45 uint8_t int8_mul_table[512] = {
46 0, 20, 1, 1, 1, 2, 1, 3, 1, 4, 1, 5, 1, 6, 1, 7, 1, 8, 1, 9, 1, 10, 1, 11,
47 1, 12, 1, 13, 1, 14, 1, 15, 1, 16, 1, 17, 1, 18, 1, 19, 1, 20, 1, 21,
48 1, 22, 1, 23, 1, 24, 1, 25, 1, 26, 1, 27, 1, 28, 1, 29, 1, 30, 1, 31,
49 1, 32, 1, 33, 1, 34, 1, 35, 1, 36, 1, 37, 1, 38, 1, 39, 1, 40, 1, 41,
50 1, 42, 1, 43, 1, 44, 1, 45, 1, 46, 1, 47, 1, 48, 1, 49, 1, 50, 1, 51,
51 1, 52, 1, 53, 1, 54, 1, 55, 1, 56, 1, 57, 1, 58, 1, 59, 1, 60, 1, 61,
52 1, 62, 1, 63, 1, 64, 5, 13, 2, 33, 17, 19, 2, 34, 3, 23, 2, 35, 11, 53,
53 2, 36, 7, 47, 2, 37, 3, 25, 2, 38, 7, 11, 2, 39, 53, 243, 2, 40, 3, 27,
54 2, 41, 17, 35, 2, 42, 5, 17, 2, 43, 3, 29, 2, 44, 15, 23, 2, 45, 7, 13,
55 2, 46, 3, 31, 2, 47, 5, 19, 2, 48, 19, 59, 2, 49, 3, 33, 2, 50, 7, 51,
56 2, 51, 15, 41, 2, 52, 3, 35, 2, 53, 11, 33, 2, 54, 23, 27, 2, 55, 3, 37,
57 2, 56, 9, 41, 2, 57, 5, 23, 2, 58, 3, 39, 2, 59, 7, 17, 2, 60, 9, 241,
58 2, 61, 3, 41, 2, 62, 5, 25, 2, 63, 35, 245, 2, 64, 3, 43, 5, 26, 9, 43,
59 3, 44, 7, 19, 10, 39, 3, 45, 4, 34, 11, 59, 3, 46, 9, 243, 4, 35, 3, 47,
60 22, 53, 7, 57, 3, 48, 5, 29, 10, 245, 3, 49, 4, 37, 9, 45, 3, 50, 7, 241,
61 4, 38, 3, 51, 7, 22, 5, 31, 3, 52, 7, 59, 7, 242, 3, 53, 4, 40, 7, 23,
62 3, 54, 15, 45, 4, 41, 3, 55, 6, 241, 9, 47, 3, 56, 13, 13, 5, 34, 3, 57,
63 4, 43, 11, 39, 3, 58, 5, 35, 4, 44, 3, 59, 6, 243, 7, 245, 3, 60, 5, 241,
64 7, 26, 3, 61, 4, 46, 5, 37, 3, 62, 11, 17, 4, 47, 3, 63, 5, 38, 5, 243,
65 3, 64, 7, 247, 9, 50, 5, 39, 4, 241, 33, 37, 6, 33, 13, 35, 4, 242, 5, 245,
66 6, 247, 7, 29, 4, 51, 5, 41, 5, 246, 7, 249, 3, 240, 11, 19, 5, 42, 3, 241,
67 4, 245, 25, 29, 3, 242, 5, 43, 4, 246, 3, 243, 17, 58, 17, 43, 3, 244,
68 5, 249, 6, 37, 3, 245, 2, 240, 5, 45, 2, 241, 21, 23, 2, 242, 3, 247,
69 2, 243, 5, 251, 2, 244, 29, 61, 2, 245, 3, 249, 2, 246, 17, 29, 2, 247,
70 9, 55, 1, 240, 1, 241, 1, 242, 1, 243, 1, 244, 1, 245, 1, 246, 1, 247,
71 1, 248, 1, 249, 1, 250, 1, 251, 1, 252, 1, 253, 1, 254, 1, 255
72 };
73
74
get_reduce_opcode(chip_class chip,ReduceOp op)75 aco_opcode get_reduce_opcode(chip_class chip, ReduceOp op) {
76 /* Because some 16-bit instructions are already VOP3 on GFX10, we use the
77 * 32-bit opcodes (VOP2) which allows to remove the tempory VGPR and to use
78 * DPP with the arithmetic instructions. This requires to sign-extend.
79 */
80 switch (op) {
81 case iadd8:
82 case iadd16:
83 if (chip >= GFX10) {
84 return aco_opcode::v_add_u32;
85 } else if (chip >= GFX8) {
86 return aco_opcode::v_add_u16;
87 } else {
88 return aco_opcode::v_add_co_u32;
89 }
90 break;
91 case imul8:
92 case imul16:
93 if (chip >= GFX10) {
94 return aco_opcode::v_mul_lo_u16_e64;
95 } else if (chip >= GFX8) {
96 return aco_opcode::v_mul_lo_u16;
97 } else {
98 return aco_opcode::v_mul_u32_u24;
99 }
100 break;
101 case fadd16: return aco_opcode::v_add_f16;
102 case fmul16: return aco_opcode::v_mul_f16;
103 case imax8:
104 case imax16:
105 if (chip >= GFX10) {
106 return aco_opcode::v_max_i32;
107 } else if (chip >= GFX8) {
108 return aco_opcode::v_max_i16;
109 } else {
110 return aco_opcode::v_max_i32;
111 }
112 break;
113 case imin8:
114 case imin16:
115 if (chip >= GFX10) {
116 return aco_opcode::v_min_i32;
117 } else if (chip >= GFX8) {
118 return aco_opcode::v_min_i16;
119 } else {
120 return aco_opcode::v_min_i32;
121 }
122 break;
123 case umin8:
124 case umin16:
125 if (chip >= GFX10) {
126 return aco_opcode::v_min_u32;
127 } else if (chip >= GFX8) {
128 return aco_opcode::v_min_u16;
129 } else {
130 return aco_opcode::v_min_u32;
131 }
132 break;
133 case umax8:
134 case umax16:
135 if (chip >= GFX10) {
136 return aco_opcode::v_max_u32;
137 } else if (chip >= GFX8) {
138 return aco_opcode::v_max_u16;
139 } else {
140 return aco_opcode::v_max_u32;
141 }
142 break;
143 case fmin16: return aco_opcode::v_min_f16;
144 case fmax16: return aco_opcode::v_max_f16;
145 case iadd32: return chip >= GFX9 ? aco_opcode::v_add_u32 : aco_opcode::v_add_co_u32;
146 case imul32: return aco_opcode::v_mul_lo_u32;
147 case fadd32: return aco_opcode::v_add_f32;
148 case fmul32: return aco_opcode::v_mul_f32;
149 case imax32: return aco_opcode::v_max_i32;
150 case imin32: return aco_opcode::v_min_i32;
151 case umin32: return aco_opcode::v_min_u32;
152 case umax32: return aco_opcode::v_max_u32;
153 case fmin32: return aco_opcode::v_min_f32;
154 case fmax32: return aco_opcode::v_max_f32;
155 case iand8:
156 case iand16:
157 case iand32: return aco_opcode::v_and_b32;
158 case ixor8:
159 case ixor16:
160 case ixor32: return aco_opcode::v_xor_b32;
161 case ior8:
162 case ior16:
163 case ior32: return aco_opcode::v_or_b32;
164 case iadd64: return aco_opcode::num_opcodes;
165 case imul64: return aco_opcode::num_opcodes;
166 case fadd64: return aco_opcode::v_add_f64;
167 case fmul64: return aco_opcode::v_mul_f64;
168 case imin64: return aco_opcode::num_opcodes;
169 case imax64: return aco_opcode::num_opcodes;
170 case umin64: return aco_opcode::num_opcodes;
171 case umax64: return aco_opcode::num_opcodes;
172 case fmin64: return aco_opcode::v_min_f64;
173 case fmax64: return aco_opcode::v_max_f64;
174 case iand64: return aco_opcode::num_opcodes;
175 case ior64: return aco_opcode::num_opcodes;
176 case ixor64: return aco_opcode::num_opcodes;
177 default: return aco_opcode::num_opcodes;
178 }
179 }
180
is_vop3_reduce_opcode(aco_opcode opcode)181 bool is_vop3_reduce_opcode(aco_opcode opcode)
182 {
183 /* 64-bit reductions are VOP3. */
184 if (opcode == aco_opcode::num_opcodes)
185 return true;
186
187 return instr_info.format[(int)opcode] == Format::VOP3;
188 }
189
emit_vadd32(Builder & bld,Definition def,Operand src0,Operand src1)190 void emit_vadd32(Builder& bld, Definition def, Operand src0, Operand src1)
191 {
192 Instruction *instr = bld.vadd32(def, src0, src1, false, Operand(s2), true);
193 if (instr->definitions.size() >= 2) {
194 assert(instr->definitions[1].regClass() == bld.lm);
195 instr->definitions[1].setFixed(vcc);
196 }
197 }
198
emit_int64_dpp_op(lower_context * ctx,PhysReg dst_reg,PhysReg src0_reg,PhysReg src1_reg,PhysReg vtmp_reg,ReduceOp op,unsigned dpp_ctrl,unsigned row_mask,unsigned bank_mask,bool bound_ctrl,Operand * identity=NULL)199 void emit_int64_dpp_op(lower_context *ctx, PhysReg dst_reg, PhysReg src0_reg, PhysReg src1_reg,
200 PhysReg vtmp_reg, ReduceOp op,
201 unsigned dpp_ctrl, unsigned row_mask, unsigned bank_mask, bool bound_ctrl,
202 Operand *identity=NULL)
203 {
204 Builder bld(ctx->program, &ctx->instructions);
205 Definition dst[] = {Definition(dst_reg, v1), Definition(PhysReg{dst_reg+1}, v1)};
206 Definition vtmp_def[] = {Definition(vtmp_reg, v1), Definition(PhysReg{vtmp_reg+1}, v1)};
207 Operand src0[] = {Operand(src0_reg, v1), Operand(PhysReg{src0_reg+1}, v1)};
208 Operand src1[] = {Operand(src1_reg, v1), Operand(PhysReg{src1_reg+1}, v1)};
209 Operand src1_64 = Operand(src1_reg, v2);
210 Operand vtmp_op[] = {Operand(vtmp_reg, v1), Operand(PhysReg{vtmp_reg+1}, v1)};
211 Operand vtmp_op64 = Operand(vtmp_reg, v2);
212 if (op == iadd64) {
213 if (ctx->program->chip_class >= GFX10) {
214 if (identity)
215 bld.vop1(aco_opcode::v_mov_b32, vtmp_def[0], identity[0]);
216 bld.vop1_dpp(aco_opcode::v_mov_b32, vtmp_def[0], src0[0],
217 dpp_ctrl, row_mask, bank_mask, bound_ctrl);
218 bld.vop3(aco_opcode::v_add_co_u32_e64, dst[0], bld.def(bld.lm, vcc), vtmp_op[0], src1[0]);
219 } else {
220 bld.vop2_dpp(aco_opcode::v_add_co_u32, dst[0], bld.def(bld.lm, vcc), src0[0], src1[0],
221 dpp_ctrl, row_mask, bank_mask, bound_ctrl);
222 }
223 bld.vop2_dpp(aco_opcode::v_addc_co_u32, dst[1], bld.def(bld.lm, vcc), src0[1], src1[1], Operand(vcc, bld.lm),
224 dpp_ctrl, row_mask, bank_mask, bound_ctrl);
225 } else if (op == iand64) {
226 bld.vop2_dpp(aco_opcode::v_and_b32, dst[0], src0[0], src1[0],
227 dpp_ctrl, row_mask, bank_mask, bound_ctrl);
228 bld.vop2_dpp(aco_opcode::v_and_b32, dst[1], src0[1], src1[1],
229 dpp_ctrl, row_mask, bank_mask, bound_ctrl);
230 } else if (op == ior64) {
231 bld.vop2_dpp(aco_opcode::v_or_b32, dst[0], src0[0], src1[0],
232 dpp_ctrl, row_mask, bank_mask, bound_ctrl);
233 bld.vop2_dpp(aco_opcode::v_or_b32, dst[1], src0[1], src1[1],
234 dpp_ctrl, row_mask, bank_mask, bound_ctrl);
235 } else if (op == ixor64) {
236 bld.vop2_dpp(aco_opcode::v_xor_b32, dst[0], src0[0], src1[0],
237 dpp_ctrl, row_mask, bank_mask, bound_ctrl);
238 bld.vop2_dpp(aco_opcode::v_xor_b32, dst[1], src0[1], src1[1],
239 dpp_ctrl, row_mask, bank_mask, bound_ctrl);
240 } else if (op == umin64 || op == umax64 || op == imin64 || op == imax64) {
241 aco_opcode cmp = aco_opcode::num_opcodes;
242 switch (op) {
243 case umin64:
244 cmp = aco_opcode::v_cmp_gt_u64;
245 break;
246 case umax64:
247 cmp = aco_opcode::v_cmp_lt_u64;
248 break;
249 case imin64:
250 cmp = aco_opcode::v_cmp_gt_i64;
251 break;
252 case imax64:
253 cmp = aco_opcode::v_cmp_lt_i64;
254 break;
255 default:
256 break;
257 }
258
259 if (identity) {
260 bld.vop1(aco_opcode::v_mov_b32, vtmp_def[0], identity[0]);
261 bld.vop1(aco_opcode::v_mov_b32, vtmp_def[1], identity[1]);
262 }
263 bld.vop1_dpp(aco_opcode::v_mov_b32, vtmp_def[0], src0[0],
264 dpp_ctrl, row_mask, bank_mask, bound_ctrl);
265 bld.vop1_dpp(aco_opcode::v_mov_b32, vtmp_def[1], src0[1],
266 dpp_ctrl, row_mask, bank_mask, bound_ctrl);
267
268 bld.vopc(cmp, bld.def(bld.lm, vcc), vtmp_op64, src1_64);
269 bld.vop2(aco_opcode::v_cndmask_b32, dst[0], vtmp_op[0], src1[0], Operand(vcc, bld.lm));
270 bld.vop2(aco_opcode::v_cndmask_b32, dst[1], vtmp_op[1], src1[1], Operand(vcc, bld.lm));
271 } else if (op == imul64) {
272 /* t4 = dpp(x_hi)
273 * t1 = umul_lo(t4, y_lo)
274 * t3 = dpp(x_lo)
275 * t0 = umul_lo(t3, y_hi)
276 * t2 = iadd(t0, t1)
277 * t5 = umul_hi(t3, y_lo)
278 * res_hi = iadd(t2, t5)
279 * res_lo = umul_lo(t3, y_lo)
280 * Requires that res_hi != src0[0] and res_hi != src1[0]
281 * and that vtmp[0] != res_hi.
282 */
283 if (identity)
284 bld.vop1(aco_opcode::v_mov_b32, vtmp_def[0], identity[1]);
285 bld.vop1_dpp(aco_opcode::v_mov_b32, vtmp_def[0], src0[1],
286 dpp_ctrl, row_mask, bank_mask, bound_ctrl);
287 bld.vop3(aco_opcode::v_mul_lo_u32, vtmp_def[1], vtmp_op[0], src1[0]);
288 if (identity)
289 bld.vop1(aco_opcode::v_mov_b32, vtmp_def[0], identity[0]);
290 bld.vop1_dpp(aco_opcode::v_mov_b32, vtmp_def[0], src0[0],
291 dpp_ctrl, row_mask, bank_mask, bound_ctrl);
292 bld.vop3(aco_opcode::v_mul_lo_u32, vtmp_def[0], vtmp_op[0], src1[1]);
293 emit_vadd32(bld, vtmp_def[1], vtmp_op[0], vtmp_op[1]);
294 if (identity)
295 bld.vop1(aco_opcode::v_mov_b32, vtmp_def[0], identity[0]);
296 bld.vop1_dpp(aco_opcode::v_mov_b32, vtmp_def[0], src0[0],
297 dpp_ctrl, row_mask, bank_mask, bound_ctrl);
298 bld.vop3(aco_opcode::v_mul_hi_u32, vtmp_def[0], vtmp_op[0], src1[0]);
299 emit_vadd32(bld, dst[1], vtmp_op[1], vtmp_op[0]);
300 if (identity)
301 bld.vop1(aco_opcode::v_mov_b32, vtmp_def[0], identity[0]);
302 bld.vop1_dpp(aco_opcode::v_mov_b32, vtmp_def[0], src0[0],
303 dpp_ctrl, row_mask, bank_mask, bound_ctrl);
304 bld.vop3(aco_opcode::v_mul_lo_u32, dst[0], vtmp_op[0], src1[0]);
305 }
306 }
307
emit_int64_op(lower_context * ctx,PhysReg dst_reg,PhysReg src0_reg,PhysReg src1_reg,PhysReg vtmp,ReduceOp op)308 void emit_int64_op(lower_context *ctx, PhysReg dst_reg, PhysReg src0_reg, PhysReg src1_reg, PhysReg vtmp, ReduceOp op)
309 {
310 Builder bld(ctx->program, &ctx->instructions);
311 Definition dst[] = {Definition(dst_reg, v1), Definition(PhysReg{dst_reg+1}, v1)};
312 RegClass src0_rc = src0_reg.reg() >= 256 ? v1 : s1;
313 Operand src0[] = {Operand(src0_reg, src0_rc), Operand(PhysReg{src0_reg+1}, src0_rc)};
314 Operand src1[] = {Operand(src1_reg, v1), Operand(PhysReg{src1_reg+1}, v1)};
315 Operand src0_64 = Operand(src0_reg, src0_reg.reg() >= 256 ? v2 : s2);
316 Operand src1_64 = Operand(src1_reg, v2);
317
318 if (src0_rc == s1 &&
319 (op == imul64 || op == umin64 || op == umax64 || op == imin64 || op == imax64)) {
320 assert(vtmp.reg() != 0);
321 bld.vop1(aco_opcode::v_mov_b32, Definition(vtmp, v1), src0[0]);
322 bld.vop1(aco_opcode::v_mov_b32, Definition(PhysReg{vtmp+1}, v1), src0[1]);
323 src0_reg = vtmp;
324 src0[0] = Operand(vtmp, v1);
325 src0[1] = Operand(PhysReg{vtmp+1}, v1);
326 src0_64 = Operand(vtmp, v2);
327 } else if (src0_rc == s1 && op == iadd64) {
328 assert(vtmp.reg() != 0);
329 bld.vop1(aco_opcode::v_mov_b32, Definition(PhysReg{vtmp+1}, v1), src0[1]);
330 src0[1] = Operand(PhysReg{vtmp+1}, v1);
331 }
332
333 if (op == iadd64) {
334 if (ctx->program->chip_class >= GFX10) {
335 bld.vop3(aco_opcode::v_add_co_u32_e64, dst[0], bld.def(bld.lm, vcc), src0[0], src1[0]);
336 } else {
337 bld.vop2(aco_opcode::v_add_co_u32, dst[0], bld.def(bld.lm, vcc), src0[0], src1[0]);
338 }
339 bld.vop2(aco_opcode::v_addc_co_u32, dst[1], bld.def(bld.lm, vcc), src0[1], src1[1], Operand(vcc, bld.lm));
340 } else if (op == iand64) {
341 bld.vop2(aco_opcode::v_and_b32, dst[0], src0[0], src1[0]);
342 bld.vop2(aco_opcode::v_and_b32, dst[1], src0[1], src1[1]);
343 } else if (op == ior64) {
344 bld.vop2(aco_opcode::v_or_b32, dst[0], src0[0], src1[0]);
345 bld.vop2(aco_opcode::v_or_b32, dst[1], src0[1], src1[1]);
346 } else if (op == ixor64) {
347 bld.vop2(aco_opcode::v_xor_b32, dst[0], src0[0], src1[0]);
348 bld.vop2(aco_opcode::v_xor_b32, dst[1], src0[1], src1[1]);
349 } else if (op == umin64 || op == umax64 || op == imin64 || op == imax64) {
350 aco_opcode cmp = aco_opcode::num_opcodes;
351 switch (op) {
352 case umin64:
353 cmp = aco_opcode::v_cmp_gt_u64;
354 break;
355 case umax64:
356 cmp = aco_opcode::v_cmp_lt_u64;
357 break;
358 case imin64:
359 cmp = aco_opcode::v_cmp_gt_i64;
360 break;
361 case imax64:
362 cmp = aco_opcode::v_cmp_lt_i64;
363 break;
364 default:
365 break;
366 }
367
368 bld.vopc(cmp, bld.def(bld.lm, vcc), src0_64, src1_64);
369 bld.vop2(aco_opcode::v_cndmask_b32, dst[0], src0[0], src1[0], Operand(vcc, bld.lm));
370 bld.vop2(aco_opcode::v_cndmask_b32, dst[1], src0[1], src1[1], Operand(vcc, bld.lm));
371 } else if (op == imul64) {
372 if (src1_reg == dst_reg) {
373 /* it's fine if src0==dst but not if src1==dst */
374 std::swap(src0_reg, src1_reg);
375 std::swap(src0[0], src1[0]);
376 std::swap(src0[1], src1[1]);
377 std::swap(src0_64, src1_64);
378 }
379 assert(!(src0_reg == src1_reg));
380 /* t1 = umul_lo(x_hi, y_lo)
381 * t0 = umul_lo(x_lo, y_hi)
382 * t2 = iadd(t0, t1)
383 * t5 = umul_hi(x_lo, y_lo)
384 * res_hi = iadd(t2, t5)
385 * res_lo = umul_lo(x_lo, y_lo)
386 * assumes that it's ok to modify x_hi/y_hi, since we might not have vtmp
387 */
388 Definition tmp0_def(PhysReg{src0_reg+1}, v1);
389 Definition tmp1_def(PhysReg{src1_reg+1}, v1);
390 Operand tmp0_op = src0[1];
391 Operand tmp1_op = src1[1];
392 bld.vop3(aco_opcode::v_mul_lo_u32, tmp0_def, src0[1], src1[0]);
393 bld.vop3(aco_opcode::v_mul_lo_u32, tmp1_def, src0[0], src1[1]);
394 emit_vadd32(bld, tmp0_def, tmp1_op, tmp0_op);
395 bld.vop3(aco_opcode::v_mul_hi_u32, tmp1_def, src0[0], src1[0]);
396 emit_vadd32(bld, dst[1], tmp0_op, tmp1_op);
397 bld.vop3(aco_opcode::v_mul_lo_u32, dst[0], src0[0], src1[0]);
398 }
399 }
400
emit_dpp_op(lower_context * ctx,PhysReg dst_reg,PhysReg src0_reg,PhysReg src1_reg,PhysReg vtmp,ReduceOp op,unsigned size,unsigned dpp_ctrl,unsigned row_mask,unsigned bank_mask,bool bound_ctrl,Operand * identity=NULL)401 void emit_dpp_op(lower_context *ctx, PhysReg dst_reg, PhysReg src0_reg, PhysReg src1_reg,
402 PhysReg vtmp, ReduceOp op, unsigned size,
403 unsigned dpp_ctrl, unsigned row_mask, unsigned bank_mask, bool bound_ctrl,
404 Operand *identity=NULL) /* for VOP3 with sparse writes */
405 {
406 Builder bld(ctx->program, &ctx->instructions);
407 RegClass rc = RegClass(RegType::vgpr, size);
408 Definition dst(dst_reg, rc);
409 Operand src0(src0_reg, rc);
410 Operand src1(src1_reg, rc);
411
412 aco_opcode opcode = get_reduce_opcode(ctx->program->chip_class, op);
413 bool vop3 = is_vop3_reduce_opcode(opcode);
414
415 if (!vop3) {
416 if (opcode == aco_opcode::v_add_co_u32)
417 bld.vop2_dpp(opcode, dst, bld.def(bld.lm, vcc), src0, src1, dpp_ctrl, row_mask, bank_mask, bound_ctrl);
418 else
419 bld.vop2_dpp(opcode, dst, src0, src1, dpp_ctrl, row_mask, bank_mask, bound_ctrl);
420 return;
421 }
422
423 if (opcode == aco_opcode::num_opcodes) {
424 emit_int64_dpp_op(ctx, dst_reg ,src0_reg, src1_reg, vtmp, op,
425 dpp_ctrl, row_mask, bank_mask, bound_ctrl, identity);
426 return;
427 }
428
429 if (identity)
430 bld.vop1(aco_opcode::v_mov_b32, Definition(vtmp, v1), identity[0]);
431 if (identity && size >= 2)
432 bld.vop1(aco_opcode::v_mov_b32, Definition(PhysReg{vtmp+1}, v1), identity[1]);
433
434 for (unsigned i = 0; i < size; i++)
435 bld.vop1_dpp(aco_opcode::v_mov_b32, Definition(PhysReg{vtmp+i}, v1), Operand(PhysReg{src0_reg+i}, v1),
436 dpp_ctrl, row_mask, bank_mask, bound_ctrl);
437
438 bld.vop3(opcode, dst, Operand(vtmp, rc), src1);
439 }
440
emit_op(lower_context * ctx,PhysReg dst_reg,PhysReg src0_reg,PhysReg src1_reg,PhysReg vtmp,ReduceOp op,unsigned size)441 void emit_op(lower_context *ctx, PhysReg dst_reg, PhysReg src0_reg, PhysReg src1_reg,
442 PhysReg vtmp, ReduceOp op, unsigned size)
443 {
444 Builder bld(ctx->program, &ctx->instructions);
445 RegClass rc = RegClass(RegType::vgpr, size);
446 Definition dst(dst_reg, rc);
447 Operand src0(src0_reg, RegClass(src0_reg.reg() >= 256 ? RegType::vgpr : RegType::sgpr, size));
448 Operand src1(src1_reg, rc);
449
450 aco_opcode opcode = get_reduce_opcode(ctx->program->chip_class, op);
451 bool vop3 = is_vop3_reduce_opcode(opcode);
452
453 if (opcode == aco_opcode::num_opcodes) {
454 emit_int64_op(ctx, dst_reg, src0_reg, src1_reg, vtmp, op);
455 return;
456 }
457
458 if (vop3) {
459 bld.vop3(opcode, dst, src0, src1);
460 } else if (opcode == aco_opcode::v_add_co_u32) {
461 bld.vop2(opcode, dst, bld.def(bld.lm, vcc), src0, src1);
462 } else {
463 bld.vop2(opcode, dst, src0, src1);
464 }
465 }
466
emit_dpp_mov(lower_context * ctx,PhysReg dst,PhysReg src0,unsigned size,unsigned dpp_ctrl,unsigned row_mask,unsigned bank_mask,bool bound_ctrl)467 void emit_dpp_mov(lower_context *ctx, PhysReg dst, PhysReg src0, unsigned size,
468 unsigned dpp_ctrl, unsigned row_mask, unsigned bank_mask, bool bound_ctrl)
469 {
470 Builder bld(ctx->program, &ctx->instructions);
471 for (unsigned i = 0; i < size; i++) {
472 bld.vop1_dpp(aco_opcode::v_mov_b32, Definition(PhysReg{dst+i}, v1), Operand(PhysReg{src0+i}, v1),
473 dpp_ctrl, row_mask, bank_mask, bound_ctrl);
474 }
475 }
476
emit_ds_swizzle(Builder bld,PhysReg dst,PhysReg src,unsigned size,unsigned ds_pattern)477 void emit_ds_swizzle(Builder bld, PhysReg dst, PhysReg src, unsigned size, unsigned ds_pattern)
478 {
479 for (unsigned i = 0; i < size; i++) {
480 bld.ds(aco_opcode::ds_swizzle_b32, Definition(PhysReg{dst+i}, v1),
481 Operand(PhysReg{src+i}, v1), ds_pattern);
482 }
483 }
484
emit_reduction(lower_context * ctx,aco_opcode op,ReduceOp reduce_op,unsigned cluster_size,PhysReg tmp,PhysReg stmp,PhysReg vtmp,PhysReg sitmp,Operand src,Definition dst)485 void emit_reduction(lower_context *ctx, aco_opcode op, ReduceOp reduce_op, unsigned cluster_size, PhysReg tmp,
486 PhysReg stmp, PhysReg vtmp, PhysReg sitmp, Operand src, Definition dst)
487 {
488 assert(cluster_size == ctx->program->wave_size || op == aco_opcode::p_reduce);
489 assert(cluster_size <= ctx->program->wave_size);
490
491 Builder bld(ctx->program, &ctx->instructions);
492
493 Operand identity[2];
494 identity[0] = Operand(get_reduction_identity(reduce_op, 0));
495 identity[1] = Operand(get_reduction_identity(reduce_op, 1));
496 Operand vcndmask_identity[2] = {identity[0], identity[1]};
497
498 /* First, copy the source to tmp and set inactive lanes to the identity */
499 bld.sop1(Builder::s_or_saveexec, Definition(stmp, bld.lm), Definition(scc, s1), Definition(exec, bld.lm), Operand(UINT64_MAX), Operand(exec, bld.lm));
500
501 for (unsigned i = 0; i < src.size(); i++) {
502 /* p_exclusive_scan needs it to be a sgpr or inline constant for the v_writelane_b32
503 * except on GFX10, where v_writelane_b32 can take a literal. */
504 if (identity[i].isLiteral() && op == aco_opcode::p_exclusive_scan && ctx->program->chip_class < GFX10) {
505 bld.sop1(aco_opcode::s_mov_b32, Definition(PhysReg{sitmp+i}, s1), identity[i]);
506 identity[i] = Operand(PhysReg{sitmp+i}, s1);
507
508 bld.vop1(aco_opcode::v_mov_b32, Definition(PhysReg{tmp+i}, v1), identity[i]);
509 vcndmask_identity[i] = Operand(PhysReg{tmp+i}, v1);
510 } else if (identity[i].isLiteral()) {
511 bld.vop1(aco_opcode::v_mov_b32, Definition(PhysReg{tmp+i}, v1), identity[i]);
512 vcndmask_identity[i] = Operand(PhysReg{tmp+i}, v1);
513 }
514 }
515
516 for (unsigned i = 0; i < src.size(); i++) {
517 bld.vop2_e64(aco_opcode::v_cndmask_b32, Definition(PhysReg{tmp + i}, v1),
518 vcndmask_identity[i], Operand(PhysReg{src.physReg() + i}, v1),
519 Operand(stmp, bld.lm));
520 }
521
522 if (src.regClass() == v1b) {
523 if (ctx->program->chip_class >= GFX8) {
524 aco_ptr<SDWA_instruction> sdwa{create_instruction<SDWA_instruction>(aco_opcode::v_mov_b32, asSDWA(Format::VOP1), 1, 1)};
525 sdwa->operands[0] = Operand(PhysReg{tmp}, v1);
526 sdwa->definitions[0] = Definition(PhysReg{tmp}, v1);
527 if (reduce_op == imin8 || reduce_op == imax8)
528 sdwa->sel[0] = sdwa_sbyte;
529 else
530 sdwa->sel[0] = sdwa_ubyte;
531 sdwa->dst_sel = sdwa_udword;
532 bld.insert(std::move(sdwa));
533 } else {
534 aco_opcode opcode;
535
536 if (reduce_op == imin8 || reduce_op == imax8)
537 opcode = aco_opcode::v_bfe_i32;
538 else
539 opcode = aco_opcode::v_bfe_u32;
540
541 bld.vop3(opcode, Definition(PhysReg{tmp}, v1),
542 Operand(PhysReg{tmp}, v1), Operand(0u), Operand(8u));
543 }
544 } else if (src.regClass() == v2b) {
545 if (ctx->program->chip_class >= GFX10 &&
546 (reduce_op == iadd16 || reduce_op == imax16 ||
547 reduce_op == imin16 || reduce_op == umin16 || reduce_op == umax16)) {
548 aco_ptr<SDWA_instruction> sdwa{create_instruction<SDWA_instruction>(aco_opcode::v_mov_b32, asSDWA(Format::VOP1), 1, 1)};
549 sdwa->operands[0] = Operand(PhysReg{tmp}, v1);
550 sdwa->definitions[0] = Definition(PhysReg{tmp}, v1);
551 if (reduce_op == imin16 || reduce_op == imax16 || reduce_op == iadd16)
552 sdwa->sel[0] = sdwa_sword;
553 else
554 sdwa->sel[0] = sdwa_uword;
555 sdwa->dst_sel = sdwa_udword;
556 bld.insert(std::move(sdwa));
557 } else if (ctx->program->chip_class == GFX6 || ctx->program->chip_class == GFX7) {
558 aco_opcode opcode;
559
560 if (reduce_op == imin16 || reduce_op == imax16 || reduce_op == iadd16)
561 opcode = aco_opcode::v_bfe_i32;
562 else
563 opcode = aco_opcode::v_bfe_u32;
564
565 bld.vop3(opcode, Definition(PhysReg{tmp}, v1),
566 Operand(PhysReg{tmp}, v1), Operand(0u), Operand(16u));
567 }
568 }
569
570 bool reduction_needs_last_op = false;
571 switch (op) {
572 case aco_opcode::p_reduce:
573 if (cluster_size == 1) break;
574
575 if (ctx->program->chip_class <= GFX7) {
576 reduction_needs_last_op = true;
577 emit_ds_swizzle(bld, vtmp, tmp, src.size(), (1 << 15) | dpp_quad_perm(1, 0, 3, 2));
578 if (cluster_size == 2) break;
579 emit_op(ctx, tmp, vtmp, tmp, PhysReg{0}, reduce_op, src.size());
580 emit_ds_swizzle(bld, vtmp, tmp, src.size(), (1 << 15) | dpp_quad_perm(2, 3, 0, 1));
581 if (cluster_size == 4) break;
582 emit_op(ctx, tmp, vtmp, tmp, PhysReg{0}, reduce_op, src.size());
583 emit_ds_swizzle(bld, vtmp, tmp, src.size(), ds_pattern_bitmode(0x1f, 0, 0x04));
584 if (cluster_size == 8) break;
585 emit_op(ctx, tmp, vtmp, tmp, PhysReg{0}, reduce_op, src.size());
586 emit_ds_swizzle(bld, vtmp, tmp, src.size(), ds_pattern_bitmode(0x1f, 0, 0x08));
587 if (cluster_size == 16) break;
588 emit_op(ctx, tmp, vtmp, tmp, PhysReg{0}, reduce_op, src.size());
589 emit_ds_swizzle(bld, vtmp, tmp, src.size(), ds_pattern_bitmode(0x1f, 0, 0x10));
590 if (cluster_size == 32) break;
591 emit_op(ctx, tmp, vtmp, tmp, PhysReg{0}, reduce_op, src.size());
592 for (unsigned i = 0; i < src.size(); i++)
593 bld.readlane(Definition(PhysReg{dst.physReg() + i}, s1), Operand(PhysReg{tmp + i}, v1), Operand(0u));
594 // TODO: it would be more effective to do the last reduction step on SALU
595 emit_op(ctx, tmp, dst.physReg(), tmp, vtmp, reduce_op, src.size());
596 reduction_needs_last_op = false;
597 break;
598 }
599
600 emit_dpp_op(ctx, tmp, tmp, tmp, vtmp, reduce_op, src.size(), dpp_quad_perm(1, 0, 3, 2), 0xf, 0xf, false);
601 if (cluster_size == 2) break;
602 emit_dpp_op(ctx, tmp, tmp, tmp, vtmp, reduce_op, src.size(), dpp_quad_perm(2, 3, 0, 1), 0xf, 0xf, false);
603 if (cluster_size == 4) break;
604 emit_dpp_op(ctx, tmp, tmp, tmp, vtmp, reduce_op, src.size(), dpp_row_half_mirror, 0xf, 0xf, false);
605 if (cluster_size == 8) break;
606 emit_dpp_op(ctx, tmp, tmp, tmp, vtmp, reduce_op, src.size(), dpp_row_mirror, 0xf, 0xf, false);
607 if (cluster_size == 16) break;
608
609 if (ctx->program->chip_class >= GFX10) {
610 /* GFX10+ doesn't support row_bcast15 and row_bcast31 */
611 for (unsigned i = 0; i < src.size(); i++)
612 bld.vop3(aco_opcode::v_permlanex16_b32, Definition(PhysReg{vtmp+i}, v1), Operand(PhysReg{tmp+i}, v1), Operand(0u), Operand(0u));
613
614 if (cluster_size == 32) {
615 reduction_needs_last_op = true;
616 break;
617 }
618
619 emit_op(ctx, tmp, tmp, vtmp, PhysReg{0}, reduce_op, src.size());
620 for (unsigned i = 0; i < src.size(); i++)
621 bld.readlane(Definition(PhysReg{dst.physReg() + i}, s1), Operand(PhysReg{tmp+i}, v1), Operand(0u));
622 // TODO: it would be more effective to do the last reduction step on SALU
623 emit_op(ctx, tmp, dst.physReg(), tmp, vtmp, reduce_op, src.size());
624 break;
625 }
626
627 if (cluster_size == 32) {
628 emit_ds_swizzle(bld, vtmp, tmp, src.size(), ds_pattern_bitmode(0x1f, 0, 0x10));
629 reduction_needs_last_op = true;
630 break;
631 }
632 assert(cluster_size == 64);
633 emit_dpp_op(ctx, tmp, tmp, tmp, vtmp, reduce_op, src.size(), dpp_row_bcast15, 0xa, 0xf, false);
634 emit_dpp_op(ctx, tmp, tmp, tmp, vtmp, reduce_op, src.size(), dpp_row_bcast31, 0xc, 0xf, false);
635 break;
636 case aco_opcode::p_exclusive_scan:
637 if (ctx->program->chip_class >= GFX10) { /* gfx10 doesn't support wf_sr1, so emulate it */
638 /* shift rows right */
639 emit_dpp_mov(ctx, vtmp, tmp, src.size(), dpp_row_sr(1), 0xf, 0xf, true);
640
641 /* fill in the gaps in rows 1 and 3 */
642 bld.sop1(aco_opcode::s_mov_b32, Definition(exec_lo, s1), Operand(0x10000u));
643 bld.sop1(aco_opcode::s_mov_b32, Definition(exec_hi, s1), Operand(0x10000u));
644 for (unsigned i = 0; i < src.size(); i++) {
645 Instruction *perm = bld.vop3(aco_opcode::v_permlanex16_b32,
646 Definition(PhysReg{vtmp+i}, v1),
647 Operand(PhysReg{tmp+i}, v1),
648 Operand(0xffffffffu), Operand(0xffffffffu)).instr;
649 static_cast<VOP3A_instruction*>(perm)->opsel = 1; /* FI (Fetch Inactive) */
650 }
651 bld.sop1(Builder::s_mov, Definition(exec, bld.lm), Operand(UINT64_MAX));
652
653 if (ctx->program->wave_size == 64) {
654 /* fill in the gap in row 2 */
655 for (unsigned i = 0; i < src.size(); i++) {
656 bld.readlane(Definition(PhysReg{sitmp+i}, s1), Operand(PhysReg{tmp+i}, v1), Operand(31u));
657 bld.writelane(Definition(PhysReg{vtmp+i}, v1), Operand(PhysReg{sitmp+i}, s1), Operand(32u), Operand(PhysReg{vtmp+i}, v1));
658 }
659 }
660 std::swap(tmp, vtmp);
661 } else if (ctx->program->chip_class >= GFX8) {
662 emit_dpp_mov(ctx, tmp, tmp, src.size(), dpp_wf_sr1, 0xf, 0xf, true);
663 } else {
664 // TODO: use LDS on CS with a single write and shifted read
665 /* wavefront shift_right by 1 on SI/CI */
666 emit_ds_swizzle(bld, vtmp, tmp, src.size(), (1 << 15) | dpp_quad_perm(0, 0, 1, 2));
667 emit_ds_swizzle(bld, tmp, tmp, src.size(), ds_pattern_bitmode(0x1F, 0x00, 0x07)); /* mirror(8) */
668 bld.sop1(aco_opcode::s_mov_b32, Definition(exec_lo, s1), Operand(0x10101010u));
669 bld.sop1(aco_opcode::s_mov_b32, Definition(exec_hi, s1), Operand(exec_lo, s1));
670 for (unsigned i = 0; i < src.size(); i++)
671 bld.vop1(aco_opcode::v_mov_b32, Definition(PhysReg{vtmp+i}, v1), Operand(PhysReg{tmp+i}, v1));
672
673 bld.sop1(aco_opcode::s_mov_b64, Definition(exec, s2), Operand(UINT64_MAX));
674 emit_ds_swizzle(bld, tmp, tmp, src.size(), ds_pattern_bitmode(0x1F, 0x00, 0x08)); /* swap(8) */
675 bld.sop1(aco_opcode::s_mov_b32, Definition(exec_lo, s1), Operand(0x01000100u));
676 bld.sop1(aco_opcode::s_mov_b32, Definition(exec_hi, s1), Operand(exec_lo, s1));
677 for (unsigned i = 0; i < src.size(); i++)
678 bld.vop1(aco_opcode::v_mov_b32, Definition(PhysReg{vtmp+i}, v1), Operand(PhysReg{tmp+i}, v1));
679
680 bld.sop1(aco_opcode::s_mov_b64, Definition(exec, s2), Operand(UINT64_MAX));
681 emit_ds_swizzle(bld, tmp, tmp, src.size(), ds_pattern_bitmode(0x1F, 0x00, 0x10)); /* swap(16) */
682 bld.sop2(aco_opcode::s_bfm_b32, Definition(exec_lo, s1), Operand(1u), Operand(16u));
683 bld.sop2(aco_opcode::s_bfm_b32, Definition(exec_hi, s1), Operand(1u), Operand(16u));
684 for (unsigned i = 0; i < src.size(); i++)
685 bld.vop1(aco_opcode::v_mov_b32, Definition(PhysReg{vtmp+i}, v1), Operand(PhysReg{tmp+i}, v1));
686
687 bld.sop1(aco_opcode::s_mov_b64, Definition(exec, s2), Operand(UINT64_MAX));
688 for (unsigned i = 0; i < src.size(); i++) {
689 bld.writelane(Definition(PhysReg{vtmp+i}, v1), identity[i], Operand(0u), Operand(PhysReg{vtmp+i}, v1));
690 bld.readlane(Definition(PhysReg{sitmp+i}, s1), Operand(PhysReg{tmp+i}, v1), Operand(0u));
691 bld.writelane(Definition(PhysReg{vtmp+i}, v1), Operand(PhysReg{sitmp+i}, s1), Operand(32u), Operand(PhysReg{vtmp+i}, v1));
692 identity[i] = Operand(0u); /* prevent further uses of identity */
693 }
694 std::swap(tmp, vtmp);
695 }
696
697 for (unsigned i = 0; i < src.size(); i++) {
698 if (!identity[i].isConstant() || identity[i].constantValue()) { /* bound_ctrl should take care of this overwise */
699 if (ctx->program->chip_class < GFX10)
700 assert((identity[i].isConstant() && !identity[i].isLiteral()) || identity[i].physReg() == PhysReg{sitmp+i});
701 bld.writelane(Definition(PhysReg{tmp+i}, v1), identity[i], Operand(0u), Operand(PhysReg{tmp+i}, v1));
702 }
703 }
704 /* fall through */
705 case aco_opcode::p_inclusive_scan:
706 assert(cluster_size == ctx->program->wave_size);
707 if (ctx->program->chip_class <= GFX7) {
708 emit_ds_swizzle(bld, vtmp, tmp, src.size(), ds_pattern_bitmode(0x1e, 0x00, 0x00));
709 bld.sop1(aco_opcode::s_mov_b32, Definition(exec_lo, s1), Operand(0xAAAAAAAAu));
710 bld.sop1(aco_opcode::s_mov_b32, Definition(exec_hi, s1), Operand(exec_lo, s1));
711 emit_op(ctx, tmp, tmp, vtmp, PhysReg{0}, reduce_op, src.size());
712
713 bld.sop1(aco_opcode::s_mov_b64, Definition(exec, s2), Operand(UINT64_MAX));
714 emit_ds_swizzle(bld, vtmp, tmp, src.size(), ds_pattern_bitmode(0x1c, 0x01, 0x00));
715 bld.sop1(aco_opcode::s_mov_b32, Definition(exec_lo, s1), Operand(0xCCCCCCCCu));
716 bld.sop1(aco_opcode::s_mov_b32, Definition(exec_hi, s1), Operand(exec_lo, s1));
717 emit_op(ctx, tmp, tmp, vtmp, PhysReg{0}, reduce_op, src.size());
718
719 bld.sop1(aco_opcode::s_mov_b64, Definition(exec, s2), Operand(UINT64_MAX));
720 emit_ds_swizzle(bld, vtmp, tmp, src.size(), ds_pattern_bitmode(0x18, 0x03, 0x00));
721 bld.sop1(aco_opcode::s_mov_b32, Definition(exec_lo, s1), Operand(0xF0F0F0F0u));
722 bld.sop1(aco_opcode::s_mov_b32, Definition(exec_hi, s1), Operand(exec_lo, s1));
723 emit_op(ctx, tmp, tmp, vtmp, PhysReg{0}, reduce_op, src.size());
724
725 bld.sop1(aco_opcode::s_mov_b64, Definition(exec, s2), Operand(UINT64_MAX));
726 emit_ds_swizzle(bld, vtmp, tmp, src.size(), ds_pattern_bitmode(0x10, 0x07, 0x00));
727 bld.sop1(aco_opcode::s_mov_b32, Definition(exec_lo, s1), Operand(0xFF00FF00u));
728 bld.sop1(aco_opcode::s_mov_b32, Definition(exec_hi, s1), Operand(exec_lo, s1));
729 emit_op(ctx, tmp, tmp, vtmp, PhysReg{0}, reduce_op, src.size());
730
731 bld.sop1(aco_opcode::s_mov_b64, Definition(exec, s2), Operand(UINT64_MAX));
732 emit_ds_swizzle(bld, vtmp, tmp, src.size(), ds_pattern_bitmode(0x00, 0x0f, 0x00));
733 bld.sop2(aco_opcode::s_bfm_b32, Definition(exec_lo, s1), Operand(16u), Operand(16u));
734 bld.sop2(aco_opcode::s_bfm_b32, Definition(exec_hi, s1), Operand(16u), Operand(16u));
735 emit_op(ctx, tmp, tmp, vtmp, PhysReg{0}, reduce_op, src.size());
736
737 for (unsigned i = 0; i < src.size(); i++)
738 bld.readlane(Definition(PhysReg{sitmp+i}, s1), Operand(PhysReg{tmp+i}, v1), Operand(31u));
739 bld.sop2(aco_opcode::s_bfm_b64, Definition(exec, s2), Operand(32u), Operand(32u));
740 emit_op(ctx, tmp, sitmp, tmp, vtmp, reduce_op, src.size());
741 break;
742 }
743
744 emit_dpp_op(ctx, tmp, tmp, tmp, vtmp, reduce_op, src.size(),
745 dpp_row_sr(1), 0xf, 0xf, false, identity);
746 emit_dpp_op(ctx, tmp, tmp, tmp, vtmp, reduce_op, src.size(),
747 dpp_row_sr(2), 0xf, 0xf, false, identity);
748 emit_dpp_op(ctx, tmp, tmp, tmp, vtmp, reduce_op, src.size(),
749 dpp_row_sr(4), 0xf, 0xf, false, identity);
750 emit_dpp_op(ctx, tmp, tmp, tmp, vtmp, reduce_op, src.size(),
751 dpp_row_sr(8), 0xf, 0xf, false, identity);
752 if (ctx->program->chip_class >= GFX10) {
753 bld.sop2(aco_opcode::s_bfm_b32, Definition(exec_lo, s1), Operand(16u), Operand(16u));
754 bld.sop2(aco_opcode::s_bfm_b32, Definition(exec_hi, s1), Operand(16u), Operand(16u));
755 for (unsigned i = 0; i < src.size(); i++) {
756 Instruction *perm = bld.vop3(aco_opcode::v_permlanex16_b32,
757 Definition(PhysReg{vtmp+i}, v1),
758 Operand(PhysReg{tmp+i}, v1),
759 Operand(0xffffffffu), Operand(0xffffffffu)).instr;
760 static_cast<VOP3A_instruction*>(perm)->opsel = 1; /* FI (Fetch Inactive) */
761 }
762 emit_op(ctx, tmp, tmp, vtmp, PhysReg{0}, reduce_op, src.size());
763
764 if (ctx->program->wave_size == 64) {
765 bld.sop2(aco_opcode::s_bfm_b64, Definition(exec, s2), Operand(32u), Operand(32u));
766 for (unsigned i = 0; i < src.size(); i++)
767 bld.readlane(Definition(PhysReg{sitmp+i}, s1), Operand(PhysReg{tmp+i}, v1), Operand(31u));
768 emit_op(ctx, tmp, sitmp, tmp, vtmp, reduce_op, src.size());
769 }
770 } else {
771 emit_dpp_op(ctx, tmp, tmp, tmp, vtmp, reduce_op, src.size(),
772 dpp_row_bcast15, 0xa, 0xf, false, identity);
773 emit_dpp_op(ctx, tmp, tmp, tmp, vtmp, reduce_op, src.size(),
774 dpp_row_bcast31, 0xc, 0xf, false, identity);
775 }
776 break;
777 default:
778 unreachable("Invalid reduction mode");
779 }
780
781
782 if (op == aco_opcode::p_reduce) {
783 if (reduction_needs_last_op && dst.regClass().type() == RegType::vgpr) {
784 bld.sop1(Builder::s_mov, Definition(exec, bld.lm), Operand(stmp, bld.lm));
785 emit_op(ctx, dst.physReg(), tmp, vtmp, PhysReg{0}, reduce_op, src.size());
786 return;
787 }
788
789 if (reduction_needs_last_op)
790 emit_op(ctx, tmp, vtmp, tmp, PhysReg{0}, reduce_op, src.size());
791 }
792
793 /* restore exec */
794 bld.sop1(Builder::s_mov, Definition(exec, bld.lm), Operand(stmp, bld.lm));
795
796 if (dst.regClass().type() == RegType::sgpr) {
797 for (unsigned k = 0; k < src.size(); k++) {
798 bld.readlane(Definition(PhysReg{dst.physReg() + k}, s1),
799 Operand(PhysReg{tmp + k}, v1), Operand(ctx->program->wave_size - 1));
800 }
801 } else if (dst.physReg() != tmp) {
802 for (unsigned k = 0; k < src.size(); k++) {
803 bld.vop1(aco_opcode::v_mov_b32, Definition(PhysReg{dst.physReg() + k}, v1),
804 Operand(PhysReg{tmp + k}, v1));
805 }
806 }
807 }
808
emit_gfx10_wave64_bpermute(Program * program,aco_ptr<Instruction> & instr,Builder & bld)809 void emit_gfx10_wave64_bpermute(Program *program, aco_ptr<Instruction> &instr, Builder &bld)
810 {
811 /* Emulates proper bpermute on GFX10 in wave64 mode.
812 *
813 * This is necessary because on GFX10 the bpermute instruction only works
814 * on half waves (you can think of it as having a cluster size of 32), so we
815 * manually swap the data between the two halves using two shared VGPRs.
816 */
817
818 assert(program->chip_class >= GFX10);
819 assert(program->wave_size == 64);
820
821 unsigned shared_vgpr_reg_0 = align(program->config->num_vgprs, 4) + 256;
822 Definition dst = instr->definitions[0];
823 Definition tmp_exec = instr->definitions[1];
824 Definition clobber_scc = instr->definitions[2];
825 Operand index_x4 = instr->operands[0];
826 Operand input_data = instr->operands[1];
827 Operand same_half = instr->operands[2];
828
829 assert(dst.regClass() == v1);
830 assert(tmp_exec.regClass() == bld.lm);
831 assert(clobber_scc.isFixed() && clobber_scc.physReg() == scc);
832 assert(same_half.regClass() == bld.lm);
833 assert(index_x4.regClass() == v1);
834 assert(input_data.regClass().type() == RegType::vgpr);
835 assert(input_data.bytes() <= 4);
836 assert(dst.physReg() != index_x4.physReg());
837 assert(dst.physReg() != input_data.physReg());
838 assert(tmp_exec.physReg() != same_half.physReg());
839
840 PhysReg shared_vgpr_lo(shared_vgpr_reg_0);
841 PhysReg shared_vgpr_hi(shared_vgpr_reg_0 + 1);
842
843 /* Permute the input within the same half-wave */
844 bld.ds(aco_opcode::ds_bpermute_b32, dst, index_x4, input_data);
845
846 /* HI: Copy data from high lanes 32-63 to shared vgpr */
847 bld.vop1_dpp(aco_opcode::v_mov_b32, Definition(shared_vgpr_hi, v1), input_data, dpp_quad_perm(0, 1, 2, 3), 0xc, 0xf, false);
848 /* Save EXEC */
849 bld.sop1(aco_opcode::s_mov_b64, tmp_exec, Operand(exec, s2));
850 /* Set EXEC to enable LO lanes only */
851 bld.sop2(aco_opcode::s_bfm_b64, Definition(exec, s2), Operand(32u), Operand(0u));
852 /* LO: Copy data from low lanes 0-31 to shared vgpr */
853 bld.vop1(aco_opcode::v_mov_b32, Definition(shared_vgpr_lo, v1), input_data);
854 /* LO: bpermute shared vgpr (high lanes' data) */
855 bld.ds(aco_opcode::ds_bpermute_b32, Definition(shared_vgpr_hi, v1), index_x4, Operand(shared_vgpr_hi, v1));
856 /* Set EXEC to enable HI lanes only */
857 bld.sop2(aco_opcode::s_bfm_b64, Definition(exec, s2), Operand(32u), Operand(32u));
858 /* HI: bpermute shared vgpr (low lanes' data) */
859 bld.ds(aco_opcode::ds_bpermute_b32, Definition(shared_vgpr_lo, v1), index_x4, Operand(shared_vgpr_lo, v1));
860
861 /* Only enable lanes which use the other half's data */
862 bld.sop2(aco_opcode::s_andn2_b64, Definition(exec, s2), clobber_scc, Operand(tmp_exec.physReg(), s2), same_half);
863 /* LO: Copy shared vgpr (high lanes' bpermuted data) to output vgpr */
864 bld.vop1_dpp(aco_opcode::v_mov_b32, dst, Operand(shared_vgpr_hi, v1), dpp_quad_perm(0, 1, 2, 3), 0x3, 0xf, false);
865 /* HI: Copy shared vgpr (low lanes' bpermuted data) to output vgpr */
866 bld.vop1_dpp(aco_opcode::v_mov_b32, dst, Operand(shared_vgpr_lo, v1), dpp_quad_perm(0, 1, 2, 3), 0xc, 0xf, false);
867
868 /* Restore saved EXEC */
869 bld.sop1(aco_opcode::s_mov_b64, Definition(exec, s2), Operand(tmp_exec.physReg(), s2));
870
871 /* RA assumes that the result is always in the low part of the register, so we have to shift, if it's not there already */
872 if (input_data.physReg().byte()) {
873 unsigned right_shift = input_data.physReg().byte() * 8;
874 bld.vop2(aco_opcode::v_lshrrev_b32, dst, Operand(right_shift), Operand(dst.physReg(), v1));
875 }
876 }
877
emit_gfx6_bpermute(Program * program,aco_ptr<Instruction> & instr,Builder & bld)878 void emit_gfx6_bpermute(Program *program, aco_ptr<Instruction> &instr, Builder &bld)
879 {
880 /* Emulates bpermute using readlane instructions */
881
882 Operand index = instr->operands[0];
883 Operand input = instr->operands[1];
884 Definition dst = instr->definitions[0];
885 Definition temp_exec = instr->definitions[1];
886 Definition clobber_vcc = instr->definitions[2];
887
888 assert(dst.regClass() == v1);
889 assert(temp_exec.regClass() == bld.lm);
890 assert(clobber_vcc.regClass() == bld.lm);
891 assert(clobber_vcc.physReg() == vcc);
892 assert(index.regClass() == v1);
893 assert(index.physReg() != dst.physReg());
894 assert(input.regClass().type() == RegType::vgpr);
895 assert(input.bytes() <= 4);
896 assert(input.physReg() != dst.physReg());
897
898 /* Save original EXEC */
899 bld.sop1(aco_opcode::s_mov_b64, temp_exec, Operand(exec, s2));
900
901 /* An "unrolled loop" that is executed per each lane.
902 * This takes only a few instructions per lane, as opposed to a "real" loop
903 * with branching, where the branch instruction alone would take 16+ cycles.
904 */
905 for (unsigned n = 0; n < program->wave_size; ++n) {
906 /* Activate the lane which has N for its source index */
907 bld.vopc(aco_opcode::v_cmpx_eq_u32, Definition(exec, bld.lm), clobber_vcc, Operand(n), index);
908 /* Read the data from lane N */
909 bld.readlane(Definition(vcc, s1), input, Operand(n));
910 /* On the active lane, move the data we read from lane N to the destination VGPR */
911 bld.vop1(aco_opcode::v_mov_b32, dst, Operand(vcc, s1));
912 /* Restore original EXEC */
913 bld.sop1(aco_opcode::s_mov_b64, Definition(exec, s2), Operand(temp_exec.physReg(), s2));
914 }
915 }
916
917 struct copy_operation {
918 Operand op;
919 Definition def;
920 unsigned bytes;
921 union {
922 uint8_t uses[8];
923 uint64_t is_used = 0;
924 };
925 };
926
split_copy(unsigned offset,Definition * def,Operand * op,const copy_operation & src,bool ignore_uses,unsigned max_size)927 void split_copy(unsigned offset, Definition *def, Operand *op, const copy_operation& src, bool ignore_uses, unsigned max_size)
928 {
929 PhysReg def_reg = src.def.physReg();
930 PhysReg op_reg = src.op.physReg();
931 def_reg.reg_b += offset;
932 op_reg.reg_b += offset;
933
934 max_size = MIN2(max_size, src.def.regClass().type() == RegType::vgpr ? 4 : 8);
935
936 /* make sure the size is a power of two and reg % bytes == 0 */
937 unsigned bytes = 1;
938 for (; bytes <= max_size; bytes *= 2) {
939 unsigned next = bytes * 2u;
940 bool can_increase = def_reg.reg_b % next == 0 &&
941 offset + next <= src.bytes && next <= max_size;
942 if (!src.op.isConstant() && can_increase)
943 can_increase = op_reg.reg_b % next == 0;
944 for (unsigned i = 0; !ignore_uses && can_increase && (i < bytes); i++)
945 can_increase = (src.uses[offset + bytes + i] == 0) == (src.uses[offset] == 0);
946 if (!can_increase)
947 break;
948 }
949
950 RegClass def_cls = bytes % 4 == 0 ? RegClass(src.def.regClass().type(), bytes / 4u) :
951 RegClass(src.def.regClass().type(), bytes).as_subdword();
952 *def = Definition(src.def.tempId(), def_reg, def_cls);
953 if (src.op.isConstant()) {
954 assert(bytes >= 1 && bytes <= 8);
955 if (bytes == 8)
956 *op = Operand(src.op.constantValue64() >> (offset * 8u));
957 else if (bytes == 4)
958 *op = Operand(uint32_t(src.op.constantValue64() >> (offset * 8u)));
959 else if (bytes == 2)
960 *op = Operand(uint16_t(src.op.constantValue64() >> (offset * 8u)));
961 else if (bytes == 1)
962 *op = Operand(uint8_t(src.op.constantValue64() >> (offset * 8u)));
963 } else {
964 RegClass op_cls = bytes % 4 == 0 ? RegClass(src.op.regClass().type(), bytes / 4u) :
965 RegClass(src.op.regClass().type(), bytes).as_subdword();
966 *op = Operand(op_reg, op_cls);
967 op->setTemp(Temp(src.op.tempId(), op_cls));
968 }
969 }
970
get_intersection_mask(int a_start,int a_size,int b_start,int b_size)971 uint32_t get_intersection_mask(int a_start, int a_size,
972 int b_start, int b_size)
973 {
974 int intersection_start = MAX2(b_start - a_start, 0);
975 int intersection_end = MAX2(b_start + b_size - a_start, 0);
976 if (intersection_start >= a_size || intersection_end == 0)
977 return 0;
978
979 uint32_t mask = u_bit_consecutive(0, a_size);
980 return u_bit_consecutive(intersection_start, intersection_end - intersection_start) & mask;
981 }
982
copy_constant(lower_context * ctx,Builder & bld,Definition dst,Operand op)983 void copy_constant(lower_context *ctx, Builder& bld, Definition dst, Operand op)
984 {
985 assert(op.bytes() == dst.bytes());
986
987 if (dst.regClass() == s1 && op.isLiteral()) {
988 uint32_t imm = op.constantValue();
989 if (imm >= 0xffff8000 || imm <= 0x7fff) {
990 bld.sopk(aco_opcode::s_movk_i32, dst, imm & 0xFFFFu);
991 return;
992 } else if (util_bitreverse(imm) <= 64 || util_bitreverse(imm) >= 0xFFFFFFF0) {
993 uint32_t rev = util_bitreverse(imm);
994 bld.sop1(aco_opcode::s_brev_b32, dst, Operand(rev));
995 return;
996 } else if (imm != 0) {
997 unsigned start = (ffs(imm) - 1) & 0x1f;
998 unsigned size = util_bitcount(imm) & 0x1f;
999 if ((((1u << size) - 1u) << start) == imm) {
1000 bld.sop2(aco_opcode::s_bfm_b32, dst, Operand(size), Operand(start));
1001 return;
1002 }
1003 }
1004 }
1005
1006 if (op.bytes() == 4 && op.constantEquals(0x3e22f983) && ctx->program->chip_class >= GFX8)
1007 op.setFixed(PhysReg{248}); /* it can be an inline constant on GFX8+ */
1008
1009 if (dst.regClass() == s1) {
1010 bld.sop1(aco_opcode::s_mov_b32, dst, op);
1011 } else if (dst.regClass() == s2) {
1012 bld.sop1(aco_opcode::s_mov_b64, dst, op);
1013 } else if (dst.regClass() == v1) {
1014 bld.vop1(aco_opcode::v_mov_b32, dst, op);
1015 } else if (dst.regClass() == v1b) {
1016 assert(ctx->program->chip_class >= GFX8);
1017 uint8_t val = op.constantValue();
1018 Operand op32((uint32_t)val | (val & 0x80u ? 0xffffff00u : 0u));
1019 aco_ptr<SDWA_instruction> sdwa;
1020 if (op32.isLiteral()) {
1021 uint32_t a = (uint32_t)int8_mul_table[val * 2];
1022 uint32_t b = (uint32_t)int8_mul_table[val * 2 + 1];
1023 bld.vop2_sdwa(aco_opcode::v_mul_u32_u24, dst,
1024 Operand(a | (a & 0x80u ? 0xffffff00u : 0x0u)),
1025 Operand(b | (b & 0x80u ? 0xffffff00u : 0x0u)));
1026 } else {
1027 bld.vop1_sdwa(aco_opcode::v_mov_b32, dst, op32);
1028 }
1029 } else if (dst.regClass() == v2b && op.isConstant() && !op.isLiteral()) {
1030 assert(ctx->program->chip_class >= GFX8);
1031 if (op.constantValue() >= 0xfff0 || op.constantValue() <= 64) {
1032 /* use v_mov_b32 to avoid possible issues with denormal flushing or
1033 * NaN. v_add_f16 is still needed for float constants. */
1034 uint32_t val32 = (int32_t)(int16_t)op.constantValue();
1035 bld.vop1_sdwa(aco_opcode::v_mov_b32, dst, Operand(val32));
1036 } else {
1037 bld.vop2_sdwa(aco_opcode::v_add_f16, dst, op, Operand(0u));
1038 }
1039 } else if (dst.regClass() == v2b && op.isLiteral()) {
1040 if (ctx->program->chip_class < GFX10 || !(ctx->block->fp_mode.denorm16_64 & fp_denorm_keep_in)) {
1041 unsigned offset = dst.physReg().byte() * 8u;
1042 dst = Definition(PhysReg(dst.physReg().reg()), v1);
1043 Operand def_op(dst.physReg(), v1);
1044 bld.vop2(aco_opcode::v_and_b32, dst, Operand(~(0xffffu << offset)), def_op);
1045 bld.vop2(aco_opcode::v_or_b32, dst, Operand(op.constantValue() << offset), def_op);
1046 } else if (dst.physReg().byte() == 2) {
1047 Operand def_lo(dst.physReg().advance(-2), v2b);
1048 Instruction* instr = bld.vop3(aco_opcode::v_pack_b32_f16, dst, def_lo, op);
1049 static_cast<VOP3A_instruction*>(instr)->opsel = 0;
1050 } else {
1051 assert(dst.physReg().byte() == 0);
1052 Operand def_hi(dst.physReg().advance(2), v2b);
1053 Instruction* instr = bld.vop3(aco_opcode::v_pack_b32_f16, dst, op, def_hi);
1054 static_cast<VOP3A_instruction*>(instr)->opsel = 2;
1055 }
1056 } else {
1057 unreachable("unsupported copy");
1058 }
1059 }
1060
do_copy(lower_context * ctx,Builder & bld,const copy_operation & copy,bool * preserve_scc,PhysReg scratch_sgpr)1061 bool do_copy(lower_context* ctx, Builder& bld, const copy_operation& copy, bool *preserve_scc, PhysReg scratch_sgpr)
1062 {
1063 bool did_copy = false;
1064 for (unsigned offset = 0; offset < copy.bytes;) {
1065 if (copy.uses[offset]) {
1066 offset++;
1067 continue;
1068 }
1069
1070 Definition def;
1071 Operand op;
1072 split_copy(offset, &def, &op, copy, false, 8);
1073
1074 if (def.physReg() == scc) {
1075 bld.sopc(aco_opcode::s_cmp_lg_i32, def, op, Operand(0u));
1076 *preserve_scc = true;
1077 } else if (op.isConstant()) {
1078 copy_constant(ctx, bld, def, op);
1079 } else if (def.regClass() == v1) {
1080 bld.vop1(aco_opcode::v_mov_b32, def, op);
1081 } else if (def.regClass() == s1) {
1082 bld.sop1(aco_opcode::s_mov_b32, def, op);
1083 } else if (def.regClass() == s2) {
1084 bld.sop1(aco_opcode::s_mov_b64, def, op);
1085 } else if (def.regClass().is_subdword() && ctx->program->chip_class < GFX8) {
1086 if (op.physReg().byte()) {
1087 assert(def.physReg().byte() == 0);
1088 bld.vop2(aco_opcode::v_lshrrev_b32, def, Operand(op.physReg().byte() * 8), op);
1089 } else if (def.physReg().byte()) {
1090 assert(op.physReg().byte() == 0);
1091 /* preserve the target's lower half */
1092 uint32_t bits = def.physReg().byte() * 8;
1093 PhysReg lo_reg = PhysReg(def.physReg().reg());
1094 Definition lo_half = Definition(lo_reg, RegClass::get(RegType::vgpr, def.physReg().byte()));
1095 Definition dst = Definition(lo_reg, RegClass::get(RegType::vgpr, lo_half.bytes() + op.bytes()));
1096
1097 if (def.physReg().reg() == op.physReg().reg()) {
1098 bld.vop2(aco_opcode::v_and_b32, lo_half, Operand((1 << bits) - 1u), Operand(lo_reg, lo_half.regClass()));
1099 if (def.physReg().byte() == 1) {
1100 bld.vop2(aco_opcode::v_mul_u32_u24, dst, Operand((1 << bits) + 1u), op);
1101 } else if (def.physReg().byte() == 2) {
1102 bld.vop2(aco_opcode::v_cvt_pk_u16_u32, dst, Operand(lo_reg, v2b), op);
1103 } else if (def.physReg().byte() == 3) {
1104 bld.sop1(aco_opcode::s_mov_b32, Definition(scratch_sgpr, s1), Operand((1 << bits) + 1u));
1105 bld.vop3(aco_opcode::v_mul_lo_u32, dst, Operand(scratch_sgpr, s1), op);
1106 }
1107 } else {
1108 lo_half.setFixed(lo_half.physReg().advance(4 - def.physReg().byte()));
1109 bld.vop2(aco_opcode::v_lshlrev_b32, lo_half, Operand(32 - bits), Operand(lo_reg, lo_half.regClass()));
1110 bld.vop3(aco_opcode::v_alignbyte_b32, dst, op, Operand(lo_half.physReg(), lo_half.regClass()), Operand(4 - def.physReg().byte()));
1111 }
1112 } else {
1113 bld.vop1(aco_opcode::v_mov_b32, def, op);
1114 }
1115 } else if (def.regClass().is_subdword()) {
1116 bld.vop1_sdwa(aco_opcode::v_mov_b32, def, op);
1117 } else {
1118 unreachable("unsupported copy");
1119 }
1120
1121 did_copy = true;
1122 offset += def.bytes();
1123 }
1124 return did_copy;
1125 }
1126
do_swap(lower_context * ctx,Builder & bld,const copy_operation & copy,bool preserve_scc,Pseudo_instruction * pi)1127 void do_swap(lower_context *ctx, Builder& bld, const copy_operation& copy, bool preserve_scc, Pseudo_instruction *pi)
1128 {
1129 unsigned offset = 0;
1130
1131 if (copy.bytes == 3 && (copy.def.physReg().reg_b % 4 <= 1) &&
1132 (copy.def.physReg().reg_b % 4) == (copy.op.physReg().reg_b % 4)) {
1133 /* instead of doing a 2-byte and 1-byte swap, do a 4-byte swap and then fixup with a 1-byte swap */
1134 PhysReg op = copy.op.physReg();
1135 PhysReg def = copy.def.physReg();
1136 op.reg_b &= ~0x3;
1137 def.reg_b &= ~0x3;
1138
1139 copy_operation tmp;
1140 tmp.op = Operand(op, v1);
1141 tmp.def = Definition(def, v1);
1142 tmp.bytes = 4;
1143 memset(tmp.uses, 1, 4);
1144 do_swap(ctx, bld, tmp, preserve_scc, pi);
1145
1146 op.reg_b += copy.def.physReg().reg_b % 4 == 0 ? 3 : 0;
1147 def.reg_b += copy.def.physReg().reg_b % 4 == 0 ? 3 : 0;
1148 tmp.op = Operand(op, v1b);
1149 tmp.def = Definition(def, v1b);
1150 tmp.bytes = 1;
1151 tmp.uses[0] = 1;
1152 do_swap(ctx, bld, tmp, preserve_scc, pi);
1153
1154 offset = copy.bytes;
1155 }
1156
1157 for (; offset < copy.bytes;) {
1158 Definition def;
1159 Operand op;
1160 split_copy(offset, &def, &op, copy, true, 8);
1161
1162 assert(op.regClass() == def.regClass());
1163 Operand def_as_op = Operand(def.physReg(), def.regClass());
1164 Definition op_as_def = Definition(op.physReg(), op.regClass());
1165 if (ctx->program->chip_class >= GFX9 && def.regClass() == v1) {
1166 bld.vop1(aco_opcode::v_swap_b32, def, op_as_def, op, def_as_op);
1167 } else if (def.regClass() == v1) {
1168 assert(def.physReg().byte() == 0 && op.physReg().byte() == 0);
1169 bld.vop2(aco_opcode::v_xor_b32, op_as_def, op, def_as_op);
1170 bld.vop2(aco_opcode::v_xor_b32, def, op, def_as_op);
1171 bld.vop2(aco_opcode::v_xor_b32, op_as_def, op, def_as_op);
1172 } else if (op.physReg() == scc || def.physReg() == scc) {
1173 /* we need to swap scc and another sgpr */
1174 assert(!preserve_scc);
1175
1176 PhysReg other = op.physReg() == scc ? def.physReg() : op.physReg();
1177
1178 bld.sop1(aco_opcode::s_mov_b32, Definition(pi->scratch_sgpr, s1), Operand(scc, s1));
1179 bld.sopc(aco_opcode::s_cmp_lg_i32, Definition(scc, s1), Operand(other, s1), Operand(0u));
1180 bld.sop1(aco_opcode::s_mov_b32, Definition(other, s1), Operand(pi->scratch_sgpr, s1));
1181 } else if (def.regClass() == s1) {
1182 if (preserve_scc) {
1183 bld.sop1(aco_opcode::s_mov_b32, Definition(pi->scratch_sgpr, s1), op);
1184 bld.sop1(aco_opcode::s_mov_b32, op_as_def, def_as_op);
1185 bld.sop1(aco_opcode::s_mov_b32, def, Operand(pi->scratch_sgpr, s1));
1186 } else {
1187 bld.sop2(aco_opcode::s_xor_b32, op_as_def, Definition(scc, s1), op, def_as_op);
1188 bld.sop2(aco_opcode::s_xor_b32, def, Definition(scc, s1), op, def_as_op);
1189 bld.sop2(aco_opcode::s_xor_b32, op_as_def, Definition(scc, s1), op, def_as_op);
1190 }
1191 } else if (def.regClass() == s2) {
1192 if (preserve_scc)
1193 bld.sop1(aco_opcode::s_mov_b32, Definition(pi->scratch_sgpr, s1), Operand(scc, s1));
1194 bld.sop2(aco_opcode::s_xor_b64, op_as_def, Definition(scc, s1), op, def_as_op);
1195 bld.sop2(aco_opcode::s_xor_b64, def, Definition(scc, s1), op, def_as_op);
1196 bld.sop2(aco_opcode::s_xor_b64, op_as_def, Definition(scc, s1), op, def_as_op);
1197 if (preserve_scc)
1198 bld.sopc(aco_opcode::s_cmp_lg_i32, Definition(scc, s1), Operand(pi->scratch_sgpr, s1), Operand(0u));
1199 } else if (def.bytes() == 2 && def.physReg().reg() == op.physReg().reg()) {
1200 bld.vop3(aco_opcode::v_alignbyte_b32, Definition(def.physReg(), v1), def_as_op, op, Operand(2u));
1201 } else {
1202 assert(def.regClass().is_subdword());
1203 bld.vop2_sdwa(aco_opcode::v_xor_b32, op_as_def, op, def_as_op);
1204 bld.vop2_sdwa(aco_opcode::v_xor_b32, def, op, def_as_op);
1205 bld.vop2_sdwa(aco_opcode::v_xor_b32, op_as_def, op, def_as_op);
1206 }
1207
1208 offset += def.bytes();
1209 }
1210
1211 if (ctx->program->chip_class <= GFX7)
1212 return;
1213
1214 /* fixup in case we swapped bytes we shouldn't have */
1215 copy_operation tmp_copy = copy;
1216 tmp_copy.op.setFixed(copy.def.physReg());
1217 tmp_copy.def.setFixed(copy.op.physReg());
1218 do_copy(ctx, bld, tmp_copy, &preserve_scc, pi->scratch_sgpr);
1219 }
1220
do_pack_2x16(lower_context * ctx,Builder & bld,Definition def,Operand lo,Operand hi)1221 void do_pack_2x16(lower_context *ctx, Builder& bld, Definition def, Operand lo, Operand hi)
1222 {
1223 if (lo.isConstant() && hi.isConstant()) {
1224 copy_constant(ctx, bld, def, Operand(lo.constantValue() | (hi.constantValue() << 16)));
1225 return;
1226 }
1227
1228 bool can_use_pack = (ctx->block->fp_mode.denorm16_64 & fp_denorm_keep_in) &&
1229 (ctx->program->chip_class >= GFX10 ||
1230 (ctx->program->chip_class >= GFX9 &&
1231 !lo.isLiteral() && !hi.isLiteral()));
1232
1233 if (can_use_pack) {
1234 Instruction* instr = bld.vop3(aco_opcode::v_pack_b32_f16, def, lo, hi);
1235 /* opsel: 0 = select low half, 1 = select high half. [0] = src0, [1] = src1 */
1236 static_cast<VOP3A_instruction*>(instr)->opsel = hi.physReg().byte() | (lo.physReg().byte() >> 1);
1237 return;
1238 }
1239
1240 /* a single alignbyte can be sufficient: hi can be a 32-bit integer constant */
1241 if (lo.physReg().byte() == 2 && hi.physReg().byte() == 0 &&
1242 (!hi.isConstant() || !Operand(hi.constantValue()).isLiteral() ||
1243 ctx->program->chip_class >= GFX10)) {
1244 bld.vop3(aco_opcode::v_alignbyte_b32, def, hi, lo, Operand(2u));
1245 return;
1246 }
1247
1248 Definition def_lo = Definition(def.physReg(), v2b);
1249 Definition def_hi = Definition(def.physReg().advance(2), v2b);
1250
1251 if (lo.isConstant()) {
1252 /* move hi and zero low bits */
1253 if (hi.physReg().byte() == 0)
1254 bld.vop2(aco_opcode::v_lshlrev_b32, def_hi, Operand(16u), hi);
1255 else
1256 bld.vop2(aco_opcode::v_and_b32, def_hi, Operand(~0xFFFFu), hi);
1257 bld.vop2(aco_opcode::v_or_b32, def, Operand(lo.constantValue()), Operand(def.physReg(), v1));
1258 return;
1259 }
1260 if (hi.isConstant()) {
1261 /* move lo and zero high bits */
1262 if (lo.physReg().byte() == 2)
1263 bld.vop2(aco_opcode::v_lshrrev_b32, def_lo, Operand(16u), lo);
1264 else
1265 bld.vop2(aco_opcode::v_and_b32, def_lo, Operand(0xFFFFu), lo);
1266 bld.vop2(aco_opcode::v_or_b32, def, Operand(hi.constantValue() << 16u), Operand(def.physReg(), v1));
1267 return;
1268 }
1269
1270 if (lo.physReg().reg() == def.physReg().reg()) {
1271 /* lo is in the high bits of def */
1272 assert(lo.physReg().byte() == 2);
1273 bld.vop2(aco_opcode::v_lshrrev_b32, def_lo, Operand(16u), lo);
1274 lo.setFixed(def.physReg());
1275 } else if (hi.physReg() == def.physReg()) {
1276 /* hi is in the low bits of def */
1277 assert(hi.physReg().byte() == 0);
1278 bld.vop2(aco_opcode::v_lshlrev_b32, def_hi, Operand(16u), hi);
1279 hi.setFixed(def.physReg().advance(2));
1280 } else if (ctx->program->chip_class >= GFX8) {
1281 /* either lo or hi can be placed with just a v_mov */
1282 assert(lo.physReg().byte() == 0 || hi.physReg().byte() == 2);
1283 Operand& op = lo.physReg().byte() == 0 ? lo : hi;
1284 PhysReg reg = def.physReg().advance(op.physReg().byte());
1285 bld.vop1(aco_opcode::v_mov_b32, Definition(reg, v2b), op);
1286 op.setFixed(reg);
1287 }
1288
1289 if (ctx->program->chip_class >= GFX8) {
1290 /* either hi or lo are already placed correctly */
1291 if (lo.physReg().reg() == def.physReg().reg())
1292 bld.vop1_sdwa(aco_opcode::v_mov_b32, def_hi, hi);
1293 else
1294 bld.vop1_sdwa(aco_opcode::v_mov_b32, def_lo, lo);
1295 return;
1296 }
1297
1298 /* alignbyte needs the operands in the following way:
1299 * | xx hi | lo xx | >> 2 byte */
1300 if (lo.physReg().byte() != hi.physReg().byte()) {
1301 /* | xx lo | hi xx | => | lo hi | lo hi | */
1302 assert(lo.physReg().byte() == 0 && hi.physReg().byte() == 2);
1303 bld.vop3(aco_opcode::v_alignbyte_b32, def, lo, hi, Operand(2u));
1304 lo = Operand(def_hi.physReg(), v2b);
1305 hi = Operand(def_lo.physReg(), v2b);
1306 } else if (lo.physReg().byte() == 0) {
1307 /* | xx hi | xx lo | => | xx hi | lo 00 | */
1308 bld.vop2(aco_opcode::v_lshlrev_b32, def_hi, Operand(16u), lo);
1309 lo = Operand(def_hi.physReg(), v2b);
1310 } else {
1311 /* | hi xx | lo xx | => | 00 hi | lo xx | */
1312 assert(hi.physReg().byte() == 2);
1313 bld.vop2(aco_opcode::v_lshrrev_b32, def_lo, Operand(16u), hi);
1314 hi = Operand(def_lo.physReg(), v2b);
1315 }
1316 /* perform the alignbyte */
1317 bld.vop3(aco_opcode::v_alignbyte_b32, def, hi, lo, Operand(2u));
1318 }
1319
handle_operands(std::map<PhysReg,copy_operation> & copy_map,lower_context * ctx,chip_class chip_class,Pseudo_instruction * pi)1320 void handle_operands(std::map<PhysReg, copy_operation>& copy_map, lower_context* ctx, chip_class chip_class, Pseudo_instruction *pi)
1321 {
1322 Builder bld(ctx->program, &ctx->instructions);
1323 unsigned num_instructions_before = ctx->instructions.size();
1324 aco_ptr<Instruction> mov;
1325 std::map<PhysReg, copy_operation>::iterator it = copy_map.begin();
1326 std::map<PhysReg, copy_operation>::iterator target;
1327 bool writes_scc = false;
1328
1329 /* count the number of uses for each dst reg */
1330 while (it != copy_map.end()) {
1331
1332 if (it->second.def.physReg() == scc)
1333 writes_scc = true;
1334
1335 assert(!pi->tmp_in_scc || !(it->second.def.physReg() == pi->scratch_sgpr));
1336
1337 /* if src and dst reg are the same, remove operation */
1338 if (it->first == it->second.op.physReg()) {
1339 it = copy_map.erase(it);
1340 continue;
1341 }
1342
1343 /* split large copies */
1344 if (it->second.bytes > 8) {
1345 assert(!it->second.op.isConstant());
1346 assert(!it->second.def.regClass().is_subdword());
1347 RegClass rc = RegClass(it->second.def.regClass().type(), it->second.def.size() - 2);
1348 Definition hi_def = Definition(PhysReg{it->first + 2}, rc);
1349 rc = RegClass(it->second.op.regClass().type(), it->second.op.size() - 2);
1350 Operand hi_op = Operand(PhysReg{it->second.op.physReg() + 2}, rc);
1351 copy_operation copy = {hi_op, hi_def, it->second.bytes - 8};
1352 copy_map[hi_def.physReg()] = copy;
1353 assert(it->second.op.physReg().byte() == 0 && it->second.def.physReg().byte() == 0);
1354 it->second.op = Operand(it->second.op.physReg(), it->second.op.regClass().type() == RegType::sgpr ? s2 : v2);
1355 it->second.def = Definition(it->second.def.physReg(), it->second.def.regClass().type() == RegType::sgpr ? s2 : v2);
1356 it->second.bytes = 8;
1357 }
1358
1359 /* try to coalesce copies */
1360 if (it->second.bytes < 8 && !it->second.op.isConstant() &&
1361 it->first.reg_b % util_next_power_of_two(it->second.bytes + 1) == 0 &&
1362 it->second.op.physReg().reg_b % util_next_power_of_two(it->second.bytes + 1) == 0) {
1363 // TODO try more relaxed alignment for subdword copies
1364 PhysReg other_def_reg = it->first;
1365 other_def_reg.reg_b += it->second.bytes;
1366 PhysReg other_op_reg = it->second.op.physReg();
1367 other_op_reg.reg_b += it->second.bytes;
1368 std::map<PhysReg, copy_operation>::iterator other = copy_map.find(other_def_reg);
1369 if (other != copy_map.end() &&
1370 other->second.op.physReg() == other_op_reg &&
1371 it->second.bytes + other->second.bytes <= 8) {
1372 it->second.bytes += other->second.bytes;
1373 it->second.def = Definition(it->first, RegClass::get(it->second.def.regClass().type(), it->second.bytes));
1374 it->second.op = Operand(it->second.op.physReg(), RegClass::get(it->second.op.regClass().type(), it->second.bytes));
1375 copy_map.erase(other);
1376 }
1377 }
1378
1379 /* check if the definition reg is used by another copy operation */
1380 for (std::pair<const PhysReg, copy_operation>& copy : copy_map) {
1381 if (copy.second.op.isConstant())
1382 continue;
1383 for (uint16_t i = 0; i < it->second.bytes; i++) {
1384 /* distance might underflow */
1385 unsigned distance = it->first.reg_b + i - copy.second.op.physReg().reg_b;
1386 if (distance < copy.second.bytes)
1387 it->second.uses[i] += 1;
1388 }
1389 }
1390
1391 ++it;
1392 }
1393
1394 /* first, handle paths in the location transfer graph */
1395 bool preserve_scc = pi->tmp_in_scc && !writes_scc;
1396 bool skip_partial_copies = true;
1397 it = copy_map.begin();
1398 while (true) {
1399 if (copy_map.empty()) {
1400 ctx->program->statistics[statistic_copies] += ctx->instructions.size() - num_instructions_before;
1401 return;
1402 }
1403 if (it == copy_map.end()) {
1404 if (!skip_partial_copies)
1405 break;
1406 skip_partial_copies = false;
1407 it = copy_map.begin();
1408 }
1409
1410 /* check if we can pack one register at once */
1411 if (it->first.byte() == 0 && it->second.bytes == 2) {
1412 PhysReg reg_hi = it->first.advance(2);
1413 std::map<PhysReg, copy_operation>::iterator other = copy_map.find(reg_hi);
1414 if (other != copy_map.end() && other->second.bytes == 2) {
1415 /* check if the target register is otherwise unused */
1416 bool unused_lo = !it->second.is_used ||
1417 (it->second.is_used == 0x0101 &&
1418 other->second.op.physReg() == it->first);
1419 bool unused_hi = !other->second.is_used ||
1420 (other->second.is_used == 0x0101 &&
1421 it->second.op.physReg() == reg_hi);
1422 if (unused_lo && unused_hi) {
1423 Operand lo = it->second.op;
1424 Operand hi = other->second.op;
1425 do_pack_2x16(ctx, bld, Definition(it->first, v1), lo, hi);
1426 copy_map.erase(it);
1427 copy_map.erase(other);
1428
1429 for (std::pair<const PhysReg, copy_operation>& other : copy_map) {
1430 for (uint16_t i = 0; i < other.second.bytes; i++) {
1431 /* distance might underflow */
1432 unsigned distance_lo = other.first.reg_b + i - lo.physReg().reg_b;
1433 unsigned distance_hi = other.first.reg_b + i - hi.physReg().reg_b;
1434 if (distance_lo < 2 || distance_hi < 2)
1435 other.second.uses[i] -= 1;
1436 }
1437 }
1438 it = copy_map.begin();
1439 continue;
1440 }
1441 }
1442 }
1443
1444 /* on GFX6/7, we need some small workarounds as there is no
1445 * SDWA instruction to do partial register writes */
1446 if (ctx->program->chip_class < GFX8 && it->second.bytes < 4) {
1447 if (it->first.byte() == 0 && it->second.op.physReg().byte() == 0 &&
1448 !it->second.is_used && pi->opcode == aco_opcode::p_split_vector) {
1449 /* Other operations might overwrite the high bits, so change all users
1450 * of the high bits to the new target where they are still available.
1451 * This mechanism depends on also emitting dead definitions. */
1452 PhysReg reg_hi = it->second.op.physReg().advance(it->second.bytes);
1453 while (reg_hi != PhysReg(it->second.op.physReg().reg() + 1)) {
1454 std::map<PhysReg, copy_operation>::iterator other = copy_map.begin();
1455 for (other = copy_map.begin(); other != copy_map.end(); other++) {
1456 /* on GFX6/7, if the high bits are used as operand, they cannot be a target */
1457 if (other->second.op.physReg() == reg_hi) {
1458 other->second.op.setFixed(it->first.advance(reg_hi.byte()));
1459 break; /* break because an operand can only be used once */
1460 }
1461 }
1462 reg_hi = reg_hi.advance(it->second.bytes);
1463 }
1464 } else if (it->first.byte()) {
1465 assert(pi->opcode == aco_opcode::p_create_vector);
1466 /* on GFX6/7, if we target an upper half where the lower half hasn't yet been handled,
1467 * move to the target operand's high bits. This is save to do as it cannot be an operand */
1468 PhysReg lo = PhysReg(it->first.reg());
1469 std::map<PhysReg, copy_operation>::iterator other = copy_map.find(lo);
1470 if (other != copy_map.end()) {
1471 assert(other->second.bytes == it->first.byte());
1472 PhysReg new_reg_hi = other->second.op.physReg().advance(it->first.byte());
1473 it->second.def = Definition(new_reg_hi, it->second.def.regClass());
1474 it->second.is_used = 0;
1475 other->second.bytes += it->second.bytes;
1476 other->second.def.setTemp(Temp(other->second.def.tempId(), RegClass::get(RegType::vgpr, other->second.bytes)));
1477 other->second.op.setTemp(Temp(other->second.op.tempId(), RegClass::get(RegType::vgpr, other->second.bytes)));
1478 /* if the new target's high bits are also a target, change uses */
1479 std::map<PhysReg, copy_operation>::iterator target = copy_map.find(new_reg_hi);
1480 if (target != copy_map.end()) {
1481 for (unsigned i = 0; i < it->second.bytes; i++)
1482 target->second.uses[i]++;
1483 }
1484 }
1485 }
1486 }
1487
1488 /* find portions where the target reg is not used as operand for any other copy */
1489 if (it->second.is_used) {
1490 if (it->second.op.isConstant() || skip_partial_copies) {
1491 /* we have to skip constants until is_used=0.
1492 * we also skip partial copies at the beginning to help coalescing */
1493 ++it;
1494 continue;
1495 }
1496
1497 unsigned has_zero_use_bytes = 0;
1498 for (unsigned i = 0; i < it->second.bytes; i++)
1499 has_zero_use_bytes |= (it->second.uses[i] == 0) << i;
1500
1501 if (has_zero_use_bytes) {
1502 /* Skipping partial copying and doing a v_swap_b32 and then fixup
1503 * copies is usually beneficial for sub-dword copies, but if doing
1504 * a partial copy allows further copies, it should be done instead. */
1505 bool partial_copy = (has_zero_use_bytes == 0xf) || (has_zero_use_bytes == 0xf0);
1506 for (std::pair<const PhysReg, copy_operation>& copy : copy_map) {
1507 /* on GFX6/7, we can only do copies with full registers */
1508 if (partial_copy || ctx->program->chip_class <= GFX7)
1509 break;
1510 for (uint16_t i = 0; i < copy.second.bytes; i++) {
1511 /* distance might underflow */
1512 unsigned distance = copy.first.reg_b + i - it->second.op.physReg().reg_b;
1513 if (distance < it->second.bytes && copy.second.uses[i] == 1 &&
1514 !it->second.uses[distance])
1515 partial_copy = true;
1516 }
1517 }
1518
1519 if (!partial_copy) {
1520 ++it;
1521 continue;
1522 }
1523 } else {
1524 /* full target reg is used: register swapping needed */
1525 ++it;
1526 continue;
1527 }
1528 }
1529
1530 bool did_copy = do_copy(ctx, bld, it->second, &preserve_scc, pi->scratch_sgpr);
1531 skip_partial_copies = did_copy;
1532 std::pair<PhysReg, copy_operation> copy = *it;
1533
1534 if (it->second.is_used == 0) {
1535 /* the target reg is not used as operand for any other copy, so we
1536 * copied to all of it */
1537 copy_map.erase(it);
1538 it = copy_map.begin();
1539 } else {
1540 /* we only performed some portions of this copy, so split it to only
1541 * leave the portions that still need to be done */
1542 copy_operation original = it->second; /* the map insertion below can overwrite this */
1543 copy_map.erase(it);
1544 for (unsigned offset = 0; offset < original.bytes;) {
1545 if (original.uses[offset] == 0) {
1546 offset++;
1547 continue;
1548 }
1549 Definition def;
1550 Operand op;
1551 split_copy(offset, &def, &op, original, false, 8);
1552
1553 copy_operation copy = {op, def, def.bytes()};
1554 for (unsigned i = 0; i < copy.bytes; i++)
1555 copy.uses[i] = original.uses[i + offset];
1556 copy_map[def.physReg()] = copy;
1557
1558 offset += def.bytes();
1559 }
1560
1561 it = copy_map.begin();
1562 }
1563
1564 /* Reduce the number of uses of the operand reg by one. Do this after
1565 * splitting the copy or removing it in case the copy writes to it's own
1566 * operand (for example, v[7:8] = v[8:9]) */
1567 if (did_copy && !copy.second.op.isConstant()) {
1568 for (std::pair<const PhysReg, copy_operation>& other : copy_map) {
1569 for (uint16_t i = 0; i < other.second.bytes; i++) {
1570 /* distance might underflow */
1571 unsigned distance = other.first.reg_b + i - copy.second.op.physReg().reg_b;
1572 if (distance < copy.second.bytes && !copy.second.uses[distance])
1573 other.second.uses[i] -= 1;
1574 }
1575 }
1576 }
1577 }
1578
1579 /* all target regs are needed as operand somewhere which means, all entries are part of a cycle */
1580 unsigned largest = 0;
1581 for (const std::pair<const PhysReg, copy_operation>& op : copy_map)
1582 largest = MAX2(largest, op.second.bytes);
1583
1584 while (!copy_map.empty()) {
1585
1586 /* Perform larger swaps first, because larger swaps swaps can make other
1587 * swaps unnecessary. */
1588 auto it = copy_map.begin();
1589 for (auto it2 = copy_map.begin(); it2 != copy_map.end(); ++it2) {
1590 if (it2->second.bytes > it->second.bytes) {
1591 it = it2;
1592 if (it->second.bytes == largest)
1593 break;
1594 }
1595 }
1596
1597 /* should already be done */
1598 assert(!it->second.op.isConstant());
1599
1600 assert(it->second.op.isFixed());
1601 assert(it->second.def.regClass() == it->second.op.regClass());
1602
1603 if (it->first == it->second.op.physReg()) {
1604 copy_map.erase(it);
1605 continue;
1606 }
1607
1608 if (preserve_scc && it->second.def.getTemp().type() == RegType::sgpr)
1609 assert(!(it->second.def.physReg() == pi->scratch_sgpr));
1610
1611 /* to resolve the cycle, we have to swap the src reg with the dst reg */
1612 copy_operation swap = it->second;
1613
1614 /* if this is self-intersecting, we have to split it because
1615 * self-intersecting swaps don't make sense */
1616 PhysReg src = swap.op.physReg(), dst = swap.def.physReg();
1617 if (abs((int)src.reg_b - (int)dst.reg_b) < (int)swap.bytes) {
1618 unsigned offset = abs((int)src.reg_b - (int)dst.reg_b);
1619 RegType type = swap.def.regClass().type();
1620
1621 copy_operation middle;
1622 src.reg_b += offset;
1623 dst.reg_b += offset;
1624 middle.bytes = swap.bytes - offset * 2;
1625 memcpy(middle.uses, swap.uses + offset, middle.bytes);
1626 middle.op = Operand(src, RegClass::get(type, middle.bytes));
1627 middle.def = Definition(dst, RegClass::get(type, middle.bytes));
1628 copy_map[dst] = middle;
1629
1630 copy_operation end;
1631 src.reg_b += middle.bytes;
1632 dst.reg_b += middle.bytes;
1633 end.bytes = swap.bytes - (offset + middle.bytes);
1634 memcpy(end.uses, swap.uses + offset + middle.bytes, end.bytes);
1635 end.op = Operand(src, RegClass::get(type, end.bytes));
1636 end.def = Definition(dst, RegClass::get(type, end.bytes));
1637 copy_map[dst] = end;
1638
1639 memset(swap.uses + offset, 0, swap.bytes - offset);
1640 swap.bytes = offset;
1641 }
1642
1643 /* GFX6-7 can only swap full registers */
1644 if (ctx->program->chip_class <= GFX7)
1645 swap.bytes = align(swap.bytes, 4);
1646
1647 do_swap(ctx, bld, swap, preserve_scc, pi);
1648
1649 /* remove from map */
1650 copy_map.erase(it);
1651
1652 /* change the operand reg of the target's uses and split uses if needed */
1653 target = copy_map.begin();
1654 uint32_t bytes_left = u_bit_consecutive(0, swap.bytes);
1655 for (; target != copy_map.end(); ++target) {
1656 if (target->second.op.physReg() == swap.def.physReg() && swap.bytes == target->second.bytes) {
1657 target->second.op.setFixed(swap.op.physReg());
1658 break;
1659 }
1660
1661 uint32_t imask = get_intersection_mask(swap.def.physReg().reg_b, swap.bytes,
1662 target->second.op.physReg().reg_b, target->second.bytes);
1663
1664 if (!imask)
1665 continue;
1666
1667 int offset = (int)target->second.op.physReg().reg_b - (int)swap.def.physReg().reg_b;
1668
1669 /* split and update the middle (the portion that reads the swap's
1670 * definition) to read the swap's operand instead */
1671 int target_op_end = target->second.op.physReg().reg_b + target->second.bytes;
1672 int swap_def_end = swap.def.physReg().reg_b + swap.bytes;
1673 int before_bytes = MAX2(-offset, 0);
1674 int after_bytes = MAX2(target_op_end - swap_def_end, 0);
1675 int middle_bytes = target->second.bytes - before_bytes - after_bytes;
1676
1677 if (after_bytes) {
1678 unsigned after_offset = before_bytes + middle_bytes;
1679 assert(after_offset > 0);
1680 copy_operation copy;
1681 copy.bytes = after_bytes;
1682 memcpy(copy.uses, target->second.uses + after_offset, copy.bytes);
1683 RegClass rc = RegClass::get(target->second.op.regClass().type(), after_bytes);
1684 copy.op = Operand(target->second.op.physReg().advance(after_offset), rc);
1685 copy.def = Definition(target->second.def.physReg().advance(after_offset), rc);
1686 copy_map[copy.def.physReg()] = copy;
1687 }
1688
1689 if (middle_bytes) {
1690 copy_operation copy;
1691 copy.bytes = middle_bytes;
1692 memcpy(copy.uses, target->second.uses + before_bytes, copy.bytes);
1693 RegClass rc = RegClass::get(target->second.op.regClass().type(), middle_bytes);
1694 copy.op = Operand(swap.op.physReg().advance(MAX2(offset, 0)), rc);
1695 copy.def = Definition(target->second.def.physReg().advance(before_bytes), rc);
1696 copy_map[copy.def.physReg()] = copy;
1697 }
1698
1699 if (before_bytes) {
1700 copy_operation copy;
1701 target->second.bytes = before_bytes;
1702 RegClass rc = RegClass::get(target->second.op.regClass().type(), before_bytes);
1703 target->second.op = Operand(target->second.op.physReg(), rc);
1704 target->second.def = Definition(target->second.def.physReg(), rc);
1705 memset(target->second.uses + target->second.bytes, 0, 8 - target->second.bytes);
1706 }
1707
1708 /* break early since we know each byte of the swap's definition is used
1709 * at most once */
1710 bytes_left &= ~imask;
1711 if (!bytes_left)
1712 break;
1713 }
1714 }
1715 ctx->program->statistics[statistic_copies] += ctx->instructions.size() - num_instructions_before;
1716 }
1717
emit_set_mode(Builder & bld,float_mode new_mode,bool set_round,bool set_denorm)1718 void emit_set_mode(Builder& bld, float_mode new_mode, bool set_round, bool set_denorm)
1719 {
1720 if (bld.program->chip_class >= GFX10) {
1721 if (set_round)
1722 bld.sopp(aco_opcode::s_round_mode, -1, new_mode.round);
1723 if (set_denorm)
1724 bld.sopp(aco_opcode::s_denorm_mode, -1, new_mode.denorm);
1725 } else if (set_round || set_denorm) {
1726 /* "((size - 1) << 11) | register" (MODE is encoded as register 1) */
1727 Instruction *instr = bld.sopk(aco_opcode::s_setreg_imm32_b32, Operand(new_mode.val), (7 << 11) | 1).instr;
1728 /* has to be a literal */
1729 instr->operands[0].setFixed(PhysReg{255});
1730 }
1731 }
1732
lower_to_hw_instr(Program * program)1733 void lower_to_hw_instr(Program* program)
1734 {
1735 Block *discard_block = NULL;
1736
1737 for (size_t i = 0; i < program->blocks.size(); i++)
1738 {
1739 Block *block = &program->blocks[i];
1740 lower_context ctx;
1741 ctx.program = program;
1742 ctx.block = block;
1743 Builder bld(program, &ctx.instructions);
1744
1745 float_mode config_mode;
1746 config_mode.val = program->config->float_mode;
1747
1748 bool set_round = i == 0 && block->fp_mode.round != config_mode.round;
1749 bool set_denorm = i == 0 && block->fp_mode.denorm != config_mode.denorm;
1750 if (block->kind & block_kind_top_level) {
1751 for (unsigned pred : block->linear_preds) {
1752 if (program->blocks[pred].fp_mode.round != block->fp_mode.round)
1753 set_round = true;
1754 if (program->blocks[pred].fp_mode.denorm != block->fp_mode.denorm)
1755 set_denorm = true;
1756 }
1757 }
1758 /* only allow changing modes at top-level blocks so this doesn't break
1759 * the "jump over empty blocks" optimization */
1760 assert((!set_round && !set_denorm) || (block->kind & block_kind_top_level));
1761 emit_set_mode(bld, block->fp_mode, set_round, set_denorm);
1762
1763 for (size_t j = 0; j < block->instructions.size(); j++) {
1764 aco_ptr<Instruction>& instr = block->instructions[j];
1765 aco_ptr<Instruction> mov;
1766 if (instr->format == Format::PSEUDO && instr->opcode != aco_opcode::p_unit_test) {
1767 Pseudo_instruction *pi = (Pseudo_instruction*)instr.get();
1768
1769 switch (instr->opcode)
1770 {
1771 case aco_opcode::p_extract_vector:
1772 {
1773 PhysReg reg = instr->operands[0].physReg();
1774 Definition& def = instr->definitions[0];
1775 reg.reg_b += instr->operands[1].constantValue() * def.bytes();
1776
1777 if (reg == def.physReg())
1778 break;
1779
1780 RegClass op_rc = def.regClass().is_subdword() ? def.regClass() :
1781 RegClass(instr->operands[0].getTemp().type(), def.size());
1782 std::map<PhysReg, copy_operation> copy_operations;
1783 copy_operations[def.physReg()] = {Operand(reg, op_rc), def, def.bytes()};
1784 handle_operands(copy_operations, &ctx, program->chip_class, pi);
1785 break;
1786 }
1787 case aco_opcode::p_create_vector:
1788 {
1789 std::map<PhysReg, copy_operation> copy_operations;
1790 PhysReg reg = instr->definitions[0].physReg();
1791
1792 for (const Operand& op : instr->operands) {
1793 if (op.isConstant()) {
1794 const Definition def = Definition(reg, RegClass(instr->definitions[0].getTemp().type(), op.size()));
1795 copy_operations[reg] = {op, def, op.bytes()};
1796 reg.reg_b += op.bytes();
1797 continue;
1798 }
1799 if (op.isUndefined()) {
1800 // TODO: coalesce subdword copies if dst byte is 0
1801 reg.reg_b += op.bytes();
1802 continue;
1803 }
1804
1805 RegClass rc_def = op.regClass().is_subdword() ? op.regClass() :
1806 RegClass(instr->definitions[0].getTemp().type(), op.size());
1807 const Definition def = Definition(reg, rc_def);
1808 copy_operations[def.physReg()] = {op, def, op.bytes()};
1809 reg.reg_b += op.bytes();
1810 }
1811 handle_operands(copy_operations, &ctx, program->chip_class, pi);
1812 break;
1813 }
1814 case aco_opcode::p_split_vector:
1815 {
1816 std::map<PhysReg, copy_operation> copy_operations;
1817 PhysReg reg = instr->operands[0].physReg();
1818
1819 for (const Definition& def : instr->definitions) {
1820 RegClass rc_op = def.regClass().is_subdword() ? def.regClass() :
1821 RegClass(instr->operands[0].getTemp().type(), def.size());
1822 const Operand op = Operand(reg, rc_op);
1823 copy_operations[def.physReg()] = {op, def, def.bytes()};
1824 reg.reg_b += def.bytes();
1825 }
1826 handle_operands(copy_operations, &ctx, program->chip_class, pi);
1827 break;
1828 }
1829 case aco_opcode::p_parallelcopy:
1830 case aco_opcode::p_wqm:
1831 {
1832 std::map<PhysReg, copy_operation> copy_operations;
1833 for (unsigned i = 0; i < instr->operands.size(); i++) {
1834 assert(instr->definitions[i].bytes() == instr->operands[i].bytes());
1835 copy_operations[instr->definitions[i].physReg()] = {instr->operands[i], instr->definitions[i], instr->operands[i].bytes()};
1836 }
1837 handle_operands(copy_operations, &ctx, program->chip_class, pi);
1838 break;
1839 }
1840 case aco_opcode::p_exit_early_if:
1841 {
1842 /* don't bother with an early exit near the end of the program */
1843 if ((block->instructions.size() - 1 - j) <= 4 &&
1844 block->instructions.back()->opcode == aco_opcode::s_endpgm) {
1845 unsigned null_exp_dest = (ctx.program->stage.hw == HWStage::FS) ? 9 /* NULL */ : V_008DFC_SQ_EXP_POS;
1846 bool ignore_early_exit = true;
1847
1848 for (unsigned k = j + 1; k < block->instructions.size(); ++k) {
1849 const aco_ptr<Instruction> &instr = block->instructions[k];
1850 if (instr->opcode == aco_opcode::s_endpgm ||
1851 instr->opcode == aco_opcode::p_logical_end)
1852 continue;
1853 else if (instr->opcode == aco_opcode::exp &&
1854 static_cast<Export_instruction *>(instr.get())->dest == null_exp_dest)
1855 continue;
1856 else if (instr->opcode == aco_opcode::p_parallelcopy &&
1857 instr->definitions[0].isFixed() &&
1858 instr->definitions[0].physReg() == exec)
1859 continue;
1860
1861 ignore_early_exit = false;
1862 }
1863
1864 if (ignore_early_exit)
1865 break;
1866 }
1867
1868 if (!discard_block) {
1869 discard_block = program->create_and_insert_block();
1870 block = &program->blocks[i];
1871
1872 bld.reset(discard_block);
1873 bld.exp(aco_opcode::exp, Operand(v1), Operand(v1), Operand(v1), Operand(v1),
1874 0, V_008DFC_SQ_EXP_NULL, false, true, true);
1875 if (program->wb_smem_l1_on_end)
1876 bld.smem(aco_opcode::s_dcache_wb);
1877 bld.sopp(aco_opcode::s_endpgm);
1878
1879 bld.reset(&ctx.instructions);
1880 }
1881
1882 //TODO: exec can be zero here with block_kind_discard
1883
1884 assert(instr->operands[0].physReg() == scc);
1885 bld.sopp(aco_opcode::s_cbranch_scc0, Definition(exec, s2), instr->operands[0], discard_block->index);
1886
1887 discard_block->linear_preds.push_back(block->index);
1888 block->linear_succs.push_back(discard_block->index);
1889 break;
1890 }
1891 case aco_opcode::p_spill:
1892 {
1893 assert(instr->operands[0].regClass() == v1.as_linear());
1894 for (unsigned i = 0; i < instr->operands[2].size(); i++)
1895 bld.writelane(bld.def(v1, instr->operands[0].physReg()),
1896 Operand(PhysReg{instr->operands[2].physReg() + i}, s1),
1897 Operand(instr->operands[1].constantValue() + i),
1898 instr->operands[0]);
1899 break;
1900 }
1901 case aco_opcode::p_reload:
1902 {
1903 assert(instr->operands[0].regClass() == v1.as_linear());
1904 for (unsigned i = 0; i < instr->definitions[0].size(); i++)
1905 bld.readlane(bld.def(s1, PhysReg{instr->definitions[0].physReg() + i}),
1906 instr->operands[0],
1907 Operand(instr->operands[1].constantValue() + i));
1908 break;
1909 }
1910 case aco_opcode::p_as_uniform:
1911 {
1912 if (instr->operands[0].isConstant() || instr->operands[0].regClass().type() == RegType::sgpr) {
1913 std::map<PhysReg, copy_operation> copy_operations;
1914 copy_operations[instr->definitions[0].physReg()] = {instr->operands[0], instr->definitions[0], instr->definitions[0].bytes()};
1915 handle_operands(copy_operations, &ctx, program->chip_class, pi);
1916 } else {
1917 assert(instr->operands[0].regClass().type() == RegType::vgpr);
1918 assert(instr->definitions[0].regClass().type() == RegType::sgpr);
1919 assert(instr->operands[0].size() == instr->definitions[0].size());
1920 for (unsigned i = 0; i < instr->definitions[0].size(); i++) {
1921 bld.vop1(aco_opcode::v_readfirstlane_b32,
1922 bld.def(s1, PhysReg{instr->definitions[0].physReg() + i}),
1923 Operand(PhysReg{instr->operands[0].physReg() + i}, v1));
1924 }
1925 }
1926 break;
1927 }
1928 case aco_opcode::p_bpermute:
1929 {
1930 if (ctx.program->chip_class <= GFX7)
1931 emit_gfx6_bpermute(program, instr, bld);
1932 else if (ctx.program->chip_class >= GFX10 && ctx.program->wave_size == 64)
1933 emit_gfx10_wave64_bpermute(program, instr, bld);
1934 else
1935 unreachable("Current hardware supports ds_bpermute, don't emit p_bpermute.");
1936 break;
1937 }
1938 default:
1939 break;
1940 }
1941 } else if (instr->format == Format::PSEUDO_BRANCH) {
1942 Pseudo_branch_instruction* branch = static_cast<Pseudo_branch_instruction*>(instr.get());
1943 /* check if all blocks from current to target are empty */
1944 bool can_remove = block->index < branch->target[0];
1945 for (unsigned i = block->index + 1; can_remove && i < branch->target[0]; i++) {
1946 if (program->blocks[i].instructions.size())
1947 can_remove = false;
1948 }
1949 if (can_remove)
1950 continue;
1951
1952 switch (instr->opcode) {
1953 case aco_opcode::p_branch:
1954 assert(block->linear_succs[0] == branch->target[0]);
1955 bld.sopp(aco_opcode::s_branch, branch->definitions[0], branch->target[0]);
1956 break;
1957 case aco_opcode::p_cbranch_nz:
1958 assert(block->linear_succs[1] == branch->target[0]);
1959 if (branch->operands[0].physReg() == exec)
1960 bld.sopp(aco_opcode::s_cbranch_execnz, branch->definitions[0], branch->target[0]);
1961 else if (branch->operands[0].physReg() == vcc)
1962 bld.sopp(aco_opcode::s_cbranch_vccnz, branch->definitions[0], branch->target[0]);
1963 else {
1964 assert(branch->operands[0].physReg() == scc);
1965 bld.sopp(aco_opcode::s_cbranch_scc1, branch->definitions[0], branch->target[0]);
1966 }
1967 break;
1968 case aco_opcode::p_cbranch_z:
1969 assert(block->linear_succs[1] == branch->target[0]);
1970 if (branch->operands[0].physReg() == exec)
1971 bld.sopp(aco_opcode::s_cbranch_execz, branch->definitions[0], branch->target[0]);
1972 else if (branch->operands[0].physReg() == vcc)
1973 bld.sopp(aco_opcode::s_cbranch_vccz, branch->definitions[0], branch->target[0]);
1974 else {
1975 assert(branch->operands[0].physReg() == scc);
1976 bld.sopp(aco_opcode::s_cbranch_scc0, branch->definitions[0], branch->target[0]);
1977 }
1978 break;
1979 default:
1980 unreachable("Unknown Pseudo branch instruction!");
1981 }
1982
1983 } else if (instr->format == Format::PSEUDO_REDUCTION) {
1984 Pseudo_reduction_instruction* reduce = static_cast<Pseudo_reduction_instruction*>(instr.get());
1985 emit_reduction(&ctx, reduce->opcode, reduce->reduce_op, reduce->cluster_size,
1986 reduce->operands[1].physReg(), // tmp
1987 reduce->definitions[1].physReg(), // stmp
1988 reduce->operands[2].physReg(), // vtmp
1989 reduce->definitions[2].physReg(), // sitmp
1990 reduce->operands[0], reduce->definitions[0]);
1991 } else if (instr->format == Format::PSEUDO_BARRIER) {
1992 Pseudo_barrier_instruction* barrier = static_cast<Pseudo_barrier_instruction*>(instr.get());
1993
1994 /* Anything larger than a workgroup isn't possible. Anything
1995 * smaller requires no instructions and this pseudo instruction
1996 * would only be included to control optimizations. */
1997 bool emit_s_barrier = barrier->exec_scope == scope_workgroup &&
1998 program->workgroup_size > program->wave_size;
1999
2000 bld.insert(std::move(instr));
2001 if (emit_s_barrier)
2002 bld.sopp(aco_opcode::s_barrier);
2003 } else if (instr->opcode == aco_opcode::p_cvt_f16_f32_rtne) {
2004 float_mode new_mode = block->fp_mode;
2005 new_mode.round16_64 = fp_round_ne;
2006 bool set_round = new_mode.round != block->fp_mode.round;
2007
2008 emit_set_mode(bld, new_mode, set_round, false);
2009
2010 instr->opcode = aco_opcode::v_cvt_f16_f32;
2011 ctx.instructions.emplace_back(std::move(instr));
2012
2013 emit_set_mode(bld, block->fp_mode, set_round, false);
2014 } else {
2015 ctx.instructions.emplace_back(std::move(instr));
2016 }
2017
2018 }
2019 block->instructions.swap(ctx.instructions);
2020 }
2021 }
2022
2023 }
2024