• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /*
2  * Copyright © 2018 Valve Corporation
3  *
4  * Permission is hereby granted, free of charge, to any person obtaining a
5  * copy of this software and associated documentation files (the "Software"),
6  * to deal in the Software without restriction, including without limitation
7  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8  * and/or sell copies of the Software, and to permit persons to whom the
9  * Software is furnished to do so, subject to the following conditions:
10  *
11  * The above copyright notice and this permission notice (including the next
12  * paragraph) shall be included in all copies or substantial portions of the
13  * Software.
14  *
15  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
18  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20  * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21  * IN THE SOFTWARE.
22  *
23  */
24 
25 #include "aco_builder.h"
26 #include "aco_ir.h"
27 
28 #include "common/sid.h"
29 
30 #include <map>
31 #include <vector>
32 
33 namespace aco {
34 
35 struct lower_context {
36    Program* program;
37    Block* block;
38    std::vector<aco_ptr<Instruction>> instructions;
39 };
40 
41 /* Class for obtaining where s_sendmsg(MSG_ORDERED_PS_DONE) must be done in a Primitive Ordered
42  * Pixel Shader on GFX9-10.3.
43  *
44  * MSG_ORDERED_PS_DONE must be sent once after the ordered section is done along all execution paths
45  * from the POPS packer ID hardware register setting to s_endpgm. It is, however, also okay to send
46  * it if the packer ID is not going to be set at all by the wave, so some conservativeness is fine.
47  *
48  * For simplicity, sending the message from top-level blocks as dominance and post-dominance
49  * checking for any location in the shader is trivial in them. Also, for simplicity, sending it
50  * regardless of whether the POPS packer ID hardware register has already potentially been set up.
51  *
52  * Note that there can be multiple interlock end instructions in the shader.
53  * SPV_EXT_fragment_shader_interlock requires OpEndInvocationInterlockEXT to be executed exactly
54  * once by the invocation. However, there may be, for instance, multiple ordered sections, and which
55  * one will be executed may depend on divergent control flow (some lanes may execute one ordered
56  * section, other lanes may execute another). MSG_ORDERED_PS_DONE, however, is sent via a scalar
57  * instruction, so it must be ensured that the message is sent after the last ordered section in the
58  * entire wave.
59  */
60 class gfx9_pops_done_msg_bounds {
61 public:
62    explicit gfx9_pops_done_msg_bounds() = default;
63 
gfx9_pops_done_msg_bounds(const Program * const program)64    explicit gfx9_pops_done_msg_bounds(const Program* const program)
65    {
66       /* Find the top-level location after the last ordered section end pseudo-instruction in the
67        * program.
68        * Consider `p_pops_gfx9_overlapped_wave_wait_done` a boundary too - make sure the message
69        * isn't sent if any wait hasn't been fully completed yet (if a begin-end-begin situation
70        * occurs somehow, as the location of `p_pops_gfx9_ordered_section_done` is controlled by the
71        * application) for safety, assuming that waits are the only thing that need the packer
72        * hardware register to be set at some point during or before them, and it won't be set
73        * anymore after the last wait.
74        */
75       int last_top_level_block_idx = -1;
76       for (int block_idx = (int)program->blocks.size() - 1; block_idx >= 0; block_idx--) {
77          const Block& block = program->blocks[block_idx];
78          if (block.kind & block_kind_top_level) {
79             last_top_level_block_idx = block_idx;
80          }
81          for (size_t instr_idx = block.instructions.size() - 1; instr_idx + size_t(1) > 0;
82               instr_idx--) {
83             const aco_opcode opcode = block.instructions[instr_idx]->opcode;
84             if (opcode == aco_opcode::p_pops_gfx9_ordered_section_done ||
85                 opcode == aco_opcode::p_pops_gfx9_overlapped_wave_wait_done) {
86                end_block_idx_ = last_top_level_block_idx;
87                /* The same block if it's already a top-level block, or the beginning of the next
88                 * top-level block.
89                 */
90                instr_after_end_idx_ = block_idx == end_block_idx_ ? instr_idx + 1 : 0;
91                break;
92             }
93          }
94          if (end_block_idx_ != -1) {
95             break;
96          }
97       }
98    }
99 
100    /* If this is not -1, during the normal execution flow (not early exiting), MSG_ORDERED_PS_DONE
101     * must be sent in this block.
102     */
end_block_idx() const103    int end_block_idx() const { return end_block_idx_; }
104 
105    /* If end_block_idx() is an existing block, during the normal execution flow (not early exiting),
106     * MSG_ORDERED_PS_DONE must be sent before this instruction in the block end_block_idx().
107     * If this is out of the bounds of the instructions in the end block, it must be sent in the end
108     * of that block.
109     */
instr_after_end_idx() const110    size_t instr_after_end_idx() const { return instr_after_end_idx_; }
111 
112    /* Whether an instruction doing early exit (such as discard) needs to send MSG_ORDERED_PS_DONE
113     * before actually ending the program.
114     */
early_exit_needs_done_msg(const int block_idx,const size_t instr_idx) const115    bool early_exit_needs_done_msg(const int block_idx, const size_t instr_idx) const
116    {
117       return block_idx <= end_block_idx_ &&
118              (block_idx != end_block_idx_ || instr_idx < instr_after_end_idx_);
119    }
120 
121 private:
122    /* Initialize to an empty range for which "is inside" comparisons will be failing for any
123     * block.
124     */
125    int end_block_idx_ = -1;
126    size_t instr_after_end_idx_ = 0;
127 };
128 
129 /* used by handle_operands() indirectly through Builder::copy */
130 uint8_t int8_mul_table[512] = {
131    0, 20,  1,  1,   1,  2,   1,  3,   1,  4,   1, 5,   1,  6,   1,  7,   1,  8,   1,  9,
132    1, 10,  1,  11,  1,  12,  1,  13,  1,  14,  1, 15,  1,  16,  1,  17,  1,  18,  1,  19,
133    1, 20,  1,  21,  1,  22,  1,  23,  1,  24,  1, 25,  1,  26,  1,  27,  1,  28,  1,  29,
134    1, 30,  1,  31,  1,  32,  1,  33,  1,  34,  1, 35,  1,  36,  1,  37,  1,  38,  1,  39,
135    1, 40,  1,  41,  1,  42,  1,  43,  1,  44,  1, 45,  1,  46,  1,  47,  1,  48,  1,  49,
136    1, 50,  1,  51,  1,  52,  1,  53,  1,  54,  1, 55,  1,  56,  1,  57,  1,  58,  1,  59,
137    1, 60,  1,  61,  1,  62,  1,  63,  1,  64,  5, 13,  2,  33,  17, 19,  2,  34,  3,  23,
138    2, 35,  11, 53,  2,  36,  7,  47,  2,  37,  3, 25,  2,  38,  7,  11,  2,  39,  53, 243,
139    2, 40,  3,  27,  2,  41,  17, 35,  2,  42,  5, 17,  2,  43,  3,  29,  2,  44,  15, 23,
140    2, 45,  7,  13,  2,  46,  3,  31,  2,  47,  5, 19,  2,  48,  19, 59,  2,  49,  3,  33,
141    2, 50,  7,  51,  2,  51,  15, 41,  2,  52,  3, 35,  2,  53,  11, 33,  2,  54,  23, 27,
142    2, 55,  3,  37,  2,  56,  9,  41,  2,  57,  5, 23,  2,  58,  3,  39,  2,  59,  7,  17,
143    2, 60,  9,  241, 2,  61,  3,  41,  2,  62,  5, 25,  2,  63,  35, 245, 2,  64,  3,  43,
144    5, 26,  9,  43,  3,  44,  7,  19,  10, 39,  3, 45,  4,  34,  11, 59,  3,  46,  9,  243,
145    4, 35,  3,  47,  22, 53,  7,  57,  3,  48,  5, 29,  10, 245, 3,  49,  4,  37,  9,  45,
146    3, 50,  7,  241, 4,  38,  3,  51,  7,  22,  5, 31,  3,  52,  7,  59,  7,  242, 3,  53,
147    4, 40,  7,  23,  3,  54,  15, 45,  4,  41,  3, 55,  6,  241, 9,  47,  3,  56,  13, 13,
148    5, 34,  3,  57,  4,  43,  11, 39,  3,  58,  5, 35,  4,  44,  3,  59,  6,  243, 7,  245,
149    3, 60,  5,  241, 7,  26,  3,  61,  4,  46,  5, 37,  3,  62,  11, 17,  4,  47,  3,  63,
150    5, 38,  5,  243, 3,  64,  7,  247, 9,  50,  5, 39,  4,  241, 33, 37,  6,  33,  13, 35,
151    4, 242, 5,  245, 6,  247, 7,  29,  4,  51,  5, 41,  5,  246, 7,  249, 3,  240, 11, 19,
152    5, 42,  3,  241, 4,  245, 25, 29,  3,  242, 5, 43,  4,  246, 3,  243, 17, 58,  17, 43,
153    3, 244, 5,  249, 6,  37,  3,  245, 2,  240, 5, 45,  2,  241, 21, 23,  2,  242, 3,  247,
154    2, 243, 5,  251, 2,  244, 29, 61,  2,  245, 3, 249, 2,  246, 17, 29,  2,  247, 9,  55,
155    1, 240, 1,  241, 1,  242, 1,  243, 1,  244, 1, 245, 1,  246, 1,  247, 1,  248, 1,  249,
156    1, 250, 1,  251, 1,  252, 1,  253, 1,  254, 1, 255};
157 
158 aco_opcode
get_reduce_opcode(amd_gfx_level gfx_level,ReduceOp op)159 get_reduce_opcode(amd_gfx_level gfx_level, ReduceOp op)
160 {
161    /* Because some 16-bit instructions are already VOP3 on GFX10, we use the
162     * 32-bit opcodes (VOP2) which allows to remove the temporary VGPR and to use
163     * DPP with the arithmetic instructions. This requires to sign-extend.
164     */
165    switch (op) {
166    case iadd8:
167    case iadd16:
168       if (gfx_level >= GFX10) {
169          return aco_opcode::v_add_u32;
170       } else if (gfx_level >= GFX8) {
171          return aco_opcode::v_add_u16;
172       } else {
173          return aco_opcode::v_add_co_u32;
174       }
175       break;
176    case imul8:
177    case imul16:
178       if (gfx_level >= GFX10) {
179          return aco_opcode::v_mul_lo_u16_e64;
180       } else if (gfx_level >= GFX8) {
181          return aco_opcode::v_mul_lo_u16;
182       } else {
183          return aco_opcode::v_mul_u32_u24;
184       }
185       break;
186    case fadd16: return aco_opcode::v_add_f16;
187    case fmul16: return aco_opcode::v_mul_f16;
188    case imax8:
189    case imax16:
190       if (gfx_level >= GFX10) {
191          return aco_opcode::v_max_i32;
192       } else if (gfx_level >= GFX8) {
193          return aco_opcode::v_max_i16;
194       } else {
195          return aco_opcode::v_max_i32;
196       }
197       break;
198    case imin8:
199    case imin16:
200       if (gfx_level >= GFX10) {
201          return aco_opcode::v_min_i32;
202       } else if (gfx_level >= GFX8) {
203          return aco_opcode::v_min_i16;
204       } else {
205          return aco_opcode::v_min_i32;
206       }
207       break;
208    case umin8:
209    case umin16:
210       if (gfx_level >= GFX10) {
211          return aco_opcode::v_min_u32;
212       } else if (gfx_level >= GFX8) {
213          return aco_opcode::v_min_u16;
214       } else {
215          return aco_opcode::v_min_u32;
216       }
217       break;
218    case umax8:
219    case umax16:
220       if (gfx_level >= GFX10) {
221          return aco_opcode::v_max_u32;
222       } else if (gfx_level >= GFX8) {
223          return aco_opcode::v_max_u16;
224       } else {
225          return aco_opcode::v_max_u32;
226       }
227       break;
228    case fmin16: return aco_opcode::v_min_f16;
229    case fmax16: return aco_opcode::v_max_f16;
230    case iadd32: return gfx_level >= GFX9 ? aco_opcode::v_add_u32 : aco_opcode::v_add_co_u32;
231    case imul32: return aco_opcode::v_mul_lo_u32;
232    case fadd32: return aco_opcode::v_add_f32;
233    case fmul32: return aco_opcode::v_mul_f32;
234    case imax32: return aco_opcode::v_max_i32;
235    case imin32: return aco_opcode::v_min_i32;
236    case umin32: return aco_opcode::v_min_u32;
237    case umax32: return aco_opcode::v_max_u32;
238    case fmin32: return aco_opcode::v_min_f32;
239    case fmax32: return aco_opcode::v_max_f32;
240    case iand8:
241    case iand16:
242    case iand32: return aco_opcode::v_and_b32;
243    case ixor8:
244    case ixor16:
245    case ixor32: return aco_opcode::v_xor_b32;
246    case ior8:
247    case ior16:
248    case ior32: return aco_opcode::v_or_b32;
249    case iadd64: return aco_opcode::num_opcodes;
250    case imul64: return aco_opcode::num_opcodes;
251    case fadd64: return aco_opcode::v_add_f64;
252    case fmul64: return aco_opcode::v_mul_f64;
253    case imin64: return aco_opcode::num_opcodes;
254    case imax64: return aco_opcode::num_opcodes;
255    case umin64: return aco_opcode::num_opcodes;
256    case umax64: return aco_opcode::num_opcodes;
257    case fmin64: return aco_opcode::v_min_f64;
258    case fmax64: return aco_opcode::v_max_f64;
259    case iand64: return aco_opcode::num_opcodes;
260    case ior64: return aco_opcode::num_opcodes;
261    case ixor64: return aco_opcode::num_opcodes;
262    default: return aco_opcode::num_opcodes;
263    }
264 }
265 
266 bool
is_vop3_reduce_opcode(aco_opcode opcode)267 is_vop3_reduce_opcode(aco_opcode opcode)
268 {
269    /* 64-bit reductions are VOP3. */
270    if (opcode == aco_opcode::num_opcodes)
271       return true;
272 
273    return instr_info.format[(int)opcode] == Format::VOP3;
274 }
275 
276 void
emit_vadd32(Builder & bld,Definition def,Operand src0,Operand src1)277 emit_vadd32(Builder& bld, Definition def, Operand src0, Operand src1)
278 {
279    Instruction* instr = bld.vadd32(def, src0, src1, false, Operand(s2), true);
280    if (instr->definitions.size() >= 2) {
281       assert(instr->definitions[1].regClass() == bld.lm);
282       instr->definitions[1].setFixed(vcc);
283    }
284 }
285 
286 void
emit_int64_dpp_op(lower_context * ctx,PhysReg dst_reg,PhysReg src0_reg,PhysReg src1_reg,PhysReg vtmp_reg,ReduceOp op,unsigned dpp_ctrl,unsigned row_mask,unsigned bank_mask,bool bound_ctrl,Operand * identity=NULL)287 emit_int64_dpp_op(lower_context* ctx, PhysReg dst_reg, PhysReg src0_reg, PhysReg src1_reg,
288                   PhysReg vtmp_reg, ReduceOp op, unsigned dpp_ctrl, unsigned row_mask,
289                   unsigned bank_mask, bool bound_ctrl, Operand* identity = NULL)
290 {
291    Builder bld(ctx->program, &ctx->instructions);
292    Definition dst[] = {Definition(dst_reg, v1), Definition(PhysReg{dst_reg + 1}, v1)};
293    Definition vtmp_def[] = {Definition(vtmp_reg, v1), Definition(PhysReg{vtmp_reg + 1}, v1)};
294    Operand src0[] = {Operand(src0_reg, v1), Operand(PhysReg{src0_reg + 1}, v1)};
295    Operand src1[] = {Operand(src1_reg, v1), Operand(PhysReg{src1_reg + 1}, v1)};
296    Operand src1_64 = Operand(src1_reg, v2);
297    Operand vtmp_op[] = {Operand(vtmp_reg, v1), Operand(PhysReg{vtmp_reg + 1}, v1)};
298    Operand vtmp_op64 = Operand(vtmp_reg, v2);
299    if (op == iadd64) {
300       if (ctx->program->gfx_level >= GFX10) {
301          if (identity)
302             bld.vop1(aco_opcode::v_mov_b32, vtmp_def[0], identity[0]);
303          bld.vop1_dpp(aco_opcode::v_mov_b32, vtmp_def[0], src0[0], dpp_ctrl, row_mask, bank_mask,
304                       bound_ctrl);
305          bld.vop3(aco_opcode::v_add_co_u32_e64, dst[0], bld.def(bld.lm, vcc), vtmp_op[0], src1[0]);
306       } else {
307          bld.vop2_dpp(aco_opcode::v_add_co_u32, dst[0], bld.def(bld.lm, vcc), src0[0], src1[0],
308                       dpp_ctrl, row_mask, bank_mask, bound_ctrl);
309       }
310       bld.vop2_dpp(aco_opcode::v_addc_co_u32, dst[1], bld.def(bld.lm, vcc), src0[1], src1[1],
311                    Operand(vcc, bld.lm), dpp_ctrl, row_mask, bank_mask, bound_ctrl);
312    } else if (op == iand64) {
313       bld.vop2_dpp(aco_opcode::v_and_b32, dst[0], src0[0], src1[0], dpp_ctrl, row_mask, bank_mask,
314                    bound_ctrl);
315       bld.vop2_dpp(aco_opcode::v_and_b32, dst[1], src0[1], src1[1], dpp_ctrl, row_mask, bank_mask,
316                    bound_ctrl);
317    } else if (op == ior64) {
318       bld.vop2_dpp(aco_opcode::v_or_b32, dst[0], src0[0], src1[0], dpp_ctrl, row_mask, bank_mask,
319                    bound_ctrl);
320       bld.vop2_dpp(aco_opcode::v_or_b32, dst[1], src0[1], src1[1], dpp_ctrl, row_mask, bank_mask,
321                    bound_ctrl);
322    } else if (op == ixor64) {
323       bld.vop2_dpp(aco_opcode::v_xor_b32, dst[0], src0[0], src1[0], dpp_ctrl, row_mask, bank_mask,
324                    bound_ctrl);
325       bld.vop2_dpp(aco_opcode::v_xor_b32, dst[1], src0[1], src1[1], dpp_ctrl, row_mask, bank_mask,
326                    bound_ctrl);
327    } else if (op == umin64 || op == umax64 || op == imin64 || op == imax64) {
328       aco_opcode cmp = aco_opcode::num_opcodes;
329       switch (op) {
330       case umin64: cmp = aco_opcode::v_cmp_gt_u64; break;
331       case umax64: cmp = aco_opcode::v_cmp_lt_u64; break;
332       case imin64: cmp = aco_opcode::v_cmp_gt_i64; break;
333       case imax64: cmp = aco_opcode::v_cmp_lt_i64; break;
334       default: break;
335       }
336 
337       if (identity) {
338          bld.vop1(aco_opcode::v_mov_b32, vtmp_def[0], identity[0]);
339          bld.vop1(aco_opcode::v_mov_b32, vtmp_def[1], identity[1]);
340       }
341       bld.vop1_dpp(aco_opcode::v_mov_b32, vtmp_def[0], src0[0], dpp_ctrl, row_mask, bank_mask,
342                    bound_ctrl);
343       bld.vop1_dpp(aco_opcode::v_mov_b32, vtmp_def[1], src0[1], dpp_ctrl, row_mask, bank_mask,
344                    bound_ctrl);
345 
346       bld.vopc(cmp, bld.def(bld.lm, vcc), vtmp_op64, src1_64);
347       bld.vop2(aco_opcode::v_cndmask_b32, dst[0], vtmp_op[0], src1[0], Operand(vcc, bld.lm));
348       bld.vop2(aco_opcode::v_cndmask_b32, dst[1], vtmp_op[1], src1[1], Operand(vcc, bld.lm));
349    } else if (op == imul64) {
350       /* t4 = dpp(x_hi)
351        * t1 = umul_lo(t4, y_lo)
352        * t3 = dpp(x_lo)
353        * t0 = umul_lo(t3, y_hi)
354        * t2 = iadd(t0, t1)
355        * t5 = umul_hi(t3, y_lo)
356        * res_hi = iadd(t2, t5)
357        * res_lo = umul_lo(t3, y_lo)
358        * Requires that res_hi != src0[0] and res_hi != src1[0]
359        * and that vtmp[0] != res_hi.
360        */
361       if (identity)
362          bld.vop1(aco_opcode::v_mov_b32, vtmp_def[0], identity[1]);
363       bld.vop1_dpp(aco_opcode::v_mov_b32, vtmp_def[0], src0[1], dpp_ctrl, row_mask, bank_mask,
364                    bound_ctrl);
365       bld.vop3(aco_opcode::v_mul_lo_u32, vtmp_def[1], vtmp_op[0], src1[0]);
366       if (identity)
367          bld.vop1(aco_opcode::v_mov_b32, vtmp_def[0], identity[0]);
368       bld.vop1_dpp(aco_opcode::v_mov_b32, vtmp_def[0], src0[0], dpp_ctrl, row_mask, bank_mask,
369                    bound_ctrl);
370       bld.vop3(aco_opcode::v_mul_lo_u32, vtmp_def[0], vtmp_op[0], src1[1]);
371       emit_vadd32(bld, vtmp_def[1], vtmp_op[0], vtmp_op[1]);
372       if (identity)
373          bld.vop1(aco_opcode::v_mov_b32, vtmp_def[0], identity[0]);
374       bld.vop1_dpp(aco_opcode::v_mov_b32, vtmp_def[0], src0[0], dpp_ctrl, row_mask, bank_mask,
375                    bound_ctrl);
376       bld.vop3(aco_opcode::v_mul_hi_u32, vtmp_def[0], vtmp_op[0], src1[0]);
377       emit_vadd32(bld, dst[1], vtmp_op[1], vtmp_op[0]);
378       if (identity)
379          bld.vop1(aco_opcode::v_mov_b32, vtmp_def[0], identity[0]);
380       bld.vop1_dpp(aco_opcode::v_mov_b32, vtmp_def[0], src0[0], dpp_ctrl, row_mask, bank_mask,
381                    bound_ctrl);
382       bld.vop3(aco_opcode::v_mul_lo_u32, dst[0], vtmp_op[0], src1[0]);
383    }
384 }
385 
386 void
emit_int64_op(lower_context * ctx,PhysReg dst_reg,PhysReg src0_reg,PhysReg src1_reg,PhysReg vtmp,ReduceOp op)387 emit_int64_op(lower_context* ctx, PhysReg dst_reg, PhysReg src0_reg, PhysReg src1_reg, PhysReg vtmp,
388               ReduceOp op)
389 {
390    Builder bld(ctx->program, &ctx->instructions);
391    Definition dst[] = {Definition(dst_reg, v1), Definition(PhysReg{dst_reg + 1}, v1)};
392    RegClass src0_rc = src0_reg.reg() >= 256 ? v1 : s1;
393    Operand src0[] = {Operand(src0_reg, src0_rc), Operand(PhysReg{src0_reg + 1}, src0_rc)};
394    Operand src1[] = {Operand(src1_reg, v1), Operand(PhysReg{src1_reg + 1}, v1)};
395    Operand src0_64 = Operand(src0_reg, src0_reg.reg() >= 256 ? v2 : s2);
396    Operand src1_64 = Operand(src1_reg, v2);
397 
398    if (src0_rc == s1 &&
399        (op == imul64 || op == umin64 || op == umax64 || op == imin64 || op == imax64)) {
400       assert(vtmp.reg() != 0);
401       bld.vop1(aco_opcode::v_mov_b32, Definition(vtmp, v1), src0[0]);
402       bld.vop1(aco_opcode::v_mov_b32, Definition(PhysReg{vtmp + 1}, v1), src0[1]);
403       src0_reg = vtmp;
404       src0[0] = Operand(vtmp, v1);
405       src0[1] = Operand(PhysReg{vtmp + 1}, v1);
406       src0_64 = Operand(vtmp, v2);
407    } else if (src0_rc == s1 && op == iadd64) {
408       assert(vtmp.reg() != 0);
409       bld.vop1(aco_opcode::v_mov_b32, Definition(PhysReg{vtmp + 1}, v1), src0[1]);
410       src0[1] = Operand(PhysReg{vtmp + 1}, v1);
411    }
412 
413    if (op == iadd64) {
414       if (ctx->program->gfx_level >= GFX10) {
415          bld.vop3(aco_opcode::v_add_co_u32_e64, dst[0], bld.def(bld.lm, vcc), src0[0], src1[0]);
416       } else {
417          bld.vop2(aco_opcode::v_add_co_u32, dst[0], bld.def(bld.lm, vcc), src0[0], src1[0]);
418       }
419       bld.vop2(aco_opcode::v_addc_co_u32, dst[1], bld.def(bld.lm, vcc), src0[1], src1[1],
420                Operand(vcc, bld.lm));
421    } else if (op == iand64) {
422       bld.vop2(aco_opcode::v_and_b32, dst[0], src0[0], src1[0]);
423       bld.vop2(aco_opcode::v_and_b32, dst[1], src0[1], src1[1]);
424    } else if (op == ior64) {
425       bld.vop2(aco_opcode::v_or_b32, dst[0], src0[0], src1[0]);
426       bld.vop2(aco_opcode::v_or_b32, dst[1], src0[1], src1[1]);
427    } else if (op == ixor64) {
428       bld.vop2(aco_opcode::v_xor_b32, dst[0], src0[0], src1[0]);
429       bld.vop2(aco_opcode::v_xor_b32, dst[1], src0[1], src1[1]);
430    } else if (op == umin64 || op == umax64 || op == imin64 || op == imax64) {
431       aco_opcode cmp = aco_opcode::num_opcodes;
432       switch (op) {
433       case umin64: cmp = aco_opcode::v_cmp_gt_u64; break;
434       case umax64: cmp = aco_opcode::v_cmp_lt_u64; break;
435       case imin64: cmp = aco_opcode::v_cmp_gt_i64; break;
436       case imax64: cmp = aco_opcode::v_cmp_lt_i64; break;
437       default: break;
438       }
439 
440       bld.vopc(cmp, bld.def(bld.lm, vcc), src0_64, src1_64);
441       bld.vop2(aco_opcode::v_cndmask_b32, dst[0], src0[0], src1[0], Operand(vcc, bld.lm));
442       bld.vop2(aco_opcode::v_cndmask_b32, dst[1], src0[1], src1[1], Operand(vcc, bld.lm));
443    } else if (op == imul64) {
444       if (src1_reg == dst_reg) {
445          /* it's fine if src0==dst but not if src1==dst */
446          std::swap(src0_reg, src1_reg);
447          std::swap(src0[0], src1[0]);
448          std::swap(src0[1], src1[1]);
449          std::swap(src0_64, src1_64);
450       }
451       assert(!(src0_reg == src1_reg));
452       /* t1 = umul_lo(x_hi, y_lo)
453        * t0 = umul_lo(x_lo, y_hi)
454        * t2 = iadd(t0, t1)
455        * t5 = umul_hi(x_lo, y_lo)
456        * res_hi = iadd(t2, t5)
457        * res_lo = umul_lo(x_lo, y_lo)
458        * assumes that it's ok to modify x_hi/y_hi, since we might not have vtmp
459        */
460       Definition tmp0_def(PhysReg{src0_reg + 1}, v1);
461       Definition tmp1_def(PhysReg{src1_reg + 1}, v1);
462       Operand tmp0_op = src0[1];
463       Operand tmp1_op = src1[1];
464       bld.vop3(aco_opcode::v_mul_lo_u32, tmp0_def, src0[1], src1[0]);
465       bld.vop3(aco_opcode::v_mul_lo_u32, tmp1_def, src0[0], src1[1]);
466       emit_vadd32(bld, tmp0_def, tmp1_op, tmp0_op);
467       bld.vop3(aco_opcode::v_mul_hi_u32, tmp1_def, src0[0], src1[0]);
468       emit_vadd32(bld, dst[1], tmp0_op, tmp1_op);
469       bld.vop3(aco_opcode::v_mul_lo_u32, dst[0], src0[0], src1[0]);
470    }
471 }
472 
473 void
emit_dpp_op(lower_context * ctx,PhysReg dst_reg,PhysReg src0_reg,PhysReg src1_reg,PhysReg vtmp,ReduceOp op,unsigned size,unsigned dpp_ctrl,unsigned row_mask,unsigned bank_mask,bool bound_ctrl,Operand * identity=NULL)474 emit_dpp_op(lower_context* ctx, PhysReg dst_reg, PhysReg src0_reg, PhysReg src1_reg, PhysReg vtmp,
475             ReduceOp op, unsigned size, unsigned dpp_ctrl, unsigned row_mask, unsigned bank_mask,
476             bool bound_ctrl, Operand* identity = NULL) /* for VOP3 with sparse writes */
477 {
478    Builder bld(ctx->program, &ctx->instructions);
479    RegClass rc = RegClass(RegType::vgpr, size);
480    Definition dst(dst_reg, rc);
481    Operand src0(src0_reg, rc);
482    Operand src1(src1_reg, rc);
483 
484    aco_opcode opcode = get_reduce_opcode(ctx->program->gfx_level, op);
485    bool vop3 = is_vop3_reduce_opcode(opcode);
486 
487    if (!vop3) {
488       if (opcode == aco_opcode::v_add_co_u32)
489          bld.vop2_dpp(opcode, dst, bld.def(bld.lm, vcc), src0, src1, dpp_ctrl, row_mask, bank_mask,
490                       bound_ctrl);
491       else
492          bld.vop2_dpp(opcode, dst, src0, src1, dpp_ctrl, row_mask, bank_mask, bound_ctrl);
493       return;
494    }
495 
496    if (opcode == aco_opcode::num_opcodes) {
497       emit_int64_dpp_op(ctx, dst_reg, src0_reg, src1_reg, vtmp, op, dpp_ctrl, row_mask, bank_mask,
498                         bound_ctrl, identity);
499       return;
500    }
501 
502    if (identity)
503       bld.vop1(aco_opcode::v_mov_b32, Definition(vtmp, v1), identity[0]);
504    if (identity && size >= 2)
505       bld.vop1(aco_opcode::v_mov_b32, Definition(PhysReg{vtmp + 1}, v1), identity[1]);
506 
507    for (unsigned i = 0; i < size; i++)
508       bld.vop1_dpp(aco_opcode::v_mov_b32, Definition(PhysReg{vtmp + i}, v1),
509                    Operand(PhysReg{src0_reg + i}, v1), dpp_ctrl, row_mask, bank_mask, bound_ctrl);
510 
511    bld.vop3(opcode, dst, Operand(vtmp, rc), src1);
512 }
513 
514 void
emit_op(lower_context * ctx,PhysReg dst_reg,PhysReg src0_reg,PhysReg src1_reg,PhysReg vtmp,ReduceOp op,unsigned size)515 emit_op(lower_context* ctx, PhysReg dst_reg, PhysReg src0_reg, PhysReg src1_reg, PhysReg vtmp,
516         ReduceOp op, unsigned size)
517 {
518    Builder bld(ctx->program, &ctx->instructions);
519    RegClass rc = RegClass(RegType::vgpr, size);
520    Definition dst(dst_reg, rc);
521    Operand src0(src0_reg, RegClass(src0_reg.reg() >= 256 ? RegType::vgpr : RegType::sgpr, size));
522    Operand src1(src1_reg, rc);
523 
524    aco_opcode opcode = get_reduce_opcode(ctx->program->gfx_level, op);
525    bool vop3 = is_vop3_reduce_opcode(opcode);
526 
527    if (opcode == aco_opcode::num_opcodes) {
528       emit_int64_op(ctx, dst_reg, src0_reg, src1_reg, vtmp, op);
529       return;
530    }
531 
532    if (vop3) {
533       bld.vop3(opcode, dst, src0, src1);
534    } else if (opcode == aco_opcode::v_add_co_u32) {
535       bld.vop2(opcode, dst, bld.def(bld.lm, vcc), src0, src1);
536    } else {
537       bld.vop2(opcode, dst, src0, src1);
538    }
539 }
540 
541 void
emit_dpp_mov(lower_context * ctx,PhysReg dst,PhysReg src0,unsigned size,unsigned dpp_ctrl,unsigned row_mask,unsigned bank_mask,bool bound_ctrl)542 emit_dpp_mov(lower_context* ctx, PhysReg dst, PhysReg src0, unsigned size, unsigned dpp_ctrl,
543              unsigned row_mask, unsigned bank_mask, bool bound_ctrl)
544 {
545    Builder bld(ctx->program, &ctx->instructions);
546    for (unsigned i = 0; i < size; i++) {
547       bld.vop1_dpp(aco_opcode::v_mov_b32, Definition(PhysReg{dst + i}, v1),
548                    Operand(PhysReg{src0 + i}, v1), dpp_ctrl, row_mask, bank_mask, bound_ctrl);
549    }
550 }
551 
552 void
emit_ds_swizzle(Builder bld,PhysReg dst,PhysReg src,unsigned size,unsigned ds_pattern)553 emit_ds_swizzle(Builder bld, PhysReg dst, PhysReg src, unsigned size, unsigned ds_pattern)
554 {
555    for (unsigned i = 0; i < size; i++) {
556       bld.ds(aco_opcode::ds_swizzle_b32, Definition(PhysReg{dst + i}, v1),
557              Operand(PhysReg{src + i}, v1), ds_pattern);
558    }
559 }
560 
561 void
emit_reduction(lower_context * ctx,aco_opcode op,ReduceOp reduce_op,unsigned cluster_size,PhysReg tmp,PhysReg stmp,PhysReg vtmp,PhysReg sitmp,Operand src,Definition dst)562 emit_reduction(lower_context* ctx, aco_opcode op, ReduceOp reduce_op, unsigned cluster_size,
563                PhysReg tmp, PhysReg stmp, PhysReg vtmp, PhysReg sitmp, Operand src, Definition dst)
564 {
565    assert(cluster_size == ctx->program->wave_size || op == aco_opcode::p_reduce);
566    assert(cluster_size <= ctx->program->wave_size);
567 
568    Builder bld(ctx->program, &ctx->instructions);
569 
570    Operand identity[2];
571    identity[0] = Operand::c32(get_reduction_identity(reduce_op, 0));
572    identity[1] = Operand::c32(get_reduction_identity(reduce_op, 1));
573    Operand vcndmask_identity[2] = {identity[0], identity[1]};
574 
575    /* First, copy the source to tmp and set inactive lanes to the identity */
576    bld.sop1(Builder::s_or_saveexec, Definition(stmp, bld.lm), Definition(scc, s1),
577             Definition(exec, bld.lm), Operand::c64(UINT64_MAX), Operand(exec, bld.lm));
578 
579    /* On GFX10+ v_writelane_b32/v_cndmask_b32_e64 can take a literal */
580    if (ctx->program->gfx_level < GFX10) {
581       for (unsigned i = 0; i < src.size(); i++) {
582          /* p_exclusive_scan uses identity for v_writelane_b32 */
583          if (identity[i].isLiteral() && op == aco_opcode::p_exclusive_scan) {
584             bld.sop1(aco_opcode::s_mov_b32, Definition(PhysReg{sitmp + i}, s1), identity[i]);
585             identity[i] = Operand(PhysReg{sitmp + i}, s1);
586 
587             bld.vop1(aco_opcode::v_mov_b32, Definition(PhysReg{tmp + i}, v1), identity[i]);
588             vcndmask_identity[i] = Operand(PhysReg{tmp + i}, v1);
589          } else if (identity[i].isLiteral()) {
590             bld.vop1(aco_opcode::v_mov_b32, Definition(PhysReg{tmp + i}, v1), identity[i]);
591             vcndmask_identity[i] = Operand(PhysReg{tmp + i}, v1);
592          }
593       }
594    }
595 
596    for (unsigned i = 0; i < src.size(); i++) {
597       bld.vop2_e64(aco_opcode::v_cndmask_b32, Definition(PhysReg{tmp + i}, v1),
598                    vcndmask_identity[i], Operand(PhysReg{src.physReg() + i}, v1),
599                    Operand(stmp, bld.lm));
600    }
601 
602    if (src.regClass() == v1b) {
603       if (ctx->program->gfx_level >= GFX8 && ctx->program->gfx_level < GFX11) {
604          aco_ptr<SDWA_instruction> sdwa{create_instruction<SDWA_instruction>(
605             aco_opcode::v_mov_b32, asSDWA(Format::VOP1), 1, 1)};
606          sdwa->operands[0] = Operand(PhysReg{tmp}, v1);
607          sdwa->definitions[0] = Definition(PhysReg{tmp}, v1);
608          bool sext = reduce_op == imin8 || reduce_op == imax8;
609          sdwa->sel[0] = SubdwordSel(1, 0, sext);
610          sdwa->dst_sel = SubdwordSel::dword;
611          bld.insert(std::move(sdwa));
612       } else {
613          aco_opcode opcode;
614 
615          if (reduce_op == imin8 || reduce_op == imax8)
616             opcode = aco_opcode::v_bfe_i32;
617          else
618             opcode = aco_opcode::v_bfe_u32;
619 
620          bld.vop3(opcode, Definition(PhysReg{tmp}, v1), Operand(PhysReg{tmp}, v1), Operand::zero(),
621                   Operand::c32(8u));
622       }
623    } else if (src.regClass() == v2b) {
624       bool is_add_cmp = reduce_op == iadd16 || reduce_op == imax16 || reduce_op == imin16 ||
625                         reduce_op == umin16 || reduce_op == umax16;
626       if (ctx->program->gfx_level >= GFX10 && ctx->program->gfx_level < GFX11 && is_add_cmp) {
627          aco_ptr<SDWA_instruction> sdwa{create_instruction<SDWA_instruction>(
628             aco_opcode::v_mov_b32, asSDWA(Format::VOP1), 1, 1)};
629          sdwa->operands[0] = Operand(PhysReg{tmp}, v1);
630          sdwa->definitions[0] = Definition(PhysReg{tmp}, v1);
631          bool sext = reduce_op == imin16 || reduce_op == imax16 || reduce_op == iadd16;
632          sdwa->sel[0] = SubdwordSel(2, 0, sext);
633          sdwa->dst_sel = SubdwordSel::dword;
634          bld.insert(std::move(sdwa));
635       } else if (ctx->program->gfx_level <= GFX7 ||
636                  (ctx->program->gfx_level >= GFX11 && is_add_cmp)) {
637          aco_opcode opcode;
638 
639          if (reduce_op == imin16 || reduce_op == imax16 || reduce_op == iadd16)
640             opcode = aco_opcode::v_bfe_i32;
641          else
642             opcode = aco_opcode::v_bfe_u32;
643 
644          bld.vop3(opcode, Definition(PhysReg{tmp}, v1), Operand(PhysReg{tmp}, v1), Operand::zero(),
645                   Operand::c32(16u));
646       }
647    }
648 
649    bool reduction_needs_last_op = false;
650    switch (op) {
651    case aco_opcode::p_reduce:
652       if (cluster_size == 1)
653          break;
654 
655       if (ctx->program->gfx_level <= GFX7) {
656          reduction_needs_last_op = true;
657          emit_ds_swizzle(bld, vtmp, tmp, src.size(), (1 << 15) | dpp_quad_perm(1, 0, 3, 2));
658          if (cluster_size == 2)
659             break;
660          emit_op(ctx, tmp, vtmp, tmp, PhysReg{0}, reduce_op, src.size());
661          emit_ds_swizzle(bld, vtmp, tmp, src.size(), (1 << 15) | dpp_quad_perm(2, 3, 0, 1));
662          if (cluster_size == 4)
663             break;
664          emit_op(ctx, tmp, vtmp, tmp, PhysReg{0}, reduce_op, src.size());
665          emit_ds_swizzle(bld, vtmp, tmp, src.size(), ds_pattern_bitmode(0x1f, 0, 0x04));
666          if (cluster_size == 8)
667             break;
668          emit_op(ctx, tmp, vtmp, tmp, PhysReg{0}, reduce_op, src.size());
669          emit_ds_swizzle(bld, vtmp, tmp, src.size(), ds_pattern_bitmode(0x1f, 0, 0x08));
670          if (cluster_size == 16)
671             break;
672          emit_op(ctx, tmp, vtmp, tmp, PhysReg{0}, reduce_op, src.size());
673          emit_ds_swizzle(bld, vtmp, tmp, src.size(), ds_pattern_bitmode(0x1f, 0, 0x10));
674          if (cluster_size == 32)
675             break;
676          emit_op(ctx, tmp, vtmp, tmp, PhysReg{0}, reduce_op, src.size());
677          for (unsigned i = 0; i < src.size(); i++)
678             bld.readlane(Definition(PhysReg{dst.physReg() + i}, s1), Operand(PhysReg{tmp + i}, v1),
679                          Operand::zero());
680          // TODO: it would be more effective to do the last reduction step on SALU
681          emit_op(ctx, tmp, dst.physReg(), tmp, vtmp, reduce_op, src.size());
682          reduction_needs_last_op = false;
683          break;
684       }
685 
686       emit_dpp_op(ctx, tmp, tmp, tmp, vtmp, reduce_op, src.size(), dpp_quad_perm(1, 0, 3, 2), 0xf,
687                   0xf, false);
688       if (cluster_size == 2)
689          break;
690       emit_dpp_op(ctx, tmp, tmp, tmp, vtmp, reduce_op, src.size(), dpp_quad_perm(2, 3, 0, 1), 0xf,
691                   0xf, false);
692       if (cluster_size == 4)
693          break;
694       emit_dpp_op(ctx, tmp, tmp, tmp, vtmp, reduce_op, src.size(), dpp_row_half_mirror, 0xf, 0xf,
695                   false);
696       if (cluster_size == 8)
697          break;
698       emit_dpp_op(ctx, tmp, tmp, tmp, vtmp, reduce_op, src.size(), dpp_row_mirror, 0xf, 0xf, false);
699       if (cluster_size == 16)
700          break;
701 
702       if (ctx->program->gfx_level >= GFX10) {
703          /* GFX10+ doesn't support row_bcast15 and row_bcast31 */
704          for (unsigned i = 0; i < src.size(); i++)
705             bld.vop3(aco_opcode::v_permlanex16_b32, Definition(PhysReg{vtmp + i}, v1),
706                      Operand(PhysReg{tmp + i}, v1), Operand::zero(), Operand::zero());
707 
708          if (cluster_size == 32) {
709             reduction_needs_last_op = true;
710             break;
711          }
712 
713          emit_op(ctx, tmp, tmp, vtmp, PhysReg{0}, reduce_op, src.size());
714          for (unsigned i = 0; i < src.size(); i++)
715             bld.readlane(Definition(PhysReg{dst.physReg() + i}, s1), Operand(PhysReg{tmp + i}, v1),
716                          Operand::zero());
717          // TODO: it would be more effective to do the last reduction step on SALU
718          emit_op(ctx, tmp, dst.physReg(), tmp, vtmp, reduce_op, src.size());
719          break;
720       }
721 
722       if (cluster_size == 32) {
723          emit_ds_swizzle(bld, vtmp, tmp, src.size(), ds_pattern_bitmode(0x1f, 0, 0x10));
724          reduction_needs_last_op = true;
725          break;
726       }
727       assert(cluster_size == 64);
728       emit_dpp_op(ctx, tmp, tmp, tmp, vtmp, reduce_op, src.size(), dpp_row_bcast15, 0xa, 0xf,
729                   false);
730       emit_dpp_op(ctx, tmp, tmp, tmp, vtmp, reduce_op, src.size(), dpp_row_bcast31, 0xc, 0xf,
731                   false);
732       break;
733    case aco_opcode::p_exclusive_scan:
734       if (ctx->program->gfx_level >= GFX10) { /* gfx10 doesn't support wf_sr1, so emulate it */
735          /* shift rows right */
736          emit_dpp_mov(ctx, vtmp, tmp, src.size(), dpp_row_sr(1), 0xf, 0xf, true);
737 
738          /* fill in the gaps in rows 1 and 3 */
739          bld.sop1(aco_opcode::s_mov_b32, Definition(exec_lo, s1), Operand::c32(0x10000u));
740          if (ctx->program->wave_size == 64)
741             bld.sop1(aco_opcode::s_mov_b32, Definition(exec_hi, s1), Operand::c32(0x10000u));
742          for (unsigned i = 0; i < src.size(); i++) {
743             Instruction* perm =
744                bld.vop3(aco_opcode::v_permlanex16_b32, Definition(PhysReg{vtmp + i}, v1),
745                         Operand(PhysReg{tmp + i}, v1), Operand::c32(0xffffffffu),
746                         Operand::c32(0xffffffffu))
747                   .instr;
748             perm->valu().opsel = 1; /* FI (Fetch Inactive) */
749          }
750          bld.sop1(Builder::s_mov, Definition(exec, bld.lm), Operand::c64(UINT64_MAX));
751 
752          if (ctx->program->wave_size == 64) {
753             /* fill in the gap in row 2 */
754             for (unsigned i = 0; i < src.size(); i++) {
755                bld.readlane(Definition(PhysReg{sitmp + i}, s1), Operand(PhysReg{tmp + i}, v1),
756                             Operand::c32(31u));
757                bld.writelane(Definition(PhysReg{vtmp + i}, v1), Operand(PhysReg{sitmp + i}, s1),
758                              Operand::c32(32u), Operand(PhysReg{vtmp + i}, v1));
759             }
760          }
761          std::swap(tmp, vtmp);
762       } else if (ctx->program->gfx_level >= GFX8) {
763          emit_dpp_mov(ctx, tmp, tmp, src.size(), dpp_wf_sr1, 0xf, 0xf, true);
764       } else {
765          // TODO: use LDS on CS with a single write and shifted read
766          /* wavefront shift_right by 1 on SI/CI */
767          emit_ds_swizzle(bld, vtmp, tmp, src.size(), (1 << 15) | dpp_quad_perm(0, 0, 1, 2));
768          emit_ds_swizzle(bld, tmp, tmp, src.size(),
769                          ds_pattern_bitmode(0x1F, 0x00, 0x07)); /* mirror(8) */
770          bld.sop1(aco_opcode::s_mov_b32, Definition(exec_lo, s1), Operand::c32(0x10101010u));
771          bld.sop1(aco_opcode::s_mov_b32, Definition(exec_hi, s1), Operand(exec_lo, s1));
772          for (unsigned i = 0; i < src.size(); i++)
773             bld.vop1(aco_opcode::v_mov_b32, Definition(PhysReg{vtmp + i}, v1),
774                      Operand(PhysReg{tmp + i}, v1));
775 
776          bld.sop1(aco_opcode::s_mov_b64, Definition(exec, s2), Operand::c64(UINT64_MAX));
777          emit_ds_swizzle(bld, tmp, tmp, src.size(),
778                          ds_pattern_bitmode(0x1F, 0x00, 0x08)); /* swap(8) */
779          bld.sop1(aco_opcode::s_mov_b32, Definition(exec_lo, s1), Operand::c32(0x01000100u));
780          bld.sop1(aco_opcode::s_mov_b32, Definition(exec_hi, s1), Operand(exec_lo, s1));
781          for (unsigned i = 0; i < src.size(); i++)
782             bld.vop1(aco_opcode::v_mov_b32, Definition(PhysReg{vtmp + i}, v1),
783                      Operand(PhysReg{tmp + i}, v1));
784 
785          bld.sop1(aco_opcode::s_mov_b64, Definition(exec, s2), Operand::c64(UINT64_MAX));
786          emit_ds_swizzle(bld, tmp, tmp, src.size(),
787                          ds_pattern_bitmode(0x1F, 0x00, 0x10)); /* swap(16) */
788          bld.sop2(aco_opcode::s_bfm_b32, Definition(exec_lo, s1), Operand::c32(1u),
789                   Operand::c32(16u));
790          bld.sop2(aco_opcode::s_bfm_b32, Definition(exec_hi, s1), Operand::c32(1u),
791                   Operand::c32(16u));
792          for (unsigned i = 0; i < src.size(); i++)
793             bld.vop1(aco_opcode::v_mov_b32, Definition(PhysReg{vtmp + i}, v1),
794                      Operand(PhysReg{tmp + i}, v1));
795 
796          bld.sop1(aco_opcode::s_mov_b64, Definition(exec, s2), Operand::c64(UINT64_MAX));
797          for (unsigned i = 0; i < src.size(); i++) {
798             bld.writelane(Definition(PhysReg{vtmp + i}, v1), identity[i], Operand::zero(),
799                           Operand(PhysReg{vtmp + i}, v1));
800             bld.readlane(Definition(PhysReg{sitmp + i}, s1), Operand(PhysReg{tmp + i}, v1),
801                          Operand::zero());
802             bld.writelane(Definition(PhysReg{vtmp + i}, v1), Operand(PhysReg{sitmp + i}, s1),
803                           Operand::c32(32u), Operand(PhysReg{vtmp + i}, v1));
804             identity[i] = Operand::zero(); /* prevent further uses of identity */
805          }
806          std::swap(tmp, vtmp);
807       }
808 
809       for (unsigned i = 0; i < src.size(); i++) {
810          if (!identity[i].isConstant() ||
811              identity[i].constantValue()) { /* bound_ctrl should take care of this otherwise */
812             if (ctx->program->gfx_level < GFX10)
813                assert((identity[i].isConstant() && !identity[i].isLiteral()) ||
814                       identity[i].physReg() == PhysReg{sitmp + i});
815             bld.writelane(Definition(PhysReg{tmp + i}, v1), identity[i], Operand::zero(),
816                           Operand(PhysReg{tmp + i}, v1));
817          }
818       }
819       FALLTHROUGH;
820    case aco_opcode::p_inclusive_scan:
821       assert(cluster_size == ctx->program->wave_size);
822       if (ctx->program->gfx_level <= GFX7) {
823          emit_ds_swizzle(bld, vtmp, tmp, src.size(), ds_pattern_bitmode(0x1e, 0x00, 0x00));
824          bld.sop1(aco_opcode::s_mov_b32, Definition(exec_lo, s1), Operand::c32(0xAAAAAAAAu));
825          bld.sop1(aco_opcode::s_mov_b32, Definition(exec_hi, s1), Operand(exec_lo, s1));
826          emit_op(ctx, tmp, tmp, vtmp, PhysReg{0}, reduce_op, src.size());
827 
828          bld.sop1(aco_opcode::s_mov_b64, Definition(exec, s2), Operand::c64(UINT64_MAX));
829          emit_ds_swizzle(bld, vtmp, tmp, src.size(), ds_pattern_bitmode(0x1c, 0x01, 0x00));
830          bld.sop1(aco_opcode::s_mov_b32, Definition(exec_lo, s1), Operand::c32(0xCCCCCCCCu));
831          bld.sop1(aco_opcode::s_mov_b32, Definition(exec_hi, s1), Operand(exec_lo, s1));
832          emit_op(ctx, tmp, tmp, vtmp, PhysReg{0}, reduce_op, src.size());
833 
834          bld.sop1(aco_opcode::s_mov_b64, Definition(exec, s2), Operand::c64(UINT64_MAX));
835          emit_ds_swizzle(bld, vtmp, tmp, src.size(), ds_pattern_bitmode(0x18, 0x03, 0x00));
836          bld.sop1(aco_opcode::s_mov_b32, Definition(exec_lo, s1), Operand::c32(0xF0F0F0F0u));
837          bld.sop1(aco_opcode::s_mov_b32, Definition(exec_hi, s1), Operand(exec_lo, s1));
838          emit_op(ctx, tmp, tmp, vtmp, PhysReg{0}, reduce_op, src.size());
839 
840          bld.sop1(aco_opcode::s_mov_b64, Definition(exec, s2), Operand::c64(UINT64_MAX));
841          emit_ds_swizzle(bld, vtmp, tmp, src.size(), ds_pattern_bitmode(0x10, 0x07, 0x00));
842          bld.sop1(aco_opcode::s_mov_b32, Definition(exec_lo, s1), Operand::c32(0xFF00FF00u));
843          bld.sop1(aco_opcode::s_mov_b32, Definition(exec_hi, s1), Operand(exec_lo, s1));
844          emit_op(ctx, tmp, tmp, vtmp, PhysReg{0}, reduce_op, src.size());
845 
846          bld.sop1(aco_opcode::s_mov_b64, Definition(exec, s2), Operand::c64(UINT64_MAX));
847          emit_ds_swizzle(bld, vtmp, tmp, src.size(), ds_pattern_bitmode(0x00, 0x0f, 0x00));
848          bld.sop2(aco_opcode::s_bfm_b32, Definition(exec_lo, s1), Operand::c32(16u),
849                   Operand::c32(16u));
850          bld.sop2(aco_opcode::s_bfm_b32, Definition(exec_hi, s1), Operand::c32(16u),
851                   Operand::c32(16u));
852          emit_op(ctx, tmp, tmp, vtmp, PhysReg{0}, reduce_op, src.size());
853 
854          for (unsigned i = 0; i < src.size(); i++)
855             bld.readlane(Definition(PhysReg{sitmp + i}, s1), Operand(PhysReg{tmp + i}, v1),
856                          Operand::c32(31u));
857          bld.sop2(aco_opcode::s_bfm_b64, Definition(exec, s2), Operand::c32(32u),
858                   Operand::c32(32u));
859          emit_op(ctx, tmp, sitmp, tmp, vtmp, reduce_op, src.size());
860          break;
861       }
862 
863       emit_dpp_op(ctx, tmp, tmp, tmp, vtmp, reduce_op, src.size(), dpp_row_sr(1), 0xf, 0xf, false,
864                   identity);
865       emit_dpp_op(ctx, tmp, tmp, tmp, vtmp, reduce_op, src.size(), dpp_row_sr(2), 0xf, 0xf, false,
866                   identity);
867       emit_dpp_op(ctx, tmp, tmp, tmp, vtmp, reduce_op, src.size(), dpp_row_sr(4), 0xf, 0xf, false,
868                   identity);
869       emit_dpp_op(ctx, tmp, tmp, tmp, vtmp, reduce_op, src.size(), dpp_row_sr(8), 0xf, 0xf, false,
870                   identity);
871       if (ctx->program->gfx_level >= GFX10) {
872          if (ctx->program->wave_size == 64) {
873             bld.sop1(aco_opcode::s_bitreplicate_b64_b32, Definition(exec, s2),
874                      Operand::c32(0xff00ff00u));
875          } else {
876             bld.sop2(aco_opcode::s_bfm_b32, Definition(exec_lo, s1), Operand::c32(16u),
877                      Operand::c32(16u));
878          }
879          for (unsigned i = 0; i < src.size(); i++) {
880             Instruction* perm =
881                bld.vop3(aco_opcode::v_permlanex16_b32, Definition(PhysReg{vtmp + i}, v1),
882                         Operand(PhysReg{tmp + i}, v1), Operand::c32(0xffffffffu),
883                         Operand::c32(0xffffffffu))
884                   .instr;
885             perm->valu().opsel = 1; /* FI (Fetch Inactive) */
886          }
887          emit_op(ctx, tmp, tmp, vtmp, PhysReg{0}, reduce_op, src.size());
888 
889          if (ctx->program->wave_size == 64) {
890             bld.sop2(aco_opcode::s_bfm_b64, Definition(exec, s2), Operand::c32(32u),
891                      Operand::c32(32u));
892             for (unsigned i = 0; i < src.size(); i++)
893                bld.readlane(Definition(PhysReg{sitmp + i}, s1), Operand(PhysReg{tmp + i}, v1),
894                             Operand::c32(31u));
895             emit_op(ctx, tmp, sitmp, tmp, vtmp, reduce_op, src.size());
896          }
897       } else {
898          emit_dpp_op(ctx, tmp, tmp, tmp, vtmp, reduce_op, src.size(), dpp_row_bcast15, 0xa, 0xf,
899                      false, identity);
900          emit_dpp_op(ctx, tmp, tmp, tmp, vtmp, reduce_op, src.size(), dpp_row_bcast31, 0xc, 0xf,
901                      false, identity);
902       }
903       break;
904    default: unreachable("Invalid reduction mode");
905    }
906 
907    if (op == aco_opcode::p_reduce) {
908       if (reduction_needs_last_op && dst.regClass().type() == RegType::vgpr) {
909          bld.sop1(Builder::s_mov, Definition(exec, bld.lm), Operand(stmp, bld.lm));
910          emit_op(ctx, dst.physReg(), tmp, vtmp, PhysReg{0}, reduce_op, src.size());
911          return;
912       }
913 
914       if (reduction_needs_last_op)
915          emit_op(ctx, tmp, vtmp, tmp, PhysReg{0}, reduce_op, src.size());
916    }
917 
918    /* restore exec */
919    bld.sop1(Builder::s_mov, Definition(exec, bld.lm), Operand(stmp, bld.lm));
920 
921    if (dst.regClass().type() == RegType::sgpr) {
922       for (unsigned k = 0; k < src.size(); k++) {
923          bld.readlane(Definition(PhysReg{dst.physReg() + k}, s1), Operand(PhysReg{tmp + k}, v1),
924                       Operand::c32(ctx->program->wave_size - 1));
925       }
926    } else if (dst.physReg() != tmp) {
927       for (unsigned k = 0; k < src.size(); k++) {
928          bld.vop1(aco_opcode::v_mov_b32, Definition(PhysReg{dst.physReg() + k}, v1),
929                   Operand(PhysReg{tmp + k}, v1));
930       }
931    }
932 }
933 
934 void
adjust_bpermute_dst(Builder & bld,Definition dst,Operand input_data)935 adjust_bpermute_dst(Builder& bld, Definition dst, Operand input_data)
936 {
937    /* RA assumes that the result is always in the low part of the register, so we have to shift,
938     * if it's not there already.
939     */
940    if (input_data.physReg().byte()) {
941       unsigned right_shift = input_data.physReg().byte() * 8;
942       bld.vop2(aco_opcode::v_lshrrev_b32, dst, Operand::c32(right_shift),
943                Operand(dst.physReg(), dst.regClass()));
944    }
945 }
946 
947 void
emit_bpermute_permlane(Program * program,aco_ptr<Instruction> & instr,Builder & bld)948 emit_bpermute_permlane(Program* program, aco_ptr<Instruction>& instr, Builder& bld)
949 {
950    /* Emulates proper bpermute on GFX11 in wave64 mode.
951     *
952     * Similar to emit_gfx10_wave64_bpermute, but uses the new
953     * v_permlane64_b32 instruction to swap data between lo and hi halves.
954     */
955 
956    assert(program->gfx_level >= GFX11);
957    assert(program->wave_size == 64);
958 
959    Definition dst = instr->definitions[0];
960    Definition tmp_exec = instr->definitions[1];
961    Definition clobber_scc = instr->definitions[2];
962    Operand tmp_op = instr->operands[0];
963    Operand index_x4 = instr->operands[1];
964    Operand input_data = instr->operands[2];
965    Operand same_half = instr->operands[3];
966 
967    assert(dst.regClass() == v1);
968    assert(tmp_exec.regClass() == bld.lm);
969    assert(clobber_scc.isFixed() && clobber_scc.physReg() == scc);
970    assert(same_half.regClass() == bld.lm);
971    assert(tmp_op.regClass() == v1.as_linear());
972    assert(index_x4.regClass() == v1);
973    assert(input_data.regClass().type() == RegType::vgpr);
974    assert(input_data.bytes() <= 4);
975 
976    Definition tmp_def(tmp_op.physReg(), tmp_op.regClass());
977 
978    /* Permute the input within the same half-wave. */
979    bld.ds(aco_opcode::ds_bpermute_b32, dst, index_x4, input_data);
980 
981    /* Save EXEC and enable all lanes. */
982    bld.sop1(aco_opcode::s_or_saveexec_b64, tmp_exec, clobber_scc, Definition(exec, s2),
983             Operand::c32(-1u), Operand(exec, s2));
984 
985    /* Copy input data from other half to current half's linear VGPR. */
986    bld.vop1(aco_opcode::v_permlane64_b32, tmp_def, input_data);
987 
988    /* Permute the input from the other half-wave, write to linear VGPR. */
989    bld.ds(aco_opcode::ds_bpermute_b32, tmp_def, index_x4, tmp_op);
990 
991    /* Restore saved EXEC. */
992    bld.sop1(aco_opcode::s_mov_b64, Definition(exec, s2), Operand(tmp_exec.physReg(), s2));
993 
994    /* Select correct permute result. */
995    bld.vop2_e64(aco_opcode::v_cndmask_b32, dst, tmp_op, Operand(dst.physReg(), dst.regClass()),
996                 same_half);
997 
998    adjust_bpermute_dst(bld, dst, input_data);
999 }
1000 
1001 void
emit_bpermute_shared_vgpr(Program * program,aco_ptr<Instruction> & instr,Builder & bld)1002 emit_bpermute_shared_vgpr(Program* program, aco_ptr<Instruction>& instr, Builder& bld)
1003 {
1004    /* Emulates proper bpermute on GFX10 in wave64 mode.
1005     *
1006     * This is necessary because on GFX10 the bpermute instruction only works
1007     * on half waves (you can think of it as having a cluster size of 32), so we
1008     * manually swap the data between the two halves using two shared VGPRs.
1009     */
1010 
1011    assert(program->gfx_level >= GFX10 && program->gfx_level <= GFX10_3);
1012    assert(program->wave_size == 64);
1013 
1014    unsigned shared_vgpr_reg_0 = align(program->config->num_vgprs, 4) + 256;
1015    Definition dst = instr->definitions[0];
1016    Definition tmp_exec = instr->definitions[1];
1017    Definition clobber_scc = instr->definitions[2];
1018    Operand index_x4 = instr->operands[0];
1019    Operand input_data = instr->operands[1];
1020    Operand same_half = instr->operands[2];
1021 
1022    assert(dst.regClass() == v1);
1023    assert(tmp_exec.regClass() == bld.lm);
1024    assert(clobber_scc.isFixed() && clobber_scc.physReg() == scc);
1025    assert(same_half.regClass() == bld.lm);
1026    assert(index_x4.regClass() == v1);
1027    assert(input_data.regClass().type() == RegType::vgpr);
1028    assert(input_data.bytes() <= 4);
1029    assert(dst.physReg() != index_x4.physReg());
1030    assert(dst.physReg() != input_data.physReg());
1031    assert(tmp_exec.physReg() != same_half.physReg());
1032 
1033    PhysReg shared_vgpr_lo(shared_vgpr_reg_0);
1034    PhysReg shared_vgpr_hi(shared_vgpr_reg_0 + 1);
1035 
1036    /* Permute the input within the same half-wave */
1037    bld.ds(aco_opcode::ds_bpermute_b32, dst, index_x4, input_data);
1038 
1039    /* HI: Copy data from high lanes 32-63 to shared vgpr */
1040    bld.vop1_dpp(aco_opcode::v_mov_b32, Definition(shared_vgpr_hi, v1), input_data,
1041                 dpp_quad_perm(0, 1, 2, 3), 0xc, 0xf, false);
1042    /* Save EXEC */
1043    bld.sop1(aco_opcode::s_mov_b64, tmp_exec, Operand(exec, s2));
1044    /* Set EXEC to enable LO lanes only */
1045    bld.sop2(aco_opcode::s_bfm_b64, Definition(exec, s2), Operand::c32(32u), Operand::zero());
1046    /* LO: Copy data from low lanes 0-31 to shared vgpr */
1047    bld.vop1(aco_opcode::v_mov_b32, Definition(shared_vgpr_lo, v1), input_data);
1048    /* LO: bpermute shared vgpr (high lanes' data) */
1049    bld.ds(aco_opcode::ds_bpermute_b32, Definition(shared_vgpr_hi, v1), index_x4,
1050           Operand(shared_vgpr_hi, v1));
1051    /* Set EXEC to enable HI lanes only */
1052    bld.sop2(aco_opcode::s_bfm_b64, Definition(exec, s2), Operand::c32(32u), Operand::c32(32u));
1053    /* HI: bpermute shared vgpr (low lanes' data) */
1054    bld.ds(aco_opcode::ds_bpermute_b32, Definition(shared_vgpr_lo, v1), index_x4,
1055           Operand(shared_vgpr_lo, v1));
1056 
1057    /* Only enable lanes which use the other half's data */
1058    bld.sop2(aco_opcode::s_andn2_b64, Definition(exec, s2), clobber_scc,
1059             Operand(tmp_exec.physReg(), s2), same_half);
1060    /* LO: Copy shared vgpr (high lanes' bpermuted data) to output vgpr */
1061    bld.vop1_dpp(aco_opcode::v_mov_b32, dst, Operand(shared_vgpr_hi, v1), dpp_quad_perm(0, 1, 2, 3),
1062                 0x3, 0xf, false);
1063    /* HI: Copy shared vgpr (low lanes' bpermuted data) to output vgpr */
1064    bld.vop1_dpp(aco_opcode::v_mov_b32, dst, Operand(shared_vgpr_lo, v1), dpp_quad_perm(0, 1, 2, 3),
1065                 0xc, 0xf, false);
1066 
1067    /* Restore saved EXEC */
1068    bld.sop1(aco_opcode::s_mov_b64, Definition(exec, s2), Operand(tmp_exec.physReg(), s2));
1069 
1070    adjust_bpermute_dst(bld, dst, input_data);
1071 }
1072 
1073 void
emit_bpermute_readlane(Program * program,aco_ptr<Instruction> & instr,Builder & bld)1074 emit_bpermute_readlane(Program* program, aco_ptr<Instruction>& instr, Builder& bld)
1075 {
1076    /* Emulates bpermute using readlane instructions */
1077 
1078    Operand index = instr->operands[0];
1079    Operand input = instr->operands[1];
1080    Definition dst = instr->definitions[0];
1081    Definition temp_exec = instr->definitions[1];
1082    Definition clobber_vcc = instr->definitions[2];
1083 
1084    assert(dst.regClass() == v1);
1085    assert(temp_exec.regClass() == bld.lm);
1086    assert(clobber_vcc.regClass() == bld.lm);
1087    assert(clobber_vcc.physReg() == vcc);
1088    assert(index.regClass() == v1);
1089    assert(index.physReg() != dst.physReg());
1090    assert(input.regClass().type() == RegType::vgpr);
1091    assert(input.bytes() <= 4);
1092    assert(input.physReg() != dst.physReg());
1093 
1094    /* Save original EXEC */
1095    bld.sop1(Builder::s_mov, temp_exec, Operand(exec, bld.lm));
1096 
1097    /* An "unrolled loop" that is executed per each lane.
1098     * This takes only a few instructions per lane, as opposed to a "real" loop
1099     * with branching, where the branch instruction alone would take 16+ cycles.
1100     */
1101    for (unsigned n = 0; n < program->wave_size; ++n) {
1102       /* Activate the lane which has N for its source index */
1103       if (program->gfx_level >= GFX10)
1104          bld.vopc(aco_opcode::v_cmpx_eq_u32, Definition(exec, bld.lm), Operand::c32(n), index);
1105       else
1106          bld.vopc(aco_opcode::v_cmpx_eq_u32, clobber_vcc, Definition(exec, bld.lm), Operand::c32(n),
1107                   index);
1108       /* Read the data from lane N */
1109       bld.readlane(Definition(vcc, s1), input, Operand::c32(n));
1110       /* On the active lane, move the data we read from lane N to the destination VGPR */
1111       bld.vop1(aco_opcode::v_mov_b32, dst, Operand(vcc, s1));
1112       /* Restore original EXEC */
1113       bld.sop1(Builder::s_mov, Definition(exec, bld.lm), Operand(temp_exec.physReg(), bld.lm));
1114    }
1115 
1116    adjust_bpermute_dst(bld, dst, input);
1117 }
1118 
1119 struct copy_operation {
1120    Operand op;
1121    Definition def;
1122    unsigned bytes;
1123    union {
1124       uint8_t uses[8];
1125       uint64_t is_used = 0;
1126    };
1127 };
1128 
1129 void
split_copy(lower_context * ctx,unsigned offset,Definition * def,Operand * op,const copy_operation & src,bool ignore_uses,unsigned max_size)1130 split_copy(lower_context* ctx, unsigned offset, Definition* def, Operand* op,
1131            const copy_operation& src, bool ignore_uses, unsigned max_size)
1132 {
1133    PhysReg def_reg = src.def.physReg();
1134    PhysReg op_reg = src.op.physReg();
1135    def_reg.reg_b += offset;
1136    op_reg.reg_b += offset;
1137 
1138    /* 64-bit VGPR copies (implemented with v_lshrrev_b64) are slow before GFX10, and on GFX11
1139     * v_lshrrev_b64 doesn't get dual issued. */
1140    if ((ctx->program->gfx_level < GFX10 || ctx->program->gfx_level >= GFX11) &&
1141        src.def.regClass().type() == RegType::vgpr)
1142       max_size = MIN2(max_size, 4);
1143    unsigned max_align = src.def.regClass().type() == RegType::vgpr ? 4 : 16;
1144 
1145    /* make sure the size is a power of two and reg % bytes == 0 */
1146    unsigned bytes = 1;
1147    for (; bytes <= max_size; bytes *= 2) {
1148       unsigned next = bytes * 2u;
1149       bool can_increase = def_reg.reg_b % MIN2(next, max_align) == 0 &&
1150                           offset + next <= src.bytes && next <= max_size;
1151       if (!src.op.isConstant() && can_increase)
1152          can_increase = op_reg.reg_b % MIN2(next, max_align) == 0;
1153       for (unsigned i = 0; !ignore_uses && can_increase && (i < bytes); i++)
1154          can_increase = (src.uses[offset + bytes + i] == 0) == (src.uses[offset] == 0);
1155       if (!can_increase)
1156          break;
1157    }
1158 
1159    *def = Definition(src.def.tempId(), def_reg, src.def.regClass().resize(bytes));
1160    if (src.op.isConstant()) {
1161       assert(bytes >= 1 && bytes <= 8);
1162       uint64_t val = src.op.constantValue64() >> (offset * 8u);
1163       *op = Operand::get_const(ctx->program->gfx_level, val, bytes);
1164    } else {
1165       RegClass op_cls = src.op.regClass().resize(bytes);
1166       *op = Operand(op_reg, op_cls);
1167       op->setTemp(Temp(src.op.tempId(), op_cls));
1168    }
1169 }
1170 
1171 uint32_t
get_intersection_mask(int a_start,int a_size,int b_start,int b_size)1172 get_intersection_mask(int a_start, int a_size, int b_start, int b_size)
1173 {
1174    int intersection_start = MAX2(b_start - a_start, 0);
1175    int intersection_end = MAX2(b_start + b_size - a_start, 0);
1176    if (intersection_start >= a_size || intersection_end == 0)
1177       return 0;
1178 
1179    uint32_t mask = u_bit_consecutive(0, a_size);
1180    return u_bit_consecutive(intersection_start, intersection_end - intersection_start) & mask;
1181 }
1182 
1183 /* src1 are bytes 0-3. dst/src0 are bytes 4-7. */
1184 void
create_bperm(Builder & bld,uint8_t swiz[4],Definition dst,Operand src1,Operand src0=Operand (v1))1185 create_bperm(Builder& bld, uint8_t swiz[4], Definition dst, Operand src1,
1186              Operand src0 = Operand(v1))
1187 {
1188    uint32_t swiz_packed =
1189       swiz[0] | ((uint32_t)swiz[1] << 8) | ((uint32_t)swiz[2] << 16) | ((uint32_t)swiz[3] << 24);
1190 
1191    dst = Definition(PhysReg(dst.physReg().reg()), v1);
1192    if (!src1.isConstant())
1193       src1 = Operand(PhysReg(src1.physReg().reg()), v1);
1194    if (src0.isUndefined())
1195       src0 = Operand(dst.physReg(), v1);
1196    else if (!src0.isConstant())
1197       src0 = Operand(PhysReg(src0.physReg().reg()), v1);
1198    bld.vop3(aco_opcode::v_perm_b32, dst, src0, src1, Operand::c32(swiz_packed));
1199 }
1200 
1201 void
emit_v_mov_b16(Builder & bld,Definition dst,Operand op)1202 emit_v_mov_b16(Builder& bld, Definition dst, Operand op)
1203 {
1204    /* v_mov_b16 uses 32bit inline constants. */
1205    if (op.isConstant()) {
1206       if (!op.isLiteral() && op.physReg() >= 240) {
1207          /* v_add_f16 is smaller because it can use 16bit fp inline constants. */
1208          Instruction* instr = bld.vop2_e64(aco_opcode::v_add_f16, dst, op, Operand::zero());
1209          instr->valu().opsel[3] = dst.physReg().byte() == 2;
1210          return;
1211       }
1212       op = Operand::c32((int32_t)(int16_t)op.constantValue());
1213    }
1214 
1215    Instruction* instr = bld.vop1(aco_opcode::v_mov_b16, dst, op);
1216    instr->valu().opsel[0] = op.physReg().byte() == 2;
1217    instr->valu().opsel[3] = dst.physReg().byte() == 2;
1218 }
1219 
1220 void
copy_constant(lower_context * ctx,Builder & bld,Definition dst,Operand op)1221 copy_constant(lower_context* ctx, Builder& bld, Definition dst, Operand op)
1222 {
1223    assert(op.bytes() == dst.bytes());
1224 
1225    if (dst.bytes() == 4 && op.isLiteral()) {
1226       uint32_t imm = op.constantValue();
1227       if (dst.regClass() == s1 && (imm >= 0xffff8000 || imm <= 0x7fff)) {
1228          bld.sopk(aco_opcode::s_movk_i32, dst, imm & 0xFFFFu);
1229          return;
1230       } else if (util_bitreverse(imm) <= 64 || util_bitreverse(imm) >= 0xFFFFFFF0) {
1231          uint32_t rev = util_bitreverse(imm);
1232          if (dst.regClass() == s1)
1233             bld.sop1(aco_opcode::s_brev_b32, dst, Operand::c32(rev));
1234          else
1235             bld.vop1(aco_opcode::v_bfrev_b32, dst, Operand::c32(rev));
1236          return;
1237       } else if (dst.regClass() == s1) {
1238          unsigned start = (ffs(imm) - 1) & 0x1f;
1239          unsigned size = util_bitcount(imm) & 0x1f;
1240          if (BITFIELD_RANGE(start, size) == imm) {
1241             bld.sop2(aco_opcode::s_bfm_b32, dst, Operand::c32(size), Operand::c32(start));
1242             return;
1243          }
1244          if (ctx->program->gfx_level >= GFX9) {
1245             Operand op_lo = Operand::c32(int32_t(int16_t(imm)));
1246             Operand op_hi = Operand::c32(int32_t(int16_t(imm >> 16)));
1247             if (!op_lo.isLiteral() && !op_hi.isLiteral()) {
1248                bld.sop2(aco_opcode::s_pack_ll_b32_b16, dst, op_lo, op_hi);
1249                return;
1250             }
1251          }
1252       }
1253    }
1254 
1255    if (op.bytes() == 4 && op.constantEquals(0x3e22f983) && ctx->program->gfx_level >= GFX8)
1256       op.setFixed(PhysReg{248}); /* it can be an inline constant on GFX8+ */
1257 
1258    if (dst.regClass() == s1) {
1259       bld.sop1(aco_opcode::s_mov_b32, dst, op);
1260    } else if (dst.regClass() == s2) {
1261       /* s_ashr_i64 writes SCC, so we can't use it */
1262       assert(Operand::is_constant_representable(op.constantValue64(), 8, true, false));
1263       uint64_t imm = op.constantValue64();
1264       if (op.isLiteral()) {
1265          unsigned start = (ffsll(imm) - 1) & 0x3f;
1266          unsigned size = util_bitcount64(imm) & 0x3f;
1267          if (BITFIELD64_RANGE(start, size) == imm) {
1268             bld.sop2(aco_opcode::s_bfm_b64, dst, Operand::c32(size), Operand::c32(start));
1269             return;
1270          }
1271       }
1272       bld.sop1(aco_opcode::s_mov_b64, dst, op);
1273    } else if (dst.regClass() == v2) {
1274       if (Operand::is_constant_representable(op.constantValue64(), 8, true, false)) {
1275          bld.vop3(aco_opcode::v_lshrrev_b64, dst, Operand::zero(), op);
1276       } else {
1277          assert(Operand::is_constant_representable(op.constantValue64(), 8, false, true));
1278          bld.vop3(aco_opcode::v_ashrrev_i64, dst, Operand::zero(), op);
1279       }
1280    } else if (dst.regClass() == v1) {
1281       bld.vop1(aco_opcode::v_mov_b32, dst, op);
1282    } else {
1283       assert(dst.regClass() == v1b || dst.regClass() == v2b);
1284 
1285       bool use_sdwa = ctx->program->gfx_level >= GFX9 && ctx->program->gfx_level < GFX11;
1286       /* We need the v_perm_b32 (VOP3) to be able to take literals, and that's a GFX10+ feature. */
1287       bool can_use_perm = ctx->program->gfx_level >= GFX10 &&
1288                           (op.constantEquals(0) || op.constantEquals(0xff) ||
1289                            op.constantEquals(0xffff) || op.constantEquals(0xff00));
1290       if (dst.regClass() == v1b && use_sdwa) {
1291          uint8_t val = op.constantValue();
1292          Operand op32 = Operand::c32((uint32_t)val | (val & 0x80u ? 0xffffff00u : 0u));
1293          if (op32.isLiteral()) {
1294             uint32_t a = (uint32_t)int8_mul_table[val * 2];
1295             uint32_t b = (uint32_t)int8_mul_table[val * 2 + 1];
1296             bld.vop2_sdwa(aco_opcode::v_mul_u32_u24, dst,
1297                           Operand::c32(a | (a & 0x80u ? 0xffffff00u : 0x0u)),
1298                           Operand::c32(b | (b & 0x80u ? 0xffffff00u : 0x0u)));
1299          } else {
1300             bld.vop1_sdwa(aco_opcode::v_mov_b32, dst, op32);
1301          }
1302       } else if (dst.regClass() == v2b && ctx->program->gfx_level >= GFX11) {
1303          emit_v_mov_b16(bld, dst, op);
1304       } else if (dst.regClass() == v2b && use_sdwa && !op.isLiteral()) {
1305          if (op.constantValue() >= 0xfff0 || op.constantValue() <= 64) {
1306             /* use v_mov_b32 to avoid possible issues with denormal flushing or
1307              * NaN. v_add_f16 is still needed for float constants. */
1308             uint32_t val32 = (int32_t)(int16_t)op.constantValue();
1309             bld.vop1_sdwa(aco_opcode::v_mov_b32, dst, Operand::c32(val32));
1310          } else {
1311             bld.vop2_sdwa(aco_opcode::v_add_f16, dst, op, Operand::zero());
1312          }
1313       } else if (dst.regClass() == v2b && ctx->program->gfx_level >= GFX10 &&
1314                  (ctx->block->fp_mode.denorm16_64 & fp_denorm_keep_in)) {
1315          if (dst.physReg().byte() == 2) {
1316             Operand def_lo(dst.physReg().advance(-2), v2b);
1317             Instruction* instr = bld.vop3(aco_opcode::v_pack_b32_f16, dst, def_lo, op);
1318             instr->valu().opsel = 0;
1319          } else {
1320             assert(dst.physReg().byte() == 0);
1321             Operand def_hi(dst.physReg().advance(2), v2b);
1322             Instruction* instr = bld.vop3(aco_opcode::v_pack_b32_f16, dst, op, def_hi);
1323             instr->valu().opsel = 2;
1324          }
1325       } else if (can_use_perm) {
1326          uint8_t swiz[] = {4, 5, 6, 7};
1327          swiz[dst.physReg().byte()] = op.constantValue() & 0xff ? bperm_255 : bperm_0;
1328          if (dst.bytes() == 2)
1329             swiz[dst.physReg().byte() + 1] = op.constantValue() >> 8 ? bperm_255 : bperm_0;
1330          create_bperm(bld, swiz, dst, Operand::zero());
1331       } else {
1332          uint32_t offset = dst.physReg().byte() * 8u;
1333          uint32_t mask = ((1u << (dst.bytes() * 8)) - 1) << offset;
1334          uint32_t val = (op.constantValue() << offset) & mask;
1335          dst = Definition(PhysReg(dst.physReg().reg()), v1);
1336          Operand def_op(dst.physReg(), v1);
1337          if (val != mask)
1338             bld.vop2(aco_opcode::v_and_b32, dst, Operand::c32(~mask), def_op);
1339          if (val != 0)
1340             bld.vop2(aco_opcode::v_or_b32, dst, Operand::c32(val), def_op);
1341       }
1342    }
1343 }
1344 
1345 void
copy_linear_vgpr(Builder & bld,Definition def,Operand op,bool preserve_scc,PhysReg scratch_sgpr)1346 copy_linear_vgpr(Builder& bld, Definition def, Operand op, bool preserve_scc, PhysReg scratch_sgpr)
1347 {
1348    if (preserve_scc)
1349       bld.sop1(aco_opcode::s_mov_b32, Definition(scratch_sgpr, s1), Operand(scc, s1));
1350 
1351    for (unsigned i = 0; i < 2; i++) {
1352       if (def.size() == 2)
1353          bld.vop3(aco_opcode::v_lshrrev_b64, def, Operand::zero(), op);
1354       else
1355          bld.vop1(aco_opcode::v_mov_b32, def, op);
1356 
1357       bld.sop1(Builder::s_not, Definition(exec, bld.lm), Definition(scc, s1),
1358                Operand(exec, bld.lm));
1359    }
1360 
1361    if (preserve_scc)
1362       bld.sopc(aco_opcode::s_cmp_lg_i32, Definition(scc, s1), Operand(scratch_sgpr, s1),
1363                Operand::zero());
1364 }
1365 
1366 void
swap_linear_vgpr(Builder & bld,Definition def,Operand op,bool preserve_scc,PhysReg scratch_sgpr)1367 swap_linear_vgpr(Builder& bld, Definition def, Operand op, bool preserve_scc, PhysReg scratch_sgpr)
1368 {
1369    if (preserve_scc)
1370       bld.sop1(aco_opcode::s_mov_b32, Definition(scratch_sgpr, s1), Operand(scc, s1));
1371 
1372    Operand def_as_op = Operand(def.physReg(), def.regClass());
1373    Definition op_as_def = Definition(op.physReg(), op.regClass());
1374 
1375    for (unsigned i = 0; i < 2; i++) {
1376       if (bld.program->gfx_level >= GFX9) {
1377          bld.vop1(aco_opcode::v_swap_b32, def, op_as_def, op, def_as_op);
1378       } else {
1379          bld.vop2(aco_opcode::v_xor_b32, op_as_def, op, def_as_op);
1380          bld.vop2(aco_opcode::v_xor_b32, def, op, def_as_op);
1381          bld.vop2(aco_opcode::v_xor_b32, op_as_def, op, def_as_op);
1382       }
1383 
1384       bld.sop1(Builder::s_not, Definition(exec, bld.lm), Definition(scc, s1),
1385                Operand(exec, bld.lm));
1386    }
1387 
1388    if (preserve_scc)
1389       bld.sopc(aco_opcode::s_cmp_lg_i32, Definition(scc, s1), Operand(scratch_sgpr, s1),
1390                Operand::zero());
1391 }
1392 
1393 void
addsub_subdword_gfx11(Builder & bld,Definition dst,Operand src0,Operand src1,bool sub)1394 addsub_subdword_gfx11(Builder& bld, Definition dst, Operand src0, Operand src1, bool sub)
1395 {
1396    Instruction* instr =
1397       bld.vop3(sub ? aco_opcode::v_sub_u16_e64 : aco_opcode::v_add_u16_e64, dst, src0, src1).instr;
1398    if (src0.physReg().byte() == 2)
1399       instr->valu().opsel |= 0x1;
1400    if (src1.physReg().byte() == 2)
1401       instr->valu().opsel |= 0x2;
1402    if (dst.physReg().byte() == 2)
1403       instr->valu().opsel |= 0x8;
1404 }
1405 
1406 bool
do_copy(lower_context * ctx,Builder & bld,const copy_operation & copy,bool * preserve_scc,PhysReg scratch_sgpr)1407 do_copy(lower_context* ctx, Builder& bld, const copy_operation& copy, bool* preserve_scc,
1408         PhysReg scratch_sgpr)
1409 {
1410    bool did_copy = false;
1411    for (unsigned offset = 0; offset < copy.bytes;) {
1412       if (copy.uses[offset]) {
1413          offset++;
1414          continue;
1415       }
1416 
1417       Definition def;
1418       Operand op;
1419       split_copy(ctx, offset, &def, &op, copy, false, 8);
1420 
1421       if (def.physReg() == scc) {
1422          bld.sopc(aco_opcode::s_cmp_lg_i32, def, op, Operand::zero());
1423          *preserve_scc = true;
1424       } else if (op.isConstant()) {
1425          copy_constant(ctx, bld, def, op);
1426       } else if (def.regClass().is_linear_vgpr()) {
1427          copy_linear_vgpr(bld, def, op, *preserve_scc, scratch_sgpr);
1428       } else if (def.regClass() == v1) {
1429          bld.vop1(aco_opcode::v_mov_b32, def, op);
1430       } else if (def.regClass() == v2) {
1431          bld.vop3(aco_opcode::v_lshrrev_b64, def, Operand::zero(), op);
1432       } else if (def.regClass() == s1) {
1433          bld.sop1(aco_opcode::s_mov_b32, def, op);
1434       } else if (def.regClass() == s2) {
1435          bld.sop1(aco_opcode::s_mov_b64, def, op);
1436       } else if (def.regClass().is_subdword() && ctx->program->gfx_level < GFX8) {
1437          if (op.physReg().byte()) {
1438             assert(def.physReg().byte() == 0);
1439             bld.vop2(aco_opcode::v_lshrrev_b32, def, Operand::c32(op.physReg().byte() * 8), op);
1440          } else if (def.physReg().byte()) {
1441             assert(op.physReg().byte() == 0);
1442             /* preserve the target's lower half */
1443             uint32_t bits = def.physReg().byte() * 8;
1444             PhysReg lo_reg = PhysReg(def.physReg().reg());
1445             Definition lo_half =
1446                Definition(lo_reg, RegClass::get(RegType::vgpr, def.physReg().byte()));
1447             Definition dst =
1448                Definition(lo_reg, RegClass::get(RegType::vgpr, lo_half.bytes() + op.bytes()));
1449 
1450             if (def.physReg().reg() == op.physReg().reg()) {
1451                bld.vop2(aco_opcode::v_and_b32, lo_half, Operand::c32((1 << bits) - 1u),
1452                         Operand(lo_reg, lo_half.regClass()));
1453                if (def.physReg().byte() == 1) {
1454                   bld.vop2(aco_opcode::v_mul_u32_u24, dst, Operand::c32((1 << bits) + 1u), op);
1455                } else if (def.physReg().byte() == 2) {
1456                   bld.vop2(aco_opcode::v_cvt_pk_u16_u32, dst, Operand(lo_reg, v2b), op);
1457                } else if (def.physReg().byte() == 3) {
1458                   bld.sop1(aco_opcode::s_mov_b32, Definition(scratch_sgpr, s1),
1459                            Operand::c32((1 << bits) + 1u));
1460                   bld.vop3(aco_opcode::v_mul_lo_u32, dst, Operand(scratch_sgpr, s1), op);
1461                }
1462             } else {
1463                lo_half.setFixed(lo_half.physReg().advance(4 - def.physReg().byte()));
1464                bld.vop2(aco_opcode::v_lshlrev_b32, lo_half, Operand::c32(32 - bits),
1465                         Operand(lo_reg, lo_half.regClass()));
1466                bld.vop3(aco_opcode::v_alignbyte_b32, dst, op,
1467                         Operand(lo_half.physReg(), lo_half.regClass()),
1468                         Operand::c32(4 - def.physReg().byte()));
1469             }
1470          } else {
1471             bld.vop1(aco_opcode::v_mov_b32, def, op);
1472          }
1473       } else if (def.regClass() == v1b && ctx->program->gfx_level >= GFX11) {
1474          uint8_t swiz[] = {4, 5, 6, 7};
1475          swiz[def.physReg().byte()] = op.physReg().byte();
1476          create_bperm(bld, swiz, def, op);
1477       } else if (def.regClass() == v2b && ctx->program->gfx_level >= GFX11) {
1478          emit_v_mov_b16(bld, def, op);
1479       } else if (def.regClass().is_subdword()) {
1480          bld.vop1_sdwa(aco_opcode::v_mov_b32, def, op);
1481       } else {
1482          unreachable("unsupported copy");
1483       }
1484 
1485       did_copy = true;
1486       offset += def.bytes();
1487    }
1488    return did_copy;
1489 }
1490 
1491 void
swap_subdword_gfx11(Builder & bld,Definition def,Operand op)1492 swap_subdword_gfx11(Builder& bld, Definition def, Operand op)
1493 {
1494    if (def.physReg().reg() == op.physReg().reg()) {
1495       assert(def.bytes() != 2); /* handled by caller */
1496       uint8_t swiz[] = {4, 5, 6, 7};
1497       std::swap(swiz[def.physReg().byte()], swiz[op.physReg().byte()]);
1498       create_bperm(bld, swiz, def, Operand::zero());
1499       return;
1500    }
1501 
1502    if (def.bytes() == 2) {
1503       Operand def_as_op = Operand(def.physReg(), def.regClass());
1504       Definition op_as_def = Definition(op.physReg(), op.regClass());
1505       addsub_subdword_gfx11(bld, def, def_as_op, op, false);
1506       addsub_subdword_gfx11(bld, op_as_def, def_as_op, op, true);
1507       addsub_subdword_gfx11(bld, def, def_as_op, op, true);
1508    } else {
1509       PhysReg op_half = op.physReg();
1510       op_half.reg_b &= ~1;
1511 
1512       PhysReg def_other_half = def.physReg();
1513       def_other_half.reg_b &= ~1;
1514       def_other_half.reg_b ^= 2;
1515 
1516       /* We can only swap individual bytes within a single VGPR, so temporarily move both bytes
1517        * into the same VGPR.
1518        */
1519       swap_subdword_gfx11(bld, Definition(def_other_half, v2b), Operand(op_half, v2b));
1520       swap_subdword_gfx11(bld, def, Operand(def_other_half.advance(op.physReg().byte() & 1), v1b));
1521       swap_subdword_gfx11(bld, Definition(def_other_half, v2b), Operand(op_half, v2b));
1522    }
1523 }
1524 
1525 void
do_swap(lower_context * ctx,Builder & bld,const copy_operation & copy,bool preserve_scc,Pseudo_instruction * pi)1526 do_swap(lower_context* ctx, Builder& bld, const copy_operation& copy, bool preserve_scc,
1527         Pseudo_instruction* pi)
1528 {
1529    unsigned offset = 0;
1530 
1531    if (copy.bytes == 3 && (copy.def.physReg().reg_b % 4 <= 1) &&
1532        (copy.def.physReg().reg_b % 4) == (copy.op.physReg().reg_b % 4)) {
1533       /* instead of doing a 2-byte and 1-byte swap, do a 4-byte swap and then fixup with a 1-byte
1534        * swap */
1535       PhysReg op = copy.op.physReg();
1536       PhysReg def = copy.def.physReg();
1537       op.reg_b &= ~0x3;
1538       def.reg_b &= ~0x3;
1539 
1540       copy_operation tmp;
1541       tmp.op = Operand(op, v1);
1542       tmp.def = Definition(def, v1);
1543       tmp.bytes = 4;
1544       memset(tmp.uses, 1, 4);
1545       do_swap(ctx, bld, tmp, preserve_scc, pi);
1546 
1547       op.reg_b += copy.def.physReg().reg_b % 4 == 0 ? 3 : 0;
1548       def.reg_b += copy.def.physReg().reg_b % 4 == 0 ? 3 : 0;
1549       tmp.op = Operand(op, v1b);
1550       tmp.def = Definition(def, v1b);
1551       tmp.bytes = 1;
1552       tmp.uses[0] = 1;
1553       do_swap(ctx, bld, tmp, preserve_scc, pi);
1554 
1555       offset = copy.bytes;
1556    }
1557 
1558    for (; offset < copy.bytes;) {
1559       Definition def;
1560       Operand op;
1561       unsigned max_size = copy.def.regClass().type() == RegType::vgpr ? 4 : 8;
1562       split_copy(ctx, offset, &def, &op, copy, true, max_size);
1563 
1564       assert(op.regClass() == def.regClass());
1565       Operand def_as_op = Operand(def.physReg(), def.regClass());
1566       Definition op_as_def = Definition(op.physReg(), op.regClass());
1567       if (def.regClass().is_linear_vgpr()) {
1568          swap_linear_vgpr(bld, def, op, preserve_scc, pi->scratch_sgpr);
1569       } else if (ctx->program->gfx_level >= GFX9 && def.regClass() == v1) {
1570          bld.vop1(aco_opcode::v_swap_b32, def, op_as_def, op, def_as_op);
1571       } else if (def.regClass() == v1) {
1572          assert(def.physReg().byte() == 0 && op.physReg().byte() == 0);
1573          bld.vop2(aco_opcode::v_xor_b32, op_as_def, op, def_as_op);
1574          bld.vop2(aco_opcode::v_xor_b32, def, op, def_as_op);
1575          bld.vop2(aco_opcode::v_xor_b32, op_as_def, op, def_as_op);
1576       } else if (op.physReg() == scc || def.physReg() == scc) {
1577          /* we need to swap scc and another sgpr */
1578          assert(!preserve_scc);
1579 
1580          PhysReg other = op.physReg() == scc ? def.physReg() : op.physReg();
1581 
1582          bld.sop1(aco_opcode::s_mov_b32, Definition(pi->scratch_sgpr, s1), Operand(scc, s1));
1583          bld.sopc(aco_opcode::s_cmp_lg_i32, Definition(scc, s1), Operand(other, s1),
1584                   Operand::zero());
1585          bld.sop1(aco_opcode::s_mov_b32, Definition(other, s1), Operand(pi->scratch_sgpr, s1));
1586       } else if (def.regClass() == s1) {
1587          if (preserve_scc) {
1588             bld.sop1(aco_opcode::s_mov_b32, Definition(pi->scratch_sgpr, s1), op);
1589             bld.sop1(aco_opcode::s_mov_b32, op_as_def, def_as_op);
1590             bld.sop1(aco_opcode::s_mov_b32, def, Operand(pi->scratch_sgpr, s1));
1591          } else {
1592             bld.sop2(aco_opcode::s_xor_b32, op_as_def, Definition(scc, s1), op, def_as_op);
1593             bld.sop2(aco_opcode::s_xor_b32, def, Definition(scc, s1), op, def_as_op);
1594             bld.sop2(aco_opcode::s_xor_b32, op_as_def, Definition(scc, s1), op, def_as_op);
1595          }
1596       } else if (def.regClass() == s2) {
1597          if (preserve_scc)
1598             bld.sop1(aco_opcode::s_mov_b32, Definition(pi->scratch_sgpr, s1), Operand(scc, s1));
1599          bld.sop2(aco_opcode::s_xor_b64, op_as_def, Definition(scc, s1), op, def_as_op);
1600          bld.sop2(aco_opcode::s_xor_b64, def, Definition(scc, s1), op, def_as_op);
1601          bld.sop2(aco_opcode::s_xor_b64, op_as_def, Definition(scc, s1), op, def_as_op);
1602          if (preserve_scc)
1603             bld.sopc(aco_opcode::s_cmp_lg_i32, Definition(scc, s1), Operand(pi->scratch_sgpr, s1),
1604                      Operand::zero());
1605       } else if (def.bytes() == 2 && def.physReg().reg() == op.physReg().reg()) {
1606          bld.vop3(aco_opcode::v_alignbyte_b32, Definition(def.physReg(), v1), def_as_op, op,
1607                   Operand::c32(2u));
1608       } else {
1609          assert(def.regClass().is_subdword());
1610          if (ctx->program->gfx_level >= GFX11) {
1611             swap_subdword_gfx11(bld, def, op);
1612          } else {
1613             bld.vop2_sdwa(aco_opcode::v_xor_b32, op_as_def, op, def_as_op);
1614             bld.vop2_sdwa(aco_opcode::v_xor_b32, def, op, def_as_op);
1615             bld.vop2_sdwa(aco_opcode::v_xor_b32, op_as_def, op, def_as_op);
1616          }
1617       }
1618 
1619       offset += def.bytes();
1620    }
1621 
1622    if (ctx->program->gfx_level <= GFX7)
1623       return;
1624 
1625    /* fixup in case we swapped bytes we shouldn't have */
1626    copy_operation tmp_copy = copy;
1627    tmp_copy.op.setFixed(copy.def.physReg());
1628    tmp_copy.def.setFixed(copy.op.physReg());
1629    do_copy(ctx, bld, tmp_copy, &preserve_scc, pi->scratch_sgpr);
1630 }
1631 
1632 void
do_pack_2x16(lower_context * ctx,Builder & bld,Definition def,Operand lo,Operand hi)1633 do_pack_2x16(lower_context* ctx, Builder& bld, Definition def, Operand lo, Operand hi)
1634 {
1635    if (lo.isConstant() && hi.isConstant()) {
1636       copy_constant(ctx, bld, def, Operand::c32(lo.constantValue() | (hi.constantValue() << 16)));
1637       return;
1638    }
1639 
1640    bool can_use_pack = (ctx->block->fp_mode.denorm16_64 & fp_denorm_keep_in) &&
1641                        (ctx->program->gfx_level >= GFX10 ||
1642                         (ctx->program->gfx_level >= GFX9 && !lo.isLiteral() && !hi.isLiteral()));
1643 
1644    if (can_use_pack) {
1645       Instruction* instr = bld.vop3(aco_opcode::v_pack_b32_f16, def, lo, hi);
1646       /* opsel: 0 = select low half, 1 = select high half. [0] = src0, [1] = src1 */
1647       instr->valu().opsel = hi.physReg().byte() | (lo.physReg().byte() >> 1);
1648       return;
1649    }
1650 
1651    /* a single alignbyte can be sufficient: hi can be a 32-bit integer constant */
1652    if (lo.physReg().byte() == 2 && hi.physReg().byte() == 0 &&
1653        (!hi.isConstant() || (hi.constantValue() && (!Operand::c32(hi.constantValue()).isLiteral() ||
1654                                                     ctx->program->gfx_level >= GFX10)))) {
1655       if (hi.isConstant())
1656          bld.vop3(aco_opcode::v_alignbyte_b32, def, Operand::c32(hi.constantValue()), lo,
1657                   Operand::c32(2u));
1658       else
1659          bld.vop3(aco_opcode::v_alignbyte_b32, def, hi, lo, Operand::c32(2u));
1660       return;
1661    }
1662 
1663    Definition def_lo = Definition(def.physReg(), v2b);
1664    Definition def_hi = Definition(def.physReg().advance(2), v2b);
1665 
1666    if (lo.isConstant()) {
1667       /* move hi and zero low bits */
1668       if (hi.physReg().byte() == 0)
1669          bld.vop2(aco_opcode::v_lshlrev_b32, def_hi, Operand::c32(16u), hi);
1670       else
1671          bld.vop2(aco_opcode::v_and_b32, def_hi, Operand::c32(~0xFFFFu), hi);
1672       if (lo.constantValue())
1673          bld.vop2(aco_opcode::v_or_b32, def, Operand::c32(lo.constantValue()),
1674                   Operand(def.physReg(), v1));
1675       return;
1676    }
1677    if (hi.isConstant()) {
1678       /* move lo and zero high bits */
1679       if (lo.physReg().byte() == 2)
1680          bld.vop2(aco_opcode::v_lshrrev_b32, def_lo, Operand::c32(16u), lo);
1681       else if (ctx->program->gfx_level >= GFX11)
1682          bld.vop1(aco_opcode::v_cvt_u32_u16, def, lo);
1683       else
1684          bld.vop2(aco_opcode::v_and_b32, def_lo, Operand::c32(0xFFFFu), lo);
1685       if (hi.constantValue())
1686          bld.vop2(aco_opcode::v_or_b32, def, Operand::c32(hi.constantValue() << 16u),
1687                   Operand(def.physReg(), v1));
1688       return;
1689    }
1690 
1691    if (lo.physReg().reg() == def.physReg().reg()) {
1692       /* lo is in the high bits of def */
1693       assert(lo.physReg().byte() == 2);
1694       bld.vop2(aco_opcode::v_lshrrev_b32, def_lo, Operand::c32(16u), lo);
1695       lo.setFixed(def.physReg());
1696    } else if (hi.physReg() == def.physReg()) {
1697       /* hi is in the low bits of def */
1698       assert(hi.physReg().byte() == 0);
1699       bld.vop2(aco_opcode::v_lshlrev_b32, def_hi, Operand::c32(16u), hi);
1700       hi.setFixed(def.physReg().advance(2));
1701    } else if (ctx->program->gfx_level >= GFX8) {
1702       /* Either lo or hi can be placed with just a v_mov. SDWA is not needed, because
1703        * op.physReg().byte()==def.physReg().byte() and the other half will be overwritten.
1704        */
1705       assert(lo.physReg().byte() == 0 || hi.physReg().byte() == 2);
1706       Operand& op = lo.physReg().byte() == 0 ? lo : hi;
1707       PhysReg reg = def.physReg().advance(op.physReg().byte());
1708       bld.vop1(aco_opcode::v_mov_b32, Definition(reg, v2b), op);
1709       op.setFixed(reg);
1710    }
1711 
1712    /* either hi or lo are already placed correctly */
1713    if (ctx->program->gfx_level >= GFX11) {
1714       if (lo.physReg().reg() == def.physReg().reg())
1715          emit_v_mov_b16(bld, def_hi, hi);
1716       else
1717          emit_v_mov_b16(bld, def_lo, lo);
1718       return;
1719    } else if (ctx->program->gfx_level >= GFX8) {
1720       if (lo.physReg().reg() == def.physReg().reg())
1721          bld.vop1_sdwa(aco_opcode::v_mov_b32, def_hi, hi);
1722       else
1723          bld.vop1_sdwa(aco_opcode::v_mov_b32, def_lo, lo);
1724       return;
1725    }
1726 
1727    /* alignbyte needs the operands in the following way:
1728     * | xx hi | lo xx | >> 2 byte */
1729    if (lo.physReg().byte() != hi.physReg().byte()) {
1730       /* | xx lo | hi xx | => | lo hi | lo hi | */
1731       assert(lo.physReg().byte() == 0 && hi.physReg().byte() == 2);
1732       bld.vop3(aco_opcode::v_alignbyte_b32, def, lo, hi, Operand::c32(2u));
1733       lo = Operand(def_hi.physReg(), v2b);
1734       hi = Operand(def_lo.physReg(), v2b);
1735    } else if (lo.physReg().byte() == 0) {
1736       /* | xx hi | xx lo | => | xx hi | lo 00 | */
1737       bld.vop2(aco_opcode::v_lshlrev_b32, def_hi, Operand::c32(16u), lo);
1738       lo = Operand(def_hi.physReg(), v2b);
1739    } else {
1740       /* | hi xx | lo xx | => | 00 hi | lo xx | */
1741       assert(hi.physReg().byte() == 2);
1742       bld.vop2(aco_opcode::v_lshrrev_b32, def_lo, Operand::c32(16u), hi);
1743       hi = Operand(def_lo.physReg(), v2b);
1744    }
1745    /* perform the alignbyte */
1746    bld.vop3(aco_opcode::v_alignbyte_b32, def, hi, lo, Operand::c32(2u));
1747 }
1748 
1749 void
try_coalesce_copies(lower_context * ctx,std::map<PhysReg,copy_operation> & copy_map,copy_operation & copy)1750 try_coalesce_copies(lower_context* ctx, std::map<PhysReg, copy_operation>& copy_map,
1751                     copy_operation& copy)
1752 {
1753    // TODO try more relaxed alignment for subdword copies
1754    unsigned next_def_align = util_next_power_of_two(copy.bytes + 1);
1755    unsigned next_op_align = next_def_align;
1756    if (copy.def.regClass().type() == RegType::vgpr)
1757       next_def_align = MIN2(next_def_align, 4);
1758    if (copy.op.regClass().type() == RegType::vgpr)
1759       next_op_align = MIN2(next_op_align, 4);
1760 
1761    if (copy.bytes >= 8 || copy.def.physReg().reg_b % next_def_align ||
1762        (!copy.op.isConstant() && copy.op.physReg().reg_b % next_op_align))
1763       return;
1764 
1765    auto other = copy_map.find(copy.def.physReg().advance(copy.bytes));
1766    if (other == copy_map.end() || copy.bytes + other->second.bytes > 8 ||
1767        copy.op.isConstant() != other->second.op.isConstant())
1768       return;
1769 
1770    /* don't create 64-bit copies before GFX10 */
1771    if (copy.bytes >= 4 && copy.def.regClass().type() == RegType::vgpr &&
1772        ctx->program->gfx_level < GFX10)
1773       return;
1774 
1775    unsigned new_size = copy.bytes + other->second.bytes;
1776    if (copy.op.isConstant()) {
1777       uint64_t val =
1778          copy.op.constantValue64() | (other->second.op.constantValue64() << (copy.bytes * 8u));
1779       if (!util_is_power_of_two_or_zero(new_size))
1780          return;
1781       if (!Operand::is_constant_representable(val, new_size, true,
1782                                               copy.def.regClass().type() == RegType::vgpr))
1783          return;
1784       copy.op = Operand::get_const(ctx->program->gfx_level, val, new_size);
1785    } else {
1786       if (other->second.op.physReg() != copy.op.physReg().advance(copy.bytes))
1787          return;
1788       copy.op = Operand(copy.op.physReg(), copy.op.regClass().resize(new_size));
1789    }
1790 
1791    copy.bytes = new_size;
1792    copy.def = Definition(copy.def.physReg(), copy.def.regClass().resize(copy.bytes));
1793    copy_map.erase(other);
1794 }
1795 
1796 void
handle_operands(std::map<PhysReg,copy_operation> & copy_map,lower_context * ctx,amd_gfx_level gfx_level,Pseudo_instruction * pi)1797 handle_operands(std::map<PhysReg, copy_operation>& copy_map, lower_context* ctx,
1798                 amd_gfx_level gfx_level, Pseudo_instruction* pi)
1799 {
1800    Builder bld(ctx->program, &ctx->instructions);
1801    unsigned num_instructions_before = ctx->instructions.size();
1802    aco_ptr<Instruction> mov;
1803    bool writes_scc = false;
1804 
1805    /* count the number of uses for each dst reg */
1806    for (auto it = copy_map.begin(); it != copy_map.end();) {
1807 
1808       if (it->second.def.physReg() == scc)
1809          writes_scc = true;
1810 
1811       assert(!pi->tmp_in_scc || !(it->second.def.physReg() == pi->scratch_sgpr));
1812 
1813       /* if src and dst reg are the same, remove operation */
1814       if (it->first == it->second.op.physReg()) {
1815          it = copy_map.erase(it);
1816          continue;
1817       }
1818 
1819       /* split large copies */
1820       if (it->second.bytes > 8) {
1821          assert(!it->second.op.isConstant());
1822          assert(!it->second.def.regClass().is_subdword());
1823          RegClass rc = it->second.def.regClass().resize(it->second.def.bytes() - 8);
1824          Definition hi_def = Definition(PhysReg{it->first + 2}, rc);
1825          rc = it->second.op.regClass().resize(it->second.op.bytes() - 8);
1826          Operand hi_op = Operand(PhysReg{it->second.op.physReg() + 2}, rc);
1827          copy_operation copy = {hi_op, hi_def, it->second.bytes - 8};
1828          copy_map[hi_def.physReg()] = copy;
1829          assert(it->second.op.physReg().byte() == 0 && it->second.def.physReg().byte() == 0);
1830          it->second.op = Operand(it->second.op.physReg(), it->second.op.regClass().resize(8));
1831          it->second.def = Definition(it->second.def.physReg(), it->second.def.regClass().resize(8));
1832          it->second.bytes = 8;
1833       }
1834 
1835       try_coalesce_copies(ctx, copy_map, it->second);
1836 
1837       /* check if the definition reg is used by another copy operation */
1838       for (std::pair<const PhysReg, copy_operation>& copy : copy_map) {
1839          if (copy.second.op.isConstant())
1840             continue;
1841          for (uint16_t i = 0; i < it->second.bytes; i++) {
1842             /* distance might underflow */
1843             unsigned distance = it->first.reg_b + i - copy.second.op.physReg().reg_b;
1844             if (distance < copy.second.bytes)
1845                it->second.uses[i] += 1;
1846          }
1847       }
1848 
1849       ++it;
1850    }
1851 
1852    /* first, handle paths in the location transfer graph */
1853    bool preserve_scc = pi->tmp_in_scc && !writes_scc;
1854    bool skip_partial_copies = true;
1855    for (auto it = copy_map.begin();;) {
1856       if (copy_map.empty()) {
1857          ctx->program->statistics[aco_statistic_copies] +=
1858             ctx->instructions.size() - num_instructions_before;
1859          return;
1860       }
1861       if (it == copy_map.end()) {
1862          if (!skip_partial_copies)
1863             break;
1864          skip_partial_copies = false;
1865          it = copy_map.begin();
1866       }
1867 
1868       /* check if we can pack one register at once */
1869       if (it->first.byte() == 0 && it->second.bytes == 2) {
1870          PhysReg reg_hi = it->first.advance(2);
1871          std::map<PhysReg, copy_operation>::iterator other = copy_map.find(reg_hi);
1872          if (other != copy_map.end() && other->second.bytes == 2) {
1873             /* check if the target register is otherwise unused */
1874             bool unused_lo = !it->second.is_used || (it->second.is_used == 0x0101 &&
1875                                                      other->second.op.physReg() == it->first);
1876             bool unused_hi = !other->second.is_used ||
1877                              (other->second.is_used == 0x0101 && it->second.op.physReg() == reg_hi);
1878             if (unused_lo && unused_hi) {
1879                Operand lo = it->second.op;
1880                Operand hi = other->second.op;
1881                do_pack_2x16(ctx, bld, Definition(it->first, v1), lo, hi);
1882                copy_map.erase(it);
1883                copy_map.erase(other);
1884 
1885                for (std::pair<const PhysReg, copy_operation>& other2 : copy_map) {
1886                   for (uint16_t i = 0; i < other2.second.bytes; i++) {
1887                      /* distance might underflow */
1888                      unsigned distance_lo = other2.first.reg_b + i - lo.physReg().reg_b;
1889                      unsigned distance_hi = other2.first.reg_b + i - hi.physReg().reg_b;
1890                      if (distance_lo < 2 || distance_hi < 2)
1891                         other2.second.uses[i] -= 1;
1892                   }
1893                }
1894                it = copy_map.begin();
1895                continue;
1896             }
1897          }
1898       }
1899 
1900       /* on GFX6/7, we need some small workarounds as there is no
1901        * SDWA instruction to do partial register writes */
1902       if (ctx->program->gfx_level < GFX8 && it->second.bytes < 4) {
1903          if (it->first.byte() == 0 && it->second.op.physReg().byte() == 0 && !it->second.is_used &&
1904              pi->opcode == aco_opcode::p_split_vector) {
1905             /* Other operations might overwrite the high bits, so change all users
1906              * of the high bits to the new target where they are still available.
1907              * This mechanism depends on also emitting dead definitions. */
1908             PhysReg reg_hi = it->second.op.physReg().advance(it->second.bytes);
1909             while (reg_hi != PhysReg(it->second.op.physReg().reg() + 1)) {
1910                std::map<PhysReg, copy_operation>::iterator other = copy_map.begin();
1911                for (other = copy_map.begin(); other != copy_map.end(); other++) {
1912                   /* on GFX6/7, if the high bits are used as operand, they cannot be a target */
1913                   if (other->second.op.physReg() == reg_hi) {
1914                      other->second.op.setFixed(it->first.advance(reg_hi.byte()));
1915                      break; /* break because an operand can only be used once */
1916                   }
1917                }
1918                reg_hi = reg_hi.advance(it->second.bytes);
1919             }
1920          } else if (it->first.byte()) {
1921             assert(pi->opcode == aco_opcode::p_create_vector);
1922             /* on GFX6/7, if we target an upper half where the lower half hasn't yet been handled,
1923              * move to the target operand's high bits. This is save to do as it cannot be an operand
1924              */
1925             PhysReg lo = PhysReg(it->first.reg());
1926             std::map<PhysReg, copy_operation>::iterator other = copy_map.find(lo);
1927             if (other != copy_map.end()) {
1928                assert(other->second.bytes == it->first.byte());
1929                PhysReg new_reg_hi = other->second.op.physReg().advance(it->first.byte());
1930                it->second.def = Definition(new_reg_hi, it->second.def.regClass());
1931                it->second.is_used = 0;
1932                other->second.bytes += it->second.bytes;
1933                other->second.def.setTemp(Temp(other->second.def.tempId(),
1934                                               RegClass::get(RegType::vgpr, other->second.bytes)));
1935                other->second.op.setTemp(Temp(other->second.op.tempId(),
1936                                              RegClass::get(RegType::vgpr, other->second.bytes)));
1937                /* if the new target's high bits are also a target, change uses */
1938                std::map<PhysReg, copy_operation>::iterator target = copy_map.find(new_reg_hi);
1939                if (target != copy_map.end()) {
1940                   for (unsigned i = 0; i < it->second.bytes; i++)
1941                      target->second.uses[i]++;
1942                }
1943             }
1944          }
1945       }
1946 
1947       /* find portions where the target reg is not used as operand for any other copy */
1948       if (it->second.is_used) {
1949          if (it->second.op.isConstant() || skip_partial_copies) {
1950             /* we have to skip constants until is_used=0.
1951              * we also skip partial copies at the beginning to help coalescing */
1952             ++it;
1953             continue;
1954          }
1955 
1956          unsigned has_zero_use_bytes = 0;
1957          for (unsigned i = 0; i < it->second.bytes; i++)
1958             has_zero_use_bytes |= (it->second.uses[i] == 0) << i;
1959 
1960          if (has_zero_use_bytes) {
1961             /* Skipping partial copying and doing a v_swap_b32 and then fixup
1962              * copies is usually beneficial for sub-dword copies, but if doing
1963              * a partial copy allows further copies, it should be done instead. */
1964             bool partial_copy = (has_zero_use_bytes == 0xf) || (has_zero_use_bytes == 0xf0);
1965             for (std::pair<const PhysReg, copy_operation>& copy : copy_map) {
1966                /* on GFX6/7, we can only do copies with full registers */
1967                if (partial_copy || ctx->program->gfx_level <= GFX7)
1968                   break;
1969                for (uint16_t i = 0; i < copy.second.bytes; i++) {
1970                   /* distance might underflow */
1971                   unsigned distance = copy.first.reg_b + i - it->second.op.physReg().reg_b;
1972                   if (distance < it->second.bytes && copy.second.uses[i] == 1 &&
1973                       !it->second.uses[distance])
1974                      partial_copy = true;
1975                }
1976             }
1977 
1978             if (!partial_copy) {
1979                ++it;
1980                continue;
1981             }
1982          } else {
1983             /* full target reg is used: register swapping needed */
1984             ++it;
1985             continue;
1986          }
1987       }
1988 
1989       bool did_copy = do_copy(ctx, bld, it->second, &preserve_scc, pi->scratch_sgpr);
1990       skip_partial_copies = did_copy;
1991       std::pair<PhysReg, copy_operation> copy = *it;
1992 
1993       if (it->second.is_used == 0) {
1994          /* the target reg is not used as operand for any other copy, so we
1995           * copied to all of it */
1996          copy_map.erase(it);
1997          it = copy_map.begin();
1998       } else {
1999          /* we only performed some portions of this copy, so split it to only
2000           * leave the portions that still need to be done */
2001          copy_operation original = it->second; /* the map insertion below can overwrite this */
2002          copy_map.erase(it);
2003          for (unsigned offset = 0; offset < original.bytes;) {
2004             if (original.uses[offset] == 0) {
2005                offset++;
2006                continue;
2007             }
2008             Definition def;
2009             Operand op;
2010             split_copy(ctx, offset, &def, &op, original, false, 8);
2011 
2012             copy_operation new_copy = {op, def, def.bytes()};
2013             for (unsigned i = 0; i < new_copy.bytes; i++)
2014                new_copy.uses[i] = original.uses[i + offset];
2015             copy_map[def.physReg()] = new_copy;
2016 
2017             offset += def.bytes();
2018          }
2019 
2020          it = copy_map.begin();
2021       }
2022 
2023       /* Reduce the number of uses of the operand reg by one. Do this after
2024        * splitting the copy or removing it in case the copy writes to it's own
2025        * operand (for example, v[7:8] = v[8:9]) */
2026       if (did_copy && !copy.second.op.isConstant()) {
2027          for (std::pair<const PhysReg, copy_operation>& other : copy_map) {
2028             for (uint16_t i = 0; i < other.second.bytes; i++) {
2029                /* distance might underflow */
2030                unsigned distance = other.first.reg_b + i - copy.second.op.physReg().reg_b;
2031                if (distance < copy.second.bytes && !copy.second.uses[distance])
2032                   other.second.uses[i] -= 1;
2033             }
2034          }
2035       }
2036    }
2037 
2038    /* all target regs are needed as operand somewhere which means, all entries are part of a cycle */
2039    unsigned largest = 0;
2040    for (const std::pair<const PhysReg, copy_operation>& op : copy_map)
2041       largest = MAX2(largest, op.second.bytes);
2042 
2043    while (!copy_map.empty()) {
2044 
2045       /* Perform larger swaps first, because larger swaps swaps can make other
2046        * swaps unnecessary. */
2047       auto it = copy_map.begin();
2048       for (auto it2 = copy_map.begin(); it2 != copy_map.end(); ++it2) {
2049          if (it2->second.bytes > it->second.bytes) {
2050             it = it2;
2051             if (it->second.bytes == largest)
2052                break;
2053          }
2054       }
2055 
2056       /* should already be done */
2057       assert(!it->second.op.isConstant());
2058 
2059       assert(it->second.op.isFixed());
2060       assert(it->second.def.regClass() == it->second.op.regClass());
2061 
2062       if (it->first == it->second.op.physReg()) {
2063          copy_map.erase(it);
2064          continue;
2065       }
2066 
2067       if (preserve_scc && it->second.def.getTemp().type() == RegType::sgpr)
2068          assert(!(it->second.def.physReg() == pi->scratch_sgpr));
2069 
2070       /* to resolve the cycle, we have to swap the src reg with the dst reg */
2071       copy_operation swap = it->second;
2072 
2073       /* if this is self-intersecting, we have to split it because
2074        * self-intersecting swaps don't make sense */
2075       PhysReg src = swap.op.physReg(), dst = swap.def.physReg();
2076       if (abs((int)src.reg_b - (int)dst.reg_b) < (int)swap.bytes) {
2077          unsigned offset = abs((int)src.reg_b - (int)dst.reg_b);
2078 
2079          copy_operation remaining;
2080          src.reg_b += offset;
2081          dst.reg_b += offset;
2082          remaining.bytes = swap.bytes - offset;
2083          memcpy(remaining.uses, swap.uses + offset, remaining.bytes);
2084          remaining.op = Operand(src, swap.def.regClass().resize(remaining.bytes));
2085          remaining.def = Definition(dst, swap.def.regClass().resize(remaining.bytes));
2086          copy_map[dst] = remaining;
2087 
2088          memset(swap.uses + offset, 0, swap.bytes - offset);
2089          swap.bytes = offset;
2090       }
2091 
2092       /* GFX6-7 can only swap full registers */
2093       if (ctx->program->gfx_level <= GFX7)
2094          swap.bytes = align(swap.bytes, 4);
2095 
2096       do_swap(ctx, bld, swap, preserve_scc, pi);
2097 
2098       /* remove from map */
2099       copy_map.erase(it);
2100 
2101       /* change the operand reg of the target's uses and split uses if needed */
2102       uint32_t bytes_left = u_bit_consecutive(0, swap.bytes);
2103       for (auto target = copy_map.begin(); target != copy_map.end(); ++target) {
2104          if (target->second.op.physReg() == swap.def.physReg() &&
2105              swap.bytes == target->second.bytes) {
2106             target->second.op.setFixed(swap.op.physReg());
2107             break;
2108          }
2109 
2110          uint32_t imask =
2111             get_intersection_mask(swap.def.physReg().reg_b, swap.bytes,
2112                                   target->second.op.physReg().reg_b, target->second.bytes);
2113 
2114          if (!imask)
2115             continue;
2116 
2117          int offset = (int)target->second.op.physReg().reg_b - (int)swap.def.physReg().reg_b;
2118 
2119          /* split and update the middle (the portion that reads the swap's
2120           * definition) to read the swap's operand instead */
2121          int target_op_end = target->second.op.physReg().reg_b + target->second.bytes;
2122          int swap_def_end = swap.def.physReg().reg_b + swap.bytes;
2123          int before_bytes = MAX2(-offset, 0);
2124          int after_bytes = MAX2(target_op_end - swap_def_end, 0);
2125          int middle_bytes = target->second.bytes - before_bytes - after_bytes;
2126 
2127          if (after_bytes) {
2128             unsigned after_offset = before_bytes + middle_bytes;
2129             assert(after_offset > 0);
2130             copy_operation copy;
2131             copy.bytes = after_bytes;
2132             memcpy(copy.uses, target->second.uses + after_offset, copy.bytes);
2133             RegClass rc = target->second.op.regClass().resize(after_bytes);
2134             copy.op = Operand(target->second.op.physReg().advance(after_offset), rc);
2135             copy.def = Definition(target->second.def.physReg().advance(after_offset), rc);
2136             copy_map[copy.def.physReg()] = copy;
2137          }
2138 
2139          if (middle_bytes) {
2140             copy_operation copy;
2141             copy.bytes = middle_bytes;
2142             memcpy(copy.uses, target->second.uses + before_bytes, copy.bytes);
2143             RegClass rc = target->second.op.regClass().resize(middle_bytes);
2144             copy.op = Operand(swap.op.physReg().advance(MAX2(offset, 0)), rc);
2145             copy.def = Definition(target->second.def.physReg().advance(before_bytes), rc);
2146             copy_map[copy.def.physReg()] = copy;
2147          }
2148 
2149          if (before_bytes) {
2150             copy_operation copy;
2151             target->second.bytes = before_bytes;
2152             RegClass rc = target->second.op.regClass().resize(before_bytes);
2153             target->second.op = Operand(target->second.op.physReg(), rc);
2154             target->second.def = Definition(target->second.def.physReg(), rc);
2155             memset(target->second.uses + target->second.bytes, 0, 8 - target->second.bytes);
2156          }
2157 
2158          /* break early since we know each byte of the swap's definition is used
2159           * at most once */
2160          bytes_left &= ~imask;
2161          if (!bytes_left)
2162             break;
2163       }
2164    }
2165    ctx->program->statistics[aco_statistic_copies] +=
2166       ctx->instructions.size() - num_instructions_before;
2167 }
2168 
2169 void
emit_set_mode(Builder & bld,float_mode new_mode,bool set_round,bool set_denorm)2170 emit_set_mode(Builder& bld, float_mode new_mode, bool set_round, bool set_denorm)
2171 {
2172    if (bld.program->gfx_level >= GFX10) {
2173       if (set_round)
2174          bld.sopp(aco_opcode::s_round_mode, -1, new_mode.round);
2175       if (set_denorm)
2176          bld.sopp(aco_opcode::s_denorm_mode, -1, new_mode.denorm);
2177    } else if (set_round || set_denorm) {
2178       /* "((size - 1) << 11) | register" (MODE is encoded as register 1) */
2179       bld.sopk(aco_opcode::s_setreg_imm32_b32, Operand::literal32(new_mode.val), (7 << 11) | 1);
2180    }
2181 }
2182 
2183 void
emit_set_mode_from_block(Builder & bld,Program & program,Block * block,bool always_set)2184 emit_set_mode_from_block(Builder& bld, Program& program, Block* block, bool always_set)
2185 {
2186    float_mode config_mode;
2187    config_mode.val = program.config->float_mode;
2188 
2189    bool set_round = always_set && block->fp_mode.round != config_mode.round;
2190    bool set_denorm = always_set && block->fp_mode.denorm != config_mode.denorm;
2191    if (block->kind & block_kind_top_level) {
2192       for (unsigned pred : block->linear_preds) {
2193          if (program.blocks[pred].fp_mode.round != block->fp_mode.round)
2194             set_round = true;
2195          if (program.blocks[pred].fp_mode.denorm != block->fp_mode.denorm)
2196             set_denorm = true;
2197       }
2198    }
2199    /* only allow changing modes at top-level blocks so this doesn't break
2200     * the "jump over empty blocks" optimization */
2201    assert((!set_round && !set_denorm) || (block->kind & block_kind_top_level));
2202    emit_set_mode(bld, block->fp_mode, set_round, set_denorm);
2203 }
2204 
2205 void
hw_init_scratch(Builder & bld,Definition def,Operand scratch_addr,Operand scratch_offset)2206 hw_init_scratch(Builder& bld, Definition def, Operand scratch_addr, Operand scratch_offset)
2207 {
2208    /* Since we know what the high 16 bits of scratch_hi is, we can set all the high 16
2209     * bits in the same instruction that we add the carry.
2210     */
2211    Operand hi_add = Operand::c32(0xffff0000 - S_008F04_SWIZZLE_ENABLE_GFX6(1));
2212    Operand scratch_addr_lo(scratch_addr.physReg(), s1);
2213    Operand scratch_addr_hi(scratch_addr_lo.physReg().advance(4), s1);
2214 
2215    if (bld.program->gfx_level >= GFX10) {
2216       PhysReg scratch_lo = def.physReg();
2217       PhysReg scratch_hi = def.physReg().advance(4);
2218 
2219       bld.sop2(aco_opcode::s_add_u32, Definition(scratch_lo, s1), Definition(scc, s1),
2220                scratch_addr_lo, scratch_offset);
2221       bld.sop2(aco_opcode::s_addc_u32, Definition(scratch_hi, s1), Definition(scc, s1),
2222                scratch_addr_hi, hi_add, Operand(scc, s1));
2223 
2224       /* "((size - 1) << 11) | register" (FLAT_SCRATCH_LO/HI is encoded as register
2225        * 20/21) */
2226       bld.sopk(aco_opcode::s_setreg_b32, Operand(scratch_lo, s1), (31 << 11) | 20);
2227       bld.sopk(aco_opcode::s_setreg_b32, Operand(scratch_hi, s1), (31 << 11) | 21);
2228    } else {
2229       bld.sop2(aco_opcode::s_add_u32, Definition(flat_scr_lo, s1), Definition(scc, s1),
2230                scratch_addr_lo, scratch_offset);
2231       bld.sop2(aco_opcode::s_addc_u32, Definition(flat_scr_hi, s1), Definition(scc, s1),
2232                scratch_addr_hi, hi_add, Operand(scc, s1));
2233    }
2234 }
2235 
2236 void
lower_image_sample(lower_context * ctx,aco_ptr<Instruction> & instr)2237 lower_image_sample(lower_context* ctx, aco_ptr<Instruction>& instr)
2238 {
2239    Operand linear_vgpr = instr->operands[3];
2240 
2241    unsigned nsa_size = ctx->program->dev.max_nsa_vgprs;
2242    unsigned vaddr_size = linear_vgpr.size();
2243    unsigned num_copied_vgprs = instr->operands.size() - 4;
2244    nsa_size = num_copied_vgprs > 0 && (ctx->program->gfx_level >= GFX11 || vaddr_size <= nsa_size)
2245                  ? nsa_size
2246                  : 0;
2247 
2248    Operand vaddr[16];
2249    unsigned num_vaddr = 0;
2250 
2251    if (nsa_size) {
2252       assert(num_copied_vgprs <= nsa_size);
2253       for (unsigned i = 0; i < num_copied_vgprs; i++)
2254          vaddr[num_vaddr++] = instr->operands[4 + i];
2255       for (unsigned i = num_copied_vgprs; i < std::min(vaddr_size, nsa_size); i++)
2256          vaddr[num_vaddr++] = Operand(linear_vgpr.physReg().advance(i * 4), v1);
2257       if (vaddr_size > nsa_size) {
2258          RegClass rc = RegClass::get(RegType::vgpr, (vaddr_size - nsa_size) * 4);
2259          vaddr[num_vaddr++] = Operand(PhysReg(linear_vgpr.physReg().advance(nsa_size * 4)), rc);
2260       }
2261    } else {
2262       PhysReg reg = linear_vgpr.physReg();
2263       std::map<PhysReg, copy_operation> copy_operations;
2264       for (unsigned i = 4; i < instr->operands.size(); i++) {
2265          Operand arg = instr->operands[i];
2266          Definition def(reg, RegClass::get(RegType::vgpr, arg.bytes()));
2267          copy_operations[def.physReg()] = {arg, def, def.bytes()};
2268          reg = reg.advance(arg.bytes());
2269       }
2270       vaddr[num_vaddr++] = linear_vgpr;
2271 
2272       Pseudo_instruction pi = {};
2273       handle_operands(copy_operations, ctx, ctx->program->gfx_level, &pi);
2274    }
2275 
2276    instr->mimg().strict_wqm = false;
2277 
2278    if ((3 + num_vaddr) > instr->operands.size()) {
2279       MIMG_instruction* new_instr = create_instruction<MIMG_instruction>(
2280          instr->opcode, Format::MIMG, 3 + num_vaddr, instr->definitions.size());
2281       std::copy(instr->definitions.cbegin(), instr->definitions.cend(),
2282                 new_instr->definitions.begin());
2283       new_instr->operands[0] = instr->operands[0];
2284       new_instr->operands[1] = instr->operands[1];
2285       new_instr->operands[2] = instr->operands[2];
2286       memcpy((uint8_t*)new_instr + sizeof(Instruction), (uint8_t*)instr.get() + sizeof(Instruction),
2287              sizeof(MIMG_instruction) - sizeof(Instruction));
2288       instr.reset(new_instr);
2289    } else {
2290       while (instr->operands.size() > (3 + num_vaddr))
2291          instr->operands.pop_back();
2292    }
2293    std::copy(vaddr, vaddr + num_vaddr, std::next(instr->operands.begin(), 3));
2294 }
2295 
2296 void
lower_to_hw_instr(Program * program)2297 lower_to_hw_instr(Program* program)
2298 {
2299    gfx9_pops_done_msg_bounds pops_done_msg_bounds;
2300    if (program->has_pops_overlapped_waves_wait && program->gfx_level < GFX11) {
2301       pops_done_msg_bounds = gfx9_pops_done_msg_bounds(program);
2302    }
2303 
2304    Block* discard_exit_block = NULL;
2305    Block* discard_pops_done_and_exit_block = NULL;
2306 
2307    int end_with_regs_block_index = -1;
2308 
2309    bool should_dealloc_vgprs = dealloc_vgprs(program);
2310 
2311    for (int block_idx = program->blocks.size() - 1; block_idx >= 0; block_idx--) {
2312       Block* block = &program->blocks[block_idx];
2313       lower_context ctx;
2314       ctx.program = program;
2315       ctx.block = block;
2316       ctx.instructions.reserve(block->instructions.size());
2317       Builder bld(program, &ctx.instructions);
2318 
2319       emit_set_mode_from_block(bld, *program, block, (block_idx == 0));
2320 
2321       for (size_t instr_idx = 0; instr_idx < block->instructions.size(); instr_idx++) {
2322          aco_ptr<Instruction>& instr = block->instructions[instr_idx];
2323 
2324          /* Send the ordered section done message from the middle of the block if needed (if the
2325           * ordered section is ended by an instruction inside this block).
2326           * Also make sure the done message is sent if it's needed in case early exit happens for
2327           * any reason.
2328           */
2329          if ((block_idx == pops_done_msg_bounds.end_block_idx() &&
2330               instr_idx == pops_done_msg_bounds.instr_after_end_idx()) ||
2331              (instr->opcode == aco_opcode::s_endpgm &&
2332               pops_done_msg_bounds.early_exit_needs_done_msg(block_idx, instr_idx))) {
2333             bld.sopp(aco_opcode::s_sendmsg, -1, sendmsg_ordered_ps_done);
2334          }
2335 
2336          aco_ptr<Instruction> mov;
2337          if (instr->isPseudo() && instr->opcode != aco_opcode::p_unit_test) {
2338             Pseudo_instruction* pi = &instr->pseudo();
2339 
2340             switch (instr->opcode) {
2341             case aco_opcode::p_extract_vector: {
2342                PhysReg reg = instr->operands[0].physReg();
2343                Definition& def = instr->definitions[0];
2344                reg.reg_b += instr->operands[1].constantValue() * def.bytes();
2345 
2346                if (reg == def.physReg())
2347                   break;
2348 
2349                RegClass op_rc = def.regClass().is_subdword()
2350                                    ? def.regClass()
2351                                    : RegClass(instr->operands[0].getTemp().type(), def.size());
2352                std::map<PhysReg, copy_operation> copy_operations;
2353                copy_operations[def.physReg()] = {Operand(reg, op_rc), def, def.bytes()};
2354                handle_operands(copy_operations, &ctx, program->gfx_level, pi);
2355                break;
2356             }
2357             case aco_opcode::p_create_vector: {
2358                std::map<PhysReg, copy_operation> copy_operations;
2359                PhysReg reg = instr->definitions[0].physReg();
2360 
2361                for (const Operand& op : instr->operands) {
2362                   if (op.isConstant()) {
2363                      const Definition def = Definition(
2364                         reg, instr->definitions[0].getTemp().regClass().resize(op.bytes()));
2365                      copy_operations[reg] = {op, def, op.bytes()};
2366                      reg.reg_b += op.bytes();
2367                      continue;
2368                   }
2369                   if (op.isUndefined()) {
2370                      // TODO: coalesce subdword copies if dst byte is 0
2371                      reg.reg_b += op.bytes();
2372                      continue;
2373                   }
2374 
2375                   RegClass rc_def =
2376                      op.regClass().is_subdword()
2377                         ? op.regClass()
2378                         : instr->definitions[0].getTemp().regClass().resize(op.bytes());
2379                   const Definition def = Definition(reg, rc_def);
2380                   copy_operations[def.physReg()] = {op, def, op.bytes()};
2381                   reg.reg_b += op.bytes();
2382                }
2383                handle_operands(copy_operations, &ctx, program->gfx_level, pi);
2384                break;
2385             }
2386             case aco_opcode::p_split_vector: {
2387                std::map<PhysReg, copy_operation> copy_operations;
2388                PhysReg reg = instr->operands[0].physReg();
2389 
2390                for (const Definition& def : instr->definitions) {
2391                   RegClass rc_op = def.regClass().is_subdword()
2392                                       ? def.regClass()
2393                                       : instr->operands[0].getTemp().regClass().resize(def.bytes());
2394                   const Operand op = Operand(reg, rc_op);
2395                   copy_operations[def.physReg()] = {op, def, def.bytes()};
2396                   reg.reg_b += def.bytes();
2397                }
2398                handle_operands(copy_operations, &ctx, program->gfx_level, pi);
2399                break;
2400             }
2401             case aco_opcode::p_parallelcopy: {
2402                std::map<PhysReg, copy_operation> copy_operations;
2403                for (unsigned j = 0; j < instr->operands.size(); j++) {
2404                   assert(instr->definitions[j].bytes() == instr->operands[j].bytes());
2405                   copy_operations[instr->definitions[j].physReg()] = {
2406                      instr->operands[j], instr->definitions[j], instr->operands[j].bytes()};
2407                }
2408                handle_operands(copy_operations, &ctx, program->gfx_level, pi);
2409                break;
2410             }
2411             case aco_opcode::p_start_linear_vgpr: {
2412                if (instr->operands.empty())
2413                   break;
2414 
2415                Definition def(instr->definitions[0].physReg(),
2416                               RegClass::get(RegType::vgpr, instr->definitions[0].bytes()));
2417 
2418                std::map<PhysReg, copy_operation> copy_operations;
2419                copy_operations[def.physReg()] = {instr->operands[0], def,
2420                                                  instr->operands[0].bytes()};
2421                handle_operands(copy_operations, &ctx, program->gfx_level, pi);
2422                break;
2423             }
2424             case aco_opcode::p_exit_early_if: {
2425                /* don't bother with an early exit near the end of the program */
2426                if ((block->instructions.size() - 1 - instr_idx) <= 4 &&
2427                    block->instructions.back()->opcode == aco_opcode::s_endpgm) {
2428                   unsigned null_exp_dest =
2429                      program->gfx_level >= GFX11 ? V_008DFC_SQ_EXP_MRT : V_008DFC_SQ_EXP_NULL;
2430                   bool ignore_early_exit = true;
2431 
2432                   for (unsigned k = instr_idx + 1; k < block->instructions.size(); ++k) {
2433                      const aco_ptr<Instruction>& instr2 = block->instructions[k];
2434                      if (instr2->opcode == aco_opcode::s_endpgm ||
2435                          instr2->opcode == aco_opcode::p_logical_end)
2436                         continue;
2437                      else if (instr2->opcode == aco_opcode::exp &&
2438                               instr2->exp().dest == null_exp_dest &&
2439                               instr2->exp().enabled_mask == 0)
2440                         continue;
2441                      else if (instr2->opcode == aco_opcode::p_parallelcopy &&
2442                               instr2->definitions[0].isFixed() &&
2443                               instr2->definitions[0].physReg() == exec)
2444                         continue;
2445 
2446                      ignore_early_exit = false;
2447                   }
2448 
2449                   if (ignore_early_exit)
2450                      break;
2451                }
2452 
2453                const bool discard_sends_pops_done =
2454                   pops_done_msg_bounds.early_exit_needs_done_msg(block_idx, instr_idx);
2455 
2456                Block* discard_block =
2457                   discard_sends_pops_done ? discard_pops_done_and_exit_block : discard_exit_block;
2458                if (!discard_block) {
2459                   discard_block = program->create_and_insert_block();
2460                   discard_block->kind = block_kind_discard_early_exit;
2461                   if (discard_sends_pops_done) {
2462                      discard_pops_done_and_exit_block = discard_block;
2463                   } else {
2464                      discard_exit_block = discard_block;
2465                   }
2466                   block = &program->blocks[block_idx];
2467 
2468                   bld.reset(discard_block);
2469                   if (program->has_pops_overlapped_waves_wait &&
2470                       (program->gfx_level >= GFX11 || discard_sends_pops_done)) {
2471                      /* If this discard early exit potentially exits the POPS ordered section, do
2472                       * the waitcnt necessary before resuming overlapping waves as the normal
2473                       * waitcnt insertion doesn't work in a discard early exit block.
2474                       */
2475                      if (program->gfx_level >= GFX10)
2476                         bld.sopk(aco_opcode::s_waitcnt_vscnt, Operand(sgpr_null, s1), 0);
2477                      wait_imm pops_exit_wait_imm;
2478                      pops_exit_wait_imm.vm = 0;
2479                      if (program->has_smem_buffer_or_global_loads)
2480                         pops_exit_wait_imm.lgkm = 0;
2481                      bld.sopp(aco_opcode::s_waitcnt, -1,
2482                               pops_exit_wait_imm.pack(program->gfx_level));
2483                   }
2484                   if (discard_sends_pops_done)
2485                      bld.sopp(aco_opcode::s_sendmsg, -1, sendmsg_ordered_ps_done);
2486                   unsigned target = V_008DFC_SQ_EXP_NULL;
2487                   if (program->gfx_level >= GFX11)
2488                      target =
2489                         program->has_color_exports ? V_008DFC_SQ_EXP_MRT : V_008DFC_SQ_EXP_MRTZ;
2490                   if (program->stage == fragment_fs)
2491                      bld.exp(aco_opcode::exp, Operand(v1), Operand(v1), Operand(v1), Operand(v1), 0,
2492                              target, false, true, true);
2493                   if (should_dealloc_vgprs)
2494                      bld.sopp(aco_opcode::s_sendmsg, -1, sendmsg_dealloc_vgprs);
2495                   bld.sopp(aco_opcode::s_endpgm);
2496 
2497                   bld.reset(&ctx.instructions);
2498                }
2499 
2500                assert(instr->operands[0].physReg() == scc);
2501                bld.sopp(aco_opcode::s_cbranch_scc0, instr->operands[0], discard_block->index);
2502 
2503                discard_block->linear_preds.push_back(block->index);
2504                block->linear_succs.push_back(discard_block->index);
2505                break;
2506             }
2507             case aco_opcode::p_spill: {
2508                assert(instr->operands[0].regClass() == v1.as_linear());
2509                for (unsigned i = 0; i < instr->operands[2].size(); i++) {
2510                   Operand src =
2511                      instr->operands[2].isConstant()
2512                         ? Operand::c32(uint32_t(instr->operands[2].constantValue64() >> (32 * i)))
2513                         : Operand(PhysReg{instr->operands[2].physReg() + i}, s1);
2514                   bld.writelane(bld.def(v1, instr->operands[0].physReg()), src,
2515                                 Operand::c32(instr->operands[1].constantValue() + i),
2516                                 instr->operands[0]);
2517                }
2518                break;
2519             }
2520             case aco_opcode::p_reload: {
2521                assert(instr->operands[0].regClass() == v1.as_linear());
2522                for (unsigned i = 0; i < instr->definitions[0].size(); i++)
2523                   bld.readlane(bld.def(s1, PhysReg{instr->definitions[0].physReg() + i}),
2524                                instr->operands[0],
2525                                Operand::c32(instr->operands[1].constantValue() + i));
2526                break;
2527             }
2528             case aco_opcode::p_as_uniform: {
2529                if (instr->operands[0].isConstant() ||
2530                    instr->operands[0].regClass().type() == RegType::sgpr) {
2531                   std::map<PhysReg, copy_operation> copy_operations;
2532                   copy_operations[instr->definitions[0].physReg()] = {
2533                      instr->operands[0], instr->definitions[0], instr->definitions[0].bytes()};
2534                   handle_operands(copy_operations, &ctx, program->gfx_level, pi);
2535                } else {
2536                   assert(instr->operands[0].regClass().type() == RegType::vgpr);
2537                   assert(instr->definitions[0].regClass().type() == RegType::sgpr);
2538                   assert(instr->operands[0].size() == instr->definitions[0].size());
2539                   for (unsigned i = 0; i < instr->definitions[0].size(); i++) {
2540                      bld.vop1(aco_opcode::v_readfirstlane_b32,
2541                               bld.def(s1, PhysReg{instr->definitions[0].physReg() + i}),
2542                               Operand(PhysReg{instr->operands[0].physReg() + i}, v1));
2543                   }
2544                }
2545                break;
2546             }
2547             case aco_opcode::p_pops_gfx9_add_exiting_wave_id: {
2548                bld.sop2(aco_opcode::s_add_i32, instr->definitions[0], instr->definitions[1],
2549                         Operand(pops_exiting_wave_id, s1), instr->operands[0]);
2550                break;
2551             }
2552             case aco_opcode::p_bpermute_readlane: {
2553                emit_bpermute_readlane(program, instr, bld);
2554                break;
2555             }
2556             case aco_opcode::p_bpermute_shared_vgpr: {
2557                emit_bpermute_shared_vgpr(program, instr, bld);
2558                break;
2559             }
2560             case aco_opcode::p_bpermute_permlane: {
2561                emit_bpermute_permlane(program, instr, bld);
2562                break;
2563             }
2564             case aco_opcode::p_constaddr: {
2565                unsigned id = instr->definitions[0].tempId();
2566                PhysReg reg = instr->definitions[0].physReg();
2567                bld.sop1(aco_opcode::p_constaddr_getpc, instr->definitions[0], Operand::c32(id));
2568                bld.sop2(aco_opcode::p_constaddr_addlo, Definition(reg, s1), bld.def(s1, scc),
2569                         Operand(reg, s1), instr->operands[0], Operand::c32(id));
2570                /* s_addc_u32 not needed because the program is in a 32-bit VA range */
2571                break;
2572             }
2573             case aco_opcode::p_resume_shader_address: {
2574                /* Find index of resume block. */
2575                unsigned resume_idx = instr->operands[0].constantValue();
2576                unsigned resume_block_idx = 0;
2577                for (Block& resume_block : program->blocks) {
2578                   if (resume_block.kind & block_kind_resume) {
2579                      if (resume_idx == 0) {
2580                         resume_block_idx = resume_block.index;
2581                         break;
2582                      }
2583                      resume_idx--;
2584                   }
2585                }
2586                assert(resume_block_idx != 0);
2587                unsigned id = instr->definitions[0].tempId();
2588                PhysReg reg = instr->definitions[0].physReg();
2589                bld.sop1(aco_opcode::p_resumeaddr_getpc, instr->definitions[0], Operand::c32(id));
2590                bld.sop2(aco_opcode::p_resumeaddr_addlo, Definition(reg, s1), bld.def(s1, scc),
2591                         Operand(reg, s1), Operand::c32(resume_block_idx), Operand::c32(id));
2592                /* s_addc_u32 not needed because the program is in a 32-bit VA range */
2593                break;
2594             }
2595             case aco_opcode::p_extract: {
2596                assert(instr->operands[1].isConstant());
2597                assert(instr->operands[2].isConstant());
2598                assert(instr->operands[3].isConstant());
2599                if (instr->definitions[0].regClass() == s1)
2600                   assert(instr->definitions.size() >= 2 && instr->definitions[1].physReg() == scc);
2601                Definition dst = instr->definitions[0];
2602                Operand op = instr->operands[0];
2603                unsigned bits = instr->operands[2].constantValue();
2604                unsigned index = instr->operands[1].constantValue();
2605                unsigned offset = index * bits;
2606                bool signext = !instr->operands[3].constantEquals(0);
2607 
2608                if (dst.regClass() == s1) {
2609                   if (offset == (32 - bits)) {
2610                      bld.sop2(signext ? aco_opcode::s_ashr_i32 : aco_opcode::s_lshr_b32, dst,
2611                               bld.def(s1, scc), op, Operand::c32(offset));
2612                   } else if (offset == 0 && signext && (bits == 8 || bits == 16)) {
2613                      bld.sop1(bits == 8 ? aco_opcode::s_sext_i32_i8 : aco_opcode::s_sext_i32_i16,
2614                               dst, op);
2615                   } else if (ctx.program->gfx_level >= GFX9 && offset == 0 && bits == 16) {
2616                      bld.sop2(aco_opcode::s_pack_ll_b32_b16, dst, op, Operand::zero());
2617                   } else {
2618                      bld.sop2(signext ? aco_opcode::s_bfe_i32 : aco_opcode::s_bfe_u32, dst,
2619                               bld.def(s1, scc), op, Operand::c32((bits << 16) | offset));
2620                   }
2621                } else if ((dst.regClass() == v1 && op.physReg().byte() == 0) ||
2622                           ctx.program->gfx_level <= GFX7) {
2623                   assert(op.physReg().byte() == 0 && dst.physReg().byte() == 0);
2624                   if (offset == (32 - bits) && op.regClass() != s1) {
2625                      bld.vop2(signext ? aco_opcode::v_ashrrev_i32 : aco_opcode::v_lshrrev_b32, dst,
2626                               Operand::c32(offset), op);
2627                   } else if (offset == 0 && bits == 16 && ctx.program->gfx_level >= GFX11) {
2628                      bld.vop1(signext ? aco_opcode::v_cvt_i32_i16 : aco_opcode::v_cvt_u32_u16, dst,
2629                               op);
2630                   } else {
2631                      bld.vop3(signext ? aco_opcode::v_bfe_i32 : aco_opcode::v_bfe_u32, dst, op,
2632                               Operand::c32(offset), Operand::c32(bits));
2633                   }
2634                } else {
2635                   assert(dst.regClass() == v2b || dst.regClass() == v1b || op.regClass() == v2b ||
2636                          op.regClass() == v1b);
2637                   if (ctx.program->gfx_level >= GFX11) {
2638                      unsigned op_vgpr_byte = op.physReg().byte() + offset / 8;
2639                      unsigned sign_byte = op_vgpr_byte + bits / 8 - 1;
2640 
2641                      uint8_t swiz[4] = {4, 5, 6, 7};
2642                      swiz[dst.physReg().byte()] = op_vgpr_byte;
2643                      if (bits == 16)
2644                         swiz[dst.physReg().byte() + 1] = op_vgpr_byte + 1;
2645                      for (unsigned i = bits / 8; i < dst.bytes(); i++) {
2646                         uint8_t ext = bperm_0;
2647                         if (signext) {
2648                            if (sign_byte == 1)
2649                               ext = bperm_b1_sign;
2650                            else if (sign_byte == 3)
2651                               ext = bperm_b3_sign;
2652                            else /* replicate so sign-extension can be done later */
2653                               ext = sign_byte;
2654                         }
2655                         swiz[dst.physReg().byte() + i] = ext;
2656                      }
2657                      create_bperm(bld, swiz, dst, op);
2658 
2659                      if (signext && sign_byte != 3 && sign_byte != 1) {
2660                         assert(bits == 8);
2661                         assert(dst.regClass() == v2b || dst.regClass() == v1);
2662                         uint8_t ext_swiz[4] = {4, 5, 6, 7};
2663                         uint8_t ext = dst.physReg().byte() == 2 ? bperm_b7_sign : bperm_b5_sign;
2664                         memset(ext_swiz + dst.physReg().byte() + 1, ext, dst.bytes() - 1);
2665                         create_bperm(bld, ext_swiz, dst, Operand::zero());
2666                      }
2667                   } else {
2668                      SDWA_instruction& sdwa = bld.vop1_sdwa(aco_opcode::v_mov_b32, dst, op)->sdwa();
2669                      sdwa.sel[0] = SubdwordSel(bits / 8, offset / 8, signext);
2670                   }
2671                }
2672                break;
2673             }
2674             case aco_opcode::p_insert: {
2675                assert(instr->operands[1].isConstant());
2676                assert(instr->operands[2].isConstant());
2677                if (instr->definitions[0].regClass() == s1)
2678                   assert(instr->definitions.size() >= 2 && instr->definitions[1].physReg() == scc);
2679                Definition dst = instr->definitions[0];
2680                Operand op = instr->operands[0];
2681                unsigned bits = instr->operands[2].constantValue();
2682                unsigned index = instr->operands[1].constantValue();
2683                unsigned offset = index * bits;
2684 
2685                bool has_sdwa = program->gfx_level >= GFX8 && program->gfx_level < GFX11;
2686                if (dst.regClass() == s1) {
2687                   if (offset == (32 - bits)) {
2688                      bld.sop2(aco_opcode::s_lshl_b32, dst, bld.def(s1, scc), op,
2689                               Operand::c32(offset));
2690                   } else if (offset == 0) {
2691                      bld.sop2(aco_opcode::s_bfe_u32, dst, bld.def(s1, scc), op,
2692                               Operand::c32(bits << 16));
2693                   } else {
2694                      bld.sop2(aco_opcode::s_bfe_u32, dst, bld.def(s1, scc), op,
2695                               Operand::c32(bits << 16));
2696                      bld.sop2(aco_opcode::s_lshl_b32, dst, bld.def(s1, scc),
2697                               Operand(dst.physReg(), s1), Operand::c32(offset));
2698                   }
2699                } else if (dst.regClass() == v1 || !has_sdwa) {
2700                   if (offset == (dst.bytes() * 8u - bits) &&
2701                       (dst.regClass() == v1 || program->gfx_level <= GFX7)) {
2702                      bld.vop2(aco_opcode::v_lshlrev_b32, dst, Operand::c32(offset), op);
2703                   } else if (offset == 0 && (dst.regClass() == v1 || program->gfx_level <= GFX7)) {
2704                      bld.vop3(aco_opcode::v_bfe_u32, dst, op, Operand::zero(), Operand::c32(bits));
2705                   } else if (has_sdwa && (op.regClass() != s1 || program->gfx_level >= GFX9)) {
2706                      bld.vop1_sdwa(aco_opcode::v_mov_b32, dst, op)->sdwa().dst_sel =
2707                         SubdwordSel(bits / 8, offset / 8, false);
2708                   } else if (program->gfx_level >= GFX11) {
2709                      uint8_t swiz[] = {4, 5, 6, 7};
2710                      for (unsigned i = 0; i < dst.bytes(); i++)
2711                         swiz[dst.physReg().byte() + i] = bperm_0;
2712                      for (unsigned i = 0; i < bits / 8; i++)
2713                         swiz[dst.physReg().byte() + i + offset / 8] = op.physReg().byte() + i;
2714                      create_bperm(bld, swiz, dst, op);
2715                   } else {
2716                      bld.vop3(aco_opcode::v_bfe_u32, dst, op, Operand::zero(), Operand::c32(bits));
2717                      bld.vop2(aco_opcode::v_lshlrev_b32, dst, Operand::c32(offset),
2718                               Operand(dst.physReg(), v1));
2719                   }
2720                } else {
2721                   assert(dst.regClass() == v2b);
2722                   bld.vop2_sdwa(aco_opcode::v_lshlrev_b32, dst, Operand::c32(offset), op)
2723                      ->sdwa()
2724                      .sel[1] = SubdwordSel::ubyte;
2725                }
2726                break;
2727             }
2728             case aco_opcode::p_init_scratch: {
2729                assert(program->gfx_level >= GFX8 && program->gfx_level <= GFX10_3);
2730                if (!program->config->scratch_bytes_per_wave)
2731                   break;
2732 
2733                Operand scratch_addr = instr->operands[0];
2734                if (scratch_addr.isUndefined()) {
2735                   PhysReg reg = instr->definitions[0].physReg();
2736                   bld.sop1(aco_opcode::p_load_symbol, Definition(reg, s1),
2737                            Operand::c32(aco_symbol_scratch_addr_lo));
2738                   bld.sop1(aco_opcode::p_load_symbol, Definition(reg.advance(4), s1),
2739                            Operand::c32(aco_symbol_scratch_addr_hi));
2740                   scratch_addr.setFixed(reg);
2741                } else if (program->stage.hw != AC_HW_COMPUTE_SHADER) {
2742                   bld.smem(aco_opcode::s_load_dwordx2, instr->definitions[0], scratch_addr,
2743                            Operand::zero());
2744                   scratch_addr.setFixed(instr->definitions[0].physReg());
2745                }
2746 
2747                hw_init_scratch(bld, instr->definitions[0], scratch_addr, instr->operands[1]);
2748                break;
2749             }
2750             case aco_opcode::p_jump_to_epilog: {
2751                if (pops_done_msg_bounds.early_exit_needs_done_msg(block_idx, instr_idx)) {
2752                   bld.sopp(aco_opcode::s_sendmsg, -1, sendmsg_ordered_ps_done);
2753                }
2754                bld.sop1(aco_opcode::s_setpc_b64, instr->operands[0]);
2755                break;
2756             }
2757             case aco_opcode::p_interp_gfx11: {
2758                assert(instr->definitions[0].regClass() == v1 ||
2759                       instr->definitions[0].regClass() == v2b);
2760                assert(instr->operands[0].regClass() == v1.as_linear());
2761                assert(instr->operands[1].isConstant());
2762                assert(instr->operands[2].isConstant());
2763                assert(instr->operands.back().physReg() == m0);
2764                Definition dst = instr->definitions[0];
2765                PhysReg lin_vgpr = instr->operands[0].physReg();
2766                unsigned attribute = instr->operands[1].constantValue();
2767                unsigned component = instr->operands[2].constantValue();
2768                uint16_t dpp_ctrl = 0;
2769                Operand coord1, coord2;
2770                if (instr->operands.size() == 6) {
2771                   assert(instr->operands[3].regClass() == v1);
2772                   assert(instr->operands[4].regClass() == v1);
2773                   coord1 = instr->operands[3];
2774                   coord2 = instr->operands[4];
2775                } else {
2776                   assert(instr->operands[3].isConstant());
2777                   dpp_ctrl = instr->operands[3].constantValue();
2778                }
2779 
2780                bld.ldsdir(aco_opcode::lds_param_load, Definition(lin_vgpr, v1), Operand(m0, s1),
2781                           attribute, component);
2782 
2783                Operand p(lin_vgpr, v1);
2784                Operand dst_op(dst.physReg(), v1);
2785                if (instr->operands.size() == 5) {
2786                   bld.vop1_dpp(aco_opcode::v_mov_b32, Definition(dst), p, dpp_ctrl);
2787                } else if (dst.regClass() == v2b) {
2788                   bld.vinterp_inreg(aco_opcode::v_interp_p10_f16_f32_inreg, Definition(dst), p,
2789                                     coord1, p);
2790                   bld.vinterp_inreg(aco_opcode::v_interp_p2_f16_f32_inreg, Definition(dst), p,
2791                                     coord2, dst_op);
2792                } else {
2793                   bld.vinterp_inreg(aco_opcode::v_interp_p10_f32_inreg, Definition(dst), p, coord1,
2794                                     p);
2795                   bld.vinterp_inreg(aco_opcode::v_interp_p2_f32_inreg, Definition(dst), p, coord2,
2796                                     dst_op);
2797                }
2798                break;
2799             }
2800             case aco_opcode::p_dual_src_export_gfx11: {
2801                PhysReg dst0 = instr->definitions[0].physReg();
2802                PhysReg dst1 = instr->definitions[1].physReg();
2803                Definition exec_tmp = instr->definitions[2];
2804                Definition not_vcc_tmp = instr->definitions[3];
2805                Definition clobber_vcc = instr->definitions[4];
2806                Definition clobber_scc = instr->definitions[5];
2807 
2808                assert(exec_tmp.regClass() == bld.lm);
2809                assert(not_vcc_tmp.regClass() == bld.lm);
2810                assert(clobber_vcc.regClass() == bld.lm && clobber_vcc.physReg() == vcc);
2811                assert(clobber_scc.isFixed() && clobber_scc.physReg() == scc);
2812 
2813                bld.sop1(Builder::s_mov, Definition(exec_tmp.physReg(), bld.lm),
2814                         Operand(exec, bld.lm));
2815                bld.sop1(Builder::s_wqm, Definition(exec, bld.lm), clobber_scc,
2816                         Operand(exec, bld.lm));
2817 
2818                uint8_t enabled_channels = 0;
2819                Operand mrt0[4], mrt1[4];
2820 
2821                bld.sop1(aco_opcode::s_mov_b32, Definition(clobber_vcc.physReg(), s1),
2822                         Operand::c32(0x55555555));
2823                if (ctx.program->wave_size == 64)
2824                   bld.sop1(aco_opcode::s_mov_b32, Definition(clobber_vcc.physReg().advance(4), s1),
2825                            Operand::c32(0x55555555));
2826 
2827                Operand src_even = Operand(clobber_vcc.physReg(), bld.lm);
2828 
2829                bld.sop1(Builder::s_not, not_vcc_tmp, clobber_scc, src_even);
2830 
2831                Operand src_odd = Operand(not_vcc_tmp.physReg(), bld.lm);
2832 
2833                for (unsigned i = 0; i < 4; i++) {
2834                   if (instr->operands[i].isUndefined() && instr->operands[i + 4].isUndefined()) {
2835                      mrt0[i] = instr->operands[i];
2836                      mrt1[i] = instr->operands[i + 4];
2837                      continue;
2838                   }
2839 
2840                   Operand src0 = instr->operands[i];
2841                   Operand src1 = instr->operands[i + 4];
2842 
2843                   /*      | even lanes | odd lanes
2844                    * mrt0 | src0 even  | src1 even
2845                    * mrt1 | src0 odd   | src1 odd
2846                    */
2847                   bld.vop2_dpp(aco_opcode::v_cndmask_b32, Definition(dst0, v1), src1, src0,
2848                                src_even, dpp_row_xmask(1));
2849                   bld.vop2_e64_dpp(aco_opcode::v_cndmask_b32, Definition(dst1, v1), src0, src1,
2850                                    src_odd, dpp_row_xmask(1));
2851 
2852                   mrt0[i] = Operand(dst0, v1);
2853                   mrt1[i] = Operand(dst1, v1);
2854 
2855                   enabled_channels |= 1 << i;
2856 
2857                   dst0 = dst0.advance(4);
2858                   dst1 = dst1.advance(4);
2859                }
2860 
2861                bld.sop1(Builder::s_mov, Definition(exec, bld.lm),
2862                         Operand(exec_tmp.physReg(), bld.lm));
2863 
2864                /* Force export all channels when everything is undefined. */
2865                if (!enabled_channels)
2866                   enabled_channels = 0xf;
2867 
2868                bld.exp(aco_opcode::exp, mrt0[0], mrt0[1], mrt0[2], mrt0[3], enabled_channels,
2869                        V_008DFC_SQ_EXP_MRT + 21, false);
2870                bld.exp(aco_opcode::exp, mrt1[0], mrt1[1], mrt1[2], mrt1[3], enabled_channels,
2871                        V_008DFC_SQ_EXP_MRT + 22, false);
2872                break;
2873             }
2874             case aco_opcode::p_end_with_regs: {
2875                end_with_regs_block_index = block->index;
2876                break;
2877             }
2878             default: break;
2879             }
2880          } else if (instr->isBranch()) {
2881             Pseudo_branch_instruction* branch = &instr->branch();
2882             const uint32_t target = branch->target[0];
2883             const bool uniform_branch = !((branch->opcode == aco_opcode::p_cbranch_z ||
2884                                            branch->opcode == aco_opcode::p_cbranch_nz) &&
2885                                           branch->operands[0].physReg() == exec);
2886 
2887             /* Check if the branch instruction can be removed.
2888              * This is beneficial when executing the next block with an empty exec mask
2889              * is faster than the branch instruction itself.
2890              *
2891              * Override this judgement when:
2892              * - The application prefers to remove control flow
2893              * - The compiler stack knows that it's a divergent branch always taken
2894              */
2895             const bool prefer_remove =
2896                branch->selection_control_remove && ctx.program->gfx_level >= GFX10;
2897             bool can_remove = block->index < target;
2898             unsigned num_scalar = 0;
2899             unsigned num_vector = 0;
2900 
2901             /* Check the instructions between branch and target */
2902             for (unsigned i = block->index + 1; i < branch->target[0]; i++) {
2903                /* Uniform conditional branches must not be ignored if they
2904                 * are about to jump over actual instructions */
2905                if (uniform_branch && !program->blocks[i].instructions.empty())
2906                   can_remove = false;
2907 
2908                if (!can_remove)
2909                   break;
2910 
2911                for (aco_ptr<Instruction>& inst : program->blocks[i].instructions) {
2912                   if (inst->isSOPP()) {
2913                      /* Discard early exits and loop breaks and continues should work fine with an
2914                       * empty exec mask.
2915                       */
2916                      bool is_break_continue =
2917                         program->blocks[i].kind & (block_kind_break | block_kind_continue);
2918                      bool discard_early_exit =
2919                         inst->sopp().block != -1 &&
2920                         (program->blocks[inst->sopp().block].kind & block_kind_discard_early_exit);
2921                      if ((inst->opcode != aco_opcode::s_cbranch_scc0 &&
2922                           inst->opcode != aco_opcode::s_cbranch_scc1) ||
2923                          (!discard_early_exit && !is_break_continue))
2924                         can_remove = false;
2925                   } else if (inst->isSALU()) {
2926                      num_scalar++;
2927                   } else if (inst->isVALU() || inst->isVINTRP()) {
2928                      if (instr->opcode == aco_opcode::v_writelane_b32 ||
2929                          instr->opcode == aco_opcode::v_writelane_b32_e64) {
2930                         /* writelane ignores exec, writing inactive lanes results in UB. */
2931                         can_remove = false;
2932                      }
2933                      num_vector++;
2934                      /* VALU which writes SGPRs are always executed on GFX10+ */
2935                      if (ctx.program->gfx_level >= GFX10) {
2936                         for (Definition& def : inst->definitions) {
2937                            if (def.regClass().type() == RegType::sgpr)
2938                               num_scalar++;
2939                         }
2940                      }
2941                   } else if (inst->isEXP()) {
2942                      /* Export instructions with exec=0 can hang some GFX10+ (unclear on old GPUs). */
2943                      can_remove = false;
2944                   } else if (inst->isVMEM() || inst->isFlatLike() || inst->isDS() ||
2945                              inst->isLDSDIR()) {
2946                      // TODO: GFX6-9 can use vskip
2947                      can_remove = prefer_remove;
2948                   } else if (inst->isSMEM()) {
2949                      /* SMEM are at least as expensive as branches */
2950                      can_remove = prefer_remove;
2951                   } else if (inst->isBarrier()) {
2952                      can_remove = prefer_remove;
2953                   } else {
2954                      can_remove = false;
2955                      assert(false && "Pseudo instructions should be lowered by this point.");
2956                   }
2957 
2958                   if (!prefer_remove) {
2959                      /* Under these conditions, we shouldn't remove the branch.
2960                       * Don't care about the estimated cycles when the shader prefers flattening.
2961                       */
2962                      unsigned est_cycles;
2963                      if (ctx.program->gfx_level >= GFX10)
2964                         est_cycles = num_scalar * 2 + num_vector;
2965                      else
2966                         est_cycles = num_scalar * 4 + num_vector * 4;
2967 
2968                      if (est_cycles > 16)
2969                         can_remove = false;
2970                   }
2971 
2972                   if (!can_remove)
2973                      break;
2974                }
2975             }
2976 
2977             if (can_remove)
2978                continue;
2979 
2980             /* emit branch instruction */
2981             switch (instr->opcode) {
2982             case aco_opcode::p_branch:
2983                assert(block->linear_succs[0] == target);
2984                bld.sopp(aco_opcode::s_branch, branch->definitions[0], target);
2985                break;
2986             case aco_opcode::p_cbranch_nz:
2987                assert(block->linear_succs[1] == target);
2988                if (branch->operands[0].physReg() == exec)
2989                   bld.sopp(aco_opcode::s_cbranch_execnz, branch->definitions[0], target);
2990                else if (branch->operands[0].physReg() == vcc)
2991                   bld.sopp(aco_opcode::s_cbranch_vccnz, branch->definitions[0], target);
2992                else {
2993                   assert(branch->operands[0].physReg() == scc);
2994                   bld.sopp(aco_opcode::s_cbranch_scc1, branch->definitions[0], target);
2995                }
2996                break;
2997             case aco_opcode::p_cbranch_z:
2998                assert(block->linear_succs[1] == target);
2999                if (branch->operands[0].physReg() == exec)
3000                   bld.sopp(aco_opcode::s_cbranch_execz, branch->definitions[0], target);
3001                else if (branch->operands[0].physReg() == vcc)
3002                   bld.sopp(aco_opcode::s_cbranch_vccz, branch->definitions[0], target);
3003                else {
3004                   assert(branch->operands[0].physReg() == scc);
3005                   bld.sopp(aco_opcode::s_cbranch_scc0, branch->definitions[0], target);
3006                }
3007                break;
3008             default: unreachable("Unknown Pseudo branch instruction!");
3009             }
3010 
3011          } else if (instr->isReduction()) {
3012             Pseudo_reduction_instruction& reduce = instr->reduction();
3013             emit_reduction(&ctx, reduce.opcode, reduce.reduce_op, reduce.cluster_size,
3014                            reduce.operands[1].physReg(),    // tmp
3015                            reduce.definitions[1].physReg(), // stmp
3016                            reduce.operands[2].physReg(),    // vtmp
3017                            reduce.definitions[2].physReg(), // sitmp
3018                            reduce.operands[0], reduce.definitions[0]);
3019          } else if (instr->isBarrier()) {
3020             Pseudo_barrier_instruction& barrier = instr->barrier();
3021 
3022             /* Anything larger than a workgroup isn't possible. Anything
3023              * smaller requires no instructions and this pseudo instruction
3024              * would only be included to control optimizations. */
3025             bool emit_s_barrier = barrier.exec_scope == scope_workgroup &&
3026                                   program->workgroup_size > program->wave_size;
3027 
3028             bld.insert(std::move(instr));
3029             if (emit_s_barrier)
3030                bld.sopp(aco_opcode::s_barrier);
3031          } else if (instr->opcode == aco_opcode::p_cvt_f16_f32_rtne) {
3032             float_mode new_mode = block->fp_mode;
3033             new_mode.round16_64 = fp_round_ne;
3034             bool set_round = new_mode.round != block->fp_mode.round;
3035 
3036             emit_set_mode(bld, new_mode, set_round, false);
3037 
3038             instr->opcode = aco_opcode::v_cvt_f16_f32;
3039             ctx.instructions.emplace_back(std::move(instr));
3040 
3041             emit_set_mode(bld, block->fp_mode, set_round, false);
3042          } else if (instr->isMIMG() && instr->mimg().strict_wqm) {
3043             lower_image_sample(&ctx, instr);
3044             ctx.instructions.emplace_back(std::move(instr));
3045          } else {
3046             ctx.instructions.emplace_back(std::move(instr));
3047          }
3048       }
3049 
3050       /* Send the ordered section done message from this block if it's needed in this block, but
3051        * instr_after_end_idx() points beyond the end of its instructions. This may commonly happen
3052        * if the common post-dominator of multiple end locations turns out to be an empty block.
3053        */
3054       if (block_idx == pops_done_msg_bounds.end_block_idx() &&
3055           pops_done_msg_bounds.instr_after_end_idx() >= block->instructions.size()) {
3056          bld.sopp(aco_opcode::s_sendmsg, -1, sendmsg_ordered_ps_done);
3057       }
3058 
3059       block->instructions = std::move(ctx.instructions);
3060    }
3061 
3062    /* If block with p_end_with_regs is not the last block (i.e. p_exit_early_if may append exit
3063     * block at last), create an exit block for it to branch to.
3064     */
3065    int last_block_index = program->blocks.size() - 1;
3066    if (end_with_regs_block_index >= 0 && end_with_regs_block_index != last_block_index) {
3067       Block* exit_block = program->create_and_insert_block();
3068       Block* end_with_regs_block = &program->blocks[end_with_regs_block_index];
3069       exit_block->linear_preds.push_back(end_with_regs_block->index);
3070       end_with_regs_block->linear_succs.push_back(exit_block->index);
3071 
3072       Builder bld(program, end_with_regs_block);
3073       bld.sopp(aco_opcode::s_branch, exit_block->index);
3074 
3075       /* For insert waitcnt pass to add waitcnt in exit block, otherwise waitcnt will be added
3076        * after the s_branch which won't be executed.
3077        */
3078       end_with_regs_block->kind &= ~block_kind_end_with_regs;
3079       exit_block->kind |= block_kind_end_with_regs;
3080    }
3081 }
3082 
3083 } // namespace aco
3084