• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /*
2  * Copyright © 2018 Valve Corporation
3  *
4  * SPDX-License-Identifier: MIT
5  */
6 
7 #include "aco_builder.h"
8 #include "aco_ir.h"
9 
10 #include "common/sid.h"
11 
12 #include <map>
13 #include <vector>
14 
15 namespace aco {
16 
17 namespace {
18 
19 struct lower_context {
20    Program* program;
21    Block* block;
22    std::vector<aco_ptr<Instruction>> instructions;
23 };
24 
25 /* Class for obtaining where s_sendmsg(MSG_ORDERED_PS_DONE) must be done in a Primitive Ordered
26  * Pixel Shader on GFX9-10.3.
27  *
28  * MSG_ORDERED_PS_DONE must be sent once after the ordered section is done along all execution paths
29  * from the POPS packer ID hardware register setting to s_endpgm. It is, however, also okay to send
30  * it if the packer ID is not going to be set at all by the wave, so some conservativeness is fine.
31  *
32  * For simplicity, sending the message from top-level blocks as dominance and post-dominance
33  * checking for any location in the shader is trivial in them. Also, for simplicity, sending it
34  * regardless of whether the POPS packer ID hardware register has already potentially been set up.
35  *
36  * Note that there can be multiple interlock end instructions in the shader.
37  * SPV_EXT_fragment_shader_interlock requires OpEndInvocationInterlockEXT to be executed exactly
38  * once by the invocation. However, there may be, for instance, multiple ordered sections, and which
39  * one will be executed may depend on divergent control flow (some lanes may execute one ordered
40  * section, other lanes may execute another). MSG_ORDERED_PS_DONE, however, is sent via a scalar
41  * instruction, so it must be ensured that the message is sent after the last ordered section in the
42  * entire wave.
43  */
44 class gfx9_pops_done_msg_bounds {
45 public:
46    explicit gfx9_pops_done_msg_bounds() = default;
47 
gfx9_pops_done_msg_bounds(const Program * const program)48    explicit gfx9_pops_done_msg_bounds(const Program* const program)
49    {
50       /* Find the top-level location after the last ordered section end pseudo-instruction in the
51        * program.
52        * Consider `p_pops_gfx9_overlapped_wave_wait_done` a boundary too - make sure the message
53        * isn't sent if any wait hasn't been fully completed yet (if a begin-end-begin situation
54        * occurs somehow, as the location of `p_pops_gfx9_ordered_section_done` is controlled by the
55        * application) for safety, assuming that waits are the only thing that need the packer
56        * hardware register to be set at some point during or before them, and it won't be set
57        * anymore after the last wait.
58        */
59       int last_top_level_block_idx = -1;
60       for (int block_idx = (int)program->blocks.size() - 1; block_idx >= 0; block_idx--) {
61          const Block& block = program->blocks[block_idx];
62          if (block.kind & block_kind_top_level) {
63             last_top_level_block_idx = block_idx;
64          }
65          for (size_t instr_idx = block.instructions.size() - 1; instr_idx + size_t(1) > 0;
66               instr_idx--) {
67             const aco_opcode opcode = block.instructions[instr_idx]->opcode;
68             if (opcode == aco_opcode::p_pops_gfx9_ordered_section_done ||
69                 opcode == aco_opcode::p_pops_gfx9_overlapped_wave_wait_done) {
70                end_block_idx_ = last_top_level_block_idx;
71                /* The same block if it's already a top-level block, or the beginning of the next
72                 * top-level block.
73                 */
74                instr_after_end_idx_ = block_idx == end_block_idx_ ? instr_idx + 1 : 0;
75                break;
76             }
77          }
78          if (end_block_idx_ != -1) {
79             break;
80          }
81       }
82    }
83 
84    /* If this is not -1, during the normal execution flow (not early exiting), MSG_ORDERED_PS_DONE
85     * must be sent in this block.
86     */
end_block_idx() const87    int end_block_idx() const { return end_block_idx_; }
88 
89    /* If end_block_idx() is an existing block, during the normal execution flow (not early exiting),
90     * MSG_ORDERED_PS_DONE must be sent before this instruction in the block end_block_idx().
91     * If this is out of the bounds of the instructions in the end block, it must be sent in the end
92     * of that block.
93     */
instr_after_end_idx() const94    size_t instr_after_end_idx() const { return instr_after_end_idx_; }
95 
96    /* Whether an instruction doing early exit (such as discard) needs to send MSG_ORDERED_PS_DONE
97     * before actually ending the program.
98     */
early_exit_needs_done_msg(const int block_idx,const size_t instr_idx) const99    bool early_exit_needs_done_msg(const int block_idx, const size_t instr_idx) const
100    {
101       return block_idx <= end_block_idx_ &&
102              (block_idx != end_block_idx_ || instr_idx < instr_after_end_idx_);
103    }
104 
105 private:
106    /* Initialize to an empty range for which "is inside" comparisons will be failing for any
107     * block.
108     */
109    int end_block_idx_ = -1;
110    size_t instr_after_end_idx_ = 0;
111 };
112 
113 void
copy_constant_sgpr(Builder & bld,Definition dst,uint64_t constant)114 copy_constant_sgpr(Builder& bld, Definition dst, uint64_t constant)
115 {
116    if (dst.regClass() == s1) {
117       uint32_t imm = constant;
118       Operand op = Operand::get_const(bld.program->gfx_level, imm, 4);
119       if (op.isLiteral()) {
120          if (imm >= 0xffff8000 || imm <= 0x7fff) {
121             bld.sopk(aco_opcode::s_movk_i32, dst, imm & 0xFFFFu);
122             return;
123          }
124 
125          Operand rev_op = Operand::get_const(bld.program->gfx_level, util_bitreverse(imm), 4);
126          if (!rev_op.isLiteral()) {
127             bld.sop1(aco_opcode::s_brev_b32, dst, rev_op);
128             return;
129          }
130 
131          unsigned start = (ffs(imm) - 1) & 0x1f;
132          unsigned size = util_bitcount(imm) & 0x1f;
133          if (BITFIELD_RANGE(start, size) == imm) {
134             bld.sop2(aco_opcode::s_bfm_b32, dst, Operand::c32(size), Operand::c32(start));
135             return;
136          }
137 
138          if (bld.program->gfx_level >= GFX9) {
139             Operand op_lo = Operand::c32(int32_t(int16_t(imm)));
140             Operand op_hi = Operand::c32(int32_t(int16_t(imm >> 16)));
141             if (!op_lo.isLiteral() && !op_hi.isLiteral()) {
142                bld.sop2(aco_opcode::s_pack_ll_b32_b16, dst, op_lo, op_hi);
143                return;
144             }
145          }
146       }
147 
148       bld.sop1(aco_opcode::s_mov_b32, dst, op);
149       return;
150    }
151 
152    assert(dst.regClass() == s2);
153 
154    bool can_use_mov = Operand::is_constant_representable(constant, 8, true, false);
155    if (can_use_mov && !Operand::c64(constant).isLiteral()) {
156       bld.sop1(aco_opcode::s_mov_b64, dst, Operand::c64(constant));
157       return;
158    }
159 
160    unsigned start = (ffsll(constant) - 1) & 0x3f;
161    unsigned size = util_bitcount64(constant) & 0x3f;
162    if (BITFIELD64_RANGE(start, size) == constant) {
163       bld.sop2(aco_opcode::s_bfm_b64, dst, Operand::c32(size), Operand::c32(start));
164       return;
165    }
166 
167    uint64_t rev = ((uint64_t)util_bitreverse(constant) << 32) | util_bitreverse(constant >> 32);
168    if (Operand::is_constant_representable(rev, 8, true, false)) {
169       bld.sop1(aco_opcode::s_brev_b64, dst, Operand::c64(rev));
170       return;
171    }
172 
173    if (can_use_mov) {
174       bld.sop1(aco_opcode::s_mov_b64, dst, Operand::c64(constant));
175       return;
176    }
177 
178    uint32_t derep = 0;
179    bool can_use_rep = bld.program->gfx_level >= GFX9;
180    for (unsigned i = 0; can_use_rep && i < 32; i++) {
181       uint32_t lo = (constant >> (i * 2)) & 0x1;
182       uint32_t hi = (constant >> ((i * 2) + 1)) & 0x1;
183       can_use_rep &= lo == hi;
184       derep |= lo << i;
185    }
186    if (can_use_rep) {
187       bld.sop1(aco_opcode::s_bitreplicate_b64_b32, dst, Operand::c32(derep));
188       return;
189    }
190 
191    copy_constant_sgpr(bld, Definition(dst.physReg(), s1), (uint32_t)constant);
192    copy_constant_sgpr(bld, Definition(dst.physReg().advance(4), s1), constant >> 32);
193 }
194 
195 /* used by handle_operands() indirectly through Builder::copy */
196 uint8_t int8_mul_table[512] = {
197    0, 20,  1,  1,   1,  2,   1,  3,   1,  4,   1, 5,   1,  6,   1,  7,   1,  8,   1,  9,
198    1, 10,  1,  11,  1,  12,  1,  13,  1,  14,  1, 15,  1,  16,  1,  17,  1,  18,  1,  19,
199    1, 20,  1,  21,  1,  22,  1,  23,  1,  24,  1, 25,  1,  26,  1,  27,  1,  28,  1,  29,
200    1, 30,  1,  31,  1,  32,  1,  33,  1,  34,  1, 35,  1,  36,  1,  37,  1,  38,  1,  39,
201    1, 40,  1,  41,  1,  42,  1,  43,  1,  44,  1, 45,  1,  46,  1,  47,  1,  48,  1,  49,
202    1, 50,  1,  51,  1,  52,  1,  53,  1,  54,  1, 55,  1,  56,  1,  57,  1,  58,  1,  59,
203    1, 60,  1,  61,  1,  62,  1,  63,  1,  64,  5, 13,  2,  33,  17, 19,  2,  34,  3,  23,
204    2, 35,  11, 53,  2,  36,  7,  47,  2,  37,  3, 25,  2,  38,  7,  11,  2,  39,  53, 243,
205    2, 40,  3,  27,  2,  41,  17, 35,  2,  42,  5, 17,  2,  43,  3,  29,  2,  44,  15, 23,
206    2, 45,  7,  13,  2,  46,  3,  31,  2,  47,  5, 19,  2,  48,  19, 59,  2,  49,  3,  33,
207    2, 50,  7,  51,  2,  51,  15, 41,  2,  52,  3, 35,  2,  53,  11, 33,  2,  54,  23, 27,
208    2, 55,  3,  37,  2,  56,  9,  41,  2,  57,  5, 23,  2,  58,  3,  39,  2,  59,  7,  17,
209    2, 60,  9,  241, 2,  61,  3,  41,  2,  62,  5, 25,  2,  63,  35, 245, 2,  64,  3,  43,
210    5, 26,  9,  43,  3,  44,  7,  19,  10, 39,  3, 45,  4,  34,  11, 59,  3,  46,  9,  243,
211    4, 35,  3,  47,  22, 53,  7,  57,  3,  48,  5, 29,  10, 245, 3,  49,  4,  37,  9,  45,
212    3, 50,  7,  241, 4,  38,  3,  51,  7,  22,  5, 31,  3,  52,  7,  59,  7,  242, 3,  53,
213    4, 40,  7,  23,  3,  54,  15, 45,  4,  41,  3, 55,  6,  241, 9,  47,  3,  56,  13, 13,
214    5, 34,  3,  57,  4,  43,  11, 39,  3,  58,  5, 35,  4,  44,  3,  59,  6,  243, 7,  245,
215    3, 60,  5,  241, 7,  26,  3,  61,  4,  46,  5, 37,  3,  62,  11, 17,  4,  47,  3,  63,
216    5, 38,  5,  243, 3,  64,  7,  247, 9,  50,  5, 39,  4,  241, 33, 37,  6,  33,  13, 35,
217    4, 242, 5,  245, 6,  247, 7,  29,  4,  51,  5, 41,  5,  246, 7,  249, 3,  240, 11, 19,
218    5, 42,  3,  241, 4,  245, 25, 29,  3,  242, 5, 43,  4,  246, 3,  243, 17, 58,  17, 43,
219    3, 244, 5,  249, 6,  37,  3,  245, 2,  240, 5, 45,  2,  241, 21, 23,  2,  242, 3,  247,
220    2, 243, 5,  251, 2,  244, 29, 61,  2,  245, 3, 249, 2,  246, 17, 29,  2,  247, 9,  55,
221    1, 240, 1,  241, 1,  242, 1,  243, 1,  244, 1, 245, 1,  246, 1,  247, 1,  248, 1,  249,
222    1, 250, 1,  251, 1,  252, 1,  253, 1,  254, 1, 255};
223 
224 aco_opcode
get_reduce_opcode(amd_gfx_level gfx_level,ReduceOp op)225 get_reduce_opcode(amd_gfx_level gfx_level, ReduceOp op)
226 {
227    /* Because some 16-bit instructions are already VOP3 on GFX10, we use the
228     * 32-bit opcodes (VOP2) which allows to remove the temporary VGPR and to use
229     * DPP with the arithmetic instructions. This requires to sign-extend.
230     */
231    switch (op) {
232    case iadd8:
233    case iadd16:
234       if (gfx_level >= GFX10) {
235          return aco_opcode::v_add_u32;
236       } else if (gfx_level >= GFX8) {
237          return aco_opcode::v_add_u16;
238       } else {
239          return aco_opcode::v_add_co_u32;
240       }
241       break;
242    case imul8:
243    case imul16:
244       if (gfx_level >= GFX10) {
245          return aco_opcode::v_mul_lo_u16_e64;
246       } else if (gfx_level >= GFX8) {
247          return aco_opcode::v_mul_lo_u16;
248       } else {
249          return aco_opcode::v_mul_u32_u24;
250       }
251       break;
252    case fadd16: return aco_opcode::v_add_f16;
253    case fmul16: return aco_opcode::v_mul_f16;
254    case imax8:
255    case imax16:
256       if (gfx_level >= GFX10) {
257          return aco_opcode::v_max_i32;
258       } else if (gfx_level >= GFX8) {
259          return aco_opcode::v_max_i16;
260       } else {
261          return aco_opcode::v_max_i32;
262       }
263       break;
264    case imin8:
265    case imin16:
266       if (gfx_level >= GFX10) {
267          return aco_opcode::v_min_i32;
268       } else if (gfx_level >= GFX8) {
269          return aco_opcode::v_min_i16;
270       } else {
271          return aco_opcode::v_min_i32;
272       }
273       break;
274    case umin8:
275    case umin16:
276       if (gfx_level >= GFX10) {
277          return aco_opcode::v_min_u32;
278       } else if (gfx_level >= GFX8) {
279          return aco_opcode::v_min_u16;
280       } else {
281          return aco_opcode::v_min_u32;
282       }
283       break;
284    case umax8:
285    case umax16:
286       if (gfx_level >= GFX10) {
287          return aco_opcode::v_max_u32;
288       } else if (gfx_level >= GFX8) {
289          return aco_opcode::v_max_u16;
290       } else {
291          return aco_opcode::v_max_u32;
292       }
293       break;
294    case fmin16: return aco_opcode::v_min_f16;
295    case fmax16: return aco_opcode::v_max_f16;
296    case iadd32: return gfx_level >= GFX9 ? aco_opcode::v_add_u32 : aco_opcode::v_add_co_u32;
297    case imul32: return aco_opcode::v_mul_lo_u32;
298    case fadd32: return aco_opcode::v_add_f32;
299    case fmul32: return aco_opcode::v_mul_f32;
300    case imax32: return aco_opcode::v_max_i32;
301    case imin32: return aco_opcode::v_min_i32;
302    case umin32: return aco_opcode::v_min_u32;
303    case umax32: return aco_opcode::v_max_u32;
304    case fmin32: return aco_opcode::v_min_f32;
305    case fmax32: return aco_opcode::v_max_f32;
306    case iand8:
307    case iand16:
308    case iand32: return aco_opcode::v_and_b32;
309    case ixor8:
310    case ixor16:
311    case ixor32: return aco_opcode::v_xor_b32;
312    case ior8:
313    case ior16:
314    case ior32: return aco_opcode::v_or_b32;
315    case iadd64: return aco_opcode::num_opcodes;
316    case imul64: return aco_opcode::num_opcodes;
317    case fadd64: return aco_opcode::v_add_f64_e64;
318    case fmul64: return aco_opcode::v_mul_f64_e64;
319    case imin64: return aco_opcode::num_opcodes;
320    case imax64: return aco_opcode::num_opcodes;
321    case umin64: return aco_opcode::num_opcodes;
322    case umax64: return aco_opcode::num_opcodes;
323    case fmin64: return aco_opcode::v_min_f64_e64;
324    case fmax64: return aco_opcode::v_max_f64_e64;
325    case iand64: return aco_opcode::num_opcodes;
326    case ior64: return aco_opcode::num_opcodes;
327    case ixor64: return aco_opcode::num_opcodes;
328    default: return aco_opcode::num_opcodes;
329    }
330 }
331 
332 bool
is_vop3_reduce_opcode(aco_opcode opcode)333 is_vop3_reduce_opcode(aco_opcode opcode)
334 {
335    /* 64-bit reductions are VOP3. */
336    if (opcode == aco_opcode::num_opcodes)
337       return true;
338 
339    return instr_info.format[(int)opcode] == Format::VOP3;
340 }
341 
342 void
emit_vadd32(Builder & bld,Definition def,Operand src0,Operand src1)343 emit_vadd32(Builder& bld, Definition def, Operand src0, Operand src1)
344 {
345    Instruction* instr = bld.vadd32(def, src0, src1, false, Operand(s2), true);
346    if (instr->definitions.size() >= 2) {
347       assert(instr->definitions[1].regClass() == bld.lm);
348       instr->definitions[1].setFixed(vcc);
349    }
350 }
351 
352 void
emit_int64_dpp_op(lower_context * ctx,PhysReg dst_reg,PhysReg src0_reg,PhysReg src1_reg,PhysReg vtmp_reg,ReduceOp op,unsigned dpp_ctrl,unsigned row_mask,unsigned bank_mask,bool bound_ctrl,Operand * identity=NULL)353 emit_int64_dpp_op(lower_context* ctx, PhysReg dst_reg, PhysReg src0_reg, PhysReg src1_reg,
354                   PhysReg vtmp_reg, ReduceOp op, unsigned dpp_ctrl, unsigned row_mask,
355                   unsigned bank_mask, bool bound_ctrl, Operand* identity = NULL)
356 {
357    Builder bld(ctx->program, &ctx->instructions);
358    Definition dst[] = {Definition(dst_reg, v1), Definition(PhysReg{dst_reg + 1}, v1)};
359    Definition vtmp_def[] = {Definition(vtmp_reg, v1), Definition(PhysReg{vtmp_reg + 1}, v1)};
360    Operand src0[] = {Operand(src0_reg, v1), Operand(PhysReg{src0_reg + 1}, v1)};
361    Operand src1[] = {Operand(src1_reg, v1), Operand(PhysReg{src1_reg + 1}, v1)};
362    Operand src1_64 = Operand(src1_reg, v2);
363    Operand vtmp_op[] = {Operand(vtmp_reg, v1), Operand(PhysReg{vtmp_reg + 1}, v1)};
364    Operand vtmp_op64 = Operand(vtmp_reg, v2);
365    if (op == iadd64) {
366       if (ctx->program->gfx_level >= GFX10) {
367          if (identity)
368             bld.vop1(aco_opcode::v_mov_b32, vtmp_def[0], identity[0]);
369          bld.vop1_dpp(aco_opcode::v_mov_b32, vtmp_def[0], src0[0], dpp_ctrl, row_mask, bank_mask,
370                       bound_ctrl);
371          bld.vop3(aco_opcode::v_add_co_u32_e64, dst[0], Definition(vcc, bld.lm), vtmp_op[0],
372                   src1[0]);
373       } else {
374          bld.vop2_dpp(aco_opcode::v_add_co_u32, dst[0], Definition(vcc, bld.lm), src0[0], src1[0],
375                       dpp_ctrl, row_mask, bank_mask, bound_ctrl);
376       }
377       bld.vop2_dpp(aco_opcode::v_addc_co_u32, dst[1], Definition(vcc, bld.lm), src0[1], src1[1],
378                    Operand(vcc, bld.lm), dpp_ctrl, row_mask, bank_mask, bound_ctrl);
379    } else if (op == iand64) {
380       bld.vop2_dpp(aco_opcode::v_and_b32, dst[0], src0[0], src1[0], dpp_ctrl, row_mask, bank_mask,
381                    bound_ctrl);
382       bld.vop2_dpp(aco_opcode::v_and_b32, dst[1], src0[1], src1[1], dpp_ctrl, row_mask, bank_mask,
383                    bound_ctrl);
384    } else if (op == ior64) {
385       bld.vop2_dpp(aco_opcode::v_or_b32, dst[0], src0[0], src1[0], dpp_ctrl, row_mask, bank_mask,
386                    bound_ctrl);
387       bld.vop2_dpp(aco_opcode::v_or_b32, dst[1], src0[1], src1[1], dpp_ctrl, row_mask, bank_mask,
388                    bound_ctrl);
389    } else if (op == ixor64) {
390       bld.vop2_dpp(aco_opcode::v_xor_b32, dst[0], src0[0], src1[0], dpp_ctrl, row_mask, bank_mask,
391                    bound_ctrl);
392       bld.vop2_dpp(aco_opcode::v_xor_b32, dst[1], src0[1], src1[1], dpp_ctrl, row_mask, bank_mask,
393                    bound_ctrl);
394    } else if (op == umin64 || op == umax64 || op == imin64 || op == imax64) {
395       aco_opcode cmp = aco_opcode::num_opcodes;
396       switch (op) {
397       case umin64: cmp = aco_opcode::v_cmp_gt_u64; break;
398       case umax64: cmp = aco_opcode::v_cmp_lt_u64; break;
399       case imin64: cmp = aco_opcode::v_cmp_gt_i64; break;
400       case imax64: cmp = aco_opcode::v_cmp_lt_i64; break;
401       default: break;
402       }
403 
404       if (identity) {
405          bld.vop1(aco_opcode::v_mov_b32, vtmp_def[0], identity[0]);
406          bld.vop1(aco_opcode::v_mov_b32, vtmp_def[1], identity[1]);
407       }
408       bld.vop1_dpp(aco_opcode::v_mov_b32, vtmp_def[0], src0[0], dpp_ctrl, row_mask, bank_mask,
409                    bound_ctrl);
410       bld.vop1_dpp(aco_opcode::v_mov_b32, vtmp_def[1], src0[1], dpp_ctrl, row_mask, bank_mask,
411                    bound_ctrl);
412 
413       bld.vopc(cmp, Definition(vcc, bld.lm), vtmp_op64, src1_64);
414       bld.vop2(aco_opcode::v_cndmask_b32, dst[0], vtmp_op[0], src1[0], Operand(vcc, bld.lm));
415       bld.vop2(aco_opcode::v_cndmask_b32, dst[1], vtmp_op[1], src1[1], Operand(vcc, bld.lm));
416    } else if (op == imul64) {
417       /* t4 = dpp(x_hi)
418        * t1 = umul_lo(t4, y_lo)
419        * t3 = dpp(x_lo)
420        * t0 = umul_lo(t3, y_hi)
421        * t2 = iadd(t0, t1)
422        * t5 = umul_hi(t3, y_lo)
423        * res_hi = iadd(t2, t5)
424        * res_lo = umul_lo(t3, y_lo)
425        * Requires that res_hi != src0[0] and res_hi != src1[0]
426        * and that vtmp[0] != res_hi.
427        */
428       if (identity)
429          bld.vop1(aco_opcode::v_mov_b32, vtmp_def[0], identity[1]);
430       bld.vop1_dpp(aco_opcode::v_mov_b32, vtmp_def[0], src0[1], dpp_ctrl, row_mask, bank_mask,
431                    bound_ctrl);
432       bld.vop3(aco_opcode::v_mul_lo_u32, vtmp_def[1], vtmp_op[0], src1[0]);
433       if (identity)
434          bld.vop1(aco_opcode::v_mov_b32, vtmp_def[0], identity[0]);
435       bld.vop1_dpp(aco_opcode::v_mov_b32, vtmp_def[0], src0[0], dpp_ctrl, row_mask, bank_mask,
436                    bound_ctrl);
437       bld.vop3(aco_opcode::v_mul_lo_u32, vtmp_def[0], vtmp_op[0], src1[1]);
438       emit_vadd32(bld, vtmp_def[1], vtmp_op[0], vtmp_op[1]);
439       if (identity)
440          bld.vop1(aco_opcode::v_mov_b32, vtmp_def[0], identity[0]);
441       bld.vop1_dpp(aco_opcode::v_mov_b32, vtmp_def[0], src0[0], dpp_ctrl, row_mask, bank_mask,
442                    bound_ctrl);
443       bld.vop3(aco_opcode::v_mul_hi_u32, vtmp_def[0], vtmp_op[0], src1[0]);
444       emit_vadd32(bld, dst[1], vtmp_op[1], vtmp_op[0]);
445       if (identity)
446          bld.vop1(aco_opcode::v_mov_b32, vtmp_def[0], identity[0]);
447       bld.vop1_dpp(aco_opcode::v_mov_b32, vtmp_def[0], src0[0], dpp_ctrl, row_mask, bank_mask,
448                    bound_ctrl);
449       bld.vop3(aco_opcode::v_mul_lo_u32, dst[0], vtmp_op[0], src1[0]);
450    }
451 }
452 
453 void
emit_int64_op(lower_context * ctx,PhysReg dst_reg,PhysReg src0_reg,PhysReg src1_reg,PhysReg vtmp,ReduceOp op)454 emit_int64_op(lower_context* ctx, PhysReg dst_reg, PhysReg src0_reg, PhysReg src1_reg, PhysReg vtmp,
455               ReduceOp op)
456 {
457    Builder bld(ctx->program, &ctx->instructions);
458    Definition dst[] = {Definition(dst_reg, v1), Definition(PhysReg{dst_reg + 1}, v1)};
459    RegClass src0_rc = src0_reg.reg() >= 256 ? v1 : s1;
460    Operand src0[] = {Operand(src0_reg, src0_rc), Operand(PhysReg{src0_reg + 1}, src0_rc)};
461    Operand src1[] = {Operand(src1_reg, v1), Operand(PhysReg{src1_reg + 1}, v1)};
462    Operand src0_64 = Operand(src0_reg, src0_reg.reg() >= 256 ? v2 : s2);
463    Operand src1_64 = Operand(src1_reg, v2);
464 
465    if (src0_rc == s1 &&
466        (op == imul64 || op == umin64 || op == umax64 || op == imin64 || op == imax64)) {
467       assert(vtmp.reg() != 0);
468       bld.vop1(aco_opcode::v_mov_b32, Definition(vtmp, v1), src0[0]);
469       bld.vop1(aco_opcode::v_mov_b32, Definition(PhysReg{vtmp + 1}, v1), src0[1]);
470       src0_reg = vtmp;
471       src0[0] = Operand(vtmp, v1);
472       src0[1] = Operand(PhysReg{vtmp + 1}, v1);
473       src0_64 = Operand(vtmp, v2);
474    } else if (src0_rc == s1 && op == iadd64) {
475       assert(vtmp.reg() != 0);
476       bld.vop1(aco_opcode::v_mov_b32, Definition(PhysReg{vtmp + 1}, v1), src0[1]);
477       src0[1] = Operand(PhysReg{vtmp + 1}, v1);
478    }
479 
480    if (op == iadd64) {
481       if (ctx->program->gfx_level >= GFX10) {
482          bld.vop3(aco_opcode::v_add_co_u32_e64, dst[0], Definition(vcc, bld.lm), src0[0], src1[0]);
483       } else {
484          bld.vop2(aco_opcode::v_add_co_u32, dst[0], Definition(vcc, bld.lm), src0[0], src1[0]);
485       }
486       bld.vop2(aco_opcode::v_addc_co_u32, dst[1], Definition(vcc, bld.lm), src0[1], src1[1],
487                Operand(vcc, bld.lm));
488    } else if (op == iand64) {
489       bld.vop2(aco_opcode::v_and_b32, dst[0], src0[0], src1[0]);
490       bld.vop2(aco_opcode::v_and_b32, dst[1], src0[1], src1[1]);
491    } else if (op == ior64) {
492       bld.vop2(aco_opcode::v_or_b32, dst[0], src0[0], src1[0]);
493       bld.vop2(aco_opcode::v_or_b32, dst[1], src0[1], src1[1]);
494    } else if (op == ixor64) {
495       bld.vop2(aco_opcode::v_xor_b32, dst[0], src0[0], src1[0]);
496       bld.vop2(aco_opcode::v_xor_b32, dst[1], src0[1], src1[1]);
497    } else if (op == umin64 || op == umax64 || op == imin64 || op == imax64) {
498       aco_opcode cmp = aco_opcode::num_opcodes;
499       switch (op) {
500       case umin64: cmp = aco_opcode::v_cmp_gt_u64; break;
501       case umax64: cmp = aco_opcode::v_cmp_lt_u64; break;
502       case imin64: cmp = aco_opcode::v_cmp_gt_i64; break;
503       case imax64: cmp = aco_opcode::v_cmp_lt_i64; break;
504       default: break;
505       }
506 
507       bld.vopc(cmp, Definition(vcc, bld.lm), src0_64, src1_64);
508       bld.vop2(aco_opcode::v_cndmask_b32, dst[0], src0[0], src1[0], Operand(vcc, bld.lm));
509       bld.vop2(aco_opcode::v_cndmask_b32, dst[1], src0[1], src1[1], Operand(vcc, bld.lm));
510    } else if (op == imul64) {
511       if (src1_reg == dst_reg) {
512          /* it's fine if src0==dst but not if src1==dst */
513          std::swap(src0_reg, src1_reg);
514          std::swap(src0[0], src1[0]);
515          std::swap(src0[1], src1[1]);
516          std::swap(src0_64, src1_64);
517       }
518       assert(!(src0_reg == src1_reg));
519       /* t1 = umul_lo(x_hi, y_lo)
520        * t0 = umul_lo(x_lo, y_hi)
521        * t2 = iadd(t0, t1)
522        * t5 = umul_hi(x_lo, y_lo)
523        * res_hi = iadd(t2, t5)
524        * res_lo = umul_lo(x_lo, y_lo)
525        * assumes that it's ok to modify x_hi/y_hi, since we might not have vtmp
526        */
527       Definition tmp0_def(PhysReg{src0_reg + 1}, v1);
528       Definition tmp1_def(PhysReg{src1_reg + 1}, v1);
529       Operand tmp0_op = src0[1];
530       Operand tmp1_op = src1[1];
531       bld.vop3(aco_opcode::v_mul_lo_u32, tmp0_def, src0[1], src1[0]);
532       bld.vop3(aco_opcode::v_mul_lo_u32, tmp1_def, src0[0], src1[1]);
533       emit_vadd32(bld, tmp0_def, tmp1_op, tmp0_op);
534       bld.vop3(aco_opcode::v_mul_hi_u32, tmp1_def, src0[0], src1[0]);
535       emit_vadd32(bld, dst[1], tmp0_op, tmp1_op);
536       bld.vop3(aco_opcode::v_mul_lo_u32, dst[0], src0[0], src1[0]);
537    }
538 }
539 
540 void
emit_dpp_op(lower_context * ctx,PhysReg dst_reg,PhysReg src0_reg,PhysReg src1_reg,PhysReg vtmp,ReduceOp op,unsigned size,unsigned dpp_ctrl,unsigned row_mask,unsigned bank_mask,bool bound_ctrl,Operand * identity=NULL)541 emit_dpp_op(lower_context* ctx, PhysReg dst_reg, PhysReg src0_reg, PhysReg src1_reg, PhysReg vtmp,
542             ReduceOp op, unsigned size, unsigned dpp_ctrl, unsigned row_mask, unsigned bank_mask,
543             bool bound_ctrl, Operand* identity = NULL) /* for VOP3 with sparse writes */
544 {
545    Builder bld(ctx->program, &ctx->instructions);
546    RegClass rc = RegClass(RegType::vgpr, size);
547    Definition dst(dst_reg, rc);
548    Operand src0(src0_reg, rc);
549    Operand src1(src1_reg, rc);
550 
551    aco_opcode opcode = get_reduce_opcode(ctx->program->gfx_level, op);
552    bool vop3 = is_vop3_reduce_opcode(opcode);
553 
554    if (!vop3) {
555       if (opcode == aco_opcode::v_add_co_u32)
556          bld.vop2_dpp(opcode, dst, Definition(vcc, bld.lm), src0, src1, dpp_ctrl, row_mask,
557                       bank_mask, bound_ctrl);
558       else
559          bld.vop2_dpp(opcode, dst, src0, src1, dpp_ctrl, row_mask, bank_mask, bound_ctrl);
560       return;
561    }
562 
563    if (opcode == aco_opcode::num_opcodes) {
564       emit_int64_dpp_op(ctx, dst_reg, src0_reg, src1_reg, vtmp, op, dpp_ctrl, row_mask, bank_mask,
565                         bound_ctrl, identity);
566       return;
567    }
568 
569    if (identity)
570       bld.vop1(aco_opcode::v_mov_b32, Definition(vtmp, v1), identity[0]);
571    if (identity && size >= 2)
572       bld.vop1(aco_opcode::v_mov_b32, Definition(PhysReg{vtmp + 1}, v1), identity[1]);
573 
574    for (unsigned i = 0; i < size; i++)
575       bld.vop1_dpp(aco_opcode::v_mov_b32, Definition(PhysReg{vtmp + i}, v1),
576                    Operand(PhysReg{src0_reg + i}, v1), dpp_ctrl, row_mask, bank_mask, bound_ctrl);
577 
578    bld.vop3(opcode, dst, Operand(vtmp, rc), src1);
579 }
580 
581 void
emit_op(lower_context * ctx,PhysReg dst_reg,PhysReg src0_reg,PhysReg src1_reg,PhysReg vtmp,ReduceOp op,unsigned size)582 emit_op(lower_context* ctx, PhysReg dst_reg, PhysReg src0_reg, PhysReg src1_reg, PhysReg vtmp,
583         ReduceOp op, unsigned size)
584 {
585    Builder bld(ctx->program, &ctx->instructions);
586    RegClass rc = RegClass(RegType::vgpr, size);
587    Definition dst(dst_reg, rc);
588    Operand src0(src0_reg, RegClass(src0_reg.reg() >= 256 ? RegType::vgpr : RegType::sgpr, size));
589    Operand src1(src1_reg, rc);
590 
591    aco_opcode opcode = get_reduce_opcode(ctx->program->gfx_level, op);
592    bool vop3 = is_vop3_reduce_opcode(opcode);
593 
594    if (opcode == aco_opcode::num_opcodes) {
595       emit_int64_op(ctx, dst_reg, src0_reg, src1_reg, vtmp, op);
596       return;
597    }
598 
599    if (vop3) {
600       bld.vop3(opcode, dst, src0, src1);
601    } else if (opcode == aco_opcode::v_add_co_u32) {
602       bld.vop2(opcode, dst, Definition(vcc, bld.lm), src0, src1);
603    } else {
604       bld.vop2(opcode, dst, src0, src1);
605    }
606 }
607 
608 void
emit_dpp_mov(lower_context * ctx,PhysReg dst,PhysReg src0,unsigned size,unsigned dpp_ctrl,unsigned row_mask,unsigned bank_mask,bool bound_ctrl)609 emit_dpp_mov(lower_context* ctx, PhysReg dst, PhysReg src0, unsigned size, unsigned dpp_ctrl,
610              unsigned row_mask, unsigned bank_mask, bool bound_ctrl)
611 {
612    Builder bld(ctx->program, &ctx->instructions);
613    for (unsigned i = 0; i < size; i++) {
614       bld.vop1_dpp(aco_opcode::v_mov_b32, Definition(PhysReg{dst + i}, v1),
615                    Operand(PhysReg{src0 + i}, v1), dpp_ctrl, row_mask, bank_mask, bound_ctrl);
616    }
617 }
618 
619 void
emit_ds_swizzle(Builder bld,PhysReg dst,PhysReg src,unsigned size,unsigned ds_pattern)620 emit_ds_swizzle(Builder bld, PhysReg dst, PhysReg src, unsigned size, unsigned ds_pattern)
621 {
622    for (unsigned i = 0; i < size; i++) {
623       bld.ds(aco_opcode::ds_swizzle_b32, Definition(PhysReg{dst + i}, v1),
624              Operand(PhysReg{src + i}, v1), ds_pattern);
625    }
626 }
627 
628 void
emit_reduction(lower_context * ctx,aco_opcode op,ReduceOp reduce_op,unsigned cluster_size,PhysReg tmp,PhysReg stmp,PhysReg vtmp,PhysReg sitmp,Operand src,Definition dst)629 emit_reduction(lower_context* ctx, aco_opcode op, ReduceOp reduce_op, unsigned cluster_size,
630                PhysReg tmp, PhysReg stmp, PhysReg vtmp, PhysReg sitmp, Operand src, Definition dst)
631 {
632    assert(cluster_size == ctx->program->wave_size || op == aco_opcode::p_reduce);
633    assert(cluster_size <= ctx->program->wave_size);
634 
635    Builder bld(ctx->program, &ctx->instructions);
636 
637    Operand identity[2];
638    identity[0] = Operand::c32(get_reduction_identity(reduce_op, 0));
639    identity[1] = Operand::c32(get_reduction_identity(reduce_op, 1));
640    Operand vcndmask_identity[2] = {identity[0], identity[1]};
641 
642    /* First, copy the source to tmp and set inactive lanes to the identity */
643    bld.sop1(Builder::s_or_saveexec, Definition(stmp, bld.lm), Definition(scc, s1),
644             Definition(exec, bld.lm), Operand::c64(UINT64_MAX), Operand(exec, bld.lm));
645 
646    /* On GFX10+ v_writelane_b32/v_cndmask_b32_e64 can take a literal */
647    if (ctx->program->gfx_level < GFX10) {
648       for (unsigned i = 0; i < src.size(); i++) {
649          /* p_exclusive_scan uses identity for v_writelane_b32 */
650          if (identity[i].isLiteral() && op == aco_opcode::p_exclusive_scan) {
651             bld.sop1(aco_opcode::s_mov_b32, Definition(PhysReg{sitmp + i}, s1), identity[i]);
652             identity[i] = Operand(PhysReg{sitmp + i}, s1);
653 
654             bld.vop1(aco_opcode::v_mov_b32, Definition(PhysReg{tmp + i}, v1), identity[i]);
655             vcndmask_identity[i] = Operand(PhysReg{tmp + i}, v1);
656          } else if (identity[i].isLiteral()) {
657             bld.vop1(aco_opcode::v_mov_b32, Definition(PhysReg{tmp + i}, v1), identity[i]);
658             vcndmask_identity[i] = Operand(PhysReg{tmp + i}, v1);
659          }
660       }
661    }
662 
663    for (unsigned i = 0; i < src.size(); i++) {
664       bld.vop2_e64(aco_opcode::v_cndmask_b32, Definition(PhysReg{tmp + i}, v1),
665                    vcndmask_identity[i], Operand(PhysReg{src.physReg() + i}, v1),
666                    Operand(stmp, bld.lm));
667    }
668 
669    if (reduce_op == iadd8 || reduce_op == imul8 || reduce_op == imax8 || reduce_op == imin8 ||
670        reduce_op == umin8 || reduce_op == umax8 || reduce_op == ixor8 || reduce_op == ior8 ||
671        reduce_op == iand8) {
672       if (ctx->program->gfx_level >= GFX8 && ctx->program->gfx_level < GFX11) {
673          aco_ptr<Instruction> sdwa{
674             create_instruction(aco_opcode::v_mov_b32, asSDWA(Format::VOP1), 1, 1)};
675          sdwa->operands[0] = Operand(PhysReg{tmp}, v1);
676          sdwa->definitions[0] = Definition(PhysReg{tmp}, v1);
677          bool sext = reduce_op == imin8 || reduce_op == imax8;
678          sdwa->sdwa().sel[0] = SubdwordSel(1, 0, sext);
679          sdwa->sdwa().dst_sel = SubdwordSel::dword;
680          bld.insert(std::move(sdwa));
681       } else {
682          aco_opcode opcode;
683 
684          if (reduce_op == imin8 || reduce_op == imax8)
685             opcode = aco_opcode::v_bfe_i32;
686          else
687             opcode = aco_opcode::v_bfe_u32;
688 
689          bld.vop3(opcode, Definition(PhysReg{tmp}, v1), Operand(PhysReg{tmp}, v1), Operand::zero(),
690                   Operand::c32(8u));
691       }
692    } else if (reduce_op == iadd16 || reduce_op == imul16 || reduce_op == imax16 ||
693               reduce_op == imin16 || reduce_op == umin16 || reduce_op == umax16 ||
694               reduce_op == ixor16 || reduce_op == ior16 || reduce_op == iand16 ||
695               reduce_op == fadd16 || reduce_op == fmul16 || reduce_op == fmin16 ||
696               reduce_op == fmax16) {
697       bool is_add_cmp = reduce_op == iadd16 || reduce_op == imax16 || reduce_op == imin16 ||
698                         reduce_op == umin16 || reduce_op == umax16;
699       if (ctx->program->gfx_level >= GFX10 && ctx->program->gfx_level < GFX11 && is_add_cmp) {
700          aco_ptr<Instruction> sdwa{
701             create_instruction(aco_opcode::v_mov_b32, asSDWA(Format::VOP1), 1, 1)};
702          sdwa->operands[0] = Operand(PhysReg{tmp}, v1);
703          sdwa->definitions[0] = Definition(PhysReg{tmp}, v1);
704          bool sext = reduce_op == imin16 || reduce_op == imax16 || reduce_op == iadd16;
705          sdwa->sdwa().sel[0] = SubdwordSel(2, 0, sext);
706          sdwa->sdwa().dst_sel = SubdwordSel::dword;
707          bld.insert(std::move(sdwa));
708       } else if (ctx->program->gfx_level <= GFX7 ||
709                  (ctx->program->gfx_level >= GFX11 && is_add_cmp)) {
710          aco_opcode opcode;
711 
712          if (reduce_op == imin16 || reduce_op == imax16 || reduce_op == iadd16)
713             opcode = aco_opcode::v_bfe_i32;
714          else
715             opcode = aco_opcode::v_bfe_u32;
716 
717          bld.vop3(opcode, Definition(PhysReg{tmp}, v1), Operand(PhysReg{tmp}, v1), Operand::zero(),
718                   Operand::c32(16u));
719       }
720    }
721 
722    bool reduction_needs_last_op = false;
723    switch (op) {
724    case aco_opcode::p_reduce:
725       if (cluster_size == 1)
726          break;
727 
728       if (ctx->program->gfx_level <= GFX7) {
729          reduction_needs_last_op = true;
730          emit_ds_swizzle(bld, vtmp, tmp, src.size(), (1 << 15) | dpp_quad_perm(1, 0, 3, 2));
731          if (cluster_size == 2)
732             break;
733          emit_op(ctx, tmp, vtmp, tmp, PhysReg{0}, reduce_op, src.size());
734          emit_ds_swizzle(bld, vtmp, tmp, src.size(), (1 << 15) | dpp_quad_perm(2, 3, 0, 1));
735          if (cluster_size == 4)
736             break;
737          emit_op(ctx, tmp, vtmp, tmp, PhysReg{0}, reduce_op, src.size());
738          emit_ds_swizzle(bld, vtmp, tmp, src.size(), ds_pattern_bitmode(0x1f, 0, 0x04));
739          if (cluster_size == 8)
740             break;
741          emit_op(ctx, tmp, vtmp, tmp, PhysReg{0}, reduce_op, src.size());
742          emit_ds_swizzle(bld, vtmp, tmp, src.size(), ds_pattern_bitmode(0x1f, 0, 0x08));
743          if (cluster_size == 16)
744             break;
745          emit_op(ctx, tmp, vtmp, tmp, PhysReg{0}, reduce_op, src.size());
746          emit_ds_swizzle(bld, vtmp, tmp, src.size(), ds_pattern_bitmode(0x1f, 0, 0x10));
747          if (cluster_size == 32)
748             break;
749          emit_op(ctx, tmp, vtmp, tmp, PhysReg{0}, reduce_op, src.size());
750          for (unsigned i = 0; i < src.size(); i++)
751             bld.readlane(Definition(PhysReg{dst.physReg() + i}, s1), Operand(PhysReg{tmp + i}, v1),
752                          Operand::zero());
753          // TODO: it would be more effective to do the last reduction step on SALU
754          emit_op(ctx, tmp, dst.physReg(), tmp, vtmp, reduce_op, src.size());
755          reduction_needs_last_op = false;
756          break;
757       }
758 
759       emit_dpp_op(ctx, tmp, tmp, tmp, vtmp, reduce_op, src.size(), dpp_quad_perm(1, 0, 3, 2), 0xf,
760                   0xf, false);
761       if (cluster_size == 2)
762          break;
763       emit_dpp_op(ctx, tmp, tmp, tmp, vtmp, reduce_op, src.size(), dpp_quad_perm(2, 3, 0, 1), 0xf,
764                   0xf, false);
765       if (cluster_size == 4)
766          break;
767       emit_dpp_op(ctx, tmp, tmp, tmp, vtmp, reduce_op, src.size(), dpp_row_half_mirror, 0xf, 0xf,
768                   false);
769       if (cluster_size == 8)
770          break;
771       emit_dpp_op(ctx, tmp, tmp, tmp, vtmp, reduce_op, src.size(), dpp_row_mirror, 0xf, 0xf, false);
772       if (cluster_size == 16)
773          break;
774 
775       if (ctx->program->gfx_level >= GFX10) {
776          /* GFX10+ doesn't support row_bcast15 and row_bcast31 */
777          for (unsigned i = 0; i < src.size(); i++)
778             bld.vop3(aco_opcode::v_permlanex16_b32, Definition(PhysReg{vtmp + i}, v1),
779                      Operand(PhysReg{tmp + i}, v1), Operand::zero(), Operand::zero());
780 
781          if (cluster_size == 32) {
782             reduction_needs_last_op = true;
783             break;
784          }
785 
786          emit_op(ctx, tmp, tmp, vtmp, PhysReg{0}, reduce_op, src.size());
787          for (unsigned i = 0; i < src.size(); i++)
788             bld.readlane(Definition(PhysReg{dst.physReg() + i}, s1), Operand(PhysReg{tmp + i}, v1),
789                          Operand::zero());
790          // TODO: it would be more effective to do the last reduction step on SALU
791          emit_op(ctx, tmp, dst.physReg(), tmp, vtmp, reduce_op, src.size());
792          break;
793       }
794 
795       if (cluster_size == 32) {
796          emit_ds_swizzle(bld, vtmp, tmp, src.size(), ds_pattern_bitmode(0x1f, 0, 0x10));
797          reduction_needs_last_op = true;
798          break;
799       }
800       assert(cluster_size == 64);
801       emit_dpp_op(ctx, tmp, tmp, tmp, vtmp, reduce_op, src.size(), dpp_row_bcast15, 0xa, 0xf,
802                   false);
803       emit_dpp_op(ctx, tmp, tmp, tmp, vtmp, reduce_op, src.size(), dpp_row_bcast31, 0xc, 0xf,
804                   false);
805       break;
806    case aco_opcode::p_exclusive_scan:
807       if (ctx->program->gfx_level >= GFX10) { /* gfx10 doesn't support wf_sr1, so emulate it */
808          /* shift rows right */
809          emit_dpp_mov(ctx, vtmp, tmp, src.size(), dpp_row_sr(1), 0xf, 0xf, true);
810 
811          /* fill in the gaps in rows 1 and 3 */
812          copy_constant_sgpr(bld, Definition(exec, bld.lm), 0x0001'0000'0001'0000ull);
813          for (unsigned i = 0; i < src.size(); i++) {
814             Instruction* perm =
815                bld.vop3(aco_opcode::v_permlanex16_b32, Definition(PhysReg{vtmp + i}, v1),
816                         Operand(PhysReg{tmp + i}, v1), Operand::c32(0xffffffffu),
817                         Operand::c32(0xffffffffu))
818                   .instr;
819             perm->valu().opsel = 1; /* FI (Fetch Inactive) */
820          }
821          copy_constant_sgpr(bld, Definition(exec, bld.lm), UINT64_MAX);
822 
823          if (ctx->program->wave_size == 64) {
824             /* fill in the gap in row 2 */
825             for (unsigned i = 0; i < src.size(); i++) {
826                bld.readlane(Definition(PhysReg{sitmp + i}, s1), Operand(PhysReg{tmp + i}, v1),
827                             Operand::c32(31u));
828                bld.writelane(Definition(PhysReg{vtmp + i}, v1), Operand(PhysReg{sitmp + i}, s1),
829                              Operand::c32(32u), Operand(PhysReg{vtmp + i}, v1));
830             }
831          }
832          std::swap(tmp, vtmp);
833       } else if (ctx->program->gfx_level >= GFX8) {
834          emit_dpp_mov(ctx, tmp, tmp, src.size(), dpp_wf_sr1, 0xf, 0xf, true);
835       } else {
836          // TODO: use LDS on CS with a single write and shifted read
837          /* wavefront shift_right by 1 on SI/CI */
838          emit_ds_swizzle(bld, vtmp, tmp, src.size(), (1 << 15) | dpp_quad_perm(0, 0, 1, 2));
839          emit_ds_swizzle(bld, tmp, tmp, src.size(),
840                          ds_pattern_bitmode(0x1F, 0x00, 0x07)); /* mirror(8) */
841          copy_constant_sgpr(bld, Definition(exec, s2), 0x1010'1010'1010'1010ull);
842          for (unsigned i = 0; i < src.size(); i++)
843             bld.vop1(aco_opcode::v_mov_b32, Definition(PhysReg{vtmp + i}, v1),
844                      Operand(PhysReg{tmp + i}, v1));
845 
846          copy_constant_sgpr(bld, Definition(exec, s2), UINT64_MAX);
847          emit_ds_swizzle(bld, tmp, tmp, src.size(),
848                          ds_pattern_bitmode(0x1F, 0x00, 0x08)); /* swap(8) */
849          copy_constant_sgpr(bld, Definition(exec, s2), 0x0100'0100'0100'0100ull);
850          for (unsigned i = 0; i < src.size(); i++)
851             bld.vop1(aco_opcode::v_mov_b32, Definition(PhysReg{vtmp + i}, v1),
852                      Operand(PhysReg{tmp + i}, v1));
853 
854          copy_constant_sgpr(bld, Definition(exec, s2), UINT64_MAX);
855          emit_ds_swizzle(bld, tmp, tmp, src.size(),
856                          ds_pattern_bitmode(0x1F, 0x00, 0x10)); /* swap(16) */
857          copy_constant_sgpr(bld, Definition(exec, s2), 0x0001'0000'0001'0000ull);
858          for (unsigned i = 0; i < src.size(); i++)
859             bld.vop1(aco_opcode::v_mov_b32, Definition(PhysReg{vtmp + i}, v1),
860                      Operand(PhysReg{tmp + i}, v1));
861 
862          copy_constant_sgpr(bld, Definition(exec, s2), UINT64_MAX);
863          for (unsigned i = 0; i < src.size(); i++) {
864             bld.writelane(Definition(PhysReg{vtmp + i}, v1), identity[i], Operand::zero(),
865                           Operand(PhysReg{vtmp + i}, v1));
866             bld.readlane(Definition(PhysReg{sitmp + i}, s1), Operand(PhysReg{tmp + i}, v1),
867                          Operand::zero());
868             bld.writelane(Definition(PhysReg{vtmp + i}, v1), Operand(PhysReg{sitmp + i}, s1),
869                           Operand::c32(32u), Operand(PhysReg{vtmp + i}, v1));
870             identity[i] = Operand::zero(); /* prevent further uses of identity */
871          }
872          std::swap(tmp, vtmp);
873       }
874 
875       for (unsigned i = 0; i < src.size(); i++) {
876          if (!identity[i].isConstant() ||
877              identity[i].constantValue()) { /* bound_ctrl should take care of this otherwise */
878             if (ctx->program->gfx_level < GFX10)
879                assert((identity[i].isConstant() && !identity[i].isLiteral()) ||
880                       identity[i].physReg() == PhysReg{sitmp + i});
881             bld.writelane(Definition(PhysReg{tmp + i}, v1), identity[i], Operand::zero(),
882                           Operand(PhysReg{tmp + i}, v1));
883          }
884       }
885       FALLTHROUGH;
886    case aco_opcode::p_inclusive_scan:
887       assert(cluster_size == ctx->program->wave_size);
888       if (ctx->program->gfx_level <= GFX7) {
889          emit_ds_swizzle(bld, vtmp, tmp, src.size(), ds_pattern_bitmode(0x1e, 0x00, 0x00));
890          copy_constant_sgpr(bld, Definition(exec, s2), 0xaaaa'aaaa'aaaa'aaaaull);
891          emit_op(ctx, tmp, tmp, vtmp, PhysReg{0}, reduce_op, src.size());
892 
893          copy_constant_sgpr(bld, Definition(exec, s2), UINT64_MAX);
894          emit_ds_swizzle(bld, vtmp, tmp, src.size(), ds_pattern_bitmode(0x1c, 0x01, 0x00));
895          copy_constant_sgpr(bld, Definition(exec, s2), 0xcccc'cccc'cccc'ccccull);
896          emit_op(ctx, tmp, tmp, vtmp, PhysReg{0}, reduce_op, src.size());
897 
898          copy_constant_sgpr(bld, Definition(exec, s2), UINT64_MAX);
899          emit_ds_swizzle(bld, vtmp, tmp, src.size(), ds_pattern_bitmode(0x18, 0x03, 0x00));
900          copy_constant_sgpr(bld, Definition(exec, s2), 0xf0f0'f0f0'f0f0'f0f0ull);
901          emit_op(ctx, tmp, tmp, vtmp, PhysReg{0}, reduce_op, src.size());
902 
903          copy_constant_sgpr(bld, Definition(exec, s2), UINT64_MAX);
904          emit_ds_swizzle(bld, vtmp, tmp, src.size(), ds_pattern_bitmode(0x10, 0x07, 0x00));
905          copy_constant_sgpr(bld, Definition(exec, s2), 0xff00'ff00'ff00'ff00ull);
906          emit_op(ctx, tmp, tmp, vtmp, PhysReg{0}, reduce_op, src.size());
907 
908          copy_constant_sgpr(bld, Definition(exec, s2), UINT64_MAX);
909          emit_ds_swizzle(bld, vtmp, tmp, src.size(), ds_pattern_bitmode(0x00, 0x0f, 0x00));
910          copy_constant_sgpr(bld, Definition(exec, s2), 0xffff'0000'ffff'0000ull);
911          emit_op(ctx, tmp, tmp, vtmp, PhysReg{0}, reduce_op, src.size());
912 
913          for (unsigned i = 0; i < src.size(); i++)
914             bld.readlane(Definition(PhysReg{sitmp + i}, s1), Operand(PhysReg{tmp + i}, v1),
915                          Operand::c32(31u));
916          copy_constant_sgpr(bld, Definition(exec, s2), 0xffff'ffff'0000'0000ull);
917          emit_op(ctx, tmp, sitmp, tmp, vtmp, reduce_op, src.size());
918          break;
919       }
920 
921       emit_dpp_op(ctx, tmp, tmp, tmp, vtmp, reduce_op, src.size(), dpp_row_sr(1), 0xf, 0xf, false,
922                   identity);
923       emit_dpp_op(ctx, tmp, tmp, tmp, vtmp, reduce_op, src.size(), dpp_row_sr(2), 0xf, 0xf, false,
924                   identity);
925       emit_dpp_op(ctx, tmp, tmp, tmp, vtmp, reduce_op, src.size(), dpp_row_sr(4), 0xf, 0xf, false,
926                   identity);
927       emit_dpp_op(ctx, tmp, tmp, tmp, vtmp, reduce_op, src.size(), dpp_row_sr(8), 0xf, 0xf, false,
928                   identity);
929       if (ctx->program->gfx_level >= GFX10) {
930          copy_constant_sgpr(bld, Definition(exec, bld.lm), 0xffff'0000'ffff'0000ull);
931          for (unsigned i = 0; i < src.size(); i++) {
932             Instruction* perm =
933                bld.vop3(aco_opcode::v_permlanex16_b32, Definition(PhysReg{vtmp + i}, v1),
934                         Operand(PhysReg{tmp + i}, v1), Operand::c32(0xffffffffu),
935                         Operand::c32(0xffffffffu))
936                   .instr;
937             perm->valu().opsel = 1; /* FI (Fetch Inactive) */
938          }
939          emit_op(ctx, tmp, tmp, vtmp, PhysReg{0}, reduce_op, src.size());
940 
941          if (ctx->program->wave_size == 64) {
942             copy_constant_sgpr(bld, Definition(exec, s2), 0xffff'ffff'0000'0000ull);
943             for (unsigned i = 0; i < src.size(); i++)
944                bld.readlane(Definition(PhysReg{sitmp + i}, s1), Operand(PhysReg{tmp + i}, v1),
945                             Operand::c32(31u));
946             emit_op(ctx, tmp, sitmp, tmp, vtmp, reduce_op, src.size());
947          }
948       } else {
949          emit_dpp_op(ctx, tmp, tmp, tmp, vtmp, reduce_op, src.size(), dpp_row_bcast15, 0xa, 0xf,
950                      false, identity);
951          emit_dpp_op(ctx, tmp, tmp, tmp, vtmp, reduce_op, src.size(), dpp_row_bcast31, 0xc, 0xf,
952                      false, identity);
953       }
954       break;
955    default: unreachable("Invalid reduction mode");
956    }
957 
958    if (op == aco_opcode::p_reduce) {
959       if (reduction_needs_last_op && dst.regClass().type() == RegType::vgpr) {
960          bld.sop1(Builder::s_mov, Definition(exec, bld.lm), Operand(stmp, bld.lm));
961          emit_op(ctx, dst.physReg(), tmp, vtmp, PhysReg{0}, reduce_op, src.size());
962          return;
963       }
964 
965       if (reduction_needs_last_op)
966          emit_op(ctx, tmp, vtmp, tmp, PhysReg{0}, reduce_op, src.size());
967    }
968 
969    /* restore exec */
970    bld.sop1(Builder::s_mov, Definition(exec, bld.lm), Operand(stmp, bld.lm));
971 
972    if (dst.regClass().type() == RegType::sgpr) {
973       for (unsigned k = 0; k < src.size(); k++) {
974          bld.readlane(Definition(PhysReg{dst.physReg() + k}, s1), Operand(PhysReg{tmp + k}, v1),
975                       Operand::c32(ctx->program->wave_size - 1));
976       }
977    } else if (dst.physReg() != tmp) {
978       for (unsigned k = 0; k < src.size(); k++) {
979          bld.vop1(aco_opcode::v_mov_b32, Definition(PhysReg{dst.physReg() + k}, v1),
980                   Operand(PhysReg{tmp + k}, v1));
981       }
982    }
983 }
984 
985 void
adjust_bpermute_dst(Builder & bld,Definition dst,Operand input_data)986 adjust_bpermute_dst(Builder& bld, Definition dst, Operand input_data)
987 {
988    /* RA assumes that the result is always in the low part of the register, so we have to shift,
989     * if it's not there already.
990     */
991    if (input_data.physReg().byte()) {
992       unsigned right_shift = input_data.physReg().byte() * 8;
993       bld.vop2(aco_opcode::v_lshrrev_b32, dst, Operand::c32(right_shift),
994                Operand(dst.physReg(), dst.regClass()));
995    }
996 }
997 
998 void
emit_bpermute_permlane(Builder & bld,aco_ptr<Instruction> & instr)999 emit_bpermute_permlane(Builder& bld, aco_ptr<Instruction>& instr)
1000 {
1001    /* Emulates proper bpermute on GFX11 in wave64 mode.
1002     *
1003     * Similar to emit_gfx10_wave64_bpermute, but uses the new
1004     * v_permlane64_b32 instruction to swap data between lo and hi halves.
1005     */
1006 
1007    assert(bld.program->gfx_level >= GFX11);
1008    assert(bld.program->wave_size == 64);
1009 
1010    Definition dst = instr->definitions[0];
1011    Definition tmp_exec = instr->definitions[1];
1012    Definition clobber_scc = instr->definitions[2];
1013    Operand tmp_op = instr->operands[0];
1014    Operand index_x4 = instr->operands[1];
1015    Operand input_data = instr->operands[2];
1016    Operand same_half = instr->operands[3];
1017 
1018    assert(dst.regClass() == v1);
1019    assert(tmp_exec.regClass() == bld.lm);
1020    assert(clobber_scc.isFixed() && clobber_scc.physReg() == scc);
1021    assert(same_half.regClass() == bld.lm);
1022    assert(tmp_op.regClass() == v1.as_linear());
1023    assert(index_x4.regClass() == v1);
1024    assert(input_data.regClass().type() == RegType::vgpr);
1025    assert(input_data.bytes() <= 4);
1026 
1027    Definition tmp_def(tmp_op.physReg(), tmp_op.regClass());
1028 
1029    /* Permute the input within the same half-wave. */
1030    bld.ds(aco_opcode::ds_bpermute_b32, dst, index_x4, input_data);
1031 
1032    /* Save EXEC and enable all lanes. */
1033    bld.sop1(aco_opcode::s_or_saveexec_b64, tmp_exec, clobber_scc, Definition(exec, s2),
1034             Operand::c32(-1u), Operand(exec, s2));
1035 
1036    /* Copy input data from other half to current half's linear VGPR. */
1037    bld.vop1(aco_opcode::v_permlane64_b32, tmp_def, input_data);
1038 
1039    /* Permute the input from the other half-wave, write to linear VGPR. */
1040    bld.ds(aco_opcode::ds_bpermute_b32, tmp_def, index_x4, tmp_op);
1041 
1042    /* Restore saved EXEC. */
1043    bld.sop1(aco_opcode::s_mov_b64, Definition(exec, s2), Operand(tmp_exec.physReg(), s2));
1044 
1045    /* Select correct permute result. */
1046    bld.vop2_e64(aco_opcode::v_cndmask_b32, dst, tmp_op, Operand(dst.physReg(), dst.regClass()),
1047                 same_half);
1048 
1049    adjust_bpermute_dst(bld, dst, input_data);
1050 }
1051 
1052 void
emit_bpermute_shared_vgpr(Builder & bld,aco_ptr<Instruction> & instr)1053 emit_bpermute_shared_vgpr(Builder& bld, aco_ptr<Instruction>& instr)
1054 {
1055    /* Emulates proper bpermute on GFX10 in wave64 mode.
1056     *
1057     * This is necessary because on GFX10 the bpermute instruction only works
1058     * on half waves (you can think of it as having a cluster size of 32), so we
1059     * manually swap the data between the two halves using two shared VGPRs.
1060     */
1061 
1062    assert(bld.program->gfx_level >= GFX10 && bld.program->gfx_level <= GFX10_3);
1063    assert(bld.program->wave_size == 64);
1064 
1065    unsigned shared_vgpr_reg_0 = align(bld.program->config->num_vgprs, 4) + 256;
1066    Definition dst = instr->definitions[0];
1067    Definition tmp_exec = instr->definitions[1];
1068    Definition clobber_scc = instr->definitions[2];
1069    Operand index_x4 = instr->operands[0];
1070    Operand input_data = instr->operands[1];
1071    Operand same_half = instr->operands[2];
1072 
1073    assert(dst.regClass() == v1);
1074    assert(tmp_exec.regClass() == bld.lm);
1075    assert(clobber_scc.isFixed() && clobber_scc.physReg() == scc);
1076    assert(same_half.regClass() == bld.lm);
1077    assert(index_x4.regClass() == v1);
1078    assert(input_data.regClass().type() == RegType::vgpr);
1079    assert(input_data.bytes() <= 4);
1080    assert(dst.physReg() != index_x4.physReg());
1081    assert(dst.physReg() != input_data.physReg());
1082    assert(tmp_exec.physReg() != same_half.physReg());
1083 
1084    PhysReg shared_vgpr_lo(shared_vgpr_reg_0);
1085    PhysReg shared_vgpr_hi(shared_vgpr_reg_0 + 1);
1086 
1087    /* Permute the input within the same half-wave */
1088    bld.ds(aco_opcode::ds_bpermute_b32, dst, index_x4, input_data);
1089 
1090    /* HI: Copy data from high lanes 32-63 to shared vgpr */
1091    bld.vop1_dpp(aco_opcode::v_mov_b32, Definition(shared_vgpr_hi, v1), input_data,
1092                 dpp_quad_perm(0, 1, 2, 3), 0xc, 0xf, false);
1093    /* Save EXEC */
1094    bld.sop1(aco_opcode::s_mov_b64, tmp_exec, Operand(exec, s2));
1095    /* Set EXEC to enable LO lanes only */
1096    copy_constant_sgpr(bld, Definition(exec, s2), 0x0000'0000'ffff'ffffull);
1097    /* LO: Copy data from low lanes 0-31 to shared vgpr */
1098    bld.vop1(aco_opcode::v_mov_b32, Definition(shared_vgpr_lo, v1), input_data);
1099    /* LO: bpermute shared vgpr (high lanes' data) */
1100    bld.ds(aco_opcode::ds_bpermute_b32, Definition(shared_vgpr_hi, v1), index_x4,
1101           Operand(shared_vgpr_hi, v1));
1102    /* Set EXEC to enable HI lanes only */
1103    copy_constant_sgpr(bld, Definition(exec, s2), 0xffff'ffff'0000'0000ull);
1104    /* HI: bpermute shared vgpr (low lanes' data) */
1105    bld.ds(aco_opcode::ds_bpermute_b32, Definition(shared_vgpr_lo, v1), index_x4,
1106           Operand(shared_vgpr_lo, v1));
1107 
1108    /* Only enable lanes which use the other half's data */
1109    bld.sop2(aco_opcode::s_andn2_b64, Definition(exec, s2), clobber_scc,
1110             Operand(tmp_exec.physReg(), s2), same_half);
1111    /* LO: Copy shared vgpr (high lanes' bpermuted data) to output vgpr */
1112    bld.vop1_dpp(aco_opcode::v_mov_b32, dst, Operand(shared_vgpr_hi, v1), dpp_quad_perm(0, 1, 2, 3),
1113                 0x3, 0xf, false);
1114    /* HI: Copy shared vgpr (low lanes' bpermuted data) to output vgpr */
1115    bld.vop1_dpp(aco_opcode::v_mov_b32, dst, Operand(shared_vgpr_lo, v1), dpp_quad_perm(0, 1, 2, 3),
1116                 0xc, 0xf, false);
1117 
1118    /* Restore saved EXEC */
1119    bld.sop1(aco_opcode::s_mov_b64, Definition(exec, s2), Operand(tmp_exec.physReg(), s2));
1120 
1121    adjust_bpermute_dst(bld, dst, input_data);
1122 }
1123 
1124 void
emit_bpermute_readlane(Builder & bld,aco_ptr<Instruction> & instr)1125 emit_bpermute_readlane(Builder& bld, aco_ptr<Instruction>& instr)
1126 {
1127    /* Emulates bpermute using readlane instructions */
1128 
1129    Operand index = instr->operands[0];
1130    Operand input = instr->operands[1];
1131    Definition dst = instr->definitions[0];
1132    Definition temp_exec = instr->definitions[1];
1133    Definition clobber_vcc = instr->definitions[2];
1134 
1135    assert(dst.regClass() == v1);
1136    assert(temp_exec.regClass() == bld.lm);
1137    assert(clobber_vcc.regClass() == bld.lm);
1138    assert(clobber_vcc.physReg() == vcc);
1139    assert(index.regClass() == v1);
1140    assert(index.physReg() != dst.physReg());
1141    assert(input.regClass().type() == RegType::vgpr);
1142    assert(input.bytes() <= 4);
1143    assert(input.physReg() != dst.physReg());
1144 
1145    /* Save original EXEC */
1146    bld.sop1(Builder::s_mov, temp_exec, Operand(exec, bld.lm));
1147 
1148    /* An "unrolled loop" that is executed per each lane.
1149     * This takes only a few instructions per lane, as opposed to a "real" loop
1150     * with branching, where the branch instruction alone would take 16+ cycles.
1151     */
1152    for (unsigned n = 0; n < bld.program->wave_size; ++n) {
1153       /* Activate the lane which has N for its source index */
1154       if (bld.program->gfx_level >= GFX10)
1155          bld.vopc(aco_opcode::v_cmpx_eq_u32, Definition(exec, bld.lm), Operand::c32(n), index);
1156       else
1157          bld.vopc(aco_opcode::v_cmpx_eq_u32, clobber_vcc, Definition(exec, bld.lm), Operand::c32(n),
1158                   index);
1159       /* Read the data from lane N */
1160       bld.readlane(Definition(vcc, s1), input, Operand::c32(n));
1161       /* On the active lane, move the data we read from lane N to the destination VGPR */
1162       bld.vop1(aco_opcode::v_mov_b32, dst, Operand(vcc, s1));
1163       /* Restore original EXEC */
1164       bld.sop1(Builder::s_mov, Definition(exec, bld.lm), Operand(temp_exec.physReg(), bld.lm));
1165    }
1166 
1167    adjust_bpermute_dst(bld, dst, input);
1168 }
1169 
1170 struct copy_operation {
1171    Operand op;
1172    Definition def;
1173    unsigned bytes;
1174    union {
1175       uint8_t uses[8];
1176       uint64_t is_used = 0;
1177    };
1178 };
1179 
1180 void
split_copy(lower_context * ctx,unsigned offset,Definition * def,Operand * op,const copy_operation & src,bool ignore_uses,unsigned max_size)1181 split_copy(lower_context* ctx, unsigned offset, Definition* def, Operand* op,
1182            const copy_operation& src, bool ignore_uses, unsigned max_size)
1183 {
1184    PhysReg def_reg = src.def.physReg();
1185    PhysReg op_reg = src.op.physReg();
1186    def_reg.reg_b += offset;
1187    op_reg.reg_b += offset;
1188 
1189    /* 64-bit VGPR copies (implemented with v_lshrrev_b64) are slow before GFX10, and on GFX11
1190     * v_lshrrev_b64 doesn't get dual issued. */
1191    if ((ctx->program->gfx_level < GFX10 || ctx->program->gfx_level >= GFX11) &&
1192        src.def.regClass().type() == RegType::vgpr)
1193       max_size = MIN2(max_size, 4);
1194    unsigned max_align = src.def.regClass().type() == RegType::vgpr ? 4 : 16;
1195 
1196    /* make sure the size is a power of two and reg % bytes == 0 */
1197    unsigned bytes = 1;
1198    for (; bytes <= max_size; bytes *= 2) {
1199       unsigned next = bytes * 2u;
1200       bool can_increase = def_reg.reg_b % MIN2(next, max_align) == 0 &&
1201                           offset + next <= src.bytes && next <= max_size;
1202       if (!src.op.isConstant() && can_increase)
1203          can_increase = op_reg.reg_b % MIN2(next, max_align) == 0;
1204       for (unsigned i = 0; !ignore_uses && can_increase && (i < bytes); i++)
1205          can_increase = (src.uses[offset + bytes + i] == 0) == (src.uses[offset] == 0);
1206       if (!can_increase)
1207          break;
1208    }
1209 
1210    *def = Definition(def_reg, src.def.regClass().resize(bytes));
1211    if (src.op.isConstant()) {
1212       assert(bytes >= 1 && bytes <= 8);
1213       uint64_t val = src.op.constantValue64() >> (offset * 8u);
1214       *op = Operand::get_const(ctx->program->gfx_level, val, bytes);
1215    } else {
1216       RegClass op_cls = src.op.regClass().resize(bytes);
1217       *op = Operand(op_reg, op_cls);
1218       op->setTemp(Temp(src.op.tempId(), op_cls));
1219    }
1220 }
1221 
1222 uint32_t
get_intersection_mask(int a_start,int a_size,int b_start,int b_size)1223 get_intersection_mask(int a_start, int a_size, int b_start, int b_size)
1224 {
1225    int intersection_start = MAX2(b_start - a_start, 0);
1226    int intersection_end = MAX2(b_start + b_size - a_start, 0);
1227    if (intersection_start >= a_size || intersection_end == 0)
1228       return 0;
1229 
1230    uint32_t mask = u_bit_consecutive(0, a_size);
1231    return u_bit_consecutive(intersection_start, intersection_end - intersection_start) & mask;
1232 }
1233 
1234 /* src1 are bytes 0-3. dst/src0 are bytes 4-7. */
1235 void
create_bperm(Builder & bld,uint8_t swiz[4],Definition dst,Operand src1,Operand src0=Operand (v1))1236 create_bperm(Builder& bld, uint8_t swiz[4], Definition dst, Operand src1,
1237              Operand src0 = Operand(v1))
1238 {
1239    uint32_t swiz_packed =
1240       swiz[0] | ((uint32_t)swiz[1] << 8) | ((uint32_t)swiz[2] << 16) | ((uint32_t)swiz[3] << 24);
1241 
1242    dst = Definition(PhysReg(dst.physReg().reg()), v1);
1243    if (!src1.isConstant())
1244       src1 = Operand(PhysReg(src1.physReg().reg()), v1);
1245    if (src0.isUndefined())
1246       src0 = Operand(dst.physReg(), v1);
1247    else if (!src0.isConstant())
1248       src0 = Operand(PhysReg(src0.physReg().reg()), v1);
1249    bld.vop3(aco_opcode::v_perm_b32, dst, src0, src1, Operand::c32(swiz_packed));
1250 }
1251 
1252 void
emit_v_mov_b16(Builder & bld,Definition dst,Operand op)1253 emit_v_mov_b16(Builder& bld, Definition dst, Operand op)
1254 {
1255    /* v_mov_b16 uses 32bit inline constants. */
1256    if (op.isConstant()) {
1257       if (!op.isLiteral() && op.physReg() >= 240) {
1258          /* v_add_f16 is smaller because it can use 16bit fp inline constants. */
1259          Instruction* instr = bld.vop2_e64(aco_opcode::v_add_f16, dst, op, Operand::zero());
1260          instr->valu().opsel[3] = dst.physReg().byte() == 2;
1261          return;
1262       }
1263       op = Operand::c32((int32_t)(int16_t)op.constantValue());
1264    }
1265 
1266    Instruction* instr = bld.vop1(aco_opcode::v_mov_b16, dst, op);
1267    instr->valu().opsel[0] = op.physReg().byte() == 2;
1268    instr->valu().opsel[3] = dst.physReg().byte() == 2;
1269 
1270    if (op.physReg().reg() < 256 && instr->valu().opsel[0])
1271       instr->format = asVOP3(instr->format);
1272 }
1273 
1274 void
copy_constant(lower_context * ctx,Builder & bld,Definition dst,Operand op)1275 copy_constant(lower_context* ctx, Builder& bld, Definition dst, Operand op)
1276 {
1277    assert(op.bytes() == dst.bytes());
1278 
1279    if (dst.regClass().type() == RegType::sgpr)
1280       return copy_constant_sgpr(bld, dst, op.constantValue64());
1281 
1282    bool dual_issue_mov = ctx->program->gfx_level >= GFX11 && ctx->program->wave_size == 64 &&
1283                          ctx->program->workgroup_size > 32;
1284    if (dst.bytes() == 4 && op.isLiteral() && !dual_issue_mov) {
1285       uint32_t imm = op.constantValue();
1286       Operand rev_op = Operand::get_const(ctx->program->gfx_level, util_bitreverse(imm), 4);
1287       if (!rev_op.isLiteral()) {
1288          bld.vop1(aco_opcode::v_bfrev_b32, dst, rev_op);
1289          return;
1290       }
1291    }
1292 
1293    if (op.bytes() == 4 && op.constantEquals(0x3e22f983) && ctx->program->gfx_level >= GFX8)
1294       op.setFixed(PhysReg{248}); /* it can be an inline constant on GFX8+ */
1295 
1296    if (dst.regClass() == v2) {
1297       if (Operand::is_constant_representable(op.constantValue64(), 8, true, false)) {
1298          bld.vop3(aco_opcode::v_lshrrev_b64, dst, Operand::zero(), op);
1299       } else {
1300          assert(Operand::is_constant_representable(op.constantValue64(), 8, false, true));
1301          bld.vop3(aco_opcode::v_ashrrev_i64, dst, Operand::zero(), op);
1302       }
1303    } else if (dst.regClass() == v1) {
1304       bld.vop1(aco_opcode::v_mov_b32, dst, op);
1305    } else {
1306       assert(dst.regClass() == v1b || dst.regClass() == v2b);
1307 
1308       bool use_sdwa = ctx->program->gfx_level >= GFX9 && ctx->program->gfx_level < GFX11;
1309       if (dst.regClass() == v1b && use_sdwa) {
1310          uint8_t val = op.constantValue();
1311          Operand op32 = Operand::c32((uint32_t)val | (val & 0x80u ? 0xffffff00u : 0u));
1312          if (op32.isLiteral()) {
1313             uint32_t a = (uint32_t)int8_mul_table[val * 2];
1314             uint32_t b = (uint32_t)int8_mul_table[val * 2 + 1];
1315             bld.vop2_sdwa(aco_opcode::v_mul_u32_u24, dst,
1316                           Operand::c32(a | (a & 0x80u ? 0xffffff00u : 0x0u)),
1317                           Operand::c32(b | (b & 0x80u ? 0xffffff00u : 0x0u)));
1318          } else {
1319             bld.vop1_sdwa(aco_opcode::v_mov_b32, dst, op32);
1320          }
1321       } else if (dst.regClass() == v1b && ctx->program->gfx_level >= GFX10) {
1322          Operand fop = Operand::c32(fui(float(op.constantValue())));
1323          Operand offset = Operand::c32(dst.physReg().byte());
1324          Operand def_op(PhysReg(dst.physReg().reg()), v1);
1325          bld.vop3(aco_opcode::v_cvt_pk_u8_f32, dst, fop, offset, def_op);
1326       } else if (dst.regClass() == v2b && ctx->program->gfx_level >= GFX11) {
1327          emit_v_mov_b16(bld, dst, op);
1328       } else if (dst.regClass() == v2b && use_sdwa && !op.isLiteral()) {
1329          if (op.constantValue() >= 0xfff0 || op.constantValue() <= 64) {
1330             /* use v_mov_b32 to avoid possible issues with denormal flushing or
1331              * NaN. v_add_f16 is still needed for float constants. */
1332             uint32_t val32 = (int32_t)(int16_t)op.constantValue();
1333             bld.vop1_sdwa(aco_opcode::v_mov_b32, dst, Operand::c32(val32));
1334          } else {
1335             bld.vop2_sdwa(aco_opcode::v_add_f16, dst, op, Operand::zero());
1336          }
1337       } else if (dst.regClass() == v2b && ctx->program->gfx_level >= GFX10) {
1338          op = Operand::c32(op.constantValue());
1339          Instruction* instr = bld.vop3(aco_opcode::v_add_u16_e64, dst, op, Operand::c32(0));
1340          instr->valu().opsel[3] = dst.physReg().byte() == 2;
1341       } else {
1342          uint32_t offset = dst.physReg().byte() * 8u;
1343          uint32_t mask = ((1u << (dst.bytes() * 8)) - 1) << offset;
1344          uint32_t val = (op.constantValue() << offset) & mask;
1345          dst = Definition(PhysReg(dst.physReg().reg()), v1);
1346          Operand def_op(dst.physReg(), v1);
1347          if (val != mask)
1348             bld.vop2(aco_opcode::v_and_b32, dst, Operand::c32(~mask), def_op);
1349          if (val != 0)
1350             bld.vop2(aco_opcode::v_or_b32, dst, Operand::c32(val), def_op);
1351       }
1352    }
1353 }
1354 
1355 bool
do_copy(lower_context * ctx,Builder & bld,const copy_operation & copy,bool * preserve_scc,PhysReg scratch_sgpr)1356 do_copy(lower_context* ctx, Builder& bld, const copy_operation& copy, bool* preserve_scc,
1357         PhysReg scratch_sgpr)
1358 {
1359    bool did_copy = false;
1360    for (unsigned offset = 0; offset < copy.bytes;) {
1361       if (copy.uses[offset]) {
1362          offset++;
1363          continue;
1364       }
1365 
1366       Definition def;
1367       Operand op;
1368       split_copy(ctx, offset, &def, &op, copy, false, 8);
1369 
1370       if (def.physReg() == scc) {
1371          bld.sopc(aco_opcode::s_cmp_lg_i32, def, op, Operand::zero());
1372          *preserve_scc = true;
1373       } else if (op.isConstant()) {
1374          copy_constant(ctx, bld, def, op);
1375       } else if (def.regClass() == v1) {
1376          bld.vop1(aco_opcode::v_mov_b32, def, op);
1377       } else if (def.regClass() == v2) {
1378          bld.vop3(aco_opcode::v_lshrrev_b64, def, Operand::zero(), op);
1379       } else if (def.regClass() == s1) {
1380          bld.sop1(aco_opcode::s_mov_b32, def, op);
1381       } else if (def.regClass() == s2) {
1382          bld.sop1(aco_opcode::s_mov_b64, def, op);
1383       } else if (def.regClass() == v1b && ctx->program->gfx_level >= GFX11) {
1384          uint8_t swiz[] = {4, 5, 6, 7};
1385          swiz[def.physReg().byte()] = op.physReg().byte();
1386          create_bperm(bld, swiz, def, op);
1387       } else if (def.regClass() == v2b && ctx->program->gfx_level >= GFX11) {
1388          emit_v_mov_b16(bld, def, op);
1389       } else if (def.regClass().is_subdword()) {
1390          bld.vop1_sdwa(aco_opcode::v_mov_b32, def, op);
1391       } else {
1392          unreachable("unsupported copy");
1393       }
1394 
1395       did_copy = true;
1396       offset += def.bytes();
1397    }
1398    return did_copy;
1399 }
1400 
1401 void
swap_subdword_gfx11(Builder & bld,Definition def,Operand op)1402 swap_subdword_gfx11(Builder& bld, Definition def, Operand op)
1403 {
1404    if (def.physReg().reg() == op.physReg().reg()) {
1405       assert(def.bytes() != 2); /* handled by caller */
1406       uint8_t swiz[] = {4, 5, 6, 7};
1407       std::swap(swiz[def.physReg().byte()], swiz[op.physReg().byte()]);
1408       create_bperm(bld, swiz, def, Operand::zero());
1409       return;
1410    }
1411 
1412    if (def.bytes() == 2) {
1413       Operand def_as_op = Operand(def.physReg(), def.regClass());
1414       Definition op_as_def = Definition(op.physReg(), op.regClass());
1415       /* v_swap_b16 is not offically supported as VOP3, so it can't be used with v128-255.
1416        * Tests show that VOP3 appears to work correctly, but according to AMD that should
1417        * not be relied on.
1418        */
1419       if (def.physReg() < (256 + 128) && op.physReg() < (256 + 128)) {
1420          Instruction* instr = bld.vop1(aco_opcode::v_swap_b16, def, op_as_def, op, def_as_op);
1421          instr->valu().opsel[0] = op.physReg().byte();
1422          instr->valu().opsel[3] = def.physReg().byte();
1423       } else {
1424          Instruction* instr = bld.vop3(aco_opcode::v_xor_b16, def, op, def_as_op);
1425          instr->valu().opsel[0] = op.physReg().byte();
1426          instr->valu().opsel[1] = def_as_op.physReg().byte();
1427          instr->valu().opsel[3] = def.physReg().byte();
1428          instr = bld.vop3(aco_opcode::v_xor_b16, op_as_def, op, def_as_op);
1429          instr->valu().opsel[0] = op.physReg().byte();
1430          instr->valu().opsel[1] = def_as_op.physReg().byte();
1431          instr->valu().opsel[3] = op_as_def.physReg().byte();
1432          instr = bld.vop3(aco_opcode::v_xor_b16, def, op, def_as_op);
1433          instr->valu().opsel[0] = op.physReg().byte();
1434          instr->valu().opsel[1] = def_as_op.physReg().byte();
1435          instr->valu().opsel[3] = def.physReg().byte();
1436       }
1437    } else {
1438       PhysReg op_half = op.physReg();
1439       op_half.reg_b &= ~1;
1440 
1441       PhysReg def_other_half = def.physReg();
1442       def_other_half.reg_b &= ~1;
1443       def_other_half.reg_b ^= 2;
1444 
1445       /* We can only swap individual bytes within a single VGPR, so temporarily move both bytes
1446        * into the same VGPR.
1447        */
1448       swap_subdword_gfx11(bld, Definition(def_other_half, v2b), Operand(op_half, v2b));
1449       swap_subdword_gfx11(bld, def, Operand(def_other_half.advance(op.physReg().byte() & 1), v1b));
1450       swap_subdword_gfx11(bld, Definition(def_other_half, v2b), Operand(op_half, v2b));
1451    }
1452 }
1453 
1454 void
do_swap(lower_context * ctx,Builder & bld,const copy_operation & copy,bool preserve_scc,Pseudo_instruction * pi)1455 do_swap(lower_context* ctx, Builder& bld, const copy_operation& copy, bool preserve_scc,
1456         Pseudo_instruction* pi)
1457 {
1458    unsigned offset = 0;
1459 
1460    if (copy.bytes == 3 && (copy.def.physReg().reg_b % 4 <= 1) &&
1461        (copy.def.physReg().reg_b % 4) == (copy.op.physReg().reg_b % 4)) {
1462       /* instead of doing a 2-byte and 1-byte swap, do a 4-byte swap and then fixup with a 1-byte
1463        * swap */
1464       PhysReg op = copy.op.physReg();
1465       PhysReg def = copy.def.physReg();
1466       op.reg_b &= ~0x3;
1467       def.reg_b &= ~0x3;
1468 
1469       copy_operation tmp;
1470       tmp.op = Operand(op, v1);
1471       tmp.def = Definition(def, v1);
1472       tmp.bytes = 4;
1473       memset(tmp.uses, 1, 4);
1474       do_swap(ctx, bld, tmp, preserve_scc, pi);
1475 
1476       op.reg_b += copy.def.physReg().reg_b % 4 == 0 ? 3 : 0;
1477       def.reg_b += copy.def.physReg().reg_b % 4 == 0 ? 3 : 0;
1478       tmp.op = Operand(op, v1b);
1479       tmp.def = Definition(def, v1b);
1480       tmp.bytes = 1;
1481       tmp.uses[0] = 1;
1482       do_swap(ctx, bld, tmp, preserve_scc, pi);
1483 
1484       offset = copy.bytes;
1485    }
1486 
1487    for (; offset < copy.bytes;) {
1488       Definition def;
1489       Operand op;
1490       unsigned max_size = copy.def.regClass().type() == RegType::vgpr ? 4 : 8;
1491       split_copy(ctx, offset, &def, &op, copy, true, max_size);
1492 
1493       assert(op.regClass() == def.regClass());
1494       Operand def_as_op = Operand(def.physReg(), def.regClass());
1495       Definition op_as_def = Definition(op.physReg(), op.regClass());
1496       if (ctx->program->gfx_level >= GFX9 && def.regClass() == v1) {
1497          bld.vop1(aco_opcode::v_swap_b32, def, op_as_def, op, def_as_op);
1498       } else if (def.regClass() == v1) {
1499          assert(def.physReg().byte() == 0 && op.physReg().byte() == 0);
1500          bld.vop2(aco_opcode::v_xor_b32, op_as_def, op, def_as_op);
1501          bld.vop2(aco_opcode::v_xor_b32, def, op, def_as_op);
1502          bld.vop2(aco_opcode::v_xor_b32, op_as_def, op, def_as_op);
1503       } else if (op.physReg() == scc || def.physReg() == scc) {
1504          /* we need to swap scc and another sgpr */
1505          assert(!preserve_scc);
1506 
1507          PhysReg other = op.physReg() == scc ? def.physReg() : op.physReg();
1508 
1509          bld.sop1(aco_opcode::s_mov_b32, Definition(pi->scratch_sgpr, s1), Operand(scc, s1));
1510          bld.sopc(aco_opcode::s_cmp_lg_i32, Definition(scc, s1), Operand(other, s1),
1511                   Operand::zero());
1512          bld.sop1(aco_opcode::s_mov_b32, Definition(other, s1), Operand(pi->scratch_sgpr, s1));
1513       } else if (def.regClass() == s1) {
1514          if (preserve_scc) {
1515             bld.sop1(aco_opcode::s_mov_b32, Definition(pi->scratch_sgpr, s1), op);
1516             bld.sop1(aco_opcode::s_mov_b32, op_as_def, def_as_op);
1517             bld.sop1(aco_opcode::s_mov_b32, def, Operand(pi->scratch_sgpr, s1));
1518          } else {
1519             bld.sop2(aco_opcode::s_xor_b32, op_as_def, Definition(scc, s1), op, def_as_op);
1520             bld.sop2(aco_opcode::s_xor_b32, def, Definition(scc, s1), op, def_as_op);
1521             bld.sop2(aco_opcode::s_xor_b32, op_as_def, Definition(scc, s1), op, def_as_op);
1522          }
1523       } else if (def.regClass() == s2) {
1524          if (preserve_scc)
1525             bld.sop1(aco_opcode::s_mov_b32, Definition(pi->scratch_sgpr, s1), Operand(scc, s1));
1526          bld.sop2(aco_opcode::s_xor_b64, op_as_def, Definition(scc, s1), op, def_as_op);
1527          bld.sop2(aco_opcode::s_xor_b64, def, Definition(scc, s1), op, def_as_op);
1528          bld.sop2(aco_opcode::s_xor_b64, op_as_def, Definition(scc, s1), op, def_as_op);
1529          if (preserve_scc)
1530             bld.sopc(aco_opcode::s_cmp_lg_i32, Definition(scc, s1), Operand(pi->scratch_sgpr, s1),
1531                      Operand::zero());
1532       } else if (def.bytes() == 2 && def.physReg().reg() == op.physReg().reg()) {
1533          bld.vop3(aco_opcode::v_alignbyte_b32, Definition(def.physReg(), v1), def_as_op, op,
1534                   Operand::c32(2u));
1535       } else {
1536          assert(def.regClass().is_subdword());
1537          if (ctx->program->gfx_level >= GFX11) {
1538             swap_subdword_gfx11(bld, def, op);
1539          } else {
1540             bld.vop2_sdwa(aco_opcode::v_xor_b32, op_as_def, op, def_as_op);
1541             bld.vop2_sdwa(aco_opcode::v_xor_b32, def, op, def_as_op);
1542             bld.vop2_sdwa(aco_opcode::v_xor_b32, op_as_def, op, def_as_op);
1543          }
1544       }
1545 
1546       offset += def.bytes();
1547    }
1548 
1549    if (ctx->program->gfx_level <= GFX7)
1550       return;
1551 
1552    /* fixup in case we swapped bytes we shouldn't have */
1553    copy_operation tmp_copy = copy;
1554    tmp_copy.op.setFixed(copy.def.physReg());
1555    tmp_copy.def.setFixed(copy.op.physReg());
1556    do_copy(ctx, bld, tmp_copy, &preserve_scc, pi->scratch_sgpr);
1557 }
1558 
1559 void
do_pack_2x16(lower_context * ctx,Builder & bld,Definition def,Operand lo,Operand hi)1560 do_pack_2x16(lower_context* ctx, Builder& bld, Definition def, Operand lo, Operand hi)
1561 {
1562    assert(ctx->program->gfx_level >= GFX8);
1563 
1564    if (lo.isConstant() && hi.isConstant()) {
1565       copy_constant(ctx, bld, def, Operand::c32(lo.constantValue() | (hi.constantValue() << 16)));
1566       return;
1567    }
1568 
1569    /* v_pack_b32_f16 can be used for bit exact copies if:
1570     * - fp16 input denorms are enabled, otherwise they get flushed to zero
1571     * - signalling input NaNs are kept, which is the case with IEEE_MODE=0
1572     *   GFX12+ always quiets signalling NaNs, IEEE_MODE was removed
1573     */
1574    bool can_use_pack = (ctx->block->fp_mode.denorm16_64 & fp_denorm_keep_in) &&
1575                        (ctx->program->gfx_level >= GFX10 ||
1576                         (ctx->program->gfx_level >= GFX9 && !lo.isLiteral() && !hi.isLiteral())) &&
1577                        ctx->program->gfx_level < GFX12;
1578 
1579    if (can_use_pack) {
1580       Instruction* instr = bld.vop3(aco_opcode::v_pack_b32_f16, def, lo, hi);
1581       /* opsel: 0 = select low half, 1 = select high half. [0] = src0, [1] = src1 */
1582       instr->valu().opsel = hi.physReg().byte() | (lo.physReg().byte() >> 1);
1583       return;
1584    }
1585 
1586    /* a single alignbyte can be sufficient: hi can be a 32-bit integer constant */
1587    if (lo.physReg().byte() == 2 && hi.physReg().byte() == 0 &&
1588        (!hi.isConstant() || (hi.constantValue() && (!Operand::c32(hi.constantValue()).isLiteral() ||
1589                                                     ctx->program->gfx_level >= GFX10)))) {
1590       if (hi.isConstant())
1591          bld.vop3(aco_opcode::v_alignbyte_b32, def, Operand::c32(hi.constantValue()), lo,
1592                   Operand::c32(2u));
1593       else
1594          bld.vop3(aco_opcode::v_alignbyte_b32, def, hi, lo, Operand::c32(2u));
1595       return;
1596    }
1597 
1598    Definition def_lo = Definition(def.physReg(), v2b);
1599    Definition def_hi = Definition(def.physReg().advance(2), v2b);
1600 
1601    if (lo.isConstant()) {
1602       /* move hi and zero low bits */
1603       if (hi.physReg().byte() == 0)
1604          bld.vop2(aco_opcode::v_lshlrev_b32, def_hi, Operand::c32(16u), hi);
1605       else
1606          bld.vop2(aco_opcode::v_and_b32, def_hi, Operand::c32(~0xFFFFu), hi);
1607       if (lo.constantValue())
1608          bld.vop2(aco_opcode::v_or_b32, def, Operand::c32(lo.constantValue()),
1609                   Operand(def.physReg(), v1));
1610       return;
1611    }
1612    if (hi.isConstant()) {
1613       /* move lo and zero high bits */
1614       if (lo.physReg().byte() == 2)
1615          bld.vop2(aco_opcode::v_lshrrev_b32, def_lo, Operand::c32(16u), lo);
1616       else if (ctx->program->gfx_level >= GFX11)
1617          bld.vop1(aco_opcode::v_cvt_u32_u16, def, lo);
1618       else
1619          bld.vop2(aco_opcode::v_and_b32, def_lo, Operand::c32(0xFFFFu), lo);
1620       if (hi.constantValue())
1621          bld.vop2(aco_opcode::v_or_b32, def, Operand::c32(hi.constantValue() << 16u),
1622                   Operand(def.physReg(), v1));
1623       return;
1624    }
1625 
1626    if (lo.physReg().reg() == def.physReg().reg()) {
1627       /* lo is in the high bits of def */
1628       assert(lo.physReg().byte() == 2);
1629       bld.vop2(aco_opcode::v_lshrrev_b32, def_lo, Operand::c32(16u), lo);
1630       lo.setFixed(def.physReg());
1631    } else if (hi.physReg() == def.physReg()) {
1632       /* hi is in the low bits of def */
1633       assert(hi.physReg().byte() == 0);
1634       bld.vop2(aco_opcode::v_lshlrev_b32, def_hi, Operand::c32(16u), hi);
1635       hi.setFixed(def.physReg().advance(2));
1636    } else if (ctx->program->gfx_level >= GFX8) {
1637       /* Either lo or hi can be placed with just a v_mov. SDWA is not needed, because
1638        * op.physReg().byte()==def.physReg().byte() and the other half will be overwritten.
1639        */
1640       assert(lo.physReg().byte() == 0 || hi.physReg().byte() == 2);
1641       Operand& op = lo.physReg().byte() == 0 ? lo : hi;
1642       PhysReg reg = def.physReg().advance(op.physReg().byte());
1643       bld.vop1(aco_opcode::v_mov_b32, Definition(reg, v2b), op);
1644       op.setFixed(reg);
1645    }
1646 
1647    /* either hi or lo are already placed correctly */
1648    if (ctx->program->gfx_level >= GFX11) {
1649       if (lo.physReg().reg() == def.physReg().reg())
1650          emit_v_mov_b16(bld, def_hi, hi);
1651       else
1652          emit_v_mov_b16(bld, def_lo, lo);
1653    } else {
1654       if (lo.physReg().reg() == def.physReg().reg())
1655          bld.vop1_sdwa(aco_opcode::v_mov_b32, def_hi, hi);
1656       else
1657          bld.vop1_sdwa(aco_opcode::v_mov_b32, def_lo, lo);
1658    }
1659 }
1660 
1661 void
try_coalesce_copies(lower_context * ctx,std::map<PhysReg,copy_operation> & copy_map,copy_operation & copy)1662 try_coalesce_copies(lower_context* ctx, std::map<PhysReg, copy_operation>& copy_map,
1663                     copy_operation& copy)
1664 {
1665    // TODO try more relaxed alignment for subdword copies
1666    unsigned next_def_align = util_next_power_of_two(copy.bytes + 1);
1667    unsigned next_op_align = next_def_align;
1668    if (copy.def.regClass().type() == RegType::vgpr)
1669       next_def_align = MIN2(next_def_align, 4);
1670    if (copy.op.regClass().type() == RegType::vgpr)
1671       next_op_align = MIN2(next_op_align, 4);
1672 
1673    if (copy.bytes >= 8 || copy.def.physReg().reg_b % next_def_align ||
1674        (!copy.op.isConstant() && copy.op.physReg().reg_b % next_op_align))
1675       return;
1676 
1677    auto other = copy_map.find(copy.def.physReg().advance(copy.bytes));
1678    if (other == copy_map.end() || copy.bytes + other->second.bytes > 8 ||
1679        copy.op.isConstant() != other->second.op.isConstant())
1680       return;
1681 
1682    /* don't create 64-bit copies before GFX10 */
1683    if (copy.bytes >= 4 && copy.def.regClass().type() == RegType::vgpr &&
1684        ctx->program->gfx_level < GFX10)
1685       return;
1686 
1687    unsigned new_size = copy.bytes + other->second.bytes;
1688    if (copy.op.isConstant()) {
1689       uint64_t val =
1690          copy.op.constantValue64() | (other->second.op.constantValue64() << (copy.bytes * 8u));
1691       if (!util_is_power_of_two_or_zero(new_size))
1692          return;
1693       if (!Operand::is_constant_representable(val, new_size, true,
1694                                               copy.def.regClass().type() == RegType::vgpr))
1695          return;
1696       copy.op = Operand::get_const(ctx->program->gfx_level, val, new_size);
1697    } else {
1698       if (other->second.op.physReg() != copy.op.physReg().advance(copy.bytes))
1699          return;
1700       copy.op = Operand(copy.op.physReg(), copy.op.regClass().resize(new_size));
1701    }
1702 
1703    copy.bytes = new_size;
1704    copy.def = Definition(copy.def.physReg(), copy.def.regClass().resize(copy.bytes));
1705    copy_map.erase(other);
1706 }
1707 
1708 void
handle_operands(std::map<PhysReg,copy_operation> & copy_map,lower_context * ctx,amd_gfx_level gfx_level,Pseudo_instruction * pi)1709 handle_operands(std::map<PhysReg, copy_operation>& copy_map, lower_context* ctx,
1710                 amd_gfx_level gfx_level, Pseudo_instruction* pi)
1711 {
1712    Builder bld(ctx->program, &ctx->instructions);
1713    unsigned num_instructions_before = ctx->instructions.size();
1714    aco_ptr<Instruction> mov;
1715    bool writes_scc = false;
1716 
1717    /* count the number of uses for each dst reg */
1718    for (auto it = copy_map.begin(); it != copy_map.end();) {
1719 
1720       if (it->second.def.physReg() == scc)
1721          writes_scc = true;
1722 
1723       assert(!pi->needs_scratch_reg || it->second.def.physReg() != pi->scratch_sgpr);
1724 
1725       /* if src and dst reg are the same, remove operation */
1726       if (it->first == it->second.op.physReg()) {
1727          it = copy_map.erase(it);
1728          continue;
1729       }
1730 
1731       /* split large copies */
1732       if (it->second.bytes > 8) {
1733          assert(!it->second.op.isConstant());
1734          assert(!it->second.def.regClass().is_subdword());
1735          RegClass rc = it->second.def.regClass().resize(it->second.def.bytes() - 8);
1736          Definition hi_def = Definition(PhysReg{it->first + 2}, rc);
1737          rc = it->second.op.regClass().resize(it->second.op.bytes() - 8);
1738          Operand hi_op = Operand(PhysReg{it->second.op.physReg() + 2}, rc);
1739          copy_operation copy = {hi_op, hi_def, it->second.bytes - 8};
1740          copy_map[hi_def.physReg()] = copy;
1741          assert(it->second.op.physReg().byte() == 0 && it->second.def.physReg().byte() == 0);
1742          it->second.op = Operand(it->second.op.physReg(), it->second.op.regClass().resize(8));
1743          it->second.def = Definition(it->second.def.physReg(), it->second.def.regClass().resize(8));
1744          it->second.bytes = 8;
1745       }
1746 
1747       try_coalesce_copies(ctx, copy_map, it->second);
1748 
1749       /* check if the definition reg is used by another copy operation */
1750       for (std::pair<const PhysReg, copy_operation>& copy : copy_map) {
1751          if (copy.second.op.isConstant())
1752             continue;
1753          for (uint16_t i = 0; i < it->second.bytes; i++) {
1754             /* distance might underflow */
1755             unsigned distance = it->first.reg_b + i - copy.second.op.physReg().reg_b;
1756             if (distance < copy.second.bytes)
1757                it->second.uses[i] += 1;
1758          }
1759       }
1760 
1761       ++it;
1762    }
1763 
1764    /* first, handle paths in the location transfer graph */
1765    bool preserve_scc = pi->needs_scratch_reg && pi->scratch_sgpr != scc && !writes_scc;
1766    bool skip_partial_copies = true;
1767    for (auto it = copy_map.begin();;) {
1768       if (copy_map.empty()) {
1769          ctx->program->statistics[aco_statistic_copies] +=
1770             ctx->instructions.size() - num_instructions_before;
1771          return;
1772       }
1773       if (it == copy_map.end()) {
1774          if (!skip_partial_copies)
1775             break;
1776          skip_partial_copies = false;
1777          it = copy_map.begin();
1778       }
1779 
1780       /* check if we can pack one register at once */
1781       if (it->first.byte() == 0 && it->second.bytes == 2) {
1782          PhysReg reg_hi = it->first.advance(2);
1783          std::map<PhysReg, copy_operation>::iterator other = copy_map.find(reg_hi);
1784          if (other != copy_map.end() && other->second.bytes == 2) {
1785             /* check if the target register is otherwise unused */
1786             bool unused_lo = !it->second.is_used || (it->second.is_used == 0x0101 &&
1787                                                      other->second.op.physReg() == it->first);
1788             bool unused_hi = !other->second.is_used ||
1789                              (other->second.is_used == 0x0101 && it->second.op.physReg() == reg_hi);
1790             if (unused_lo && unused_hi) {
1791                Operand lo = it->second.op;
1792                Operand hi = other->second.op;
1793                do_pack_2x16(ctx, bld, Definition(it->first, v1), lo, hi);
1794                copy_map.erase(it);
1795                copy_map.erase(other);
1796 
1797                for (std::pair<const PhysReg, copy_operation>& other2 : copy_map) {
1798                   for (uint16_t i = 0; i < other2.second.bytes; i++) {
1799                      /* distance might underflow */
1800                      unsigned distance_lo = other2.first.reg_b + i - lo.physReg().reg_b;
1801                      unsigned distance_hi = other2.first.reg_b + i - hi.physReg().reg_b;
1802                      if (distance_lo < 2 || distance_hi < 2)
1803                         other2.second.uses[i] -= 1;
1804                   }
1805                }
1806                it = copy_map.begin();
1807                continue;
1808             }
1809          }
1810       }
1811 
1812       /* optimize constant copies to aligned sgpr pair that's otherwise unused. */
1813       if (it->first <= exec && (it->first % 2) == 0 && it->second.bytes == 4 &&
1814           it->second.op.isConstant() && !it->second.is_used) {
1815          PhysReg reg_hi = it->first.advance(4);
1816          std::map<PhysReg, copy_operation>::iterator other = copy_map.find(reg_hi);
1817          if (other != copy_map.end() && other->second.bytes == 4 && other->second.op.isConstant() &&
1818              !other->second.is_used) {
1819             uint64_t constant =
1820                it->second.op.constantValue64() | (other->second.op.constantValue64() << 32);
1821             copy_constant_sgpr(bld, Definition(it->first, s2), constant);
1822             copy_map.erase(it);
1823             copy_map.erase(other);
1824             it = copy_map.begin();
1825             continue;
1826          }
1827       }
1828 
1829       /* find portions where the target reg is not used as operand for any other copy */
1830       if (it->second.is_used) {
1831          if (it->second.op.isConstant() || skip_partial_copies) {
1832             /* we have to skip constants until is_used=0.
1833              * we also skip partial copies at the beginning to help coalescing */
1834             ++it;
1835             continue;
1836          }
1837 
1838          unsigned has_zero_use_bytes = 0;
1839          for (unsigned i = 0; i < it->second.bytes; i++)
1840             has_zero_use_bytes |= (it->second.uses[i] == 0) << i;
1841 
1842          if (has_zero_use_bytes) {
1843             /* Skipping partial copying and doing a v_swap_b32 and then fixup
1844              * copies is usually beneficial for sub-dword copies, but if doing
1845              * a partial copy allows further copies, it should be done instead. */
1846             bool partial_copy = (has_zero_use_bytes == 0xf) || (has_zero_use_bytes == 0xf0);
1847             for (std::pair<const PhysReg, copy_operation>& copy : copy_map) {
1848                if (partial_copy)
1849                   break;
1850                for (uint16_t i = 0; i < copy.second.bytes; i++) {
1851                   /* distance might underflow */
1852                   unsigned distance = copy.first.reg_b + i - it->second.op.physReg().reg_b;
1853                   if (distance < it->second.bytes && copy.second.uses[i] == 1 &&
1854                       !it->second.uses[distance])
1855                      partial_copy = true;
1856                }
1857             }
1858 
1859             if (!partial_copy) {
1860                ++it;
1861                continue;
1862             }
1863          } else {
1864             /* full target reg is used: register swapping needed */
1865             ++it;
1866             continue;
1867          }
1868       }
1869 
1870       bool did_copy = do_copy(ctx, bld, it->second, &preserve_scc, pi->scratch_sgpr);
1871       skip_partial_copies = did_copy;
1872       std::pair<PhysReg, copy_operation> copy = *it;
1873 
1874       if (it->second.is_used == 0) {
1875          /* the target reg is not used as operand for any other copy, so we
1876           * copied to all of it */
1877          copy_map.erase(it);
1878          it = copy_map.begin();
1879       } else {
1880          /* we only performed some portions of this copy, so split it to only
1881           * leave the portions that still need to be done */
1882          copy_operation original = it->second; /* the map insertion below can overwrite this */
1883          copy_map.erase(it);
1884          for (unsigned offset = 0; offset < original.bytes;) {
1885             if (original.uses[offset] == 0) {
1886                offset++;
1887                continue;
1888             }
1889             Definition def;
1890             Operand op;
1891             split_copy(ctx, offset, &def, &op, original, false, 8);
1892 
1893             copy_operation new_copy = {op, def, def.bytes()};
1894             for (unsigned i = 0; i < new_copy.bytes; i++)
1895                new_copy.uses[i] = original.uses[i + offset];
1896             copy_map[def.physReg()] = new_copy;
1897 
1898             offset += def.bytes();
1899          }
1900 
1901          it = copy_map.begin();
1902       }
1903 
1904       /* Reduce the number of uses of the operand reg by one. Do this after
1905        * splitting the copy or removing it in case the copy writes to it's own
1906        * operand (for example, v[7:8] = v[8:9]) */
1907       if (did_copy && !copy.second.op.isConstant()) {
1908          for (std::pair<const PhysReg, copy_operation>& other : copy_map) {
1909             for (uint16_t i = 0; i < other.second.bytes; i++) {
1910                /* distance might underflow */
1911                unsigned distance = other.first.reg_b + i - copy.second.op.physReg().reg_b;
1912                if (distance < copy.second.bytes && !copy.second.uses[distance])
1913                   other.second.uses[i] -= 1;
1914             }
1915          }
1916       }
1917    }
1918 
1919    /* all target regs are needed as operand somewhere which means, all entries are part of a cycle */
1920    unsigned largest = 0;
1921    for (const std::pair<const PhysReg, copy_operation>& op : copy_map)
1922       largest = MAX2(largest, op.second.bytes);
1923 
1924    while (!copy_map.empty()) {
1925 
1926       /* Perform larger swaps first, because larger swaps swaps can make other
1927        * swaps unnecessary. */
1928       auto it = copy_map.begin();
1929       for (auto it2 = copy_map.begin(); it2 != copy_map.end(); ++it2) {
1930          if (it2->second.bytes > it->second.bytes) {
1931             it = it2;
1932             if (it->second.bytes == largest)
1933                break;
1934          }
1935       }
1936 
1937       /* should already be done */
1938       assert(!it->second.op.isConstant());
1939 
1940       assert(it->second.op.isFixed());
1941       assert(it->second.def.regClass() == it->second.op.regClass());
1942 
1943       if (it->first == it->second.op.physReg()) {
1944          copy_map.erase(it);
1945          continue;
1946       }
1947 
1948       if (it->second.def.getTemp().type() == RegType::sgpr) {
1949          assert(it->second.def.physReg() != pi->scratch_sgpr);
1950          assert(pi->needs_scratch_reg);
1951          assert(!preserve_scc || pi->scratch_sgpr != scc);
1952       }
1953 
1954       /* to resolve the cycle, we have to swap the src reg with the dst reg */
1955       copy_operation swap = it->second;
1956 
1957       /* if this is self-intersecting, we have to split it because
1958        * self-intersecting swaps don't make sense */
1959       PhysReg src = swap.op.physReg(), dst = swap.def.physReg();
1960       if (abs((int)src.reg_b - (int)dst.reg_b) < (int)swap.bytes) {
1961          unsigned offset = abs((int)src.reg_b - (int)dst.reg_b);
1962 
1963          copy_operation remaining;
1964          src.reg_b += offset;
1965          dst.reg_b += offset;
1966          remaining.bytes = swap.bytes - offset;
1967          memcpy(remaining.uses, swap.uses + offset, remaining.bytes);
1968          remaining.op = Operand(src, swap.def.regClass().resize(remaining.bytes));
1969          remaining.def = Definition(dst, swap.def.regClass().resize(remaining.bytes));
1970          copy_map[dst] = remaining;
1971 
1972          memset(swap.uses + offset, 0, swap.bytes - offset);
1973          swap.bytes = offset;
1974       }
1975 
1976       /* GFX6-7 can only swap full registers */
1977       assert (ctx->program->gfx_level > GFX7 || (swap.bytes % 4) == 0);
1978 
1979       do_swap(ctx, bld, swap, preserve_scc, pi);
1980 
1981       /* remove from map */
1982       copy_map.erase(it);
1983 
1984       /* change the operand reg of the target's uses and split uses if needed */
1985       uint32_t bytes_left = u_bit_consecutive(0, swap.bytes);
1986       for (auto target = copy_map.begin(); target != copy_map.end(); ++target) {
1987          if (target->second.op.physReg() == swap.def.physReg() &&
1988              swap.bytes == target->second.bytes) {
1989             target->second.op.setFixed(swap.op.physReg());
1990             break;
1991          }
1992 
1993          uint32_t imask =
1994             get_intersection_mask(swap.def.physReg().reg_b, swap.bytes,
1995                                   target->second.op.physReg().reg_b, target->second.bytes);
1996 
1997          if (!imask)
1998             continue;
1999 
2000          int offset = (int)target->second.op.physReg().reg_b - (int)swap.def.physReg().reg_b;
2001 
2002          /* split and update the middle (the portion that reads the swap's
2003           * definition) to read the swap's operand instead */
2004          int target_op_end = target->second.op.physReg().reg_b + target->second.bytes;
2005          int swap_def_end = swap.def.physReg().reg_b + swap.bytes;
2006          int before_bytes = MAX2(-offset, 0);
2007          int after_bytes = MAX2(target_op_end - swap_def_end, 0);
2008          int middle_bytes = target->second.bytes - before_bytes - after_bytes;
2009 
2010          if (after_bytes) {
2011             unsigned after_offset = before_bytes + middle_bytes;
2012             assert(after_offset > 0);
2013             copy_operation copy;
2014             copy.bytes = after_bytes;
2015             memcpy(copy.uses, target->second.uses + after_offset, copy.bytes);
2016             RegClass rc = target->second.op.regClass().resize(after_bytes);
2017             copy.op = Operand(target->second.op.physReg().advance(after_offset), rc);
2018             copy.def = Definition(target->second.def.physReg().advance(after_offset), rc);
2019             copy_map[copy.def.physReg()] = copy;
2020          }
2021 
2022          if (middle_bytes) {
2023             copy_operation copy;
2024             copy.bytes = middle_bytes;
2025             memcpy(copy.uses, target->second.uses + before_bytes, copy.bytes);
2026             RegClass rc = target->second.op.regClass().resize(middle_bytes);
2027             copy.op = Operand(swap.op.physReg().advance(MAX2(offset, 0)), rc);
2028             copy.def = Definition(target->second.def.physReg().advance(before_bytes), rc);
2029             copy_map[copy.def.physReg()] = copy;
2030          }
2031 
2032          if (before_bytes) {
2033             copy_operation copy;
2034             target->second.bytes = before_bytes;
2035             RegClass rc = target->second.op.regClass().resize(before_bytes);
2036             target->second.op = Operand(target->second.op.physReg(), rc);
2037             target->second.def = Definition(target->second.def.physReg(), rc);
2038             memset(target->second.uses + target->second.bytes, 0, 8 - target->second.bytes);
2039          }
2040 
2041          /* break early since we know each byte of the swap's definition is used
2042           * at most once */
2043          bytes_left &= ~imask;
2044          if (!bytes_left)
2045             break;
2046       }
2047    }
2048    ctx->program->statistics[aco_statistic_copies] +=
2049       ctx->instructions.size() - num_instructions_before;
2050 }
2051 
2052 void
handle_operands_linear_vgpr(std::map<PhysReg,copy_operation> & copy_map,lower_context * ctx,amd_gfx_level gfx_level,Pseudo_instruction * pi)2053 handle_operands_linear_vgpr(std::map<PhysReg, copy_operation>& copy_map, lower_context* ctx,
2054                             amd_gfx_level gfx_level, Pseudo_instruction* pi)
2055 {
2056    Builder bld(ctx->program, &ctx->instructions);
2057 
2058    for (auto& copy : copy_map) {
2059       copy.second.op =
2060          Operand(copy.second.op.physReg(), RegClass::get(RegType::vgpr, copy.second.op.bytes()));
2061       copy.second.def = Definition(copy.second.def.physReg(),
2062                                    RegClass::get(RegType::vgpr, copy.second.def.bytes()));
2063    }
2064 
2065    std::map<PhysReg, copy_operation> second_map(copy_map);
2066    handle_operands(second_map, ctx, gfx_level, pi);
2067 
2068    assert(pi->needs_scratch_reg);
2069    PhysReg scratch_sgpr = pi->scratch_sgpr;
2070    if (scratch_sgpr != scc) {
2071       bld.sop1(aco_opcode::s_mov_b32, Definition(scratch_sgpr, s1), Operand(scc, s1));
2072       pi->scratch_sgpr = scc;
2073    }
2074    bld.sop1(Builder::s_not, Definition(exec, bld.lm), Definition(scc, s1), Operand(exec, bld.lm));
2075 
2076    handle_operands(copy_map, ctx, gfx_level, pi);
2077 
2078    bld.sop1(Builder::s_not, Definition(exec, bld.lm), Definition(scc, s1), Operand(exec, bld.lm));
2079    if (scratch_sgpr != scc) {
2080       bld.sopc(aco_opcode::s_cmp_lg_i32, Definition(scc, s1), Operand(scratch_sgpr, s1),
2081                Operand::zero());
2082       pi->scratch_sgpr = scratch_sgpr;
2083    }
2084 
2085    ctx->program->statistics[aco_statistic_copies] += scratch_sgpr == scc ? 2 : 4;
2086 }
2087 
2088 void
emit_set_mode(Builder & bld,float_mode new_mode,bool set_round,bool set_denorm)2089 emit_set_mode(Builder& bld, float_mode new_mode, bool set_round, bool set_denorm)
2090 {
2091    if (bld.program->gfx_level >= GFX10) {
2092       if (set_round)
2093          bld.sopp(aco_opcode::s_round_mode, new_mode.round);
2094       if (set_denorm)
2095          bld.sopp(aco_opcode::s_denorm_mode, new_mode.denorm);
2096    } else if (set_round || set_denorm) {
2097       /* "((size - 1) << 11) | register" (MODE is encoded as register 1) */
2098       bld.sopk(aco_opcode::s_setreg_imm32_b32, Operand::literal32(new_mode.val), (7 << 11) | 1);
2099    }
2100 }
2101 
2102 void
emit_set_mode_from_block(Builder & bld,Program & program,Block * block)2103 emit_set_mode_from_block(Builder& bld, Program& program, Block* block)
2104 {
2105    float_mode initial;
2106    initial.val = program.config->float_mode;
2107 
2108    bool inital_unknown =
2109       (program.info.merged_shader_compiled_separately && program.stage.sw == SWStage::GS) ||
2110       (program.info.merged_shader_compiled_separately && program.stage.sw == SWStage::TCS);
2111    bool is_start = block->index == 0;
2112    bool set_round = is_start && (inital_unknown || block->fp_mode.round != initial.round);
2113    bool set_denorm = is_start && (inital_unknown || block->fp_mode.denorm != initial.denorm);
2114    if (block->kind & block_kind_top_level) {
2115       for (unsigned pred : block->linear_preds) {
2116          if (program.blocks[pred].fp_mode.round != block->fp_mode.round)
2117             set_round = true;
2118          if (program.blocks[pred].fp_mode.denorm != block->fp_mode.denorm)
2119             set_denorm = true;
2120       }
2121    }
2122    /* only allow changing modes at top-level blocks so this doesn't break
2123     * the "jump over empty blocks" optimization */
2124    assert((!set_round && !set_denorm) || (block->kind & block_kind_top_level));
2125    emit_set_mode(bld, block->fp_mode, set_round, set_denorm);
2126 }
2127 
2128 void
lower_image_sample(lower_context * ctx,aco_ptr<Instruction> & instr)2129 lower_image_sample(lower_context* ctx, aco_ptr<Instruction>& instr)
2130 {
2131    Operand linear_vgpr = instr->operands[3];
2132 
2133    unsigned nsa_size = ctx->program->dev.max_nsa_vgprs;
2134    unsigned vaddr_size = linear_vgpr.size();
2135    unsigned num_copied_vgprs = instr->operands.size() - 4;
2136    nsa_size = num_copied_vgprs > 0 && (ctx->program->gfx_level >= GFX11 || vaddr_size <= nsa_size)
2137                  ? nsa_size
2138                  : 0;
2139 
2140    Operand vaddr[16];
2141    unsigned num_vaddr = 0;
2142 
2143    if (nsa_size) {
2144       assert(num_copied_vgprs <= nsa_size);
2145       for (unsigned i = 0; i < num_copied_vgprs; i++)
2146          vaddr[num_vaddr++] = instr->operands[4 + i];
2147       for (unsigned i = num_copied_vgprs; i < std::min(vaddr_size, nsa_size); i++)
2148          vaddr[num_vaddr++] = Operand(linear_vgpr.physReg().advance(i * 4), v1);
2149       if (vaddr_size > nsa_size) {
2150          RegClass rc = RegClass::get(RegType::vgpr, (vaddr_size - nsa_size) * 4);
2151          vaddr[num_vaddr++] = Operand(PhysReg(linear_vgpr.physReg().advance(nsa_size * 4)), rc);
2152       }
2153    } else {
2154       PhysReg reg = linear_vgpr.physReg();
2155       std::map<PhysReg, copy_operation> copy_operations;
2156       for (unsigned i = 4; i < instr->operands.size(); i++) {
2157          Operand arg = instr->operands[i];
2158          Definition def(reg, RegClass::get(RegType::vgpr, arg.bytes()));
2159          copy_operations[def.physReg()] = {arg, def, def.bytes()};
2160          reg = reg.advance(arg.bytes());
2161       }
2162       vaddr[num_vaddr++] = linear_vgpr;
2163 
2164       Pseudo_instruction pi = {};
2165       handle_operands(copy_operations, ctx, ctx->program->gfx_level, &pi);
2166    }
2167 
2168    instr->mimg().strict_wqm = false;
2169 
2170    if ((3 + num_vaddr) > instr->operands.size()) {
2171       Instruction* new_instr =
2172          create_instruction(instr->opcode, Format::MIMG, 3 + num_vaddr, instr->definitions.size());
2173       std::copy(instr->definitions.cbegin(), instr->definitions.cend(),
2174                 new_instr->definitions.begin());
2175       new_instr->operands[0] = instr->operands[0];
2176       new_instr->operands[1] = instr->operands[1];
2177       new_instr->operands[2] = instr->operands[2];
2178       memcpy((uint8_t*)new_instr + sizeof(Instruction), (uint8_t*)instr.get() + sizeof(Instruction),
2179              sizeof(MIMG_instruction) - sizeof(Instruction));
2180       instr.reset(new_instr);
2181    } else {
2182       while (instr->operands.size() > (3 + num_vaddr))
2183          instr->operands.pop_back();
2184    }
2185    std::copy(vaddr, vaddr + num_vaddr, std::next(instr->operands.begin(), 3));
2186 }
2187 
2188 } /* end namespace */
2189 
2190 void
hw_init_scratch(Builder & bld,Definition def,Operand scratch_addr,Operand scratch_offset)2191 hw_init_scratch(Builder& bld, Definition def, Operand scratch_addr, Operand scratch_offset)
2192 {
2193    /* Since we know what the high 16 bits of scratch_hi is, we can set all the high 16
2194     * bits in the same instruction that we add the carry.
2195     */
2196    Operand hi_add = Operand::c32(0xffff0000 - S_008F04_SWIZZLE_ENABLE_GFX6(1));
2197    Operand scratch_addr_lo(scratch_addr.physReg(), s1);
2198    Operand scratch_addr_hi(scratch_addr_lo.physReg().advance(4), s1);
2199 
2200    if (bld.program->gfx_level >= GFX10) {
2201       PhysReg scratch_lo = def.physReg();
2202       PhysReg scratch_hi = def.physReg().advance(4);
2203 
2204       bld.sop2(aco_opcode::s_add_u32, Definition(scratch_lo, s1), Definition(scc, s1),
2205                scratch_addr_lo, scratch_offset);
2206       bld.sop2(aco_opcode::s_addc_u32, Definition(scratch_hi, s1), Definition(scc, s1),
2207                scratch_addr_hi, hi_add, Operand(scc, s1));
2208 
2209       /* "((size - 1) << 11) | register" (FLAT_SCRATCH_LO/HI is encoded as register
2210        * 20/21) */
2211       bld.sopk(aco_opcode::s_setreg_b32, Operand(scratch_lo, s1), (31 << 11) | 20);
2212       bld.sopk(aco_opcode::s_setreg_b32, Operand(scratch_hi, s1), (31 << 11) | 21);
2213    } else {
2214       bld.sop2(aco_opcode::s_add_u32, Definition(flat_scr_lo, s1), Definition(scc, s1),
2215                scratch_addr_lo, scratch_offset);
2216       bld.sop2(aco_opcode::s_addc_u32, Definition(flat_scr_hi, s1), Definition(scc, s1),
2217                scratch_addr_hi, hi_add, Operand(scc, s1));
2218    }
2219 }
2220 
2221 void
lower_to_hw_instr(Program * program)2222 lower_to_hw_instr(Program* program)
2223 {
2224    gfx9_pops_done_msg_bounds pops_done_msg_bounds;
2225    if (program->has_pops_overlapped_waves_wait && program->gfx_level < GFX11) {
2226       pops_done_msg_bounds = gfx9_pops_done_msg_bounds(program);
2227    }
2228 
2229    Block* discard_exit_block = NULL;
2230    Block* discard_pops_done_and_exit_block = NULL;
2231 
2232    int end_with_regs_block_index = -1;
2233 
2234    bool should_dealloc_vgprs = dealloc_vgprs(program);
2235 
2236    for (int block_idx = program->blocks.size() - 1; block_idx >= 0; block_idx--) {
2237       Block* block = &program->blocks[block_idx];
2238       lower_context ctx;
2239       ctx.program = program;
2240       ctx.block = block;
2241       ctx.instructions.reserve(block->instructions.size());
2242       Builder bld(program, &ctx.instructions);
2243 
2244       emit_set_mode_from_block(bld, *program, block);
2245 
2246       for (size_t instr_idx = 0; instr_idx < block->instructions.size(); instr_idx++) {
2247          aco_ptr<Instruction>& instr = block->instructions[instr_idx];
2248 
2249          /* Send the ordered section done message from the middle of the block if needed (if the
2250           * ordered section is ended by an instruction inside this block).
2251           * Also make sure the done message is sent if it's needed in case early exit happens for
2252           * any reason.
2253           */
2254          if ((block_idx == pops_done_msg_bounds.end_block_idx() &&
2255               instr_idx == pops_done_msg_bounds.instr_after_end_idx()) ||
2256              (instr->opcode == aco_opcode::s_endpgm &&
2257               pops_done_msg_bounds.early_exit_needs_done_msg(block_idx, instr_idx))) {
2258             bld.sopp(aco_opcode::s_sendmsg, sendmsg_ordered_ps_done);
2259          }
2260 
2261          aco_ptr<Instruction> mov;
2262          if (instr->isPseudo() && instr->opcode != aco_opcode::p_unit_test &&
2263              instr->opcode != aco_opcode::p_debug_info) {
2264             Pseudo_instruction* pi = &instr->pseudo();
2265 
2266             switch (instr->opcode) {
2267             case aco_opcode::p_extract_vector: {
2268                PhysReg reg = instr->operands[0].physReg();
2269                Definition& def = instr->definitions[0];
2270                reg.reg_b += instr->operands[1].constantValue() * def.bytes();
2271 
2272                if (reg == def.physReg())
2273                   break;
2274 
2275                RegClass op_rc = def.regClass().is_subdword()
2276                                    ? def.regClass()
2277                                    : RegClass(instr->operands[0].getTemp().type(), def.size());
2278                std::map<PhysReg, copy_operation> copy_operations;
2279                copy_operations[def.physReg()] = {Operand(reg, op_rc), def, def.bytes()};
2280                handle_operands(copy_operations, &ctx, program->gfx_level, pi);
2281                break;
2282             }
2283             case aco_opcode::p_create_vector:
2284             case aco_opcode::p_start_linear_vgpr: {
2285                if (instr->operands.empty())
2286                   break;
2287 
2288                std::map<PhysReg, copy_operation> copy_operations;
2289                PhysReg reg = instr->definitions[0].physReg();
2290 
2291                for (const Operand& op : instr->operands) {
2292                   RegClass rc = RegClass::get(instr->definitions[0].regClass().type(), op.bytes());
2293                   if (op.isConstant()) {
2294                      const Definition def = Definition(reg, rc);
2295                      copy_operations[reg] = {op, def, op.bytes()};
2296                      reg.reg_b += op.bytes();
2297                      continue;
2298                   }
2299                   if (op.isUndefined()) {
2300                      // TODO: coalesce subdword copies if dst byte is 0
2301                      reg.reg_b += op.bytes();
2302                      continue;
2303                   }
2304 
2305                   RegClass rc_def = op.regClass().is_subdword() ? op.regClass() : rc;
2306                   const Definition def = Definition(reg, rc_def);
2307                   copy_operations[def.physReg()] = {op, def, op.bytes()};
2308                   reg.reg_b += op.bytes();
2309                }
2310                handle_operands(copy_operations, &ctx, program->gfx_level, pi);
2311                break;
2312             }
2313             case aco_opcode::p_split_vector: {
2314                std::map<PhysReg, copy_operation> copy_operations;
2315                PhysReg reg = instr->operands[0].physReg();
2316 
2317                for (const Definition& def : instr->definitions) {
2318                   RegClass rc_op = def.regClass().is_subdword()
2319                                       ? def.regClass()
2320                                       : instr->operands[0].getTemp().regClass().resize(def.bytes());
2321                   const Operand op = Operand(reg, rc_op);
2322                   copy_operations[def.physReg()] = {op, def, def.bytes()};
2323                   reg.reg_b += def.bytes();
2324                }
2325                handle_operands(copy_operations, &ctx, program->gfx_level, pi);
2326                break;
2327             }
2328             case aco_opcode::p_parallelcopy: {
2329                std::map<PhysReg, copy_operation> copy_operations;
2330                bool linear_vgpr = false;
2331                bool non_linear_vgpr = false;
2332                for (unsigned j = 0; j < instr->operands.size(); j++) {
2333                   assert(instr->definitions[j].bytes() == instr->operands[j].bytes());
2334                   copy_operations[instr->definitions[j].physReg()] = {
2335                      instr->operands[j], instr->definitions[j], instr->operands[j].bytes()};
2336                   linear_vgpr |= instr->definitions[j].regClass().is_linear_vgpr();
2337                   non_linear_vgpr |= !instr->definitions[j].regClass().is_linear_vgpr();
2338                }
2339                assert(!linear_vgpr || !non_linear_vgpr);
2340                if (linear_vgpr)
2341                   handle_operands_linear_vgpr(copy_operations, &ctx, program->gfx_level, pi);
2342                else
2343                   handle_operands(copy_operations, &ctx, program->gfx_level, pi);
2344                break;
2345             }
2346             case aco_opcode::p_exit_early_if_not: {
2347                /* don't bother with an early exit near the end of the program */
2348                if ((block->instructions.size() - 1 - instr_idx) <= 5 &&
2349                    block->instructions.back()->opcode == aco_opcode::s_endpgm) {
2350                   unsigned null_exp_dest =
2351                      program->gfx_level >= GFX11 ? V_008DFC_SQ_EXP_MRT : V_008DFC_SQ_EXP_NULL;
2352                   bool ignore_early_exit = true;
2353 
2354                   for (unsigned k = instr_idx + 1; k < block->instructions.size(); ++k) {
2355                      const aco_ptr<Instruction>& instr2 = block->instructions[k];
2356                      if (instr2->opcode == aco_opcode::s_endpgm ||
2357                          instr2->opcode == aco_opcode::p_logical_end)
2358                         continue;
2359                      else if (instr2->opcode == aco_opcode::exp &&
2360                               instr2->exp().dest == null_exp_dest &&
2361                               instr2->exp().enabled_mask == 0)
2362                         continue;
2363                      else if (instr2->opcode == aco_opcode::p_parallelcopy &&
2364                               instr2->definitions[0].isFixed() &&
2365                               instr2->definitions[0].physReg() == exec)
2366                         continue;
2367                      else if (instr2->opcode == aco_opcode::s_sendmsg &&
2368                               instr2->salu().imm == sendmsg_dealloc_vgprs)
2369                         continue;
2370 
2371                      ignore_early_exit = false;
2372                   }
2373 
2374                   if (ignore_early_exit)
2375                      break;
2376                }
2377 
2378                const bool discard_sends_pops_done =
2379                   pops_done_msg_bounds.early_exit_needs_done_msg(block_idx, instr_idx);
2380 
2381                Block* discard_block =
2382                   discard_sends_pops_done ? discard_pops_done_and_exit_block : discard_exit_block;
2383                if (!discard_block) {
2384                   discard_block = program->create_and_insert_block();
2385                   discard_block->kind = block_kind_discard_early_exit;
2386                   if (discard_sends_pops_done) {
2387                      discard_pops_done_and_exit_block = discard_block;
2388                   } else {
2389                      discard_exit_block = discard_block;
2390                   }
2391                   block = &program->blocks[block_idx];
2392 
2393                   /* sendmsg(dealloc_vgprs) releases scratch, so it isn't safe if there is an
2394                    * in-progress scratch store. */
2395                   wait_imm wait;
2396                   if (should_dealloc_vgprs && uses_scratch(program))
2397                      wait.vs = 0;
2398 
2399                   bld.reset(discard_block);
2400                   if (program->has_pops_overlapped_waves_wait &&
2401                       (program->gfx_level >= GFX11 || discard_sends_pops_done)) {
2402                      /* If this discard early exit potentially exits the POPS ordered section, do
2403                       * the waitcnt necessary before resuming overlapping waves as the normal
2404                       * waitcnt insertion doesn't work in a discard early exit block.
2405                       */
2406                      if (program->gfx_level >= GFX10)
2407                         wait.vs = 0;
2408                      wait.vm = 0;
2409                      if (program->has_smem_buffer_or_global_loads)
2410                         wait.lgkm = 0;
2411                      wait.build_waitcnt(bld);
2412                   }
2413                   if (discard_sends_pops_done)
2414                      bld.sopp(aco_opcode::s_sendmsg, sendmsg_ordered_ps_done);
2415 
2416                   unsigned target = V_008DFC_SQ_EXP_NULL;
2417                   if (program->gfx_level >= GFX11)
2418                      target =
2419                         program->has_color_exports ? V_008DFC_SQ_EXP_MRT : V_008DFC_SQ_EXP_MRTZ;
2420                   if (program->stage == fragment_fs)
2421                      bld.exp(aco_opcode::exp, Operand(v1), Operand(v1), Operand(v1), Operand(v1), 0,
2422                              target, false, true, true);
2423 
2424                   wait.build_waitcnt(bld);
2425                   if (should_dealloc_vgprs)
2426                      bld.sopp(aco_opcode::s_sendmsg, sendmsg_dealloc_vgprs);
2427 
2428                   bld.sopp(aco_opcode::s_endpgm);
2429 
2430                   bld.reset(&ctx.instructions);
2431                }
2432 
2433                assert(instr->operands[0].physReg() == scc || instr->operands[0].physReg() == exec);
2434                if (instr->operands[0].physReg() == scc)
2435                   bld.sopp(aco_opcode::s_cbranch_scc0, discard_block->index);
2436                else
2437                   bld.sopp(aco_opcode::s_cbranch_execz, discard_block->index);
2438 
2439                discard_block->linear_preds.push_back(block->index);
2440                block->linear_succs.push_back(discard_block->index);
2441                break;
2442             }
2443             case aco_opcode::p_spill: {
2444                assert(instr->operands[0].regClass() == v1.as_linear());
2445                for (unsigned i = 0; i < instr->operands[2].size(); i++) {
2446                   Operand src =
2447                      instr->operands[2].isConstant()
2448                         ? Operand::c32(uint32_t(instr->operands[2].constantValue64() >> (32 * i)))
2449                         : Operand(PhysReg{instr->operands[2].physReg() + i}, s1);
2450                   bld.writelane(Definition(instr->operands[0].physReg(), v1), src,
2451                                 Operand::c32(instr->operands[1].constantValue() + i),
2452                                 instr->operands[0]);
2453                }
2454                break;
2455             }
2456             case aco_opcode::p_reload: {
2457                assert(instr->operands[0].regClass() == v1.as_linear());
2458                for (unsigned i = 0; i < instr->definitions[0].size(); i++)
2459                   bld.readlane(Definition(PhysReg{instr->definitions[0].physReg() + i}, s1),
2460                                instr->operands[0],
2461                                Operand::c32(instr->operands[1].constantValue() + i));
2462                break;
2463             }
2464             case aco_opcode::p_as_uniform: {
2465                if (instr->operands[0].isConstant() ||
2466                    instr->operands[0].regClass().type() == RegType::sgpr) {
2467                   std::map<PhysReg, copy_operation> copy_operations;
2468                   copy_operations[instr->definitions[0].physReg()] = {
2469                      instr->operands[0], instr->definitions[0], instr->definitions[0].bytes()};
2470                   handle_operands(copy_operations, &ctx, program->gfx_level, pi);
2471                } else {
2472                   assert(instr->operands[0].regClass().type() == RegType::vgpr);
2473                   assert(instr->definitions[0].regClass().type() == RegType::sgpr);
2474                   assert(instr->operands[0].size() == instr->definitions[0].size());
2475                   for (unsigned i = 0; i < instr->definitions[0].size(); i++) {
2476                      bld.vop1(aco_opcode::v_readfirstlane_b32,
2477                               Definition(PhysReg{instr->definitions[0].physReg() + i}, s1),
2478                               Operand(PhysReg{instr->operands[0].physReg() + i}, v1));
2479                   }
2480                }
2481                break;
2482             }
2483             case aco_opcode::p_pops_gfx9_add_exiting_wave_id: {
2484                bld.sop2(aco_opcode::s_add_i32, instr->definitions[0], instr->definitions[1],
2485                         Operand(pops_exiting_wave_id, s1), instr->operands[0]);
2486                break;
2487             }
2488             case aco_opcode::p_bpermute_readlane: {
2489                emit_bpermute_readlane(bld, instr);
2490                break;
2491             }
2492             case aco_opcode::p_bpermute_shared_vgpr: {
2493                emit_bpermute_shared_vgpr(bld, instr);
2494                break;
2495             }
2496             case aco_opcode::p_bpermute_permlane: {
2497                emit_bpermute_permlane(bld, instr);
2498                break;
2499             }
2500             case aco_opcode::p_constaddr: {
2501                unsigned id = instr->definitions[0].tempId();
2502                PhysReg reg = instr->definitions[0].physReg();
2503                bld.sop1(aco_opcode::p_constaddr_getpc, instr->definitions[0], Operand::c32(id));
2504                if (ctx.program->gfx_level >= GFX12)
2505                   bld.sop1(aco_opcode::s_sext_i32_i16, Definition(reg.advance(4), s1), Operand(reg.advance(4), s1));
2506                bld.sop2(aco_opcode::p_constaddr_addlo, Definition(reg, s1), instr->definitions[1],
2507                         Operand(reg, s1), instr->operands[0], Operand::c32(id));
2508                /* s_addc_u32 not needed because the program is in a 32-bit VA range */
2509                break;
2510             }
2511             case aco_opcode::p_resume_shader_address: {
2512                /* Find index of resume block. */
2513                unsigned resume_idx = instr->operands[0].constantValue();
2514                unsigned resume_block_idx = 0;
2515                for (Block& resume_block : program->blocks) {
2516                   if (resume_block.kind & block_kind_resume) {
2517                      if (resume_idx == 0) {
2518                         resume_block_idx = resume_block.index;
2519                         break;
2520                      }
2521                      resume_idx--;
2522                   }
2523                }
2524                assert(resume_block_idx != 0);
2525                unsigned id = instr->definitions[0].tempId();
2526                PhysReg reg = instr->definitions[0].physReg();
2527                bld.sop1(aco_opcode::p_resumeaddr_getpc, instr->definitions[0], Operand::c32(id));
2528                if (ctx.program->gfx_level >= GFX12)
2529                   bld.sop1(aco_opcode::s_sext_i32_i16, Definition(reg.advance(4), s1), Operand(reg.advance(4), s1));
2530                bld.sop2(aco_opcode::p_resumeaddr_addlo, Definition(reg, s1), instr->definitions[1],
2531                         Operand(reg, s1), Operand::c32(resume_block_idx), Operand::c32(id));
2532                /* s_addc_u32 not needed because the program is in a 32-bit VA range */
2533                break;
2534             }
2535             case aco_opcode::p_extract: {
2536                assert(instr->operands[1].isConstant());
2537                assert(instr->operands[2].isConstant());
2538                assert(instr->operands[3].isConstant());
2539                if (instr->definitions[0].regClass() == s1)
2540                   assert(instr->definitions.size() >= 2 && instr->definitions[1].physReg() == scc);
2541                Definition dst = instr->definitions[0];
2542                Operand op = instr->operands[0];
2543                unsigned bits = instr->operands[2].constantValue();
2544                unsigned index = instr->operands[1].constantValue();
2545                unsigned offset = index * bits;
2546                bool signext = !instr->operands[3].constantEquals(0);
2547 
2548                if (dst.regClass() == s1) {
2549                   if (offset == 0 && signext && (bits == 8 || bits == 16)) {
2550                      bld.sop1(bits == 8 ? aco_opcode::s_sext_i32_i8 : aco_opcode::s_sext_i32_i16,
2551                               dst, op);
2552                   } else if (ctx.program->gfx_level >= GFX9 && offset == 0 && bits == 16) {
2553                      bld.sop2(aco_opcode::s_pack_ll_b32_b16, dst, op, Operand::zero());
2554                   } else if (ctx.program->gfx_level >= GFX9 && offset == 16 && bits == 16 &&
2555                              !signext) {
2556                      bld.sop2(aco_opcode::s_pack_hh_b32_b16, dst, op, Operand::zero());
2557                   } else if (offset == (32 - bits)) {
2558                      bld.sop2(signext ? aco_opcode::s_ashr_i32 : aco_opcode::s_lshr_b32, dst,
2559                               instr->definitions[1], op, Operand::c32(offset));
2560                   } else {
2561                      bld.sop2(signext ? aco_opcode::s_bfe_i32 : aco_opcode::s_bfe_u32, dst,
2562                               instr->definitions[1], op, Operand::c32((bits << 16) | offset));
2563                   }
2564                } else if (dst.regClass() == v1) {
2565                   if (op.physReg().byte()) {
2566                      offset += op.physReg().byte() * 8;
2567                      op = Operand(PhysReg(op.physReg().reg()), v1);
2568                   }
2569                   assert(op.physReg().byte() == 0 && dst.physReg().byte() == 0);
2570                   if (offset == (32 - bits) && op.regClass() != s1) {
2571                      bld.vop2(signext ? aco_opcode::v_ashrrev_i32 : aco_opcode::v_lshrrev_b32, dst,
2572                               Operand::c32(offset), op);
2573                   } else if (offset == 0 && bits == 16 && ctx.program->gfx_level >= GFX11) {
2574                      bld.vop1(signext ? aco_opcode::v_cvt_i32_i16 : aco_opcode::v_cvt_u32_u16, dst,
2575                               op);
2576                   } else {
2577                      bld.vop3(signext ? aco_opcode::v_bfe_i32 : aco_opcode::v_bfe_u32, dst, op,
2578                               Operand::c32(offset), Operand::c32(bits));
2579                   }
2580                } else {
2581                   assert(dst.regClass() == v2b || dst.regClass() == v1b || op.regClass() == v2b ||
2582                          op.regClass() == v1b);
2583                   if (ctx.program->gfx_level >= GFX11) {
2584                      unsigned op_vgpr_byte = op.physReg().byte() + offset / 8;
2585                      unsigned sign_byte = op_vgpr_byte + bits / 8 - 1;
2586 
2587                      uint8_t swiz[4] = {4, 5, 6, 7};
2588                      swiz[dst.physReg().byte()] = op_vgpr_byte;
2589                      if (bits == 16 && dst.bytes() >= 2)
2590                         swiz[dst.physReg().byte() + 1] = op_vgpr_byte + 1;
2591                      for (unsigned i = bits / 8; i < dst.bytes(); i++) {
2592                         uint8_t ext = bperm_0;
2593                         if (signext) {
2594                            if (sign_byte == 1)
2595                               ext = bperm_b1_sign;
2596                            else if (sign_byte == 3)
2597                               ext = bperm_b3_sign;
2598                            else /* replicate so sign-extension can be done later */
2599                               ext = sign_byte;
2600                         }
2601                         swiz[dst.physReg().byte() + i] = ext;
2602                      }
2603                      create_bperm(bld, swiz, dst, op);
2604 
2605                      if (signext && sign_byte != 3 && sign_byte != 1) {
2606                         assert(bits == 8);
2607                         assert(dst.regClass() == v2b || dst.regClass() == v1);
2608                         uint8_t ext_swiz[4] = {4, 5, 6, 7};
2609                         uint8_t ext = dst.physReg().byte() == 2 ? bperm_b7_sign : bperm_b5_sign;
2610                         memset(ext_swiz + dst.physReg().byte() + 1, ext, dst.bytes() - 1);
2611                         create_bperm(bld, ext_swiz, dst, Operand::zero());
2612                      }
2613                   } else {
2614                      SDWA_instruction& sdwa = bld.vop1_sdwa(aco_opcode::v_mov_b32, dst, op)->sdwa();
2615                      sdwa.sel[0] = SubdwordSel(bits / 8, offset / 8, signext);
2616                   }
2617                }
2618                break;
2619             }
2620             case aco_opcode::p_insert: {
2621                assert(instr->operands[1].isConstant());
2622                assert(instr->operands[2].isConstant());
2623                if (instr->definitions[0].regClass() == s1)
2624                   assert(instr->definitions.size() >= 2 && instr->definitions[1].physReg() == scc);
2625                Definition dst = instr->definitions[0];
2626                Operand op = instr->operands[0];
2627                unsigned bits = instr->operands[2].constantValue();
2628                unsigned index = instr->operands[1].constantValue();
2629                unsigned offset = index * bits;
2630 
2631                bool has_sdwa = program->gfx_level >= GFX8 && program->gfx_level < GFX11;
2632                if (dst.regClass() == s1) {
2633                   if (ctx.program->gfx_level >= GFX9 && offset == 0 && bits == 16) {
2634                      bld.sop2(aco_opcode::s_pack_ll_b32_b16, dst, op, Operand::zero());
2635                   } else if (ctx.program->gfx_level >= GFX9 && offset == 16 && bits == 16) {
2636                      bld.sop2(aco_opcode::s_pack_ll_b32_b16, dst, Operand::zero(), op);
2637                   } else if (offset == (32 - bits)) {
2638                      bld.sop2(aco_opcode::s_lshl_b32, dst, instr->definitions[1], op,
2639                               Operand::c32(offset));
2640                   } else if (offset == 0) {
2641                      bld.sop2(aco_opcode::s_bfe_u32, dst, instr->definitions[1], op,
2642                               Operand::c32(bits << 16));
2643                   } else {
2644                      bld.sop2(aco_opcode::s_bfe_u32, dst, instr->definitions[1], op,
2645                               Operand::c32(bits << 16));
2646                      bld.sop2(aco_opcode::s_lshl_b32, dst, instr->definitions[1],
2647                               Operand(dst.physReg(), s1), Operand::c32(offset));
2648                   }
2649                } else if (dst.regClass() == v1 || !has_sdwa) {
2650                   if (offset == (dst.bytes() * 8u - bits) && dst.regClass() == v1) {
2651                      bld.vop2(aco_opcode::v_lshlrev_b32, dst, Operand::c32(offset), op);
2652                   } else if (offset == 0 && dst.regClass() == v1) {
2653                      bld.vop3(aco_opcode::v_bfe_u32, dst, op, Operand::zero(), Operand::c32(bits));
2654                   } else if (has_sdwa && (op.regClass() != s1 || program->gfx_level >= GFX9)) {
2655                      bld.vop1_sdwa(aco_opcode::v_mov_b32, dst, op)->sdwa().dst_sel =
2656                         SubdwordSel(bits / 8, offset / 8, false);
2657                   } else if (program->gfx_level >= GFX11) {
2658                      uint8_t swiz[] = {4, 5, 6, 7};
2659                      for (unsigned i = 0; i < dst.bytes(); i++)
2660                         swiz[dst.physReg().byte() + i] = bperm_0;
2661                      for (unsigned i = 0; i < bits / 8; i++)
2662                         swiz[dst.physReg().byte() + i + offset / 8] = op.physReg().byte() + i;
2663                      create_bperm(bld, swiz, dst, op);
2664                   } else {
2665                      bld.vop3(aco_opcode::v_bfe_u32, dst, op, Operand::zero(), Operand::c32(bits));
2666                      bld.vop2(aco_opcode::v_lshlrev_b32, dst, Operand::c32(offset),
2667                               Operand(dst.physReg(), v1));
2668                   }
2669                } else {
2670                   assert(dst.regClass() == v2b);
2671                   if (!offset) {
2672                      bld.vop1_sdwa(aco_opcode::v_mov_b32, dst, op)->sdwa().sel[0] =
2673                         SubdwordSel::ubyte;
2674                   } else if (program->gfx_level >= GFX9) {
2675                      bld.vop2_sdwa(aco_opcode::v_lshlrev_b32, dst, Operand::c32(offset), op)
2676                         ->sdwa()
2677                         .sel[1] = SubdwordSel::ubyte;
2678                   } else {
2679                      assert(offset == 8);
2680                      Definition dst_hi = Definition(dst.physReg().advance(1), v1b);
2681                      bld.vop1_sdwa(aco_opcode::v_mov_b32, dst_hi, op)->sdwa().sel[0] =
2682                         SubdwordSel::ubyte;
2683                      uint32_t c = ~(BITFIELD_MASK(offset) << (dst.physReg().byte() * 8));
2684                      bld.vop2(aco_opcode::v_and_b32, dst, Operand::c32(c),
2685                               Operand(PhysReg(op.physReg().reg()), v1));
2686                   }
2687                }
2688                break;
2689             }
2690             case aco_opcode::p_init_scratch: {
2691                assert(program->gfx_level >= GFX8 && program->gfx_level <= GFX10_3);
2692                if (!program->config->scratch_bytes_per_wave)
2693                   break;
2694 
2695                Operand scratch_addr = instr->operands[0];
2696                if (scratch_addr.isUndefined()) {
2697                   PhysReg reg = instr->definitions[0].physReg();
2698                   bld.sop1(aco_opcode::p_load_symbol, Definition(reg, s1),
2699                            Operand::c32(aco_symbol_scratch_addr_lo));
2700                   bld.sop1(aco_opcode::p_load_symbol, Definition(reg.advance(4), s1),
2701                            Operand::c32(aco_symbol_scratch_addr_hi));
2702                   scratch_addr.setFixed(reg);
2703                } else if (program->stage.hw != AC_HW_COMPUTE_SHADER) {
2704                   bld.smem(aco_opcode::s_load_dwordx2, instr->definitions[0], scratch_addr,
2705                            Operand::zero());
2706                   scratch_addr.setFixed(instr->definitions[0].physReg());
2707                }
2708 
2709                hw_init_scratch(bld, instr->definitions[0], scratch_addr, instr->operands[1]);
2710                break;
2711             }
2712             case aco_opcode::p_jump_to_epilog: {
2713                if (pops_done_msg_bounds.early_exit_needs_done_msg(block_idx, instr_idx)) {
2714                   bld.sopp(aco_opcode::s_sendmsg, sendmsg_ordered_ps_done);
2715                }
2716                bld.sop1(aco_opcode::s_setpc_b64, instr->operands[0]);
2717                break;
2718             }
2719             case aco_opcode::p_interp_gfx11: {
2720                assert(instr->definitions[0].regClass() == v1 ||
2721                       instr->definitions[0].regClass() == v2b);
2722                assert(instr->operands[0].regClass() == v1.as_linear());
2723                assert(instr->operands[1].isConstant());
2724                assert(instr->operands[2].isConstant());
2725                assert(instr->operands.back().physReg() == m0);
2726                Definition dst = instr->definitions[0];
2727                PhysReg lin_vgpr = instr->operands[0].physReg();
2728                unsigned attribute = instr->operands[1].constantValue();
2729                unsigned component = instr->operands[2].constantValue();
2730                uint16_t dpp_ctrl = 0;
2731                bool high_16bits = false;
2732                Operand coord1, coord2;
2733                if (instr->operands.size() == 7) {
2734                   assert(instr->operands[3].isConstant());
2735                   high_16bits = instr->operands[3].constantValue();
2736                   assert(instr->operands[4].regClass() == v1);
2737                   assert(instr->operands[5].regClass() == v1);
2738                   coord1 = instr->operands[4];
2739                   coord2 = instr->operands[5];
2740                } else {
2741                   assert(instr->operands[3].isConstant());
2742                   dpp_ctrl = instr->operands[3].constantValue();
2743                }
2744 
2745                bld.ldsdir(aco_opcode::lds_param_load, Definition(lin_vgpr, v1), Operand(m0, s1),
2746                           attribute, component);
2747 
2748                Operand p(lin_vgpr, v1);
2749                Operand dst_op(dst.physReg(), v1);
2750                if (instr->operands.size() == 5) {
2751                   bld.vop1_dpp(aco_opcode::v_mov_b32, Definition(dst), p, dpp_ctrl);
2752                } else if (dst.regClass() == v2b) {
2753                   bld.vinterp_inreg(aco_opcode::v_interp_p10_f16_f32_inreg, Definition(dst), p,
2754                                     coord1, p, high_16bits ? 0x5 : 0);
2755                   bld.vinterp_inreg(aco_opcode::v_interp_p2_f16_f32_inreg, Definition(dst), p,
2756                                     coord2, dst_op, high_16bits ? 0x1 : 0);
2757                } else {
2758                   bld.vinterp_inreg(aco_opcode::v_interp_p10_f32_inreg, Definition(dst), p, coord1,
2759                                     p);
2760                   bld.vinterp_inreg(aco_opcode::v_interp_p2_f32_inreg, Definition(dst), p, coord2,
2761                                     dst_op);
2762                }
2763                break;
2764             }
2765             case aco_opcode::p_dual_src_export_gfx11: {
2766                PhysReg dst0 = instr->definitions[0].physReg();
2767                PhysReg dst1 = instr->definitions[1].physReg();
2768                Definition exec_tmp = instr->definitions[2];
2769                Definition not_vcc_tmp = instr->definitions[3];
2770                Definition clobber_vcc = instr->definitions[4];
2771                Definition clobber_scc = instr->definitions[5];
2772 
2773                assert(exec_tmp.regClass() == bld.lm);
2774                assert(not_vcc_tmp.regClass() == bld.lm);
2775                assert(clobber_vcc.regClass() == bld.lm && clobber_vcc.physReg() == vcc);
2776                assert(clobber_scc.isFixed() && clobber_scc.physReg() == scc);
2777 
2778                bld.sop1(Builder::s_mov, Definition(exec_tmp.physReg(), bld.lm),
2779                         Operand(exec, bld.lm));
2780                bld.sop1(Builder::s_wqm, Definition(exec, bld.lm), clobber_scc,
2781                         Operand(exec, bld.lm));
2782 
2783                uint8_t enabled_channels = 0;
2784                Operand mrt0[4], mrt1[4];
2785 
2786                copy_constant_sgpr(bld, clobber_vcc, 0x5555'5555'5555'5555ull);
2787 
2788                Operand src_even = Operand(clobber_vcc.physReg(), bld.lm);
2789 
2790                bld.sop1(Builder::s_not, not_vcc_tmp, clobber_scc, src_even);
2791 
2792                Operand src_odd = Operand(not_vcc_tmp.physReg(), bld.lm);
2793 
2794                for (unsigned i = 0; i < 4; i++) {
2795                   if (instr->operands[i].isUndefined() && instr->operands[i + 4].isUndefined()) {
2796                      mrt0[i] = instr->operands[i];
2797                      mrt1[i] = instr->operands[i + 4];
2798                      continue;
2799                   }
2800 
2801                   Operand src0 = instr->operands[i];
2802                   Operand src1 = instr->operands[i + 4];
2803 
2804                   /*      | even lanes | odd lanes
2805                    * mrt0 | src0 even  | src1 even
2806                    * mrt1 | src0 odd   | src1 odd
2807                    */
2808                   bld.vop2_dpp(aco_opcode::v_cndmask_b32, Definition(dst0, v1), src1, src0,
2809                                src_even, dpp_row_xmask(1));
2810                   bld.vop2_e64_dpp(aco_opcode::v_cndmask_b32, Definition(dst1, v1), src0, src1,
2811                                    src_odd, dpp_row_xmask(1));
2812 
2813                   mrt0[i] = Operand(dst0, v1);
2814                   mrt1[i] = Operand(dst1, v1);
2815 
2816                   enabled_channels |= 1 << i;
2817 
2818                   dst0 = dst0.advance(4);
2819                   dst1 = dst1.advance(4);
2820                }
2821 
2822                bld.sop1(Builder::s_mov, Definition(exec, bld.lm),
2823                         Operand(exec_tmp.physReg(), bld.lm));
2824 
2825                /* Force export all channels when everything is undefined. */
2826                if (!enabled_channels)
2827                   enabled_channels = 0xf;
2828 
2829                bld.exp(aco_opcode::exp, mrt0[0], mrt0[1], mrt0[2], mrt0[3], enabled_channels,
2830                        V_008DFC_SQ_EXP_MRT + 21, false);
2831                bld.exp(aco_opcode::exp, mrt1[0], mrt1[1], mrt1[2], mrt1[3], enabled_channels,
2832                        V_008DFC_SQ_EXP_MRT + 22, false);
2833                break;
2834             }
2835             case aco_opcode::p_end_with_regs: {
2836                end_with_regs_block_index = block->index;
2837                break;
2838             }
2839             case aco_opcode::p_shader_cycles_hi_lo_hi: {
2840                unsigned shader_cycles_lo = 29;
2841                unsigned shader_cycles_hi = 30;
2842                bld.sopk(aco_opcode::s_getreg_b32, instr->definitions[0],
2843                         ((32 - 1) << 11) | shader_cycles_hi);
2844                bld.sopk(aco_opcode::s_getreg_b32, instr->definitions[1],
2845                         ((32 - 1) << 11) | shader_cycles_lo);
2846                bld.sopk(aco_opcode::s_getreg_b32, instr->definitions[2],
2847                         ((32 - 1) << 11) | shader_cycles_hi);
2848                break;
2849             }
2850             default: break;
2851             }
2852          } else if (instr->isReduction()) {
2853             Pseudo_reduction_instruction& reduce = instr->reduction();
2854             emit_reduction(&ctx, reduce.opcode, reduce.reduce_op, reduce.cluster_size,
2855                            reduce.operands[1].physReg(),    // tmp
2856                            reduce.definitions[1].physReg(), // stmp
2857                            reduce.operands[2].physReg(),    // vtmp
2858                            reduce.definitions[2].physReg(), // sitmp
2859                            reduce.operands[0], reduce.definitions[0]);
2860          } else if (instr->isBarrier()) {
2861             Pseudo_barrier_instruction& barrier = instr->barrier();
2862 
2863             /* Anything larger than a workgroup isn't possible. Anything
2864              * smaller requires no instructions and this pseudo instruction
2865              * would only be included to control optimizations. */
2866             bool emit_s_barrier = barrier.exec_scope == scope_workgroup &&
2867                                   program->workgroup_size > program->wave_size;
2868 
2869             bld.insert(std::move(instr));
2870             if (emit_s_barrier && ctx.program->gfx_level >= GFX12) {
2871                bld.sop1(aco_opcode::s_barrier_signal, Operand::c32(-1));
2872                bld.sopp(aco_opcode::s_barrier_wait, UINT16_MAX);
2873             } else if (emit_s_barrier) {
2874                bld.sopp(aco_opcode::s_barrier);
2875             }
2876          } else if (instr->opcode == aco_opcode::p_v_cvt_f16_f32_rtne ||
2877                     instr->opcode == aco_opcode::p_s_cvt_f16_f32_rtne) {
2878             float_mode new_mode = block->fp_mode;
2879             new_mode.round16_64 = fp_round_ne;
2880             bool set_round = new_mode.round != block->fp_mode.round;
2881 
2882             emit_set_mode(bld, new_mode, set_round, false);
2883 
2884             if (instr->opcode == aco_opcode::p_v_cvt_f16_f32_rtne)
2885                instr->opcode = aco_opcode::v_cvt_f16_f32;
2886             else
2887                instr->opcode = aco_opcode::s_cvt_f16_f32;
2888             ctx.instructions.emplace_back(std::move(instr));
2889 
2890             emit_set_mode(bld, block->fp_mode, set_round, false);
2891          } else if (instr->opcode == aco_opcode::p_v_cvt_pk_u8_f32) {
2892             Definition def = instr->definitions[0];
2893             VALU_instruction& valu =
2894                bld.vop3(aco_opcode::v_cvt_pk_u8_f32, def, instr->operands[0],
2895                         Operand::c32(def.physReg().byte()), Operand(def.physReg(), v1))
2896                   ->valu();
2897             valu.abs = instr->valu().abs;
2898             valu.neg = instr->valu().neg;
2899          } else if (instr->isMIMG() && instr->mimg().strict_wqm) {
2900             lower_image_sample(&ctx, instr);
2901             ctx.instructions.emplace_back(std::move(instr));
2902          } else {
2903             ctx.instructions.emplace_back(std::move(instr));
2904          }
2905       }
2906 
2907       /* Send the ordered section done message from this block if it's needed in this block, but
2908        * instr_after_end_idx() points beyond the end of its instructions. This may commonly happen
2909        * if the common post-dominator of multiple end locations turns out to be an empty block.
2910        */
2911       if (block_idx == pops_done_msg_bounds.end_block_idx() &&
2912           pops_done_msg_bounds.instr_after_end_idx() >= block->instructions.size()) {
2913          bld.sopp(aco_opcode::s_sendmsg, sendmsg_ordered_ps_done);
2914       }
2915 
2916       block->instructions = std::move(ctx.instructions);
2917    }
2918 
2919    /* If block with p_end_with_regs is not the last block (i.e. p_exit_early_if_not may append exit
2920     * block at last), create an exit block for it to branch to.
2921     */
2922    int last_block_index = program->blocks.size() - 1;
2923    if (end_with_regs_block_index >= 0 && end_with_regs_block_index != last_block_index) {
2924       Block* exit_block = program->create_and_insert_block();
2925       Block* end_with_regs_block = &program->blocks[end_with_regs_block_index];
2926       exit_block->linear_preds.push_back(end_with_regs_block->index);
2927       end_with_regs_block->linear_succs.push_back(exit_block->index);
2928 
2929       Builder bld(program, end_with_regs_block);
2930       bld.sopp(aco_opcode::s_branch, exit_block->index);
2931 
2932       /* For insert waitcnt pass to add waitcnt in exit block, otherwise waitcnt will be added
2933        * after the s_branch which won't be executed.
2934        */
2935       end_with_regs_block->kind &= ~block_kind_end_with_regs;
2936       exit_block->kind |= block_kind_end_with_regs;
2937    }
2938 
2939    program->progress = CompilationProgress::after_lower_to_hw;
2940 }
2941 
2942 } // namespace aco
2943