• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /*
2  * Copyright © 2020 Valve Corporation
3  *
4  * SPDX-License-Identifier: MIT
5  */
6 #include "helpers.h"
7 
8 using namespace aco;
9 
10 BEGIN_TEST(to_hw_instr.swap_subdword)
11    PhysReg v0_lo{256};
12    PhysReg v0_hi{256};
13    PhysReg v0_b1{256};
14    PhysReg v0_b3{256};
15    PhysReg v1_lo{257};
16    PhysReg v1_hi{257};
17    PhysReg v1_b1{257};
18    PhysReg v1_b3{257};
19    PhysReg v128_lo{256 + 128};
20    PhysReg v128_hi{256 + 128};
21    PhysReg v129_lo{256 + 129};
22    PhysReg v129_hi{256 + 129};
23    v0_hi.reg_b += 2;
24    v1_hi.reg_b += 2;
25    v0_b1.reg_b += 1;
26    v1_b1.reg_b += 1;
27    v0_b3.reg_b += 3;
28    v1_b3.reg_b += 3;
29    v128_hi.reg_b += 2;
30    v129_hi.reg_b += 2;
31 
32    for (amd_gfx_level lvl : {GFX8, GFX9, GFX11}) {
33       if (!setup_cs(NULL, lvl))
34          continue;
35 
36       //~gfx(8|9|11)>> p_unit_test 0
37       //~gfx8! v1: %0:v[0] = v_alignbyte_b32 %0:v[0][0:16], %0:v[0][16:32], 2
38       //~gfx(9|11)! v1: %0:v[0] = v_pack_b32_f16 hi(%0:v[0][16:32]), %0:v[0][0:16]
39       bld.pseudo(aco_opcode::p_unit_test, Operand::zero());
40       bld.pseudo(aco_opcode::p_parallelcopy, Definition(v0_lo, v2b), Definition(v0_hi, v2b),
41                  Operand(v0_hi, v2b), Operand(v0_lo, v2b));
42 
43       //~gfx(8|9|11)! p_unit_test 1
44       //~gfx8! v1: %0:v[1] = v_xor_b32 %0:v[1], %0:v[0]
45       //~gfx8! v1: %0:v[0] = v_xor_b32 %0:v[1], %0:v[0]
46       //~gfx8! v1: %0:v[1] = v_xor_b32 %0:v[1], %0:v[0]
47       //~gfx(9|11)! v1: %0:v[0],  v1: %0:v[1] = v_swap_b32 %0:v[1], %0:v[0]
48       //~gfx[89]! v2b: %0:v[1][16:32] = v_mov_b32 %0:v[0][16:32] dst_sel:uword1 dst_preserve src0_sel:uword1
49       //~gfx11! v2b: %0:v[1][16:32] = v_mov_b16 hi(%0:v[0][16:32]) opsel_hi
50       bld.pseudo(aco_opcode::p_unit_test, Operand::c32(1u));
51       bld.pseudo(aco_opcode::p_parallelcopy, Definition(v0_lo, v1), Definition(v1_lo, v2b),
52                  Operand(v1_lo, v1), Operand(v0_lo, v2b));
53 
54       //~gfx(8|9|11)! p_unit_test 2
55       //~gfx[89]! v2b: %0:v[0][16:32] = v_mov_b32 %0:v[1][16:32] dst_sel:uword1 dst_preserve src0_sel:uword1
56       //~gfx[89]! v2b: %0:v[1][16:32] = v_mov_b32 %0:v[0][0:16] dst_sel:uword1 dst_preserve src0_sel:uword0
57       //~gfx[89]! v2b: %0:v[1][0:16] = v_xor_b32 %0:v[1][0:16], %0:v[0][0:16] dst_sel:uword0 dst_preserve src0_sel:uword0 src1_sel:uword0
58       //~gfx[89]! v2b: %0:v[0][0:16] = v_xor_b32 %0:v[1][0:16], %0:v[0][0:16] dst_sel:uword0 dst_preserve src0_sel:uword0 src1_sel:uword0
59       //~gfx[89]! v2b: %0:v[1][0:16] = v_xor_b32 %0:v[1][0:16], %0:v[0][0:16] dst_sel:uword0 dst_preserve src0_sel:uword0 src1_sel:uword0
60       //~gfx11! v2b: %0:v[0][16:32] = v_mov_b16 hi(%0:v[1][16:32]) opsel_hi
61       //~gfx11! v2b: %0:v[1][16:32] = v_mov_b16 %0:v[0][0:16] opsel_hi
62       //~gfx11! v2b: %0:v[0][0:16],  v2b: %0:v[1][0:16] = v_swap_b16 %0:v[1][0:16], %0:v[0][0:16]
63       bld.pseudo(aco_opcode::p_unit_test, Operand::c32(2u));
64       bld.pseudo(aco_opcode::p_parallelcopy, Definition(v0_lo, v1), Definition(v1_lo, v2b),
65                  Definition(v1_hi, v2b), Operand(v1_lo, v1), Operand(v0_lo, v2b),
66                  Operand(v0_lo, v2b));
67 
68       //~gfx(8|9|11)! p_unit_test 3
69       //~gfx8! v1: %0:v[1] = v_xor_b32 %0:v[1], %0:v[0]
70       //~gfx8! v1: %0:v[0] = v_xor_b32 %0:v[1], %0:v[0]
71       //~gfx8! v1: %0:v[1] = v_xor_b32 %0:v[1], %0:v[0]
72       //~gfx(9|11)! v1: %0:v[0],  v1: %0:v[1] = v_swap_b32 %0:v[1], %0:v[0]
73       //~gfx[89]! v2b: %0:v[1][0:16] = v_mov_b32 %0:v[0][0:16] dst_sel:uword0 dst_preserve src0_sel:uword0
74       //~gfx[89]! v1b: %0:v[1][16:24] = v_mov_b32 %0:v[0][16:24] dst_sel:ubyte2 dst_preserve src0_sel:ubyte2
75       //~gfx11! v2b: %0:v[1][0:16] = v_mov_b16 %0:v[0][0:16]
76       //~gfx11! v1: %0:v[1] = v_perm_b32 %0:v[1], %0:v[0], 0x7020504
77       bld.pseudo(aco_opcode::p_unit_test, Operand::c32(3u));
78       bld.pseudo(aco_opcode::p_parallelcopy, Definition(v0_lo, v1), Definition(v1_b3, v1b),
79                  Operand(v1_lo, v1), Operand(v0_b3, v1b));
80 
81       //~gfx(8|9|11)! p_unit_test 4
82       //~gfx8! v1: %0:v[1] = v_xor_b32 %0:v[1], %0:v[0]
83       //~gfx8! v1: %0:v[0] = v_xor_b32 %0:v[1], %0:v[0]
84       //~gfx8! v1: %0:v[1] = v_xor_b32 %0:v[1], %0:v[0]
85       //~gfx(9|11)! v1: %0:v[0],  v1: %0:v[1] = v_swap_b32 %0:v[1], %0:v[0]
86       //~gfx[89]! v1b: %0:v[1][8:16] = v_mov_b32 %0:v[0][8:16] dst_sel:ubyte1 dst_preserve src0_sel:ubyte1
87       //~gfx[89]! v2b: %0:v[1][16:32] = v_mov_b32 %0:v[0][16:32] dst_sel:uword1 dst_preserve src0_sel:uword1
88       //~gfx11! v1: %0:v[1] = v_perm_b32 %0:v[1], %0:v[0], 0x7060104
89       //~gfx11! v2b: %0:v[1][16:32] = v_mov_b16 hi(%0:v[0][16:32]) opsel_hi
90       bld.pseudo(aco_opcode::p_unit_test, Operand::c32(4u));
91       bld.pseudo(aco_opcode::p_parallelcopy, Definition(v0_lo, v1), Definition(v1_lo, v1b),
92                  Operand(v1_lo, v1), Operand(v0_lo, v1b));
93 
94       //~gfx(8|9|11)! p_unit_test 5
95       //~gfx8! v1: %0:v[0] = v_xor_b32 %0:v[0], %0:v[1]
96       //~gfx8! v1: %0:v[1] = v_xor_b32 %0:v[0], %0:v[1]
97       //~gfx8! v1: %0:v[0] = v_xor_b32 %0:v[0], %0:v[1]
98       //~gfx(9|11)! v1: %0:v[1],  v1: %0:v[0] = v_swap_b32 %0:v[0], %0:v[1]
99       //~gfx[89]! v1b: %0:v[0][8:16] = v_mov_b32 %0:v[1][8:16] dst_sel:ubyte1 dst_preserve src0_sel:ubyte1
100       //~gfx[89]! v1b: %0:v[0][24:32] = v_mov_b32 %0:v[1][24:32] dst_sel:ubyte3 dst_preserve src0_sel:ubyte3
101       //~gfx11! v1: %0:v[0] = v_perm_b32 %0:v[0], %0:v[1], 0x7060104
102       //~gfx11! v1: %0:v[0] = v_perm_b32 %0:v[0], %0:v[1], 0x3060504
103       bld.pseudo(aco_opcode::p_unit_test, Operand::c32(5u));
104       bld.pseudo(aco_opcode::p_parallelcopy, Definition(v0_lo, v1b), Definition(v0_hi, v1b),
105                  Definition(v1_lo, v1), Operand(v1_lo, v1b), Operand(v1_hi, v1b),
106                  Operand(v0_lo, v1));
107 
108       //~gfx(8|9|11)! p_unit_test 6
109       //~gfx8! v1: %0:v[1] = v_xor_b32 %0:v[1], %0:v[0]
110       //~gfx8! v1: %0:v[0] = v_xor_b32 %0:v[1], %0:v[0]
111       //~gfx8! v1: %0:v[1] = v_xor_b32 %0:v[1], %0:v[0]
112       //~gfx(9|11)! v1: %0:v[0],  v1: %0:v[1] = v_swap_b32 %0:v[1], %0:v[0]
113       bld.pseudo(aco_opcode::p_unit_test, Operand::c32(6u));
114       bld.pseudo(aco_opcode::p_parallelcopy, Definition(v0_lo, v2b), Definition(v0_hi, v2b),
115                  Definition(v1_lo, v1), Operand(v1_lo, v2b), Operand(v1_hi, v2b),
116                  Operand(v0_lo, v1));
117 
118       //~gfx(8|9|11)! p_unit_test 7
119       //~gfx8! v1: %0:v[0] = v_xor_b32 %0:v[0], %0:v[1]
120       //~gfx8! v1: %0:v[1] = v_xor_b32 %0:v[0], %0:v[1]
121       //~gfx8! v1: %0:v[0] = v_xor_b32 %0:v[0], %0:v[1]
122       //~gfx(9|11)! v1: %0:v[1],  v1: %0:v[0] = v_swap_b32 %0:v[0], %0:v[1]
123       //~gfx(8|9|11)! v1: %0:v[0] = v_alignbyte_b32 %0:v[0][0:16], %0:v[0][16:32], 2
124       bld.pseudo(aco_opcode::p_unit_test, Operand::c32(7u));
125       bld.pseudo(aco_opcode::p_parallelcopy, Definition(v0_lo, v2b), Definition(v0_hi, v2b),
126                  Definition(v1_lo, v1), Operand(v1_hi, v2b), Operand(v1_lo, v2b),
127                  Operand(v0_lo, v1));
128 
129       //~gfx(8|9|11)! p_unit_test 8
130       //~gfx8! v1: %0:v[1] = v_xor_b32 %0:v[1], %0:v[0]
131       //~gfx8! v1: %0:v[0] = v_xor_b32 %0:v[1], %0:v[0]
132       //~gfx8! v1: %0:v[1] = v_xor_b32 %0:v[1], %0:v[0]
133       //~gfx(9|11)! v1: %0:v[0],  v1: %0:v[1] = v_swap_b32 %0:v[1], %0:v[0]
134       //~gfx[89]! v1b: %0:v[1][24:32] = v_xor_b32 %0:v[1][24:32], %0:v[0][24:32] dst_sel:ubyte3 dst_preserve src0_sel:ubyte3 src1_sel:ubyte3
135       //~gfx[89]! v1b: %0:v[0][24:32] = v_xor_b32 %0:v[1][24:32], %0:v[0][24:32] dst_sel:ubyte3 dst_preserve src0_sel:ubyte3 src1_sel:ubyte3
136       //~gfx[89]! v1b: %0:v[1][24:32] = v_xor_b32 %0:v[1][24:32], %0:v[0][24:32] dst_sel:ubyte3 dst_preserve src0_sel:ubyte3 src1_sel:ubyte3
137       //~gfx11! v2b: %0:v[0][0:16],  v2b: %0:v[1][16:32] = v_swap_b16 hi(%0:v[1][16:32]), %0:v[0][0:16]
138       //~gfx11! v1: %0:v[0] = v_perm_b32 %0:v[0], 0, 0x5060704
139       //~gfx11! v2b: %0:v[0][0:16],  v2b: %0:v[1][16:32] = v_swap_b16 hi(%0:v[1][16:32]), %0:v[0][0:16]
140       bld.pseudo(aco_opcode::p_unit_test, Operand::c32(8u));
141       bld.pseudo(aco_opcode::p_parallelcopy, Definition(v0_lo, v3b), Definition(v1_lo, v3b),
142                  Operand(v1_lo, v3b), Operand(v0_lo, v3b));
143 
144       //~gfx(8|9|11)! p_unit_test 9
145       //~gfx8! v1: %0:v[1] = v_xor_b32 %0:v[1], %0:v[0]
146       //~gfx8! v1: %0:v[0] = v_xor_b32 %0:v[1], %0:v[0]
147       //~gfx8! v1: %0:v[1] = v_xor_b32 %0:v[1], %0:v[0]
148       //~gfx(9|11)! v1: %0:v[0],  v1: %0:v[1] = v_swap_b32 %0:v[1], %0:v[0]
149       //~gfx[89]! v1b: %0:v[1][24:32] = v_mov_b32 %0:v[0][24:32] dst_sel:ubyte3 dst_preserve src0_sel:ubyte3
150       //~gfx11! v1: %0:v[1] = v_perm_b32 %0:v[1], %0:v[0], 0x3060504
151       bld.pseudo(aco_opcode::p_unit_test, Operand::c32(9u));
152       bld.pseudo(aco_opcode::p_parallelcopy, Definition(v0_lo, v3b), Definition(v1_lo, v3b),
153                  Definition(v0_b3, v1b), Operand(v1_lo, v3b), Operand(v0_lo, v3b),
154                  Operand(v1_b3, v1b));
155 
156       //~gfx(8|9|11)! p_unit_test 10
157       //~gfx[89]! v1b: %0:v[1][8:16] = v_xor_b32 %0:v[1][8:16], %0:v[0][8:16] dst_sel:ubyte1 dst_preserve src0_sel:ubyte1 src1_sel:ubyte1
158       //~gfx[89]! v1b: %0:v[0][8:16] = v_xor_b32 %0:v[1][8:16], %0:v[0][8:16] dst_sel:ubyte1 dst_preserve src0_sel:ubyte1 src1_sel:ubyte1
159       //~gfx[89]! v1b: %0:v[1][8:16] = v_xor_b32 %0:v[1][8:16], %0:v[0][8:16] dst_sel:ubyte1 dst_preserve src0_sel:ubyte1 src1_sel:ubyte1
160       //~gfx11! v2b: %0:v[0][16:32],  v2b: %0:v[1][0:16] = v_swap_b16 %0:v[1][0:16], %0:v[0][16:32] opsel_hi
161       //~gfx11! v1: %0:v[0] = v_perm_b32 %0:v[0], 0, 0x5060704
162       //~gfx11! v2b: %0:v[0][16:32],  v2b: %0:v[1][0:16] = v_swap_b16 %0:v[1][0:16], %0:v[0][16:32] opsel_hi
163       //~gfx[89]! v1b: %0:v[1][16:24] = v_xor_b32 %0:v[1][16:24], %0:v[0][16:24] dst_sel:ubyte2 dst_preserve src0_sel:ubyte2 src1_sel:ubyte2
164       //~gfx[89]! v1b: %0:v[0][16:24] = v_xor_b32 %0:v[1][16:24], %0:v[0][16:24] dst_sel:ubyte2 dst_preserve src0_sel:ubyte2 src1_sel:ubyte2
165       //~gfx[89]! v1b: %0:v[1][16:24] = v_xor_b32 %0:v[1][16:24], %0:v[0][16:24] dst_sel:ubyte2 dst_preserve src0_sel:ubyte2 src1_sel:ubyte2
166       //~gfx11! v2b: %0:v[0][0:16],  v2b: %0:v[1][16:32] = v_swap_b16 hi(%0:v[1][16:32]), %0:v[0][0:16]
167       //~gfx11! v1: %0:v[0] = v_perm_b32 %0:v[0], 0, 0x7040506
168       //~gfx11! v2b: %0:v[0][0:16],  v2b: %0:v[1][16:32] = v_swap_b16 hi(%0:v[1][16:32]), %0:v[0][0:16]
169       bld.pseudo(aco_opcode::p_unit_test, Operand::c32(10u));
170       bld.pseudo(aco_opcode::p_parallelcopy, Definition(v0_b1, v2b), Definition(v1_b1, v2b),
171                  Operand(v1_b1, v2b), Operand(v0_b1, v2b));
172 
173       //~gfx(8|9|11)! p_unit_test 11
174       //~gfx[89]! v2b: %0:v[1][0:16] = v_mov_b32 %0:v[0][16:32] dst_sel:uword0 dst_preserve src0_sel:uword1
175       //~gfx11! v2b: %0:v[1][0:16] = v_mov_b16 hi(%0:v[0][16:32])
176       //~gfx(8|9|11)! v1: %0:v[0] = v_mov_b32 42
177       bld.pseudo(aco_opcode::p_unit_test, Operand::c32(11u));
178       bld.pseudo(aco_opcode::p_parallelcopy, Definition(v0_lo, v1), Definition(v1_lo, v2b),
179                  Operand::c32(42u), Operand(v0_hi, v2b));
180 
181       //~gfx(8|9|11)! p_unit_test 12
182       //~gfx[89]! v1b: %0:v[0][24:32] = v_xor_b32 %0:v[0][24:32], %0:v[0][8:16] dst_sel:ubyte3 dst_preserve src0_sel:ubyte3 src1_sel:ubyte1
183       //~gfx[89]! v1b: %0:v[0][8:16] = v_xor_b32 %0:v[0][24:32], %0:v[0][8:16] dst_sel:ubyte1 dst_preserve src0_sel:ubyte3 src1_sel:ubyte1
184       //~gfx[89]! v1b: %0:v[0][24:32] = v_xor_b32 %0:v[0][24:32], %0:v[0][8:16] dst_sel:ubyte3 dst_preserve src0_sel:ubyte3 src1_sel:ubyte1
185       //~gfx11! v1: %0:v[0] = v_perm_b32 %0:v[0], 0, 0x5060704
186       bld.pseudo(aco_opcode::p_unit_test, Operand::c32(12u));
187       bld.pseudo(aco_opcode::p_parallelcopy, Definition(v0_b1, v1b), Definition(v0_b3, v1b),
188                  Operand(v0_b3, v1b), Operand(v0_b1, v1b));
189 
190       //~gfx(8|9|11)! p_unit_test 13
191       //~gfx[89]! v2b: %0:v[129][16:32] = v_xor_b32 %0:v[129][16:32], %0:v[128][0:16] dst_sel:uword1 dst_preserve src0_sel:uword1 src1_sel:uword0
192       //~gfx[89]! v2b: %0:v[128][0:16] = v_xor_b32 %0:v[129][16:32], %0:v[128][0:16] dst_sel:uword0 dst_preserve src0_sel:uword1 src1_sel:uword0
193       //~gfx[89]! v2b: %0:v[129][16:32] = v_xor_b32 %0:v[129][16:32], %0:v[128][0:16] dst_sel:uword1 dst_preserve src0_sel:uword1 src1_sel:uword0
194       //~gfx11! v2b: %0:v[128][0:16] = v_xor_b16 hi(%0:v[129][16:32]), %0:v[128][0:16]
195       //~gfx11! v2b: %0:v[129][16:32] = v_xor_b16 hi(%0:v[129][16:32]), %0:v[128][0:16] opsel_hi
196       //~gfx11! v2b: %0:v[128][0:16] = v_xor_b16 hi(%0:v[129][16:32]), %0:v[128][0:16]
197       bld.pseudo(aco_opcode::p_unit_test, Operand::c32(13u));
198       bld.pseudo(aco_opcode::p_parallelcopy, Definition(v128_lo, v2b), Definition(v129_hi, v2b),
199                  Operand(v129_hi, v2b), Operand(v128_lo, v2b));
200 
201       //~gfx(8|9|11)! p_unit_test 14
202       //~gfx[89]! v2b: %0:v[129][0:16] = v_xor_b32 %0:v[129][0:16], %0:v[128][16:32] dst_sel:uword0 dst_preserve src0_sel:uword0 src1_sel:uword1
203       //~gfx[89]! v2b: %0:v[128][16:32] = v_xor_b32 %0:v[129][0:16], %0:v[128][16:32] dst_sel:uword1 dst_preserve src0_sel:uword0 src1_sel:uword1
204       //~gfx[89]! v2b: %0:v[129][0:16] = v_xor_b32 %0:v[129][0:16], %0:v[128][16:32] dst_sel:uword0 dst_preserve src0_sel:uword0 src1_sel:uword1
205       //~gfx11! v2b: %0:v[128][16:32] = v_xor_b16 %0:v[129][0:16], hi(%0:v[128][16:32]) opsel_hi
206       //~gfx11! v2b: %0:v[129][0:16] = v_xor_b16 %0:v[129][0:16], hi(%0:v[128][16:32])
207       //~gfx11! v2b: %0:v[128][16:32] = v_xor_b16 %0:v[129][0:16], hi(%0:v[128][16:32]) opsel_hi
208       bld.pseudo(aco_opcode::p_unit_test, Operand::c32(14u));
209       bld.pseudo(aco_opcode::p_parallelcopy, Definition(v128_hi, v2b), Definition(v129_lo, v2b),
210                  Operand(v129_lo, v2b), Operand(v128_hi, v2b));
211 
212       //~gfx11! s_sendmsg sendmsg(dealloc_vgprs)
213       //~gfx(8|9|11)! s_endpgm
214 
215       finish_to_hw_instr_test();
216    }
217 END_TEST
218 
219 BEGIN_TEST(to_hw_instr.subdword_constant)
220    PhysReg v0_lo{256};
221    PhysReg v0_hi{256};
222    PhysReg v0_b1{256};
223    PhysReg v1_lo{257};
224    PhysReg v1_hi{257};
225    v0_hi.reg_b += 2;
226    v0_b1.reg_b += 1;
227    v1_hi.reg_b += 2;
228 
229    for (amd_gfx_level lvl : {GFX9, GFX10, GFX11}) {
230       if (!setup_cs(NULL, lvl))
231          continue;
232 
233       /* 16-bit pack */
234       //>> p_unit_test 0
235       //! v1: %_:v[0] = v_pack_b32_f16 0.5, hi(%_:v[1][16:32])
236       bld.pseudo(aco_opcode::p_unit_test, Operand::zero());
237       bld.pseudo(aco_opcode::p_parallelcopy, Definition(v0_lo, v2b), Definition(v0_hi, v2b),
238                  Operand::c16(0x3800), Operand(v1_hi, v2b));
239 
240       //! p_unit_test 1
241       //~gfx9! v2b: %0:v[0][16:32] = v_and_b32 0xffff0000, %0:v[1][16:32]
242       //~gfx9! v1: %0:v[0] = v_or_b32 0x4205, %0:v[0]
243       //~gfx(10|11)! v1: %_:v[0] = v_pack_b32_f16 0x4205, hi(%_:v[1][16:32])
244       bld.pseudo(aco_opcode::p_unit_test, Operand::c32(1u));
245       bld.pseudo(aco_opcode::p_parallelcopy, Definition(v0_lo, v2b), Definition(v0_hi, v2b),
246                  Operand::c16(0x4205), Operand(v1_hi, v2b));
247 
248       //! p_unit_test 2
249       //~gfx9! v2b: %0:v[0][16:32] = v_lshlrev_b32 16, %0:v[0][0:16]
250       //~gfx9! v1: %_:v[0] = v_or_b32 0x4205, %_:v[0]
251       //~gfx(10|11)! v1: %0:v[0] = v_pack_b32_f16 0x4205, %0:v[0][0:16]
252       bld.pseudo(aco_opcode::p_unit_test, Operand::c32(2u));
253       bld.pseudo(aco_opcode::p_parallelcopy, Definition(v0_lo, v2b), Definition(v0_hi, v2b),
254                  Operand::c16(0x4205), Operand(v0_lo, v2b));
255 
256       //! p_unit_test 3
257       //! v1: %_:v[0] = v_mov_b32 0x3c003800
258       bld.pseudo(aco_opcode::p_unit_test, Operand::c32(3u));
259       bld.pseudo(aco_opcode::p_parallelcopy, Definition(v0_lo, v2b), Definition(v0_hi, v2b),
260                  Operand::c16(0x3800), Operand::c16(0x3c00));
261 
262       //! p_unit_test 4
263       //! v1: %_:v[0] = v_mov_b32 0x43064205
264       bld.pseudo(aco_opcode::p_unit_test, Operand::c32(4u));
265       bld.pseudo(aco_opcode::p_parallelcopy, Definition(v0_lo, v2b), Definition(v0_hi, v2b),
266                  Operand::c16(0x4205), Operand::c16(0x4306));
267 
268       //! p_unit_test 5
269       //! v1: %_:v[0] = v_mov_b32 0x38004205
270       bld.pseudo(aco_opcode::p_unit_test, Operand::c32(5u));
271       bld.pseudo(aco_opcode::p_parallelcopy, Definition(v0_lo, v2b), Definition(v0_hi, v2b),
272                  Operand::c16(0x4205), Operand::c16(0x3800));
273 
274       /* 16-bit copy */
275       //! p_unit_test 6
276       //~gfx(9|10)! v2b: %_:v[0][0:16] = v_add_f16 0.5, 0 dst_sel:uword0 dst_preserve src0_sel:uword0 src1_sel:dword
277       //~gfx11! v2b: %0:v[0][0:16] = v_add_f16 0.5, 0
278       bld.pseudo(aco_opcode::p_unit_test, Operand::c32(6u));
279       bld.pseudo(aco_opcode::p_parallelcopy, Definition(v0_lo, v2b), Operand::c16(0x3800));
280 
281       //! p_unit_test 7
282       //~gfx9! v1: %_:v[0] = v_and_b32 0xffff0000, %_:v[0]
283       //~gfx9! v1: %_:v[0] = v_or_b32 0x4205, %_:v[0]
284       //~gfx10! v2b: %_:v[0][0:16] = v_add_u16_e64 0x4205, 0
285       //~gfx11! v2b: %0:v[0][0:16] = v_mov_b16 0x4205
286       bld.pseudo(aco_opcode::p_unit_test, Operand::c32(7u));
287       bld.pseudo(aco_opcode::p_parallelcopy, Definition(v0_lo, v2b), Operand::c16(0x4205));
288 
289       //! p_unit_test 8
290       //~gfx9! v1: %_:v[0] = v_and_b32 0xffff, %_:v[0]
291       //~gfx9! v1: %_:v[0] = v_or_b32 0x42050000, %_:v[0]
292       //~gfx10! v2b: %_:v[0][16:32] = v_add_u16_e64 0x4205, 0 opsel_hi
293       //~gfx11! v2b: %0:v[0][16:32] = v_mov_b16 0x4205 opsel_hi
294       bld.pseudo(aco_opcode::p_unit_test, Operand::c32(8u));
295       bld.pseudo(aco_opcode::p_parallelcopy, Definition(v0_hi, v2b), Operand::c16(0x4205));
296 
297       //! p_unit_test 9
298       //~gfx(9|10)! v1b: %_:v[0][8:16] = v_mov_b32 0 dst_sel:ubyte1 dst_preserve src0_sel:dword
299       //~gfx(9|10)! v1b: %_:v[0][16:24] = v_mov_b32 56 dst_sel:ubyte2 dst_preserve src0_sel:dword
300       //~gfx11! v1b: %_:v[0][8:16] = v_cvt_pk_u8_f32 0, 1, %_:v[0]
301       //~gfx11! v1b: %_:v[0][16:24] = v_cvt_pk_u8_f32 0x42600000, 2, %_:v[0]
302       bld.pseudo(aco_opcode::p_unit_test, Operand::c32(9u));
303       bld.pseudo(aco_opcode::p_parallelcopy, Definition(v0_b1, v2b), Operand::c16(0x3800));
304 
305       //! p_unit_test 10
306       //~gfx(9|10)! v1b: %_:v[0][8:16] = v_mov_b32 5 dst_sel:ubyte1 dst_preserve src0_sel:dword
307       //~gfx(9|10)! v1b: %_:v[0][16:24] = v_mul_u32_u24 2, 33 dst_sel:ubyte2 dst_preserve src0_sel:dword src1_sel:dword
308       //~gfx11! v1b: %_:v[0][8:16] = v_cvt_pk_u8_f32 0x40a00000, 1, %_:v[0]
309       //~gfx11! v1b: %_:v[0][16:24] = v_cvt_pk_u8_f32 0x42840000, 2, %_:v[0]
310       bld.pseudo(aco_opcode::p_unit_test, Operand::c32(10u));
311       bld.pseudo(aco_opcode::p_parallelcopy, Definition(v0_b1, v2b), Operand::c16(0x4205));
312 
313       /* 8-bit copy */
314       //! p_unit_test 11
315       //~gfx(9|10)! v1b: %_:v[0][0:8] = v_mul_u32_u24 2, 33 dst_sel:ubyte0 dst_preserve src0_sel:dword src1_sel:dword
316       //~gfx11! v1b: %_:v[0][0:8] = v_cvt_pk_u8_f32 0x42840000, 0, %_:v[0]
317       bld.pseudo(aco_opcode::p_unit_test, Operand::c32(11u));
318       bld.pseudo(aco_opcode::p_parallelcopy, Definition(v0_lo, v1b), Operand::c8(0x42));
319 
320       /* 32-bit and 8-bit copy */
321       //! p_unit_test 12
322       //! v1: %_:v[0] = v_mov_b32 0
323       //~gfx(9|10)! v1b: %_:v[1][0:8] = v_mov_b32 0 dst_sel:ubyte0 dst_preserve src0_sel:dword
324       //~gfx11! v1b: %_:v[1][0:8] = v_cvt_pk_u8_f32 0, 0, %_:v[1]
325       bld.pseudo(aco_opcode::p_unit_test, Operand::c32(12u));
326       bld.pseudo(aco_opcode::p_parallelcopy, Definition(v0_lo, v1), Definition(v1_lo, v1b),
327                  Operand::zero(), Operand::zero(1));
328 
329       //! p_unit_test 13
330       //~gfx9! v1: %_:v[0] = v_and_b32 0xffff0000, %_:v[0]
331       //~gfx9! v1: %_:v[0] = v_or_b32 0xff, %_:v[0]
332       //~gfx10! v2b: %_:v[0][0:16] = v_add_u16_e64 0xff, 0
333       //~gfx11! v2b: %0:v[0][0:16] = v_mov_b16 0xff
334       bld.pseudo(aco_opcode::p_unit_test, Operand::c32(13u));
335       bld.pseudo(aco_opcode::p_parallelcopy, Definition(v0_lo, v2b), Operand::c16(0x00ff));
336 
337       //! p_unit_test 14
338       //~gfx9! v1: %_:v[0] = v_and_b32 0xffff, %_:v[0]
339       //~gfx9! v1: %_:v[0] = v_or_b32 0xff000000, %_:v[0]
340       //~gfx10! v2b: %_:v[0][16:32] = v_add_u16_e64 0xff00, 0 opsel_hi
341       //~gfx11! v2b: %0:v[0][16:32] = v_mov_b16 0xffffff00 opsel_hi
342       bld.pseudo(aco_opcode::p_unit_test, Operand::c32(14u));
343       bld.pseudo(aco_opcode::p_parallelcopy, Definition(v0_hi, v2b), Operand::c16(0xff00));
344 
345       //! p_unit_test 15
346       //~gfx(9|10)! v2b: %_:v[0][0:16] = v_mov_b32 0 dst_sel:uword0 dst_preserve src0_sel:dword
347       //~gfx11! v2b: %0:v[0][0:16] = v_mov_b16 0
348       bld.pseudo(aco_opcode::p_unit_test, Operand::c32(15u));
349       bld.pseudo(aco_opcode::p_parallelcopy, Definition(v0_lo, v2b), Operand::zero(2));
350 
351       //! p_unit_test 16
352       //~gfx(9|10)! v1b: %_:v[0][0:8] = v_mov_b32 -1 dst_sel:ubyte0 dst_preserve src0_sel:dword
353       //~gfx11! v1b: %_:v[0][0:8] = v_cvt_pk_u8_f32 0x437f0000, 0, %_:v[0]
354       bld.pseudo(aco_opcode::p_unit_test, Operand::c32(16u));
355       bld.pseudo(aco_opcode::p_parallelcopy, Definition(v0_lo, v1b), Operand::c8(0xff));
356 
357       //! p_unit_test 17
358       //~gfx(9|10)! v1b: %_:v[0][0:8] = v_mov_b32 0 dst_sel:ubyte0 dst_preserve src0_sel:dword
359       //~gfx11! v1b: %_:v[0][0:8] = v_cvt_pk_u8_f32 0, 0, %_:v[0]
360       bld.pseudo(aco_opcode::p_unit_test, Operand::c32(17u));
361       bld.pseudo(aco_opcode::p_parallelcopy, Definition(v0_lo, v1b), Operand::zero(1));
362 
363       //~gfx11! s_sendmsg sendmsg(dealloc_vgprs)
364       //! s_endpgm
365 
366       finish_to_hw_instr_test();
367    }
368 END_TEST
369 
370 BEGIN_TEST(to_hw_instr.self_intersecting_swap)
371    if (!setup_cs(NULL, GFX9))
372       return;
373 
374    PhysReg reg_v1{257};
375    PhysReg reg_v2{258};
376    PhysReg reg_v3{259};
377    PhysReg reg_v7{263};
378 
379    //>> p_unit_test 0
380    //! v1: %0:v[1],  v1: %0:v[2] = v_swap_b32 %0:v[2], %0:v[1]
381    //! v1: %0:v[2],  v1: %0:v[3] = v_swap_b32 %0:v[3], %0:v[2]
382    //! v1: %0:v[3],  v1: %0:v[7] = v_swap_b32 %0:v[7], %0:v[3]
383    //! s_endpgm
384    bld.pseudo(aco_opcode::p_unit_test, Operand::zero());
385    // v[1:2] = v[2:3]
386    // v3 = v7
387    // v7 = v1
388    bld.pseudo(aco_opcode::p_parallelcopy, Definition(reg_v1, v2), Definition(reg_v3, v1),
389               Definition(reg_v7, v1), Operand(reg_v2, v2), Operand(reg_v7, v1),
390               Operand(reg_v1, v1));
391 
392    finish_to_hw_instr_test();
393 END_TEST
394 
395 BEGIN_TEST(to_hw_instr.extract)
396    PhysReg s0_lo{0};
397    PhysReg s1_lo{1};
398    PhysReg v0_lo{256};
399    PhysReg v1_lo{257};
400 
401    for (amd_gfx_level lvl : {GFX7, GFX8, GFX9, GFX11}) {
402       for (unsigned is_signed = 0; is_signed <= 1; is_signed++) {
403          if (!setup_cs(NULL, lvl, CHIP_UNKNOWN, is_signed ? "_signed" : "_unsigned"))
404             continue;
405 
406 #define EXT(idx, size)                                                                             \
407    bld.pseudo(aco_opcode::p_extract, Definition(v0_lo, v1), Operand(v1_lo, v1), Operand::c32(idx), \
408               Operand::c32(size), Operand::c32(is_signed));
409 
410          //; funcs['v_bfe'] = lambda _: 'v_bfe_i32' if variant.endswith('_signed') else 'v_bfe_u32'
411          //; funcs['v_shr'] = lambda _: 'v_ashrrev_i32' if variant.endswith('_signed') else 'v_lshrrev_b32'
412          //; funcs['s_bfe'] = lambda _: 's_bfe_i32' if variant.endswith('_signed') else 's_bfe_u32'
413          //; funcs['s_shr'] = lambda _: 's_ashr_i32' if variant.endswith('_signed') else 's_lshr_b32'
414          //; funcs['byte'] = lambda n: '%cbyte%s' % ('s' if variant.endswith('_signed') else 'u', n)
415 
416          //>> p_unit_test 0
417          bld.pseudo(aco_opcode::p_unit_test, Operand::zero());
418          //! v1: %_:v[0] = @v_bfe %_:v[1], 0, 8
419          EXT(0, 8)
420          //! v1: %_:v[0] = @v_bfe %_:v[1], 8, 8
421          EXT(1, 8)
422          //! v1: %_:v[0] = @v_bfe %_:v[1], 16, 8
423          EXT(2, 8)
424          //! v1: %_:v[0] = @v_shr 24, %_:v[1]
425          EXT(3, 8)
426          //~gfx(7|8|9)_.*! v1: %_:v[0] = @v_bfe %_:v[1], 0, 16
427          //~gfx11_unsigned! v1: %_:v[0] = v_cvt_u32_u16 %_:v[1]
428          //~gfx11_signed! v1: %_:v[0] = v_cvt_i32_i16 %_:v[1]
429          EXT(0, 16)
430          //! v1: %_:v[0] = @v_shr 16, %_:v[1]
431          EXT(1, 16)
432 
433 #undef EXT
434 
435 #define EXT(idx, size)                                                                             \
436    bld.pseudo(aco_opcode::p_extract, Definition(s0_lo, s1), Definition(scc, s1),                   \
437               Operand(s1_lo, s1), Operand::c32(idx), Operand::c32(size), Operand::c32(is_signed));
438 
439          //>> p_unit_test 2
440          bld.pseudo(aco_opcode::p_unit_test, Operand::c32(2u));
441          //~gfx.*_unsigned! s1: %_:s[0],  s1: %_:scc = @s_bfe %_:s[1], 0x80000
442          //~gfx.*_signed! s1: %_:s[0] = s_sext_i32_i8 %_:s[1]
443          EXT(0, 8)
444          //! s1: %_:s[0],  s1: %_:scc = @s_bfe %_:s[1], 0x80008
445          EXT(1, 8)
446          //! s1: %_:s[0],  s1: %_:scc = @s_bfe %_:s[1], 0x80010
447          EXT(2, 8)
448          //! s1: %_:s[0],  s1: %_:scc = @s_shr %_:s[1], 24
449          EXT(3, 8)
450          //~gfx(7|8)_unsigned! s1: %_:s[0],  s1: %_:scc = @s_bfe %_:s[1], 0x100000
451          //~gfx(9|11)_unsigned! s1: %_:s[0] = s_pack_ll_b32_b16 %_:s[1], 0
452          //~gfx.*_signed! s1: %_:s[0] = s_sext_i32_i16 %_:s[1]
453          EXT(0, 16)
454          //~gfx(7,8)_unsigned! s1: %_:s[0],  s1: %_:scc = @s_shr %_:s[1], 16
455          //~gfx(9|11)_unsigned! s1: %_:s[0] = s_pack_hh_b32_b16 %_:s[1], 0
456          //~gfx.*_signed! s1: %_:s[0],  s1: %_:scc = @s_shr %_:s[1], 16
457          EXT(1, 16)
458 
459 #undef EXT
460 
461 #define EXT(idx, src_b)                                                                            \
462    bld.pseudo(aco_opcode::p_extract, Definition(v0_lo, v2b), Operand(v1_lo.advance(src_b), v2b),   \
463               Operand::c32(idx), Operand::c32(8u), Operand::c32(is_signed));
464 
465          //>> p_unit_test 4
466          bld.pseudo(aco_opcode::p_unit_test, Operand::c32(4u));
467          //~gfx(8|9).*! v2b: %_:v[0][0:16] = v_mov_b32 %_:v[1][0:16] dst_sel:uword0 dst_preserve src0_sel:@byte(0)
468          //~gfx11_unsigned! v1: %_:v[0] = v_perm_b32 %_:v[0], %_:v[1], 0x7060c00
469          //~gfx11_signed! v1: %_:v[0] = v_perm_b32 %_:v[0], %_:v[1], 0x7060000
470          //~gfx11_signed! v1: %_:v[0] = v_perm_b32 %_:v[0], 0, 0x7060a04
471          if (lvl != GFX7)
472             EXT(0, 0)
473          //~gfx(8|9).*! v2b: %_:v[0][0:16] = v_mov_b32 %_:v[1][16:32] dst_sel:uword0 dst_preserve src0_sel:@byte(2)
474          //~gfx11_unsigned! v1: %_:v[0] = v_perm_b32 %_:v[0], %_:v[1], 0x7060c02
475          //~gfx11_signed! v1: %_:v[0] = v_perm_b32 %_:v[0], %_:v[1], 0x7060202
476          //~gfx11_signed! v1: %_:v[0] = v_perm_b32 %_:v[0], 0, 0x7060a04
477          if (lvl != GFX7)
478             EXT(0, 2)
479          //~gfx(8|9).*! v2b: %_:v[0][0:16] = v_mov_b32 %_:v[1][0:16] dst_sel:uword0 dst_preserve src0_sel:@byte(1)
480          //~gfx11_unsigned! v1: %_:v[0] = v_perm_b32 %_:v[0], %_:v[1], 0x7060c01
481          //~gfx11_signed! v1: %_:v[0] = v_perm_b32 %_:v[0], %_:v[1], 0x7060801
482          if (lvl != GFX7)
483             EXT(1, 0)
484          //~gfx(8|9).*! v2b: %_:v[0][0:16] = v_mov_b32 %_:v[1][16:32] dst_sel:uword0 dst_preserve src0_sel:@byte(3)
485          //~gfx11_unsigned! v1: %_:v[0] = v_perm_b32 %_:v[0], %_:v[1], 0x7060c03
486          //~gfx11_signed! v1: %_:v[0] = v_perm_b32 %_:v[0], %_:v[1], 0x7060903
487          if (lvl != GFX7)
488             EXT(1, 2)
489 
490 #undef EXT
491 
492          finish_to_hw_instr_test();
493 
494          //~gfx11_.*! s_sendmsg sendmsg(dealloc_vgprs)
495          //! s_endpgm
496       }
497    }
498 END_TEST
499 
500 BEGIN_TEST(to_hw_instr.insert)
501    PhysReg s0_lo{0};
502    PhysReg s1_lo{1};
503    PhysReg v0_lo{256};
504    PhysReg v1_lo{257};
505 
506    for (amd_gfx_level lvl : {GFX7, GFX8, GFX9, GFX11}) {
507       if (!setup_cs(NULL, lvl))
508          continue;
509 
510 #define INS(idx, size)                                                                             \
511    bld.pseudo(aco_opcode::p_insert, Definition(v0_lo, v1), Operand(v1_lo, v1), Operand::c32(idx),  \
512               Operand::c32(size));
513 
514       //>> p_unit_test 0
515       bld.pseudo(aco_opcode::p_unit_test, Operand::zero());
516       //! v1: %_:v[0] = v_bfe_u32 %_:v[1], 0, 8
517       INS(0, 8)
518       //~gfx7! v1: %0:v[0] = v_bfe_u32 %0:v[1], 0, 8
519       //~gfx7! v1: %0:v[0] = v_lshlrev_b32 8, %0:v[0]
520       //~gfx(8|9)! v1: %0:v[0] = v_mov_b32 %0:v[1] dst_sel:ubyte1 src0_sel:dword
521       //~gfx11! v1: %0:v[0] = v_perm_b32 %0:v[0], %0:v[1], 0xc0c000c
522       INS(1, 8)
523       //~gfx7! v1: %0:v[0] = v_bfe_u32 %0:v[1], 0, 8
524       //~gfx7! v1: %0:v[0] = v_lshlrev_b32 16, %0:v[0]
525       //~gfx(8|9)! v1: %0:v[0] = v_mov_b32 %0:v[1] dst_sel:ubyte2 src0_sel:dword
526       //~gfx11! v1: %0:v[0] = v_perm_b32 %0:v[0], %0:v[1], 0xc000c0c
527       INS(2, 8)
528       //! v1: %0:v[0] = v_lshlrev_b32 24, %0:v[1]
529       INS(3, 8)
530       //! v1: %0:v[0] = v_bfe_u32 %0:v[1], 0, 16
531       INS(0, 16)
532       //! v1: %0:v[0] = v_lshlrev_b32 16, %0:v[1]
533       INS(1, 16)
534 
535 #undef INS
536 
537 #define INS(idx, size)                                                                             \
538    bld.pseudo(aco_opcode::p_insert, Definition(s0_lo, s1), Definition(scc, s1),                    \
539               Operand(s1_lo, s1), Operand::c32(idx), Operand::c32(size));
540 
541       //>> p_unit_test 1
542       bld.pseudo(aco_opcode::p_unit_test, Operand::c32(1u));
543       //! s1: %_:s[0],  s1: %_:scc = s_bfe_u32 %_:s[1], 0x80000
544       INS(0, 8)
545       //! s1: %_:s[0],  s1: %_:scc = s_bfe_u32 %_:s[1], 0x80000
546       //! s1: %_:s[0],  s1: %_:scc = s_lshl_b32 %_:s[0], 8
547       INS(1, 8)
548       //! s1: %_:s[0],  s1: %_:scc = s_bfe_u32 %_:s[1], 0x80000
549       //! s1: %_:s[0],  s1: %_:scc = s_lshl_b32 %_:s[0], 16
550       INS(2, 8)
551       //! s1: %_:s[0],  s1: %_:scc = s_lshl_b32 %_:s[1], 24
552       INS(3, 8)
553       //~gfx(7|8)! s1: %_:s[0],  s1: %_:scc = s_bfe_u32 %_:s[1], 0x100000
554       //~gfx(9|11)! s1: %_:s[0] = s_pack_ll_b32_b16 %_:s[1], 0
555       INS(0, 16)
556       //~gfx(7|8)! s1: %_:s[0],  s1: %_:scc = s_lshl_b32 %_:s[1], 16
557       //~gfx(9|11)! s1: %_:s[0] = s_pack_ll_b32_b16 0, %_:s[1]
558       INS(1, 16)
559 
560 #undef INS
561 
562 #define INS(idx, def_b, op_b)                                                                      \
563    bld.pseudo(aco_opcode::p_insert, Definition(v0_lo.advance(def_b), v2b),                         \
564               Operand(v1_lo.advance(op_b), v2b), Operand::c32(idx), Operand::c32(8u));
565 
566       //>> p_unit_test 2
567       bld.pseudo(aco_opcode::p_unit_test, Operand::c32(2u));
568       //~gfx(8|9)! v2b: %0:v[0][0:16] = v_mov_b32 %0:v[1][0:16] dst_sel:uword0 dst_preserve src0_sel:ubyte0
569       //~gfx11! v1: %0:v[0] = v_perm_b32 %0:v[0], %0:v[1], 0x7060c00
570       if (lvl != GFX7)
571          INS(0, 0, 0)
572       //~gfx(8|9)! v2b: %0:v[0][16:32] = v_mov_b32 %0:v[1][0:16] dst_sel:uword1 dst_preserve src0_sel:ubyte0
573       //~gfx11! v1: %0:v[0] = v_perm_b32 %0:v[0], %0:v[1], 0xc000504
574       if (lvl != GFX7)
575          INS(0, 2, 0)
576       //~gfx(8|9)! v2b: %0:v[0][0:16] = v_mov_b32 %0:v[1][16:32] dst_sel:uword0 dst_preserve src0_sel:ubyte2
577       //~gfx11! v1: %0:v[0] = v_perm_b32 %0:v[0], %0:v[1], 0x7060c02
578       if (lvl != GFX7)
579          INS(0, 0, 2)
580       //~gfx(8|9)! v2b: %0:v[0][16:32] = v_mov_b32 %0:v[1][16:32] dst_sel:uword1 dst_preserve src0_sel:ubyte2
581       //~gfx11! v1: %0:v[0] = v_perm_b32 %0:v[0], %0:v[1], 0xc020504
582       if (lvl != GFX7)
583          INS(0, 2, 2)
584       //~gfx8! v1b: %0:v[0][8:16] = v_mov_b32 %0:v[1][0:16] dst_sel:ubyte1 dst_preserve src0_sel:ubyte0
585       //~gfx8! v2b: %0:v[0][0:16] = v_and_b32 0xffffff00, %0:v[1]
586       //~gfx9! v2b: %0:v[0][0:16] = v_lshlrev_b32 8, %0:v[1][0:16] dst_sel:uword0 dst_preserve src0_sel:dword src1_sel:ubyte0
587       //~gfx11! v1: %0:v[0] = v_perm_b32 %0:v[0], %0:v[1], 0x706000c
588       if (lvl != GFX7)
589          INS(1, 0, 0)
590       //~gfx8! v1b: %0:v[0][24:32] = v_mov_b32 %0:v[1][0:16] dst_sel:ubyte3 dst_preserve src0_sel:ubyte0
591       //~gfx8! v2b: %0:v[0][16:32] = v_and_b32 0xff00ffff, %0:v[1]
592       //~gfx9! v2b: %0:v[0][16:32] = v_lshlrev_b32 8, %0:v[1][0:16] dst_sel:uword1 dst_preserve src0_sel:dword src1_sel:ubyte0
593       //~gfx11! v1: %0:v[0] = v_perm_b32 %0:v[0], %0:v[1], 0xc0504
594       if (lvl != GFX7)
595          INS(1, 2, 0)
596       //~gfx8! v1b: %0:v[0][8:16] = v_mov_b32 %0:v[1][16:32] dst_sel:ubyte1 dst_preserve src0_sel:ubyte2
597       //~gfx8! v2b: %0:v[0][0:16] = v_and_b32 0xffffff00, %0:v[1]
598       //~gfx9! v2b: %0:v[0][0:16] = v_lshlrev_b32 8, %0:v[1][16:32] dst_sel:uword0 dst_preserve src0_sel:dword src1_sel:ubyte2
599       //~gfx11! v1: %0:v[0] = v_perm_b32 %0:v[0], %0:v[1], 0x706020c
600       if (lvl != GFX7)
601          INS(1, 0, 2)
602       //~gfx8! v1b: %0:v[0][24:32] = v_mov_b32 %0:v[1][16:32] dst_sel:ubyte3 dst_preserve src0_sel:ubyte2
603       //~gfx8! v2b: %0:v[0][16:32] = v_and_b32 0xff00ffff, %0:v[1]
604       //~gfx9! v2b: %0:v[0][16:32] = v_lshlrev_b32 8, %0:v[1][16:32] dst_sel:uword1 dst_preserve src0_sel:dword src1_sel:ubyte2
605       //~gfx11! v1: %0:v[0] = v_perm_b32 %0:v[0], %0:v[1], 0x20c0504
606       if (lvl != GFX7)
607          INS(1, 2, 2)
608 #undef INS
609 
610       finish_to_hw_instr_test();
611 
612       //~gfx11! s_sendmsg sendmsg(dealloc_vgprs)
613       //! s_endpgm
614    }
615 END_TEST
616 
617 BEGIN_TEST(to_hw_instr.copy_linear_vgpr_scc)
618    if (!setup_cs(NULL, GFX10))
619       return;
620 
621    PhysReg v0_lo{256};
622    PhysReg v1_lo{257};
623 
624    //>> p_unit_test 0
625    bld.pseudo(aco_opcode::p_unit_test, Operand::zero());
626 
627    /* It would be better if the scc=s0 copy was done later, but handle_operands() is complex
628     * enough
629     */
630 
631    //! v1: %0:v[0] = v_mov_b32 %0:v[1]
632    //! s1: %0:m0 = s_mov_b32 %0:scc
633    //! s2: %0:exec,  s1: %0:scc = s_not_b64 %0:exec
634    //! v1: %0:v[0] = v_mov_b32 %0:v[1]
635    //! s2: %0:exec,  s1: %0:scc = s_not_b64 %0:exec
636    //! s1: %0:scc = s_cmp_lg_i32 %0:m0, 0
637    Instruction* instr = bld.pseudo(aco_opcode::p_parallelcopy, Definition(v0_lo, v1.as_linear()),
638                                    Operand(v1_lo, v1.as_linear()));
639    instr->pseudo().scratch_sgpr = m0;
640    instr->pseudo().needs_scratch_reg = true;
641 
642    finish_to_hw_instr_test();
643 END_TEST
644 
645 BEGIN_TEST(to_hw_instr.swap_linear_vgpr)
646    if (!setup_cs(NULL, GFX10))
647       return;
648 
649    PhysReg reg_v0{256};
650    PhysReg reg_v1{257};
651    RegClass v1_linear = v1.as_linear();
652 
653    //>> p_unit_test 0
654    bld.pseudo(aco_opcode::p_unit_test, Operand::zero());
655 
656    //! v1: %0:v[0],  v1: %0:v[1] = v_swap_b32 %0:v[1], %0:v[0]
657    //! s2: %0:exec,  s1: %0:scc = s_not_b64 %0:exec
658    //! v1: %0:v[0],  v1: %0:v[1] = v_swap_b32 %0:v[1], %0:v[0]
659    //! s2: %0:exec,  s1: %0:scc = s_not_b64 %0:exec
660    Instruction* instr = bld.pseudo(aco_opcode::p_parallelcopy, Definition(reg_v0, v1_linear),
661                                    Definition(reg_v1, v1_linear), Operand(reg_v1, v1_linear),
662                                    Operand(reg_v0, v1_linear));
663    instr->pseudo().scratch_sgpr = scc;
664    instr->pseudo().needs_scratch_reg = true;
665 
666    finish_to_hw_instr_test();
667 END_TEST
668 
669 BEGIN_TEST(to_hw_instr.copy_linear_vgpr_v3)
670    if (!setup_cs(NULL, GFX10))
671       return;
672 
673    PhysReg reg_v0{256};
674    PhysReg reg_v4{256 + 4};
675    RegClass v3_linear = v3.as_linear();
676 
677    //>> p_unit_test 0
678    bld.pseudo(aco_opcode::p_unit_test, Operand::zero());
679 
680    //! v2: %0:v[0-1] = v_lshrrev_b64 0, %0:v[4-5]
681    //! v1: %0:v[2] = v_mov_b32 %0:v[6]
682    //! s2: %0:exec,  s1: %0:scc = s_not_b64 %0:exec
683    //! v2: %0:v[0-1] = v_lshrrev_b64 0, %0:v[4-5]
684    //! v1: %0:v[2] = v_mov_b32 %0:v[6]
685    //! s2: %0:exec,  s1: %0:scc = s_not_b64 %0:exec
686    Instruction* instr = bld.pseudo(aco_opcode::p_parallelcopy, Definition(reg_v0, v3_linear),
687                                    Operand(reg_v4, v3_linear));
688    instr->pseudo().scratch_sgpr = scc;
689    instr->pseudo().needs_scratch_reg = true;
690 
691    finish_to_hw_instr_test();
692 END_TEST
693 
694 BEGIN_TEST(to_hw_instr.copy_linear_vgpr_coalesce)
695    if (!setup_cs(NULL, GFX10))
696       return;
697 
698    PhysReg reg_v0{256};
699    PhysReg reg_v1{256 + 1};
700    PhysReg reg_v4{256 + 4};
701    PhysReg reg_v5{256 + 5};
702    RegClass v1_linear = v1.as_linear();
703 
704    //>> p_unit_test 0
705    //! v2: %0:v[0-1] = v_lshrrev_b64 0, %0:v[4-5]
706    //! s2: %0:exec,  s1: %0:scc = s_not_b64 %0:exec
707    //! v2: %0:v[0-1] = v_lshrrev_b64 0, %0:v[4-5]
708    //! s2: %0:exec,  s1: %0:scc = s_not_b64 %0:exec
709    bld.pseudo(aco_opcode::p_unit_test, Operand::zero());
710 
711    Instruction* instr = bld.pseudo(aco_opcode::p_parallelcopy, Definition(reg_v0, v1_linear),
712                                    Definition(reg_v1, v1_linear), Operand(reg_v4, v1_linear),
713                                    Operand(reg_v5, v1_linear));
714    instr->pseudo().scratch_sgpr = scc;
715    instr->pseudo().needs_scratch_reg = true;
716 
717    finish_to_hw_instr_test();
718 END_TEST
719 
720 BEGIN_TEST(to_hw_instr.pack2x16_constant)
721    PhysReg v0_lo{256};
722    PhysReg v0_hi{256};
723    PhysReg v1_lo{257};
724    PhysReg v1_hi{257};
725    v0_hi.reg_b += 2;
726    v1_hi.reg_b += 2;
727 
728    for (amd_gfx_level lvl : {GFX10, GFX11}) {
729       if (!setup_cs(NULL, lvl))
730          continue;
731 
732       /* prevent usage of v_pack_b32_f16 */
733       program->blocks[0].fp_mode.denorm16_64 = fp_denorm_flush;
734 
735       //>> p_unit_test 0
736       //! v1: %_:v[0] = v_alignbyte_b32 0x3800, %_:v[1][16:32], 2
737       bld.pseudo(aco_opcode::p_unit_test, Operand::zero());
738       bld.pseudo(aco_opcode::p_parallelcopy, Definition(v0_lo, v2b), Definition(v0_hi, v2b),
739                  Operand(v1_hi, v2b), Operand::c16(0x3800));
740 
741       //! p_unit_test 1
742       //! v2b: %_:v[0][0:16] = v_lshrrev_b32 16, %_:v[1][16:32]
743       bld.pseudo(aco_opcode::p_unit_test, Operand::c32(1));
744       bld.pseudo(aco_opcode::p_parallelcopy, Definition(v0_lo, v2b), Definition(v0_hi, v2b),
745                  Operand(v1_hi, v2b), Operand::zero(2));
746 
747       //! p_unit_test 2
748       //~gfx10! v2b: %_:v[0][0:16] = v_and_b32 0xffff, %_:v[1][0:16]
749       //~gfx11! v1: %_:v[0] = v_cvt_u32_u16 %_:v[1][0:16]
750       bld.pseudo(aco_opcode::p_unit_test, Operand::c32(2));
751       bld.pseudo(aco_opcode::p_parallelcopy, Definition(v0_lo, v2b), Definition(v0_hi, v2b),
752                  Operand(v1_lo, v2b), Operand::zero(2));
753 
754       //! p_unit_test 3
755       //! v2b: %_:v[0][16:32] = v_and_b32 0xffff0000, %_:v[1][16:32]
756       bld.pseudo(aco_opcode::p_unit_test, Operand::c32(3));
757       bld.pseudo(aco_opcode::p_parallelcopy, Definition(v0_lo, v2b), Definition(v0_hi, v2b),
758                  Operand::zero(2), Operand(v1_hi, v2b));
759 
760       //! p_unit_test 4
761       //! v2b: %_:v[0][16:32] = v_lshlrev_b32 16, %_:v[1][0:16]
762       bld.pseudo(aco_opcode::p_unit_test, Operand::c32(4));
763       bld.pseudo(aco_opcode::p_parallelcopy, Definition(v0_lo, v2b), Definition(v0_hi, v2b),
764                  Operand::zero(2), Operand(v1_lo, v2b));
765 
766       //~gfx11! s_sendmsg sendmsg(dealloc_vgprs)
767       //! s_endpgm
768 
769       finish_to_hw_instr_test();
770    }
771 END_TEST
772 
773 BEGIN_TEST(to_hw_instr.mov_b16_sgpr_src)
774    if (!setup_cs(NULL, GFX11))
775       return;
776 
777    //>> p_unit_test 0
778    //! v2b: %0:v[0][0:16] = v_mov_b16 hi(%0:s[0][16:32])
779    bld.pseudo(aco_opcode::p_unit_test, Operand::zero());
780    bld.pseudo(aco_opcode::p_extract_vector, Definition(PhysReg(256), v2b), Operand(PhysReg(0), s1),
781               Operand::c32(1));
782 
783    //! s_sendmsg sendmsg(dealloc_vgprs)
784    //! s_endpgm
785 
786    finish_to_hw_instr_test();
787 
788    for (aco_ptr<Instruction>& instr : program->blocks[0].instructions) {
789       if (instr->opcode == aco_opcode::v_mov_b16 && instr->format != asVOP3(Format::VOP1)) {
790          fail_test("v_mov_b16 must be be VOP3");
791          return;
792       }
793    }
794 END_TEST
795