• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /*
2  * Copyright © 2020 Valve Corporation
3  *
4  * Permission is hereby granted, free of charge, to any person obtaining a
5  * copy of this software and associated documentation files (the "Software"),
6  * to deal in the Software without restriction, including without limitation
7  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8  * and/or sell copies of the Software, and to permit persons to whom the
9  * Software is furnished to do so, subject to the following conditions:
10  *
11  * The above copyright notice and this permission notice (including the next
12  * paragraph) shall be included in all copies or substantial portions of the
13  * Software.
14  *
15  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
18  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20  * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21  * IN THE SOFTWARE.
22  *
23  */
24 #include "helpers.h"
25 
26 using namespace aco;
27 
28 BEGIN_TEST(to_hw_instr.swap_subdword)
29    PhysReg v0_lo{256};
30    PhysReg v0_hi{256};
31    PhysReg v0_b1{256};
32    PhysReg v0_b3{256};
33    PhysReg v1_lo{257};
34    PhysReg v1_hi{257};
35    PhysReg v1_b1{257};
36    PhysReg v1_b3{257};
37    PhysReg v2_lo{258};
38    PhysReg v3_lo{259};
39    v0_hi.reg_b += 2;
40    v1_hi.reg_b += 2;
41    v0_b1.reg_b += 1;
42    v1_b1.reg_b += 1;
43    v0_b3.reg_b += 3;
44    v1_b3.reg_b += 3;
45 
46    for (unsigned i = GFX6; i <= GFX7; i++) {
47       if (!setup_cs(NULL, (amd_gfx_level)i))
48          continue;
49 
50       //~gfx[67]>>  p_unit_test 0
51       //~gfx[67]! v1: %0:v[1] = v_xor_b32 %0:v[1], %0:v[0]
52       //~gfx[67]! v1: %0:v[0] = v_xor_b32 %0:v[1], %0:v[0]
53       //~gfx[67]! v1: %0:v[1] = v_xor_b32 %0:v[1], %0:v[0]
54       bld.pseudo(aco_opcode::p_unit_test, Operand::zero());
55       bld.pseudo(aco_opcode::p_parallelcopy,
56                  Definition(v0_lo, v2b), Definition(v1_lo, v2b),
57                  Operand(v1_lo, v2b), Operand(v0_lo, v2b));
58 
59       //~gfx[67]! p_unit_test 1
60       //~gfx[67]! v2b: %0:v[0][16:32] = v_lshlrev_b32 16, %0:v[0][0:16]
61       //~gfx[67]! v1: %0:v[0] = v_alignbyte_b32 %0:v[1][0:16], %0:v[0][16:32], 2
62       //~gfx[67]! v1: %0:v[0] = v_alignbyte_b32 %0:v[0][0:16], %0:v[0][16:32], 2
63       bld.pseudo(aco_opcode::p_unit_test, Operand::c32(1u));
64       bld.pseudo(aco_opcode::p_create_vector,
65                  Definition(v0_lo, v1),
66                  Operand(v1_lo, v2b), Operand(v0_lo, v2b));
67 
68       //~gfx[67]! p_unit_test 2
69       //~gfx[67]! v2b: %0:v[0][16:32] = v_lshlrev_b32 16, %0:v[0][0:16]
70       //~gfx[67]! v1: %0:v[0] = v_alignbyte_b32 %0:v[1][0:16], %0:v[0][16:32], 2
71       //~gfx[67]! v1: %0:v[0] = v_alignbyte_b32 %0:v[0][0:16], %0:v[0][16:32], 2
72       //~gfx[67]! v2b: %0:v[1][0:16] = v_mov_b32 %0:v[2][0:16]
73       bld.pseudo(aco_opcode::p_unit_test, Operand::c32(2u));
74       bld.pseudo(aco_opcode::p_create_vector,
75                  Definition(v0_lo, v6b), Operand(v1_lo, v2b),
76                  Operand(v0_lo, v2b), Operand(v2_lo, v2b));
77 
78       //~gfx[67]! p_unit_test 3
79       //~gfx[67]! v2b: %0:v[0][16:32] = v_lshlrev_b32 16, %0:v[0][0:16]
80       //~gfx[67]! v1: %0:v[0] = v_alignbyte_b32 %0:v[1][0:16], %0:v[0][16:32], 2
81       //~gfx[67]! v1: %0:v[0] = v_alignbyte_b32 %0:v[0][0:16], %0:v[0][16:32], 2
82       //~gfx[67]! v2b: %0:v[1][16:32] = v_lshlrev_b32 16, %0:v[2][0:16]
83       //~gfx[67]! v1: %0:v[1] = v_alignbyte_b32 %0:v[3][0:16], %0:v[1][16:32], 2
84       bld.pseudo(aco_opcode::p_unit_test, Operand::c32(3u));
85       bld.pseudo(aco_opcode::p_create_vector,
86                  Definition(v0_lo, v2),
87                  Operand(v1_lo, v2b), Operand(v0_lo, v2b),
88                  Operand(v2_lo, v2b), Operand(v3_lo, v2b));
89 
90       //~gfx[67]! p_unit_test 4
91       //~gfx[67]! v2b: %0:v[1][16:32] = v_lshlrev_b32 16, %0:v[1][0:16]
92       //~gfx[67]! v1: %0:v[1] = v_alignbyte_b32 %0:v[2][0:16], %0:v[1][16:32], 2
93       //~gfx[67]! v2b: %0:v[0][16:32] = v_lshlrev_b32 16, %0:v[0][0:16]
94       //~gfx[67]! v1: %0:v[0] = v_alignbyte_b32 %0:v[3][0:16], %0:v[0][16:32], 2
95       //~gfx[67]! v1: %0:v[1] = v_xor_b32 %0:v[1], %0:v[0]
96       //~gfx[67]! v1: %0:v[0] = v_xor_b32 %0:v[1], %0:v[0]
97       //~gfx[67]! v1: %0:v[1] = v_xor_b32 %0:v[1], %0:v[0]
98       bld.pseudo(aco_opcode::p_unit_test, Operand::c32(4u));
99       bld.pseudo(aco_opcode::p_create_vector,
100                  Definition(v0_lo, v2),
101                  Operand(v1_lo, v2b), Operand(v2_lo, v2b),
102                  Operand(v0_lo, v2b), Operand(v3_lo, v2b));
103 
104       //~gfx[67]! p_unit_test 5
105       //~gfx[67]! v2b: %0:v[1][0:16] = v_mov_b32 %0:v[0][0:16]
106       //~gfx[67]! v2b: %0:v[0][0:16] = v_lshrrev_b32 16, %0:v[1][16:32]
107       bld.pseudo(aco_opcode::p_unit_test, Operand::c32(5u));
108       bld.pseudo(aco_opcode::p_split_vector,
109                  Definition(v1_lo, v2b), Definition(v0_lo, v2b),
110                  Operand(v0_lo, v1));
111 
112       //~gfx[67]! p_unit_test 6
113       //~gfx[67]! v2b: %0:v[2][0:16] = v_mov_b32 %0:v[1][0:16]
114       //~gfx[67]! v2b: %0:v[1][0:16] = v_mov_b32 %0:v[0][0:16]
115       //~gfx[67]! v2b: %0:v[0][0:16] = v_lshrrev_b32 16, %0:v[1][16:32]
116       bld.pseudo(aco_opcode::p_unit_test, Operand::c32(6u));
117       bld.pseudo(aco_opcode::p_split_vector,
118                  Definition(v1_lo, v2b), Definition(v0_lo, v2b),
119                  Definition(v2_lo, v2b), Operand(v0_lo, v6b));
120 
121       //~gfx[67]! p_unit_test 7
122       //~gfx[67]! v2b: %0:v[2][0:16] = v_mov_b32 %0:v[1][0:16]
123       //~gfx[67]! v2b: %0:v[1][0:16] = v_mov_b32 %0:v[0][0:16]
124       //~gfx[67]! v2b: %0:v[0][0:16] = v_lshrrev_b32 16, %0:v[1][16:32]
125       //~gfx[67]! v2b: %0:v[3][0:16] = v_lshrrev_b32 16, %0:v[2][16:32]
126       bld.pseudo(aco_opcode::p_unit_test, Operand::c32(7u));
127       bld.pseudo(aco_opcode::p_split_vector,
128                  Definition(v1_lo, v2b), Definition(v0_lo, v2b),
129                  Definition(v2_lo, v2b), Definition(v3_lo, v2b),
130                  Operand(v0_lo, v2));
131 
132       //~gfx[67]! p_unit_test 8
133       //~gfx[67]! v2b: %0:v[2][0:16] = v_lshrrev_b32 16, %0:v[0][16:32]
134       //~gfx[67]! v2b: %0:v[3][0:16] = v_lshrrev_b32 16, %0:v[1][16:32]
135       //~gfx[67]! v1: %0:v[1] = v_xor_b32 %0:v[1], %0:v[0]
136       //~gfx[67]! v1: %0:v[0] = v_xor_b32 %0:v[1], %0:v[0]
137       //~gfx[67]! v1: %0:v[1] = v_xor_b32 %0:v[1], %0:v[0]
138       bld.pseudo(aco_opcode::p_unit_test, Operand::c32(8u));
139       bld.pseudo(aco_opcode::p_split_vector,
140                  Definition(v1_lo, v2b), Definition(v2_lo, v2b),
141                  Definition(v0_lo, v2b), Definition(v3_lo, v2b),
142                  Operand(v0_lo, v2));
143 
144       //~gfx[67]! p_unit_test 9
145       //~gfx[67]! v1: %0:v[1] = v_xor_b32 %0:v[1], %0:v[0]
146       //~gfx[67]! v1: %0:v[0] = v_xor_b32 %0:v[1], %0:v[0]
147       //~gfx[67]! v1: %0:v[1] = v_xor_b32 %0:v[1], %0:v[0]
148       bld.pseudo(aco_opcode::p_unit_test, Operand::c32(9u));
149       bld.pseudo(aco_opcode::p_parallelcopy,
150                  Definition(v0_lo, v1b), Definition(v1_lo, v1b),
151                  Operand(v1_lo, v1b), Operand(v0_lo, v1b));
152 
153       //~gfx[67]! p_unit_test 10
154       //~gfx[67]! v1b: %0:v[1][24:32] = v_lshlrev_b32 24, %0:v[1][0:8]
155       //~gfx[67]! v2b: %0:v[1][0:16] = v_alignbyte_b32 %0:v[0][0:8], %0:v[1][24:32], 3
156       //~gfx[67]! v2b: %0:v[0][0:16] = v_mov_b32 %0:v[1][0:16]
157       bld.pseudo(aco_opcode::p_unit_test, Operand::c32(10u));
158       bld.pseudo(aco_opcode::p_create_vector,
159                  Definition(v0_lo, v2b),
160                  Operand(v1_lo, v1b), Operand(v0_lo, v1b));
161 
162       //~gfx[67]! p_unit_test 11
163       //~gfx[67]! v1b: %0:v[1][24:32] = v_lshlrev_b32 24, %0:v[1][0:8]
164       //~gfx[67]! v2b: %0:v[1][0:16] = v_alignbyte_b32 %0:v[0][0:8], %0:v[1][24:32], 3
165       //~gfx[67]! v2b: %0:v[0][0:16] = v_mov_b32 %0:v[1][0:16]
166       //~gfx[67]! v2b: %0:v[0][16:32] = v_lshlrev_b32 16, %0:v[0][0:16]
167       //~gfx[67]! v3b: %0:v[0][0:24] = v_alignbyte_b32 %0:v[2][0:8], %0:v[0][16:32], 2
168       bld.pseudo(aco_opcode::p_unit_test, Operand::c32(11u));
169       bld.pseudo(aco_opcode::p_create_vector,
170                  Definition(v0_lo, v3b), Operand(v1_lo, v1b),
171                  Operand(v0_lo, v1b), Operand(v2_lo, v1b));
172 
173       //~gfx[67]! p_unit_test 12
174       //~gfx[67]! v1b: %0:v[1][24:32] = v_lshlrev_b32 24, %0:v[1][0:8]
175       //~gfx[67]! v2b: %0:v[1][0:16] = v_alignbyte_b32 %0:v[0][0:8], %0:v[1][24:32], 3
176       //~gfx[67]! v2b: %0:v[0][0:16] = v_mov_b32 %0:v[1][0:16]
177       //~gfx[67]! v2b: %0:v[0][16:32] = v_lshlrev_b32 16, %0:v[0][0:16]
178       //~gfx[67]! v3b: %0:v[0][0:24] = v_alignbyte_b32 %0:v[2][0:8], %0:v[0][16:32], 2
179       //~gfx[67]! v3b: %0:v[0][8:32] = v_lshlrev_b32 8, %0:v[0][0:24]
180       //~gfx[67]! v1: %0:v[0] = v_alignbyte_b32 %0:v[3][0:8], %0:v[0][8:32], 1
181       bld.pseudo(aco_opcode::p_unit_test, Operand::c32(12u));
182       bld.pseudo(aco_opcode::p_create_vector,
183                  Definition(v0_lo, v1),
184                  Operand(v1_lo, v1b), Operand(v0_lo, v1b),
185                  Operand(v2_lo, v1b), Operand(v3_lo, v1b));
186 
187       //~gfx[67]! p_unit_test 13
188       //~gfx[67]! v1b: %0:v[0][0:8] = v_and_b32 0xff, %0:v[0][0:8]
189       //~gfx[67]! v2b: %0:v[0][0:16] = v_mul_u32_u24 0x101, %0:v[0][0:8]
190       //~gfx[67]! v2b: %0:v[0][0:16] = v_and_b32 0xffff, %0:v[0][0:16]
191       //~gfx[67]! v3b: %0:v[0][0:24] = v_cvt_pk_u16_u32 %0:v[0][0:16], %0:v[0][0:8]
192       //~gfx[67]! v3b: %0:v[0][0:24] = v_and_b32 0xffffff, %0:v[0][0:24]
193       //~gfx[67]! s1: %0:m0 = s_mov_b32 0x1000001
194       //~gfx[67]! v1: %0:v[0] = v_mul_lo_u32 %0:m0, %0:v[0][0:8]
195       bld.pseudo(aco_opcode::p_unit_test, Operand::c32(13u));
196       Instruction* pseudo = bld.pseudo(aco_opcode::p_create_vector,
197                                        Definition(v0_lo, v1),
198                                        Operand(v0_lo, v1b), Operand(v0_lo, v1b),
199                                        Operand(v0_lo, v1b), Operand(v0_lo, v1b));
200       pseudo->pseudo().scratch_sgpr = m0;
201 
202       //~gfx[67]! p_unit_test 14
203       //~gfx[67]! v1b: %0:v[1][0:8] = v_mov_b32 %0:v[0][0:8]
204       //~gfx[67]! v1b: %0:v[0][0:8] = v_lshrrev_b32 8, %0:v[1][8:16]
205       bld.pseudo(aco_opcode::p_unit_test, Operand::c32(14u));
206       bld.pseudo(aco_opcode::p_split_vector,
207                  Definition(v1_lo, v1b), Definition(v0_lo, v1b),
208                  Operand(v0_lo, v2b));
209 
210       //~gfx[67]! p_unit_test 15
211       //~gfx[67]! v1b: %0:v[1][0:8] = v_mov_b32 %0:v[0][0:8]
212       //~gfx[67]! v1b: %0:v[0][0:8] = v_lshrrev_b32 8, %0:v[1][8:16]
213       //~gfx[67]! v1b: %0:v[2][0:8] = v_lshrrev_b32 16, %0:v[1][16:24]
214       //~gfx[67]! v1b: %0:v[3][0:8] = v_lshrrev_b32 24, %0:v[1][24:32]
215       bld.pseudo(aco_opcode::p_unit_test, Operand::c32(15u));
216       bld.pseudo(aco_opcode::p_split_vector,
217                  Definition(v1_lo, v1b), Definition(v0_lo, v1b),
218                  Definition(v2_lo, v1b), Definition(v3_lo, v1b),
219                  Operand(v0_lo, v1));
220 
221       //~gfx[67]! s_endpgm
222 
223       finish_to_hw_instr_test();
224    }
225 
226    for (amd_gfx_level lvl : {GFX8, GFX9, GFX11}) {
227       if (!setup_cs(NULL, lvl))
228          continue;
229 
230       //~gfx(8|9|11)>> p_unit_test 0
231       //~gfx8! v1: %0:v[0] = v_alignbyte_b32 %0:v[0][0:16], %0:v[0][16:32], 2
232       //~gfx(9|11)! v1: %0:v[0] = v_pack_b32_f16 hi(%0:v[0][16:32]), %0:v[0][0:16]
233       bld.pseudo(aco_opcode::p_unit_test, Operand::zero());
234       bld.pseudo(aco_opcode::p_parallelcopy,
235                  Definition(v0_lo, v2b), Definition(v0_hi, v2b),
236                  Operand(v0_hi, v2b), Operand(v0_lo, v2b));
237 
238       //~gfx(8|9|11)! p_unit_test 1
239       //~gfx8! v1: %0:v[1] = v_xor_b32 %0:v[1], %0:v[0]
240       //~gfx8! v1: %0:v[0] = v_xor_b32 %0:v[1], %0:v[0]
241       //~gfx8! v1: %0:v[1] = v_xor_b32 %0:v[1], %0:v[0]
242       //~gfx(9|11)! v1: %0:v[0],  v1: %0:v[1] = v_swap_b32 %0:v[1], %0:v[0]
243       //~gfx[89]! v2b: %0:v[1][16:32] = v_mov_b32 %0:v[0][16:32] dst_sel:uword1 dst_preserve src0_sel:uword1
244       //~gfx11! v2b: %0:v[1][16:32] = v_add_u16_e64 hi(%0:v[0][16:32]), 0 opsel_hi
245       bld.pseudo(aco_opcode::p_unit_test, Operand::c32(1u));
246       bld.pseudo(aco_opcode::p_parallelcopy,
247                  Definition(v0_lo, v1), Definition(v1_lo, v2b),
248                  Operand(v1_lo, v1), Operand(v0_lo, v2b));
249 
250       //~gfx(8|9|11)! p_unit_test 2
251       //~gfx[89]! v2b: %0:v[0][16:32] = v_mov_b32 %0:v[1][16:32] dst_sel:uword1 dst_preserve src0_sel:uword1
252       //~gfx[89]! v2b: %0:v[1][16:32] = v_mov_b32 %0:v[0][0:16] dst_sel:uword1 dst_preserve src0_sel:uword0
253       //~gfx[89]! v2b: %0:v[1][0:16] = v_xor_b32 %0:v[1][0:16], %0:v[0][0:16] dst_sel:uword0 dst_preserve src0_sel:uword0 src1_sel:uword0
254       //~gfx[89]! v2b: %0:v[0][0:16] = v_xor_b32 %0:v[1][0:16], %0:v[0][0:16] dst_sel:uword0 dst_preserve src0_sel:uword0 src1_sel:uword0
255       //~gfx[89]! v2b: %0:v[1][0:16] = v_xor_b32 %0:v[1][0:16], %0:v[0][0:16] dst_sel:uword0 dst_preserve src0_sel:uword0 src1_sel:uword0
256       //~gfx11! v2b: %0:v[0][16:32] = v_add_u16_e64 hi(%0:v[1][16:32]), 0 opsel_hi
257       //~gfx11! v2b: %0:v[1][16:32] = v_add_u16_e64 %0:v[0][0:16], 0 opsel_hi
258       //~gfx11! v2b: %0:v[0][0:16] = v_add_u16_e64 %0:v[0][0:16], %0:v[1][0:16]
259       //~gfx11! v2b: %0:v[1][0:16] = v_sub_u16_e64 %0:v[0][0:16], %0:v[1][0:16]
260       //~gfx11! v2b: %0:v[0][0:16] = v_sub_u16_e64 %0:v[0][0:16], %0:v[1][0:16]
261       bld.pseudo(aco_opcode::p_unit_test, Operand::c32(2u));
262       bld.pseudo(aco_opcode::p_parallelcopy,
263                  Definition(v0_lo, v1), Definition(v1_lo, v2b), Definition(v1_hi, v2b),
264                  Operand(v1_lo, v1), Operand(v0_lo, v2b), Operand(v0_lo, v2b));
265 
266       //~gfx(8|9|11)! p_unit_test 3
267       //~gfx8! v1: %0:v[1] = v_xor_b32 %0:v[1], %0:v[0]
268       //~gfx8! v1: %0:v[0] = v_xor_b32 %0:v[1], %0:v[0]
269       //~gfx8! v1: %0:v[1] = v_xor_b32 %0:v[1], %0:v[0]
270       //~gfx(9|11)! v1: %0:v[0],  v1: %0:v[1] = v_swap_b32 %0:v[1], %0:v[0]
271       //~gfx[89]! v2b: %0:v[1][0:16] = v_mov_b32 %0:v[0][0:16] dst_sel:uword0 dst_preserve src0_sel:uword0
272       //~gfx[89]! v1b: %0:v[1][16:24] = v_mov_b32 %0:v[0][16:24] dst_sel:ubyte2 dst_preserve src0_sel:ubyte2
273       //~gfx11! v2b: %0:v[1][0:16] = v_add_u16_e64 %0:v[0][0:16], 0
274       //~gfx11! v1: %0:v[1] = v_perm_b32 %0:v[1], %0:v[0], 0x7020504
275       bld.pseudo(aco_opcode::p_unit_test, Operand::c32(3u));
276       bld.pseudo(aco_opcode::p_parallelcopy,
277                  Definition(v0_lo, v1), Definition(v1_b3, v1b),
278                  Operand(v1_lo, v1), Operand(v0_b3, v1b));
279 
280       //~gfx(8|9|11)! p_unit_test 4
281       //~gfx8! v1: %0:v[1] = v_xor_b32 %0:v[1], %0:v[0]
282       //~gfx8! v1: %0:v[0] = v_xor_b32 %0:v[1], %0:v[0]
283       //~gfx8! v1: %0:v[1] = v_xor_b32 %0:v[1], %0:v[0]
284       //~gfx(9|11)! v1: %0:v[0],  v1: %0:v[1] = v_swap_b32 %0:v[1], %0:v[0]
285       //~gfx[89]! v1b: %0:v[1][8:16] = v_mov_b32 %0:v[0][8:16] dst_sel:ubyte1 dst_preserve src0_sel:ubyte1
286       //~gfx[89]! v2b: %0:v[1][16:32] = v_mov_b32 %0:v[0][16:32] dst_sel:uword1 dst_preserve src0_sel:uword1
287       //~gfx11! v1: %0:v[1] = v_perm_b32 %0:v[1], %0:v[0], 0x7060104
288       //~gfx11! v2b: %0:v[1][16:32] = v_add_u16_e64 hi(%0:v[0][16:32]), 0 opsel_hi
289       bld.pseudo(aco_opcode::p_unit_test, Operand::c32(4u));
290       bld.pseudo(aco_opcode::p_parallelcopy,
291                  Definition(v0_lo, v1), Definition(v1_lo, v1b),
292                  Operand(v1_lo, v1), Operand(v0_lo, v1b));
293 
294       //~gfx(8|9|11)! p_unit_test 5
295       //~gfx8! v1: %0:v[0] = v_xor_b32 %0:v[0], %0:v[1]
296       //~gfx8! v1: %0:v[1] = v_xor_b32 %0:v[0], %0:v[1]
297       //~gfx8! v1: %0:v[0] = v_xor_b32 %0:v[0], %0:v[1]
298       //~gfx(9|11)! v1: %0:v[1],  v1: %0:v[0] = v_swap_b32 %0:v[0], %0:v[1]
299       //~gfx[89]! v1b: %0:v[0][8:16] = v_mov_b32 %0:v[1][8:16] dst_sel:ubyte1 dst_preserve src0_sel:ubyte1
300       //~gfx[89]! v1b: %0:v[0][24:32] = v_mov_b32 %0:v[1][24:32] dst_sel:ubyte3 dst_preserve src0_sel:ubyte3
301       //~gfx11! v1: %0:v[0] = v_perm_b32 %0:v[0], %0:v[1], 0x7060104
302       //~gfx11! v1: %0:v[0] = v_perm_b32 %0:v[0], %0:v[1], 0x3060504
303       bld.pseudo(aco_opcode::p_unit_test, Operand::c32(5u));
304       bld.pseudo(aco_opcode::p_parallelcopy,
305                  Definition(v0_lo, v1b), Definition(v0_hi, v1b), Definition(v1_lo, v1),
306                  Operand(v1_lo, v1b), Operand(v1_hi, v1b), Operand(v0_lo, v1));
307 
308       //~gfx(8|9|11)! p_unit_test 6
309       //~gfx8! v1: %0:v[1] = v_xor_b32 %0:v[1], %0:v[0]
310       //~gfx8! v1: %0:v[0] = v_xor_b32 %0:v[1], %0:v[0]
311       //~gfx8! v1: %0:v[1] = v_xor_b32 %0:v[1], %0:v[0]
312       //~gfx(9|11)! v1: %0:v[0],  v1: %0:v[1] = v_swap_b32 %0:v[1], %0:v[0]
313       bld.pseudo(aco_opcode::p_unit_test, Operand::c32(6u));
314       bld.pseudo(aco_opcode::p_parallelcopy,
315                  Definition(v0_lo, v2b), Definition(v0_hi, v2b), Definition(v1_lo, v1),
316                  Operand(v1_lo, v2b), Operand(v1_hi, v2b), Operand(v0_lo, v1));
317 
318       //~gfx(8|9|11)! p_unit_test 7
319       //~gfx8! v1: %0:v[0] = v_xor_b32 %0:v[0], %0:v[1]
320       //~gfx8! v1: %0:v[1] = v_xor_b32 %0:v[0], %0:v[1]
321       //~gfx8! v1: %0:v[0] = v_xor_b32 %0:v[0], %0:v[1]
322       //~gfx(9|11)! v1: %0:v[1],  v1: %0:v[0] = v_swap_b32 %0:v[0], %0:v[1]
323       //~gfx(8|9|11)! v1: %0:v[0] = v_alignbyte_b32 %0:v[0][0:16], %0:v[0][16:32], 2
324       bld.pseudo(aco_opcode::p_unit_test, Operand::c32(7u));
325       bld.pseudo(aco_opcode::p_parallelcopy,
326                  Definition(v0_lo, v2b), Definition(v0_hi, v2b), Definition(v1_lo, v1),
327                  Operand(v1_hi, v2b), Operand(v1_lo, v2b), Operand(v0_lo, v1));
328 
329       //~gfx(8|9|11)! p_unit_test 8
330       //~gfx8! v1: %0:v[1] = v_xor_b32 %0:v[1], %0:v[0]
331       //~gfx8! v1: %0:v[0] = v_xor_b32 %0:v[1], %0:v[0]
332       //~gfx8! v1: %0:v[1] = v_xor_b32 %0:v[1], %0:v[0]
333       //~gfx(9|11)! v1: %0:v[0],  v1: %0:v[1] = v_swap_b32 %0:v[1], %0:v[0]
334       //~gfx[89]! v1b: %0:v[1][24:32] = v_xor_b32 %0:v[1][24:32], %0:v[0][24:32] dst_sel:ubyte3 dst_preserve src0_sel:ubyte3 src1_sel:ubyte3
335       //~gfx[89]! v1b: %0:v[0][24:32] = v_xor_b32 %0:v[1][24:32], %0:v[0][24:32] dst_sel:ubyte3 dst_preserve src0_sel:ubyte3 src1_sel:ubyte3
336       //~gfx[89]! v1b: %0:v[1][24:32] = v_xor_b32 %0:v[1][24:32], %0:v[0][24:32] dst_sel:ubyte3 dst_preserve src0_sel:ubyte3 src1_sel:ubyte3
337       //~gfx11! v2b: %0:v[0][0:16] = v_add_u16_e64 %0:v[0][0:16], hi(%0:v[1][16:32])
338       //~gfx11! v2b: %0:v[1][16:32] = v_sub_u16_e64 %0:v[0][0:16], hi(%0:v[1][16:32]) opsel_hi
339       //~gfx11! v2b: %0:v[0][0:16] = v_sub_u16_e64 %0:v[0][0:16], hi(%0:v[1][16:32])
340       //~gfx11! v1: %0:v[0] = v_perm_b32 %0:v[0], 0, 0x5060704
341       //~gfx11! v2b: %0:v[0][0:16] = v_add_u16_e64 %0:v[0][0:16], hi(%0:v[1][16:32])
342       //~gfx11! v2b: %0:v[1][16:32] = v_sub_u16_e64 %0:v[0][0:16], hi(%0:v[1][16:32]) opsel_hi
343       //~gfx11! v2b: %0:v[0][0:16] = v_sub_u16_e64 %0:v[0][0:16], hi(%0:v[1][16:32])
344       bld.pseudo(aco_opcode::p_unit_test, Operand::c32(8u));
345       bld.pseudo(aco_opcode::p_parallelcopy,
346                  Definition(v0_lo, v3b), Definition(v1_lo, v3b),
347                  Operand(v1_lo, v3b), Operand(v0_lo, v3b));
348 
349       //~gfx(8|9|11)! p_unit_test 9
350       //~gfx8! v1: %0:v[1] = v_xor_b32 %0:v[1], %0:v[0]
351       //~gfx8! v1: %0:v[0] = v_xor_b32 %0:v[1], %0:v[0]
352       //~gfx8! v1: %0:v[1] = v_xor_b32 %0:v[1], %0:v[0]
353       //~gfx(9|11)! v1: %0:v[0],  v1: %0:v[1] = v_swap_b32 %0:v[1], %0:v[0]
354       //~gfx[89]! v1b: %0:v[1][24:32] = v_mov_b32 %0:v[0][24:32] dst_sel:ubyte3 dst_preserve src0_sel:ubyte3
355       //~gfx11! v1: %0:v[1] = v_perm_b32 %0:v[1], %0:v[0], 0x3060504
356       bld.pseudo(aco_opcode::p_unit_test, Operand::c32(9u));
357       bld.pseudo(aco_opcode::p_parallelcopy,
358                  Definition(v0_lo, v3b), Definition(v1_lo, v3b), Definition(v0_b3, v1b),
359                  Operand(v1_lo, v3b), Operand(v0_lo, v3b), Operand(v1_b3, v1b));
360 
361       //~gfx(8|9|11)! p_unit_test 10
362       //~gfx[89]! v1b: %0:v[1][8:16] = v_xor_b32 %0:v[1][8:16], %0:v[0][8:16] dst_sel:ubyte1 dst_preserve src0_sel:ubyte1 src1_sel:ubyte1
363       //~gfx[89]! v1b: %0:v[0][8:16] = v_xor_b32 %0:v[1][8:16], %0:v[0][8:16] dst_sel:ubyte1 dst_preserve src0_sel:ubyte1 src1_sel:ubyte1
364       //~gfx[89]! v1b: %0:v[1][8:16] = v_xor_b32 %0:v[1][8:16], %0:v[0][8:16] dst_sel:ubyte1 dst_preserve src0_sel:ubyte1 src1_sel:ubyte1
365       //~gfx11! v2b: %0:v[0][16:32] = v_add_u16_e64 hi(%0:v[0][16:32]), %0:v[1][0:16] opsel_hi
366       //~gfx11! v2b: %0:v[1][0:16] = v_sub_u16_e64 hi(%0:v[0][16:32]), %0:v[1][0:16]
367       //~gfx11! v2b: %0:v[0][16:32] = v_sub_u16_e64 hi(%0:v[0][16:32]), %0:v[1][0:16] opsel_hi
368       //~gfx11! v1: %0:v[0] = v_perm_b32 %0:v[0], 0, 0x5060704
369       //~gfx11! v2b: %0:v[0][16:32] = v_add_u16_e64 hi(%0:v[0][16:32]), %0:v[1][0:16] opsel_hi
370       //~gfx11! v2b: %0:v[1][0:16] = v_sub_u16_e64 hi(%0:v[0][16:32]), %0:v[1][0:16]
371       //~gfx11! v2b: %0:v[0][16:32] = v_sub_u16_e64 hi(%0:v[0][16:32]), %0:v[1][0:16] opsel_hi
372       //~gfx[89]! v1b: %0:v[1][16:24] = v_xor_b32 %0:v[1][16:24], %0:v[0][16:24] dst_sel:ubyte2 dst_preserve src0_sel:ubyte2 src1_sel:ubyte2
373       //~gfx[89]! v1b: %0:v[0][16:24] = v_xor_b32 %0:v[1][16:24], %0:v[0][16:24] dst_sel:ubyte2 dst_preserve src0_sel:ubyte2 src1_sel:ubyte2
374       //~gfx[89]! v1b: %0:v[1][16:24] = v_xor_b32 %0:v[1][16:24], %0:v[0][16:24] dst_sel:ubyte2 dst_preserve src0_sel:ubyte2 src1_sel:ubyte2
375       //~gfx11! v2b: %0:v[0][0:16] = v_add_u16_e64 %0:v[0][0:16], hi(%0:v[1][16:32])
376       //~gfx11! v2b: %0:v[1][16:32] = v_sub_u16_e64 %0:v[0][0:16], hi(%0:v[1][16:32]) opsel_hi
377       //~gfx11! v2b: %0:v[0][0:16] = v_sub_u16_e64 %0:v[0][0:16], hi(%0:v[1][16:32])
378       //~gfx11! v1: %0:v[0] = v_perm_b32 %0:v[0], 0, 0x7040506
379       //~gfx11! v2b: %0:v[0][0:16] = v_add_u16_e64 %0:v[0][0:16], hi(%0:v[1][16:32])
380       //~gfx11! v2b: %0:v[1][16:32] = v_sub_u16_e64 %0:v[0][0:16], hi(%0:v[1][16:32]) opsel_hi
381       //~gfx11! v2b: %0:v[0][0:16] = v_sub_u16_e64 %0:v[0][0:16], hi(%0:v[1][16:32])
382       bld.pseudo(aco_opcode::p_unit_test, Operand::c32(10u));
383       bld.pseudo(aco_opcode::p_parallelcopy,
384                  Definition(v0_b1, v2b), Definition(v1_b1, v2b),
385                  Operand(v1_b1, v2b), Operand(v0_b1, v2b));
386 
387       //~gfx(8|9|11)! p_unit_test 11
388       //~gfx[89]! v2b: %0:v[1][0:16] = v_mov_b32 %0:v[0][16:32] dst_sel:uword0 dst_preserve src0_sel:uword1
389       //~gfx11! v2b: %0:v[1][0:16] = v_add_u16_e64 hi(%0:v[0][16:32]), 0
390       //~gfx(8|9|11)! v1: %0:v[0] = v_mov_b32 42
391       bld.pseudo(aco_opcode::p_unit_test, Operand::c32(11u));
392       bld.pseudo(aco_opcode::p_parallelcopy, Definition(v0_lo, v1), Definition(v1_lo, v2b),
393                  Operand::c32(42u), Operand(v0_hi, v2b));
394 
395       //~gfx(8|9|11)! p_unit_test 12
396       //~gfx[89]! v1b: %0:v[0][24:32] = v_xor_b32 %0:v[0][24:32], %0:v[0][8:16] dst_sel:ubyte3 dst_preserve src0_sel:ubyte3 src1_sel:ubyte1
397       //~gfx[89]! v1b: %0:v[0][8:16] = v_xor_b32 %0:v[0][24:32], %0:v[0][8:16] dst_sel:ubyte1 dst_preserve src0_sel:ubyte3 src1_sel:ubyte1
398       //~gfx[89]! v1b: %0:v[0][24:32] = v_xor_b32 %0:v[0][24:32], %0:v[0][8:16] dst_sel:ubyte3 dst_preserve src0_sel:ubyte3 src1_sel:ubyte1
399       //~gfx11! v1: %0:v[0] = v_perm_b32 %0:v[0], 0, 0x5060704
400       bld.pseudo(aco_opcode::p_unit_test, Operand::c32(12u));
401       bld.pseudo(aco_opcode::p_parallelcopy,
402                  Definition(v0_b1, v1b), Definition(v0_b3, v1b),
403                  Operand(v0_b3, v1b), Operand(v0_b1, v1b));
404 
405       //~gfx(8|9|11)! s_endpgm
406 
407       finish_to_hw_instr_test();
408    }
409 END_TEST
410 
411 BEGIN_TEST(to_hw_instr.subdword_constant)
412    PhysReg v0_lo{256};
413    PhysReg v0_hi{256};
414    PhysReg v0_b1{256};
415    PhysReg v1_lo{257};
416    PhysReg v1_hi{257};
417    v0_hi.reg_b += 2;
418    v0_b1.reg_b += 1;
419    v1_hi.reg_b += 2;
420 
421    for (amd_gfx_level lvl : {GFX9, GFX10, GFX11}) {
422       if (!setup_cs(NULL, lvl))
423          continue;
424 
425       /* 16-bit pack */
426       //>> p_unit_test 0
427       //! v1: %_:v[0] = v_pack_b32_f16 0.5, hi(%_:v[1][16:32])
428       bld.pseudo(aco_opcode::p_unit_test, Operand::zero());
429       bld.pseudo(aco_opcode::p_parallelcopy, Definition(v0_lo, v2b), Definition(v0_hi, v2b),
430                  Operand::c16(0x3800), Operand(v1_hi, v2b));
431 
432       //! p_unit_test 1
433       //~gfx9! v2b: %0:v[0][16:32] = v_and_b32 0xffff0000, %0:v[1][16:32]
434       //~gfx9! v1: %0:v[0] = v_or_b32 0x4205, %0:v[0]
435       //~gfx(10|11)! v1: %_:v[0] = v_pack_b32_f16 0x4205, hi(%_:v[1][16:32])
436       bld.pseudo(aco_opcode::p_unit_test, Operand::c32(1u));
437       bld.pseudo(aco_opcode::p_parallelcopy, Definition(v0_lo, v2b), Definition(v0_hi, v2b),
438                  Operand::c16(0x4205), Operand(v1_hi, v2b));
439 
440       //! p_unit_test 2
441       //~gfx9! v2b: %0:v[0][16:32] = v_lshlrev_b32 16, %0:v[0][0:16]
442       //~gfx9! v1: %_:v[0] = v_or_b32 0x4205, %_:v[0]
443       //~gfx(10|11)! v1: %0:v[0] = v_pack_b32_f16 0x4205, %0:v[0][0:16]
444       bld.pseudo(aco_opcode::p_unit_test, Operand::c32(2u));
445       bld.pseudo(aco_opcode::p_parallelcopy, Definition(v0_lo, v2b), Definition(v0_hi, v2b),
446                  Operand::c16(0x4205), Operand(v0_lo, v2b));
447 
448       //! p_unit_test 3
449       //! v1: %_:v[0] = v_mov_b32 0x3c003800
450       bld.pseudo(aco_opcode::p_unit_test, Operand::c32(3u));
451       bld.pseudo(aco_opcode::p_parallelcopy, Definition(v0_lo, v2b), Definition(v0_hi, v2b),
452                  Operand::c16(0x3800), Operand::c16(0x3c00));
453 
454       //! p_unit_test 4
455       //! v1: %_:v[0] = v_mov_b32 0x43064205
456       bld.pseudo(aco_opcode::p_unit_test, Operand::c32(4u));
457       bld.pseudo(aco_opcode::p_parallelcopy, Definition(v0_lo, v2b), Definition(v0_hi, v2b),
458                  Operand::c16(0x4205), Operand::c16(0x4306));
459 
460       //! p_unit_test 5
461       //! v1: %_:v[0] = v_mov_b32 0x38004205
462       bld.pseudo(aco_opcode::p_unit_test, Operand::c32(5u));
463       bld.pseudo(aco_opcode::p_parallelcopy, Definition(v0_lo, v2b), Definition(v0_hi, v2b),
464                  Operand::c16(0x4205), Operand::c16(0x3800));
465 
466       /* 16-bit copy */
467       //! p_unit_test 6
468       //~gfx(9|10)! v2b: %_:v[0][0:16] = v_add_f16 0.5, 0 dst_sel:uword0 dst_preserve src0_sel:uword0 src1_sel:dword
469       //~gfx11! v2b: %_:v[0][0:16] = v_pack_b32_f16 0.5, hi(%_:v[0][16:32])
470       bld.pseudo(aco_opcode::p_unit_test, Operand::c32(6u));
471       bld.pseudo(aco_opcode::p_parallelcopy, Definition(v0_lo, v2b), Operand::c16(0x3800));
472 
473       //! p_unit_test 7
474       //~gfx9! v1: %_:v[0] = v_and_b32 0xffff0000, %_:v[0]
475       //~gfx9! v1: %_:v[0] = v_or_b32 0x4205, %_:v[0]
476       //~gfx(10|11)! v2b: %_:v[0][0:16] = v_pack_b32_f16 0x4205, hi(%_:v[0][16:32])
477       bld.pseudo(aco_opcode::p_unit_test, Operand::c32(7u));
478       bld.pseudo(aco_opcode::p_parallelcopy, Definition(v0_lo, v2b), Operand::c16(0x4205));
479 
480       //! p_unit_test 8
481       //~gfx9! v1: %_:v[0] = v_and_b32 0xffff, %_:v[0]
482       //~gfx9! v1: %_:v[0] = v_or_b32 0x42050000, %_:v[0]
483       //~gfx(10|11)! v2b: %_:v[0][16:32] = v_pack_b32_f16 %_:v[0][0:16], 0x4205
484       bld.pseudo(aco_opcode::p_unit_test, Operand::c32(8u));
485       bld.pseudo(aco_opcode::p_parallelcopy, Definition(v0_hi, v2b), Operand::c16(0x4205));
486 
487       //! p_unit_test 9
488       //~gfx(9|10)! v1b: %_:v[0][8:16] = v_mov_b32 0 dst_sel:ubyte1 dst_preserve src0_sel:dword
489       //~gfx(9|10)! v1b: %_:v[0][16:24] = v_mov_b32 56 dst_sel:ubyte2 dst_preserve src0_sel:dword
490       //~gfx11! v1: %_:v[0] = v_perm_b32 %_:v[0], 0, 0x7060c04
491       //~gfx11! v1: %_:v[0] = v_and_b32 0xff00ffff, %_:v[0]
492       //~gfx11! v1: %_:v[0] = v_or_b32 0x380000, %_:v[0]
493       bld.pseudo(aco_opcode::p_unit_test, Operand::c32(9u));
494       bld.pseudo(aco_opcode::p_parallelcopy, Definition(v0_b1, v2b), Operand::c16(0x3800));
495 
496       //! p_unit_test 10
497       //~gfx(9|10)! v1b: %_:v[0][8:16] = v_mov_b32 5 dst_sel:ubyte1 dst_preserve src0_sel:dword
498       //~gfx(9|10)! v1b: %_:v[0][16:24] = v_mul_u32_u24 2, 33 dst_sel:ubyte2 dst_preserve src0_sel:dword src1_sel:dword
499       //~gfx11! v1: %_:v[0] = v_and_b32 0xffff00ff, %_:v[0]
500       //~gfx11! v1: %_:v[0] = v_or_b32 0x500, %_:v[0]
501       //~gfx11! v1: %_:v[0] = v_and_b32 0xff00ffff, %_:v[0]
502       //~gfx11! v1: %_:v[0] = v_or_b32 0x420000, %_:v[0]
503       bld.pseudo(aco_opcode::p_unit_test, Operand::c32(10u));
504       bld.pseudo(aco_opcode::p_parallelcopy, Definition(v0_b1, v2b), Operand::c16(0x4205));
505 
506       /* 8-bit copy */
507       //! p_unit_test 11
508       //~gfx(9|10)! v1b: %_:v[0][0:8] = v_mul_u32_u24 2, 33 dst_sel:ubyte0 dst_preserve src0_sel:dword src1_sel:dword
509       //~gfx11! v1: %_:v[0] = v_and_b32 0xffffff00, %_:v[0]
510       //~gfx11! v1: %_:v[0] = v_or_b32 0x42, %_:v[0]
511       bld.pseudo(aco_opcode::p_unit_test, Operand::c32(11u));
512       bld.pseudo(aco_opcode::p_parallelcopy, Definition(v0_lo, v1b), Operand::c8(0x42));
513 
514       /* 32-bit and 8-bit copy */
515       //! p_unit_test 12
516       //! v1: %_:v[0] = v_mov_b32 0
517       //~gfx(9|10)! v1b: %_:v[1][0:8] = v_mov_b32 0 dst_sel:ubyte0 dst_preserve src0_sel:dword
518       //~gfx11! v1: %_:v[1] = v_perm_b32 %_:v[1], 0, 0x706050c
519       bld.pseudo(aco_opcode::p_unit_test, Operand::c32(12u));
520       bld.pseudo(aco_opcode::p_parallelcopy, Definition(v0_lo, v1), Definition(v1_lo, v1b),
521                  Operand::zero(), Operand::zero(1));
522 
523       bld.reset(program->create_and_insert_block());
524       program->blocks[0].linear_succs.push_back(1);
525       program->blocks[1].linear_preds.push_back(0);
526 
527       /* Prevent usage of v_pack_b32_f16, so we use v_perm_b32 instead. */
528       program->blocks[1].fp_mode.denorm16_64 = fp_denorm_flush;
529 
530       //>> p_unit_test 13
531       //~gfx9! v1: %_:v[0] = v_and_b32 0xffff0000, %_:v[0]
532       //~gfx9! v1: %_:v[0] = v_or_b32 0xff, %_:v[0]
533       //~gfx(10|11)! v1: %_:v[0] = v_perm_b32 %_:v[0], 0, 0x7060c0d
534       bld.pseudo(aco_opcode::p_unit_test, Operand::c32(13u));
535       bld.pseudo(aco_opcode::p_parallelcopy, Definition(v0_lo, v2b),
536                  Operand::c16(0x00ff));
537 
538       //! p_unit_test 14
539       //~gfx9! v1: %_:v[0] = v_and_b32 0xffff, %_:v[0]
540       //~gfx9! v1: %_:v[0] = v_or_b32 0xff000000, %_:v[0]
541       //~gfx(10|11)! v1: %_:v[0] = v_perm_b32 %_:v[0], 0, 0xd0c0504
542       bld.pseudo(aco_opcode::p_unit_test, Operand::c32(14u));
543       bld.pseudo(aco_opcode::p_parallelcopy, Definition(v0_hi, v2b),
544                  Operand::c16(0xff00));
545 
546       //! p_unit_test 15
547       //~gfx(9|10)! v2b: %_:v[0][0:16] = v_mov_b32 0 dst_sel:uword0 dst_preserve src0_sel:dword
548       //~gfx11! v1: %_:v[0] = v_perm_b32 %_:v[0], 0, 0x7060c0c
549       bld.pseudo(aco_opcode::p_unit_test, Operand::c32(15u));
550       bld.pseudo(aco_opcode::p_parallelcopy, Definition(v0_lo, v2b),
551                  Operand::zero(2));
552 
553       //! p_unit_test 16
554       //~gfx(9|10)! v1b: %_:v[0][0:8] = v_mov_b32 -1 dst_sel:ubyte0 dst_preserve src0_sel:dword
555       //~gfx11! v1: %_:v[0] = v_perm_b32 %_:v[0], 0, 0x706050d
556       bld.pseudo(aco_opcode::p_unit_test, Operand::c32(16u));
557       bld.pseudo(aco_opcode::p_parallelcopy, Definition(v0_lo, v1b),
558                  Operand::c8(0xff));
559 
560       //! p_unit_test 17
561       //~gfx(9|10)! v1b: %_:v[0][0:8] = v_mov_b32 0 dst_sel:ubyte0 dst_preserve src0_sel:dword
562       //~gfx11! v1: %_:v[0] = v_perm_b32 %_:v[0], 0, 0x706050c
563       bld.pseudo(aco_opcode::p_unit_test, Operand::c32(17u));
564       bld.pseudo(aco_opcode::p_parallelcopy, Definition(v0_lo, v1b),
565                  Operand::zero(1));
566 
567       //! s_endpgm
568 
569       finish_to_hw_instr_test();
570    }
571 END_TEST
572 
573 BEGIN_TEST(to_hw_instr.self_intersecting_swap)
574    if (!setup_cs(NULL, GFX9))
575       return;
576 
577    PhysReg reg_v1{257};
578    PhysReg reg_v2{258};
579    PhysReg reg_v3{259};
580    PhysReg reg_v7{263};
581 
582    //>> p_unit_test 0
583    //! v1: %0:v[1],  v1: %0:v[2] = v_swap_b32 %0:v[2], %0:v[1]
584    //! v1: %0:v[2],  v1: %0:v[3] = v_swap_b32 %0:v[3], %0:v[2]
585    //! v1: %0:v[3],  v1: %0:v[7] = v_swap_b32 %0:v[7], %0:v[3]
586    //! s_endpgm
587    bld.pseudo(aco_opcode::p_unit_test, Operand::zero());
588    //v[1:2] = v[2:3]
589    //v3 = v7
590    //v7 = v1
591    bld.pseudo(aco_opcode::p_parallelcopy,
592               Definition(reg_v1, v2), Definition(reg_v3, v1), Definition(reg_v7, v1),
593               Operand(reg_v2, v2), Operand(reg_v7, v1), Operand(reg_v1, v1));
594 
595    finish_to_hw_instr_test();
596 END_TEST
597 
598 BEGIN_TEST(to_hw_instr.extract)
599    PhysReg s0_lo{0};
600    PhysReg s1_lo{1};
601    PhysReg v0_lo{256};
602    PhysReg v1_lo{257};
603 
604    for (amd_gfx_level lvl : {GFX7, GFX8, GFX9, GFX11}) {
605    for (unsigned is_signed = 0; is_signed <= 1; is_signed++) {
606       if (!setup_cs(NULL, lvl, CHIP_UNKNOWN, is_signed ? "_signed" : "_unsigned"))
607          continue;
608 
609 #define EXT(idx, size)                                                                             \
610    bld.pseudo(aco_opcode::p_extract, Definition(v0_lo, v1), Operand(v1_lo, v1), Operand::c32(idx), \
611               Operand::c32(size), Operand::c32(is_signed));
612 
613       //; funcs['v_bfe'] = lambda _: 'v_bfe_i32' if variant.endswith('_signed') else 'v_bfe_u32'
614       //; funcs['v_shr'] = lambda _: 'v_ashrrev_i32' if variant.endswith('_signed') else 'v_lshrrev_b32'
615       //; funcs['s_bfe'] = lambda _: 's_bfe_i32' if variant.endswith('_signed') else 's_bfe_u32'
616       //; funcs['s_shr'] = lambda _: 's_ashr_i32' if variant.endswith('_signed') else 's_lshr_b32'
617       //; funcs['byte'] = lambda n: '%cbyte%s' % ('s' if variant.endswith('_signed') else 'u', n)
618 
619       //>> p_unit_test 0
620       bld.pseudo(aco_opcode::p_unit_test, Operand::zero());
621       //! v1: %_:v[0] = @v_bfe %_:v[1], 0, 8
622       EXT(0, 8)
623       //! v1: %_:v[0] = @v_bfe %_:v[1], 8, 8
624       EXT(1, 8)
625       //! v1: %_:v[0] = @v_bfe %_:v[1], 16, 8
626       EXT(2, 8)
627       //! v1: %_:v[0] = @v_shr 24, %_:v[1]
628       EXT(3, 8)
629       //! v1: %_:v[0] = @v_bfe %_:v[1], 0, 16
630       EXT(0, 16)
631       //! v1: %_:v[0] = @v_shr 16, %_:v[1]
632       EXT(1, 16)
633 
634       #undef EXT
635 
636 #define EXT(idx, size)                                                                             \
637    bld.pseudo(aco_opcode::p_extract, Definition(s0_lo, s1), Definition(scc, s1),                   \
638               Operand(s1_lo, s1), Operand::c32(idx), Operand::c32(size), Operand::c32(is_signed));
639 
640       //>> p_unit_test 2
641       bld.pseudo(aco_opcode::p_unit_test, Operand::c32(2u));
642       //~gfx.*_unsigned! s1: %_:s[0],  s1: %_:scc = @s_bfe %_:s[1], 0x80000
643       //~gfx.*_signed! s1: %_:s[0] = s_sext_i32_i8 %_:s[1]
644       EXT(0, 8)
645       //! s1: %_:s[0],  s1: %_:scc = @s_bfe %_:s[1], 0x80008
646       EXT(1, 8)
647       //! s1: %_:s[0],  s1: %_:scc = @s_bfe %_:s[1], 0x80010
648       EXT(2, 8)
649       //! s1: %_:s[0],  s1: %_:scc = @s_shr %_:s[1], 24
650       EXT(3, 8)
651       //~gfx.*_unsigned! s1: %_:s[0],  s1: %_:scc = @s_bfe %_:s[1], 0x100000
652       //~gfx.*_signed! s1: %_:s[0] = s_sext_i32_i16 %_:s[1]
653       EXT(0, 16)
654       //! s1: %_:s[0],  s1: %_:scc = @s_shr %_:s[1], 16
655       EXT(1, 16)
656 
657       #undef EXT
658 
659 #define EXT(idx, src_b)                                                                            \
660    bld.pseudo(aco_opcode::p_extract, Definition(v0_lo, v2b), Operand(v1_lo.advance(src_b), v2b),   \
661               Operand::c32(idx), Operand::c32(8u), Operand::c32(is_signed));
662 
663       //>> p_unit_test 4
664       bld.pseudo(aco_opcode::p_unit_test, Operand::c32(4u));
665       //~gfx7.*! v2b: %_:v[0][0:16] = @v_bfe %_:v[1][0:16], 0, 8
666       //~gfx(8|9).*! v2b: %_:v[0][0:16] = v_mov_b32 %_:v[1][0:16] dst_sel:uword0 dst_preserve src0_sel:@byte(0)
667       //~gfx11_unsigned! v1: %_:v[0] = v_perm_b32 %_:v[0], %_:v[1], 0x7060c00
668       //~gfx11_signed! v1: %_:v[0] = v_perm_b32 %_:v[0], %_:v[1], 0x7060000
669       //~gfx11_signed! v1: %_:v[0] = v_perm_b32 %_:v[0], 0, 0x7060a04
670       EXT(0, 0)
671       //~gfx(8|9).*! v2b: %_:v[0][0:16] = v_mov_b32 %_:v[1][16:32] dst_sel:uword0 dst_preserve src0_sel:@byte(2)
672       //~gfx11_unsigned! v1: %_:v[0] = v_perm_b32 %_:v[0], %_:v[1], 0x7060c02
673       //~gfx11_signed! v1: %_:v[0] = v_perm_b32 %_:v[0], %_:v[1], 0x7060202
674       //~gfx11_signed! v1: %_:v[0] = v_perm_b32 %_:v[0], 0, 0x7060a04
675       if (lvl != GFX7)
676          EXT(0, 2)
677       //~gfx7.*! v2b: %_:v[0][0:16] = @v_bfe %_:v[1][0:16], 8, 8
678       //~gfx(8|9).*! v2b: %_:v[0][0:16] = v_mov_b32 %_:v[1][0:16] dst_sel:uword0 dst_preserve src0_sel:@byte(1)
679       //~gfx11_unsigned! v1: %_:v[0] = v_perm_b32 %_:v[0], %_:v[1], 0x7060c01
680       //~gfx11_signed! v1: %_:v[0] = v_perm_b32 %_:v[0], %_:v[1], 0x7060801
681       EXT(1, 0)
682       //~gfx(8|9).*! v2b: %_:v[0][0:16] = v_mov_b32 %_:v[1][16:32] dst_sel:uword0 dst_preserve src0_sel:@byte(3)
683       //~gfx11_unsigned! v1: %_:v[0] = v_perm_b32 %_:v[0], %_:v[1], 0x7060c03
684       //~gfx11_signed! v1: %_:v[0] = v_perm_b32 %_:v[0], %_:v[1], 0x7060903
685       if (lvl != GFX7)
686          EXT(1, 2)
687 
688       #undef EXT
689 
690       finish_to_hw_instr_test();
691 
692       //! s_endpgm
693    }
694    }
695 END_TEST
696 
697 BEGIN_TEST(to_hw_instr.insert)
698    PhysReg s0_lo{0};
699    PhysReg s1_lo{1};
700    PhysReg v0_lo{256};
701    PhysReg v1_lo{257};
702 
703    for (amd_gfx_level lvl : {GFX7, GFX8, GFX9, GFX11}) {
704       if (!setup_cs(NULL, lvl))
705          continue;
706 
707 #define INS(idx, size)                                                                             \
708    bld.pseudo(aco_opcode::p_insert, Definition(v0_lo, v1), Operand(v1_lo, v1), Operand::c32(idx),  \
709               Operand::c32(size));
710 
711       //>> p_unit_test 0
712       bld.pseudo(aco_opcode::p_unit_test, Operand::zero());
713       //! v1: %_:v[0] = v_bfe_u32 %_:v[1], 0, 8
714       INS(0, 8)
715       //~gfx7! v1: %0:v[0] = v_bfe_u32 %0:v[1], 0, 8
716       //~gfx7! v1: %0:v[0] = v_lshlrev_b32 8, %0:v[0]
717       //~gfx(8|9)! v1: %0:v[0] = v_mov_b32 %0:v[1] dst_sel:ubyte1 src0_sel:dword
718       //~gfx11! v1: %0:v[0] = v_perm_b32 %0:v[0], %0:v[1], 0xc0c000c
719       INS(1, 8)
720       //~gfx7! v1: %0:v[0] = v_bfe_u32 %0:v[1], 0, 8
721       //~gfx7! v1: %0:v[0] = v_lshlrev_b32 16, %0:v[0]
722       //~gfx(8|9)! v1: %0:v[0] = v_mov_b32 %0:v[1] dst_sel:ubyte2 src0_sel:dword
723       //~gfx11! v1: %0:v[0] = v_perm_b32 %0:v[0], %0:v[1], 0xc000c0c
724       INS(2, 8)
725       //! v1: %0:v[0] = v_lshlrev_b32 24, %0:v[1]
726       INS(3, 8)
727       //! v1: %0:v[0] = v_bfe_u32 %0:v[1], 0, 16
728       INS(0, 16)
729       //! v1: %0:v[0] = v_lshlrev_b32 16, %0:v[1]
730       INS(1, 16)
731 
732       #undef INS
733 
734 #define INS(idx, size)                                                                             \
735    bld.pseudo(aco_opcode::p_insert, Definition(s0_lo, s1), Definition(scc, s1),                    \
736               Operand(s1_lo, s1), Operand::c32(idx), Operand::c32(size));
737 
738       //>> p_unit_test 1
739       bld.pseudo(aco_opcode::p_unit_test, Operand::c32(1u));
740       //! s1: %_:s[0],  s1: %_:scc = s_bfe_u32 %_:s[1], 0x80000
741       INS(0, 8)
742       //! s1: %_:s[0],  s1: %_:scc = s_bfe_u32 %_:s[1], 0x80000
743       //! s1: %_:s[0],  s1: %_:scc = s_lshl_b32 %_:s[0], 8
744       INS(1, 8)
745       //! s1: %_:s[0],  s1: %_:scc = s_bfe_u32 %_:s[1], 0x80000
746       //! s1: %_:s[0],  s1: %_:scc = s_lshl_b32 %_:s[0], 16
747       INS(2, 8)
748       //! s1: %_:s[0],  s1: %_:scc = s_lshl_b32 %_:s[1], 24
749       INS(3, 8)
750       //! s1: %_:s[0],  s1: %_:scc = s_bfe_u32 %_:s[1], 0x100000
751       INS(0, 16)
752       //! s1: %_:s[0],  s1: %_:scc = s_lshl_b32 %_:s[1], 16
753       INS(1, 16)
754 
755       #undef INS
756 
757 #define INS(idx, def_b)                                                                            \
758    bld.pseudo(aco_opcode::p_insert, Definition(v0_lo.advance(def_b), v2b), Operand(v1_lo, v2b),    \
759               Operand::c32(idx), Operand::c32(8u));
760 
761       //>> p_unit_test 2
762       bld.pseudo(aco_opcode::p_unit_test, Operand::c32(2u));
763       //~gfx7! v2b: %_:v[0][0:16] = v_bfe_u32 %_:v[1][0:16], 0, 8
764       //~gfx(8|9)! v2b: %0:v[0][0:16] = v_lshlrev_b32 0, %0:v[1][0:16] dst_sel:uword0 dst_preserve src0_sel:dword src1_sel:ubyte0
765       //~gfx11! v1: %0:v[0] = v_perm_b32 %0:v[0], %0:v[1], 0x7060c00
766       INS(0, 0)
767       //~gfx(8|9)! v2b: %0:v[0][16:32] = v_lshlrev_b32 0, %0:v[1][0:16] dst_sel:uword1 dst_preserve src0_sel:dword src1_sel:ubyte0
768       //~gfx11! v1: %0:v[0] = v_perm_b32 %0:v[0], %0:v[1], 0xc000504
769       if (lvl != GFX7)
770          INS(0, 2)
771       //~gfx7! v2b: %_:v[0][0:16] = v_lshlrev_b32 8, %_:v[1][0:16]
772       //~gfx(8|9)! v2b: %0:v[0][0:16] = v_lshlrev_b32 8, %0:v[1][0:16] dst_sel:uword0 dst_preserve src0_sel:dword src1_sel:ubyte0
773       //~gfx11! v1: %0:v[0] = v_perm_b32 %0:v[0], %0:v[1], 0x706000c
774       INS(1, 0)
775       //~gfx(8|9)! v2b: %0:v[0][16:32] = v_lshlrev_b32 8, %0:v[1][0:16] dst_sel:uword1 dst_preserve src0_sel:dword src1_sel:ubyte0
776       //~gfx11! v1: %0:v[0] = v_perm_b32 %0:v[0], %0:v[1], 0xc0504
777       if (lvl != GFX7)
778          INS(1, 2)
779 
780       #undef INS
781 
782       finish_to_hw_instr_test();
783 
784       //! s_endpgm
785    }
786 END_TEST
787 
788 BEGIN_TEST(to_hw_instr.copy_linear_vgpr_scc)
789    if (!setup_cs(NULL, GFX10))
790       return;
791 
792    PhysReg reg_s0{0};
793    PhysReg reg_s1{1};
794    PhysReg v0_lo{256};
795    PhysReg v0_b3{256};
796    v0_b3.reg_b += 3;
797    PhysReg v1_lo{257};
798 
799    //>> p_unit_test 0
800    bld.pseudo(aco_opcode::p_unit_test, Operand::zero());
801 
802    /* It would be better if the scc=s0 copy was done later, but handle_operands() is complex
803     * enough
804     */
805 
806    //! s1: %0:scc = s_cmp_lg_i32 %0:s[0], 0
807    //! s1: %0:m0 = s_mov_b32 %0:scc
808    //! lv1: %0:v[0] = v_mov_b32 %0:v[1]
809    //! s2: %0:exec,  s1: %0:scc = s_not_b64 %0:exec
810    //! lv1: %0:v[0] = v_mov_b32 %0:v[1]
811    //! s2: %0:exec,  s1: %0:scc = s_not_b64 %0:exec
812    //! s1: %0:scc = s_cmp_lg_i32 %0:m0, 0
813    Instruction *instr = bld.pseudo(
814       aco_opcode::p_parallelcopy,
815       Definition(scc, s1), Definition(v0_lo, v1.as_linear()),
816       Operand(reg_s0, s1), Operand(v1_lo, v1.as_linear()));
817    instr->pseudo().scratch_sgpr = m0;
818 
819    finish_to_hw_instr_test();
820 END_TEST
821 
822 BEGIN_TEST(to_hw_instr.swap_linear_vgpr)
823    if (!setup_cs(NULL, GFX10))
824       return;
825 
826    PhysReg reg_v0{256};
827    PhysReg reg_v1{257};
828    RegClass v1_linear = v1.as_linear();
829 
830    //>> p_unit_test 0
831    bld.pseudo(aco_opcode::p_unit_test, Operand::zero());
832 
833    Instruction *instr = bld.pseudo(
834       aco_opcode::p_parallelcopy,
835       Definition(reg_v0, v1_linear), Definition(reg_v1, v1_linear),
836       Operand(reg_v1, v1_linear), Operand(reg_v0, v1_linear));
837    instr->pseudo().scratch_sgpr = m0;
838 
839    finish_to_hw_instr_test();
840 END_TEST
841 
842 BEGIN_TEST(to_hw_instr.pack2x16_alignbyte_constant)
843    PhysReg v0_lo{256};
844    PhysReg v0_hi{256};
845    PhysReg v1_hi{257};
846    v0_hi.reg_b += 2;
847    v1_hi.reg_b += 2;
848 
849    if (!setup_cs(NULL, GFX10))
850       return;
851 
852    /* prevent usage of v_pack_b32_f16 */
853    program->blocks[0].fp_mode.denorm16_64 = fp_denorm_flush;
854 
855    //>> p_unit_test 0
856    //! v1: %_:v[0] = v_alignbyte_b32 0x3800, %_:v[1][16:32], 2
857    bld.pseudo(aco_opcode::p_unit_test, Operand::zero());
858    bld.pseudo(aco_opcode::p_parallelcopy, Definition(v0_lo, v2b), Definition(v0_hi, v2b),
859               Operand(v1_hi, v2b), Operand::c16(0x3800));
860 
861    //! s_endpgm
862 
863    finish_to_hw_instr_test();
864 END_TEST
865