• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /*
2  * Copyright © 2020 Valve Corporation
3  *
4  * Permission is hereby granted, free of charge, to any person obtaining a
5  * copy of this software and associated documentation files (the "Software"),
6  * to deal in the Software without restriction, including without limitation
7  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8  * and/or sell copies of the Software, and to permit persons to whom the
9  * Software is furnished to do so, subject to the following conditions:
10  *
11  * The above copyright notice and this permission notice (including the next
12  * paragraph) shall be included in all copies or substantial portions of the
13  * Software.
14  *
15  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
18  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20  * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21  * IN THE SOFTWARE.
22  *
23  */
24 #include "helpers.h"
25 
26 using namespace aco;
27 
28 BEGIN_TEST(to_hw_instr.swap_subdword)
29    PhysReg v0_lo{256};
30    PhysReg v0_hi{256};
31    PhysReg v0_b1{256};
32    PhysReg v0_b3{256};
33    PhysReg v1_lo{257};
34    PhysReg v1_hi{257};
35    PhysReg v1_b1{257};
36    PhysReg v1_b3{257};
37    PhysReg v2_lo{258};
38    PhysReg v3_lo{259};
39    v0_hi.reg_b += 2;
40    v1_hi.reg_b += 2;
41    v0_b1.reg_b += 1;
42    v1_b1.reg_b += 1;
43    v0_b3.reg_b += 3;
44    v1_b3.reg_b += 3;
45 
46    for (unsigned i = GFX6; i <= GFX7; i++) {
47       if (!setup_cs(NULL, (amd_gfx_level)i))
48          continue;
49 
50       //~gfx[67]>>  p_unit_test 0
51       //~gfx[67]! v1: %0:v[1] = v_xor_b32 %0:v[1], %0:v[0]
52       //~gfx[67]! v1: %0:v[0] = v_xor_b32 %0:v[1], %0:v[0]
53       //~gfx[67]! v1: %0:v[1] = v_xor_b32 %0:v[1], %0:v[0]
54       bld.pseudo(aco_opcode::p_unit_test, Operand::zero());
55       bld.pseudo(aco_opcode::p_parallelcopy, Definition(v0_lo, v2b), Definition(v1_lo, v2b),
56                  Operand(v1_lo, v2b), Operand(v0_lo, v2b));
57 
58       //~gfx[67]! p_unit_test 1
59       //~gfx[67]! v2b: %0:v[0][16:32] = v_lshlrev_b32 16, %0:v[0][0:16]
60       //~gfx[67]! v1: %0:v[0] = v_alignbyte_b32 %0:v[1][0:16], %0:v[0][16:32], 2
61       //~gfx[67]! v1: %0:v[0] = v_alignbyte_b32 %0:v[0][0:16], %0:v[0][16:32], 2
62       bld.pseudo(aco_opcode::p_unit_test, Operand::c32(1u));
63       bld.pseudo(aco_opcode::p_create_vector, Definition(v0_lo, v1), Operand(v1_lo, v2b),
64                  Operand(v0_lo, v2b));
65 
66       //~gfx[67]! p_unit_test 2
67       //~gfx[67]! v2b: %0:v[0][16:32] = v_lshlrev_b32 16, %0:v[0][0:16]
68       //~gfx[67]! v1: %0:v[0] = v_alignbyte_b32 %0:v[1][0:16], %0:v[0][16:32], 2
69       //~gfx[67]! v1: %0:v[0] = v_alignbyte_b32 %0:v[0][0:16], %0:v[0][16:32], 2
70       //~gfx[67]! v2b: %0:v[1][0:16] = v_mov_b32 %0:v[2][0:16]
71       bld.pseudo(aco_opcode::p_unit_test, Operand::c32(2u));
72       bld.pseudo(aco_opcode::p_create_vector, Definition(v0_lo, v6b), Operand(v1_lo, v2b),
73                  Operand(v0_lo, v2b), Operand(v2_lo, v2b));
74 
75       //~gfx[67]! p_unit_test 3
76       //~gfx[67]! v2b: %0:v[0][16:32] = v_lshlrev_b32 16, %0:v[0][0:16]
77       //~gfx[67]! v1: %0:v[0] = v_alignbyte_b32 %0:v[1][0:16], %0:v[0][16:32], 2
78       //~gfx[67]! v1: %0:v[0] = v_alignbyte_b32 %0:v[0][0:16], %0:v[0][16:32], 2
79       //~gfx[67]! v2b: %0:v[1][16:32] = v_lshlrev_b32 16, %0:v[2][0:16]
80       //~gfx[67]! v1: %0:v[1] = v_alignbyte_b32 %0:v[3][0:16], %0:v[1][16:32], 2
81       bld.pseudo(aco_opcode::p_unit_test, Operand::c32(3u));
82       bld.pseudo(aco_opcode::p_create_vector, Definition(v0_lo, v2), Operand(v1_lo, v2b),
83                  Operand(v0_lo, v2b), Operand(v2_lo, v2b), Operand(v3_lo, v2b));
84 
85       //~gfx[67]! p_unit_test 4
86       //~gfx[67]! v2b: %0:v[1][16:32] = v_lshlrev_b32 16, %0:v[1][0:16]
87       //~gfx[67]! v1: %0:v[1] = v_alignbyte_b32 %0:v[2][0:16], %0:v[1][16:32], 2
88       //~gfx[67]! v2b: %0:v[0][16:32] = v_lshlrev_b32 16, %0:v[0][0:16]
89       //~gfx[67]! v1: %0:v[0] = v_alignbyte_b32 %0:v[3][0:16], %0:v[0][16:32], 2
90       //~gfx[67]! v1: %0:v[1] = v_xor_b32 %0:v[1], %0:v[0]
91       //~gfx[67]! v1: %0:v[0] = v_xor_b32 %0:v[1], %0:v[0]
92       //~gfx[67]! v1: %0:v[1] = v_xor_b32 %0:v[1], %0:v[0]
93       bld.pseudo(aco_opcode::p_unit_test, Operand::c32(4u));
94       bld.pseudo(aco_opcode::p_create_vector, Definition(v0_lo, v2), Operand(v1_lo, v2b),
95                  Operand(v2_lo, v2b), Operand(v0_lo, v2b), Operand(v3_lo, v2b));
96 
97       //~gfx[67]! p_unit_test 5
98       //~gfx[67]! v2b: %0:v[1][0:16] = v_mov_b32 %0:v[0][0:16]
99       //~gfx[67]! v2b: %0:v[0][0:16] = v_lshrrev_b32 16, %0:v[1][16:32]
100       bld.pseudo(aco_opcode::p_unit_test, Operand::c32(5u));
101       bld.pseudo(aco_opcode::p_split_vector, Definition(v1_lo, v2b), Definition(v0_lo, v2b),
102                  Operand(v0_lo, v1));
103 
104       //~gfx[67]! p_unit_test 6
105       //~gfx[67]! v2b: %0:v[2][0:16] = v_mov_b32 %0:v[1][0:16]
106       //~gfx[67]! v2b: %0:v[1][0:16] = v_mov_b32 %0:v[0][0:16]
107       //~gfx[67]! v2b: %0:v[0][0:16] = v_lshrrev_b32 16, %0:v[1][16:32]
108       bld.pseudo(aco_opcode::p_unit_test, Operand::c32(6u));
109       bld.pseudo(aco_opcode::p_split_vector, Definition(v1_lo, v2b), Definition(v0_lo, v2b),
110                  Definition(v2_lo, v2b), Operand(v0_lo, v6b));
111 
112       //~gfx[67]! p_unit_test 7
113       //~gfx[67]! v2b: %0:v[2][0:16] = v_mov_b32 %0:v[1][0:16]
114       //~gfx[67]! v2b: %0:v[1][0:16] = v_mov_b32 %0:v[0][0:16]
115       //~gfx[67]! v2b: %0:v[0][0:16] = v_lshrrev_b32 16, %0:v[1][16:32]
116       //~gfx[67]! v2b: %0:v[3][0:16] = v_lshrrev_b32 16, %0:v[2][16:32]
117       bld.pseudo(aco_opcode::p_unit_test, Operand::c32(7u));
118       bld.pseudo(aco_opcode::p_split_vector, Definition(v1_lo, v2b), Definition(v0_lo, v2b),
119                  Definition(v2_lo, v2b), Definition(v3_lo, v2b), Operand(v0_lo, v2));
120 
121       //~gfx[67]! p_unit_test 8
122       //~gfx[67]! v2b: %0:v[2][0:16] = v_lshrrev_b32 16, %0:v[0][16:32]
123       //~gfx[67]! v2b: %0:v[3][0:16] = v_lshrrev_b32 16, %0:v[1][16:32]
124       //~gfx[67]! v1: %0:v[1] = v_xor_b32 %0:v[1], %0:v[0]
125       //~gfx[67]! v1: %0:v[0] = v_xor_b32 %0:v[1], %0:v[0]
126       //~gfx[67]! v1: %0:v[1] = v_xor_b32 %0:v[1], %0:v[0]
127       bld.pseudo(aco_opcode::p_unit_test, Operand::c32(8u));
128       bld.pseudo(aco_opcode::p_split_vector, Definition(v1_lo, v2b), Definition(v2_lo, v2b),
129                  Definition(v0_lo, v2b), Definition(v3_lo, v2b), Operand(v0_lo, v2));
130 
131       //~gfx[67]! p_unit_test 9
132       //~gfx[67]! v1: %0:v[1] = v_xor_b32 %0:v[1], %0:v[0]
133       //~gfx[67]! v1: %0:v[0] = v_xor_b32 %0:v[1], %0:v[0]
134       //~gfx[67]! v1: %0:v[1] = v_xor_b32 %0:v[1], %0:v[0]
135       bld.pseudo(aco_opcode::p_unit_test, Operand::c32(9u));
136       bld.pseudo(aco_opcode::p_parallelcopy, Definition(v0_lo, v1b), Definition(v1_lo, v1b),
137                  Operand(v1_lo, v1b), Operand(v0_lo, v1b));
138 
139       //~gfx[67]! p_unit_test 10
140       //~gfx[67]! v1b: %0:v[1][24:32] = v_lshlrev_b32 24, %0:v[1][0:8]
141       //~gfx[67]! v2b: %0:v[1][0:16] = v_alignbyte_b32 %0:v[0][0:8], %0:v[1][24:32], 3
142       //~gfx[67]! v2b: %0:v[0][0:16] = v_mov_b32 %0:v[1][0:16]
143       bld.pseudo(aco_opcode::p_unit_test, Operand::c32(10u));
144       bld.pseudo(aco_opcode::p_create_vector, Definition(v0_lo, v2b), Operand(v1_lo, v1b),
145                  Operand(v0_lo, v1b));
146 
147       //~gfx[67]! p_unit_test 11
148       //~gfx[67]! v1b: %0:v[1][24:32] = v_lshlrev_b32 24, %0:v[1][0:8]
149       //~gfx[67]! v2b: %0:v[1][0:16] = v_alignbyte_b32 %0:v[0][0:8], %0:v[1][24:32], 3
150       //~gfx[67]! v2b: %0:v[0][0:16] = v_mov_b32 %0:v[1][0:16]
151       //~gfx[67]! v2b: %0:v[0][16:32] = v_lshlrev_b32 16, %0:v[0][0:16]
152       //~gfx[67]! v3b: %0:v[0][0:24] = v_alignbyte_b32 %0:v[2][0:8], %0:v[0][16:32], 2
153       bld.pseudo(aco_opcode::p_unit_test, Operand::c32(11u));
154       bld.pseudo(aco_opcode::p_create_vector, Definition(v0_lo, v3b), Operand(v1_lo, v1b),
155                  Operand(v0_lo, v1b), Operand(v2_lo, v1b));
156 
157       //~gfx[67]! p_unit_test 12
158       //~gfx[67]! v1b: %0:v[1][24:32] = v_lshlrev_b32 24, %0:v[1][0:8]
159       //~gfx[67]! v2b: %0:v[1][0:16] = v_alignbyte_b32 %0:v[0][0:8], %0:v[1][24:32], 3
160       //~gfx[67]! v2b: %0:v[0][0:16] = v_mov_b32 %0:v[1][0:16]
161       //~gfx[67]! v2b: %0:v[0][16:32] = v_lshlrev_b32 16, %0:v[0][0:16]
162       //~gfx[67]! v3b: %0:v[0][0:24] = v_alignbyte_b32 %0:v[2][0:8], %0:v[0][16:32], 2
163       //~gfx[67]! v3b: %0:v[0][8:32] = v_lshlrev_b32 8, %0:v[0][0:24]
164       //~gfx[67]! v1: %0:v[0] = v_alignbyte_b32 %0:v[3][0:8], %0:v[0][8:32], 1
165       bld.pseudo(aco_opcode::p_unit_test, Operand::c32(12u));
166       bld.pseudo(aco_opcode::p_create_vector, Definition(v0_lo, v1), Operand(v1_lo, v1b),
167                  Operand(v0_lo, v1b), Operand(v2_lo, v1b), Operand(v3_lo, v1b));
168 
169       //~gfx[67]! p_unit_test 13
170       //~gfx[67]! v1b: %0:v[0][0:8] = v_and_b32 0xff, %0:v[0][0:8]
171       //~gfx[67]! v2b: %0:v[0][0:16] = v_mul_u32_u24 0x101, %0:v[0][0:8]
172       //~gfx[67]! v2b: %0:v[0][0:16] = v_and_b32 0xffff, %0:v[0][0:16]
173       //~gfx[67]! v3b: %0:v[0][0:24] = v_cvt_pk_u16_u32 %0:v[0][0:16], %0:v[0][0:8]
174       //~gfx[67]! v3b: %0:v[0][0:24] = v_and_b32 0xffffff, %0:v[0][0:24]
175       //~gfx[67]! s1: %0:m0 = s_mov_b32 0x1000001
176       //~gfx[67]! v1: %0:v[0] = v_mul_lo_u32 %0:m0, %0:v[0][0:8]
177       bld.pseudo(aco_opcode::p_unit_test, Operand::c32(13u));
178       Instruction* pseudo =
179          bld.pseudo(aco_opcode::p_create_vector, Definition(v0_lo, v1), Operand(v0_lo, v1b),
180                     Operand(v0_lo, v1b), Operand(v0_lo, v1b), Operand(v0_lo, v1b));
181       pseudo->pseudo().scratch_sgpr = m0;
182 
183       //~gfx[67]! p_unit_test 14
184       //~gfx[67]! v1b: %0:v[1][0:8] = v_mov_b32 %0:v[0][0:8]
185       //~gfx[67]! v1b: %0:v[0][0:8] = v_lshrrev_b32 8, %0:v[1][8:16]
186       bld.pseudo(aco_opcode::p_unit_test, Operand::c32(14u));
187       bld.pseudo(aco_opcode::p_split_vector, Definition(v1_lo, v1b), Definition(v0_lo, v1b),
188                  Operand(v0_lo, v2b));
189 
190       //~gfx[67]! p_unit_test 15
191       //~gfx[67]! v1b: %0:v[1][0:8] = v_mov_b32 %0:v[0][0:8]
192       //~gfx[67]! v1b: %0:v[0][0:8] = v_lshrrev_b32 8, %0:v[1][8:16]
193       //~gfx[67]! v1b: %0:v[2][0:8] = v_lshrrev_b32 16, %0:v[1][16:24]
194       //~gfx[67]! v1b: %0:v[3][0:8] = v_lshrrev_b32 24, %0:v[1][24:32]
195       bld.pseudo(aco_opcode::p_unit_test, Operand::c32(15u));
196       bld.pseudo(aco_opcode::p_split_vector, Definition(v1_lo, v1b), Definition(v0_lo, v1b),
197                  Definition(v2_lo, v1b), Definition(v3_lo, v1b), Operand(v0_lo, v1));
198 
199       //~gfx[67]! s_endpgm
200 
201       finish_to_hw_instr_test();
202    }
203 
204    for (amd_gfx_level lvl : {GFX8, GFX9, GFX11}) {
205       if (!setup_cs(NULL, lvl))
206          continue;
207 
208       //~gfx(8|9|11)>> p_unit_test 0
209       //~gfx8! v1: %0:v[0] = v_alignbyte_b32 %0:v[0][0:16], %0:v[0][16:32], 2
210       //~gfx(9|11)! v1: %0:v[0] = v_pack_b32_f16 hi(%0:v[0][16:32]), %0:v[0][0:16]
211       bld.pseudo(aco_opcode::p_unit_test, Operand::zero());
212       bld.pseudo(aco_opcode::p_parallelcopy, Definition(v0_lo, v2b), Definition(v0_hi, v2b),
213                  Operand(v0_hi, v2b), Operand(v0_lo, v2b));
214 
215       //~gfx(8|9|11)! p_unit_test 1
216       //~gfx8! v1: %0:v[1] = v_xor_b32 %0:v[1], %0:v[0]
217       //~gfx8! v1: %0:v[0] = v_xor_b32 %0:v[1], %0:v[0]
218       //~gfx8! v1: %0:v[1] = v_xor_b32 %0:v[1], %0:v[0]
219       //~gfx(9|11)! v1: %0:v[0],  v1: %0:v[1] = v_swap_b32 %0:v[1], %0:v[0]
220       //~gfx[89]! v2b: %0:v[1][16:32] = v_mov_b32 %0:v[0][16:32] dst_sel:uword1 dst_preserve src0_sel:uword1
221       //~gfx11! v2b: %0:v[1][16:32] = v_mov_b16 hi(%0:v[0][16:32]) opsel_hi
222       bld.pseudo(aco_opcode::p_unit_test, Operand::c32(1u));
223       bld.pseudo(aco_opcode::p_parallelcopy, Definition(v0_lo, v1), Definition(v1_lo, v2b),
224                  Operand(v1_lo, v1), Operand(v0_lo, v2b));
225 
226       //~gfx(8|9|11)! p_unit_test 2
227       //~gfx[89]! v2b: %0:v[0][16:32] = v_mov_b32 %0:v[1][16:32] dst_sel:uword1 dst_preserve src0_sel:uword1
228       //~gfx[89]! v2b: %0:v[1][16:32] = v_mov_b32 %0:v[0][0:16] dst_sel:uword1 dst_preserve src0_sel:uword0
229       //~gfx[89]! v2b: %0:v[1][0:16] = v_xor_b32 %0:v[1][0:16], %0:v[0][0:16] dst_sel:uword0 dst_preserve src0_sel:uword0 src1_sel:uword0
230       //~gfx[89]! v2b: %0:v[0][0:16] = v_xor_b32 %0:v[1][0:16], %0:v[0][0:16] dst_sel:uword0 dst_preserve src0_sel:uword0 src1_sel:uword0
231       //~gfx[89]! v2b: %0:v[1][0:16] = v_xor_b32 %0:v[1][0:16], %0:v[0][0:16] dst_sel:uword0 dst_preserve src0_sel:uword0 src1_sel:uword0
232       //~gfx11! v2b: %0:v[0][16:32] = v_mov_b16 hi(%0:v[1][16:32]) opsel_hi
233       //~gfx11! v2b: %0:v[1][16:32] = v_mov_b16 %0:v[0][0:16] opsel_hi
234       //~gfx11! v2b: %0:v[0][0:16] = v_add_u16_e64 %0:v[0][0:16], %0:v[1][0:16]
235       //~gfx11! v2b: %0:v[1][0:16] = v_sub_u16_e64 %0:v[0][0:16], %0:v[1][0:16]
236       //~gfx11! v2b: %0:v[0][0:16] = v_sub_u16_e64 %0:v[0][0:16], %0:v[1][0:16]
237       bld.pseudo(aco_opcode::p_unit_test, Operand::c32(2u));
238       bld.pseudo(aco_opcode::p_parallelcopy, Definition(v0_lo, v1), Definition(v1_lo, v2b),
239                  Definition(v1_hi, v2b), Operand(v1_lo, v1), Operand(v0_lo, v2b),
240                  Operand(v0_lo, v2b));
241 
242       //~gfx(8|9|11)! p_unit_test 3
243       //~gfx8! v1: %0:v[1] = v_xor_b32 %0:v[1], %0:v[0]
244       //~gfx8! v1: %0:v[0] = v_xor_b32 %0:v[1], %0:v[0]
245       //~gfx8! v1: %0:v[1] = v_xor_b32 %0:v[1], %0:v[0]
246       //~gfx(9|11)! v1: %0:v[0],  v1: %0:v[1] = v_swap_b32 %0:v[1], %0:v[0]
247       //~gfx[89]! v2b: %0:v[1][0:16] = v_mov_b32 %0:v[0][0:16] dst_sel:uword0 dst_preserve src0_sel:uword0
248       //~gfx[89]! v1b: %0:v[1][16:24] = v_mov_b32 %0:v[0][16:24] dst_sel:ubyte2 dst_preserve src0_sel:ubyte2
249       //~gfx11! v2b: %0:v[1][0:16] = v_mov_b16 %0:v[0][0:16]
250       //~gfx11! v1: %0:v[1] = v_perm_b32 %0:v[1], %0:v[0], 0x7020504
251       bld.pseudo(aco_opcode::p_unit_test, Operand::c32(3u));
252       bld.pseudo(aco_opcode::p_parallelcopy, Definition(v0_lo, v1), Definition(v1_b3, v1b),
253                  Operand(v1_lo, v1), Operand(v0_b3, v1b));
254 
255       //~gfx(8|9|11)! p_unit_test 4
256       //~gfx8! v1: %0:v[1] = v_xor_b32 %0:v[1], %0:v[0]
257       //~gfx8! v1: %0:v[0] = v_xor_b32 %0:v[1], %0:v[0]
258       //~gfx8! v1: %0:v[1] = v_xor_b32 %0:v[1], %0:v[0]
259       //~gfx(9|11)! v1: %0:v[0],  v1: %0:v[1] = v_swap_b32 %0:v[1], %0:v[0]
260       //~gfx[89]! v1b: %0:v[1][8:16] = v_mov_b32 %0:v[0][8:16] dst_sel:ubyte1 dst_preserve src0_sel:ubyte1
261       //~gfx[89]! v2b: %0:v[1][16:32] = v_mov_b32 %0:v[0][16:32] dst_sel:uword1 dst_preserve src0_sel:uword1
262       //~gfx11! v1: %0:v[1] = v_perm_b32 %0:v[1], %0:v[0], 0x7060104
263       //~gfx11! v2b: %0:v[1][16:32] = v_mov_b16 hi(%0:v[0][16:32]) opsel_hi
264       bld.pseudo(aco_opcode::p_unit_test, Operand::c32(4u));
265       bld.pseudo(aco_opcode::p_parallelcopy, Definition(v0_lo, v1), Definition(v1_lo, v1b),
266                  Operand(v1_lo, v1), Operand(v0_lo, v1b));
267 
268       //~gfx(8|9|11)! p_unit_test 5
269       //~gfx8! v1: %0:v[0] = v_xor_b32 %0:v[0], %0:v[1]
270       //~gfx8! v1: %0:v[1] = v_xor_b32 %0:v[0], %0:v[1]
271       //~gfx8! v1: %0:v[0] = v_xor_b32 %0:v[0], %0:v[1]
272       //~gfx(9|11)! v1: %0:v[1],  v1: %0:v[0] = v_swap_b32 %0:v[0], %0:v[1]
273       //~gfx[89]! v1b: %0:v[0][8:16] = v_mov_b32 %0:v[1][8:16] dst_sel:ubyte1 dst_preserve src0_sel:ubyte1
274       //~gfx[89]! v1b: %0:v[0][24:32] = v_mov_b32 %0:v[1][24:32] dst_sel:ubyte3 dst_preserve src0_sel:ubyte3
275       //~gfx11! v1: %0:v[0] = v_perm_b32 %0:v[0], %0:v[1], 0x7060104
276       //~gfx11! v1: %0:v[0] = v_perm_b32 %0:v[0], %0:v[1], 0x3060504
277       bld.pseudo(aco_opcode::p_unit_test, Operand::c32(5u));
278       bld.pseudo(aco_opcode::p_parallelcopy, Definition(v0_lo, v1b), Definition(v0_hi, v1b),
279                  Definition(v1_lo, v1), Operand(v1_lo, v1b), Operand(v1_hi, v1b),
280                  Operand(v0_lo, v1));
281 
282       //~gfx(8|9|11)! p_unit_test 6
283       //~gfx8! v1: %0:v[1] = v_xor_b32 %0:v[1], %0:v[0]
284       //~gfx8! v1: %0:v[0] = v_xor_b32 %0:v[1], %0:v[0]
285       //~gfx8! v1: %0:v[1] = v_xor_b32 %0:v[1], %0:v[0]
286       //~gfx(9|11)! v1: %0:v[0],  v1: %0:v[1] = v_swap_b32 %0:v[1], %0:v[0]
287       bld.pseudo(aco_opcode::p_unit_test, Operand::c32(6u));
288       bld.pseudo(aco_opcode::p_parallelcopy, Definition(v0_lo, v2b), Definition(v0_hi, v2b),
289                  Definition(v1_lo, v1), Operand(v1_lo, v2b), Operand(v1_hi, v2b),
290                  Operand(v0_lo, v1));
291 
292       //~gfx(8|9|11)! p_unit_test 7
293       //~gfx8! v1: %0:v[0] = v_xor_b32 %0:v[0], %0:v[1]
294       //~gfx8! v1: %0:v[1] = v_xor_b32 %0:v[0], %0:v[1]
295       //~gfx8! v1: %0:v[0] = v_xor_b32 %0:v[0], %0:v[1]
296       //~gfx(9|11)! v1: %0:v[1],  v1: %0:v[0] = v_swap_b32 %0:v[0], %0:v[1]
297       //~gfx(8|9|11)! v1: %0:v[0] = v_alignbyte_b32 %0:v[0][0:16], %0:v[0][16:32], 2
298       bld.pseudo(aco_opcode::p_unit_test, Operand::c32(7u));
299       bld.pseudo(aco_opcode::p_parallelcopy, Definition(v0_lo, v2b), Definition(v0_hi, v2b),
300                  Definition(v1_lo, v1), Operand(v1_hi, v2b), Operand(v1_lo, v2b),
301                  Operand(v0_lo, v1));
302 
303       //~gfx(8|9|11)! p_unit_test 8
304       //~gfx8! v1: %0:v[1] = v_xor_b32 %0:v[1], %0:v[0]
305       //~gfx8! v1: %0:v[0] = v_xor_b32 %0:v[1], %0:v[0]
306       //~gfx8! v1: %0:v[1] = v_xor_b32 %0:v[1], %0:v[0]
307       //~gfx(9|11)! v1: %0:v[0],  v1: %0:v[1] = v_swap_b32 %0:v[1], %0:v[0]
308       //~gfx[89]! v1b: %0:v[1][24:32] = v_xor_b32 %0:v[1][24:32], %0:v[0][24:32] dst_sel:ubyte3 dst_preserve src0_sel:ubyte3 src1_sel:ubyte3
309       //~gfx[89]! v1b: %0:v[0][24:32] = v_xor_b32 %0:v[1][24:32], %0:v[0][24:32] dst_sel:ubyte3 dst_preserve src0_sel:ubyte3 src1_sel:ubyte3
310       //~gfx[89]! v1b: %0:v[1][24:32] = v_xor_b32 %0:v[1][24:32], %0:v[0][24:32] dst_sel:ubyte3 dst_preserve src0_sel:ubyte3 src1_sel:ubyte3
311       //~gfx11! v2b: %0:v[0][0:16] = v_add_u16_e64 %0:v[0][0:16], hi(%0:v[1][16:32])
312       //~gfx11! v2b: %0:v[1][16:32] = v_sub_u16_e64 %0:v[0][0:16], hi(%0:v[1][16:32]) opsel_hi
313       //~gfx11! v2b: %0:v[0][0:16] = v_sub_u16_e64 %0:v[0][0:16], hi(%0:v[1][16:32])
314       //~gfx11! v1: %0:v[0] = v_perm_b32 %0:v[0], 0, 0x5060704
315       //~gfx11! v2b: %0:v[0][0:16] = v_add_u16_e64 %0:v[0][0:16], hi(%0:v[1][16:32])
316       //~gfx11! v2b: %0:v[1][16:32] = v_sub_u16_e64 %0:v[0][0:16], hi(%0:v[1][16:32]) opsel_hi
317       //~gfx11! v2b: %0:v[0][0:16] = v_sub_u16_e64 %0:v[0][0:16], hi(%0:v[1][16:32])
318       bld.pseudo(aco_opcode::p_unit_test, Operand::c32(8u));
319       bld.pseudo(aco_opcode::p_parallelcopy, Definition(v0_lo, v3b), Definition(v1_lo, v3b),
320                  Operand(v1_lo, v3b), Operand(v0_lo, v3b));
321 
322       //~gfx(8|9|11)! p_unit_test 9
323       //~gfx8! v1: %0:v[1] = v_xor_b32 %0:v[1], %0:v[0]
324       //~gfx8! v1: %0:v[0] = v_xor_b32 %0:v[1], %0:v[0]
325       //~gfx8! v1: %0:v[1] = v_xor_b32 %0:v[1], %0:v[0]
326       //~gfx(9|11)! v1: %0:v[0],  v1: %0:v[1] = v_swap_b32 %0:v[1], %0:v[0]
327       //~gfx[89]! v1b: %0:v[1][24:32] = v_mov_b32 %0:v[0][24:32] dst_sel:ubyte3 dst_preserve src0_sel:ubyte3
328       //~gfx11! v1: %0:v[1] = v_perm_b32 %0:v[1], %0:v[0], 0x3060504
329       bld.pseudo(aco_opcode::p_unit_test, Operand::c32(9u));
330       bld.pseudo(aco_opcode::p_parallelcopy, Definition(v0_lo, v3b), Definition(v1_lo, v3b),
331                  Definition(v0_b3, v1b), Operand(v1_lo, v3b), Operand(v0_lo, v3b),
332                  Operand(v1_b3, v1b));
333 
334       //~gfx(8|9|11)! p_unit_test 10
335       //~gfx[89]! v1b: %0:v[1][8:16] = v_xor_b32 %0:v[1][8:16], %0:v[0][8:16] dst_sel:ubyte1 dst_preserve src0_sel:ubyte1 src1_sel:ubyte1
336       //~gfx[89]! v1b: %0:v[0][8:16] = v_xor_b32 %0:v[1][8:16], %0:v[0][8:16] dst_sel:ubyte1 dst_preserve src0_sel:ubyte1 src1_sel:ubyte1
337       //~gfx[89]! v1b: %0:v[1][8:16] = v_xor_b32 %0:v[1][8:16], %0:v[0][8:16] dst_sel:ubyte1 dst_preserve src0_sel:ubyte1 src1_sel:ubyte1
338       //~gfx11! v2b: %0:v[0][16:32] = v_add_u16_e64 hi(%0:v[0][16:32]), %0:v[1][0:16] opsel_hi
339       //~gfx11! v2b: %0:v[1][0:16] = v_sub_u16_e64 hi(%0:v[0][16:32]), %0:v[1][0:16]
340       //~gfx11! v2b: %0:v[0][16:32] = v_sub_u16_e64 hi(%0:v[0][16:32]), %0:v[1][0:16] opsel_hi
341       //~gfx11! v1: %0:v[0] = v_perm_b32 %0:v[0], 0, 0x5060704
342       //~gfx11! v2b: %0:v[0][16:32] = v_add_u16_e64 hi(%0:v[0][16:32]), %0:v[1][0:16] opsel_hi
343       //~gfx11! v2b: %0:v[1][0:16] = v_sub_u16_e64 hi(%0:v[0][16:32]), %0:v[1][0:16]
344       //~gfx11! v2b: %0:v[0][16:32] = v_sub_u16_e64 hi(%0:v[0][16:32]), %0:v[1][0:16] opsel_hi
345       //~gfx[89]! v1b: %0:v[1][16:24] = v_xor_b32 %0:v[1][16:24], %0:v[0][16:24] dst_sel:ubyte2 dst_preserve src0_sel:ubyte2 src1_sel:ubyte2
346       //~gfx[89]! v1b: %0:v[0][16:24] = v_xor_b32 %0:v[1][16:24], %0:v[0][16:24] dst_sel:ubyte2 dst_preserve src0_sel:ubyte2 src1_sel:ubyte2
347       //~gfx[89]! v1b: %0:v[1][16:24] = v_xor_b32 %0:v[1][16:24], %0:v[0][16:24] dst_sel:ubyte2 dst_preserve src0_sel:ubyte2 src1_sel:ubyte2
348       //~gfx11! v2b: %0:v[0][0:16] = v_add_u16_e64 %0:v[0][0:16], hi(%0:v[1][16:32])
349       //~gfx11! v2b: %0:v[1][16:32] = v_sub_u16_e64 %0:v[0][0:16], hi(%0:v[1][16:32]) opsel_hi
350       //~gfx11! v2b: %0:v[0][0:16] = v_sub_u16_e64 %0:v[0][0:16], hi(%0:v[1][16:32])
351       //~gfx11! v1: %0:v[0] = v_perm_b32 %0:v[0], 0, 0x7040506
352       //~gfx11! v2b: %0:v[0][0:16] = v_add_u16_e64 %0:v[0][0:16], hi(%0:v[1][16:32])
353       //~gfx11! v2b: %0:v[1][16:32] = v_sub_u16_e64 %0:v[0][0:16], hi(%0:v[1][16:32]) opsel_hi
354       //~gfx11! v2b: %0:v[0][0:16] = v_sub_u16_e64 %0:v[0][0:16], hi(%0:v[1][16:32])
355       bld.pseudo(aco_opcode::p_unit_test, Operand::c32(10u));
356       bld.pseudo(aco_opcode::p_parallelcopy, Definition(v0_b1, v2b), Definition(v1_b1, v2b),
357                  Operand(v1_b1, v2b), Operand(v0_b1, v2b));
358 
359       //~gfx(8|9|11)! p_unit_test 11
360       //~gfx[89]! v2b: %0:v[1][0:16] = v_mov_b32 %0:v[0][16:32] dst_sel:uword0 dst_preserve src0_sel:uword1
361       //~gfx11! v2b: %0:v[1][0:16] = v_mov_b16 hi(%0:v[0][16:32])
362       //~gfx(8|9|11)! v1: %0:v[0] = v_mov_b32 42
363       bld.pseudo(aco_opcode::p_unit_test, Operand::c32(11u));
364       bld.pseudo(aco_opcode::p_parallelcopy, Definition(v0_lo, v1), Definition(v1_lo, v2b),
365                  Operand::c32(42u), Operand(v0_hi, v2b));
366 
367       //~gfx(8|9|11)! p_unit_test 12
368       //~gfx[89]! v1b: %0:v[0][24:32] = v_xor_b32 %0:v[0][24:32], %0:v[0][8:16] dst_sel:ubyte3 dst_preserve src0_sel:ubyte3 src1_sel:ubyte1
369       //~gfx[89]! v1b: %0:v[0][8:16] = v_xor_b32 %0:v[0][24:32], %0:v[0][8:16] dst_sel:ubyte1 dst_preserve src0_sel:ubyte3 src1_sel:ubyte1
370       //~gfx[89]! v1b: %0:v[0][24:32] = v_xor_b32 %0:v[0][24:32], %0:v[0][8:16] dst_sel:ubyte3 dst_preserve src0_sel:ubyte3 src1_sel:ubyte1
371       //~gfx11! v1: %0:v[0] = v_perm_b32 %0:v[0], 0, 0x5060704
372       bld.pseudo(aco_opcode::p_unit_test, Operand::c32(12u));
373       bld.pseudo(aco_opcode::p_parallelcopy, Definition(v0_b1, v1b), Definition(v0_b3, v1b),
374                  Operand(v0_b3, v1b), Operand(v0_b1, v1b));
375 
376       //~gfx(8|9|11)! s_endpgm
377 
378       finish_to_hw_instr_test();
379    }
380 END_TEST
381 
382 BEGIN_TEST(to_hw_instr.subdword_constant)
383    PhysReg v0_lo{256};
384    PhysReg v0_hi{256};
385    PhysReg v0_b1{256};
386    PhysReg v1_lo{257};
387    PhysReg v1_hi{257};
388    v0_hi.reg_b += 2;
389    v0_b1.reg_b += 1;
390    v1_hi.reg_b += 2;
391 
392    for (amd_gfx_level lvl : {GFX9, GFX10, GFX11}) {
393       if (!setup_cs(NULL, lvl))
394          continue;
395 
396       /* 16-bit pack */
397       //>> p_unit_test 0
398       //! v1: %_:v[0] = v_pack_b32_f16 0.5, hi(%_:v[1][16:32])
399       bld.pseudo(aco_opcode::p_unit_test, Operand::zero());
400       bld.pseudo(aco_opcode::p_parallelcopy, Definition(v0_lo, v2b), Definition(v0_hi, v2b),
401                  Operand::c16(0x3800), Operand(v1_hi, v2b));
402 
403       //! p_unit_test 1
404       //~gfx9! v2b: %0:v[0][16:32] = v_and_b32 0xffff0000, %0:v[1][16:32]
405       //~gfx9! v1: %0:v[0] = v_or_b32 0x4205, %0:v[0]
406       //~gfx(10|11)! v1: %_:v[0] = v_pack_b32_f16 0x4205, hi(%_:v[1][16:32])
407       bld.pseudo(aco_opcode::p_unit_test, Operand::c32(1u));
408       bld.pseudo(aco_opcode::p_parallelcopy, Definition(v0_lo, v2b), Definition(v0_hi, v2b),
409                  Operand::c16(0x4205), Operand(v1_hi, v2b));
410 
411       //! p_unit_test 2
412       //~gfx9! v2b: %0:v[0][16:32] = v_lshlrev_b32 16, %0:v[0][0:16]
413       //~gfx9! v1: %_:v[0] = v_or_b32 0x4205, %_:v[0]
414       //~gfx(10|11)! v1: %0:v[0] = v_pack_b32_f16 0x4205, %0:v[0][0:16]
415       bld.pseudo(aco_opcode::p_unit_test, Operand::c32(2u));
416       bld.pseudo(aco_opcode::p_parallelcopy, Definition(v0_lo, v2b), Definition(v0_hi, v2b),
417                  Operand::c16(0x4205), Operand(v0_lo, v2b));
418 
419       //! p_unit_test 3
420       //! v1: %_:v[0] = v_mov_b32 0x3c003800
421       bld.pseudo(aco_opcode::p_unit_test, Operand::c32(3u));
422       bld.pseudo(aco_opcode::p_parallelcopy, Definition(v0_lo, v2b), Definition(v0_hi, v2b),
423                  Operand::c16(0x3800), Operand::c16(0x3c00));
424 
425       //! p_unit_test 4
426       //! v1: %_:v[0] = v_mov_b32 0x43064205
427       bld.pseudo(aco_opcode::p_unit_test, Operand::c32(4u));
428       bld.pseudo(aco_opcode::p_parallelcopy, Definition(v0_lo, v2b), Definition(v0_hi, v2b),
429                  Operand::c16(0x4205), Operand::c16(0x4306));
430 
431       //! p_unit_test 5
432       //! v1: %_:v[0] = v_mov_b32 0x38004205
433       bld.pseudo(aco_opcode::p_unit_test, Operand::c32(5u));
434       bld.pseudo(aco_opcode::p_parallelcopy, Definition(v0_lo, v2b), Definition(v0_hi, v2b),
435                  Operand::c16(0x4205), Operand::c16(0x3800));
436 
437       /* 16-bit copy */
438       //! p_unit_test 6
439       //~gfx(9|10)! v2b: %_:v[0][0:16] = v_add_f16 0.5, 0 dst_sel:uword0 dst_preserve src0_sel:uword0 src1_sel:dword
440       //~gfx11! v2b: %0:v[0][0:16] = v_add_f16 0.5, 0
441       bld.pseudo(aco_opcode::p_unit_test, Operand::c32(6u));
442       bld.pseudo(aco_opcode::p_parallelcopy, Definition(v0_lo, v2b), Operand::c16(0x3800));
443 
444       //! p_unit_test 7
445       //~gfx9! v1: %_:v[0] = v_and_b32 0xffff0000, %_:v[0]
446       //~gfx9! v1: %_:v[0] = v_or_b32 0x4205, %_:v[0]
447       //~gfx10! v2b: %_:v[0][0:16] = v_pack_b32_f16 0x4205, hi(%_:v[0][16:32])
448       //~gfx11! v2b: %0:v[0][0:16] = v_mov_b16 0x4205
449       bld.pseudo(aco_opcode::p_unit_test, Operand::c32(7u));
450       bld.pseudo(aco_opcode::p_parallelcopy, Definition(v0_lo, v2b), Operand::c16(0x4205));
451 
452       //! p_unit_test 8
453       //~gfx9! v1: %_:v[0] = v_and_b32 0xffff, %_:v[0]
454       //~gfx9! v1: %_:v[0] = v_or_b32 0x42050000, %_:v[0]
455       //~gfx10! v2b: %_:v[0][16:32] = v_pack_b32_f16 %_:v[0][0:16], 0x4205
456       //~gfx11! v2b: %0:v[0][16:32] = v_mov_b16 0x4205 opsel_hi
457       bld.pseudo(aco_opcode::p_unit_test, Operand::c32(8u));
458       bld.pseudo(aco_opcode::p_parallelcopy, Definition(v0_hi, v2b), Operand::c16(0x4205));
459 
460       //! p_unit_test 9
461       //~gfx(9|10)! v1b: %_:v[0][8:16] = v_mov_b32 0 dst_sel:ubyte1 dst_preserve src0_sel:dword
462       //~gfx(9|10)! v1b: %_:v[0][16:24] = v_mov_b32 56 dst_sel:ubyte2 dst_preserve src0_sel:dword
463       //~gfx11! v1: %_:v[0] = v_perm_b32 %_:v[0], 0, 0x7060c04
464       //~gfx11! v1: %_:v[0] = v_and_b32 0xff00ffff, %_:v[0]
465       //~gfx11! v1: %_:v[0] = v_or_b32 0x380000, %_:v[0]
466       bld.pseudo(aco_opcode::p_unit_test, Operand::c32(9u));
467       bld.pseudo(aco_opcode::p_parallelcopy, Definition(v0_b1, v2b), Operand::c16(0x3800));
468 
469       //! p_unit_test 10
470       //~gfx(9|10)! v1b: %_:v[0][8:16] = v_mov_b32 5 dst_sel:ubyte1 dst_preserve src0_sel:dword
471       //~gfx(9|10)! v1b: %_:v[0][16:24] = v_mul_u32_u24 2, 33 dst_sel:ubyte2 dst_preserve src0_sel:dword src1_sel:dword
472       //~gfx11! v1: %_:v[0] = v_and_b32 0xffff00ff, %_:v[0]
473       //~gfx11! v1: %_:v[0] = v_or_b32 0x500, %_:v[0]
474       //~gfx11! v1: %_:v[0] = v_and_b32 0xff00ffff, %_:v[0]
475       //~gfx11! v1: %_:v[0] = v_or_b32 0x420000, %_:v[0]
476       bld.pseudo(aco_opcode::p_unit_test, Operand::c32(10u));
477       bld.pseudo(aco_opcode::p_parallelcopy, Definition(v0_b1, v2b), Operand::c16(0x4205));
478 
479       /* 8-bit copy */
480       //! p_unit_test 11
481       //~gfx(9|10)! v1b: %_:v[0][0:8] = v_mul_u32_u24 2, 33 dst_sel:ubyte0 dst_preserve src0_sel:dword src1_sel:dword
482       //~gfx11! v1: %_:v[0] = v_and_b32 0xffffff00, %_:v[0]
483       //~gfx11! v1: %_:v[0] = v_or_b32 0x42, %_:v[0]
484       bld.pseudo(aco_opcode::p_unit_test, Operand::c32(11u));
485       bld.pseudo(aco_opcode::p_parallelcopy, Definition(v0_lo, v1b), Operand::c8(0x42));
486 
487       /* 32-bit and 8-bit copy */
488       //! p_unit_test 12
489       //! v1: %_:v[0] = v_mov_b32 0
490       //~gfx(9|10)! v1b: %_:v[1][0:8] = v_mov_b32 0 dst_sel:ubyte0 dst_preserve src0_sel:dword
491       //~gfx11! v1: %_:v[1] = v_perm_b32 %_:v[1], 0, 0x706050c
492       bld.pseudo(aco_opcode::p_unit_test, Operand::c32(12u));
493       bld.pseudo(aco_opcode::p_parallelcopy, Definition(v0_lo, v1), Definition(v1_lo, v1b),
494                  Operand::zero(), Operand::zero(1));
495 
496       bld.reset(program->create_and_insert_block());
497       program->blocks[0].linear_succs.push_back(1);
498       program->blocks[1].linear_preds.push_back(0);
499 
500       /* Prevent usage of v_pack_b32_f16, so we use v_perm_b32 instead. */
501       program->blocks[1].fp_mode.denorm16_64 = fp_denorm_flush;
502 
503       //>> p_unit_test 13
504       //~gfx9! v1: %_:v[0] = v_and_b32 0xffff0000, %_:v[0]
505       //~gfx9! v1: %_:v[0] = v_or_b32 0xff, %_:v[0]
506       //~gfx10! v1: %_:v[0] = v_perm_b32 %_:v[0], 0, 0x7060c0d
507       //~gfx11! v2b: %0:v[0][0:16] = v_mov_b16 0xff
508       bld.pseudo(aco_opcode::p_unit_test, Operand::c32(13u));
509       bld.pseudo(aco_opcode::p_parallelcopy, Definition(v0_lo, v2b), Operand::c16(0x00ff));
510 
511       //! p_unit_test 14
512       //~gfx9! v1: %_:v[0] = v_and_b32 0xffff, %_:v[0]
513       //~gfx9! v1: %_:v[0] = v_or_b32 0xff000000, %_:v[0]
514       //~gfx10! v1: %_:v[0] = v_perm_b32 %_:v[0], 0, 0xd0c0504
515       //~gfx11! v2b: %0:v[0][16:32] = v_mov_b16 0xffffff00 opsel_hi
516       bld.pseudo(aco_opcode::p_unit_test, Operand::c32(14u));
517       bld.pseudo(aco_opcode::p_parallelcopy, Definition(v0_hi, v2b), Operand::c16(0xff00));
518 
519       //! p_unit_test 15
520       //~gfx(9|10)! v2b: %_:v[0][0:16] = v_mov_b32 0 dst_sel:uword0 dst_preserve src0_sel:dword
521       //~gfx11! v2b: %0:v[0][0:16] = v_mov_b16 0
522       bld.pseudo(aco_opcode::p_unit_test, Operand::c32(15u));
523       bld.pseudo(aco_opcode::p_parallelcopy, Definition(v0_lo, v2b), Operand::zero(2));
524 
525       //! p_unit_test 16
526       //~gfx(9|10)! v1b: %_:v[0][0:8] = v_mov_b32 -1 dst_sel:ubyte0 dst_preserve src0_sel:dword
527       //~gfx11! v1: %_:v[0] = v_perm_b32 %_:v[0], 0, 0x706050d
528       bld.pseudo(aco_opcode::p_unit_test, Operand::c32(16u));
529       bld.pseudo(aco_opcode::p_parallelcopy, Definition(v0_lo, v1b), Operand::c8(0xff));
530 
531       //! p_unit_test 17
532       //~gfx(9|10)! v1b: %_:v[0][0:8] = v_mov_b32 0 dst_sel:ubyte0 dst_preserve src0_sel:dword
533       //~gfx11! v1: %_:v[0] = v_perm_b32 %_:v[0], 0, 0x706050c
534       bld.pseudo(aco_opcode::p_unit_test, Operand::c32(17u));
535       bld.pseudo(aco_opcode::p_parallelcopy, Definition(v0_lo, v1b), Operand::zero(1));
536 
537       //! s_endpgm
538 
539       finish_to_hw_instr_test();
540    }
541 END_TEST
542 
543 BEGIN_TEST(to_hw_instr.self_intersecting_swap)
544    if (!setup_cs(NULL, GFX9))
545       return;
546 
547    PhysReg reg_v1{257};
548    PhysReg reg_v2{258};
549    PhysReg reg_v3{259};
550    PhysReg reg_v7{263};
551 
552    //>> p_unit_test 0
553    //! v1: %0:v[1],  v1: %0:v[2] = v_swap_b32 %0:v[2], %0:v[1]
554    //! v1: %0:v[2],  v1: %0:v[3] = v_swap_b32 %0:v[3], %0:v[2]
555    //! v1: %0:v[3],  v1: %0:v[7] = v_swap_b32 %0:v[7], %0:v[3]
556    //! s_endpgm
557    bld.pseudo(aco_opcode::p_unit_test, Operand::zero());
558    // v[1:2] = v[2:3]
559    // v3 = v7
560    // v7 = v1
561    bld.pseudo(aco_opcode::p_parallelcopy, Definition(reg_v1, v2), Definition(reg_v3, v1),
562               Definition(reg_v7, v1), Operand(reg_v2, v2), Operand(reg_v7, v1),
563               Operand(reg_v1, v1));
564 
565    finish_to_hw_instr_test();
566 END_TEST
567 
568 BEGIN_TEST(to_hw_instr.extract)
569    PhysReg s0_lo{0};
570    PhysReg s1_lo{1};
571    PhysReg v0_lo{256};
572    PhysReg v1_lo{257};
573 
574    for (amd_gfx_level lvl : {GFX7, GFX8, GFX9, GFX11}) {
575       for (unsigned is_signed = 0; is_signed <= 1; is_signed++) {
576          if (!setup_cs(NULL, lvl, CHIP_UNKNOWN, is_signed ? "_signed" : "_unsigned"))
577             continue;
578 
579 #define EXT(idx, size)                                                                             \
580    bld.pseudo(aco_opcode::p_extract, Definition(v0_lo, v1), Operand(v1_lo, v1), Operand::c32(idx), \
581               Operand::c32(size), Operand::c32(is_signed));
582 
583          //; funcs['v_bfe'] = lambda _: 'v_bfe_i32' if variant.endswith('_signed') else 'v_bfe_u32'
584          //; funcs['v_shr'] = lambda _: 'v_ashrrev_i32' if variant.endswith('_signed') else 'v_lshrrev_b32'
585          //; funcs['s_bfe'] = lambda _: 's_bfe_i32' if variant.endswith('_signed') else 's_bfe_u32'
586          //; funcs['s_shr'] = lambda _: 's_ashr_i32' if variant.endswith('_signed') else 's_lshr_b32'
587          //; funcs['byte'] = lambda n: '%cbyte%s' % ('s' if variant.endswith('_signed') else 'u', n)
588 
589          //>> p_unit_test 0
590          bld.pseudo(aco_opcode::p_unit_test, Operand::zero());
591          //! v1: %_:v[0] = @v_bfe %_:v[1], 0, 8
592          EXT(0, 8)
593          //! v1: %_:v[0] = @v_bfe %_:v[1], 8, 8
594          EXT(1, 8)
595          //! v1: %_:v[0] = @v_bfe %_:v[1], 16, 8
596          EXT(2, 8)
597          //! v1: %_:v[0] = @v_shr 24, %_:v[1]
598          EXT(3, 8)
599          //~gfx(7|8|9)_.*! v1: %_:v[0] = @v_bfe %_:v[1], 0, 16
600          //~gfx11_unsigned! v1: %_:v[0] = v_cvt_u32_u16 %_:v[1]
601          //~gfx11_signed! v1: %_:v[0] = v_cvt_i32_i16 %_:v[1]
602          EXT(0, 16)
603          //! v1: %_:v[0] = @v_shr 16, %_:v[1]
604          EXT(1, 16)
605 
606 #undef EXT
607 
608 #define EXT(idx, size)                                                                             \
609    bld.pseudo(aco_opcode::p_extract, Definition(s0_lo, s1), Definition(scc, s1),                   \
610               Operand(s1_lo, s1), Operand::c32(idx), Operand::c32(size), Operand::c32(is_signed));
611 
612          //>> p_unit_test 2
613          bld.pseudo(aco_opcode::p_unit_test, Operand::c32(2u));
614          //~gfx.*_unsigned! s1: %_:s[0],  s1: %_:scc = @s_bfe %_:s[1], 0x80000
615          //~gfx.*_signed! s1: %_:s[0] = s_sext_i32_i8 %_:s[1]
616          EXT(0, 8)
617          //! s1: %_:s[0],  s1: %_:scc = @s_bfe %_:s[1], 0x80008
618          EXT(1, 8)
619          //! s1: %_:s[0],  s1: %_:scc = @s_bfe %_:s[1], 0x80010
620          EXT(2, 8)
621          //! s1: %_:s[0],  s1: %_:scc = @s_shr %_:s[1], 24
622          EXT(3, 8)
623          //~gfx(7|8)_unsigned! s1: %_:s[0],  s1: %_:scc = @s_bfe %_:s[1], 0x100000
624          //~gfx(9|11)_unsigned! s1: %_:s[0] = s_pack_ll_b32_b16 %_:s[1], 0
625          //~gfx.*_signed! s1: %_:s[0] = s_sext_i32_i16 %_:s[1]
626          EXT(0, 16)
627          //! s1: %_:s[0],  s1: %_:scc = @s_shr %_:s[1], 16
628          EXT(1, 16)
629 
630 #undef EXT
631 
632 #define EXT(idx, src_b)                                                                            \
633    bld.pseudo(aco_opcode::p_extract, Definition(v0_lo, v2b), Operand(v1_lo.advance(src_b), v2b),   \
634               Operand::c32(idx), Operand::c32(8u), Operand::c32(is_signed));
635 
636          //>> p_unit_test 4
637          bld.pseudo(aco_opcode::p_unit_test, Operand::c32(4u));
638          //~gfx7.*! v2b: %_:v[0][0:16] = @v_bfe %_:v[1][0:16], 0, 8
639          //~gfx(8|9).*! v2b: %_:v[0][0:16] = v_mov_b32 %_:v[1][0:16] dst_sel:uword0 dst_preserve src0_sel:@byte(0)
640          //~gfx11_unsigned! v1: %_:v[0] = v_perm_b32 %_:v[0], %_:v[1], 0x7060c00
641          //~gfx11_signed! v1: %_:v[0] = v_perm_b32 %_:v[0], %_:v[1], 0x7060000
642          //~gfx11_signed! v1: %_:v[0] = v_perm_b32 %_:v[0], 0, 0x7060a04
643          EXT(0, 0)
644          //~gfx(8|9).*! v2b: %_:v[0][0:16] = v_mov_b32 %_:v[1][16:32] dst_sel:uword0 dst_preserve src0_sel:@byte(2)
645          //~gfx11_unsigned! v1: %_:v[0] = v_perm_b32 %_:v[0], %_:v[1], 0x7060c02
646          //~gfx11_signed! v1: %_:v[0] = v_perm_b32 %_:v[0], %_:v[1], 0x7060202
647          //~gfx11_signed! v1: %_:v[0] = v_perm_b32 %_:v[0], 0, 0x7060a04
648          if (lvl != GFX7)
649             EXT(0, 2)
650          //~gfx7.*! v2b: %_:v[0][0:16] = @v_bfe %_:v[1][0:16], 8, 8
651          //~gfx(8|9).*! v2b: %_:v[0][0:16] = v_mov_b32 %_:v[1][0:16] dst_sel:uword0 dst_preserve src0_sel:@byte(1)
652          //~gfx11_unsigned! v1: %_:v[0] = v_perm_b32 %_:v[0], %_:v[1], 0x7060c01
653          //~gfx11_signed! v1: %_:v[0] = v_perm_b32 %_:v[0], %_:v[1], 0x7060801
654          EXT(1, 0)
655          //~gfx(8|9).*! v2b: %_:v[0][0:16] = v_mov_b32 %_:v[1][16:32] dst_sel:uword0 dst_preserve src0_sel:@byte(3)
656          //~gfx11_unsigned! v1: %_:v[0] = v_perm_b32 %_:v[0], %_:v[1], 0x7060c03
657          //~gfx11_signed! v1: %_:v[0] = v_perm_b32 %_:v[0], %_:v[1], 0x7060903
658          if (lvl != GFX7)
659             EXT(1, 2)
660 
661 #undef EXT
662 
663          finish_to_hw_instr_test();
664 
665          //! s_endpgm
666       }
667    }
668 END_TEST
669 
670 BEGIN_TEST(to_hw_instr.insert)
671    PhysReg s0_lo{0};
672    PhysReg s1_lo{1};
673    PhysReg v0_lo{256};
674    PhysReg v1_lo{257};
675 
676    for (amd_gfx_level lvl : {GFX7, GFX8, GFX9, GFX11}) {
677       if (!setup_cs(NULL, lvl))
678          continue;
679 
680 #define INS(idx, size)                                                                             \
681    bld.pseudo(aco_opcode::p_insert, Definition(v0_lo, v1), Operand(v1_lo, v1), Operand::c32(idx),  \
682               Operand::c32(size));
683 
684       //>> p_unit_test 0
685       bld.pseudo(aco_opcode::p_unit_test, Operand::zero());
686       //! v1: %_:v[0] = v_bfe_u32 %_:v[1], 0, 8
687       INS(0, 8)
688       //~gfx7! v1: %0:v[0] = v_bfe_u32 %0:v[1], 0, 8
689       //~gfx7! v1: %0:v[0] = v_lshlrev_b32 8, %0:v[0]
690       //~gfx(8|9)! v1: %0:v[0] = v_mov_b32 %0:v[1] dst_sel:ubyte1 src0_sel:dword
691       //~gfx11! v1: %0:v[0] = v_perm_b32 %0:v[0], %0:v[1], 0xc0c000c
692       INS(1, 8)
693       //~gfx7! v1: %0:v[0] = v_bfe_u32 %0:v[1], 0, 8
694       //~gfx7! v1: %0:v[0] = v_lshlrev_b32 16, %0:v[0]
695       //~gfx(8|9)! v1: %0:v[0] = v_mov_b32 %0:v[1] dst_sel:ubyte2 src0_sel:dword
696       //~gfx11! v1: %0:v[0] = v_perm_b32 %0:v[0], %0:v[1], 0xc000c0c
697       INS(2, 8)
698       //! v1: %0:v[0] = v_lshlrev_b32 24, %0:v[1]
699       INS(3, 8)
700       //! v1: %0:v[0] = v_bfe_u32 %0:v[1], 0, 16
701       INS(0, 16)
702       //! v1: %0:v[0] = v_lshlrev_b32 16, %0:v[1]
703       INS(1, 16)
704 
705 #undef INS
706 
707 #define INS(idx, size)                                                                             \
708    bld.pseudo(aco_opcode::p_insert, Definition(s0_lo, s1), Definition(scc, s1),                    \
709               Operand(s1_lo, s1), Operand::c32(idx), Operand::c32(size));
710 
711       //>> p_unit_test 1
712       bld.pseudo(aco_opcode::p_unit_test, Operand::c32(1u));
713       //! s1: %_:s[0],  s1: %_:scc = s_bfe_u32 %_:s[1], 0x80000
714       INS(0, 8)
715       //! s1: %_:s[0],  s1: %_:scc = s_bfe_u32 %_:s[1], 0x80000
716       //! s1: %_:s[0],  s1: %_:scc = s_lshl_b32 %_:s[0], 8
717       INS(1, 8)
718       //! s1: %_:s[0],  s1: %_:scc = s_bfe_u32 %_:s[1], 0x80000
719       //! s1: %_:s[0],  s1: %_:scc = s_lshl_b32 %_:s[0], 16
720       INS(2, 8)
721       //! s1: %_:s[0],  s1: %_:scc = s_lshl_b32 %_:s[1], 24
722       INS(3, 8)
723       //! s1: %_:s[0],  s1: %_:scc = s_bfe_u32 %_:s[1], 0x100000
724       INS(0, 16)
725       //! s1: %_:s[0],  s1: %_:scc = s_lshl_b32 %_:s[1], 16
726       INS(1, 16)
727 
728 #undef INS
729 
730 #define INS(idx, def_b)                                                                            \
731    bld.pseudo(aco_opcode::p_insert, Definition(v0_lo.advance(def_b), v2b), Operand(v1_lo, v2b),    \
732               Operand::c32(idx), Operand::c32(8u));
733 
734       //>> p_unit_test 2
735       bld.pseudo(aco_opcode::p_unit_test, Operand::c32(2u));
736       //~gfx7! v2b: %_:v[0][0:16] = v_bfe_u32 %_:v[1][0:16], 0, 8
737       //~gfx(8|9)! v2b: %0:v[0][0:16] = v_lshlrev_b32 0, %0:v[1][0:16] dst_sel:uword0 dst_preserve src0_sel:dword src1_sel:ubyte0
738       //~gfx11! v1: %0:v[0] = v_perm_b32 %0:v[0], %0:v[1], 0x7060c00
739       INS(0, 0)
740       //~gfx(8|9)! v2b: %0:v[0][16:32] = v_lshlrev_b32 0, %0:v[1][0:16] dst_sel:uword1 dst_preserve src0_sel:dword src1_sel:ubyte0
741       //~gfx11! v1: %0:v[0] = v_perm_b32 %0:v[0], %0:v[1], 0xc000504
742       if (lvl != GFX7)
743          INS(0, 2)
744       //~gfx7! v2b: %_:v[0][0:16] = v_lshlrev_b32 8, %_:v[1][0:16]
745       //~gfx(8|9)! v2b: %0:v[0][0:16] = v_lshlrev_b32 8, %0:v[1][0:16] dst_sel:uword0 dst_preserve src0_sel:dword src1_sel:ubyte0
746       //~gfx11! v1: %0:v[0] = v_perm_b32 %0:v[0], %0:v[1], 0x706000c
747       INS(1, 0)
748       //~gfx(8|9)! v2b: %0:v[0][16:32] = v_lshlrev_b32 8, %0:v[1][0:16] dst_sel:uword1 dst_preserve src0_sel:dword src1_sel:ubyte0
749       //~gfx11! v1: %0:v[0] = v_perm_b32 %0:v[0], %0:v[1], 0xc0504
750       if (lvl != GFX7)
751          INS(1, 2)
752 
753 #undef INS
754 
755       finish_to_hw_instr_test();
756 
757       //! s_endpgm
758    }
759 END_TEST
760 
761 BEGIN_TEST(to_hw_instr.copy_linear_vgpr_scc)
762    if (!setup_cs(NULL, GFX10))
763       return;
764 
765    PhysReg reg_s0{0};
766    PhysReg v0_lo{256};
767    PhysReg v0_b3{256};
768    v0_b3.reg_b += 3;
769    PhysReg v1_lo{257};
770 
771    //>> p_unit_test 0
772    bld.pseudo(aco_opcode::p_unit_test, Operand::zero());
773 
774    /* It would be better if the scc=s0 copy was done later, but handle_operands() is complex
775     * enough
776     */
777 
778    //! s1: %0:scc = s_cmp_lg_i32 %0:s[0], 0
779    //! s1: %0:m0 = s_mov_b32 %0:scc
780    //! lv1: %0:v[0] = v_mov_b32 %0:v[1]
781    //! s2: %0:exec,  s1: %0:scc = s_not_b64 %0:exec
782    //! lv1: %0:v[0] = v_mov_b32 %0:v[1]
783    //! s2: %0:exec,  s1: %0:scc = s_not_b64 %0:exec
784    //! s1: %0:scc = s_cmp_lg_i32 %0:m0, 0
785    Instruction* instr =
786       bld.pseudo(aco_opcode::p_parallelcopy, Definition(scc, s1), Definition(v0_lo, v1.as_linear()),
787                  Operand(reg_s0, s1), Operand(v1_lo, v1.as_linear()));
788    instr->pseudo().scratch_sgpr = m0;
789 
790    finish_to_hw_instr_test();
791 END_TEST
792 
793 BEGIN_TEST(to_hw_instr.swap_linear_vgpr)
794    if (!setup_cs(NULL, GFX10))
795       return;
796 
797    PhysReg reg_v0{256};
798    PhysReg reg_v1{257};
799    RegClass v1_linear = v1.as_linear();
800 
801    //>> p_unit_test 0
802    bld.pseudo(aco_opcode::p_unit_test, Operand::zero());
803 
804    //! lv1: %0:v[0],  lv1: %0:v[1] = v_swap_b32 %0:v[1], %0:v[0]
805    //! s2: %0:exec,  s1: %0:scc = s_not_b64 %0:exec
806    //! lv1: %0:v[0],  lv1: %0:v[1] = v_swap_b32 %0:v[1], %0:v[0]
807    //! s2: %0:exec,  s1: %0:scc = s_not_b64 %0:exec
808    Instruction* instr = bld.pseudo(aco_opcode::p_parallelcopy, Definition(reg_v0, v1_linear),
809                                    Definition(reg_v1, v1_linear), Operand(reg_v1, v1_linear),
810                                    Operand(reg_v0, v1_linear));
811    instr->pseudo().scratch_sgpr = m0;
812 
813    finish_to_hw_instr_test();
814 END_TEST
815 
816 BEGIN_TEST(to_hw_instr.copy_linear_vgpr_v3)
817    if (!setup_cs(NULL, GFX10))
818       return;
819 
820    PhysReg reg_v0{256};
821    PhysReg reg_v4{256 + 4};
822    RegClass v3_linear = v3.as_linear();
823 
824    //>> p_unit_test 0
825    bld.pseudo(aco_opcode::p_unit_test, Operand::zero());
826 
827    //! lv2: %0:v[0-1] = v_lshrrev_b64 0, %0:v[4-5]
828    //! s2: %0:exec,  s1: %0:scc = s_not_b64 %0:exec
829    //! lv2: %0:v[0-1] = v_lshrrev_b64 0, %0:v[4-5]
830    //! s2: %0:exec,  s1: %0:scc = s_not_b64 %0:exec
831    //! lv1: %0:v[2] = v_mov_b32 %0:v[6]
832    //! s2: %0:exec,  s1: %0:scc = s_not_b64 %0:exec
833    //! lv1: %0:v[2] = v_mov_b32 %0:v[6]
834    //! s2: %0:exec,  s1: %0:scc = s_not_b64 %0:exec
835    Instruction* instr = bld.pseudo(aco_opcode::p_parallelcopy, Definition(reg_v0, v3_linear),
836                                    Operand(reg_v4, v3_linear));
837    instr->pseudo().scratch_sgpr = m0;
838 
839    finish_to_hw_instr_test();
840 END_TEST
841 
842 BEGIN_TEST(to_hw_instr.pack2x16_constant)
843    PhysReg v0_lo{256};
844    PhysReg v0_hi{256};
845    PhysReg v1_lo{257};
846    PhysReg v1_hi{257};
847    v0_hi.reg_b += 2;
848    v1_hi.reg_b += 2;
849 
850    for (amd_gfx_level lvl : {GFX10, GFX11}) {
851       if (!setup_cs(NULL, lvl))
852          continue;
853 
854       /* prevent usage of v_pack_b32_f16 */
855       program->blocks[0].fp_mode.denorm16_64 = fp_denorm_flush;
856 
857       //>> p_unit_test 0
858       //! v1: %_:v[0] = v_alignbyte_b32 0x3800, %_:v[1][16:32], 2
859       bld.pseudo(aco_opcode::p_unit_test, Operand::zero());
860       bld.pseudo(aco_opcode::p_parallelcopy, Definition(v0_lo, v2b), Definition(v0_hi, v2b),
861                  Operand(v1_hi, v2b), Operand::c16(0x3800));
862 
863       //! p_unit_test 1
864       //! v2b: %_:v[0][0:16] = v_lshrrev_b32 16, %_:v[1][16:32]
865       bld.pseudo(aco_opcode::p_unit_test, Operand::c32(1));
866       bld.pseudo(aco_opcode::p_parallelcopy, Definition(v0_lo, v2b), Definition(v0_hi, v2b),
867                  Operand(v1_hi, v2b), Operand::zero(2));
868 
869       //! p_unit_test 2
870       //~gfx10! v2b: %_:v[0][0:16] = v_and_b32 0xffff, %_:v[1][0:16]
871       //~gfx11! v1: %_:v[0] = v_cvt_u32_u16 %_:v[1][0:16]
872       bld.pseudo(aco_opcode::p_unit_test, Operand::c32(2));
873       bld.pseudo(aco_opcode::p_parallelcopy, Definition(v0_lo, v2b), Definition(v0_hi, v2b),
874                  Operand(v1_lo, v2b), Operand::zero(2));
875 
876       //! p_unit_test 3
877       //! v2b: %_:v[0][16:32] = v_and_b32 0xffff0000, %_:v[1][16:32]
878       bld.pseudo(aco_opcode::p_unit_test, Operand::c32(3));
879       bld.pseudo(aco_opcode::p_parallelcopy, Definition(v0_lo, v2b), Definition(v0_hi, v2b),
880                  Operand::zero(2), Operand(v1_hi, v2b));
881 
882       //! p_unit_test 4
883       //! v2b: %_:v[0][16:32] = v_lshlrev_b32 16, %_:v[1][0:16]
884       bld.pseudo(aco_opcode::p_unit_test, Operand::c32(4));
885       bld.pseudo(aco_opcode::p_parallelcopy, Definition(v0_lo, v2b), Definition(v0_hi, v2b),
886                  Operand::zero(2), Operand(v1_lo, v2b));
887 
888       //! s_endpgm
889 
890       finish_to_hw_instr_test();
891    }
892 END_TEST
893