• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /*
2  * Copyright © 2020 Valve Corporation
3  *
4  * Permission is hereby granted, free of charge, to any person obtaining a
5  * copy of this software and associated documentation files (the "Software"),
6  * to deal in the Software without restriction, including without limitation
7  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8  * and/or sell copies of the Software, and to permit persons to whom the
9  * Software is furnished to do so, subject to the following conditions:
10  *
11  * The above copyright notice and this permission notice (including the next
12  * paragraph) shall be included in all copies or substantial portions of the
13  * Software.
14  *
15  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
18  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20  * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21  * IN THE SOFTWARE.
22  *
23  */
24 #include "helpers.h"
25 
26 using namespace aco;
27 
28 BEGIN_TEST(optimize.neg)
29    for (unsigned i = GFX9; i <= GFX10; i++) {
30       //>> v1: %a, v1: %b, s1: %c, s1: %d = p_startpgm
31       if (!setup_cs("v1 v1 s1 s1", (amd_gfx_level)i))
32          continue;
33 
34       //! v1: %res0 = v_mul_f32 %a, -%b
35       //! p_unit_test 0, %res0
36       Temp neg_b = fneg(inputs[1]);
37       writeout(0, bld.vop2(aco_opcode::v_mul_f32, bld.def(v1), inputs[0], neg_b));
38 
39       //~gfx9! v1: %neg_a = v_mul_f32 -1.0, %a
40       //~gfx9! v1: %res1 = v_mul_f32 0x123456, %neg_a
41       //~gfx10! v1: %res1 = v_mul_f32 0x123456, -%a
42       //! p_unit_test 1, %res1
43       Temp neg_a = fneg(inputs[0]);
44       writeout(1, bld.vop2(aco_opcode::v_mul_f32, bld.def(v1), Operand::c32(0x123456u), neg_a));
45 
46       //! v1: %res2 = v_mul_f32 %a, %b
47       //! p_unit_test 2, %res2
48       Temp neg_neg_a = fneg(neg_a);
49       writeout(2, bld.vop2(aco_opcode::v_mul_f32, bld.def(v1), neg_neg_a, inputs[1]));
50 
51       //! v1: %res3 = v_mul_f32 |%a|, %b
52       //! p_unit_test 3, %res3
53       Temp abs_neg_a = fabs(neg_a);
54       writeout(3, bld.vop2(aco_opcode::v_mul_f32, bld.def(v1), abs_neg_a, inputs[1]));
55 
56       //! v1: %res4 = v_mul_f32 -|%a|, %b
57       //! p_unit_test 4, %res4
58       Temp abs_a = fabs(inputs[0]);
59       Temp neg_abs_a = fneg(abs_a);
60       writeout(4, bld.vop2(aco_opcode::v_mul_f32, bld.def(v1), neg_abs_a, inputs[1]));
61 
62       //! v1: %res5 = v_mul_f32 -%a, %b row_shl:1 bound_ctrl:1
63       //! p_unit_test 5, %res5
64       writeout(5, bld.vop2_dpp(aco_opcode::v_mul_f32, bld.def(v1), neg_a, inputs[1], dpp_row_sl(1)));
65 
66       //! v1: %res6 = v_subrev_f32 %a, %b
67       //! p_unit_test 6, %res6
68       writeout(6, bld.vop2(aco_opcode::v_add_f32, bld.def(v1), neg_a, inputs[1]));
69 
70       //! v1: %res7 = v_sub_f32 %b, %a
71       //! p_unit_test 7, %res7
72       writeout(7, bld.vop2(aco_opcode::v_add_f32, bld.def(v1), inputs[1], neg_a));
73 
74       //! v1: %res8 = v_mul_f32 %a, -%c
75       //! p_unit_test 8, %res8
76       Temp neg_c = fneg(bld.copy(bld.def(v1), inputs[2]));
77       writeout(8, bld.vop2(aco_opcode::v_mul_f32, bld.def(v1), inputs[0], neg_c));
78 
79       // //! v1: %res9 = v_mul_f32 |%neg_a|, %b
80       // //! p_unit_test 9, %res9
81       Temp abs_neg_abs_a = fabs(neg_abs_a);
82       writeout(9, bld.vop2(aco_opcode::v_mul_f32, bld.def(v1), abs_neg_abs_a, inputs[1]));
83 
84       finish_opt_test();
85    }
86 END_TEST
87 
88 BEGIN_TEST(optimize.output_modifiers)
89    //>> v1: %a, v1: %b = p_startpgm
90    if (!setup_cs("v1 v1", GFX9))
91       return;
92 
93    program->blocks[0].fp_mode.denorm16_64 = fp_denorm_flush;
94 
95    /* 32-bit modifiers */
96 
97    //! v1: %res0 = v_add_f32 %a, %b *0.5
98    //! p_unit_test 0, %res0
99    Temp tmp = bld.vop2(aco_opcode::v_add_f32, bld.def(v1), inputs[0], inputs[1]);
100    writeout(0, bld.vop2(aco_opcode::v_mul_f32, bld.def(v1), Operand::c32(0x3f000000u), tmp));
101 
102    //! v1: %res1 = v_add_f32 %a, %b *2
103    //! p_unit_test 1, %res1
104    tmp = bld.vop2(aco_opcode::v_add_f32, bld.def(v1), inputs[0], inputs[1]);
105    writeout(1, bld.vop2(aco_opcode::v_mul_f32, bld.def(v1), Operand::c32(0x40000000u), tmp));
106 
107    //! v1: %res2 = v_add_f32 %a, %b *4
108    //! p_unit_test 2, %res2
109    tmp = bld.vop2(aco_opcode::v_add_f32, bld.def(v1), inputs[0], inputs[1]);
110    writeout(2, bld.vop2(aco_opcode::v_mul_f32, bld.def(v1), Operand::c32(0x40800000u), tmp));
111 
112    //! v1: %res3 = v_add_f32 %a, %b clamp
113    //! p_unit_test 3, %res3
114    tmp = bld.vop2(aco_opcode::v_add_f32, bld.def(v1), inputs[0], inputs[1]);
115    writeout(3, bld.vop3(aco_opcode::v_med3_f32, bld.def(v1), Operand::zero(),
116                         Operand::c32(0x3f800000u), tmp));
117 
118    //! v1: %res4 = v_add_f32 %a, %b *2 clamp
119    //! p_unit_test 4, %res4
120    tmp = bld.vop2(aco_opcode::v_add_f32, bld.def(v1), inputs[0], inputs[1]);
121    tmp = bld.vop2(aco_opcode::v_mul_f32, bld.def(v1), Operand::c32(0x40000000u), tmp);
122    writeout(4, bld.vop3(aco_opcode::v_med3_f32, bld.def(v1), Operand::zero(),
123                         Operand::c32(0x3f800000u), tmp));
124 
125    /* 16-bit modifiers */
126 
127    //! v2b: %res5 = v_add_f16 %a, %b *0.5
128    //! p_unit_test 5, %res5
129    tmp = bld.vop2(aco_opcode::v_add_f16, bld.def(v2b), inputs[0], inputs[1]);
130    writeout(5, bld.vop2(aco_opcode::v_mul_f16, bld.def(v2b), Operand::c16(0x3800u), tmp));
131 
132    //! v2b: %res6 = v_add_f16 %a, %b *2
133    //! p_unit_test 6, %res6
134    tmp = bld.vop2(aco_opcode::v_add_f16, bld.def(v2b), inputs[0], inputs[1]);
135    writeout(6, bld.vop2(aco_opcode::v_mul_f16, bld.def(v2b), Operand::c16(0x4000u), tmp));
136 
137    //! v2b: %res7 = v_add_f16 %a, %b *4
138    //! p_unit_test 7, %res7
139    tmp = bld.vop2(aco_opcode::v_add_f16, bld.def(v2b), inputs[0], inputs[1]);
140    writeout(7, bld.vop2(aco_opcode::v_mul_f16, bld.def(v2b), Operand::c16(0x4400u), tmp));
141 
142    //! v2b: %res8 = v_add_f16 %a, %b clamp
143    //! p_unit_test 8, %res8
144    tmp = bld.vop2(aco_opcode::v_add_f16, bld.def(v2b), inputs[0], inputs[1]);
145    writeout(8, bld.vop3(aco_opcode::v_med3_f16, bld.def(v2b), Operand::c16(0u),
146                         Operand::c16(0x3c00u), tmp));
147 
148    //! v2b: %res9 = v_add_f16 %a, %b *2 clamp
149    //! p_unit_test 9, %res9
150    tmp = bld.vop2(aco_opcode::v_add_f16, bld.def(v2b), inputs[0], inputs[1]);
151    tmp = bld.vop2(aco_opcode::v_mul_f16, bld.def(v2b), Operand::c16(0x4000), tmp);
152    writeout(9, bld.vop3(aco_opcode::v_med3_f16, bld.def(v2b), Operand::c16(0u),
153                         Operand::c16(0x3c00u), tmp));
154 
155    /* clamping is done after omod */
156 
157    //! v1: %res10_tmp = v_add_f32 %a, %b clamp
158    //! v1: %res10 = v_mul_f32 2.0, %res10_tmp
159    //! p_unit_test 10, %res10
160    tmp = bld.vop2(aco_opcode::v_add_f32, bld.def(v1), inputs[0], inputs[1]);
161    tmp = bld.vop3(aco_opcode::v_med3_f32, bld.def(v1), Operand::zero(), Operand::c32(0x3f800000u),
162                   tmp);
163    writeout(10, bld.vop2(aco_opcode::v_mul_f32, bld.def(v1), Operand::c32(0x40000000u), tmp));
164 
165    /* unsupported instructions */
166 
167    //! v1: %res11_tmp = v_xor_b32 %a, %b
168    //! v1: %res11 = v_mul_f32 2.0, %res11_tmp
169    //! p_unit_test 11, %res11
170    tmp = bld.vop2(aco_opcode::v_xor_b32, bld.def(v1), inputs[0], inputs[1]);
171    writeout(11, bld.vop2(aco_opcode::v_mul_f32, bld.def(v1), Operand::c32(0x40000000u), tmp));
172 
173    /* several users */
174 
175    //! v1: %res12_tmp = v_add_f32 %a, %b
176    //! p_unit_test %res12_tmp
177    //! v1: %res12 = v_mul_f32 2.0, %res12_tmp
178    //! p_unit_test 12, %res12
179    tmp = bld.vop2(aco_opcode::v_add_f32, bld.def(v1), inputs[0], inputs[1]);
180    bld.pseudo(aco_opcode::p_unit_test, tmp);
181    writeout(12, bld.vop2(aco_opcode::v_mul_f32, bld.def(v1), Operand::c32(0x40000000u), tmp));
182 
183    //! v1: %res13 = v_add_f32 %a, %b
184    //! p_unit_test 13, %res13
185    tmp = bld.vop2(aco_opcode::v_add_f32, bld.def(v1), inputs[0], inputs[1]);
186    bld.vop2(aco_opcode::v_mul_f32, bld.def(v1), Operand::c32(0x40000000u), tmp);
187    writeout(13, tmp);
188 
189    /* omod has no effect if denormals are enabled but clamp is fine */
190 
191    //>> BB1
192    //! /* logical preds: / linear preds: / kind: uniform, */
193    program->next_fp_mode.denorm32 = fp_denorm_keep;
194    program->next_fp_mode.denorm16_64 = fp_denorm_flush;
195    bld.reset(program->create_and_insert_block());
196 
197    //! v1: %res14_tmp = v_add_f32 %a, %b
198    //! v1: %res14 = v_mul_f32 2.0, %res13_tmp
199    //! p_unit_test 14, %res14
200    tmp = bld.vop2(aco_opcode::v_add_f32, bld.def(v1), inputs[0], inputs[1]);
201    writeout(14, bld.vop2(aco_opcode::v_mul_f32, bld.def(v1), Operand::c32(0x40000000u), tmp));
202 
203    //! v1: %res15 = v_add_f32 %a, %b clamp
204    //! p_unit_test 15, %res15
205    tmp = bld.vop2(aco_opcode::v_add_f32, bld.def(v1), inputs[0], inputs[1]);
206    writeout(15, bld.vop3(aco_opcode::v_med3_f32, bld.def(v1), Operand::zero(),
207                          Operand::c32(0x3f800000u), tmp));
208 
209    //>> BB2
210    //! /* logical preds: / linear preds: / kind: uniform, */
211    program->next_fp_mode.denorm32 = fp_denorm_flush;
212    program->next_fp_mode.denorm16_64 = fp_denorm_keep;
213    bld.reset(program->create_and_insert_block());
214 
215    //! v2b: %res16_tmp = v_add_f16 %a, %b
216    //! v2b: %res16 = v_mul_f16 2.0, %res15_tmp
217    //! p_unit_test 16, %res16
218    tmp = bld.vop2(aco_opcode::v_add_f16, bld.def(v2b), inputs[0], inputs[1]);
219    writeout(16, bld.vop2(aco_opcode::v_mul_f16, bld.def(v2b), Operand::c16(0x4000u), tmp));
220 
221    //! v2b: %res17 = v_add_f16 %a, %b clamp
222    //! p_unit_test 17, %res17
223    tmp = bld.vop2(aco_opcode::v_add_f16, bld.def(v2b), inputs[0], inputs[1]);
224    writeout(17, bld.vop3(aco_opcode::v_med3_f16, bld.def(v2b), Operand::c16(0u),
225                          Operand::c16(0x3c00u), tmp));
226 
227    /* omod flushes -0.0 to +0.0 */
228 
229    //>> BB3
230    //! /* logical preds: / linear preds: / kind: uniform, */
231    program->next_fp_mode.denorm32 = fp_denorm_keep;
232    program->next_fp_mode.denorm16_64 = fp_denorm_keep;
233    program->next_fp_mode.preserve_signed_zero_inf_nan32 = true;
234    program->next_fp_mode.preserve_signed_zero_inf_nan16_64 = false;
235    bld.reset(program->create_and_insert_block());
236 
237    //! v1: %res18_tmp = v_add_f32 %a, %b
238    //! v1: %res18 = v_mul_f32 2.0, %res18_tmp
239    //! p_unit_test 18, %res18
240    tmp = bld.vop2(aco_opcode::v_add_f32, bld.def(v1), inputs[0], inputs[1]);
241    writeout(18, bld.vop2(aco_opcode::v_mul_f32, bld.def(v1), Operand::c32(0x40000000u), tmp));
242    //! v1: %res19 = v_add_f32 %a, %b clamp
243    //! p_unit_test 19, %res19
244    tmp = bld.vop2(aco_opcode::v_add_f32, bld.def(v1), inputs[0], inputs[1]);
245    writeout(19, bld.vop3(aco_opcode::v_med3_f32, bld.def(v1), Operand::zero(),
246                          Operand::c32(0x3f800000u), tmp));
247 
248    //>> BB4
249    //! /* logical preds: / linear preds: / kind: uniform, */
250    program->next_fp_mode.preserve_signed_zero_inf_nan32 = false;
251    program->next_fp_mode.preserve_signed_zero_inf_nan16_64 = true;
252    bld.reset(program->create_and_insert_block());
253    //! v2b: %res20_tmp = v_add_f16 %a, %b
254    //! v2b: %res20 = v_mul_f16 2.0, %res20_tmp
255    //! p_unit_test 20, %res20
256    tmp = bld.vop2(aco_opcode::v_add_f16, bld.def(v2b), inputs[0], inputs[1]);
257    writeout(20, bld.vop2(aco_opcode::v_mul_f16, bld.def(v2b), Operand::c16(0x4000u), tmp));
258    //! v2b: %res21 = v_add_f16 %a, %b clamp
259    //! p_unit_test 21, %res21
260    tmp = bld.vop2(aco_opcode::v_add_f16, bld.def(v2b), inputs[0], inputs[1]);
261    writeout(21, bld.vop3(aco_opcode::v_med3_f16, bld.def(v2b), Operand::c16(0u),
262                          Operand::c16(0x3c00u), tmp));
263 
264    finish_opt_test();
265 END_TEST
266 
create_subbrev_co(Operand op0,Operand op1,Operand op2)267 Temp create_subbrev_co(Operand op0, Operand op1, Operand op2)
268 {
269    return bld.vop2_e64(aco_opcode::v_subbrev_co_u32, bld.def(v1), bld.def(bld.lm), op0, op1, op2);
270 }
271 
272 BEGIN_TEST(optimize.cndmask)
273    for (unsigned i = GFX9; i <= GFX10; i++) {
274       //>> v1: %a, s1: %b, s2: %c = p_startpgm
275       if (!setup_cs("v1 s1 s2", (amd_gfx_level)i))
276          continue;
277 
278       Temp subbrev;
279 
280       //! v1: %res0 = v_cndmask_b32 0, %a, %c
281       //! p_unit_test 0, %res0
282       subbrev = create_subbrev_co(Operand::zero(), Operand::zero(), Operand(inputs[2]));
283       writeout(0, bld.vop2(aco_opcode::v_and_b32, bld.def(v1), inputs[0], subbrev));
284 
285       //! v1: %res1 = v_cndmask_b32 0, 42, %c
286       //! p_unit_test 1, %res1
287       subbrev = create_subbrev_co(Operand::zero(), Operand::zero(), Operand(inputs[2]));
288       writeout(1, bld.vop2(aco_opcode::v_and_b32, bld.def(v1), Operand::c32(42u), subbrev));
289 
290       //~gfx9! v1: %subbrev, s2: %_ = v_subbrev_co_u32 0, 0, %c
291       //~gfx9! v1: %res2 = v_and_b32 %b, %subbrev
292       //~gfx10! v1: %res2 = v_cndmask_b32 0, %b, %c
293       //! p_unit_test 2, %res2
294       subbrev = create_subbrev_co(Operand::zero(), Operand::zero(), Operand(inputs[2]));
295       writeout(2, bld.vop2(aco_opcode::v_and_b32, bld.def(v1), inputs[1], subbrev));
296 
297       //! v1: %subbrev1, s2: %_ = v_subbrev_co_u32 0, 0, %c
298       //! v1: %xor = v_xor_b32 %a, %subbrev1
299       //! v1: %res3 = v_cndmask_b32 0, %xor, %c
300       //! p_unit_test 3, %res3
301       subbrev = create_subbrev_co(Operand::zero(), Operand::zero(), Operand(inputs[2]));
302       Temp xor_a = bld.vop2(aco_opcode::v_xor_b32, bld.def(v1), inputs[0], subbrev);
303       writeout(3, bld.vop2(aco_opcode::v_and_b32, bld.def(v1), xor_a, subbrev));
304 
305       //! v1: %res4 = v_cndmask_b32 0, %a, %c
306       //! p_unit_test 4, %res4
307       Temp cndmask = bld.vop2_e64(aco_opcode::v_cndmask_b32, bld.def(v1), Operand::zero(),
308                                   Operand::c32(1u), Operand(inputs[2]));
309       Temp sub = bld.vsub32(bld.def(v1), Operand::zero(), cndmask);
310       writeout(4, bld.vop2(aco_opcode::v_and_b32, bld.def(v1), Operand(inputs[0]), sub));
311 
312       finish_opt_test();
313    }
314 END_TEST
315 
316 BEGIN_TEST(optimize.add_lshl)
317    for (unsigned i = GFX8; i <= GFX10; i++) {
318       //>> s1: %a, v1: %b = p_startpgm
319       if (!setup_cs("s1 v1", (amd_gfx_level)i))
320          continue;
321 
322       Temp shift;
323 
324       //~gfx8! s1: %lshl0, s1: %_:scc = s_lshl_b32 %a, 3
325       //~gfx8! s1: %res0, s1: %_:scc = s_add_u32 %lshl0, 4
326       //~gfx(9|10)! s1: %res0, s1: %_:scc = s_lshl3_add_u32 %a, 4
327       //! p_unit_test 0, %res0
328       shift = bld.sop2(aco_opcode::s_lshl_b32, bld.def(s1), bld.def(s1, scc), Operand(inputs[0]),
329                        Operand::c32(3u));
330       writeout(0, bld.sop2(aco_opcode::s_add_u32, bld.def(s1), bld.def(s1, scc), shift,
331                            Operand::c32(4u)));
332 
333       //~gfx8! s1: %lshl1, s1: %_:scc = s_lshl_b32 %a, 3
334       //~gfx8! s1: %add1, s1: %_:scc = s_add_u32 %lshl1, 4
335       //~gfx8! v1: %add_co1, s2: %_ = v_add_co_u32 %lshl1, %b
336       //~gfx8! v1: %res1, s2: %_ = v_add_co_u32 %add1, %add_co1
337       //~gfx(9|10)! s1: %lshl1, s1: %_:scc = s_lshl3_add_u32 %a, 4
338       //~gfx(9|10)! v1: %lshl_add = v_lshl_add_u32 %a, 3, %b
339       //~gfx(9|10)! v1: %res1 = v_add_u32 %lshl1, %lshl_add
340       //! p_unit_test 1, %res1
341       shift = bld.sop2(aco_opcode::s_lshl_b32, bld.def(s1), bld.def(s1, scc), Operand(inputs[0]),
342                        Operand::c32(3u));
343       Temp sadd =
344          bld.sop2(aco_opcode::s_add_u32, bld.def(s1), bld.def(s1, scc), shift, Operand::c32(4u));
345       Temp vadd = bld.vadd32(bld.def(v1), shift, Operand(inputs[1]));
346       writeout(1, bld.vadd32(bld.def(v1), sadd, vadd));
347 
348       //~gfx8! s1: %lshl2 = s_lshl_b32 %a, 3
349       //~gfx8! v1: %res2,  s2: %_ = v_add_co_u32 %lshl2, %b
350       //~gfx(9|10)! v1: %res2 = v_lshl_add_u32 %a, 3, %b
351       //! p_unit_test 2, %res2
352       Temp lshl =
353          bld.sop2(aco_opcode::s_lshl_b32, bld.def(s1), Operand(inputs[0]), Operand::c32(3u));
354       writeout(2, bld.vadd32(bld.def(v1), lshl, Operand(inputs[1])));
355 
356       //~gfx8! s1: %lshl3 = s_lshl_b32 (is24bit)%a, 7
357       //~gfx8! v1: %res3, s2: %_ = v_add_co_u32 %lshl3, %b
358       //~gfx(9|10)! v1: %res3 = v_lshl_add_u32 (is24bit)%a, 7, %b
359       //! p_unit_test 3, %res3
360       Operand a_24bit = Operand(inputs[0]);
361       a_24bit.set24bit(true);
362       lshl = bld.sop2(aco_opcode::s_lshl_b32, bld.def(s1), a_24bit, Operand::c32(7u));
363       writeout(3, bld.vadd32(bld.def(v1), lshl, Operand(inputs[1])));
364 
365       //! s1: %lshl4 = s_lshl_b32 (is24bit)%a, 3
366       //~gfx(8|9)! v1: %res4, s2: %carry = v_add_co_u32 %lshl4, %b
367       //~gfx10! v1: %res4, s2: %carry = v_add_co_u32_e64 %lshl4, %b
368       //! p_unit_test 4, %carry
369       lshl = bld.sop2(aco_opcode::s_lshl_b32, bld.def(s1), a_24bit, Operand::c32(3u));
370       Temp carry = bld.vadd32(bld.def(v1), lshl, Operand(inputs[1]), true).def(1).getTemp();
371       writeout(4, carry);
372 
373       //~gfx8! s1: %lshl5 = s_lshl_b32 (is24bit)%a, (is24bit)%a
374       //~gfx8! v1: %res5, s2: %_ = v_add_co_u32 %lshl5, %b
375       //~gfx(9|10)! v1: %res5 = v_lshl_add_u32 (is24bit)%a, (is24bit)%a, %b
376       //! p_unit_test 5, %res5
377       lshl = bld.sop2(aco_opcode::s_lshl_b32, bld.def(s1), a_24bit, a_24bit);
378       writeout(5, bld.vadd32(bld.def(v1), lshl, Operand(inputs[1])));
379 
380       //~gfx8! v1: %res6 = v_mad_u32_u24 (is24bit)%a, 8, %b
381       //~gfx(9|10)! v1: %res6 = v_lshl_add_u32 (is24bit)%a, 3, %b
382       //! p_unit_test 6, %res6
383       lshl = bld.sop2(aco_opcode::s_lshl_b32, bld.def(s1), a_24bit, Operand::c32(3u));
384       writeout(6, bld.vadd32(bld.def(v1), lshl, Operand(inputs[1])));
385 
386       //~gfx8! v1: %res7 = v_mad_u32_u24 (is16bit)%a, 16, %b
387       //~gfx(9|10)! v1: %res7 = v_lshl_add_u32 (is16bit)%a, 4, %b
388       //! p_unit_test 7, %res7
389       Operand a_16bit = Operand(inputs[0]);
390       a_16bit.set16bit(true);
391       lshl = bld.sop2(aco_opcode::s_lshl_b32, bld.def(s1), a_16bit, Operand::c32(4u));
392       writeout(7, bld.vadd32(bld.def(v1), lshl, Operand(inputs[1])));
393 
394       finish_opt_test();
395    }
396 END_TEST
397 
398 BEGIN_TEST(optimize.bcnt)
399    for (unsigned i = GFX8; i <= GFX10; i++) {
400       //>> v1: %a, s1: %b = p_startpgm
401       if (!setup_cs("v1 s1", (amd_gfx_level)i))
402          continue;
403 
404       Temp bcnt;
405 
406       //! v1: %res0 = v_bcnt_u32_b32 %a, %a
407       //! p_unit_test 0, %res0
408       bcnt = bld.vop3(aco_opcode::v_bcnt_u32_b32, bld.def(v1), Operand(inputs[0]), Operand::zero());
409       writeout(0, bld.vadd32(bld.def(v1), bcnt, Operand(inputs[0])));
410 
411       //! v1: %res1 = v_bcnt_u32_b32 %a, %b
412       //! p_unit_test 1, %res1
413       bcnt = bld.vop3(aco_opcode::v_bcnt_u32_b32, bld.def(v1), Operand(inputs[0]), Operand::zero());
414       writeout(1, bld.vadd32(bld.def(v1), bcnt, Operand(inputs[1])));
415 
416       //! v1: %res2 = v_bcnt_u32_b32 %a, 42
417       //! p_unit_test 2, %res2
418       bcnt = bld.vop3(aco_opcode::v_bcnt_u32_b32, bld.def(v1), Operand(inputs[0]), Operand::zero());
419       writeout(2, bld.vadd32(bld.def(v1), bcnt, Operand::c32(42u)));
420 
421       //! v1: %bnct3 = v_bcnt_u32_b32 %b, 0
422       //~gfx8! v1: %res3, s2: %_ = v_add_co_u32 %bcnt3, %a
423       //~gfx(9|10)! v1: %res3 = v_add_u32 %bcnt3, %a
424       //! p_unit_test 3, %res3
425       bcnt = bld.vop3(aco_opcode::v_bcnt_u32_b32, bld.def(v1), Operand(inputs[1]), Operand::zero());
426       writeout(3, bld.vadd32(bld.def(v1), bcnt, Operand(inputs[0])));
427 
428       //! v1: %bnct4 = v_bcnt_u32_b32 %a, 0
429       //~gfx(8|9)! v1: %add4, s2: %carry = v_add_co_u32 %bcnt4, %a
430       //~gfx10! v1: %add4, s2: %carry = v_add_co_u32_e64 %bcnt4, %a
431       //! p_unit_test 4, %carry
432       bcnt = bld.vop3(aco_opcode::v_bcnt_u32_b32, bld.def(v1), Operand(inputs[0]), Operand::zero());
433       Temp carry = bld.vadd32(bld.def(v1), bcnt, Operand(inputs[0]), true).def(1).getTemp();
434       writeout(4, carry);
435 
436       finish_opt_test();
437    }
438 END_TEST
439 
440 struct clamp_config {
441    const char *name;
442    aco_opcode min, max, med3;
443    Operand lb, ub;
444 };
445 
446 static const clamp_config clamp_configs[] = {
447    /* 0.0, 4.0 */
448    {"_0,4f32", aco_opcode::v_min_f32, aco_opcode::v_max_f32, aco_opcode::v_med3_f32,
449     Operand::zero(), Operand::c32(0x40800000u)},
450    {"_0,4f16", aco_opcode::v_min_f16, aco_opcode::v_max_f16, aco_opcode::v_med3_f16,
451     Operand::c16(0u), Operand::c16(0x4400)},
452    /* -1.0, 0.0 */
453    {"_-1,0f32", aco_opcode::v_min_f32, aco_opcode::v_max_f32, aco_opcode::v_med3_f32,
454     Operand::c32(0xbf800000u), Operand::zero()},
455    {"_-1,0f16", aco_opcode::v_min_f16, aco_opcode::v_max_f16, aco_opcode::v_med3_f16,
456     Operand::c16(0xBC00), Operand::c16(0u)},
457    /* 0, 3 */
458    {"_0,3u32", aco_opcode::v_min_u32, aco_opcode::v_max_u32, aco_opcode::v_med3_u32,
459     Operand::zero(), Operand::c32(3u)},
460    {"_0,3u16", aco_opcode::v_min_u16, aco_opcode::v_max_u16, aco_opcode::v_med3_u16,
461     Operand::c16(0u), Operand::c16(3u)},
462    {"_0,3i32", aco_opcode::v_min_i32, aco_opcode::v_max_i32, aco_opcode::v_med3_i32,
463     Operand::zero(), Operand::c32(3u)},
464    {"_0,3i16", aco_opcode::v_min_i16, aco_opcode::v_max_i16, aco_opcode::v_med3_i16,
465     Operand::c16(0u), Operand::c16(3u)},
466    /* -5, 0 */
467    {"_-5,0i32", aco_opcode::v_min_i32, aco_opcode::v_max_i32, aco_opcode::v_med3_i32,
468     Operand::c32(0xfffffffbu), Operand::zero()},
469    {"_-5,0i16", aco_opcode::v_min_i16, aco_opcode::v_max_i16, aco_opcode::v_med3_i16,
470     Operand::c16(0xfffbu), Operand::c16(0u)},
471 };
472 
473 BEGIN_TEST(optimize.clamp)
474    for (clamp_config cfg : clamp_configs) {
475       if (!setup_cs("v1 v1 v1", GFX9, CHIP_UNKNOWN, cfg.name))
476          continue;
477 
478       //! cfg: @match_func(min max med3 lb ub)
479       fprintf(output, "cfg: %s ", instr_info.name[(int)cfg.min]);
480       fprintf(output, "%s ", instr_info.name[(int)cfg.max]);
481       fprintf(output, "%s ", instr_info.name[(int)cfg.med3]);
482       aco_print_operand(&cfg.lb, output);
483       fprintf(output, " ");
484       aco_print_operand(&cfg.ub, output);
485       fprintf(output, "\n");
486 
487       //>> v1: %a, v1: %b, v1: %c = p_startpgm
488 
489       //! v1: %res0 = @med3 @ub, @lb, %a
490       //! p_unit_test 0, %res0
491       writeout(0, bld.vop2(cfg.min, bld.def(v1), cfg.ub,
492                            bld.vop2(cfg.max, bld.def(v1), cfg.lb, inputs[0])));
493 
494       //! v1: %res1 = @med3 @lb, @ub, %a
495       //! p_unit_test 1, %res1
496       writeout(1, bld.vop2(cfg.max, bld.def(v1), cfg.lb,
497                            bld.vop2(cfg.min, bld.def(v1), cfg.ub, inputs[0])));
498 
499       /* min constant must be greater than max constant */
500       //! v1: %res2_tmp = @min @lb, %a
501       //! v1: %res2 = @max @ub, %res2_tmp
502       //! p_unit_test 2, %res2
503       writeout(2, bld.vop2(cfg.max, bld.def(v1), cfg.ub,
504                            bld.vop2(cfg.min, bld.def(v1), cfg.lb, inputs[0])));
505 
506       //! v1: %res3_tmp = @max @ub, %a
507       //! v1: %res3 = @min @lb, %res3_tmp
508       //! p_unit_test 3, %res3
509       writeout(3, bld.vop2(cfg.min, bld.def(v1), cfg.lb,
510                            bld.vop2(cfg.max, bld.def(v1), cfg.ub, inputs[0])));
511 
512       /* needs two constants */
513 
514       //! v1: %res4_tmp = @max @lb, %a
515       //! v1: %res4 = @min %b, %res4_tmp
516       //! p_unit_test 4, %res4
517       writeout(4, bld.vop2(cfg.min, bld.def(v1), inputs[1],
518                            bld.vop2(cfg.max, bld.def(v1), cfg.lb, inputs[0])));
519 
520       //! v1: %res5_tmp = @max %b, %a
521       //! v1: %res5 = @min @ub, %res5_tmp
522       //! p_unit_test 5, %res5
523       writeout(5, bld.vop2(cfg.min, bld.def(v1), cfg.ub,
524                            bld.vop2(cfg.max, bld.def(v1), inputs[1], inputs[0])));
525 
526       //! v1: %res6_tmp = @max %c, %a
527       //! v1: %res6 = @min %b, %res6_tmp
528       //! p_unit_test 6, %res6
529       writeout(6, bld.vop2(cfg.min, bld.def(v1), inputs[1],
530                            bld.vop2(cfg.max, bld.def(v1), inputs[2], inputs[0])));
531 
532       /* correct NaN behaviour with precise */
533       if (cfg.min == aco_opcode::v_min_f16 || cfg.min == aco_opcode::v_min_f32) {
534          //~f(16|32)! v1: %res7 = @med3 @ub, @lb, %a
535          //~f(16|32)! p_unit_test 7, %res7
536          Builder::Result max = bld.vop2(cfg.max, bld.def(v1), cfg.lb, inputs[0]);
537          max.def(0).setPrecise(true);
538          Builder::Result min = bld.vop2(cfg.min, bld.def(v1), cfg.ub, max);
539          max.def(0).setPrecise(true);
540          writeout(7, min);
541 
542          //~f(16|32)! v1: (precise)%res8_tmp = @min @ub, %a
543          //~f(16|32)! v1: %res8 = @max @lb, %res8_tmp
544          //~f(16|32)! p_unit_test 8, %res8
545          min = bld.vop2(cfg.min, bld.def(v1), cfg.ub, inputs[0]);
546          min.def(0).setPrecise(true);
547          writeout(8, bld.vop2(cfg.max, bld.def(v1), cfg.lb, min));
548       }
549 
550       finish_opt_test();
551    }
552 END_TEST
553 
554 BEGIN_TEST(optimize.const_comparison_ordering)
555    //>> v1: %a, v1: %b, v2: %c, v1: %d = p_startpgm
556    if (!setup_cs("v1 v1 v2 v1", GFX9))
557       return;
558 
559    /* optimize to unordered comparison */
560    //! s2: %res0 = v_cmp_nge_f32 4.0, %a
561    //! p_unit_test 0, %res0
562    writeout(0, bld.sop2(aco_opcode::s_or_b64, bld.def(bld.lm), bld.def(s1, scc),
563                         bld.vopc(aco_opcode::v_cmp_neq_f32, bld.def(bld.lm), inputs[0], inputs[0]),
564                         bld.vopc(aco_opcode::v_cmp_lt_f32, bld.def(bld.lm),
565                                  Operand::c32(0x40800000u), inputs[0])));
566 
567    //! s2: %res1 = v_cmp_nge_f32 4.0, %a
568    //! p_unit_test 1, %res1
569    writeout(1, bld.sop2(aco_opcode::s_or_b64, bld.def(bld.lm), bld.def(s1, scc),
570                         bld.vopc(aco_opcode::v_cmp_neq_f32, bld.def(bld.lm), inputs[0], inputs[0]),
571                         bld.vopc(aco_opcode::v_cmp_nge_f32, bld.def(bld.lm),
572                                  Operand::c32(0x40800000u), inputs[0])));
573 
574    //! s2: %res2 = v_cmp_nge_f32 0x40a00000, %a
575    //! p_unit_test 2, %res2
576    writeout(2, bld.sop2(aco_opcode::s_or_b64, bld.def(bld.lm), bld.def(s1, scc),
577                         bld.vopc(aco_opcode::v_cmp_neq_f32, bld.def(bld.lm), inputs[0], inputs[0]),
578                         bld.vopc(aco_opcode::v_cmp_lt_f32, bld.def(bld.lm),
579                                  bld.copy(bld.def(v1), Operand::c32(0x40a00000u)), inputs[0])));
580 
581    /* optimize to ordered comparison */
582    //! s2: %res3 = v_cmp_lt_f32 4.0, %a
583    //! p_unit_test 3, %res3
584    writeout(3, bld.sop2(aco_opcode::s_and_b64, bld.def(bld.lm), bld.def(s1, scc),
585                         bld.vopc(aco_opcode::v_cmp_eq_f32, bld.def(bld.lm), inputs[0], inputs[0]),
586                         bld.vopc(aco_opcode::v_cmp_nge_f32, bld.def(bld.lm),
587                                  Operand::c32(0x40800000u), inputs[0])));
588 
589    //! s2: %res4 = v_cmp_lt_f32 4.0, %a
590    //! p_unit_test 4, %res4
591    writeout(4, bld.sop2(aco_opcode::s_and_b64, bld.def(bld.lm), bld.def(s1, scc),
592                         bld.vopc(aco_opcode::v_cmp_eq_f32, bld.def(bld.lm), inputs[0], inputs[0]),
593                         bld.vopc(aco_opcode::v_cmp_lt_f32, bld.def(bld.lm),
594                                  Operand::c32(0x40800000u), inputs[0])));
595 
596    //! s2: %res5 = v_cmp_lt_f32 0x40a00000, %a
597    //! p_unit_test 5, %res5
598    writeout(5, bld.sop2(aco_opcode::s_and_b64, bld.def(bld.lm), bld.def(s1, scc),
599                         bld.vopc(aco_opcode::v_cmp_eq_f32, bld.def(bld.lm), inputs[0], inputs[0]),
600                         bld.vopc(aco_opcode::v_cmp_nge_f32, bld.def(bld.lm),
601                                  bld.copy(bld.def(v1), Operand::c32(0x40a00000u)), inputs[0])));
602 
603    /* similar but unoptimizable expressions */
604    //! s2: %tmp6_0 = v_cmp_lt_f32 4.0, %a
605    //! s2: %tmp6_1 = v_cmp_neq_f32 %a, %a
606    //! s2: %res6, s1: %_:scc = s_and_b64 %tmp6_1, %tmp6_0
607    //! p_unit_test 6, %res6
608    Temp src1 =
609       bld.vopc(aco_opcode::v_cmp_lt_f32, bld.def(bld.lm), Operand::c32(0x40800000u), inputs[0]);
610    Temp src0 = bld.vopc(aco_opcode::v_cmp_neq_f32, bld.def(bld.lm), inputs[0], inputs[0]);
611    writeout(6, bld.sop2(aco_opcode::s_and_b64, bld.def(bld.lm), bld.def(s1, scc), src0, src1));
612 
613    //! s2: %tmp7_0 = v_cmp_nge_f32 4.0, %a
614    //! s2: %tmp7_1 = v_cmp_eq_f32 %a, %a
615    //! s2: %res7, s1: %_:scc = s_or_b64 %tmp7_1, %tmp7_0
616    //! p_unit_test 7, %res7
617    src1 =
618       bld.vopc(aco_opcode::v_cmp_nge_f32, bld.def(bld.lm), Operand::c32(0x40800000u), inputs[0]);
619    src0 = bld.vopc(aco_opcode::v_cmp_eq_f32, bld.def(bld.lm), inputs[0], inputs[0]);
620    writeout(7, bld.sop2(aco_opcode::s_or_b64, bld.def(bld.lm), bld.def(s1, scc), src0, src1));
621 
622    //! s2: %tmp8_0 = v_cmp_lt_f32 4.0, %d
623    //! s2: %tmp8_1 = v_cmp_neq_f32 %a, %a
624    //! s2: %res8, s1: %_:scc = s_or_b64 %tmp8_1, %tmp8_0
625    //! p_unit_test 8, %res8
626    src1 = bld.vopc(aco_opcode::v_cmp_lt_f32, bld.def(bld.lm), Operand::c32(0x40800000u), inputs[3]);
627    src0 = bld.vopc(aco_opcode::v_cmp_neq_f32, bld.def(bld.lm), inputs[0], inputs[0]);
628    writeout(8, bld.sop2(aco_opcode::s_or_b64, bld.def(bld.lm), bld.def(s1, scc), src0, src1));
629 
630    //! s2: %tmp9_0 = v_cmp_lt_f32 4.0, %a
631    //! s2: %tmp9_1 = v_cmp_neq_f32 %a, %d
632    //! s2: %res9, s1: %_:scc = s_or_b64 %tmp9_1, %tmp9_0
633    //! p_unit_test 9, %res9
634    src1 = bld.vopc(aco_opcode::v_cmp_lt_f32, bld.def(bld.lm), Operand::c32(0x40800000u), inputs[0]);
635    src0 = bld.vopc(aco_opcode::v_cmp_neq_f32, bld.def(bld.lm), inputs[0], inputs[3]);
636    writeout(9, bld.sop2(aco_opcode::s_or_b64, bld.def(bld.lm), bld.def(s1, scc), src0, src1));
637 
638    /* bit sizes */
639    //! s2: %res10 = v_cmp_nge_f16 4.0, %b
640    //! p_unit_test 10, %res10
641    Temp input1_16 =
642       bld.pseudo(aco_opcode::p_extract_vector, bld.def(v2b), inputs[1], Operand::zero());
643    writeout(10, bld.sop2(aco_opcode::s_or_b64, bld.def(bld.lm), bld.def(s1, scc),
644                          bld.vopc(aco_opcode::v_cmp_neq_f16, bld.def(bld.lm), input1_16, input1_16),
645                          bld.vopc(aco_opcode::v_cmp_lt_f16, bld.def(bld.lm), Operand::c16(0x4400u),
646                                   input1_16)));
647 
648    //! s2: %res11 = v_cmp_nge_f64 4.0, %c
649    //! p_unit_test 11, %res11
650    writeout(11, bld.sop2(aco_opcode::s_or_b64, bld.def(bld.lm), bld.def(s1, scc),
651                          bld.vopc(aco_opcode::v_cmp_neq_f64, bld.def(bld.lm), inputs[2], inputs[2]),
652                          bld.vopc(aco_opcode::v_cmp_lt_f64, bld.def(bld.lm),
653                                   Operand::c64(0x4010000000000000u), inputs[2])));
654 
655    /* NaN */
656    uint16_t nan16 = 0x7e00;
657    uint32_t nan32 = 0x7fc00000;
658    uint64_t nan64 = 0xffffffffffffffffllu;
659 
660    //! s2: %tmp12_0 = v_cmp_lt_f16 0x7e00, %a
661    //! s2: %tmp12_1 = v_cmp_neq_f16 %a, %a
662    //! s2: %res12, s1: %_:scc = s_or_b64 %tmp12_1, %tmp12_0
663    //! p_unit_test 12, %res12
664    src1 = bld.vopc(aco_opcode::v_cmp_lt_f16, bld.def(bld.lm), Operand::c16(nan16), inputs[0]);
665    src0 = bld.vopc(aco_opcode::v_cmp_neq_f16, bld.def(bld.lm), inputs[0], inputs[0]);
666    writeout(12, bld.sop2(aco_opcode::s_or_b64, bld.def(bld.lm), bld.def(s1, scc), src0, src1));
667 
668    //! s2: %tmp13_0 = v_cmp_lt_f32 0x7fc00000, %a
669    //! s2: %tmp13_1 = v_cmp_neq_f32 %a, %a
670    //! s2: %res13, s1: %_:scc = s_or_b64 %tmp13_1, %tmp13_0
671    //! p_unit_test 13, %res13
672    src1 = bld.vopc(aco_opcode::v_cmp_lt_f32, bld.def(bld.lm), Operand::c32(nan32), inputs[0]);
673    src0 = bld.vopc(aco_opcode::v_cmp_neq_f32, bld.def(bld.lm), inputs[0], inputs[0]);
674    writeout(13, bld.sop2(aco_opcode::s_or_b64, bld.def(bld.lm), bld.def(s1, scc), src0, src1));
675 
676    //! s2: %tmp14_0 = v_cmp_lt_f64 -1, %a
677    //! s2: %tmp14_1 = v_cmp_neq_f64 %a, %a
678    //! s2: %res14, s1: %_:scc = s_or_b64 %tmp14_1, %tmp14_0
679    //! p_unit_test 14, %res14
680    src1 = bld.vopc(aco_opcode::v_cmp_lt_f64, bld.def(bld.lm), Operand::c64(nan64), inputs[0]);
681    src0 = bld.vopc(aco_opcode::v_cmp_neq_f64, bld.def(bld.lm), inputs[0], inputs[0]);
682    writeout(14, bld.sop2(aco_opcode::s_or_b64, bld.def(bld.lm), bld.def(s1, scc), src0, src1));
683 
684    finish_opt_test();
685 END_TEST
686 
687 BEGIN_TEST(optimize.add3)
688    //>> v1: %a, v1: %b, v1: %c = p_startpgm
689    if (!setup_cs("v1 v1 v1", GFX9))
690       return;
691 
692    //! v1: %res0 = v_add3_u32 %a, %b, %c
693    //! p_unit_test 0, %res0
694    Builder::Result tmp = bld.vop2(aco_opcode::v_add_u32, bld.def(v1), inputs[1], inputs[2]);
695    writeout(0, bld.vop2(aco_opcode::v_add_u32, bld.def(v1), inputs[0], tmp));
696 
697    //! v1: %tmp1 = v_add_u32 %b, %c clamp
698    //! v1: %res1 = v_add_u32 %a, %tmp1
699    //! p_unit_test 1, %res1
700    tmp = bld.vop2_e64(aco_opcode::v_add_u32, bld.def(v1), inputs[1], inputs[2]);
701    tmp.instr->vop3().clamp = true;
702    writeout(1, bld.vop2(aco_opcode::v_add_u32, bld.def(v1), inputs[0], tmp));
703 
704    //! v1: %tmp2 = v_add_u32 %b, %c
705    //! v1: %res2 = v_add_u32 %a, %tmp2 clamp
706    //! p_unit_test 2, %res2
707    tmp = bld.vop2(aco_opcode::v_add_u32, bld.def(v1), inputs[1], inputs[2]);
708    tmp = bld.vop2_e64(aco_opcode::v_add_u32, bld.def(v1), inputs[0], tmp);
709    tmp.instr->vop3().clamp = true;
710    writeout(2, tmp);
711 
712    finish_opt_test();
713 END_TEST
714 
715 BEGIN_TEST(optimize.minmax)
716    for (unsigned i = GFX9; i <= GFX10; i++) {
717       //>> v1: %a = p_startpgm
718       if (!setup_cs("v1", (amd_gfx_level)i))
719          continue;
720 
721       //! v1: %res0 = v_max3_f32 0, -0, %a
722       //! p_unit_test 0, %res0
723       Temp xor0 = fneg(inputs[0]);
724       Temp min = bld.vop2(aco_opcode::v_min_f32, bld.def(v1), Operand::zero(), xor0);
725       Temp xor1 = fneg(min);
726       writeout(0, bld.vop2(aco_opcode::v_max_f32, bld.def(v1), Operand::zero(), xor1));
727 
728       //! v1: %res1 = v_max3_f32 0, -0, -%a
729       //! p_unit_test 1, %res1
730       min = bld.vop2(aco_opcode::v_min_f32, bld.def(v1), Operand::zero(), Operand(inputs[0]));
731       xor1 = fneg(min);
732       writeout(1, bld.vop2(aco_opcode::v_max_f32, bld.def(v1), Operand::zero(), xor1));
733 
734       finish_opt_test();
735    }
736 END_TEST
737 
738 BEGIN_TEST(optimize.mad_32_24)
739    for (unsigned i = GFX8; i <= GFX9; i++) {
740       //>> v1: %a, v1: %b, v1: %c = p_startpgm
741       if (!setup_cs("v1 v1 v1", (amd_gfx_level)i))
742          continue;
743 
744       //! v1: %res0 = v_mad_u32_u24 %b, %c, %a
745       //! p_unit_test 0, %res0
746       Temp mul = bld.vop2(aco_opcode::v_mul_u32_u24, bld.def(v1), inputs[1], inputs[2]);
747       writeout(0, bld.vadd32(bld.def(v1), inputs[0], mul));
748 
749       //! v1: %res1_tmp = v_mul_u32_u24 %b, %c
750       //! v1: %_, s2: %res1 = v_add_co_u32 %a, %res1_tmp
751       //! p_unit_test 1, %res1
752       mul = bld.vop2(aco_opcode::v_mul_u32_u24, bld.def(v1), inputs[1], inputs[2]);
753       writeout(1, bld.vadd32(bld.def(v1), inputs[0], mul, true).def(1).getTemp());
754 
755       finish_opt_test();
756    }
757 END_TEST
758 
759 BEGIN_TEST(optimize.add_lshlrev)
760    for (unsigned i = GFX8; i <= GFX10; i++) {
761       //>> v1: %a, v1: %b, s1: %c = p_startpgm
762       if (!setup_cs("v1 v1 s1", (amd_gfx_level)i))
763          continue;
764 
765       Temp lshl;
766 
767       //~gfx8! v1: %lshl0 = v_lshlrev_b32 3, %a
768       //~gfx8! v1: %res0, s2: %_ = v_add_co_u32 %lshl0, %b
769       //~gfx(9|10)! v1: %res0 = v_lshl_add_u32 %a, 3, %b
770       //! p_unit_test 0, %res0
771       lshl = bld.vop2(aco_opcode::v_lshlrev_b32, bld.def(v1), Operand::c32(3u), Operand(inputs[0]));
772       writeout(0, bld.vadd32(bld.def(v1), lshl, Operand(inputs[1])));
773 
774       //~gfx8! v1: %lshl1 = v_lshlrev_b32 7, (is24bit)%a
775       //~gfx8! v1: %res1, s2: %_ = v_add_co_u32 %lshl1, %b
776       //~gfx(9|10)! v1: %res1 = v_lshl_add_u32 (is24bit)%a, 7, %b
777       //! p_unit_test 1, %res1
778       Operand a_24bit = Operand(inputs[0]);
779       a_24bit.set24bit(true);
780       lshl = bld.vop2(aco_opcode::v_lshlrev_b32, bld.def(v1), Operand::c32(7u), a_24bit);
781       writeout(1, bld.vadd32(bld.def(v1), lshl, Operand(inputs[1])));
782 
783       //~gfx8! v1: %lshl2 = v_lshlrev_b32 (is24bit)%a, (is24bit)%b
784       //~gfx8! v1: %res2, s2: %_ = v_add_co_u32 %lshl2, %b
785       //~gfx(9|10)! v1: %res2 = v_lshl_add_u32 (is24bit)%b, (is24bit)%a, %b
786       //! p_unit_test 2, %res2
787       Operand b_24bit = Operand(inputs[1]);
788       b_24bit.set24bit(true);
789       lshl = bld.vop2(aco_opcode::v_lshlrev_b32, bld.def(v1), a_24bit, b_24bit);
790       writeout(2, bld.vadd32(bld.def(v1), lshl, Operand(inputs[1])));
791 
792       //~gfx8! v1: %res3 = v_mad_u32_u24 (is24bit)%a, 8, %b
793       //~gfx(9|10)! v1: %res3 = v_lshl_add_u32 (is24bit)%a, 3, %b
794       //! p_unit_test 3, %res3
795       lshl = bld.vop2(aco_opcode::v_lshlrev_b32, bld.def(v1), Operand::c32(3u), a_24bit);
796       writeout(3, bld.vadd32(bld.def(v1), lshl, Operand(inputs[1])));
797 
798       //~gfx8! v1: %res4 = v_mad_u32_u24 (is16bit)%a, 16, %b
799       //~gfx(9|10)! v1: %res4 = v_lshl_add_u32 (is16bit)%a, 4, %b
800       //! p_unit_test 4, %res4
801       Operand a_16bit = Operand(inputs[0]);
802       a_16bit.set16bit(true);
803       lshl = bld.vop2(aco_opcode::v_lshlrev_b32, bld.def(v1), Operand::c32(4u), a_16bit);
804       writeout(4, bld.vadd32(bld.def(v1), lshl, Operand(inputs[1])));
805 
806       //~gfx8! v1: %res5 = v_mad_u32_u24 (is24bit)%c, 16, %c
807       //~gfx(9|10)! v1: %res5 = v_lshl_add_u32 (is24bit)%c, 4, %c
808       //! p_unit_test 5, %res5
809       Operand c_24bit = Operand(inputs[2]);
810       c_24bit.set24bit(true);
811       lshl = bld.vop2_e64(aco_opcode::v_lshlrev_b32, bld.def(v1), Operand::c32(4u), c_24bit);
812       writeout(5, bld.vadd32(bld.def(v1), lshl, Operand(inputs[2])));
813 
814       finish_opt_test();
815    }
816 END_TEST
817 
818 enum denorm_op {
819    denorm_mul1 = 0,
820    denorm_fneg = 1,
821    denorm_fabs = 2,
822    denorm_fnegabs = 3,
823 };
824 
825 static const char *denorm_op_names[] = {
826    "mul1",
827    "fneg",
828    "fabs",
829    "fnegabs",
830 };
831 
832 struct denorm_config {
833    bool flush;
834    unsigned op;
835    aco_opcode src;
836    aco_opcode dest;
837 };
838 
srcdest_op_name(aco_opcode op)839 static const char *srcdest_op_name(aco_opcode op)
840 {
841    switch (op) {
842    case aco_opcode::v_cndmask_b32:
843       return "cndmask";
844    case aco_opcode::v_min_f32:
845       return "min";
846    case aco_opcode::v_rcp_f32:
847       return "rcp";
848    default:
849       return "none";
850    }
851 }
852 
emit_denorm_srcdest(aco_opcode op,Temp val)853 static Temp emit_denorm_srcdest(aco_opcode op, Temp val)
854 {
855    switch (op) {
856    case aco_opcode::v_cndmask_b32:
857       return bld.vop2(aco_opcode::v_cndmask_b32, bld.def(v1), Operand::zero(), val, inputs[1]);
858    case aco_opcode::v_min_f32:
859       return bld.vop2(aco_opcode::v_min_f32, bld.def(v1), Operand::zero(), val);
860    case aco_opcode::v_rcp_f32:
861       return bld.vop1(aco_opcode::v_rcp_f32, bld.def(v1), val);
862    default:
863       return val;
864    }
865 }
866 
867 BEGIN_TEST(optimize.denorm_propagation)
868    for (unsigned i = GFX8; i <= GFX9; i++) {
869       std::vector<denorm_config> configs;
870       for (bool flush : {false, true}) {
871          for (denorm_op op : {denorm_mul1, denorm_fneg, denorm_fabs, denorm_fnegabs})
872             configs.push_back({flush, op, aco_opcode::num_opcodes, aco_opcode::num_opcodes});
873 
874          for (aco_opcode dest : {aco_opcode::v_min_f32, aco_opcode::v_rcp_f32}) {
875             for (denorm_op op : {denorm_mul1, denorm_fneg, denorm_fabs, denorm_fnegabs})
876                configs.push_back({flush, op, aco_opcode::num_opcodes, dest});
877          }
878 
879          for (aco_opcode src : {aco_opcode::v_cndmask_b32, aco_opcode::v_min_f32, aco_opcode::v_rcp_f32}) {
880             for (denorm_op op : {denorm_mul1, denorm_fneg, denorm_fabs, denorm_fnegabs})
881                configs.push_back({flush, op, src, aco_opcode::num_opcodes});
882          }
883       }
884 
885       for (denorm_config cfg : configs) {
886          char subvariant[128];
887          sprintf(subvariant, "_%s_%s_%s_%s",
888                  cfg.flush ? "flush" : "keep", srcdest_op_name(cfg.src),
889                  denorm_op_names[(int)cfg.op], srcdest_op_name(cfg.dest));
890          if (!setup_cs("v1 s2", (amd_gfx_level)i, CHIP_UNKNOWN, subvariant))
891             continue;
892 
893          bool can_propagate = cfg.src == aco_opcode::v_rcp_f32 || (i >= GFX9 && cfg.src == aco_opcode::v_min_f32) ||
894                               cfg.dest == aco_opcode::v_rcp_f32 || (i >= GFX9 && cfg.dest == aco_opcode::v_min_f32) ||
895                               !cfg.flush;
896 
897          fprintf(output, "src, dest, op: %s %s %s\n",
898                  srcdest_op_name(cfg.src), srcdest_op_name(cfg.dest), denorm_op_names[(int)cfg.op]);
899          fprintf(output, "can_propagate: %u\n", can_propagate);
900          //! src, dest, op: $src $dest $op
901          //! can_propagate: #can_propagate
902          //>> v1: %a, s2: %b = p_startpgm
903 
904          //; patterns = {'cndmask': 'v1: %{} = v_cndmask_b32 0, {}, %b',
905          //;             'min': 'v1: %{} = v_min_f32 0, {}',
906          //;             'rcp': 'v1: %{} = v_rcp_f32 {}'}
907          //; ops = {'mul1': 'v1: %{} = v_mul_f32 1.0, %{}',
908          //;        'fneg': 'v1: %{} = v_mul_f32 -1.0, %{}',
909          //;        'fabs': 'v1: %{} = v_mul_f32 1.0, |%{}|',
910          //;        'fnegabs': 'v1: %{} = v_mul_f32 -1.0, |%{}|'}
911          //; inline_ops = {'mul1': '%{}', 'fneg': '-%{}', 'fabs': '|%{}|', 'fnegabs': '-|%{}|'}
912 
913          //; name = 'a'
914          //; if src != 'none':
915          //;    insert_pattern(patterns[src].format('src_res', '%'+name))
916          //;    name = 'src_res'
917 
918          //; if can_propagate:
919          //;    name = inline_ops[op].format(name)
920          //; else:
921          //;    insert_pattern(ops[op].format('op_res', name))
922          //;    name = '%op_res'
923 
924          //; if dest != 'none':
925          //;    insert_pattern(patterns[dest].format('dest_res', name))
926          //;    name = '%dest_res'
927 
928          //; insert_pattern('v1: %res = v_cndmask_b32 0, {}, %b'.format(name))
929          //! p_unit_test 0, %res
930 
931          program->blocks[0].fp_mode.denorm32 = cfg.flush ? fp_denorm_flush : fp_denorm_keep;
932 
933          Temp val = emit_denorm_srcdest(cfg.src, inputs[0]);
934          switch (cfg.op) {
935          case denorm_mul1:
936             val = bld.vop2(aco_opcode::v_mul_f32, bld.def(v1), Operand::c32(0x3f800000u), val);
937             break;
938          case denorm_fneg:
939             val = fneg(val);
940             break;
941          case denorm_fabs:
942             val = fabs(val);
943             break;
944          case denorm_fnegabs:
945             val = fneg(fabs(val));
946             break;
947          }
948          val = emit_denorm_srcdest(cfg.dest, val);
949          writeout(
950             0, bld.vop2(aco_opcode::v_cndmask_b32, bld.def(v1), Operand::zero(), val, inputs[1]));
951 
952          finish_opt_test();
953       }
954    }
955 END_TEST
956 
957 BEGIN_TEST(optimizer.dpp)
958    //>> v1: %a, v1: %b, s2: %c, s1: %d = p_startpgm
959    if (!setup_cs("v1 v1 s2 s1", GFX10_3))
960       return;
961 
962    Operand a(inputs[0]);
963    Operand b(inputs[1]);
964    Operand c(inputs[2]);
965    Operand d(inputs[3]);
966 
967    /* basic optimization */
968    //! v1: %res0 = v_add_f32 %a, %b row_mirror bound_ctrl:1
969    //! p_unit_test 0, %res0
970    Temp tmp0 = bld.vop1_dpp(aco_opcode::v_mov_b32, bld.def(v1), a, dpp_row_mirror);
971    Temp res0 = bld.vop2(aco_opcode::v_add_f32, bld.def(v1), tmp0, b);
972    writeout(0, res0);
973 
974    /* operand swapping */
975    //! v1: %res1 = v_subrev_f32 %a, %b row_mirror bound_ctrl:1
976    //! p_unit_test 1, %res1
977    Temp tmp1 = bld.vop1_dpp(aco_opcode::v_mov_b32, bld.def(v1), a, dpp_row_mirror);
978    Temp res1 = bld.vop2(aco_opcode::v_sub_f32, bld.def(v1), b, tmp1);
979    writeout(1, res1);
980 
981    //! v1: %tmp2 = v_mov_b32 %a row_mirror bound_ctrl:1
982    //! v1: %res2 = v_sub_f32 %b, %tmp2 row_half_mirror bound_ctrl:1
983    //! p_unit_test 2, %res2
984    Temp tmp2 = bld.vop1_dpp(aco_opcode::v_mov_b32, bld.def(v1), a, dpp_row_mirror);
985    Temp res2 = bld.vop2_dpp(aco_opcode::v_sub_f32, bld.def(v1), b, tmp2, dpp_row_half_mirror);
986    writeout(2, res2);
987 
988    /* modifiers */
989    //! v1: %res3 = v_add_f32 -%a, %b row_mirror bound_ctrl:1
990    //! p_unit_test 3, %res3
991    auto tmp3 = bld.vop1_dpp(aco_opcode::v_mov_b32, bld.def(v1), a, dpp_row_mirror);
992    tmp3.instr->dpp16().neg[0] = true;
993    Temp res3 = bld.vop2(aco_opcode::v_add_f32, bld.def(v1), tmp3, b);
994    writeout(3, res3);
995 
996    //! v1: %res4 = v_add_f32 -%a, %b row_mirror bound_ctrl:1
997    //! p_unit_test 4, %res4
998    Temp tmp4 = bld.vop1_dpp(aco_opcode::v_mov_b32, bld.def(v1), a, dpp_row_mirror);
999    auto res4 = bld.vop2_e64(aco_opcode::v_add_f32, bld.def(v1), tmp4, b);
1000    res4.instr->vop3().neg[0] = true;
1001    writeout(4, res4);
1002 
1003    //! v1: %tmp5 = v_mov_b32 %a row_mirror bound_ctrl:1
1004    //! v1: %res5 = v_add_f32 %tmp5, %b clamp
1005    //! p_unit_test 5, %res5
1006    Temp tmp5 = bld.vop1_dpp(aco_opcode::v_mov_b32, bld.def(v1), a, dpp_row_mirror);
1007    auto res5 = bld.vop2_e64(aco_opcode::v_add_f32, bld.def(v1), tmp5, b);
1008    res5.instr->vop3().clamp = true;
1009    writeout(5, res5);
1010 
1011    //! v1: %res6 = v_add_f32 |%a|, %b row_mirror bound_ctrl:1
1012    //! p_unit_test 6, %res6
1013    auto tmp6 = bld.vop1_dpp(aco_opcode::v_mov_b32, bld.def(v1), a, dpp_row_mirror);
1014    tmp6.instr->dpp16().neg[0] = true;
1015    auto res6 = bld.vop2_e64(aco_opcode::v_add_f32, bld.def(v1), tmp6, b);
1016    res6.instr->vop3().abs[0] = true;
1017    writeout(6, res6);
1018 
1019    //! v1: %res7 = v_subrev_f32 %a, |%b| row_mirror bound_ctrl:1
1020    //! p_unit_test 7, %res7
1021    Temp tmp7 = bld.vop1_dpp(aco_opcode::v_mov_b32, bld.def(v1), a, dpp_row_mirror);
1022    auto res7 = bld.vop2_e64(aco_opcode::v_sub_f32, bld.def(v1), b, tmp7);
1023    res7.instr->vop3().abs[0] = true;
1024    writeout(7, res7);
1025 
1026    /* vcc */
1027    //! v1: %res8 = v_cndmask_b32 %a, %b, %c:vcc row_mirror bound_ctrl:1
1028    //! p_unit_test 8, %res8
1029    Temp tmp8 = bld.vop1_dpp(aco_opcode::v_mov_b32, bld.def(v1), a, dpp_row_mirror);
1030    Temp res8 = bld.vop2(aco_opcode::v_cndmask_b32, bld.def(v1), tmp8, b, c);
1031    writeout(8, res8);
1032 
1033    /* sgprs */
1034    //! v1: %tmp9 = v_mov_b32 %a row_mirror bound_ctrl:1
1035    //! v1: %res9 = v_add_f32 %tmp9, %d
1036    //! p_unit_test 9, %res9
1037    Temp tmp9 = bld.vop1_dpp(aco_opcode::v_mov_b32, bld.def(v1), a, dpp_row_mirror);
1038    Temp res9 = bld.vop2_e64(aco_opcode::v_add_f32, bld.def(v1), tmp9, d);
1039    writeout(9, res9);
1040 
1041    //! v1: %tmp10 = v_mov_b32 %a row_mirror bound_ctrl:1
1042    //! v1: %res10 = v_add_f32 %d, %tmp10
1043    //! p_unit_test 10, %res10
1044    Temp tmp10 = bld.vop1_dpp(aco_opcode::v_mov_b32, bld.def(v1), a, dpp_row_mirror);
1045    Temp res10 = bld.vop2(aco_opcode::v_add_f32, bld.def(v1), d, tmp10);
1046    writeout(10, res10);
1047 
1048    finish_opt_test();
1049 END_TEST
1050 
1051 BEGIN_TEST(optimize.dpp_prop)
1052    //>> v1: %a, s1: %b = p_startpgm
1053    if (!setup_cs("v1 s1", GFX10))
1054       return;
1055 
1056    //! v1: %one = p_parallelcopy 1
1057    //! v1: %res0 = v_mul_f32 1, %a
1058    //! p_unit_test 0, %res0
1059    Temp one = bld.copy(bld.def(v1), Operand::c32(1));
1060    writeout(0, bld.vop2_dpp(aco_opcode::v_mul_f32, bld.def(v1), one, inputs[0], dpp_row_sl(1)));
1061 
1062    //! v1: %res1 = v_mul_f32 %a, %one row_shl:1 bound_ctrl:1
1063    //! p_unit_test 1, %res1
1064    writeout(1, bld.vop2_dpp(aco_opcode::v_mul_f32, bld.def(v1), inputs[0], one, dpp_row_sl(1)));
1065 
1066    //! v1: %res2 = v_mul_f32 0x12345678, %a
1067    //! p_unit_test 2, %res2
1068    Temp literal1 = bld.copy(bld.def(v1), Operand::c32(0x12345678u));
1069    writeout(2, bld.vop2_dpp(aco_opcode::v_mul_f32, bld.def(v1), literal1, inputs[0], dpp_row_sl(1)));
1070 
1071    //! v1: %literal2 = p_parallelcopy 0x12345679
1072    //! v1: %res3 = v_mul_f32 %a, %literal row_shl:1 bound_ctrl:1
1073    //! p_unit_test 3, %res3
1074    Temp literal2 = bld.copy(bld.def(v1), Operand::c32(0x12345679u));
1075    writeout(3, bld.vop2_dpp(aco_opcode::v_mul_f32, bld.def(v1), inputs[0], literal2, dpp_row_sl(1)));
1076 
1077    //! v1: %b_v = p_parallelcopy %b
1078    //! v1: %res4 = v_mul_f32 %b, %a
1079    //! p_unit_test 4, %res4
1080    Temp b_v = bld.copy(bld.def(v1), inputs[1]);
1081    writeout(4, bld.vop2_dpp(aco_opcode::v_mul_f32, bld.def(v1), b_v, inputs[0], dpp_row_sl(1)));
1082 
1083    //! v1: %res5 = v_mul_f32 %a, %b_v row_shl:1 bound_ctrl:1
1084    //! p_unit_test 5, %res5
1085    writeout(5, bld.vop2_dpp(aco_opcode::v_mul_f32, bld.def(v1), inputs[0], b_v, dpp_row_sl(1)));
1086 
1087    //! v1: %res6 = v_rcp_f32 %b
1088    //! p_unit_test 6, %res6
1089    writeout(6, bld.vop1_dpp(aco_opcode::v_rcp_f32, bld.def(v1), b_v, dpp_row_sl(1)));
1090 
1091    finish_opt_test();
1092 END_TEST
1093 
1094 BEGIN_TEST(optimize.casts)
1095    //>> v1: %a, v2b: %a16 = p_startpgm
1096    if (!setup_cs("v1 v2b", GFX10_3))
1097       return;
1098 
1099    Temp a = inputs[0];
1100    Temp a16 = inputs[1];
1101 
1102    program->blocks[0].fp_mode.denorm16_64 = fp_denorm_flush;
1103 
1104    //! v1: %res0_tmp = v_mul_f32 -1.0, %a
1105    //! v2b: %res0 = v_mul_f16 %res0_tmp, %a16
1106    //! p_unit_test 0, %res0
1107    writeout(0, fmul(u2u16(fneg(a)), a16));
1108 
1109    //! v2b: %res1_tmp = v_mul_f16 -1.0, %a16
1110    //! v1: %res1 = v_mul_f32 %res1_tmp, %a
1111    //! p_unit_test 1, %res1
1112    writeout(1, fmul(bld.as_uniform(fneg(a16)), a));
1113 
1114    //! v1: %res2_tmp = v_mul_f32 -1.0, %a16
1115    //! v2b: %res2 = v_mul_f16 %res2_tmp, %a16
1116    //! p_unit_test 2, %res2
1117    writeout(2, fmul(u2u16(bld.vop2_e64(aco_opcode::v_mul_f32, bld.def(v1), Operand::c32(0xbf800000u), bld.as_uniform(a16))), a16));
1118 
1119    //! v1: %res3_tmp = v_mul_f32 %a, %a
1120    //! v2b: %res3 = v_med3_f16 0, 1.0, %res3_tmp
1121    //! p_unit_test 3, %res3
1122    writeout(3, fsat(u2u16(fmul(a, a))));
1123 
1124    //! v2b: %res4_tmp = v_mul_f16 %a16, %a16
1125    //! v1: %res4 = v_med3_f32 0, 1.0, %res4_tmp
1126    //! p_unit_test 4, %res4
1127    writeout(4, fsat(bld.as_uniform(fmul(a16, a16))));
1128 
1129    //! v1: %res5_tmp = v_mul_f32 %a, %a
1130    //! v2b: %res5 = v_mul_f16 2.0, %res5_tmp
1131    //! p_unit_test 5, %res5
1132    writeout(5, fmul(u2u16(fmul(a, a)), bld.copy(bld.def(v2b), Operand::c16(0x4000))));
1133 
1134    //! v2b: %res6_tmp = v_mul_f16 %a16, %a16
1135    //! v1: %res6 = v_mul_f32 2.0, %res6_tmp
1136    //! p_unit_test 6, %res6
1137    writeout(6, fmul(bld.as_uniform(fmul(a16, a16)), bld.copy(bld.def(v1), Operand::c32(0x40000000))));
1138 
1139    //! v1: %res7_tmp = v_mul_f32 %a, %a
1140    //! v2b: %res7 = v_add_f16 %res7_tmp, %a16
1141    //! p_unit_test 7, %res7
1142    writeout(7, fadd(u2u16(fmul(a, a)), a16));
1143 
1144    //! v2b: %res8_tmp = v_mul_f16 %a16, %a16
1145    //! v1: %res8 = v_add_f32 %res8_tmp, %a
1146    //! p_unit_test 8, %res8
1147    writeout(8, fadd(bld.as_uniform(fmul(a16, a16)), a));
1148 
1149    //! v1: %res9_tmp = v_mul_f32 %a, %a
1150    //! v2b: %res9 = v_mul_f16 -1.0, %res9_tmp
1151    //! p_unit_test 9, %res9
1152    writeout(9, fneg(u2u16(fmul(a, a))));
1153 
1154    //! v2b: %res10_tmp = v_mul_f16 %a16, %a16
1155    //! v1: %res10 = v_mul_f32 -1.0, %res10_tmp
1156    //! p_unit_test 10, %res10
1157    writeout(10, bld.vop2_e64(aco_opcode::v_mul_f32, bld.def(v1), Operand::c32(0xbf800000u), bld.as_uniform(fmul(a16, a16))));
1158 
1159    finish_opt_test();
1160 END_TEST
1161 
1162 BEGIN_TEST(optimize.mad_mix.input_conv.basic)
1163    for (unsigned i = GFX9; i <= GFX10; i++) {
1164       //>> v1: %a, v2b: %a16 = p_startpgm
1165       if (!setup_cs("v1 v2b", (amd_gfx_level)i))
1166          continue;
1167 
1168       program->blocks[0].fp_mode.denorm16_64 = fp_denorm_flush;
1169 
1170       Temp a = inputs[0];
1171       Temp a16 = inputs[1];
1172 
1173       //! v1: %res0 = v_fma_mix_f32 %a, lo(%a16), -0
1174       //! p_unit_test 0, %res0
1175       writeout(0, fmul(a, f2f32(a16)));
1176 
1177       //! v1: %res1 = v_fma_mix_f32 1.0, %a, lo(%a16)
1178       //! p_unit_test 1, %res1
1179       writeout(1, fadd(a, f2f32(a16)));
1180 
1181       //! v1: %res2 = v_fma_mix_f32 1.0, lo(%a16), %a
1182       //! p_unit_test 2, %res2
1183       writeout(2, fadd(f2f32(a16), a));
1184 
1185       //! v1: %res3 = v_fma_mix_f32 %a, %a, lo(%a16)
1186       //! p_unit_test 3, %res3
1187       writeout(3, fma(a, a, f2f32(a16)));
1188 
1189       //! v1: %res4 = v_fma_mix_f32 %a, %a, lo(%a16)
1190       //! p_unit_test 4, %res4
1191       writeout(4, fma(a, a, f2f32(a16)));
1192 
1193       finish_opt_test();
1194    }
1195 END_TEST
1196 
1197 BEGIN_TEST(optimize.mad_mix.input_conv.precision)
1198    for (unsigned i = GFX9; i <= GFX10; i++) {
1199       //>> v1: %a, v2b: %a16 = p_startpgm
1200       if (!setup_cs("v1 v2b", (amd_gfx_level)i))
1201          continue;
1202 
1203       program->blocks[0].fp_mode.denorm16_64 = fp_denorm_flush;
1204 
1205       Temp a = inputs[0];
1206       Temp a16 = inputs[1];
1207 
1208       /* precise arithmetic */
1209       //~gfx9! v1: %res0_cvt = v_cvt_f32_f16 %a16
1210       //~gfx9! v1: (precise)%res0 = v_fma_f32 %a, %a, %res0_cvt
1211       //~gfx10! v1: (precise)%res0 = v_fma_mix_f32 %a, %a, lo(%a16)
1212       //! p_unit_test 0, %res0
1213       writeout(0, fma(a, a, f2f32(a16), bld.precise()));
1214 
1215       //! v2b: %res1_cvt = v_cvt_f16_f32 %a
1216       //! v2b: (precise)%res1 = v_mul_f16 %a16, %res1_cvt
1217       //! p_unit_test 1, %res1
1218       writeout(1, fmul(a16, f2f16(a), bld.precise()));
1219 
1220       //! v2b: %res2_cvt = v_cvt_f16_f32 %a
1221       //! v2b: (precise)%res2 = v_add_f16 %a16, %res2_cvt
1222       //! p_unit_test 2, %res2
1223       writeout(2, fadd(a16, f2f16(a), bld.precise()));
1224 
1225       //! v2b: %res3_cvt = v_cvt_f16_f32 %a
1226       //! v2b: (precise)%res3 = v_fma_f16 %a16, %a16, %res3_cvt
1227       //! p_unit_test 3, %res3
1228       writeout(3, fma(a16, a16, f2f16(a), bld.precise()));
1229 
1230       /* precise conversions */
1231       //! v2b: (precise)%res4_cvt = v_cvt_f16_f32 %a
1232       //! v2b: %res4 = v_mul_f16 %a16, %res4_cvt
1233       //! p_unit_test 4, %res4
1234       writeout(4, fmul(a16, f2f16(a, bld.precise())));
1235 
1236       //! v2b: (precise)%res5_cvt = v_cvt_f16_f32 %a
1237       //! v2b: %res5 = v_add_f16 %a16, %res5_cvt
1238       //! p_unit_test 5, %res5
1239       writeout(5, fadd(a16, f2f16(a, bld.precise())));
1240 
1241       //! v2b: (precise)%res6_cvt = v_cvt_f16_f32 %a
1242       //! v2b: %res6 = v_fma_f16 %a16, %a16, %res6_cvt
1243       //! p_unit_test 6, %res6
1244       writeout(6, fma(a16, a16, f2f16(a, bld.precise())));
1245 
1246       finish_opt_test();
1247    }
1248 END_TEST
1249 
1250 BEGIN_TEST(optimize.mad_mix.input_conv.modifiers)
1251    for (unsigned i = GFX9; i <= GFX10; i++) {
1252       //>> v1: %a, v2b: %a16 = p_startpgm
1253       if (!setup_cs("v1 v2b", (amd_gfx_level)i))
1254          continue;
1255 
1256       program->blocks[0].fp_mode.denorm16_64 = fp_denorm_flush;
1257 
1258       Temp a = inputs[0];
1259       Temp a16 = inputs[1];
1260 
1261       /* check whether modifiers are preserved when converting to VOP3P */
1262       //! v1: %res0 = v_fma_mix_f32 -%a, lo(%a16), -0
1263       //! p_unit_test 0, %res0
1264       writeout(0, fmul(fneg(a), f2f32(a16)));
1265 
1266       //! v1: %res1 = v_fma_mix_f32 |%a|, lo(%a16), -0
1267       //! p_unit_test 1, %res1
1268       writeout(1, fmul(fabs(a), f2f32(a16)));
1269 
1270       /* fneg modifiers */
1271       //! v1: %res2 = v_fma_mix_f32 %a, -lo(%a16), -0
1272       //! p_unit_test 2, %res2
1273       writeout(2, fmul(a, fneg(f2f32(a16))));
1274 
1275       //! v1: %res3 = v_fma_mix_f32 %a, -lo(%a16), -0
1276       //! p_unit_test 3, %res3
1277       writeout(3, fmul(a, f2f32(fneg(a16))));
1278 
1279       /* fabs modifiers */
1280       //! v1: %res4 = v_fma_mix_f32 %a, |lo(%a16)|, -0
1281       //! p_unit_test 4, %res4
1282       writeout(4, fmul(a, fabs(f2f32(a16))));
1283 
1284       //! v1: %res5 = v_fma_mix_f32 %a, |lo(%a16)|, -0
1285       //! p_unit_test 5, %res5
1286       writeout(5, fmul(a, f2f32(fabs(a16))));
1287 
1288       /* both fabs and fneg modifiers */
1289       //! v1: %res6 = v_fma_mix_f32 %a, -|lo(%a16)|, -0
1290       //! p_unit_test 6, %res6
1291       writeout(6, fmul(a, fneg(f2f32(fabs(a16)))));
1292 
1293       //! v1: %res7 = v_fma_mix_f32 %a, |lo(%a16)|, -0
1294       //! p_unit_test 7, %res7
1295       writeout(7, fmul(a, fabs(f2f32(fabs(a16)))));
1296 
1297       //! v1: %res8 = v_fma_mix_f32 %a, -|lo(%a16)|, -0
1298       //! p_unit_test 8, %res8
1299       writeout(8, fmul(a, fneg(fabs(f2f32(fabs(a16))))));
1300 
1301       //! v1: %res9 = v_fma_mix_f32 %a, -|lo(%a16)|, -0
1302       //! p_unit_test 9, %res9
1303       writeout(9, fmul(a, f2f32(fneg(fabs(a16)))));
1304 
1305       //! v1: %res10 = v_fma_mix_f32 %a, |lo(%a16)|, -0
1306       //! p_unit_test 10, %res10
1307       writeout(10, fmul(a, fneg(f2f32(fneg(fabs(a16))))));
1308 
1309       //! v1: %res11 = v_fma_mix_f32 %a, |lo(%a16)|, -0
1310       //! p_unit_test 11, %res11
1311       writeout(11, fmul(a, fabs(f2f32(fneg(fabs(a16))))));
1312 
1313       //! v1: %res12 = v_fma_mix_f32 %a, -|lo(%a16)|, -0
1314       //! p_unit_test 12, %res12
1315       writeout(12, fmul(a, fneg(fabs(f2f32(fneg(fabs(a16)))))));
1316 
1317       /* sdwa */
1318       //! v1: %res13 = v_fma_mix_f32 lo(%a), %a, -0
1319       //! p_unit_test 13, %res13
1320       writeout(13, fmul(f2f32(ext_ushort(a, 0)), a));
1321 
1322       //! v1: %res14 = v_fma_mix_f32 hi(%a), %a, -0
1323       //! p_unit_test 14, %res14
1324       writeout(14, fmul(f2f32(ext_ushort(a, 1)), a));
1325 
1326       //! v1: %res15_cvt = v_cvt_f32_f16 %a dst_sel:uword0 src0_sel:dword
1327       //! v1: %res15 = v_mul_f32 %res15_cvt, %a
1328       //! p_unit_test 15, %res15
1329       writeout(15, fmul(ext_ushort(f2f32(a), 0), a));
1330 
1331       //! v1: %res16_cvt = v_cvt_f32_f16 %a
1332       //! v1: %res16 = v_mul_f32 %res16_cvt, %a dst_sel:dword src0_sel:uword1 src1_sel:dword
1333       //! p_unit_test 16, %res16
1334       writeout(16, fmul(ext_ushort(f2f32(a), 1), a));
1335 
1336       //! v1: %res17_cvt = v_cvt_f32_f16 %a dst_sel:dword src0_sel:ubyte2
1337       //! v1: %res17 = v_mul_f32 %res17_cvt, %a
1338       //! p_unit_test 17, %res17
1339       writeout(17, fmul(f2f32(ext_ubyte(a, 2)), a));
1340 
1341       finish_opt_test();
1342    }
1343 END_TEST
1344 
1345 BEGIN_TEST(optimize.mad_mix.output_conv.basic)
1346    for (unsigned i = GFX9; i <= GFX10; i++) {
1347       //>> v1: %a, v1: %b, v1: %c, v2b: %a16, v2b: %b16 = p_startpgm
1348       if (!setup_cs("v1 v1 v1 v2b v2b", (amd_gfx_level)i))
1349          continue;
1350 
1351       program->blocks[0].fp_mode.denorm16_64 = fp_denorm_flush;
1352 
1353       Temp a = inputs[0];
1354       Temp b = inputs[1];
1355       Temp c = inputs[2];
1356       Temp a16 = inputs[3];
1357       Temp b16 = inputs[4];
1358 
1359       //! v2b: %res0 = v_fma_mixlo_f16 %a, %b, -0
1360       //! p_unit_test 0, %res0
1361       writeout(0, f2f16(fmul(a, b)));
1362 
1363       //! v2b: %res1 = v_fma_mixlo_f16 1.0, %a, %b
1364       //! p_unit_test 1, %res1
1365       writeout(1, f2f16(fadd(a, b)));
1366 
1367       //! v2b: %res2 = v_fma_mixlo_f16 %a, %b, %c
1368       //! p_unit_test 2, %res2
1369       writeout(2, f2f16(fma(a, b, c)));
1370 
1371       //! v2b: %res3 = v_fma_mixlo_f16 lo(%a16), %b, -0
1372       //! p_unit_test 3, %res3
1373       writeout(3, f2f16(fmul(f2f32(a16), b)));
1374 
1375       //! v2b: %res4 = v_fma_mixlo_f16 1.0, %a, lo(%b16)
1376       //! p_unit_test 4, %res4
1377       writeout(4, f2f16(fadd(a, f2f32(b16))));
1378 
1379       //! v2b: %res5 = v_fma_mixlo_f16 %a, lo(%b16), %c
1380       //! p_unit_test 5, %res5
1381       writeout(5, f2f16(fma(a, f2f32(b16), c)));
1382 
1383       finish_opt_test();
1384    }
1385 END_TEST
1386 
1387 BEGIN_TEST(optimize.mad_mix.output_conv.precision)
1388    for (unsigned i = GFX9; i <= GFX10; i++) {
1389       //>> v2b: %a16 = p_startpgm
1390       if (!setup_cs("v2b", (amd_gfx_level)i))
1391          continue;
1392 
1393       program->blocks[0].fp_mode.denorm16_64 = fp_denorm_flush;
1394 
1395       Temp a16 = inputs[0];
1396 
1397       //! v2b: %res0_tmp = v_mul_f16 %a16, %a16
1398       //! v1: (precise)%res0 = v_cvt_f32_f16 %res0_tmp
1399       //! p_unit_test 0, %res0
1400       writeout(0, f2f32(fmul(a16, a16), bld.precise()));
1401 
1402       //! v2b: (precise)%res1_tmp = v_mul_f16 %a16, %a16
1403       //! v1: %res1 = v_cvt_f32_f16 %res1_tmp
1404       //! p_unit_test 1, %res1
1405       writeout(1, f2f32(fmul(a16, a16, bld.precise())));
1406 
1407       finish_opt_test();
1408    }
1409 END_TEST
1410 
1411 BEGIN_TEST(optimize.mad_mix.output_conv.modifiers)
1412    for (unsigned i = GFX9; i <= GFX10; i++) {
1413       //>> v1: %a, v1: %b, v2b: %a16, v2b: %b16 = p_startpgm
1414       if (!setup_cs("v1 v1 v2b v2b", (amd_gfx_level)i))
1415          continue;
1416 
1417       program->blocks[0].fp_mode.denorm16_64 = fp_denorm_flush;
1418 
1419       Temp a = inputs[0];
1420       Temp b = inputs[1];
1421       Temp a16 = inputs[2];
1422       Temp b16 = inputs[3];
1423 
1424       /* fneg/fabs */
1425       //! v1: %res0_add = v_add_f32 %1, %2
1426       //! v2b: %res0 = v_cvt_f16_f32 |%res0_add|
1427       //! p_unit_test 0, %res0
1428       writeout(0, f2f16(fabs(fadd(a, b))));
1429 
1430       //! v1: %res1_add = v_add_f32 %1, %2
1431       //! v2b: %res1 = v_cvt_f16_f32 -%res1_add
1432       //! p_unit_test 1, %res1
1433       writeout(1, f2f16(fneg(fadd(a, b))));
1434 
1435       //! v2b: %res2_add = v_add_f16 %3, %4
1436       //! v1: %res2 = v_cvt_f32_f16 |%res2_add|
1437       //! p_unit_test 2, %res2
1438       writeout(2, f2f32(fabs(fadd(a16, b16))));
1439 
1440       //! v2b: %res3_add = v_add_f16 %3, %4
1441       //! v1: %res3 = v_cvt_f32_f16 -%res3_add
1442       //! p_unit_test 3, %res3
1443       writeout(3, f2f32(fneg(fadd(a16, b16))));
1444 
1445       /* sdwa */
1446       //! v2b: %res4_add = v_fma_mixlo_f16 1.0, %a, %b
1447       //! v2b: %res4 = p_extract %res4_add, 0, 8, 0
1448       //! p_unit_test 4, %res4
1449       writeout(4, ext_ubyte(f2f16(fadd(a, b)), 0));
1450 
1451       //! v1: %res5_mul = v_add_f32 %a, %b dst_sel:uword0 src0_sel:dword src1_sel:dword
1452       //! v2b: %res5 = v_cvt_f16_f32 %res5_mul
1453       //! p_unit_test 5, %res5
1454       writeout(5, f2f16(ext_ushort(fadd(a, b), 0)));
1455 
1456       finish_opt_test();
1457    }
1458 END_TEST
1459 
1460 BEGIN_TEST(optimize.mad_mix.fma.basic)
1461    for (unsigned i = GFX9; i <= GFX10; i++) {
1462       //>> v1: %a, v1: %b, v1: %c, v2b: %a16, v2b: %c16 = p_startpgm
1463       if (!setup_cs("v1 v1 v1 v2b v2b", (amd_gfx_level)i))
1464          continue;
1465 
1466       program->blocks[0].fp_mode.denorm16_64 = fp_denorm_flush;
1467 
1468       Temp a = inputs[0];
1469       Temp b = inputs[1];
1470       Temp c = inputs[2];
1471       Temp a16 = inputs[3];
1472       Temp c16 = inputs[4];
1473 
1474       //! v1: %res0 = v_fma_mix_f32 lo(%a16), %b, %c
1475       //! p_unit_test 0, %res0
1476       writeout(0, fadd(fmul(f2f32(a16), b), c));
1477 
1478       //! v1: %res1 = v_fma_mix_f32 %a, %b, lo(%c16)
1479       //! p_unit_test 1, %res1
1480       writeout(1, fadd(fmul(a, b), f2f32(c16)));
1481 
1482       /* omod/clamp check */
1483       //! v1: %res2_mul = v_fma_mix_f32 lo(%a16), %b, -0
1484       //! v1: %res2 = v_add_f32 %res2_mul, %c *2
1485       //! p_unit_test 2, %res2
1486       writeout(2, bld.vop2(aco_opcode::v_mul_f32, bld.def(v1), Operand::c32(0x40000000), fadd(fmul(f2f32(a16), b), c)));
1487 
1488       /* neg/abs modifiers */
1489       //! v1: %res3 = v_fma_mix_f32 -lo(%a16), %b, |lo(%c16)|
1490       //! p_unit_test 3, %res3
1491       writeout(3, fadd(fmul(fneg(f2f32(a16)), b), fabs(f2f32(c16))));
1492 
1493       //! v1: %res4 = v_fma_mix_f32 |%a|, |%b|, lo(%c16)
1494       //! p_unit_test 4, %res4
1495       writeout(4, fadd(fabs(fmul(fneg(a), fneg(b))), f2f32(c16)));
1496 
1497       //! v1: %res5 = v_fma_mix_f32 %a, -%b, lo(%c16)
1498       //! p_unit_test 5, %res5
1499       writeout(5, fadd(fneg(fmul(a, b)), f2f32(c16)));
1500 
1501       //! v1: %res6 = v_fma_mix_f32 |%a|, -|%b|, lo(%c16)
1502       //! p_unit_test 6, %res6
1503       writeout(6, fadd(fneg(fabs(fmul(fneg(a), fneg(b)))), f2f32(c16)));
1504 
1505       /* output conversions */
1506       //! v2b: %res7 = v_fma_mixlo_f16 %a, %b, %c
1507       //! p_unit_test 7, %res7
1508       writeout(7, f2f16(fadd(fmul(a, b), c)));
1509 
1510       finish_opt_test();
1511    }
1512 END_TEST
1513 
1514 BEGIN_TEST(optimize.mad_mix.fma.precision)
1515    for (unsigned i = GFX9; i <= GFX10; i++) {
1516       //>> v1: %a, v1: %b, v1: %c, v2b: %a16, v2b: %b16 = p_startpgm
1517       if (!setup_cs("v1 v1 v1 v2b v2b", (amd_gfx_level)i))
1518          continue;
1519 
1520       program->blocks[0].fp_mode.denorm16_64 = fp_denorm_flush;
1521 
1522       Temp a = inputs[0];
1523       Temp b = inputs[1];
1524       Temp c = inputs[2];
1525       Temp a16 = inputs[3];
1526       Temp b16 = inputs[4];
1527 
1528       /* the optimization is precise for 32-bit on GFX9 */
1529       //~gfx9! v1: %res0 = v_fma_mix_f32 lo(%a16), %b, %c
1530       //~gfx10! v1: (precise)%res0_tmp = v_fma_mix_f32 lo(%a16), %b, -0
1531       //~gfx10! v1: %res0 = v_add_f32 %res0_tmp, %c
1532       //! p_unit_test 0, %res0
1533       writeout(0, fadd(fmul(f2f32(a16), b, bld.precise()), c));
1534 
1535       //~gfx9! v1: (precise)%res1 = v_fma_mix_f32 lo(%a16), %b, %c
1536       //~gfx10! v1: %res1_tmp = v_fma_mix_f32 lo(%a16), %b, -0
1537       //~gfx10! v1: (precise)%res1 = v_add_f32 %res1_tmp, %c
1538       //! p_unit_test 1, %res1
1539       writeout(1, fadd(fmul(f2f32(a16), b), c, bld.precise()));
1540 
1541       /* never promote 16-bit arithmetic to 32-bit */
1542       //! v2b: %res2_tmp = v_cvt_f16_f32 %a
1543       //! v2b: %res2 = v_add_f16 %res2_tmp, %b16
1544       //! p_unit_test 2, %res2
1545       writeout(2, fadd(f2f16(a), b16));
1546 
1547       //! v2b: %res3_tmp = v_cvt_f16_f32 %a
1548       //! v2b: %res3 = v_mul_f16 %res3_tmp, %b16
1549       //! p_unit_test 3, %res3
1550       writeout(3, fmul(f2f16(a), b16));
1551 
1552       //! v2b: %res4_tmp = v_mul_f16 %a16, %b16
1553       //! v1: %res4 = v_cvt_f32_f16 %res4_tmp
1554       //! p_unit_test 4, %res4
1555       writeout(4, f2f32(fmul(a16, b16)));
1556 
1557       //! v2b: %res5_tmp = v_add_f16 %a16, %b16
1558       //! v1: %res5 = v_cvt_f32_f16 %res5_tmp
1559       //! p_unit_test 5, %res5
1560       writeout(5, f2f32(fadd(a16, b16)));
1561 
1562       //! v2b: %res6_tmp = v_fma_mixlo_f16 %a, %b, -0
1563       //! v2b: %res6 = v_add_f16 %res6_tmp, %a16
1564       //! p_unit_test 6, %res6
1565       writeout(6, fadd(f2f16(fmul(a, b)), a16));
1566 
1567       //! v2b: %res7_tmp = v_mul_f16 %a16, %b16
1568       //! v1: %res7 = v_fma_mix_f32 1.0, lo(%res7_tmp), %c
1569       //! p_unit_test 7, %res7
1570       writeout(7, fadd(f2f32(fmul(a16, b16)), c));
1571 
1572       finish_opt_test();
1573    }
1574 END_TEST
1575 
1576 BEGIN_TEST(optimize.mad_mix.clamp)
1577    for (unsigned i = GFX9; i <= GFX10; i++) {
1578       //>> v1: %a, v2b: %a16 = p_startpgm
1579       if (!setup_cs("v1 v2b", (amd_gfx_level)i))
1580          continue;
1581 
1582       program->blocks[0].fp_mode.denorm16_64 = fp_denorm_flush;
1583 
1584       Temp a = inputs[0];
1585       Temp a16 = inputs[1];
1586 
1587       //! v1: %res0 = v_fma_mix_f32 lo(%a16), %a, -0 clamp
1588       //! p_unit_test 0, %res0
1589       writeout(0, fsat(fmul(f2f32(a16), a)));
1590 
1591       //! v2b: %res1 = v_fma_mixlo_f16 %a, %a, -0 clamp
1592       //! p_unit_test 1, %res1
1593       writeout(1, f2f16(fsat(fmul(a, a))));
1594 
1595       //! v2b: %res2 = v_fma_mixlo_f16 %a, %a, -0 clamp
1596       //! p_unit_test 2, %res2
1597       writeout(2, fsat(f2f16(fmul(a, a))));
1598 
1599       finish_opt_test();
1600    }
1601 END_TEST
1602 
1603 BEGIN_TEST(optimize.mad_mix.cast)
1604    for (unsigned i = GFX9; i <= GFX10; i++) {
1605       //>> v1: %a, v2b: %a16 = p_startpgm
1606       if (!setup_cs("v1 v2b", (amd_gfx_level)i))
1607          continue;
1608 
1609       program->blocks[0].fp_mode.denorm16_64 = fp_denorm_flush;
1610 
1611       Temp a = inputs[0];
1612       Temp a16 = inputs[1];
1613 
1614       /* The optimizer copy-propagates v2b=p_extract_vector(v1, 0) and p_as_uniform, so the
1615        * optimizer has to check compatibility.
1616        */
1617 
1618       //! v1: %res0_cvt = v_cvt_f32_f16 %a16
1619       //! v2b: %res0 = v_mul_f16 %res0_cvt, %a16
1620       //! p_unit_test 0, %res0
1621       writeout(0, fmul(u2u16(f2f32(a16)), a16));
1622 
1623       //! v2b: %res1_cvt = v_cvt_f16_f32 %a
1624       //! v1: %res1 = v_mul_f32 %res1_cvt, %a
1625       //! p_unit_test 1, %res1
1626       writeout(1, fmul(bld.as_uniform(f2f16(a)), a));
1627 
1628       //! v2b: %res2_mul = v_mul_f16 %a16, %a16
1629       //! v2b: %res2 = v_cvt_f16_f32 %res2_mul
1630       //! p_unit_test 2, %res2
1631       writeout(2, f2f16(bld.as_uniform(fmul(a16, a16))));
1632 
1633       //! v1: %res3_mul = v_mul_f32 %a, %a
1634       //! v1: %res3 = v_cvt_f32_f16 %res3_mul
1635       //! p_unit_test 3, %res3
1636       writeout(3, f2f32(u2u16(fmul(a, a))));
1637 
1638       //! v1: %res4_mul = v_fma_mix_f32 lo(%a16), %a, -0
1639       //! v2b: %res4 = v_med3_f16 0, 1.0, %res4_mul
1640       //! p_unit_test 4, %res4
1641       writeout(4, fsat(u2u16(fmul(f2f32(a16), a))));
1642 
1643       //! v2b: %res5_mul = v_fma_mixlo_f16 %a, %a, -0
1644       //! v1: %res5 = v_med3_f32 0, 1.0, %res5_mul
1645       //! p_unit_test 5, %res5
1646       writeout(5, fsat(bld.as_uniform(f2f16(fmul(a, a)))));
1647 
1648       //! v1: %res6_mul = v_mul_f32 %a, %a
1649       //! v1: %res6 = v_fma_mix_f32 1.0, lo(%res6_mul), %a
1650       //! p_unit_test 6, %res6
1651       writeout(6, fadd(f2f32(u2u16(fmul(a, a))), a));
1652 
1653       //! v2b: %res7_mul = v_mul_f16 %a16, %a16
1654       //! v1: %res7 = v_fma_mix_f32 1.0, %res7_mul, lo(%a16)
1655       //! p_unit_test 7, %res7
1656       writeout(7, fadd(bld.as_uniform(fmul(a16, a16)), f2f32(a16)));
1657 
1658       /* opsel_hi should be obtained from the original opcode, not the operand regclass */
1659       //! v1: %res8 = v_fma_mix_f32 lo(%a16), %a16, -0
1660       //! p_unit_test 8, %res8
1661       writeout(8, fmul(f2f32(a16), a16));
1662 
1663       finish_opt_test();
1664    }
1665 END_TEST
1666 
vop3p_constant(unsigned * idx,aco_opcode op,const char * swizzle,uint32_t val)1667 static void vop3p_constant(unsigned *idx, aco_opcode op, const char *swizzle, uint32_t val)
1668 {
1669    uint32_t halves[2] = {val & 0xffff, val >> 16};
1670    uint32_t expected = halves[swizzle[0] - 'x'] | (halves[swizzle[1] - 'x'] << 16);
1671    fprintf(output, "Expected for %u: 0x%.8x / %u\n", *idx, expected, expected);
1672 
1673    unsigned opsel_lo = swizzle[0] == 'x' ? 0x0 : 0x1;
1674    unsigned opsel_hi = swizzle[1] == 'x' ? 0x2 : 0x3;
1675    writeout((*idx)++, bld.vop3p(op, bld.def(v1), bld.copy(bld.def(v1), Operand::c32(val)),
1676                                 inputs[0], opsel_lo, opsel_hi));
1677 }
1678 
1679 BEGIN_TEST(optimize.vop3p_constants)
1680    for (aco_opcode op : {aco_opcode::v_pk_add_f16, aco_opcode::v_pk_add_u16}) {
1681       for (const char *swizzle : {"xx", "yy", "xy", "yx"}) {
1682          char variant[16];
1683          strcpy(variant, op == aco_opcode::v_pk_add_f16 ? "_f16" : "_u16");
1684          strcat(variant, "_");
1685          strcat(variant, swizzle);
1686 
1687          //; for i in range(36):
1688          //;    insert_pattern('Expected for %u: $_ / #expected%u' % (i, i))
1689 
1690          //>> v1: %a = p_startpgm
1691          if (!setup_cs("v1", GFX10_3, CHIP_UNKNOWN, variant))
1692             continue;
1693 
1694          //; opcode = 'v_pk_add_u16' if 'u16' in variant else 'v_pk_add_f16'
1695          //; for i in range(36):
1696          //;    insert_pattern('v1: %%res%u = %s $got%u %%a' % (i, opcode, i))
1697          //;    insert_pattern('p_unit_test %u, %%res%u' % (i, i))
1698          //! s_endpgm
1699 
1700          //; def parse_op(op):
1701          //;    is_int = opcode == 'v_pk_add_u16'
1702          //;    op = op.rstrip(',')
1703          //;
1704          //;    mods = lambda v: v
1705          //;    if op.endswith('*[1,-1]'):
1706          //;       mods = lambda v: v ^ 0x80000000
1707          //;       assert(not is_int)
1708          //;    elif op.endswith('*[-1,1]'):
1709          //;       mods = lambda v: v ^ 0x00008000
1710          //;       assert(not is_int)
1711          //;    op = op.split('*')[0]
1712          //;
1713          //;    swizzle = lambda v: v
1714          //;    if op.endswith('.xx'):
1715          //;       swizzle = lambda v: ((v & 0xffff) | (v << 16)) & 0xffffffff;
1716          //;    elif op.endswith('.yy'):
1717          //;       swizzle = lambda v: (v >> 16) | (v & 0xffff0000);
1718          //;    elif op.endswith('.yx'):
1719          //;       swizzle = lambda v: ((v >> 16) | (v << 16)) & 0xffffffff;
1720          //;    op = op.rstrip('xy.')
1721          //;
1722          //;    val = None
1723          //;    if op.startswith('0x'):
1724          //;       val = int(op[2:], 16)
1725          //;    elif op == '-1.0':
1726          //;       val = 0xbf800000 if is_int else 0xbC00
1727          //;    elif op == '1.0':
1728          //;       val = 0x3f800000 if is_int else 0x3c00
1729          //;    else:
1730          //;       val = int(op) & 0xffffffff
1731          //;
1732          //;    return mods(swizzle(val))
1733 
1734          //; # Check correctness
1735          //; for i in range(36):
1736          //;    expected = globals()['expected%u' % i]
1737          //;    got = globals()['got%u' % i]
1738          //;    got_parsed = parse_op(got)
1739          //;    if got_parsed != expected:
1740          //;       raise Exception('Check %u failed: expected 0x%.8x, got 0x%.8x ("%s")' % (i, expected, got_parsed, got))
1741 
1742          //; # Check that all literals are ones that cannot be encoded as inline constants
1743          //; allowed_literals = [0x00004242, 0x0000fffe, 0x00308030, 0x0030ffff, 0x3c00ffff,
1744          //;                     0x42420000, 0x42424242, 0x4242c242, 0x4242ffff, 0x7ffefffe,
1745          //;                     0x80300030, 0xbeefdead, 0xc2424242, 0xdeadbeef, 0xfffe0000,
1746          //;                     0xfffe7ffe, 0xffff0030, 0xffff3c00, 0xffff4242]
1747          //; if opcode == 'v_pk_add_u16':
1748          //;    allowed_literals.extend([0x00003c00, 0x3c000000, 0x3c003c00, 0x3c00bc00, 0xbc003c00])
1749          //; else:
1750          //;    allowed_literals.extend([0x00003f80, 0x3f800000])
1751          //;
1752          //; for i in range(36):
1753          //;    got = globals()['got%u' % i]
1754          //;    if not got.startswith('0x'):
1755          //;       continue;
1756          //;    got = int(got[2:].rstrip(',').split('*')[0].split('.')[0], 16)
1757          //;    if got not in allowed_literals:
1758          //;       raise Exception('Literal check %u failed: 0x%.8x not in allowed literals' % (i, got))
1759 
1760          unsigned idx = 0;
1761          for (uint32_t constant : {0x3C00, 0x0030, 0xfffe, 0x4242}) {
1762             vop3p_constant(&idx, op, swizzle, constant);
1763             vop3p_constant(&idx, op, swizzle, constant | 0xffff0000);
1764             vop3p_constant(&idx, op, swizzle, constant | (constant << 16));
1765             vop3p_constant(&idx, op, swizzle, constant << 16);
1766             vop3p_constant(&idx, op, swizzle, (constant << 16) | 0x0000ffff);
1767             vop3p_constant(&idx, op, swizzle, constant | ((constant ^ 0x8000) << 16));
1768             vop3p_constant(&idx, op, swizzle, (constant ^ 0x8000) | (constant << 16));
1769          }
1770 
1771          for (uint32_t constant : {0x3f800000u, 0xfffffffeu, 0x00000030u, 0xdeadbeefu}) {
1772             uint32_t lo = constant & 0xffff;
1773             uint32_t hi = constant >> 16;
1774             vop3p_constant(&idx, op, swizzle, constant);
1775             vop3p_constant(&idx, op, swizzle, hi | (lo << 16));
1776          }
1777 
1778          finish_opt_test();
1779       }
1780    }
1781 END_TEST
1782