1 /*
2 * Copyright © 2020 Valve Corporation
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
13 * Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21 * IN THE SOFTWARE.
22 *
23 */
24 #include "helpers.h"
25
26 using namespace aco;
27
28 BEGIN_TEST(optimize.neg)
29 for (unsigned i = GFX9; i <= GFX10; i++) {
30 //>> v1: %a, v1: %b, s1: %c, s1: %d = p_startpgm
31 if (!setup_cs("v1 v1 s1 s1", (amd_gfx_level)i))
32 continue;
33
34 //! v1: %res0 = v_mul_f32 %a, -%b
35 //! p_unit_test 0, %res0
36 Temp neg_b = fneg(inputs[1]);
37 writeout(0, bld.vop2(aco_opcode::v_mul_f32, bld.def(v1), inputs[0], neg_b));
38
39 //~gfx9! v1: %neg_a = v_mul_f32 -1.0, %a
40 //~gfx9! v1: %res1 = v_mul_f32 0x123456, %neg_a
41 //~gfx10! v1: %res1 = v_mul_f32 0x123456, -%a
42 //! p_unit_test 1, %res1
43 Temp neg_a = fneg(inputs[0]);
44 writeout(1, bld.vop2(aco_opcode::v_mul_f32, bld.def(v1), Operand::c32(0x123456u), neg_a));
45
46 //! v1: %res2 = v_mul_f32 %a, %b
47 //! p_unit_test 2, %res2
48 Temp neg_neg_a = fneg(neg_a);
49 writeout(2, bld.vop2(aco_opcode::v_mul_f32, bld.def(v1), neg_neg_a, inputs[1]));
50
51 //! v1: %res3 = v_mul_f32 |%a|, %b
52 //! p_unit_test 3, %res3
53 Temp abs_neg_a = fabs(neg_a);
54 writeout(3, bld.vop2(aco_opcode::v_mul_f32, bld.def(v1), abs_neg_a, inputs[1]));
55
56 //! v1: %res4 = v_mul_f32 -|%a|, %b
57 //! p_unit_test 4, %res4
58 Temp abs_a = fabs(inputs[0]);
59 Temp neg_abs_a = fneg(abs_a);
60 writeout(4, bld.vop2(aco_opcode::v_mul_f32, bld.def(v1), neg_abs_a, inputs[1]));
61
62 //~gfx9! v1: %res5 = v_mul_f32 -%a, %b row_shl:1 bound_ctrl:1
63 //~gfx10! v1: %res5 = v_mul_f32 -%a, %b row_shl:1 bound_ctrl:1 fi
64 //! p_unit_test 5, %res5
65 writeout(5,
66 bld.vop2_dpp(aco_opcode::v_mul_f32, bld.def(v1), neg_a, inputs[1], dpp_row_sl(1)));
67
68 //! v1: %res6 = v_subrev_f32 %a, %b
69 //! p_unit_test 6, %res6
70 writeout(6, bld.vop2(aco_opcode::v_add_f32, bld.def(v1), neg_a, inputs[1]));
71
72 //! v1: %res7 = v_sub_f32 %b, %a
73 //! p_unit_test 7, %res7
74 writeout(7, bld.vop2(aco_opcode::v_add_f32, bld.def(v1), inputs[1], neg_a));
75
76 //! v1: %res8 = v_mul_f32 %a, -%c
77 //! p_unit_test 8, %res8
78 Temp neg_c = fneg(bld.copy(bld.def(v1), inputs[2]));
79 writeout(8, bld.vop2(aco_opcode::v_mul_f32, bld.def(v1), inputs[0], neg_c));
80
81 // //! v1: %res9 = v_mul_f32 |%neg_a|, %b
82 // //! p_unit_test 9, %res9
83 Temp abs_neg_abs_a = fabs(neg_abs_a);
84 writeout(9, bld.vop2(aco_opcode::v_mul_f32, bld.def(v1), abs_neg_abs_a, inputs[1]));
85
86 finish_opt_test();
87 }
88 END_TEST
89
90 BEGIN_TEST(optimize.output_modifiers)
91 //>> v1: %a, v1: %b = p_startpgm
92 if (!setup_cs("v1 v1", GFX9))
93 return;
94
95 program->blocks[0].fp_mode.denorm16_64 = fp_denorm_flush;
96
97 /* 32-bit modifiers */
98
99 //! v1: %res0 = v_add_f32 %a, %b *0.5
100 //! p_unit_test 0, %res0
101 Temp tmp = bld.vop2(aco_opcode::v_add_f32, bld.def(v1), inputs[0], inputs[1]);
102 writeout(0, bld.vop2(aco_opcode::v_mul_f32, bld.def(v1), Operand::c32(0x3f000000u), tmp));
103
104 //! v1: %res1 = v_add_f32 %a, %b *2
105 //! p_unit_test 1, %res1
106 tmp = bld.vop2(aco_opcode::v_add_f32, bld.def(v1), inputs[0], inputs[1]);
107 writeout(1, bld.vop2(aco_opcode::v_mul_f32, bld.def(v1), Operand::c32(0x40000000u), tmp));
108
109 //! v1: %res2 = v_add_f32 %a, %b *4
110 //! p_unit_test 2, %res2
111 tmp = bld.vop2(aco_opcode::v_add_f32, bld.def(v1), inputs[0], inputs[1]);
112 writeout(2, bld.vop2(aco_opcode::v_mul_f32, bld.def(v1), Operand::c32(0x40800000u), tmp));
113
114 //! v1: %res3 = v_add_f32 %a, %b clamp
115 //! p_unit_test 3, %res3
116 tmp = bld.vop2(aco_opcode::v_add_f32, bld.def(v1), inputs[0], inputs[1]);
117 writeout(3, bld.vop3(aco_opcode::v_med3_f32, bld.def(v1), Operand::zero(),
118 Operand::c32(0x3f800000u), tmp));
119
120 //! v1: %res4 = v_add_f32 %a, %b *2 clamp
121 //! p_unit_test 4, %res4
122 tmp = bld.vop2(aco_opcode::v_add_f32, bld.def(v1), inputs[0], inputs[1]);
123 tmp = bld.vop2(aco_opcode::v_mul_f32, bld.def(v1), Operand::c32(0x40000000u), tmp);
124 writeout(4, bld.vop3(aco_opcode::v_med3_f32, bld.def(v1), Operand::zero(),
125 Operand::c32(0x3f800000u), tmp));
126
127 /* 16-bit modifiers */
128
129 //! v2b: %res5 = v_add_f16 %a, %b *0.5
130 //! p_unit_test 5, %res5
131 tmp = bld.vop2(aco_opcode::v_add_f16, bld.def(v2b), inputs[0], inputs[1]);
132 writeout(5, bld.vop2(aco_opcode::v_mul_f16, bld.def(v2b), Operand::c16(0x3800u), tmp));
133
134 //! v2b: %res6 = v_add_f16 %a, %b *2
135 //! p_unit_test 6, %res6
136 tmp = bld.vop2(aco_opcode::v_add_f16, bld.def(v2b), inputs[0], inputs[1]);
137 writeout(6, bld.vop2(aco_opcode::v_mul_f16, bld.def(v2b), Operand::c16(0x4000u), tmp));
138
139 //! v2b: %res7 = v_add_f16 %a, %b *4
140 //! p_unit_test 7, %res7
141 tmp = bld.vop2(aco_opcode::v_add_f16, bld.def(v2b), inputs[0], inputs[1]);
142 writeout(7, bld.vop2(aco_opcode::v_mul_f16, bld.def(v2b), Operand::c16(0x4400u), tmp));
143
144 //! v2b: %res8 = v_add_f16 %a, %b clamp
145 //! p_unit_test 8, %res8
146 tmp = bld.vop2(aco_opcode::v_add_f16, bld.def(v2b), inputs[0], inputs[1]);
147 writeout(8, bld.vop3(aco_opcode::v_med3_f16, bld.def(v2b), Operand::c16(0u),
148 Operand::c16(0x3c00u), tmp));
149
150 //! v2b: %res9 = v_add_f16 %a, %b *2 clamp
151 //! p_unit_test 9, %res9
152 tmp = bld.vop2(aco_opcode::v_add_f16, bld.def(v2b), inputs[0], inputs[1]);
153 tmp = bld.vop2(aco_opcode::v_mul_f16, bld.def(v2b), Operand::c16(0x4000), tmp);
154 writeout(9, bld.vop3(aco_opcode::v_med3_f16, bld.def(v2b), Operand::c16(0u),
155 Operand::c16(0x3c00u), tmp));
156
157 /* clamping is done after omod */
158
159 //! v1: %res10_tmp = v_add_f32 %a, %b clamp
160 //! v1: %res10 = v_mul_f32 2.0, %res10_tmp
161 //! p_unit_test 10, %res10
162 tmp = bld.vop2(aco_opcode::v_add_f32, bld.def(v1), inputs[0], inputs[1]);
163 tmp = bld.vop3(aco_opcode::v_med3_f32, bld.def(v1), Operand::zero(), Operand::c32(0x3f800000u),
164 tmp);
165 writeout(10, bld.vop2(aco_opcode::v_mul_f32, bld.def(v1), Operand::c32(0x40000000u), tmp));
166
167 /* unsupported instructions */
168
169 //! v1: %res11_tmp = v_xor_b32 %a, %b
170 //! v1: %res11 = v_mul_f32 2.0, %res11_tmp
171 //! p_unit_test 11, %res11
172 tmp = bld.vop2(aco_opcode::v_xor_b32, bld.def(v1), inputs[0], inputs[1]);
173 writeout(11, bld.vop2(aco_opcode::v_mul_f32, bld.def(v1), Operand::c32(0x40000000u), tmp));
174
175 /* several users */
176
177 //! v1: %res12_tmp = v_add_f32 %a, %b
178 //! p_unit_test %res12_tmp
179 //! v1: %res12 = v_mul_f32 2.0, %res12_tmp
180 //! p_unit_test 12, %res12
181 tmp = bld.vop2(aco_opcode::v_add_f32, bld.def(v1), inputs[0], inputs[1]);
182 bld.pseudo(aco_opcode::p_unit_test, tmp);
183 writeout(12, bld.vop2(aco_opcode::v_mul_f32, bld.def(v1), Operand::c32(0x40000000u), tmp));
184
185 //! v1: %res13 = v_add_f32 %a, %b
186 //! p_unit_test 13, %res13
187 tmp = bld.vop2(aco_opcode::v_add_f32, bld.def(v1), inputs[0], inputs[1]);
188 bld.vop2(aco_opcode::v_mul_f32, bld.def(v1), Operand::c32(0x40000000u), tmp);
189 writeout(13, tmp);
190
191 /* omod has no effect if denormals are enabled but clamp is fine */
192
193 //>> BB1
194 //! /* logical preds: / linear preds: / kind: uniform, */
195 program->next_fp_mode.denorm32 = fp_denorm_keep;
196 program->next_fp_mode.denorm16_64 = fp_denorm_flush;
197 bld.reset(program->create_and_insert_block());
198
199 //! v1: %res14_tmp = v_add_f32 %a, %b
200 //! v1: %res14 = v_mul_f32 2.0, %res13_tmp
201 //! p_unit_test 14, %res14
202 tmp = bld.vop2(aco_opcode::v_add_f32, bld.def(v1), inputs[0], inputs[1]);
203 writeout(14, bld.vop2(aco_opcode::v_mul_f32, bld.def(v1), Operand::c32(0x40000000u), tmp));
204
205 //! v1: %res15 = v_add_f32 %a, %b clamp
206 //! p_unit_test 15, %res15
207 tmp = bld.vop2(aco_opcode::v_add_f32, bld.def(v1), inputs[0], inputs[1]);
208 writeout(15, bld.vop3(aco_opcode::v_med3_f32, bld.def(v1), Operand::zero(),
209 Operand::c32(0x3f800000u), tmp));
210
211 //>> BB2
212 //! /* logical preds: / linear preds: / kind: uniform, */
213 program->next_fp_mode.denorm32 = fp_denorm_flush;
214 program->next_fp_mode.denorm16_64 = fp_denorm_keep;
215 bld.reset(program->create_and_insert_block());
216
217 //! v2b: %res16_tmp = v_add_f16 %a, %b
218 //! v2b: %res16 = v_mul_f16 2.0, %res15_tmp
219 //! p_unit_test 16, %res16
220 tmp = bld.vop2(aco_opcode::v_add_f16, bld.def(v2b), inputs[0], inputs[1]);
221 writeout(16, bld.vop2(aco_opcode::v_mul_f16, bld.def(v2b), Operand::c16(0x4000u), tmp));
222
223 //! v2b: %res17 = v_add_f16 %a, %b clamp
224 //! p_unit_test 17, %res17
225 tmp = bld.vop2(aco_opcode::v_add_f16, bld.def(v2b), inputs[0], inputs[1]);
226 writeout(17, bld.vop3(aco_opcode::v_med3_f16, bld.def(v2b), Operand::c16(0u),
227 Operand::c16(0x3c00u), tmp));
228
229 /* omod flushes -0.0 to +0.0 */
230
231 //>> BB3
232 //! /* logical preds: / linear preds: / kind: uniform, */
233 program->next_fp_mode.denorm32 = fp_denorm_keep;
234 program->next_fp_mode.denorm16_64 = fp_denorm_keep;
235 program->next_fp_mode.preserve_signed_zero_inf_nan32 = true;
236 program->next_fp_mode.preserve_signed_zero_inf_nan16_64 = false;
237 bld.reset(program->create_and_insert_block());
238
239 //! v1: %res18_tmp = v_add_f32 %a, %b
240 //! v1: %res18 = v_mul_f32 2.0, %res18_tmp
241 //! p_unit_test 18, %res18
242 tmp = bld.vop2(aco_opcode::v_add_f32, bld.def(v1), inputs[0], inputs[1]);
243 writeout(18, bld.vop2(aco_opcode::v_mul_f32, bld.def(v1), Operand::c32(0x40000000u), tmp));
244 //! v1: %res19 = v_add_f32 %a, %b clamp
245 //! p_unit_test 19, %res19
246 tmp = bld.vop2(aco_opcode::v_add_f32, bld.def(v1), inputs[0], inputs[1]);
247 writeout(19, bld.vop3(aco_opcode::v_med3_f32, bld.def(v1), Operand::zero(),
248 Operand::c32(0x3f800000u), tmp));
249
250 //>> BB4
251 //! /* logical preds: / linear preds: / kind: uniform, */
252 program->next_fp_mode.preserve_signed_zero_inf_nan32 = false;
253 program->next_fp_mode.preserve_signed_zero_inf_nan16_64 = true;
254 bld.reset(program->create_and_insert_block());
255 //! v2b: %res20_tmp = v_add_f16 %a, %b
256 //! v2b: %res20 = v_mul_f16 2.0, %res20_tmp
257 //! p_unit_test 20, %res20
258 tmp = bld.vop2(aco_opcode::v_add_f16, bld.def(v2b), inputs[0], inputs[1]);
259 writeout(20, bld.vop2(aco_opcode::v_mul_f16, bld.def(v2b), Operand::c16(0x4000u), tmp));
260 //! v2b: %res21 = v_add_f16 %a, %b clamp
261 //! p_unit_test 21, %res21
262 tmp = bld.vop2(aco_opcode::v_add_f16, bld.def(v2b), inputs[0], inputs[1]);
263 writeout(21, bld.vop3(aco_opcode::v_med3_f16, bld.def(v2b), Operand::c16(0u),
264 Operand::c16(0x3c00u), tmp));
265
266 finish_opt_test();
267 END_TEST
268
269 Temp
create_subbrev_co(Operand op0,Operand op1,Operand op2)270 create_subbrev_co(Operand op0, Operand op1, Operand op2)
271 {
272 return bld.vop2_e64(aco_opcode::v_subbrev_co_u32, bld.def(v1), bld.def(bld.lm), op0, op1, op2);
273 }
274
275 BEGIN_TEST(optimize.cndmask)
276 for (unsigned i = GFX9; i <= GFX10; i++) {
277 //>> v1: %a, s1: %b, s2: %c = p_startpgm
278 if (!setup_cs("v1 s1 s2", (amd_gfx_level)i))
279 continue;
280
281 Temp subbrev;
282
283 //! v1: %res0 = v_cndmask_b32 0, %a, %c
284 //! p_unit_test 0, %res0
285 subbrev = create_subbrev_co(Operand::zero(), Operand::zero(), Operand(inputs[2]));
286 writeout(0, bld.vop2(aco_opcode::v_and_b32, bld.def(v1), inputs[0], subbrev));
287
288 //! v1: %res1 = v_cndmask_b32 0, 42, %c
289 //! p_unit_test 1, %res1
290 subbrev = create_subbrev_co(Operand::zero(), Operand::zero(), Operand(inputs[2]));
291 writeout(1, bld.vop2(aco_opcode::v_and_b32, bld.def(v1), Operand::c32(42u), subbrev));
292
293 //~gfx9! v1: %subbrev, s2: %_ = v_subbrev_co_u32 0, 0, %c
294 //~gfx9! v1: %res2 = v_and_b32 %b, %subbrev
295 //~gfx10! v1: %res2 = v_cndmask_b32 0, %b, %c
296 //! p_unit_test 2, %res2
297 subbrev = create_subbrev_co(Operand::zero(), Operand::zero(), Operand(inputs[2]));
298 writeout(2, bld.vop2(aco_opcode::v_and_b32, bld.def(v1), inputs[1], subbrev));
299
300 //! v1: %subbrev1, s2: %_ = v_subbrev_co_u32 0, 0, %c
301 //! v1: %xor = v_xor_b32 %a, %subbrev1
302 //! v1: %res3 = v_cndmask_b32 0, %xor, %c
303 //! p_unit_test 3, %res3
304 subbrev = create_subbrev_co(Operand::zero(), Operand::zero(), Operand(inputs[2]));
305 Temp xor_a = bld.vop2(aco_opcode::v_xor_b32, bld.def(v1), inputs[0], subbrev);
306 writeout(3, bld.vop2(aco_opcode::v_and_b32, bld.def(v1), xor_a, subbrev));
307
308 //! v1: %res4 = v_cndmask_b32 0, %a, %c
309 //! p_unit_test 4, %res4
310 Temp cndmask = bld.vop2_e64(aco_opcode::v_cndmask_b32, bld.def(v1), Operand::zero(),
311 Operand::c32(1u), Operand(inputs[2]));
312 Temp sub = bld.vsub32(bld.def(v1), Operand::zero(), cndmask);
313 writeout(4, bld.vop2(aco_opcode::v_and_b32, bld.def(v1), Operand(inputs[0]), sub));
314
315 finish_opt_test();
316 }
317 END_TEST
318
319 BEGIN_TEST(optimize.add_lshl)
320 for (unsigned i = GFX8; i <= GFX10; i++) {
321 //>> s1: %a, v1: %b = p_startpgm
322 if (!setup_cs("s1 v1", (amd_gfx_level)i))
323 continue;
324
325 Temp shift;
326
327 //~gfx8! s1: %lshl0, s1: %_:scc = s_lshl_b32 %a, 3
328 //~gfx8! s1: %res0, s1: %_:scc = s_add_u32 %lshl0, 4
329 //~gfx(9|10)! s1: %res0, s1: %_:scc = s_lshl3_add_u32 %a, 4
330 //! p_unit_test 0, %res0
331 shift = bld.sop2(aco_opcode::s_lshl_b32, bld.def(s1), bld.def(s1, scc), Operand(inputs[0]),
332 Operand::c32(3u));
333 writeout(0, bld.sop2(aco_opcode::s_add_u32, bld.def(s1), bld.def(s1, scc), shift,
334 Operand::c32(4u)));
335
336 //~gfx8! s1: %lshl1, s1: %_:scc = s_lshl_b32 %a, 3
337 //~gfx8! s1: %add1, s1: %_:scc = s_add_u32 %lshl1, 4
338 //~gfx8! v1: %add_co1, s2: %_ = v_add_co_u32 %lshl1, %b
339 //~gfx8! v1: %res1, s2: %_ = v_add_co_u32 %add1, %add_co1
340 //~gfx(9|10)! s1: %lshl1, s1: %_:scc = s_lshl3_add_u32 %a, 4
341 //~gfx(9|10)! v1: %lshl_add = v_lshl_add_u32 %a, 3, %b
342 //~gfx(9|10)! v1: %res1 = v_add_u32 %lshl1, %lshl_add
343 //! p_unit_test 1, %res1
344 shift = bld.sop2(aco_opcode::s_lshl_b32, bld.def(s1), bld.def(s1, scc), Operand(inputs[0]),
345 Operand::c32(3u));
346 Temp sadd =
347 bld.sop2(aco_opcode::s_add_u32, bld.def(s1), bld.def(s1, scc), shift, Operand::c32(4u));
348 Temp vadd = bld.vadd32(bld.def(v1), shift, Operand(inputs[1]));
349 writeout(1, bld.vadd32(bld.def(v1), sadd, vadd));
350
351 //~gfx8! s1: %lshl2, s1: %_:scc = s_lshl_b32 %a, 3
352 //~gfx8! v1: %res2, s2: %_ = v_add_co_u32 %lshl2, %b
353 //~gfx(9|10)! v1: %res2 = v_lshl_add_u32 %a, 3, %b
354 //! p_unit_test 2, %res2
355 Temp lshl = bld.sop2(aco_opcode::s_lshl_b32, bld.def(s1), bld.def(s1, scc),
356 Operand(inputs[0]), Operand::c32(3u));
357 writeout(2, bld.vadd32(bld.def(v1), lshl, Operand(inputs[1])));
358
359 //~gfx8! s1: %lshl3, s1: %_:scc = s_lshl_b32 (is24bit)%a, 7
360 //~gfx8! v1: %res3, s2: %_ = v_add_co_u32 %lshl3, %b
361 //~gfx(9|10)! v1: %res3 = v_lshl_add_u32 (is24bit)%a, 7, %b
362 //! p_unit_test 3, %res3
363 Operand a_24bit = Operand(inputs[0]);
364 a_24bit.set24bit(true);
365 lshl =
366 bld.sop2(aco_opcode::s_lshl_b32, bld.def(s1), bld.def(s1, scc), a_24bit, Operand::c32(7u));
367 writeout(3, bld.vadd32(bld.def(v1), lshl, Operand(inputs[1])));
368
369 //! s1: %lshl4, s1: %_:scc = s_lshl_b32 (is24bit)%a, 3
370 //~gfx(8|9)! v1: %res4, s2: %carry = v_add_co_u32 %lshl4, %b
371 //~gfx10! v1: %res4, s2: %carry = v_add_co_u32_e64 %lshl4, %b
372 //! p_unit_test 4, %carry
373 lshl =
374 bld.sop2(aco_opcode::s_lshl_b32, bld.def(s1), bld.def(s1, scc), a_24bit, Operand::c32(3u));
375 Temp carry = bld.vadd32(bld.def(v1), lshl, Operand(inputs[1]), true).def(1).getTemp();
376 writeout(4, carry);
377
378 //~gfx8! s1: %lshl5, s1: %_:scc = s_lshl_b32 (is24bit)%a, (is24bit)%a
379 //~gfx8! v1: %res5, s2: %_ = v_add_co_u32 %lshl5, %b
380 //~gfx(9|10)! v1: %res5 = v_lshl_add_u32 (is24bit)%a, (is24bit)%a, %b
381 //! p_unit_test 5, %res5
382 lshl = bld.sop2(aco_opcode::s_lshl_b32, bld.def(s1), bld.def(s1, scc), a_24bit, a_24bit);
383 writeout(5, bld.vadd32(bld.def(v1), lshl, Operand(inputs[1])));
384
385 //~gfx8! v1: %res6 = v_mad_u32_u24 (is24bit)%a, 8, %b
386 //~gfx(9|10)! v1: %res6 = v_lshl_add_u32 (is24bit)%a, 3, %b
387 //! p_unit_test 6, %res6
388 lshl =
389 bld.sop2(aco_opcode::s_lshl_b32, bld.def(s1), bld.def(s1, scc), a_24bit, Operand::c32(3u));
390 writeout(6, bld.vadd32(bld.def(v1), lshl, Operand(inputs[1])));
391
392 //~gfx8! v1: %res7 = v_mad_u32_u24 (is16bit)%a, 16, %b
393 //~gfx(9|10)! v1: %res7 = v_lshl_add_u32 (is16bit)%a, 4, %b
394 //! p_unit_test 7, %res7
395 Operand a_16bit = Operand(inputs[0]);
396 a_16bit.set16bit(true);
397 lshl =
398 bld.sop2(aco_opcode::s_lshl_b32, bld.def(s1), bld.def(s1, scc), a_16bit, Operand::c32(4u));
399 writeout(7, bld.vadd32(bld.def(v1), lshl, Operand(inputs[1])));
400
401 finish_opt_test();
402 }
403 END_TEST
404
405 BEGIN_TEST(optimize.bcnt)
406 for (unsigned i = GFX8; i <= GFX10; i++) {
407 //>> v1: %a, s1: %b = p_startpgm
408 if (!setup_cs("v1 s1", (amd_gfx_level)i))
409 continue;
410
411 Temp bcnt;
412
413 //! v1: %res0 = v_bcnt_u32_b32 %a, %a
414 //! p_unit_test 0, %res0
415 bcnt = bld.vop3(aco_opcode::v_bcnt_u32_b32, bld.def(v1), Operand(inputs[0]), Operand::zero());
416 writeout(0, bld.vadd32(bld.def(v1), bcnt, Operand(inputs[0])));
417
418 //! v1: %res1 = v_bcnt_u32_b32 %a, %b
419 //! p_unit_test 1, %res1
420 bcnt = bld.vop3(aco_opcode::v_bcnt_u32_b32, bld.def(v1), Operand(inputs[0]), Operand::zero());
421 writeout(1, bld.vadd32(bld.def(v1), bcnt, Operand(inputs[1])));
422
423 //! v1: %res2 = v_bcnt_u32_b32 %a, 42
424 //! p_unit_test 2, %res2
425 bcnt = bld.vop3(aco_opcode::v_bcnt_u32_b32, bld.def(v1), Operand(inputs[0]), Operand::zero());
426 writeout(2, bld.vadd32(bld.def(v1), bcnt, Operand::c32(42u)));
427
428 //! v1: %bnct3 = v_bcnt_u32_b32 %b, 0
429 //~gfx8! v1: %res3, s2: %_ = v_add_co_u32 %bcnt3, %a
430 //~gfx(9|10)! v1: %res3 = v_add_u32 %bcnt3, %a
431 //! p_unit_test 3, %res3
432 bcnt = bld.vop3(aco_opcode::v_bcnt_u32_b32, bld.def(v1), Operand(inputs[1]), Operand::zero());
433 writeout(3, bld.vadd32(bld.def(v1), bcnt, Operand(inputs[0])));
434
435 //! v1: %bnct4 = v_bcnt_u32_b32 %a, 0
436 //~gfx(8|9)! v1: %add4, s2: %carry = v_add_co_u32 %bcnt4, %a
437 //~gfx10! v1: %add4, s2: %carry = v_add_co_u32_e64 %bcnt4, %a
438 //! p_unit_test 4, %carry
439 bcnt = bld.vop3(aco_opcode::v_bcnt_u32_b32, bld.def(v1), Operand(inputs[0]), Operand::zero());
440 Temp carry = bld.vadd32(bld.def(v1), bcnt, Operand(inputs[0]), true).def(1).getTemp();
441 writeout(4, carry);
442
443 finish_opt_test();
444 }
445 END_TEST
446
447 struct clamp_config {
448 const char* name;
449 aco_opcode min, max, med3;
450 Operand lb, ub;
451 };
452
453 static const clamp_config clamp_configs[] = {
454 /* 0.0, 4.0 */
455 {"_0,4f32", aco_opcode::v_min_f32, aco_opcode::v_max_f32, aco_opcode::v_med3_f32,
456 Operand::zero(), Operand::c32(0x40800000u)},
457 {"_0,4f16", aco_opcode::v_min_f16, aco_opcode::v_max_f16, aco_opcode::v_med3_f16,
458 Operand::c16(0u), Operand::c16(0x4400)},
459 /* -1.0, 0.0 */
460 {"_-1,0f32", aco_opcode::v_min_f32, aco_opcode::v_max_f32, aco_opcode::v_med3_f32,
461 Operand::c32(0xbf800000u), Operand::zero()},
462 {"_-1,0f16", aco_opcode::v_min_f16, aco_opcode::v_max_f16, aco_opcode::v_med3_f16,
463 Operand::c16(0xBC00), Operand::c16(0u)},
464 /* 0, 3 */
465 {"_0,3u32", aco_opcode::v_min_u32, aco_opcode::v_max_u32, aco_opcode::v_med3_u32,
466 Operand::zero(), Operand::c32(3u)},
467 {"_0,3u16", aco_opcode::v_min_u16, aco_opcode::v_max_u16, aco_opcode::v_med3_u16,
468 Operand::c16(0u), Operand::c16(3u)},
469 {"_0,3i32", aco_opcode::v_min_i32, aco_opcode::v_max_i32, aco_opcode::v_med3_i32,
470 Operand::zero(), Operand::c32(3u)},
471 {"_0,3i16", aco_opcode::v_min_i16, aco_opcode::v_max_i16, aco_opcode::v_med3_i16,
472 Operand::c16(0u), Operand::c16(3u)},
473 /* -5, 0 */
474 {"_-5,0i32", aco_opcode::v_min_i32, aco_opcode::v_max_i32, aco_opcode::v_med3_i32,
475 Operand::c32(0xfffffffbu), Operand::zero()},
476 {"_-5,0i16", aco_opcode::v_min_i16, aco_opcode::v_max_i16, aco_opcode::v_med3_i16,
477 Operand::c16(0xfffbu), Operand::c16(0u)},
478 };
479
480 BEGIN_TEST(optimize.clamp)
481 for (clamp_config cfg : clamp_configs) {
482 if (!setup_cs("v1 v1 v1", GFX9, CHIP_UNKNOWN, cfg.name))
483 continue;
484
485 //! cfg: @match_func(min max med3 lb ub)
486 fprintf(output, "cfg: %s ", instr_info.name[(int)cfg.min]);
487 fprintf(output, "%s ", instr_info.name[(int)cfg.max]);
488 fprintf(output, "%s ", instr_info.name[(int)cfg.med3]);
489 aco_print_operand(&cfg.lb, output);
490 fprintf(output, " ");
491 aco_print_operand(&cfg.ub, output);
492 fprintf(output, "\n");
493
494 //>> v1: %a, v1: %b, v1: %c = p_startpgm
495
496 //! v1: %res0 = @med3 @ub, @lb, %a
497 //! p_unit_test 0, %res0
498 writeout(0, bld.vop2(cfg.min, bld.def(v1), cfg.ub,
499 bld.vop2(cfg.max, bld.def(v1), cfg.lb, inputs[0])));
500
501 //! v1: %res1 = @med3 @lb, @ub, %a
502 //! p_unit_test 1, %res1
503 writeout(1, bld.vop2(cfg.max, bld.def(v1), cfg.lb,
504 bld.vop2(cfg.min, bld.def(v1), cfg.ub, inputs[0])));
505
506 /* min constant must be greater than max constant */
507 //! v1: %res2_tmp = @min @lb, %a
508 //! v1: %res2 = @max @ub, %res2_tmp
509 //! p_unit_test 2, %res2
510 writeout(2, bld.vop2(cfg.max, bld.def(v1), cfg.ub,
511 bld.vop2(cfg.min, bld.def(v1), cfg.lb, inputs[0])));
512
513 //! v1: %res3_tmp = @max @ub, %a
514 //! v1: %res3 = @min @lb, %res3_tmp
515 //! p_unit_test 3, %res3
516 writeout(3, bld.vop2(cfg.min, bld.def(v1), cfg.lb,
517 bld.vop2(cfg.max, bld.def(v1), cfg.ub, inputs[0])));
518
519 /* needs two constants */
520
521 //! v1: %res4_tmp = @max @lb, %a
522 //! v1: %res4 = @min %b, %res4_tmp
523 //! p_unit_test 4, %res4
524 writeout(4, bld.vop2(cfg.min, bld.def(v1), inputs[1],
525 bld.vop2(cfg.max, bld.def(v1), cfg.lb, inputs[0])));
526
527 //! v1: %res5_tmp = @max %b, %a
528 //! v1: %res5 = @min @ub, %res5_tmp
529 //! p_unit_test 5, %res5
530 writeout(5, bld.vop2(cfg.min, bld.def(v1), cfg.ub,
531 bld.vop2(cfg.max, bld.def(v1), inputs[1], inputs[0])));
532
533 //! v1: %res6_tmp = @max %c, %a
534 //! v1: %res6 = @min %b, %res6_tmp
535 //! p_unit_test 6, %res6
536 writeout(6, bld.vop2(cfg.min, bld.def(v1), inputs[1],
537 bld.vop2(cfg.max, bld.def(v1), inputs[2], inputs[0])));
538
539 /* correct NaN behaviour with precise */
540 if (cfg.min == aco_opcode::v_min_f16 || cfg.min == aco_opcode::v_min_f32) {
541 //~f(16|32)! v1: %res7 = @med3 @ub, @lb, %a
542 //~f(16|32)! p_unit_test 7, %res7
543 Builder::Result max = bld.vop2(cfg.max, bld.def(v1), cfg.lb, inputs[0]);
544 max.def(0).setPrecise(true);
545 Builder::Result min = bld.vop2(cfg.min, bld.def(v1), cfg.ub, max);
546 max.def(0).setPrecise(true);
547 writeout(7, min);
548
549 //~f(16|32)! v1: (precise)%res8_tmp = @min @ub, %a
550 //~f(16|32)! v1: %res8 = @max @lb, %res8_tmp
551 //~f(16|32)! p_unit_test 8, %res8
552 min = bld.vop2(cfg.min, bld.def(v1), cfg.ub, inputs[0]);
553 min.def(0).setPrecise(true);
554 writeout(8, bld.vop2(cfg.max, bld.def(v1), cfg.lb, min));
555 }
556
557 finish_opt_test();
558 }
559 END_TEST
560
561 BEGIN_TEST(optimize.const_comparison_ordering)
562 //>> v1: %a, v1: %b, v2: %c, v1: %d = p_startpgm
563 if (!setup_cs("v1 v1 v2 v1", GFX9))
564 return;
565
566 /* optimize to unordered comparison */
567 //! s2: %res0 = v_cmp_nge_f32 4.0, %a
568 //! p_unit_test 0, %res0
569 writeout(0, bld.sop2(aco_opcode::s_or_b64, bld.def(bld.lm), bld.def(s1, scc),
570 bld.vopc(aco_opcode::v_cmp_neq_f32, bld.def(bld.lm), inputs[0], inputs[0]),
571 bld.vopc(aco_opcode::v_cmp_lt_f32, bld.def(bld.lm),
572 Operand::c32(0x40800000u), inputs[0])));
573
574 //! s2: %res1 = v_cmp_nge_f32 4.0, %a
575 //! p_unit_test 1, %res1
576 writeout(1, bld.sop2(aco_opcode::s_or_b64, bld.def(bld.lm), bld.def(s1, scc),
577 bld.vopc(aco_opcode::v_cmp_neq_f32, bld.def(bld.lm), inputs[0], inputs[0]),
578 bld.vopc(aco_opcode::v_cmp_nge_f32, bld.def(bld.lm),
579 Operand::c32(0x40800000u), inputs[0])));
580
581 //! s2: %res2 = v_cmp_nge_f32 0x40a00000, %a
582 //! p_unit_test 2, %res2
583 writeout(2, bld.sop2(aco_opcode::s_or_b64, bld.def(bld.lm), bld.def(s1, scc),
584 bld.vopc(aco_opcode::v_cmp_neq_f32, bld.def(bld.lm), inputs[0], inputs[0]),
585 bld.vopc(aco_opcode::v_cmp_lt_f32, bld.def(bld.lm),
586 bld.copy(bld.def(v1), Operand::c32(0x40a00000u)), inputs[0])));
587
588 /* optimize to ordered comparison */
589 //! s2: %res3 = v_cmp_lt_f32 4.0, %a
590 //! p_unit_test 3, %res3
591 writeout(3, bld.sop2(aco_opcode::s_and_b64, bld.def(bld.lm), bld.def(s1, scc),
592 bld.vopc(aco_opcode::v_cmp_eq_f32, bld.def(bld.lm), inputs[0], inputs[0]),
593 bld.vopc(aco_opcode::v_cmp_nge_f32, bld.def(bld.lm),
594 Operand::c32(0x40800000u), inputs[0])));
595
596 //! s2: %res4 = v_cmp_lt_f32 4.0, %a
597 //! p_unit_test 4, %res4
598 writeout(4, bld.sop2(aco_opcode::s_and_b64, bld.def(bld.lm), bld.def(s1, scc),
599 bld.vopc(aco_opcode::v_cmp_eq_f32, bld.def(bld.lm), inputs[0], inputs[0]),
600 bld.vopc(aco_opcode::v_cmp_lt_f32, bld.def(bld.lm),
601 Operand::c32(0x40800000u), inputs[0])));
602
603 //! s2: %res5 = v_cmp_lt_f32 0x40a00000, %a
604 //! p_unit_test 5, %res5
605 writeout(5, bld.sop2(aco_opcode::s_and_b64, bld.def(bld.lm), bld.def(s1, scc),
606 bld.vopc(aco_opcode::v_cmp_eq_f32, bld.def(bld.lm), inputs[0], inputs[0]),
607 bld.vopc(aco_opcode::v_cmp_nge_f32, bld.def(bld.lm),
608 bld.copy(bld.def(v1), Operand::c32(0x40a00000u)), inputs[0])));
609
610 /* similar but unoptimizable expressions */
611 //! s2: %tmp6_0 = v_cmp_lt_f32 4.0, %a
612 //! s2: %tmp6_1 = v_cmp_neq_f32 %a, %a
613 //! s2: %res6, s1: %_:scc = s_and_b64 %tmp6_1, %tmp6_0
614 //! p_unit_test 6, %res6
615 Temp src1 =
616 bld.vopc(aco_opcode::v_cmp_lt_f32, bld.def(bld.lm), Operand::c32(0x40800000u), inputs[0]);
617 Temp src0 = bld.vopc(aco_opcode::v_cmp_neq_f32, bld.def(bld.lm), inputs[0], inputs[0]);
618 writeout(6, bld.sop2(aco_opcode::s_and_b64, bld.def(bld.lm), bld.def(s1, scc), src0, src1));
619
620 //! s2: %tmp7_0 = v_cmp_nge_f32 4.0, %a
621 //! s2: %tmp7_1 = v_cmp_eq_f32 %a, %a
622 //! s2: %res7, s1: %_:scc = s_or_b64 %tmp7_1, %tmp7_0
623 //! p_unit_test 7, %res7
624 src1 =
625 bld.vopc(aco_opcode::v_cmp_nge_f32, bld.def(bld.lm), Operand::c32(0x40800000u), inputs[0]);
626 src0 = bld.vopc(aco_opcode::v_cmp_eq_f32, bld.def(bld.lm), inputs[0], inputs[0]);
627 writeout(7, bld.sop2(aco_opcode::s_or_b64, bld.def(bld.lm), bld.def(s1, scc), src0, src1));
628
629 //! s2: %tmp8_0 = v_cmp_lt_f32 4.0, %d
630 //! s2: %tmp8_1 = v_cmp_neq_f32 %a, %a
631 //! s2: %res8, s1: %_:scc = s_or_b64 %tmp8_1, %tmp8_0
632 //! p_unit_test 8, %res8
633 src1 = bld.vopc(aco_opcode::v_cmp_lt_f32, bld.def(bld.lm), Operand::c32(0x40800000u), inputs[3]);
634 src0 = bld.vopc(aco_opcode::v_cmp_neq_f32, bld.def(bld.lm), inputs[0], inputs[0]);
635 writeout(8, bld.sop2(aco_opcode::s_or_b64, bld.def(bld.lm), bld.def(s1, scc), src0, src1));
636
637 //! s2: %tmp9_0 = v_cmp_lt_f32 4.0, %a
638 //! s2: %tmp9_1 = v_cmp_neq_f32 %a, %d
639 //! s2: %res9, s1: %_:scc = s_or_b64 %tmp9_1, %tmp9_0
640 //! p_unit_test 9, %res9
641 src1 = bld.vopc(aco_opcode::v_cmp_lt_f32, bld.def(bld.lm), Operand::c32(0x40800000u), inputs[0]);
642 src0 = bld.vopc(aco_opcode::v_cmp_neq_f32, bld.def(bld.lm), inputs[0], inputs[3]);
643 writeout(9, bld.sop2(aco_opcode::s_or_b64, bld.def(bld.lm), bld.def(s1, scc), src0, src1));
644
645 /* bit sizes */
646 //! s2: %res10 = v_cmp_nge_f16 4.0, %b
647 //! p_unit_test 10, %res10
648 Temp input1_16 =
649 bld.pseudo(aco_opcode::p_extract_vector, bld.def(v2b), inputs[1], Operand::zero());
650 writeout(10, bld.sop2(aco_opcode::s_or_b64, bld.def(bld.lm), bld.def(s1, scc),
651 bld.vopc(aco_opcode::v_cmp_neq_f16, bld.def(bld.lm), input1_16, input1_16),
652 bld.vopc(aco_opcode::v_cmp_lt_f16, bld.def(bld.lm), Operand::c16(0x4400u),
653 input1_16)));
654
655 //! s2: %res11 = v_cmp_nge_f64 4.0, %c
656 //! p_unit_test 11, %res11
657 writeout(11, bld.sop2(aco_opcode::s_or_b64, bld.def(bld.lm), bld.def(s1, scc),
658 bld.vopc(aco_opcode::v_cmp_neq_f64, bld.def(bld.lm), inputs[2], inputs[2]),
659 bld.vopc(aco_opcode::v_cmp_lt_f64, bld.def(bld.lm),
660 Operand::c64(0x4010000000000000u), inputs[2])));
661
662 /* NaN */
663 uint16_t nan16 = 0x7e00;
664 uint32_t nan32 = 0x7fc00000;
665 uint64_t nan64 = 0xffffffffffffffffllu;
666
667 //! s2: %tmp12_0 = v_cmp_lt_f16 0x7e00, %a
668 //! s2: %tmp12_1 = v_cmp_neq_f16 %a, %a
669 //! s2: %res12, s1: %_:scc = s_or_b64 %tmp12_1, %tmp12_0
670 //! p_unit_test 12, %res12
671 src1 = bld.vopc(aco_opcode::v_cmp_lt_f16, bld.def(bld.lm), Operand::c16(nan16), inputs[0]);
672 src0 = bld.vopc(aco_opcode::v_cmp_neq_f16, bld.def(bld.lm), inputs[0], inputs[0]);
673 writeout(12, bld.sop2(aco_opcode::s_or_b64, bld.def(bld.lm), bld.def(s1, scc), src0, src1));
674
675 //! s2: %tmp13_0 = v_cmp_lt_f32 0x7fc00000, %a
676 //! s2: %tmp13_1 = v_cmp_neq_f32 %a, %a
677 //! s2: %res13, s1: %_:scc = s_or_b64 %tmp13_1, %tmp13_0
678 //! p_unit_test 13, %res13
679 src1 = bld.vopc(aco_opcode::v_cmp_lt_f32, bld.def(bld.lm), Operand::c32(nan32), inputs[0]);
680 src0 = bld.vopc(aco_opcode::v_cmp_neq_f32, bld.def(bld.lm), inputs[0], inputs[0]);
681 writeout(13, bld.sop2(aco_opcode::s_or_b64, bld.def(bld.lm), bld.def(s1, scc), src0, src1));
682
683 //! s2: %tmp14_0 = v_cmp_lt_f64 -1, %c
684 //! s2: %tmp14_1 = v_cmp_neq_f64 %c, %c
685 //! s2: %res14, s1: %_:scc = s_or_b64 %tmp14_1, %tmp14_0
686 //! p_unit_test 14, %res14
687 src1 = bld.vopc(aco_opcode::v_cmp_lt_f64, bld.def(bld.lm), Operand::c64(nan64), inputs[2]);
688 src0 = bld.vopc(aco_opcode::v_cmp_neq_f64, bld.def(bld.lm), inputs[2], inputs[2]);
689 writeout(14, bld.sop2(aco_opcode::s_or_b64, bld.def(bld.lm), bld.def(s1, scc), src0, src1));
690
691 finish_opt_test();
692 END_TEST
693
694 BEGIN_TEST(optimize.add3)
695 //>> v1: %a, v1: %b, v1: %c = p_startpgm
696 if (!setup_cs("v1 v1 v1", GFX9))
697 return;
698
699 //! v1: %res0 = v_add3_u32 %a, %b, %c
700 //! p_unit_test 0, %res0
701 Builder::Result tmp = bld.vop2(aco_opcode::v_add_u32, bld.def(v1), inputs[1], inputs[2]);
702 writeout(0, bld.vop2(aco_opcode::v_add_u32, bld.def(v1), inputs[0], tmp));
703
704 //! v1: %tmp1 = v_add_u32 %b, %c clamp
705 //! v1: %res1 = v_add_u32 %a, %tmp1
706 //! p_unit_test 1, %res1
707 tmp = bld.vop2_e64(aco_opcode::v_add_u32, bld.def(v1), inputs[1], inputs[2]);
708 tmp->valu().clamp = true;
709 writeout(1, bld.vop2(aco_opcode::v_add_u32, bld.def(v1), inputs[0], tmp));
710
711 //! v1: %tmp2 = v_add_u32 %b, %c
712 //! v1: %res2 = v_add_u32 %a, %tmp2 clamp
713 //! p_unit_test 2, %res2
714 tmp = bld.vop2(aco_opcode::v_add_u32, bld.def(v1), inputs[1], inputs[2]);
715 tmp = bld.vop2_e64(aco_opcode::v_add_u32, bld.def(v1), inputs[0], tmp);
716 tmp->valu().clamp = true;
717 writeout(2, tmp);
718
719 finish_opt_test();
720 END_TEST
721
722 BEGIN_TEST(optimize.minmax)
723 for (unsigned i = GFX10_3; i <= GFX11; i++) {
724 //>> v1: %a, v1: %b, v1: %c = p_startpgm
725 if (!setup_cs("v1 v1 v1", (amd_gfx_level)i))
726 continue;
727
728 Temp a = inputs[0];
729 Temp b = inputs[1];
730 Temp c = inputs[2];
731
732 //! v1: %res0 = v_min3_f32 %a, %b, %c
733 //! p_unit_test 0, %res0
734 writeout(0, fmin(c, fmin(a, b)));
735
736 //! v1: %res1 = v_max3_f32 %a, %b, %c
737 //! p_unit_test 1, %res1
738 writeout(1, fmax(c, fmax(a, b)));
739
740 //! v1: %res2 = v_min3_f32 -%a, -%b, %c
741 //! p_unit_test 2, %res2
742 writeout(2, fmin(c, fneg(fmax(a, b))));
743
744 //! v1: %res3 = v_max3_f32 -%a, -%b, %c
745 //! p_unit_test 3, %res3
746 writeout(3, fmax(c, fneg(fmin(a, b))));
747
748 //! v1: %res4 = v_max3_f32 -%a, %b, %c
749 //! p_unit_test 4, %res4
750 writeout(4, fmax(c, fneg(fmin(a, fneg(b)))));
751
752 //~gfx10_3! v1: %res5_tmp = v_max_f32 %a, %b
753 //~gfx10_3! v1: %res5 = v_min_f32 %c, %res5_tmp
754 //~gfx11! v1: %res5 = v_maxmin_f32 %a, %b, %c
755 //! p_unit_test 5, %res5
756 writeout(5, fmin(c, fmax(a, b)));
757
758 //~gfx10_3! v1: %res6_tmp = v_min_f32 %a, %b
759 //~gfx10_3! v1: %res6 = v_max_f32 %c, %res6_tmp
760 //~gfx11! v1: %res6 = v_minmax_f32 %a, %b, %c
761 //! p_unit_test 6, %res6
762 writeout(6, fmax(c, fmin(a, b)));
763
764 //~gfx10_3! v1: %res7_tmp = v_min_f32 %a, %b
765 //~gfx10_3! v1: %res7 = v_min_f32 %c, -%res7_tmp
766 //~gfx11! v1: %res7 = v_maxmin_f32 -%a, -%b, %c
767 //! p_unit_test 7, %res7
768 writeout(7, fmin(c, fneg(fmin(a, b))));
769
770 //~gfx10_3! v1: %res8_tmp = v_max_f32 %a, %b
771 //~gfx10_3! v1: %res8 = v_max_f32 %c, -%res8_tmp
772 //~gfx11! v1: %res8 = v_minmax_f32 -%a, -%b, %c
773 //! p_unit_test 8, %res8
774 writeout(8, fmax(c, fneg(fmax(a, b))));
775
776 //~gfx10_3! v1: %res9_tmp = v_max_f32 %a, -%b
777 //~gfx10_3! v1: %res9 = v_max_f32 %c, -%res9_tmp
778 //~gfx11! v1: %res9 = v_minmax_f32 -%a, %b, %c
779 //! p_unit_test 9, %res9
780 writeout(9, fmax(c, fneg(fmax(a, fneg(b)))));
781
782 finish_opt_test();
783 }
784 END_TEST
785
786 BEGIN_TEST(optimize.mad_32_24)
787 for (unsigned i = GFX8; i <= GFX9; i++) {
788 //>> v1: %a, v1: %b, v1: %c = p_startpgm
789 if (!setup_cs("v1 v1 v1", (amd_gfx_level)i))
790 continue;
791
792 //! v1: %res0 = v_mad_u32_u24 %b, %c, %a
793 //! p_unit_test 0, %res0
794 Temp mul = bld.vop2(aco_opcode::v_mul_u32_u24, bld.def(v1), inputs[1], inputs[2]);
795 writeout(0, bld.vadd32(bld.def(v1), inputs[0], mul));
796
797 //! v1: %res1_tmp = v_mul_u32_u24 %b, %c
798 //! v1: %_, s2: %res1 = v_add_co_u32 %a, %res1_tmp
799 //! p_unit_test 1, %res1
800 mul = bld.vop2(aco_opcode::v_mul_u32_u24, bld.def(v1), inputs[1], inputs[2]);
801 writeout(1, bld.vadd32(bld.def(v1), inputs[0], mul, true).def(1).getTemp());
802
803 finish_opt_test();
804 }
805 END_TEST
806
807 BEGIN_TEST(optimize.add_lshlrev)
808 for (unsigned i = GFX8; i <= GFX10; i++) {
809 //>> v1: %a, v1: %b, s1: %c = p_startpgm
810 if (!setup_cs("v1 v1 s1", (amd_gfx_level)i))
811 continue;
812
813 Temp lshl;
814
815 //~gfx8! v1: %lshl0 = v_lshlrev_b32 3, %a
816 //~gfx8! v1: %res0, s2: %_ = v_add_co_u32 %lshl0, %b
817 //~gfx(9|10)! v1: %res0 = v_lshl_add_u32 %a, 3, %b
818 //! p_unit_test 0, %res0
819 lshl = bld.vop2(aco_opcode::v_lshlrev_b32, bld.def(v1), Operand::c32(3u), Operand(inputs[0]));
820 writeout(0, bld.vadd32(bld.def(v1), lshl, Operand(inputs[1])));
821
822 //~gfx8! v1: %lshl1 = v_lshlrev_b32 7, (is24bit)%a
823 //~gfx8! v1: %res1, s2: %_ = v_add_co_u32 %lshl1, %b
824 //~gfx(9|10)! v1: %res1 = v_lshl_add_u32 (is24bit)%a, 7, %b
825 //! p_unit_test 1, %res1
826 Operand a_24bit = Operand(inputs[0]);
827 a_24bit.set24bit(true);
828 lshl = bld.vop2(aco_opcode::v_lshlrev_b32, bld.def(v1), Operand::c32(7u), a_24bit);
829 writeout(1, bld.vadd32(bld.def(v1), lshl, Operand(inputs[1])));
830
831 //~gfx8! v1: %lshl2 = v_lshlrev_b32 (is24bit)%a, (is24bit)%b
832 //~gfx8! v1: %res2, s2: %_ = v_add_co_u32 %lshl2, %b
833 //~gfx(9|10)! v1: %res2 = v_lshl_add_u32 (is24bit)%b, (is24bit)%a, %b
834 //! p_unit_test 2, %res2
835 Operand b_24bit = Operand(inputs[1]);
836 b_24bit.set24bit(true);
837 lshl = bld.vop2(aco_opcode::v_lshlrev_b32, bld.def(v1), a_24bit, b_24bit);
838 writeout(2, bld.vadd32(bld.def(v1), lshl, Operand(inputs[1])));
839
840 //~gfx8! v1: %res3 = v_mad_u32_u24 (is24bit)%a, 8, %b
841 //~gfx(9|10)! v1: %res3 = v_lshl_add_u32 (is24bit)%a, 3, %b
842 //! p_unit_test 3, %res3
843 lshl = bld.vop2(aco_opcode::v_lshlrev_b32, bld.def(v1), Operand::c32(3u), a_24bit);
844 writeout(3, bld.vadd32(bld.def(v1), lshl, Operand(inputs[1])));
845
846 //~gfx8! v1: %res4 = v_mad_u32_u24 (is16bit)%a, 16, %b
847 //~gfx(9|10)! v1: %res4 = v_lshl_add_u32 (is16bit)%a, 4, %b
848 //! p_unit_test 4, %res4
849 Operand a_16bit = Operand(inputs[0]);
850 a_16bit.set16bit(true);
851 lshl = bld.vop2(aco_opcode::v_lshlrev_b32, bld.def(v1), Operand::c32(4u), a_16bit);
852 writeout(4, bld.vadd32(bld.def(v1), lshl, Operand(inputs[1])));
853
854 //~gfx8! v1: %res5 = v_mad_u32_u24 (is24bit)%c, 16, %c
855 //~gfx(9|10)! v1: %res5 = v_lshl_add_u32 (is24bit)%c, 4, %c
856 //! p_unit_test 5, %res5
857 Operand c_24bit = Operand(inputs[2]);
858 c_24bit.set24bit(true);
859 lshl = bld.vop2_e64(aco_opcode::v_lshlrev_b32, bld.def(v1), Operand::c32(4u), c_24bit);
860 writeout(5, bld.vadd32(bld.def(v1), lshl, Operand(inputs[2])));
861
862 finish_opt_test();
863 }
864 END_TEST
865
866 enum denorm_op {
867 denorm_mul1 = 0,
868 denorm_fneg = 1,
869 denorm_fabs = 2,
870 denorm_fnegabs = 3,
871 };
872
873 static const char* denorm_op_names[] = {
874 "mul1",
875 "fneg",
876 "fabs",
877 "fnegabs",
878 };
879
880 struct denorm_config {
881 bool flush;
882 unsigned op;
883 aco_opcode src;
884 aco_opcode dest;
885 };
886
887 static const char*
srcdest_op_name(aco_opcode op)888 srcdest_op_name(aco_opcode op)
889 {
890 switch (op) {
891 case aco_opcode::v_cndmask_b32: return "cndmask";
892 case aco_opcode::v_min_f32: return "min";
893 case aco_opcode::v_rcp_f32: return "rcp";
894 default: return "none";
895 }
896 }
897
898 static Temp
emit_denorm_srcdest(aco_opcode op,Temp val)899 emit_denorm_srcdest(aco_opcode op, Temp val)
900 {
901 switch (op) {
902 case aco_opcode::v_cndmask_b32:
903 return bld.vop2(aco_opcode::v_cndmask_b32, bld.def(v1), Operand::zero(), val, inputs[1]);
904 case aco_opcode::v_min_f32:
905 return bld.vop2(aco_opcode::v_min_f32, bld.def(v1), Operand::zero(), val);
906 case aco_opcode::v_rcp_f32: return bld.vop1(aco_opcode::v_rcp_f32, bld.def(v1), val);
907 default: return val;
908 }
909 }
910
911 BEGIN_TEST(optimize.denorm_propagation)
912 for (unsigned i = GFX8; i <= GFX9; i++) {
913 std::vector<denorm_config> configs;
914 for (bool flush : {false, true}) {
915 for (denorm_op op : {denorm_mul1, denorm_fneg, denorm_fabs, denorm_fnegabs})
916 configs.push_back({flush, op, aco_opcode::num_opcodes, aco_opcode::num_opcodes});
917
918 for (aco_opcode dest : {aco_opcode::v_min_f32, aco_opcode::v_rcp_f32}) {
919 for (denorm_op op : {denorm_mul1, denorm_fneg, denorm_fabs, denorm_fnegabs})
920 configs.push_back({flush, op, aco_opcode::num_opcodes, dest});
921 }
922
923 for (aco_opcode src :
924 {aco_opcode::v_cndmask_b32, aco_opcode::v_min_f32, aco_opcode::v_rcp_f32}) {
925 for (denorm_op op : {denorm_mul1, denorm_fneg, denorm_fabs, denorm_fnegabs})
926 configs.push_back({flush, op, src, aco_opcode::num_opcodes});
927 }
928 }
929
930 for (denorm_config cfg : configs) {
931 char subvariant[128];
932 sprintf(subvariant, "_%s_%s_%s_%s", cfg.flush ? "flush" : "keep", srcdest_op_name(cfg.src),
933 denorm_op_names[(int)cfg.op], srcdest_op_name(cfg.dest));
934 if (!setup_cs("v1 s2", (amd_gfx_level)i, CHIP_UNKNOWN, subvariant))
935 continue;
936
937 bool can_propagate = cfg.src == aco_opcode::v_rcp_f32 ||
938 (i >= GFX9 && cfg.src == aco_opcode::v_min_f32) ||
939 cfg.dest == aco_opcode::v_rcp_f32 ||
940 (i >= GFX9 && cfg.dest == aco_opcode::v_min_f32) || !cfg.flush;
941
942 fprintf(output, "src, dest, op: %s %s %s\n", srcdest_op_name(cfg.src),
943 srcdest_op_name(cfg.dest), denorm_op_names[(int)cfg.op]);
944 fprintf(output, "can_propagate: %u\n", can_propagate);
945 //! src, dest, op: $src $dest $op
946 //! can_propagate: #can_propagate
947 //>> v1: %a, s2: %b = p_startpgm
948
949 //; patterns = {'cndmask': 'v1: %{} = v_cndmask_b32 0, {}, %b',
950 //; 'min': 'v1: %{} = v_min_f32 0, {}',
951 //; 'rcp': 'v1: %{} = v_rcp_f32 {}'}
952 //; ops = {'mul1': 'v1: %{} = v_mul_f32 1.0, %{}',
953 //; 'fneg': 'v1: %{} = v_mul_f32 -1.0, %{}',
954 //; 'fabs': 'v1: %{} = v_mul_f32 1.0, |%{}|',
955 //; 'fnegabs': 'v1: %{} = v_mul_f32 -1.0, |%{}|'}
956 //; inline_ops = {'mul1': '%{}', 'fneg': '-%{}', 'fabs': '|%{}|', 'fnegabs': '-|%{}|'}
957
958 //; name = 'a'
959 //; if src != 'none':
960 //; insert_pattern(patterns[src].format('src_res', '%'+name))
961 //; name = 'src_res'
962
963 //; if can_propagate:
964 //; name = inline_ops[op].format(name)
965 //; else:
966 //; insert_pattern(ops[op].format('op_res', name))
967 //; name = '%op_res'
968
969 //; if dest != 'none':
970 //; insert_pattern(patterns[dest].format('dest_res', name))
971 //; name = '%dest_res'
972
973 //; insert_pattern('v1: %res = v_cndmask_b32 0, {}, %b'.format(name))
974 //! p_unit_test 0, %res
975
976 program->blocks[0].fp_mode.denorm32 = cfg.flush ? fp_denorm_flush : fp_denorm_keep;
977
978 Temp val = emit_denorm_srcdest(cfg.src, inputs[0]);
979 switch (cfg.op) {
980 case denorm_mul1:
981 val = bld.vop2(aco_opcode::v_mul_f32, bld.def(v1), Operand::c32(0x3f800000u), val);
982 break;
983 case denorm_fneg: val = fneg(val); break;
984 case denorm_fabs: val = fabs(val); break;
985 case denorm_fnegabs: val = fneg(fabs(val)); break;
986 }
987 val = emit_denorm_srcdest(cfg.dest, val);
988 writeout(
989 0, bld.vop2(aco_opcode::v_cndmask_b32, bld.def(v1), Operand::zero(), val, inputs[1]));
990
991 finish_opt_test();
992 }
993 }
994 END_TEST
995
996 BEGIN_TEST(optimizer.dpp)
997 //>> v1: %a, v1: %b, s2: %c, s1: %d = p_startpgm
998 if (!setup_cs("v1 v1 s2 s1", GFX10_3))
999 return;
1000
1001 Operand a(inputs[0]);
1002 Operand b(inputs[1]);
1003 Operand c(inputs[2]);
1004 Operand d(inputs[3]);
1005
1006 /* basic optimization */
1007 //! v1: %res0 = v_add_f32 %a, %b row_mirror bound_ctrl:1 fi
1008 //! p_unit_test 0, %res0
1009 Temp tmp0 = bld.vop1_dpp(aco_opcode::v_mov_b32, bld.def(v1), a, dpp_row_mirror);
1010 Temp res0 = bld.vop2(aco_opcode::v_add_f32, bld.def(v1), tmp0, b);
1011 writeout(0, res0);
1012
1013 /* operand swapping */
1014 //! v1: %res1 = v_subrev_f32 %a, %b row_mirror bound_ctrl:1 fi
1015 //! p_unit_test 1, %res1
1016 Temp tmp1 = bld.vop1_dpp(aco_opcode::v_mov_b32, bld.def(v1), a, dpp_row_mirror);
1017 Temp res1 = bld.vop2(aco_opcode::v_sub_f32, bld.def(v1), b, tmp1);
1018 writeout(1, res1);
1019
1020 //! v1: %tmp2 = v_mov_b32 %a row_mirror bound_ctrl:1 fi
1021 //! v1: %res2 = v_sub_f32 %b, %tmp2 row_half_mirror bound_ctrl:1 fi
1022 //! p_unit_test 2, %res2
1023 Temp tmp2 = bld.vop1_dpp(aco_opcode::v_mov_b32, bld.def(v1), a, dpp_row_mirror);
1024 Temp res2 = bld.vop2_dpp(aco_opcode::v_sub_f32, bld.def(v1), b, tmp2, dpp_row_half_mirror);
1025 writeout(2, res2);
1026
1027 /* modifiers */
1028 //! v1: %res3 = v_add_f32 -%a, %b row_mirror bound_ctrl:1 fi
1029 //! p_unit_test 3, %res3
1030 auto tmp3 = bld.vop1_dpp(aco_opcode::v_mov_b32, bld.def(v1), a, dpp_row_mirror);
1031 tmp3->dpp16().neg[0] = true;
1032 Temp res3 = bld.vop2(aco_opcode::v_add_f32, bld.def(v1), tmp3, b);
1033 writeout(3, res3);
1034
1035 //! v1: %res4 = v_add_f32 -%a, %b row_mirror bound_ctrl:1 fi
1036 //! p_unit_test 4, %res4
1037 Temp tmp4 = bld.vop1_dpp(aco_opcode::v_mov_b32, bld.def(v1), a, dpp_row_mirror);
1038 auto res4 = bld.vop2_e64(aco_opcode::v_add_f32, bld.def(v1), tmp4, b);
1039 res4->valu().neg[0] = true;
1040 writeout(4, res4);
1041
1042 //! v1: %tmp5 = v_mov_b32 %a row_mirror bound_ctrl:1 fi
1043 //! v1: %res5 = v_add_f32 %tmp5, %b clamp
1044 //! p_unit_test 5, %res5
1045 Temp tmp5 = bld.vop1_dpp(aco_opcode::v_mov_b32, bld.def(v1), a, dpp_row_mirror);
1046 auto res5 = bld.vop2_e64(aco_opcode::v_add_f32, bld.def(v1), tmp5, b);
1047 res5->valu().clamp = true;
1048 writeout(5, res5);
1049
1050 //! v1: %res6 = v_add_f32 |%a|, %b row_mirror bound_ctrl:1 fi
1051 //! p_unit_test 6, %res6
1052 auto tmp6 = bld.vop1_dpp(aco_opcode::v_mov_b32, bld.def(v1), a, dpp_row_mirror);
1053 tmp6->dpp16().neg[0] = true;
1054 auto res6 = bld.vop2_e64(aco_opcode::v_add_f32, bld.def(v1), tmp6, b);
1055 res6->valu().abs[0] = true;
1056 writeout(6, res6);
1057
1058 //! v1: %res7 = v_subrev_f32 %a, |%b| row_mirror bound_ctrl:1 fi
1059 //! p_unit_test 7, %res7
1060 Temp tmp7 = bld.vop1_dpp(aco_opcode::v_mov_b32, bld.def(v1), a, dpp_row_mirror);
1061 auto res7 = bld.vop2_e64(aco_opcode::v_sub_f32, bld.def(v1), b, tmp7);
1062 res7->valu().abs[0] = true;
1063 writeout(7, res7);
1064
1065 //! v1: %tmp11 = v_mov_b32 -%a row_mirror bound_ctrl:1 fi
1066 //! v1: %res11 = v_add_u32 %tmp11, %b
1067 //! p_unit_test 11, %res11
1068 auto tmp11 = bld.vop1_dpp(aco_opcode::v_mov_b32, bld.def(v1), a, dpp_row_mirror);
1069 tmp11->dpp16().neg[0] = true;
1070 Temp res11 = bld.vop2(aco_opcode::v_add_u32, bld.def(v1), tmp11, b);
1071 writeout(11, res11);
1072
1073 //! v1: %tmp12 = v_mov_b32 -%a row_mirror bound_ctrl:1 fi
1074 //! v1: %res12 = v_add_f16 %tmp12, %b
1075 //! p_unit_test 12, %res12
1076 auto tmp12 = bld.vop1_dpp(aco_opcode::v_mov_b32, bld.def(v1), a, dpp_row_mirror);
1077 tmp12->dpp16().neg[0] = true;
1078 Temp res12 = bld.vop2(aco_opcode::v_add_f16, bld.def(v1), tmp12, b);
1079 writeout(12, res12);
1080
1081 /* vcc */
1082 //! v1: %res8 = v_cndmask_b32 %a, %b, %c:vcc row_mirror bound_ctrl:1 fi
1083 //! p_unit_test 8, %res8
1084 Temp tmp8 = bld.vop1_dpp(aco_opcode::v_mov_b32, bld.def(v1), a, dpp_row_mirror);
1085 Temp res8 = bld.vop2(aco_opcode::v_cndmask_b32, bld.def(v1), tmp8, b, c);
1086 writeout(8, res8);
1087
1088 /* sgprs */
1089 //! v1: %tmp9 = v_mov_b32 %a row_mirror bound_ctrl:1 fi
1090 //! v1: %res9 = v_add_f32 %tmp9, %d
1091 //! p_unit_test 9, %res9
1092 Temp tmp9 = bld.vop1_dpp(aco_opcode::v_mov_b32, bld.def(v1), a, dpp_row_mirror);
1093 Temp res9 = bld.vop2_e64(aco_opcode::v_add_f32, bld.def(v1), tmp9, d);
1094 writeout(9, res9);
1095
1096 //! v1: %tmp10 = v_mov_b32 %a row_mirror bound_ctrl:1 fi
1097 //! v1: %res10 = v_add_f32 %d, %tmp10
1098 //! p_unit_test 10, %res10
1099 Temp tmp10 = bld.vop1_dpp(aco_opcode::v_mov_b32, bld.def(v1), a, dpp_row_mirror);
1100 Temp res10 = bld.vop2(aco_opcode::v_add_f32, bld.def(v1), d, tmp10);
1101 writeout(10, res10);
1102
1103 finish_opt_test();
1104 END_TEST
1105
1106 BEGIN_TEST(optimize.dpp_prop)
1107 //>> v1: %a, s1: %b = p_startpgm
1108 if (!setup_cs("v1 s1", GFX10))
1109 return;
1110
1111 //! v1: %one = p_parallelcopy 1
1112 //! v1: %res0 = v_mul_f32 1, %a
1113 //! p_unit_test 0, %res0
1114 Temp one = bld.copy(bld.def(v1), Operand::c32(1));
1115 writeout(0, bld.vop2_dpp(aco_opcode::v_mul_f32, bld.def(v1), one, inputs[0], dpp_row_sl(1)));
1116
1117 //! v1: %res1 = v_mul_f32 %a, %one row_shl:1 bound_ctrl:1 fi
1118 //! p_unit_test 1, %res1
1119 writeout(1, bld.vop2_dpp(aco_opcode::v_mul_f32, bld.def(v1), inputs[0], one, dpp_row_sl(1)));
1120
1121 //! v1: %res2 = v_mul_f32 0x12345678, %a
1122 //! p_unit_test 2, %res2
1123 Temp literal1 = bld.copy(bld.def(v1), Operand::c32(0x12345678u));
1124 writeout(2,
1125 bld.vop2_dpp(aco_opcode::v_mul_f32, bld.def(v1), literal1, inputs[0], dpp_row_sl(1)));
1126
1127 //! v1: %literal2 = p_parallelcopy 0x12345679
1128 //! v1: %res3 = v_mul_f32 %a, %literal row_shl:1 bound_ctrl:1 fi
1129 //! p_unit_test 3, %res3
1130 Temp literal2 = bld.copy(bld.def(v1), Operand::c32(0x12345679u));
1131 writeout(3,
1132 bld.vop2_dpp(aco_opcode::v_mul_f32, bld.def(v1), inputs[0], literal2, dpp_row_sl(1)));
1133
1134 //! v1: %b_v = p_parallelcopy %b
1135 //! v1: %res4 = v_mul_f32 %b, %a
1136 //! p_unit_test 4, %res4
1137 Temp b_v = bld.copy(bld.def(v1), inputs[1]);
1138 writeout(4, bld.vop2_dpp(aco_opcode::v_mul_f32, bld.def(v1), b_v, inputs[0], dpp_row_sl(1)));
1139
1140 //! v1: %res5 = v_mul_f32 %a, %b_v row_shl:1 bound_ctrl:1 fi
1141 //! p_unit_test 5, %res5
1142 writeout(5, bld.vop2_dpp(aco_opcode::v_mul_f32, bld.def(v1), inputs[0], b_v, dpp_row_sl(1)));
1143
1144 //! v1: %res6 = v_rcp_f32 %b
1145 //! p_unit_test 6, %res6
1146 writeout(6, bld.vop1_dpp(aco_opcode::v_rcp_f32, bld.def(v1), b_v, dpp_row_sl(1)));
1147
1148 finish_opt_test();
1149 END_TEST
1150
1151 BEGIN_TEST(optimize.casts)
1152 //>> v1: %a, v2b: %a16 = p_startpgm
1153 if (!setup_cs("v1 v2b", GFX10_3))
1154 return;
1155
1156 Temp a = inputs[0];
1157 Temp a16 = inputs[1];
1158
1159 program->blocks[0].fp_mode.denorm16_64 = fp_denorm_flush;
1160
1161 //! v1: %res0_tmp = v_mul_f32 -1.0, %a
1162 //! v2b: %res0 = v_mul_f16 %res0_tmp, %a16
1163 //! p_unit_test 0, %res0
1164 writeout(0, fmul(u2u16(fneg(a)), a16));
1165
1166 //! v2b: %res1_tmp = v_mul_f16 -1.0, %a16
1167 //! v1: %res1 = v_mul_f32 %res1_tmp, %a
1168 //! p_unit_test 1, %res1
1169 writeout(1, fmul(bld.as_uniform(fneg(a16)), a));
1170
1171 //! v1: %res2_tmp = v_mul_f32 -1.0, %a16
1172 //! v2b: %res2 = v_mul_f16 %res2_tmp, %a16
1173 //! p_unit_test 2, %res2
1174 writeout(2, fmul(u2u16(bld.vop2_e64(aco_opcode::v_mul_f32, bld.def(v1),
1175 Operand::c32(0xbf800000u), bld.as_uniform(a16))),
1176 a16));
1177
1178 //! v1: %res3_tmp = v_mul_f32 %a, %a
1179 //! v2b: %res3 = v_add_f16 %res3_tmp, 0 clamp
1180 //! p_unit_test 3, %res3
1181 writeout(3, fsat(u2u16(fmul(a, a))));
1182
1183 //! v2b: %res4_tmp = v_mul_f16 %a16, %a16
1184 //! v1: %res4 = v_add_f32 %res4_tmp, 0 clamp
1185 //! p_unit_test 4, %res4
1186 writeout(4, fsat(bld.as_uniform(fmul(a16, a16))));
1187
1188 //! v1: %res5_tmp = v_mul_f32 %a, %a
1189 //! v2b: %res5 = v_mul_f16 2.0, %res5_tmp
1190 //! p_unit_test 5, %res5
1191 writeout(5, fmul(u2u16(fmul(a, a)), bld.copy(bld.def(v2b), Operand::c16(0x4000))));
1192
1193 //! v2b: %res6_tmp = v_mul_f16 %a16, %a16
1194 //! v1: %res6 = v_mul_f32 2.0, %res6_tmp
1195 //! p_unit_test 6, %res6
1196 writeout(6,
1197 fmul(bld.as_uniform(fmul(a16, a16)), bld.copy(bld.def(v1), Operand::c32(0x40000000))));
1198
1199 //! v1: %res7_tmp = v_mul_f32 %a, %a
1200 //! v2b: %res7 = v_add_f16 %res7_tmp, %a16
1201 //! p_unit_test 7, %res7
1202 writeout(7, fadd(u2u16(fmul(a, a)), a16));
1203
1204 //! v2b: %res8_tmp = v_mul_f16 %a16, %a16
1205 //! v1: %res8 = v_add_f32 %res8_tmp, %a
1206 //! p_unit_test 8, %res8
1207 writeout(8, fadd(bld.as_uniform(fmul(a16, a16)), a));
1208
1209 //! v1: %res9_tmp = v_mul_f32 %a, %a
1210 //! v2b: %res9 = v_mul_f16 -1.0, %res9_tmp
1211 //! p_unit_test 9, %res9
1212 writeout(9, fneg(u2u16(fmul(a, a))));
1213
1214 //! v2b: %res10_tmp = v_mul_f16 %a16, %a16
1215 //! v1: %res10 = v_mul_f32 -1.0, %res10_tmp
1216 //! p_unit_test 10, %res10
1217 writeout(10, bld.vop2_e64(aco_opcode::v_mul_f32, bld.def(v1), Operand::c32(0xbf800000u),
1218 bld.as_uniform(fmul(a16, a16))));
1219
1220 finish_opt_test();
1221 END_TEST
1222
1223 BEGIN_TEST(optimize.mad_mix.input_conv.basic)
1224 for (unsigned i = GFX9; i <= GFX10; i++) {
1225 //>> v1: %a, v2b: %a16 = p_startpgm
1226 if (!setup_cs("v1 v2b", (amd_gfx_level)i))
1227 continue;
1228
1229 program->blocks[0].fp_mode.denorm16_64 = fp_denorm_flush;
1230
1231 Temp a = inputs[0];
1232 Temp a16 = inputs[1];
1233
1234 //! v1: %res0 = v_fma_mix_f32 %a, lo(%a16), -0
1235 //! p_unit_test 0, %res0
1236 writeout(0, fmul(a, f2f32(a16)));
1237
1238 //! v1: %res1 = v_fma_mix_f32 1.0, %a, lo(%a16)
1239 //! p_unit_test 1, %res1
1240 writeout(1, fadd(a, f2f32(a16)));
1241
1242 //! v1: %res2 = v_fma_mix_f32 1.0, lo(%a16), %a
1243 //! p_unit_test 2, %res2
1244 writeout(2, fadd(f2f32(a16), a));
1245
1246 //! v1: %res3 = v_fma_mix_f32 %a, %a, lo(%a16)
1247 //! p_unit_test 3, %res3
1248 writeout(3, fma(a, a, f2f32(a16)));
1249
1250 //! v1: %res4 = v_fma_mix_f32 %a, %a, lo(%a16)
1251 //! p_unit_test 4, %res4
1252 writeout(4, fma(a, a, f2f32(a16)));
1253
1254 finish_opt_test();
1255 }
1256 END_TEST
1257
1258 BEGIN_TEST(optimize.mad_mix.input_conv.precision)
1259 for (unsigned i = GFX9; i <= GFX10; i++) {
1260 //>> v1: %a, v2b: %a16 = p_startpgm
1261 if (!setup_cs("v1 v2b", (amd_gfx_level)i))
1262 continue;
1263
1264 program->blocks[0].fp_mode.denorm16_64 = fp_denorm_flush;
1265
1266 Temp a = inputs[0];
1267 Temp a16 = inputs[1];
1268
1269 /* precise arithmetic */
1270 //~gfx9! v1: %res0_cvt = v_cvt_f32_f16 %a16
1271 //~gfx9! v1: (precise)%res0 = v_fma_f32 %a, %a, %res0_cvt
1272 //~gfx10! v1: (precise)%res0 = v_fma_mix_f32 %a, %a, lo(%a16)
1273 //! p_unit_test 0, %res0
1274 writeout(0, fma(a, a, f2f32(a16), bld.precise()));
1275
1276 //! v2b: %res1_cvt = v_cvt_f16_f32 %a
1277 //! v2b: (precise)%res1 = v_mul_f16 %a16, %res1_cvt
1278 //! p_unit_test 1, %res1
1279 writeout(1, fmul(a16, f2f16(a), bld.precise()));
1280
1281 //! v2b: %res2_cvt = v_cvt_f16_f32 %a
1282 //! v2b: (precise)%res2 = v_add_f16 %a16, %res2_cvt
1283 //! p_unit_test 2, %res2
1284 writeout(2, fadd(a16, f2f16(a), bld.precise()));
1285
1286 //! v2b: %res3_cvt = v_cvt_f16_f32 %a
1287 //! v2b: (precise)%res3 = v_fma_f16 %a16, %a16, %res3_cvt
1288 //! p_unit_test 3, %res3
1289 writeout(3, fma(a16, a16, f2f16(a), bld.precise()));
1290
1291 /* precise conversions */
1292 //! v2b: (precise)%res4_cvt = v_cvt_f16_f32 %a
1293 //! v2b: %res4 = v_mul_f16 %a16, %res4_cvt
1294 //! p_unit_test 4, %res4
1295 writeout(4, fmul(a16, f2f16(a, bld.precise())));
1296
1297 //! v2b: (precise)%res5_cvt = v_cvt_f16_f32 %a
1298 //! v2b: %res5 = v_add_f16 %a16, %res5_cvt
1299 //! p_unit_test 5, %res5
1300 writeout(5, fadd(a16, f2f16(a, bld.precise())));
1301
1302 //! v2b: (precise)%res6_cvt = v_cvt_f16_f32 %a
1303 //! v2b: %res6 = v_fma_f16 %a16, %a16, %res6_cvt
1304 //! p_unit_test 6, %res6
1305 writeout(6, fma(a16, a16, f2f16(a, bld.precise())));
1306
1307 finish_opt_test();
1308 }
1309 END_TEST
1310
1311 BEGIN_TEST(optimize.mad_mix.input_conv.modifiers)
1312 for (unsigned i = GFX9; i <= GFX11; i++) {
1313 if (i == GFX10_3)
1314 continue;
1315 //>> v1: %a, v2b: %a16 = p_startpgm
1316 if (!setup_cs("v1 v2b", (amd_gfx_level)i))
1317 continue;
1318
1319 program->blocks[0].fp_mode.denorm16_64 = fp_denorm_flush;
1320
1321 Temp a = inputs[0];
1322 Temp a16 = inputs[1];
1323
1324 /* check whether modifiers are preserved when converting to VOP3P */
1325 //! v1: %res0 = v_fma_mix_f32 -%a, lo(%a16), -0
1326 //! p_unit_test 0, %res0
1327 writeout(0, fmul(fneg(a), f2f32(a16)));
1328
1329 //! v1: %res1 = v_fma_mix_f32 |%a|, lo(%a16), -0
1330 //! p_unit_test 1, %res1
1331 writeout(1, fmul(fabs(a), f2f32(a16)));
1332
1333 /* fneg modifiers */
1334 //! v1: %res2 = v_fma_mix_f32 %a, -lo(%a16), -0
1335 //! p_unit_test 2, %res2
1336 writeout(2, fmul(a, fneg(f2f32(a16))));
1337
1338 //! v1: %res3 = v_fma_mix_f32 %a, -lo(%a16), -0
1339 //! p_unit_test 3, %res3
1340 writeout(3, fmul(a, f2f32(fneg(a16))));
1341
1342 /* fabs modifiers */
1343 //! v1: %res4 = v_fma_mix_f32 %a, |lo(%a16)|, -0
1344 //! p_unit_test 4, %res4
1345 writeout(4, fmul(a, fabs(f2f32(a16))));
1346
1347 //! v1: %res5 = v_fma_mix_f32 %a, |lo(%a16)|, -0
1348 //! p_unit_test 5, %res5
1349 writeout(5, fmul(a, f2f32(fabs(a16))));
1350
1351 /* both fabs and fneg modifiers */
1352 //! v1: %res6 = v_fma_mix_f32 %a, -|lo(%a16)|, -0
1353 //! p_unit_test 6, %res6
1354 writeout(6, fmul(a, fneg(f2f32(fabs(a16)))));
1355
1356 //! v1: %res7 = v_fma_mix_f32 %a, |lo(%a16)|, -0
1357 //! p_unit_test 7, %res7
1358 writeout(7, fmul(a, fabs(f2f32(fabs(a16)))));
1359
1360 //! v1: %res8 = v_fma_mix_f32 %a, -|lo(%a16)|, -0
1361 //! p_unit_test 8, %res8
1362 writeout(8, fmul(a, fneg(fabs(f2f32(fabs(a16))))));
1363
1364 //! v1: %res9 = v_fma_mix_f32 %a, -|lo(%a16)|, -0
1365 //! p_unit_test 9, %res9
1366 writeout(9, fmul(a, f2f32(fneg(fabs(a16)))));
1367
1368 //! v1: %res10 = v_fma_mix_f32 %a, |lo(%a16)|, -0
1369 //! p_unit_test 10, %res10
1370 writeout(10, fmul(a, fneg(f2f32(fneg(fabs(a16))))));
1371
1372 //! v1: %res11 = v_fma_mix_f32 %a, |lo(%a16)|, -0
1373 //! p_unit_test 11, %res11
1374 writeout(11, fmul(a, fabs(f2f32(fneg(fabs(a16))))));
1375
1376 //! v1: %res12 = v_fma_mix_f32 %a, -|lo(%a16)|, -0
1377 //! p_unit_test 12, %res12
1378 writeout(12, fmul(a, fneg(fabs(f2f32(fneg(fabs(a16)))))));
1379
1380 /* sdwa */
1381 //! v1: %res13 = v_fma_mix_f32 lo(%a), %a, -0
1382 //! p_unit_test 13, %res13
1383 writeout(13, fmul(f2f32(ext_ushort(a, 0)), a));
1384
1385 //! v1: %res14 = v_fma_mix_f32 hi(%a), %a, -0
1386 //! p_unit_test 14, %res14
1387 writeout(14, fmul(f2f32(ext_ushort(a, 1)), a));
1388
1389 //~gfx(9|10)! v1: %res15_cvt = v_cvt_f32_f16 %a dst_sel:uword0 src0_sel:dword
1390 //~gfx11! v1: %res16_cvt1 = v_fma_mix_f32 lo(%a), 1.0, -0
1391 //~gfx11! v1: %res15_cvt = p_extract %res16_cvt1, 0, 16, 0
1392 //! v1: %res15 = v_mul_f32 %res15_cvt, %a
1393 //! p_unit_test 15, %res15
1394 writeout(15, fmul(ext_ushort(f2f32(a), 0), a));
1395
1396 //~gfx(9|10)! v1: %res16_cvt = v_cvt_f32_f16 %a
1397 //~gfx(9|10)! v1: %res16 = v_mul_f32 %res16_cvt, %a dst_sel:dword src0_sel:uword1 src1_sel:dword
1398 //~gfx11! v1: %res16_cvt = v_fma_mix_f32 lo(%a), 1.0, -0
1399 //~gfx11! v1: %res16_ext = p_extract %res16_cvt, 1, 16, 0
1400 //~gfx11! v1: %res16 = v_mul_f32 %res16_ext, %a
1401 //! p_unit_test 16, %res16
1402 writeout(16, fmul(ext_ushort(f2f32(a), 1), a));
1403
1404 //~gfx(9|10)! v1: %res17_cvt = v_cvt_f32_f16 %a dst_sel:dword src0_sel:ubyte2
1405 //~gfx(9|10)! v1: %res17 = v_mul_f32 %res17_cvt, %a
1406 //~gfx11! v1: %res17_ext = p_extract %a, 2, 8, 0
1407 //~gfx11! v1: %res17 = v_fma_mix_f32 lo(%res17_ext), %a, -0
1408 //! p_unit_test 17, %res17
1409 writeout(17, fmul(f2f32(ext_ubyte(a, 2)), a));
1410
1411 finish_opt_test();
1412 }
1413 END_TEST
1414
1415 BEGIN_TEST(optimize.mad_mix.output_conv.basic)
1416 for (unsigned i = GFX9; i <= GFX10; i++) {
1417 //>> v1: %a, v1: %b, v1: %c, v2b: %a16, v2b: %b16 = p_startpgm
1418 if (!setup_cs("v1 v1 v1 v2b v2b", (amd_gfx_level)i))
1419 continue;
1420
1421 program->blocks[0].fp_mode.denorm16_64 = fp_denorm_flush;
1422
1423 Temp a = inputs[0];
1424 Temp b = inputs[1];
1425 Temp c = inputs[2];
1426 Temp a16 = inputs[3];
1427 Temp b16 = inputs[4];
1428
1429 //! v2b: %res0 = v_fma_mixlo_f16 %a, %b, -0
1430 //! p_unit_test 0, %res0
1431 writeout(0, f2f16(fmul(a, b)));
1432
1433 //! v2b: %res1 = v_fma_mixlo_f16 1.0, %a, %b
1434 //! p_unit_test 1, %res1
1435 writeout(1, f2f16(fadd(a, b)));
1436
1437 //! v2b: %res2 = v_fma_mixlo_f16 %a, %b, %c
1438 //! p_unit_test 2, %res2
1439 writeout(2, f2f16(fma(a, b, c)));
1440
1441 //! v2b: %res3 = v_fma_mixlo_f16 lo(%a16), %b, -0
1442 //! p_unit_test 3, %res3
1443 writeout(3, f2f16(fmul(f2f32(a16), b)));
1444
1445 //! v2b: %res4 = v_fma_mixlo_f16 1.0, %a, lo(%b16)
1446 //! p_unit_test 4, %res4
1447 writeout(4, f2f16(fadd(a, f2f32(b16))));
1448
1449 //! v2b: %res5 = v_fma_mixlo_f16 %a, lo(%b16), %c
1450 //! p_unit_test 5, %res5
1451 writeout(5, f2f16(fma(a, f2f32(b16), c)));
1452
1453 finish_opt_test();
1454 }
1455 END_TEST
1456
1457 BEGIN_TEST(optimize.mad_mix.output_conv.precision)
1458 for (unsigned i = GFX9; i <= GFX10; i++) {
1459 //>> v2b: %a16 = p_startpgm
1460 if (!setup_cs("v2b", (amd_gfx_level)i))
1461 continue;
1462
1463 program->blocks[0].fp_mode.denorm16_64 = fp_denorm_flush;
1464
1465 Temp a16 = inputs[0];
1466
1467 //! v2b: %res0_tmp = v_mul_f16 %a16, %a16
1468 //! v1: (precise)%res0 = v_cvt_f32_f16 %res0_tmp
1469 //! p_unit_test 0, %res0
1470 writeout(0, f2f32(fmul(a16, a16), bld.precise()));
1471
1472 //! v2b: (precise)%res1_tmp = v_mul_f16 %a16, %a16
1473 //! v1: %res1 = v_cvt_f32_f16 %res1_tmp
1474 //! p_unit_test 1, %res1
1475 writeout(1, f2f32(fmul(a16, a16, bld.precise())));
1476
1477 finish_opt_test();
1478 }
1479 END_TEST
1480
1481 BEGIN_TEST(optimize.mad_mix.output_conv.modifiers)
1482 for (unsigned i = GFX9; i <= GFX10; i++) {
1483 //>> v1: %a, v1: %b, v2b: %a16, v2b: %b16 = p_startpgm
1484 if (!setup_cs("v1 v1 v2b v2b", (amd_gfx_level)i))
1485 continue;
1486
1487 program->blocks[0].fp_mode.denorm16_64 = fp_denorm_flush;
1488
1489 Temp a = inputs[0];
1490 Temp b = inputs[1];
1491 Temp a16 = inputs[2];
1492 Temp b16 = inputs[3];
1493
1494 /* fneg/fabs */
1495 //! v1: %res0_add = v_add_f32 %1, %2
1496 //! v2b: %res0 = v_cvt_f16_f32 |%res0_add|
1497 //! p_unit_test 0, %res0
1498 writeout(0, f2f16(fabs(fadd(a, b))));
1499
1500 //! v1: %res1_add = v_add_f32 %1, %2
1501 //! v2b: %res1 = v_cvt_f16_f32 -%res1_add
1502 //! p_unit_test 1, %res1
1503 writeout(1, f2f16(fneg(fadd(a, b))));
1504
1505 //! v2b: %res2_add = v_add_f16 %3, %4
1506 //! v1: %res2 = v_cvt_f32_f16 |%res2_add|
1507 //! p_unit_test 2, %res2
1508 writeout(2, f2f32(fabs(fadd(a16, b16))));
1509
1510 //! v2b: %res3_add = v_add_f16 %3, %4
1511 //! v1: %res3 = v_cvt_f32_f16 -%res3_add
1512 //! p_unit_test 3, %res3
1513 writeout(3, f2f32(fneg(fadd(a16, b16))));
1514
1515 /* sdwa */
1516 //! v2b: %res4_add = v_fma_mixlo_f16 1.0, %a, %b
1517 //! v2b: %res4 = p_extract %res4_add, 0, 8, 0
1518 //! p_unit_test 4, %res4
1519 writeout(4, ext_ubyte(f2f16(fadd(a, b)), 0));
1520
1521 //! v1: %res5_mul = v_add_f32 %a, %b dst_sel:uword0 src0_sel:dword src1_sel:dword
1522 //! v2b: %res5 = v_cvt_f16_f32 %res5_mul
1523 //! p_unit_test 5, %res5
1524 writeout(5, f2f16(ext_ushort(fadd(a, b), 0)));
1525
1526 finish_opt_test();
1527 }
1528 END_TEST
1529
1530 BEGIN_TEST(optimize.mad_mix.fma.basic)
1531 for (unsigned i = GFX9; i <= GFX10; i++) {
1532 //>> v1: %a, v1: %b, v1: %c, v2b: %a16, v2b: %c16 = p_startpgm
1533 if (!setup_cs("v1 v1 v1 v2b v2b", (amd_gfx_level)i))
1534 continue;
1535
1536 program->blocks[0].fp_mode.denorm16_64 = fp_denorm_flush;
1537
1538 Temp a = inputs[0];
1539 Temp b = inputs[1];
1540 Temp c = inputs[2];
1541 Temp a16 = inputs[3];
1542 Temp c16 = inputs[4];
1543
1544 //! v1: %res0 = v_fma_mix_f32 lo(%a16), %b, %c
1545 //! p_unit_test 0, %res0
1546 writeout(0, fadd(fmul(f2f32(a16), b), c));
1547
1548 //! v1: %res1 = v_fma_mix_f32 %a, %b, lo(%c16)
1549 //! p_unit_test 1, %res1
1550 writeout(1, fadd(fmul(a, b), f2f32(c16)));
1551
1552 /* omod/clamp check */
1553 //! v1: %res2_mul = v_fma_mix_f32 lo(%a16), %b, -0
1554 //! v1: %res2 = v_add_f32 %res2_mul, %c *2
1555 //! p_unit_test 2, %res2
1556 writeout(2, bld.vop2(aco_opcode::v_mul_f32, bld.def(v1), Operand::c32(0x40000000),
1557 fadd(fmul(f2f32(a16), b), c)));
1558
1559 /* neg/abs modifiers */
1560 //! v1: %res3 = v_fma_mix_f32 -lo(%a16), %b, |lo(%c16)|
1561 //! p_unit_test 3, %res3
1562 writeout(3, fadd(fmul(fneg(f2f32(a16)), b), fabs(f2f32(c16))));
1563
1564 //! v1: %res4 = v_fma_mix_f32 |%a|, |%b|, lo(%c16)
1565 //! p_unit_test 4, %res4
1566 writeout(4, fadd(fabs(fmul(fneg(a), fneg(b))), f2f32(c16)));
1567
1568 //! v1: %res5 = v_fma_mix_f32 %a, -%b, lo(%c16)
1569 //! p_unit_test 5, %res5
1570 writeout(5, fadd(fneg(fmul(a, b)), f2f32(c16)));
1571
1572 //! v1: %res6 = v_fma_mix_f32 |%a|, -|%b|, lo(%c16)
1573 //! p_unit_test 6, %res6
1574 writeout(6, fadd(fneg(fabs(fmul(fneg(a), fneg(b)))), f2f32(c16)));
1575
1576 /* output conversions */
1577 //! v2b: %res7 = v_fma_mixlo_f16 %a, %b, %c
1578 //! p_unit_test 7, %res7
1579 writeout(7, f2f16(fadd(fmul(a, b), c)));
1580
1581 finish_opt_test();
1582 }
1583 END_TEST
1584
1585 BEGIN_TEST(optimize.mad_mix.fma.precision)
1586 for (unsigned i = GFX9; i <= GFX10; i++) {
1587 //>> v1: %a, v1: %b, v1: %c, v2b: %a16, v2b: %b16 = p_startpgm
1588 if (!setup_cs("v1 v1 v1 v2b v2b", (amd_gfx_level)i))
1589 continue;
1590
1591 program->blocks[0].fp_mode.denorm16_64 = fp_denorm_flush;
1592
1593 Temp a = inputs[0];
1594 Temp b = inputs[1];
1595 Temp c = inputs[2];
1596 Temp a16 = inputs[3];
1597 Temp b16 = inputs[4];
1598
1599 /* the optimization is precise for 32-bit on GFX9 */
1600 //~gfx9! v1: (precise)%res0 = v_fma_mix_f32 lo(%a16), %b, %c
1601 //~gfx10! v1: (precise)%res0_tmp = v_fma_mix_f32 lo(%a16), %b, -0
1602 //~gfx10! v1: %res0 = v_add_f32 %res0_tmp, %c
1603 //! p_unit_test 0, %res0
1604 writeout(0, fadd(fmul(f2f32(a16), b, bld.precise()), c));
1605
1606 //~gfx9! v1: (precise)%res1 = v_fma_mix_f32 lo(%a16), %b, %c
1607 //~gfx10! v1: %res1_tmp = v_fma_mix_f32 lo(%a16), %b, -0
1608 //~gfx10! v1: (precise)%res1 = v_add_f32 %res1_tmp, %c
1609 //! p_unit_test 1, %res1
1610 writeout(1, fadd(fmul(f2f32(a16), b), c, bld.precise()));
1611
1612 /* never promote 16-bit arithmetic to 32-bit */
1613 //! v2b: %res2_tmp = v_cvt_f16_f32 %a
1614 //! v2b: %res2 = v_add_f16 %res2_tmp, %b16
1615 //! p_unit_test 2, %res2
1616 writeout(2, fadd(f2f16(a), b16));
1617
1618 //! v2b: %res3_tmp = v_cvt_f16_f32 %a
1619 //! v2b: %res3 = v_mul_f16 %res3_tmp, %b16
1620 //! p_unit_test 3, %res3
1621 writeout(3, fmul(f2f16(a), b16));
1622
1623 //! v2b: %res4_tmp = v_mul_f16 %a16, %b16
1624 //! v1: %res4 = v_cvt_f32_f16 %res4_tmp
1625 //! p_unit_test 4, %res4
1626 writeout(4, f2f32(fmul(a16, b16)));
1627
1628 //! v2b: %res5_tmp = v_add_f16 %a16, %b16
1629 //! v1: %res5 = v_cvt_f32_f16 %res5_tmp
1630 //! p_unit_test 5, %res5
1631 writeout(5, f2f32(fadd(a16, b16)));
1632
1633 //! v2b: %res6_tmp = v_fma_mixlo_f16 %a, %b, -0
1634 //! v2b: %res6 = v_add_f16 %res6_tmp, %a16
1635 //! p_unit_test 6, %res6
1636 writeout(6, fadd(f2f16(fmul(a, b)), a16));
1637
1638 //! v2b: %res7_tmp = v_mul_f16 %a16, %b16
1639 //! v1: %res7 = v_fma_mix_f32 1.0, lo(%res7_tmp), %c
1640 //! p_unit_test 7, %res7
1641 writeout(7, fadd(f2f32(fmul(a16, b16)), c));
1642
1643 finish_opt_test();
1644 }
1645 END_TEST
1646
1647 BEGIN_TEST(optimize.mad_mix.clamp)
1648 for (unsigned i = GFX9; i <= GFX10; i++) {
1649 //>> v1: %a, v2b: %a16 = p_startpgm
1650 if (!setup_cs("v1 v2b", (amd_gfx_level)i))
1651 continue;
1652
1653 program->blocks[0].fp_mode.denorm16_64 = fp_denorm_flush;
1654
1655 Temp a = inputs[0];
1656 Temp a16 = inputs[1];
1657
1658 //! v1: %res0 = v_fma_mix_f32 lo(%a16), %a, -0 clamp
1659 //! p_unit_test 0, %res0
1660 writeout(0, fsat(fmul(f2f32(a16), a)));
1661
1662 //! v2b: %res1 = v_fma_mixlo_f16 %a, %a, -0 clamp
1663 //! p_unit_test 1, %res1
1664 writeout(1, f2f16(fsat(fmul(a, a))));
1665
1666 //! v2b: %res2 = v_fma_mixlo_f16 %a, %a, -0 clamp
1667 //! p_unit_test 2, %res2
1668 writeout(2, fsat(f2f16(fmul(a, a))));
1669
1670 finish_opt_test();
1671 }
1672 END_TEST
1673
1674 BEGIN_TEST(optimize.mad_mix.cast)
1675 for (unsigned i = GFX9; i <= GFX10; i++) {
1676 //>> v1: %a, v2b: %a16 = p_startpgm
1677 if (!setup_cs("v1 v2b", (amd_gfx_level)i))
1678 continue;
1679
1680 program->blocks[0].fp_mode.denorm16_64 = fp_denorm_flush;
1681
1682 Temp a = inputs[0];
1683 Temp a16 = inputs[1];
1684
1685 /* The optimizer copy-propagates v2b=p_extract_vector(v1, 0) and p_as_uniform, so the
1686 * optimizer has to check compatibility.
1687 */
1688
1689 //! v1: %res0_cvt = v_cvt_f32_f16 %a16
1690 //! v2b: %res0 = v_mul_f16 %res0_cvt, %a16
1691 //! p_unit_test 0, %res0
1692 writeout(0, fmul(u2u16(f2f32(a16)), a16));
1693
1694 //! v2b: %res1_cvt = v_cvt_f16_f32 %a
1695 //! v1: %res1 = v_mul_f32 %res1_cvt, %a
1696 //! p_unit_test 1, %res1
1697 writeout(1, fmul(bld.as_uniform(f2f16(a)), a));
1698
1699 //! v2b: %res2_mul = v_mul_f16 %a16, %a16
1700 //! v2b: %res2 = v_cvt_f16_f32 %res2_mul
1701 //! p_unit_test 2, %res2
1702 writeout(2, f2f16(bld.as_uniform(fmul(a16, a16))));
1703
1704 //! v1: %res3_mul = v_mul_f32 %a, %a
1705 //! v1: %res3 = v_cvt_f32_f16 %res3_mul
1706 //! p_unit_test 3, %res3
1707 writeout(3, f2f32(u2u16(fmul(a, a))));
1708
1709 //! v1: %res4_mul = v_fma_mix_f32 lo(%a16), %a, -0
1710 //! v2b: %res4 = v_add_f16 %res4_mul, 0 clamp
1711 //! p_unit_test 4, %res4
1712 writeout(4, fsat(u2u16(fmul(f2f32(a16), a))));
1713
1714 //! v2b: %res5_mul = v_fma_mixlo_f16 %a, %a, -0
1715 //! v1: %res5 = v_add_f32 %res5_mul, 0 clamp
1716 //! p_unit_test 5, %res5
1717 writeout(5, fsat(bld.as_uniform(f2f16(fmul(a, a)))));
1718
1719 //! v1: %res6_mul = v_mul_f32 %a, %a
1720 //! v1: %res6 = v_fma_mix_f32 1.0, lo(%res6_mul), %a
1721 //! p_unit_test 6, %res6
1722 writeout(6, fadd(f2f32(u2u16(fmul(a, a))), a));
1723
1724 //! v2b: %res7_mul = v_mul_f16 %a16, %a16
1725 //! v1: %res7 = v_fma_mix_f32 1.0, %res7_mul, lo(%a16)
1726 //! p_unit_test 7, %res7
1727 writeout(7, fadd(bld.as_uniform(fmul(a16, a16)), f2f32(a16)));
1728
1729 /* opsel_hi should be obtained from the original opcode, not the operand regclass */
1730 //! v1: %res8 = v_fma_mix_f32 lo(%a16), %a16, -0
1731 //! p_unit_test 8, %res8
1732 writeout(8, fmul(f2f32(a16), a16));
1733
1734 finish_opt_test();
1735 }
1736 END_TEST
1737
1738 static void
vop3p_constant(unsigned * idx,aco_opcode op,const char * swizzle,uint32_t val)1739 vop3p_constant(unsigned* idx, aco_opcode op, const char* swizzle, uint32_t val)
1740 {
1741 uint32_t halves[2] = {val & 0xffff, val >> 16};
1742 uint32_t expected = halves[swizzle[0] - 'x'] | (halves[swizzle[1] - 'x'] << 16);
1743 fprintf(output, "Expected for %u: 0x%.8x / %u\n", *idx, expected, expected);
1744
1745 unsigned opsel_lo = swizzle[0] == 'x' ? 0x0 : 0x1;
1746 unsigned opsel_hi = swizzle[1] == 'x' ? 0x2 : 0x3;
1747 writeout((*idx)++, bld.vop3p(op, bld.def(v1), bld.copy(bld.def(v1), Operand::c32(val)),
1748 inputs[0], opsel_lo, opsel_hi));
1749 }
1750
1751 BEGIN_TEST(optimize.vop3p_constants)
1752 for (aco_opcode op : {aco_opcode::v_pk_add_f16, aco_opcode::v_pk_add_u16}) {
1753 for (const char* swizzle : {"xx", "yy", "xy", "yx"}) {
1754 char variant[16];
1755 strcpy(variant, op == aco_opcode::v_pk_add_f16 ? "_f16" : "_u16");
1756 strcat(variant, "_");
1757 strcat(variant, swizzle);
1758
1759 //; for i in range(36):
1760 //; insert_pattern('Expected for %u: $_ / #expected%u' % (i, i))
1761
1762 //>> v1: %a = p_startpgm
1763 if (!setup_cs("v1", GFX10_3, CHIP_UNKNOWN, variant))
1764 continue;
1765
1766 //; opcode = 'v_pk_add_u16' if 'u16' in variant else 'v_pk_add_f16'
1767 //; for i in range(36):
1768 //; insert_pattern('v1: %%res%u = %s $got%u %%a' % (i, opcode, i))
1769 //; insert_pattern('p_unit_test %u, %%res%u' % (i, i))
1770 //! s_endpgm
1771
1772 //; def parse_op(op):
1773 //; is_int = opcode == 'v_pk_add_u16'
1774 //; op = op.rstrip(',')
1775 //;
1776 //; mods = lambda v: v
1777 //; if op.endswith('*[1,-1]'):
1778 //; mods = lambda v: v ^ 0x80000000
1779 //; assert(not is_int)
1780 //; elif op.endswith('*[-1,1]'):
1781 //; mods = lambda v: v ^ 0x00008000
1782 //; assert(not is_int)
1783 //; op = op.split('*')[0]
1784 //;
1785 //; swizzle = lambda v: v
1786 //; if op.endswith('.xx'):
1787 //; swizzle = lambda v: ((v & 0xffff) | (v << 16)) & 0xffffffff;
1788 //; elif op.endswith('.yy'):
1789 //; swizzle = lambda v: (v >> 16) | (v & 0xffff0000);
1790 //; elif op.endswith('.yx'):
1791 //; swizzle = lambda v: ((v >> 16) | (v << 16)) & 0xffffffff;
1792 //; op = op.rstrip('xy.')
1793 //;
1794 //; val = None
1795 //; if op.startswith('0x'):
1796 //; val = int(op[2:], 16)
1797 //; elif op == '-1.0':
1798 //; val = 0xbf800000 if is_int else 0xbC00
1799 //; elif op == '1.0':
1800 //; val = 0x3f800000 if is_int else 0x3c00
1801 //; else:
1802 //; val = int(op) & 0xffffffff
1803 //;
1804 //; return mods(swizzle(val))
1805
1806 //; # Check correctness
1807 //; for i in range(36):
1808 //; expected = globals()['expected%u' % i]
1809 //; got = globals()['got%u' % i]
1810 //; got_parsed = parse_op(got)
1811 //; if got_parsed != expected:
1812 //; raise Exception('Check %u failed: expected 0x%.8x, got 0x%.8x ("%s")' % (i, expected, got_parsed, got))
1813
1814 //; # Check that all literals are ones that cannot be encoded as inline constants
1815 //; allowed_literals = [0x00004242, 0x0000fffe, 0x00308030, 0x0030ffff, 0x3c00ffff,
1816 //; 0x42420000, 0x42424242, 0x4242c242, 0x4242ffff, 0x7ffefffe,
1817 //; 0x80300030, 0xbeefdead, 0xc2424242, 0xdeadbeef, 0xfffe0000,
1818 //; 0xfffe7ffe, 0xffff0030, 0xffff3c00, 0xffff4242]
1819 //; if opcode == 'v_pk_add_u16':
1820 //; allowed_literals.extend([0x00003c00, 0x3c000000, 0x3c003c00, 0x3c00bc00, 0xbc003c00])
1821 //; else:
1822 //; allowed_literals.extend([0x00003f80, 0x3f800000])
1823 //;
1824 //; for i in range(36):
1825 //; got = globals()['got%u' % i]
1826 //; if not got.startswith('0x'):
1827 //; continue;
1828 //; got = int(got[2:].rstrip(',').split('*')[0].split('.')[0], 16)
1829 //; if got not in allowed_literals:
1830 //; raise Exception('Literal check %u failed: 0x%.8x not in allowed literals' % (i, got))
1831
1832 unsigned idx = 0;
1833 for (uint32_t constant : {0x3C00, 0x0030, 0xfffe, 0x4242}) {
1834 vop3p_constant(&idx, op, swizzle, constant);
1835 vop3p_constant(&idx, op, swizzle, constant | 0xffff0000);
1836 vop3p_constant(&idx, op, swizzle, constant | (constant << 16));
1837 vop3p_constant(&idx, op, swizzle, constant << 16);
1838 vop3p_constant(&idx, op, swizzle, (constant << 16) | 0x0000ffff);
1839 vop3p_constant(&idx, op, swizzle, constant | ((constant ^ 0x8000) << 16));
1840 vop3p_constant(&idx, op, swizzle, (constant ^ 0x8000) | (constant << 16));
1841 }
1842
1843 for (uint32_t constant : {0x3f800000u, 0xfffffffeu, 0x00000030u, 0xdeadbeefu}) {
1844 uint32_t lo = constant & 0xffff;
1845 uint32_t hi = constant >> 16;
1846 vop3p_constant(&idx, op, swizzle, constant);
1847 vop3p_constant(&idx, op, swizzle, hi | (lo << 16));
1848 }
1849
1850 finish_opt_test();
1851 }
1852 }
1853 END_TEST
1854
1855 BEGIN_TEST(optimize.fmamix_two_literals)
1856 /* This test has to recreate literals sometimes because we don't combine them at all if there's
1857 * at least one uncombined use.
1858 */
1859 for (unsigned i = GFX10; i <= GFX10_3; i++) {
1860 //>> v1: %a, v1: %b = p_startpgm
1861 if (!setup_cs("v1 v1", (amd_gfx_level)i))
1862 continue;
1863
1864 Temp a = inputs[0];
1865 Temp b = inputs[1];
1866
1867 Temp c15 = bld.copy(bld.def(v1), Operand::c32(fui(1.5f)));
1868 Temp c30 = bld.copy(bld.def(v1), Operand::c32(fui(3.0f)));
1869 Temp c_denorm = bld.copy(bld.def(v1), Operand::c32(0x387fc000));
1870
1871 //! v1: %res0 = v_fma_mix_f32 %a, lo(0x42003e00), hi(0x42003e00)
1872 //! p_unit_test 0, %res0
1873 writeout(0, fma(a, c15, c30));
1874
1875 /* No need to use v_fma_mix_f32. */
1876 //! v1: %res1 = v_fmaak_f32 %a, %b, 0x40400000
1877 //! p_unit_test 1, %res1
1878 writeout(1, fma(a, b, c30));
1879
1880 /* Separate mul/add can become v_fma_mix_f32 if it's not precise. */
1881 //! v1: %res2 = v_fma_mix_f32 %a, lo(0x42003e00), hi(0x42003e00)
1882 //! p_unit_test 2, %res2
1883 writeout(2, fadd(fmul(a, c15), c30));
1884
1885 //~gfx10! v1: %c15 = p_parallelcopy 0x3fc00000
1886 c15 = bld.copy(bld.def(v1), Operand::c32(fui(1.5f)));
1887 c30 = bld.copy(bld.def(v1), Operand::c32(fui(3.0f)));
1888
1889 /* v_fma_mix_f32 is a fused mul/add, so it can't be used for precise separate mul/add. */
1890 //~gfx10! v1: (precise)%res3 = v_madak_f32 %a, %c15, 0x40400000
1891 //~gfx10_3! v1: (precise)%res3_tmp = v_mul_f32 %a, 0x3fc00000
1892 //~gfx10_3! v1: %res3 = v_add_f32 %res3_tmp, 0x40400000
1893 //! p_unit_test 3, %res3
1894 writeout(3, fadd(bld.precise().vop2(aco_opcode::v_mul_f32, bld.def(v1), a, c15), c30));
1895
1896 //~gfx10! v1: (precise)%res4 = v_madak_f32 %1, %c16, 0x40400000
1897 //~gfx10_3! v1: %res4_tmp = v_mul_f32 %a, 0x3fc00000
1898 //~gfx10_3! v1: (precise)%res4 = v_add_f32 %res4_tmp, 0x40400000
1899 //! p_unit_test 4, %res4
1900 writeout(4, bld.precise().vop2(aco_opcode::v_add_f32, bld.def(v1), fmul(a, c15), c30));
1901
1902 /* Can't convert to fp16 if it will be flushed as a denormal. */
1903 //! v1: %res5 = v_fma_mix_f32 %1, lo(0x3ff3e00), hi(0x3ff3e00)
1904 //! p_unit_test 5, %res5
1905 c15 = bld.copy(bld.def(v1), Operand::c32(fui(1.5f)));
1906 writeout(5, fma(a, c15, c_denorm));
1907
1908 //>> BB1
1909 //! /* logical preds: / linear preds: / kind: uniform, */
1910 program->next_fp_mode.denorm16_64 = fp_denorm_flush;
1911 bld.reset(program->create_and_insert_block());
1912
1913 //~gfx10; del c15
1914 //! v1: %c15 = p_parallelcopy 0x3fc00000
1915 //! v1: %res6 = v_fmaak_f32 %a, %c15, 0x387fc000
1916 //! p_unit_test 6, %res6
1917 c15 = bld.copy(bld.def(v1), Operand::c32(fui(1.5f)));
1918 c_denorm = bld.copy(bld.def(v1), Operand::c32(0x387fc000));
1919 writeout(6, fma(a, c15, c_denorm));
1920
1921 /* Can't accept more than 3 unique fp16 literals. */
1922 //! v1: %c45 = p_parallelcopy 0x40900000
1923 //! v1: %res7 = v_fma_mix_f32 lo(0x42003e00), hi(0x42003e00), %c45
1924 //! p_unit_test 7, %res7
1925 Temp c45 = bld.copy(bld.def(v1), Operand::c32(fui(4.5f)));
1926 writeout(7, fma(c15, c30, c45));
1927
1928 /* Modifiers must be preserved. */
1929 //! v1: %res8 = v_fma_mix_f32 -%a, lo(0x44804200), hi(0x44804200)
1930 //! p_unit_test 8, %res8
1931 writeout(8, fma(fneg(a), c30, c45));
1932
1933 //! v1: %res9 = v_fma_mix_f32 lo(0x44804200), |%a|, hi(0x44804200)
1934 //! p_unit_test 9, %res9
1935 writeout(9, fma(c30, fabs(a), c45));
1936
1937 //! v1: %res10 = v_fma_mix_f32 %a, lo(0x44804200), hi(0x44804200) clamp
1938 //! p_unit_test 10, %res10
1939 writeout(10, fsat(fma(a, c30, c45)));
1940
1941 /* Output modifiers are not supported by v_fma_mix_f32. */
1942 c30 = bld.copy(bld.def(v1), Operand::c32(fui(3.0f)));
1943 //; del c45
1944 //! v1: %c45 = p_parallelcopy 0x40900000
1945 //! v1: %res11 = v_fma_f32 %a, 0x40400000, %c45 *0.5
1946 //! p_unit_test 11, %res11
1947 c45 = bld.copy(bld.def(v1), Operand::c32(fui(4.5f)));
1948 writeout(11, fmul(fma(a, c30, c45), bld.copy(bld.def(v1), Operand::c32(0x3f000000))));
1949
1950 /* Has a literal which can't be represented as fp16. */
1951 //! v1: %c03 = p_parallelcopy 0x3e99999a
1952 //! v1: %res12 = v_fmaak_f32 %a, %c03, 0x40400000
1953 //! p_unit_test 12, %res12
1954 Temp c03 = bld.copy(bld.def(v1), Operand::c32(fui(0.3f)));
1955 writeout(12, fma(a, c03, c30));
1956
1957 /* We should still use fmaak/fmamk if the two literals are identical. */
1958 //! v1: %res13 = v_fmaak_f32 0x40400000, %a, 0x40400000
1959 //! p_unit_test 13, %res13
1960 writeout(13, fma(a, c30, c30));
1961
1962 finish_opt_test();
1963 }
1964 END_TEST
1965
1966 BEGIN_TEST(optimize.fma_opsel)
1967 /* TODO make these work before GFX11 using SDWA. */
1968 for (unsigned i = GFX11; i <= GFX11; i++) {
1969 //>> v2b: %a, v2b: %b, v1: %c, v1: %d, v1: %e = p_startpgm
1970 if (!setup_cs("v2b v2b v1 v1 v1", (amd_gfx_level)i))
1971 continue;
1972
1973 Temp a = inputs[0];
1974 Temp b = inputs[1];
1975 Temp c = inputs[2];
1976 Temp d = inputs[3];
1977 Temp e = inputs[4];
1978 Temp c_hi = bld.pseudo(aco_opcode::p_extract_vector, bld.def(v2b), c, Operand::c32(1));
1979 Temp d_hi = bld.pseudo(aco_opcode::p_extract_vector, bld.def(v2b), d, Operand::c32(1));
1980 Temp e_hi = bld.pseudo(aco_opcode::p_extract_vector, bld.def(v2b), e, Operand::c32(1));
1981
1982 //! v2b: %res0 = v_fma_f16 %b, hi(%c), %a
1983 //! p_unit_test 0, %res0
1984 writeout(0, fadd(fmul(b, c_hi), a));
1985
1986 //! v2b: %res1 = v_fma_f16 %a, %b, hi(%d)
1987 //! p_unit_test 1, %res1
1988 writeout(1, fadd(fmul(a, b), d_hi));
1989
1990 //! v2b: %res2 = v_fma_f16 %a, %b, hi(%e)
1991 //! p_unit_test 2, %res2
1992 writeout(2, fma(a, b, e_hi));
1993
1994 finish_opt_test();
1995 }
1996 END_TEST
1997
1998 BEGIN_TEST(optimize.dpp_opsel)
1999 //>> v1: %a, v1: %b = p_startpgm
2000 if (!setup_cs("v1 v1", GFX11))
2001 return;
2002
2003 Temp a = inputs[0];
2004 Temp b = inputs[1];
2005
2006 Temp dpp16 = bld.vop1_dpp(aco_opcode::v_mov_b32, bld.def(v1), a, dpp_row_mirror);
2007 Temp dpp16_hi = bld.pseudo(aco_opcode::p_extract_vector, bld.def(v2b), dpp16, Operand::c32(1));
2008 Temp dpp8 = bld.vop1_dpp8(aco_opcode::v_mov_b32, bld.def(v1), a);
2009 Temp dpp8_hi = bld.pseudo(aco_opcode::p_extract_vector, bld.def(v2b), dpp8, Operand::c32(1));
2010
2011 Temp b_hi = bld.pseudo(aco_opcode::p_extract_vector, bld.def(v2b), b, Operand::c32(1));
2012 Temp b_lo = bld.pseudo(aco_opcode::p_extract_vector, bld.def(v2b), b, Operand::c32(0));
2013
2014 //! v2b: %res0 = v_add_f16 hi(%a), hi(%b) row_mirror bound_ctrl:1 fi
2015 //! p_unit_test 0, %res0
2016 writeout(0, fadd(dpp16_hi, b_hi));
2017
2018 //! v2b: %res1 = v_add_f16 hi(%a), %b dpp8:[0,0,0,0,0,0,0,0] fi
2019 //! p_unit_test 1, %res1
2020 writeout(1, fadd(b_lo, dpp8_hi));
2021
2022 finish_opt_test();
2023 END_TEST
2024
2025 BEGIN_TEST(optimize.apply_sgpr_swap_opsel)
2026 //>> v1: %a, s1: %b = p_startpgm
2027 if (!setup_cs("v1 s1", GFX11))
2028 return;
2029
2030 Temp a = inputs[0];
2031 Temp b = inputs[1];
2032
2033 Temp b_vgpr = bld.pseudo(aco_opcode::p_extract_vector, bld.def(v2b), bld.copy(bld.def(v1), b),
2034 Operand::c32(0));
2035
2036 Temp res0 = bld.tmp(v2b);
2037 VALU_instruction& valu = bld.vop2(aco_opcode::v_sub_f16, Definition(res0), a, b_vgpr)->valu();
2038 valu.opsel[0] = true;
2039
2040 //! v2b: %res0 = v_subrev_f16 %b, hi(%a)
2041 //! p_unit_test 0, %res0
2042 writeout(0, res0);
2043
2044 finish_opt_test();
2045 END_TEST
2046
2047 BEGIN_TEST(optimize.combine_comparison_ordering)
2048 //>> v1: %a, v1: %b, v1: %c = p_startpgm
2049 if (!setup_cs("v1 v1 v1", GFX11))
2050 return;
2051
2052 Temp a = inputs[0];
2053 Temp b = inputs[1];
2054 Temp c = inputs[2];
2055
2056 Temp a_unordered = bld.vopc(aco_opcode::v_cmp_neq_f32, bld.def(bld.lm), a, a);
2057 Temp b_unordered = bld.vopc(aco_opcode::v_cmp_neq_f32, bld.def(bld.lm), b, b);
2058 Temp unordered =
2059 bld.sop2(Builder::s_or, bld.def(bld.lm), bld.def(bld.lm, scc), a_unordered, b_unordered);
2060
2061 Temp a_lt_a = bld.vopc(aco_opcode::v_cmp_lt_f32, bld.def(bld.lm), a, a);
2062 Temp unordered_cmp =
2063 bld.sop2(Builder::s_or, bld.def(bld.lm), bld.def(bld.lm, scc), unordered, a_lt_a);
2064
2065 //! s2: %res0_unordered = v_cmp_u_f32 %a, %b
2066 //! s2: %res0_cmp = v_cmp_lt_f32 %a, %a
2067 //! s2: %res0, s2: %_:scc = s_or_b64 %res0_unordered, %res0_cmp
2068 //! p_unit_test 0, %res0
2069 writeout(0, unordered_cmp);
2070
2071 Temp c_hi = bld.pseudo(aco_opcode::p_extract_vector, bld.def(v2b), c, Operand::c32(1));
2072
2073 Temp c_unordered = bld.vopc(aco_opcode::v_cmp_neq_f16, bld.def(bld.lm), c, c);
2074 Temp c_hi_unordered = bld.vopc(aco_opcode::v_cmp_neq_f16, bld.def(bld.lm), c_hi, c_hi);
2075 unordered =
2076 bld.sop2(Builder::s_or, bld.def(bld.lm), bld.def(bld.lm, scc), c_unordered, c_hi_unordered);
2077
2078 Temp c_lt_c_hi = bld.vopc(aco_opcode::v_cmp_lt_f16, bld.def(bld.lm), c, c_hi);
2079 unordered_cmp =
2080 bld.sop2(Builder::s_or, bld.def(bld.lm), bld.def(bld.lm, scc), unordered, c_lt_c_hi);
2081
2082 //! s2: %res1 = v_cmp_nge_f16 %c, hi(%c)
2083 //! p_unit_test 1, %res1
2084 writeout(1, unordered_cmp);
2085
2086 finish_opt_test();
2087 END_TEST
2088
2089 BEGIN_TEST(optimize.combine_comparison_ordering_opsel)
2090 //>> v1: %a, v2b: %b = p_startpgm
2091 if (!setup_cs("v1 v2b", GFX11))
2092 return;
2093
2094 Temp a = inputs[0];
2095 Temp b = inputs[1];
2096
2097 Temp a_hi = bld.pseudo(aco_opcode::p_extract_vector, bld.def(v2b), a, Operand::c32(1));
2098
2099 Temp ahi_unordered = bld.vopc(aco_opcode::v_cmp_neq_f16, bld.def(bld.lm), a_hi, a_hi);
2100 Temp b_unordered = bld.vopc(aco_opcode::v_cmp_neq_f16, bld.def(bld.lm), b, b);
2101 Temp unordered =
2102 bld.sop2(Builder::s_or, bld.def(bld.lm), bld.def(bld.lm, scc), ahi_unordered, b_unordered);
2103
2104 Temp ahi_lt_b = bld.vopc(aco_opcode::v_cmp_lt_f16, bld.def(bld.lm), a_hi, b);
2105 Temp unordered_cmp =
2106 bld.sop2(Builder::s_or, bld.def(bld.lm), bld.def(bld.lm, scc), unordered, ahi_lt_b);
2107
2108 //! s2: %res0 = v_cmp_nge_f16 hi(%a), %b
2109 //! p_unit_test 0, %res0
2110 writeout(0, unordered_cmp);
2111
2112 Temp ahi_cmp_const = bld.vopc(aco_opcode::v_cmp_lt_f16, bld.def(bld.lm), a_hi,
2113 bld.copy(bld.def(v2b), Operand::c16(0x4400)));
2114 Temp ahi_ucmp_const =
2115 bld.sop2(Builder::s_or, bld.def(bld.lm), bld.def(bld.lm, scc), ahi_unordered, ahi_cmp_const);
2116 //! s2: %res1 = v_cmp_nle_f16 4.0, hi(%a)
2117 //! p_unit_test 1, %res1
2118 writeout(1, ahi_ucmp_const);
2119
2120 a_hi = bld.pseudo(aco_opcode::p_extract_vector, bld.def(v2b), a, Operand::c32(1));
2121 ahi_unordered = bld.vopc(aco_opcode::v_cmp_neq_f16, bld.def(bld.lm), a_hi, a_hi);
2122 b_unordered = bld.vopc(aco_opcode::v_cmp_neq_f16, bld.def(bld.lm), b, b);
2123 unordered =
2124 bld.sop2(Builder::s_or, bld.def(bld.lm), bld.def(bld.lm, scc), ahi_unordered, b_unordered);
2125 Temp alo_lt_b = bld.vopc(aco_opcode::v_cmp_lt_f16, bld.def(bld.lm), a, b);
2126 Temp noopt = bld.sop2(Builder::s_or, bld.def(bld.lm), bld.def(bld.lm, scc), unordered, alo_lt_b);
2127 //! s2: %u2 = v_cmp_u_f16 hi(%a), %b
2128 //! s2: %cmp2 = v_cmp_lt_f16 %a, %b
2129 //! s2: %res2, s2: %scc2:scc = s_or_b64 %u2, %cmp2
2130 //! p_unit_test 2, %res2
2131 writeout(2, noopt);
2132
2133 Temp hi_neq_lo = bld.vopc(aco_opcode::v_cmp_neq_f16, bld.def(bld.lm), a, a_hi);
2134 Temp a_unordered = bld.vopc(aco_opcode::v_cmp_neq_f16, bld.def(bld.lm), a, a);
2135 noopt = bld.sop2(Builder::s_or, bld.def(bld.lm), bld.def(bld.lm, scc), hi_neq_lo, a_unordered);
2136 //! s2: %nan31 = v_cmp_neq_f16 %a, hi(%a)
2137 //! s2: %nan32 = v_cmp_neq_f16 %a, %a
2138 //! s2: %res3, s2: %scc3:scc = s_or_b64 %nan31, %nan32
2139 //! p_unit_test 3, %res3
2140 writeout(3, noopt);
2141
2142 ahi_cmp_const = bld.vopc(aco_opcode::v_cmp_lt_f16, bld.def(bld.lm), a_hi,
2143 bld.copy(bld.def(v2b), Operand::c16(0x4400)));
2144 a_unordered = bld.vopc(aco_opcode::v_cmp_neq_f16, bld.def(bld.lm), a, a);
2145 noopt =
2146 bld.sop2(Builder::s_or, bld.def(bld.lm), bld.def(bld.lm, scc), a_unordered, ahi_cmp_const);
2147 //! s2: %cmp4 = v_cmp_gt_f16 4.0, hi(%a)
2148 //! s2: %nan4 = v_cmp_neq_f16 %a, %a
2149 //! s2: %res4, s2: %scc4:scc = s_or_b64 %nan4, %cmp4
2150 //! p_unit_test 4, %res4
2151 writeout(4, noopt);
2152
2153 finish_opt_test();
2154 END_TEST
2155
2156 BEGIN_TEST(optimize.max3_opsel)
2157 /* TODO make these work before GFX11 using SDWA. */
2158 for (unsigned i = GFX11; i <= GFX11; i++) {
2159 //>> v1: %a, v1: %b, v2b: %c = p_startpgm
2160 if (!setup_cs("v1 v1 v2b", GFX11))
2161 continue;
2162
2163 Temp a = inputs[0];
2164 Temp b = inputs[1];
2165 Temp c = inputs[2];
2166
2167 Temp a_hi = bld.pseudo(aco_opcode::p_extract_vector, bld.def(v2b), a, Operand::c32(1));
2168 Temp b_hi = bld.pseudo(aco_opcode::p_extract_vector, bld.def(v2b), b, Operand::c32(1));
2169
2170 //! v2b: %res0 = v_max3_f16 hi(%a), %c, hi(%b)
2171 //! p_unit_test 0, %res0
2172 writeout(0, bld.vop2(aco_opcode::v_max_f16, bld.def(v2b),
2173 bld.vop2(aco_opcode::v_max_f16, bld.def(v2b), a_hi, c), b_hi));
2174
2175 finish_opt_test();
2176 }
2177 END_TEST
2178
2179 BEGIN_TEST(optimize.neg_mul_opsel)
2180 //>> v1: %a, v2b: %b = p_startpgm
2181 if (!setup_cs("v1 v2b", GFX11))
2182 return;
2183
2184 Temp a = inputs[0];
2185 Temp b = inputs[1];
2186
2187 Temp a_hi = bld.pseudo(aco_opcode::p_extract_vector, bld.def(v2b), a, Operand::c32(1));
2188
2189 //! v2b: %res0 = v_mul_f16 -hi(%a), %b
2190 //! p_unit_test 0, %res0
2191 writeout(0, fneg(fmul(a_hi, b)));
2192
2193 //! v1: %res1 = v_fma_mix_f32 -hi(%a), lo(%b), -0
2194 //! p_unit_test 1, %res1
2195 writeout(1, fneg(fmul(f2f32(a_hi), f2f32(b))));
2196
2197 finish_opt_test();
2198 END_TEST
2199
2200 BEGIN_TEST(optimize.vinterp_inreg_output_modifiers)
2201 //>> v1: %a, v1: %b, v1: %c = p_startpgm
2202 if (!setup_cs("v1 v1 v1", GFX11))
2203 return;
2204
2205 //! v1: %res0 = v_interp_p2_f32_inreg %a, %b, %c clamp
2206 //! p_unit_test 0, %res0
2207 Temp tmp = bld.vinterp_inreg(aco_opcode::v_interp_p2_f32_inreg, bld.def(v1), inputs[0],
2208 inputs[1], inputs[2]);
2209 writeout(0, fsat(tmp));
2210
2211 //! v1: %res1 = v_fma_f32 %b, %a, %c *2 quad_perm:[2,2,2,2] fi
2212 //! p_unit_test 1, %res1
2213 tmp = bld.vinterp_inreg(aco_opcode::v_interp_p2_f32_inreg, bld.def(v1), inputs[1], inputs[0],
2214 inputs[2]);
2215 tmp = bld.vop2(aco_opcode::v_mul_f32, bld.def(v1), Operand::c32(0x40000000u), tmp);
2216 writeout(1, tmp);
2217
2218 //! v2b: %res2 = v_interp_p2_f16_f32_inreg %a, %b, %c clamp
2219 //! p_unit_test 2, %res2
2220 tmp = bld.vinterp_inreg(aco_opcode::v_interp_p2_f16_f32_inreg, bld.def(v2b), inputs[0],
2221 inputs[1], inputs[2]);
2222 writeout(2, fsat(tmp));
2223
2224 //! v2b: %tmp3 = v_interp_p2_f16_f32_inreg %b, %a, %c
2225 //! v2b: %res3 = v_mul_f16 2.0, %tmp3
2226 //! p_unit_test 3, %res3
2227 tmp = bld.vinterp_inreg(aco_opcode::v_interp_p2_f16_f32_inreg, bld.def(v2b), inputs[1],
2228 inputs[0], inputs[2]);
2229 tmp = bld.vop2(aco_opcode::v_mul_f16, bld.def(v2b), Operand::c16(0x4000u), tmp);
2230 writeout(3, tmp);
2231
2232 //! v2b: %res4 = v_fma_mixlo_f16 %c, %b, %a quad_perm:[2,2,2,2] fi
2233 //! p_unit_test 4, %res4
2234 tmp = bld.vinterp_inreg(aco_opcode::v_interp_p2_f32_inreg, bld.def(v1), inputs[2], inputs[1],
2235 inputs[0]);
2236 writeout(4, f2f16(tmp));
2237
2238 finish_opt_test();
2239 END_TEST
2240