1 /*
2 * Copyright © 2020 Valve Corporation
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
13 * Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21 * IN THE SOFTWARE.
22 *
23 */
24 #include "helpers.h"
25
26 using namespace aco;
27
28 BEGIN_TEST(optimize.neg)
29 for (unsigned i = GFX9; i <= GFX10; i++) {
30 //>> v1: %a, v1: %b, s1: %c, s1: %d, s2: %_:exec = p_startpgm
31 if (!setup_cs("v1 v1 s1 s1", (chip_class)i))
32 continue;
33
34 //! v1: %res0 = v_mul_f32 %a, -%b
35 //! p_unit_test 0, %res0
36 Temp neg_b = bld.vop2(aco_opcode::v_xor_b32, bld.def(v1), Operand(0x80000000u), inputs[1]);
37 writeout(0, bld.vop2(aco_opcode::v_mul_f32, bld.def(v1), inputs[0], neg_b));
38
39 //! v1: %neg_a = v_xor_b32 0x80000000, %a
40 //~gfx[6-9]! v1: %res1 = v_mul_f32 0x123456, %neg_a
41 //~gfx10! v1: %res1 = v_mul_f32 0x123456, -%a
42 //! p_unit_test 1, %res1
43 Temp neg_a = bld.vop2(aco_opcode::v_xor_b32, bld.def(v1), Operand(0x80000000u), inputs[0]);
44 writeout(1, bld.vop2(aco_opcode::v_mul_f32, bld.def(v1), Operand(0x123456u), neg_a));
45
46 //! v1: %res2 = v_mul_f32 %a, %b
47 //! p_unit_test 2, %res2
48 Temp neg_neg_a = bld.vop2(aco_opcode::v_xor_b32, bld.def(v1), Operand(0x80000000u), neg_a);
49 writeout(2, bld.vop2(aco_opcode::v_mul_f32, bld.def(v1), neg_neg_a, inputs[1]));
50
51 /* we could optimize this case into just an abs(), but NIR already does this */
52 //! v1: %res3 = v_mul_f32 |%neg_a|, %b
53 //! p_unit_test 3, %res3
54 Temp abs_neg_a = bld.vop2(aco_opcode::v_and_b32, bld.def(v1), Operand(0x7FFFFFFFu), neg_a);
55 writeout(3, bld.vop2(aco_opcode::v_mul_f32, bld.def(v1), abs_neg_a, inputs[1]));
56
57 //! v1: %res4 = v_mul_f32 -|%a|, %b
58 //! p_unit_test 4, %res4
59 Temp abs_a = bld.vop2(aco_opcode::v_and_b32, bld.def(v1), Operand(0x7FFFFFFFu), inputs[0]);
60 Temp neg_abs_a = bld.vop2(aco_opcode::v_xor_b32, bld.def(v1), Operand(0x80000000u), abs_a);
61 writeout(4, bld.vop2(aco_opcode::v_mul_f32, bld.def(v1), neg_abs_a, inputs[1]));
62
63 //! v1: %res5 = v_mul_f32 -%a, %b row_shl:1 bound_ctrl:1
64 //! p_unit_test 5, %res5
65 writeout(5, bld.vop2_dpp(aco_opcode::v_mul_f32, bld.def(v1), neg_a, inputs[1], dpp_row_sl(1)));
66
67 //! v1: %res6 = v_subrev_f32 %a, %b
68 //! p_unit_test 6, %res6
69 writeout(6, bld.vop2(aco_opcode::v_add_f32, bld.def(v1), neg_a, inputs[1]));
70
71 //! v1: %res7 = v_sub_f32 %b, %a
72 //! p_unit_test 7, %res7
73 writeout(7, bld.vop2(aco_opcode::v_add_f32, bld.def(v1), inputs[1], neg_a));
74
75 //! v1: %res8 = v_mul_f32 %a, -%c
76 //! p_unit_test 8, %res8
77 Temp neg_c = bld.vop2(aco_opcode::v_xor_b32, bld.def(v1), Operand(0x80000000u), bld.copy(bld.def(v1), inputs[2]));
78 writeout(8, bld.vop2(aco_opcode::v_mul_f32, bld.def(v1), inputs[0], neg_c));
79
80 finish_opt_test();
81 }
82 END_TEST
83
create_subbrev_co(Operand op0,Operand op1,Operand op2)84 Temp create_subbrev_co(Operand op0, Operand op1, Operand op2)
85 {
86 return bld.vop2_e64(aco_opcode::v_subbrev_co_u32, bld.def(v1), bld.hint_vcc(bld.def(bld.lm)), op0, op1, op2);
87 }
88
89 BEGIN_TEST(optimize.cndmask)
90 for (unsigned i = GFX9; i <= GFX10; i++) {
91 //>> v1: %a, s1: %b, s2: %c, s2: %_:exec = p_startpgm
92 if (!setup_cs("v1 s1 s2", (chip_class)i))
93 continue;
94
95 Temp subbrev;
96
97 //! v1: %res0 = v_cndmask_b32 0, %a, %c
98 //! p_unit_test 0, %res0
99 subbrev = create_subbrev_co(Operand(0u), Operand(0u), Operand(inputs[2]));
100 writeout(0, bld.vop2(aco_opcode::v_and_b32, bld.def(v1), inputs[0], subbrev));
101
102 //! v1: %res1 = v_cndmask_b32 0, 42, %c
103 //! p_unit_test 1, %res1
104 subbrev = create_subbrev_co(Operand(0u), Operand(0u), Operand(inputs[2]));
105 writeout(1, bld.vop2(aco_opcode::v_and_b32, bld.def(v1), Operand(42u), subbrev));
106
107 //~gfx9! v1: %subbrev, s2: %_ = v_subbrev_co_u32 0, 0, %c
108 //~gfx9! v1: %res2 = v_and_b32 %b, %subbrev
109 //~gfx10! v1: %res2 = v_cndmask_b32 0, %b, %c
110 //! p_unit_test 2, %res2
111 subbrev = create_subbrev_co(Operand(0u), Operand(0u), Operand(inputs[2]));
112 writeout(2, bld.vop2(aco_opcode::v_and_b32, bld.def(v1), inputs[1], subbrev));
113
114 //! v1: %subbrev1, s2: %_ = v_subbrev_co_u32 0, 0, %c
115 //! v1: %xor = v_xor_b32 %a, %subbrev1
116 //! v1: %res3 = v_cndmask_b32 0, %xor, %c
117 //! p_unit_test 3, %res3
118 subbrev = create_subbrev_co(Operand(0u), Operand(0u), Operand(inputs[2]));
119 Temp xor_a = bld.vop2(aco_opcode::v_xor_b32, bld.def(v1), inputs[0], subbrev);
120 writeout(3, bld.vop2(aco_opcode::v_and_b32, bld.def(v1), xor_a, subbrev));
121
122 //! v1: %res4 = v_cndmask_b32 0, %a, %c
123 //! p_unit_test 4, %res4
124 Temp cndmask = bld.vop2_e64(aco_opcode::v_cndmask_b32, bld.def(v1), Operand(0u), Operand(1u), Operand(inputs[2]));
125 Temp sub = bld.vsub32(bld.def(v1), Operand(0u), cndmask);
126 writeout(4, bld.vop2(aco_opcode::v_and_b32, bld.def(v1), Operand(inputs[0]), sub));
127
128 finish_opt_test();
129 }
130 END_TEST
131
132 BEGIN_TEST(optimize.clamp)
133 //>> v1: %a, v1: %b, v1: %c, s2: %_:exec = p_startpgm
134 if (!setup_cs("v1 v1 v1", GFX9))
135 return;
136
137 //! v1: %res0 = v_med3_f32 4.0, 0, %a
138 //! p_unit_test 0, %res0
139 writeout(0, bld.vop2(aco_opcode::v_min_f32, bld.def(v1), Operand(0x40800000u),
140 bld.vop2(aco_opcode::v_max_f32, bld.def(v1), Operand(0u), inputs[0])));
141
142 //! v1: %res1 = v_med3_f32 0, 4.0, %a
143 //! p_unit_test 1, %res1
144 writeout(1, bld.vop2(aco_opcode::v_max_f32, bld.def(v1), Operand(0u),
145 bld.vop2(aco_opcode::v_min_f32, bld.def(v1), Operand(0x40800000u), inputs[0])));
146
147 /* correct NaN behaviour with precise */
148
149 //! v1: %res2 = v_med3_f32 4.0, 0, %a
150 //! p_unit_test 2, %res2
151 Builder::Result max = bld.vop2(aco_opcode::v_max_f32, bld.def(v1), Operand(0u), inputs[0]);
152 max.def(0).setPrecise(true);
153 Builder::Result min = bld.vop2(aco_opcode::v_min_f32, bld.def(v1), Operand(0x40800000u), max);
154 max.def(0).setPrecise(true);
155 writeout(2, min);
156
157 //! v1: (precise)%res3_tmp = v_min_f32 4.0, %a
158 //! v1: %res3 = v_max_f32 0, %res3_tmp
159 //! p_unit_test 3, %res3
160 min = bld.vop2(aco_opcode::v_min_f32, bld.def(v1), Operand(0x40800000u), inputs[0]);
161 min.def(0).setPrecise(true);
162 writeout(3, bld.vop2(aco_opcode::v_max_f32, bld.def(v1), Operand(0u), min));
163
164 finish_opt_test();
165 END_TEST
166
167 BEGIN_TEST(optimize.const_comparison_ordering)
168 //>> v1: %a, v1: %b, v2: %c, v1: %d, s2: %_:exec = p_startpgm
169 if (!setup_cs("v1 v1 v2 v1", GFX9))
170 return;
171
172 /* optimize to unordered comparison */
173 //! s2: %res0 = v_cmp_nge_f32 4.0, %a
174 //! p_unit_test 0, %res0
175 writeout(0, bld.sop2(aco_opcode::s_or_b64, bld.def(bld.lm), bld.def(s1, scc),
176 bld.vopc(aco_opcode::v_cmp_neq_f32, bld.def(bld.lm), inputs[0], inputs[0]),
177 bld.vopc(aco_opcode::v_cmp_lt_f32, bld.def(bld.lm), Operand(0x40800000u), inputs[0])));
178
179 //! s2: %res1 = v_cmp_nge_f32 4.0, %a
180 //! p_unit_test 1, %res1
181 writeout(1, bld.sop2(aco_opcode::s_or_b64, bld.def(bld.lm), bld.def(s1, scc),
182 bld.vopc(aco_opcode::v_cmp_neq_f32, bld.def(bld.lm), inputs[0], inputs[0]),
183 bld.vopc(aco_opcode::v_cmp_nge_f32, bld.def(bld.lm), Operand(0x40800000u), inputs[0])));
184
185 //! s2: %res2 = v_cmp_nge_f32 0x40a00000, %a
186 //! p_unit_test 2, %res2
187 writeout(2, bld.sop2(aco_opcode::s_or_b64, bld.def(bld.lm), bld.def(s1, scc),
188 bld.vopc(aco_opcode::v_cmp_neq_f32, bld.def(bld.lm), inputs[0], inputs[0]),
189 bld.vopc(aco_opcode::v_cmp_lt_f32, bld.def(bld.lm), bld.copy(bld.def(v1), Operand(0x40a00000u)), inputs[0])));
190
191 /* optimize to ordered comparison */
192 //! s2: %res3 = v_cmp_lt_f32 4.0, %a
193 //! p_unit_test 3, %res3
194 writeout(3, bld.sop2(aco_opcode::s_and_b64, bld.def(bld.lm), bld.def(s1, scc),
195 bld.vopc(aco_opcode::v_cmp_eq_f32, bld.def(bld.lm), inputs[0], inputs[0]),
196 bld.vopc(aco_opcode::v_cmp_nge_f32, bld.def(bld.lm), Operand(0x40800000u), inputs[0])));
197
198 //! s2: %res4 = v_cmp_lt_f32 4.0, %a
199 //! p_unit_test 4, %res4
200 writeout(4, bld.sop2(aco_opcode::s_and_b64, bld.def(bld.lm), bld.def(s1, scc),
201 bld.vopc(aco_opcode::v_cmp_eq_f32, bld.def(bld.lm), inputs[0], inputs[0]),
202 bld.vopc(aco_opcode::v_cmp_lt_f32, bld.def(bld.lm), Operand(0x40800000u), inputs[0])));
203
204 //! s2: %res5 = v_cmp_lt_f32 0x40a00000, %a
205 //! p_unit_test 5, %res5
206 writeout(5, bld.sop2(aco_opcode::s_and_b64, bld.def(bld.lm), bld.def(s1, scc),
207 bld.vopc(aco_opcode::v_cmp_eq_f32, bld.def(bld.lm), inputs[0], inputs[0]),
208 bld.vopc(aco_opcode::v_cmp_nge_f32, bld.def(bld.lm), bld.copy(bld.def(v1), Operand(0x40a00000u)), inputs[0])));
209
210 /* NaN */
211 uint16_t nan16 = 0x7e00;
212 uint32_t nan32 = 0x7fc00000;
213
214 //! s2: %tmp6_0 = v_cmp_lt_f16 0x7e00, %a
215 //! s2: %tmp6_1 = v_cmp_neq_f16 %a, %a
216 //! s2: %res6, s1: %_:scc = s_or_b64 %tmp6_1, %tmp6_0
217 //! p_unit_test 6, %res6
218 writeout(6, bld.sop2(aco_opcode::s_or_b64, bld.def(bld.lm), bld.def(s1, scc),
219 bld.vopc(aco_opcode::v_cmp_neq_f16, bld.def(bld.lm), inputs[0], inputs[0]),
220 bld.vopc(aco_opcode::v_cmp_lt_f16, bld.def(bld.lm), Operand(nan16), inputs[0])));
221
222 //! s2: %tmp7_0 = v_cmp_lt_f32 0x7fc00000, %a
223 //! s2: %tmp7_1 = v_cmp_neq_f32 %a, %a
224 //! s2: %res7, s1: %_:scc = s_or_b64 %tmp7_1, %tmp7_0
225 //! p_unit_test 7, %res7
226 writeout(7, bld.sop2(aco_opcode::s_or_b64, bld.def(bld.lm), bld.def(s1, scc),
227 bld.vopc(aco_opcode::v_cmp_neq_f32, bld.def(bld.lm), inputs[0], inputs[0]),
228 bld.vopc(aco_opcode::v_cmp_lt_f32, bld.def(bld.lm), Operand(nan32), inputs[0])));
229
230 finish_opt_test();
231 END_TEST
232
233 BEGIN_TEST(optimize.add3)
234 //>> v1: %a, v1: %b, v1: %c, s2: %_:exec = p_startpgm
235 if (!setup_cs("v1 v1 v1", GFX9))
236 return;
237
238 //! v1: %res0 = v_add3_u32 %a, %b, %c
239 //! p_unit_test 0, %res0
240 Builder::Result tmp = bld.vop2(aco_opcode::v_add_u32, bld.def(v1), inputs[1], inputs[2]);
241 writeout(0, bld.vop2(aco_opcode::v_add_u32, bld.def(v1), inputs[0], tmp));
242
243 //! v1: %tmp1 = v_add_u32 %b, %c clamp
244 //! v1: %res1 = v_add_u32 %a, %tmp1
245 //! p_unit_test 1, %res1
246 tmp = bld.vop2_e64(aco_opcode::v_add_u32, bld.def(v1), inputs[1], inputs[2]);
247 static_cast<VOP3A_instruction *>(tmp.instr)->clamp = true;
248 writeout(1, bld.vop2(aco_opcode::v_add_u32, bld.def(v1), inputs[0], tmp));
249
250 //! v1: %tmp2 = v_add_u32 %b, %c
251 //! v1: %res2 = v_add_u32 %a, %tmp2 clamp
252 //! p_unit_test 2, %res2
253 tmp = bld.vop2(aco_opcode::v_add_u32, bld.def(v1), inputs[1], inputs[2]);
254 tmp = bld.vop2_e64(aco_opcode::v_add_u32, bld.def(v1), inputs[0], tmp);
255 static_cast<VOP3A_instruction *>(tmp.instr)->clamp = true;
256 writeout(2, tmp);
257
258 finish_opt_test();
259 END_TEST
260
261 BEGIN_TEST(optimize.minmax)
262 for (unsigned i = GFX8; i <= GFX10; i++) {
263 //>> v1: %a, s2: %_:exec = p_startpgm
264 if (!setup_cs("v1", (chip_class)i))
265 continue;
266
267 //! v1: %res0 = v_max3_f32 0, -0, %a
268 //! p_unit_test 0, %res0
269 Temp xor0 = bld.vop2(aco_opcode::v_xor_b32, bld.def(v1), Operand(0x80000000u), Operand(inputs[0]));
270 Temp min = bld.vop2(aco_opcode::v_min_f32, bld.def(v1), Operand(0u), xor0);
271 Temp xor1 = bld.vop2(aco_opcode::v_xor_b32, bld.def(v1), Operand(0x80000000u), min);
272 writeout(0, bld.vop2(aco_opcode::v_max_f32, bld.def(v1), Operand(0u), xor1));
273
274 //! v1: %res1 = v_max3_f32 0, -0, -%a
275 //! p_unit_test 1, %res1
276 min = bld.vop2(aco_opcode::v_min_f32, bld.def(v1), Operand(0u), Operand(inputs[0]));
277 xor1 = bld.vop2(aco_opcode::v_xor_b32, bld.def(v1), Operand(0x80000000u), min);
278 writeout(1, bld.vop2(aco_opcode::v_max_f32, bld.def(v1), Operand(0u), xor1));
279
280 finish_opt_test();
281 }
282 END_TEST
283