• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /*
2  * Copyright © 2020 Valve Corporation
3  *
4  * Permission is hereby granted, free of charge, to any person obtaining a
5  * copy of this software and associated documentation files (the "Software"),
6  * to deal in the Software without restriction, including without limitation
7  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8  * and/or sell copies of the Software, and to permit persons to whom the
9  * Software is furnished to do so, subject to the following conditions:
10  *
11  * The above copyright notice and this permission notice (including the next
12  * paragraph) shall be included in all copies or substantial portions of the
13  * Software.
14  *
15  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
18  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20  * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21  * IN THE SOFTWARE.
22  *
23  */
24 #include "helpers.h"
25 
26 using namespace aco;
27 
28 BEGIN_TEST(regalloc.subdword_alloc.reuse_16bit_operands)
29    /* Registers of operands should be "recycled" for the output. But if the
30     * input is smaller than the output, that's not generally possible. The
31     * first v_cvt_f32_f16 instruction below uses the upper 16 bits of v0
32     * while the lower 16 bits are still live, so the output must be stored in
33     * a register other than v0. For the second v_cvt_f32_f16, the original
34     * value stored in v0 is no longer used and hence it's safe to store the
35     * result in v0.
36     */
37 
38    /* TODO: is this possible to do on GFX11? */
39    for (amd_gfx_level cc = GFX8; cc <= GFX10_3; cc = (amd_gfx_level)((unsigned)cc + 1)) {
40       for (bool pessimistic : {false, true}) {
41          const char* subvariant = pessimistic ? "/pessimistic" : "/optimistic";
42 
43          //>> v1: %_:v[#a] = p_startpgm
44          if (!setup_cs("v1", (amd_gfx_level)cc, CHIP_UNKNOWN, subvariant))
45             return;
46 
47          //! v2b: %_:v[#a][0:16], v2b: %res1:v[#a][16:32] = p_split_vector %_:v[#a]
48          Builder::Result tmp =
49             bld.pseudo(aco_opcode::p_split_vector, bld.def(v2b), bld.def(v2b), inputs[0]);
50 
51          //! v1: %_:v[#b] = v_cvt_f32_f16 %_:v[#a][16:32] dst_sel:dword src0_sel:uword1
52          //! v1: %_:v[#a] = v_cvt_f32_f16 %_:v[#a][0:16]
53          //; success = (b != a)
54          auto result1 = bld.vop1(aco_opcode::v_cvt_f32_f16, bld.def(v1), tmp.def(1).getTemp());
55          auto result2 = bld.vop1(aco_opcode::v_cvt_f32_f16, bld.def(v1), tmp.def(0).getTemp());
56          writeout(0, result1);
57          writeout(1, result2);
58 
59          finish_ra_test(ra_test_policy{pessimistic});
60       }
61    }
62 END_TEST
63 
64 BEGIN_TEST(regalloc._32bit_partial_write)
65    //>> v1: %_:v[0] = p_startpgm
66    if (!setup_cs("v1", GFX10))
67       return;
68 
69    /* ensure high 16 bits are occupied */
70    //! v2b: %_:v[0][0:16], v2b: %_:v[0][16:32] = p_split_vector %_:v[0]
71    Temp hi =
72       bld.pseudo(aco_opcode::p_split_vector, bld.def(v2b), bld.def(v2b), inputs[0]).def(1).getTemp();
73 
74    /* This test checks if this instruction uses SDWA. */
75    //! v2b: %_:v[0][0:16] = v_not_b32 0 dst_sel:uword0 dst_preserve src0_sel:dword
76    Temp lo = bld.vop1(aco_opcode::v_not_b32, bld.def(v2b), Operand::zero());
77 
78    //! v1: %_:v[0] = p_create_vector %_:v[0][0:16], %_:v[0][16:32]
79    bld.pseudo(aco_opcode::p_create_vector, bld.def(v1), lo, hi);
80 
81    finish_ra_test(ra_test_policy());
82 END_TEST
83 
84 BEGIN_TEST(regalloc.precolor.swap)
85    //>> s2: %op0:s[0-1] = p_startpgm
86    if (!setup_cs("s2", GFX10))
87       return;
88 
89    program->dev.sgpr_limit = 4;
90 
91    //! s2: %op1:s[2-3] = p_unit_test
92    Temp op1 = bld.pseudo(aco_opcode::p_unit_test, bld.def(s2));
93 
94    //! s2: %op0_2:s[2-3], s2: %op1_2:s[0-1] = p_parallelcopy %op0:s[0-1], %op1:s[2-3]
95    //! p_unit_test %op0_2:s[2-3], %op1_2:s[0-1]
96    Operand op(inputs[0]);
97    op.setFixed(PhysReg(2));
98    bld.pseudo(aco_opcode::p_unit_test, op, op1);
99 
100    finish_ra_test(ra_test_policy());
101 END_TEST
102 
103 BEGIN_TEST(regalloc.precolor.blocking_vector)
104    //>> s2: %tmp0:s[0-1], s1: %tmp1:s[2] = p_startpgm
105    if (!setup_cs("s2 s1", GFX10))
106       return;
107 
108    //! s1: %tmp1_2:s[1], s2: %tmp0_2:s[2-3] = p_parallelcopy %tmp1:s[2], %tmp0:s[0-1]
109    //! p_unit_test %tmp1_2:s[1]
110    Operand op(inputs[1]);
111    op.setFixed(PhysReg(1));
112    bld.pseudo(aco_opcode::p_unit_test, op);
113 
114    //! p_unit_test %tmp0_2:s[2-3]
115    bld.pseudo(aco_opcode::p_unit_test, inputs[0]);
116 
117    finish_ra_test(ra_test_policy());
118 END_TEST
119 
120 BEGIN_TEST(regalloc.precolor.vector.test)
121    //>> s2: %tmp0:s[0-1], s1: %tmp1:s[2], s1: %tmp2:s[3] = p_startpgm
122    if (!setup_cs("s2 s1 s1", GFX10))
123       return;
124 
125    //! s2: %tmp0_2:s[2-3], s1: %tmp2_2:s[0] = p_parallelcopy %tmp0:s[0-1], %tmp2:s[3]
126    //! p_unit_test %tmp0_2:s[2-3]
127    Operand op(inputs[0]);
128    op.setFixed(PhysReg(2));
129    bld.pseudo(aco_opcode::p_unit_test, op);
130 
131    //! p_unit_test %tmp2_2:s[0]
132    bld.pseudo(aco_opcode::p_unit_test, inputs[2]);
133 
134    finish_ra_test(ra_test_policy());
135 END_TEST
136 
137 BEGIN_TEST(regalloc.precolor.vector.collect)
138    //>> s2: %tmp0:s[0-1], s1: %tmp1:s[2], s1: %tmp2:s[3] = p_startpgm
139    if (!setup_cs("s2 s1 s1", GFX10))
140       return;
141 
142    //! s2: %tmp0_2:s[2-3], s1: %tmp1_2:s[0], s1: %tmp2_2:s[1] = p_parallelcopy %tmp0:s[0-1], %tmp1:s[2], %tmp2:s[3]
143    //! p_unit_test %tmp0_2:s[2-3]
144    Operand op(inputs[0]);
145    op.setFixed(PhysReg(2));
146    bld.pseudo(aco_opcode::p_unit_test, op);
147 
148    //! p_unit_test %tmp1_2:s[0], %tmp2_2:s[1]
149    bld.pseudo(aco_opcode::p_unit_test, inputs[1], inputs[2]);
150 
151    finish_ra_test(ra_test_policy());
152 END_TEST
153 
154 BEGIN_TEST(regalloc.precolor.vgpr_move)
155    //>> v1: %tmp0:v[0], v1: %tmp1:v[1] = p_startpgm
156    if (!setup_cs("v1 v1", GFX10))
157       return;
158 
159    //! v1: %tmp1_2:v[0], v1: %tmp0_2:v[1] = p_parallelcopy %tmp1:v[1], %tmp0:v[0]
160    //! p_unit_test %tmp0_2:v[1], %tmp1_2:v[0]
161    bld.pseudo(aco_opcode::p_unit_test, inputs[0], Operand(inputs[1], PhysReg(256)));
162 
163    finish_ra_test(ra_test_policy());
164 END_TEST
165 
166 BEGIN_TEST(regalloc.precolor.multiple_operands)
167    //>> v1: %tmp0:v[0], v1: %tmp1:v[1], v1: %tmp2:v[2], v1: %tmp3:v[3] = p_startpgm
168    if (!setup_cs("v1 v1 v1 v1", GFX10))
169       return;
170 
171    //! v1: %tmp3_2:v[0], v1: %tmp0_2:v[1], v1: %tmp1_2:v[2], v1: %tmp2_2:v[3] = p_parallelcopy %tmp3:v[3], %tmp0:v[0], %tmp1:v[1], %tmp2:v[2]
172    //! p_unit_test %tmp3_2:v[0], %tmp0_2:v[1], %tmp1_2:v[2], %tmp2_2:v[3]
173    bld.pseudo(aco_opcode::p_unit_test, Operand(inputs[3], PhysReg(256 + 0)),
174               Operand(inputs[0], PhysReg(256 + 1)), Operand(inputs[1], PhysReg(256 + 2)),
175               Operand(inputs[2], PhysReg(256 + 3)));
176 
177    finish_ra_test(ra_test_policy());
178 END_TEST
179 
180 BEGIN_TEST(regalloc.precolor.different_regs)
181    //>> v1: %tmp0:v[0] = p_startpgm
182    if (!setup_cs("v1", GFX10))
183       return;
184 
185    //! v1: %tmp1:v[1], v1: %tmp2:v[2] = p_parallelcopy %tmp0:v[0], %tmp0:v[0]
186    //! p_unit_test %tmp0:v[0], %tmp1:v[1], %tmp2:v[2]
187    bld.pseudo(aco_opcode::p_unit_test, Operand(inputs[0], PhysReg(256 + 0)),
188               Operand(inputs[0], PhysReg(256 + 1)), Operand(inputs[0], PhysReg(256 + 2)));
189 
190    finish_ra_test(ra_test_policy());
191 END_TEST
192 
193 BEGIN_TEST(regalloc.scratch_sgpr.create_vector)
194    if (!setup_cs("v1 s1", GFX7))
195       return;
196 
197    Temp tmp = bld.pseudo(aco_opcode::p_extract_vector, bld.def(v1b), inputs[0], Operand::zero());
198 
199    //>> v3b: %0:v[0][0:24] = v_and_b32 0xffffff, %0:v[0][0:24]
200    //! s1: %0:s[1] = s_mov_b32 0x1000001
201    //! v1: %0:v[0] = v_mul_lo_u32 %0:s[1], %_:v[0][0:8]
202    bld.pseudo(aco_opcode::p_create_vector, bld.def(v1), Operand(v3b), Operand(tmp));
203 
204    //! p_unit_test %_:s[0]
205    //! s_endpgm
206    bld.pseudo(aco_opcode::p_unit_test, inputs[1]);
207 
208    finish_ra_test(ra_test_policy(), true);
209 END_TEST
210 
211 BEGIN_TEST(regalloc.scratch_sgpr.create_vector_sgpr_operand)
212    if (!setup_cs("v2 s1", GFX7))
213       return;
214 
215    Temp tmp = bld.pseudo(aco_opcode::p_extract_vector, bld.def(v1b), inputs[0], Operand::c32(4u));
216 
217    //>> v1: %0:v[0] = v_mov_b32 %_:s[0]
218    //! v3b: %0:v[1][0:24] = v_and_b32 0xffffff, %0:v[1][0:24]
219    //! s1: %0:s[1] = s_mov_b32 0x1000001
220    //! v1: %0:v[1] = v_mul_lo_u32 %0:s[1], %_:v[1][0:8]
221    bld.pseudo(aco_opcode::p_create_vector, bld.def(v2), inputs[1], Operand(v3b), Operand(tmp));
222 
223    //! p_unit_test %_:s[0]
224    //! s_endpgm
225    bld.pseudo(aco_opcode::p_unit_test, inputs[1]);
226 
227    finish_ra_test(ra_test_policy(), true);
228 END_TEST
229 
230 BEGIN_TEST(regalloc.linear_vgpr.live_range_split.fixed_def)
231    //>> p_startpgm
232    if (!setup_cs("", GFX10))
233       return;
234 
235    PhysReg reg_v0{256};
236 
237    //! lv1: %tmp1:v[0] = p_unit_test
238    Temp tmp = bld.pseudo(aco_opcode::p_unit_test, bld.def(v1.as_linear(), reg_v0));
239 
240    //! lv1: %tmp2:v[1] = p_parallelcopy %tmp1:v[0]
241    //! v1: %_:v[0] = p_unit_test
242    bld.pseudo(aco_opcode::p_unit_test, Definition(reg_v0, v1));
243 
244    //! p_unit_test %tmp2:v[1]
245    bld.pseudo(aco_opcode::p_unit_test, tmp);
246 
247    finish_ra_test(ra_test_policy());
248 END_TEST
249 
250 BEGIN_TEST(regalloc.linear_vgpr.live_range_split.get_reg_impl)
251    //>> p_startpgm
252    if (!setup_cs("", GFX10))
253       return;
254 
255    program->dev.vgpr_limit = 3;
256 
257    PhysReg reg_v1{257};
258 
259    //! s1: %scc_tmp:scc, s1: %1:s[0] = p_unit_test
260    Temp s0_tmp = bld.tmp(s1);
261    Temp scc_tmp = bld.pseudo(aco_opcode::p_unit_test, bld.def(s1, scc),
262                              Definition(s0_tmp.id(), PhysReg{0}, s1));
263 
264    //! lv1: %tmp1:v[1] = p_unit_test
265    Temp tmp = bld.pseudo(aco_opcode::p_unit_test, bld.def(v1.as_linear(), reg_v1));
266 
267    //! lv1: %tmp2:v[2] = p_parallelcopy %tmp1:v[1]
268    //! v2: %_:v[0-1] = p_unit_test
269    bld.pseudo(aco_opcode::p_unit_test, bld.def(v2));
270 
271    //! p_unit_test %tmp2:v[2], %scc_tmp:scc, %1:s[0]
272    bld.pseudo(aco_opcode::p_unit_test, tmp, scc_tmp, s0_tmp);
273 
274    finish_ra_test(ra_test_policy());
275 
276    //>> lv1: %5:v[2] = p_parallelcopy %3:v[1] scc:1 scratch:s1
277    Pseudo_instruction& parallelcopy = program->blocks[0].instructions[3]->pseudo();
278    aco_print_instr(program->gfx_level, &parallelcopy, output);
279    fprintf(output, " scc:%u scratch:s%u\n", parallelcopy.tmp_in_scc,
280            parallelcopy.scratch_sgpr.reg());
281 END_TEST
282 
283 BEGIN_TEST(regalloc.linear_vgpr.live_range_split.get_regs_for_copies)
284    //>> p_startpgm
285    if (!setup_cs("", GFX10))
286       return;
287 
288    program->dev.vgpr_limit = 6;
289 
290    PhysReg reg_v2{258};
291    PhysReg reg_v4{260};
292 
293    //! lv1: %lin_tmp1:v[4] = p_unit_test
294    Temp lin_tmp = bld.pseudo(aco_opcode::p_unit_test, bld.def(v1.as_linear(), reg_v4));
295    //! v2: %log_tmp1:v[2-3] = p_unit_test
296    Temp log_tmp = bld.pseudo(aco_opcode::p_unit_test, bld.def(v2, reg_v2));
297 
298    //! lv1: %lin_tmp2:v[0], v2: %log_tmp2:v[4-5] = p_parallelcopy %lin_tmp1:v[4], %log_tmp1:v[2-3]
299    //! v3: %_:v[1-3] = p_unit_test
300    bld.pseudo(aco_opcode::p_unit_test, bld.def(v3));
301 
302    //! p_unit_test %log_tmp2:v[4-5], %lin_tmp2:v[0]
303    bld.pseudo(aco_opcode::p_unit_test, log_tmp, lin_tmp);
304 
305    finish_ra_test(ra_test_policy());
306 END_TEST
307 
308 BEGIN_TEST(regalloc.linear_vgpr.live_range_split.get_reg_create_vector)
309    //>> p_startpgm
310    if (!setup_cs("", GFX10))
311       return;
312 
313    program->dev.vgpr_limit = 4;
314 
315    PhysReg reg_v0{256};
316    PhysReg reg_v1{257};
317 
318    //! lv1: %lin_tmp1:v[0] = p_unit_test
319    Temp lin_tmp = bld.pseudo(aco_opcode::p_unit_test, bld.def(v1.as_linear(), reg_v0));
320    //! v1: %log_tmp:v[1] = p_unit_test
321    Temp log_tmp = bld.pseudo(aco_opcode::p_unit_test, bld.def(v1, reg_v1));
322 
323    //! lv1: %lin_tmp2:v[2] = p_parallelcopy %lin_tmp1:v[0]
324    //! v2: %_:v[0-1] = p_create_vector v1: undef, %log_tmp:v[1]
325    bld.pseudo(aco_opcode::p_create_vector, bld.def(v2), Operand(v1), log_tmp);
326 
327    //! p_unit_test %lin_tmp2:v[2]
328    bld.pseudo(aco_opcode::p_unit_test, lin_tmp);
329 
330    finish_ra_test(ra_test_policy());
331 END_TEST
332 
333 BEGIN_TEST(regalloc.branch_def_phis_at_merge_block)
334    //>> p_startpgm
335    if (!setup_cs("", GFX10))
336       return;
337 
338    //! s2: %_:s[2-3] = p_branch
339    bld.branch(aco_opcode::p_branch, bld.def(s2));
340 
341    //! BB1
342    //! /* logical preds: / linear preds: BB0, / kind: uniform, */
343    bld.reset(program->create_and_insert_block());
344    program->blocks[1].linear_preds.push_back(0);
345 
346    //! s2: %tmp:s[0-1] = p_linear_phi 0
347    Temp tmp = bld.pseudo(aco_opcode::p_linear_phi, bld.def(s2), Operand::c64(0u));
348 
349    //! p_unit_test %tmp:s[0-1]
350    bld.pseudo(aco_opcode::p_unit_test, tmp);
351 
352    finish_ra_test(ra_test_policy());
353 END_TEST
354 
355 BEGIN_TEST(regalloc.branch_def_phis_at_branch_block)
356    //>> p_startpgm
357    if (!setup_cs("", GFX10))
358       return;
359 
360    //! s2: %tmp:s[0-1] = p_unit_test
361    Temp tmp = bld.pseudo(aco_opcode::p_unit_test, bld.def(s2));
362 
363    //! s2: %_:s[2-3] = p_cbranch_z %0:scc
364    bld.branch(aco_opcode::p_cbranch_z, bld.def(s2), Operand(scc, s1));
365 
366    //! BB1
367    //! /* logical preds: / linear preds: BB0, / kind: */
368    bld.reset(program->create_and_insert_block());
369    program->blocks[1].linear_preds.push_back(0);
370 
371    //! p_unit_test %tmp:s[0-1]
372    bld.pseudo(aco_opcode::p_unit_test, tmp);
373    bld.branch(aco_opcode::p_branch, bld.def(s2));
374 
375    bld.reset(program->create_and_insert_block());
376    program->blocks[2].linear_preds.push_back(0);
377 
378    bld.branch(aco_opcode::p_branch, bld.def(s2));
379 
380    bld.reset(program->create_and_insert_block());
381    program->blocks[3].linear_preds.push_back(1);
382    program->blocks[3].linear_preds.push_back(2);
383 
384    finish_ra_test(ra_test_policy());
385 END_TEST
386 
387 BEGIN_TEST(regalloc.vinterp_fp16)
388    //>> v1: %in0:v[0], v1: %in1:v[1], v1: %in2:v[2] = p_startpgm
389    if (!setup_cs("v1 v1 v1", GFX11))
390       return;
391 
392    //! v2b: %lo:v[3][0:16], v2b: %hi:v[3][16:32] = p_split_vector %in0:v[0]
393    Temp lo = bld.tmp(v2b);
394    Temp hi = bld.tmp(v2b);
395    bld.pseudo(aco_opcode::p_split_vector, Definition(lo), Definition(hi), inputs[0]);
396 
397    //! v1: %tmp0:v[1] = v_interp_p10_f16_f32_inreg %lo:v[3][0:16], %in1:v[1], hi(%hi:v[3][16:32])
398    //! p_unit_test %tmp0:v[1]
399    Temp tmp0 =
400       bld.vinterp_inreg(aco_opcode::v_interp_p10_f16_f32_inreg, bld.def(v1), lo, inputs[1], hi);
401    bld.pseudo(aco_opcode::p_unit_test, tmp0);
402 
403    //! v2b: %tmp1:v[0][16:32] = v_interp_p2_f16_f32_inreg %in0:v[0], %in2:v[2], %tmp0:v[1] opsel_hi
404    //! v1: %tmp2:v[0] = p_create_vector 0, %tmp1:v[0][16:32]
405    //! p_unit_test %tmp2:v[0]
406    Temp tmp1 = bld.vinterp_inreg(aco_opcode::v_interp_p2_f16_f32_inreg, bld.def(v2b), inputs[0],
407                                  inputs[2], tmp0);
408    Temp tmp2 = bld.pseudo(aco_opcode::p_create_vector, bld.def(v1), Operand::zero(2), tmp1);
409    bld.pseudo(aco_opcode::p_unit_test, tmp2);
410 
411    finish_ra_test(ra_test_policy());
412 END_TEST
413 
414 BEGIN_TEST(regalloc.writelane)
415    //>> v1: %in0:v[0], s1: %in1:s[0], s1: %in2:s[1], s1: %in3:s[2] = p_startpgm
416    if (!setup_cs("v1 s1 s1 s1", GFX8))
417       return;
418 
419    //! s1: %tmp:m0 = p_parallelcopy %int3:s[2]
420    Temp tmp = bld.copy(bld.def(s1, m0), inputs[3]);
421 
422    //! s1: %in1_2:m0,  s1: %tmp_2:s[0] = p_parallelcopy %in1:s[0], %tmp:m0
423    //! v1: %tmp2:v[0] = v_writelane_b32_e64 %in1_2:m0, %in2:s[1], %in0:v[0]
424    Temp tmp2 = bld.writelane(bld.def(v1), inputs[1], inputs[2], inputs[0]);
425 
426    //! p_unit_test %tmp_2:s[0], %tmp2:v[0]
427    bld.pseudo(aco_opcode::p_unit_test, tmp, tmp2);
428 
429    finish_ra_test(ra_test_policy());
430 END_TEST
431