• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /*
2  * Copyright © 2022 Valve Corporation
3  *
4  * SPDX-License-Identifier: MIT
5  */
6 #include "helpers.h"
7 
8 using namespace aco;
9 
10 BEGIN_TEST(insert_waitcnt.ds_ordered_count)
11    if (!setup_cs(NULL, GFX10_3))
12       return;
13 
14    Operand def0(PhysReg(256), v1);
15    Operand def1(PhysReg(257), v1);
16    Operand def2(PhysReg(258), v1);
17    Operand gds_base(PhysReg(259), v1);
18    Operand chan_counter(PhysReg(260), v1);
19    Operand m(m0, s1);
20 
21    Instruction* ds_instr;
22    //>> ds_ordered_count %0:v[0], %0:v[3], %0:m0 offset0:3072 gds storage:gds semantics:volatile
23    //! s_waitcnt lgkmcnt(0)
24    ds_instr = bld.ds(aco_opcode::ds_ordered_count, def0, gds_base, m, 3072u, 0u, true);
25    ds_instr->ds().sync = memory_sync_info(storage_gds, semantic_volatile);
26 
27    //! ds_add_rtn_u32 %0:v[1], %0:v[3], %0:v[4], %0:m0 gds storage:gds semantics:volatile,atomic,rmw
28    ds_instr = bld.ds(aco_opcode::ds_add_rtn_u32, def1, gds_base, chan_counter, m, 0u, 0u, true);
29    ds_instr->ds().sync = memory_sync_info(storage_gds, semantic_atomicrmw);
30 
31    //! s_waitcnt lgkmcnt(0)
32    //! ds_ordered_count %0:v[2], %0:v[3], %0:m0 offset0:3840 gds storage:gds semantics:volatile
33    ds_instr = bld.ds(aco_opcode::ds_ordered_count, def2, gds_base, m, 3840u, 0u, true);
34    ds_instr->ds().sync = memory_sync_info(storage_gds, semantic_volatile);
35 
36    finish_waitcnt_test();
37 END_TEST
38 
39 BEGIN_TEST(insert_waitcnt.clause)
40    if (!setup_cs(NULL, GFX11))
41       return;
42 
43    Definition def_v4(PhysReg(260), v1);
44    Definition def_v5(PhysReg(261), v1);
45    Definition def_v6(PhysReg(262), v1);
46    Definition def_v7(PhysReg(263), v1);
47    Operand op_v0(PhysReg(256), v1);
48    Operand op_v4(PhysReg(260), v1);
49    Operand op_v5(PhysReg(261), v1);
50    Operand op_v6(PhysReg(262), v1);
51    Operand op_v7(PhysReg(263), v1);
52    Operand desc0(PhysReg(0), s4);
53 
54    //>> p_unit_test 0
55    bld.pseudo(aco_opcode::p_unit_test, Operand::zero());
56 
57    //! v1: %0:v[4] = buffer_load_dword %0:s[0-3], %0:v[0], 0
58    //! v1: %0:v[5] = buffer_load_dword %0:s[0-3], %0:v[0], 0
59    //! v1: %0:v[6] = buffer_load_dword %0:s[0-3], %0:v[0], 0
60    //! v1: %0:v[7] = buffer_load_dword %0:s[0-3], %0:v[0], 0
61    bld.mubuf(aco_opcode::buffer_load_dword, def_v4, desc0, op_v0, Operand::zero(), 0, false);
62    bld.mubuf(aco_opcode::buffer_load_dword, def_v5, desc0, op_v0, Operand::zero(), 0, false);
63    bld.mubuf(aco_opcode::buffer_load_dword, def_v6, desc0, op_v0, Operand::zero(), 0, false);
64    bld.mubuf(aco_opcode::buffer_load_dword, def_v7, desc0, op_v0, Operand::zero(), 0, false);
65    //! s_waitcnt vmcnt(0)
66    //! v1: %0:v[4] = buffer_load_dword %0:s[0-3], %0:v[4], 0
67    //! v1: %0:v[5] = buffer_load_dword %0:s[0-3], %0:v[5], 0
68    //! v1: %0:v[6] = buffer_load_dword %0:s[0-3], %0:v[6], 0
69    //! v1: %0:v[7] = buffer_load_dword %0:s[0-3], %0:v[7], 0
70    bld.mubuf(aco_opcode::buffer_load_dword, def_v4, desc0, op_v4, Operand::zero(), 0, false);
71    bld.mubuf(aco_opcode::buffer_load_dword, def_v5, desc0, op_v5, Operand::zero(), 0, false);
72    bld.mubuf(aco_opcode::buffer_load_dword, def_v6, desc0, op_v6, Operand::zero(), 0, false);
73    bld.mubuf(aco_opcode::buffer_load_dword, def_v7, desc0, op_v7, Operand::zero(), 0, false);
74    //! s_waitcnt vmcnt(0)
75    //! buffer_store_dword %0:s[0-3], %0:v[0], 0, %0:v[4]
76    //! buffer_store_dword %0:s[0-3], %0:v[0], 0, %0:v[5]
77    //! buffer_store_dword %0:s[0-3], %0:v[0], 0, %0:v[6]
78    //! buffer_store_dword %0:s[0-3], %0:v[0], 0, %0:v[7]
79    bld.mubuf(aco_opcode::buffer_store_dword, desc0, op_v0, Operand::zero(), op_v4, 0, false);
80    bld.mubuf(aco_opcode::buffer_store_dword, desc0, op_v0, Operand::zero(), op_v5, 0, false);
81    bld.mubuf(aco_opcode::buffer_store_dword, desc0, op_v0, Operand::zero(), op_v6, 0, false);
82    bld.mubuf(aco_opcode::buffer_store_dword, desc0, op_v0, Operand::zero(), op_v7, 0, false);
83 
84    //>> p_unit_test 1
85    bld.reset(program->create_and_insert_block());
86    bld.pseudo(aco_opcode::p_unit_test, Operand::c32(1));
87 
88    //! s4: %0:s[4-7] = s_load_dwordx4 %0:s[0-1], 0
89    bld.smem(aco_opcode::s_load_dwordx4, Definition(PhysReg(4), s4), Operand(PhysReg(0), s2),
90             Operand::zero());
91    //! v1: %0:v[4] = buffer_load_dword %0:s[0-3], %0:v[0], 0
92    bld.mubuf(aco_opcode::buffer_load_dword, def_v4, desc0, op_v0, Operand::zero(), 0, false);
93    //! s_waitcnt lgkmcnt(0) vmcnt(0)
94    //! v1: %0:v[5] = buffer_load_dword %0:s[4-7], %0:v[4], 0
95    bld.mubuf(aco_opcode::buffer_load_dword, def_v5, Operand(PhysReg(4), s4), op_v4, Operand::zero(),
96              0, false);
97 
98    //>> p_unit_test 2
99    bld.reset(program->create_and_insert_block());
100    bld.pseudo(aco_opcode::p_unit_test, Operand::c32(2));
101 
102    //! v1: %0:v[4] = buffer_load_dword %0:s[0-3], %0:v[0], 0
103    bld.mubuf(aco_opcode::buffer_load_dword, def_v4, desc0, op_v0, Operand::zero(), 0, false);
104    //! v_nop
105    bld.vop1(aco_opcode::v_nop);
106    //! v1: %0:v[4] = buffer_load_dword %0:s[0-3], %0:v[0], 0
107    bld.mubuf(aco_opcode::buffer_load_dword, def_v4, desc0, op_v0, Operand::zero(), 0, false);
108    //! s_waitcnt vmcnt(0)
109    //! v1: %0:v[5] = buffer_load_dword %0:s[0-3], %0:v[4], 0
110    bld.mubuf(aco_opcode::buffer_load_dword, def_v5, desc0, op_v4, Operand::zero(), 0, false);
111 
112    finish_waitcnt_test();
113 END_TEST
114 
115 BEGIN_TEST(insert_waitcnt.waw.mixed_vmem_lds.vmem)
116    if (!setup_cs(NULL, GFX10))
117       return;
118 
119    Definition def_v4(PhysReg(260), v1);
120    Operand op_v0(PhysReg(256), v1);
121    Operand desc0(PhysReg(0), s4);
122 
123    //>> BB0
124    //! /* logical preds: / linear preds: / kind: top-level, */
125    //! v1: %0:v[4] = buffer_load_dword %0:s[0-3], %0:v[0], 0
126    bld.mubuf(aco_opcode::buffer_load_dword, def_v4, desc0, op_v0, Operand::zero(), 0, false);
127 
128    //>> BB1
129    //! /* logical preds: / linear preds: / kind: */
130    //! v1: %0:v[4] = ds_read_b32 %0:v[0]
131    bld.reset(program->create_and_insert_block());
132    bld.ds(aco_opcode::ds_read_b32, def_v4, op_v0);
133 
134    bld.reset(program->create_and_insert_block());
135    program->blocks[2].linear_preds.push_back(0);
136    program->blocks[2].linear_preds.push_back(1);
137    program->blocks[2].logical_preds.push_back(0);
138    program->blocks[2].logical_preds.push_back(1);
139 
140    //>> BB2
141    //! /* logical preds: BB0, BB1, / linear preds: BB0, BB1, / kind: uniform, */
142    //! s_waitcnt lgkmcnt(0)
143    //! v1: %0:v[4] = buffer_load_dword %0:s[0-3], %0:v[0], 0
144    bld.mubuf(aco_opcode::buffer_load_dword, def_v4, desc0, op_v0, Operand::zero(), 0, false);
145 
146    finish_waitcnt_test();
147 END_TEST
148 
149 BEGIN_TEST(insert_waitcnt.waw.mixed_vmem_lds.lds)
150    if (!setup_cs(NULL, GFX10))
151       return;
152 
153    Definition def_v4(PhysReg(260), v1);
154    Operand op_v0(PhysReg(256), v1);
155    Operand desc0(PhysReg(0), s4);
156 
157    //>> BB0
158    //! /* logical preds: / linear preds: / kind: top-level, */
159    //! v1: %0:v[4] = buffer_load_dword %0:s[0-3], %0:v[0], 0
160    bld.mubuf(aco_opcode::buffer_load_dword, def_v4, desc0, op_v0, Operand::zero(), 0, false);
161 
162    //>> BB1
163    //! /* logical preds: / linear preds: / kind: */
164    //! v1: %0:v[4] = ds_read_b32 %0:v[0]
165    bld.reset(program->create_and_insert_block());
166    bld.ds(aco_opcode::ds_read_b32, def_v4, op_v0);
167 
168    bld.reset(program->create_and_insert_block());
169    program->blocks[2].linear_preds.push_back(0);
170    program->blocks[2].linear_preds.push_back(1);
171    program->blocks[2].logical_preds.push_back(0);
172    program->blocks[2].logical_preds.push_back(1);
173 
174    //>> BB2
175    //! /* logical preds: BB0, BB1, / linear preds: BB0, BB1, / kind: uniform, */
176    //! s_waitcnt vmcnt(0)
177    //! v1: %0:v[4] = ds_read_b32 %0:v[0]
178    bld.ds(aco_opcode::ds_read_b32, def_v4, op_v0);
179 
180    finish_waitcnt_test();
181 END_TEST
182 
183 BEGIN_TEST(insert_waitcnt.waw.vmem_types)
184    for (amd_gfx_level gfx : {GFX11, GFX12}) {
185       if (!setup_cs(NULL, gfx))
186          continue;
187 
188       Definition def_v4(PhysReg(260), v1);
189       Operand op_v0(PhysReg(256), v1);
190       Operand desc_s4(PhysReg(0), s4);
191       Operand desc_s8(PhysReg(8), s8);
192 
193       //>> p_unit_test 0
194       //! v1: %0:v[4] = buffer_load_dword %0:s[0-3], %0:v[0], 0
195       //~gfx12! s_wait_loadcnt imm:0
196       //! v1: %0:v[4] = buffer_load_dword %0:s[0-3], %0:v[0], 0
197       bld.pseudo(aco_opcode::p_unit_test, Operand::c32(0));
198       bld.mubuf(aco_opcode::buffer_load_dword, def_v4, desc_s4, op_v0, Operand::zero(), 0, false);
199       bld.mubuf(aco_opcode::buffer_load_dword, def_v4, desc_s4, op_v0, Operand::zero(), 0, false);
200 
201       //>> p_unit_test 1
202       //! v1: %0:v[4] = buffer_load_dword %0:s[0-3], %0:v[0], 0
203       //~gfx11! s_waitcnt vmcnt(0)
204       //~gfx12! s_wait_loadcnt imm:0
205       //! v1: %0:v[4] = image_sample %0:s[8-15], %0:s[0-3],  v1: undef, %0:v[0] 1d
206       bld.reset(program->create_and_insert_block());
207       bld.pseudo(aco_opcode::p_unit_test, Operand::c32(1));
208       bld.mubuf(aco_opcode::buffer_load_dword, def_v4, desc_s4, op_v0, Operand::zero(), 0, false);
209       bld.mimg(aco_opcode::image_sample, def_v4, desc_s8, desc_s4, Operand(v1), op_v0);
210 
211       //>> p_unit_test 2
212       //! v1: %0:v[4] = buffer_load_dword %0:s[0-3], %0:v[0], 0
213       //~gfx11! s_waitcnt vmcnt(0)
214       //~gfx12! s_wait_loadcnt imm:0
215       //! v1: %0:v[4] = image_bvh64_intersect_ray %0:s[0-3],  s4: undef,  v1: undef, %0:v[16-26] 1d
216       bld.reset(program->create_and_insert_block());
217       bld.pseudo(aco_opcode::p_unit_test, Operand::c32(2));
218       bld.mubuf(aco_opcode::buffer_load_dword, def_v4, desc_s4, op_v0, Operand::zero(), 0, false);
219       bld.mimg(aco_opcode::image_bvh64_intersect_ray, def_v4, desc_s4, Operand(s4), Operand(v1),
220                Operand(PhysReg(272), RegClass::get(RegType::vgpr, 11 * 4)));
221 
222       //>> p_unit_test 3
223       //! v1: %0:v[4] = image_sample %0:s[8-15], %0:s[0-3],  v1: undef, %0:v[0] 1d
224       //~gfx12! s_wait_samplecnt imm:0
225       //! v1: %0:v[4] = image_sample %0:s[8-15], %0:s[0-3],  v1: undef, %0:v[0] 1d
226       bld.reset(program->create_and_insert_block());
227       bld.pseudo(aco_opcode::p_unit_test, Operand::c32(3));
228       bld.mimg(aco_opcode::image_sample, def_v4, desc_s8, desc_s4, Operand(v1), op_v0);
229       bld.mimg(aco_opcode::image_sample, def_v4, desc_s8, desc_s4, Operand(v1), op_v0);
230 
231       //>> p_unit_test 4
232       //! v1: %0:v[4] = image_sample %0:s[8-15], %0:s[0-3],  v1: undef, %0:v[0] 1d
233       //~gfx11! s_waitcnt vmcnt(0)
234       //~gfx12! s_wait_samplecnt imm:0
235       //! v1: %0:v[4] = buffer_load_dword %0:s[0-3], %0:v[0], 0
236       bld.reset(program->create_and_insert_block());
237       bld.pseudo(aco_opcode::p_unit_test, Operand::c32(4));
238       bld.mimg(aco_opcode::image_sample, def_v4, desc_s8, desc_s4, Operand(v1), op_v0);
239       bld.mubuf(aco_opcode::buffer_load_dword, def_v4, desc_s4, op_v0, Operand::zero(), 0, false);
240 
241       //>> p_unit_test 5
242       //! v1: %0:v[4] = image_sample %0:s[8-15], %0:s[0-3],  v1: undef, %0:v[0] 1d
243       //~gfx11! s_waitcnt vmcnt(0)
244       //~gfx12! s_wait_samplecnt imm:0
245       //! v1: %0:v[4] = image_bvh64_intersect_ray %0:s[0-3],  s4: undef,  v1: undef, %0:v[16-26] 1d
246       bld.reset(program->create_and_insert_block());
247       bld.pseudo(aco_opcode::p_unit_test, Operand::c32(5));
248       bld.mimg(aco_opcode::image_sample, def_v4, desc_s8, desc_s4, Operand(v1), op_v0);
249       bld.mimg(aco_opcode::image_bvh64_intersect_ray, def_v4, desc_s4, Operand(s4), Operand(v1),
250                Operand(PhysReg(272), RegClass::get(RegType::vgpr, 11 * 4)));
251 
252       //>> p_unit_test 6
253       //! v1: %0:v[4] = image_bvh64_intersect_ray %0:s[0-3],  s4: undef,  v1: undef, %0:v[16-26] 1d
254       //~gfx12! s_wait_bvhcnt imm:0
255       //! v1: %0:v[4] = image_bvh64_intersect_ray %0:s[0-3],  s4: undef,  v1: undef, %0:v[16-26] 1d
256       bld.reset(program->create_and_insert_block());
257       bld.pseudo(aco_opcode::p_unit_test, Operand::c32(6));
258       bld.mimg(aco_opcode::image_bvh64_intersect_ray, def_v4, desc_s4, Operand(s4), Operand(v1),
259                Operand(PhysReg(272), RegClass::get(RegType::vgpr, 11 * 4)));
260       bld.mimg(aco_opcode::image_bvh64_intersect_ray, def_v4, desc_s4, Operand(s4), Operand(v1),
261                Operand(PhysReg(272), RegClass::get(RegType::vgpr, 11 * 4)));
262 
263       //>> p_unit_test 7
264       //! v1: %0:v[4] = image_bvh64_intersect_ray %0:s[0-3],  s4: undef,  v1: undef, %0:v[16-26] 1d
265       //~gfx11! s_waitcnt vmcnt(0)
266       //~gfx12! s_wait_bvhcnt imm:0
267       //! v1: %0:v[4] = buffer_load_dword %0:s[0-3], %0:v[0], 0
268       bld.reset(program->create_and_insert_block());
269       bld.pseudo(aco_opcode::p_unit_test, Operand::c32(7));
270       bld.mimg(aco_opcode::image_bvh64_intersect_ray, def_v4, desc_s4, Operand(s4), Operand(v1),
271                Operand(PhysReg(272), RegClass::get(RegType::vgpr, 11 * 4)));
272       bld.mubuf(aco_opcode::buffer_load_dword, def_v4, desc_s4, op_v0, Operand::zero(), 0, false);
273 
274       //>> p_unit_test 8
275       //! v1: %0:v[4] = image_bvh64_intersect_ray %0:s[0-3],  s4: undef,  v1: undef, %0:v[16-26] 1d
276       //~gfx11! s_waitcnt vmcnt(0)
277       //~gfx12! s_wait_bvhcnt imm:0
278       //! v1: %0:v[4] = image_sample %0:s[8-15], %0:s[0-3],  v1: undef, %0:v[0] 1d
279       bld.reset(program->create_and_insert_block());
280       bld.pseudo(aco_opcode::p_unit_test, Operand::c32(8));
281       bld.mimg(aco_opcode::image_bvh64_intersect_ray, def_v4, desc_s4, Operand(s4), Operand(v1),
282                Operand(PhysReg(272), RegClass::get(RegType::vgpr, 11 * 4)));
283       bld.mimg(aco_opcode::image_sample, def_v4, desc_s8, desc_s4, Operand(v1), op_v0);
284 
285       //>> BB9
286       //! /* logical preds: / linear preds: / kind: */
287       //! v1: %0:v[4] = buffer_load_dword %0:s[0-3], %0:v[0], 0
288       bld.reset(program->create_and_insert_block());
289       bld.mubuf(aco_opcode::buffer_load_dword, def_v4, desc_s4, op_v0, Operand::zero(), 0, false);
290 
291       //>> BB10
292       //! /* logical preds: / linear preds: / kind: */
293       //! v1: %0:v[4] = buffer_load_dword %0:s[0-3], %0:v[0], 0
294       bld.reset(program->create_and_insert_block());
295       bld.mubuf(aco_opcode::buffer_load_dword, def_v4, desc_s4, op_v0, Operand::zero(), 0, false);
296 
297       bld.reset(program->create_and_insert_block());
298       program->blocks[11].linear_preds.push_back(9);
299       program->blocks[11].linear_preds.push_back(10);
300       program->blocks[11].logical_preds.push_back(9);
301       program->blocks[11].logical_preds.push_back(10);
302 
303       //>> BB11
304       //! /* logical preds: BB9, BB10, / linear preds: BB9, BB10, / kind: uniform, */
305       //! p_unit_test 9
306       //~gfx12! s_wait_loadcnt imm:0
307       //! v1: %0:v[4] = buffer_load_dword %0:s[0-3], %0:v[0], 0
308       bld.pseudo(aco_opcode::p_unit_test, Operand::c32(9));
309       bld.mubuf(aco_opcode::buffer_load_dword, def_v4, desc_s4, op_v0, Operand::zero(), 0, false);
310 
311       //>> BB12
312       //! /* logical preds: / linear preds: / kind: */
313       //! v1: %0:v[4] = image_sample %0:s[8-15], %0:s[0-3],  v1: undef, %0:v[0] 1d
314       bld.reset(program->create_and_insert_block());
315       bld.mimg(aco_opcode::image_sample, def_v4, desc_s8, desc_s4, Operand(v1), op_v0);
316 
317       //>> BB13
318       //! /* logical preds: / linear preds: / kind: */
319       //! v1: %0:v[4] = buffer_load_dword %0:s[0-3], %0:v[0], 0
320       bld.reset(program->create_and_insert_block());
321       bld.mubuf(aco_opcode::buffer_load_dword, def_v4, desc_s4, op_v0, Operand::zero(), 0, false);
322 
323       bld.reset(program->create_and_insert_block());
324       program->blocks[14].linear_preds.push_back(12);
325       program->blocks[14].linear_preds.push_back(13);
326       program->blocks[14].logical_preds.push_back(12);
327       program->blocks[14].logical_preds.push_back(13);
328 
329       //>> BB14
330       //! /* logical preds: BB12, BB13, / linear preds: BB12, BB13, / kind: uniform, */
331       //! p_unit_test 10
332       //~gfx11! s_waitcnt vmcnt(0)
333       //~gfx12! s_wait_loadcnt imm:0
334       //~gfx12! s_wait_samplecnt imm:0
335       //! v1: %0:v[4] = buffer_load_dword %0:s[0-3], %0:v[0], 0
336       bld.pseudo(aco_opcode::p_unit_test, Operand::c32(10));
337       bld.mubuf(aco_opcode::buffer_load_dword, def_v4, desc_s4, op_v0, Operand::zero(), 0, false);
338 
339       finish_waitcnt_test();
340    }
341 END_TEST
342 
343 BEGIN_TEST(insert_waitcnt.vmem)
344    if (!setup_cs(NULL, GFX12))
345       return;
346 
347    Definition def_v4(PhysReg(260), v1);
348    Definition def_v5(PhysReg(261), v1);
349    Definition def_v6(PhysReg(262), v1);
350    Definition def_v7(PhysReg(263), v1);
351    Definition def_v8(PhysReg(264), v1);
352    Definition def_v9(PhysReg(265), v1);
353    Operand op_v0(PhysReg(256), v1);
354    Operand op_v4(PhysReg(260), v1);
355    Operand op_v5(PhysReg(261), v1);
356    Operand op_v6(PhysReg(262), v1);
357    Operand op_v7(PhysReg(263), v1);
358    Operand op_v8(PhysReg(264), v1);
359    Operand op_v9(PhysReg(265), v1);
360    Operand desc_s4(PhysReg(0), s4);
361    Operand desc_s8(PhysReg(8), s8);
362 
363    //>> v1: %0:v[4] = buffer_load_dword %0:s[0-3], %0:v[0], 0
364    //! v1: %0:v[5] = image_sample %0:s[8-15], %0:s[0-3],  v1: undef, %0:v[0] 1d
365    //! v1: %0:v[6] = image_bvh64_intersect_ray %0:s[0-3],  s4: undef,  v1: undef, %0:v[16-26] 1d unrm r128
366    bld.mubuf(aco_opcode::buffer_load_dword, def_v4, desc_s4, op_v0, Operand::zero(), 0, false);
367    bld.mimg(aco_opcode::image_sample, def_v5, desc_s8, desc_s4, Operand(v1), op_v0);
368    Instruction* instr =
369       bld.mimg(aco_opcode::image_bvh64_intersect_ray, def_v6, desc_s4, Operand(s4), Operand(v1),
370                Operand(PhysReg(272), RegClass::get(RegType::vgpr, 11 * 4)))
371          .instr;
372    instr->mimg().unrm = true;
373    instr->mimg().r128 = true;
374 
375    //! v1: %0:v[7] = image_load %0:s[8-15],  s4: undef,  v1: undef, %0:v[0] 1d
376    //! v1: %0:v[8] = image_sample %0:s[8-15], %0:s[0-3],  v1: undef, %0:v[0] 1d
377    //! v1: %0:v[9] = image_bvh64_intersect_ray %0:s[0-3],  s4: undef,  v1: undef, %0:v[16-26] 1d unrm r128
378    bld.mimg(aco_opcode::image_load, def_v7, desc_s8, Operand(s4), Operand(v1), op_v0, 0x1);
379    bld.mimg(aco_opcode::image_sample, def_v8, desc_s8, desc_s4, Operand(v1), op_v0);
380    instr = bld.mimg(aco_opcode::image_bvh64_intersect_ray, def_v9, desc_s4, Operand(s4),
381                     Operand(v1), Operand(PhysReg(272), RegClass::get(RegType::vgpr, 11 * 4)))
382               .instr;
383    instr->mimg().unrm = true;
384    instr->mimg().r128 = true;
385 
386    //! s_wait_loadcnt imm:1
387    //! p_unit_test 0, %0:v[4]
388    bld.pseudo(aco_opcode::p_unit_test, Operand::c32(0), op_v4);
389    //! s_wait_samplecnt imm:1
390    //! p_unit_test 1, %0:v[5]
391    bld.pseudo(aco_opcode::p_unit_test, Operand::c32(1), op_v5);
392    //! s_wait_bvhcnt imm:1
393    //! p_unit_test 2, %0:v[6]
394    bld.pseudo(aco_opcode::p_unit_test, Operand::c32(2), op_v6);
395    //! s_wait_loadcnt imm:0
396    //! p_unit_test 3, %0:v[7]
397    bld.pseudo(aco_opcode::p_unit_test, Operand::c32(3), op_v7);
398    //! s_wait_samplecnt imm:0
399    //! p_unit_test 4, %0:v[8]
400    bld.pseudo(aco_opcode::p_unit_test, Operand::c32(4), op_v8);
401    //! s_wait_bvhcnt imm:0
402    //! p_unit_test 5, %0:v[9]
403    bld.pseudo(aco_opcode::p_unit_test, Operand::c32(5), op_v9);
404 
405    /* Despite not using a sampler, this uses samplecnt. */
406    //! v1: %0:v[5] = image_msaa_load %0:s[8-15], s4: undef, v1: undef, %0:v[0] 1d
407    //! s_wait_samplecnt imm:0
408    //! p_unit_test 6, %0:v[5]
409    bld.mimg(aco_opcode::image_msaa_load, def_v5, desc_s8, Operand(s4), Operand(v1), op_v0);
410    bld.pseudo(aco_opcode::p_unit_test, Operand::c32(6), op_v5);
411 
412    finish_waitcnt_test();
413 END_TEST
414 
415 BEGIN_TEST(insert_waitcnt.lds_smem)
416    for (amd_gfx_level gfx : {GFX11, GFX12}) {
417       if (!setup_cs(NULL, gfx))
418          continue;
419 
420       Definition def_v4(PhysReg(260), v1);
421       Definition def_v5(PhysReg(261), v1);
422       Definition def_s4(PhysReg(4), s1);
423       Definition def_s5(PhysReg(5), s1);
424       Operand op_s0(PhysReg(0), s1);
425       Operand op_s4(PhysReg(4), s1);
426       Operand op_s5(PhysReg(5), s1);
427       Operand op_v0(PhysReg(256), v1);
428       Operand op_v4(PhysReg(260), v1);
429       Operand op_v5(PhysReg(261), v1);
430       Operand desc_s4(PhysReg(0), s4);
431 
432       //>> v1: %0:v[4] = ds_read_b32 %0:v[0]
433       //! s1: %0:s[4] = s_buffer_load_dword %0:s[0-3], %0:s[0]
434       //! v1: %0:v[5] = ds_read_b32 %0:v[0]
435       //! s1: %0:s[5] = s_buffer_load_dword %0:s[0-3], %0:s[0]
436       bld.ds(aco_opcode::ds_read_b32, def_v4, op_v0);
437       bld.smem(aco_opcode::s_buffer_load_dword, def_s4, desc_s4, op_s0);
438       bld.ds(aco_opcode::ds_read_b32, def_v5, op_v0);
439       bld.smem(aco_opcode::s_buffer_load_dword, def_s5, desc_s4, op_s0);
440 
441       //~gfx11! s_waitcnt lgkmcnt(1)
442       //~gfx12! s_wait_dscnt imm:1
443       //! p_unit_test 0, %0:v[4]
444       bld.pseudo(aco_opcode::p_unit_test, Operand::c32(0), op_v4);
445       //~gfx11! s_waitcnt lgkmcnt(0)
446       //~gfx12! s_wait_kmcnt imm:0
447       //! p_unit_test 1, %0:s[4]
448       bld.pseudo(aco_opcode::p_unit_test, Operand::c32(1), op_s4);
449       //~gfx12! s_wait_dscnt imm:0
450       //! p_unit_test 2, %0:v[5]
451       bld.pseudo(aco_opcode::p_unit_test, Operand::c32(2), op_v5);
452       //! p_unit_test 3, %0:s[5]
453       bld.pseudo(aco_opcode::p_unit_test, Operand::c32(3), op_s5);
454 
455       finish_waitcnt_test();
456    }
457 END_TEST
458 
459 BEGIN_TEST(insert_waitcnt.sendmsg_smem)
460    for (amd_gfx_level gfx : {GFX11, GFX12}) {
461       if (!setup_cs(NULL, gfx))
462          continue;
463 
464       Definition def_s4(PhysReg(4), s1);
465       Definition def_s5(PhysReg(5), s1);
466       Definition def_s6(PhysReg(6), s1);
467       Definition def_s7(PhysReg(7), s1);
468       Operand op_s0(PhysReg(0), s1);
469       Operand op_s4(PhysReg(4), s1);
470       Operand op_s5(PhysReg(5), s1);
471       Operand op_s6(PhysReg(6), s1);
472       Operand op_s7(PhysReg(7), s1);
473       Operand desc_s4(PhysReg(0), s4);
474 
475       //>> s1: %0:s[4] = s_sendmsg_rtn_b32 3 sendmsg(rtn_get_realtime)
476       //! s1: %0:s[5] = s_buffer_load_dword %0:s[0-3], %0:s[0]
477       //! s1: %0:s[6] = s_sendmsg_rtn_b32 3 sendmsg(rtn_get_realtime)
478       //! s1: %0:s[7] = s_buffer_load_dword %0:s[0-3], %0:s[0]
479       bld.sop1(aco_opcode::s_sendmsg_rtn_b32, def_s4, Operand::c32(sendmsg_rtn_get_realtime));
480       bld.smem(aco_opcode::s_buffer_load_dword, def_s5, desc_s4, op_s0);
481       bld.sop1(aco_opcode::s_sendmsg_rtn_b32, def_s6, Operand::c32(sendmsg_rtn_get_realtime));
482       bld.smem(aco_opcode::s_buffer_load_dword, def_s7, desc_s4, op_s0);
483 
484       //~gfx12! s_wait_kmcnt imm:1
485       //~gfx11! s_waitcnt lgkmcnt(1)
486       //! p_unit_test 0, %0:s[4]
487       bld.pseudo(aco_opcode::p_unit_test, Operand::c32(0), op_s4);
488       //~gfx12! s_wait_kmcnt imm:0
489       //~gfx11! s_waitcnt lgkmcnt(0)
490       //! p_unit_test 1, %0:s[5]
491       bld.pseudo(aco_opcode::p_unit_test, Operand::c32(1), op_s5);
492       //! p_unit_test 2, %0:s[6]
493       bld.pseudo(aco_opcode::p_unit_test, Operand::c32(2), op_s6);
494       //! p_unit_test 3, %0:s[7]
495       bld.pseudo(aco_opcode::p_unit_test, Operand::c32(3), op_s7);
496 
497       finish_waitcnt_test();
498    }
499 END_TEST
500 
501 BEGIN_TEST(insert_waitcnt.vmem_ds)
502    if (!setup_cs(NULL, GFX12))
503       return;
504 
505    Definition def_v4(PhysReg(260), v1);
506    Definition def_v5(PhysReg(261), v1);
507    Operand op_v0(PhysReg(256), v1);
508    Operand op_v1(PhysReg(257), v1);
509    Operand op_v4(PhysReg(260), v1);
510    Operand op_v5(PhysReg(261), v1);
511    Operand desc_s4(PhysReg(0), s4);
512 
513    program->workgroup_size = 128;
514    program->wgp_mode = true;
515 
516    //>> v1: %0:v[4] = buffer_load_dword %0:s[0-3], %0:v[0], 0
517    //! v1: %0:v[5] = ds_read_b32 %0:v[0]
518    bld.mubuf(aco_opcode::buffer_load_dword, def_v4, desc_s4, op_v0, Operand::zero(), 0, false);
519    bld.ds(aco_opcode::ds_read_b32, def_v5, op_v0);
520 
521    //! s_wait_loadcnt_dscnt dscnt(0) loadcnt(0)
522    //! p_unit_test 0, %0:v[4], %0:v[5]
523    bld.pseudo(aco_opcode::p_unit_test, Operand::c32(0), op_v4, op_v5);
524 
525    //! buffer_store_dword %0:s[0-3], %0:v[0], 0, %0:v[1] storage:buffer
526    //! v1: %0:v[5] = ds_write_b32 %0:v[0], %0:v[1] storage:shared
527    Instruction* instr =
528       bld.mubuf(aco_opcode::buffer_store_dword, desc_s4, op_v0, Operand::zero(), op_v1, 0, false)
529          .instr;
530    instr->mubuf().sync = memory_sync_info(storage_buffer);
531    instr = bld.ds(aco_opcode::ds_write_b32, def_v5, op_v0, op_v1).instr;
532    instr->ds().sync = memory_sync_info(storage_shared);
533 
534    //! s_wait_storecnt_dscnt dscnt(0) storecnt(0)
535    bld.barrier(aco_opcode::p_barrier,
536                memory_sync_info(storage_buffer | storage_shared, semantic_acqrel, scope_workgroup));
537 
538    finish_waitcnt_test();
539 END_TEST
540