1 /*
2 * Copyright © 2020 Valve Corporation
3 *
4 * SPDX-License-Identifier: MIT
5 */
6 #include "common/amdgfxregs.h"
7
8 #include "helpers.h"
9
10 using namespace aco;
11
12 void
create_mubuf(unsigned offset,PhysReg dst=PhysReg (256),PhysReg vaddr=PhysReg (256))13 create_mubuf(unsigned offset, PhysReg dst = PhysReg(256), PhysReg vaddr = PhysReg(256))
14 {
15 bld.mubuf(aco_opcode::buffer_load_dword, Definition(dst, v1), Operand(PhysReg(0), s4),
16 Operand(vaddr, v1), Operand::zero(), offset, true);
17 }
18
19 void
create_mubuf_store(PhysReg src=PhysReg (256))20 create_mubuf_store(PhysReg src = PhysReg(256))
21 {
22 bld.mubuf(aco_opcode::buffer_store_dword, Operand(PhysReg(0), s4), Operand(src, v1),
23 Operand::zero(), Operand(src, v1), 0, true);
24 }
25
26 void
create_mimg(bool nsa,unsigned addrs,unsigned instr_dwords)27 create_mimg(bool nsa, unsigned addrs, unsigned instr_dwords)
28 {
29 aco_ptr<Instruction> mimg{
30 create_instruction(aco_opcode::image_sample, Format::MIMG, 3 + addrs, 1)};
31 mimg->definitions[0] = Definition(PhysReg(256), v1);
32 mimg->operands[0] = Operand(PhysReg(0), s8);
33 mimg->operands[1] = Operand(PhysReg(0), s4);
34 mimg->operands[2] = Operand(v1);
35 for (unsigned i = 0; i < addrs; i++)
36 mimg->operands[3 + i] = Operand(PhysReg(256 + (nsa ? i * 2 : i)), v1);
37 mimg->mimg().dmask = 0x1;
38 mimg->mimg().dim = ac_image_2d;
39
40 assert(get_mimg_nsa_dwords(mimg.get()) + 2 == instr_dwords);
41
42 bld.insert(std::move(mimg));
43 }
44
45 void
create_bvh()46 create_bvh()
47 {
48 aco_ptr<Instruction> instr{
49 create_instruction(aco_opcode::image_bvh64_intersect_ray, Format::MIMG, 8, 1)};
50 instr->definitions[0] = Definition(PhysReg(256), v4);
51 instr->operands[0] = Operand(PhysReg(0), s4);
52 instr->operands[1] = Operand(s4);
53 instr->operands[2] = Operand(v1);
54 instr->operands[3] = Operand(PhysReg(256 + 0), v2); /* node */
55 instr->operands[4] = Operand(PhysReg(256 + 2), v1); /* tmax */
56 instr->operands[5] = Operand(PhysReg(256 + 3), v3); /* origin */
57 instr->operands[6] = Operand(PhysReg(256 + 6), v3); /* dir */
58 instr->operands[7] = Operand(PhysReg(256 + 9), v3); /* inv dir */
59 instr->mimg().dmask = 0xf;
60 instr->mimg().unrm = true;
61 instr->mimg().r128 = true;
62 bld.insert(std::move(instr));
63 }
64
65 BEGIN_TEST(insert_nops.nsa_to_vmem_bug)
66 if (!setup_cs(NULL, GFX10))
67 return;
68
69 /* no nop needed because offset&6==0 */
70 //>> p_unit_test 0
71 //! v1: %0:v[0] = image_sample %0:s[0-7], %0:s[0-3], v1: undef, %0:v[0], %0:v[2], %0:v[4], %0:v[6], %0:v[8], %0:v[10] 2d
72 //! v1: %0:v[0] = buffer_load_dword %0:s[0-3], %0:v[0], 0 offset:8 offen
73 bld.pseudo(aco_opcode::p_unit_test, Operand::zero());
74 create_mimg(true, 6, 4);
75 create_mubuf(8);
76
77 /* nop needed */
78 //! p_unit_test 1
79 //! v1: %0:v[0] = image_sample %0:s[0-7], %0:s[0-3], v1: undef, %0:v[0], %0:v[2], %0:v[4], %0:v[6], %0:v[8], %0:v[10] 2d
80 //! s_nop
81 //! v1: %0:v[0] = buffer_load_dword %0:s[0-3], %0:v[0], 0 offset:4 offen
82 bld.pseudo(aco_opcode::p_unit_test, Operand::c32(1u));
83 create_mimg(true, 6, 4);
84 create_mubuf(4);
85
86 /* no nop needed because the MIMG is not NSA */
87 //! p_unit_test 2
88 //! v1: %0:v[0] = image_sample %0:s[0-7], %0:s[0-3], v1: undef, %0:v[0], %0:v[1], %0:v[2], %0:v[3], %0:v[4], %0:v[5] 2d
89 //! v1: %0:v[0] = buffer_load_dword %0:s[0-3], %0:v[0], 0 offset:4 offen
90 bld.pseudo(aco_opcode::p_unit_test, Operand::c32(2u));
91 create_mimg(false, 6, 2);
92 create_mubuf(4);
93
94 /* no nop needed because there's already an instruction in-between */
95 //! p_unit_test 3
96 //! v1: %0:v[0] = image_sample %0:s[0-7], %0:s[0-3], v1: undef, %0:v[0], %0:v[2], %0:v[4], %0:v[6], %0:v[8], %0:v[10] 2d
97 //! v_nop
98 //! v1: %0:v[0] = buffer_load_dword %0:s[0-3], %0:v[0], 0 offset:4 offen
99 bld.pseudo(aco_opcode::p_unit_test, Operand::c32(3u));
100 create_mimg(true, 6, 4);
101 bld.vop1(aco_opcode::v_nop);
102 create_mubuf(4);
103
104 /* no nop needed because the NSA instruction is under 4 dwords */
105 //! p_unit_test 4
106 //! v1: %0:v[0] = image_sample %0:s[0-7], %0:s[0-3], v1: undef, %0:v[0], %0:v[2] 2d
107 //! v1: %0:v[0] = buffer_load_dword %0:s[0-3], %0:v[0], 0 offset:4 offen
108 bld.pseudo(aco_opcode::p_unit_test, Operand::c32(4u));
109 create_mimg(true, 2, 3);
110 create_mubuf(4);
111
112 /* NSA instruction and MUBUF/MTBUF in a different block */
113 //! p_unit_test 5
114 //! v1: %0:v[0] = image_sample %0:s[0-7], %0:s[0-3], v1: undef, %0:v[0], %0:v[2], %0:v[4], %0:v[6], %0:v[8], %0:v[10] 2d
115 //! BB1
116 //! /* logical preds: / linear preds: BB0, / kind: uniform, */
117 //! s_nop
118 //! v1: %0:v[0] = buffer_load_dword %0:s[0-3], %0:v[0], 0 offset:4 offen
119 bld.pseudo(aco_opcode::p_unit_test, Operand::c32(5u));
120 create_mimg(true, 6, 4);
121 bld.reset(program->create_and_insert_block());
122 create_mubuf(4);
123 program->blocks[0].linear_succs.push_back(1);
124 program->blocks[1].linear_preds.push_back(0);
125
126 finish_insert_nops_test();
127 END_TEST
128
129 BEGIN_TEST(insert_nops.writelane_to_nsa_bug)
130 if (!setup_cs(NULL, GFX10))
131 return;
132
133 /* nop needed */
134 //>> p_unit_test 0
135 //! v1: %0:v[255] = v_writelane_b32_e64 0, 0, %0:v[255]
136 //! s_nop
137 //! v1: %0:v[0] = image_sample %0:s[0-7], %0:s[0-3], v1: undef, %0:v[0], %0:v[2] 2d
138 bld.pseudo(aco_opcode::p_unit_test, Operand::zero());
139 bld.writelane(Definition(PhysReg(511), v1), Operand::zero(), Operand::zero(),
140 Operand(PhysReg(511), v1));
141 create_mimg(true, 2, 3);
142
143 /* no nop needed because the MIMG is not NSA */
144 //! p_unit_test 1
145 //! v1: %0:v[255] = v_writelane_b32_e64 0, 0, %0:v[255]
146 //! v1: %0:v[0] = image_sample %0:s[0-7], %0:s[0-3], v1: undef, %0:v[0], %0:v[1] 2d
147 bld.pseudo(aco_opcode::p_unit_test, Operand::c32(1u));
148 bld.writelane(Definition(PhysReg(511), v1), Operand::zero(), Operand::zero(),
149 Operand(PhysReg(511), v1));
150 create_mimg(false, 2, 2);
151
152 /* no nop needed because there's already an instruction in-between */
153 //! p_unit_test 2
154 //! v1: %0:v[255] = v_writelane_b32_e64 0, 0, %0:v[255]
155 //! v_nop
156 //! v1: %0:v[0] = image_sample %0:s[0-7], %0:s[0-3], v1: undef, %0:v[0], %0:v[2] 2d
157 bld.pseudo(aco_opcode::p_unit_test, Operand::c32(2u));
158 bld.writelane(Definition(PhysReg(511), v1), Operand::zero(), Operand::zero(),
159 Operand(PhysReg(511), v1));
160 bld.vop1(aco_opcode::v_nop);
161 create_mimg(true, 2, 3);
162
163 /* writelane and NSA instruction in different blocks */
164 //! p_unit_test 3
165 //! v1: %0:v[255] = v_writelane_b32_e64 0, 0, %0:v[255]
166 //! BB1
167 //! /* logical preds: / linear preds: BB0, / kind: uniform, */
168 //! s_nop
169 //! v1: %0:v[0] = image_sample %0:s[0-7], %0:s[0-3], v1: undef, %0:v[0], %0:v[2] 2d
170 bld.pseudo(aco_opcode::p_unit_test, Operand::c32(3u));
171 bld.writelane(Definition(PhysReg(511), v1), Operand::zero(), Operand::zero(),
172 Operand(PhysReg(511), v1));
173 bld.reset(program->create_and_insert_block());
174 create_mimg(true, 2, 3);
175 program->blocks[0].linear_succs.push_back(1);
176 program->blocks[1].linear_preds.push_back(0);
177
178 finish_insert_nops_test();
179 END_TEST
180
181 BEGIN_TEST(insert_nops.vmem_to_scalar_write)
182 if (!setup_cs(NULL, GFX10))
183 return;
184
185 /* WaR: VMEM load */
186 //>> p_unit_test 0
187 //! v1: %0:v[0] = buffer_load_dword %0:s[0-3], %0:v[0], 0 offen
188 //! s_waitcnt_depctr vm_vsrc(0)
189 //! s1: %0:s[0] = s_mov_b32 0
190 bld.pseudo(aco_opcode::p_unit_test, Operand::c32(0));
191 create_mubuf(0);
192 bld.sop1(aco_opcode::s_mov_b32, Definition(PhysReg(0), s1), Operand::zero());
193
194 //! p_unit_test 1
195 //! v1: %0:v[0] = buffer_load_dword %0:s[0-3], %0:v[0], 0 offen
196 //! s_waitcnt_depctr vm_vsrc(0)
197 //! s2: %0:exec = s_mov_b64 -1
198 bld.pseudo(aco_opcode::p_unit_test, Operand::c32(1));
199 create_mubuf(0);
200 bld.sop1(aco_opcode::s_mov_b64, Definition(exec, s2), Operand::c64(-1));
201
202 /* no hazard: VMEM load */
203 //! p_unit_test 2
204 //! v1: %0:v[0] = buffer_load_dword %0:s[0-3], %0:v[0], 0 offen
205 //! s1: %0:s[4] = s_mov_b32 0
206 bld.pseudo(aco_opcode::p_unit_test, Operand::c32(2));
207 create_mubuf(0);
208 bld.sop1(aco_opcode::s_mov_b32, Definition(PhysReg(4), s1), Operand::zero());
209
210 /* no hazard: VMEM load with VALU in-between */
211 //! p_unit_test 3
212 //! v1: %0:v[0] = buffer_load_dword %0:s[0-3], %0:v[0], 0 offen
213 //! v_nop
214 //! s1: %0:s[0] = s_mov_b32 0
215 bld.pseudo(aco_opcode::p_unit_test, Operand::c32(3));
216 create_mubuf(0);
217 bld.vop1(aco_opcode::v_nop);
218 bld.sop1(aco_opcode::s_mov_b32, Definition(PhysReg(0), s1), Operand::zero());
219
220 /* WaR: LDS */
221 //! p_unit_test 4
222 //! v1: %0:v[0] = ds_read_b32 %0:v[0], %0:m0
223 //! s_waitcnt_depctr vm_vsrc(0)
224 //! s1: %0:m0 = s_mov_b32 0
225 bld.pseudo(aco_opcode::p_unit_test, Operand::c32(4));
226 bld.ds(aco_opcode::ds_read_b32, Definition(PhysReg(256), v1), Operand(PhysReg(256), v1),
227 Operand(m0, s1));
228 bld.sop1(aco_opcode::s_mov_b32, Definition(m0, s1), Operand::zero());
229
230 //! p_unit_test 5
231 //! v1: %0:v[0] = ds_read_b32 %0:v[0], %0:m0
232 //! s_waitcnt_depctr vm_vsrc(0)
233 //! s2: %0:exec = s_mov_b64 -1
234 bld.pseudo(aco_opcode::p_unit_test, Operand::c32(5));
235 bld.ds(aco_opcode::ds_read_b32, Definition(PhysReg(256), v1), Operand(PhysReg(256), v1),
236 Operand(m0, s1));
237 bld.sop1(aco_opcode::s_mov_b64, Definition(exec, s2), Operand::c64(-1));
238
239 /* no hazard: LDS */
240 //! p_unit_test 6
241 //! v1: %0:v[0] = ds_read_b32 %0:v[0], %0:m0
242 //! s1: %0:s[0] = s_mov_b32 0
243 bld.pseudo(aco_opcode::p_unit_test, Operand::c32(6));
244 bld.ds(aco_opcode::ds_read_b32, Definition(PhysReg(256), v1), Operand(PhysReg(256), v1),
245 Operand(m0, s1));
246 bld.sop1(aco_opcode::s_mov_b32, Definition(PhysReg(0), s1), Operand::zero());
247
248 /* no hazard: LDS with VALU in-between */
249 //! p_unit_test 7
250 //! v1: %0:v[0] = ds_read_b32 %0:v[0], %0:m0
251 //! v_nop
252 //! s1: %0:m0 = s_mov_b32 0
253 bld.pseudo(aco_opcode::p_unit_test, Operand::c32(7));
254 bld.ds(aco_opcode::ds_read_b32, Definition(PhysReg(256), v1), Operand(PhysReg(256), v1),
255 Operand(m0, s1));
256 bld.vop1(aco_opcode::v_nop);
257 bld.sop1(aco_opcode::s_mov_b32, Definition(m0, s1), Operand::zero());
258
259 /* no hazard: VMEM/LDS with the correct waitcnt in-between */
260 //! p_unit_test 8
261 //! v1: %0:v[0] = buffer_load_dword %0:s[0-3], %0:v[0], 0 offen
262 //! s_waitcnt vmcnt(0)
263 //! s1: %0:s[0] = s_mov_b32 0
264 bld.pseudo(aco_opcode::p_unit_test, Operand::c32(8));
265 create_mubuf(0);
266 bld.sopp(aco_opcode::s_waitcnt, 0x3f70);
267 bld.sop1(aco_opcode::s_mov_b32, Definition(PhysReg(0), s1), Operand::zero());
268
269 //! p_unit_test 9
270 //! buffer_store_dword %0:s[0-3], %0:v[0], 0, %0:v[0] offen
271 //! s_waitcnt_vscnt %0:null imm:0
272 //! s1: %0:s[0] = s_mov_b32 0
273 bld.pseudo(aco_opcode::p_unit_test, Operand::c32(9));
274 create_mubuf_store();
275 bld.sopk(aco_opcode::s_waitcnt_vscnt, Operand(sgpr_null, s1), 0);
276 bld.sop1(aco_opcode::s_mov_b32, Definition(PhysReg(0), s1), Operand::zero());
277
278 //! p_unit_test 10
279 //! v1: %0:v[0] = ds_read_b32 %0:v[0], %0:m0
280 //! s_waitcnt lgkmcnt(0)
281 //! s1: %0:m0 = s_mov_b32 0
282 bld.pseudo(aco_opcode::p_unit_test, Operand::c32(10));
283 bld.ds(aco_opcode::ds_read_b32, Definition(PhysReg(256), v1), Operand(PhysReg(256), v1),
284 Operand(m0, s1));
285 bld.sopp(aco_opcode::s_waitcnt, 0xc07f);
286 bld.sop1(aco_opcode::s_mov_b32, Definition(m0, s1), Operand::zero());
287
288 /* VMEM/LDS with the wrong waitcnt in-between */
289 //! p_unit_test 11
290 //! v1: %0:v[0] = buffer_load_dword %0:s[0-3], %0:v[0], 0 offen
291 //! s_waitcnt_vscnt %0:null imm:0
292 //! s_waitcnt_depctr vm_vsrc(0)
293 //! s1: %0:s[0] = s_mov_b32 0
294 bld.pseudo(aco_opcode::p_unit_test, Operand::c32(11));
295 create_mubuf(0);
296 bld.sopk(aco_opcode::s_waitcnt_vscnt, Operand(sgpr_null, s1), 0);
297 bld.sop1(aco_opcode::s_mov_b32, Definition(PhysReg(0), s1), Operand::zero());
298
299 //! p_unit_test 12
300 //! buffer_store_dword %0:s[0-3], %0:v[0], 0, %0:v[0] offen
301 //! s_waitcnt lgkmcnt(0)
302 //! s_waitcnt_depctr vm_vsrc(0)
303 //! s1: %0:s[0] = s_mov_b32 0
304 bld.pseudo(aco_opcode::p_unit_test, Operand::c32(12));
305 create_mubuf_store();
306 bld.sopp(aco_opcode::s_waitcnt, 0xc07f);
307 bld.sop1(aco_opcode::s_mov_b32, Definition(PhysReg(0), s1), Operand::zero());
308
309 //! p_unit_test 13
310 //! v1: %0:v[0] = ds_read_b32 %0:v[0], %0:m0
311 //! s_waitcnt vmcnt(0)
312 //! s_waitcnt_depctr vm_vsrc(0)
313 //! s1: %0:m0 = s_mov_b32 0
314 bld.pseudo(aco_opcode::p_unit_test, Operand::c32(13));
315 bld.ds(aco_opcode::ds_read_b32, Definition(PhysReg(256), v1), Operand(PhysReg(256), v1),
316 Operand(m0, s1));
317 bld.sopp(aco_opcode::s_waitcnt, 0x3f70);
318 bld.sop1(aco_opcode::s_mov_b32, Definition(m0, s1), Operand::zero());
319
320 finish_insert_nops_test();
321 END_TEST
322
323 BEGIN_TEST(insert_nops.lds_direct_valu)
324 for (amd_gfx_level gfx : {GFX11, GFX12}) {
325 if (!setup_cs(NULL, gfx))
326 continue;
327
328 /* WaW */
329 //>> p_unit_test 0
330 //! v1: %0:v[0] = v_mov_b32 0
331 //! v1: %0:v[0] = lds_direct_load %0:m0 wait_vdst:0
332 bld.pseudo(aco_opcode::p_unit_test, Operand::c32(0));
333 bld.vop1(aco_opcode::v_mov_b32, Definition(PhysReg(256), v1), Operand::zero());
334 bld.ldsdir(aco_opcode::lds_direct_load, Definition(PhysReg(256), v1), Operand(m0, s1));
335
336 /* WaR */
337 //! p_unit_test 1
338 //! v1: %0:v[1] = v_mov_b32 %0:v[0]
339 //! v1: %0:v[0] = lds_direct_load %0:m0 wait_vdst:0
340 bld.pseudo(aco_opcode::p_unit_test, Operand::c32(1));
341 bld.vop1(aco_opcode::v_mov_b32, Definition(PhysReg(257), v1), Operand(PhysReg(256), v1));
342 bld.ldsdir(aco_opcode::lds_direct_load, Definition(PhysReg(256), v1), Operand(m0, s1));
343
344 /* No hazard. */
345 //! p_unit_test 2
346 //! v1: %0:v[1] = v_mov_b32 0
347 //! v1: %0:v[0] = lds_direct_load %0:m0
348 bld.pseudo(aco_opcode::p_unit_test, Operand::c32(2));
349 bld.vop1(aco_opcode::v_mov_b32, Definition(PhysReg(257), v1), Operand::zero());
350 bld.ldsdir(aco_opcode::lds_direct_load, Definition(PhysReg(256), v1), Operand(m0, s1));
351
352 /* multiples hazards, nearest should be considered */
353 //! p_unit_test 3
354 //! v1: %0:v[1] = v_mov_b32 %0:v[0]
355 //! v1: %0:v[0] = v_mov_b32 0
356 //! v1: %0:v[0] = lds_direct_load %0:m0 wait_vdst:0
357 bld.pseudo(aco_opcode::p_unit_test, Operand::c32(3));
358 bld.vop1(aco_opcode::v_mov_b32, Definition(PhysReg(257), v1), Operand(PhysReg(256), v1));
359 bld.vop1(aco_opcode::v_mov_b32, Definition(PhysReg(256), v1), Operand::zero());
360 bld.ldsdir(aco_opcode::lds_direct_load, Definition(PhysReg(256), v1), Operand(m0, s1));
361
362 /* independent VALU increase wait_vdst */
363 //! p_unit_test 4
364 //! v1: %0:v[0] = v_mov_b32 0
365 //! v_nop
366 //! v1: %0:v[0] = lds_direct_load %0:m0 wait_vdst:1
367 bld.pseudo(aco_opcode::p_unit_test, Operand::c32(4));
368 bld.vop1(aco_opcode::v_mov_b32, Definition(PhysReg(256), v1), Operand::zero());
369 bld.vop1(aco_opcode::v_nop);
370 bld.ldsdir(aco_opcode::lds_direct_load, Definition(PhysReg(256), v1), Operand(m0, s1));
371
372 //! p_unit_test 5
373 //! v1: %0:v[0] = v_mov_b32 0
374 //; for i in range(10): insert_pattern('v_nop')
375 //! v1: %0:v[0] = lds_direct_load %0:m0 wait_vdst:10
376 bld.pseudo(aco_opcode::p_unit_test, Operand::c32(5));
377 bld.vop1(aco_opcode::v_mov_b32, Definition(PhysReg(256), v1), Operand::zero());
378 for (unsigned i = 0; i < 10; i++)
379 bld.vop1(aco_opcode::v_nop);
380 bld.ldsdir(aco_opcode::lds_direct_load, Definition(PhysReg(256), v1), Operand(m0, s1));
381
382 //! p_unit_test 6
383 //! v1: %0:v[0] = v_mov_b32 0
384 //; for i in range(20): insert_pattern('v_nop')
385 //! v1: %0:v[0] = lds_direct_load %0:m0
386 bld.pseudo(aco_opcode::p_unit_test, Operand::c32(6));
387 bld.vop1(aco_opcode::v_mov_b32, Definition(PhysReg(256), v1), Operand::zero());
388 for (unsigned i = 0; i < 20; i++)
389 bld.vop1(aco_opcode::v_nop);
390 bld.ldsdir(aco_opcode::lds_direct_load, Definition(PhysReg(256), v1), Operand(m0, s1));
391
392 /* transcendental requires wait_vdst=0 */
393 //! p_unit_test 7
394 //! v1: %0:v[0] = v_mov_b32 0
395 //! v_nop
396 //! v1: %0:v[1] = v_sqrt_f32 %0:v[1]
397 //! v1: %0:v[0] = lds_direct_load %0:m0 wait_vdst:0
398 bld.pseudo(aco_opcode::p_unit_test, Operand::c32(7));
399 bld.vop1(aco_opcode::v_mov_b32, Definition(PhysReg(256), v1), Operand::zero());
400 bld.vop1(aco_opcode::v_nop);
401 bld.vop1(aco_opcode::v_sqrt_f32, Definition(PhysReg(257), v1), Operand(PhysReg(257), v1));
402 bld.ldsdir(aco_opcode::lds_direct_load, Definition(PhysReg(256), v1), Operand(m0, s1));
403
404 //! p_unit_test 8
405 //! v1: %0:v[0] = v_sqrt_f32 %0:v[0]
406 //! v_nop
407 //! v1: %0:v[0] = lds_direct_load %0:m0 wait_vdst:0
408 bld.pseudo(aco_opcode::p_unit_test, Operand::c32(8));
409 bld.vop1(aco_opcode::v_sqrt_f32, Definition(PhysReg(256), v1), Operand(PhysReg(256), v1));
410 bld.vop1(aco_opcode::v_nop);
411 bld.ldsdir(aco_opcode::lds_direct_load, Definition(PhysReg(256), v1), Operand(m0, s1));
412
413 /* transcendental is fine if it's before the instruction */
414 //! p_unit_test 9
415 //! v1: %0:v[1] = v_sqrt_f32 %0:v[1]
416 //! v1: %0:v[0] = v_mov_b32 0
417 //! v_nop
418 //! v1: %0:v[0] = lds_direct_load %0:m0 wait_vdst:1
419 bld.pseudo(aco_opcode::p_unit_test, Operand::c32(9));
420 bld.vop1(aco_opcode::v_sqrt_f32, Definition(PhysReg(257), v1), Operand(PhysReg(257), v1));
421 bld.vop1(aco_opcode::v_mov_b32, Definition(PhysReg(256), v1), Operand::zero());
422 bld.vop1(aco_opcode::v_nop);
423 bld.ldsdir(aco_opcode::lds_direct_load, Definition(PhysReg(256), v1), Operand(m0, s1));
424
425 /* non-VALU does not increase wait_vdst */
426 //! p_unit_test 10
427 //! v1: %0:v[0] = v_mov_b32 0
428 //! s1: %0:m0 = s_mov_b32 0
429 //! v1: %0:v[0] = lds_direct_load %0:m0 wait_vdst:0
430 bld.pseudo(aco_opcode::p_unit_test, Operand::c32(10));
431 bld.vop1(aco_opcode::v_mov_b32, Definition(PhysReg(256), v1), Operand::zero());
432 bld.sop1(aco_opcode::s_mov_b32, Definition(m0, s1), Operand::zero());
433 bld.ldsdir(aco_opcode::lds_direct_load, Definition(PhysReg(256), v1), Operand(m0, s1));
434
435 /* consider instructions which wait on vdst */
436 //! p_unit_test 11
437 //! v1: %0:v[0] = v_mov_b32 0
438 //! v_nop
439 //! s_waitcnt_depctr va_vdst(0)
440 //! v1: %0:v[0] = lds_direct_load %0:m0
441 bld.pseudo(aco_opcode::p_unit_test, Operand::c32(11));
442 bld.vop1(aco_opcode::v_mov_b32, Definition(PhysReg(256), v1), Operand::zero());
443 bld.vop1(aco_opcode::v_nop);
444 bld.sopp(aco_opcode::s_waitcnt_depctr, 0x0fff);
445 bld.ldsdir(aco_opcode::lds_direct_load, Definition(PhysReg(256), v1), Operand(m0, s1));
446
447 finish_insert_nops_test();
448 }
449 END_TEST
450
451 BEGIN_TEST(insert_nops.lds_direct_vmem)
452 for (amd_gfx_level gfx : {GFX11, GFX12}) {
453 if (!setup_cs(NULL, gfx))
454 continue;
455
456 /* WaR: VMEM */
457 //>> p_unit_test 0
458 //! v1: %0:v[1] = buffer_load_dword %0:s[0-3], %0:v[0], 0 offen
459 //~gfx11! s_waitcnt_depctr vm_vsrc(0)
460 //~gfx11! v1: %0:v[0] = lds_direct_load %0:m0
461 //~gfx12! v1: %0:v[0] = lds_direct_load %0:m0 wait_vsrc:0
462 bld.pseudo(aco_opcode::p_unit_test, Operand::c32(0));
463 create_mubuf(0, PhysReg(257));
464 bld.ldsdir(aco_opcode::lds_direct_load, Definition(PhysReg(256), v1), Operand(m0, s1));
465
466 /* WaW: VMEM */
467 //! p_unit_test 1
468 //! v1: %0:v[0] = buffer_load_dword %0:s[0-3], %0:v[1], 0 offen
469 //~gfx11! s_waitcnt_depctr vm_vsrc(0)
470 //~gfx11! v1: %0:v[0] = lds_direct_load %0:m0
471 //~gfx12! v1: %0:v[0] = lds_direct_load %0:m0 wait_vsrc:0
472 bld.pseudo(aco_opcode::p_unit_test, Operand::c32(1));
473 create_mubuf(0, PhysReg(256), PhysReg(257));
474 bld.ldsdir(aco_opcode::lds_direct_load, Definition(PhysReg(256), v1), Operand(m0, s1));
475
476 /* no hazard: VMEM */
477 //! p_unit_test 2
478 //! v1: %0:v[1] = buffer_load_dword %0:s[0-3], %0:v[1], 0 offen
479 //! v1: %0:v[0] = lds_direct_load %0:m0
480 bld.pseudo(aco_opcode::p_unit_test, Operand::c32(2));
481 create_mubuf(0, PhysReg(257), PhysReg(257));
482 bld.ldsdir(aco_opcode::lds_direct_load, Definition(PhysReg(256), v1), Operand(m0, s1));
483
484 /* no hazard: VMEM with VALU in-between */
485 //! p_unit_test 3
486 //! v1: %0:v[1] = buffer_load_dword %0:s[0-3], %0:v[0], 0 offen
487 //! v_nop
488 //! v1: %0:v[0] = lds_direct_load %0:m0
489 bld.pseudo(aco_opcode::p_unit_test, Operand::c32(3));
490 create_mubuf(0, PhysReg(257));
491 bld.vop1(aco_opcode::v_nop);
492 bld.ldsdir(aco_opcode::lds_direct_load, Definition(PhysReg(256), v1), Operand(m0, s1));
493
494 /* WaR: LDS */
495 //! p_unit_test 4
496 //! v1: %0:v[1] = ds_read_b32 %0:v[0]
497 //~gfx11! s_waitcnt_depctr vm_vsrc(0)
498 //~gfx11! v1: %0:v[0] = lds_direct_load %0:m0
499 //~gfx12! v1: %0:v[0] = lds_direct_load %0:m0 wait_vsrc:0
500 bld.pseudo(aco_opcode::p_unit_test, Operand::c32(4));
501 bld.ds(aco_opcode::ds_read_b32, Definition(PhysReg(257), v1), Operand(PhysReg(256), v1));
502 bld.ldsdir(aco_opcode::lds_direct_load, Definition(PhysReg(256), v1), Operand(m0, s1));
503
504 /* WaW: LDS */
505 //! p_unit_test 5
506 //! v1: %0:v[0] = ds_read_b32 %0:v[1]
507 //~gfx11! s_waitcnt_depctr vm_vsrc(0)
508 //~gfx11! v1: %0:v[0] = lds_direct_load %0:m0
509 //~gfx12! v1: %0:v[0] = lds_direct_load %0:m0 wait_vsrc:0
510 bld.pseudo(aco_opcode::p_unit_test, Operand::c32(5));
511 bld.ds(aco_opcode::ds_read_b32, Definition(PhysReg(256), v1), Operand(PhysReg(257), v1));
512 bld.ldsdir(aco_opcode::lds_direct_load, Definition(PhysReg(256), v1), Operand(m0, s1));
513
514 /* no hazard: LDS */
515 //! p_unit_test 6
516 //! v1: %0:v[1] = ds_read_b32 %0:v[1]
517 //! v1: %0:v[0] = lds_direct_load %0:m0
518 bld.pseudo(aco_opcode::p_unit_test, Operand::c32(6));
519 bld.ds(aco_opcode::ds_read_b32, Definition(PhysReg(257), v1), Operand(PhysReg(257), v1));
520 bld.ldsdir(aco_opcode::lds_direct_load, Definition(PhysReg(256), v1), Operand(m0, s1));
521
522 /* no hazard: LDS with VALU in-between */
523 //! p_unit_test 7
524 //! v1: %0:v[1] = ds_read_b32 %0:v[0]
525 //! v_nop
526 //! v1: %0:v[0] = lds_direct_load %0:m0
527 bld.pseudo(aco_opcode::p_unit_test, Operand::c32(7));
528 bld.ds(aco_opcode::ds_read_b32, Definition(PhysReg(257), v1), Operand(PhysReg(256), v1));
529 bld.vop1(aco_opcode::v_nop);
530 bld.ldsdir(aco_opcode::lds_direct_load, Definition(PhysReg(256), v1), Operand(m0, s1));
531
532 /* no hazard: VMEM/LDS with the correct waitcnt in-between */
533 //! p_unit_test 8
534 //! v1: %0:v[1] = buffer_load_dword %0:s[0-3], %0:v[0], 0 offen
535 //~gfx11! s_waitcnt vmcnt(0)
536 //~gfx12! s_wait_loadcnt imm:0
537 //! v1: %0:v[0] = lds_direct_load %0:m0
538 bld.pseudo(aco_opcode::p_unit_test, Operand::c32(8));
539 create_mubuf(0, PhysReg(257));
540 if (gfx >= GFX12)
541 bld.sopp(aco_opcode::s_wait_loadcnt, 0);
542 else
543 bld.sopp(aco_opcode::s_waitcnt, 0x3ff);
544 bld.ldsdir(aco_opcode::lds_direct_load, Definition(PhysReg(256), v1), Operand(m0, s1));
545
546 //! p_unit_test 9
547 //! buffer_store_dword %0:s[0-3], %0:v[0], 0, %0:v[0] offen
548 //~gfx11! s_waitcnt_vscnt %0:null imm:0
549 //~gfx12! s_wait_storecnt imm:0
550 //! v1: %0:v[0] = lds_direct_load %0:m0
551 bld.pseudo(aco_opcode::p_unit_test, Operand::c32(9));
552 create_mubuf_store();
553 if (gfx >= GFX12)
554 bld.sopp(aco_opcode::s_wait_storecnt, 0);
555 else
556 bld.sopk(aco_opcode::s_waitcnt_vscnt, Operand(sgpr_null, s1), 0);
557 bld.ldsdir(aco_opcode::lds_direct_load, Definition(PhysReg(256), v1), Operand(m0, s1));
558
559 //! p_unit_test 10
560 //! v1: %0:v[1] = ds_read_b32 %0:v[0]
561 //~gfx11! s_waitcnt lgkmcnt(0)
562 //~gfx12! s_wait_dscnt imm:0
563 //! v1: %0:v[0] = lds_direct_load %0:m0
564 bld.pseudo(aco_opcode::p_unit_test, Operand::c32(10));
565 bld.ds(aco_opcode::ds_read_b32, Definition(PhysReg(257), v1), Operand(PhysReg(256), v1));
566 if (gfx >= GFX12)
567 bld.sopp(aco_opcode::s_wait_dscnt, 0);
568 else
569 bld.sopp(aco_opcode::s_waitcnt, 0xfc0f);
570 bld.ldsdir(aco_opcode::lds_direct_load, Definition(PhysReg(256), v1), Operand(m0, s1));
571
572 if (gfx >= GFX12) {
573 //~gfx12! p_unit_test 11
574 //~gfx12! v1: %0:v[1] = image_load %0:s[0-7], s4: undef, v1: undef, %0:v[0-1] 2d
575 //~gfx12! s_wait_loadcnt imm:0
576 //~gfx12! v1: %0:v[0] = lds_direct_load %0:m0
577 bld.pseudo(aco_opcode::p_unit_test, Operand::c32(11));
578 Instruction* instr =
579 bld.mimg(aco_opcode::image_load, Definition(PhysReg(257), v1), Operand(PhysReg(0), s8),
580 Operand(s4), Operand(v1), Operand(PhysReg(256), v2))
581 .instr;
582 instr->mimg().dmask = 0x1;
583 instr->mimg().dim = ac_image_2d;
584 bld.sopp(aco_opcode::s_wait_loadcnt, 0);
585 bld.ldsdir(aco_opcode::lds_direct_load, Definition(PhysReg(256), v1), Operand(m0, s1));
586
587 //~gfx12! p_unit_test 12
588 //~gfx12! v1: %0:v[1] = image_sample %0:s[0-7], %0:s[0-3], v1: undef, %0:v[0-1] 2d
589 //~gfx12! s_wait_samplecnt imm:0
590 //~gfx12! v1: %0:v[0] = lds_direct_load %0:m0
591 bld.pseudo(aco_opcode::p_unit_test, Operand::c32(12));
592 instr = bld.mimg(aco_opcode::image_sample, Definition(PhysReg(257), v1),
593 Operand(PhysReg(0), s8), Operand(PhysReg(0), s4), Operand(v1),
594 Operand(PhysReg(256), v2))
595 .instr;
596 instr->mimg().dmask = 0x1;
597 instr->mimg().dim = ac_image_2d;
598 bld.sopp(aco_opcode::s_wait_samplecnt, 0);
599 bld.ldsdir(aco_opcode::lds_direct_load, Definition(PhysReg(256), v1), Operand(m0, s1));
600
601 //~gfx12! p_unit_test 13
602 //~gfx12! v4: %0:v[0-3] = image_bvh64_intersect_ray %0:s[0-3], s4: undef, v1: undef, %0:v[0-1], %0:v[2], %0:v[3-5], %0:v[6-8], %0:v[9-11] 1d unrm r128
603 //~gfx12! s_wait_bvhcnt imm:0
604 //~gfx12! v1: %0:v[0] = lds_direct_load %0:m0
605 bld.pseudo(aco_opcode::p_unit_test, Operand::c32(13));
606 create_bvh();
607 bld.sopp(aco_opcode::s_wait_bvhcnt, 0);
608 bld.ldsdir(aco_opcode::lds_direct_load, Definition(PhysReg(256), v1), Operand(m0, s1));
609 }
610
611 /* VMEM/LDS with the wrong waitcnt in-between */
612 //! p_unit_test 14
613 //! v1: %0:v[1] = buffer_load_dword %0:s[0-3], %0:v[0], 0 offen
614 //~gfx11! s_waitcnt_vscnt %0:null imm:0
615 //~gfx11! s_waitcnt_depctr vm_vsrc(0)
616 //~gfx11! v1: %0:v[0] = lds_direct_load %0:m0
617 //~gfx12! s_wait_storecnt imm:0
618 //~gfx12! v1: %0:v[0] = lds_direct_load %0:m0 wait_vsrc:0
619 bld.pseudo(aco_opcode::p_unit_test, Operand::c32(14));
620 create_mubuf(0, PhysReg(257));
621 if (gfx >= GFX12)
622 bld.sopp(aco_opcode::s_wait_storecnt, 0);
623 else
624 bld.sopk(aco_opcode::s_waitcnt_vscnt, Operand(sgpr_null, s1), 0);
625 bld.ldsdir(aco_opcode::lds_direct_load, Definition(PhysReg(256), v1), Operand(m0, s1));
626
627 //! p_unit_test 15
628 //! buffer_store_dword %0:s[0-3], %0:v[0], 0, %0:v[0] offen
629 //~gfx11! s_waitcnt lgkmcnt(0)
630 //~gfx11! s_waitcnt_depctr vm_vsrc(0)
631 //~gfx11! v1: %0:v[0] = lds_direct_load %0:m0
632 //~gfx12! s_wait_dscnt imm:0
633 //~gfx12! v1: %0:v[0] = lds_direct_load %0:m0 wait_vsrc:0
634 bld.pseudo(aco_opcode::p_unit_test, Operand::c32(15));
635 create_mubuf_store();
636 if (gfx >= GFX12)
637 bld.sopp(aco_opcode::s_wait_dscnt, 0);
638 else
639 bld.sopp(aco_opcode::s_waitcnt, 0xfc0f);
640 bld.ldsdir(aco_opcode::lds_direct_load, Definition(PhysReg(256), v1), Operand(m0, s1));
641
642 //! p_unit_test 16
643 //! v1: %0:v[1] = ds_read_b32 %0:v[0]
644 //~gfx11! s_waitcnt vmcnt(0)
645 //~gfx11! s_waitcnt_depctr vm_vsrc(0)
646 //~gfx11! v1: %0:v[0] = lds_direct_load %0:m0
647 //~gfx12! s_wait_loadcnt imm:0
648 //~gfx12! v1: %0:v[0] = lds_direct_load %0:m0 wait_vsrc:0
649 bld.pseudo(aco_opcode::p_unit_test, Operand::c32(16));
650 bld.ds(aco_opcode::ds_read_b32, Definition(PhysReg(257), v1), Operand(PhysReg(256), v1));
651 if (gfx >= GFX12)
652 bld.sopp(aco_opcode::s_wait_loadcnt, 0);
653 else
654 bld.sopp(aco_opcode::s_waitcnt, 0x3ff);
655 bld.ldsdir(aco_opcode::lds_direct_load, Definition(PhysReg(256), v1), Operand(m0, s1));
656
657 //! p_unit_test 17
658 //! v1: %0:v[0] = buffer_load_dword %0:s[0-3], %0:v[1], 0 offen
659 //~gfx11! s_waitcnt_vscnt %0:null imm:0
660 //~gfx11! s_waitcnt_depctr vm_vsrc(0)
661 //~gfx11! v1: %0:v[0] = lds_direct_load %0:m0
662 //~gfx12! s_wait_storecnt imm:0
663 //~gfx12! v1: %0:v[0] = lds_direct_load %0:m0 wait_vsrc:0
664 bld.pseudo(aco_opcode::p_unit_test, Operand::c32(17));
665 create_mubuf(0, PhysReg(256), PhysReg(257));
666 if (gfx >= GFX12)
667 bld.sopp(aco_opcode::s_wait_storecnt, 0);
668 else
669 bld.sopk(aco_opcode::s_waitcnt_vscnt, Operand(sgpr_null, s1), 0);
670 bld.ldsdir(aco_opcode::lds_direct_load, Definition(PhysReg(256), v1), Operand(m0, s1));
671
672 if (gfx >= GFX12) {
673 //~gfx12! p_unit_test 18
674 //~gfx12! v1: %0:v[1] = image_load %0:s[0-7], s4: undef, v1: undef, %0:v[0-1] 2d
675 //~gfx12! s_wait_samplecnt imm:0
676 //~gfx12! v1: %0:v[0] = lds_direct_load %0:m0 wait_vsrc:0
677 bld.pseudo(aco_opcode::p_unit_test, Operand::c32(18));
678 Instruction* instr =
679 bld.mimg(aco_opcode::image_load, Definition(PhysReg(257), v1), Operand(PhysReg(0), s8),
680 Operand(s4), Operand(v1), Operand(PhysReg(256), v2))
681 .instr;
682 instr->mimg().dmask = 0x1;
683 instr->mimg().dim = ac_image_2d;
684 bld.sopp(aco_opcode::s_wait_samplecnt, 0);
685 bld.ldsdir(aco_opcode::lds_direct_load, Definition(PhysReg(256), v1), Operand(m0, s1));
686
687 //~gfx12! p_unit_test 19
688 //~gfx12! v1: %0:v[1] = image_sample %0:s[0-7], %0:s[0-3], v1: undef, %0:v[0-1] 2d
689 //~gfx12! s_wait_loadcnt imm:0
690 //~gfx12! v1: %0:v[0] = lds_direct_load %0:m0 wait_vsrc:0
691 bld.pseudo(aco_opcode::p_unit_test, Operand::c32(19));
692 instr = bld.mimg(aco_opcode::image_sample, Definition(PhysReg(257), v1),
693 Operand(PhysReg(0), s8), Operand(PhysReg(0), s4), Operand(v1),
694 Operand(PhysReg(256), v2))
695 .instr;
696 instr->mimg().dmask = 0x1;
697 instr->mimg().dim = ac_image_2d;
698 bld.sopp(aco_opcode::s_wait_loadcnt, 0);
699 bld.ldsdir(aco_opcode::lds_direct_load, Definition(PhysReg(256), v1), Operand(m0, s1));
700
701 //~gfx12! p_unit_test 20
702 //~gfx12! v4: %0:v[0-3] = image_bvh64_intersect_ray %0:s[0-3], s4: undef, v1: undef, %0:v[0-1], %0:v[2], %0:v[3-5], %0:v[6-8], %0:v[9-11] 1d unrm r128
703 //~gfx12! s_wait_loadcnt imm:0
704 //~gfx12! v1: %0:v[0] = lds_direct_load %0:m0 wait_vsrc:0
705 bld.pseudo(aco_opcode::p_unit_test, Operand::c32(20));
706 create_bvh();
707 bld.sopp(aco_opcode::s_wait_loadcnt, 0);
708 bld.ldsdir(aco_opcode::lds_direct_load, Definition(PhysReg(256), v1), Operand(m0, s1));
709 }
710
711 finish_insert_nops_test();
712 }
713 END_TEST
714
715 BEGIN_TEST(insert_nops.valu_trans_use)
716 if (!setup_cs(NULL, GFX11))
717 return;
718
719 //>> p_unit_test 0
720 //! v1: %0:v[0] = v_rcp_f32 %0:v[1]
721 //! s_waitcnt_depctr va_vdst(0)
722 //! v1: %0:v[1] = v_mov_b32 %0:v[0]
723 bld.pseudo(aco_opcode::p_unit_test, Operand::c32(0));
724 bld.vop1(aco_opcode::v_rcp_f32, Definition(PhysReg(256), v1), Operand(PhysReg(257), v1));
725 bld.vop1(aco_opcode::v_mov_b32, Definition(PhysReg(257), v1), Operand(PhysReg(256), v1));
726
727 /* Sufficient VALU mitigates the hazard. */
728 //! p_unit_test 1
729 //! v1: %0:v[0] = v_rcp_f32 %0:v[1]
730 //; for i in range(4): insert_pattern('v_nop')
731 //! s_waitcnt_depctr va_vdst(0)
732 //! v1: %0:v[1] = v_mov_b32 %0:v[0]
733 bld.pseudo(aco_opcode::p_unit_test, Operand::c32(1));
734 bld.vop1(aco_opcode::v_rcp_f32, Definition(PhysReg(256), v1), Operand(PhysReg(257), v1));
735 for (unsigned i = 0; i < 4; i++)
736 bld.vop1(aco_opcode::v_nop);
737 bld.vop1(aco_opcode::v_mov_b32, Definition(PhysReg(257), v1), Operand(PhysReg(256), v1));
738
739 //! p_unit_test 2
740 //! v1: %0:v[0] = v_rcp_f32 %0:v[1]
741 //; for i in range(8): insert_pattern('v_nop')
742 //! v1: %0:v[1] = v_mov_b32 %0:v[0]
743 bld.pseudo(aco_opcode::p_unit_test, Operand::c32(2));
744 bld.vop1(aco_opcode::v_rcp_f32, Definition(PhysReg(256), v1), Operand(PhysReg(257), v1));
745 for (unsigned i = 0; i < 8; i++)
746 bld.vop1(aco_opcode::v_nop);
747 bld.vop1(aco_opcode::v_mov_b32, Definition(PhysReg(257), v1), Operand(PhysReg(256), v1));
748
749 /* Sufficient transcendental VALU mitigates the hazard. */
750 //! p_unit_test 3
751 //! v1: %0:v[0] = v_rcp_f32 %0:v[1]
752 //! v1: %0:v[2] = v_sqrt_f32 %0:v[3]
753 //! s_waitcnt_depctr va_vdst(0)
754 //! v1: %0:v[1] = v_mov_b32 %0:v[0]
755 bld.pseudo(aco_opcode::p_unit_test, Operand::c32(3));
756 bld.vop1(aco_opcode::v_rcp_f32, Definition(PhysReg(256), v1), Operand(PhysReg(257), v1));
757 bld.vop1(aco_opcode::v_sqrt_f32, Definition(PhysReg(258), v1), Operand(PhysReg(259), v1));
758 bld.vop1(aco_opcode::v_mov_b32, Definition(PhysReg(257), v1), Operand(PhysReg(256), v1));
759
760 //! p_unit_test 4
761 //! v1: %0:v[0] = v_rcp_f32 %0:v[1]
762 //! v1: %0:v[2] = v_sqrt_f32 %0:v[3]
763 //! v1: %0:v[2] = v_sqrt_f32 %0:v[3]
764 //! v1: %0:v[1] = v_mov_b32 %0:v[0]
765 bld.pseudo(aco_opcode::p_unit_test, Operand::c32(4));
766 bld.vop1(aco_opcode::v_rcp_f32, Definition(PhysReg(256), v1), Operand(PhysReg(257), v1));
767 for (unsigned i = 0; i < 2; i++)
768 bld.vop1(aco_opcode::v_sqrt_f32, Definition(PhysReg(258), v1), Operand(PhysReg(259), v1));
769 bld.vop1(aco_opcode::v_mov_b32, Definition(PhysReg(257), v1), Operand(PhysReg(256), v1));
770
771 /* Transcendental VALU should be counted towards VALU */
772 //! p_unit_test 5
773 //! v1: %0:v[0] = v_rcp_f32 %0:v[1]
774 //; for i in range(5): insert_pattern('v_nop')
775 //! v1: %0:v[2] = v_sqrt_f32 %0:v[3]
776 //! v1: %0:v[1] = v_mov_b32 %0:v[0]
777 bld.pseudo(aco_opcode::p_unit_test, Operand::c32(5));
778 bld.vop1(aco_opcode::v_rcp_f32, Definition(PhysReg(256), v1), Operand(PhysReg(257), v1));
779 for (unsigned i = 0; i < 5; i++)
780 bld.vop1(aco_opcode::v_nop);
781 bld.vop1(aco_opcode::v_sqrt_f32, Definition(PhysReg(258), v1), Operand(PhysReg(259), v1));
782 bld.vop1(aco_opcode::v_mov_b32, Definition(PhysReg(257), v1), Operand(PhysReg(256), v1));
783
784 /* non-VALU does not mitigate the hazard. */
785 //! p_unit_test 6
786 //! v1: %0:v[0] = v_rcp_f32 %0:v[1]
787 //; for i in range(8): insert_pattern('s_nop')
788 //! s_waitcnt_depctr va_vdst(0)
789 //! v1: %0:v[1] = v_mov_b32 %0:v[0]
790 bld.pseudo(aco_opcode::p_unit_test, Operand::c32(6));
791 bld.vop1(aco_opcode::v_rcp_f32, Definition(PhysReg(256), v1), Operand(PhysReg(257), v1));
792 for (unsigned i = 0; i < 8; i++)
793 bld.sopp(aco_opcode::s_nop, 0);
794 bld.vop1(aco_opcode::v_mov_b32, Definition(PhysReg(257), v1), Operand(PhysReg(256), v1));
795
796 finish_insert_nops_test();
797 END_TEST
798
799 BEGIN_TEST(insert_nops.valu_partial_forwarding.basic)
800 if (!setup_cs(NULL, GFX11))
801 return;
802
803 /* Basic case. */
804 //>> p_unit_test 0
805 //! v1: %0:v[0] = v_mov_b32 0
806 //! s2: %0:exec = s_mov_b64 -1
807 //! v1: %0:v[1] = v_mov_b32 1
808 //! s_waitcnt_depctr va_vdst(0)
809 //! v1: %0:v[2] = v_max_f32 %0:v[0], %0:v[1]
810 bld.pseudo(aco_opcode::p_unit_test, Operand::c32(0));
811 bld.vop1(aco_opcode::v_mov_b32, Definition(PhysReg(256), v1), Operand::zero());
812 bld.sop1(aco_opcode::s_mov_b64, Definition(exec, s2), Operand::c64(-1));
813 bld.vop1(aco_opcode::v_mov_b32, Definition(PhysReg(257), v1), Operand::c32(1));
814 bld.vop2(aco_opcode::v_max_f32, Definition(PhysReg(258), v1), Operand(PhysReg(256), v1),
815 Operand(PhysReg(257), v1));
816
817 /* We should consider both the closest and further VALU after the exec write. */
818 //! p_unit_test 1
819 //! v1: %0:v[0] = v_mov_b32 0
820 //! s2: %0:exec = s_mov_b64 -1
821 //! v1: %0:v[1] = v_mov_b32 1
822 //; for i in range(2): insert_pattern('v_nop')
823 //! v1: %0:v[2] = v_mov_b32 2
824 //! s_waitcnt_depctr va_vdst(0)
825 //! v1: %0:v[2] = v_max3_f32 %0:v[0], %0:v[1], %0:v[2]
826 bld.pseudo(aco_opcode::p_unit_test, Operand::c32(1));
827 bld.vop1(aco_opcode::v_mov_b32, Definition(PhysReg(256), v1), Operand::zero());
828 bld.sop1(aco_opcode::s_mov_b64, Definition(exec, s2), Operand::c64(-1));
829 bld.vop1(aco_opcode::v_mov_b32, Definition(PhysReg(257), v1), Operand::c32(1));
830 bld.vop1(aco_opcode::v_nop);
831 bld.vop1(aco_opcode::v_nop);
832 bld.vop1(aco_opcode::v_mov_b32, Definition(PhysReg(258), v1), Operand::c32(2));
833 bld.vop3(aco_opcode::v_max3_f32, Definition(PhysReg(258), v1), Operand(PhysReg(256), v1),
834 Operand(PhysReg(257), v1), Operand(PhysReg(258), v1));
835
836 //! p_unit_test 2
837 //! v1: %0:v[0] = v_mov_b32 0
838 //! s2: %0:exec = s_mov_b64 -1
839 //! v1: %0:v[1] = v_mov_b32 1
840 //! v1: %0:v[2] = v_mov_b32 2
841 //; for i in range(4): insert_pattern('v_nop')
842 //! s_waitcnt_depctr va_vdst(0)
843 //! v1: %0:v[2] = v_max3_f32 %0:v[0], %0:v[1], %0:v[2]
844 bld.pseudo(aco_opcode::p_unit_test, Operand::c32(2));
845 bld.vop1(aco_opcode::v_mov_b32, Definition(PhysReg(256), v1), Operand::zero());
846 bld.sop1(aco_opcode::s_mov_b64, Definition(exec, s2), Operand::c64(-1));
847 bld.vop1(aco_opcode::v_mov_b32, Definition(PhysReg(257), v1), Operand::c32(1));
848 bld.vop1(aco_opcode::v_mov_b32, Definition(PhysReg(258), v1), Operand::c32(2));
849 for (unsigned i = 0; i < 4; i++)
850 bld.vop1(aco_opcode::v_nop);
851 bld.vop3(aco_opcode::v_max3_f32, Definition(PhysReg(258), v1), Operand(PhysReg(256), v1),
852 Operand(PhysReg(257), v1), Operand(PhysReg(258), v1));
853
854 /* If a VALU writes a read VGPR in-between the first and second writes, it should still be
855 * counted towards the distance between the first and second writes.
856 */
857 //! p_unit_test 3
858 //! v1: %0:v[0] = v_mov_b32 0
859 //! s2: %0:exec = s_mov_b64 -1
860 //! v1: %0:v[1] = v_mov_b32 1
861 //; for i in range(2): insert_pattern('v_nop')
862 //! v1: %0:v[2] = v_mov_b32 2
863 //; for i in range(3): insert_pattern('v_nop')
864 //! v1: %0:v[2] = v_max3_f32 %0:v[0], %0:v[1], %0:v[2]
865 bld.pseudo(aco_opcode::p_unit_test, Operand::c32(3));
866 bld.vop1(aco_opcode::v_mov_b32, Definition(PhysReg(256), v1), Operand::zero());
867 bld.sop1(aco_opcode::s_mov_b64, Definition(exec, s2), Operand::c64(-1));
868 bld.vop1(aco_opcode::v_mov_b32, Definition(PhysReg(257), v1), Operand::c32(1));
869 bld.vop1(aco_opcode::v_nop);
870 bld.vop1(aco_opcode::v_nop);
871 bld.vop1(aco_opcode::v_mov_b32, Definition(PhysReg(258), v1), Operand::c32(2));
872 for (unsigned i = 0; i < 3; i++)
873 bld.vop1(aco_opcode::v_nop);
874 bld.vop3(aco_opcode::v_max3_f32, Definition(PhysReg(258), v1), Operand(PhysReg(256), v1),
875 Operand(PhysReg(257), v1), Operand(PhysReg(258), v1));
876
877 bld.pseudo(aco_opcode::p_unit_test, Operand::c32(4));
878
879 finish_insert_nops_test();
880 END_TEST
881
882 BEGIN_TEST(insert_nops.valu_partial_forwarding.multiple_exec_writes)
883 if (!setup_cs(NULL, GFX11))
884 return;
885
886 //>> p_unit_test 0
887 //! v1: %0:v[0] = v_mov_b32 0
888 //! s2: %0:exec = s_mov_b64 0
889 //! s2: %0:exec = s_mov_b64 -1
890 //! v1: %0:v[1] = v_mov_b32 1
891 //! s_waitcnt_depctr va_vdst(0)
892 //! v1: %0:v[2] = v_max_f32 %0:v[0], %0:v[1]
893 bld.pseudo(aco_opcode::p_unit_test, Operand::c32(0));
894 bld.vop1(aco_opcode::v_mov_b32, Definition(PhysReg(256), v1), Operand::zero());
895 bld.sop1(aco_opcode::s_mov_b64, Definition(exec, s2), Operand::c64(0));
896 bld.sop1(aco_opcode::s_mov_b64, Definition(exec, s2), Operand::c64(-1));
897 bld.vop1(aco_opcode::v_mov_b32, Definition(PhysReg(257), v1), Operand::c32(1));
898 bld.vop2(aco_opcode::v_max_f32, Definition(PhysReg(258), v1), Operand(PhysReg(256), v1),
899 Operand(PhysReg(257), v1));
900
901 //! p_unit_test 1
902 //! v1: %0:v[0] = v_mov_b32 0
903 //! s2: %0:exec = s_mov_b64 0
904 //! v1: %0:v[1] = v_mov_b32 1
905 //! s2: %0:exec = s_mov_b64 -1
906 //! s_waitcnt_depctr va_vdst(0)
907 //! v1: %0:v[2] = v_max_f32 %0:v[0], %0:v[1]
908 bld.pseudo(aco_opcode::p_unit_test, Operand::c32(1));
909 bld.vop1(aco_opcode::v_mov_b32, Definition(PhysReg(256), v1), Operand::zero());
910 bld.sop1(aco_opcode::s_mov_b64, Definition(exec, s2), Operand::c64(0));
911 bld.vop1(aco_opcode::v_mov_b32, Definition(PhysReg(257), v1), Operand::c32(1));
912 bld.sop1(aco_opcode::s_mov_b64, Definition(exec, s2), Operand::c64(-1));
913 bld.vop2(aco_opcode::v_max_f32, Definition(PhysReg(258), v1), Operand(PhysReg(256), v1),
914 Operand(PhysReg(257), v1));
915
916 finish_insert_nops_test();
917 END_TEST
918
919 BEGIN_TEST(insert_nops.valu_partial_forwarding.control_flow)
920 if (!setup_cs(NULL, GFX11))
921 return;
922
923 /* Control flow merges: one branch shouldn't interfere with the other (clobbering VALU closer
924 * than interesting one).
925 */
926 //>> p_unit_test 0
927 //! s_cbranch_scc1 block:BB2
928 bld.pseudo(aco_opcode::p_unit_test, Operand::c32(0u));
929 bld.sopp(aco_opcode::s_cbranch_scc1, 2);
930
931 //! BB1
932 //! /* logical preds: / linear preds: BB0, / kind: */
933 //! v1: %0:v[0] = v_mov_b32 0
934 //! s2: %0:exec = s_mov_b64 -1
935 //! v_nop
936 //! s_branch block:BB3
937 bld.reset(program->create_and_insert_block());
938 program->blocks[0].linear_succs.push_back(1);
939 program->blocks[1].linear_preds.push_back(0);
940 bld.vop1(aco_opcode::v_mov_b32, Definition(PhysReg(256), v1), Operand::zero());
941 bld.sop1(aco_opcode::s_mov_b64, Definition(exec, s2), Operand::c64(-1));
942 bld.vop1(aco_opcode::v_nop);
943 bld.sopp(aco_opcode::s_branch, 3);
944
945 //! BB2
946 //! /* logical preds: / linear preds: BB0, / kind: */
947 //! v1: %0:v[0] = v_mov_b32 0
948 bld.reset(program->create_and_insert_block());
949 program->blocks[0].linear_succs.push_back(2);
950 program->blocks[2].linear_preds.push_back(0);
951 bld.vop1(aco_opcode::v_mov_b32, Definition(PhysReg(256), v1), Operand::zero());
952
953 //! BB3
954 //! /* logical preds: / linear preds: BB1, BB2, / kind: */
955 //! v1: %0:v[1] = v_mov_b32 1
956 //! s_waitcnt_depctr va_vdst(0)
957 //! v1: %0:v[2] = v_max_f32 %0:v[0], %0:v[1]
958 bld.reset(program->create_and_insert_block());
959 program->blocks[1].linear_succs.push_back(3);
960 program->blocks[2].linear_succs.push_back(3);
961 program->blocks[3].linear_preds.push_back(1);
962 program->blocks[3].linear_preds.push_back(2);
963 bld.vop1(aco_opcode::v_mov_b32, Definition(PhysReg(257), v1), Operand::c32(1));
964 bld.vop2(aco_opcode::v_max_f32, Definition(PhysReg(258), v1), Operand(PhysReg(256), v1),
965 Operand(PhysReg(257), v1));
966
967 /* Control flow merges: one branch shouldn't interfere with the other (should consider furthest
968 * VALU writes after exec).
969 */
970 //! p_unit_test 1
971 //! s_cbranch_scc1 block:BB5
972 bld.pseudo(aco_opcode::p_unit_test, Operand::c32(1u));
973 bld.sopp(aco_opcode::s_cbranch_scc1, 5);
974
975 //! BB4
976 //! /* logical preds: / linear preds: BB3, / kind: */
977 //! v1: %0:v[0] = v_mov_b32 0
978 //! s2: %0:exec = s_mov_b64 -1
979 //; for i in range(2): insert_pattern('v_nop')
980 //! v1: %0:v[1] = v_mov_b32 1
981 //! v_nop
982 //! s_branch block:BB6
983 bld.reset(program->create_and_insert_block());
984 program->blocks[3].linear_succs.push_back(4);
985 program->blocks[4].linear_preds.push_back(3);
986 bld.vop1(aco_opcode::v_mov_b32, Definition(PhysReg(256), v1), Operand::zero());
987 bld.sop1(aco_opcode::s_mov_b64, Definition(exec, s2), Operand::c64(-1));
988 bld.vop1(aco_opcode::v_nop);
989 bld.vop1(aco_opcode::v_nop);
990 bld.vop1(aco_opcode::v_mov_b32, Definition(PhysReg(257), v1), Operand::c32(1));
991 bld.vop1(aco_opcode::v_nop);
992 bld.sopp(aco_opcode::s_branch, 6);
993
994 //! BB5
995 //! /* logical preds: / linear preds: BB3, / kind: */
996 //! v1: %0:v[1] = v_mov_b32 1
997 bld.reset(program->create_and_insert_block());
998 program->blocks[3].linear_succs.push_back(5);
999 program->blocks[5].linear_preds.push_back(3);
1000 bld.vop1(aco_opcode::v_mov_b32, Definition(PhysReg(257), v1), Operand::c32(1));
1001
1002 //! BB6
1003 //! /* logical preds: / linear preds: BB4, BB5, / kind: */
1004 //! s_waitcnt_depctr va_vdst(0)
1005 //! v1: %0:v[2] = v_max_f32 %0:v[0], %0:v[1]
1006 bld.reset(program->create_and_insert_block());
1007 program->blocks[4].linear_succs.push_back(6);
1008 program->blocks[5].linear_succs.push_back(6);
1009 program->blocks[6].linear_preds.push_back(4);
1010 program->blocks[6].linear_preds.push_back(5);
1011 bld.vop2(aco_opcode::v_max_f32, Definition(PhysReg(258), v1), Operand(PhysReg(256), v1),
1012 Operand(PhysReg(257), v1));
1013
1014 /* Control flow merges: one branch shouldn't interfere with the other (should consider closest
1015 * VALU writes after exec).
1016 */
1017 //! p_unit_test 2
1018 //! s_cbranch_scc1 block:BB8
1019 bld.pseudo(aco_opcode::p_unit_test, Operand::c32(2u));
1020 bld.sopp(aco_opcode::s_cbranch_scc1, 8);
1021
1022 //! BB7
1023 //! /* logical preds: / linear preds: BB6, / kind: */
1024 //! v1: %0:v[0] = v_mov_b32 0
1025 //! s2: %0:exec = s_mov_b64 -1
1026 //! v1: %0:v[1] = v_mov_b32 1
1027 //; for i in range(4): insert_pattern('v_nop')
1028 //! s_branch block:BB9
1029 bld.reset(program->create_and_insert_block());
1030 program->blocks[6].linear_succs.push_back(7);
1031 program->blocks[7].linear_preds.push_back(6);
1032 bld.vop1(aco_opcode::v_mov_b32, Definition(PhysReg(256), v1), Operand::zero());
1033 bld.sop1(aco_opcode::s_mov_b64, Definition(exec, s2), Operand::c64(-1));
1034 bld.vop1(aco_opcode::v_mov_b32, Definition(PhysReg(257), v1), Operand::c32(1));
1035 for (unsigned i = 0; i < 4; i++)
1036 bld.vop1(aco_opcode::v_nop);
1037 bld.sopp(aco_opcode::s_branch, 9);
1038
1039 //! BB8
1040 //! /* logical preds: / linear preds: BB6, / kind: */
1041 //! v1: %0:v[1] = v_mov_b32 1
1042 //; for i in range(5): insert_pattern('v_nop')
1043 bld.reset(program->create_and_insert_block());
1044 program->blocks[6].linear_succs.push_back(8);
1045 program->blocks[8].linear_preds.push_back(6);
1046 bld.vop1(aco_opcode::v_mov_b32, Definition(PhysReg(257), v1), Operand::c32(1));
1047 for (unsigned i = 0; i < 5; i++)
1048 bld.vop1(aco_opcode::v_nop);
1049
1050 //! BB9
1051 //! /* logical preds: / linear preds: BB7, BB8, / kind: uniform, */
1052 //! s_waitcnt_depctr va_vdst(0)
1053 //! v1: %0:v[2] = v_max_f32 %0:v[0], %0:v[1]
1054 bld.reset(program->create_and_insert_block());
1055 program->blocks[7].linear_succs.push_back(9);
1056 program->blocks[8].linear_succs.push_back(9);
1057 program->blocks[9].linear_preds.push_back(7);
1058 program->blocks[9].linear_preds.push_back(8);
1059 bld.vop2(aco_opcode::v_max_f32, Definition(PhysReg(258), v1), Operand(PhysReg(256), v1),
1060 Operand(PhysReg(257), v1));
1061
1062 finish_insert_nops_test();
1063 END_TEST
1064
1065 BEGIN_TEST(insert_nops.valu_mask_write)
1066 if (!setup_cs(NULL, GFX11))
1067 return;
1068
1069 /* Basic case. */
1070 //>> p_unit_test 0
1071 //! v1: %0:v[0] = v_cndmask_b32 0, 0, %0:s[0-1]
1072 //! s1: %0:s[1] = s_mov_b32 0
1073 //! s_waitcnt_depctr sa_sdst(0)
1074 //! s1: %0:s[2] = s_mov_b32 %0:s[1]
1075 bld.pseudo(aco_opcode::p_unit_test, Operand::c32(0));
1076 bld.vop2_e64(aco_opcode::v_cndmask_b32, Definition(PhysReg(256), v1), Operand::zero(),
1077 Operand::zero(), Operand(PhysReg(0), s2));
1078 bld.sop1(aco_opcode::s_mov_b32, Definition(PhysReg(1), s1), Operand::zero());
1079 bld.sop1(aco_opcode::s_mov_b32, Definition(PhysReg(2), s1), Operand(PhysReg(1), s1));
1080
1081 /* Mitigation. */
1082 //! p_unit_test 1
1083 //! v1: %0:v[0] = v_cndmask_b32 0, 0, %0:s[0-1]
1084 //! v1: %0:v[1] = v_mov_b32 %0:s[1]
1085 //! s1: %0:s[1] = s_mov_b32 0
1086 //! s1: %0:s[2] = s_mov_b32 %0:s[1]
1087 bld.pseudo(aco_opcode::p_unit_test, Operand::c32(1));
1088 bld.vop2_e64(aco_opcode::v_cndmask_b32, Definition(PhysReg(256), v1), Operand::zero(),
1089 Operand::zero(), Operand(PhysReg(0), s2));
1090 bld.vop1(aco_opcode::v_mov_b32, Definition(PhysReg(257), v1), Operand(PhysReg(1), s1));
1091 bld.sop1(aco_opcode::s_mov_b32, Definition(PhysReg(1), s1), Operand::zero());
1092 bld.sop1(aco_opcode::s_mov_b32, Definition(PhysReg(2), s1), Operand(PhysReg(1), s1));
1093
1094 //! p_unit_test 2
1095 //! v1: %0:v[0] = v_cndmask_b32 0, 0, %0:s[0-1]
1096 //! s1: %0:s[1] = s_mov_b32 0
1097 //! s_waitcnt_depctr sa_sdst(0)
1098 //! s1: %0:s[2] = s_mov_b32 %0:s[1]
1099 //! s1: %0:s[2] = s_mov_b32 %0:s[1]
1100 bld.pseudo(aco_opcode::p_unit_test, Operand::c32(2));
1101 bld.vop2_e64(aco_opcode::v_cndmask_b32, Definition(PhysReg(256), v1), Operand::zero(),
1102 Operand::zero(), Operand(PhysReg(0), s2));
1103 bld.sop1(aco_opcode::s_mov_b32, Definition(PhysReg(1), s1), Operand::zero());
1104 bld.sop1(aco_opcode::s_mov_b32, Definition(PhysReg(2), s1), Operand(PhysReg(1), s1));
1105 bld.sop1(aco_opcode::s_mov_b32, Definition(PhysReg(2), s1), Operand(PhysReg(1), s1));
1106
1107 //! p_unit_test 3
1108 //! v1: %0:v[0] = v_cndmask_b32 0, 0, %0:s[0-1]
1109 //! s1: %0:s[1] = s_mov_b32 0
1110 //! s_waitcnt_depctr sa_sdst(0)
1111 //! s1: %0:s[2] = s_mov_b32 %0:s[1]
1112 bld.pseudo(aco_opcode::p_unit_test, Operand::c32(3));
1113 bld.vop2_e64(aco_opcode::v_cndmask_b32, Definition(PhysReg(256), v1), Operand::zero(),
1114 Operand::zero(), Operand(PhysReg(0), s2));
1115 bld.sop1(aco_opcode::s_mov_b32, Definition(PhysReg(1), s1), Operand::zero());
1116 bld.sopp(aco_opcode::s_waitcnt_depctr, 0xfffe);
1117 bld.sop1(aco_opcode::s_mov_b32, Definition(PhysReg(2), s1), Operand(PhysReg(1), s1));
1118
1119 /* v_cndmask_b32 is both involved in the hazard and is a mitigation. */
1120 //! p_unit_test 4
1121 //! v1: %0:v[0] = v_cndmask_b32 %0:s[2], 0, %0:s[0-1]
1122 //! s1: %0:s[1] = s_mov_b32 0
1123 //! s_waitcnt_depctr sa_sdst(0)
1124 //! s1: %0:s[2] = s_mov_b32 %0:s[1]
1125 bld.pseudo(aco_opcode::p_unit_test, Operand::c32(4));
1126 bld.vop2_e64(aco_opcode::v_cndmask_b32, Definition(PhysReg(256), v1), Operand(PhysReg(2), s1),
1127 Operand::zero(), Operand(PhysReg(0), s2));
1128 bld.sop1(aco_opcode::s_mov_b32, Definition(PhysReg(1), s1), Operand::zero());
1129 bld.sop1(aco_opcode::s_mov_b32, Definition(PhysReg(2), s1), Operand(PhysReg(1), s1));
1130
1131 /* VALU reading exec does not mitigate the hazard. We also don't consider literals. */
1132 //! p_unit_test 5
1133 //! v1: %0:v[0] = v_cndmask_b32 0, 0, %0:s[0-1]
1134 //! v1: %0:v[1] = v_mov_b32 %0:exec_lo
1135 //! s1: %0:s[1] = s_mov_b32 0
1136 //! s_waitcnt_depctr sa_sdst(0)
1137 //! s1: %0:s[2] = s_mov_b32 %0:s[1]
1138 bld.pseudo(aco_opcode::p_unit_test, Operand::c32(5));
1139 bld.vop2_e64(aco_opcode::v_cndmask_b32, Definition(PhysReg(256), v1), Operand::zero(),
1140 Operand::zero(), Operand(PhysReg(0), s2));
1141 bld.vop1(aco_opcode::v_mov_b32, Definition(PhysReg(257), v1), Operand(exec_lo, s1));
1142 bld.sop1(aco_opcode::s_mov_b32, Definition(PhysReg(1), s1), Operand::zero());
1143 bld.sop1(aco_opcode::s_mov_b32, Definition(PhysReg(2), s1), Operand(PhysReg(1), s1));
1144
1145 //! p_unit_test 6
1146 //! v1: %0:v[0] = v_cndmask_b32 0, 0, %0:s[0-1]
1147 //! v1: %0:v[1] = v_mov_b32 0x200
1148 //! s1: %0:s[1] = s_mov_b32 0
1149 //! s_waitcnt_depctr sa_sdst(0)
1150 //! s1: %0:s[2] = s_mov_b32 %0:s[1]
1151 bld.pseudo(aco_opcode::p_unit_test, Operand::c32(6));
1152 bld.vop2_e64(aco_opcode::v_cndmask_b32, Definition(PhysReg(256), v1), Operand::zero(),
1153 Operand::zero(), Operand(PhysReg(0), s2));
1154 bld.vop1(aco_opcode::v_mov_b32, Definition(PhysReg(257), v1), Operand::literal32(0x200));
1155 bld.sop1(aco_opcode::s_mov_b32, Definition(PhysReg(1), s1), Operand::zero());
1156 bld.sop1(aco_opcode::s_mov_b32, Definition(PhysReg(2), s1), Operand(PhysReg(1), s1));
1157
1158 /* Basic case: VALU. */
1159 //! p_unit_test 7
1160 //! v1: %0:v[0] = v_cndmask_b32 0, 0, %0:s[0-1]
1161 //! s1: %0:s[1] = s_mov_b32 0
1162 //! s_waitcnt_depctr sa_sdst(0)
1163 //! v1: %0:v[1] = v_mov_b32 %0:s[1]
1164 bld.pseudo(aco_opcode::p_unit_test, Operand::c32(7));
1165 bld.vop2_e64(aco_opcode::v_cndmask_b32, Definition(PhysReg(256), v1), Operand::zero(),
1166 Operand::zero(), Operand(PhysReg(0), s2));
1167 bld.sop1(aco_opcode::s_mov_b32, Definition(PhysReg(1), s1), Operand::zero());
1168 bld.vop1(aco_opcode::v_mov_b32, Definition(PhysReg(257), v1), Operand(PhysReg(1), s1));
1169
1170 /* SALU which both reads and writes a lane mask SGPR. */
1171 //! p_unit_test 8
1172 //! v1: %0:v[0] = v_cndmask_b32 0, 0, %0:s[0-1]
1173 //! s1: %0:s[1] = s_mov_b32 0
1174 //! v1: %0:v[0] = v_cndmask_b32 0, 0, %0:s[2-3]
1175 //! s_waitcnt_depctr sa_sdst(0)
1176 //! s1: %0:s[2] = s_mov_b32 %0:s[1]
1177 //! s_waitcnt_depctr sa_sdst(0)
1178 //! s1: %0:s[4] = s_mov_b32 %0:s[2]
1179 bld.pseudo(aco_opcode::p_unit_test, Operand::c32(8));
1180 bld.vop2_e64(aco_opcode::v_cndmask_b32, Definition(PhysReg(256), v1), Operand::zero(),
1181 Operand::zero(), Operand(PhysReg(0), s2));
1182 bld.sop1(aco_opcode::s_mov_b32, Definition(PhysReg(1), s1), Operand::zero());
1183 bld.vop2_e64(aco_opcode::v_cndmask_b32, Definition(PhysReg(256), v1), Operand::zero(),
1184 Operand::zero(), Operand(PhysReg(2), s2));
1185 bld.sop1(aco_opcode::s_mov_b32, Definition(PhysReg(2), s1), Operand(PhysReg(1), s1));
1186 bld.sop1(aco_opcode::s_mov_b32, Definition(PhysReg(4), s1), Operand(PhysReg(2), s1));
1187
1188 /* When a SALU writes a lane mask, we shouldn't forget the current SGPRs used as lane masks then
1189 * written. */
1190 //! p_unit_test 9
1191 //! v1: %0:v[0] = v_cndmask_b32 0, 0, %0:s[0-1]
1192 //! s1: %0:s[0] = s_mov_b32 0
1193 //! v1: %0:v[0] = v_cndmask_b32 0, 0, %0:s[2-3]
1194 //! s1: %0:s[2] = s_mov_b32 0
1195 //! s_waitcnt_depctr sa_sdst(0)
1196 //! s1: %0:s[4] = s_mov_b32 %0:s[0]
1197 //! s1: %0:s[5] = s_mov_b32 %0:s[2]
1198 bld.pseudo(aco_opcode::p_unit_test, Operand::c32(9));
1199 bld.vop2_e64(aco_opcode::v_cndmask_b32, Definition(PhysReg(256), v1), Operand::zero(),
1200 Operand::zero(), Operand(PhysReg(0), s2));
1201 bld.sop1(aco_opcode::s_mov_b32, Definition(PhysReg(0), s1), Operand::zero());
1202 bld.vop2_e64(aco_opcode::v_cndmask_b32, Definition(PhysReg(256), v1), Operand::zero(),
1203 Operand::zero(), Operand(PhysReg(2), s2));
1204 bld.sop1(aco_opcode::s_mov_b32, Definition(PhysReg(2), s1), Operand::zero());
1205 bld.sop1(aco_opcode::s_mov_b32, Definition(PhysReg(4), s1), Operand(PhysReg(0), s1));
1206 bld.sop1(aco_opcode::s_mov_b32, Definition(PhysReg(5), s1), Operand(PhysReg(2), s1));
1207
1208 /* When a SALU writes a lane mask, we shouldn't forget all SGPRs used as lane masks, there might
1209 * be later problematic writes. */
1210 //! p_unit_test 10
1211 //! v1: %0:v[0] = v_cndmask_b32 0, 0, %0:s[0-1]
1212 //! s1: %0:s[0] = s_mov_b32 0
1213 //! s_waitcnt_depctr sa_sdst(0)
1214 //! s1: %0:s[4] = s_mov_b32 %0:s[0]
1215 //! s1: %0:s[1] = s_mov_b32 0
1216 //! s_waitcnt_depctr sa_sdst(0)
1217 //! s1: %0:s[5] = s_mov_b32 %0:s[1]
1218 bld.pseudo(aco_opcode::p_unit_test, Operand::c32(10));
1219 bld.vop2_e64(aco_opcode::v_cndmask_b32, Definition(PhysReg(256), v1), Operand::zero(),
1220 Operand::zero(), Operand(PhysReg(0), s2));
1221 bld.sop1(aco_opcode::s_mov_b32, Definition(PhysReg(0), s1), Operand::zero());
1222 bld.sop1(aco_opcode::s_mov_b32, Definition(PhysReg(4), s1), Operand(PhysReg(0), s1));
1223 bld.sop1(aco_opcode::s_mov_b32, Definition(PhysReg(1), s1), Operand::zero());
1224 bld.sop1(aco_opcode::s_mov_b32, Definition(PhysReg(5), s1), Operand(PhysReg(1), s1));
1225
1226 //! p_unit_test 11
1227 //! v1: %0:v[0] = v_cndmask_b32 0, 0, %0:s[0-1]
1228 //! s1: %0:s[0] = s_mov_b32 0
1229 //! s_waitcnt_depctr sa_sdst(0)
1230 //! s1: %0:s[4] = s_mov_b32 %0:s[0]
1231 //! s1: %0:s[0] = s_mov_b32 0
1232 //! s_waitcnt_depctr sa_sdst(0)
1233 //! s1: %0:s[5] = s_mov_b32 %0:s[0]
1234 bld.pseudo(aco_opcode::p_unit_test, Operand::c32(11));
1235 bld.vop2_e64(aco_opcode::v_cndmask_b32, Definition(PhysReg(256), v1), Operand::zero(),
1236 Operand::zero(), Operand(PhysReg(0), s2));
1237 bld.sop1(aco_opcode::s_mov_b32, Definition(PhysReg(0), s1), Operand::zero());
1238 bld.sop1(aco_opcode::s_mov_b32, Definition(PhysReg(4), s1), Operand(PhysReg(0), s1));
1239 bld.sop1(aco_opcode::s_mov_b32, Definition(PhysReg(0), s1), Operand::zero());
1240 bld.sop1(aco_opcode::s_mov_b32, Definition(PhysReg(5), s1), Operand(PhysReg(0), s1));
1241
1242 //! p_unit_test 12
1243 bld.pseudo(aco_opcode::p_unit_test, Operand::c32(12));
1244
1245 //! BB1
1246 //! /* logical preds: / linear preds: BB0, / kind: */
1247 //! v1: %0:v[0] = v_cndmask_b32 0, 0, %0:s[0-1]
1248 bld.reset(program->create_and_insert_block());
1249 program->blocks[0].linear_succs.push_back(1);
1250 program->blocks[1].linear_preds.push_back(0);
1251 bld.vop2_e64(aco_opcode::v_cndmask_b32, Definition(PhysReg(256), v1), Operand::zero(),
1252 Operand::zero(), Operand(PhysReg(0), s2));
1253
1254 //! BB2
1255 //! /* logical preds: / linear preds: BB0, / kind: */
1256 //! v1: %0:v[0] = v_cndmask_b32 0, 0, %0:s[2-3]
1257 bld.reset(program->create_and_insert_block());
1258 program->blocks[0].linear_succs.push_back(2);
1259 program->blocks[2].linear_preds.push_back(0);
1260 bld.vop2_e64(aco_opcode::v_cndmask_b32, Definition(PhysReg(256), v1), Operand::zero(),
1261 Operand::zero(), Operand(PhysReg(2), s2));
1262
1263 //! BB3
1264 //! /* logical preds: / linear preds: BB1, BB2, / kind: uniform, */
1265 //! s1: %0:s[0] = s_mov_b32 0
1266 //! s_waitcnt_depctr sa_sdst(0)
1267 //! s1: %0:s[4] = s_mov_b32 %0:s[0]
1268 //! s1: %0:s[2] = s_mov_b32 0
1269 //! s_waitcnt_depctr sa_sdst(0)
1270 //! s1: %0:s[5] = s_mov_b32 %0:s[2]
1271 bld.reset(program->create_and_insert_block());
1272 program->blocks[1].linear_succs.push_back(3);
1273 program->blocks[2].linear_succs.push_back(3);
1274 program->blocks[3].linear_preds.push_back(1);
1275 program->blocks[3].linear_preds.push_back(2);
1276 bld.sop1(aco_opcode::s_mov_b32, Definition(PhysReg(0), s1), Operand::zero());
1277 bld.sop1(aco_opcode::s_mov_b32, Definition(PhysReg(4), s1), Operand(PhysReg(0), s1));
1278 bld.sop1(aco_opcode::s_mov_b32, Definition(PhysReg(2), s1), Operand::zero());
1279 bld.sop1(aco_opcode::s_mov_b32, Definition(PhysReg(5), s1), Operand(PhysReg(2), s1));
1280
1281 finish_insert_nops_test();
1282 END_TEST
1283
1284 BEGIN_TEST(insert_nops.wmma_raw)
1285 if (!setup_cs(NULL, GFX11))
1286 return;
1287
1288 /* Basic case. */
1289 //>> p_unit_test 0
1290 //! v4: %_:v[20-23] = v_wmma_f16_16x16x16_f16 %_:v[0-7].xx, %_:v[8-15].xx, %_:v[20-23].xx
1291 //! v_nop
1292 //! v4: %_:v[48-51] = v_wmma_f16_16x16x16_f16 %_:v[24-31].xx, %_:v[16-23].xx, %_:v[48-51].xx
1293 bld.pseudo(aco_opcode::p_unit_test, Operand::c32(0));
1294 Operand A(PhysReg(256 + 0), v8);
1295 Operand B(PhysReg(256 + 8), v8);
1296 Operand C(PhysReg(256 + 20), v4);
1297 bld.vop3p(aco_opcode::v_wmma_f16_16x16x16_f16, Definition(C.physReg(), C.regClass()), A, B, C, 0,
1298 0);
1299 A.setFixed(PhysReg(256 + 24));
1300 B.setFixed(PhysReg(256 + 16));
1301 C.setFixed(PhysReg(256 + 48));
1302 bld.vop3p(aco_opcode::v_wmma_f16_16x16x16_f16, Definition(C.physReg(), C.regClass()), A, B, C, 0,
1303 0);
1304
1305 /* Mitigation. */
1306 //! p_unit_test 1
1307 //! v4: %_:v[20-23] = v_wmma_f16_16x16x16_f16 %_:v[0-7].xx, %_:v[8-15].xx, %_:v[20-23].xx
1308 //! v1: %_:v[56] = v_rcp_f32 0
1309 //! v4: %_:v[48-51] = v_wmma_f16_16x16x16_f16 %_:v[24-31].xx, %_:v[16-23].xx, %_:v[48-51].xx
1310 bld.pseudo(aco_opcode::p_unit_test, Operand::c32(1));
1311 A.setFixed(PhysReg(256 + 0));
1312 B.setFixed(PhysReg(256 + 8));
1313 C.setFixed(PhysReg(256 + 20));
1314 bld.vop3p(aco_opcode::v_wmma_f16_16x16x16_f16, Definition(C.physReg(), C.regClass()), A, B, C, 0,
1315 0);
1316 bld.vop1(aco_opcode::v_rcp_f32, Definition(PhysReg(256 + 56), v1), Operand::zero());
1317 A.setFixed(PhysReg(256 + 24));
1318 B.setFixed(PhysReg(256 + 16));
1319 C.setFixed(PhysReg(256 + 48));
1320 bld.vop3p(aco_opcode::v_wmma_f16_16x16x16_f16, Definition(C.physReg(), C.regClass()), A, B, C, 0,
1321 0);
1322
1323 /* No hazard. */
1324 //>> p_unit_test 2
1325 //! v4: %_:v[20-23] = v_wmma_f16_16x16x16_f16 %_:v[0-7].xx, %_:v[8-15].xx, %_:v[20-23].xx
1326 //! v4: %_:v[48-51] = v_wmma_f16_16x16x16_f16 %_:v[24-31].xx, %_:v[32-39].xx, %_:v[48-51].xx
1327 bld.pseudo(aco_opcode::p_unit_test, Operand::c32(2));
1328 A.setFixed(PhysReg(256 + 0));
1329 B.setFixed(PhysReg(256 + 8));
1330 C.setFixed(PhysReg(256 + 20));
1331 bld.vop3p(aco_opcode::v_wmma_f16_16x16x16_f16, Definition(C.physReg(), C.regClass()), A, B, C, 0,
1332 0);
1333 A.setFixed(PhysReg(256 + 24));
1334 B.setFixed(PhysReg(256 + 32));
1335 C.setFixed(PhysReg(256 + 48));
1336 bld.vop3p(aco_opcode::v_wmma_f16_16x16x16_f16, Definition(C.physReg(), C.regClass()), A, B, C, 0,
1337 0);
1338
1339 //>> p_unit_test 3
1340 //! v4: %_:v[20-23] = v_wmma_f16_16x16x16_f16 %_:v[0-7].xx, %_:v[8-15].xx, %_:v[20-23].xx
1341 //! v4: %_:v[20-23] = v_wmma_f16_16x16x16_f16 %_:v[24-31].xx, %_:v[32-39].xx, %_:v[20-23].xx
1342 bld.pseudo(aco_opcode::p_unit_test, Operand::c32(3));
1343 A.setFixed(PhysReg(256 + 0));
1344 B.setFixed(PhysReg(256 + 8));
1345 C.setFixed(PhysReg(256 + 20));
1346 bld.vop3p(aco_opcode::v_wmma_f16_16x16x16_f16, Definition(C.physReg(), C.regClass()), A, B, C, 0,
1347 0);
1348 A.setFixed(PhysReg(256 + 24));
1349 B.setFixed(PhysReg(256 + 32));
1350 C.setFixed(PhysReg(256 + 20));
1351 bld.vop3p(aco_opcode::v_wmma_f16_16x16x16_f16, Definition(C.physReg(), C.regClass()), A, B, C, 0,
1352 0);
1353
1354 finish_insert_nops_test();
1355 END_TEST
1356
1357 enum StageInfoFlags {
1358 stage_separate = 1 << 0,
1359 stage_has_prolog = 1 << 1,
1360 stage_has_export = 1 << 2,
1361 stage_is_prolog = 1 << 3,
1362 stage_is_epilog = 1 << 4,
1363 };
1364
1365 struct StageInfo {
1366 const char* name;
1367 Stage stage;
1368 unsigned flags;
1369 };
1370
1371 BEGIN_TEST(insert_nops.export_priority.stages)
1372 Stage geometry_ngg(AC_HW_NEXT_GEN_GEOMETRY_SHADER, SWStage::GS);
1373 for (StageInfo stage : (StageInfo[]){
1374 {"_fs_first_last", fragment_fs, stage_has_export},
1375 {"_fs_with_epilog_first", fragment_fs, 0},
1376 {"_fs_prolog_first", fragment_fs, stage_is_prolog},
1377 {"_fs_epilog_last", fragment_fs, stage_is_epilog | stage_has_export},
1378 {"_vs_first_last", vertex_ngg, stage_has_export},
1379 {"_vs_with_prolog_last", vertex_ngg, stage_has_export | stage_has_prolog},
1380 {"_tes_first_last", tess_eval_ngg, stage_has_export},
1381 {"_ms_first_last", mesh_ngg, stage_has_export},
1382 {"_tesgs_first_last", tess_eval_geometry_ngg, stage_has_export},
1383 {"_vsgs_first_last", vertex_geometry_ngg, stage_has_export},
1384 {"_vsgs_with_prolog_last", vertex_geometry_ngg, stage_has_export | stage_has_prolog},
1385 {"_separate_vs_first", vertex_ngg, stage_separate},
1386 {"_separate_vs_with_prolog", vertex_ngg, stage_separate | stage_has_prolog},
1387 {"_separate_tes_first", tess_eval_ngg, stage_separate},
1388 {"_separate_gs_last", geometry_ngg, stage_separate | stage_has_export}}) {
1389 if (!setup_cs(NULL, GFX11_5, CHIP_UNKNOWN, stage.name))
1390 continue;
1391
1392 program->stage = stage.stage;
1393 program->info.merged_shader_compiled_separately = stage.flags & stage_separate;
1394 program->info.vs.has_prolog = stage.flags & stage_has_prolog;
1395 program->is_prolog = stage.flags & stage_is_prolog;
1396 program->is_epilog = stage.flags & stage_is_epilog;
1397 //>> /* logical preds: / linear preds: / kind: uniform, top-level, */
1398 //~.*first.*! s_setprio imm:2
1399 if (stage.flags & stage_has_export) {
1400 //~.*last.*! exp v1: undef, v1: undef, v1: undef, v1: undef en:**** pos0
1401 //~.*last.*! s_setprio imm:0
1402 //~.*last.*! s_nop
1403 //~.*last.*! s_nop
1404 //~.*last.*! s_endpgm
1405 bld.exp(aco_opcode::exp, Operand(v1), Operand(v1), Operand(v1), Operand(v1), 0x0,
1406 V_008DFC_SQ_EXP_POS, false);
1407 } else {
1408 //(?!.*last.*)! v_nop
1409 bld.vop1(aco_opcode::v_nop);
1410 }
1411
1412 finish_insert_nops_test(stage.flags & stage_has_export);
1413 }
1414 END_TEST
1415
1416 BEGIN_TEST(insert_nops.export_priority.instrs_after_export)
1417 if (!setup_cs(NULL, GFX11_5))
1418 return;
1419
1420 program->stage = vertex_ngg;
1421 //>> /* logical preds: / linear preds: / kind: uniform, top-level, */
1422 //! s_setprio imm:2
1423 //! exp v1: undef, v1: undef, v1: undef, v1: undef en:**** pos0
1424 //! s_setprio imm:0
1425 //! s_waitcnt_expcnt %0:null imm:0
1426 //! s_nop
1427 //! s_nop
1428 //! s_setprio imm:2
1429 //! v_nop
1430 //! s_endpgm
1431 bld.exp(aco_opcode::exp, Operand(v1), Operand(v1), Operand(v1), Operand(v1), 0x0,
1432 V_008DFC_SQ_EXP_POS, false);
1433 bld.vop1(aco_opcode::v_nop);
1434
1435 finish_insert_nops_test();
1436 END_TEST
1437
1438 BEGIN_TEST(insert_nops.export_priority.fallthrough_to_endpgm)
1439 if (!setup_cs(NULL, GFX11_5))
1440 return;
1441
1442 program->stage = vertex_ngg;
1443 //>> /* logical preds: / linear preds: / kind: top-level, */
1444 //! s_setprio imm:2
1445 //! exp v1: undef, v1: undef, v1: undef, v1: undef en:**** pos0
1446 //! s_setprio imm:0
1447 //! s_nop
1448 //! s_nop
1449 //>> BB1
1450 //>> /* logical preds: BB0, / linear preds: BB0, / kind: uniform, */
1451 //! s_endpgm
1452 bld.exp(aco_opcode::exp, Operand(v1), Operand(v1), Operand(v1), Operand(v1), 0x0,
1453 V_008DFC_SQ_EXP_POS, false);
1454
1455 bld.reset(program->create_and_insert_block());
1456 program->blocks[0].linear_succs.push_back(1);
1457 program->blocks[0].logical_succs.push_back(1);
1458 program->blocks[1].linear_preds.push_back(0);
1459 program->blocks[1].logical_preds.push_back(0);
1460
1461 finish_insert_nops_test();
1462 END_TEST
1463
1464 BEGIN_TEST(insert_nops.export_priority.multiple_exports)
1465 if (!setup_cs(NULL, GFX11_5))
1466 return;
1467
1468 program->stage = vertex_ngg;
1469 //>> /* logical preds: / linear preds: / kind: uniform, top-level, */
1470 //! s_setprio imm:2
1471 //! exp v1: undef, v1: undef, v1: undef, v1: undef en:**** pos0
1472 //! exp v1: undef, v1: undef, v1: undef, v1: undef en:**** pos1
1473 //! s_setprio imm:0
1474 //! s_nop
1475 //! s_nop
1476 //! s_endpgm
1477 bld.exp(aco_opcode::exp, Operand(v1), Operand(v1), Operand(v1), Operand(v1), 0x0,
1478 V_008DFC_SQ_EXP_POS, false);
1479 bld.exp(aco_opcode::exp, Operand(v1), Operand(v1), Operand(v1), Operand(v1), 0x0,
1480 V_008DFC_SQ_EXP_POS + 1, false);
1481
1482 finish_insert_nops_test();
1483 END_TEST
1484
1485 BEGIN_TEST(insert_nops.export_priority.set_prio)
1486 if (!setup_cs(NULL, GFX11_5))
1487 return;
1488
1489 program->stage = vertex_ngg;
1490 //>> /* logical preds: / linear preds: / kind: uniform, top-level, */
1491 //! s_setprio imm:3
1492 //! v_nop
1493 //! s_setprio imm:2
1494 //! exp v1: undef, v1: undef, v1: undef, v1: undef en:**** pos0
1495 //! s_setprio imm:0
1496 //! s_nop
1497 //! s_nop
1498 //! s_endpgm
1499 bld.sopp(aco_opcode::s_setprio, 3);
1500 bld.vop1(aco_opcode::v_nop);
1501 bld.sopp(aco_opcode::s_setprio, 1);
1502 bld.exp(aco_opcode::exp, Operand(v1), Operand(v1), Operand(v1), Operand(v1), 0x0,
1503 V_008DFC_SQ_EXP_POS, false);
1504
1505 finish_insert_nops_test();
1506 END_TEST
1507
1508 BEGIN_TEST(insert_nops.valu_read_sgpr.basic)
1509 if (!setup_cs(NULL, GFX12))
1510 return;
1511
1512 bld.vop1(aco_opcode::v_mov_b32, Definition(PhysReg(256), v1), Operand(PhysReg(4), s1));
1513 bld.vop1(aco_opcode::v_mov_b32, Definition(PhysReg(256), v1), Operand(PhysReg(7), s1));
1514 bld.vop1(aco_opcode::v_mov_b32, Definition(PhysReg(256), v1), Operand(sgpr_null, s1));
1515 bld.vop1(aco_opcode::v_mov_b32, Definition(PhysReg(256), v1), Operand(exec_lo, s1));
1516 bld.vop1(aco_opcode::v_mov_b32, Definition(PhysReg(256), v1), Operand(m0, s1));
1517 bld.vop1(aco_opcode::v_mov_b32, Definition(PhysReg(256), v1), Operand(scc, s1));
1518 bld.vop1(aco_opcode::v_mov_b32, Definition(PhysReg(256), v1), Operand(vcc, s1));
1519
1520 /* no hazard: SALU write missing */
1521 //>> p_unit_test 0
1522 //! s1: %0:s[64] = s_mov_b32 %0:s[4]
1523 bld.pseudo(aco_opcode::p_unit_test, Operand::c32(0));
1524 bld.sop1(aco_opcode::s_mov_b32, Definition(PhysReg(64), s1), Operand(PhysReg(4), s1));
1525
1526 /* no hazard: SGPR never read by VALU */
1527 //! p_unit_test 1
1528 //! s1: %0:s[16] = s_mov_b32 0
1529 //! s1: %0:s[64] = s_mov_b32 %0:s[16]
1530 bld.pseudo(aco_opcode::p_unit_test, Operand::c32(1));
1531 bld.sop1(aco_opcode::s_mov_b32, Definition(PhysReg(16), s1), Operand::zero(4));
1532 bld.sop1(aco_opcode::s_mov_b32, Definition(PhysReg(64), s1), Operand(PhysReg(16), s1));
1533
1534 /* basic case: SALU read */
1535 //! p_unit_test 2
1536 //! s1: %0:s[4] = s_mov_b32 0
1537 //! s_waitcnt_depctr sa_sdst(0)
1538 //! s1: %0:s[64] = s_mov_b32 %0:s[4]
1539 bld.pseudo(aco_opcode::p_unit_test, Operand::c32(2));
1540 bld.sop1(aco_opcode::s_mov_b32, Definition(PhysReg(4), s1), Operand::zero(4));
1541 bld.sop1(aco_opcode::s_mov_b32, Definition(PhysReg(64), s1), Operand(PhysReg(4), s1));
1542
1543 /* basic case again: VALU reads never expire */
1544 //! p_unit_test 3
1545 //! s1: %0:s[4] = s_mov_b32 0
1546 //! s_waitcnt_depctr sa_sdst(0)
1547 //! s1: %0:s[64] = s_mov_b32 %0:s[4]
1548 bld.pseudo(aco_opcode::p_unit_test, Operand::c32(3));
1549 bld.sop1(aco_opcode::s_mov_b32, Definition(PhysReg(4), s1), Operand::zero(4));
1550 bld.sop1(aco_opcode::s_mov_b32, Definition(PhysReg(64), s1), Operand(PhysReg(4), s1));
1551
1552 /* sa_sdst(0) resolves the hazard */
1553 //! p_unit_test 4
1554 //! s1: %0:s[4] = s_mov_b32 0
1555 //! s_waitcnt_depctr sa_sdst(0)
1556 //! s1: %0:s[64] = s_mov_b32 %0:s[4]
1557 //! s1: %0:s[64] = s_mov_b32 %0:s[4]
1558 bld.pseudo(aco_opcode::p_unit_test, Operand::c32(4));
1559 bld.sop1(aco_opcode::s_mov_b32, Definition(PhysReg(4), s1), Operand::zero(4));
1560 bld.sop1(aco_opcode::s_mov_b32, Definition(PhysReg(64), s1), Operand(PhysReg(4), s1));
1561 bld.sop1(aco_opcode::s_mov_b32, Definition(PhysReg(64), s1), Operand(PhysReg(4), s1));
1562
1563 //! p_unit_test 5
1564 //! s1: %0:s[4] = s_mov_b32 0
1565 //! s_waitcnt_depctr sa_sdst(0)
1566 //! s1: %0:s[64] = s_mov_b32 %0:s[4]
1567 bld.pseudo(aco_opcode::p_unit_test, Operand::c32(5));
1568 bld.sop1(aco_opcode::s_mov_b32, Definition(PhysReg(4), s1), Operand::zero(4));
1569 bld.sopp(aco_opcode::s_waitcnt_depctr, 0xfffe);
1570 bld.sop1(aco_opcode::s_mov_b32, Definition(PhysReg(64), s1), Operand(PhysReg(4), s1));
1571
1572 /* basic case: VALU read */
1573 //! p_unit_test 6
1574 //! s1: %0:s[4] = s_mov_b32 0
1575 //! s_waitcnt_depctr sa_sdst(0)
1576 //! v1: %0:v[0] = v_mov_b32 %0:s[4]
1577 bld.pseudo(aco_opcode::p_unit_test, Operand::c32(6));
1578 bld.sop1(aco_opcode::s_mov_b32, Definition(PhysReg(4), s1), Operand::zero(4));
1579 bld.vop1(aco_opcode::v_mov_b32, Definition(PhysReg(256), v1), Operand(PhysReg(4), s1));
1580
1581 /* the SALU write is in the same SGPR pair as the VALU read */
1582 //! p_unit_test 7
1583 //! s1: %0:s[6] = s_mov_b32 0
1584 //! s_waitcnt_depctr sa_sdst(0)
1585 //! s1: %0:s[64] = s_mov_b32 %0:s[6]
1586 bld.pseudo(aco_opcode::p_unit_test, Operand::c32(7));
1587 bld.sop1(aco_opcode::s_mov_b32, Definition(PhysReg(6), s1), Operand::zero(4));
1588 bld.sop1(aco_opcode::s_mov_b32, Definition(PhysReg(64), s1), Operand(PhysReg(6), s1));
1589
1590 /* no hazard: these registers are not problematic */
1591 //! p_unit_test 8
1592 //! s1: %0:null = s_mov_b32 0
1593 //! s1: %0:s[64] = s_mov_b32 %0:null
1594 bld.pseudo(aco_opcode::p_unit_test, Operand::c32(8));
1595 bld.sop1(aco_opcode::s_mov_b32, Definition(sgpr_null, s1), Operand::zero(4));
1596 bld.sop1(aco_opcode::s_mov_b32, Definition(PhysReg(64), s1), Operand(sgpr_null, s1));
1597
1598 //! p_unit_test 9
1599 //! s1: %0:exec_lo = s_mov_b32 0
1600 //! s1: %0:s[64] = s_mov_b32 %0:exec_lo
1601 bld.pseudo(aco_opcode::p_unit_test, Operand::c32(9));
1602 bld.sop1(aco_opcode::s_mov_b32, Definition(exec_lo, s1), Operand::zero(4));
1603 bld.sop1(aco_opcode::s_mov_b32, Definition(PhysReg(64), s1), Operand(exec_lo, s1));
1604
1605 //! p_unit_test 10
1606 //! s1: %0:m0 = s_mov_b32 0
1607 //! s1: %0:s[64] = s_mov_b32 %0:m0
1608 bld.pseudo(aco_opcode::p_unit_test, Operand::c32(10));
1609 bld.sop1(aco_opcode::s_mov_b32, Definition(m0, s1), Operand::zero(4));
1610 bld.sop1(aco_opcode::s_mov_b32, Definition(PhysReg(64), s1), Operand(m0, s1));
1611
1612 //! p_unit_test 11
1613 //! s1: %0:scc = s_cmp_lg_i32 0, 0
1614 //! s1: %0:s[64] = s_mov_b32 %0:scc
1615 bld.pseudo(aco_opcode::p_unit_test, Operand::c32(11));
1616 bld.sopc(aco_opcode::s_cmp_lg_i32, Definition(scc, s1), Operand::zero(4), Operand::zero(4));
1617 bld.sop1(aco_opcode::s_mov_b32, Definition(PhysReg(64), s1), Operand(scc, s1));
1618
1619 /* 11 SALU between the write and a VALU read expire the hazard */
1620 //! p_unit_test 12
1621 //! s1: %0:s[4] = s_mov_b32 0
1622 //; for i in range(11): insert_pattern('s1: %0:s[64] = s_mov_b32 0')
1623 //! v1: %0:v[0] = v_mov_b32 %0:s[4]
1624 bld.pseudo(aco_opcode::p_unit_test, Operand::c32(12));
1625 bld.sop1(aco_opcode::s_mov_b32, Definition(PhysReg(4), s1), Operand::zero(4));
1626 for (unsigned i = 0; i < 11; i++)
1627 bld.sop1(aco_opcode::s_mov_b32, Definition(PhysReg(64), s1), Operand::zero(4));
1628 bld.vop1(aco_opcode::v_mov_b32, Definition(PhysReg(256), v1), Operand(PhysReg(4), s1));
1629
1630 //! p_unit_test 13
1631 //! s1: %0:s[4] = s_mov_b32 0
1632 //; for i in range(10): insert_pattern('s1: %0:s[64] = s_mov_b32 0')
1633 //! s_waitcnt_depctr sa_sdst(0)
1634 //! v1: %0:v[0] = v_mov_b32 %0:s[4]
1635 bld.pseudo(aco_opcode::p_unit_test, Operand::c32(13));
1636 bld.sop1(aco_opcode::s_mov_b32, Definition(PhysReg(4), s1), Operand::zero(4));
1637 for (unsigned i = 0; i < 10; i++)
1638 bld.sop1(aco_opcode::s_mov_b32, Definition(PhysReg(64), s1), Operand::zero(4));
1639 bld.vop1(aco_opcode::v_mov_b32, Definition(PhysReg(256), v1), Operand(PhysReg(4), s1));
1640
1641 /* 10 SALU between the write and a SALU read expire the hazard */
1642 //! p_unit_test 14
1643 //! s1: %0:s[4] = s_mov_b32 0
1644 //; for i in range(10): insert_pattern('s1: %0:s[64] = s_mov_b32 0')
1645 //! s1: %0:s[64] = s_mov_b32 %0:s[4]
1646 bld.pseudo(aco_opcode::p_unit_test, Operand::c32(14));
1647 bld.sop1(aco_opcode::s_mov_b32, Definition(PhysReg(4), s1), Operand::zero(4));
1648 for (unsigned i = 0; i < 10; i++)
1649 bld.sop1(aco_opcode::s_mov_b32, Definition(PhysReg(64), s1), Operand::zero(4));
1650 bld.sop1(aco_opcode::s_mov_b32, Definition(PhysReg(64), s1), Operand(PhysReg(4), s1));
1651
1652 //! p_unit_test 15
1653 //! s1: %0:s[4] = s_mov_b32 0
1654 //; for i in range(9): insert_pattern('s1: %0:s[64] = s_mov_b32 0')
1655 //! s_waitcnt_depctr sa_sdst(0)
1656 //! s1: %0:s[64] = s_mov_b32 %0:s[4]
1657 bld.pseudo(aco_opcode::p_unit_test, Operand::c32(15));
1658 bld.sop1(aco_opcode::s_mov_b32, Definition(PhysReg(4), s1), Operand::zero(4));
1659 for (unsigned i = 0; i < 9; i++)
1660 bld.sop1(aco_opcode::s_mov_b32, Definition(PhysReg(64), s1), Operand::zero(4));
1661 bld.sop1(aco_opcode::s_mov_b32, Definition(PhysReg(64), s1), Operand(PhysReg(4), s1));
1662
1663 /* SOPP in-between the write and the read do not count */
1664 //! p_unit_test 16
1665 //! s1: %0:s[4] = s_mov_b32 0
1666 //; for i in range(9): insert_pattern('s1: %0:s[64] = s_mov_b32 0')
1667 //! s_nop
1668 //! s_waitcnt_depctr sa_sdst(0)
1669 //! s1: %0:s[64] = s_mov_b32 %0:s[4]
1670 bld.pseudo(aco_opcode::p_unit_test, Operand::c32(16));
1671 bld.sop1(aco_opcode::s_mov_b32, Definition(PhysReg(4), s1), Operand::zero(4));
1672 for (unsigned i = 0; i < 9; i++)
1673 bld.sop1(aco_opcode::s_mov_b32, Definition(PhysReg(64), s1), Operand::zero(4));
1674 bld.sopp(aco_opcode::s_nop, 0);
1675 bld.sop1(aco_opcode::s_mov_b32, Definition(PhysReg(64), s1), Operand(PhysReg(4), s1));
1676
1677 /* VALU -> VALU non-VCC SGPR */
1678 //! p_unit_test 17
1679 //! s1: %0:s[4] = v_readfirstlane_b32 %0:v[0]
1680 //! s_waitcnt_depctr va_sdst(0)
1681 //! v1: %0:v[0] = v_mov_b32 %0:s[4]
1682 bld.pseudo(aco_opcode::p_unit_test, Operand::c32(17));
1683 bld.vop1(aco_opcode::v_readfirstlane_b32, Definition(PhysReg(4), s1), Operand(PhysReg(256), v1));
1684 bld.vop1(aco_opcode::v_mov_b32, Definition(PhysReg(256), v1), Operand(PhysReg(4), s1));
1685
1686 /* VALU -> VALU VCC SGPR */
1687 //! p_unit_test 18
1688 //! s1: %0:vcc_hi = v_readfirstlane_b32 %0:v[0]
1689 //! s_waitcnt_depctr va_vcc(0)
1690 //! v1: %0:v[0] = v_mov_b32 %0:vcc_hi
1691 bld.pseudo(aco_opcode::p_unit_test, Operand::c32(18));
1692 bld.vop1(aco_opcode::v_readfirstlane_b32, Definition(vcc_hi, s1), Operand(PhysReg(256), v1));
1693 bld.vop1(aco_opcode::v_mov_b32, Definition(PhysReg(256), v1), Operand(vcc_hi, s1));
1694
1695 /* va_sdst=0 from SALU reading an SGPR: hazard mitigated */
1696 //! p_unit_test 19
1697 //! s1: %0:s[4] = v_readfirstlane_b32 %0:v[0]
1698 //! s1: %0:s[64] = s_mov_b32 %0:s[6]
1699 //! v1: %0:v[0] = v_mov_b32 %0:s[4]
1700 bld.pseudo(aco_opcode::p_unit_test, Operand::c32(19));
1701 bld.vop1(aco_opcode::v_readfirstlane_b32, Definition(PhysReg(4), s1), Operand(PhysReg(256), v1));
1702 bld.sop1(aco_opcode::s_mov_b32, Definition(PhysReg(64), s1), Operand(PhysReg(6), s1));
1703 bld.vop1(aco_opcode::v_mov_b32, Definition(PhysReg(256), v1), Operand(PhysReg(4), s1));
1704
1705 /* va_vcc=0 from SALU reading VCC: hazard mitigated */
1706 //! p_unit_test 20
1707 //! s1: %0:vcc_hi = v_readfirstlane_b32 %0:v[0]
1708 //! s1: %0:s[64] = s_mov_b32 %0:vcc_lo
1709 //! v1: %0:v[0] = v_mov_b32 %0:vcc_hi
1710 bld.pseudo(aco_opcode::p_unit_test, Operand::c32(20));
1711 bld.vop1(aco_opcode::v_readfirstlane_b32, Definition(vcc_hi, s1), Operand(PhysReg(256), v1));
1712 bld.sop1(aco_opcode::s_mov_b32, Definition(PhysReg(64), s1), Operand(vcc, s1));
1713 bld.vop1(aco_opcode::v_mov_b32, Definition(PhysReg(256), v1), Operand(vcc_hi, s1));
1714
1715 /* VALU -> VALU read VCC and then SGPR */
1716 //! p_unit_test 21
1717 //! s1: %0:vcc_hi = v_readfirstlane_b32 %0:v[0]
1718 //! s1: %0:s[4] = v_readfirstlane_b32 %0:v[0]
1719 //! s_waitcnt_depctr va_vcc(0)
1720 //! v1: %0:v[0] = v_mov_b32 %0:vcc_hi
1721 //! s_waitcnt_depctr va_sdst(0)
1722 //! v1: %0:v[0] = v_mov_b32 %0:s[4]
1723 bld.pseudo(aco_opcode::p_unit_test, Operand::c32(21));
1724 bld.vop1(aco_opcode::v_readfirstlane_b32, Definition(vcc_hi, s1), Operand(PhysReg(256), v1));
1725 bld.vop1(aco_opcode::v_readfirstlane_b32, Definition(PhysReg(4), s1), Operand(PhysReg(256), v1));
1726 bld.vop1(aco_opcode::v_mov_b32, Definition(PhysReg(256), v1), Operand(vcc_hi, s1));
1727 bld.vop1(aco_opcode::v_mov_b32, Definition(PhysReg(256), v1), Operand(PhysReg(4), s1));
1728
1729 /* VALU -> VALU read SGPR and then VCC */
1730 //! p_unit_test 22
1731 //! s1: %0:vcc_hi = v_readfirstlane_b32 %0:v[0]
1732 //! s1: %0:s[4] = v_readfirstlane_b32 %0:v[0]
1733 //! s_waitcnt_depctr va_sdst(0)
1734 //! v1: %0:v[0] = v_mov_b32 %0:s[4]
1735 //! s_waitcnt_depctr va_vcc(0)
1736 //! v1: %0:v[0] = v_mov_b32 %0:vcc_hi
1737 bld.pseudo(aco_opcode::p_unit_test, Operand::c32(22));
1738 bld.vop1(aco_opcode::v_readfirstlane_b32, Definition(vcc_hi, s1), Operand(PhysReg(256), v1));
1739 bld.vop1(aco_opcode::v_readfirstlane_b32, Definition(PhysReg(4), s1), Operand(PhysReg(256), v1));
1740 bld.vop1(aco_opcode::v_mov_b32, Definition(PhysReg(256), v1), Operand(PhysReg(4), s1));
1741 bld.vop1(aco_opcode::v_mov_b32, Definition(PhysReg(256), v1), Operand(vcc_hi, s1));
1742
1743 /* VALU writes VCC and SALU writes SGPR */
1744 //! p_unit_test 23
1745 //! s1: %0:vcc_hi = v_readfirstlane_b32 %0:v[0]
1746 //! s1: %0:s[4] = s_mov_b32 0
1747 //! s_waitcnt_depctr va_vcc(0)
1748 //! v1: %0:v[0] = v_mov_b32 %0:vcc_hi
1749 //! s_waitcnt_depctr sa_sdst(0)
1750 //! v1: %0:v[0] = v_mov_b32 %0:s[4]
1751 bld.pseudo(aco_opcode::p_unit_test, Operand::c32(23));
1752 bld.vop1(aco_opcode::v_readfirstlane_b32, Definition(vcc_hi, s1), Operand(PhysReg(256), v1));
1753 bld.sop1(aco_opcode::s_mov_b32, Definition(PhysReg(4), s1), Operand::zero(4));
1754 bld.vop1(aco_opcode::v_mov_b32, Definition(PhysReg(256), v1), Operand(vcc_hi, s1));
1755 bld.vop1(aco_opcode::v_mov_b32, Definition(PhysReg(256), v1), Operand(PhysReg(4), s1));
1756
1757 finish_insert_nops_test();
1758 END_TEST
1759
1760 BEGIN_TEST(insert_nops.valu_read_sgpr.previous_part)
1761 if (!setup_cs(NULL, GFX12))
1762 return;
1763
1764 /* Raytracing shaders have a prolog and may also be split into several parts. */
1765 program->stage = raytracing_cs;
1766
1767 /* Despite the SGPR never being read by a VALU in this shader, a sa_sdst(0) is needed. */
1768 //>> p_unit_test 0
1769 //! s1: %0:s[4] = s_mov_b32 0
1770 //! s_waitcnt_depctr sa_sdst(0)
1771 //! s1: %0:s[64] = s_mov_b32 %0:s[4]
1772 bld.pseudo(aco_opcode::p_unit_test, Operand::c32(0));
1773 bld.sop1(aco_opcode::s_mov_b32, Definition(PhysReg(4), s1), Operand::zero(4));
1774 bld.sop1(aco_opcode::s_mov_b32, Definition(PhysReg(64), s1), Operand(PhysReg(4), s1));
1775
1776 finish_insert_nops_test();
1777 END_TEST
1778
1779 BEGIN_TEST(insert_nops.setpc_gfx6)
1780 if (!setup_cs(NULL, GFX6))
1781 return;
1782
1783 /* SGPR->SMEM hazards */
1784 //>> p_unit_test 0
1785 //! s1: %0:s[0] = s_mov_b32 0
1786 //! s_nop imm:2
1787 //! s_setpc_b64 0
1788 bld.pseudo(aco_opcode::p_unit_test, Operand::c32(0));
1789 bld.sop1(aco_opcode::s_mov_b32, Definition(PhysReg(0), s1), Operand::zero());
1790 bld.sop1(aco_opcode::s_setpc_b64, Operand::zero(8));
1791
1792 //! p_unit_test 1
1793 //! s1: %0:s[0] = s_mov_b32 0
1794 //! s_nop imm:2
1795 //! s_setpc_b64 0
1796 bld.pseudo(aco_opcode::p_unit_test, Operand::c32(1));
1797 bld.sop1(aco_opcode::s_mov_b32, Definition(PhysReg(0), s1), Operand::zero());
1798 bld.sopp(aco_opcode::s_nop, 2);
1799 bld.sop1(aco_opcode::s_setpc_b64, Operand::zero(8));
1800
1801 finish_insert_nops_test();
1802
1803 /* This hazard can't be tested using s_setpc_b64, because the s_setpc_b64 itself resolves it. */
1804
1805 /* VINTRP->v_readlane_b32/etc */
1806 //>> p_unit_test 2
1807 //! v1: %0:v[0] = v_interp_mov_f32 2, %0:m0 attr0.x
1808 //! s_nop
1809 create_program(GFX6, compute_cs, 64, CHIP_UNKNOWN);
1810 bld.pseudo(aco_opcode::p_unit_test, Operand::c32(2));
1811 bld.vintrp(aco_opcode::v_interp_mov_f32, Definition(PhysReg(256), v1), Operand::c32(2u),
1812 Operand(m0, s1), 0, 0);
1813 finish_insert_nops_test(false);
1814 END_TEST
1815
1816 BEGIN_TEST(insert_nops.setpc_gfx7)
1817 for (amd_gfx_level gfx : {GFX7, GFX9}) {
1818 if (!setup_cs(NULL, gfx))
1819 continue;
1820
1821 //>> p_unit_test 0
1822 //! s_setpc_b64 0
1823 bld.pseudo(aco_opcode::p_unit_test, Operand::c32(0));
1824 bld.sop1(aco_opcode::s_setpc_b64, Operand::zero(8));
1825
1826 /* Break up SMEM clauses: resolved by the s_setpc_b64 itself */
1827 //! p_unit_test 1
1828 //! s1: %0:s[0] = s_load_dword %0:s[0-1]
1829 //! s_setpc_b64 0
1830 bld.pseudo(aco_opcode::p_unit_test, Operand::c32(1));
1831 bld.smem(aco_opcode::s_load_dword, Definition(PhysReg(0), s1), Operand(PhysReg(0), s2));
1832 bld.sop1(aco_opcode::s_setpc_b64, Operand::zero(8));
1833
1834 /* SALU and GDS hazards */
1835 //! p_unit_test 2
1836 //! s_setreg_imm32_b32 0x0 imm:14337
1837 //! s_nop
1838 //! s_setpc_b64 0
1839 bld.pseudo(aco_opcode::p_unit_test, Operand::c32(2));
1840 bld.sopk(aco_opcode::s_setreg_imm32_b32, Operand::literal32(0), (7 << 11) | 1);
1841 bld.sop1(aco_opcode::s_setpc_b64, Operand::zero(8));
1842
1843 /* VALU writes vcc -> vccz/v_div_fmas */
1844 //! p_unit_test 3
1845 //! s2: %0:vcc = v_cmp_eq_u32 0, 0
1846 //! s_nop imm:3
1847 //! s_setpc_b64 0
1848 bld.pseudo(aco_opcode::p_unit_test, Operand::c32(3));
1849 bld.vopc_e64(aco_opcode::v_cmp_eq_u32, Definition(vcc, s2), Operand::zero(), Operand::zero());
1850 bld.sop1(aco_opcode::s_setpc_b64, Operand::zero(8));
1851
1852 /* VALU writes exec -> execz/DPP */
1853 //! p_unit_test 4
1854 //! s2: %0:exec = v_cmpx_eq_u32 0, 0
1855 //! s_nop imm:3
1856 //! s_setpc_b64 0
1857 bld.pseudo(aco_opcode::p_unit_test, Operand::c32(4));
1858 bld.vopc_e64(aco_opcode::v_cmpx_eq_u32, Definition(exec, s2), Operand::zero(),
1859 Operand::zero());
1860 bld.sop1(aco_opcode::s_setpc_b64, Operand::zero(8));
1861
1862 /* VALU->DPP */
1863 //! p_unit_test 5
1864 //! v1: %0:v[0] = v_mov_b32 0
1865 //~gfx9! s_nop
1866 //! s_setpc_b64 0
1867 bld.pseudo(aco_opcode::p_unit_test, Operand::c32(5));
1868 bld.vop1(aco_opcode::v_mov_b32, Definition(PhysReg(256), v1), Operand::zero());
1869 bld.sop1(aco_opcode::s_setpc_b64, Operand::zero(8));
1870
1871 /* VALU->v_readlane_b32/VMEM/etc */
1872 //! p_unit_test 6
1873 //! s1: %0:s[0] = v_readfirstlane_b32 %0:v[0]
1874 //! s_nop imm:3
1875 //! s_setpc_b64 0
1876 bld.pseudo(aco_opcode::p_unit_test, Operand::c32(6));
1877 bld.vop1(aco_opcode::v_readfirstlane_b32, Definition(PhysReg(0), s1),
1878 Operand(PhysReg(256), v1));
1879 bld.sop1(aco_opcode::s_setpc_b64, Operand::zero(8));
1880
1881 finish_insert_nops_test();
1882
1883 /* These hazards can't be tested using s_setpc_b64, because the s_setpc_b64 itself resolves
1884 * them. */
1885
1886 //>> p_unit_test 7
1887 //! buffer_store_dwordx3 %0:s[0-3], %0:v[0], 0, %0:v[0-2] offen
1888 //! s_nop
1889 create_program(gfx, compute_cs, 64, CHIP_UNKNOWN);
1890 bld.pseudo(aco_opcode::p_unit_test, Operand::c32(7));
1891 bld.mubuf(aco_opcode::buffer_store_dwordx3, Operand(PhysReg(0), s4),
1892 Operand(PhysReg(256), v1), Operand::zero(), Operand(PhysReg(256), v3), 0, true);
1893 finish_insert_nops_test(false);
1894
1895 //>> p_unit_test 8
1896 //! s1: %0:m0 = s_mov_b32 0
1897 //! s_nop
1898 create_program(gfx, compute_cs, 64, CHIP_UNKNOWN);
1899 bld.pseudo(aco_opcode::p_unit_test, Operand::c32(8));
1900 bld.sop1(aco_opcode::s_mov_b32, Definition(PhysReg(m0), s1), Operand::zero());
1901 finish_insert_nops_test(false);
1902
1903 /* Break up SMEM clauses */
1904 //>> p_unit_test 9
1905 //! s1: %0:s[0] = s_load_dword %0:s[0-1]
1906 //! s_nop
1907 create_program(gfx, compute_cs, 64, CHIP_UNKNOWN);
1908 bld.pseudo(aco_opcode::p_unit_test, Operand::c32(9));
1909 bld.smem(aco_opcode::s_load_dword, Definition(PhysReg(0), s1), Operand(PhysReg(0), s2));
1910 finish_insert_nops_test(false);
1911 }
1912 END_TEST
1913
1914 BEGIN_TEST(insert_nops.setpc_gfx10)
1915 if (!setup_cs(NULL, GFX10))
1916 return;
1917
1918 //>> p_unit_test 0
1919 //! s_setpc_b64 0
1920 bld.pseudo(aco_opcode::p_unit_test, Operand::c32(0));
1921 bld.sop1(aco_opcode::s_setpc_b64, Operand::zero(8));
1922
1923 /* VcmpxPermlaneHazard */
1924 //! p_unit_test 1
1925 //! s2: %0:exec = v_cmpx_eq_u32 0, 0
1926 //! v1: %0:v[0] = v_mov_b32 %0:v[0]
1927 //! s_setpc_b64 0
1928 bld.pseudo(aco_opcode::p_unit_test, Operand::c32(1));
1929 bld.vopc_e64(aco_opcode::v_cmpx_eq_u32, Definition(exec, s2), Operand::zero(), Operand::zero());
1930 bld.sop1(aco_opcode::s_setpc_b64, Operand::zero(8));
1931
1932 /* VMEMtoScalarWriteHazard */
1933 //! p_unit_test 2
1934 //! v1: %0:v[0] = ds_read_b32 %0:v[0]
1935 //! s_waitcnt_vscnt %0:null imm:0
1936 //! s_waitcnt_depctr vm_vsrc(0)
1937 //! s_setpc_b64 0
1938 bld.pseudo(aco_opcode::p_unit_test, Operand::c32(2));
1939 bld.ds(aco_opcode::ds_read_b32, Definition(PhysReg(256), v1), Operand(PhysReg(256), v1));
1940 bld.sopk(aco_opcode::s_waitcnt_vscnt, Operand(sgpr_null, s1),
1941 0); /* reset LdsBranchVmemWARHazard */
1942 bld.sop1(aco_opcode::s_setpc_b64, Operand::zero(8));
1943
1944 /* VcmpxExecWARHazard */
1945 //! p_unit_test 3
1946 //! s1: %0:s[0] = s_mov_b32 %0:exec_hi
1947 //! s_waitcnt_depctr sa_sdst(0)
1948 //! s_setpc_b64 0
1949 bld.pseudo(aco_opcode::p_unit_test, Operand::c32(3));
1950 bld.sop1(aco_opcode::s_mov_b32, Definition(PhysReg(0), s1), Operand(exec_hi, s1));
1951 bld.sop1(aco_opcode::s_setpc_b64, Operand::zero(8));
1952
1953 /* LdsBranchVmemWARHazard */
1954 //! p_unit_test 4
1955 //! v1: %0:v[0] = ds_read_b32 %0:v[0]
1956 //! v_nop
1957 //! s_branch block:BB0
1958 //! s_waitcnt_vscnt %0:null imm:0
1959 //! s_setpc_b64 0
1960 bld.pseudo(aco_opcode::p_unit_test, Operand::c32(4));
1961 bld.ds(aco_opcode::ds_read_b32, Definition(PhysReg(256), v1), Operand(PhysReg(256), v1));
1962 bld.vop1(aco_opcode::v_nop); /* reset VMEMtoScalarWriteHazard */
1963 bld.sopp(aco_opcode::s_branch, 0);
1964 bld.sop1(aco_opcode::s_setpc_b64, Operand::zero(8));
1965
1966 //! p_unit_test 5
1967 //! v1: %0:v[0] = ds_read_b32 %0:v[0]
1968 //! v_nop
1969 //! s_waitcnt_vscnt %0:null imm:0
1970 //! s_setpc_b64 0
1971 bld.pseudo(aco_opcode::p_unit_test, Operand::c32(5));
1972 bld.ds(aco_opcode::ds_read_b32, Definition(PhysReg(256), v1), Operand(PhysReg(256), v1));
1973 bld.vop1(aco_opcode::v_nop); /* reset VMEMtoScalarWriteHazard */
1974 bld.sop1(aco_opcode::s_setpc_b64, Operand::zero(8));
1975
1976 /* waNsaCannotFollowWritelane: resolved by the s_setpc_b64 */
1977 //! p_unit_test 6
1978 //! v1: %0:v[0] = v_writelane_b32_e64 %0:v[1], 0, %0:v[0]
1979 //! s_setpc_b64 0
1980 bld.pseudo(aco_opcode::p_unit_test, Operand::c32(6));
1981 bld.vop3(aco_opcode::v_writelane_b32_e64, Definition(PhysReg(256), v1),
1982 Operand(PhysReg(257), v1), Operand::zero(4), Operand(PhysReg(256), v1));
1983 bld.sop1(aco_opcode::s_setpc_b64, Operand::zero(8));
1984
1985 finish_insert_nops_test();
1986
1987 /* These hazards can't be tested using s_setpc_b64, because the s_setpc_b64 itself resolves them.
1988 */
1989
1990 /* SMEMtoVectorWriteHazard */
1991 //>> p_unit_test 7
1992 //! s1: %0:s[0] = s_load_dword %0:s[0-1]
1993 //! s1: %0:null = s_mov_b32 0
1994 create_program(GFX10, compute_cs, 64, CHIP_UNKNOWN);
1995 bld.pseudo(aco_opcode::p_unit_test, Operand::c32(7));
1996 bld.smem(aco_opcode::s_load_dword, Definition(PhysReg(0), s1), Operand(PhysReg(0), s2));
1997 finish_insert_nops_test(false);
1998
1999 /* NSAToVMEMBug is already resolved indirectly through VMEMtoScalarWriteHazard and
2000 * LdsBranchVmemWARHazard. */
2001 //>> p_unit_test 8
2002 //! v1: %0:v[0] = image_sample %0:s[0-7], %0:s[0-3], v1: undef, %0:v[0], %0:v[2], %0:v[4], %0:v[6], %0:v[8], %0:v[10] 2d
2003 //! s_waitcnt_depctr vm_vsrc(0)
2004 //! s_waitcnt_vscnt %0:null imm:0
2005 create_program(GFX10, compute_cs, 64, CHIP_UNKNOWN);
2006 bld.pseudo(aco_opcode::p_unit_test, Operand::c32(8));
2007 create_mimg(true, 6, 4);
2008 finish_insert_nops_test(false);
2009
2010 /* waNsaCannotFollowWritelane */
2011 //>> p_unit_test 9
2012 //! v1: %0:v[0] = v_writelane_b32_e64 %0:v[1], 0, %0:v[0]
2013 //! s_nop
2014 create_program(GFX10, compute_cs, 64, CHIP_UNKNOWN);
2015 bld.pseudo(aco_opcode::p_unit_test, Operand::c32(9));
2016 bld.vop3(aco_opcode::v_writelane_b32_e64, Definition(PhysReg(256), v1),
2017 Operand(PhysReg(257), v1), Operand::zero(4), Operand(PhysReg(256), v1));
2018 finish_insert_nops_test(false);
2019 END_TEST
2020
2021 BEGIN_TEST(insert_nops.setpc_gfx11)
2022 if (!setup_cs(NULL, GFX11))
2023 return;
2024
2025 //>> p_unit_test 0
2026 //! s_setpc_b64 0
2027 bld.pseudo(aco_opcode::p_unit_test, Operand::c32(0));
2028 bld.sop1(aco_opcode::s_setpc_b64, Operand::zero(8));
2029
2030 /* LdsDirectVALUHazard */
2031 //! p_unit_test 1
2032 //! s2: %0:vcc = v_cmp_eq_u32 %0:v[0], 0
2033 //! s_waitcnt_depctr va_vdst(0)
2034 //! s_setpc_b64 0
2035 bld.pseudo(aco_opcode::p_unit_test, Operand::c32(1));
2036 bld.vopc_e64(aco_opcode::v_cmp_eq_u32, Definition(vcc, s2), Operand(PhysReg(256), v1),
2037 Operand::zero());
2038 bld.sop1(aco_opcode::s_setpc_b64, Operand::zero(8));
2039
2040 /* VALUPartialForwardingHazard */
2041 //! p_unit_test 2
2042 //! v1: %0:v[0] = v_mov_b32 0
2043 //! s_waitcnt_depctr va_vdst(0)
2044 //! s_setpc_b64 0
2045 bld.pseudo(aco_opcode::p_unit_test, Operand::c32(2));
2046 bld.vop1(aco_opcode::v_mov_b32, Definition(PhysReg(256), v1), Operand::zero());
2047 bld.sop1(aco_opcode::s_setpc_b64, Operand::zero(8));
2048
2049 /* VcmpxPermlaneHazard */
2050 //! p_unit_test 2
2051 //! s2: %0:exec = v_cmpx_eq_u32 0, 0
2052 //! v_nop
2053 //! s_setpc_b64 0
2054 bld.pseudo(aco_opcode::p_unit_test, Operand::c32(2));
2055 bld.vopc_e64(aco_opcode::v_cmpx_eq_u32, Definition(exec, s2), Operand::zero(), Operand::zero());
2056 bld.sop1(aco_opcode::s_setpc_b64, Operand::zero(8));
2057
2058 /* VALUTransUseHazard */
2059 //! p_unit_test 3
2060 //! v1: %0:v[0] = v_rcp_f32 0
2061 //! s_waitcnt_depctr va_vdst(0)
2062 //! s_setpc_b64 0
2063 bld.pseudo(aco_opcode::p_unit_test, Operand::c32(3));
2064 bld.vop1(aco_opcode::v_rcp_f32, Definition(PhysReg(256), v1), Operand::zero());
2065 bld.sop1(aco_opcode::s_setpc_b64, Operand::zero(8));
2066
2067 /* VALUMaskWriteHazard */
2068 //! p_unit_test 4
2069 //! v1: %0:v[0] = v_cndmask_b32 0, 0, %0:vcc
2070 //! s1: %0:vcc_hi = s_mov_b32 0
2071 //! s_waitcnt_depctr va_vdst(0) sa_sdst(0)
2072 //! v1: %0:v[0] = v_xor3_b32 %0:v[0], %0:s[0], %0:s[0]
2073 //! s_waitcnt_depctr va_vdst(0)
2074 //! s_setpc_b64 0
2075 bld.pseudo(aco_opcode::p_unit_test, Operand::c32(4));
2076 bld.vop2(aco_opcode::v_cndmask_b32, Definition(PhysReg(256), v1), Operand::zero(),
2077 Operand::zero(), Operand(vcc, s2));
2078 bld.sop1(aco_opcode::s_mov_b32, Definition(vcc_hi, s1), Operand::c32(0));
2079 bld.sop1(aco_opcode::s_setpc_b64, Operand::zero(8));
2080
2081 //! p_unit_test 8
2082 //! v1: %0:v[0] = v_cndmask_b32 0, 0, %0:vcc
2083 //! s_waitcnt_depctr va_vdst(0)
2084 //! v1: %0:v[0] = v_xor3_b32 %0:v[0], %0:s[0], %0:s[0]
2085 //! s_waitcnt_depctr va_vdst(0)
2086 //! s_setpc_b64 0
2087 bld.pseudo(aco_opcode::p_unit_test, Operand::c32(8));
2088 bld.vop2(aco_opcode::v_cndmask_b32, Definition(PhysReg(256), v1), Operand::zero(),
2089 Operand::zero(), Operand(vcc, s2));
2090 bld.sop1(aco_opcode::s_setpc_b64, Operand::zero(8));
2091
2092 //! p_unit_test 5
2093 //! v1: %0:v[0] = v_cndmask_b32 0, 0, %0:vcc
2094 //! s2: %0:vcc = s_mov_b64 0
2095 //! s_waitcnt_depctr va_vdst(0) sa_sdst(0)
2096 //! v1: %0:v[0] = v_xor3_b32 %0:v[0], %0:s[0], %0:s[0]
2097 //! s_waitcnt_depctr va_vdst(0)
2098 //! s_setpc_b64 0
2099 bld.pseudo(aco_opcode::p_unit_test, Operand::c32(5));
2100 bld.vop2(aco_opcode::v_cndmask_b32, Definition(PhysReg(256), v1), Operand::zero(),
2101 Operand::zero(), Operand(vcc, s2));
2102 bld.sop1(aco_opcode::s_mov_b64, Definition(vcc, s2), Operand::zero(8));
2103 bld.sop1(aco_opcode::s_setpc_b64, Operand::zero(8));
2104
2105 /* LdsDirectVMEMHazard */
2106 //! p_unit_test 6
2107 //! v1: %0:v[0] = ds_read_b32 %0:v[0]
2108 //! s_waitcnt_depctr vm_vsrc(0)
2109 //! s_setpc_b64 0
2110 bld.pseudo(aco_opcode::p_unit_test, Operand::c32(6));
2111 bld.ds(aco_opcode::ds_read_b32, Definition(PhysReg(256), v1), Operand(PhysReg(256), v1));
2112 bld.sop1(aco_opcode::s_setpc_b64, Operand::zero(8));
2113
2114 /* WMMA Hazards */
2115 //! p_unit_test 7
2116 //! v4: %0:v[20-23] = v_wmma_f16_16x16x16_f16 %0:v[0-7].xx, %0:v[8-15].xx, %0:v[20-23].xx
2117 //! v_nop
2118 //! s_waitcnt_depctr va_vdst(0)
2119 //! s_setpc_b64 0
2120 bld.pseudo(aco_opcode::p_unit_test, Operand::c32(7));
2121 Operand A(PhysReg(256 + 0), v8);
2122 Operand B(PhysReg(256 + 8), v8);
2123 Operand C(PhysReg(256 + 20), v4);
2124 bld.vop3p(aco_opcode::v_wmma_f16_16x16x16_f16, Definition(C.physReg(), C.regClass()), A, B, C, 0,
2125 0);
2126 bld.sop1(aco_opcode::s_setpc_b64, Operand::zero(8));
2127
2128 finish_insert_nops_test(true);
2129 END_TEST
2130
2131 BEGIN_TEST(insert_nops.setpc_gfx12)
2132 if (!setup_cs(NULL, GFX12))
2133 return;
2134
2135 //>> p_unit_test 0
2136 //! s_setpc_b64 0
2137 bld.pseudo(aco_opcode::p_unit_test, Operand::c32(0));
2138 bld.sop1(aco_opcode::s_setpc_b64, Operand::zero(8));
2139
2140 /* LdsDirectVALUHazard */
2141 //! p_unit_test 1
2142 //! s2: %0:vcc = v_cmp_eq_u32 %0:v[0], 0
2143 //! s_waitcnt_depctr va_vdst(0)
2144 //! s_setpc_b64 0
2145 bld.pseudo(aco_opcode::p_unit_test, Operand::c32(1));
2146 bld.vopc_e64(aco_opcode::v_cmp_eq_u32, Definition(vcc, s2), Operand(PhysReg(256), v1),
2147 Operand::zero());
2148 bld.sop1(aco_opcode::s_setpc_b64, Operand::zero(8));
2149
2150 /* VcmpxPermlaneHazard */
2151 //! p_unit_test 2
2152 //! s2: %0:exec = v_cmpx_eq_u32 0, 0
2153 //! v_nop
2154 //! s_setpc_b64 0
2155 bld.pseudo(aco_opcode::p_unit_test, Operand::c32(2));
2156 bld.vopc_e64(aco_opcode::v_cmpx_eq_u32, Definition(exec, s2), Operand::zero(), Operand::zero());
2157 bld.sop1(aco_opcode::s_setpc_b64, Operand::zero(8));
2158
2159 /* LdsDirectVMEMHazard */
2160 //! p_unit_test 3
2161 //! v1: %0:v[0] = ds_read_b32 %0:v[0]
2162 //! s_waitcnt_depctr vm_vsrc(0)
2163 //! s_setpc_b64 0
2164 bld.pseudo(aco_opcode::p_unit_test, Operand::c32(3));
2165 bld.ds(aco_opcode::ds_read_b32, Definition(PhysReg(256), v1), Operand(PhysReg(256), v1));
2166 bld.sop1(aco_opcode::s_setpc_b64, Operand::zero(8));
2167
2168 /* VALUReadSGPRHazard */
2169 //! p_unit_test 4
2170 //! v1: %0:v[0] = v_mov_b32 %0:s[4]
2171 //! s1: %0:s[4] = s_mov_b32 0
2172 //! s_waitcnt_depctr va_vdst(0) sa_sdst(0)
2173 //! s_setpc_b64 0
2174 bld.pseudo(aco_opcode::p_unit_test, Operand::c32(4));
2175 bld.vop1(aco_opcode::v_mov_b32, Definition(PhysReg(256), v1), Operand(PhysReg(4), s1));
2176 bld.sop1(aco_opcode::s_mov_b32, Definition(PhysReg(4), s1), Operand::zero(4));
2177 bld.sop1(aco_opcode::s_setpc_b64, Operand::zero(8));
2178
2179 //! p_unit_test 5
2180 //! v1: %0:v[0] = v_mov_b32 %0:s[4]
2181 //! s1: %0:s[4] = s_mov_b32 0
2182 //; for i in range(10): insert_pattern('s1: %0:s[64] = s_mov_b32 0')
2183 //! s_waitcnt_depctr va_vdst(0)
2184 //! s_setpc_b64 0
2185 bld.pseudo(aco_opcode::p_unit_test, Operand::c32(5));
2186 bld.vop1(aco_opcode::v_mov_b32, Definition(PhysReg(256), v1), Operand(PhysReg(4), s1));
2187 bld.sop1(aco_opcode::s_mov_b32, Definition(PhysReg(4), s1), Operand::zero(4));
2188 for (unsigned i = 0; i < 10; i++) /* the s_setpc_b64 counts */
2189 bld.sop1(aco_opcode::s_mov_b32, Definition(PhysReg(64), s1), Operand::zero(4));
2190 bld.sop1(aco_opcode::s_setpc_b64, Operand::zero(8));
2191
2192 //! p_unit_test 6
2193 //! v1: %0:v[0] = v_mov_b32 %0:s[4]
2194 //! s1: %0:s[4] = s_mov_b32 0
2195 //; for i in range(9): insert_pattern('s1: %0:s[64] = s_mov_b32 0')
2196 //! s_waitcnt_depctr va_vdst(0) sa_sdst(0)
2197 //! s_setpc_b64 0
2198 bld.pseudo(aco_opcode::p_unit_test, Operand::c32(6));
2199 bld.vop1(aco_opcode::v_mov_b32, Definition(PhysReg(256), v1), Operand(PhysReg(4), s1));
2200 bld.sop1(aco_opcode::s_mov_b32, Definition(PhysReg(4), s1), Operand::zero(4));
2201 for (unsigned i = 0; i < 9; i++)
2202 bld.sop1(aco_opcode::s_mov_b32, Definition(PhysReg(64), s1), Operand::zero(4));
2203 bld.sop1(aco_opcode::s_setpc_b64, Operand::zero(8));
2204
2205 //! p_unit_test 7
2206 //! v1: %0:v[0] = v_mov_b32 %0:s[4]
2207 //! s1: %0:s[4] = v_readfirstlane_b32 %0:v[0]
2208 //! s_waitcnt_depctr va_vdst(0) va_sdst(0)
2209 //! s_setpc_b64 0
2210 bld.pseudo(aco_opcode::p_unit_test, Operand::c32(7));
2211 bld.vop1(aco_opcode::v_mov_b32, Definition(PhysReg(256), v1), Operand(PhysReg(4), s1));
2212 bld.vop1(aco_opcode::v_readfirstlane_b32, Definition(PhysReg(4), s1), Operand(PhysReg(256), v1));
2213 bld.sop1(aco_opcode::s_setpc_b64, Operand::zero(8));
2214
2215 //! p_unit_test 8
2216 //! v1: %0:v[0] = v_mov_b32 %0:vcc_lo
2217 //! s1: %0:vcc_lo = v_readfirstlane_b32 %0:v[0]
2218 //! s_waitcnt_depctr va_vdst(0) va_vcc(0)
2219 //! s_setpc_b64 0
2220 bld.pseudo(aco_opcode::p_unit_test, Operand::c32(8));
2221 bld.vop1(aco_opcode::v_mov_b32, Definition(PhysReg(256), v1), Operand(PhysReg(vcc), s1));
2222 bld.vop1(aco_opcode::v_readfirstlane_b32, Definition(vcc, s1), Operand(PhysReg(256), v1));
2223 bld.sop1(aco_opcode::s_setpc_b64, Operand::zero(8));
2224
2225 //! p_unit_test 9
2226 //! v1: %0:v[0] = v_mov_b32 %0:s[4]
2227 //! v1: %0:v[1] = v_mov_b32 %0:s[5]
2228 //! v1: %0:v[2] = v_mov_b32 %0:vcc_lo
2229 //! s1: %0:s[4] = s_mov_b32 0
2230 //! s1: %0:s[5] = v_readfirstlane_b32 %0:v[0]
2231 //! s1: %0:vcc_lo = v_readfirstlane_b32 %0:v[1]
2232 //! s_waitcnt_depctr va_vdst(0) va_sdst(0) va_vcc(0) sa_sdst(0)
2233 //! s_setpc_b64 0
2234 bld.pseudo(aco_opcode::p_unit_test, Operand::c32(9));
2235 bld.vop1(aco_opcode::v_mov_b32, Definition(PhysReg(256), v1), Operand(PhysReg(4), s1));
2236 bld.vop1(aco_opcode::v_mov_b32, Definition(PhysReg(257), v1), Operand(PhysReg(5), s1));
2237 bld.vop1(aco_opcode::v_mov_b32, Definition(PhysReg(258), v1), Operand(PhysReg(vcc), s1));
2238 bld.sop1(aco_opcode::s_mov_b32, Definition(PhysReg(4), s1), Operand::zero(4));
2239 bld.vop1(aco_opcode::v_readfirstlane_b32, Definition(PhysReg(5), s1), Operand(PhysReg(256), v1));
2240 bld.vop1(aco_opcode::v_readfirstlane_b32, Definition(vcc, s1), Operand(PhysReg(257), v1));
2241 bld.sop1(aco_opcode::s_setpc_b64, Operand::zero(8));
2242
2243 finish_insert_nops_test(true);
2244 END_TEST
2245