1 /*
2 * Copyright © 2020 Valve Corporation
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
13 * Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21 * IN THE SOFTWARE.
22 *
23 */
24 #include "helpers.h"
25
26 using namespace aco;
27
28 void
create_mubuf(unsigned offset,PhysReg dst=PhysReg (256),PhysReg vaddr=PhysReg (256))29 create_mubuf(unsigned offset, PhysReg dst = PhysReg(256), PhysReg vaddr = PhysReg(256))
30 {
31 bld.mubuf(aco_opcode::buffer_load_dword, Definition(dst, v1), Operand(PhysReg(0), s4),
32 Operand(vaddr, v1), Operand::zero(), offset, true);
33 }
34
35 void
create_mubuf_store(PhysReg src=PhysReg (256))36 create_mubuf_store(PhysReg src = PhysReg(256))
37 {
38 bld.mubuf(aco_opcode::buffer_store_dword, Operand(PhysReg(0), s4), Operand(src, v1),
39 Operand::zero(), Operand(src, v1), 0, true);
40 }
41
42 void
create_mimg(bool nsa,unsigned addrs,unsigned instr_dwords)43 create_mimg(bool nsa, unsigned addrs, unsigned instr_dwords)
44 {
45 aco_ptr<MIMG_instruction> mimg{
46 create_instruction<MIMG_instruction>(aco_opcode::image_sample, Format::MIMG, 3 + addrs, 1)};
47 mimg->definitions[0] = Definition(PhysReg(256), v1);
48 mimg->operands[0] = Operand(PhysReg(0), s8);
49 mimg->operands[1] = Operand(PhysReg(0), s4);
50 mimg->operands[2] = Operand(v1);
51 for (unsigned i = 0; i < addrs; i++)
52 mimg->operands[3 + i] = Operand(PhysReg(256 + (nsa ? i * 2 : i)), v1);
53 mimg->dmask = 0x1;
54 mimg->dim = ac_image_2d;
55
56 assert(get_mimg_nsa_dwords(mimg.get()) + 2 == instr_dwords);
57
58 bld.insert(std::move(mimg));
59 }
60
61 BEGIN_TEST(insert_nops.nsa_to_vmem_bug)
62 if (!setup_cs(NULL, GFX10))
63 return;
64
65 /* no nop needed because offset&6==0 */
66 //>> p_unit_test 0
67 //! v1: %0:v[0] = image_sample %0:s[0-7], %0:s[0-3], v1: undef, %0:v[0], %0:v[2], %0:v[4], %0:v[6], %0:v[8], %0:v[10] 2d
68 //! v1: %0:v[0] = buffer_load_dword %0:s[0-3], %0:v[0], 0 offset:8 offen
69 bld.pseudo(aco_opcode::p_unit_test, Operand::zero());
70 create_mimg(true, 6, 4);
71 create_mubuf(8);
72
73 /* nop needed */
74 //! p_unit_test 1
75 //! v1: %0:v[0] = image_sample %0:s[0-7], %0:s[0-3], v1: undef, %0:v[0], %0:v[2], %0:v[4], %0:v[6], %0:v[8], %0:v[10] 2d
76 //! s_nop
77 //! v1: %0:v[0] = buffer_load_dword %0:s[0-3], %0:v[0], 0 offset:4 offen
78 bld.pseudo(aco_opcode::p_unit_test, Operand::c32(1u));
79 create_mimg(true, 6, 4);
80 create_mubuf(4);
81
82 /* no nop needed because the MIMG is not NSA */
83 //! p_unit_test 2
84 //! v1: %0:v[0] = image_sample %0:s[0-7], %0:s[0-3], v1: undef, %0:v[0], %0:v[1], %0:v[2], %0:v[3], %0:v[4], %0:v[5] 2d
85 //! v1: %0:v[0] = buffer_load_dword %0:s[0-3], %0:v[0], 0 offset:4 offen
86 bld.pseudo(aco_opcode::p_unit_test, Operand::c32(2u));
87 create_mimg(false, 6, 2);
88 create_mubuf(4);
89
90 /* no nop needed because there's already an instruction in-between */
91 //! p_unit_test 3
92 //! v1: %0:v[0] = image_sample %0:s[0-7], %0:s[0-3], v1: undef, %0:v[0], %0:v[2], %0:v[4], %0:v[6], %0:v[8], %0:v[10] 2d
93 //! v_nop
94 //! v1: %0:v[0] = buffer_load_dword %0:s[0-3], %0:v[0], 0 offset:4 offen
95 bld.pseudo(aco_opcode::p_unit_test, Operand::c32(3u));
96 create_mimg(true, 6, 4);
97 bld.vop1(aco_opcode::v_nop);
98 create_mubuf(4);
99
100 /* no nop needed because the NSA instruction is under 4 dwords */
101 //! p_unit_test 4
102 //! v1: %0:v[0] = image_sample %0:s[0-7], %0:s[0-3], v1: undef, %0:v[0], %0:v[2] 2d
103 //! v1: %0:v[0] = buffer_load_dword %0:s[0-3], %0:v[0], 0 offset:4 offen
104 bld.pseudo(aco_opcode::p_unit_test, Operand::c32(4u));
105 create_mimg(true, 2, 3);
106 create_mubuf(4);
107
108 /* NSA instruction and MUBUF/MTBUF in a different block */
109 //! p_unit_test 5
110 //! v1: %0:v[0] = image_sample %0:s[0-7], %0:s[0-3], v1: undef, %0:v[0], %0:v[2], %0:v[4], %0:v[6], %0:v[8], %0:v[10] 2d
111 //! BB1
112 //! /* logical preds: / linear preds: BB0, / kind: uniform, */
113 //! s_nop
114 //! v1: %0:v[0] = buffer_load_dword %0:s[0-3], %0:v[0], 0 offset:4 offen
115 bld.pseudo(aco_opcode::p_unit_test, Operand::c32(5u));
116 create_mimg(true, 6, 4);
117 bld.reset(program->create_and_insert_block());
118 create_mubuf(4);
119 program->blocks[0].linear_succs.push_back(1);
120 program->blocks[1].linear_preds.push_back(0);
121
122 finish_insert_nops_test();
123 END_TEST
124
125 BEGIN_TEST(insert_nops.writelane_to_nsa_bug)
126 if (!setup_cs(NULL, GFX10))
127 return;
128
129 /* nop needed */
130 //>> p_unit_test 0
131 //! v1: %0:v[255] = v_writelane_b32_e64 0, 0, %0:v[255]
132 //! s_nop
133 //! v1: %0:v[0] = image_sample %0:s[0-7], %0:s[0-3], v1: undef, %0:v[0], %0:v[2] 2d
134 bld.pseudo(aco_opcode::p_unit_test, Operand::zero());
135 bld.writelane(Definition(PhysReg(511), v1), Operand::zero(), Operand::zero(),
136 Operand(PhysReg(511), v1));
137 create_mimg(true, 2, 3);
138
139 /* no nop needed because the MIMG is not NSA */
140 //! p_unit_test 1
141 //! v1: %0:v[255] = v_writelane_b32_e64 0, 0, %0:v[255]
142 //! v1: %0:v[0] = image_sample %0:s[0-7], %0:s[0-3], v1: undef, %0:v[0], %0:v[1] 2d
143 bld.pseudo(aco_opcode::p_unit_test, Operand::c32(1u));
144 bld.writelane(Definition(PhysReg(511), v1), Operand::zero(), Operand::zero(),
145 Operand(PhysReg(511), v1));
146 create_mimg(false, 2, 2);
147
148 /* no nop needed because there's already an instruction in-between */
149 //! p_unit_test 2
150 //! v1: %0:v[255] = v_writelane_b32_e64 0, 0, %0:v[255]
151 //! v_nop
152 //! v1: %0:v[0] = image_sample %0:s[0-7], %0:s[0-3], v1: undef, %0:v[0], %0:v[2] 2d
153 bld.pseudo(aco_opcode::p_unit_test, Operand::c32(2u));
154 bld.writelane(Definition(PhysReg(511), v1), Operand::zero(), Operand::zero(),
155 Operand(PhysReg(511), v1));
156 bld.vop1(aco_opcode::v_nop);
157 create_mimg(true, 2, 3);
158
159 /* writelane and NSA instruction in different blocks */
160 //! p_unit_test 3
161 //! v1: %0:v[255] = v_writelane_b32_e64 0, 0, %0:v[255]
162 //! BB1
163 //! /* logical preds: / linear preds: BB0, / kind: uniform, */
164 //! s_nop
165 //! v1: %0:v[0] = image_sample %0:s[0-7], %0:s[0-3], v1: undef, %0:v[0], %0:v[2] 2d
166 bld.pseudo(aco_opcode::p_unit_test, Operand::c32(3u));
167 bld.writelane(Definition(PhysReg(511), v1), Operand::zero(), Operand::zero(),
168 Operand(PhysReg(511), v1));
169 bld.reset(program->create_and_insert_block());
170 create_mimg(true, 2, 3);
171 program->blocks[0].linear_succs.push_back(1);
172 program->blocks[1].linear_preds.push_back(0);
173
174 finish_insert_nops_test();
175 END_TEST
176
177 BEGIN_TEST(insert_nops.vmem_to_scalar_write)
178 if (!setup_cs(NULL, GFX10))
179 return;
180
181 /* WaR: VMEM load */
182 //>> p_unit_test 0
183 //! v1: %0:v[0] = buffer_load_dword %0:s[0-3], %0:v[0], 0 offen
184 //! s_waitcnt_depctr vm_vsrc(0)
185 //! s1: %0:s[0] = s_mov_b32 0
186 bld.pseudo(aco_opcode::p_unit_test, Operand::c32(0));
187 create_mubuf(0);
188 bld.sop1(aco_opcode::s_mov_b32, Definition(PhysReg(0), s1), Operand::zero());
189
190 //! p_unit_test 1
191 //! v1: %0:v[0] = buffer_load_dword %0:s[0-3], %0:v[0], 0 offen
192 //! s_waitcnt_depctr vm_vsrc(0)
193 //! s2: %0:exec = s_mov_b64 -1
194 bld.pseudo(aco_opcode::p_unit_test, Operand::c32(1));
195 create_mubuf(0);
196 bld.sop1(aco_opcode::s_mov_b64, Definition(exec, s2), Operand::c64(-1));
197
198 /* no hazard: VMEM load */
199 //! p_unit_test 2
200 //! v1: %0:v[0] = buffer_load_dword %0:s[0-3], %0:v[0], 0 offen
201 //! s1: %0:s[4] = s_mov_b32 0
202 bld.pseudo(aco_opcode::p_unit_test, Operand::c32(2));
203 create_mubuf(0);
204 bld.sop1(aco_opcode::s_mov_b32, Definition(PhysReg(4), s1), Operand::zero());
205
206 /* no hazard: VMEM load with VALU in-between */
207 //! p_unit_test 3
208 //! v1: %0:v[0] = buffer_load_dword %0:s[0-3], %0:v[0], 0 offen
209 //! v_nop
210 //! s1: %0:s[0] = s_mov_b32 0
211 bld.pseudo(aco_opcode::p_unit_test, Operand::c32(3));
212 create_mubuf(0);
213 bld.vop1(aco_opcode::v_nop);
214 bld.sop1(aco_opcode::s_mov_b32, Definition(PhysReg(0), s1), Operand::zero());
215
216 /* WaR: LDS */
217 //! p_unit_test 4
218 //! v1: %0:v[0] = ds_read_b32 %0:v[0], %0:m0
219 //! s_waitcnt_depctr vm_vsrc(0)
220 //! s1: %0:m0 = s_mov_b32 0
221 bld.pseudo(aco_opcode::p_unit_test, Operand::c32(4));
222 bld.ds(aco_opcode::ds_read_b32, Definition(PhysReg(256), v1), Operand(PhysReg(256), v1),
223 Operand(m0, s1));
224 bld.sop1(aco_opcode::s_mov_b32, Definition(m0, s1), Operand::zero());
225
226 //! p_unit_test 5
227 //! v1: %0:v[0] = ds_read_b32 %0:v[0], %0:m0
228 //! s_waitcnt_depctr vm_vsrc(0)
229 //! s2: %0:exec = s_mov_b64 -1
230 bld.pseudo(aco_opcode::p_unit_test, Operand::c32(5));
231 bld.ds(aco_opcode::ds_read_b32, Definition(PhysReg(256), v1), Operand(PhysReg(256), v1),
232 Operand(m0, s1));
233 bld.sop1(aco_opcode::s_mov_b64, Definition(exec, s2), Operand::c64(-1));
234
235 /* no hazard: LDS */
236 //! p_unit_test 6
237 //! v1: %0:v[0] = ds_read_b32 %0:v[0], %0:m0
238 //! s1: %0:s[0] = s_mov_b32 0
239 bld.pseudo(aco_opcode::p_unit_test, Operand::c32(6));
240 bld.ds(aco_opcode::ds_read_b32, Definition(PhysReg(256), v1), Operand(PhysReg(256), v1),
241 Operand(m0, s1));
242 bld.sop1(aco_opcode::s_mov_b32, Definition(PhysReg(0), s1), Operand::zero());
243
244 /* no hazard: LDS with VALU in-between */
245 //! p_unit_test 7
246 //! v1: %0:v[0] = ds_read_b32 %0:v[0], %0:m0
247 //! v_nop
248 //! s1: %0:m0 = s_mov_b32 0
249 bld.pseudo(aco_opcode::p_unit_test, Operand::c32(7));
250 bld.ds(aco_opcode::ds_read_b32, Definition(PhysReg(256), v1), Operand(PhysReg(256), v1),
251 Operand(m0, s1));
252 bld.vop1(aco_opcode::v_nop);
253 bld.sop1(aco_opcode::s_mov_b32, Definition(m0, s1), Operand::zero());
254
255 /* no hazard: VMEM/LDS with the correct waitcnt in-between */
256 //! p_unit_test 8
257 //! v1: %0:v[0] = buffer_load_dword %0:s[0-3], %0:v[0], 0 offen
258 //! s_waitcnt vmcnt(0)
259 //! s1: %0:s[0] = s_mov_b32 0
260 bld.pseudo(aco_opcode::p_unit_test, Operand::c32(8));
261 create_mubuf(0);
262 bld.sopp(aco_opcode::s_waitcnt, -1, 0x3f70);
263 bld.sop1(aco_opcode::s_mov_b32, Definition(PhysReg(0), s1), Operand::zero());
264
265 //! p_unit_test 9
266 //! buffer_store_dword %0:s[0-3], %0:v[0], 0, %0:v[0] offen
267 //! s_waitcnt_vscnt %0:null imm:0
268 //! s1: %0:s[0] = s_mov_b32 0
269 bld.pseudo(aco_opcode::p_unit_test, Operand::c32(9));
270 create_mubuf_store();
271 bld.sopk(aco_opcode::s_waitcnt_vscnt, Operand(sgpr_null, s1), 0);
272 bld.sop1(aco_opcode::s_mov_b32, Definition(PhysReg(0), s1), Operand::zero());
273
274 //! p_unit_test 10
275 //! v1: %0:v[0] = ds_read_b32 %0:v[0], %0:m0
276 //! s_waitcnt lgkmcnt(0)
277 //! s1: %0:m0 = s_mov_b32 0
278 bld.pseudo(aco_opcode::p_unit_test, Operand::c32(10));
279 bld.ds(aco_opcode::ds_read_b32, Definition(PhysReg(256), v1), Operand(PhysReg(256), v1),
280 Operand(m0, s1));
281 bld.sopp(aco_opcode::s_waitcnt, -1, 0xc07f);
282 bld.sop1(aco_opcode::s_mov_b32, Definition(m0, s1), Operand::zero());
283
284 /* VMEM/LDS with the wrong waitcnt in-between */
285 //! p_unit_test 11
286 //! v1: %0:v[0] = buffer_load_dword %0:s[0-3], %0:v[0], 0 offen
287 //! s_waitcnt_vscnt %0:null imm:0
288 //! s_waitcnt_depctr vm_vsrc(0)
289 //! s1: %0:s[0] = s_mov_b32 0
290 bld.pseudo(aco_opcode::p_unit_test, Operand::c32(11));
291 create_mubuf(0);
292 bld.sopk(aco_opcode::s_waitcnt_vscnt, Operand(sgpr_null, s1), 0);
293 bld.sop1(aco_opcode::s_mov_b32, Definition(PhysReg(0), s1), Operand::zero());
294
295 //! p_unit_test 12
296 //! buffer_store_dword %0:s[0-3], %0:v[0], 0, %0:v[0] offen
297 //! s_waitcnt lgkmcnt(0)
298 //! s_waitcnt_depctr vm_vsrc(0)
299 //! s1: %0:s[0] = s_mov_b32 0
300 bld.pseudo(aco_opcode::p_unit_test, Operand::c32(12));
301 create_mubuf_store();
302 bld.sopp(aco_opcode::s_waitcnt, -1, 0xc07f);
303 bld.sop1(aco_opcode::s_mov_b32, Definition(PhysReg(0), s1), Operand::zero());
304
305 //! p_unit_test 13
306 //! v1: %0:v[0] = ds_read_b32 %0:v[0], %0:m0
307 //! s_waitcnt vmcnt(0)
308 //! s_waitcnt_depctr vm_vsrc(0)
309 //! s1: %0:m0 = s_mov_b32 0
310 bld.pseudo(aco_opcode::p_unit_test, Operand::c32(13));
311 bld.ds(aco_opcode::ds_read_b32, Definition(PhysReg(256), v1), Operand(PhysReg(256), v1),
312 Operand(m0, s1));
313 bld.sopp(aco_opcode::s_waitcnt, -1, 0x3f70);
314 bld.sop1(aco_opcode::s_mov_b32, Definition(m0, s1), Operand::zero());
315
316 finish_insert_nops_test();
317 END_TEST
318
319 BEGIN_TEST(insert_nops.lds_direct_valu)
320 if (!setup_cs(NULL, GFX11))
321 return;
322
323 /* WaW */
324 //>> p_unit_test 0
325 //! v1: %0:v[0] = v_mov_b32 0
326 //! v1: %0:v[0] = lds_direct_load %0:m0 wait_vdst:0
327 bld.pseudo(aco_opcode::p_unit_test, Operand::c32(0));
328 bld.vop1(aco_opcode::v_mov_b32, Definition(PhysReg(256), v1), Operand::zero());
329 bld.ldsdir(aco_opcode::lds_direct_load, Definition(PhysReg(256), v1), Operand(m0, s1));
330
331 /* WaR */
332 //! p_unit_test 1
333 //! v1: %0:v[1] = v_mov_b32 %0:v[0]
334 //! v1: %0:v[0] = lds_direct_load %0:m0 wait_vdst:0
335 bld.pseudo(aco_opcode::p_unit_test, Operand::c32(1));
336 bld.vop1(aco_opcode::v_mov_b32, Definition(PhysReg(257), v1), Operand(PhysReg(256), v1));
337 bld.ldsdir(aco_opcode::lds_direct_load, Definition(PhysReg(256), v1), Operand(m0, s1));
338
339 /* No hazard. */
340 //! p_unit_test 2
341 //! v1: %0:v[1] = v_mov_b32 0
342 //! v1: %0:v[0] = lds_direct_load %0:m0
343 bld.pseudo(aco_opcode::p_unit_test, Operand::c32(2));
344 bld.vop1(aco_opcode::v_mov_b32, Definition(PhysReg(257), v1), Operand::zero());
345 bld.ldsdir(aco_opcode::lds_direct_load, Definition(PhysReg(256), v1), Operand(m0, s1));
346
347 /* multiples hazards, nearest should be considered */
348 //! p_unit_test 3
349 //! v1: %0:v[1] = v_mov_b32 %0:v[0]
350 //! v1: %0:v[0] = v_mov_b32 0
351 //! v1: %0:v[0] = lds_direct_load %0:m0 wait_vdst:0
352 bld.pseudo(aco_opcode::p_unit_test, Operand::c32(3));
353 bld.vop1(aco_opcode::v_mov_b32, Definition(PhysReg(257), v1), Operand(PhysReg(256), v1));
354 bld.vop1(aco_opcode::v_mov_b32, Definition(PhysReg(256), v1), Operand::zero());
355 bld.ldsdir(aco_opcode::lds_direct_load, Definition(PhysReg(256), v1), Operand(m0, s1));
356
357 /* independent VALU increase wait_vdst */
358 //! p_unit_test 4
359 //! v1: %0:v[0] = v_mov_b32 0
360 //! v_nop
361 //! v1: %0:v[0] = lds_direct_load %0:m0 wait_vdst:1
362 bld.pseudo(aco_opcode::p_unit_test, Operand::c32(4));
363 bld.vop1(aco_opcode::v_mov_b32, Definition(PhysReg(256), v1), Operand::zero());
364 bld.vop1(aco_opcode::v_nop);
365 bld.ldsdir(aco_opcode::lds_direct_load, Definition(PhysReg(256), v1), Operand(m0, s1));
366
367 //! p_unit_test 5
368 //! v1: %0:v[0] = v_mov_b32 0
369 //; for i in range(10): insert_pattern('v_nop')
370 //! v1: %0:v[0] = lds_direct_load %0:m0 wait_vdst:10
371 bld.pseudo(aco_opcode::p_unit_test, Operand::c32(5));
372 bld.vop1(aco_opcode::v_mov_b32, Definition(PhysReg(256), v1), Operand::zero());
373 for (unsigned i = 0; i < 10; i++)
374 bld.vop1(aco_opcode::v_nop);
375 bld.ldsdir(aco_opcode::lds_direct_load, Definition(PhysReg(256), v1), Operand(m0, s1));
376
377 //! p_unit_test 6
378 //! v1: %0:v[0] = v_mov_b32 0
379 //; for i in range(20): insert_pattern('v_nop')
380 //! v1: %0:v[0] = lds_direct_load %0:m0
381 bld.pseudo(aco_opcode::p_unit_test, Operand::c32(6));
382 bld.vop1(aco_opcode::v_mov_b32, Definition(PhysReg(256), v1), Operand::zero());
383 for (unsigned i = 0; i < 20; i++)
384 bld.vop1(aco_opcode::v_nop);
385 bld.ldsdir(aco_opcode::lds_direct_load, Definition(PhysReg(256), v1), Operand(m0, s1));
386
387 /* transcendental requires wait_vdst=0 */
388 //! p_unit_test 7
389 //! v1: %0:v[0] = v_mov_b32 0
390 //! v_nop
391 //! v1: %0:v[1] = v_sqrt_f32 %0:v[1]
392 //! v1: %0:v[0] = lds_direct_load %0:m0 wait_vdst:0
393 bld.pseudo(aco_opcode::p_unit_test, Operand::c32(7));
394 bld.vop1(aco_opcode::v_mov_b32, Definition(PhysReg(256), v1), Operand::zero());
395 bld.vop1(aco_opcode::v_nop);
396 bld.vop1(aco_opcode::v_sqrt_f32, Definition(PhysReg(257), v1), Operand(PhysReg(257), v1));
397 bld.ldsdir(aco_opcode::lds_direct_load, Definition(PhysReg(256), v1), Operand(m0, s1));
398
399 //! p_unit_test 8
400 //! v1: %0:v[0] = v_sqrt_f32 %0:v[0]
401 //! v_nop
402 //! v1: %0:v[0] = lds_direct_load %0:m0 wait_vdst:0
403 bld.pseudo(aco_opcode::p_unit_test, Operand::c32(8));
404 bld.vop1(aco_opcode::v_sqrt_f32, Definition(PhysReg(256), v1), Operand(PhysReg(256), v1));
405 bld.vop1(aco_opcode::v_nop);
406 bld.ldsdir(aco_opcode::lds_direct_load, Definition(PhysReg(256), v1), Operand(m0, s1));
407
408 /* transcendental is fine if it's before the instruction */
409 //! p_unit_test 9
410 //! v1: %0:v[1] = v_sqrt_f32 %0:v[1]
411 //! v1: %0:v[0] = v_mov_b32 0
412 //! v_nop
413 //! v1: %0:v[0] = lds_direct_load %0:m0 wait_vdst:1
414 bld.pseudo(aco_opcode::p_unit_test, Operand::c32(9));
415 bld.vop1(aco_opcode::v_sqrt_f32, Definition(PhysReg(257), v1), Operand(PhysReg(257), v1));
416 bld.vop1(aco_opcode::v_mov_b32, Definition(PhysReg(256), v1), Operand::zero());
417 bld.vop1(aco_opcode::v_nop);
418 bld.ldsdir(aco_opcode::lds_direct_load, Definition(PhysReg(256), v1), Operand(m0, s1));
419
420 /* non-VALU does not increase wait_vdst */
421 //! p_unit_test 10
422 //! v1: %0:v[0] = v_mov_b32 0
423 //! s1: %0:m0 = s_mov_b32 0
424 //! v1: %0:v[0] = lds_direct_load %0:m0 wait_vdst:0
425 bld.pseudo(aco_opcode::p_unit_test, Operand::c32(10));
426 bld.vop1(aco_opcode::v_mov_b32, Definition(PhysReg(256), v1), Operand::zero());
427 bld.sop1(aco_opcode::s_mov_b32, Definition(m0, s1), Operand::zero());
428 bld.ldsdir(aco_opcode::lds_direct_load, Definition(PhysReg(256), v1), Operand(m0, s1));
429
430 /* consider instructions which wait on vdst */
431 //! p_unit_test 11
432 //! v1: %0:v[0] = v_mov_b32 0
433 //! v_nop
434 //! s_waitcnt_depctr va_vdst(0)
435 //! v1: %0:v[0] = lds_direct_load %0:m0
436 bld.pseudo(aco_opcode::p_unit_test, Operand::c32(11));
437 bld.vop1(aco_opcode::v_mov_b32, Definition(PhysReg(256), v1), Operand::zero());
438 bld.vop1(aco_opcode::v_nop);
439 bld.sopp(aco_opcode::s_waitcnt_depctr, -1, 0x0fff);
440 bld.ldsdir(aco_opcode::lds_direct_load, Definition(PhysReg(256), v1), Operand(m0, s1));
441
442 finish_insert_nops_test();
443 END_TEST
444
445 BEGIN_TEST(insert_nops.lds_direct_vmem)
446 if (!setup_cs(NULL, GFX11))
447 return;
448
449 /* WaR: VMEM */
450 //>> p_unit_test 0
451 //! v1: %0:v[1] = buffer_load_dword %0:s[0-3], %0:v[0], 0 offen
452 //! s_waitcnt_depctr vm_vsrc(0)
453 //! v1: %0:v[0] = lds_direct_load %0:m0
454 bld.pseudo(aco_opcode::p_unit_test, Operand::c32(0));
455 create_mubuf(0, PhysReg(257));
456 bld.ldsdir(aco_opcode::lds_direct_load, Definition(PhysReg(256), v1), Operand(m0, s1));
457
458 /* WaW: VMEM */
459 //! p_unit_test 1
460 //! v1: %0:v[0] = buffer_load_dword %0:s[0-3], %0:v[1], 0 offen
461 //! s_waitcnt_depctr vm_vsrc(0)
462 //! v1: %0:v[0] = lds_direct_load %0:m0
463 bld.pseudo(aco_opcode::p_unit_test, Operand::c32(1));
464 create_mubuf(0, PhysReg(256), PhysReg(257));
465 bld.ldsdir(aco_opcode::lds_direct_load, Definition(PhysReg(256), v1), Operand(m0, s1));
466
467 /* no hazard: VMEM */
468 //! p_unit_test 2
469 //! v1: %0:v[1] = buffer_load_dword %0:s[0-3], %0:v[1], 0 offen
470 //! v1: %0:v[0] = lds_direct_load %0:m0
471 bld.pseudo(aco_opcode::p_unit_test, Operand::c32(2));
472 create_mubuf(0, PhysReg(257), PhysReg(257));
473 bld.ldsdir(aco_opcode::lds_direct_load, Definition(PhysReg(256), v1), Operand(m0, s1));
474
475 /* no hazard: VMEM with VALU in-between */
476 //! p_unit_test 3
477 //! v1: %0:v[1] = buffer_load_dword %0:s[0-3], %0:v[0], 0 offen
478 //! v_nop
479 //! v1: %0:v[0] = lds_direct_load %0:m0
480 bld.pseudo(aco_opcode::p_unit_test, Operand::c32(3));
481 create_mubuf(0, PhysReg(257));
482 bld.vop1(aco_opcode::v_nop);
483 bld.ldsdir(aco_opcode::lds_direct_load, Definition(PhysReg(256), v1), Operand(m0, s1));
484
485 /* WaR: LDS */
486 //! p_unit_test 4
487 //! v1: %0:v[1] = ds_read_b32 %0:v[0]
488 //! s_waitcnt_depctr vm_vsrc(0)
489 //! v1: %0:v[0] = lds_direct_load %0:m0
490 bld.pseudo(aco_opcode::p_unit_test, Operand::c32(4));
491 bld.ds(aco_opcode::ds_read_b32, Definition(PhysReg(257), v1), Operand(PhysReg(256), v1));
492 bld.ldsdir(aco_opcode::lds_direct_load, Definition(PhysReg(256), v1), Operand(m0, s1));
493
494 /* WaW: LDS */
495 //! p_unit_test 5
496 //! v1: %0:v[0] = ds_read_b32 %0:v[1]
497 //! s_waitcnt_depctr vm_vsrc(0)
498 //! v1: %0:v[0] = lds_direct_load %0:m0
499 bld.pseudo(aco_opcode::p_unit_test, Operand::c32(5));
500 bld.ds(aco_opcode::ds_read_b32, Definition(PhysReg(256), v1), Operand(PhysReg(257), v1));
501 bld.ldsdir(aco_opcode::lds_direct_load, Definition(PhysReg(256), v1), Operand(m0, s1));
502
503 /* no hazard: LDS */
504 //! p_unit_test 6
505 //! v1: %0:v[1] = ds_read_b32 %0:v[1]
506 //! v1: %0:v[0] = lds_direct_load %0:m0
507 bld.pseudo(aco_opcode::p_unit_test, Operand::c32(6));
508 bld.ds(aco_opcode::ds_read_b32, Definition(PhysReg(257), v1), Operand(PhysReg(257), v1));
509 bld.ldsdir(aco_opcode::lds_direct_load, Definition(PhysReg(256), v1), Operand(m0, s1));
510
511 /* no hazard: LDS with VALU in-between */
512 //! p_unit_test 7
513 //! v1: %0:v[1] = ds_read_b32 %0:v[0]
514 //! v_nop
515 //! v1: %0:v[0] = lds_direct_load %0:m0
516 bld.pseudo(aco_opcode::p_unit_test, Operand::c32(7));
517 bld.ds(aco_opcode::ds_read_b32, Definition(PhysReg(257), v1), Operand(PhysReg(256), v1));
518 bld.vop1(aco_opcode::v_nop);
519 bld.ldsdir(aco_opcode::lds_direct_load, Definition(PhysReg(256), v1), Operand(m0, s1));
520
521 /* no hazard: VMEM/LDS with the correct waitcnt in-between */
522 //! p_unit_test 8
523 //! v1: %0:v[1] = buffer_load_dword %0:s[0-3], %0:v[0], 0 offen
524 //! s_waitcnt vmcnt(0)
525 //! v1: %0:v[0] = lds_direct_load %0:m0
526 bld.pseudo(aco_opcode::p_unit_test, Operand::c32(8));
527 create_mubuf(0, PhysReg(257));
528 bld.sopp(aco_opcode::s_waitcnt, -1, 0x3ff);
529 bld.ldsdir(aco_opcode::lds_direct_load, Definition(PhysReg(256), v1), Operand(m0, s1));
530
531 //! p_unit_test 9
532 //! buffer_store_dword %0:s[0-3], %0:v[0], 0, %0:v[0] offen
533 //! s_waitcnt_vscnt %0:null imm:0
534 //! v1: %0:v[0] = lds_direct_load %0:m0
535 bld.pseudo(aco_opcode::p_unit_test, Operand::c32(9));
536 create_mubuf_store();
537 bld.sopk(aco_opcode::s_waitcnt_vscnt, Operand(sgpr_null, s1), 0);
538 bld.ldsdir(aco_opcode::lds_direct_load, Definition(PhysReg(256), v1), Operand(m0, s1));
539
540 //! p_unit_test 10
541 //! v1: %0:v[1] = ds_read_b32 %0:v[0]
542 //! s_waitcnt lgkmcnt(0)
543 //! v1: %0:v[0] = lds_direct_load %0:m0
544 bld.pseudo(aco_opcode::p_unit_test, Operand::c32(10));
545 bld.ds(aco_opcode::ds_read_b32, Definition(PhysReg(257), v1), Operand(PhysReg(256), v1));
546 bld.sopp(aco_opcode::s_waitcnt, -1, 0xfc0f);
547 bld.ldsdir(aco_opcode::lds_direct_load, Definition(PhysReg(256), v1), Operand(m0, s1));
548
549 /* VMEM/LDS with the wrong waitcnt in-between */
550 //! p_unit_test 11
551 //! v1: %0:v[1] = buffer_load_dword %0:s[0-3], %0:v[0], 0 offen
552 //! s_waitcnt_vscnt %0:null imm:0
553 //! s_waitcnt_depctr vm_vsrc(0)
554 //! v1: %0:v[0] = lds_direct_load %0:m0
555 bld.pseudo(aco_opcode::p_unit_test, Operand::c32(11));
556 create_mubuf(0, PhysReg(257));
557 bld.sopk(aco_opcode::s_waitcnt_vscnt, Operand(sgpr_null, s1), 0);
558 bld.ldsdir(aco_opcode::lds_direct_load, Definition(PhysReg(256), v1), Operand(m0, s1));
559
560 //! p_unit_test 12
561 //! buffer_store_dword %0:s[0-3], %0:v[0], 0, %0:v[0] offen
562 //! s_waitcnt lgkmcnt(0)
563 //! s_waitcnt_depctr vm_vsrc(0)
564 //! v1: %0:v[0] = lds_direct_load %0:m0
565 bld.pseudo(aco_opcode::p_unit_test, Operand::c32(12));
566 create_mubuf_store();
567 bld.sopp(aco_opcode::s_waitcnt, -1, 0xfc0f);
568 bld.ldsdir(aco_opcode::lds_direct_load, Definition(PhysReg(256), v1), Operand(m0, s1));
569
570 //! p_unit_test 13
571 //! v1: %0:v[1] = ds_read_b32 %0:v[0]
572 //! s_waitcnt vmcnt(0)
573 //! s_waitcnt_depctr vm_vsrc(0)
574 //! v1: %0:v[0] = lds_direct_load %0:m0
575 bld.pseudo(aco_opcode::p_unit_test, Operand::c32(13));
576 bld.ds(aco_opcode::ds_read_b32, Definition(PhysReg(257), v1), Operand(PhysReg(256), v1));
577 bld.sopp(aco_opcode::s_waitcnt, -1, 0x3ff);
578 bld.ldsdir(aco_opcode::lds_direct_load, Definition(PhysReg(256), v1), Operand(m0, s1));
579
580 //! p_unit_test 14
581 //! v1: %0:v[0] = buffer_load_dword %0:s[0-3], %0:v[1], 0 offen
582 //! s_waitcnt_vscnt %0:null imm:0
583 //! s_waitcnt_depctr vm_vsrc(0)
584 //! v1: %0:v[0] = lds_direct_load %0:m0
585 bld.pseudo(aco_opcode::p_unit_test, Operand::c32(14));
586 create_mubuf(0, PhysReg(256), PhysReg(257));
587 bld.sopk(aco_opcode::s_waitcnt_vscnt, Operand(sgpr_null, s1), 0);
588 bld.ldsdir(aco_opcode::lds_direct_load, Definition(PhysReg(256), v1), Operand(m0, s1));
589
590 finish_insert_nops_test();
591 END_TEST
592
593 BEGIN_TEST(insert_nops.valu_trans_use)
594 if (!setup_cs(NULL, GFX11))
595 return;
596
597 //>> p_unit_test 0
598 //! v1: %0:v[0] = v_rcp_f32 %0:v[1]
599 //! s_waitcnt_depctr va_vdst(0)
600 //! v1: %0:v[1] = v_mov_b32 %0:v[0]
601 bld.pseudo(aco_opcode::p_unit_test, Operand::c32(0));
602 bld.vop1(aco_opcode::v_rcp_f32, Definition(PhysReg(256), v1), Operand(PhysReg(257), v1));
603 bld.vop1(aco_opcode::v_mov_b32, Definition(PhysReg(257), v1), Operand(PhysReg(256), v1));
604
605 /* Sufficient VALU mitigates the hazard. */
606 //! p_unit_test 1
607 //! v1: %0:v[0] = v_rcp_f32 %0:v[1]
608 //; for i in range(4): insert_pattern('v_nop')
609 //! s_waitcnt_depctr va_vdst(0)
610 //! v1: %0:v[1] = v_mov_b32 %0:v[0]
611 bld.pseudo(aco_opcode::p_unit_test, Operand::c32(1));
612 bld.vop1(aco_opcode::v_rcp_f32, Definition(PhysReg(256), v1), Operand(PhysReg(257), v1));
613 for (unsigned i = 0; i < 4; i++)
614 bld.vop1(aco_opcode::v_nop);
615 bld.vop1(aco_opcode::v_mov_b32, Definition(PhysReg(257), v1), Operand(PhysReg(256), v1));
616
617 //! p_unit_test 2
618 //! v1: %0:v[0] = v_rcp_f32 %0:v[1]
619 //; for i in range(8): insert_pattern('v_nop')
620 //! v1: %0:v[1] = v_mov_b32 %0:v[0]
621 bld.pseudo(aco_opcode::p_unit_test, Operand::c32(2));
622 bld.vop1(aco_opcode::v_rcp_f32, Definition(PhysReg(256), v1), Operand(PhysReg(257), v1));
623 for (unsigned i = 0; i < 8; i++)
624 bld.vop1(aco_opcode::v_nop);
625 bld.vop1(aco_opcode::v_mov_b32, Definition(PhysReg(257), v1), Operand(PhysReg(256), v1));
626
627 /* Sufficient transcendental VALU mitigates the hazard. */
628 //! p_unit_test 3
629 //! v1: %0:v[0] = v_rcp_f32 %0:v[1]
630 //! v1: %0:v[2] = v_sqrt_f32 %0:v[3]
631 //! s_waitcnt_depctr va_vdst(0)
632 //! v1: %0:v[1] = v_mov_b32 %0:v[0]
633 bld.pseudo(aco_opcode::p_unit_test, Operand::c32(3));
634 bld.vop1(aco_opcode::v_rcp_f32, Definition(PhysReg(256), v1), Operand(PhysReg(257), v1));
635 bld.vop1(aco_opcode::v_sqrt_f32, Definition(PhysReg(258), v1), Operand(PhysReg(259), v1));
636 bld.vop1(aco_opcode::v_mov_b32, Definition(PhysReg(257), v1), Operand(PhysReg(256), v1));
637
638 //! p_unit_test 4
639 //! v1: %0:v[0] = v_rcp_f32 %0:v[1]
640 //! v1: %0:v[2] = v_sqrt_f32 %0:v[3]
641 //! v1: %0:v[2] = v_sqrt_f32 %0:v[3]
642 //! v1: %0:v[1] = v_mov_b32 %0:v[0]
643 bld.pseudo(aco_opcode::p_unit_test, Operand::c32(4));
644 bld.vop1(aco_opcode::v_rcp_f32, Definition(PhysReg(256), v1), Operand(PhysReg(257), v1));
645 for (unsigned i = 0; i < 2; i++)
646 bld.vop1(aco_opcode::v_sqrt_f32, Definition(PhysReg(258), v1), Operand(PhysReg(259), v1));
647 bld.vop1(aco_opcode::v_mov_b32, Definition(PhysReg(257), v1), Operand(PhysReg(256), v1));
648
649 /* Transcendental VALU should be counted towards VALU */
650 //! p_unit_test 5
651 //! v1: %0:v[0] = v_rcp_f32 %0:v[1]
652 //; for i in range(5): insert_pattern('v_nop')
653 //! v1: %0:v[2] = v_sqrt_f32 %0:v[3]
654 //! v1: %0:v[1] = v_mov_b32 %0:v[0]
655 bld.pseudo(aco_opcode::p_unit_test, Operand::c32(5));
656 bld.vop1(aco_opcode::v_rcp_f32, Definition(PhysReg(256), v1), Operand(PhysReg(257), v1));
657 for (unsigned i = 0; i < 5; i++)
658 bld.vop1(aco_opcode::v_nop);
659 bld.vop1(aco_opcode::v_sqrt_f32, Definition(PhysReg(258), v1), Operand(PhysReg(259), v1));
660 bld.vop1(aco_opcode::v_mov_b32, Definition(PhysReg(257), v1), Operand(PhysReg(256), v1));
661
662 /* non-VALU does not mitigate the hazard. */
663 //! p_unit_test 6
664 //! v1: %0:v[0] = v_rcp_f32 %0:v[1]
665 //; for i in range(8): insert_pattern('s_nop')
666 //! s_waitcnt_depctr va_vdst(0)
667 //! v1: %0:v[1] = v_mov_b32 %0:v[0]
668 bld.pseudo(aco_opcode::p_unit_test, Operand::c32(6));
669 bld.vop1(aco_opcode::v_rcp_f32, Definition(PhysReg(256), v1), Operand(PhysReg(257), v1));
670 for (unsigned i = 0; i < 8; i++)
671 bld.sopp(aco_opcode::s_nop, -1, 0);
672 bld.vop1(aco_opcode::v_mov_b32, Definition(PhysReg(257), v1), Operand(PhysReg(256), v1));
673
674 finish_insert_nops_test();
675 END_TEST
676
677 BEGIN_TEST(insert_nops.valu_partial_forwarding.basic)
678 if (!setup_cs(NULL, GFX11))
679 return;
680
681 /* Basic case. */
682 //>> p_unit_test 0
683 //! v1: %0:v[0] = v_mov_b32 0
684 //! s2: %0:exec = s_mov_b64 -1
685 //! v1: %0:v[1] = v_mov_b32 1
686 //! s_waitcnt_depctr va_vdst(0)
687 //! v1: %0:v[2] = v_max_f32 %0:v[0], %0:v[1]
688 bld.pseudo(aco_opcode::p_unit_test, Operand::c32(0));
689 bld.vop1(aco_opcode::v_mov_b32, Definition(PhysReg(256), v1), Operand::zero());
690 bld.sop1(aco_opcode::s_mov_b64, Definition(exec, s2), Operand::c64(-1));
691 bld.vop1(aco_opcode::v_mov_b32, Definition(PhysReg(257), v1), Operand::c32(1));
692 bld.vop2(aco_opcode::v_max_f32, Definition(PhysReg(258), v1), Operand(PhysReg(256), v1),
693 Operand(PhysReg(257), v1));
694
695 /* We should consider both the closest and further VALU after the exec write. */
696 //! p_unit_test 1
697 //! v1: %0:v[0] = v_mov_b32 0
698 //! s2: %0:exec = s_mov_b64 -1
699 //! v1: %0:v[1] = v_mov_b32 1
700 //; for i in range(2): insert_pattern('v_nop')
701 //! v1: %0:v[2] = v_mov_b32 2
702 //! s_waitcnt_depctr va_vdst(0)
703 //! v1: %0:v[2] = v_max3_f32 %0:v[0], %0:v[1], %0:v[2]
704 bld.pseudo(aco_opcode::p_unit_test, Operand::c32(1));
705 bld.vop1(aco_opcode::v_mov_b32, Definition(PhysReg(256), v1), Operand::zero());
706 bld.sop1(aco_opcode::s_mov_b64, Definition(exec, s2), Operand::c64(-1));
707 bld.vop1(aco_opcode::v_mov_b32, Definition(PhysReg(257), v1), Operand::c32(1));
708 bld.vop1(aco_opcode::v_nop);
709 bld.vop1(aco_opcode::v_nop);
710 bld.vop1(aco_opcode::v_mov_b32, Definition(PhysReg(258), v1), Operand::c32(2));
711 bld.vop3(aco_opcode::v_max3_f32, Definition(PhysReg(258), v1), Operand(PhysReg(256), v1),
712 Operand(PhysReg(257), v1), Operand(PhysReg(258), v1));
713
714 //! p_unit_test 2
715 //! v1: %0:v[0] = v_mov_b32 0
716 //! s2: %0:exec = s_mov_b64 -1
717 //! v1: %0:v[1] = v_mov_b32 1
718 //! v1: %0:v[2] = v_mov_b32 2
719 //; for i in range(4): insert_pattern('v_nop')
720 //! s_waitcnt_depctr va_vdst(0)
721 //! v1: %0:v[2] = v_max3_f32 %0:v[0], %0:v[1], %0:v[2]
722 bld.pseudo(aco_opcode::p_unit_test, Operand::c32(2));
723 bld.vop1(aco_opcode::v_mov_b32, Definition(PhysReg(256), v1), Operand::zero());
724 bld.sop1(aco_opcode::s_mov_b64, Definition(exec, s2), Operand::c64(-1));
725 bld.vop1(aco_opcode::v_mov_b32, Definition(PhysReg(257), v1), Operand::c32(1));
726 bld.vop1(aco_opcode::v_mov_b32, Definition(PhysReg(258), v1), Operand::c32(2));
727 for (unsigned i = 0; i < 4; i++)
728 bld.vop1(aco_opcode::v_nop);
729 bld.vop3(aco_opcode::v_max3_f32, Definition(PhysReg(258), v1), Operand(PhysReg(256), v1),
730 Operand(PhysReg(257), v1), Operand(PhysReg(258), v1));
731
732 /* If a VALU writes a read VGPR in-between the first and second writes, it should still be
733 * counted towards the distance between the first and second writes.
734 */
735 //! p_unit_test 3
736 //! v1: %0:v[0] = v_mov_b32 0
737 //! s2: %0:exec = s_mov_b64 -1
738 //! v1: %0:v[1] = v_mov_b32 1
739 //; for i in range(2): insert_pattern('v_nop')
740 //! v1: %0:v[2] = v_mov_b32 2
741 //; for i in range(3): insert_pattern('v_nop')
742 //! v1: %0:v[2] = v_max3_f32 %0:v[0], %0:v[1], %0:v[2]
743 bld.pseudo(aco_opcode::p_unit_test, Operand::c32(3));
744 bld.vop1(aco_opcode::v_mov_b32, Definition(PhysReg(256), v1), Operand::zero());
745 bld.sop1(aco_opcode::s_mov_b64, Definition(exec, s2), Operand::c64(-1));
746 bld.vop1(aco_opcode::v_mov_b32, Definition(PhysReg(257), v1), Operand::c32(1));
747 bld.vop1(aco_opcode::v_nop);
748 bld.vop1(aco_opcode::v_nop);
749 bld.vop1(aco_opcode::v_mov_b32, Definition(PhysReg(258), v1), Operand::c32(2));
750 for (unsigned i = 0; i < 3; i++)
751 bld.vop1(aco_opcode::v_nop);
752 bld.vop3(aco_opcode::v_max3_f32, Definition(PhysReg(258), v1), Operand(PhysReg(256), v1),
753 Operand(PhysReg(257), v1), Operand(PhysReg(258), v1));
754
755 bld.pseudo(aco_opcode::p_unit_test, Operand::c32(4));
756
757 finish_insert_nops_test();
758 END_TEST
759
760 BEGIN_TEST(insert_nops.valu_partial_forwarding.multiple_exec_writes)
761 if (!setup_cs(NULL, GFX11))
762 return;
763
764 //>> p_unit_test 0
765 //! v1: %0:v[0] = v_mov_b32 0
766 //! s2: %0:exec = s_mov_b64 0
767 //! s2: %0:exec = s_mov_b64 -1
768 //! v1: %0:v[1] = v_mov_b32 1
769 //! s_waitcnt_depctr va_vdst(0)
770 //! v1: %0:v[2] = v_max_f32 %0:v[0], %0:v[1]
771 bld.pseudo(aco_opcode::p_unit_test, Operand::c32(0));
772 bld.vop1(aco_opcode::v_mov_b32, Definition(PhysReg(256), v1), Operand::zero());
773 bld.sop1(aco_opcode::s_mov_b64, Definition(exec, s2), Operand::c64(0));
774 bld.sop1(aco_opcode::s_mov_b64, Definition(exec, s2), Operand::c64(-1));
775 bld.vop1(aco_opcode::v_mov_b32, Definition(PhysReg(257), v1), Operand::c32(1));
776 bld.vop2(aco_opcode::v_max_f32, Definition(PhysReg(258), v1), Operand(PhysReg(256), v1),
777 Operand(PhysReg(257), v1));
778
779 //! p_unit_test 1
780 //! v1: %0:v[0] = v_mov_b32 0
781 //! s2: %0:exec = s_mov_b64 0
782 //! v1: %0:v[1] = v_mov_b32 1
783 //! s2: %0:exec = s_mov_b64 -1
784 //! s_waitcnt_depctr va_vdst(0)
785 //! v1: %0:v[2] = v_max_f32 %0:v[0], %0:v[1]
786 bld.pseudo(aco_opcode::p_unit_test, Operand::c32(1));
787 bld.vop1(aco_opcode::v_mov_b32, Definition(PhysReg(256), v1), Operand::zero());
788 bld.sop1(aco_opcode::s_mov_b64, Definition(exec, s2), Operand::c64(0));
789 bld.vop1(aco_opcode::v_mov_b32, Definition(PhysReg(257), v1), Operand::c32(1));
790 bld.sop1(aco_opcode::s_mov_b64, Definition(exec, s2), Operand::c64(-1));
791 bld.vop2(aco_opcode::v_max_f32, Definition(PhysReg(258), v1), Operand(PhysReg(256), v1),
792 Operand(PhysReg(257), v1));
793
794 finish_insert_nops_test();
795 END_TEST
796
797 BEGIN_TEST(insert_nops.valu_partial_forwarding.control_flow)
798 if (!setup_cs(NULL, GFX11))
799 return;
800
801 /* Control flow merges: one branch shouldn't interfere with the other (clobbering VALU closer
802 * than interesting one).
803 */
804 //>> p_unit_test 0
805 //! s_cbranch_scc1 block:BB2
806 bld.pseudo(aco_opcode::p_unit_test, Operand::c32(0u));
807 bld.sopp(aco_opcode::s_cbranch_scc1, 2);
808
809 //! BB1
810 //! /* logical preds: / linear preds: BB0, / kind: */
811 //! v1: %0:v[0] = v_mov_b32 0
812 //! s2: %0:exec = s_mov_b64 -1
813 //! v_nop
814 //! s_branch block:BB3
815 bld.reset(program->create_and_insert_block());
816 program->blocks[0].linear_succs.push_back(1);
817 program->blocks[1].linear_preds.push_back(0);
818 bld.vop1(aco_opcode::v_mov_b32, Definition(PhysReg(256), v1), Operand::zero());
819 bld.sop1(aco_opcode::s_mov_b64, Definition(exec, s2), Operand::c64(-1));
820 bld.vop1(aco_opcode::v_nop);
821 bld.sopp(aco_opcode::s_branch, 3);
822
823 //! BB2
824 //! /* logical preds: / linear preds: BB0, / kind: */
825 //! v1: %0:v[0] = v_mov_b32 0
826 bld.reset(program->create_and_insert_block());
827 program->blocks[0].linear_succs.push_back(2);
828 program->blocks[2].linear_preds.push_back(0);
829 bld.vop1(aco_opcode::v_mov_b32, Definition(PhysReg(256), v1), Operand::zero());
830
831 //! BB3
832 //! /* logical preds: / linear preds: BB1, BB2, / kind: */
833 //! v1: %0:v[1] = v_mov_b32 1
834 //! s_waitcnt_depctr va_vdst(0)
835 //! v1: %0:v[2] = v_max_f32 %0:v[0], %0:v[1]
836 bld.reset(program->create_and_insert_block());
837 program->blocks[1].linear_succs.push_back(3);
838 program->blocks[2].linear_succs.push_back(3);
839 program->blocks[3].linear_preds.push_back(1);
840 program->blocks[3].linear_preds.push_back(2);
841 bld.vop1(aco_opcode::v_mov_b32, Definition(PhysReg(257), v1), Operand::c32(1));
842 bld.vop2(aco_opcode::v_max_f32, Definition(PhysReg(258), v1), Operand(PhysReg(256), v1),
843 Operand(PhysReg(257), v1));
844
845 /* Control flow merges: one branch shouldn't interfere with the other (should consider furthest
846 * VALU writes after exec).
847 */
848 //! p_unit_test 1
849 //! s_cbranch_scc1 block:BB5
850 bld.pseudo(aco_opcode::p_unit_test, Operand::c32(1u));
851 bld.sopp(aco_opcode::s_cbranch_scc1, 5);
852
853 //! BB4
854 //! /* logical preds: / linear preds: BB3, / kind: */
855 //! v1: %0:v[0] = v_mov_b32 0
856 //! s2: %0:exec = s_mov_b64 -1
857 //; for i in range(2): insert_pattern('v_nop')
858 //! v1: %0:v[1] = v_mov_b32 1
859 //! v_nop
860 //! s_branch block:BB6
861 bld.reset(program->create_and_insert_block());
862 program->blocks[3].linear_succs.push_back(4);
863 program->blocks[4].linear_preds.push_back(3);
864 bld.vop1(aco_opcode::v_mov_b32, Definition(PhysReg(256), v1), Operand::zero());
865 bld.sop1(aco_opcode::s_mov_b64, Definition(exec, s2), Operand::c64(-1));
866 bld.vop1(aco_opcode::v_nop);
867 bld.vop1(aco_opcode::v_nop);
868 bld.vop1(aco_opcode::v_mov_b32, Definition(PhysReg(257), v1), Operand::c32(1));
869 bld.vop1(aco_opcode::v_nop);
870 bld.sopp(aco_opcode::s_branch, 6);
871
872 //! BB5
873 //! /* logical preds: / linear preds: BB3, / kind: */
874 //! v1: %0:v[1] = v_mov_b32 1
875 bld.reset(program->create_and_insert_block());
876 program->blocks[3].linear_succs.push_back(5);
877 program->blocks[5].linear_preds.push_back(3);
878 bld.vop1(aco_opcode::v_mov_b32, Definition(PhysReg(257), v1), Operand::c32(1));
879
880 //! BB6
881 //! /* logical preds: / linear preds: BB4, BB5, / kind: */
882 //! s_waitcnt_depctr va_vdst(0)
883 //! v1: %0:v[2] = v_max_f32 %0:v[0], %0:v[1]
884 bld.reset(program->create_and_insert_block());
885 program->blocks[4].linear_succs.push_back(6);
886 program->blocks[5].linear_succs.push_back(6);
887 program->blocks[6].linear_preds.push_back(4);
888 program->blocks[6].linear_preds.push_back(5);
889 bld.vop2(aco_opcode::v_max_f32, Definition(PhysReg(258), v1), Operand(PhysReg(256), v1),
890 Operand(PhysReg(257), v1));
891
892 /* Control flow merges: one branch shouldn't interfere with the other (should consider closest
893 * VALU writes after exec).
894 */
895 //! p_unit_test 2
896 //! s_cbranch_scc1 block:BB8
897 bld.pseudo(aco_opcode::p_unit_test, Operand::c32(2u));
898 bld.sopp(aco_opcode::s_cbranch_scc1, 8);
899
900 //! BB7
901 //! /* logical preds: / linear preds: BB6, / kind: */
902 //! v1: %0:v[0] = v_mov_b32 0
903 //! s2: %0:exec = s_mov_b64 -1
904 //! v1: %0:v[1] = v_mov_b32 1
905 //; for i in range(4): insert_pattern('v_nop')
906 //! s_branch block:BB9
907 bld.reset(program->create_and_insert_block());
908 program->blocks[6].linear_succs.push_back(7);
909 program->blocks[7].linear_preds.push_back(6);
910 bld.vop1(aco_opcode::v_mov_b32, Definition(PhysReg(256), v1), Operand::zero());
911 bld.sop1(aco_opcode::s_mov_b64, Definition(exec, s2), Operand::c64(-1));
912 bld.vop1(aco_opcode::v_mov_b32, Definition(PhysReg(257), v1), Operand::c32(1));
913 for (unsigned i = 0; i < 4; i++)
914 bld.vop1(aco_opcode::v_nop);
915 bld.sopp(aco_opcode::s_branch, 9);
916
917 //! BB8
918 //! /* logical preds: / linear preds: BB6, / kind: */
919 //! v1: %0:v[1] = v_mov_b32 1
920 //; for i in range(5): insert_pattern('v_nop')
921 bld.reset(program->create_and_insert_block());
922 program->blocks[6].linear_succs.push_back(8);
923 program->blocks[8].linear_preds.push_back(6);
924 bld.vop1(aco_opcode::v_mov_b32, Definition(PhysReg(257), v1), Operand::c32(1));
925 for (unsigned i = 0; i < 5; i++)
926 bld.vop1(aco_opcode::v_nop);
927
928 //! BB9
929 //! /* logical preds: / linear preds: BB7, BB8, / kind: uniform, */
930 //! s_waitcnt_depctr va_vdst(0)
931 //! v1: %0:v[2] = v_max_f32 %0:v[0], %0:v[1]
932 bld.reset(program->create_and_insert_block());
933 program->blocks[7].linear_succs.push_back(9);
934 program->blocks[8].linear_succs.push_back(9);
935 program->blocks[9].linear_preds.push_back(7);
936 program->blocks[9].linear_preds.push_back(8);
937 bld.vop2(aco_opcode::v_max_f32, Definition(PhysReg(258), v1), Operand(PhysReg(256), v1),
938 Operand(PhysReg(257), v1));
939
940 finish_insert_nops_test();
941 END_TEST
942
943 BEGIN_TEST(insert_nops.valu_mask_write)
944 if (!setup_cs(NULL, GFX11))
945 return;
946
947 /* Basic case. */
948 //>> p_unit_test 0
949 //! v1: %0:v[0] = v_cndmask_b32 0, 0, %0:s[0-1]
950 //! s1: %0:s[1] = s_mov_b32 0
951 //! s_waitcnt_depctr sa_sdst(0)
952 //! s1: %0:s[2] = s_mov_b32 %0:s[1]
953 bld.pseudo(aco_opcode::p_unit_test, Operand::c32(0));
954 bld.vop2_e64(aco_opcode::v_cndmask_b32, Definition(PhysReg(256), v1), Operand::zero(),
955 Operand::zero(), Operand(PhysReg(0), s2));
956 bld.sop1(aco_opcode::s_mov_b32, Definition(PhysReg(1), s1), Operand::zero());
957 bld.sop1(aco_opcode::s_mov_b32, Definition(PhysReg(2), s1), Operand(PhysReg(1), s1));
958
959 /* Mitigation. */
960 //! p_unit_test 1
961 //! v1: %0:v[0] = v_cndmask_b32 0, 0, %0:s[0-1]
962 //! v1: %0:v[1] = v_mov_b32 %0:s[1]
963 //! s1: %0:s[1] = s_mov_b32 0
964 //! s1: %0:s[2] = s_mov_b32 %0:s[1]
965 bld.pseudo(aco_opcode::p_unit_test, Operand::c32(1));
966 bld.vop2_e64(aco_opcode::v_cndmask_b32, Definition(PhysReg(256), v1), Operand::zero(),
967 Operand::zero(), Operand(PhysReg(0), s2));
968 bld.vop1(aco_opcode::v_mov_b32, Definition(PhysReg(257), v1), Operand(PhysReg(1), s1));
969 bld.sop1(aco_opcode::s_mov_b32, Definition(PhysReg(1), s1), Operand::zero());
970 bld.sop1(aco_opcode::s_mov_b32, Definition(PhysReg(2), s1), Operand(PhysReg(1), s1));
971
972 //! p_unit_test 2
973 //! v1: %0:v[0] = v_cndmask_b32 0, 0, %0:s[0-1]
974 //! s1: %0:s[1] = s_mov_b32 0
975 //! s_waitcnt_depctr sa_sdst(0)
976 //! s1: %0:s[2] = s_mov_b32 %0:s[1]
977 //! s1: %0:s[2] = s_mov_b32 %0:s[1]
978 bld.pseudo(aco_opcode::p_unit_test, Operand::c32(2));
979 bld.vop2_e64(aco_opcode::v_cndmask_b32, Definition(PhysReg(256), v1), Operand::zero(),
980 Operand::zero(), Operand(PhysReg(0), s2));
981 bld.sop1(aco_opcode::s_mov_b32, Definition(PhysReg(1), s1), Operand::zero());
982 bld.sop1(aco_opcode::s_mov_b32, Definition(PhysReg(2), s1), Operand(PhysReg(1), s1));
983 bld.sop1(aco_opcode::s_mov_b32, Definition(PhysReg(2), s1), Operand(PhysReg(1), s1));
984
985 //! p_unit_test 3
986 //! v1: %0:v[0] = v_cndmask_b32 0, 0, %0:s[0-1]
987 //! s1: %0:s[1] = s_mov_b32 0
988 //! s_waitcnt_depctr sa_sdst(0)
989 //! s1: %0:s[2] = s_mov_b32 %0:s[1]
990 bld.pseudo(aco_opcode::p_unit_test, Operand::c32(3));
991 bld.vop2_e64(aco_opcode::v_cndmask_b32, Definition(PhysReg(256), v1), Operand::zero(),
992 Operand::zero(), Operand(PhysReg(0), s2));
993 bld.sop1(aco_opcode::s_mov_b32, Definition(PhysReg(1), s1), Operand::zero());
994 bld.sopp(aco_opcode::s_waitcnt_depctr, -1, 0xfffe);
995 bld.sop1(aco_opcode::s_mov_b32, Definition(PhysReg(2), s1), Operand(PhysReg(1), s1));
996
997 /* Instruction which is both involved in the hazard and is a mitigation. */
998 //! p_unit_test 4
999 //! v1: %0:v[0] = v_cndmask_b32 %0:s[2], 0, %0:s[0-1]
1000 //! s1: %0:s[1] = s_mov_b32 0
1001 //! s_waitcnt_depctr sa_sdst(0)
1002 //! s1: %0:s[2] = s_mov_b32 %0:s[1]
1003 bld.pseudo(aco_opcode::p_unit_test, Operand::c32(4));
1004 bld.vop2_e64(aco_opcode::v_cndmask_b32, Definition(PhysReg(256), v1), Operand(PhysReg(2), s1),
1005 Operand::zero(), Operand(PhysReg(0), s2));
1006 bld.sop1(aco_opcode::s_mov_b32, Definition(PhysReg(1), s1), Operand::zero());
1007 bld.sop1(aco_opcode::s_mov_b32, Definition(PhysReg(2), s1), Operand(PhysReg(1), s1));
1008
1009 finish_insert_nops_test();
1010 END_TEST
1011
1012 BEGIN_TEST(insert_nops.setpc_gfx6)
1013 if (!setup_cs(NULL, GFX6))
1014 return;
1015
1016 /* SGPR->SMEM hazards */
1017 //>> p_unit_test 0
1018 //! s1: %0:s[0] = s_mov_b32 0
1019 //! s_nop imm:2
1020 //! s_setpc_b64 0
1021 bld.pseudo(aco_opcode::p_unit_test, Operand::c32(0));
1022 bld.sop1(aco_opcode::s_mov_b32, Definition(PhysReg(0), s1), Operand::zero());
1023 bld.sop1(aco_opcode::s_setpc_b64, Operand::zero(8));
1024
1025 //! p_unit_test 1
1026 //! s1: %0:s[0] = s_mov_b32 0
1027 //! s_nop imm:2
1028 //! s_setpc_b64 0
1029 bld.pseudo(aco_opcode::p_unit_test, Operand::c32(1));
1030 bld.sop1(aco_opcode::s_mov_b32, Definition(PhysReg(0), s1), Operand::zero());
1031 bld.sopp(aco_opcode::s_nop, -1, 2);
1032 bld.sop1(aco_opcode::s_setpc_b64, Operand::zero(8));
1033
1034 finish_insert_nops_test();
1035
1036 /* This hazard can't be tested using s_setpc_b64, because the s_setpc_b64 itself resolves it. */
1037
1038 /* VINTRP->v_readlane_b32/etc */
1039 //>> p_unit_test 2
1040 //! v1: %0:v[0] = v_interp_mov_f32 2, %0:m0 attr0.x
1041 //! s_nop
1042 create_program(GFX6, compute_cs, 64, CHIP_UNKNOWN);
1043 bld.pseudo(aco_opcode::p_unit_test, Operand::c32(2));
1044 bld.vintrp(aco_opcode::v_interp_mov_f32, Definition(PhysReg(256), v1), Operand::c32(2u),
1045 Operand(m0, s1), 0, 0);
1046 finish_insert_nops_test(false);
1047 END_TEST
1048
1049 BEGIN_TEST(insert_nops.setpc_gfx7)
1050 for (amd_gfx_level gfx : {GFX7, GFX9}) {
1051 if (!setup_cs(NULL, gfx))
1052 continue;
1053
1054 //>> p_unit_test 0
1055 //! s_setpc_b64 0
1056 bld.pseudo(aco_opcode::p_unit_test, Operand::c32(0));
1057 bld.sop1(aco_opcode::s_setpc_b64, Operand::zero(8));
1058
1059 /* Break up SMEM clauses: resolved by the s_setpc_b64 itself */
1060 //! p_unit_test 1
1061 //! s1: %0:s[0] = s_load_dword %0:s[0-1]
1062 //! s_setpc_b64 0
1063 bld.pseudo(aco_opcode::p_unit_test, Operand::c32(1));
1064 bld.smem(aco_opcode::s_load_dword, Definition(PhysReg(0), s1), Operand(PhysReg(0), s2));
1065 bld.sop1(aco_opcode::s_setpc_b64, Operand::zero(8));
1066
1067 /* SALU and GDS hazards */
1068 //! p_unit_test 2
1069 //! s_setreg_imm32_b32 0x0 imm:14337
1070 //! s_nop
1071 //! s_setpc_b64 0
1072 bld.pseudo(aco_opcode::p_unit_test, Operand::c32(2));
1073 bld.sopk(aco_opcode::s_setreg_imm32_b32, Operand::literal32(0), (7 << 11) | 1);
1074 bld.sop1(aco_opcode::s_setpc_b64, Operand::zero(8));
1075
1076 /* VALU writes vcc -> vccz/v_div_fmas */
1077 //! p_unit_test 3
1078 //! s2: %0:vcc = v_cmp_eq_u32 0, 0
1079 //! s_nop imm:3
1080 //! s_setpc_b64 0
1081 bld.pseudo(aco_opcode::p_unit_test, Operand::c32(3));
1082 bld.vopc_e64(aco_opcode::v_cmp_eq_u32, Definition(vcc, s2), Operand::zero(), Operand::zero());
1083 bld.sop1(aco_opcode::s_setpc_b64, Operand::zero(8));
1084
1085 /* VALU writes exec -> execz/DPP */
1086 //! p_unit_test 4
1087 //! s2: %0:exec = v_cmpx_eq_u32 0, 0
1088 //! s_nop imm:3
1089 //! s_setpc_b64 0
1090 bld.pseudo(aco_opcode::p_unit_test, Operand::c32(4));
1091 bld.vopc_e64(aco_opcode::v_cmpx_eq_u32, Definition(exec, s2), Operand::zero(),
1092 Operand::zero());
1093 bld.sop1(aco_opcode::s_setpc_b64, Operand::zero(8));
1094
1095 /* VALU->DPP */
1096 //! p_unit_test 5
1097 //! v1: %0:v[0] = v_mov_b32 0
1098 //~gfx9! s_nop
1099 //! s_setpc_b64 0
1100 bld.pseudo(aco_opcode::p_unit_test, Operand::c32(5));
1101 bld.vop1(aco_opcode::v_mov_b32, Definition(PhysReg(256), v1), Operand::zero());
1102 bld.sop1(aco_opcode::s_setpc_b64, Operand::zero(8));
1103
1104 /* VALU->v_readlane_b32/VMEM/etc */
1105 //! p_unit_test 6
1106 //! s1: %0:s[0] = v_readfirstlane_b32 %0:v[0]
1107 //! s_nop imm:3
1108 //! s_setpc_b64 0
1109 bld.pseudo(aco_opcode::p_unit_test, Operand::c32(6));
1110 bld.vop1(aco_opcode::v_readfirstlane_b32, Definition(PhysReg(0), s1),
1111 Operand(PhysReg(256), v1));
1112 bld.sop1(aco_opcode::s_setpc_b64, Operand::zero(8));
1113
1114 finish_insert_nops_test();
1115
1116 /* These hazards can't be tested using s_setpc_b64, because the s_setpc_b64 itself resolves
1117 * them. */
1118
1119 //>> p_unit_test 7
1120 //! buffer_store_dwordx3 %0:s[0-3], %0:v[0], 0, %0:v[0-2] offen
1121 //! s_nop
1122 create_program(gfx, compute_cs, 64, CHIP_UNKNOWN);
1123 bld.pseudo(aco_opcode::p_unit_test, Operand::c32(7));
1124 bld.mubuf(aco_opcode::buffer_store_dwordx3, Operand(PhysReg(0), s4),
1125 Operand(PhysReg(256), v1), Operand::zero(), Operand(PhysReg(256), v3), 0, true);
1126 finish_insert_nops_test(false);
1127
1128 //>> p_unit_test 8
1129 //! s1: %0:m0 = s_mov_b32 0
1130 //! s_nop
1131 create_program(gfx, compute_cs, 64, CHIP_UNKNOWN);
1132 bld.pseudo(aco_opcode::p_unit_test, Operand::c32(8));
1133 bld.sop1(aco_opcode::s_mov_b32, Definition(PhysReg(m0), s1), Operand::zero());
1134 finish_insert_nops_test(false);
1135
1136 /* Break up SMEM clauses */
1137 //>> p_unit_test 9
1138 //! s1: %0:s[0] = s_load_dword %0:s[0-1]
1139 //! s_nop
1140 create_program(gfx, compute_cs, 64, CHIP_UNKNOWN);
1141 bld.pseudo(aco_opcode::p_unit_test, Operand::c32(9));
1142 bld.smem(aco_opcode::s_load_dword, Definition(PhysReg(0), s1), Operand(PhysReg(0), s2));
1143 finish_insert_nops_test(false);
1144 }
1145 END_TEST
1146
1147 BEGIN_TEST(insert_nops.setpc_gfx10)
1148 if (!setup_cs(NULL, GFX10))
1149 return;
1150
1151 //>> p_unit_test 0
1152 //! s_setpc_b64 0
1153 bld.pseudo(aco_opcode::p_unit_test, Operand::c32(0));
1154 bld.sop1(aco_opcode::s_setpc_b64, Operand::zero(8));
1155
1156 /* VcmpxPermlaneHazard */
1157 //! p_unit_test 1
1158 //! s2: %0:exec = v_cmpx_eq_u32 0, 0
1159 //! v1: %0:v[0] = v_mov_b32 %0:v[0]
1160 //! s_setpc_b64 0
1161 bld.pseudo(aco_opcode::p_unit_test, Operand::c32(1));
1162 bld.vopc_e64(aco_opcode::v_cmpx_eq_u32, Definition(exec, s2), Operand::zero(), Operand::zero());
1163 bld.sop1(aco_opcode::s_setpc_b64, Operand::zero(8));
1164
1165 /* VMEMtoScalarWriteHazard */
1166 //! p_unit_test 2
1167 //! v1: %0:v[0] = ds_read_b32 %0:v[0]
1168 //! s_waitcnt_vscnt %0:null imm:0
1169 //! s_waitcnt_depctr vm_vsrc(0)
1170 //! s_setpc_b64 0
1171 bld.pseudo(aco_opcode::p_unit_test, Operand::c32(2));
1172 bld.ds(aco_opcode::ds_read_b32, Definition(PhysReg(256), v1), Operand(PhysReg(256), v1));
1173 bld.sopk(aco_opcode::s_waitcnt_vscnt, Operand(sgpr_null, s1),
1174 0); /* reset LdsBranchVmemWARHazard */
1175 bld.sop1(aco_opcode::s_setpc_b64, Operand::zero(8));
1176
1177 /* VcmpxExecWARHazard */
1178 //! p_unit_test 3
1179 //! s1: %0:s[0] = s_mov_b32 %0:exec_hi
1180 //! s_waitcnt_depctr sa_sdst(0)
1181 //! s_setpc_b64 0
1182 bld.pseudo(aco_opcode::p_unit_test, Operand::c32(3));
1183 bld.sop1(aco_opcode::s_mov_b32, Definition(PhysReg(0), s1), Operand(exec_hi, s1));
1184 bld.sop1(aco_opcode::s_setpc_b64, Operand::zero(8));
1185
1186 /* LdsBranchVmemWARHazard */
1187 //! p_unit_test 4
1188 //! v1: %0:v[0] = ds_read_b32 %0:v[0]
1189 //! v_nop
1190 //! s_branch
1191 //! s_waitcnt_vscnt %0:null imm:0
1192 //! s_setpc_b64 0
1193 bld.pseudo(aco_opcode::p_unit_test, Operand::c32(4));
1194 bld.ds(aco_opcode::ds_read_b32, Definition(PhysReg(256), v1), Operand(PhysReg(256), v1));
1195 bld.vop1(aco_opcode::v_nop); /* reset VMEMtoScalarWriteHazard */
1196 bld.sopp(aco_opcode::s_branch, -1, 0);
1197 bld.sop1(aco_opcode::s_setpc_b64, Operand::zero(8));
1198
1199 //! p_unit_test 5
1200 //! v1: %0:v[0] = ds_read_b32 %0:v[0]
1201 //! v_nop
1202 //! s_waitcnt_vscnt %0:null imm:0
1203 //! s_setpc_b64 0
1204 bld.pseudo(aco_opcode::p_unit_test, Operand::c32(5));
1205 bld.ds(aco_opcode::ds_read_b32, Definition(PhysReg(256), v1), Operand(PhysReg(256), v1));
1206 bld.vop1(aco_opcode::v_nop); /* reset VMEMtoScalarWriteHazard */
1207 bld.sop1(aco_opcode::s_setpc_b64, Operand::zero(8));
1208
1209 /* waNsaCannotFollowWritelane: resolved by the s_setpc_b64 */
1210 //! p_unit_test 6
1211 //! v1: %0:v[0] = v_writelane_b32_e64 %0:v[1], 0, %0:v[0]
1212 //! s_setpc_b64 0
1213 bld.pseudo(aco_opcode::p_unit_test, Operand::c32(6));
1214 bld.vop3(aco_opcode::v_writelane_b32_e64, Definition(PhysReg(256), v1),
1215 Operand(PhysReg(257), v1), Operand::zero(4), Operand(PhysReg(256), v1));
1216 bld.sop1(aco_opcode::s_setpc_b64, Operand::zero(8));
1217
1218 finish_insert_nops_test();
1219
1220 /* These hazards can't be tested using s_setpc_b64, because the s_setpc_b64 itself resolves them.
1221 */
1222
1223 /* SMEMtoVectorWriteHazard */
1224 //>> p_unit_test 7
1225 //! s1: %0:s[0] = s_load_dword %0:s[0-1]
1226 //! s1: %0:null = s_mov_b32 0
1227 create_program(GFX10, compute_cs, 64, CHIP_UNKNOWN);
1228 bld.pseudo(aco_opcode::p_unit_test, Operand::c32(7));
1229 bld.smem(aco_opcode::s_load_dword, Definition(PhysReg(0), s1), Operand(PhysReg(0), s2));
1230 finish_insert_nops_test(false);
1231
1232 /* NSAToVMEMBug is already resolved indirectly through VMEMtoScalarWriteHazard and
1233 * LdsBranchVmemWARHazard. */
1234 //>> p_unit_test 8
1235 //! v1: %0:v[0] = image_sample %0:s[0-7], %0:s[0-3], v1: undef, %0:v[0], %0:v[2], %0:v[4], %0:v[6], %0:v[8], %0:v[10] 2d
1236 //! s_waitcnt_depctr vm_vsrc(0)
1237 //! s_waitcnt_vscnt %0:null imm:0
1238 create_program(GFX10, compute_cs, 64, CHIP_UNKNOWN);
1239 bld.pseudo(aco_opcode::p_unit_test, Operand::c32(8));
1240 create_mimg(true, 6, 4);
1241 finish_insert_nops_test(false);
1242
1243 /* waNsaCannotFollowWritelane */
1244 //>> p_unit_test 9
1245 //! v1: %0:v[0] = v_writelane_b32_e64 %0:v[1], 0, %0:v[0]
1246 //! s_nop
1247 create_program(GFX10, compute_cs, 64, CHIP_UNKNOWN);
1248 bld.pseudo(aco_opcode::p_unit_test, Operand::c32(9));
1249 bld.vop3(aco_opcode::v_writelane_b32_e64, Definition(PhysReg(256), v1),
1250 Operand(PhysReg(257), v1), Operand::zero(4), Operand(PhysReg(256), v1));
1251 finish_insert_nops_test(false);
1252 END_TEST
1253
1254 BEGIN_TEST(insert_nops.setpc_gfx11)
1255 if (!setup_cs(NULL, GFX11))
1256 return;
1257
1258 //>> p_unit_test 0
1259 //! s_setpc_b64 0
1260 bld.pseudo(aco_opcode::p_unit_test, Operand::c32(0));
1261 bld.sop1(aco_opcode::s_setpc_b64, Operand::zero(8));
1262
1263 /* LdsDirectVALUHazard */
1264 //! p_unit_test 1
1265 //! s2: %0:vcc = v_cmp_eq_u32 %0:v[0], 0
1266 //! s_waitcnt_depctr va_vdst(0)
1267 //! s_setpc_b64 0
1268 bld.pseudo(aco_opcode::p_unit_test, Operand::c32(1));
1269 bld.vopc_e64(aco_opcode::v_cmp_eq_u32, Definition(vcc, s2), Operand(PhysReg(256), v1),
1270 Operand::zero());
1271 bld.sop1(aco_opcode::s_setpc_b64, Operand::zero(8));
1272
1273 /* VALUPartialForwardingHazard */
1274 //! p_unit_test 2
1275 //! v1: %0:v[0] = v_mov_b32 0
1276 //! s_waitcnt_depctr va_vdst(0)
1277 //! s_setpc_b64 0
1278 bld.pseudo(aco_opcode::p_unit_test, Operand::c32(2));
1279 bld.vop1(aco_opcode::v_mov_b32, Definition(PhysReg(256), v1), Operand::zero());
1280 bld.sop1(aco_opcode::s_setpc_b64, Operand::zero(8));
1281
1282 /* VcmpxPermlaneHazard */
1283 //! p_unit_test 2
1284 //! s2: %0:exec = v_cmpx_eq_u32 0, 0
1285 //! v_nop
1286 //! s_setpc_b64 0
1287 bld.pseudo(aco_opcode::p_unit_test, Operand::c32(2));
1288 bld.vopc_e64(aco_opcode::v_cmpx_eq_u32, Definition(exec, s2), Operand::zero(), Operand::zero());
1289 bld.sop1(aco_opcode::s_setpc_b64, Operand::zero(8));
1290
1291 /* VALUTransUseHazard */
1292 //! p_unit_test 3
1293 //! v1: %0:v[0] = v_rcp_f32 0
1294 //! s_waitcnt_depctr va_vdst(0)
1295 //! s_setpc_b64 0
1296 bld.pseudo(aco_opcode::p_unit_test, Operand::c32(3));
1297 bld.vop1(aco_opcode::v_rcp_f32, Definition(PhysReg(256), v1), Operand::zero());
1298 bld.sop1(aco_opcode::s_setpc_b64, Operand::zero(8));
1299
1300 /* VALUMaskWriteHazard */
1301 //! p_unit_test 4
1302 //! v1: %0:v[0] = v_cndmask_b32 0, 0, %0:vcc
1303 //! s_waitcnt_depctr va_vdst(0) sa_sdst(0)
1304 //! s_setpc_b64 0
1305 bld.pseudo(aco_opcode::p_unit_test, Operand::c32(4));
1306 bld.vop2(aco_opcode::v_cndmask_b32, Definition(PhysReg(256), v1), Operand::zero(),
1307 Operand::zero(), Operand(vcc, s2));
1308 bld.sop1(aco_opcode::s_setpc_b64, Operand::zero(8));
1309
1310 //! p_unit_test 5
1311 //! v1: %0:v[0] = v_cndmask_b32 0, 0, %0:vcc
1312 //! s2: %0:vcc = s_mov_b64 0
1313 //! s_waitcnt_depctr va_vdst(0) sa_sdst(0)
1314 //! s_setpc_b64 0
1315 bld.pseudo(aco_opcode::p_unit_test, Operand::c32(5));
1316 bld.vop2(aco_opcode::v_cndmask_b32, Definition(PhysReg(256), v1), Operand::zero(),
1317 Operand::zero(), Operand(vcc, s2));
1318 bld.sop1(aco_opcode::s_mov_b64, Definition(vcc, s2), Operand::zero(8));
1319 bld.sop1(aco_opcode::s_setpc_b64, Operand::zero(8));
1320
1321 /* LdsDirectVMEMHazard */
1322 //! p_unit_test 6
1323 //! v1: %0:v[0] = ds_read_b32 %0:v[0]
1324 //! s_waitcnt_depctr vm_vsrc(0)
1325 //! s_setpc_b64 0
1326 bld.pseudo(aco_opcode::p_unit_test, Operand::c32(6));
1327 bld.ds(aco_opcode::ds_read_b32, Definition(PhysReg(256), v1), Operand(PhysReg(256), v1));
1328 bld.sop1(aco_opcode::s_setpc_b64, Operand::zero(8));
1329
1330 finish_insert_nops_test(true);
1331 }
1332