• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /*
2  * Copyright © 2020 Valve Corporation
3  *
4  * Permission is hereby granted, free of charge, to any person obtaining a
5  * copy of this software and associated documentation files (the "Software"),
6  * to deal in the Software without restriction, including without limitation
7  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8  * and/or sell copies of the Software, and to permit persons to whom the
9  * Software is furnished to do so, subject to the following conditions:
10  *
11  * The above copyright notice and this permission notice (including the next
12  * paragraph) shall be included in all copies or substantial portions of the
13  * Software.
14  *
15  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
18  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20  * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21  * IN THE SOFTWARE.
22  *
23  */
24 #include "helpers.h"
25 
26 using namespace aco;
27 
28 void
create_mubuf(unsigned offset,PhysReg dst=PhysReg (256),PhysReg vaddr=PhysReg (256))29 create_mubuf(unsigned offset, PhysReg dst = PhysReg(256), PhysReg vaddr = PhysReg(256))
30 {
31    bld.mubuf(aco_opcode::buffer_load_dword, Definition(dst, v1), Operand(PhysReg(0), s4),
32              Operand(vaddr, v1), Operand::zero(), offset, true);
33 }
34 
35 void
create_mubuf_store(PhysReg src=PhysReg (256))36 create_mubuf_store(PhysReg src = PhysReg(256))
37 {
38    bld.mubuf(aco_opcode::buffer_store_dword, Operand(PhysReg(0), s4), Operand(src, v1),
39              Operand::zero(), Operand(src, v1), 0, true);
40 }
41 
42 void
create_mimg(bool nsa,unsigned addrs,unsigned instr_dwords)43 create_mimg(bool nsa, unsigned addrs, unsigned instr_dwords)
44 {
45    aco_ptr<MIMG_instruction> mimg{
46       create_instruction<MIMG_instruction>(aco_opcode::image_sample, Format::MIMG, 3 + addrs, 1)};
47    mimg->definitions[0] = Definition(PhysReg(256), v1);
48    mimg->operands[0] = Operand(PhysReg(0), s8);
49    mimg->operands[1] = Operand(PhysReg(0), s4);
50    mimg->operands[2] = Operand(v1);
51    for (unsigned i = 0; i < addrs; i++)
52       mimg->operands[3 + i] = Operand(PhysReg(256 + (nsa ? i * 2 : i)), v1);
53    mimg->dmask = 0x1;
54    mimg->dim = ac_image_2d;
55 
56    assert(get_mimg_nsa_dwords(mimg.get()) + 2 == instr_dwords);
57 
58    bld.insert(std::move(mimg));
59 }
60 
61 BEGIN_TEST(insert_nops.nsa_to_vmem_bug)
62    if (!setup_cs(NULL, GFX10))
63       return;
64 
65    /* no nop needed because offset&6==0 */
66    //>> p_unit_test 0
67    //! v1: %0:v[0] = image_sample %0:s[0-7], %0:s[0-3],  v1: undef, %0:v[0], %0:v[2], %0:v[4], %0:v[6], %0:v[8], %0:v[10] 2d
68    //! v1: %0:v[0] = buffer_load_dword %0:s[0-3], %0:v[0], 0 offset:8 offen
69    bld.pseudo(aco_opcode::p_unit_test, Operand::zero());
70    create_mimg(true, 6, 4);
71    create_mubuf(8);
72 
73    /* nop needed */
74    //! p_unit_test 1
75    //! v1: %0:v[0] = image_sample %0:s[0-7], %0:s[0-3],  v1: undef, %0:v[0], %0:v[2], %0:v[4], %0:v[6], %0:v[8], %0:v[10] 2d
76    //! s_nop
77    //! v1: %0:v[0] = buffer_load_dword %0:s[0-3], %0:v[0], 0 offset:4 offen
78    bld.pseudo(aco_opcode::p_unit_test, Operand::c32(1u));
79    create_mimg(true, 6, 4);
80    create_mubuf(4);
81 
82    /* no nop needed because the MIMG is not NSA */
83    //! p_unit_test 2
84    //! v1: %0:v[0] = image_sample %0:s[0-7], %0:s[0-3],  v1: undef, %0:v[0], %0:v[1], %0:v[2], %0:v[3], %0:v[4], %0:v[5] 2d
85    //! v1: %0:v[0] = buffer_load_dword %0:s[0-3], %0:v[0], 0 offset:4 offen
86    bld.pseudo(aco_opcode::p_unit_test, Operand::c32(2u));
87    create_mimg(false, 6, 2);
88    create_mubuf(4);
89 
90    /* no nop needed because there's already an instruction in-between */
91    //! p_unit_test 3
92    //! v1: %0:v[0] = image_sample %0:s[0-7], %0:s[0-3],  v1: undef, %0:v[0], %0:v[2], %0:v[4], %0:v[6], %0:v[8], %0:v[10] 2d
93    //! v_nop
94    //! v1: %0:v[0] = buffer_load_dword %0:s[0-3], %0:v[0], 0 offset:4 offen
95    bld.pseudo(aco_opcode::p_unit_test, Operand::c32(3u));
96    create_mimg(true, 6, 4);
97    bld.vop1(aco_opcode::v_nop);
98    create_mubuf(4);
99 
100    /* no nop needed because the NSA instruction is under 4 dwords */
101    //! p_unit_test 4
102    //! v1: %0:v[0] = image_sample %0:s[0-7], %0:s[0-3],  v1: undef, %0:v[0], %0:v[2] 2d
103    //! v1: %0:v[0] = buffer_load_dword %0:s[0-3], %0:v[0], 0 offset:4 offen
104    bld.pseudo(aco_opcode::p_unit_test, Operand::c32(4u));
105    create_mimg(true, 2, 3);
106    create_mubuf(4);
107 
108    /* NSA instruction and MUBUF/MTBUF in a different block */
109    //! p_unit_test 5
110    //! v1: %0:v[0] = image_sample %0:s[0-7], %0:s[0-3],  v1: undef, %0:v[0], %0:v[2], %0:v[4], %0:v[6], %0:v[8], %0:v[10] 2d
111    //! BB1
112    //! /* logical preds: / linear preds: BB0, / kind: uniform, */
113    //! s_nop
114    //! v1: %0:v[0] = buffer_load_dword %0:s[0-3], %0:v[0], 0 offset:4 offen
115    bld.pseudo(aco_opcode::p_unit_test, Operand::c32(5u));
116    create_mimg(true, 6, 4);
117    bld.reset(program->create_and_insert_block());
118    create_mubuf(4);
119    program->blocks[0].linear_succs.push_back(1);
120    program->blocks[1].linear_preds.push_back(0);
121 
122    finish_insert_nops_test();
123 END_TEST
124 
125 BEGIN_TEST(insert_nops.writelane_to_nsa_bug)
126    if (!setup_cs(NULL, GFX10))
127       return;
128 
129    /* nop needed */
130    //>> p_unit_test 0
131    //! v1: %0:v[255] = v_writelane_b32_e64 0, 0, %0:v[255]
132    //! s_nop
133    //! v1: %0:v[0] = image_sample %0:s[0-7], %0:s[0-3],  v1: undef, %0:v[0], %0:v[2] 2d
134    bld.pseudo(aco_opcode::p_unit_test, Operand::zero());
135    bld.writelane(Definition(PhysReg(511), v1), Operand::zero(), Operand::zero(),
136                  Operand(PhysReg(511), v1));
137    create_mimg(true, 2, 3);
138 
139    /* no nop needed because the MIMG is not NSA */
140    //! p_unit_test 1
141    //! v1: %0:v[255] = v_writelane_b32_e64 0, 0, %0:v[255]
142    //! v1: %0:v[0] = image_sample %0:s[0-7], %0:s[0-3],  v1: undef, %0:v[0], %0:v[1] 2d
143    bld.pseudo(aco_opcode::p_unit_test, Operand::c32(1u));
144    bld.writelane(Definition(PhysReg(511), v1), Operand::zero(), Operand::zero(),
145                  Operand(PhysReg(511), v1));
146    create_mimg(false, 2, 2);
147 
148    /* no nop needed because there's already an instruction in-between */
149    //! p_unit_test 2
150    //! v1: %0:v[255] = v_writelane_b32_e64 0, 0, %0:v[255]
151    //! v_nop
152    //! v1: %0:v[0] = image_sample %0:s[0-7], %0:s[0-3],  v1: undef, %0:v[0], %0:v[2] 2d
153    bld.pseudo(aco_opcode::p_unit_test, Operand::c32(2u));
154    bld.writelane(Definition(PhysReg(511), v1), Operand::zero(), Operand::zero(),
155                  Operand(PhysReg(511), v1));
156    bld.vop1(aco_opcode::v_nop);
157    create_mimg(true, 2, 3);
158 
159    /* writelane and NSA instruction in different blocks */
160    //! p_unit_test 3
161    //! v1: %0:v[255] = v_writelane_b32_e64 0, 0, %0:v[255]
162    //! BB1
163    //! /* logical preds: / linear preds: BB0, / kind: uniform, */
164    //! s_nop
165    //! v1: %0:v[0] = image_sample %0:s[0-7], %0:s[0-3],  v1: undef, %0:v[0], %0:v[2] 2d
166    bld.pseudo(aco_opcode::p_unit_test, Operand::c32(3u));
167    bld.writelane(Definition(PhysReg(511), v1), Operand::zero(), Operand::zero(),
168                  Operand(PhysReg(511), v1));
169    bld.reset(program->create_and_insert_block());
170    create_mimg(true, 2, 3);
171    program->blocks[0].linear_succs.push_back(1);
172    program->blocks[1].linear_preds.push_back(0);
173 
174    finish_insert_nops_test();
175 END_TEST
176 
177 BEGIN_TEST(insert_nops.vmem_to_scalar_write)
178    if (!setup_cs(NULL, GFX10))
179       return;
180 
181    /* WaR: VMEM load */
182    //>> p_unit_test 0
183    //! v1: %0:v[0] = buffer_load_dword %0:s[0-3], %0:v[0], 0 offen
184    //! s_waitcnt_depctr vm_vsrc(0)
185    //! s1: %0:s[0] = s_mov_b32 0
186    bld.pseudo(aco_opcode::p_unit_test, Operand::c32(0));
187    create_mubuf(0);
188    bld.sop1(aco_opcode::s_mov_b32, Definition(PhysReg(0), s1), Operand::zero());
189 
190    //! p_unit_test 1
191    //! v1: %0:v[0] = buffer_load_dword %0:s[0-3], %0:v[0], 0 offen
192    //! s_waitcnt_depctr vm_vsrc(0)
193    //! s2: %0:exec = s_mov_b64 -1
194    bld.pseudo(aco_opcode::p_unit_test, Operand::c32(1));
195    create_mubuf(0);
196    bld.sop1(aco_opcode::s_mov_b64, Definition(exec, s2), Operand::c64(-1));
197 
198    /* no hazard: VMEM load */
199    //! p_unit_test 2
200    //! v1: %0:v[0] = buffer_load_dword %0:s[0-3], %0:v[0], 0 offen
201    //! s1: %0:s[4] = s_mov_b32 0
202    bld.pseudo(aco_opcode::p_unit_test, Operand::c32(2));
203    create_mubuf(0);
204    bld.sop1(aco_opcode::s_mov_b32, Definition(PhysReg(4), s1), Operand::zero());
205 
206    /* no hazard: VMEM load with VALU in-between */
207    //! p_unit_test 3
208    //! v1: %0:v[0] = buffer_load_dword %0:s[0-3], %0:v[0], 0 offen
209    //! v_nop
210    //! s1: %0:s[0] = s_mov_b32 0
211    bld.pseudo(aco_opcode::p_unit_test, Operand::c32(3));
212    create_mubuf(0);
213    bld.vop1(aco_opcode::v_nop);
214    bld.sop1(aco_opcode::s_mov_b32, Definition(PhysReg(0), s1), Operand::zero());
215 
216    /* WaR: LDS */
217    //! p_unit_test 4
218    //! v1: %0:v[0] = ds_read_b32 %0:v[0], %0:m0
219    //! s_waitcnt_depctr vm_vsrc(0)
220    //! s1: %0:m0 = s_mov_b32 0
221    bld.pseudo(aco_opcode::p_unit_test, Operand::c32(4));
222    bld.ds(aco_opcode::ds_read_b32, Definition(PhysReg(256), v1), Operand(PhysReg(256), v1),
223           Operand(m0, s1));
224    bld.sop1(aco_opcode::s_mov_b32, Definition(m0, s1), Operand::zero());
225 
226    //! p_unit_test 5
227    //! v1: %0:v[0] = ds_read_b32 %0:v[0], %0:m0
228    //! s_waitcnt_depctr vm_vsrc(0)
229    //! s2: %0:exec = s_mov_b64 -1
230    bld.pseudo(aco_opcode::p_unit_test, Operand::c32(5));
231    bld.ds(aco_opcode::ds_read_b32, Definition(PhysReg(256), v1), Operand(PhysReg(256), v1),
232           Operand(m0, s1));
233    bld.sop1(aco_opcode::s_mov_b64, Definition(exec, s2), Operand::c64(-1));
234 
235    /* no hazard: LDS */
236    //! p_unit_test 6
237    //! v1: %0:v[0] = ds_read_b32 %0:v[0], %0:m0
238    //! s1: %0:s[0] = s_mov_b32 0
239    bld.pseudo(aco_opcode::p_unit_test, Operand::c32(6));
240    bld.ds(aco_opcode::ds_read_b32, Definition(PhysReg(256), v1), Operand(PhysReg(256), v1),
241           Operand(m0, s1));
242    bld.sop1(aco_opcode::s_mov_b32, Definition(PhysReg(0), s1), Operand::zero());
243 
244    /* no hazard: LDS with VALU in-between */
245    //! p_unit_test 7
246    //! v1: %0:v[0] = ds_read_b32 %0:v[0], %0:m0
247    //! v_nop
248    //! s1: %0:m0 = s_mov_b32 0
249    bld.pseudo(aco_opcode::p_unit_test, Operand::c32(7));
250    bld.ds(aco_opcode::ds_read_b32, Definition(PhysReg(256), v1), Operand(PhysReg(256), v1),
251           Operand(m0, s1));
252    bld.vop1(aco_opcode::v_nop);
253    bld.sop1(aco_opcode::s_mov_b32, Definition(m0, s1), Operand::zero());
254 
255    /* no hazard: VMEM/LDS with the correct waitcnt in-between */
256    //! p_unit_test 8
257    //! v1: %0:v[0] = buffer_load_dword %0:s[0-3], %0:v[0], 0 offen
258    //! s_waitcnt vmcnt(0)
259    //! s1: %0:s[0] = s_mov_b32 0
260    bld.pseudo(aco_opcode::p_unit_test, Operand::c32(8));
261    create_mubuf(0);
262    bld.sopp(aco_opcode::s_waitcnt, -1, 0x3f70);
263    bld.sop1(aco_opcode::s_mov_b32, Definition(PhysReg(0), s1), Operand::zero());
264 
265    //! p_unit_test 9
266    //! buffer_store_dword %0:s[0-3], %0:v[0], 0, %0:v[0] offen
267    //! s_waitcnt_vscnt %0:null imm:0
268    //! s1: %0:s[0] = s_mov_b32 0
269    bld.pseudo(aco_opcode::p_unit_test, Operand::c32(9));
270    create_mubuf_store();
271    bld.sopk(aco_opcode::s_waitcnt_vscnt, Operand(sgpr_null, s1), 0);
272    bld.sop1(aco_opcode::s_mov_b32, Definition(PhysReg(0), s1), Operand::zero());
273 
274    //! p_unit_test 10
275    //! v1: %0:v[0] = ds_read_b32 %0:v[0], %0:m0
276    //! s_waitcnt lgkmcnt(0)
277    //! s1: %0:m0 = s_mov_b32 0
278    bld.pseudo(aco_opcode::p_unit_test, Operand::c32(10));
279    bld.ds(aco_opcode::ds_read_b32, Definition(PhysReg(256), v1), Operand(PhysReg(256), v1),
280           Operand(m0, s1));
281    bld.sopp(aco_opcode::s_waitcnt, -1, 0xc07f);
282    bld.sop1(aco_opcode::s_mov_b32, Definition(m0, s1), Operand::zero());
283 
284    /* VMEM/LDS with the wrong waitcnt in-between */
285    //! p_unit_test 11
286    //! v1: %0:v[0] = buffer_load_dword %0:s[0-3], %0:v[0], 0 offen
287    //! s_waitcnt_vscnt %0:null imm:0
288    //! s_waitcnt_depctr vm_vsrc(0)
289    //! s1: %0:s[0] = s_mov_b32 0
290    bld.pseudo(aco_opcode::p_unit_test, Operand::c32(11));
291    create_mubuf(0);
292    bld.sopk(aco_opcode::s_waitcnt_vscnt, Operand(sgpr_null, s1), 0);
293    bld.sop1(aco_opcode::s_mov_b32, Definition(PhysReg(0), s1), Operand::zero());
294 
295    //! p_unit_test 12
296    //! buffer_store_dword %0:s[0-3], %0:v[0], 0, %0:v[0] offen
297    //! s_waitcnt lgkmcnt(0)
298    //! s_waitcnt_depctr vm_vsrc(0)
299    //! s1: %0:s[0] = s_mov_b32 0
300    bld.pseudo(aco_opcode::p_unit_test, Operand::c32(12));
301    create_mubuf_store();
302    bld.sopp(aco_opcode::s_waitcnt, -1, 0xc07f);
303    bld.sop1(aco_opcode::s_mov_b32, Definition(PhysReg(0), s1), Operand::zero());
304 
305    //! p_unit_test 13
306    //! v1: %0:v[0] = ds_read_b32 %0:v[0], %0:m0
307    //! s_waitcnt vmcnt(0)
308    //! s_waitcnt_depctr vm_vsrc(0)
309    //! s1: %0:m0 = s_mov_b32 0
310    bld.pseudo(aco_opcode::p_unit_test, Operand::c32(13));
311    bld.ds(aco_opcode::ds_read_b32, Definition(PhysReg(256), v1), Operand(PhysReg(256), v1),
312           Operand(m0, s1));
313    bld.sopp(aco_opcode::s_waitcnt, -1, 0x3f70);
314    bld.sop1(aco_opcode::s_mov_b32, Definition(m0, s1), Operand::zero());
315 
316    finish_insert_nops_test();
317 END_TEST
318 
319 BEGIN_TEST(insert_nops.lds_direct_valu)
320    if (!setup_cs(NULL, GFX11))
321       return;
322 
323    /* WaW */
324    //>> p_unit_test 0
325    //! v1: %0:v[0] = v_mov_b32 0
326    //! v1: %0:v[0] = lds_direct_load %0:m0 wait_vdst:0
327    bld.pseudo(aco_opcode::p_unit_test, Operand::c32(0));
328    bld.vop1(aco_opcode::v_mov_b32, Definition(PhysReg(256), v1), Operand::zero());
329    bld.ldsdir(aco_opcode::lds_direct_load, Definition(PhysReg(256), v1), Operand(m0, s1));
330 
331    /* WaR */
332    //! p_unit_test 1
333    //! v1: %0:v[1] = v_mov_b32 %0:v[0]
334    //! v1: %0:v[0] = lds_direct_load %0:m0 wait_vdst:0
335    bld.pseudo(aco_opcode::p_unit_test, Operand::c32(1));
336    bld.vop1(aco_opcode::v_mov_b32, Definition(PhysReg(257), v1), Operand(PhysReg(256), v1));
337    bld.ldsdir(aco_opcode::lds_direct_load, Definition(PhysReg(256), v1), Operand(m0, s1));
338 
339    /* No hazard. */
340    //! p_unit_test 2
341    //! v1: %0:v[1] = v_mov_b32 0
342    //! v1: %0:v[0] = lds_direct_load %0:m0
343    bld.pseudo(aco_opcode::p_unit_test, Operand::c32(2));
344    bld.vop1(aco_opcode::v_mov_b32, Definition(PhysReg(257), v1), Operand::zero());
345    bld.ldsdir(aco_opcode::lds_direct_load, Definition(PhysReg(256), v1), Operand(m0, s1));
346 
347    /* multiples hazards, nearest should be considered */
348    //! p_unit_test 3
349    //! v1: %0:v[1] = v_mov_b32 %0:v[0]
350    //! v1: %0:v[0] = v_mov_b32 0
351    //! v1: %0:v[0] = lds_direct_load %0:m0 wait_vdst:0
352    bld.pseudo(aco_opcode::p_unit_test, Operand::c32(3));
353    bld.vop1(aco_opcode::v_mov_b32, Definition(PhysReg(257), v1), Operand(PhysReg(256), v1));
354    bld.vop1(aco_opcode::v_mov_b32, Definition(PhysReg(256), v1), Operand::zero());
355    bld.ldsdir(aco_opcode::lds_direct_load, Definition(PhysReg(256), v1), Operand(m0, s1));
356 
357    /* independent VALU increase wait_vdst */
358    //! p_unit_test 4
359    //! v1: %0:v[0] = v_mov_b32 0
360    //! v_nop
361    //! v1: %0:v[0] = lds_direct_load %0:m0 wait_vdst:1
362    bld.pseudo(aco_opcode::p_unit_test, Operand::c32(4));
363    bld.vop1(aco_opcode::v_mov_b32, Definition(PhysReg(256), v1), Operand::zero());
364    bld.vop1(aco_opcode::v_nop);
365    bld.ldsdir(aco_opcode::lds_direct_load, Definition(PhysReg(256), v1), Operand(m0, s1));
366 
367    //! p_unit_test 5
368    //! v1: %0:v[0] = v_mov_b32 0
369    //; for i in range(10): insert_pattern('v_nop')
370    //! v1: %0:v[0] = lds_direct_load %0:m0 wait_vdst:10
371    bld.pseudo(aco_opcode::p_unit_test, Operand::c32(5));
372    bld.vop1(aco_opcode::v_mov_b32, Definition(PhysReg(256), v1), Operand::zero());
373    for (unsigned i = 0; i < 10; i++)
374       bld.vop1(aco_opcode::v_nop);
375    bld.ldsdir(aco_opcode::lds_direct_load, Definition(PhysReg(256), v1), Operand(m0, s1));
376 
377    //! p_unit_test 6
378    //! v1: %0:v[0] = v_mov_b32 0
379    //; for i in range(20): insert_pattern('v_nop')
380    //! v1: %0:v[0] = lds_direct_load %0:m0
381    bld.pseudo(aco_opcode::p_unit_test, Operand::c32(6));
382    bld.vop1(aco_opcode::v_mov_b32, Definition(PhysReg(256), v1), Operand::zero());
383    for (unsigned i = 0; i < 20; i++)
384       bld.vop1(aco_opcode::v_nop);
385    bld.ldsdir(aco_opcode::lds_direct_load, Definition(PhysReg(256), v1), Operand(m0, s1));
386 
387    /* transcendental requires wait_vdst=0 */
388    //! p_unit_test 7
389    //! v1: %0:v[0] = v_mov_b32 0
390    //! v_nop
391    //! v1: %0:v[1] = v_sqrt_f32 %0:v[1]
392    //! v1: %0:v[0] = lds_direct_load %0:m0 wait_vdst:0
393    bld.pseudo(aco_opcode::p_unit_test, Operand::c32(7));
394    bld.vop1(aco_opcode::v_mov_b32, Definition(PhysReg(256), v1), Operand::zero());
395    bld.vop1(aco_opcode::v_nop);
396    bld.vop1(aco_opcode::v_sqrt_f32, Definition(PhysReg(257), v1), Operand(PhysReg(257), v1));
397    bld.ldsdir(aco_opcode::lds_direct_load, Definition(PhysReg(256), v1), Operand(m0, s1));
398 
399    //! p_unit_test 8
400    //! v1: %0:v[0] = v_sqrt_f32 %0:v[0]
401    //! v_nop
402    //! v1: %0:v[0] = lds_direct_load %0:m0 wait_vdst:0
403    bld.pseudo(aco_opcode::p_unit_test, Operand::c32(8));
404    bld.vop1(aco_opcode::v_sqrt_f32, Definition(PhysReg(256), v1), Operand(PhysReg(256), v1));
405    bld.vop1(aco_opcode::v_nop);
406    bld.ldsdir(aco_opcode::lds_direct_load, Definition(PhysReg(256), v1), Operand(m0, s1));
407 
408    /* transcendental is fine if it's before the instruction */
409    //! p_unit_test 9
410    //! v1: %0:v[1] = v_sqrt_f32 %0:v[1]
411    //! v1: %0:v[0] = v_mov_b32 0
412    //! v_nop
413    //! v1: %0:v[0] = lds_direct_load %0:m0 wait_vdst:1
414    bld.pseudo(aco_opcode::p_unit_test, Operand::c32(9));
415    bld.vop1(aco_opcode::v_sqrt_f32, Definition(PhysReg(257), v1), Operand(PhysReg(257), v1));
416    bld.vop1(aco_opcode::v_mov_b32, Definition(PhysReg(256), v1), Operand::zero());
417    bld.vop1(aco_opcode::v_nop);
418    bld.ldsdir(aco_opcode::lds_direct_load, Definition(PhysReg(256), v1), Operand(m0, s1));
419 
420    /* non-VALU does not increase wait_vdst */
421    //! p_unit_test 10
422    //! v1: %0:v[0] = v_mov_b32 0
423    //! s1: %0:m0 = s_mov_b32 0
424    //! v1: %0:v[0] = lds_direct_load %0:m0 wait_vdst:0
425    bld.pseudo(aco_opcode::p_unit_test, Operand::c32(10));
426    bld.vop1(aco_opcode::v_mov_b32, Definition(PhysReg(256), v1), Operand::zero());
427    bld.sop1(aco_opcode::s_mov_b32, Definition(m0, s1), Operand::zero());
428    bld.ldsdir(aco_opcode::lds_direct_load, Definition(PhysReg(256), v1), Operand(m0, s1));
429 
430    /* consider instructions which wait on vdst */
431    //! p_unit_test 11
432    //! v1: %0:v[0] = v_mov_b32 0
433    //! v_nop
434    //! s_waitcnt_depctr va_vdst(0)
435    //! v1: %0:v[0] = lds_direct_load %0:m0
436    bld.pseudo(aco_opcode::p_unit_test, Operand::c32(11));
437    bld.vop1(aco_opcode::v_mov_b32, Definition(PhysReg(256), v1), Operand::zero());
438    bld.vop1(aco_opcode::v_nop);
439    bld.sopp(aco_opcode::s_waitcnt_depctr, -1, 0x0fff);
440    bld.ldsdir(aco_opcode::lds_direct_load, Definition(PhysReg(256), v1), Operand(m0, s1));
441 
442    finish_insert_nops_test();
443 END_TEST
444 
445 BEGIN_TEST(insert_nops.lds_direct_vmem)
446    if (!setup_cs(NULL, GFX11))
447       return;
448 
449    /* WaR: VMEM */
450    //>> p_unit_test 0
451    //! v1: %0:v[1] = buffer_load_dword %0:s[0-3], %0:v[0], 0 offen
452    //! s_waitcnt_depctr vm_vsrc(0)
453    //! v1: %0:v[0] = lds_direct_load %0:m0
454    bld.pseudo(aco_opcode::p_unit_test, Operand::c32(0));
455    create_mubuf(0, PhysReg(257));
456    bld.ldsdir(aco_opcode::lds_direct_load, Definition(PhysReg(256), v1), Operand(m0, s1));
457 
458    /* WaW: VMEM */
459    //! p_unit_test 1
460    //! v1: %0:v[0] = buffer_load_dword %0:s[0-3], %0:v[1], 0 offen
461    //! s_waitcnt_depctr vm_vsrc(0)
462    //! v1: %0:v[0] = lds_direct_load %0:m0
463    bld.pseudo(aco_opcode::p_unit_test, Operand::c32(1));
464    create_mubuf(0, PhysReg(256), PhysReg(257));
465    bld.ldsdir(aco_opcode::lds_direct_load, Definition(PhysReg(256), v1), Operand(m0, s1));
466 
467    /* no hazard: VMEM */
468    //! p_unit_test 2
469    //! v1: %0:v[1] = buffer_load_dword %0:s[0-3], %0:v[1], 0 offen
470    //! v1: %0:v[0] = lds_direct_load %0:m0
471    bld.pseudo(aco_opcode::p_unit_test, Operand::c32(2));
472    create_mubuf(0, PhysReg(257), PhysReg(257));
473    bld.ldsdir(aco_opcode::lds_direct_load, Definition(PhysReg(256), v1), Operand(m0, s1));
474 
475    /* no hazard: VMEM with VALU in-between */
476    //! p_unit_test 3
477    //! v1: %0:v[1] = buffer_load_dword %0:s[0-3], %0:v[0], 0 offen
478    //! v_nop
479    //! v1: %0:v[0] = lds_direct_load %0:m0
480    bld.pseudo(aco_opcode::p_unit_test, Operand::c32(3));
481    create_mubuf(0, PhysReg(257));
482    bld.vop1(aco_opcode::v_nop);
483    bld.ldsdir(aco_opcode::lds_direct_load, Definition(PhysReg(256), v1), Operand(m0, s1));
484 
485    /* WaR: LDS */
486    //! p_unit_test 4
487    //! v1: %0:v[1] = ds_read_b32 %0:v[0]
488    //! s_waitcnt_depctr vm_vsrc(0)
489    //! v1: %0:v[0] = lds_direct_load %0:m0
490    bld.pseudo(aco_opcode::p_unit_test, Operand::c32(4));
491    bld.ds(aco_opcode::ds_read_b32, Definition(PhysReg(257), v1), Operand(PhysReg(256), v1));
492    bld.ldsdir(aco_opcode::lds_direct_load, Definition(PhysReg(256), v1), Operand(m0, s1));
493 
494    /* WaW: LDS */
495    //! p_unit_test 5
496    //! v1: %0:v[0] = ds_read_b32 %0:v[1]
497    //! s_waitcnt_depctr vm_vsrc(0)
498    //! v1: %0:v[0] = lds_direct_load %0:m0
499    bld.pseudo(aco_opcode::p_unit_test, Operand::c32(5));
500    bld.ds(aco_opcode::ds_read_b32, Definition(PhysReg(256), v1), Operand(PhysReg(257), v1));
501    bld.ldsdir(aco_opcode::lds_direct_load, Definition(PhysReg(256), v1), Operand(m0, s1));
502 
503    /* no hazard: LDS */
504    //! p_unit_test 6
505    //! v1: %0:v[1] = ds_read_b32 %0:v[1]
506    //! v1: %0:v[0] = lds_direct_load %0:m0
507    bld.pseudo(aco_opcode::p_unit_test, Operand::c32(6));
508    bld.ds(aco_opcode::ds_read_b32, Definition(PhysReg(257), v1), Operand(PhysReg(257), v1));
509    bld.ldsdir(aco_opcode::lds_direct_load, Definition(PhysReg(256), v1), Operand(m0, s1));
510 
511    /* no hazard: LDS with VALU in-between */
512    //! p_unit_test 7
513    //! v1: %0:v[1] = ds_read_b32 %0:v[0]
514    //! v_nop
515    //! v1: %0:v[0] = lds_direct_load %0:m0
516    bld.pseudo(aco_opcode::p_unit_test, Operand::c32(7));
517    bld.ds(aco_opcode::ds_read_b32, Definition(PhysReg(257), v1), Operand(PhysReg(256), v1));
518    bld.vop1(aco_opcode::v_nop);
519    bld.ldsdir(aco_opcode::lds_direct_load, Definition(PhysReg(256), v1), Operand(m0, s1));
520 
521    /* no hazard: VMEM/LDS with the correct waitcnt in-between */
522    //! p_unit_test 8
523    //! v1: %0:v[1] = buffer_load_dword %0:s[0-3], %0:v[0], 0 offen
524    //! s_waitcnt vmcnt(0)
525    //! v1: %0:v[0] = lds_direct_load %0:m0
526    bld.pseudo(aco_opcode::p_unit_test, Operand::c32(8));
527    create_mubuf(0, PhysReg(257));
528    bld.sopp(aco_opcode::s_waitcnt, -1, 0x3ff);
529    bld.ldsdir(aco_opcode::lds_direct_load, Definition(PhysReg(256), v1), Operand(m0, s1));
530 
531    //! p_unit_test 9
532    //! buffer_store_dword %0:s[0-3], %0:v[0], 0, %0:v[0] offen
533    //! s_waitcnt_vscnt %0:null imm:0
534    //! v1: %0:v[0] = lds_direct_load %0:m0
535    bld.pseudo(aco_opcode::p_unit_test, Operand::c32(9));
536    create_mubuf_store();
537    bld.sopk(aco_opcode::s_waitcnt_vscnt, Operand(sgpr_null, s1), 0);
538    bld.ldsdir(aco_opcode::lds_direct_load, Definition(PhysReg(256), v1), Operand(m0, s1));
539 
540    //! p_unit_test 10
541    //! v1: %0:v[1] = ds_read_b32 %0:v[0]
542    //! s_waitcnt lgkmcnt(0)
543    //! v1: %0:v[0] = lds_direct_load %0:m0
544    bld.pseudo(aco_opcode::p_unit_test, Operand::c32(10));
545    bld.ds(aco_opcode::ds_read_b32, Definition(PhysReg(257), v1), Operand(PhysReg(256), v1));
546    bld.sopp(aco_opcode::s_waitcnt, -1, 0xfc0f);
547    bld.ldsdir(aco_opcode::lds_direct_load, Definition(PhysReg(256), v1), Operand(m0, s1));
548 
549    /* VMEM/LDS with the wrong waitcnt in-between */
550    //! p_unit_test 11
551    //! v1: %0:v[1] = buffer_load_dword %0:s[0-3], %0:v[0], 0 offen
552    //! s_waitcnt_vscnt %0:null imm:0
553    //! s_waitcnt_depctr vm_vsrc(0)
554    //! v1: %0:v[0] = lds_direct_load %0:m0
555    bld.pseudo(aco_opcode::p_unit_test, Operand::c32(11));
556    create_mubuf(0, PhysReg(257));
557    bld.sopk(aco_opcode::s_waitcnt_vscnt, Operand(sgpr_null, s1), 0);
558    bld.ldsdir(aco_opcode::lds_direct_load, Definition(PhysReg(256), v1), Operand(m0, s1));
559 
560    //! p_unit_test 12
561    //! buffer_store_dword %0:s[0-3], %0:v[0], 0, %0:v[0] offen
562    //! s_waitcnt lgkmcnt(0)
563    //! s_waitcnt_depctr vm_vsrc(0)
564    //! v1: %0:v[0] = lds_direct_load %0:m0
565    bld.pseudo(aco_opcode::p_unit_test, Operand::c32(12));
566    create_mubuf_store();
567    bld.sopp(aco_opcode::s_waitcnt, -1, 0xfc0f);
568    bld.ldsdir(aco_opcode::lds_direct_load, Definition(PhysReg(256), v1), Operand(m0, s1));
569 
570    //! p_unit_test 13
571    //! v1: %0:v[1] = ds_read_b32 %0:v[0]
572    //! s_waitcnt vmcnt(0)
573    //! s_waitcnt_depctr vm_vsrc(0)
574    //! v1: %0:v[0] = lds_direct_load %0:m0
575    bld.pseudo(aco_opcode::p_unit_test, Operand::c32(13));
576    bld.ds(aco_opcode::ds_read_b32, Definition(PhysReg(257), v1), Operand(PhysReg(256), v1));
577    bld.sopp(aco_opcode::s_waitcnt, -1, 0x3ff);
578    bld.ldsdir(aco_opcode::lds_direct_load, Definition(PhysReg(256), v1), Operand(m0, s1));
579 
580    //! p_unit_test 14
581    //! v1: %0:v[0] = buffer_load_dword %0:s[0-3], %0:v[1], 0 offen
582    //! s_waitcnt_vscnt %0:null imm:0
583    //! s_waitcnt_depctr vm_vsrc(0)
584    //! v1: %0:v[0] = lds_direct_load %0:m0
585    bld.pseudo(aco_opcode::p_unit_test, Operand::c32(14));
586    create_mubuf(0, PhysReg(256), PhysReg(257));
587    bld.sopk(aco_opcode::s_waitcnt_vscnt, Operand(sgpr_null, s1), 0);
588    bld.ldsdir(aco_opcode::lds_direct_load, Definition(PhysReg(256), v1), Operand(m0, s1));
589 
590    finish_insert_nops_test();
591 END_TEST
592 
593 BEGIN_TEST(insert_nops.valu_trans_use)
594    if (!setup_cs(NULL, GFX11))
595       return;
596 
597    //>> p_unit_test 0
598    //! v1: %0:v[0] = v_rcp_f32 %0:v[1]
599    //! s_waitcnt_depctr va_vdst(0)
600    //! v1: %0:v[1] = v_mov_b32 %0:v[0]
601    bld.pseudo(aco_opcode::p_unit_test, Operand::c32(0));
602    bld.vop1(aco_opcode::v_rcp_f32, Definition(PhysReg(256), v1), Operand(PhysReg(257), v1));
603    bld.vop1(aco_opcode::v_mov_b32, Definition(PhysReg(257), v1), Operand(PhysReg(256), v1));
604 
605    /* Sufficient VALU mitigates the hazard. */
606    //! p_unit_test 1
607    //! v1: %0:v[0] = v_rcp_f32 %0:v[1]
608    //; for i in range(4): insert_pattern('v_nop')
609    //! s_waitcnt_depctr va_vdst(0)
610    //! v1: %0:v[1] = v_mov_b32 %0:v[0]
611    bld.pseudo(aco_opcode::p_unit_test, Operand::c32(1));
612    bld.vop1(aco_opcode::v_rcp_f32, Definition(PhysReg(256), v1), Operand(PhysReg(257), v1));
613    for (unsigned i = 0; i < 4; i++)
614       bld.vop1(aco_opcode::v_nop);
615    bld.vop1(aco_opcode::v_mov_b32, Definition(PhysReg(257), v1), Operand(PhysReg(256), v1));
616 
617    //! p_unit_test 2
618    //! v1: %0:v[0] = v_rcp_f32 %0:v[1]
619    //; for i in range(8): insert_pattern('v_nop')
620    //! v1: %0:v[1] = v_mov_b32 %0:v[0]
621    bld.pseudo(aco_opcode::p_unit_test, Operand::c32(2));
622    bld.vop1(aco_opcode::v_rcp_f32, Definition(PhysReg(256), v1), Operand(PhysReg(257), v1));
623    for (unsigned i = 0; i < 8; i++)
624       bld.vop1(aco_opcode::v_nop);
625    bld.vop1(aco_opcode::v_mov_b32, Definition(PhysReg(257), v1), Operand(PhysReg(256), v1));
626 
627    /* Sufficient transcendental VALU mitigates the hazard. */
628    //! p_unit_test 3
629    //! v1: %0:v[0] = v_rcp_f32 %0:v[1]
630    //! v1: %0:v[2] = v_sqrt_f32 %0:v[3]
631    //! s_waitcnt_depctr va_vdst(0)
632    //! v1: %0:v[1] = v_mov_b32 %0:v[0]
633    bld.pseudo(aco_opcode::p_unit_test, Operand::c32(3));
634    bld.vop1(aco_opcode::v_rcp_f32, Definition(PhysReg(256), v1), Operand(PhysReg(257), v1));
635    bld.vop1(aco_opcode::v_sqrt_f32, Definition(PhysReg(258), v1), Operand(PhysReg(259), v1));
636    bld.vop1(aco_opcode::v_mov_b32, Definition(PhysReg(257), v1), Operand(PhysReg(256), v1));
637 
638    //! p_unit_test 4
639    //! v1: %0:v[0] = v_rcp_f32 %0:v[1]
640    //! v1: %0:v[2] = v_sqrt_f32 %0:v[3]
641    //! v1: %0:v[2] = v_sqrt_f32 %0:v[3]
642    //! v1: %0:v[1] = v_mov_b32 %0:v[0]
643    bld.pseudo(aco_opcode::p_unit_test, Operand::c32(4));
644    bld.vop1(aco_opcode::v_rcp_f32, Definition(PhysReg(256), v1), Operand(PhysReg(257), v1));
645    for (unsigned i = 0; i < 2; i++)
646       bld.vop1(aco_opcode::v_sqrt_f32, Definition(PhysReg(258), v1), Operand(PhysReg(259), v1));
647    bld.vop1(aco_opcode::v_mov_b32, Definition(PhysReg(257), v1), Operand(PhysReg(256), v1));
648 
649    /* Transcendental VALU should be counted towards VALU */
650    //! p_unit_test 5
651    //! v1: %0:v[0] = v_rcp_f32 %0:v[1]
652    //; for i in range(5): insert_pattern('v_nop')
653    //! v1: %0:v[2] = v_sqrt_f32 %0:v[3]
654    //! v1: %0:v[1] = v_mov_b32 %0:v[0]
655    bld.pseudo(aco_opcode::p_unit_test, Operand::c32(5));
656    bld.vop1(aco_opcode::v_rcp_f32, Definition(PhysReg(256), v1), Operand(PhysReg(257), v1));
657    for (unsigned i = 0; i < 5; i++)
658       bld.vop1(aco_opcode::v_nop);
659    bld.vop1(aco_opcode::v_sqrt_f32, Definition(PhysReg(258), v1), Operand(PhysReg(259), v1));
660    bld.vop1(aco_opcode::v_mov_b32, Definition(PhysReg(257), v1), Operand(PhysReg(256), v1));
661 
662    /* non-VALU does not mitigate the hazard. */
663    //! p_unit_test 6
664    //! v1: %0:v[0] = v_rcp_f32 %0:v[1]
665    //; for i in range(8): insert_pattern('s_nop')
666    //! s_waitcnt_depctr va_vdst(0)
667    //! v1: %0:v[1] = v_mov_b32 %0:v[0]
668    bld.pseudo(aco_opcode::p_unit_test, Operand::c32(6));
669    bld.vop1(aco_opcode::v_rcp_f32, Definition(PhysReg(256), v1), Operand(PhysReg(257), v1));
670    for (unsigned i = 0; i < 8; i++)
671       bld.sopp(aco_opcode::s_nop, -1, 0);
672    bld.vop1(aco_opcode::v_mov_b32, Definition(PhysReg(257), v1), Operand(PhysReg(256), v1));
673 
674    finish_insert_nops_test();
675 END_TEST
676 
677 BEGIN_TEST(insert_nops.valu_partial_forwarding.basic)
678    if (!setup_cs(NULL, GFX11))
679       return;
680 
681    /* Basic case. */
682    //>> p_unit_test 0
683    //! v1: %0:v[0] = v_mov_b32 0
684    //! s2: %0:exec = s_mov_b64 -1
685    //! v1: %0:v[1] = v_mov_b32 1
686    //! s_waitcnt_depctr va_vdst(0)
687    //! v1: %0:v[2] = v_max_f32 %0:v[0], %0:v[1]
688    bld.pseudo(aco_opcode::p_unit_test, Operand::c32(0));
689    bld.vop1(aco_opcode::v_mov_b32, Definition(PhysReg(256), v1), Operand::zero());
690    bld.sop1(aco_opcode::s_mov_b64, Definition(exec, s2), Operand::c64(-1));
691    bld.vop1(aco_opcode::v_mov_b32, Definition(PhysReg(257), v1), Operand::c32(1));
692    bld.vop2(aco_opcode::v_max_f32, Definition(PhysReg(258), v1), Operand(PhysReg(256), v1),
693             Operand(PhysReg(257), v1));
694 
695    /* We should consider both the closest and further VALU after the exec write. */
696    //! p_unit_test 1
697    //! v1: %0:v[0] = v_mov_b32 0
698    //! s2: %0:exec = s_mov_b64 -1
699    //! v1: %0:v[1] = v_mov_b32 1
700    //; for i in range(2): insert_pattern('v_nop')
701    //! v1: %0:v[2] = v_mov_b32 2
702    //! s_waitcnt_depctr va_vdst(0)
703    //! v1: %0:v[2] = v_max3_f32 %0:v[0], %0:v[1], %0:v[2]
704    bld.pseudo(aco_opcode::p_unit_test, Operand::c32(1));
705    bld.vop1(aco_opcode::v_mov_b32, Definition(PhysReg(256), v1), Operand::zero());
706    bld.sop1(aco_opcode::s_mov_b64, Definition(exec, s2), Operand::c64(-1));
707    bld.vop1(aco_opcode::v_mov_b32, Definition(PhysReg(257), v1), Operand::c32(1));
708    bld.vop1(aco_opcode::v_nop);
709    bld.vop1(aco_opcode::v_nop);
710    bld.vop1(aco_opcode::v_mov_b32, Definition(PhysReg(258), v1), Operand::c32(2));
711    bld.vop3(aco_opcode::v_max3_f32, Definition(PhysReg(258), v1), Operand(PhysReg(256), v1),
712             Operand(PhysReg(257), v1), Operand(PhysReg(258), v1));
713 
714    //! p_unit_test 2
715    //! v1: %0:v[0] = v_mov_b32 0
716    //! s2: %0:exec = s_mov_b64 -1
717    //! v1: %0:v[1] = v_mov_b32 1
718    //! v1: %0:v[2] = v_mov_b32 2
719    //; for i in range(4): insert_pattern('v_nop')
720    //! s_waitcnt_depctr va_vdst(0)
721    //! v1: %0:v[2] = v_max3_f32 %0:v[0], %0:v[1], %0:v[2]
722    bld.pseudo(aco_opcode::p_unit_test, Operand::c32(2));
723    bld.vop1(aco_opcode::v_mov_b32, Definition(PhysReg(256), v1), Operand::zero());
724    bld.sop1(aco_opcode::s_mov_b64, Definition(exec, s2), Operand::c64(-1));
725    bld.vop1(aco_opcode::v_mov_b32, Definition(PhysReg(257), v1), Operand::c32(1));
726    bld.vop1(aco_opcode::v_mov_b32, Definition(PhysReg(258), v1), Operand::c32(2));
727    for (unsigned i = 0; i < 4; i++)
728       bld.vop1(aco_opcode::v_nop);
729    bld.vop3(aco_opcode::v_max3_f32, Definition(PhysReg(258), v1), Operand(PhysReg(256), v1),
730             Operand(PhysReg(257), v1), Operand(PhysReg(258), v1));
731 
732    /* If a VALU writes a read VGPR in-between the first and second writes, it should still be
733     * counted towards the distance between the first and second writes.
734     */
735    //! p_unit_test 3
736    //! v1: %0:v[0] = v_mov_b32 0
737    //! s2: %0:exec = s_mov_b64 -1
738    //! v1: %0:v[1] = v_mov_b32 1
739    //; for i in range(2): insert_pattern('v_nop')
740    //! v1: %0:v[2] = v_mov_b32 2
741    //; for i in range(3): insert_pattern('v_nop')
742    //! v1: %0:v[2] = v_max3_f32 %0:v[0], %0:v[1], %0:v[2]
743    bld.pseudo(aco_opcode::p_unit_test, Operand::c32(3));
744    bld.vop1(aco_opcode::v_mov_b32, Definition(PhysReg(256), v1), Operand::zero());
745    bld.sop1(aco_opcode::s_mov_b64, Definition(exec, s2), Operand::c64(-1));
746    bld.vop1(aco_opcode::v_mov_b32, Definition(PhysReg(257), v1), Operand::c32(1));
747    bld.vop1(aco_opcode::v_nop);
748    bld.vop1(aco_opcode::v_nop);
749    bld.vop1(aco_opcode::v_mov_b32, Definition(PhysReg(258), v1), Operand::c32(2));
750    for (unsigned i = 0; i < 3; i++)
751       bld.vop1(aco_opcode::v_nop);
752    bld.vop3(aco_opcode::v_max3_f32, Definition(PhysReg(258), v1), Operand(PhysReg(256), v1),
753             Operand(PhysReg(257), v1), Operand(PhysReg(258), v1));
754 
755    bld.pseudo(aco_opcode::p_unit_test, Operand::c32(4));
756 
757    finish_insert_nops_test();
758 END_TEST
759 
760 BEGIN_TEST(insert_nops.valu_partial_forwarding.multiple_exec_writes)
761    if (!setup_cs(NULL, GFX11))
762       return;
763 
764    //>> p_unit_test 0
765    //! v1: %0:v[0] = v_mov_b32 0
766    //! s2: %0:exec = s_mov_b64 0
767    //! s2: %0:exec = s_mov_b64 -1
768    //! v1: %0:v[1] = v_mov_b32 1
769    //! s_waitcnt_depctr va_vdst(0)
770    //! v1: %0:v[2] = v_max_f32 %0:v[0], %0:v[1]
771    bld.pseudo(aco_opcode::p_unit_test, Operand::c32(0));
772    bld.vop1(aco_opcode::v_mov_b32, Definition(PhysReg(256), v1), Operand::zero());
773    bld.sop1(aco_opcode::s_mov_b64, Definition(exec, s2), Operand::c64(0));
774    bld.sop1(aco_opcode::s_mov_b64, Definition(exec, s2), Operand::c64(-1));
775    bld.vop1(aco_opcode::v_mov_b32, Definition(PhysReg(257), v1), Operand::c32(1));
776    bld.vop2(aco_opcode::v_max_f32, Definition(PhysReg(258), v1), Operand(PhysReg(256), v1),
777             Operand(PhysReg(257), v1));
778 
779    //! p_unit_test 1
780    //! v1: %0:v[0] = v_mov_b32 0
781    //! s2: %0:exec = s_mov_b64 0
782    //! v1: %0:v[1] = v_mov_b32 1
783    //! s2: %0:exec = s_mov_b64 -1
784    //! s_waitcnt_depctr va_vdst(0)
785    //! v1: %0:v[2] = v_max_f32 %0:v[0], %0:v[1]
786    bld.pseudo(aco_opcode::p_unit_test, Operand::c32(1));
787    bld.vop1(aco_opcode::v_mov_b32, Definition(PhysReg(256), v1), Operand::zero());
788    bld.sop1(aco_opcode::s_mov_b64, Definition(exec, s2), Operand::c64(0));
789    bld.vop1(aco_opcode::v_mov_b32, Definition(PhysReg(257), v1), Operand::c32(1));
790    bld.sop1(aco_opcode::s_mov_b64, Definition(exec, s2), Operand::c64(-1));
791    bld.vop2(aco_opcode::v_max_f32, Definition(PhysReg(258), v1), Operand(PhysReg(256), v1),
792             Operand(PhysReg(257), v1));
793 
794    finish_insert_nops_test();
795 END_TEST
796 
797 BEGIN_TEST(insert_nops.valu_partial_forwarding.control_flow)
798    if (!setup_cs(NULL, GFX11))
799       return;
800 
801    /* Control flow merges: one branch shouldn't interfere with the other (clobbering VALU closer
802     * than interesting one).
803     */
804    //>> p_unit_test 0
805    //! s_cbranch_scc1 block:BB2
806    bld.pseudo(aco_opcode::p_unit_test, Operand::c32(0u));
807    bld.sopp(aco_opcode::s_cbranch_scc1, 2);
808 
809    //! BB1
810    //! /* logical preds: / linear preds: BB0, / kind: */
811    //! v1: %0:v[0] = v_mov_b32 0
812    //! s2: %0:exec = s_mov_b64 -1
813    //! v_nop
814    //! s_branch block:BB3
815    bld.reset(program->create_and_insert_block());
816    program->blocks[0].linear_succs.push_back(1);
817    program->blocks[1].linear_preds.push_back(0);
818    bld.vop1(aco_opcode::v_mov_b32, Definition(PhysReg(256), v1), Operand::zero());
819    bld.sop1(aco_opcode::s_mov_b64, Definition(exec, s2), Operand::c64(-1));
820    bld.vop1(aco_opcode::v_nop);
821    bld.sopp(aco_opcode::s_branch, 3);
822 
823    //! BB2
824    //! /* logical preds: / linear preds: BB0, / kind: */
825    //! v1: %0:v[0] = v_mov_b32 0
826    bld.reset(program->create_and_insert_block());
827    program->blocks[0].linear_succs.push_back(2);
828    program->blocks[2].linear_preds.push_back(0);
829    bld.vop1(aco_opcode::v_mov_b32, Definition(PhysReg(256), v1), Operand::zero());
830 
831    //! BB3
832    //! /* logical preds: / linear preds: BB1, BB2, / kind: */
833    //! v1: %0:v[1] = v_mov_b32 1
834    //! s_waitcnt_depctr va_vdst(0)
835    //! v1: %0:v[2] = v_max_f32 %0:v[0], %0:v[1]
836    bld.reset(program->create_and_insert_block());
837    program->blocks[1].linear_succs.push_back(3);
838    program->blocks[2].linear_succs.push_back(3);
839    program->blocks[3].linear_preds.push_back(1);
840    program->blocks[3].linear_preds.push_back(2);
841    bld.vop1(aco_opcode::v_mov_b32, Definition(PhysReg(257), v1), Operand::c32(1));
842    bld.vop2(aco_opcode::v_max_f32, Definition(PhysReg(258), v1), Operand(PhysReg(256), v1),
843             Operand(PhysReg(257), v1));
844 
845    /* Control flow merges: one branch shouldn't interfere with the other (should consider furthest
846     * VALU writes after exec).
847     */
848    //! p_unit_test 1
849    //! s_cbranch_scc1 block:BB5
850    bld.pseudo(aco_opcode::p_unit_test, Operand::c32(1u));
851    bld.sopp(aco_opcode::s_cbranch_scc1, 5);
852 
853    //! BB4
854    //! /* logical preds: / linear preds: BB3, / kind: */
855    //! v1: %0:v[0] = v_mov_b32 0
856    //! s2: %0:exec = s_mov_b64 -1
857    //; for i in range(2): insert_pattern('v_nop')
858    //! v1: %0:v[1] = v_mov_b32 1
859    //! v_nop
860    //! s_branch block:BB6
861    bld.reset(program->create_and_insert_block());
862    program->blocks[3].linear_succs.push_back(4);
863    program->blocks[4].linear_preds.push_back(3);
864    bld.vop1(aco_opcode::v_mov_b32, Definition(PhysReg(256), v1), Operand::zero());
865    bld.sop1(aco_opcode::s_mov_b64, Definition(exec, s2), Operand::c64(-1));
866    bld.vop1(aco_opcode::v_nop);
867    bld.vop1(aco_opcode::v_nop);
868    bld.vop1(aco_opcode::v_mov_b32, Definition(PhysReg(257), v1), Operand::c32(1));
869    bld.vop1(aco_opcode::v_nop);
870    bld.sopp(aco_opcode::s_branch, 6);
871 
872    //! BB5
873    //! /* logical preds: / linear preds: BB3, / kind: */
874    //! v1: %0:v[1] = v_mov_b32 1
875    bld.reset(program->create_and_insert_block());
876    program->blocks[3].linear_succs.push_back(5);
877    program->blocks[5].linear_preds.push_back(3);
878    bld.vop1(aco_opcode::v_mov_b32, Definition(PhysReg(257), v1), Operand::c32(1));
879 
880    //! BB6
881    //! /* logical preds: / linear preds: BB4, BB5, / kind: */
882    //! s_waitcnt_depctr va_vdst(0)
883    //! v1: %0:v[2] = v_max_f32 %0:v[0], %0:v[1]
884    bld.reset(program->create_and_insert_block());
885    program->blocks[4].linear_succs.push_back(6);
886    program->blocks[5].linear_succs.push_back(6);
887    program->blocks[6].linear_preds.push_back(4);
888    program->blocks[6].linear_preds.push_back(5);
889    bld.vop2(aco_opcode::v_max_f32, Definition(PhysReg(258), v1), Operand(PhysReg(256), v1),
890             Operand(PhysReg(257), v1));
891 
892    /* Control flow merges: one branch shouldn't interfere with the other (should consider closest
893     * VALU writes after exec).
894     */
895    //! p_unit_test 2
896    //! s_cbranch_scc1 block:BB8
897    bld.pseudo(aco_opcode::p_unit_test, Operand::c32(2u));
898    bld.sopp(aco_opcode::s_cbranch_scc1, 8);
899 
900    //! BB7
901    //! /* logical preds: / linear preds: BB6, / kind: */
902    //! v1: %0:v[0] = v_mov_b32 0
903    //! s2: %0:exec = s_mov_b64 -1
904    //! v1: %0:v[1] = v_mov_b32 1
905    //; for i in range(4): insert_pattern('v_nop')
906    //! s_branch block:BB9
907    bld.reset(program->create_and_insert_block());
908    program->blocks[6].linear_succs.push_back(7);
909    program->blocks[7].linear_preds.push_back(6);
910    bld.vop1(aco_opcode::v_mov_b32, Definition(PhysReg(256), v1), Operand::zero());
911    bld.sop1(aco_opcode::s_mov_b64, Definition(exec, s2), Operand::c64(-1));
912    bld.vop1(aco_opcode::v_mov_b32, Definition(PhysReg(257), v1), Operand::c32(1));
913    for (unsigned i = 0; i < 4; i++)
914       bld.vop1(aco_opcode::v_nop);
915    bld.sopp(aco_opcode::s_branch, 9);
916 
917    //! BB8
918    //! /* logical preds: / linear preds: BB6, / kind: */
919    //! v1: %0:v[1] = v_mov_b32 1
920    //; for i in range(5): insert_pattern('v_nop')
921    bld.reset(program->create_and_insert_block());
922    program->blocks[6].linear_succs.push_back(8);
923    program->blocks[8].linear_preds.push_back(6);
924    bld.vop1(aco_opcode::v_mov_b32, Definition(PhysReg(257), v1), Operand::c32(1));
925    for (unsigned i = 0; i < 5; i++)
926       bld.vop1(aco_opcode::v_nop);
927 
928    //! BB9
929    //! /* logical preds: / linear preds: BB7, BB8, / kind: uniform, */
930    //! s_waitcnt_depctr va_vdst(0)
931    //! v1: %0:v[2] = v_max_f32 %0:v[0], %0:v[1]
932    bld.reset(program->create_and_insert_block());
933    program->blocks[7].linear_succs.push_back(9);
934    program->blocks[8].linear_succs.push_back(9);
935    program->blocks[9].linear_preds.push_back(7);
936    program->blocks[9].linear_preds.push_back(8);
937    bld.vop2(aco_opcode::v_max_f32, Definition(PhysReg(258), v1), Operand(PhysReg(256), v1),
938             Operand(PhysReg(257), v1));
939 
940    finish_insert_nops_test();
941 END_TEST
942 
943 BEGIN_TEST(insert_nops.valu_mask_write)
944    if (!setup_cs(NULL, GFX11))
945       return;
946 
947    /* Basic case. */
948    //>> p_unit_test 0
949    //! v1: %0:v[0] = v_cndmask_b32 0, 0, %0:s[0-1]
950    //! s1: %0:s[1] = s_mov_b32 0
951    //! s_waitcnt_depctr sa_sdst(0)
952    //! s1: %0:s[2] = s_mov_b32 %0:s[1]
953    bld.pseudo(aco_opcode::p_unit_test, Operand::c32(0));
954    bld.vop2_e64(aco_opcode::v_cndmask_b32, Definition(PhysReg(256), v1), Operand::zero(),
955                 Operand::zero(), Operand(PhysReg(0), s2));
956    bld.sop1(aco_opcode::s_mov_b32, Definition(PhysReg(1), s1), Operand::zero());
957    bld.sop1(aco_opcode::s_mov_b32, Definition(PhysReg(2), s1), Operand(PhysReg(1), s1));
958 
959    /* Mitigation. */
960    //! p_unit_test 1
961    //! v1: %0:v[0] = v_cndmask_b32 0, 0, %0:s[0-1]
962    //! v1: %0:v[1] = v_mov_b32 %0:s[1]
963    //! s1: %0:s[1] = s_mov_b32 0
964    //! s1: %0:s[2] = s_mov_b32 %0:s[1]
965    bld.pseudo(aco_opcode::p_unit_test, Operand::c32(1));
966    bld.vop2_e64(aco_opcode::v_cndmask_b32, Definition(PhysReg(256), v1), Operand::zero(),
967                 Operand::zero(), Operand(PhysReg(0), s2));
968    bld.vop1(aco_opcode::v_mov_b32, Definition(PhysReg(257), v1), Operand(PhysReg(1), s1));
969    bld.sop1(aco_opcode::s_mov_b32, Definition(PhysReg(1), s1), Operand::zero());
970    bld.sop1(aco_opcode::s_mov_b32, Definition(PhysReg(2), s1), Operand(PhysReg(1), s1));
971 
972    //! p_unit_test 2
973    //! v1: %0:v[0] = v_cndmask_b32 0, 0, %0:s[0-1]
974    //! s1: %0:s[1] = s_mov_b32 0
975    //! s_waitcnt_depctr sa_sdst(0)
976    //! s1: %0:s[2] = s_mov_b32 %0:s[1]
977    //! s1: %0:s[2] = s_mov_b32 %0:s[1]
978    bld.pseudo(aco_opcode::p_unit_test, Operand::c32(2));
979    bld.vop2_e64(aco_opcode::v_cndmask_b32, Definition(PhysReg(256), v1), Operand::zero(),
980                 Operand::zero(), Operand(PhysReg(0), s2));
981    bld.sop1(aco_opcode::s_mov_b32, Definition(PhysReg(1), s1), Operand::zero());
982    bld.sop1(aco_opcode::s_mov_b32, Definition(PhysReg(2), s1), Operand(PhysReg(1), s1));
983    bld.sop1(aco_opcode::s_mov_b32, Definition(PhysReg(2), s1), Operand(PhysReg(1), s1));
984 
985    //! p_unit_test 3
986    //! v1: %0:v[0] = v_cndmask_b32 0, 0, %0:s[0-1]
987    //! s1: %0:s[1] = s_mov_b32 0
988    //! s_waitcnt_depctr sa_sdst(0)
989    //! s1: %0:s[2] = s_mov_b32 %0:s[1]
990    bld.pseudo(aco_opcode::p_unit_test, Operand::c32(3));
991    bld.vop2_e64(aco_opcode::v_cndmask_b32, Definition(PhysReg(256), v1), Operand::zero(),
992                 Operand::zero(), Operand(PhysReg(0), s2));
993    bld.sop1(aco_opcode::s_mov_b32, Definition(PhysReg(1), s1), Operand::zero());
994    bld.sopp(aco_opcode::s_waitcnt_depctr, -1, 0xfffe);
995    bld.sop1(aco_opcode::s_mov_b32, Definition(PhysReg(2), s1), Operand(PhysReg(1), s1));
996 
997    /* Instruction which is both involved in the hazard and is a mitigation. */
998    //! p_unit_test 4
999    //! v1: %0:v[0] = v_cndmask_b32 %0:s[2], 0, %0:s[0-1]
1000    //! s1: %0:s[1] = s_mov_b32 0
1001    //! s_waitcnt_depctr sa_sdst(0)
1002    //! s1: %0:s[2] = s_mov_b32 %0:s[1]
1003    bld.pseudo(aco_opcode::p_unit_test, Operand::c32(4));
1004    bld.vop2_e64(aco_opcode::v_cndmask_b32, Definition(PhysReg(256), v1), Operand(PhysReg(2), s1),
1005                 Operand::zero(), Operand(PhysReg(0), s2));
1006    bld.sop1(aco_opcode::s_mov_b32, Definition(PhysReg(1), s1), Operand::zero());
1007    bld.sop1(aco_opcode::s_mov_b32, Definition(PhysReg(2), s1), Operand(PhysReg(1), s1));
1008 
1009    finish_insert_nops_test();
1010 END_TEST
1011 
1012 BEGIN_TEST(insert_nops.setpc_gfx6)
1013    if (!setup_cs(NULL, GFX6))
1014       return;
1015 
1016    /* SGPR->SMEM hazards */
1017    //>> p_unit_test 0
1018    //! s1: %0:s[0] = s_mov_b32 0
1019    //! s_nop imm:2
1020    //! s_setpc_b64 0
1021    bld.pseudo(aco_opcode::p_unit_test, Operand::c32(0));
1022    bld.sop1(aco_opcode::s_mov_b32, Definition(PhysReg(0), s1), Operand::zero());
1023    bld.sop1(aco_opcode::s_setpc_b64, Operand::zero(8));
1024 
1025    //! p_unit_test 1
1026    //! s1: %0:s[0] = s_mov_b32 0
1027    //! s_nop imm:2
1028    //! s_setpc_b64 0
1029    bld.pseudo(aco_opcode::p_unit_test, Operand::c32(1));
1030    bld.sop1(aco_opcode::s_mov_b32, Definition(PhysReg(0), s1), Operand::zero());
1031    bld.sopp(aco_opcode::s_nop, -1, 2);
1032    bld.sop1(aco_opcode::s_setpc_b64, Operand::zero(8));
1033 
1034    finish_insert_nops_test();
1035 
1036    /* This hazard can't be tested using s_setpc_b64, because the s_setpc_b64 itself resolves it. */
1037 
1038    /* VINTRP->v_readlane_b32/etc */
1039    //>> p_unit_test 2
1040    //! v1: %0:v[0] = v_interp_mov_f32 2, %0:m0 attr0.x
1041    //! s_nop
1042    create_program(GFX6, compute_cs, 64, CHIP_UNKNOWN);
1043    bld.pseudo(aco_opcode::p_unit_test, Operand::c32(2));
1044    bld.vintrp(aco_opcode::v_interp_mov_f32, Definition(PhysReg(256), v1), Operand::c32(2u),
1045               Operand(m0, s1), 0, 0);
1046    finish_insert_nops_test(false);
1047 END_TEST
1048 
1049 BEGIN_TEST(insert_nops.setpc_gfx7)
1050    for (amd_gfx_level gfx : {GFX7, GFX9}) {
1051       if (!setup_cs(NULL, gfx))
1052          continue;
1053 
1054       //>> p_unit_test 0
1055       //! s_setpc_b64 0
1056       bld.pseudo(aco_opcode::p_unit_test, Operand::c32(0));
1057       bld.sop1(aco_opcode::s_setpc_b64, Operand::zero(8));
1058 
1059       /* Break up SMEM clauses: resolved by the s_setpc_b64 itself */
1060       //! p_unit_test 1
1061       //! s1: %0:s[0] = s_load_dword %0:s[0-1]
1062       //! s_setpc_b64 0
1063       bld.pseudo(aco_opcode::p_unit_test, Operand::c32(1));
1064       bld.smem(aco_opcode::s_load_dword, Definition(PhysReg(0), s1), Operand(PhysReg(0), s2));
1065       bld.sop1(aco_opcode::s_setpc_b64, Operand::zero(8));
1066 
1067       /* SALU and GDS hazards */
1068       //! p_unit_test 2
1069       //! s_setreg_imm32_b32 0x0 imm:14337
1070       //! s_nop
1071       //! s_setpc_b64 0
1072       bld.pseudo(aco_opcode::p_unit_test, Operand::c32(2));
1073       bld.sopk(aco_opcode::s_setreg_imm32_b32, Operand::literal32(0), (7 << 11) | 1);
1074       bld.sop1(aco_opcode::s_setpc_b64, Operand::zero(8));
1075 
1076       /* VALU writes vcc -> vccz/v_div_fmas */
1077       //! p_unit_test 3
1078       //! s2: %0:vcc = v_cmp_eq_u32 0, 0
1079       //! s_nop imm:3
1080       //! s_setpc_b64 0
1081       bld.pseudo(aco_opcode::p_unit_test, Operand::c32(3));
1082       bld.vopc_e64(aco_opcode::v_cmp_eq_u32, Definition(vcc, s2), Operand::zero(), Operand::zero());
1083       bld.sop1(aco_opcode::s_setpc_b64, Operand::zero(8));
1084 
1085       /* VALU writes exec -> execz/DPP */
1086       //! p_unit_test 4
1087       //! s2: %0:exec = v_cmpx_eq_u32 0, 0
1088       //! s_nop imm:3
1089       //! s_setpc_b64 0
1090       bld.pseudo(aco_opcode::p_unit_test, Operand::c32(4));
1091       bld.vopc_e64(aco_opcode::v_cmpx_eq_u32, Definition(exec, s2), Operand::zero(),
1092                    Operand::zero());
1093       bld.sop1(aco_opcode::s_setpc_b64, Operand::zero(8));
1094 
1095       /* VALU->DPP */
1096       //! p_unit_test 5
1097       //! v1: %0:v[0] = v_mov_b32 0
1098       //~gfx9! s_nop
1099       //! s_setpc_b64 0
1100       bld.pseudo(aco_opcode::p_unit_test, Operand::c32(5));
1101       bld.vop1(aco_opcode::v_mov_b32, Definition(PhysReg(256), v1), Operand::zero());
1102       bld.sop1(aco_opcode::s_setpc_b64, Operand::zero(8));
1103 
1104       /* VALU->v_readlane_b32/VMEM/etc */
1105       //! p_unit_test 6
1106       //! s1: %0:s[0] = v_readfirstlane_b32 %0:v[0]
1107       //! s_nop imm:3
1108       //! s_setpc_b64 0
1109       bld.pseudo(aco_opcode::p_unit_test, Operand::c32(6));
1110       bld.vop1(aco_opcode::v_readfirstlane_b32, Definition(PhysReg(0), s1),
1111                Operand(PhysReg(256), v1));
1112       bld.sop1(aco_opcode::s_setpc_b64, Operand::zero(8));
1113 
1114       finish_insert_nops_test();
1115 
1116       /* These hazards can't be tested using s_setpc_b64, because the s_setpc_b64 itself resolves
1117        * them. */
1118 
1119       //>> p_unit_test 7
1120       //! buffer_store_dwordx3 %0:s[0-3], %0:v[0], 0, %0:v[0-2] offen
1121       //! s_nop
1122       create_program(gfx, compute_cs, 64, CHIP_UNKNOWN);
1123       bld.pseudo(aco_opcode::p_unit_test, Operand::c32(7));
1124       bld.mubuf(aco_opcode::buffer_store_dwordx3, Operand(PhysReg(0), s4),
1125                 Operand(PhysReg(256), v1), Operand::zero(), Operand(PhysReg(256), v3), 0, true);
1126       finish_insert_nops_test(false);
1127 
1128       //>> p_unit_test 8
1129       //! s1: %0:m0 = s_mov_b32 0
1130       //! s_nop
1131       create_program(gfx, compute_cs, 64, CHIP_UNKNOWN);
1132       bld.pseudo(aco_opcode::p_unit_test, Operand::c32(8));
1133       bld.sop1(aco_opcode::s_mov_b32, Definition(PhysReg(m0), s1), Operand::zero());
1134       finish_insert_nops_test(false);
1135 
1136       /* Break up SMEM clauses */
1137       //>> p_unit_test 9
1138       //! s1: %0:s[0] = s_load_dword %0:s[0-1]
1139       //! s_nop
1140       create_program(gfx, compute_cs, 64, CHIP_UNKNOWN);
1141       bld.pseudo(aco_opcode::p_unit_test, Operand::c32(9));
1142       bld.smem(aco_opcode::s_load_dword, Definition(PhysReg(0), s1), Operand(PhysReg(0), s2));
1143       finish_insert_nops_test(false);
1144    }
1145 END_TEST
1146 
1147 BEGIN_TEST(insert_nops.setpc_gfx10)
1148    if (!setup_cs(NULL, GFX10))
1149       return;
1150 
1151    //>> p_unit_test 0
1152    //! s_setpc_b64 0
1153    bld.pseudo(aco_opcode::p_unit_test, Operand::c32(0));
1154    bld.sop1(aco_opcode::s_setpc_b64, Operand::zero(8));
1155 
1156    /* VcmpxPermlaneHazard */
1157    //! p_unit_test 1
1158    //! s2: %0:exec = v_cmpx_eq_u32 0, 0
1159    //! v1: %0:v[0] = v_mov_b32 %0:v[0]
1160    //! s_setpc_b64 0
1161    bld.pseudo(aco_opcode::p_unit_test, Operand::c32(1));
1162    bld.vopc_e64(aco_opcode::v_cmpx_eq_u32, Definition(exec, s2), Operand::zero(), Operand::zero());
1163    bld.sop1(aco_opcode::s_setpc_b64, Operand::zero(8));
1164 
1165    /* VMEMtoScalarWriteHazard */
1166    //! p_unit_test 2
1167    //! v1: %0:v[0] = ds_read_b32 %0:v[0]
1168    //! s_waitcnt_vscnt %0:null imm:0
1169    //! s_waitcnt_depctr vm_vsrc(0)
1170    //! s_setpc_b64 0
1171    bld.pseudo(aco_opcode::p_unit_test, Operand::c32(2));
1172    bld.ds(aco_opcode::ds_read_b32, Definition(PhysReg(256), v1), Operand(PhysReg(256), v1));
1173    bld.sopk(aco_opcode::s_waitcnt_vscnt, Operand(sgpr_null, s1),
1174             0); /* reset LdsBranchVmemWARHazard */
1175    bld.sop1(aco_opcode::s_setpc_b64, Operand::zero(8));
1176 
1177    /* VcmpxExecWARHazard */
1178    //! p_unit_test 3
1179    //! s1: %0:s[0] = s_mov_b32 %0:exec_hi
1180    //! s_waitcnt_depctr sa_sdst(0)
1181    //! s_setpc_b64 0
1182    bld.pseudo(aco_opcode::p_unit_test, Operand::c32(3));
1183    bld.sop1(aco_opcode::s_mov_b32, Definition(PhysReg(0), s1), Operand(exec_hi, s1));
1184    bld.sop1(aco_opcode::s_setpc_b64, Operand::zero(8));
1185 
1186    /* LdsBranchVmemWARHazard */
1187    //! p_unit_test 4
1188    //! v1: %0:v[0] = ds_read_b32 %0:v[0]
1189    //! v_nop
1190    //! s_branch
1191    //! s_waitcnt_vscnt %0:null imm:0
1192    //! s_setpc_b64 0
1193    bld.pseudo(aco_opcode::p_unit_test, Operand::c32(4));
1194    bld.ds(aco_opcode::ds_read_b32, Definition(PhysReg(256), v1), Operand(PhysReg(256), v1));
1195    bld.vop1(aco_opcode::v_nop); /* reset VMEMtoScalarWriteHazard */
1196    bld.sopp(aco_opcode::s_branch, -1, 0);
1197    bld.sop1(aco_opcode::s_setpc_b64, Operand::zero(8));
1198 
1199    //! p_unit_test 5
1200    //! v1: %0:v[0] = ds_read_b32 %0:v[0]
1201    //! v_nop
1202    //! s_waitcnt_vscnt %0:null imm:0
1203    //! s_setpc_b64 0
1204    bld.pseudo(aco_opcode::p_unit_test, Operand::c32(5));
1205    bld.ds(aco_opcode::ds_read_b32, Definition(PhysReg(256), v1), Operand(PhysReg(256), v1));
1206    bld.vop1(aco_opcode::v_nop); /* reset VMEMtoScalarWriteHazard */
1207    bld.sop1(aco_opcode::s_setpc_b64, Operand::zero(8));
1208 
1209    /* waNsaCannotFollowWritelane: resolved by the s_setpc_b64 */
1210    //! p_unit_test 6
1211    //! v1: %0:v[0] = v_writelane_b32_e64 %0:v[1], 0, %0:v[0]
1212    //! s_setpc_b64 0
1213    bld.pseudo(aco_opcode::p_unit_test, Operand::c32(6));
1214    bld.vop3(aco_opcode::v_writelane_b32_e64, Definition(PhysReg(256), v1),
1215             Operand(PhysReg(257), v1), Operand::zero(4), Operand(PhysReg(256), v1));
1216    bld.sop1(aco_opcode::s_setpc_b64, Operand::zero(8));
1217 
1218    finish_insert_nops_test();
1219 
1220    /* These hazards can't be tested using s_setpc_b64, because the s_setpc_b64 itself resolves them.
1221     */
1222 
1223    /* SMEMtoVectorWriteHazard */
1224    //>> p_unit_test 7
1225    //! s1: %0:s[0] = s_load_dword %0:s[0-1]
1226    //! s1: %0:null = s_mov_b32 0
1227    create_program(GFX10, compute_cs, 64, CHIP_UNKNOWN);
1228    bld.pseudo(aco_opcode::p_unit_test, Operand::c32(7));
1229    bld.smem(aco_opcode::s_load_dword, Definition(PhysReg(0), s1), Operand(PhysReg(0), s2));
1230    finish_insert_nops_test(false);
1231 
1232    /* NSAToVMEMBug is already resolved indirectly through VMEMtoScalarWriteHazard and
1233     * LdsBranchVmemWARHazard. */
1234    //>> p_unit_test 8
1235    //! v1: %0:v[0] = image_sample %0:s[0-7], %0:s[0-3],  v1: undef, %0:v[0], %0:v[2], %0:v[4], %0:v[6], %0:v[8], %0:v[10] 2d
1236    //! s_waitcnt_depctr vm_vsrc(0)
1237    //! s_waitcnt_vscnt %0:null imm:0
1238    create_program(GFX10, compute_cs, 64, CHIP_UNKNOWN);
1239    bld.pseudo(aco_opcode::p_unit_test, Operand::c32(8));
1240    create_mimg(true, 6, 4);
1241    finish_insert_nops_test(false);
1242 
1243    /* waNsaCannotFollowWritelane */
1244    //>> p_unit_test 9
1245    //! v1: %0:v[0] = v_writelane_b32_e64 %0:v[1], 0, %0:v[0]
1246    //! s_nop
1247    create_program(GFX10, compute_cs, 64, CHIP_UNKNOWN);
1248    bld.pseudo(aco_opcode::p_unit_test, Operand::c32(9));
1249    bld.vop3(aco_opcode::v_writelane_b32_e64, Definition(PhysReg(256), v1),
1250             Operand(PhysReg(257), v1), Operand::zero(4), Operand(PhysReg(256), v1));
1251    finish_insert_nops_test(false);
1252 END_TEST
1253 
1254 BEGIN_TEST(insert_nops.setpc_gfx11)
1255    if (!setup_cs(NULL, GFX11))
1256       return;
1257 
1258    //>> p_unit_test 0
1259    //! s_setpc_b64 0
1260    bld.pseudo(aco_opcode::p_unit_test, Operand::c32(0));
1261    bld.sop1(aco_opcode::s_setpc_b64, Operand::zero(8));
1262 
1263    /* LdsDirectVALUHazard */
1264    //! p_unit_test 1
1265    //! s2: %0:vcc = v_cmp_eq_u32 %0:v[0], 0
1266    //! s_waitcnt_depctr va_vdst(0)
1267    //! s_setpc_b64 0
1268    bld.pseudo(aco_opcode::p_unit_test, Operand::c32(1));
1269    bld.vopc_e64(aco_opcode::v_cmp_eq_u32, Definition(vcc, s2), Operand(PhysReg(256), v1),
1270                 Operand::zero());
1271    bld.sop1(aco_opcode::s_setpc_b64, Operand::zero(8));
1272 
1273    /* VALUPartialForwardingHazard */
1274    //! p_unit_test 2
1275    //! v1: %0:v[0] = v_mov_b32 0
1276    //! s_waitcnt_depctr va_vdst(0)
1277    //! s_setpc_b64 0
1278    bld.pseudo(aco_opcode::p_unit_test, Operand::c32(2));
1279    bld.vop1(aco_opcode::v_mov_b32, Definition(PhysReg(256), v1), Operand::zero());
1280    bld.sop1(aco_opcode::s_setpc_b64, Operand::zero(8));
1281 
1282    /* VcmpxPermlaneHazard */
1283    //! p_unit_test 2
1284    //! s2: %0:exec = v_cmpx_eq_u32 0, 0
1285    //! v_nop
1286    //! s_setpc_b64 0
1287    bld.pseudo(aco_opcode::p_unit_test, Operand::c32(2));
1288    bld.vopc_e64(aco_opcode::v_cmpx_eq_u32, Definition(exec, s2), Operand::zero(), Operand::zero());
1289    bld.sop1(aco_opcode::s_setpc_b64, Operand::zero(8));
1290 
1291    /* VALUTransUseHazard */
1292    //! p_unit_test 3
1293    //! v1: %0:v[0] = v_rcp_f32 0
1294    //! s_waitcnt_depctr va_vdst(0)
1295    //! s_setpc_b64 0
1296    bld.pseudo(aco_opcode::p_unit_test, Operand::c32(3));
1297    bld.vop1(aco_opcode::v_rcp_f32, Definition(PhysReg(256), v1), Operand::zero());
1298    bld.sop1(aco_opcode::s_setpc_b64, Operand::zero(8));
1299 
1300    /* VALUMaskWriteHazard */
1301    //! p_unit_test 4
1302    //! v1: %0:v[0] = v_cndmask_b32 0, 0, %0:vcc
1303    //! s_waitcnt_depctr va_vdst(0) sa_sdst(0)
1304    //! s_setpc_b64 0
1305    bld.pseudo(aco_opcode::p_unit_test, Operand::c32(4));
1306    bld.vop2(aco_opcode::v_cndmask_b32, Definition(PhysReg(256), v1), Operand::zero(),
1307             Operand::zero(), Operand(vcc, s2));
1308    bld.sop1(aco_opcode::s_setpc_b64, Operand::zero(8));
1309 
1310    //! p_unit_test 5
1311    //! v1: %0:v[0] = v_cndmask_b32 0, 0, %0:vcc
1312    //! s2: %0:vcc = s_mov_b64 0
1313    //! s_waitcnt_depctr va_vdst(0) sa_sdst(0)
1314    //! s_setpc_b64 0
1315    bld.pseudo(aco_opcode::p_unit_test, Operand::c32(5));
1316    bld.vop2(aco_opcode::v_cndmask_b32, Definition(PhysReg(256), v1), Operand::zero(),
1317             Operand::zero(), Operand(vcc, s2));
1318    bld.sop1(aco_opcode::s_mov_b64, Definition(vcc, s2), Operand::zero(8));
1319    bld.sop1(aco_opcode::s_setpc_b64, Operand::zero(8));
1320 
1321    /* LdsDirectVMEMHazard */
1322    //! p_unit_test 6
1323    //! v1: %0:v[0] = ds_read_b32 %0:v[0]
1324    //! s_waitcnt_depctr vm_vsrc(0)
1325    //! s_setpc_b64 0
1326    bld.pseudo(aco_opcode::p_unit_test, Operand::c32(6));
1327    bld.ds(aco_opcode::ds_read_b32, Definition(PhysReg(256), v1), Operand(PhysReg(256), v1));
1328    bld.sop1(aco_opcode::s_setpc_b64, Operand::zero(8));
1329 
1330    finish_insert_nops_test(true);
1331 }
1332