• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /*
2  * Copyright © 2020 Valve Corporation
3  *
4  * SPDX-License-Identifier: MIT
5  */
6 #include "common/amdgfxregs.h"
7 
8 #include "helpers.h"
9 
10 using namespace aco;
11 
12 void
create_mubuf(unsigned offset,PhysReg dst=PhysReg (256),PhysReg vaddr=PhysReg (256))13 create_mubuf(unsigned offset, PhysReg dst = PhysReg(256), PhysReg vaddr = PhysReg(256))
14 {
15    bld.mubuf(aco_opcode::buffer_load_dword, Definition(dst, v1), Operand(PhysReg(0), s4),
16              Operand(vaddr, v1), Operand::zero(), offset, true);
17 }
18 
19 void
create_mubuf_store(PhysReg src=PhysReg (256))20 create_mubuf_store(PhysReg src = PhysReg(256))
21 {
22    bld.mubuf(aco_opcode::buffer_store_dword, Operand(PhysReg(0), s4), Operand(src, v1),
23              Operand::zero(), Operand(src, v1), 0, true);
24 }
25 
26 void
create_mimg(bool nsa,unsigned addrs,unsigned instr_dwords)27 create_mimg(bool nsa, unsigned addrs, unsigned instr_dwords)
28 {
29    aco_ptr<Instruction> mimg{
30       create_instruction(aco_opcode::image_sample, Format::MIMG, 3 + addrs, 1)};
31    mimg->definitions[0] = Definition(PhysReg(256), v1);
32    mimg->operands[0] = Operand(PhysReg(0), s8);
33    mimg->operands[1] = Operand(PhysReg(0), s4);
34    mimg->operands[2] = Operand(v1);
35    for (unsigned i = 0; i < addrs; i++)
36       mimg->operands[3 + i] = Operand(PhysReg(256 + (nsa ? i * 2 : i)), v1);
37    mimg->mimg().dmask = 0x1;
38    mimg->mimg().dim = ac_image_2d;
39 
40    assert(get_mimg_nsa_dwords(mimg.get()) + 2 == instr_dwords);
41 
42    bld.insert(std::move(mimg));
43 }
44 
45 void
create_bvh()46 create_bvh()
47 {
48    aco_ptr<Instruction> instr{
49       create_instruction(aco_opcode::image_bvh64_intersect_ray, Format::MIMG, 8, 1)};
50    instr->definitions[0] = Definition(PhysReg(256), v4);
51    instr->operands[0] = Operand(PhysReg(0), s4);
52    instr->operands[1] = Operand(s4);
53    instr->operands[2] = Operand(v1);
54    instr->operands[3] = Operand(PhysReg(256 + 0), v2); /* node */
55    instr->operands[4] = Operand(PhysReg(256 + 2), v1); /* tmax */
56    instr->operands[5] = Operand(PhysReg(256 + 3), v3); /* origin */
57    instr->operands[6] = Operand(PhysReg(256 + 6), v3); /* dir */
58    instr->operands[7] = Operand(PhysReg(256 + 9), v3); /* inv dir */
59    instr->mimg().dmask = 0xf;
60    instr->mimg().unrm = true;
61    instr->mimg().r128 = true;
62    bld.insert(std::move(instr));
63 }
64 
65 BEGIN_TEST(insert_nops.nsa_to_vmem_bug)
66    if (!setup_cs(NULL, GFX10))
67       return;
68 
69    /* no nop needed because offset&6==0 */
70    //>> p_unit_test 0
71    //! v1: %0:v[0] = image_sample %0:s[0-7], %0:s[0-3],  v1: undef, %0:v[0], %0:v[2], %0:v[4], %0:v[6], %0:v[8], %0:v[10] 2d
72    //! v1: %0:v[0] = buffer_load_dword %0:s[0-3], %0:v[0], 0 offset:8 offen
73    bld.pseudo(aco_opcode::p_unit_test, Operand::zero());
74    create_mimg(true, 6, 4);
75    create_mubuf(8);
76 
77    /* nop needed */
78    //! p_unit_test 1
79    //! v1: %0:v[0] = image_sample %0:s[0-7], %0:s[0-3],  v1: undef, %0:v[0], %0:v[2], %0:v[4], %0:v[6], %0:v[8], %0:v[10] 2d
80    //! s_nop
81    //! v1: %0:v[0] = buffer_load_dword %0:s[0-3], %0:v[0], 0 offset:4 offen
82    bld.pseudo(aco_opcode::p_unit_test, Operand::c32(1u));
83    create_mimg(true, 6, 4);
84    create_mubuf(4);
85 
86    /* no nop needed because the MIMG is not NSA */
87    //! p_unit_test 2
88    //! v1: %0:v[0] = image_sample %0:s[0-7], %0:s[0-3],  v1: undef, %0:v[0], %0:v[1], %0:v[2], %0:v[3], %0:v[4], %0:v[5] 2d
89    //! v1: %0:v[0] = buffer_load_dword %0:s[0-3], %0:v[0], 0 offset:4 offen
90    bld.pseudo(aco_opcode::p_unit_test, Operand::c32(2u));
91    create_mimg(false, 6, 2);
92    create_mubuf(4);
93 
94    /* no nop needed because there's already an instruction in-between */
95    //! p_unit_test 3
96    //! v1: %0:v[0] = image_sample %0:s[0-7], %0:s[0-3],  v1: undef, %0:v[0], %0:v[2], %0:v[4], %0:v[6], %0:v[8], %0:v[10] 2d
97    //! v_nop
98    //! v1: %0:v[0] = buffer_load_dword %0:s[0-3], %0:v[0], 0 offset:4 offen
99    bld.pseudo(aco_opcode::p_unit_test, Operand::c32(3u));
100    create_mimg(true, 6, 4);
101    bld.vop1(aco_opcode::v_nop);
102    create_mubuf(4);
103 
104    /* no nop needed because the NSA instruction is under 4 dwords */
105    //! p_unit_test 4
106    //! v1: %0:v[0] = image_sample %0:s[0-7], %0:s[0-3],  v1: undef, %0:v[0], %0:v[2] 2d
107    //! v1: %0:v[0] = buffer_load_dword %0:s[0-3], %0:v[0], 0 offset:4 offen
108    bld.pseudo(aco_opcode::p_unit_test, Operand::c32(4u));
109    create_mimg(true, 2, 3);
110    create_mubuf(4);
111 
112    /* NSA instruction and MUBUF/MTBUF in a different block */
113    //! p_unit_test 5
114    //! v1: %0:v[0] = image_sample %0:s[0-7], %0:s[0-3],  v1: undef, %0:v[0], %0:v[2], %0:v[4], %0:v[6], %0:v[8], %0:v[10] 2d
115    //! BB1
116    //! /* logical preds: / linear preds: BB0, / kind: uniform, */
117    //! s_nop
118    //! v1: %0:v[0] = buffer_load_dword %0:s[0-3], %0:v[0], 0 offset:4 offen
119    bld.pseudo(aco_opcode::p_unit_test, Operand::c32(5u));
120    create_mimg(true, 6, 4);
121    bld.reset(program->create_and_insert_block());
122    create_mubuf(4);
123    program->blocks[0].linear_succs.push_back(1);
124    program->blocks[1].linear_preds.push_back(0);
125 
126    finish_insert_nops_test();
127 END_TEST
128 
129 BEGIN_TEST(insert_nops.writelane_to_nsa_bug)
130    if (!setup_cs(NULL, GFX10))
131       return;
132 
133    /* nop needed */
134    //>> p_unit_test 0
135    //! v1: %0:v[255] = v_writelane_b32_e64 0, 0, %0:v[255]
136    //! s_nop
137    //! v1: %0:v[0] = image_sample %0:s[0-7], %0:s[0-3],  v1: undef, %0:v[0], %0:v[2] 2d
138    bld.pseudo(aco_opcode::p_unit_test, Operand::zero());
139    bld.writelane(Definition(PhysReg(511), v1), Operand::zero(), Operand::zero(),
140                  Operand(PhysReg(511), v1));
141    create_mimg(true, 2, 3);
142 
143    /* no nop needed because the MIMG is not NSA */
144    //! p_unit_test 1
145    //! v1: %0:v[255] = v_writelane_b32_e64 0, 0, %0:v[255]
146    //! v1: %0:v[0] = image_sample %0:s[0-7], %0:s[0-3],  v1: undef, %0:v[0], %0:v[1] 2d
147    bld.pseudo(aco_opcode::p_unit_test, Operand::c32(1u));
148    bld.writelane(Definition(PhysReg(511), v1), Operand::zero(), Operand::zero(),
149                  Operand(PhysReg(511), v1));
150    create_mimg(false, 2, 2);
151 
152    /* no nop needed because there's already an instruction in-between */
153    //! p_unit_test 2
154    //! v1: %0:v[255] = v_writelane_b32_e64 0, 0, %0:v[255]
155    //! v_nop
156    //! v1: %0:v[0] = image_sample %0:s[0-7], %0:s[0-3],  v1: undef, %0:v[0], %0:v[2] 2d
157    bld.pseudo(aco_opcode::p_unit_test, Operand::c32(2u));
158    bld.writelane(Definition(PhysReg(511), v1), Operand::zero(), Operand::zero(),
159                  Operand(PhysReg(511), v1));
160    bld.vop1(aco_opcode::v_nop);
161    create_mimg(true, 2, 3);
162 
163    /* writelane and NSA instruction in different blocks */
164    //! p_unit_test 3
165    //! v1: %0:v[255] = v_writelane_b32_e64 0, 0, %0:v[255]
166    //! BB1
167    //! /* logical preds: / linear preds: BB0, / kind: uniform, */
168    //! s_nop
169    //! v1: %0:v[0] = image_sample %0:s[0-7], %0:s[0-3],  v1: undef, %0:v[0], %0:v[2] 2d
170    bld.pseudo(aco_opcode::p_unit_test, Operand::c32(3u));
171    bld.writelane(Definition(PhysReg(511), v1), Operand::zero(), Operand::zero(),
172                  Operand(PhysReg(511), v1));
173    bld.reset(program->create_and_insert_block());
174    create_mimg(true, 2, 3);
175    program->blocks[0].linear_succs.push_back(1);
176    program->blocks[1].linear_preds.push_back(0);
177 
178    finish_insert_nops_test();
179 END_TEST
180 
181 BEGIN_TEST(insert_nops.vmem_to_scalar_write)
182    if (!setup_cs(NULL, GFX10))
183       return;
184 
185    /* WaR: VMEM load */
186    //>> p_unit_test 0
187    //! v1: %0:v[0] = buffer_load_dword %0:s[0-3], %0:v[0], 0 offen
188    //! s_waitcnt_depctr vm_vsrc(0)
189    //! s1: %0:s[0] = s_mov_b32 0
190    bld.pseudo(aco_opcode::p_unit_test, Operand::c32(0));
191    create_mubuf(0);
192    bld.sop1(aco_opcode::s_mov_b32, Definition(PhysReg(0), s1), Operand::zero());
193 
194    //! p_unit_test 1
195    //! v1: %0:v[0] = buffer_load_dword %0:s[0-3], %0:v[0], 0 offen
196    //! s_waitcnt_depctr vm_vsrc(0)
197    //! s2: %0:exec = s_mov_b64 -1
198    bld.pseudo(aco_opcode::p_unit_test, Operand::c32(1));
199    create_mubuf(0);
200    bld.sop1(aco_opcode::s_mov_b64, Definition(exec, s2), Operand::c64(-1));
201 
202    /* no hazard: VMEM load */
203    //! p_unit_test 2
204    //! v1: %0:v[0] = buffer_load_dword %0:s[0-3], %0:v[0], 0 offen
205    //! s1: %0:s[4] = s_mov_b32 0
206    bld.pseudo(aco_opcode::p_unit_test, Operand::c32(2));
207    create_mubuf(0);
208    bld.sop1(aco_opcode::s_mov_b32, Definition(PhysReg(4), s1), Operand::zero());
209 
210    /* no hazard: VMEM load with VALU in-between */
211    //! p_unit_test 3
212    //! v1: %0:v[0] = buffer_load_dword %0:s[0-3], %0:v[0], 0 offen
213    //! v_nop
214    //! s1: %0:s[0] = s_mov_b32 0
215    bld.pseudo(aco_opcode::p_unit_test, Operand::c32(3));
216    create_mubuf(0);
217    bld.vop1(aco_opcode::v_nop);
218    bld.sop1(aco_opcode::s_mov_b32, Definition(PhysReg(0), s1), Operand::zero());
219 
220    /* WaR: LDS */
221    //! p_unit_test 4
222    //! v1: %0:v[0] = ds_read_b32 %0:v[0], %0:m0
223    //! s_waitcnt_depctr vm_vsrc(0)
224    //! s1: %0:m0 = s_mov_b32 0
225    bld.pseudo(aco_opcode::p_unit_test, Operand::c32(4));
226    bld.ds(aco_opcode::ds_read_b32, Definition(PhysReg(256), v1), Operand(PhysReg(256), v1),
227           Operand(m0, s1));
228    bld.sop1(aco_opcode::s_mov_b32, Definition(m0, s1), Operand::zero());
229 
230    //! p_unit_test 5
231    //! v1: %0:v[0] = ds_read_b32 %0:v[0], %0:m0
232    //! s_waitcnt_depctr vm_vsrc(0)
233    //! s2: %0:exec = s_mov_b64 -1
234    bld.pseudo(aco_opcode::p_unit_test, Operand::c32(5));
235    bld.ds(aco_opcode::ds_read_b32, Definition(PhysReg(256), v1), Operand(PhysReg(256), v1),
236           Operand(m0, s1));
237    bld.sop1(aco_opcode::s_mov_b64, Definition(exec, s2), Operand::c64(-1));
238 
239    /* no hazard: LDS */
240    //! p_unit_test 6
241    //! v1: %0:v[0] = ds_read_b32 %0:v[0], %0:m0
242    //! s1: %0:s[0] = s_mov_b32 0
243    bld.pseudo(aco_opcode::p_unit_test, Operand::c32(6));
244    bld.ds(aco_opcode::ds_read_b32, Definition(PhysReg(256), v1), Operand(PhysReg(256), v1),
245           Operand(m0, s1));
246    bld.sop1(aco_opcode::s_mov_b32, Definition(PhysReg(0), s1), Operand::zero());
247 
248    /* no hazard: LDS with VALU in-between */
249    //! p_unit_test 7
250    //! v1: %0:v[0] = ds_read_b32 %0:v[0], %0:m0
251    //! v_nop
252    //! s1: %0:m0 = s_mov_b32 0
253    bld.pseudo(aco_opcode::p_unit_test, Operand::c32(7));
254    bld.ds(aco_opcode::ds_read_b32, Definition(PhysReg(256), v1), Operand(PhysReg(256), v1),
255           Operand(m0, s1));
256    bld.vop1(aco_opcode::v_nop);
257    bld.sop1(aco_opcode::s_mov_b32, Definition(m0, s1), Operand::zero());
258 
259    /* no hazard: VMEM/LDS with the correct waitcnt in-between */
260    //! p_unit_test 8
261    //! v1: %0:v[0] = buffer_load_dword %0:s[0-3], %0:v[0], 0 offen
262    //! s_waitcnt vmcnt(0)
263    //! s1: %0:s[0] = s_mov_b32 0
264    bld.pseudo(aco_opcode::p_unit_test, Operand::c32(8));
265    create_mubuf(0);
266    bld.sopp(aco_opcode::s_waitcnt, 0x3f70);
267    bld.sop1(aco_opcode::s_mov_b32, Definition(PhysReg(0), s1), Operand::zero());
268 
269    //! p_unit_test 9
270    //! buffer_store_dword %0:s[0-3], %0:v[0], 0, %0:v[0] offen
271    //! s_waitcnt_vscnt %0:null imm:0
272    //! s1: %0:s[0] = s_mov_b32 0
273    bld.pseudo(aco_opcode::p_unit_test, Operand::c32(9));
274    create_mubuf_store();
275    bld.sopk(aco_opcode::s_waitcnt_vscnt, Operand(sgpr_null, s1), 0);
276    bld.sop1(aco_opcode::s_mov_b32, Definition(PhysReg(0), s1), Operand::zero());
277 
278    //! p_unit_test 10
279    //! v1: %0:v[0] = ds_read_b32 %0:v[0], %0:m0
280    //! s_waitcnt lgkmcnt(0)
281    //! s1: %0:m0 = s_mov_b32 0
282    bld.pseudo(aco_opcode::p_unit_test, Operand::c32(10));
283    bld.ds(aco_opcode::ds_read_b32, Definition(PhysReg(256), v1), Operand(PhysReg(256), v1),
284           Operand(m0, s1));
285    bld.sopp(aco_opcode::s_waitcnt, 0xc07f);
286    bld.sop1(aco_opcode::s_mov_b32, Definition(m0, s1), Operand::zero());
287 
288    /* VMEM/LDS with the wrong waitcnt in-between */
289    //! p_unit_test 11
290    //! v1: %0:v[0] = buffer_load_dword %0:s[0-3], %0:v[0], 0 offen
291    //! s_waitcnt_vscnt %0:null imm:0
292    //! s_waitcnt_depctr vm_vsrc(0)
293    //! s1: %0:s[0] = s_mov_b32 0
294    bld.pseudo(aco_opcode::p_unit_test, Operand::c32(11));
295    create_mubuf(0);
296    bld.sopk(aco_opcode::s_waitcnt_vscnt, Operand(sgpr_null, s1), 0);
297    bld.sop1(aco_opcode::s_mov_b32, Definition(PhysReg(0), s1), Operand::zero());
298 
299    //! p_unit_test 12
300    //! buffer_store_dword %0:s[0-3], %0:v[0], 0, %0:v[0] offen
301    //! s_waitcnt lgkmcnt(0)
302    //! s_waitcnt_depctr vm_vsrc(0)
303    //! s1: %0:s[0] = s_mov_b32 0
304    bld.pseudo(aco_opcode::p_unit_test, Operand::c32(12));
305    create_mubuf_store();
306    bld.sopp(aco_opcode::s_waitcnt, 0xc07f);
307    bld.sop1(aco_opcode::s_mov_b32, Definition(PhysReg(0), s1), Operand::zero());
308 
309    //! p_unit_test 13
310    //! v1: %0:v[0] = ds_read_b32 %0:v[0], %0:m0
311    //! s_waitcnt vmcnt(0)
312    //! s_waitcnt_depctr vm_vsrc(0)
313    //! s1: %0:m0 = s_mov_b32 0
314    bld.pseudo(aco_opcode::p_unit_test, Operand::c32(13));
315    bld.ds(aco_opcode::ds_read_b32, Definition(PhysReg(256), v1), Operand(PhysReg(256), v1),
316           Operand(m0, s1));
317    bld.sopp(aco_opcode::s_waitcnt, 0x3f70);
318    bld.sop1(aco_opcode::s_mov_b32, Definition(m0, s1), Operand::zero());
319 
320    finish_insert_nops_test();
321 END_TEST
322 
323 BEGIN_TEST(insert_nops.lds_direct_valu)
324    for (amd_gfx_level gfx : {GFX11, GFX12}) {
325       if (!setup_cs(NULL, gfx))
326          continue;
327 
328       /* WaW */
329       //>> p_unit_test 0
330       //! v1: %0:v[0] = v_mov_b32 0
331       //! v1: %0:v[0] = lds_direct_load %0:m0 wait_vdst:0
332       bld.pseudo(aco_opcode::p_unit_test, Operand::c32(0));
333       bld.vop1(aco_opcode::v_mov_b32, Definition(PhysReg(256), v1), Operand::zero());
334       bld.ldsdir(aco_opcode::lds_direct_load, Definition(PhysReg(256), v1), Operand(m0, s1));
335 
336       /* WaR */
337       //! p_unit_test 1
338       //! v1: %0:v[1] = v_mov_b32 %0:v[0]
339       //! v1: %0:v[0] = lds_direct_load %0:m0 wait_vdst:0
340       bld.pseudo(aco_opcode::p_unit_test, Operand::c32(1));
341       bld.vop1(aco_opcode::v_mov_b32, Definition(PhysReg(257), v1), Operand(PhysReg(256), v1));
342       bld.ldsdir(aco_opcode::lds_direct_load, Definition(PhysReg(256), v1), Operand(m0, s1));
343 
344       /* No hazard. */
345       //! p_unit_test 2
346       //! v1: %0:v[1] = v_mov_b32 0
347       //! v1: %0:v[0] = lds_direct_load %0:m0
348       bld.pseudo(aco_opcode::p_unit_test, Operand::c32(2));
349       bld.vop1(aco_opcode::v_mov_b32, Definition(PhysReg(257), v1), Operand::zero());
350       bld.ldsdir(aco_opcode::lds_direct_load, Definition(PhysReg(256), v1), Operand(m0, s1));
351 
352       /* multiples hazards, nearest should be considered */
353       //! p_unit_test 3
354       //! v1: %0:v[1] = v_mov_b32 %0:v[0]
355       //! v1: %0:v[0] = v_mov_b32 0
356       //! v1: %0:v[0] = lds_direct_load %0:m0 wait_vdst:0
357       bld.pseudo(aco_opcode::p_unit_test, Operand::c32(3));
358       bld.vop1(aco_opcode::v_mov_b32, Definition(PhysReg(257), v1), Operand(PhysReg(256), v1));
359       bld.vop1(aco_opcode::v_mov_b32, Definition(PhysReg(256), v1), Operand::zero());
360       bld.ldsdir(aco_opcode::lds_direct_load, Definition(PhysReg(256), v1), Operand(m0, s1));
361 
362       /* independent VALU increase wait_vdst */
363       //! p_unit_test 4
364       //! v1: %0:v[0] = v_mov_b32 0
365       //! v_nop
366       //! v1: %0:v[0] = lds_direct_load %0:m0 wait_vdst:1
367       bld.pseudo(aco_opcode::p_unit_test, Operand::c32(4));
368       bld.vop1(aco_opcode::v_mov_b32, Definition(PhysReg(256), v1), Operand::zero());
369       bld.vop1(aco_opcode::v_nop);
370       bld.ldsdir(aco_opcode::lds_direct_load, Definition(PhysReg(256), v1), Operand(m0, s1));
371 
372       //! p_unit_test 5
373       //! v1: %0:v[0] = v_mov_b32 0
374       //; for i in range(10): insert_pattern('v_nop')
375       //! v1: %0:v[0] = lds_direct_load %0:m0 wait_vdst:10
376       bld.pseudo(aco_opcode::p_unit_test, Operand::c32(5));
377       bld.vop1(aco_opcode::v_mov_b32, Definition(PhysReg(256), v1), Operand::zero());
378       for (unsigned i = 0; i < 10; i++)
379          bld.vop1(aco_opcode::v_nop);
380       bld.ldsdir(aco_opcode::lds_direct_load, Definition(PhysReg(256), v1), Operand(m0, s1));
381 
382       //! p_unit_test 6
383       //! v1: %0:v[0] = v_mov_b32 0
384       //; for i in range(20): insert_pattern('v_nop')
385       //! v1: %0:v[0] = lds_direct_load %0:m0
386       bld.pseudo(aco_opcode::p_unit_test, Operand::c32(6));
387       bld.vop1(aco_opcode::v_mov_b32, Definition(PhysReg(256), v1), Operand::zero());
388       for (unsigned i = 0; i < 20; i++)
389          bld.vop1(aco_opcode::v_nop);
390       bld.ldsdir(aco_opcode::lds_direct_load, Definition(PhysReg(256), v1), Operand(m0, s1));
391 
392       /* transcendental requires wait_vdst=0 */
393       //! p_unit_test 7
394       //! v1: %0:v[0] = v_mov_b32 0
395       //! v_nop
396       //! v1: %0:v[1] = v_sqrt_f32 %0:v[1]
397       //! v1: %0:v[0] = lds_direct_load %0:m0 wait_vdst:0
398       bld.pseudo(aco_opcode::p_unit_test, Operand::c32(7));
399       bld.vop1(aco_opcode::v_mov_b32, Definition(PhysReg(256), v1), Operand::zero());
400       bld.vop1(aco_opcode::v_nop);
401       bld.vop1(aco_opcode::v_sqrt_f32, Definition(PhysReg(257), v1), Operand(PhysReg(257), v1));
402       bld.ldsdir(aco_opcode::lds_direct_load, Definition(PhysReg(256), v1), Operand(m0, s1));
403 
404       //! p_unit_test 8
405       //! v1: %0:v[0] = v_sqrt_f32 %0:v[0]
406       //! v_nop
407       //! v1: %0:v[0] = lds_direct_load %0:m0 wait_vdst:0
408       bld.pseudo(aco_opcode::p_unit_test, Operand::c32(8));
409       bld.vop1(aco_opcode::v_sqrt_f32, Definition(PhysReg(256), v1), Operand(PhysReg(256), v1));
410       bld.vop1(aco_opcode::v_nop);
411       bld.ldsdir(aco_opcode::lds_direct_load, Definition(PhysReg(256), v1), Operand(m0, s1));
412 
413       /* transcendental is fine if it's before the instruction */
414       //! p_unit_test 9
415       //! v1: %0:v[1] = v_sqrt_f32 %0:v[1]
416       //! v1: %0:v[0] = v_mov_b32 0
417       //! v_nop
418       //! v1: %0:v[0] = lds_direct_load %0:m0 wait_vdst:1
419       bld.pseudo(aco_opcode::p_unit_test, Operand::c32(9));
420       bld.vop1(aco_opcode::v_sqrt_f32, Definition(PhysReg(257), v1), Operand(PhysReg(257), v1));
421       bld.vop1(aco_opcode::v_mov_b32, Definition(PhysReg(256), v1), Operand::zero());
422       bld.vop1(aco_opcode::v_nop);
423       bld.ldsdir(aco_opcode::lds_direct_load, Definition(PhysReg(256), v1), Operand(m0, s1));
424 
425       /* non-VALU does not increase wait_vdst */
426       //! p_unit_test 10
427       //! v1: %0:v[0] = v_mov_b32 0
428       //! s1: %0:m0 = s_mov_b32 0
429       //! v1: %0:v[0] = lds_direct_load %0:m0 wait_vdst:0
430       bld.pseudo(aco_opcode::p_unit_test, Operand::c32(10));
431       bld.vop1(aco_opcode::v_mov_b32, Definition(PhysReg(256), v1), Operand::zero());
432       bld.sop1(aco_opcode::s_mov_b32, Definition(m0, s1), Operand::zero());
433       bld.ldsdir(aco_opcode::lds_direct_load, Definition(PhysReg(256), v1), Operand(m0, s1));
434 
435       /* consider instructions which wait on vdst */
436       //! p_unit_test 11
437       //! v1: %0:v[0] = v_mov_b32 0
438       //! v_nop
439       //! s_waitcnt_depctr va_vdst(0)
440       //! v1: %0:v[0] = lds_direct_load %0:m0
441       bld.pseudo(aco_opcode::p_unit_test, Operand::c32(11));
442       bld.vop1(aco_opcode::v_mov_b32, Definition(PhysReg(256), v1), Operand::zero());
443       bld.vop1(aco_opcode::v_nop);
444       bld.sopp(aco_opcode::s_waitcnt_depctr, 0x0fff);
445       bld.ldsdir(aco_opcode::lds_direct_load, Definition(PhysReg(256), v1), Operand(m0, s1));
446 
447       finish_insert_nops_test();
448    }
449 END_TEST
450 
451 BEGIN_TEST(insert_nops.lds_direct_vmem)
452    for (amd_gfx_level gfx : {GFX11, GFX12}) {
453       if (!setup_cs(NULL, gfx))
454          continue;
455 
456       /* WaR: VMEM */
457       //>> p_unit_test 0
458       //! v1: %0:v[1] = buffer_load_dword %0:s[0-3], %0:v[0], 0 offen
459       //~gfx11! s_waitcnt_depctr vm_vsrc(0)
460       //~gfx11! v1: %0:v[0] = lds_direct_load %0:m0
461       //~gfx12! v1: %0:v[0] = lds_direct_load %0:m0 wait_vsrc:0
462       bld.pseudo(aco_opcode::p_unit_test, Operand::c32(0));
463       create_mubuf(0, PhysReg(257));
464       bld.ldsdir(aco_opcode::lds_direct_load, Definition(PhysReg(256), v1), Operand(m0, s1));
465 
466       /* WaW: VMEM */
467       //! p_unit_test 1
468       //! v1: %0:v[0] = buffer_load_dword %0:s[0-3], %0:v[1], 0 offen
469       //~gfx11! s_waitcnt_depctr vm_vsrc(0)
470       //~gfx11! v1: %0:v[0] = lds_direct_load %0:m0
471       //~gfx12! v1: %0:v[0] = lds_direct_load %0:m0 wait_vsrc:0
472       bld.pseudo(aco_opcode::p_unit_test, Operand::c32(1));
473       create_mubuf(0, PhysReg(256), PhysReg(257));
474       bld.ldsdir(aco_opcode::lds_direct_load, Definition(PhysReg(256), v1), Operand(m0, s1));
475 
476       /* no hazard: VMEM */
477       //! p_unit_test 2
478       //! v1: %0:v[1] = buffer_load_dword %0:s[0-3], %0:v[1], 0 offen
479       //! v1: %0:v[0] = lds_direct_load %0:m0
480       bld.pseudo(aco_opcode::p_unit_test, Operand::c32(2));
481       create_mubuf(0, PhysReg(257), PhysReg(257));
482       bld.ldsdir(aco_opcode::lds_direct_load, Definition(PhysReg(256), v1), Operand(m0, s1));
483 
484       /* no hazard: VMEM with VALU in-between */
485       //! p_unit_test 3
486       //! v1: %0:v[1] = buffer_load_dword %0:s[0-3], %0:v[0], 0 offen
487       //! v_nop
488       //! v1: %0:v[0] = lds_direct_load %0:m0
489       bld.pseudo(aco_opcode::p_unit_test, Operand::c32(3));
490       create_mubuf(0, PhysReg(257));
491       bld.vop1(aco_opcode::v_nop);
492       bld.ldsdir(aco_opcode::lds_direct_load, Definition(PhysReg(256), v1), Operand(m0, s1));
493 
494       /* WaR: LDS */
495       //! p_unit_test 4
496       //! v1: %0:v[1] = ds_read_b32 %0:v[0]
497       //~gfx11! s_waitcnt_depctr vm_vsrc(0)
498       //~gfx11! v1: %0:v[0] = lds_direct_load %0:m0
499       //~gfx12! v1: %0:v[0] = lds_direct_load %0:m0 wait_vsrc:0
500       bld.pseudo(aco_opcode::p_unit_test, Operand::c32(4));
501       bld.ds(aco_opcode::ds_read_b32, Definition(PhysReg(257), v1), Operand(PhysReg(256), v1));
502       bld.ldsdir(aco_opcode::lds_direct_load, Definition(PhysReg(256), v1), Operand(m0, s1));
503 
504       /* WaW: LDS */
505       //! p_unit_test 5
506       //! v1: %0:v[0] = ds_read_b32 %0:v[1]
507       //~gfx11! s_waitcnt_depctr vm_vsrc(0)
508       //~gfx11! v1: %0:v[0] = lds_direct_load %0:m0
509       //~gfx12! v1: %0:v[0] = lds_direct_load %0:m0 wait_vsrc:0
510       bld.pseudo(aco_opcode::p_unit_test, Operand::c32(5));
511       bld.ds(aco_opcode::ds_read_b32, Definition(PhysReg(256), v1), Operand(PhysReg(257), v1));
512       bld.ldsdir(aco_opcode::lds_direct_load, Definition(PhysReg(256), v1), Operand(m0, s1));
513 
514       /* no hazard: LDS */
515       //! p_unit_test 6
516       //! v1: %0:v[1] = ds_read_b32 %0:v[1]
517       //! v1: %0:v[0] = lds_direct_load %0:m0
518       bld.pseudo(aco_opcode::p_unit_test, Operand::c32(6));
519       bld.ds(aco_opcode::ds_read_b32, Definition(PhysReg(257), v1), Operand(PhysReg(257), v1));
520       bld.ldsdir(aco_opcode::lds_direct_load, Definition(PhysReg(256), v1), Operand(m0, s1));
521 
522       /* no hazard: LDS with VALU in-between */
523       //! p_unit_test 7
524       //! v1: %0:v[1] = ds_read_b32 %0:v[0]
525       //! v_nop
526       //! v1: %0:v[0] = lds_direct_load %0:m0
527       bld.pseudo(aco_opcode::p_unit_test, Operand::c32(7));
528       bld.ds(aco_opcode::ds_read_b32, Definition(PhysReg(257), v1), Operand(PhysReg(256), v1));
529       bld.vop1(aco_opcode::v_nop);
530       bld.ldsdir(aco_opcode::lds_direct_load, Definition(PhysReg(256), v1), Operand(m0, s1));
531 
532       /* no hazard: VMEM/LDS with the correct waitcnt in-between */
533       //! p_unit_test 8
534       //! v1: %0:v[1] = buffer_load_dword %0:s[0-3], %0:v[0], 0 offen
535       //~gfx11! s_waitcnt vmcnt(0)
536       //~gfx12! s_wait_loadcnt imm:0
537       //! v1: %0:v[0] = lds_direct_load %0:m0
538       bld.pseudo(aco_opcode::p_unit_test, Operand::c32(8));
539       create_mubuf(0, PhysReg(257));
540       if (gfx >= GFX12)
541          bld.sopp(aco_opcode::s_wait_loadcnt, 0);
542       else
543          bld.sopp(aco_opcode::s_waitcnt, 0x3ff);
544       bld.ldsdir(aco_opcode::lds_direct_load, Definition(PhysReg(256), v1), Operand(m0, s1));
545 
546       //! p_unit_test 9
547       //! buffer_store_dword %0:s[0-3], %0:v[0], 0, %0:v[0] offen
548       //~gfx11! s_waitcnt_vscnt %0:null imm:0
549       //~gfx12! s_wait_storecnt imm:0
550       //! v1: %0:v[0] = lds_direct_load %0:m0
551       bld.pseudo(aco_opcode::p_unit_test, Operand::c32(9));
552       create_mubuf_store();
553       if (gfx >= GFX12)
554          bld.sopp(aco_opcode::s_wait_storecnt, 0);
555       else
556          bld.sopk(aco_opcode::s_waitcnt_vscnt, Operand(sgpr_null, s1), 0);
557       bld.ldsdir(aco_opcode::lds_direct_load, Definition(PhysReg(256), v1), Operand(m0, s1));
558 
559       //! p_unit_test 10
560       //! v1: %0:v[1] = ds_read_b32 %0:v[0]
561       //~gfx11! s_waitcnt lgkmcnt(0)
562       //~gfx12! s_wait_dscnt imm:0
563       //! v1: %0:v[0] = lds_direct_load %0:m0
564       bld.pseudo(aco_opcode::p_unit_test, Operand::c32(10));
565       bld.ds(aco_opcode::ds_read_b32, Definition(PhysReg(257), v1), Operand(PhysReg(256), v1));
566       if (gfx >= GFX12)
567          bld.sopp(aco_opcode::s_wait_dscnt, 0);
568       else
569          bld.sopp(aco_opcode::s_waitcnt, 0xfc0f);
570       bld.ldsdir(aco_opcode::lds_direct_load, Definition(PhysReg(256), v1), Operand(m0, s1));
571 
572       if (gfx >= GFX12) {
573          //~gfx12! p_unit_test 11
574          //~gfx12! v1: %0:v[1] = image_load %0:s[0-7], s4: undef, v1: undef, %0:v[0-1] 2d
575          //~gfx12! s_wait_loadcnt imm:0
576          //~gfx12! v1: %0:v[0] = lds_direct_load %0:m0
577          bld.pseudo(aco_opcode::p_unit_test, Operand::c32(11));
578          Instruction* instr =
579             bld.mimg(aco_opcode::image_load, Definition(PhysReg(257), v1), Operand(PhysReg(0), s8),
580                      Operand(s4), Operand(v1), Operand(PhysReg(256), v2))
581                .instr;
582          instr->mimg().dmask = 0x1;
583          instr->mimg().dim = ac_image_2d;
584          bld.sopp(aco_opcode::s_wait_loadcnt, 0);
585          bld.ldsdir(aco_opcode::lds_direct_load, Definition(PhysReg(256), v1), Operand(m0, s1));
586 
587          //~gfx12! p_unit_test 12
588          //~gfx12! v1: %0:v[1] = image_sample %0:s[0-7], %0:s[0-3], v1: undef, %0:v[0-1] 2d
589          //~gfx12! s_wait_samplecnt imm:0
590          //~gfx12! v1: %0:v[0] = lds_direct_load %0:m0
591          bld.pseudo(aco_opcode::p_unit_test, Operand::c32(12));
592          instr = bld.mimg(aco_opcode::image_sample, Definition(PhysReg(257), v1),
593                           Operand(PhysReg(0), s8), Operand(PhysReg(0), s4), Operand(v1),
594                           Operand(PhysReg(256), v2))
595                     .instr;
596          instr->mimg().dmask = 0x1;
597          instr->mimg().dim = ac_image_2d;
598          bld.sopp(aco_opcode::s_wait_samplecnt, 0);
599          bld.ldsdir(aco_opcode::lds_direct_load, Definition(PhysReg(256), v1), Operand(m0, s1));
600 
601          //~gfx12! p_unit_test 13
602          //~gfx12! v4: %0:v[0-3] = image_bvh64_intersect_ray %0:s[0-3], s4: undef, v1: undef, %0:v[0-1], %0:v[2], %0:v[3-5], %0:v[6-8], %0:v[9-11] 1d unrm r128
603          //~gfx12! s_wait_bvhcnt imm:0
604          //~gfx12! v1: %0:v[0] = lds_direct_load %0:m0
605          bld.pseudo(aco_opcode::p_unit_test, Operand::c32(13));
606          create_bvh();
607          bld.sopp(aco_opcode::s_wait_bvhcnt, 0);
608          bld.ldsdir(aco_opcode::lds_direct_load, Definition(PhysReg(256), v1), Operand(m0, s1));
609       }
610 
611       /* VMEM/LDS with the wrong waitcnt in-between */
612       //! p_unit_test 14
613       //! v1: %0:v[1] = buffer_load_dword %0:s[0-3], %0:v[0], 0 offen
614       //~gfx11! s_waitcnt_vscnt %0:null imm:0
615       //~gfx11! s_waitcnt_depctr vm_vsrc(0)
616       //~gfx11! v1: %0:v[0] = lds_direct_load %0:m0
617       //~gfx12! s_wait_storecnt imm:0
618       //~gfx12! v1: %0:v[0] = lds_direct_load %0:m0 wait_vsrc:0
619       bld.pseudo(aco_opcode::p_unit_test, Operand::c32(14));
620       create_mubuf(0, PhysReg(257));
621       if (gfx >= GFX12)
622          bld.sopp(aco_opcode::s_wait_storecnt, 0);
623       else
624          bld.sopk(aco_opcode::s_waitcnt_vscnt, Operand(sgpr_null, s1), 0);
625       bld.ldsdir(aco_opcode::lds_direct_load, Definition(PhysReg(256), v1), Operand(m0, s1));
626 
627       //! p_unit_test 15
628       //! buffer_store_dword %0:s[0-3], %0:v[0], 0, %0:v[0] offen
629       //~gfx11! s_waitcnt lgkmcnt(0)
630       //~gfx11! s_waitcnt_depctr vm_vsrc(0)
631       //~gfx11! v1: %0:v[0] = lds_direct_load %0:m0
632       //~gfx12! s_wait_dscnt imm:0
633       //~gfx12! v1: %0:v[0] = lds_direct_load %0:m0 wait_vsrc:0
634       bld.pseudo(aco_opcode::p_unit_test, Operand::c32(15));
635       create_mubuf_store();
636       if (gfx >= GFX12)
637          bld.sopp(aco_opcode::s_wait_dscnt, 0);
638       else
639          bld.sopp(aco_opcode::s_waitcnt, 0xfc0f);
640       bld.ldsdir(aco_opcode::lds_direct_load, Definition(PhysReg(256), v1), Operand(m0, s1));
641 
642       //! p_unit_test 16
643       //! v1: %0:v[1] = ds_read_b32 %0:v[0]
644       //~gfx11! s_waitcnt vmcnt(0)
645       //~gfx11! s_waitcnt_depctr vm_vsrc(0)
646       //~gfx11! v1: %0:v[0] = lds_direct_load %0:m0
647       //~gfx12! s_wait_loadcnt imm:0
648       //~gfx12! v1: %0:v[0] = lds_direct_load %0:m0 wait_vsrc:0
649       bld.pseudo(aco_opcode::p_unit_test, Operand::c32(16));
650       bld.ds(aco_opcode::ds_read_b32, Definition(PhysReg(257), v1), Operand(PhysReg(256), v1));
651       if (gfx >= GFX12)
652          bld.sopp(aco_opcode::s_wait_loadcnt, 0);
653       else
654          bld.sopp(aco_opcode::s_waitcnt, 0x3ff);
655       bld.ldsdir(aco_opcode::lds_direct_load, Definition(PhysReg(256), v1), Operand(m0, s1));
656 
657       //! p_unit_test 17
658       //! v1: %0:v[0] = buffer_load_dword %0:s[0-3], %0:v[1], 0 offen
659       //~gfx11! s_waitcnt_vscnt %0:null imm:0
660       //~gfx11! s_waitcnt_depctr vm_vsrc(0)
661       //~gfx11! v1: %0:v[0] = lds_direct_load %0:m0
662       //~gfx12! s_wait_storecnt imm:0
663       //~gfx12! v1: %0:v[0] = lds_direct_load %0:m0 wait_vsrc:0
664       bld.pseudo(aco_opcode::p_unit_test, Operand::c32(17));
665       create_mubuf(0, PhysReg(256), PhysReg(257));
666       if (gfx >= GFX12)
667          bld.sopp(aco_opcode::s_wait_storecnt, 0);
668       else
669          bld.sopk(aco_opcode::s_waitcnt_vscnt, Operand(sgpr_null, s1), 0);
670       bld.ldsdir(aco_opcode::lds_direct_load, Definition(PhysReg(256), v1), Operand(m0, s1));
671 
672       if (gfx >= GFX12) {
673          //~gfx12! p_unit_test 18
674          //~gfx12! v1: %0:v[1] = image_load %0:s[0-7], s4: undef, v1: undef, %0:v[0-1] 2d
675          //~gfx12! s_wait_samplecnt imm:0
676          //~gfx12! v1: %0:v[0] = lds_direct_load %0:m0 wait_vsrc:0
677          bld.pseudo(aco_opcode::p_unit_test, Operand::c32(18));
678          Instruction* instr =
679             bld.mimg(aco_opcode::image_load, Definition(PhysReg(257), v1), Operand(PhysReg(0), s8),
680                      Operand(s4), Operand(v1), Operand(PhysReg(256), v2))
681                .instr;
682          instr->mimg().dmask = 0x1;
683          instr->mimg().dim = ac_image_2d;
684          bld.sopp(aco_opcode::s_wait_samplecnt, 0);
685          bld.ldsdir(aco_opcode::lds_direct_load, Definition(PhysReg(256), v1), Operand(m0, s1));
686 
687          //~gfx12! p_unit_test 19
688          //~gfx12! v1: %0:v[1] = image_sample %0:s[0-7], %0:s[0-3], v1: undef, %0:v[0-1] 2d
689          //~gfx12! s_wait_loadcnt imm:0
690          //~gfx12! v1: %0:v[0] = lds_direct_load %0:m0 wait_vsrc:0
691          bld.pseudo(aco_opcode::p_unit_test, Operand::c32(19));
692          instr = bld.mimg(aco_opcode::image_sample, Definition(PhysReg(257), v1),
693                           Operand(PhysReg(0), s8), Operand(PhysReg(0), s4), Operand(v1),
694                           Operand(PhysReg(256), v2))
695                     .instr;
696          instr->mimg().dmask = 0x1;
697          instr->mimg().dim = ac_image_2d;
698          bld.sopp(aco_opcode::s_wait_loadcnt, 0);
699          bld.ldsdir(aco_opcode::lds_direct_load, Definition(PhysReg(256), v1), Operand(m0, s1));
700 
701          //~gfx12! p_unit_test 20
702          //~gfx12! v4: %0:v[0-3] = image_bvh64_intersect_ray %0:s[0-3], s4: undef, v1: undef, %0:v[0-1], %0:v[2], %0:v[3-5], %0:v[6-8], %0:v[9-11] 1d unrm r128
703          //~gfx12! s_wait_loadcnt imm:0
704          //~gfx12! v1: %0:v[0] = lds_direct_load %0:m0 wait_vsrc:0
705          bld.pseudo(aco_opcode::p_unit_test, Operand::c32(20));
706          create_bvh();
707          bld.sopp(aco_opcode::s_wait_loadcnt, 0);
708          bld.ldsdir(aco_opcode::lds_direct_load, Definition(PhysReg(256), v1), Operand(m0, s1));
709       }
710 
711       finish_insert_nops_test();
712    }
713 END_TEST
714 
715 BEGIN_TEST(insert_nops.valu_trans_use)
716    if (!setup_cs(NULL, GFX11))
717       return;
718 
719    //>> p_unit_test 0
720    //! v1: %0:v[0] = v_rcp_f32 %0:v[1]
721    //! s_waitcnt_depctr va_vdst(0)
722    //! v1: %0:v[1] = v_mov_b32 %0:v[0]
723    bld.pseudo(aco_opcode::p_unit_test, Operand::c32(0));
724    bld.vop1(aco_opcode::v_rcp_f32, Definition(PhysReg(256), v1), Operand(PhysReg(257), v1));
725    bld.vop1(aco_opcode::v_mov_b32, Definition(PhysReg(257), v1), Operand(PhysReg(256), v1));
726 
727    /* Sufficient VALU mitigates the hazard. */
728    //! p_unit_test 1
729    //! v1: %0:v[0] = v_rcp_f32 %0:v[1]
730    //; for i in range(4): insert_pattern('v_nop')
731    //! s_waitcnt_depctr va_vdst(0)
732    //! v1: %0:v[1] = v_mov_b32 %0:v[0]
733    bld.pseudo(aco_opcode::p_unit_test, Operand::c32(1));
734    bld.vop1(aco_opcode::v_rcp_f32, Definition(PhysReg(256), v1), Operand(PhysReg(257), v1));
735    for (unsigned i = 0; i < 4; i++)
736       bld.vop1(aco_opcode::v_nop);
737    bld.vop1(aco_opcode::v_mov_b32, Definition(PhysReg(257), v1), Operand(PhysReg(256), v1));
738 
739    //! p_unit_test 2
740    //! v1: %0:v[0] = v_rcp_f32 %0:v[1]
741    //; for i in range(8): insert_pattern('v_nop')
742    //! v1: %0:v[1] = v_mov_b32 %0:v[0]
743    bld.pseudo(aco_opcode::p_unit_test, Operand::c32(2));
744    bld.vop1(aco_opcode::v_rcp_f32, Definition(PhysReg(256), v1), Operand(PhysReg(257), v1));
745    for (unsigned i = 0; i < 8; i++)
746       bld.vop1(aco_opcode::v_nop);
747    bld.vop1(aco_opcode::v_mov_b32, Definition(PhysReg(257), v1), Operand(PhysReg(256), v1));
748 
749    /* Sufficient transcendental VALU mitigates the hazard. */
750    //! p_unit_test 3
751    //! v1: %0:v[0] = v_rcp_f32 %0:v[1]
752    //! v1: %0:v[2] = v_sqrt_f32 %0:v[3]
753    //! s_waitcnt_depctr va_vdst(0)
754    //! v1: %0:v[1] = v_mov_b32 %0:v[0]
755    bld.pseudo(aco_opcode::p_unit_test, Operand::c32(3));
756    bld.vop1(aco_opcode::v_rcp_f32, Definition(PhysReg(256), v1), Operand(PhysReg(257), v1));
757    bld.vop1(aco_opcode::v_sqrt_f32, Definition(PhysReg(258), v1), Operand(PhysReg(259), v1));
758    bld.vop1(aco_opcode::v_mov_b32, Definition(PhysReg(257), v1), Operand(PhysReg(256), v1));
759 
760    //! p_unit_test 4
761    //! v1: %0:v[0] = v_rcp_f32 %0:v[1]
762    //! v1: %0:v[2] = v_sqrt_f32 %0:v[3]
763    //! v1: %0:v[2] = v_sqrt_f32 %0:v[3]
764    //! v1: %0:v[1] = v_mov_b32 %0:v[0]
765    bld.pseudo(aco_opcode::p_unit_test, Operand::c32(4));
766    bld.vop1(aco_opcode::v_rcp_f32, Definition(PhysReg(256), v1), Operand(PhysReg(257), v1));
767    for (unsigned i = 0; i < 2; i++)
768       bld.vop1(aco_opcode::v_sqrt_f32, Definition(PhysReg(258), v1), Operand(PhysReg(259), v1));
769    bld.vop1(aco_opcode::v_mov_b32, Definition(PhysReg(257), v1), Operand(PhysReg(256), v1));
770 
771    /* Transcendental VALU should be counted towards VALU */
772    //! p_unit_test 5
773    //! v1: %0:v[0] = v_rcp_f32 %0:v[1]
774    //; for i in range(5): insert_pattern('v_nop')
775    //! v1: %0:v[2] = v_sqrt_f32 %0:v[3]
776    //! v1: %0:v[1] = v_mov_b32 %0:v[0]
777    bld.pseudo(aco_opcode::p_unit_test, Operand::c32(5));
778    bld.vop1(aco_opcode::v_rcp_f32, Definition(PhysReg(256), v1), Operand(PhysReg(257), v1));
779    for (unsigned i = 0; i < 5; i++)
780       bld.vop1(aco_opcode::v_nop);
781    bld.vop1(aco_opcode::v_sqrt_f32, Definition(PhysReg(258), v1), Operand(PhysReg(259), v1));
782    bld.vop1(aco_opcode::v_mov_b32, Definition(PhysReg(257), v1), Operand(PhysReg(256), v1));
783 
784    /* non-VALU does not mitigate the hazard. */
785    //! p_unit_test 6
786    //! v1: %0:v[0] = v_rcp_f32 %0:v[1]
787    //; for i in range(8): insert_pattern('s_nop')
788    //! s_waitcnt_depctr va_vdst(0)
789    //! v1: %0:v[1] = v_mov_b32 %0:v[0]
790    bld.pseudo(aco_opcode::p_unit_test, Operand::c32(6));
791    bld.vop1(aco_opcode::v_rcp_f32, Definition(PhysReg(256), v1), Operand(PhysReg(257), v1));
792    for (unsigned i = 0; i < 8; i++)
793       bld.sopp(aco_opcode::s_nop, 0);
794    bld.vop1(aco_opcode::v_mov_b32, Definition(PhysReg(257), v1), Operand(PhysReg(256), v1));
795 
796    finish_insert_nops_test();
797 END_TEST
798 
799 BEGIN_TEST(insert_nops.valu_partial_forwarding.basic)
800    if (!setup_cs(NULL, GFX11))
801       return;
802 
803    /* Basic case. */
804    //>> p_unit_test 0
805    //! v1: %0:v[0] = v_mov_b32 0
806    //! s2: %0:exec = s_mov_b64 -1
807    //! v1: %0:v[1] = v_mov_b32 1
808    //! s_waitcnt_depctr va_vdst(0)
809    //! v1: %0:v[2] = v_max_f32 %0:v[0], %0:v[1]
810    bld.pseudo(aco_opcode::p_unit_test, Operand::c32(0));
811    bld.vop1(aco_opcode::v_mov_b32, Definition(PhysReg(256), v1), Operand::zero());
812    bld.sop1(aco_opcode::s_mov_b64, Definition(exec, s2), Operand::c64(-1));
813    bld.vop1(aco_opcode::v_mov_b32, Definition(PhysReg(257), v1), Operand::c32(1));
814    bld.vop2(aco_opcode::v_max_f32, Definition(PhysReg(258), v1), Operand(PhysReg(256), v1),
815             Operand(PhysReg(257), v1));
816 
817    /* We should consider both the closest and further VALU after the exec write. */
818    //! p_unit_test 1
819    //! v1: %0:v[0] = v_mov_b32 0
820    //! s2: %0:exec = s_mov_b64 -1
821    //! v1: %0:v[1] = v_mov_b32 1
822    //; for i in range(2): insert_pattern('v_nop')
823    //! v1: %0:v[2] = v_mov_b32 2
824    //! s_waitcnt_depctr va_vdst(0)
825    //! v1: %0:v[2] = v_max3_f32 %0:v[0], %0:v[1], %0:v[2]
826    bld.pseudo(aco_opcode::p_unit_test, Operand::c32(1));
827    bld.vop1(aco_opcode::v_mov_b32, Definition(PhysReg(256), v1), Operand::zero());
828    bld.sop1(aco_opcode::s_mov_b64, Definition(exec, s2), Operand::c64(-1));
829    bld.vop1(aco_opcode::v_mov_b32, Definition(PhysReg(257), v1), Operand::c32(1));
830    bld.vop1(aco_opcode::v_nop);
831    bld.vop1(aco_opcode::v_nop);
832    bld.vop1(aco_opcode::v_mov_b32, Definition(PhysReg(258), v1), Operand::c32(2));
833    bld.vop3(aco_opcode::v_max3_f32, Definition(PhysReg(258), v1), Operand(PhysReg(256), v1),
834             Operand(PhysReg(257), v1), Operand(PhysReg(258), v1));
835 
836    //! p_unit_test 2
837    //! v1: %0:v[0] = v_mov_b32 0
838    //! s2: %0:exec = s_mov_b64 -1
839    //! v1: %0:v[1] = v_mov_b32 1
840    //! v1: %0:v[2] = v_mov_b32 2
841    //; for i in range(4): insert_pattern('v_nop')
842    //! s_waitcnt_depctr va_vdst(0)
843    //! v1: %0:v[2] = v_max3_f32 %0:v[0], %0:v[1], %0:v[2]
844    bld.pseudo(aco_opcode::p_unit_test, Operand::c32(2));
845    bld.vop1(aco_opcode::v_mov_b32, Definition(PhysReg(256), v1), Operand::zero());
846    bld.sop1(aco_opcode::s_mov_b64, Definition(exec, s2), Operand::c64(-1));
847    bld.vop1(aco_opcode::v_mov_b32, Definition(PhysReg(257), v1), Operand::c32(1));
848    bld.vop1(aco_opcode::v_mov_b32, Definition(PhysReg(258), v1), Operand::c32(2));
849    for (unsigned i = 0; i < 4; i++)
850       bld.vop1(aco_opcode::v_nop);
851    bld.vop3(aco_opcode::v_max3_f32, Definition(PhysReg(258), v1), Operand(PhysReg(256), v1),
852             Operand(PhysReg(257), v1), Operand(PhysReg(258), v1));
853 
854    /* If a VALU writes a read VGPR in-between the first and second writes, it should still be
855     * counted towards the distance between the first and second writes.
856     */
857    //! p_unit_test 3
858    //! v1: %0:v[0] = v_mov_b32 0
859    //! s2: %0:exec = s_mov_b64 -1
860    //! v1: %0:v[1] = v_mov_b32 1
861    //; for i in range(2): insert_pattern('v_nop')
862    //! v1: %0:v[2] = v_mov_b32 2
863    //; for i in range(3): insert_pattern('v_nop')
864    //! v1: %0:v[2] = v_max3_f32 %0:v[0], %0:v[1], %0:v[2]
865    bld.pseudo(aco_opcode::p_unit_test, Operand::c32(3));
866    bld.vop1(aco_opcode::v_mov_b32, Definition(PhysReg(256), v1), Operand::zero());
867    bld.sop1(aco_opcode::s_mov_b64, Definition(exec, s2), Operand::c64(-1));
868    bld.vop1(aco_opcode::v_mov_b32, Definition(PhysReg(257), v1), Operand::c32(1));
869    bld.vop1(aco_opcode::v_nop);
870    bld.vop1(aco_opcode::v_nop);
871    bld.vop1(aco_opcode::v_mov_b32, Definition(PhysReg(258), v1), Operand::c32(2));
872    for (unsigned i = 0; i < 3; i++)
873       bld.vop1(aco_opcode::v_nop);
874    bld.vop3(aco_opcode::v_max3_f32, Definition(PhysReg(258), v1), Operand(PhysReg(256), v1),
875             Operand(PhysReg(257), v1), Operand(PhysReg(258), v1));
876 
877    bld.pseudo(aco_opcode::p_unit_test, Operand::c32(4));
878 
879    finish_insert_nops_test();
880 END_TEST
881 
882 BEGIN_TEST(insert_nops.valu_partial_forwarding.multiple_exec_writes)
883    if (!setup_cs(NULL, GFX11))
884       return;
885 
886    //>> p_unit_test 0
887    //! v1: %0:v[0] = v_mov_b32 0
888    //! s2: %0:exec = s_mov_b64 0
889    //! s2: %0:exec = s_mov_b64 -1
890    //! v1: %0:v[1] = v_mov_b32 1
891    //! s_waitcnt_depctr va_vdst(0)
892    //! v1: %0:v[2] = v_max_f32 %0:v[0], %0:v[1]
893    bld.pseudo(aco_opcode::p_unit_test, Operand::c32(0));
894    bld.vop1(aco_opcode::v_mov_b32, Definition(PhysReg(256), v1), Operand::zero());
895    bld.sop1(aco_opcode::s_mov_b64, Definition(exec, s2), Operand::c64(0));
896    bld.sop1(aco_opcode::s_mov_b64, Definition(exec, s2), Operand::c64(-1));
897    bld.vop1(aco_opcode::v_mov_b32, Definition(PhysReg(257), v1), Operand::c32(1));
898    bld.vop2(aco_opcode::v_max_f32, Definition(PhysReg(258), v1), Operand(PhysReg(256), v1),
899             Operand(PhysReg(257), v1));
900 
901    //! p_unit_test 1
902    //! v1: %0:v[0] = v_mov_b32 0
903    //! s2: %0:exec = s_mov_b64 0
904    //! v1: %0:v[1] = v_mov_b32 1
905    //! s2: %0:exec = s_mov_b64 -1
906    //! s_waitcnt_depctr va_vdst(0)
907    //! v1: %0:v[2] = v_max_f32 %0:v[0], %0:v[1]
908    bld.pseudo(aco_opcode::p_unit_test, Operand::c32(1));
909    bld.vop1(aco_opcode::v_mov_b32, Definition(PhysReg(256), v1), Operand::zero());
910    bld.sop1(aco_opcode::s_mov_b64, Definition(exec, s2), Operand::c64(0));
911    bld.vop1(aco_opcode::v_mov_b32, Definition(PhysReg(257), v1), Operand::c32(1));
912    bld.sop1(aco_opcode::s_mov_b64, Definition(exec, s2), Operand::c64(-1));
913    bld.vop2(aco_opcode::v_max_f32, Definition(PhysReg(258), v1), Operand(PhysReg(256), v1),
914             Operand(PhysReg(257), v1));
915 
916    finish_insert_nops_test();
917 END_TEST
918 
919 BEGIN_TEST(insert_nops.valu_partial_forwarding.control_flow)
920    if (!setup_cs(NULL, GFX11))
921       return;
922 
923    /* Control flow merges: one branch shouldn't interfere with the other (clobbering VALU closer
924     * than interesting one).
925     */
926    //>> p_unit_test 0
927    //! s_cbranch_scc1 block:BB2
928    bld.pseudo(aco_opcode::p_unit_test, Operand::c32(0u));
929    bld.sopp(aco_opcode::s_cbranch_scc1, 2);
930 
931    //! BB1
932    //! /* logical preds: / linear preds: BB0, / kind: */
933    //! v1: %0:v[0] = v_mov_b32 0
934    //! s2: %0:exec = s_mov_b64 -1
935    //! v_nop
936    //! s_branch block:BB3
937    bld.reset(program->create_and_insert_block());
938    program->blocks[0].linear_succs.push_back(1);
939    program->blocks[1].linear_preds.push_back(0);
940    bld.vop1(aco_opcode::v_mov_b32, Definition(PhysReg(256), v1), Operand::zero());
941    bld.sop1(aco_opcode::s_mov_b64, Definition(exec, s2), Operand::c64(-1));
942    bld.vop1(aco_opcode::v_nop);
943    bld.sopp(aco_opcode::s_branch, 3);
944 
945    //! BB2
946    //! /* logical preds: / linear preds: BB0, / kind: */
947    //! v1: %0:v[0] = v_mov_b32 0
948    bld.reset(program->create_and_insert_block());
949    program->blocks[0].linear_succs.push_back(2);
950    program->blocks[2].linear_preds.push_back(0);
951    bld.vop1(aco_opcode::v_mov_b32, Definition(PhysReg(256), v1), Operand::zero());
952 
953    //! BB3
954    //! /* logical preds: / linear preds: BB1, BB2, / kind: */
955    //! v1: %0:v[1] = v_mov_b32 1
956    //! s_waitcnt_depctr va_vdst(0)
957    //! v1: %0:v[2] = v_max_f32 %0:v[0], %0:v[1]
958    bld.reset(program->create_and_insert_block());
959    program->blocks[1].linear_succs.push_back(3);
960    program->blocks[2].linear_succs.push_back(3);
961    program->blocks[3].linear_preds.push_back(1);
962    program->blocks[3].linear_preds.push_back(2);
963    bld.vop1(aco_opcode::v_mov_b32, Definition(PhysReg(257), v1), Operand::c32(1));
964    bld.vop2(aco_opcode::v_max_f32, Definition(PhysReg(258), v1), Operand(PhysReg(256), v1),
965             Operand(PhysReg(257), v1));
966 
967    /* Control flow merges: one branch shouldn't interfere with the other (should consider furthest
968     * VALU writes after exec).
969     */
970    //! p_unit_test 1
971    //! s_cbranch_scc1 block:BB5
972    bld.pseudo(aco_opcode::p_unit_test, Operand::c32(1u));
973    bld.sopp(aco_opcode::s_cbranch_scc1, 5);
974 
975    //! BB4
976    //! /* logical preds: / linear preds: BB3, / kind: */
977    //! v1: %0:v[0] = v_mov_b32 0
978    //! s2: %0:exec = s_mov_b64 -1
979    //; for i in range(2): insert_pattern('v_nop')
980    //! v1: %0:v[1] = v_mov_b32 1
981    //! v_nop
982    //! s_branch block:BB6
983    bld.reset(program->create_and_insert_block());
984    program->blocks[3].linear_succs.push_back(4);
985    program->blocks[4].linear_preds.push_back(3);
986    bld.vop1(aco_opcode::v_mov_b32, Definition(PhysReg(256), v1), Operand::zero());
987    bld.sop1(aco_opcode::s_mov_b64, Definition(exec, s2), Operand::c64(-1));
988    bld.vop1(aco_opcode::v_nop);
989    bld.vop1(aco_opcode::v_nop);
990    bld.vop1(aco_opcode::v_mov_b32, Definition(PhysReg(257), v1), Operand::c32(1));
991    bld.vop1(aco_opcode::v_nop);
992    bld.sopp(aco_opcode::s_branch, 6);
993 
994    //! BB5
995    //! /* logical preds: / linear preds: BB3, / kind: */
996    //! v1: %0:v[1] = v_mov_b32 1
997    bld.reset(program->create_and_insert_block());
998    program->blocks[3].linear_succs.push_back(5);
999    program->blocks[5].linear_preds.push_back(3);
1000    bld.vop1(aco_opcode::v_mov_b32, Definition(PhysReg(257), v1), Operand::c32(1));
1001 
1002    //! BB6
1003    //! /* logical preds: / linear preds: BB4, BB5, / kind: */
1004    //! s_waitcnt_depctr va_vdst(0)
1005    //! v1: %0:v[2] = v_max_f32 %0:v[0], %0:v[1]
1006    bld.reset(program->create_and_insert_block());
1007    program->blocks[4].linear_succs.push_back(6);
1008    program->blocks[5].linear_succs.push_back(6);
1009    program->blocks[6].linear_preds.push_back(4);
1010    program->blocks[6].linear_preds.push_back(5);
1011    bld.vop2(aco_opcode::v_max_f32, Definition(PhysReg(258), v1), Operand(PhysReg(256), v1),
1012             Operand(PhysReg(257), v1));
1013 
1014    /* Control flow merges: one branch shouldn't interfere with the other (should consider closest
1015     * VALU writes after exec).
1016     */
1017    //! p_unit_test 2
1018    //! s_cbranch_scc1 block:BB8
1019    bld.pseudo(aco_opcode::p_unit_test, Operand::c32(2u));
1020    bld.sopp(aco_opcode::s_cbranch_scc1, 8);
1021 
1022    //! BB7
1023    //! /* logical preds: / linear preds: BB6, / kind: */
1024    //! v1: %0:v[0] = v_mov_b32 0
1025    //! s2: %0:exec = s_mov_b64 -1
1026    //! v1: %0:v[1] = v_mov_b32 1
1027    //; for i in range(4): insert_pattern('v_nop')
1028    //! s_branch block:BB9
1029    bld.reset(program->create_and_insert_block());
1030    program->blocks[6].linear_succs.push_back(7);
1031    program->blocks[7].linear_preds.push_back(6);
1032    bld.vop1(aco_opcode::v_mov_b32, Definition(PhysReg(256), v1), Operand::zero());
1033    bld.sop1(aco_opcode::s_mov_b64, Definition(exec, s2), Operand::c64(-1));
1034    bld.vop1(aco_opcode::v_mov_b32, Definition(PhysReg(257), v1), Operand::c32(1));
1035    for (unsigned i = 0; i < 4; i++)
1036       bld.vop1(aco_opcode::v_nop);
1037    bld.sopp(aco_opcode::s_branch, 9);
1038 
1039    //! BB8
1040    //! /* logical preds: / linear preds: BB6, / kind: */
1041    //! v1: %0:v[1] = v_mov_b32 1
1042    //; for i in range(5): insert_pattern('v_nop')
1043    bld.reset(program->create_and_insert_block());
1044    program->blocks[6].linear_succs.push_back(8);
1045    program->blocks[8].linear_preds.push_back(6);
1046    bld.vop1(aco_opcode::v_mov_b32, Definition(PhysReg(257), v1), Operand::c32(1));
1047    for (unsigned i = 0; i < 5; i++)
1048       bld.vop1(aco_opcode::v_nop);
1049 
1050    //! BB9
1051    //! /* logical preds: / linear preds: BB7, BB8, / kind: uniform, */
1052    //! s_waitcnt_depctr va_vdst(0)
1053    //! v1: %0:v[2] = v_max_f32 %0:v[0], %0:v[1]
1054    bld.reset(program->create_and_insert_block());
1055    program->blocks[7].linear_succs.push_back(9);
1056    program->blocks[8].linear_succs.push_back(9);
1057    program->blocks[9].linear_preds.push_back(7);
1058    program->blocks[9].linear_preds.push_back(8);
1059    bld.vop2(aco_opcode::v_max_f32, Definition(PhysReg(258), v1), Operand(PhysReg(256), v1),
1060             Operand(PhysReg(257), v1));
1061 
1062    finish_insert_nops_test();
1063 END_TEST
1064 
1065 BEGIN_TEST(insert_nops.valu_mask_write)
1066    if (!setup_cs(NULL, GFX11))
1067       return;
1068 
1069    /* Basic case. */
1070    //>> p_unit_test 0
1071    //! v1: %0:v[0] = v_cndmask_b32 0, 0, %0:s[0-1]
1072    //! s1: %0:s[1] = s_mov_b32 0
1073    //! s_waitcnt_depctr sa_sdst(0)
1074    //! s1: %0:s[2] = s_mov_b32 %0:s[1]
1075    bld.pseudo(aco_opcode::p_unit_test, Operand::c32(0));
1076    bld.vop2_e64(aco_opcode::v_cndmask_b32, Definition(PhysReg(256), v1), Operand::zero(),
1077                 Operand::zero(), Operand(PhysReg(0), s2));
1078    bld.sop1(aco_opcode::s_mov_b32, Definition(PhysReg(1), s1), Operand::zero());
1079    bld.sop1(aco_opcode::s_mov_b32, Definition(PhysReg(2), s1), Operand(PhysReg(1), s1));
1080 
1081    /* Mitigation. */
1082    //! p_unit_test 1
1083    //! v1: %0:v[0] = v_cndmask_b32 0, 0, %0:s[0-1]
1084    //! v1: %0:v[1] = v_mov_b32 %0:s[1]
1085    //! s1: %0:s[1] = s_mov_b32 0
1086    //! s1: %0:s[2] = s_mov_b32 %0:s[1]
1087    bld.pseudo(aco_opcode::p_unit_test, Operand::c32(1));
1088    bld.vop2_e64(aco_opcode::v_cndmask_b32, Definition(PhysReg(256), v1), Operand::zero(),
1089                 Operand::zero(), Operand(PhysReg(0), s2));
1090    bld.vop1(aco_opcode::v_mov_b32, Definition(PhysReg(257), v1), Operand(PhysReg(1), s1));
1091    bld.sop1(aco_opcode::s_mov_b32, Definition(PhysReg(1), s1), Operand::zero());
1092    bld.sop1(aco_opcode::s_mov_b32, Definition(PhysReg(2), s1), Operand(PhysReg(1), s1));
1093 
1094    //! p_unit_test 2
1095    //! v1: %0:v[0] = v_cndmask_b32 0, 0, %0:s[0-1]
1096    //! s1: %0:s[1] = s_mov_b32 0
1097    //! s_waitcnt_depctr sa_sdst(0)
1098    //! s1: %0:s[2] = s_mov_b32 %0:s[1]
1099    //! s1: %0:s[2] = s_mov_b32 %0:s[1]
1100    bld.pseudo(aco_opcode::p_unit_test, Operand::c32(2));
1101    bld.vop2_e64(aco_opcode::v_cndmask_b32, Definition(PhysReg(256), v1), Operand::zero(),
1102                 Operand::zero(), Operand(PhysReg(0), s2));
1103    bld.sop1(aco_opcode::s_mov_b32, Definition(PhysReg(1), s1), Operand::zero());
1104    bld.sop1(aco_opcode::s_mov_b32, Definition(PhysReg(2), s1), Operand(PhysReg(1), s1));
1105    bld.sop1(aco_opcode::s_mov_b32, Definition(PhysReg(2), s1), Operand(PhysReg(1), s1));
1106 
1107    //! p_unit_test 3
1108    //! v1: %0:v[0] = v_cndmask_b32 0, 0, %0:s[0-1]
1109    //! s1: %0:s[1] = s_mov_b32 0
1110    //! s_waitcnt_depctr sa_sdst(0)
1111    //! s1: %0:s[2] = s_mov_b32 %0:s[1]
1112    bld.pseudo(aco_opcode::p_unit_test, Operand::c32(3));
1113    bld.vop2_e64(aco_opcode::v_cndmask_b32, Definition(PhysReg(256), v1), Operand::zero(),
1114                 Operand::zero(), Operand(PhysReg(0), s2));
1115    bld.sop1(aco_opcode::s_mov_b32, Definition(PhysReg(1), s1), Operand::zero());
1116    bld.sopp(aco_opcode::s_waitcnt_depctr, 0xfffe);
1117    bld.sop1(aco_opcode::s_mov_b32, Definition(PhysReg(2), s1), Operand(PhysReg(1), s1));
1118 
1119    /* v_cndmask_b32 is both involved in the hazard and is a mitigation. */
1120    //! p_unit_test 4
1121    //! v1: %0:v[0] = v_cndmask_b32 %0:s[2], 0, %0:s[0-1]
1122    //! s1: %0:s[1] = s_mov_b32 0
1123    //! s_waitcnt_depctr sa_sdst(0)
1124    //! s1: %0:s[2] = s_mov_b32 %0:s[1]
1125    bld.pseudo(aco_opcode::p_unit_test, Operand::c32(4));
1126    bld.vop2_e64(aco_opcode::v_cndmask_b32, Definition(PhysReg(256), v1), Operand(PhysReg(2), s1),
1127                 Operand::zero(), Operand(PhysReg(0), s2));
1128    bld.sop1(aco_opcode::s_mov_b32, Definition(PhysReg(1), s1), Operand::zero());
1129    bld.sop1(aco_opcode::s_mov_b32, Definition(PhysReg(2), s1), Operand(PhysReg(1), s1));
1130 
1131    /* VALU reading exec does not mitigate the hazard. We also don't consider literals. */
1132    //! p_unit_test 5
1133    //! v1: %0:v[0] = v_cndmask_b32 0, 0, %0:s[0-1]
1134    //! v1: %0:v[1] = v_mov_b32 %0:exec_lo
1135    //! s1: %0:s[1] = s_mov_b32 0
1136    //! s_waitcnt_depctr sa_sdst(0)
1137    //! s1: %0:s[2] = s_mov_b32 %0:s[1]
1138    bld.pseudo(aco_opcode::p_unit_test, Operand::c32(5));
1139    bld.vop2_e64(aco_opcode::v_cndmask_b32, Definition(PhysReg(256), v1), Operand::zero(),
1140                 Operand::zero(), Operand(PhysReg(0), s2));
1141    bld.vop1(aco_opcode::v_mov_b32, Definition(PhysReg(257), v1), Operand(exec_lo, s1));
1142    bld.sop1(aco_opcode::s_mov_b32, Definition(PhysReg(1), s1), Operand::zero());
1143    bld.sop1(aco_opcode::s_mov_b32, Definition(PhysReg(2), s1), Operand(PhysReg(1), s1));
1144 
1145    //! p_unit_test 6
1146    //! v1: %0:v[0] = v_cndmask_b32 0, 0, %0:s[0-1]
1147    //! v1: %0:v[1] = v_mov_b32 0x200
1148    //! s1: %0:s[1] = s_mov_b32 0
1149    //! s_waitcnt_depctr sa_sdst(0)
1150    //! s1: %0:s[2] = s_mov_b32 %0:s[1]
1151    bld.pseudo(aco_opcode::p_unit_test, Operand::c32(6));
1152    bld.vop2_e64(aco_opcode::v_cndmask_b32, Definition(PhysReg(256), v1), Operand::zero(),
1153                 Operand::zero(), Operand(PhysReg(0), s2));
1154    bld.vop1(aco_opcode::v_mov_b32, Definition(PhysReg(257), v1), Operand::literal32(0x200));
1155    bld.sop1(aco_opcode::s_mov_b32, Definition(PhysReg(1), s1), Operand::zero());
1156    bld.sop1(aco_opcode::s_mov_b32, Definition(PhysReg(2), s1), Operand(PhysReg(1), s1));
1157 
1158    /* Basic case: VALU. */
1159    //! p_unit_test 7
1160    //! v1: %0:v[0] = v_cndmask_b32 0, 0, %0:s[0-1]
1161    //! s1: %0:s[1] = s_mov_b32 0
1162    //! s_waitcnt_depctr sa_sdst(0)
1163    //! v1: %0:v[1] = v_mov_b32 %0:s[1]
1164    bld.pseudo(aco_opcode::p_unit_test, Operand::c32(7));
1165    bld.vop2_e64(aco_opcode::v_cndmask_b32, Definition(PhysReg(256), v1), Operand::zero(),
1166                 Operand::zero(), Operand(PhysReg(0), s2));
1167    bld.sop1(aco_opcode::s_mov_b32, Definition(PhysReg(1), s1), Operand::zero());
1168    bld.vop1(aco_opcode::v_mov_b32, Definition(PhysReg(257), v1), Operand(PhysReg(1), s1));
1169 
1170    /* SALU which both reads and writes a lane mask SGPR. */
1171    //! p_unit_test 8
1172    //! v1: %0:v[0] = v_cndmask_b32 0, 0, %0:s[0-1]
1173    //! s1: %0:s[1] = s_mov_b32 0
1174    //! v1: %0:v[0] = v_cndmask_b32 0, 0, %0:s[2-3]
1175    //! s_waitcnt_depctr sa_sdst(0)
1176    //! s1: %0:s[2] = s_mov_b32 %0:s[1]
1177    //! s_waitcnt_depctr sa_sdst(0)
1178    //! s1: %0:s[4] = s_mov_b32 %0:s[2]
1179    bld.pseudo(aco_opcode::p_unit_test, Operand::c32(8));
1180    bld.vop2_e64(aco_opcode::v_cndmask_b32, Definition(PhysReg(256), v1), Operand::zero(),
1181                 Operand::zero(), Operand(PhysReg(0), s2));
1182    bld.sop1(aco_opcode::s_mov_b32, Definition(PhysReg(1), s1), Operand::zero());
1183    bld.vop2_e64(aco_opcode::v_cndmask_b32, Definition(PhysReg(256), v1), Operand::zero(),
1184                 Operand::zero(), Operand(PhysReg(2), s2));
1185    bld.sop1(aco_opcode::s_mov_b32, Definition(PhysReg(2), s1), Operand(PhysReg(1), s1));
1186    bld.sop1(aco_opcode::s_mov_b32, Definition(PhysReg(4), s1), Operand(PhysReg(2), s1));
1187 
1188    /* When a SALU writes a lane mask, we shouldn't forget the current SGPRs used as lane masks then
1189     * written. */
1190    //! p_unit_test 9
1191    //! v1: %0:v[0] = v_cndmask_b32 0, 0, %0:s[0-1]
1192    //! s1: %0:s[0] = s_mov_b32 0
1193    //! v1: %0:v[0] = v_cndmask_b32 0, 0, %0:s[2-3]
1194    //! s1: %0:s[2] = s_mov_b32 0
1195    //! s_waitcnt_depctr sa_sdst(0)
1196    //! s1: %0:s[4] = s_mov_b32 %0:s[0]
1197    //! s1: %0:s[5] = s_mov_b32 %0:s[2]
1198    bld.pseudo(aco_opcode::p_unit_test, Operand::c32(9));
1199    bld.vop2_e64(aco_opcode::v_cndmask_b32, Definition(PhysReg(256), v1), Operand::zero(),
1200                 Operand::zero(), Operand(PhysReg(0), s2));
1201    bld.sop1(aco_opcode::s_mov_b32, Definition(PhysReg(0), s1), Operand::zero());
1202    bld.vop2_e64(aco_opcode::v_cndmask_b32, Definition(PhysReg(256), v1), Operand::zero(),
1203                 Operand::zero(), Operand(PhysReg(2), s2));
1204    bld.sop1(aco_opcode::s_mov_b32, Definition(PhysReg(2), s1), Operand::zero());
1205    bld.sop1(aco_opcode::s_mov_b32, Definition(PhysReg(4), s1), Operand(PhysReg(0), s1));
1206    bld.sop1(aco_opcode::s_mov_b32, Definition(PhysReg(5), s1), Operand(PhysReg(2), s1));
1207 
1208    /* When a SALU writes a lane mask, we shouldn't forget all SGPRs used as lane masks, there might
1209     * be later problematic writes. */
1210    //! p_unit_test 10
1211    //! v1: %0:v[0] = v_cndmask_b32 0, 0, %0:s[0-1]
1212    //! s1: %0:s[0] = s_mov_b32 0
1213    //! s_waitcnt_depctr sa_sdst(0)
1214    //! s1: %0:s[4] = s_mov_b32 %0:s[0]
1215    //! s1: %0:s[1] = s_mov_b32 0
1216    //! s_waitcnt_depctr sa_sdst(0)
1217    //! s1: %0:s[5] = s_mov_b32 %0:s[1]
1218    bld.pseudo(aco_opcode::p_unit_test, Operand::c32(10));
1219    bld.vop2_e64(aco_opcode::v_cndmask_b32, Definition(PhysReg(256), v1), Operand::zero(),
1220                 Operand::zero(), Operand(PhysReg(0), s2));
1221    bld.sop1(aco_opcode::s_mov_b32, Definition(PhysReg(0), s1), Operand::zero());
1222    bld.sop1(aco_opcode::s_mov_b32, Definition(PhysReg(4), s1), Operand(PhysReg(0), s1));
1223    bld.sop1(aco_opcode::s_mov_b32, Definition(PhysReg(1), s1), Operand::zero());
1224    bld.sop1(aco_opcode::s_mov_b32, Definition(PhysReg(5), s1), Operand(PhysReg(1), s1));
1225 
1226    //! p_unit_test 11
1227    //! v1: %0:v[0] = v_cndmask_b32 0, 0, %0:s[0-1]
1228    //! s1: %0:s[0] = s_mov_b32 0
1229    //! s_waitcnt_depctr sa_sdst(0)
1230    //! s1: %0:s[4] = s_mov_b32 %0:s[0]
1231    //! s1: %0:s[0] = s_mov_b32 0
1232    //! s_waitcnt_depctr sa_sdst(0)
1233    //! s1: %0:s[5] = s_mov_b32 %0:s[0]
1234    bld.pseudo(aco_opcode::p_unit_test, Operand::c32(11));
1235    bld.vop2_e64(aco_opcode::v_cndmask_b32, Definition(PhysReg(256), v1), Operand::zero(),
1236                 Operand::zero(), Operand(PhysReg(0), s2));
1237    bld.sop1(aco_opcode::s_mov_b32, Definition(PhysReg(0), s1), Operand::zero());
1238    bld.sop1(aco_opcode::s_mov_b32, Definition(PhysReg(4), s1), Operand(PhysReg(0), s1));
1239    bld.sop1(aco_opcode::s_mov_b32, Definition(PhysReg(0), s1), Operand::zero());
1240    bld.sop1(aco_opcode::s_mov_b32, Definition(PhysReg(5), s1), Operand(PhysReg(0), s1));
1241 
1242    //! p_unit_test 12
1243    bld.pseudo(aco_opcode::p_unit_test, Operand::c32(12));
1244 
1245    //! BB1
1246    //! /* logical preds: / linear preds: BB0, / kind: */
1247    //! v1: %0:v[0] = v_cndmask_b32 0, 0, %0:s[0-1]
1248    bld.reset(program->create_and_insert_block());
1249    program->blocks[0].linear_succs.push_back(1);
1250    program->blocks[1].linear_preds.push_back(0);
1251    bld.vop2_e64(aco_opcode::v_cndmask_b32, Definition(PhysReg(256), v1), Operand::zero(),
1252                 Operand::zero(), Operand(PhysReg(0), s2));
1253 
1254    //! BB2
1255    //! /* logical preds: / linear preds: BB0, / kind: */
1256    //! v1: %0:v[0] = v_cndmask_b32 0, 0, %0:s[2-3]
1257    bld.reset(program->create_and_insert_block());
1258    program->blocks[0].linear_succs.push_back(2);
1259    program->blocks[2].linear_preds.push_back(0);
1260    bld.vop2_e64(aco_opcode::v_cndmask_b32, Definition(PhysReg(256), v1), Operand::zero(),
1261                 Operand::zero(), Operand(PhysReg(2), s2));
1262 
1263    //! BB3
1264    //! /* logical preds: / linear preds: BB1, BB2, / kind: uniform, */
1265    //! s1: %0:s[0] = s_mov_b32 0
1266    //! s_waitcnt_depctr sa_sdst(0)
1267    //! s1: %0:s[4] = s_mov_b32 %0:s[0]
1268    //! s1: %0:s[2] = s_mov_b32 0
1269    //! s_waitcnt_depctr sa_sdst(0)
1270    //! s1: %0:s[5] = s_mov_b32 %0:s[2]
1271    bld.reset(program->create_and_insert_block());
1272    program->blocks[1].linear_succs.push_back(3);
1273    program->blocks[2].linear_succs.push_back(3);
1274    program->blocks[3].linear_preds.push_back(1);
1275    program->blocks[3].linear_preds.push_back(2);
1276    bld.sop1(aco_opcode::s_mov_b32, Definition(PhysReg(0), s1), Operand::zero());
1277    bld.sop1(aco_opcode::s_mov_b32, Definition(PhysReg(4), s1), Operand(PhysReg(0), s1));
1278    bld.sop1(aco_opcode::s_mov_b32, Definition(PhysReg(2), s1), Operand::zero());
1279    bld.sop1(aco_opcode::s_mov_b32, Definition(PhysReg(5), s1), Operand(PhysReg(2), s1));
1280 
1281    finish_insert_nops_test();
1282 END_TEST
1283 
1284 BEGIN_TEST(insert_nops.wmma_raw)
1285    if (!setup_cs(NULL, GFX11))
1286       return;
1287 
1288    /* Basic case. */
1289    //>> p_unit_test 0
1290    //! v4: %_:v[20-23] = v_wmma_f16_16x16x16_f16 %_:v[0-7].xx, %_:v[8-15].xx, %_:v[20-23].xx
1291    //! v_nop
1292    //! v4: %_:v[48-51] = v_wmma_f16_16x16x16_f16 %_:v[24-31].xx, %_:v[16-23].xx, %_:v[48-51].xx
1293    bld.pseudo(aco_opcode::p_unit_test, Operand::c32(0));
1294    Operand A(PhysReg(256 + 0), v8);
1295    Operand B(PhysReg(256 + 8), v8);
1296    Operand C(PhysReg(256 + 20), v4);
1297    bld.vop3p(aco_opcode::v_wmma_f16_16x16x16_f16, Definition(C.physReg(), C.regClass()), A, B, C, 0,
1298              0);
1299    A.setFixed(PhysReg(256 + 24));
1300    B.setFixed(PhysReg(256 + 16));
1301    C.setFixed(PhysReg(256 + 48));
1302    bld.vop3p(aco_opcode::v_wmma_f16_16x16x16_f16, Definition(C.physReg(), C.regClass()), A, B, C, 0,
1303              0);
1304 
1305    /* Mitigation. */
1306    //! p_unit_test 1
1307    //! v4: %_:v[20-23] = v_wmma_f16_16x16x16_f16 %_:v[0-7].xx, %_:v[8-15].xx, %_:v[20-23].xx
1308    //! v1: %_:v[56] = v_rcp_f32 0
1309    //! v4: %_:v[48-51] = v_wmma_f16_16x16x16_f16 %_:v[24-31].xx, %_:v[16-23].xx, %_:v[48-51].xx
1310    bld.pseudo(aco_opcode::p_unit_test, Operand::c32(1));
1311    A.setFixed(PhysReg(256 + 0));
1312    B.setFixed(PhysReg(256 + 8));
1313    C.setFixed(PhysReg(256 + 20));
1314    bld.vop3p(aco_opcode::v_wmma_f16_16x16x16_f16, Definition(C.physReg(), C.regClass()), A, B, C, 0,
1315              0);
1316    bld.vop1(aco_opcode::v_rcp_f32, Definition(PhysReg(256 + 56), v1), Operand::zero());
1317    A.setFixed(PhysReg(256 + 24));
1318    B.setFixed(PhysReg(256 + 16));
1319    C.setFixed(PhysReg(256 + 48));
1320    bld.vop3p(aco_opcode::v_wmma_f16_16x16x16_f16, Definition(C.physReg(), C.regClass()), A, B, C, 0,
1321              0);
1322 
1323    /* No hazard. */
1324    //>> p_unit_test 2
1325    //! v4: %_:v[20-23] = v_wmma_f16_16x16x16_f16 %_:v[0-7].xx, %_:v[8-15].xx, %_:v[20-23].xx
1326    //! v4: %_:v[48-51] = v_wmma_f16_16x16x16_f16 %_:v[24-31].xx, %_:v[32-39].xx, %_:v[48-51].xx
1327    bld.pseudo(aco_opcode::p_unit_test, Operand::c32(2));
1328    A.setFixed(PhysReg(256 + 0));
1329    B.setFixed(PhysReg(256 + 8));
1330    C.setFixed(PhysReg(256 + 20));
1331    bld.vop3p(aco_opcode::v_wmma_f16_16x16x16_f16, Definition(C.physReg(), C.regClass()), A, B, C, 0,
1332              0);
1333    A.setFixed(PhysReg(256 + 24));
1334    B.setFixed(PhysReg(256 + 32));
1335    C.setFixed(PhysReg(256 + 48));
1336    bld.vop3p(aco_opcode::v_wmma_f16_16x16x16_f16, Definition(C.physReg(), C.regClass()), A, B, C, 0,
1337              0);
1338 
1339    //>> p_unit_test 3
1340    //! v4: %_:v[20-23] = v_wmma_f16_16x16x16_f16 %_:v[0-7].xx, %_:v[8-15].xx, %_:v[20-23].xx
1341    //! v4: %_:v[20-23] = v_wmma_f16_16x16x16_f16 %_:v[24-31].xx, %_:v[32-39].xx, %_:v[20-23].xx
1342    bld.pseudo(aco_opcode::p_unit_test, Operand::c32(3));
1343    A.setFixed(PhysReg(256 + 0));
1344    B.setFixed(PhysReg(256 + 8));
1345    C.setFixed(PhysReg(256 + 20));
1346    bld.vop3p(aco_opcode::v_wmma_f16_16x16x16_f16, Definition(C.physReg(), C.regClass()), A, B, C, 0,
1347              0);
1348    A.setFixed(PhysReg(256 + 24));
1349    B.setFixed(PhysReg(256 + 32));
1350    C.setFixed(PhysReg(256 + 20));
1351    bld.vop3p(aco_opcode::v_wmma_f16_16x16x16_f16, Definition(C.physReg(), C.regClass()), A, B, C, 0,
1352              0);
1353 
1354    finish_insert_nops_test();
1355 END_TEST
1356 
1357 enum StageInfoFlags {
1358    stage_separate = 1 << 0,
1359    stage_has_prolog = 1 << 1,
1360    stage_has_export = 1 << 2,
1361    stage_is_prolog = 1 << 3,
1362    stage_is_epilog = 1 << 4,
1363 };
1364 
1365 struct StageInfo {
1366    const char* name;
1367    Stage stage;
1368    unsigned flags;
1369 };
1370 
1371 BEGIN_TEST(insert_nops.export_priority.stages)
1372    Stage geometry_ngg(AC_HW_NEXT_GEN_GEOMETRY_SHADER, SWStage::GS);
1373    for (StageInfo stage : (StageInfo[]){
1374            {"_fs_first_last", fragment_fs, stage_has_export},
1375            {"_fs_with_epilog_first", fragment_fs, 0},
1376            {"_fs_prolog_first", fragment_fs, stage_is_prolog},
1377            {"_fs_epilog_last", fragment_fs, stage_is_epilog | stage_has_export},
1378            {"_vs_first_last", vertex_ngg, stage_has_export},
1379            {"_vs_with_prolog_last", vertex_ngg, stage_has_export | stage_has_prolog},
1380            {"_tes_first_last", tess_eval_ngg, stage_has_export},
1381            {"_ms_first_last", mesh_ngg, stage_has_export},
1382            {"_tesgs_first_last", tess_eval_geometry_ngg, stage_has_export},
1383            {"_vsgs_first_last", vertex_geometry_ngg, stage_has_export},
1384            {"_vsgs_with_prolog_last", vertex_geometry_ngg, stage_has_export | stage_has_prolog},
1385            {"_separate_vs_first", vertex_ngg, stage_separate},
1386            {"_separate_vs_with_prolog", vertex_ngg, stage_separate | stage_has_prolog},
1387            {"_separate_tes_first", tess_eval_ngg, stage_separate},
1388            {"_separate_gs_last", geometry_ngg, stage_separate | stage_has_export}}) {
1389       if (!setup_cs(NULL, GFX11_5, CHIP_UNKNOWN, stage.name))
1390          continue;
1391 
1392       program->stage = stage.stage;
1393       program->info.merged_shader_compiled_separately = stage.flags & stage_separate;
1394       program->info.vs.has_prolog = stage.flags & stage_has_prolog;
1395       program->is_prolog = stage.flags & stage_is_prolog;
1396       program->is_epilog = stage.flags & stage_is_epilog;
1397       //>> /* logical preds: / linear preds: / kind: uniform, top-level, */
1398       //~.*first.*! s_setprio imm:2
1399       if (stage.flags & stage_has_export) {
1400          //~.*last.*! exp v1: undef, v1: undef, v1: undef, v1: undef en:**** pos0
1401          //~.*last.*! s_setprio imm:0
1402          //~.*last.*! s_nop
1403          //~.*last.*! s_nop
1404          //~.*last.*! s_endpgm
1405          bld.exp(aco_opcode::exp, Operand(v1), Operand(v1), Operand(v1), Operand(v1), 0x0,
1406                  V_008DFC_SQ_EXP_POS, false);
1407       } else {
1408          //(?!.*last.*)! v_nop
1409          bld.vop1(aco_opcode::v_nop);
1410       }
1411 
1412       finish_insert_nops_test(stage.flags & stage_has_export);
1413    }
1414 END_TEST
1415 
1416 BEGIN_TEST(insert_nops.export_priority.instrs_after_export)
1417    if (!setup_cs(NULL, GFX11_5))
1418       return;
1419 
1420    program->stage = vertex_ngg;
1421    //>> /* logical preds: / linear preds: / kind: uniform, top-level, */
1422    //! s_setprio imm:2
1423    //! exp v1: undef, v1: undef, v1: undef, v1: undef en:**** pos0
1424    //! s_setprio imm:0
1425    //! s_waitcnt_expcnt %0:null imm:0
1426    //! s_nop
1427    //! s_nop
1428    //! s_setprio imm:2
1429    //! v_nop
1430    //! s_endpgm
1431    bld.exp(aco_opcode::exp, Operand(v1), Operand(v1), Operand(v1), Operand(v1), 0x0,
1432            V_008DFC_SQ_EXP_POS, false);
1433    bld.vop1(aco_opcode::v_nop);
1434 
1435    finish_insert_nops_test();
1436 END_TEST
1437 
1438 BEGIN_TEST(insert_nops.export_priority.fallthrough_to_endpgm)
1439    if (!setup_cs(NULL, GFX11_5))
1440       return;
1441 
1442    program->stage = vertex_ngg;
1443    //>> /* logical preds: / linear preds: / kind: top-level, */
1444    //! s_setprio imm:2
1445    //! exp v1: undef, v1: undef, v1: undef, v1: undef en:**** pos0
1446    //! s_setprio imm:0
1447    //! s_nop
1448    //! s_nop
1449    //>> BB1
1450    //>> /* logical preds: BB0, / linear preds: BB0, / kind: uniform, */
1451    //! s_endpgm
1452    bld.exp(aco_opcode::exp, Operand(v1), Operand(v1), Operand(v1), Operand(v1), 0x0,
1453            V_008DFC_SQ_EXP_POS, false);
1454 
1455    bld.reset(program->create_and_insert_block());
1456    program->blocks[0].linear_succs.push_back(1);
1457    program->blocks[0].logical_succs.push_back(1);
1458    program->blocks[1].linear_preds.push_back(0);
1459    program->blocks[1].logical_preds.push_back(0);
1460 
1461    finish_insert_nops_test();
1462 END_TEST
1463 
1464 BEGIN_TEST(insert_nops.export_priority.multiple_exports)
1465    if (!setup_cs(NULL, GFX11_5))
1466       return;
1467 
1468    program->stage = vertex_ngg;
1469    //>> /* logical preds: / linear preds: / kind: uniform, top-level, */
1470    //! s_setprio imm:2
1471    //! exp v1: undef, v1: undef, v1: undef, v1: undef en:**** pos0
1472    //! exp v1: undef, v1: undef, v1: undef, v1: undef en:**** pos1
1473    //! s_setprio imm:0
1474    //! s_nop
1475    //! s_nop
1476    //! s_endpgm
1477    bld.exp(aco_opcode::exp, Operand(v1), Operand(v1), Operand(v1), Operand(v1), 0x0,
1478            V_008DFC_SQ_EXP_POS, false);
1479    bld.exp(aco_opcode::exp, Operand(v1), Operand(v1), Operand(v1), Operand(v1), 0x0,
1480            V_008DFC_SQ_EXP_POS + 1, false);
1481 
1482    finish_insert_nops_test();
1483 END_TEST
1484 
1485 BEGIN_TEST(insert_nops.export_priority.set_prio)
1486    if (!setup_cs(NULL, GFX11_5))
1487       return;
1488 
1489    program->stage = vertex_ngg;
1490    //>> /* logical preds: / linear preds: / kind: uniform, top-level, */
1491    //! s_setprio imm:3
1492    //! v_nop
1493    //! s_setprio imm:2
1494    //! exp v1: undef, v1: undef, v1: undef, v1: undef en:**** pos0
1495    //! s_setprio imm:0
1496    //! s_nop
1497    //! s_nop
1498    //! s_endpgm
1499    bld.sopp(aco_opcode::s_setprio, 3);
1500    bld.vop1(aco_opcode::v_nop);
1501    bld.sopp(aco_opcode::s_setprio, 1);
1502    bld.exp(aco_opcode::exp, Operand(v1), Operand(v1), Operand(v1), Operand(v1), 0x0,
1503            V_008DFC_SQ_EXP_POS, false);
1504 
1505    finish_insert_nops_test();
1506 END_TEST
1507 
1508 BEGIN_TEST(insert_nops.valu_read_sgpr.basic)
1509    if (!setup_cs(NULL, GFX12))
1510       return;
1511 
1512    bld.vop1(aco_opcode::v_mov_b32, Definition(PhysReg(256), v1), Operand(PhysReg(4), s1));
1513    bld.vop1(aco_opcode::v_mov_b32, Definition(PhysReg(256), v1), Operand(PhysReg(7), s1));
1514    bld.vop1(aco_opcode::v_mov_b32, Definition(PhysReg(256), v1), Operand(sgpr_null, s1));
1515    bld.vop1(aco_opcode::v_mov_b32, Definition(PhysReg(256), v1), Operand(exec_lo, s1));
1516    bld.vop1(aco_opcode::v_mov_b32, Definition(PhysReg(256), v1), Operand(m0, s1));
1517    bld.vop1(aco_opcode::v_mov_b32, Definition(PhysReg(256), v1), Operand(scc, s1));
1518    bld.vop1(aco_opcode::v_mov_b32, Definition(PhysReg(256), v1), Operand(vcc, s1));
1519 
1520    /* no hazard: SALU write missing */
1521    //>> p_unit_test 0
1522    //! s1: %0:s[64] = s_mov_b32 %0:s[4]
1523    bld.pseudo(aco_opcode::p_unit_test, Operand::c32(0));
1524    bld.sop1(aco_opcode::s_mov_b32, Definition(PhysReg(64), s1), Operand(PhysReg(4), s1));
1525 
1526    /* no hazard: SGPR never read by VALU */
1527    //! p_unit_test 1
1528    //! s1: %0:s[16] = s_mov_b32 0
1529    //! s1: %0:s[64] = s_mov_b32 %0:s[16]
1530    bld.pseudo(aco_opcode::p_unit_test, Operand::c32(1));
1531    bld.sop1(aco_opcode::s_mov_b32, Definition(PhysReg(16), s1), Operand::zero(4));
1532    bld.sop1(aco_opcode::s_mov_b32, Definition(PhysReg(64), s1), Operand(PhysReg(16), s1));
1533 
1534    /* basic case: SALU read */
1535    //! p_unit_test 2
1536    //! s1: %0:s[4] = s_mov_b32 0
1537    //! s_waitcnt_depctr sa_sdst(0)
1538    //! s1: %0:s[64] = s_mov_b32 %0:s[4]
1539    bld.pseudo(aco_opcode::p_unit_test, Operand::c32(2));
1540    bld.sop1(aco_opcode::s_mov_b32, Definition(PhysReg(4), s1), Operand::zero(4));
1541    bld.sop1(aco_opcode::s_mov_b32, Definition(PhysReg(64), s1), Operand(PhysReg(4), s1));
1542 
1543    /* basic case again: VALU reads never expire */
1544    //! p_unit_test 3
1545    //! s1: %0:s[4] = s_mov_b32 0
1546    //! s_waitcnt_depctr sa_sdst(0)
1547    //! s1: %0:s[64] = s_mov_b32 %0:s[4]
1548    bld.pseudo(aco_opcode::p_unit_test, Operand::c32(3));
1549    bld.sop1(aco_opcode::s_mov_b32, Definition(PhysReg(4), s1), Operand::zero(4));
1550    bld.sop1(aco_opcode::s_mov_b32, Definition(PhysReg(64), s1), Operand(PhysReg(4), s1));
1551 
1552    /* sa_sdst(0) resolves the hazard */
1553    //! p_unit_test 4
1554    //! s1: %0:s[4] = s_mov_b32 0
1555    //! s_waitcnt_depctr sa_sdst(0)
1556    //! s1: %0:s[64] = s_mov_b32 %0:s[4]
1557    //! s1: %0:s[64] = s_mov_b32 %0:s[4]
1558    bld.pseudo(aco_opcode::p_unit_test, Operand::c32(4));
1559    bld.sop1(aco_opcode::s_mov_b32, Definition(PhysReg(4), s1), Operand::zero(4));
1560    bld.sop1(aco_opcode::s_mov_b32, Definition(PhysReg(64), s1), Operand(PhysReg(4), s1));
1561    bld.sop1(aco_opcode::s_mov_b32, Definition(PhysReg(64), s1), Operand(PhysReg(4), s1));
1562 
1563    //! p_unit_test 5
1564    //! s1: %0:s[4] = s_mov_b32 0
1565    //! s_waitcnt_depctr sa_sdst(0)
1566    //! s1: %0:s[64] = s_mov_b32 %0:s[4]
1567    bld.pseudo(aco_opcode::p_unit_test, Operand::c32(5));
1568    bld.sop1(aco_opcode::s_mov_b32, Definition(PhysReg(4), s1), Operand::zero(4));
1569    bld.sopp(aco_opcode::s_waitcnt_depctr, 0xfffe);
1570    bld.sop1(aco_opcode::s_mov_b32, Definition(PhysReg(64), s1), Operand(PhysReg(4), s1));
1571 
1572    /* basic case: VALU read */
1573    //! p_unit_test 6
1574    //! s1: %0:s[4] = s_mov_b32 0
1575    //! s_waitcnt_depctr sa_sdst(0)
1576    //! v1: %0:v[0] = v_mov_b32 %0:s[4]
1577    bld.pseudo(aco_opcode::p_unit_test, Operand::c32(6));
1578    bld.sop1(aco_opcode::s_mov_b32, Definition(PhysReg(4), s1), Operand::zero(4));
1579    bld.vop1(aco_opcode::v_mov_b32, Definition(PhysReg(256), v1), Operand(PhysReg(4), s1));
1580 
1581    /* the SALU write is in the same SGPR pair as the VALU read */
1582    //! p_unit_test 7
1583    //! s1: %0:s[6] = s_mov_b32 0
1584    //! s_waitcnt_depctr sa_sdst(0)
1585    //! s1: %0:s[64] = s_mov_b32 %0:s[6]
1586    bld.pseudo(aco_opcode::p_unit_test, Operand::c32(7));
1587    bld.sop1(aco_opcode::s_mov_b32, Definition(PhysReg(6), s1), Operand::zero(4));
1588    bld.sop1(aco_opcode::s_mov_b32, Definition(PhysReg(64), s1), Operand(PhysReg(6), s1));
1589 
1590    /* no hazard: these registers are not problematic */
1591    //! p_unit_test 8
1592    //! s1: %0:null = s_mov_b32 0
1593    //! s1: %0:s[64] = s_mov_b32 %0:null
1594    bld.pseudo(aco_opcode::p_unit_test, Operand::c32(8));
1595    bld.sop1(aco_opcode::s_mov_b32, Definition(sgpr_null, s1), Operand::zero(4));
1596    bld.sop1(aco_opcode::s_mov_b32, Definition(PhysReg(64), s1), Operand(sgpr_null, s1));
1597 
1598    //! p_unit_test 9
1599    //! s1: %0:exec_lo = s_mov_b32 0
1600    //! s1: %0:s[64] = s_mov_b32 %0:exec_lo
1601    bld.pseudo(aco_opcode::p_unit_test, Operand::c32(9));
1602    bld.sop1(aco_opcode::s_mov_b32, Definition(exec_lo, s1), Operand::zero(4));
1603    bld.sop1(aco_opcode::s_mov_b32, Definition(PhysReg(64), s1), Operand(exec_lo, s1));
1604 
1605    //! p_unit_test 10
1606    //! s1: %0:m0 = s_mov_b32 0
1607    //! s1: %0:s[64] = s_mov_b32 %0:m0
1608    bld.pseudo(aco_opcode::p_unit_test, Operand::c32(10));
1609    bld.sop1(aco_opcode::s_mov_b32, Definition(m0, s1), Operand::zero(4));
1610    bld.sop1(aco_opcode::s_mov_b32, Definition(PhysReg(64), s1), Operand(m0, s1));
1611 
1612    //! p_unit_test 11
1613    //! s1: %0:scc = s_cmp_lg_i32 0, 0
1614    //! s1: %0:s[64] = s_mov_b32 %0:scc
1615    bld.pseudo(aco_opcode::p_unit_test, Operand::c32(11));
1616    bld.sopc(aco_opcode::s_cmp_lg_i32, Definition(scc, s1), Operand::zero(4), Operand::zero(4));
1617    bld.sop1(aco_opcode::s_mov_b32, Definition(PhysReg(64), s1), Operand(scc, s1));
1618 
1619    /* 11 SALU between the write and a VALU read expire the hazard */
1620    //! p_unit_test 12
1621    //! s1: %0:s[4] = s_mov_b32 0
1622    //; for i in range(11): insert_pattern('s1: %0:s[64] = s_mov_b32 0')
1623    //! v1: %0:v[0] = v_mov_b32 %0:s[4]
1624    bld.pseudo(aco_opcode::p_unit_test, Operand::c32(12));
1625    bld.sop1(aco_opcode::s_mov_b32, Definition(PhysReg(4), s1), Operand::zero(4));
1626    for (unsigned i = 0; i < 11; i++)
1627       bld.sop1(aco_opcode::s_mov_b32, Definition(PhysReg(64), s1), Operand::zero(4));
1628    bld.vop1(aco_opcode::v_mov_b32, Definition(PhysReg(256), v1), Operand(PhysReg(4), s1));
1629 
1630    //! p_unit_test 13
1631    //! s1: %0:s[4] = s_mov_b32 0
1632    //; for i in range(10): insert_pattern('s1: %0:s[64] = s_mov_b32 0')
1633    //! s_waitcnt_depctr sa_sdst(0)
1634    //! v1: %0:v[0] = v_mov_b32 %0:s[4]
1635    bld.pseudo(aco_opcode::p_unit_test, Operand::c32(13));
1636    bld.sop1(aco_opcode::s_mov_b32, Definition(PhysReg(4), s1), Operand::zero(4));
1637    for (unsigned i = 0; i < 10; i++)
1638       bld.sop1(aco_opcode::s_mov_b32, Definition(PhysReg(64), s1), Operand::zero(4));
1639    bld.vop1(aco_opcode::v_mov_b32, Definition(PhysReg(256), v1), Operand(PhysReg(4), s1));
1640 
1641    /* 10 SALU between the write and a SALU read expire the hazard */
1642    //! p_unit_test 14
1643    //! s1: %0:s[4] = s_mov_b32 0
1644    //; for i in range(10): insert_pattern('s1: %0:s[64] = s_mov_b32 0')
1645    //! s1: %0:s[64] = s_mov_b32 %0:s[4]
1646    bld.pseudo(aco_opcode::p_unit_test, Operand::c32(14));
1647    bld.sop1(aco_opcode::s_mov_b32, Definition(PhysReg(4), s1), Operand::zero(4));
1648    for (unsigned i = 0; i < 10; i++)
1649       bld.sop1(aco_opcode::s_mov_b32, Definition(PhysReg(64), s1), Operand::zero(4));
1650    bld.sop1(aco_opcode::s_mov_b32, Definition(PhysReg(64), s1), Operand(PhysReg(4), s1));
1651 
1652    //! p_unit_test 15
1653    //! s1: %0:s[4] = s_mov_b32 0
1654    //; for i in range(9): insert_pattern('s1: %0:s[64] = s_mov_b32 0')
1655    //! s_waitcnt_depctr sa_sdst(0)
1656    //! s1: %0:s[64] = s_mov_b32 %0:s[4]
1657    bld.pseudo(aco_opcode::p_unit_test, Operand::c32(15));
1658    bld.sop1(aco_opcode::s_mov_b32, Definition(PhysReg(4), s1), Operand::zero(4));
1659    for (unsigned i = 0; i < 9; i++)
1660       bld.sop1(aco_opcode::s_mov_b32, Definition(PhysReg(64), s1), Operand::zero(4));
1661    bld.sop1(aco_opcode::s_mov_b32, Definition(PhysReg(64), s1), Operand(PhysReg(4), s1));
1662 
1663    /* SOPP in-between the write and the read do not count */
1664    //! p_unit_test 16
1665    //! s1: %0:s[4] = s_mov_b32 0
1666    //; for i in range(9): insert_pattern('s1: %0:s[64] = s_mov_b32 0')
1667    //! s_nop
1668    //! s_waitcnt_depctr sa_sdst(0)
1669    //! s1: %0:s[64] = s_mov_b32 %0:s[4]
1670    bld.pseudo(aco_opcode::p_unit_test, Operand::c32(16));
1671    bld.sop1(aco_opcode::s_mov_b32, Definition(PhysReg(4), s1), Operand::zero(4));
1672    for (unsigned i = 0; i < 9; i++)
1673       bld.sop1(aco_opcode::s_mov_b32, Definition(PhysReg(64), s1), Operand::zero(4));
1674    bld.sopp(aco_opcode::s_nop, 0);
1675    bld.sop1(aco_opcode::s_mov_b32, Definition(PhysReg(64), s1), Operand(PhysReg(4), s1));
1676 
1677    /* VALU -> VALU non-VCC SGPR */
1678    //! p_unit_test 17
1679    //! s1: %0:s[4] = v_readfirstlane_b32 %0:v[0]
1680    //! s_waitcnt_depctr va_sdst(0)
1681    //! v1: %0:v[0] = v_mov_b32 %0:s[4]
1682    bld.pseudo(aco_opcode::p_unit_test, Operand::c32(17));
1683    bld.vop1(aco_opcode::v_readfirstlane_b32, Definition(PhysReg(4), s1), Operand(PhysReg(256), v1));
1684    bld.vop1(aco_opcode::v_mov_b32, Definition(PhysReg(256), v1), Operand(PhysReg(4), s1));
1685 
1686    /* VALU -> VALU VCC SGPR */
1687    //! p_unit_test 18
1688    //! s1: %0:vcc_hi = v_readfirstlane_b32 %0:v[0]
1689    //! s_waitcnt_depctr va_vcc(0)
1690    //! v1: %0:v[0] = v_mov_b32 %0:vcc_hi
1691    bld.pseudo(aco_opcode::p_unit_test, Operand::c32(18));
1692    bld.vop1(aco_opcode::v_readfirstlane_b32, Definition(vcc_hi, s1), Operand(PhysReg(256), v1));
1693    bld.vop1(aco_opcode::v_mov_b32, Definition(PhysReg(256), v1), Operand(vcc_hi, s1));
1694 
1695    /* va_sdst=0 from SALU reading an SGPR: hazard mitigated */
1696    //! p_unit_test 19
1697    //! s1: %0:s[4] = v_readfirstlane_b32 %0:v[0]
1698    //! s1: %0:s[64] = s_mov_b32 %0:s[6]
1699    //! v1: %0:v[0] = v_mov_b32 %0:s[4]
1700    bld.pseudo(aco_opcode::p_unit_test, Operand::c32(19));
1701    bld.vop1(aco_opcode::v_readfirstlane_b32, Definition(PhysReg(4), s1), Operand(PhysReg(256), v1));
1702    bld.sop1(aco_opcode::s_mov_b32, Definition(PhysReg(64), s1), Operand(PhysReg(6), s1));
1703    bld.vop1(aco_opcode::v_mov_b32, Definition(PhysReg(256), v1), Operand(PhysReg(4), s1));
1704 
1705    /* va_vcc=0 from SALU reading VCC: hazard mitigated */
1706    //! p_unit_test 20
1707    //! s1: %0:vcc_hi = v_readfirstlane_b32 %0:v[0]
1708    //! s1: %0:s[64] = s_mov_b32 %0:vcc_lo
1709    //! v1: %0:v[0] = v_mov_b32 %0:vcc_hi
1710    bld.pseudo(aco_opcode::p_unit_test, Operand::c32(20));
1711    bld.vop1(aco_opcode::v_readfirstlane_b32, Definition(vcc_hi, s1), Operand(PhysReg(256), v1));
1712    bld.sop1(aco_opcode::s_mov_b32, Definition(PhysReg(64), s1), Operand(vcc, s1));
1713    bld.vop1(aco_opcode::v_mov_b32, Definition(PhysReg(256), v1), Operand(vcc_hi, s1));
1714 
1715    /* VALU -> VALU read VCC and then SGPR */
1716    //! p_unit_test 21
1717    //! s1: %0:vcc_hi = v_readfirstlane_b32 %0:v[0]
1718    //! s1: %0:s[4] = v_readfirstlane_b32 %0:v[0]
1719    //! s_waitcnt_depctr va_vcc(0)
1720    //! v1: %0:v[0] = v_mov_b32 %0:vcc_hi
1721    //! s_waitcnt_depctr va_sdst(0)
1722    //! v1: %0:v[0] = v_mov_b32 %0:s[4]
1723    bld.pseudo(aco_opcode::p_unit_test, Operand::c32(21));
1724    bld.vop1(aco_opcode::v_readfirstlane_b32, Definition(vcc_hi, s1), Operand(PhysReg(256), v1));
1725    bld.vop1(aco_opcode::v_readfirstlane_b32, Definition(PhysReg(4), s1), Operand(PhysReg(256), v1));
1726    bld.vop1(aco_opcode::v_mov_b32, Definition(PhysReg(256), v1), Operand(vcc_hi, s1));
1727    bld.vop1(aco_opcode::v_mov_b32, Definition(PhysReg(256), v1), Operand(PhysReg(4), s1));
1728 
1729    /* VALU -> VALU read SGPR and then VCC */
1730    //! p_unit_test 22
1731    //! s1: %0:vcc_hi = v_readfirstlane_b32 %0:v[0]
1732    //! s1: %0:s[4] = v_readfirstlane_b32 %0:v[0]
1733    //! s_waitcnt_depctr va_sdst(0)
1734    //! v1: %0:v[0] = v_mov_b32 %0:s[4]
1735    //! s_waitcnt_depctr va_vcc(0)
1736    //! v1: %0:v[0] = v_mov_b32 %0:vcc_hi
1737    bld.pseudo(aco_opcode::p_unit_test, Operand::c32(22));
1738    bld.vop1(aco_opcode::v_readfirstlane_b32, Definition(vcc_hi, s1), Operand(PhysReg(256), v1));
1739    bld.vop1(aco_opcode::v_readfirstlane_b32, Definition(PhysReg(4), s1), Operand(PhysReg(256), v1));
1740    bld.vop1(aco_opcode::v_mov_b32, Definition(PhysReg(256), v1), Operand(PhysReg(4), s1));
1741    bld.vop1(aco_opcode::v_mov_b32, Definition(PhysReg(256), v1), Operand(vcc_hi, s1));
1742 
1743    /* VALU writes VCC and SALU writes SGPR */
1744    //! p_unit_test 23
1745    //! s1: %0:vcc_hi = v_readfirstlane_b32 %0:v[0]
1746    //! s1: %0:s[4] = s_mov_b32 0
1747    //! s_waitcnt_depctr va_vcc(0)
1748    //! v1: %0:v[0] = v_mov_b32 %0:vcc_hi
1749    //! s_waitcnt_depctr sa_sdst(0)
1750    //! v1: %0:v[0] = v_mov_b32 %0:s[4]
1751    bld.pseudo(aco_opcode::p_unit_test, Operand::c32(23));
1752    bld.vop1(aco_opcode::v_readfirstlane_b32, Definition(vcc_hi, s1), Operand(PhysReg(256), v1));
1753    bld.sop1(aco_opcode::s_mov_b32, Definition(PhysReg(4), s1), Operand::zero(4));
1754    bld.vop1(aco_opcode::v_mov_b32, Definition(PhysReg(256), v1), Operand(vcc_hi, s1));
1755    bld.vop1(aco_opcode::v_mov_b32, Definition(PhysReg(256), v1), Operand(PhysReg(4), s1));
1756 
1757    finish_insert_nops_test();
1758 END_TEST
1759 
1760 BEGIN_TEST(insert_nops.valu_read_sgpr.previous_part)
1761    if (!setup_cs(NULL, GFX12))
1762       return;
1763 
1764    /* Raytracing shaders have a prolog and may also be split into several parts. */
1765    program->stage = raytracing_cs;
1766 
1767    /* Despite the SGPR never being read by a VALU in this shader, a sa_sdst(0) is needed. */
1768    //>> p_unit_test 0
1769    //! s1: %0:s[4] = s_mov_b32 0
1770    //! s_waitcnt_depctr sa_sdst(0)
1771    //! s1: %0:s[64] = s_mov_b32 %0:s[4]
1772    bld.pseudo(aco_opcode::p_unit_test, Operand::c32(0));
1773    bld.sop1(aco_opcode::s_mov_b32, Definition(PhysReg(4), s1), Operand::zero(4));
1774    bld.sop1(aco_opcode::s_mov_b32, Definition(PhysReg(64), s1), Operand(PhysReg(4), s1));
1775 
1776    finish_insert_nops_test();
1777 END_TEST
1778 
1779 BEGIN_TEST(insert_nops.setpc_gfx6)
1780    if (!setup_cs(NULL, GFX6))
1781       return;
1782 
1783    /* SGPR->SMEM hazards */
1784    //>> p_unit_test 0
1785    //! s1: %0:s[0] = s_mov_b32 0
1786    //! s_nop imm:2
1787    //! s_setpc_b64 0
1788    bld.pseudo(aco_opcode::p_unit_test, Operand::c32(0));
1789    bld.sop1(aco_opcode::s_mov_b32, Definition(PhysReg(0), s1), Operand::zero());
1790    bld.sop1(aco_opcode::s_setpc_b64, Operand::zero(8));
1791 
1792    //! p_unit_test 1
1793    //! s1: %0:s[0] = s_mov_b32 0
1794    //! s_nop imm:2
1795    //! s_setpc_b64 0
1796    bld.pseudo(aco_opcode::p_unit_test, Operand::c32(1));
1797    bld.sop1(aco_opcode::s_mov_b32, Definition(PhysReg(0), s1), Operand::zero());
1798    bld.sopp(aco_opcode::s_nop, 2);
1799    bld.sop1(aco_opcode::s_setpc_b64, Operand::zero(8));
1800 
1801    finish_insert_nops_test();
1802 
1803    /* This hazard can't be tested using s_setpc_b64, because the s_setpc_b64 itself resolves it. */
1804 
1805    /* VINTRP->v_readlane_b32/etc */
1806    //>> p_unit_test 2
1807    //! v1: %0:v[0] = v_interp_mov_f32 2, %0:m0 attr0.x
1808    //! s_nop
1809    create_program(GFX6, compute_cs, 64, CHIP_UNKNOWN);
1810    bld.pseudo(aco_opcode::p_unit_test, Operand::c32(2));
1811    bld.vintrp(aco_opcode::v_interp_mov_f32, Definition(PhysReg(256), v1), Operand::c32(2u),
1812               Operand(m0, s1), 0, 0);
1813    finish_insert_nops_test(false);
1814 END_TEST
1815 
1816 BEGIN_TEST(insert_nops.setpc_gfx7)
1817    for (amd_gfx_level gfx : {GFX7, GFX9}) {
1818       if (!setup_cs(NULL, gfx))
1819          continue;
1820 
1821       //>> p_unit_test 0
1822       //! s_setpc_b64 0
1823       bld.pseudo(aco_opcode::p_unit_test, Operand::c32(0));
1824       bld.sop1(aco_opcode::s_setpc_b64, Operand::zero(8));
1825 
1826       /* Break up SMEM clauses: resolved by the s_setpc_b64 itself */
1827       //! p_unit_test 1
1828       //! s1: %0:s[0] = s_load_dword %0:s[0-1]
1829       //! s_setpc_b64 0
1830       bld.pseudo(aco_opcode::p_unit_test, Operand::c32(1));
1831       bld.smem(aco_opcode::s_load_dword, Definition(PhysReg(0), s1), Operand(PhysReg(0), s2));
1832       bld.sop1(aco_opcode::s_setpc_b64, Operand::zero(8));
1833 
1834       /* SALU and GDS hazards */
1835       //! p_unit_test 2
1836       //! s_setreg_imm32_b32 0x0 imm:14337
1837       //! s_nop
1838       //! s_setpc_b64 0
1839       bld.pseudo(aco_opcode::p_unit_test, Operand::c32(2));
1840       bld.sopk(aco_opcode::s_setreg_imm32_b32, Operand::literal32(0), (7 << 11) | 1);
1841       bld.sop1(aco_opcode::s_setpc_b64, Operand::zero(8));
1842 
1843       /* VALU writes vcc -> vccz/v_div_fmas */
1844       //! p_unit_test 3
1845       //! s2: %0:vcc = v_cmp_eq_u32 0, 0
1846       //! s_nop imm:3
1847       //! s_setpc_b64 0
1848       bld.pseudo(aco_opcode::p_unit_test, Operand::c32(3));
1849       bld.vopc_e64(aco_opcode::v_cmp_eq_u32, Definition(vcc, s2), Operand::zero(), Operand::zero());
1850       bld.sop1(aco_opcode::s_setpc_b64, Operand::zero(8));
1851 
1852       /* VALU writes exec -> execz/DPP */
1853       //! p_unit_test 4
1854       //! s2: %0:exec = v_cmpx_eq_u32 0, 0
1855       //! s_nop imm:3
1856       //! s_setpc_b64 0
1857       bld.pseudo(aco_opcode::p_unit_test, Operand::c32(4));
1858       bld.vopc_e64(aco_opcode::v_cmpx_eq_u32, Definition(exec, s2), Operand::zero(),
1859                    Operand::zero());
1860       bld.sop1(aco_opcode::s_setpc_b64, Operand::zero(8));
1861 
1862       /* VALU->DPP */
1863       //! p_unit_test 5
1864       //! v1: %0:v[0] = v_mov_b32 0
1865       //~gfx9! s_nop
1866       //! s_setpc_b64 0
1867       bld.pseudo(aco_opcode::p_unit_test, Operand::c32(5));
1868       bld.vop1(aco_opcode::v_mov_b32, Definition(PhysReg(256), v1), Operand::zero());
1869       bld.sop1(aco_opcode::s_setpc_b64, Operand::zero(8));
1870 
1871       /* VALU->v_readlane_b32/VMEM/etc */
1872       //! p_unit_test 6
1873       //! s1: %0:s[0] = v_readfirstlane_b32 %0:v[0]
1874       //! s_nop imm:3
1875       //! s_setpc_b64 0
1876       bld.pseudo(aco_opcode::p_unit_test, Operand::c32(6));
1877       bld.vop1(aco_opcode::v_readfirstlane_b32, Definition(PhysReg(0), s1),
1878                Operand(PhysReg(256), v1));
1879       bld.sop1(aco_opcode::s_setpc_b64, Operand::zero(8));
1880 
1881       finish_insert_nops_test();
1882 
1883       /* These hazards can't be tested using s_setpc_b64, because the s_setpc_b64 itself resolves
1884        * them. */
1885 
1886       //>> p_unit_test 7
1887       //! buffer_store_dwordx3 %0:s[0-3], %0:v[0], 0, %0:v[0-2] offen
1888       //! s_nop
1889       create_program(gfx, compute_cs, 64, CHIP_UNKNOWN);
1890       bld.pseudo(aco_opcode::p_unit_test, Operand::c32(7));
1891       bld.mubuf(aco_opcode::buffer_store_dwordx3, Operand(PhysReg(0), s4),
1892                 Operand(PhysReg(256), v1), Operand::zero(), Operand(PhysReg(256), v3), 0, true);
1893       finish_insert_nops_test(false);
1894 
1895       //>> p_unit_test 8
1896       //! s1: %0:m0 = s_mov_b32 0
1897       //! s_nop
1898       create_program(gfx, compute_cs, 64, CHIP_UNKNOWN);
1899       bld.pseudo(aco_opcode::p_unit_test, Operand::c32(8));
1900       bld.sop1(aco_opcode::s_mov_b32, Definition(PhysReg(m0), s1), Operand::zero());
1901       finish_insert_nops_test(false);
1902 
1903       /* Break up SMEM clauses */
1904       //>> p_unit_test 9
1905       //! s1: %0:s[0] = s_load_dword %0:s[0-1]
1906       //! s_nop
1907       create_program(gfx, compute_cs, 64, CHIP_UNKNOWN);
1908       bld.pseudo(aco_opcode::p_unit_test, Operand::c32(9));
1909       bld.smem(aco_opcode::s_load_dword, Definition(PhysReg(0), s1), Operand(PhysReg(0), s2));
1910       finish_insert_nops_test(false);
1911    }
1912 END_TEST
1913 
1914 BEGIN_TEST(insert_nops.setpc_gfx10)
1915    if (!setup_cs(NULL, GFX10))
1916       return;
1917 
1918    //>> p_unit_test 0
1919    //! s_setpc_b64 0
1920    bld.pseudo(aco_opcode::p_unit_test, Operand::c32(0));
1921    bld.sop1(aco_opcode::s_setpc_b64, Operand::zero(8));
1922 
1923    /* VcmpxPermlaneHazard */
1924    //! p_unit_test 1
1925    //! s2: %0:exec = v_cmpx_eq_u32 0, 0
1926    //! v1: %0:v[0] = v_mov_b32 %0:v[0]
1927    //! s_setpc_b64 0
1928    bld.pseudo(aco_opcode::p_unit_test, Operand::c32(1));
1929    bld.vopc_e64(aco_opcode::v_cmpx_eq_u32, Definition(exec, s2), Operand::zero(), Operand::zero());
1930    bld.sop1(aco_opcode::s_setpc_b64, Operand::zero(8));
1931 
1932    /* VMEMtoScalarWriteHazard */
1933    //! p_unit_test 2
1934    //! v1: %0:v[0] = ds_read_b32 %0:v[0]
1935    //! s_waitcnt_vscnt %0:null imm:0
1936    //! s_waitcnt_depctr vm_vsrc(0)
1937    //! s_setpc_b64 0
1938    bld.pseudo(aco_opcode::p_unit_test, Operand::c32(2));
1939    bld.ds(aco_opcode::ds_read_b32, Definition(PhysReg(256), v1), Operand(PhysReg(256), v1));
1940    bld.sopk(aco_opcode::s_waitcnt_vscnt, Operand(sgpr_null, s1),
1941             0); /* reset LdsBranchVmemWARHazard */
1942    bld.sop1(aco_opcode::s_setpc_b64, Operand::zero(8));
1943 
1944    /* VcmpxExecWARHazard */
1945    //! p_unit_test 3
1946    //! s1: %0:s[0] = s_mov_b32 %0:exec_hi
1947    //! s_waitcnt_depctr sa_sdst(0)
1948    //! s_setpc_b64 0
1949    bld.pseudo(aco_opcode::p_unit_test, Operand::c32(3));
1950    bld.sop1(aco_opcode::s_mov_b32, Definition(PhysReg(0), s1), Operand(exec_hi, s1));
1951    bld.sop1(aco_opcode::s_setpc_b64, Operand::zero(8));
1952 
1953    /* LdsBranchVmemWARHazard */
1954    //! p_unit_test 4
1955    //! v1: %0:v[0] = ds_read_b32 %0:v[0]
1956    //! v_nop
1957    //! s_branch block:BB0
1958    //! s_waitcnt_vscnt %0:null imm:0
1959    //! s_setpc_b64 0
1960    bld.pseudo(aco_opcode::p_unit_test, Operand::c32(4));
1961    bld.ds(aco_opcode::ds_read_b32, Definition(PhysReg(256), v1), Operand(PhysReg(256), v1));
1962    bld.vop1(aco_opcode::v_nop); /* reset VMEMtoScalarWriteHazard */
1963    bld.sopp(aco_opcode::s_branch, 0);
1964    bld.sop1(aco_opcode::s_setpc_b64, Operand::zero(8));
1965 
1966    //! p_unit_test 5
1967    //! v1: %0:v[0] = ds_read_b32 %0:v[0]
1968    //! v_nop
1969    //! s_waitcnt_vscnt %0:null imm:0
1970    //! s_setpc_b64 0
1971    bld.pseudo(aco_opcode::p_unit_test, Operand::c32(5));
1972    bld.ds(aco_opcode::ds_read_b32, Definition(PhysReg(256), v1), Operand(PhysReg(256), v1));
1973    bld.vop1(aco_opcode::v_nop); /* reset VMEMtoScalarWriteHazard */
1974    bld.sop1(aco_opcode::s_setpc_b64, Operand::zero(8));
1975 
1976    /* waNsaCannotFollowWritelane: resolved by the s_setpc_b64 */
1977    //! p_unit_test 6
1978    //! v1: %0:v[0] = v_writelane_b32_e64 %0:v[1], 0, %0:v[0]
1979    //! s_setpc_b64 0
1980    bld.pseudo(aco_opcode::p_unit_test, Operand::c32(6));
1981    bld.vop3(aco_opcode::v_writelane_b32_e64, Definition(PhysReg(256), v1),
1982             Operand(PhysReg(257), v1), Operand::zero(4), Operand(PhysReg(256), v1));
1983    bld.sop1(aco_opcode::s_setpc_b64, Operand::zero(8));
1984 
1985    finish_insert_nops_test();
1986 
1987    /* These hazards can't be tested using s_setpc_b64, because the s_setpc_b64 itself resolves them.
1988     */
1989 
1990    /* SMEMtoVectorWriteHazard */
1991    //>> p_unit_test 7
1992    //! s1: %0:s[0] = s_load_dword %0:s[0-1]
1993    //! s1: %0:null = s_mov_b32 0
1994    create_program(GFX10, compute_cs, 64, CHIP_UNKNOWN);
1995    bld.pseudo(aco_opcode::p_unit_test, Operand::c32(7));
1996    bld.smem(aco_opcode::s_load_dword, Definition(PhysReg(0), s1), Operand(PhysReg(0), s2));
1997    finish_insert_nops_test(false);
1998 
1999    /* NSAToVMEMBug is already resolved indirectly through VMEMtoScalarWriteHazard and
2000     * LdsBranchVmemWARHazard. */
2001    //>> p_unit_test 8
2002    //! v1: %0:v[0] = image_sample %0:s[0-7], %0:s[0-3],  v1: undef, %0:v[0], %0:v[2], %0:v[4], %0:v[6], %0:v[8], %0:v[10] 2d
2003    //! s_waitcnt_depctr vm_vsrc(0)
2004    //! s_waitcnt_vscnt %0:null imm:0
2005    create_program(GFX10, compute_cs, 64, CHIP_UNKNOWN);
2006    bld.pseudo(aco_opcode::p_unit_test, Operand::c32(8));
2007    create_mimg(true, 6, 4);
2008    finish_insert_nops_test(false);
2009 
2010    /* waNsaCannotFollowWritelane */
2011    //>> p_unit_test 9
2012    //! v1: %0:v[0] = v_writelane_b32_e64 %0:v[1], 0, %0:v[0]
2013    //! s_nop
2014    create_program(GFX10, compute_cs, 64, CHIP_UNKNOWN);
2015    bld.pseudo(aco_opcode::p_unit_test, Operand::c32(9));
2016    bld.vop3(aco_opcode::v_writelane_b32_e64, Definition(PhysReg(256), v1),
2017             Operand(PhysReg(257), v1), Operand::zero(4), Operand(PhysReg(256), v1));
2018    finish_insert_nops_test(false);
2019 END_TEST
2020 
2021 BEGIN_TEST(insert_nops.setpc_gfx11)
2022    if (!setup_cs(NULL, GFX11))
2023       return;
2024 
2025    //>> p_unit_test 0
2026    //! s_setpc_b64 0
2027    bld.pseudo(aco_opcode::p_unit_test, Operand::c32(0));
2028    bld.sop1(aco_opcode::s_setpc_b64, Operand::zero(8));
2029 
2030    /* LdsDirectVALUHazard */
2031    //! p_unit_test 1
2032    //! s2: %0:vcc = v_cmp_eq_u32 %0:v[0], 0
2033    //! s_waitcnt_depctr va_vdst(0)
2034    //! s_setpc_b64 0
2035    bld.pseudo(aco_opcode::p_unit_test, Operand::c32(1));
2036    bld.vopc_e64(aco_opcode::v_cmp_eq_u32, Definition(vcc, s2), Operand(PhysReg(256), v1),
2037                 Operand::zero());
2038    bld.sop1(aco_opcode::s_setpc_b64, Operand::zero(8));
2039 
2040    /* VALUPartialForwardingHazard */
2041    //! p_unit_test 2
2042    //! v1: %0:v[0] = v_mov_b32 0
2043    //! s_waitcnt_depctr va_vdst(0)
2044    //! s_setpc_b64 0
2045    bld.pseudo(aco_opcode::p_unit_test, Operand::c32(2));
2046    bld.vop1(aco_opcode::v_mov_b32, Definition(PhysReg(256), v1), Operand::zero());
2047    bld.sop1(aco_opcode::s_setpc_b64, Operand::zero(8));
2048 
2049    /* VcmpxPermlaneHazard */
2050    //! p_unit_test 2
2051    //! s2: %0:exec = v_cmpx_eq_u32 0, 0
2052    //! v_nop
2053    //! s_setpc_b64 0
2054    bld.pseudo(aco_opcode::p_unit_test, Operand::c32(2));
2055    bld.vopc_e64(aco_opcode::v_cmpx_eq_u32, Definition(exec, s2), Operand::zero(), Operand::zero());
2056    bld.sop1(aco_opcode::s_setpc_b64, Operand::zero(8));
2057 
2058    /* VALUTransUseHazard */
2059    //! p_unit_test 3
2060    //! v1: %0:v[0] = v_rcp_f32 0
2061    //! s_waitcnt_depctr va_vdst(0)
2062    //! s_setpc_b64 0
2063    bld.pseudo(aco_opcode::p_unit_test, Operand::c32(3));
2064    bld.vop1(aco_opcode::v_rcp_f32, Definition(PhysReg(256), v1), Operand::zero());
2065    bld.sop1(aco_opcode::s_setpc_b64, Operand::zero(8));
2066 
2067    /* VALUMaskWriteHazard */
2068    //! p_unit_test 4
2069    //! v1: %0:v[0] = v_cndmask_b32 0, 0, %0:vcc
2070    //! s1: %0:vcc_hi = s_mov_b32 0
2071    //! s_waitcnt_depctr va_vdst(0) sa_sdst(0)
2072    //! v1: %0:v[0] = v_xor3_b32 %0:v[0], %0:s[0], %0:s[0]
2073    //! s_waitcnt_depctr va_vdst(0)
2074    //! s_setpc_b64 0
2075    bld.pseudo(aco_opcode::p_unit_test, Operand::c32(4));
2076    bld.vop2(aco_opcode::v_cndmask_b32, Definition(PhysReg(256), v1), Operand::zero(),
2077             Operand::zero(), Operand(vcc, s2));
2078    bld.sop1(aco_opcode::s_mov_b32, Definition(vcc_hi, s1), Operand::c32(0));
2079    bld.sop1(aco_opcode::s_setpc_b64, Operand::zero(8));
2080 
2081    //! p_unit_test 8
2082    //! v1: %0:v[0] = v_cndmask_b32 0, 0, %0:vcc
2083    //! s_waitcnt_depctr va_vdst(0)
2084    //! v1: %0:v[0] = v_xor3_b32 %0:v[0], %0:s[0], %0:s[0]
2085    //! s_waitcnt_depctr va_vdst(0)
2086    //! s_setpc_b64 0
2087    bld.pseudo(aco_opcode::p_unit_test, Operand::c32(8));
2088    bld.vop2(aco_opcode::v_cndmask_b32, Definition(PhysReg(256), v1), Operand::zero(),
2089             Operand::zero(), Operand(vcc, s2));
2090    bld.sop1(aco_opcode::s_setpc_b64, Operand::zero(8));
2091 
2092    //! p_unit_test 5
2093    //! v1: %0:v[0] = v_cndmask_b32 0, 0, %0:vcc
2094    //! s2: %0:vcc = s_mov_b64 0
2095    //! s_waitcnt_depctr va_vdst(0) sa_sdst(0)
2096    //! v1: %0:v[0] = v_xor3_b32 %0:v[0], %0:s[0], %0:s[0]
2097    //! s_waitcnt_depctr va_vdst(0)
2098    //! s_setpc_b64 0
2099    bld.pseudo(aco_opcode::p_unit_test, Operand::c32(5));
2100    bld.vop2(aco_opcode::v_cndmask_b32, Definition(PhysReg(256), v1), Operand::zero(),
2101             Operand::zero(), Operand(vcc, s2));
2102    bld.sop1(aco_opcode::s_mov_b64, Definition(vcc, s2), Operand::zero(8));
2103    bld.sop1(aco_opcode::s_setpc_b64, Operand::zero(8));
2104 
2105    /* LdsDirectVMEMHazard */
2106    //! p_unit_test 6
2107    //! v1: %0:v[0] = ds_read_b32 %0:v[0]
2108    //! s_waitcnt_depctr vm_vsrc(0)
2109    //! s_setpc_b64 0
2110    bld.pseudo(aco_opcode::p_unit_test, Operand::c32(6));
2111    bld.ds(aco_opcode::ds_read_b32, Definition(PhysReg(256), v1), Operand(PhysReg(256), v1));
2112    bld.sop1(aco_opcode::s_setpc_b64, Operand::zero(8));
2113 
2114    /* WMMA Hazards */
2115    //! p_unit_test 7
2116    //! v4: %0:v[20-23] = v_wmma_f16_16x16x16_f16 %0:v[0-7].xx, %0:v[8-15].xx, %0:v[20-23].xx
2117    //! v_nop
2118    //! s_waitcnt_depctr va_vdst(0)
2119    //! s_setpc_b64 0
2120    bld.pseudo(aco_opcode::p_unit_test, Operand::c32(7));
2121    Operand A(PhysReg(256 + 0), v8);
2122    Operand B(PhysReg(256 + 8), v8);
2123    Operand C(PhysReg(256 + 20), v4);
2124    bld.vop3p(aco_opcode::v_wmma_f16_16x16x16_f16, Definition(C.physReg(), C.regClass()), A, B, C, 0,
2125              0);
2126    bld.sop1(aco_opcode::s_setpc_b64, Operand::zero(8));
2127 
2128    finish_insert_nops_test(true);
2129 END_TEST
2130 
2131 BEGIN_TEST(insert_nops.setpc_gfx12)
2132    if (!setup_cs(NULL, GFX12))
2133       return;
2134 
2135    //>> p_unit_test 0
2136    //! s_setpc_b64 0
2137    bld.pseudo(aco_opcode::p_unit_test, Operand::c32(0));
2138    bld.sop1(aco_opcode::s_setpc_b64, Operand::zero(8));
2139 
2140    /* LdsDirectVALUHazard */
2141    //! p_unit_test 1
2142    //! s2: %0:vcc = v_cmp_eq_u32 %0:v[0], 0
2143    //! s_waitcnt_depctr va_vdst(0)
2144    //! s_setpc_b64 0
2145    bld.pseudo(aco_opcode::p_unit_test, Operand::c32(1));
2146    bld.vopc_e64(aco_opcode::v_cmp_eq_u32, Definition(vcc, s2), Operand(PhysReg(256), v1),
2147                 Operand::zero());
2148    bld.sop1(aco_opcode::s_setpc_b64, Operand::zero(8));
2149 
2150    /* VcmpxPermlaneHazard */
2151    //! p_unit_test 2
2152    //! s2: %0:exec = v_cmpx_eq_u32 0, 0
2153    //! v_nop
2154    //! s_setpc_b64 0
2155    bld.pseudo(aco_opcode::p_unit_test, Operand::c32(2));
2156    bld.vopc_e64(aco_opcode::v_cmpx_eq_u32, Definition(exec, s2), Operand::zero(), Operand::zero());
2157    bld.sop1(aco_opcode::s_setpc_b64, Operand::zero(8));
2158 
2159    /* LdsDirectVMEMHazard */
2160    //! p_unit_test 3
2161    //! v1: %0:v[0] = ds_read_b32 %0:v[0]
2162    //! s_waitcnt_depctr vm_vsrc(0)
2163    //! s_setpc_b64 0
2164    bld.pseudo(aco_opcode::p_unit_test, Operand::c32(3));
2165    bld.ds(aco_opcode::ds_read_b32, Definition(PhysReg(256), v1), Operand(PhysReg(256), v1));
2166    bld.sop1(aco_opcode::s_setpc_b64, Operand::zero(8));
2167 
2168    /* VALUReadSGPRHazard */
2169    //! p_unit_test 4
2170    //! v1: %0:v[0] = v_mov_b32 %0:s[4]
2171    //! s1: %0:s[4] = s_mov_b32 0
2172    //! s_waitcnt_depctr va_vdst(0) sa_sdst(0)
2173    //! s_setpc_b64 0
2174    bld.pseudo(aco_opcode::p_unit_test, Operand::c32(4));
2175    bld.vop1(aco_opcode::v_mov_b32, Definition(PhysReg(256), v1), Operand(PhysReg(4), s1));
2176    bld.sop1(aco_opcode::s_mov_b32, Definition(PhysReg(4), s1), Operand::zero(4));
2177    bld.sop1(aco_opcode::s_setpc_b64, Operand::zero(8));
2178 
2179    //! p_unit_test 5
2180    //! v1: %0:v[0] = v_mov_b32 %0:s[4]
2181    //! s1: %0:s[4] = s_mov_b32 0
2182    //; for i in range(10): insert_pattern('s1: %0:s[64] = s_mov_b32 0')
2183    //! s_waitcnt_depctr va_vdst(0)
2184    //! s_setpc_b64 0
2185    bld.pseudo(aco_opcode::p_unit_test, Operand::c32(5));
2186    bld.vop1(aco_opcode::v_mov_b32, Definition(PhysReg(256), v1), Operand(PhysReg(4), s1));
2187    bld.sop1(aco_opcode::s_mov_b32, Definition(PhysReg(4), s1), Operand::zero(4));
2188    for (unsigned i = 0; i < 10; i++) /* the s_setpc_b64 counts */
2189       bld.sop1(aco_opcode::s_mov_b32, Definition(PhysReg(64), s1), Operand::zero(4));
2190    bld.sop1(aco_opcode::s_setpc_b64, Operand::zero(8));
2191 
2192    //! p_unit_test 6
2193    //! v1: %0:v[0] = v_mov_b32 %0:s[4]
2194    //! s1: %0:s[4] = s_mov_b32 0
2195    //; for i in range(9): insert_pattern('s1: %0:s[64] = s_mov_b32 0')
2196    //! s_waitcnt_depctr va_vdst(0) sa_sdst(0)
2197    //! s_setpc_b64 0
2198    bld.pseudo(aco_opcode::p_unit_test, Operand::c32(6));
2199    bld.vop1(aco_opcode::v_mov_b32, Definition(PhysReg(256), v1), Operand(PhysReg(4), s1));
2200    bld.sop1(aco_opcode::s_mov_b32, Definition(PhysReg(4), s1), Operand::zero(4));
2201    for (unsigned i = 0; i < 9; i++)
2202       bld.sop1(aco_opcode::s_mov_b32, Definition(PhysReg(64), s1), Operand::zero(4));
2203    bld.sop1(aco_opcode::s_setpc_b64, Operand::zero(8));
2204 
2205    //! p_unit_test 7
2206    //! v1: %0:v[0] = v_mov_b32 %0:s[4]
2207    //! s1: %0:s[4] = v_readfirstlane_b32 %0:v[0]
2208    //! s_waitcnt_depctr va_vdst(0) va_sdst(0)
2209    //! s_setpc_b64 0
2210    bld.pseudo(aco_opcode::p_unit_test, Operand::c32(7));
2211    bld.vop1(aco_opcode::v_mov_b32, Definition(PhysReg(256), v1), Operand(PhysReg(4), s1));
2212    bld.vop1(aco_opcode::v_readfirstlane_b32, Definition(PhysReg(4), s1), Operand(PhysReg(256), v1));
2213    bld.sop1(aco_opcode::s_setpc_b64, Operand::zero(8));
2214 
2215    //! p_unit_test 8
2216    //! v1: %0:v[0] = v_mov_b32 %0:vcc_lo
2217    //! s1: %0:vcc_lo = v_readfirstlane_b32 %0:v[0]
2218    //! s_waitcnt_depctr va_vdst(0) va_vcc(0)
2219    //! s_setpc_b64 0
2220    bld.pseudo(aco_opcode::p_unit_test, Operand::c32(8));
2221    bld.vop1(aco_opcode::v_mov_b32, Definition(PhysReg(256), v1), Operand(PhysReg(vcc), s1));
2222    bld.vop1(aco_opcode::v_readfirstlane_b32, Definition(vcc, s1), Operand(PhysReg(256), v1));
2223    bld.sop1(aco_opcode::s_setpc_b64, Operand::zero(8));
2224 
2225    //! p_unit_test 9
2226    //! v1: %0:v[0] = v_mov_b32 %0:s[4]
2227    //! v1: %0:v[1] = v_mov_b32 %0:s[5]
2228    //! v1: %0:v[2] = v_mov_b32 %0:vcc_lo
2229    //! s1: %0:s[4] = s_mov_b32 0
2230    //! s1: %0:s[5] = v_readfirstlane_b32 %0:v[0]
2231    //! s1: %0:vcc_lo = v_readfirstlane_b32 %0:v[1]
2232    //! s_waitcnt_depctr va_vdst(0) va_sdst(0) va_vcc(0) sa_sdst(0)
2233    //! s_setpc_b64 0
2234    bld.pseudo(aco_opcode::p_unit_test, Operand::c32(9));
2235    bld.vop1(aco_opcode::v_mov_b32, Definition(PhysReg(256), v1), Operand(PhysReg(4), s1));
2236    bld.vop1(aco_opcode::v_mov_b32, Definition(PhysReg(257), v1), Operand(PhysReg(5), s1));
2237    bld.vop1(aco_opcode::v_mov_b32, Definition(PhysReg(258), v1), Operand(PhysReg(vcc), s1));
2238    bld.sop1(aco_opcode::s_mov_b32, Definition(PhysReg(4), s1), Operand::zero(4));
2239    bld.vop1(aco_opcode::v_readfirstlane_b32, Definition(PhysReg(5), s1), Operand(PhysReg(256), v1));
2240    bld.vop1(aco_opcode::v_readfirstlane_b32, Definition(vcc, s1), Operand(PhysReg(257), v1));
2241    bld.sop1(aco_opcode::s_setpc_b64, Operand::zero(8));
2242 
2243    finish_insert_nops_test(true);
2244 END_TEST
2245