• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /*
2  * Copyright © 2020 Valve Corporation
3  *
4  * SPDX-License-Identifier: MIT
5  */
6 #include <llvm/Config/llvm-config.h>
7 
8 #include "helpers.h"
9 #include "sid.h"
10 
11 using namespace aco;
12 
13 static std::vector<amd_gfx_level>
filter_gfx_levels(std::vector<amd_gfx_level> src)14 filter_gfx_levels(std::vector<amd_gfx_level> src)
15 {
16    std::vector<amd_gfx_level> res;
17    for (amd_gfx_level gfx : src) {
18       if (gfx < GFX12 || LLVM_VERSION_MAJOR >= 19)
19          res.push_back(gfx);
20    }
21    return res;
22 }
23 
24 BEGIN_TEST(assembler.s_memtime)
25    for (unsigned i = GFX6; i <= GFX10; i++) {
26       if (!setup_cs(NULL, (amd_gfx_level)i))
27          continue;
28 
29       //~gfx[6-7]>> c7800000
30       //~gfx[6-7]!  bf810000
31       //~gfx[8-9]>> s_memtime s[0:1] ; c0900000 00000000
32       //~gfx10>> s_memtime s[0:1] ; f4900000 fa000000
33       bld.smem(aco_opcode::s_memtime, bld.def(s2)).def(0).setFixed(PhysReg{0});
34 
35       finish_assembler_test();
36    }
37 END_TEST
38 
39 BEGIN_TEST(assembler.branch_3f)
40    if (!setup_cs(NULL, (amd_gfx_level)GFX10))
41       return;
42 
43    //! BB0:
44    //! s_branch BB1                                                ; bf820040
45    //! s_nop 0                                                     ; bf800000
46    bld.sopp(aco_opcode::s_branch, 1);
47 
48    for (unsigned i = 0; i < 0x3f; i++)
49       bld.vop1(aco_opcode::v_nop);
50 
51    bld.reset(program->create_and_insert_block());
52 
53    program->blocks[1].linear_preds.push_back(0u);
54 
55    finish_assembler_test();
56 END_TEST
57 
58 BEGIN_TEST(assembler.long_jump.unconditional_forwards)
59    if (!setup_cs(NULL, (amd_gfx_level)GFX10))
60       return;
61 
62    //!BB0:
63    //! s_branch 16369                                              ; bf823ff1
64    bld.sopp(aco_opcode::s_branch, 2);
65 
66    bld.reset(program->create_and_insert_block());
67 
68    //! s_nop 0                                                     ; bf800000
69    //!(then repeated 16366 times)
70    for (unsigned i = 0; i < INT16_MAX + 1; i++)
71       bld.sopp(aco_opcode::s_nop, 0);
72 
73    //! s_waitcnt_vscnt null, 0x0                                   ; bbfd0000
74    //! s_branch 1                                                  ; bf820001
75    //! s_branch BB2                                                ; bf824011
76    //! s_nop 0                                                     ; bf800000
77    //! (then repeated 16400 times)
78    //! BB2:
79    //! s_endpgm                                                    ; bf810000
80    bld.reset(program->create_and_insert_block());
81 
82    program->blocks[2].linear_preds.push_back(0u);
83    program->blocks[2].linear_preds.push_back(1u);
84 
85    finish_assembler_test();
86 END_TEST
87 
88 BEGIN_TEST(assembler.long_jump.conditional_forwards)
89    for (amd_gfx_level gfx : filter_gfx_levels({GFX10, GFX12})) {
90       if (!setup_cs(NULL, gfx))
91          continue;
92 
93       //! BB0:
94       //~gfx10! s_cbranch_scc0 16369                                  ; bf843ff1
95       //~gfx12! s_cbranch_scc0 16368                                  ; bfa13ff0
96       bld.sopp(aco_opcode::s_cbranch_scc0, 2);
97 
98       bld.reset(program->create_and_insert_block());
99 
100       //! BB1:
101       //! s_nop 0                                                     ; bf800000
102       //!(then repeated 16366 times)
103       //~gfx10! s_waitcnt_vscnt null, 0x0                             ; bbfd0000
104       //! s_branch 1                                                  ; $_
105       //! s_branch BB2                                                ; $_
106       //! s_nop 0                                                     ; bf800000
107       //!(then repeated 16400 times)
108       for (unsigned i = 0; i < INT16_MAX + 1; i++)
109          bld.sopp(aco_opcode::s_nop, 0);
110 
111       //! BB2:
112       //! s_endpgm                                                    ; $_
113       bld.reset(program->create_and_insert_block());
114 
115       program->blocks[1].linear_preds.push_back(0u);
116       program->blocks[2].linear_preds.push_back(0u);
117       program->blocks[2].linear_preds.push_back(1u);
118 
119       finish_assembler_test();
120    }
121 END_TEST
122 
123 BEGIN_TEST(assembler.long_jump.unconditional_backwards)
124    if (!setup_cs(NULL, (amd_gfx_level)GFX10))
125       return;
126 
127    //!BB0:
128    //! s_nop 0                                                     ; bf800000
129    //!(then repeated 16367 times)
130    for (unsigned i = 0; i < INT16_MAX + 1; i++)
131       bld.sopp(aco_opcode::s_nop, 0);
132 
133    //! s_waitcnt_vscnt null, 0x0                                   ; bbfd0000
134    //! s_branch 1                                                  ; bf820001
135    //! s_branch BB0                                                ; bf82c00d
136    //! s_nop 0                                                     ; bf800000
137    //! (then repeated 16399 times)
138    //! s_branch 49134                                              ; bf82bfee
139    bld.sopp(aco_opcode::s_branch, 0);
140 
141    //! BB1:
142    //! s_endpgm                                                    ; bf810000
143    bld.reset(program->create_and_insert_block());
144 
145    program->blocks[0].linear_preds.push_back(0u);
146    program->blocks[1].linear_preds.push_back(0u);
147 
148    finish_assembler_test();
149 END_TEST
150 
151 BEGIN_TEST(assembler.long_jump.conditional_backwards)
152    if (!setup_cs(NULL, (amd_gfx_level)GFX10))
153       return;
154 
155    //!BB0:
156    //! s_nop 0                                                     ; bf800000
157    //!(then repeated 16367 times)
158    for (unsigned i = 0; i < INT16_MAX + 1; i++)
159       bld.sopp(aco_opcode::s_nop, 0);
160 
161    //! s_waitcnt_vscnt null, 0x0                                   ; bbfd0000
162    //! s_branch 1                                                  ; bf820001
163    //! s_branch BB0                                                ; bf82c00d
164    //! s_nop 0                                                     ; bf800000
165    //!(then repeated 16399 times)
166    //! s_cbranch_execnz 49134                                      ; bf89bfee
167    bld.sopp(aco_opcode::s_cbranch_execnz, 0);
168 
169    //! BB1:
170    //! s_endpgm                                                    ; bf810000
171    bld.reset(program->create_and_insert_block());
172 
173    program->blocks[0].linear_preds.push_back(0u);
174    program->blocks[1].linear_preds.push_back(0u);
175 
176    finish_assembler_test();
177 END_TEST
178 
179 BEGIN_TEST(assembler.long_jump.constaddr)
180    if (!setup_cs(NULL, (amd_gfx_level)GFX10))
181       return;
182 
183    //>> s_branch 16369                                              ; bf823ff1
184    bld.sopp(aco_opcode::s_branch, 2);
185 
186    bld.reset(program->create_and_insert_block());
187 
188    for (unsigned i = 0; i < INT16_MAX + 1; i++)
189       bld.sopp(aco_opcode::s_nop, 0);
190 
191    bld.reset(program->create_and_insert_block());
192 
193    //>> s_getpc_b64 s[0:1]                                          ; be801f00
194    //! s_add_u32 s0, s0, 32                                         ; 8000ff00 00000020
195    bld.sop1(aco_opcode::p_constaddr_getpc, Definition(PhysReg(0), s2), Operand::zero());
196    bld.sop2(aco_opcode::p_constaddr_addlo, Definition(PhysReg(0), s1), bld.def(s1, scc),
197             Operand(PhysReg(0), s1), Operand::zero(), Operand::zero());
198 
199    program->blocks[2].linear_preds.push_back(0u);
200    program->blocks[2].linear_preds.push_back(1u);
201 
202    finish_assembler_test();
203 END_TEST
204 
205 BEGIN_TEST(assembler.long_jump.discard_early_exit)
206    if (!setup_cs(NULL, (amd_gfx_level)GFX10))
207       return;
208 
209    //! BB0:
210    //! s_cbranch_scc0 16369                                        ; bf843ff1
211    bld.sopp(aco_opcode::s_cbranch_scc0, 2);
212 
213    bld.reset(program->create_and_insert_block());
214 
215    //! BB1:
216    //! s_nop 1                                                     ; bf800001
217    //! (then repeated 16366 times)
218    //! s_waitcnt_vscnt null, 0x0                                   ; bbfd0000
219    //! s_branch 1                                                  ; bf820001
220    //! s_branch BB2                                                ; bf824011
221    //! s_nop 1                                                     ; bf800001
222    //! (then repeated 16399 times)
223    //! s_endpgm                                                    ; bf810000
224    for (unsigned i = 0; i < INT16_MAX; i++)
225       bld.sopp(aco_opcode::s_nop, 1);
226 
227    //! BB2:
228    //! s_endpgm                                                    ; bf810000
229    bld.reset(program->create_and_insert_block());
230 
231    program->blocks[1].linear_preds.push_back(0u);
232    program->blocks[2].linear_preds.push_back(0u);
233    program->blocks[2].kind = block_kind_discard_early_exit;
234 
235    finish_assembler_test();
236 END_TEST
237 
238 BEGIN_TEST(assembler.v_add3)
239    for (unsigned i = GFX9; i <= GFX10; i++) {
240       if (!setup_cs(NULL, (amd_gfx_level)i))
241          continue;
242 
243       //~gfx9>> v_add3_u32 v0, 0, 0, 0 ; d1ff0000 02010080
244       //~gfx10>> v_add3_u32 v0, 0, 0, 0 ; d76d0000 02010080
245       aco_ptr<Instruction> add3{create_instruction(aco_opcode::v_add3_u32, Format::VOP3, 3, 1)};
246       add3->operands[0] = Operand::zero();
247       add3->operands[1] = Operand::zero();
248       add3->operands[2] = Operand::zero();
249       add3->definitions[0] = Definition(PhysReg(0), v1);
250       bld.insert(std::move(add3));
251 
252       finish_assembler_test();
253    }
254 END_TEST
255 
256 BEGIN_TEST(assembler.v_add3_clamp)
257    for (unsigned i = GFX9; i <= GFX10; i++) {
258       if (!setup_cs(NULL, (amd_gfx_level)i))
259          continue;
260 
261       //~gfx9>> integer addition + clamp ; d1ff8000 02010080
262       //~gfx10>> integer addition + clamp ; d76d8000 02010080
263       aco_ptr<Instruction> add3{create_instruction(aco_opcode::v_add3_u32, Format::VOP3, 3, 1)};
264       add3->operands[0] = Operand::zero();
265       add3->operands[1] = Operand::zero();
266       add3->operands[2] = Operand::zero();
267       add3->definitions[0] = Definition(PhysReg(0), v1);
268       add3->valu().clamp = 1;
269       bld.insert(std::move(add3));
270 
271       finish_assembler_test();
272    }
273 END_TEST
274 
275 BEGIN_TEST(assembler.smem_offset)
276    for (unsigned i = GFX9; i <= GFX10; i++) {
277       if (!setup_cs(NULL, (amd_gfx_level)i))
278          continue;
279 
280       Definition dst(PhysReg(7), s1);
281       Operand sbase(PhysReg(6), s2);
282       Operand offset(PhysReg(5), s1);
283 
284       //~gfx9>> s_load_dword s7, s[6:7], s5 ; c00001c3 00000005
285       //~gfx10>> s_load_dword s7, s[6:7], s5 ; f40001c3 0a000000
286       bld.smem(aco_opcode::s_load_dword, dst, sbase, offset);
287       //~gfx9! s_load_dword s7, s[6:7], 0x42 ; c00201c3 00000042
288       //~gfx10! s_load_dword s7, s[6:7], 0x42 ; f40001c3 fa000042
289       bld.smem(aco_opcode::s_load_dword, dst, sbase, Operand::c32(0x42));
290       if (i >= GFX9) {
291          //~gfx9! s_load_dword s7, s[6:7], s5 offset:0x42 ; c00241c3 0a000042
292          //~gfx10! s_load_dword s7, s[6:7], s5 offset:0x42 ; f40001c3 0a000042
293          bld.smem(aco_opcode::s_load_dword, dst, sbase, Operand::c32(0x42), offset);
294       }
295 
296       finish_assembler_test();
297    }
298 END_TEST
299 
300 BEGIN_TEST(assembler.p_constaddr)
301    if (!setup_cs(NULL, GFX9))
302       return;
303 
304    Definition dst0 = bld.def(s2);
305    Definition dst1 = bld.def(s2);
306    dst0.setFixed(PhysReg(0));
307    dst1.setFixed(PhysReg(2));
308 
309    //>> s_getpc_b64 s[0:1] ; be801c00
310    //! s_add_u32 s0, s0, 44 ; 8000ff00 0000002c
311    bld.pseudo(aco_opcode::p_constaddr, dst0, bld.def(s1, scc), Operand::zero());
312 
313    //! s_getpc_b64 s[2:3] ; be821c00
314    //! s_add_u32 s2, s2, 64 ; 8002ff02 00000040
315    bld.pseudo(aco_opcode::p_constaddr, dst1, bld.def(s1, scc), Operand::c32(32));
316 
317    aco::lower_to_hw_instr(program.get());
318    finish_assembler_test();
319 END_TEST
320 
321 BEGIN_TEST(assembler.vopc_sdwa)
322    for (unsigned i = GFX9; i <= GFX10; i++) {
323       if (!setup_cs(NULL, (amd_gfx_level)i))
324          continue;
325 
326       //~gfx9>> v_cmp_lt_u32_sdwa vcc, 0, 0 src0_sel:DWORD src1_sel:DWORD ; 7d9300f9 86860080
327       //~gfx10>> v_cmp_lt_u32_sdwa vcc, 0, 0 src0_sel:DWORD src1_sel:DWORD   ; 7d8300f9 86860080
328       bld.vopc_sdwa(aco_opcode::v_cmp_lt_u32, Definition(vcc, s2), Operand::zero(),
329                     Operand::zero());
330 
331       //~gfx9! v_cmp_lt_u32_sdwa s[44:45], 0, 0 src0_sel:DWORD src1_sel:DWORD ; 7d9300f9 8686ac80
332       //~gfx10! v_cmp_lt_u32_sdwa s[44:45], 0, 0 src0_sel:DWORD src1_sel:DWORD ; 7d8300f9 8686ac80
333       bld.vopc_sdwa(aco_opcode::v_cmp_lt_u32, Definition(PhysReg(0x2c), s2), Operand::zero(),
334                     Operand::zero());
335 
336       //~gfx9! v_cmp_lt_u32_sdwa exec, 0, 0 src0_sel:DWORD src1_sel:DWORD ; 7d9300f9 8686fe80
337       //~gfx10! v_cmp_lt_u32_sdwa exec, 0, 0 src0_sel:DWORD src1_sel:DWORD  ; 7d8300f9 8686fe80
338       bld.vopc_sdwa(aco_opcode::v_cmp_lt_u32, Definition(exec, s2), Operand::zero(),
339                     Operand::zero());
340 
341       if (i == GFX10) {
342          //~gfx10! v_cmpx_lt_u32_sdwa 0, 0 src0_sel:DWORD src1_sel:DWORD ; 7da300f9 86860080
343          bld.vopc_sdwa(aco_opcode::v_cmpx_lt_u32, Definition(exec, s2), Operand::zero(),
344                        Operand::zero());
345       } else {
346          //~gfx9! v_cmpx_lt_u32_sdwa vcc, 0, 0 src0_sel:DWORD src1_sel:DWORD ; 7db300f9 86860080
347          bld.vopc_sdwa(aco_opcode::v_cmpx_lt_u32, Definition(vcc, s2), Definition(exec, s2),
348                        Operand::zero(), Operand::zero());
349 
350          //~gfx9! v_cmpx_lt_u32_sdwa s[44:45], 0, 0 src0_sel:DWORD src1_sel:DWORD ; 7db300f9 8686ac80
351          bld.vopc_sdwa(aco_opcode::v_cmpx_lt_u32, Definition(PhysReg(0x2c), s2),
352                        Definition(exec, s2), Operand::zero(), Operand::zero());
353       }
354 
355       finish_assembler_test();
356    }
357 END_TEST
358 
359 BEGIN_TEST(assembler.smem)
360    for (amd_gfx_level gfx : filter_gfx_levels({GFX11, GFX12})) {
361       if (!setup_cs(NULL, gfx))
362          continue;
363 
364       Definition dst = bld.def(s1);
365       dst.setFixed(PhysReg(4));
366 
367       Operand op_s1(bld.tmp(s1));
368       op_s1.setFixed(PhysReg(8));
369 
370       Operand op_s2(bld.tmp(s2));
371       op_s2.setFixed(PhysReg(16));
372 
373       Operand op_s4(bld.tmp(s4));
374       op_s4.setFixed(PhysReg(32));
375 
376       //~gfx11>> s_dcache_inv                                                ; f4840000 f8000000
377       //~gfx12>> s_dcache_inv                                                ; f4042000 f8000000
378       bld.smem(aco_opcode::s_dcache_inv);
379 
380       //! s_load_b32 s4, s[16:17], 0x2a                               ; f4000108 f800002a
381       bld.smem(aco_opcode::s_load_dword, dst, op_s2, Operand::c32(42));
382 
383       //~gfx11! s_load_b32 s4, s[16:17], s8                                 ; f4000108 10000000
384       //~gfx12! s_load_b32 s4, s[16:17], s8 offset:0x0                      ; f4000108 10000000
385       bld.smem(aco_opcode::s_load_dword, dst, op_s2, op_s1);
386 
387       //! s_load_b32 s4, s[16:17], s8 offset:0x2a                     ; f4000108 1000002a
388       bld.smem(aco_opcode::s_load_dword, dst, op_s2, Operand::c32(42), op_s1);
389 
390       ac_hw_cache_flags cache_coherent = {{0, 0, 0, 0, 0}};
391       ac_hw_cache_flags cache_non_temporal = {{0, 0, 0, 0, 0}};
392       if (gfx >= GFX12) {
393          cache_coherent.gfx12.scope = gfx12_scope_device;
394          cache_non_temporal.gfx12.temporal_hint = gfx12_load_non_temporal;
395       } else {
396          cache_coherent.value = ac_glc;
397          cache_non_temporal.value = ac_dlc;
398       }
399 
400       //~gfx11! s_buffer_load_b32 s4, s[32:35], s8 glc                      ; f4204110 10000000
401       //~gfx12! s_buffer_load_b32 s4, s[32:35], s8 offset:0x0 scope:SCOPE_DEV ; f4420110 10000000
402       bld.smem(aco_opcode::s_buffer_load_dword, dst, op_s4, op_s1)->smem().cache = cache_coherent;
403 
404       //~gfx11! s_buffer_load_b32 s4, s[32:35], s8 dlc                      ; f4202110 10000000
405       //~gfx12! s_buffer_load_b32 s4, s[32:35], s8 offset:0x0 th:TH_LOAD_NT ; f4820110 10000000
406       bld.smem(aco_opcode::s_buffer_load_dword, dst, op_s4, op_s1)->smem().cache =
407          cache_non_temporal;
408 
409       finish_assembler_test();
410    }
411 END_TEST
412 
413 BEGIN_TEST(assembler.mubuf)
414    for (amd_gfx_level gfx : filter_gfx_levels({GFX11, GFX12})) {
415       if (!setup_cs(NULL, gfx))
416          continue;
417 
418       Definition dst = bld.def(v1);
419       dst.setFixed(PhysReg(256 + 42));
420 
421       Operand op_s4(bld.tmp(s4));
422       op_s4.setFixed(PhysReg(32));
423 
424       Operand op_v1(bld.tmp(v1));
425       op_v1.setFixed(PhysReg(256 + 10));
426 
427       Operand op_v2(bld.tmp(v2));
428       op_v2.setFixed(PhysReg(256 + 20));
429 
430       Operand op_s1(bld.tmp(s1));
431       op_s1.setFixed(PhysReg(30));
432 
433       Operand op_m0(bld.tmp(s1));
434       op_m0.setFixed(m0);
435 
436       //! llvm_version: #llvm_ver
437       fprintf(output, "llvm_version: %u\n", LLVM_VERSION_MAJOR);
438 
439       /* Addressing */
440       //~gfx11>> buffer_load_b32 v42, off, s[32:35], s30                     ; e0500000 1e082a80
441       //~gfx12>> buffer_load_b32 v42, off, s[32:35], s30                     ; c405001e 0080402a 00000000
442       bld.mubuf(aco_opcode::buffer_load_dword, dst, op_s4, Operand(v1), op_s1, 0, false);
443 
444       //~gfx11! buffer_load_b32 v42, off, s[32:35], 0                       ; e0500000 80082a80
445       //~gfx12! buffer_load_b32 v42, off, s[32:35], null                    ; c405007c 0080402a 00000000
446       bld.mubuf(aco_opcode::buffer_load_dword, dst, op_s4, Operand(v1), Operand::zero(), 0, false);
447 
448       //~gfx11! buffer_load_b32 v42, off, s[32:35], 42                      ; e0500000 aa082a80
449       if (gfx == GFX11)
450          bld.mubuf(aco_opcode::buffer_load_dword, dst, op_s4, Operand(v1), Operand::c32(42), 0,
451                    false);
452 
453       //~gfx11! buffer_load_b32 v42, v10, s[32:35], s30 offen               ; e0500000 1e482a0a
454       //~gfx12! buffer_load_b32 v42, v10, s[32:35], s30 offen               ; c405001e 4080402a 0000000a
455       bld.mubuf(aco_opcode::buffer_load_dword, dst, op_s4, op_v1, op_s1, 0, true);
456 
457       //~gfx11! buffer_load_b32 v42, v10, s[32:35], s30 idxen               ; e0500000 1e882a0a
458       //~gfx12! buffer_load_b32 v42, v10, s[32:35], s30 idxen               ; c405001e 8080402a 0000000a
459       bld.mubuf(aco_opcode::buffer_load_dword, dst, op_s4, op_v1, op_s1, 0, false)->mubuf().idxen =
460          true;
461 
462       //~gfx11! buffer_load_b32 v42, v[20:21], s[32:35], s30 idxen offen    ; e0500000 1ec82a14
463       //~gfx12! buffer_load_b32 v42, v[20:21], s[32:35], s30 idxen offen    ; c405001e c080402a 00000014
464       bld.mubuf(aco_opcode::buffer_load_dword, dst, op_s4, op_v2, op_s1, 0, true)->mubuf().idxen =
465          true;
466 
467       //~gfx11! buffer_load_b32 v42, off, s[32:35], s30 offset:84           ; e0500054 1e082a80
468       //~gfx12! buffer_load_b32 v42, off, s[32:35], s30 offset:84           ; c405001e 0080402a 00005400
469       bld.mubuf(aco_opcode::buffer_load_dword, dst, op_s4, Operand(v1), op_s1, 84, false);
470 
471       /* Various flags */
472       ac_hw_cache_flags cache_coherent = {{0, 0, 0, 0, 0}};
473       ac_hw_cache_flags cache_sys_coherent = {{0, 0, 0, 0, 0}};
474       ac_hw_cache_flags cache_non_temporal = {{0, 0, 0, 0, 0}};
475       ac_hw_cache_flags cache_atomic_rtn = {{0, 0, 0, 0, 0}};
476       if (gfx >= GFX12) {
477          cache_coherent.gfx12.scope = gfx12_scope_device;
478          cache_sys_coherent.gfx12.scope = gfx12_scope_memory;
479          cache_non_temporal.gfx12.temporal_hint = gfx12_load_non_temporal;
480          cache_atomic_rtn.gfx12.temporal_hint = gfx12_atomic_return;
481       } else {
482          cache_coherent.value = ac_glc;
483          cache_sys_coherent.value = ac_slc;
484          cache_non_temporal.value = ac_dlc;
485          cache_atomic_rtn.value = ac_glc;
486       }
487 
488       //~gfx11! buffer_load_b32 v42, off, s[32:35], 0 glc                   ; e0504000 80082a80
489       //~gfx12! buffer_load_b32 v42, off, s[32:35], null scope:SCOPE_DEV    ; c405007c 0088402a 00000000
490       bld.mubuf(aco_opcode::buffer_load_dword, dst, op_s4, Operand(v1), Operand::zero(), 0, false)
491          ->mubuf()
492          .cache = cache_coherent;
493 
494       //~gfx11! buffer_load_b32 v42, off, s[32:35], 0 dlc                   ; e0502000 80082a80
495       //~gfx12! buffer_load_b32 v42, off, s[32:35], null th:TH_LOAD_NT      ; c405007c 0090402a 00000000
496       bld.mubuf(aco_opcode::buffer_load_dword, dst, op_s4, Operand(v1), Operand::zero(), 0, false)
497          ->mubuf()
498          .cache = cache_non_temporal;
499 
500       //~gfx11! buffer_load_b32 v42, off, s[32:35], 0 slc                   ; e0501000 80082a80
501       //~gfx12! buffer_load_b32 v42, off, s[32:35], null scope:SCOPE_SYS    ; c405007c 008c402a 00000000
502       bld.mubuf(aco_opcode::buffer_load_dword, dst, op_s4, Operand(v1), Operand::zero(), 0, false)
503          ->mubuf()
504          .cache = cache_sys_coherent;
505 
506       //; if llvm_ver >= 16 and variant == 'gfx11':
507       //;    insert_pattern('buffer_load_b32 v[42:43], off, s[32:35], 0 tfe              ; e0500000 80282a80')
508       //; elif variant == 'gfx11':
509       //;    insert_pattern('buffer_load_b32 v42, off, s[32:35], 0 tfe                   ; e0500000 80282a80')
510       //~gfx12! buffer_load_b32 v[42:43], off, s[32:35], null tfe           ; c445007c 0080402a 00000000
511       bld.mubuf(aco_opcode::buffer_load_dword, dst, op_s4, Operand(v1), Operand::zero(), 0, false)
512          ->mubuf()
513          .tfe = true;
514 
515       /* LDS */
516       if (gfx == GFX11) {
517          //~gfx11! buffer_load_lds_b32 off, s[32:35], 0                        ; e0c40000 80080080
518          bld.mubuf(aco_opcode::buffer_load_dword, op_s4, Operand(v1), Operand::zero(), op_m0, 0,
519                    false)
520             ->mubuf()
521             .lds = true;
522 
523          //~gfx11! buffer_load_lds_i8 off, s[32:35], 0                         ; e0b80000 80080080
524          bld.mubuf(aco_opcode::buffer_load_sbyte, op_s4, Operand(v1), Operand::zero(), op_m0, 0,
525                    false)
526             ->mubuf()
527             .lds = true;
528 
529          //~gfx11! buffer_load_lds_i16 off, s[32:35], 0                        ; e0c00000 80080080
530          bld.mubuf(aco_opcode::buffer_load_sshort, op_s4, Operand(v1), Operand::zero(), op_m0, 0,
531                    false)
532             ->mubuf()
533             .lds = true;
534 
535          //~gfx11! buffer_load_lds_u8 off, s[32:35], 0                         ; e0b40000 80080080
536          bld.mubuf(aco_opcode::buffer_load_ubyte, op_s4, Operand(v1), Operand::zero(), op_m0, 0,
537                    false)
538             ->mubuf()
539             .lds = true;
540 
541          //~gfx11! buffer_load_lds_u16 off, s[32:35], 0                        ; e0bc0000 80080080
542          bld.mubuf(aco_opcode::buffer_load_ushort, op_s4, Operand(v1), Operand::zero(), op_m0, 0,
543                    false)
544             ->mubuf()
545             .lds = true;
546 
547          //~gfx11! buffer_load_lds_format_x off, s[32:35], 0                   ; e0c80000 80080080
548          bld.mubuf(aco_opcode::buffer_load_format_x, op_s4, Operand(v1), Operand::zero(), op_m0, 0,
549                    false)
550             ->mubuf()
551             .lds = true;
552       }
553 
554       /* Stores */
555       //~gfx11! buffer_store_b32 v10, off, s[32:35], s30                    ; e0680000 1e080a80
556       //~gfx12! buffer_store_b32 v10, off, s[32:35], s30                    ; c406801e 0080400a 00000000
557       bld.mubuf(aco_opcode::buffer_store_dword, op_s4, Operand(v1), op_s1, op_v1, 0, false);
558 
559       //~gfx11! buffer_store_b64 v[20:21], v10, s[32:35], s30 offen         ; e06c0000 1e48140a
560       //~gfx12! buffer_store_b64 v[20:21], v10, s[32:35], s30 offen         ; c406c01e 40804014 0000000a
561       bld.mubuf(aco_opcode::buffer_store_dwordx2, op_s4, op_v1, op_s1, op_v2, 0, true);
562 
563       /* Atomic with return */
564       //~gfx11! buffer_atomic_add_u32 v10, off, s[32:35], 0 glc             ; e0d44000 80080a80
565       //~gfx12! buffer_atomic_add_u32 v10, off, s[32:35], null th:TH_ATOMIC_RETURN ; c40d407c 0090400a 00000000
566       bld.mubuf(aco_opcode::buffer_atomic_add, Definition(op_v1.physReg(), v1), op_s4, Operand(v1),
567                 Operand::zero(), op_v1, 0, false)
568          ->mubuf()
569          .cache = cache_atomic_rtn;
570 
571       finish_assembler_test();
572    }
573 END_TEST
574 
575 BEGIN_TEST(assembler.mtbuf)
576    for (amd_gfx_level gfx : filter_gfx_levels({GFX11, GFX12})) {
577       if (!setup_cs(NULL, gfx))
578          continue;
579 
580       Definition dst = bld.def(v1);
581       dst.setFixed(PhysReg(256 + 42));
582 
583       Operand op_s4(bld.tmp(s4));
584       op_s4.setFixed(PhysReg(32));
585 
586       Operand op_v1(bld.tmp(v1));
587       op_v1.setFixed(PhysReg(256 + 10));
588 
589       Operand op_v2(bld.tmp(v2));
590       op_v2.setFixed(PhysReg(256 + 20));
591 
592       Operand op_s1(bld.tmp(s1));
593       op_s1.setFixed(PhysReg(30));
594 
595       unsigned dfmt = V_008F0C_BUF_DATA_FORMAT_32_32;
596       unsigned nfmt = V_008F0C_BUF_NUM_FORMAT_FLOAT;
597 
598       //! llvm_version: #llvm_ver
599       fprintf(output, "llvm_version: %u\n", LLVM_VERSION_MAJOR);
600 
601       /* Addressing */
602       //~gfx11>> tbuffer_load_format_x v42, off, s[32:35], s30 format:[BUF_FMT_32_32_FLOAT] ; e9900000 1e082a80
603       //~gfx12>> tbuffer_load_format_x v42, off, s[32:35], s30 format:[BUF_FMT_32_32_FLOAT] ; c420001e 1900402a 00000080
604       bld.mtbuf(aco_opcode::tbuffer_load_format_x, dst, op_s4, Operand(v1), op_s1, dfmt, nfmt, 0,
605                 false);
606 
607       //~gfx11! tbuffer_load_format_x v42, off, s[32:35], 0 format:[BUF_FMT_32_32_FLOAT] ; e9900000 80082a80
608       //~gfx12! tbuffer_load_format_x v42, off, s[32:35], null format:[BUF_FMT_32_32_FLOAT] ; c420007c 1900402a 00000080
609       bld.mtbuf(aco_opcode::tbuffer_load_format_x, dst, op_s4, Operand(v1), Operand::zero(), dfmt,
610                 nfmt, 0, false);
611 
612       //~gfx11! tbuffer_load_format_x v42, off, s[32:35], 42 format:[BUF_FMT_32_32_FLOAT] ; e9900000 aa082a80
613       if (gfx == GFX11)
614          bld.mtbuf(aco_opcode::tbuffer_load_format_x, dst, op_s4, Operand(v1), Operand::c32(42),
615                    dfmt, nfmt, 0, false);
616 
617       //~gfx11! tbuffer_load_format_x v42, v10, s[32:35], s30 format:[BUF_FMT_32_32_FLOAT] offen ; e9900000 1e482a0a
618       //~gfx12! tbuffer_load_format_x v42, v10, s[32:35], s30 format:[BUF_FMT_32_32_FLOAT] offen ; c420001e 5900402a 0000000a
619       bld.mtbuf(aco_opcode::tbuffer_load_format_x, dst, op_s4, op_v1, op_s1, dfmt, nfmt, 0, true);
620 
621       //~gfx11! tbuffer_load_format_x v42, v10, s[32:35], s30 format:[BUF_FMT_32_32_FLOAT] idxen ; e9900000 1e882a0a
622       //~gfx12! tbuffer_load_format_x v42, v10, s[32:35], s30 format:[BUF_FMT_32_32_FLOAT] idxen ; c420001e 9900402a 0000000a
623       bld.mtbuf(aco_opcode::tbuffer_load_format_x, dst, op_s4, op_v1, op_s1, dfmt, nfmt, 0, false)
624          ->mtbuf()
625          .idxen = true;
626 
627       //~gfx11! tbuffer_load_format_x v42, v[20:21], s[32:35], s30 format:[BUF_FMT_32_32_FLOAT] idxen offen ; e9900000 1ec82a14
628       //~gfx12! tbuffer_load_format_x v42, v[20:21], s[32:35], s30 format:[BUF_FMT_32_32_FLOAT] idxen offen ; c420001e d900402a 00000014
629       bld.mtbuf(aco_opcode::tbuffer_load_format_x, dst, op_s4, op_v2, op_s1, dfmt, nfmt, 0, true)
630          ->mtbuf()
631          .idxen = true;
632 
633       //~gfx11! tbuffer_load_format_x v42, off, s[32:35], s30 format:[BUF_FMT_32_32_FLOAT] offset:84 ; e9900054 1e082a80
634       //~gfx12! tbuffer_load_format_x v42, off, s[32:35], s30 format:[BUF_FMT_32_32_FLOAT] offset:84 ; c420001e 1900402a 00005480
635       bld.mtbuf(aco_opcode::tbuffer_load_format_x, dst, op_s4, Operand(v1), op_s1, dfmt, nfmt, 84,
636                 false);
637 
638       /* Various flags */
639       ac_hw_cache_flags cache_coherent = {{0, 0, 0, 0, 0}};
640       ac_hw_cache_flags cache_sys_coherent = {{0, 0, 0, 0, 0}};
641       ac_hw_cache_flags cache_non_temporal = {{0, 0, 0, 0, 0}};
642       if (gfx >= GFX12) {
643          cache_coherent.gfx12.scope = gfx12_scope_device;
644          cache_sys_coherent.gfx12.scope = gfx12_scope_memory;
645          cache_non_temporal.gfx12.temporal_hint = gfx12_load_non_temporal;
646       } else {
647          cache_coherent.value = ac_glc;
648          cache_sys_coherent.value = ac_slc;
649          cache_non_temporal.value = ac_dlc;
650       }
651 
652       //~gfx11! tbuffer_load_format_x v42, off, s[32:35], 0 format:[BUF_FMT_32_32_FLOAT] glc ; e9904000 80082a80
653       //~gfx12! tbuffer_load_format_x v42, off, s[32:35], null format:[BUF_FMT_32_32_FLOAT] scope:SCOPE_DEV ; c420007c 1908402a 00000080
654       bld.mtbuf(aco_opcode::tbuffer_load_format_x, dst, op_s4, Operand(v1), Operand::zero(), dfmt,
655                 nfmt, 0, false)
656          ->mtbuf()
657          .cache = cache_coherent;
658 
659       //~gfx11! tbuffer_load_format_x v42, off, s[32:35], 0 format:[BUF_FMT_32_32_FLOAT] dlc ; e9902000 80082a80
660       //~gfx12! tbuffer_load_format_x v42, off, s[32:35], null format:[BUF_FMT_32_32_FLOAT] th:TH_LOAD_NT ; c420007c 1910402a 00000080
661       bld.mtbuf(aco_opcode::tbuffer_load_format_x, dst, op_s4, Operand(v1), Operand::zero(), dfmt,
662                 nfmt, 0, false)
663          ->mtbuf()
664          .cache = cache_non_temporal;
665 
666       //~gfx11! tbuffer_load_format_x v42, off, s[32:35], 0 format:[BUF_FMT_32_32_FLOAT] slc ; e9901000 80082a80
667       //~gfx12! tbuffer_load_format_x v42, off, s[32:35], null format:[BUF_FMT_32_32_FLOAT] scope:SCOPE_SYS ; c420007c 190c402a 00000080
668       bld.mtbuf(aco_opcode::tbuffer_load_format_x, dst, op_s4, Operand(v1), Operand::zero(), dfmt,
669                 nfmt, 0, false)
670          ->mtbuf()
671          .cache = cache_sys_coherent;
672 
673       //; if llvm_ver >= 19 and variant == 'gfx11':
674       //;    insert_pattern('(invalid instruction) ; e9900000')
675       //;    insert_pattern('s_add_u32 s40, 0, s42 ; 80282a80')
676       //; elif llvm_ver >= 19 and variant == 'gfx12':
677       //;    insert_pattern('(invalid instruction) ; c460007c')
678       //;    insert_pattern('v_mul_hi_u32_u24_e32 v128, s42, v32 ; 1900402a')
679       //;    insert_pattern('(invalid instruction) ; 00000080')
680       //; elif llvm_ver >= 16 and variant == 'gfx11':
681       //;    insert_pattern('tbuffer_load_format_x v42, off, s[32:35], 0 format:[BUF_FMT_32_32_FLOAT] ; e9900000 80282a80')
682       //; elif variant == 'gfx11':
683       //;    insert_pattern('tbuffer_load_format_x v42, off, s[32:35], 0 format:[BUF_FMT_32_32_FLOAT] tfe ; e9900000 80282a80')
684       //; elif variant == 'gfx12':
685       //;    insert_pattern('tbuffer_load_format_x v42, off, s[32:35], null format:[BUF_FMT_32_32_FLOAT] ; c460007c 1900402a 00000080')
686       bld.mtbuf(aco_opcode::tbuffer_load_format_x, dst, op_s4, Operand(v1), Operand::zero(), dfmt,
687                 nfmt, 0, false)
688          ->mtbuf()
689          .tfe = true;
690 
691       /* Stores */
692       //~gfx11! tbuffer_store_format_x v10, off, s[32:35], s30 format:[BUF_FMT_32_32_FLOAT] ; e9920000 1e080a80
693       //~gfx12! tbuffer_store_format_x v10, off, s[32:35], s30 format:[BUF_FMT_32_32_FLOAT] ; c421001e 1900400a 00000080
694       bld.mtbuf(aco_opcode::tbuffer_store_format_x, op_s4, Operand(v1), op_s1, op_v1, dfmt, nfmt, 0,
695                 false);
696 
697       //~gfx11! tbuffer_store_format_xy v[20:21], v10, s[32:35], s30 format:[BUF_FMT_32_32_FLOAT] offen ; e9928000 1e48140a
698       //~gfx12! tbuffer_store_format_xy v[20:21], v10, s[32:35], s30 format:[BUF_FMT_32_32_FLOAT] offen ; c421401e 59004014 0000000a
699       bld.mtbuf(aco_opcode::tbuffer_store_format_xy, op_s4, op_v1, op_s1, op_v2, dfmt, nfmt, 0,
700                 true);
701 
702       finish_assembler_test();
703    }
704 END_TEST
705 
706 BEGIN_TEST(assembler.mimg)
707    for (amd_gfx_level gfx : filter_gfx_levels({GFX11, GFX12})) {
708       if (!setup_cs(NULL, gfx))
709          continue;
710 
711       Definition dst_v1 = bld.def(v1);
712       dst_v1.setFixed(PhysReg(256 + 42));
713 
714       Definition dst_v4 = bld.def(v4);
715       dst_v4.setFixed(PhysReg(256 + 84));
716 
717       Operand op_s4(bld.tmp(s4));
718       op_s4.setFixed(PhysReg(32));
719 
720       Operand op_s8(bld.tmp(s8));
721       op_s8.setFixed(PhysReg(64));
722 
723       Operand op_v1(bld.tmp(v1));
724       op_v1.setFixed(PhysReg(256 + 10));
725 
726       Operand op_v2(bld.tmp(v2));
727       op_v2.setFixed(PhysReg(256 + 20));
728 
729       Operand op_v4(bld.tmp(v4));
730       op_v4.setFixed(PhysReg(256 + 30));
731 
732       //~gfx11>> image_sample v[84:87], v10, s[64:71], s[32:35] dmask:0xf dim:SQ_RSRC_IMG_1D ; f06c0f00 2010540a
733       //~gfx12>> image_sample v[84:87], v10, s[64:71], s[32:35] dmask:0xf dim:SQ_RSRC_IMG_1D ; e7c6c000 10008054 0000000a
734       bld.mimg(aco_opcode::image_sample, dst_v4, op_s8, op_s4, Operand(v1), op_v1);
735 
736       //~gfx11! image_sample v[84:87], v[20:21], s[64:71], s[32:35] dmask:0xf dim:SQ_RSRC_IMG_2D ; f06c0f04 20105414
737       //~gfx12! image_sample v[84:87], [v20, v21], s[64:71], s[32:35] dmask:0xf dim:SQ_RSRC_IMG_2D ; e7c6c001 10008054 00001514
738       bld.mimg(aco_opcode::image_sample, dst_v4, op_s8, op_s4, Operand(v1), op_v2)->mimg().dim =
739          ac_image_2d;
740 
741       //~gfx11! image_sample v42, v10, s[64:71], s[32:35] dmask:0x1 dim:SQ_RSRC_IMG_1D ; f06c0100 20102a0a
742       //~gfx12! image_sample v42, v10, s[64:71], s[32:35] dmask:0x1 dim:SQ_RSRC_IMG_1D ; e446c000 1000802a 0000000a
743       bld.mimg(aco_opcode::image_sample, dst_v1, op_s8, op_s4, Operand(v1), op_v1)->mimg().dmask =
744          0x1;
745 
746       /* Various flags */
747       ac_hw_cache_flags cache_coherent = {{0, 0, 0, 0, 0}};
748       ac_hw_cache_flags cache_sys_coherent = {{0, 0, 0, 0, 0}};
749       ac_hw_cache_flags cache_non_temporal = {{0, 0, 0, 0, 0}};
750       ac_hw_cache_flags cache_atomic_rtn = {{0, 0, 0, 0, 0}};
751       if (gfx >= GFX12) {
752          cache_coherent.gfx12.scope = gfx12_scope_device;
753          cache_sys_coherent.gfx12.scope = gfx12_scope_memory;
754          cache_non_temporal.gfx12.temporal_hint = gfx12_load_non_temporal;
755          cache_atomic_rtn.gfx12.temporal_hint = gfx12_atomic_return;
756       } else {
757          cache_coherent.value = ac_glc;
758          cache_sys_coherent.value = ac_slc;
759          cache_non_temporal.value = ac_dlc;
760          cache_atomic_rtn.value = ac_glc;
761       }
762 
763       //~gfx11! image_sample v[84:87], v10, s[64:71], s[32:35] dmask:0xf dim:SQ_RSRC_IMG_1D dlc ; f06c2f00 2010540a
764       //~gfx12! image_sample v[84:87], v10, s[64:71], s[32:35] dmask:0xf dim:SQ_RSRC_IMG_1D th:TH_LOAD_NT ; e7c6c000 10108054 0000000a
765       bld.mimg(aco_opcode::image_sample, dst_v4, op_s8, op_s4, Operand(v1), op_v1)->mimg().cache =
766          cache_non_temporal;
767 
768       //~gfx11! image_sample v[84:87], v10, s[64:71], s[32:35] dmask:0xf dim:SQ_RSRC_IMG_1D glc ; f06c4f00 2010540a
769       //~gfx12! image_sample v[84:87], v10, s[64:71], s[32:35] dmask:0xf dim:SQ_RSRC_IMG_1D scope:SCOPE_DEV ; e7c6c000 10088054 0000000a
770       bld.mimg(aco_opcode::image_sample, dst_v4, op_s8, op_s4, Operand(v1), op_v1)->mimg().cache =
771          cache_coherent;
772 
773       //~gfx11! image_sample v[84:87], v10, s[64:71], s[32:35] dmask:0xf dim:SQ_RSRC_IMG_1D slc ; f06c1f00 2010540a
774       //~gfx12! image_sample v[84:87], v10, s[64:71], s[32:35] dmask:0xf dim:SQ_RSRC_IMG_1D scope:SCOPE_SYS ; e7c6c000 100c8054 0000000a
775       bld.mimg(aco_opcode::image_sample, dst_v4, op_s8, op_s4, Operand(v1), op_v1)->mimg().cache =
776          cache_sys_coherent;
777 
778       //~gfx11! image_sample v[84:88], v10, s[64:71], s[32:35] dmask:0xf dim:SQ_RSRC_IMG_1D tfe ; f06c0f00 2030540a
779       //~gfx12! image_sample v[84:88], v10, s[64:71], s[32:35] dmask:0xf dim:SQ_RSRC_IMG_1D tfe ; e7c6c008 10008054 0000000a
780       bld.mimg(aco_opcode::image_sample, dst_v4, op_s8, op_s4, Operand(v1), op_v1)->mimg().tfe =
781          true;
782 
783       //~gfx11! image_sample v[84:87], v10, s[64:71], s[32:35] dmask:0xf dim:SQ_RSRC_IMG_1D lwe ; f06c0f00 2050540a
784       //~gfx12! image_sample v[84:87], v10, s[64:71], s[32:35] dmask:0xf dim:SQ_RSRC_IMG_1D lwe ; e7c6c000 10008154 0000000a
785       bld.mimg(aco_opcode::image_sample, dst_v4, op_s8, op_s4, Operand(v1), op_v1)->mimg().lwe =
786          true;
787 
788       //~gfx11! image_sample v[84:87], v10, s[64:71], s[32:35] dmask:0xf dim:SQ_RSRC_IMG_1D r128 ; f06c8f00 2010540a
789       //~gfx12! image_sample v[84:87], v10, s[64:71], s[32:35] dmask:0xf dim:SQ_RSRC_IMG_1D r128 ; e7c6c010 10008054 0000000a
790       bld.mimg(aco_opcode::image_sample, dst_v4, op_s8, op_s4, Operand(v1), op_v1)->mimg().r128 =
791          true;
792 
793       //~gfx11! image_sample v[84:87], v10, s[64:71], s[32:35] dmask:0xf dim:SQ_RSRC_IMG_1D a16 ; f06d0f00 2010540a
794       //~gfx12! image_sample v[84:87], v10, s[64:71], s[32:35] dmask:0xf dim:SQ_RSRC_IMG_1D a16 ; e7c6c040 10008054 0000000a
795       bld.mimg(aco_opcode::image_sample, dst_v4, op_s8, op_s4, Operand(v1), op_v1)->mimg().a16 =
796          true;
797 
798       //~gfx11! image_sample v[84:85], v10, s[64:71], s[32:35] dmask:0xf dim:SQ_RSRC_IMG_1D d16 ; f06e0f00 2010540a
799       //~gfx12! image_sample v[84:85], v10, s[64:71], s[32:35] dmask:0xf dim:SQ_RSRC_IMG_1D d16 ; e7c6c020 10008054 0000000a
800       bld.mimg(aco_opcode::image_sample, dst_v4, op_s8, op_s4, Operand(v1), op_v1)->mimg().d16 =
801          true;
802 
803       /* NSA */
804       //~gfx11! image_sample v[84:87], [v10, v40], s[64:71], s[32:35] dmask:0xf dim:SQ_RSRC_IMG_2D ; f06c0f05 2010540a 00000028
805       //~gfx12! image_sample v[84:87], [v10, v40], s[64:71], s[32:35] dmask:0xf dim:SQ_RSRC_IMG_2D ; e7c6c001 10008054 0000280a
806       bld.mimg(aco_opcode::image_sample, dst_v4, op_s8, op_s4, Operand(v1), op_v1,
807                Operand(bld.tmp(v1), PhysReg(256 + 40)))
808          ->mimg()
809          .dim = ac_image_2d;
810 
811       //~gfx11! image_bvh_intersect_ray v[84:87], [v40, v42, v[44:46], v[48:50], v[52:54]], s[32:35] ; f0648f81 00085428 34302c2a
812       //~gfx12! image_bvh_intersect_ray v[84:87], [v40, v42, v[44:46], v[48:50], v[52:54]], s[32:35] ; d3c64010 34004054 302c2a28
813       aco_ptr<Instruction> instr{
814          create_instruction(aco_opcode::image_bvh_intersect_ray, Format::MIMG, 8, 1)};
815       instr->definitions[0] = dst_v4;
816       instr->operands[0] = op_s4;
817       instr->operands[1] = Operand(s4);
818       instr->operands[2] = Operand(v1);
819       instr->operands[3] = Operand(PhysReg(256 + 40), v1); /* node */
820       instr->operands[4] = Operand(PhysReg(256 + 42), v1); /* tmax */
821       instr->operands[5] = Operand(PhysReg(256 + 44), v3); /* origin */
822       instr->operands[6] = Operand(PhysReg(256 + 48), v3); /* dir */
823       instr->operands[7] = Operand(PhysReg(256 + 52), v3); /* inv dir */
824       instr->mimg().dmask = 0xf;
825       instr->mimg().unrm = true;
826       instr->mimg().r128 = true;
827       bld.insert(std::move(instr));
828 
829       /* Stores */
830       //~gfx11! image_store v[30:33], v10, s[64:71] dmask:0xf dim:SQ_RSRC_IMG_1D ; f0180f00 00101e0a
831       //~gfx12! image_store v[30:33], v10, s[64:71] dmask:0xf dim:SQ_RSRC_IMG_1D ; d3c18000 0000801e 0000000a
832       bld.mimg(aco_opcode::image_store, op_s8, Operand(s4), op_v4, op_v1);
833 
834       //~gfx11! image_atomic_add v10, v[20:21], s[64:71] dmask:0x1 dim:SQ_RSRC_IMG_2D ; f0300104 00100a14
835       //~gfx12! image_atomic_add_uint v10, [v20, v21], s[64:71] dmask:0x1 dim:SQ_RSRC_IMG_2D ; d0430001 0000800a 00001514
836       bld.mimg(aco_opcode::image_atomic_add, Definition(op_v1.physReg(), v1), op_s8, Operand(s4),
837                op_v1, op_v2, 0x1)
838          ->mimg()
839          .dim = ac_image_2d;
840 
841       /* Atomic with return */
842       //~gfx11! image_atomic_add v10, v[20:21], s[64:71] dmask:0x1 dim:SQ_RSRC_IMG_2D glc ; f0304104 00100a14
843       //~gfx12! image_atomic_add_uint v10, [v20, v21], s[64:71] dmask:0x1 dim:SQ_RSRC_IMG_2D th:TH_ATOMIC_RETURN ; d0430001 0010800a 00001514
844       bld.mimg(aco_opcode::image_atomic_add, Definition(op_v1.physReg(), v1), op_s8, Operand(s4),
845                op_v1, op_v2, 0x1, false, false, false, cache_atomic_rtn)
846          ->mimg()
847          .dim = ac_image_2d;
848 
849       //~gfx11! image_load v[84:87], v[20:21], s[64:71] dmask:0xf dim:SQ_RSRC_IMG_2D ; f0000f04 00105414
850       //~gfx12! image_load v[84:87], [v20, v21], s[64:71] dmask:0xf dim:SQ_RSRC_IMG_2D ; d3c00001 00008054 00001514
851       bld.mimg(aco_opcode::image_load, dst_v4, op_s8, Operand(s4), Operand(v1), op_v2)->mimg().dim =
852          ac_image_2d;
853 
854       //~gfx11! image_msaa_load v[84:87], v[30:33], s[64:71] dmask:0x1 dim:SQ_RSRC_IMG_2D_MSAA_ARRAY ; f060011c 0010541e
855       //~gfx12! image_msaa_load v[84:87], [v30, v31, v32, v33], s[64:71] dmask:0x1 dim:SQ_RSRC_IMG_2D_MSAA_ARRAY ; e4460007 00008054 21201f1e
856       bld.mimg(aco_opcode::image_msaa_load, dst_v4, op_s8, Operand(s4), Operand(v1), op_v4, 0x1)
857          ->mimg()
858          .dim = ac_image_2darraymsaa;
859 
860       finish_assembler_test();
861    }
862 END_TEST
863 
864 BEGIN_TEST(assembler.flat)
865    for (amd_gfx_level gfx : filter_gfx_levels({GFX11, GFX12})) {
866       if (!setup_cs(NULL, gfx))
867          continue;
868 
869       Definition dst_v1 = bld.def(v1);
870       dst_v1.setFixed(PhysReg(256 + 42));
871 
872       Operand op_s1(bld.tmp(s1));
873       op_s1.setFixed(PhysReg(32));
874 
875       Operand op_s2(bld.tmp(s2));
876       op_s2.setFixed(PhysReg(64));
877 
878       Operand op_v1(bld.tmp(v1));
879       op_v1.setFixed(PhysReg(256 + 10));
880 
881       Operand op_v2(bld.tmp(v2));
882       op_v2.setFixed(PhysReg(256 + 20));
883 
884       /* Addressing */
885       //~gfx11>> flat_load_b32 v42, v[20:21]                                 ; dc500000 2a7c0014
886       //~gfx12>> flat_load_b32 v42, v[20:21]                                 ; ec05007c 0000002a 00000014
887       bld.flat(aco_opcode::flat_load_dword, dst_v1, op_v2, Operand(s1));
888 
889       //~gfx11! global_load_b32 v42, v[20:21], off                          ; dc520000 2a7c0014
890       //~gfx12! global_load_b32 v42, v[20:21], off                          ; ee05007c 0000002a 00000014
891       bld.global(aco_opcode::global_load_dword, dst_v1, op_v2, Operand(s1));
892 
893       //~gfx11! global_load_b32 v42, v10, s[64:65]                          ; dc520000 2a40000a
894       //~gfx12! global_load_b32 v42, v10, s[64:65]                          ; ee050040 0000002a 0000000a
895       bld.global(aco_opcode::global_load_dword, dst_v1, op_v1, op_s2);
896 
897       //~gfx11! scratch_load_b32 v42, v10, off                              ; dc510000 2afc000a
898       //~gfx12! scratch_load_b32 v42, v10, off                              ; ed05007c 0002002a 0000000a
899       bld.scratch(aco_opcode::scratch_load_dword, dst_v1, op_v1, Operand(s1));
900 
901       //~gfx11! scratch_load_b32 v42, off, s32                              ; dc510000 2a200080
902       //~gfx12! scratch_load_b32 v42, off, s32                              ; ed050020 0000002a 00000000
903       bld.scratch(aco_opcode::scratch_load_dword, dst_v1, Operand(v1), op_s1);
904 
905       //~gfx11! scratch_load_b32 v42, v10, s32                              ; dc510000 2aa0000a
906       //~gfx12! scratch_load_b32 v42, v10, s32                              ; ed050020 0002002a 0000000a
907       bld.scratch(aco_opcode::scratch_load_dword, dst_v1, op_v1, op_s1);
908 
909       //~gfx11! scratch_load_b32 v42, off, off                              ; dc510000 2a7c0080
910       //~gfx12! scratch_load_b32 v42, off, off                              ; ed05007c 0000002a 00000000
911       bld.scratch(aco_opcode::scratch_load_dword, dst_v1, Operand(v1), Operand(s1));
912 
913       //~gfx11! global_load_b32 v42, v[20:21], off offset:-42               ; dc521fd6 2a7c0014
914       //~gfx12! global_load_b32 v42, v[20:21], off offset:-42               ; ee05007c 0000002a ffffd614
915       bld.global(aco_opcode::global_load_dword, dst_v1, op_v2, Operand(s1), -42);
916 
917       //~gfx11! global_load_b32 v42, v[20:21], off offset:84                ; dc520054 2a7c0014
918       //~gfx12! global_load_b32 v42, v[20:21], off offset:84                ; ee05007c 0000002a 00005414
919       bld.global(aco_opcode::global_load_dword, dst_v1, op_v2, Operand(s1), 84);
920 
921       /* Various flags */
922       ac_hw_cache_flags cache_coherent = {{0, 0, 0, 0, 0}};
923       ac_hw_cache_flags cache_sys_coherent = {{0, 0, 0, 0, 0}};
924       ac_hw_cache_flags cache_non_temporal = {{0, 0, 0, 0, 0}};
925       ac_hw_cache_flags cache_atomic_rtn = {{0, 0, 0, 0, 0}};
926       if (gfx >= GFX12) {
927          cache_coherent.gfx12.scope = gfx12_scope_device;
928          cache_sys_coherent.gfx12.scope = gfx12_scope_memory;
929          cache_non_temporal.gfx12.temporal_hint = gfx12_load_non_temporal;
930          cache_atomic_rtn.gfx12.temporal_hint = gfx12_atomic_return;
931       } else {
932          cache_coherent.value = ac_glc;
933          cache_sys_coherent.value = ac_slc;
934          cache_non_temporal.value = ac_dlc;
935          cache_atomic_rtn.value = ac_glc;
936       }
937 
938       //~gfx11! flat_load_b32 v42, v[20:21] slc                             ; dc508000 2a7c0014
939       //~gfx12! flat_load_b32 v42, v[20:21] scope:SCOPE_SYS                 ; ec05007c 000c002a 00000014
940       bld.flat(aco_opcode::flat_load_dword, dst_v1, op_v2, Operand(s1))->flat().cache =
941          cache_sys_coherent;
942 
943       //~gfx11! flat_load_b32 v42, v[20:21] glc                             ; dc504000 2a7c0014
944       //~gfx12! flat_load_b32 v42, v[20:21] scope:SCOPE_DEV                 ; ec05007c 0008002a 00000014
945       bld.flat(aco_opcode::flat_load_dword, dst_v1, op_v2, Operand(s1))->flat().cache =
946          cache_coherent;
947 
948       //~gfx11! flat_load_b32 v42, v[20:21] dlc                             ; dc502000 2a7c0014
949       //~gfx12! flat_load_b32 v42, v[20:21] th:TH_LOAD_NT                   ; ec05007c 0010002a 00000014
950       bld.flat(aco_opcode::flat_load_dword, dst_v1, op_v2, Operand(s1))->flat().cache =
951          cache_non_temporal;
952 
953       /* Stores */
954       //~gfx11! flat_store_b32 v[20:21], v10                                ; dc680000 007c0a14
955       //~gfx12! flat_store_b32 v[20:21], v10                                ; ec06807c 05000000 00000014
956       bld.flat(aco_opcode::flat_store_dword, op_v2, Operand(s1), op_v1);
957 
958       /* Atomic with return */
959       //~gfx11! global_atomic_add_u32 v42, v[20:21], v10, off glc           ; dcd64000 2a7c0a14
960       //~gfx12! global_atomic_add_u32 v42, v[20:21], v10, off th:TH_ATOMIC_RETURN ; ee0d407c 0510002a 00000014
961       bld.global(aco_opcode::global_atomic_add, dst_v1, op_v2, Operand(s1), op_v1)->global().cache =
962          cache_atomic_rtn;
963 
964       finish_assembler_test();
965    }
966 END_TEST
967 
968 BEGIN_TEST(assembler.exp)
969    for (amd_gfx_level gfx : filter_gfx_levels({GFX11, GFX12})) {
970       if (!setup_cs(NULL, gfx))
971          continue;
972 
973       Operand op[4];
974       for (unsigned i = 0; i < 4; i++)
975          op[i] = Operand(PhysReg(256 + i), v1);
976 
977       Operand op_m0(bld.tmp(s1));
978       op_m0.setFixed(m0);
979 
980       //~gfx11>> exp mrt3 v1, v0, v3, v2                                     ; f800003f 02030001
981       //~gfx12>> export mrt3 v1, v0, v3, v2                                  ; f800003f 02030001
982       bld.exp(aco_opcode::exp, op[1], op[0], op[3], op[2], 0xf, 3);
983 
984       //~gfx11! exp mrt3 v1, off, v0, off                                   ; f8000035 80008001
985       //~gfx12! export mrt3 v1, off, v0, off                                ; f8000035 80008001
986       bld.exp(aco_opcode::exp, op[1], Operand(v1), op[0], Operand(v1), 0x5, 3);
987 
988       //~gfx11! exp mrt3 v1, v0, v3, v2 done                                ; f800083f 02030001
989       //~gfx12! export mrt3 v1, v0, v3, v2 done                             ; f800083f 02030001
990       bld.exp(aco_opcode::exp, op[1], op[0], op[3], op[2], 0xf, 3, false, true);
991 
992       //~gfx11! exp mrt3 v1, v0, v3, v2 row_en                              ; f800203f 02030001
993       //~gfx12! export mrt3 v1, v0, v3, v2 row_en                           ; f800203f 02030001
994       bld.exp(aco_opcode::exp, op[1], op[0], op[3], op[2], op_m0, 0xf, 3)->exp().row_en = true;
995 
996       finish_assembler_test();
997    }
998 END_TEST
999 
1000 BEGIN_TEST(assembler.vinterp)
1001    for (amd_gfx_level gfx : filter_gfx_levels({GFX11, GFX12})) {
1002       if (!setup_cs(NULL, gfx))
1003          continue;
1004 
1005       Definition dst = bld.def(v1);
1006       dst.setFixed(PhysReg(256 + 42));
1007 
1008       Operand op0(bld.tmp(v1));
1009       op0.setFixed(PhysReg(256 + 10));
1010 
1011       Operand op1(bld.tmp(v1));
1012       op1.setFixed(PhysReg(256 + 20));
1013 
1014       Operand op2(bld.tmp(v1));
1015       op2.setFixed(PhysReg(256 + 30));
1016 
1017       //! llvm_version: #llvm_ver
1018       fprintf(output, "llvm_version: %u\n", LLVM_VERSION_MAJOR);
1019 
1020       //>> v_interp_p10_f32 v42, v10, v20, v30 wait_exp:7              ; cd00072a 047a290a
1021       bld.vinterp_inreg(aco_opcode::v_interp_p10_f32_inreg, dst, op0, op1, op2);
1022 
1023       //! v_interp_p10_f32 v42, v10, v20, v30 wait_exp:6              ; cd00062a 047a290a
1024       bld.vinterp_inreg(aco_opcode::v_interp_p10_f32_inreg, dst, op0, op1, op2, 0, 6);
1025 
1026       //; if llvm_ver >= 18:
1027       //;    insert_pattern('v_interp_p2_f32 v42, v10, v20, v30 wait_exp:0               ; cd01002a 047a290a')
1028       //; else:
1029       //;    insert_pattern('v_interp_p2_f32 v42, v10, v20, v30                          ; cd01002a 047a290a')
1030       bld.vinterp_inreg(aco_opcode::v_interp_p2_f32_inreg, dst, op0, op1, op2, 0, 0);
1031 
1032       //! v_interp_p10_f32 v42, -v10, v20, v30 wait_exp:6             ; cd00062a 247a290a
1033       bld.vinterp_inreg(aco_opcode::v_interp_p10_f32_inreg, dst, op0, op1, op2, 0, 6)
1034          ->vinterp_inreg()
1035          .neg[0] = true;
1036 
1037       //! v_interp_p10_f32 v42, v10, -v20, v30 wait_exp:6             ; cd00062a 447a290a
1038       bld.vinterp_inreg(aco_opcode::v_interp_p10_f32_inreg, dst, op0, op1, op2, 0, 6)
1039          ->vinterp_inreg()
1040          .neg[1] = true;
1041 
1042       //! v_interp_p10_f32 v42, v10, v20, -v30 wait_exp:6             ; cd00062a 847a290a
1043       bld.vinterp_inreg(aco_opcode::v_interp_p10_f32_inreg, dst, op0, op1, op2, 0, 6)
1044          ->vinterp_inreg()
1045          .neg[2] = true;
1046 
1047       //! v_interp_p10_f16_f32 v42, v10, v20, v30 op_sel:[1,0,0,0] wait_exp:6 ; cd020e2a 047a290a
1048       bld.vinterp_inreg(aco_opcode::v_interp_p10_f16_f32_inreg, dst, op0, op1, op2, 0x1, 6);
1049 
1050       //! v_interp_p2_f16_f32 v42, v10, v20, v30 op_sel:[0,1,0,0] wait_exp:6 ; cd03162a 047a290a
1051       bld.vinterp_inreg(aco_opcode::v_interp_p2_f16_f32_inreg, dst, op0, op1, op2, 0x2, 6);
1052 
1053       //! v_interp_p10_rtz_f16_f32 v42, v10, v20, v30 op_sel:[0,0,1,0] wait_exp:6 ; cd04262a 047a290a
1054       bld.vinterp_inreg(aco_opcode::v_interp_p10_rtz_f16_f32_inreg, dst, op0, op1, op2, 0x4, 6);
1055 
1056       //! v_interp_p2_rtz_f16_f32 v42, v10, v20, v30 op_sel:[0,0,0,1] wait_exp:6 ; cd05462a 047a290a
1057       bld.vinterp_inreg(aco_opcode::v_interp_p2_rtz_f16_f32_inreg, dst, op0, op1, op2, 0x8, 6);
1058 
1059       //! v_interp_p10_f32 v42, v10, v20, v30 clamp wait_exp:6        ; cd00862a 047a290a
1060       bld.vinterp_inreg(aco_opcode::v_interp_p10_f32_inreg, dst, op0, op1, op2, 0, 6)
1061          ->vinterp_inreg()
1062          .clamp = true;
1063 
1064       finish_assembler_test();
1065    }
1066 END_TEST
1067 
1068 BEGIN_TEST(assembler.ldsdir)
1069    for (amd_gfx_level gfx : filter_gfx_levels({GFX11, GFX12})) {
1070       if (!setup_cs(NULL, gfx))
1071          continue;
1072 
1073       Definition dst = bld.def(v1);
1074       dst.setFixed(PhysReg(256 + 42));
1075 
1076       Operand op(bld.tmp(s1));
1077       op.setFixed(m0);
1078 
1079       //! llvm_version: #llvm_ver
1080       fprintf(output, "llvm_version: %u\n", LLVM_VERSION_MAJOR);
1081 
1082       //~gfx11>> lds_direct_load v42 wait_vdst:15                           ; ce1f002a
1083       //~gfx12>> ds_direct_load v42 wait_va_vdst:15 wait_vm_vsrc:1          ; ce9f002a
1084       bld.ldsdir(aco_opcode::lds_direct_load, dst, op)->ldsdir().wait_vdst = 15;
1085 
1086       //~gfx11! lds_direct_load v42 wait_vdst:6                             ; ce16002a
1087       //~gfx12! ds_direct_load v42 wait_va_vdst:6 wait_vm_vsrc:1            ; ce96002a
1088       bld.ldsdir(aco_opcode::lds_direct_load, dst, op)->ldsdir().wait_vdst = 6;
1089 
1090       //; if llvm_ver >= 18 and variant == 'gfx11':
1091       //;    insert_pattern('lds_direct_load v42 wait_vdst:0                             ; ce10002a')
1092       //; elif variant == 'gfx11':
1093       //;    insert_pattern('lds_direct_load v42                                         ; ce10002a')
1094       //~gfx12! ds_direct_load v42 wait_va_vdst:0 wait_vm_vsrc:1            ; ce90002a
1095       bld.ldsdir(aco_opcode::lds_direct_load, dst, op)->ldsdir().wait_vdst = 0;
1096 
1097       //~gfx11! lds_param_load v42, attr56.x wait_vdst:8                    ; ce08e02a
1098       //~gfx12! ds_param_load v42, attr56.x wait_va_vdst:8 wait_vm_vsrc:1   ; ce88e02a
1099       bld.ldsdir(aco_opcode::lds_param_load, dst, op, 56, 0)->ldsdir().wait_vdst = 8;
1100 
1101       //; if llvm_ver >= 18 and variant == 'gfx11':
1102       //;    insert_pattern('lds_param_load v42, attr56.x wait_vdst:0                    ; ce00e02a')
1103       //; elif variant == 'gfx11':
1104       //;    insert_pattern('lds_param_load v42, attr56.x                                ; ce00e02a')
1105       //~gfx12! ds_param_load v42, attr56.x wait_va_vdst:0 wait_vm_vsrc:1   ; ce80e02a
1106       bld.ldsdir(aco_opcode::lds_param_load, dst, op, 56, 0)->ldsdir().wait_vdst = 0;
1107 
1108       //~gfx11! lds_param_load v42, attr34.y wait_vdst:8                    ; ce08892a
1109       //~gfx12! ds_param_load v42, attr34.y wait_va_vdst:8 wait_vm_vsrc:1   ; ce88892a
1110       bld.ldsdir(aco_opcode::lds_param_load, dst, op, 34, 1)->ldsdir().wait_vdst = 8;
1111 
1112       //~gfx11! lds_param_load v42, attr12.z wait_vdst:8                    ; ce08322a
1113       //~gfx12! ds_param_load v42, attr12.z wait_va_vdst:8 wait_vm_vsrc:1   ; ce88322a
1114       bld.ldsdir(aco_opcode::lds_param_load, dst, op, 12, 2)->ldsdir().wait_vdst = 8;
1115 
1116       //~gfx11>> lds_direct_load v42 wait_vdst:15                           ; ce1f002a
1117       //~gfx12>> ds_direct_load v42 wait_va_vdst:15 wait_vm_vsrc:0          ; ce1f002a
1118       bld.ldsdir(aco_opcode::lds_direct_load, dst, op)->ldsdir().wait_vsrc = 0;
1119 
1120       finish_assembler_test();
1121    }
1122 END_TEST
1123 
1124 BEGIN_TEST(assembler.vop12c_v128)
1125    for (amd_gfx_level gfx : filter_gfx_levels({GFX11, GFX12})) {
1126       if (!setup_cs(NULL, gfx))
1127          continue;
1128 
1129       Definition dst_v0 = bld.def(v1);
1130       dst_v0.setFixed(PhysReg(256));
1131 
1132       Definition dst_v128 = bld.def(v1);
1133       dst_v128.setFixed(PhysReg(256 + 128));
1134 
1135       Operand op_v1(bld.tmp(v1));
1136       op_v1.setFixed(PhysReg(256 + 1));
1137 
1138       Operand op_v2(bld.tmp(v1));
1139       op_v2.setFixed(PhysReg(256 + 2));
1140 
1141       Operand op_v129(bld.tmp(v1));
1142       op_v129.setFixed(PhysReg(256 + 129));
1143 
1144       Operand op_v130(bld.tmp(v1));
1145       op_v130.setFixed(PhysReg(256 + 130));
1146 
1147       //! llvm_version: #llvm_ver
1148       fprintf(output, "llvm_version: %u\n", LLVM_VERSION_MAJOR);
1149 
1150       //>> BB0:
1151       //; if llvm_ver == 16:
1152       //;    insert_pattern('v_mul_f16_e32 v0, v1, v2 ; Error: VGPR_32_Lo128: unknown register 128 ; 6a000501')
1153       //; else:
1154       //;    insert_pattern('v_mul_f16_e32 v0, v1, v2                                    ; 6a000501')
1155       bld.vop2(aco_opcode::v_mul_f16, dst_v0, op_v1, op_v2);
1156 
1157       //! v_mul_f16_e64 v128, v1, v2                                  ; d5350080 00020501
1158       bld.vop2(aco_opcode::v_mul_f16, dst_v128, op_v1, op_v2);
1159 
1160       //! v_mul_f16_e64 v0, v129, v2                                  ; d5350000 00020581
1161       bld.vop2(aco_opcode::v_mul_f16, dst_v0, op_v129, op_v2);
1162 
1163       //! v_mul_f16_e64 v0, v1, v130                                  ; d5350000 00030501
1164       bld.vop2(aco_opcode::v_mul_f16, dst_v0, op_v1, op_v130);
1165 
1166       //! v_rcp_f16_e64 v128, v1                                      ; d5d40080 00000101
1167       bld.vop1(aco_opcode::v_rcp_f16, dst_v128, op_v1);
1168 
1169       //! v_cmp_eq_f16_e64 vcc, v129, v2                              ; d402006a 00020581
1170       bld.vopc(aco_opcode::v_cmp_eq_f16, bld.def(s2, vcc), op_v129, op_v2);
1171 
1172       //! v_mul_f16_e64_dpp v128, v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 fi:1 ; d5350080 000204fa ff0d2101
1173       bld.vop2_dpp(aco_opcode::v_mul_f16, dst_v128, op_v1, op_v2, dpp_row_rr(1));
1174 
1175       //! v_mul_f16_e64_dpp v0, v129, v2 row_ror:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 fi:1 ; d5350000 000204fa ff0d2181
1176       bld.vop2_dpp(aco_opcode::v_mul_f16, dst_v0, op_v129, op_v2, dpp_row_rr(1));
1177 
1178       //! v_mul_f16_e64_dpp v0, v1, v130 row_ror:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 fi:1 ; d5350000 000304fa ff0d2101
1179       bld.vop2_dpp(aco_opcode::v_mul_f16, dst_v0, op_v1, op_v130, dpp_row_rr(1));
1180 
1181       //! v_mul_f16_e64_dpp v128, v1, v2 dpp8:[0,0,0,0,0,0,0,0] fi:1  ; d5350080 000204ea 00000001
1182       bld.vop2_dpp8(aco_opcode::v_mul_f16, dst_v128, op_v1, op_v2);
1183 
1184       //! v_mul_f16_e64_dpp v0, v129, v2 dpp8:[0,0,0,0,0,0,0,0] fi:1  ; d5350000 000204ea 00000081
1185       bld.vop2_dpp8(aco_opcode::v_mul_f16, dst_v0, op_v129, op_v2);
1186 
1187       //! v_mul_f16_e64_dpp v0, v1, v130 dpp8:[0,0,0,0,0,0,0,0] fi:1  ; d5350000 000304ea 00000001
1188       bld.vop2_dpp8(aco_opcode::v_mul_f16, dst_v0, op_v1, op_v130);
1189 
1190       //! v_fma_f16 v128, v1, v2, 0x60                                ; d6480080 03fe0501 00000060
1191       bld.vop2(aco_opcode::v_fmaak_f16, dst_v128, op_v1, op_v2, Operand::literal32(96));
1192 
1193       //! v_fma_f16 v128, v1, 0x60, v2                                ; d6480080 0409ff01 00000060
1194       bld.vop2(aco_opcode::v_fmamk_f16, dst_v128, op_v1, op_v2, Operand::literal32(96));
1195 
1196       //! v_rcp_f16_e64_dpp v128, -v1 row_ror:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 fi:1 ; d5d40080 200000fa ff1d2101
1197       bld.vop1_dpp(aco_opcode::v_rcp_f16, dst_v128, op_v1, dpp_row_rr(1))->dpp16().neg[0] = true;
1198 
1199       //! v_rcp_f16_e64_dpp v128, |v1| row_ror:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 fi:1 ; d5d40180 000000fa ff2d2101
1200       bld.vop1_dpp(aco_opcode::v_rcp_f16, dst_v128, op_v1, dpp_row_rr(1))->dpp16().abs[0] = true;
1201 
1202       //! v_mul_f16_e64_dpp v128, -v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 fi:1 ; d5350080 200204fa ff1d2101
1203       bld.vop2_dpp(aco_opcode::v_mul_f16, dst_v128, op_v1, op_v2, dpp_row_rr(1))->dpp16().neg[0] =
1204          true;
1205 
1206       //! v_mul_f16_e64_dpp v128, |v1|, v2 row_ror:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 fi:1 ; d5350180 000204fa ff2d2101
1207       bld.vop2_dpp(aco_opcode::v_mul_f16, dst_v128, op_v1, op_v2, dpp_row_rr(1))->dpp16().abs[0] =
1208          true;
1209 
1210       //! v_cmp_eq_f16_e64_dpp vcc, -v129, v2 row_ror:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 fi:1 ; d402006a 200204fa ff1d2181
1211       bld.vopc_dpp(aco_opcode::v_cmp_eq_f16, bld.def(s2, vcc), op_v129, op_v2, dpp_row_rr(1))
1212          ->dpp16()
1213          .neg[0] = true;
1214 
1215       //! v_cmp_eq_f16_e64_dpp vcc, |v129|, v2 row_ror:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 fi:1 ; d402016a 000204fa ff2d2181
1216       bld.vopc_dpp(aco_opcode::v_cmp_eq_f16, bld.def(s2, vcc), op_v129, op_v2, dpp_row_rr(1))
1217          ->dpp16()
1218          .abs[0] = true;
1219 
1220       finish_assembler_test();
1221    }
1222 END_TEST
1223 
1224 BEGIN_TEST(assembler.vop3_dpp)
1225    for (amd_gfx_level gfx : filter_gfx_levels({GFX11, GFX12})) {
1226       if (!setup_cs(NULL, gfx))
1227          continue;
1228 
1229       Definition dst_v0 = bld.def(v1);
1230       dst_v0.setFixed(PhysReg(256));
1231 
1232       Definition dst_non_vcc = bld.def(s2);
1233       dst_non_vcc.setFixed(PhysReg(4));
1234 
1235       Operand op_v1(bld.tmp(v1));
1236       op_v1.setFixed(PhysReg(256 + 1));
1237 
1238       Operand op_v2(bld.tmp(v1));
1239       op_v2.setFixed(PhysReg(256 + 2));
1240 
1241       Operand op_s1(bld.tmp(s1));
1242       op_s1.setFixed(PhysReg(1));
1243 
1244       //>> BB0:
1245       //! v_fma_f32_e64_dpp v0, v1, v2, s1 clamp row_ror:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 fi:1 ; d6138000 000604fa ff0d2101
1246       bld.vop3_dpp(aco_opcode::v_fma_f32, dst_v0, op_v1, op_v2, op_s1, dpp_row_rr(1))->valu().clamp =
1247          true;
1248 
1249       //! v_fma_mix_f32_e64_dpp v0, |v1|, |v2|, |s1| op_sel:[1,0,0] op_sel_hi:[1,0,1] row_ror:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 fi:1 ; cc204f00 080604fa ffad2101
1250       bld.vop3p_dpp(aco_opcode::v_fma_mix_f32, dst_v0, op_v1, op_v2, op_s1, 0x1, 0x5, dpp_row_rr(1))
1251          ->valu()
1252          .abs = 0x7;
1253 
1254       //! v_fma_f32_e64_dpp v0, -v1, -v2, -s1 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; d6130000 e00604ea 00000001
1255       bld.vop3_dpp8(aco_opcode::v_fma_f32, dst_v0, op_v1, op_v2, op_s1)->valu().neg = 0x7;
1256 
1257       //! v_fma_mix_f32_e64_dpp v0, -v1, -v2, s1 op_sel_hi:[1,1,1] dpp8:[0,0,0,0,0,0,0,0] fi:1 ; cc204000 780604ea 00000001
1258       bld.vop3p_dpp8(aco_opcode::v_fma_mix_f32, dst_v0, op_v1, op_v2, op_s1, 0x0, 0x7)->valu().neg =
1259          0x3;
1260 
1261       //! v_add_f32_e64_dpp v0, v1, v2 clamp row_ror:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 fi:1 ; d5038000 000204fa ff0d2101
1262       bld.vop2_e64_dpp(aco_opcode::v_add_f32, dst_v0, op_v1, op_v2, dpp_row_rr(1))->valu().clamp =
1263          true;
1264 
1265       //! v_sqrt_f32_e64_dpp v0, v1 clamp row_ror:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 fi:1 ; d5b38000 000000fa ff0d2101
1266       bld.vop1_e64_dpp(aco_opcode::v_sqrt_f32, dst_v0, op_v1, dpp_row_rr(1))->valu().clamp = true;
1267 
1268       //! v_cmp_lt_f32_e64_dpp s[4:5], |v1|, |v2| row_ror:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 fi:1 ; d4110304 000204fa ffad2101
1269       bld.vopc_e64_dpp(aco_opcode::v_cmp_lt_f32, dst_non_vcc, op_v1, op_v2, dpp_row_rr(1))
1270          ->valu()
1271          .abs = 0x3;
1272 
1273       //! v_add_f32_e64_dpp v0, v1, v2 mul:4 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; d5030000 100204ea 00000001
1274       bld.vop2_e64_dpp8(aco_opcode::v_add_f32, dst_v0, op_v1, op_v2)->valu().omod = 2;
1275 
1276       //! v_sqrt_f32_e64_dpp v0, v1 clamp dpp8:[0,0,0,0,0,0,0,0] fi:1 ; d5b38000 000000ea 00000001
1277       bld.vop1_e64_dpp8(aco_opcode::v_sqrt_f32, dst_v0, op_v1)->valu().clamp = true;
1278 
1279       //! v_cmp_lt_f32_e64_dpp s[4:5], |v1|, v2 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; d4110104 000204ea 00000001
1280       bld.vopc_e64_dpp8(aco_opcode::v_cmp_lt_f32, dst_non_vcc, op_v1, op_v2)->valu().abs = 0x1;
1281 
1282       finish_assembler_test();
1283    }
1284 END_TEST
1285 
1286 BEGIN_TEST(assembler.vopd)
1287    for (amd_gfx_level gfx : filter_gfx_levels({GFX11, GFX12})) {
1288       if (!setup_cs(NULL, gfx))
1289          continue;
1290 
1291       program->wave_size = 32;
1292 
1293       Definition dst_v0 = bld.def(v1);
1294       dst_v0.setFixed(PhysReg(256));
1295 
1296       Definition dst_v1 = bld.def(v1);
1297       dst_v1.setFixed(PhysReg(256 + 1));
1298 
1299       Operand op_v0(bld.tmp(v1));
1300       op_v0.setFixed(PhysReg(256 + 0));
1301 
1302       Operand op_v1(bld.tmp(v1));
1303       op_v1.setFixed(PhysReg(256 + 1));
1304 
1305       Operand op_v2(bld.tmp(v1));
1306       op_v2.setFixed(PhysReg(256 + 2));
1307 
1308       Operand op_v3(bld.tmp(v1));
1309       op_v3.setFixed(PhysReg(256 + 3));
1310 
1311       Operand op_s0(bld.tmp(s1));
1312       op_s0.setFixed(PhysReg(0));
1313 
1314       Operand op_vcc(bld.tmp(s1));
1315       op_vcc.setFixed(vcc);
1316 
1317       //>> BB0:
1318       //! v_dual_mov_b32 v0, v0 :: v_dual_mov_b32 v1, v1 ; ca100100 00000101
1319       bld.vopd(aco_opcode::v_dual_mov_b32, dst_v0, dst_v1, op_v0, op_v1,
1320                aco_opcode::v_dual_mov_b32);
1321 
1322       //! v_dual_mov_b32 v0, 0x60 :: v_dual_mov_b32 v1, s0 ; ca1000ff 00000000 00000060
1323       bld.vopd(aco_opcode::v_dual_mov_b32, dst_v0, dst_v1, Operand::c32(96), op_s0,
1324                aco_opcode::v_dual_mov_b32);
1325 
1326       //! v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, 0x60 ; ca100000 000000ff 00000060
1327       bld.vopd(aco_opcode::v_dual_mov_b32, dst_v0, dst_v1, op_s0, Operand::c32(96),
1328                aco_opcode::v_dual_mov_b32);
1329 
1330       //! v_dual_mul_f32 v0, v0, v1 :: v_dual_mov_b32 v1, v2 ; c8d00300 00000102
1331       bld.vopd(aco_opcode::v_dual_mul_f32, dst_v0, dst_v1, op_v0, op_v1, op_v2,
1332                aco_opcode::v_dual_mov_b32);
1333 
1334       //! v_dual_fmac_f32 v0, v1, v2 :: v_dual_mov_b32 v1, v3 ; c8100501 00000103
1335       bld.vopd(aco_opcode::v_dual_fmac_f32, dst_v0, dst_v1, op_v1, op_v2, op_v0, op_v3,
1336                aco_opcode::v_dual_mov_b32);
1337 
1338       //! v_dual_mov_b32 v0, v0 :: v_dual_and_b32 v1, v1, v2 ; ca240100 00000501
1339       bld.vopd(aco_opcode::v_dual_mov_b32, dst_v0, dst_v1, op_v0, op_v1, op_v2,
1340                aco_opcode::v_dual_and_b32);
1341 
1342       //! v_dual_cndmask_b32 v0, v0, v1 :: v_dual_cndmask_b32 v1, v2, v3 ; ca520300 00000702
1343       bld.vopd(aco_opcode::v_dual_cndmask_b32, dst_v0, dst_v1, op_v0, op_v1, op_vcc, op_v2, op_v3,
1344                op_vcc, aco_opcode::v_dual_cndmask_b32);
1345 
1346       finish_assembler_test();
1347    }
1348 END_TEST
1349 
1350 BEGIN_TEST(assembler.pseudo_scalar_trans)
1351    if (LLVM_VERSION_MAJOR < 19 || !setup_cs(NULL, GFX12))
1352       return;
1353 
1354    //>> v_s_sqrt_f32 s5, s1                                         ; d6880005 00000001
1355    bld.vop3(aco_opcode::v_s_sqrt_f32, Definition(PhysReg(5), s1), Operand(PhysReg(1), s1));
1356 
1357    finish_assembler_test();
1358 END_TEST
1359 
1360 BEGIN_TEST(assembler.vintrp_high_16bits)
1361    for (unsigned i = GFX8; i <= GFX10; i++) {
1362       if (!setup_cs(NULL, (amd_gfx_level)i))
1363          continue;
1364 
1365       Definition dst_v0 = bld.def(v1);
1366       dst_v0.setFixed(PhysReg(256));
1367 
1368       Definition dst_v1 = bld.def(v1);
1369       dst_v1.setFixed(PhysReg(256 + 1));
1370 
1371       Operand op_v0(bld.tmp(v1));
1372       op_v0.setFixed(PhysReg(256 + 0));
1373 
1374       Operand op_v1(bld.tmp(v1));
1375       op_v1.setFixed(PhysReg(256 + 1));
1376 
1377       Operand op_v2(bld.tmp(v1));
1378       op_v2.setFixed(PhysReg(256 + 2));
1379 
1380       Operand op_m0(bld.tmp(s1));
1381       op_m0.setFixed(m0);
1382 
1383       aco_opcode interp_p2_op = aco_opcode::v_interp_p2_f16;
1384 
1385       if (bld.program->gfx_level == GFX8)
1386          interp_p2_op = aco_opcode::v_interp_p2_legacy_f16;
1387 
1388       //! BB0:
1389       //~gfx8! v_interp_p1ll_f16 v0, v1, attr4.y high                      ; d2740000 00020344
1390       //~gfx9! v_interp_p1ll_f16 v0, v1, attr4.y high                      ; d2740000 00020344
1391       //~gfx10! v_interp_p1ll_f16 v0, v1, attr4.y high                      ; d7420000 00020344
1392       bld.vintrp(aco_opcode::v_interp_p1ll_f16, dst_v0, op_v1, op_m0, 4, 1, true);
1393 
1394       //~gfx8! v_interp_p2_f16 v1, v2, attr4.y, v0 high                    ; d2760001 04020544
1395       //~gfx9! v_interp_p2_f16 v1, v2, attr4.y, v0 high                    ; d2770001 04020544
1396       //~gfx10! v_interp_p2_f16 v1, v2, attr4.y, v0 high                    ; d75a0001 04020544
1397       bld.vintrp(interp_p2_op, dst_v1, op_v2, op_m0, op_v0, 4, 1, true);
1398 
1399       finish_assembler_test();
1400    }
1401 END_TEST
1402