1 /*
2 * Copyright © 2020 Valve Corporation
3 *
4 * SPDX-License-Identifier: MIT
5 */
6 #include <llvm/Config/llvm-config.h>
7
8 #include "helpers.h"
9 #include "sid.h"
10
11 using namespace aco;
12
13 static std::vector<amd_gfx_level>
filter_gfx_levels(std::vector<amd_gfx_level> src)14 filter_gfx_levels(std::vector<amd_gfx_level> src)
15 {
16 std::vector<amd_gfx_level> res;
17 for (amd_gfx_level gfx : src) {
18 if (gfx < GFX12 || LLVM_VERSION_MAJOR >= 19)
19 res.push_back(gfx);
20 }
21 return res;
22 }
23
24 BEGIN_TEST(assembler.s_memtime)
25 for (unsigned i = GFX6; i <= GFX10; i++) {
26 if (!setup_cs(NULL, (amd_gfx_level)i))
27 continue;
28
29 //~gfx[6-7]>> c7800000
30 //~gfx[6-7]! bf810000
31 //~gfx[8-9]>> s_memtime s[0:1] ; c0900000 00000000
32 //~gfx10>> s_memtime s[0:1] ; f4900000 fa000000
33 bld.smem(aco_opcode::s_memtime, bld.def(s2)).def(0).setFixed(PhysReg{0});
34
35 finish_assembler_test();
36 }
37 END_TEST
38
39 BEGIN_TEST(assembler.branch_3f)
40 if (!setup_cs(NULL, (amd_gfx_level)GFX10))
41 return;
42
43 //! BB0:
44 //! s_branch BB1 ; bf820040
45 //! s_nop 0 ; bf800000
46 bld.sopp(aco_opcode::s_branch, 1);
47
48 for (unsigned i = 0; i < 0x3f; i++)
49 bld.vop1(aco_opcode::v_nop);
50
51 bld.reset(program->create_and_insert_block());
52
53 program->blocks[1].linear_preds.push_back(0u);
54
55 finish_assembler_test();
56 END_TEST
57
58 BEGIN_TEST(assembler.long_jump.unconditional_forwards)
59 if (!setup_cs(NULL, (amd_gfx_level)GFX10))
60 return;
61
62 //!BB0:
63 //! s_branch 16369 ; bf823ff1
64 bld.sopp(aco_opcode::s_branch, 2);
65
66 bld.reset(program->create_and_insert_block());
67
68 //! s_nop 0 ; bf800000
69 //!(then repeated 16366 times)
70 for (unsigned i = 0; i < INT16_MAX + 1; i++)
71 bld.sopp(aco_opcode::s_nop, 0);
72
73 //! s_waitcnt_vscnt null, 0x0 ; bbfd0000
74 //! s_branch 1 ; bf820001
75 //! s_branch BB2 ; bf824011
76 //! s_nop 0 ; bf800000
77 //! (then repeated 16400 times)
78 //! BB2:
79 //! s_endpgm ; bf810000
80 bld.reset(program->create_and_insert_block());
81
82 program->blocks[2].linear_preds.push_back(0u);
83 program->blocks[2].linear_preds.push_back(1u);
84
85 finish_assembler_test();
86 END_TEST
87
88 BEGIN_TEST(assembler.long_jump.conditional_forwards)
89 for (amd_gfx_level gfx : filter_gfx_levels({GFX10, GFX12})) {
90 if (!setup_cs(NULL, gfx))
91 continue;
92
93 //! BB0:
94 //~gfx10! s_cbranch_scc0 16369 ; bf843ff1
95 //~gfx12! s_cbranch_scc0 16368 ; bfa13ff0
96 bld.sopp(aco_opcode::s_cbranch_scc0, 2);
97
98 bld.reset(program->create_and_insert_block());
99
100 //! BB1:
101 //! s_nop 0 ; bf800000
102 //!(then repeated 16366 times)
103 //~gfx10! s_waitcnt_vscnt null, 0x0 ; bbfd0000
104 //! s_branch 1 ; $_
105 //! s_branch BB2 ; $_
106 //! s_nop 0 ; bf800000
107 //!(then repeated 16400 times)
108 for (unsigned i = 0; i < INT16_MAX + 1; i++)
109 bld.sopp(aco_opcode::s_nop, 0);
110
111 //! BB2:
112 //! s_endpgm ; $_
113 bld.reset(program->create_and_insert_block());
114
115 program->blocks[1].linear_preds.push_back(0u);
116 program->blocks[2].linear_preds.push_back(0u);
117 program->blocks[2].linear_preds.push_back(1u);
118
119 finish_assembler_test();
120 }
121 END_TEST
122
123 BEGIN_TEST(assembler.long_jump.unconditional_backwards)
124 if (!setup_cs(NULL, (amd_gfx_level)GFX10))
125 return;
126
127 //!BB0:
128 //! s_nop 0 ; bf800000
129 //!(then repeated 16367 times)
130 for (unsigned i = 0; i < INT16_MAX + 1; i++)
131 bld.sopp(aco_opcode::s_nop, 0);
132
133 //! s_waitcnt_vscnt null, 0x0 ; bbfd0000
134 //! s_branch 1 ; bf820001
135 //! s_branch BB0 ; bf82c00d
136 //! s_nop 0 ; bf800000
137 //! (then repeated 16399 times)
138 //! s_branch 49134 ; bf82bfee
139 bld.sopp(aco_opcode::s_branch, 0);
140
141 //! BB1:
142 //! s_endpgm ; bf810000
143 bld.reset(program->create_and_insert_block());
144
145 program->blocks[0].linear_preds.push_back(0u);
146 program->blocks[1].linear_preds.push_back(0u);
147
148 finish_assembler_test();
149 END_TEST
150
151 BEGIN_TEST(assembler.long_jump.conditional_backwards)
152 if (!setup_cs(NULL, (amd_gfx_level)GFX10))
153 return;
154
155 //!BB0:
156 //! s_nop 0 ; bf800000
157 //!(then repeated 16367 times)
158 for (unsigned i = 0; i < INT16_MAX + 1; i++)
159 bld.sopp(aco_opcode::s_nop, 0);
160
161 //! s_waitcnt_vscnt null, 0x0 ; bbfd0000
162 //! s_branch 1 ; bf820001
163 //! s_branch BB0 ; bf82c00d
164 //! s_nop 0 ; bf800000
165 //!(then repeated 16399 times)
166 //! s_cbranch_execnz 49134 ; bf89bfee
167 bld.sopp(aco_opcode::s_cbranch_execnz, 0);
168
169 //! BB1:
170 //! s_endpgm ; bf810000
171 bld.reset(program->create_and_insert_block());
172
173 program->blocks[0].linear_preds.push_back(0u);
174 program->blocks[1].linear_preds.push_back(0u);
175
176 finish_assembler_test();
177 END_TEST
178
179 BEGIN_TEST(assembler.long_jump.constaddr)
180 if (!setup_cs(NULL, (amd_gfx_level)GFX10))
181 return;
182
183 //>> s_branch 16369 ; bf823ff1
184 bld.sopp(aco_opcode::s_branch, 2);
185
186 bld.reset(program->create_and_insert_block());
187
188 for (unsigned i = 0; i < INT16_MAX + 1; i++)
189 bld.sopp(aco_opcode::s_nop, 0);
190
191 bld.reset(program->create_and_insert_block());
192
193 //>> s_getpc_b64 s[0:1] ; be801f00
194 //! s_add_u32 s0, s0, 32 ; 8000ff00 00000020
195 bld.sop1(aco_opcode::p_constaddr_getpc, Definition(PhysReg(0), s2), Operand::zero());
196 bld.sop2(aco_opcode::p_constaddr_addlo, Definition(PhysReg(0), s1), bld.def(s1, scc),
197 Operand(PhysReg(0), s1), Operand::zero(), Operand::zero());
198
199 program->blocks[2].linear_preds.push_back(0u);
200 program->blocks[2].linear_preds.push_back(1u);
201
202 finish_assembler_test();
203 END_TEST
204
205 BEGIN_TEST(assembler.long_jump.discard_early_exit)
206 if (!setup_cs(NULL, (amd_gfx_level)GFX10))
207 return;
208
209 //! BB0:
210 //! s_cbranch_scc0 16369 ; bf843ff1
211 bld.sopp(aco_opcode::s_cbranch_scc0, 2);
212
213 bld.reset(program->create_and_insert_block());
214
215 //! BB1:
216 //! s_nop 1 ; bf800001
217 //! (then repeated 16366 times)
218 //! s_waitcnt_vscnt null, 0x0 ; bbfd0000
219 //! s_branch 1 ; bf820001
220 //! s_branch BB2 ; bf824011
221 //! s_nop 1 ; bf800001
222 //! (then repeated 16399 times)
223 //! s_endpgm ; bf810000
224 for (unsigned i = 0; i < INT16_MAX; i++)
225 bld.sopp(aco_opcode::s_nop, 1);
226
227 //! BB2:
228 //! s_endpgm ; bf810000
229 bld.reset(program->create_and_insert_block());
230
231 program->blocks[1].linear_preds.push_back(0u);
232 program->blocks[2].linear_preds.push_back(0u);
233 program->blocks[2].kind = block_kind_discard_early_exit;
234
235 finish_assembler_test();
236 END_TEST
237
238 BEGIN_TEST(assembler.v_add3)
239 for (unsigned i = GFX9; i <= GFX10; i++) {
240 if (!setup_cs(NULL, (amd_gfx_level)i))
241 continue;
242
243 //~gfx9>> v_add3_u32 v0, 0, 0, 0 ; d1ff0000 02010080
244 //~gfx10>> v_add3_u32 v0, 0, 0, 0 ; d76d0000 02010080
245 aco_ptr<Instruction> add3{create_instruction(aco_opcode::v_add3_u32, Format::VOP3, 3, 1)};
246 add3->operands[0] = Operand::zero();
247 add3->operands[1] = Operand::zero();
248 add3->operands[2] = Operand::zero();
249 add3->definitions[0] = Definition(PhysReg(0), v1);
250 bld.insert(std::move(add3));
251
252 finish_assembler_test();
253 }
254 END_TEST
255
256 BEGIN_TEST(assembler.v_add3_clamp)
257 for (unsigned i = GFX9; i <= GFX10; i++) {
258 if (!setup_cs(NULL, (amd_gfx_level)i))
259 continue;
260
261 //~gfx9>> integer addition + clamp ; d1ff8000 02010080
262 //~gfx10>> integer addition + clamp ; d76d8000 02010080
263 aco_ptr<Instruction> add3{create_instruction(aco_opcode::v_add3_u32, Format::VOP3, 3, 1)};
264 add3->operands[0] = Operand::zero();
265 add3->operands[1] = Operand::zero();
266 add3->operands[2] = Operand::zero();
267 add3->definitions[0] = Definition(PhysReg(0), v1);
268 add3->valu().clamp = 1;
269 bld.insert(std::move(add3));
270
271 finish_assembler_test();
272 }
273 END_TEST
274
275 BEGIN_TEST(assembler.smem_offset)
276 for (unsigned i = GFX9; i <= GFX10; i++) {
277 if (!setup_cs(NULL, (amd_gfx_level)i))
278 continue;
279
280 Definition dst(PhysReg(7), s1);
281 Operand sbase(PhysReg(6), s2);
282 Operand offset(PhysReg(5), s1);
283
284 //~gfx9>> s_load_dword s7, s[6:7], s5 ; c00001c3 00000005
285 //~gfx10>> s_load_dword s7, s[6:7], s5 ; f40001c3 0a000000
286 bld.smem(aco_opcode::s_load_dword, dst, sbase, offset);
287 //~gfx9! s_load_dword s7, s[6:7], 0x42 ; c00201c3 00000042
288 //~gfx10! s_load_dword s7, s[6:7], 0x42 ; f40001c3 fa000042
289 bld.smem(aco_opcode::s_load_dword, dst, sbase, Operand::c32(0x42));
290 if (i >= GFX9) {
291 //~gfx9! s_load_dword s7, s[6:7], s5 offset:0x42 ; c00241c3 0a000042
292 //~gfx10! s_load_dword s7, s[6:7], s5 offset:0x42 ; f40001c3 0a000042
293 bld.smem(aco_opcode::s_load_dword, dst, sbase, Operand::c32(0x42), offset);
294 }
295
296 finish_assembler_test();
297 }
298 END_TEST
299
300 BEGIN_TEST(assembler.p_constaddr)
301 if (!setup_cs(NULL, GFX9))
302 return;
303
304 Definition dst0 = bld.def(s2);
305 Definition dst1 = bld.def(s2);
306 dst0.setFixed(PhysReg(0));
307 dst1.setFixed(PhysReg(2));
308
309 //>> s_getpc_b64 s[0:1] ; be801c00
310 //! s_add_u32 s0, s0, 44 ; 8000ff00 0000002c
311 bld.pseudo(aco_opcode::p_constaddr, dst0, bld.def(s1, scc), Operand::zero());
312
313 //! s_getpc_b64 s[2:3] ; be821c00
314 //! s_add_u32 s2, s2, 64 ; 8002ff02 00000040
315 bld.pseudo(aco_opcode::p_constaddr, dst1, bld.def(s1, scc), Operand::c32(32));
316
317 aco::lower_to_hw_instr(program.get());
318 finish_assembler_test();
319 END_TEST
320
321 BEGIN_TEST(assembler.vopc_sdwa)
322 for (unsigned i = GFX9; i <= GFX10; i++) {
323 if (!setup_cs(NULL, (amd_gfx_level)i))
324 continue;
325
326 //~gfx9>> v_cmp_lt_u32_sdwa vcc, 0, 0 src0_sel:DWORD src1_sel:DWORD ; 7d9300f9 86860080
327 //~gfx10>> v_cmp_lt_u32_sdwa vcc, 0, 0 src0_sel:DWORD src1_sel:DWORD ; 7d8300f9 86860080
328 bld.vopc_sdwa(aco_opcode::v_cmp_lt_u32, Definition(vcc, s2), Operand::zero(),
329 Operand::zero());
330
331 //~gfx9! v_cmp_lt_u32_sdwa s[44:45], 0, 0 src0_sel:DWORD src1_sel:DWORD ; 7d9300f9 8686ac80
332 //~gfx10! v_cmp_lt_u32_sdwa s[44:45], 0, 0 src0_sel:DWORD src1_sel:DWORD ; 7d8300f9 8686ac80
333 bld.vopc_sdwa(aco_opcode::v_cmp_lt_u32, Definition(PhysReg(0x2c), s2), Operand::zero(),
334 Operand::zero());
335
336 //~gfx9! v_cmp_lt_u32_sdwa exec, 0, 0 src0_sel:DWORD src1_sel:DWORD ; 7d9300f9 8686fe80
337 //~gfx10! v_cmp_lt_u32_sdwa exec, 0, 0 src0_sel:DWORD src1_sel:DWORD ; 7d8300f9 8686fe80
338 bld.vopc_sdwa(aco_opcode::v_cmp_lt_u32, Definition(exec, s2), Operand::zero(),
339 Operand::zero());
340
341 if (i == GFX10) {
342 //~gfx10! v_cmpx_lt_u32_sdwa 0, 0 src0_sel:DWORD src1_sel:DWORD ; 7da300f9 86860080
343 bld.vopc_sdwa(aco_opcode::v_cmpx_lt_u32, Definition(exec, s2), Operand::zero(),
344 Operand::zero());
345 } else {
346 //~gfx9! v_cmpx_lt_u32_sdwa vcc, 0, 0 src0_sel:DWORD src1_sel:DWORD ; 7db300f9 86860080
347 bld.vopc_sdwa(aco_opcode::v_cmpx_lt_u32, Definition(vcc, s2), Definition(exec, s2),
348 Operand::zero(), Operand::zero());
349
350 //~gfx9! v_cmpx_lt_u32_sdwa s[44:45], 0, 0 src0_sel:DWORD src1_sel:DWORD ; 7db300f9 8686ac80
351 bld.vopc_sdwa(aco_opcode::v_cmpx_lt_u32, Definition(PhysReg(0x2c), s2),
352 Definition(exec, s2), Operand::zero(), Operand::zero());
353 }
354
355 finish_assembler_test();
356 }
357 END_TEST
358
359 BEGIN_TEST(assembler.smem)
360 for (amd_gfx_level gfx : filter_gfx_levels({GFX11, GFX12})) {
361 if (!setup_cs(NULL, gfx))
362 continue;
363
364 Definition dst = bld.def(s1);
365 dst.setFixed(PhysReg(4));
366
367 Operand op_s1(bld.tmp(s1));
368 op_s1.setFixed(PhysReg(8));
369
370 Operand op_s2(bld.tmp(s2));
371 op_s2.setFixed(PhysReg(16));
372
373 Operand op_s4(bld.tmp(s4));
374 op_s4.setFixed(PhysReg(32));
375
376 //~gfx11>> s_dcache_inv ; f4840000 f8000000
377 //~gfx12>> s_dcache_inv ; f4042000 f8000000
378 bld.smem(aco_opcode::s_dcache_inv);
379
380 //! s_load_b32 s4, s[16:17], 0x2a ; f4000108 f800002a
381 bld.smem(aco_opcode::s_load_dword, dst, op_s2, Operand::c32(42));
382
383 //~gfx11! s_load_b32 s4, s[16:17], s8 ; f4000108 10000000
384 //~gfx12! s_load_b32 s4, s[16:17], s8 offset:0x0 ; f4000108 10000000
385 bld.smem(aco_opcode::s_load_dword, dst, op_s2, op_s1);
386
387 //! s_load_b32 s4, s[16:17], s8 offset:0x2a ; f4000108 1000002a
388 bld.smem(aco_opcode::s_load_dword, dst, op_s2, Operand::c32(42), op_s1);
389
390 ac_hw_cache_flags cache_coherent = {{0, 0, 0, 0, 0}};
391 ac_hw_cache_flags cache_non_temporal = {{0, 0, 0, 0, 0}};
392 if (gfx >= GFX12) {
393 cache_coherent.gfx12.scope = gfx12_scope_device;
394 cache_non_temporal.gfx12.temporal_hint = gfx12_load_non_temporal;
395 } else {
396 cache_coherent.value = ac_glc;
397 cache_non_temporal.value = ac_dlc;
398 }
399
400 //~gfx11! s_buffer_load_b32 s4, s[32:35], s8 glc ; f4204110 10000000
401 //~gfx12! s_buffer_load_b32 s4, s[32:35], s8 offset:0x0 scope:SCOPE_DEV ; f4420110 10000000
402 bld.smem(aco_opcode::s_buffer_load_dword, dst, op_s4, op_s1)->smem().cache = cache_coherent;
403
404 //~gfx11! s_buffer_load_b32 s4, s[32:35], s8 dlc ; f4202110 10000000
405 //~gfx12! s_buffer_load_b32 s4, s[32:35], s8 offset:0x0 th:TH_LOAD_NT ; f4820110 10000000
406 bld.smem(aco_opcode::s_buffer_load_dword, dst, op_s4, op_s1)->smem().cache =
407 cache_non_temporal;
408
409 finish_assembler_test();
410 }
411 END_TEST
412
413 BEGIN_TEST(assembler.mubuf)
414 for (amd_gfx_level gfx : filter_gfx_levels({GFX11, GFX12})) {
415 if (!setup_cs(NULL, gfx))
416 continue;
417
418 Definition dst = bld.def(v1);
419 dst.setFixed(PhysReg(256 + 42));
420
421 Operand op_s4(bld.tmp(s4));
422 op_s4.setFixed(PhysReg(32));
423
424 Operand op_v1(bld.tmp(v1));
425 op_v1.setFixed(PhysReg(256 + 10));
426
427 Operand op_v2(bld.tmp(v2));
428 op_v2.setFixed(PhysReg(256 + 20));
429
430 Operand op_s1(bld.tmp(s1));
431 op_s1.setFixed(PhysReg(30));
432
433 Operand op_m0(bld.tmp(s1));
434 op_m0.setFixed(m0);
435
436 //! llvm_version: #llvm_ver
437 fprintf(output, "llvm_version: %u\n", LLVM_VERSION_MAJOR);
438
439 /* Addressing */
440 //~gfx11>> buffer_load_b32 v42, off, s[32:35], s30 ; e0500000 1e082a80
441 //~gfx12>> buffer_load_b32 v42, off, s[32:35], s30 ; c405001e 0080402a 00000000
442 bld.mubuf(aco_opcode::buffer_load_dword, dst, op_s4, Operand(v1), op_s1, 0, false);
443
444 //~gfx11! buffer_load_b32 v42, off, s[32:35], 0 ; e0500000 80082a80
445 //~gfx12! buffer_load_b32 v42, off, s[32:35], null ; c405007c 0080402a 00000000
446 bld.mubuf(aco_opcode::buffer_load_dword, dst, op_s4, Operand(v1), Operand::zero(), 0, false);
447
448 //~gfx11! buffer_load_b32 v42, off, s[32:35], 42 ; e0500000 aa082a80
449 if (gfx == GFX11)
450 bld.mubuf(aco_opcode::buffer_load_dword, dst, op_s4, Operand(v1), Operand::c32(42), 0,
451 false);
452
453 //~gfx11! buffer_load_b32 v42, v10, s[32:35], s30 offen ; e0500000 1e482a0a
454 //~gfx12! buffer_load_b32 v42, v10, s[32:35], s30 offen ; c405001e 4080402a 0000000a
455 bld.mubuf(aco_opcode::buffer_load_dword, dst, op_s4, op_v1, op_s1, 0, true);
456
457 //~gfx11! buffer_load_b32 v42, v10, s[32:35], s30 idxen ; e0500000 1e882a0a
458 //~gfx12! buffer_load_b32 v42, v10, s[32:35], s30 idxen ; c405001e 8080402a 0000000a
459 bld.mubuf(aco_opcode::buffer_load_dword, dst, op_s4, op_v1, op_s1, 0, false)->mubuf().idxen =
460 true;
461
462 //~gfx11! buffer_load_b32 v42, v[20:21], s[32:35], s30 idxen offen ; e0500000 1ec82a14
463 //~gfx12! buffer_load_b32 v42, v[20:21], s[32:35], s30 idxen offen ; c405001e c080402a 00000014
464 bld.mubuf(aco_opcode::buffer_load_dword, dst, op_s4, op_v2, op_s1, 0, true)->mubuf().idxen =
465 true;
466
467 //~gfx11! buffer_load_b32 v42, off, s[32:35], s30 offset:84 ; e0500054 1e082a80
468 //~gfx12! buffer_load_b32 v42, off, s[32:35], s30 offset:84 ; c405001e 0080402a 00005400
469 bld.mubuf(aco_opcode::buffer_load_dword, dst, op_s4, Operand(v1), op_s1, 84, false);
470
471 /* Various flags */
472 ac_hw_cache_flags cache_coherent = {{0, 0, 0, 0, 0}};
473 ac_hw_cache_flags cache_sys_coherent = {{0, 0, 0, 0, 0}};
474 ac_hw_cache_flags cache_non_temporal = {{0, 0, 0, 0, 0}};
475 ac_hw_cache_flags cache_atomic_rtn = {{0, 0, 0, 0, 0}};
476 if (gfx >= GFX12) {
477 cache_coherent.gfx12.scope = gfx12_scope_device;
478 cache_sys_coherent.gfx12.scope = gfx12_scope_memory;
479 cache_non_temporal.gfx12.temporal_hint = gfx12_load_non_temporal;
480 cache_atomic_rtn.gfx12.temporal_hint = gfx12_atomic_return;
481 } else {
482 cache_coherent.value = ac_glc;
483 cache_sys_coherent.value = ac_slc;
484 cache_non_temporal.value = ac_dlc;
485 cache_atomic_rtn.value = ac_glc;
486 }
487
488 //~gfx11! buffer_load_b32 v42, off, s[32:35], 0 glc ; e0504000 80082a80
489 //~gfx12! buffer_load_b32 v42, off, s[32:35], null scope:SCOPE_DEV ; c405007c 0088402a 00000000
490 bld.mubuf(aco_opcode::buffer_load_dword, dst, op_s4, Operand(v1), Operand::zero(), 0, false)
491 ->mubuf()
492 .cache = cache_coherent;
493
494 //~gfx11! buffer_load_b32 v42, off, s[32:35], 0 dlc ; e0502000 80082a80
495 //~gfx12! buffer_load_b32 v42, off, s[32:35], null th:TH_LOAD_NT ; c405007c 0090402a 00000000
496 bld.mubuf(aco_opcode::buffer_load_dword, dst, op_s4, Operand(v1), Operand::zero(), 0, false)
497 ->mubuf()
498 .cache = cache_non_temporal;
499
500 //~gfx11! buffer_load_b32 v42, off, s[32:35], 0 slc ; e0501000 80082a80
501 //~gfx12! buffer_load_b32 v42, off, s[32:35], null scope:SCOPE_SYS ; c405007c 008c402a 00000000
502 bld.mubuf(aco_opcode::buffer_load_dword, dst, op_s4, Operand(v1), Operand::zero(), 0, false)
503 ->mubuf()
504 .cache = cache_sys_coherent;
505
506 //; if llvm_ver >= 16 and variant == 'gfx11':
507 //; insert_pattern('buffer_load_b32 v[42:43], off, s[32:35], 0 tfe ; e0500000 80282a80')
508 //; elif variant == 'gfx11':
509 //; insert_pattern('buffer_load_b32 v42, off, s[32:35], 0 tfe ; e0500000 80282a80')
510 //~gfx12! buffer_load_b32 v[42:43], off, s[32:35], null tfe ; c445007c 0080402a 00000000
511 bld.mubuf(aco_opcode::buffer_load_dword, dst, op_s4, Operand(v1), Operand::zero(), 0, false)
512 ->mubuf()
513 .tfe = true;
514
515 /* LDS */
516 if (gfx == GFX11) {
517 //~gfx11! buffer_load_lds_b32 off, s[32:35], 0 ; e0c40000 80080080
518 bld.mubuf(aco_opcode::buffer_load_dword, op_s4, Operand(v1), Operand::zero(), op_m0, 0,
519 false)
520 ->mubuf()
521 .lds = true;
522
523 //~gfx11! buffer_load_lds_i8 off, s[32:35], 0 ; e0b80000 80080080
524 bld.mubuf(aco_opcode::buffer_load_sbyte, op_s4, Operand(v1), Operand::zero(), op_m0, 0,
525 false)
526 ->mubuf()
527 .lds = true;
528
529 //~gfx11! buffer_load_lds_i16 off, s[32:35], 0 ; e0c00000 80080080
530 bld.mubuf(aco_opcode::buffer_load_sshort, op_s4, Operand(v1), Operand::zero(), op_m0, 0,
531 false)
532 ->mubuf()
533 .lds = true;
534
535 //~gfx11! buffer_load_lds_u8 off, s[32:35], 0 ; e0b40000 80080080
536 bld.mubuf(aco_opcode::buffer_load_ubyte, op_s4, Operand(v1), Operand::zero(), op_m0, 0,
537 false)
538 ->mubuf()
539 .lds = true;
540
541 //~gfx11! buffer_load_lds_u16 off, s[32:35], 0 ; e0bc0000 80080080
542 bld.mubuf(aco_opcode::buffer_load_ushort, op_s4, Operand(v1), Operand::zero(), op_m0, 0,
543 false)
544 ->mubuf()
545 .lds = true;
546
547 //~gfx11! buffer_load_lds_format_x off, s[32:35], 0 ; e0c80000 80080080
548 bld.mubuf(aco_opcode::buffer_load_format_x, op_s4, Operand(v1), Operand::zero(), op_m0, 0,
549 false)
550 ->mubuf()
551 .lds = true;
552 }
553
554 /* Stores */
555 //~gfx11! buffer_store_b32 v10, off, s[32:35], s30 ; e0680000 1e080a80
556 //~gfx12! buffer_store_b32 v10, off, s[32:35], s30 ; c406801e 0080400a 00000000
557 bld.mubuf(aco_opcode::buffer_store_dword, op_s4, Operand(v1), op_s1, op_v1, 0, false);
558
559 //~gfx11! buffer_store_b64 v[20:21], v10, s[32:35], s30 offen ; e06c0000 1e48140a
560 //~gfx12! buffer_store_b64 v[20:21], v10, s[32:35], s30 offen ; c406c01e 40804014 0000000a
561 bld.mubuf(aco_opcode::buffer_store_dwordx2, op_s4, op_v1, op_s1, op_v2, 0, true);
562
563 /* Atomic with return */
564 //~gfx11! buffer_atomic_add_u32 v10, off, s[32:35], 0 glc ; e0d44000 80080a80
565 //~gfx12! buffer_atomic_add_u32 v10, off, s[32:35], null th:TH_ATOMIC_RETURN ; c40d407c 0090400a 00000000
566 bld.mubuf(aco_opcode::buffer_atomic_add, Definition(op_v1.physReg(), v1), op_s4, Operand(v1),
567 Operand::zero(), op_v1, 0, false)
568 ->mubuf()
569 .cache = cache_atomic_rtn;
570
571 finish_assembler_test();
572 }
573 END_TEST
574
575 BEGIN_TEST(assembler.mtbuf)
576 for (amd_gfx_level gfx : filter_gfx_levels({GFX11, GFX12})) {
577 if (!setup_cs(NULL, gfx))
578 continue;
579
580 Definition dst = bld.def(v1);
581 dst.setFixed(PhysReg(256 + 42));
582
583 Operand op_s4(bld.tmp(s4));
584 op_s4.setFixed(PhysReg(32));
585
586 Operand op_v1(bld.tmp(v1));
587 op_v1.setFixed(PhysReg(256 + 10));
588
589 Operand op_v2(bld.tmp(v2));
590 op_v2.setFixed(PhysReg(256 + 20));
591
592 Operand op_s1(bld.tmp(s1));
593 op_s1.setFixed(PhysReg(30));
594
595 unsigned dfmt = V_008F0C_BUF_DATA_FORMAT_32_32;
596 unsigned nfmt = V_008F0C_BUF_NUM_FORMAT_FLOAT;
597
598 //! llvm_version: #llvm_ver
599 fprintf(output, "llvm_version: %u\n", LLVM_VERSION_MAJOR);
600
601 /* Addressing */
602 //~gfx11>> tbuffer_load_format_x v42, off, s[32:35], s30 format:[BUF_FMT_32_32_FLOAT] ; e9900000 1e082a80
603 //~gfx12>> tbuffer_load_format_x v42, off, s[32:35], s30 format:[BUF_FMT_32_32_FLOAT] ; c420001e 1900402a 00000080
604 bld.mtbuf(aco_opcode::tbuffer_load_format_x, dst, op_s4, Operand(v1), op_s1, dfmt, nfmt, 0,
605 false);
606
607 //~gfx11! tbuffer_load_format_x v42, off, s[32:35], 0 format:[BUF_FMT_32_32_FLOAT] ; e9900000 80082a80
608 //~gfx12! tbuffer_load_format_x v42, off, s[32:35], null format:[BUF_FMT_32_32_FLOAT] ; c420007c 1900402a 00000080
609 bld.mtbuf(aco_opcode::tbuffer_load_format_x, dst, op_s4, Operand(v1), Operand::zero(), dfmt,
610 nfmt, 0, false);
611
612 //~gfx11! tbuffer_load_format_x v42, off, s[32:35], 42 format:[BUF_FMT_32_32_FLOAT] ; e9900000 aa082a80
613 if (gfx == GFX11)
614 bld.mtbuf(aco_opcode::tbuffer_load_format_x, dst, op_s4, Operand(v1), Operand::c32(42),
615 dfmt, nfmt, 0, false);
616
617 //~gfx11! tbuffer_load_format_x v42, v10, s[32:35], s30 format:[BUF_FMT_32_32_FLOAT] offen ; e9900000 1e482a0a
618 //~gfx12! tbuffer_load_format_x v42, v10, s[32:35], s30 format:[BUF_FMT_32_32_FLOAT] offen ; c420001e 5900402a 0000000a
619 bld.mtbuf(aco_opcode::tbuffer_load_format_x, dst, op_s4, op_v1, op_s1, dfmt, nfmt, 0, true);
620
621 //~gfx11! tbuffer_load_format_x v42, v10, s[32:35], s30 format:[BUF_FMT_32_32_FLOAT] idxen ; e9900000 1e882a0a
622 //~gfx12! tbuffer_load_format_x v42, v10, s[32:35], s30 format:[BUF_FMT_32_32_FLOAT] idxen ; c420001e 9900402a 0000000a
623 bld.mtbuf(aco_opcode::tbuffer_load_format_x, dst, op_s4, op_v1, op_s1, dfmt, nfmt, 0, false)
624 ->mtbuf()
625 .idxen = true;
626
627 //~gfx11! tbuffer_load_format_x v42, v[20:21], s[32:35], s30 format:[BUF_FMT_32_32_FLOAT] idxen offen ; e9900000 1ec82a14
628 //~gfx12! tbuffer_load_format_x v42, v[20:21], s[32:35], s30 format:[BUF_FMT_32_32_FLOAT] idxen offen ; c420001e d900402a 00000014
629 bld.mtbuf(aco_opcode::tbuffer_load_format_x, dst, op_s4, op_v2, op_s1, dfmt, nfmt, 0, true)
630 ->mtbuf()
631 .idxen = true;
632
633 //~gfx11! tbuffer_load_format_x v42, off, s[32:35], s30 format:[BUF_FMT_32_32_FLOAT] offset:84 ; e9900054 1e082a80
634 //~gfx12! tbuffer_load_format_x v42, off, s[32:35], s30 format:[BUF_FMT_32_32_FLOAT] offset:84 ; c420001e 1900402a 00005480
635 bld.mtbuf(aco_opcode::tbuffer_load_format_x, dst, op_s4, Operand(v1), op_s1, dfmt, nfmt, 84,
636 false);
637
638 /* Various flags */
639 ac_hw_cache_flags cache_coherent = {{0, 0, 0, 0, 0}};
640 ac_hw_cache_flags cache_sys_coherent = {{0, 0, 0, 0, 0}};
641 ac_hw_cache_flags cache_non_temporal = {{0, 0, 0, 0, 0}};
642 if (gfx >= GFX12) {
643 cache_coherent.gfx12.scope = gfx12_scope_device;
644 cache_sys_coherent.gfx12.scope = gfx12_scope_memory;
645 cache_non_temporal.gfx12.temporal_hint = gfx12_load_non_temporal;
646 } else {
647 cache_coherent.value = ac_glc;
648 cache_sys_coherent.value = ac_slc;
649 cache_non_temporal.value = ac_dlc;
650 }
651
652 //~gfx11! tbuffer_load_format_x v42, off, s[32:35], 0 format:[BUF_FMT_32_32_FLOAT] glc ; e9904000 80082a80
653 //~gfx12! tbuffer_load_format_x v42, off, s[32:35], null format:[BUF_FMT_32_32_FLOAT] scope:SCOPE_DEV ; c420007c 1908402a 00000080
654 bld.mtbuf(aco_opcode::tbuffer_load_format_x, dst, op_s4, Operand(v1), Operand::zero(), dfmt,
655 nfmt, 0, false)
656 ->mtbuf()
657 .cache = cache_coherent;
658
659 //~gfx11! tbuffer_load_format_x v42, off, s[32:35], 0 format:[BUF_FMT_32_32_FLOAT] dlc ; e9902000 80082a80
660 //~gfx12! tbuffer_load_format_x v42, off, s[32:35], null format:[BUF_FMT_32_32_FLOAT] th:TH_LOAD_NT ; c420007c 1910402a 00000080
661 bld.mtbuf(aco_opcode::tbuffer_load_format_x, dst, op_s4, Operand(v1), Operand::zero(), dfmt,
662 nfmt, 0, false)
663 ->mtbuf()
664 .cache = cache_non_temporal;
665
666 //~gfx11! tbuffer_load_format_x v42, off, s[32:35], 0 format:[BUF_FMT_32_32_FLOAT] slc ; e9901000 80082a80
667 //~gfx12! tbuffer_load_format_x v42, off, s[32:35], null format:[BUF_FMT_32_32_FLOAT] scope:SCOPE_SYS ; c420007c 190c402a 00000080
668 bld.mtbuf(aco_opcode::tbuffer_load_format_x, dst, op_s4, Operand(v1), Operand::zero(), dfmt,
669 nfmt, 0, false)
670 ->mtbuf()
671 .cache = cache_sys_coherent;
672
673 //; if llvm_ver >= 19 and variant == 'gfx11':
674 //; insert_pattern('(invalid instruction) ; e9900000')
675 //; insert_pattern('s_add_u32 s40, 0, s42 ; 80282a80')
676 //; elif llvm_ver >= 19 and variant == 'gfx12':
677 //; insert_pattern('(invalid instruction) ; c460007c')
678 //; insert_pattern('v_mul_hi_u32_u24_e32 v128, s42, v32 ; 1900402a')
679 //; insert_pattern('(invalid instruction) ; 00000080')
680 //; elif llvm_ver >= 16 and variant == 'gfx11':
681 //; insert_pattern('tbuffer_load_format_x v42, off, s[32:35], 0 format:[BUF_FMT_32_32_FLOAT] ; e9900000 80282a80')
682 //; elif variant == 'gfx11':
683 //; insert_pattern('tbuffer_load_format_x v42, off, s[32:35], 0 format:[BUF_FMT_32_32_FLOAT] tfe ; e9900000 80282a80')
684 //; elif variant == 'gfx12':
685 //; insert_pattern('tbuffer_load_format_x v42, off, s[32:35], null format:[BUF_FMT_32_32_FLOAT] ; c460007c 1900402a 00000080')
686 bld.mtbuf(aco_opcode::tbuffer_load_format_x, dst, op_s4, Operand(v1), Operand::zero(), dfmt,
687 nfmt, 0, false)
688 ->mtbuf()
689 .tfe = true;
690
691 /* Stores */
692 //~gfx11! tbuffer_store_format_x v10, off, s[32:35], s30 format:[BUF_FMT_32_32_FLOAT] ; e9920000 1e080a80
693 //~gfx12! tbuffer_store_format_x v10, off, s[32:35], s30 format:[BUF_FMT_32_32_FLOAT] ; c421001e 1900400a 00000080
694 bld.mtbuf(aco_opcode::tbuffer_store_format_x, op_s4, Operand(v1), op_s1, op_v1, dfmt, nfmt, 0,
695 false);
696
697 //~gfx11! tbuffer_store_format_xy v[20:21], v10, s[32:35], s30 format:[BUF_FMT_32_32_FLOAT] offen ; e9928000 1e48140a
698 //~gfx12! tbuffer_store_format_xy v[20:21], v10, s[32:35], s30 format:[BUF_FMT_32_32_FLOAT] offen ; c421401e 59004014 0000000a
699 bld.mtbuf(aco_opcode::tbuffer_store_format_xy, op_s4, op_v1, op_s1, op_v2, dfmt, nfmt, 0,
700 true);
701
702 finish_assembler_test();
703 }
704 END_TEST
705
706 BEGIN_TEST(assembler.mimg)
707 for (amd_gfx_level gfx : filter_gfx_levels({GFX11, GFX12})) {
708 if (!setup_cs(NULL, gfx))
709 continue;
710
711 Definition dst_v1 = bld.def(v1);
712 dst_v1.setFixed(PhysReg(256 + 42));
713
714 Definition dst_v4 = bld.def(v4);
715 dst_v4.setFixed(PhysReg(256 + 84));
716
717 Operand op_s4(bld.tmp(s4));
718 op_s4.setFixed(PhysReg(32));
719
720 Operand op_s8(bld.tmp(s8));
721 op_s8.setFixed(PhysReg(64));
722
723 Operand op_v1(bld.tmp(v1));
724 op_v1.setFixed(PhysReg(256 + 10));
725
726 Operand op_v2(bld.tmp(v2));
727 op_v2.setFixed(PhysReg(256 + 20));
728
729 Operand op_v4(bld.tmp(v4));
730 op_v4.setFixed(PhysReg(256 + 30));
731
732 //~gfx11>> image_sample v[84:87], v10, s[64:71], s[32:35] dmask:0xf dim:SQ_RSRC_IMG_1D ; f06c0f00 2010540a
733 //~gfx12>> image_sample v[84:87], v10, s[64:71], s[32:35] dmask:0xf dim:SQ_RSRC_IMG_1D ; e7c6c000 10008054 0000000a
734 bld.mimg(aco_opcode::image_sample, dst_v4, op_s8, op_s4, Operand(v1), op_v1);
735
736 //~gfx11! image_sample v[84:87], v[20:21], s[64:71], s[32:35] dmask:0xf dim:SQ_RSRC_IMG_2D ; f06c0f04 20105414
737 //~gfx12! image_sample v[84:87], [v20, v21], s[64:71], s[32:35] dmask:0xf dim:SQ_RSRC_IMG_2D ; e7c6c001 10008054 00001514
738 bld.mimg(aco_opcode::image_sample, dst_v4, op_s8, op_s4, Operand(v1), op_v2)->mimg().dim =
739 ac_image_2d;
740
741 //~gfx11! image_sample v42, v10, s[64:71], s[32:35] dmask:0x1 dim:SQ_RSRC_IMG_1D ; f06c0100 20102a0a
742 //~gfx12! image_sample v42, v10, s[64:71], s[32:35] dmask:0x1 dim:SQ_RSRC_IMG_1D ; e446c000 1000802a 0000000a
743 bld.mimg(aco_opcode::image_sample, dst_v1, op_s8, op_s4, Operand(v1), op_v1)->mimg().dmask =
744 0x1;
745
746 /* Various flags */
747 ac_hw_cache_flags cache_coherent = {{0, 0, 0, 0, 0}};
748 ac_hw_cache_flags cache_sys_coherent = {{0, 0, 0, 0, 0}};
749 ac_hw_cache_flags cache_non_temporal = {{0, 0, 0, 0, 0}};
750 ac_hw_cache_flags cache_atomic_rtn = {{0, 0, 0, 0, 0}};
751 if (gfx >= GFX12) {
752 cache_coherent.gfx12.scope = gfx12_scope_device;
753 cache_sys_coherent.gfx12.scope = gfx12_scope_memory;
754 cache_non_temporal.gfx12.temporal_hint = gfx12_load_non_temporal;
755 cache_atomic_rtn.gfx12.temporal_hint = gfx12_atomic_return;
756 } else {
757 cache_coherent.value = ac_glc;
758 cache_sys_coherent.value = ac_slc;
759 cache_non_temporal.value = ac_dlc;
760 cache_atomic_rtn.value = ac_glc;
761 }
762
763 //~gfx11! image_sample v[84:87], v10, s[64:71], s[32:35] dmask:0xf dim:SQ_RSRC_IMG_1D dlc ; f06c2f00 2010540a
764 //~gfx12! image_sample v[84:87], v10, s[64:71], s[32:35] dmask:0xf dim:SQ_RSRC_IMG_1D th:TH_LOAD_NT ; e7c6c000 10108054 0000000a
765 bld.mimg(aco_opcode::image_sample, dst_v4, op_s8, op_s4, Operand(v1), op_v1)->mimg().cache =
766 cache_non_temporal;
767
768 //~gfx11! image_sample v[84:87], v10, s[64:71], s[32:35] dmask:0xf dim:SQ_RSRC_IMG_1D glc ; f06c4f00 2010540a
769 //~gfx12! image_sample v[84:87], v10, s[64:71], s[32:35] dmask:0xf dim:SQ_RSRC_IMG_1D scope:SCOPE_DEV ; e7c6c000 10088054 0000000a
770 bld.mimg(aco_opcode::image_sample, dst_v4, op_s8, op_s4, Operand(v1), op_v1)->mimg().cache =
771 cache_coherent;
772
773 //~gfx11! image_sample v[84:87], v10, s[64:71], s[32:35] dmask:0xf dim:SQ_RSRC_IMG_1D slc ; f06c1f00 2010540a
774 //~gfx12! image_sample v[84:87], v10, s[64:71], s[32:35] dmask:0xf dim:SQ_RSRC_IMG_1D scope:SCOPE_SYS ; e7c6c000 100c8054 0000000a
775 bld.mimg(aco_opcode::image_sample, dst_v4, op_s8, op_s4, Operand(v1), op_v1)->mimg().cache =
776 cache_sys_coherent;
777
778 //~gfx11! image_sample v[84:88], v10, s[64:71], s[32:35] dmask:0xf dim:SQ_RSRC_IMG_1D tfe ; f06c0f00 2030540a
779 //~gfx12! image_sample v[84:88], v10, s[64:71], s[32:35] dmask:0xf dim:SQ_RSRC_IMG_1D tfe ; e7c6c008 10008054 0000000a
780 bld.mimg(aco_opcode::image_sample, dst_v4, op_s8, op_s4, Operand(v1), op_v1)->mimg().tfe =
781 true;
782
783 //~gfx11! image_sample v[84:87], v10, s[64:71], s[32:35] dmask:0xf dim:SQ_RSRC_IMG_1D lwe ; f06c0f00 2050540a
784 //~gfx12! image_sample v[84:87], v10, s[64:71], s[32:35] dmask:0xf dim:SQ_RSRC_IMG_1D lwe ; e7c6c000 10008154 0000000a
785 bld.mimg(aco_opcode::image_sample, dst_v4, op_s8, op_s4, Operand(v1), op_v1)->mimg().lwe =
786 true;
787
788 //~gfx11! image_sample v[84:87], v10, s[64:71], s[32:35] dmask:0xf dim:SQ_RSRC_IMG_1D r128 ; f06c8f00 2010540a
789 //~gfx12! image_sample v[84:87], v10, s[64:71], s[32:35] dmask:0xf dim:SQ_RSRC_IMG_1D r128 ; e7c6c010 10008054 0000000a
790 bld.mimg(aco_opcode::image_sample, dst_v4, op_s8, op_s4, Operand(v1), op_v1)->mimg().r128 =
791 true;
792
793 //~gfx11! image_sample v[84:87], v10, s[64:71], s[32:35] dmask:0xf dim:SQ_RSRC_IMG_1D a16 ; f06d0f00 2010540a
794 //~gfx12! image_sample v[84:87], v10, s[64:71], s[32:35] dmask:0xf dim:SQ_RSRC_IMG_1D a16 ; e7c6c040 10008054 0000000a
795 bld.mimg(aco_opcode::image_sample, dst_v4, op_s8, op_s4, Operand(v1), op_v1)->mimg().a16 =
796 true;
797
798 //~gfx11! image_sample v[84:85], v10, s[64:71], s[32:35] dmask:0xf dim:SQ_RSRC_IMG_1D d16 ; f06e0f00 2010540a
799 //~gfx12! image_sample v[84:85], v10, s[64:71], s[32:35] dmask:0xf dim:SQ_RSRC_IMG_1D d16 ; e7c6c020 10008054 0000000a
800 bld.mimg(aco_opcode::image_sample, dst_v4, op_s8, op_s4, Operand(v1), op_v1)->mimg().d16 =
801 true;
802
803 /* NSA */
804 //~gfx11! image_sample v[84:87], [v10, v40], s[64:71], s[32:35] dmask:0xf dim:SQ_RSRC_IMG_2D ; f06c0f05 2010540a 00000028
805 //~gfx12! image_sample v[84:87], [v10, v40], s[64:71], s[32:35] dmask:0xf dim:SQ_RSRC_IMG_2D ; e7c6c001 10008054 0000280a
806 bld.mimg(aco_opcode::image_sample, dst_v4, op_s8, op_s4, Operand(v1), op_v1,
807 Operand(bld.tmp(v1), PhysReg(256 + 40)))
808 ->mimg()
809 .dim = ac_image_2d;
810
811 //~gfx11! image_bvh_intersect_ray v[84:87], [v40, v42, v[44:46], v[48:50], v[52:54]], s[32:35] ; f0648f81 00085428 34302c2a
812 //~gfx12! image_bvh_intersect_ray v[84:87], [v40, v42, v[44:46], v[48:50], v[52:54]], s[32:35] ; d3c64010 34004054 302c2a28
813 aco_ptr<Instruction> instr{
814 create_instruction(aco_opcode::image_bvh_intersect_ray, Format::MIMG, 8, 1)};
815 instr->definitions[0] = dst_v4;
816 instr->operands[0] = op_s4;
817 instr->operands[1] = Operand(s4);
818 instr->operands[2] = Operand(v1);
819 instr->operands[3] = Operand(PhysReg(256 + 40), v1); /* node */
820 instr->operands[4] = Operand(PhysReg(256 + 42), v1); /* tmax */
821 instr->operands[5] = Operand(PhysReg(256 + 44), v3); /* origin */
822 instr->operands[6] = Operand(PhysReg(256 + 48), v3); /* dir */
823 instr->operands[7] = Operand(PhysReg(256 + 52), v3); /* inv dir */
824 instr->mimg().dmask = 0xf;
825 instr->mimg().unrm = true;
826 instr->mimg().r128 = true;
827 bld.insert(std::move(instr));
828
829 /* Stores */
830 //~gfx11! image_store v[30:33], v10, s[64:71] dmask:0xf dim:SQ_RSRC_IMG_1D ; f0180f00 00101e0a
831 //~gfx12! image_store v[30:33], v10, s[64:71] dmask:0xf dim:SQ_RSRC_IMG_1D ; d3c18000 0000801e 0000000a
832 bld.mimg(aco_opcode::image_store, op_s8, Operand(s4), op_v4, op_v1);
833
834 //~gfx11! image_atomic_add v10, v[20:21], s[64:71] dmask:0x1 dim:SQ_RSRC_IMG_2D ; f0300104 00100a14
835 //~gfx12! image_atomic_add_uint v10, [v20, v21], s[64:71] dmask:0x1 dim:SQ_RSRC_IMG_2D ; d0430001 0000800a 00001514
836 bld.mimg(aco_opcode::image_atomic_add, Definition(op_v1.physReg(), v1), op_s8, Operand(s4),
837 op_v1, op_v2, 0x1)
838 ->mimg()
839 .dim = ac_image_2d;
840
841 /* Atomic with return */
842 //~gfx11! image_atomic_add v10, v[20:21], s[64:71] dmask:0x1 dim:SQ_RSRC_IMG_2D glc ; f0304104 00100a14
843 //~gfx12! image_atomic_add_uint v10, [v20, v21], s[64:71] dmask:0x1 dim:SQ_RSRC_IMG_2D th:TH_ATOMIC_RETURN ; d0430001 0010800a 00001514
844 bld.mimg(aco_opcode::image_atomic_add, Definition(op_v1.physReg(), v1), op_s8, Operand(s4),
845 op_v1, op_v2, 0x1, false, false, false, cache_atomic_rtn)
846 ->mimg()
847 .dim = ac_image_2d;
848
849 //~gfx11! image_load v[84:87], v[20:21], s[64:71] dmask:0xf dim:SQ_RSRC_IMG_2D ; f0000f04 00105414
850 //~gfx12! image_load v[84:87], [v20, v21], s[64:71] dmask:0xf dim:SQ_RSRC_IMG_2D ; d3c00001 00008054 00001514
851 bld.mimg(aco_opcode::image_load, dst_v4, op_s8, Operand(s4), Operand(v1), op_v2)->mimg().dim =
852 ac_image_2d;
853
854 //~gfx11! image_msaa_load v[84:87], v[30:33], s[64:71] dmask:0x1 dim:SQ_RSRC_IMG_2D_MSAA_ARRAY ; f060011c 0010541e
855 //~gfx12! image_msaa_load v[84:87], [v30, v31, v32, v33], s[64:71] dmask:0x1 dim:SQ_RSRC_IMG_2D_MSAA_ARRAY ; e4460007 00008054 21201f1e
856 bld.mimg(aco_opcode::image_msaa_load, dst_v4, op_s8, Operand(s4), Operand(v1), op_v4, 0x1)
857 ->mimg()
858 .dim = ac_image_2darraymsaa;
859
860 finish_assembler_test();
861 }
862 END_TEST
863
864 BEGIN_TEST(assembler.flat)
865 for (amd_gfx_level gfx : filter_gfx_levels({GFX11, GFX12})) {
866 if (!setup_cs(NULL, gfx))
867 continue;
868
869 Definition dst_v1 = bld.def(v1);
870 dst_v1.setFixed(PhysReg(256 + 42));
871
872 Operand op_s1(bld.tmp(s1));
873 op_s1.setFixed(PhysReg(32));
874
875 Operand op_s2(bld.tmp(s2));
876 op_s2.setFixed(PhysReg(64));
877
878 Operand op_v1(bld.tmp(v1));
879 op_v1.setFixed(PhysReg(256 + 10));
880
881 Operand op_v2(bld.tmp(v2));
882 op_v2.setFixed(PhysReg(256 + 20));
883
884 /* Addressing */
885 //~gfx11>> flat_load_b32 v42, v[20:21] ; dc500000 2a7c0014
886 //~gfx12>> flat_load_b32 v42, v[20:21] ; ec05007c 0000002a 00000014
887 bld.flat(aco_opcode::flat_load_dword, dst_v1, op_v2, Operand(s1));
888
889 //~gfx11! global_load_b32 v42, v[20:21], off ; dc520000 2a7c0014
890 //~gfx12! global_load_b32 v42, v[20:21], off ; ee05007c 0000002a 00000014
891 bld.global(aco_opcode::global_load_dword, dst_v1, op_v2, Operand(s1));
892
893 //~gfx11! global_load_b32 v42, v10, s[64:65] ; dc520000 2a40000a
894 //~gfx12! global_load_b32 v42, v10, s[64:65] ; ee050040 0000002a 0000000a
895 bld.global(aco_opcode::global_load_dword, dst_v1, op_v1, op_s2);
896
897 //~gfx11! scratch_load_b32 v42, v10, off ; dc510000 2afc000a
898 //~gfx12! scratch_load_b32 v42, v10, off ; ed05007c 0002002a 0000000a
899 bld.scratch(aco_opcode::scratch_load_dword, dst_v1, op_v1, Operand(s1));
900
901 //~gfx11! scratch_load_b32 v42, off, s32 ; dc510000 2a200080
902 //~gfx12! scratch_load_b32 v42, off, s32 ; ed050020 0000002a 00000000
903 bld.scratch(aco_opcode::scratch_load_dword, dst_v1, Operand(v1), op_s1);
904
905 //~gfx11! scratch_load_b32 v42, v10, s32 ; dc510000 2aa0000a
906 //~gfx12! scratch_load_b32 v42, v10, s32 ; ed050020 0002002a 0000000a
907 bld.scratch(aco_opcode::scratch_load_dword, dst_v1, op_v1, op_s1);
908
909 //~gfx11! scratch_load_b32 v42, off, off ; dc510000 2a7c0080
910 //~gfx12! scratch_load_b32 v42, off, off ; ed05007c 0000002a 00000000
911 bld.scratch(aco_opcode::scratch_load_dword, dst_v1, Operand(v1), Operand(s1));
912
913 //~gfx11! global_load_b32 v42, v[20:21], off offset:-42 ; dc521fd6 2a7c0014
914 //~gfx12! global_load_b32 v42, v[20:21], off offset:-42 ; ee05007c 0000002a ffffd614
915 bld.global(aco_opcode::global_load_dword, dst_v1, op_v2, Operand(s1), -42);
916
917 //~gfx11! global_load_b32 v42, v[20:21], off offset:84 ; dc520054 2a7c0014
918 //~gfx12! global_load_b32 v42, v[20:21], off offset:84 ; ee05007c 0000002a 00005414
919 bld.global(aco_opcode::global_load_dword, dst_v1, op_v2, Operand(s1), 84);
920
921 /* Various flags */
922 ac_hw_cache_flags cache_coherent = {{0, 0, 0, 0, 0}};
923 ac_hw_cache_flags cache_sys_coherent = {{0, 0, 0, 0, 0}};
924 ac_hw_cache_flags cache_non_temporal = {{0, 0, 0, 0, 0}};
925 ac_hw_cache_flags cache_atomic_rtn = {{0, 0, 0, 0, 0}};
926 if (gfx >= GFX12) {
927 cache_coherent.gfx12.scope = gfx12_scope_device;
928 cache_sys_coherent.gfx12.scope = gfx12_scope_memory;
929 cache_non_temporal.gfx12.temporal_hint = gfx12_load_non_temporal;
930 cache_atomic_rtn.gfx12.temporal_hint = gfx12_atomic_return;
931 } else {
932 cache_coherent.value = ac_glc;
933 cache_sys_coherent.value = ac_slc;
934 cache_non_temporal.value = ac_dlc;
935 cache_atomic_rtn.value = ac_glc;
936 }
937
938 //~gfx11! flat_load_b32 v42, v[20:21] slc ; dc508000 2a7c0014
939 //~gfx12! flat_load_b32 v42, v[20:21] scope:SCOPE_SYS ; ec05007c 000c002a 00000014
940 bld.flat(aco_opcode::flat_load_dword, dst_v1, op_v2, Operand(s1))->flat().cache =
941 cache_sys_coherent;
942
943 //~gfx11! flat_load_b32 v42, v[20:21] glc ; dc504000 2a7c0014
944 //~gfx12! flat_load_b32 v42, v[20:21] scope:SCOPE_DEV ; ec05007c 0008002a 00000014
945 bld.flat(aco_opcode::flat_load_dword, dst_v1, op_v2, Operand(s1))->flat().cache =
946 cache_coherent;
947
948 //~gfx11! flat_load_b32 v42, v[20:21] dlc ; dc502000 2a7c0014
949 //~gfx12! flat_load_b32 v42, v[20:21] th:TH_LOAD_NT ; ec05007c 0010002a 00000014
950 bld.flat(aco_opcode::flat_load_dword, dst_v1, op_v2, Operand(s1))->flat().cache =
951 cache_non_temporal;
952
953 /* Stores */
954 //~gfx11! flat_store_b32 v[20:21], v10 ; dc680000 007c0a14
955 //~gfx12! flat_store_b32 v[20:21], v10 ; ec06807c 05000000 00000014
956 bld.flat(aco_opcode::flat_store_dword, op_v2, Operand(s1), op_v1);
957
958 /* Atomic with return */
959 //~gfx11! global_atomic_add_u32 v42, v[20:21], v10, off glc ; dcd64000 2a7c0a14
960 //~gfx12! global_atomic_add_u32 v42, v[20:21], v10, off th:TH_ATOMIC_RETURN ; ee0d407c 0510002a 00000014
961 bld.global(aco_opcode::global_atomic_add, dst_v1, op_v2, Operand(s1), op_v1)->global().cache =
962 cache_atomic_rtn;
963
964 finish_assembler_test();
965 }
966 END_TEST
967
968 BEGIN_TEST(assembler.exp)
969 for (amd_gfx_level gfx : filter_gfx_levels({GFX11, GFX12})) {
970 if (!setup_cs(NULL, gfx))
971 continue;
972
973 Operand op[4];
974 for (unsigned i = 0; i < 4; i++)
975 op[i] = Operand(PhysReg(256 + i), v1);
976
977 Operand op_m0(bld.tmp(s1));
978 op_m0.setFixed(m0);
979
980 //~gfx11>> exp mrt3 v1, v0, v3, v2 ; f800003f 02030001
981 //~gfx12>> export mrt3 v1, v0, v3, v2 ; f800003f 02030001
982 bld.exp(aco_opcode::exp, op[1], op[0], op[3], op[2], 0xf, 3);
983
984 //~gfx11! exp mrt3 v1, off, v0, off ; f8000035 80008001
985 //~gfx12! export mrt3 v1, off, v0, off ; f8000035 80008001
986 bld.exp(aco_opcode::exp, op[1], Operand(v1), op[0], Operand(v1), 0x5, 3);
987
988 //~gfx11! exp mrt3 v1, v0, v3, v2 done ; f800083f 02030001
989 //~gfx12! export mrt3 v1, v0, v3, v2 done ; f800083f 02030001
990 bld.exp(aco_opcode::exp, op[1], op[0], op[3], op[2], 0xf, 3, false, true);
991
992 //~gfx11! exp mrt3 v1, v0, v3, v2 row_en ; f800203f 02030001
993 //~gfx12! export mrt3 v1, v0, v3, v2 row_en ; f800203f 02030001
994 bld.exp(aco_opcode::exp, op[1], op[0], op[3], op[2], op_m0, 0xf, 3)->exp().row_en = true;
995
996 finish_assembler_test();
997 }
998 END_TEST
999
1000 BEGIN_TEST(assembler.vinterp)
1001 for (amd_gfx_level gfx : filter_gfx_levels({GFX11, GFX12})) {
1002 if (!setup_cs(NULL, gfx))
1003 continue;
1004
1005 Definition dst = bld.def(v1);
1006 dst.setFixed(PhysReg(256 + 42));
1007
1008 Operand op0(bld.tmp(v1));
1009 op0.setFixed(PhysReg(256 + 10));
1010
1011 Operand op1(bld.tmp(v1));
1012 op1.setFixed(PhysReg(256 + 20));
1013
1014 Operand op2(bld.tmp(v1));
1015 op2.setFixed(PhysReg(256 + 30));
1016
1017 //! llvm_version: #llvm_ver
1018 fprintf(output, "llvm_version: %u\n", LLVM_VERSION_MAJOR);
1019
1020 //>> v_interp_p10_f32 v42, v10, v20, v30 wait_exp:7 ; cd00072a 047a290a
1021 bld.vinterp_inreg(aco_opcode::v_interp_p10_f32_inreg, dst, op0, op1, op2);
1022
1023 //! v_interp_p10_f32 v42, v10, v20, v30 wait_exp:6 ; cd00062a 047a290a
1024 bld.vinterp_inreg(aco_opcode::v_interp_p10_f32_inreg, dst, op0, op1, op2, 0, 6);
1025
1026 //; if llvm_ver >= 18:
1027 //; insert_pattern('v_interp_p2_f32 v42, v10, v20, v30 wait_exp:0 ; cd01002a 047a290a')
1028 //; else:
1029 //; insert_pattern('v_interp_p2_f32 v42, v10, v20, v30 ; cd01002a 047a290a')
1030 bld.vinterp_inreg(aco_opcode::v_interp_p2_f32_inreg, dst, op0, op1, op2, 0, 0);
1031
1032 //! v_interp_p10_f32 v42, -v10, v20, v30 wait_exp:6 ; cd00062a 247a290a
1033 bld.vinterp_inreg(aco_opcode::v_interp_p10_f32_inreg, dst, op0, op1, op2, 0, 6)
1034 ->vinterp_inreg()
1035 .neg[0] = true;
1036
1037 //! v_interp_p10_f32 v42, v10, -v20, v30 wait_exp:6 ; cd00062a 447a290a
1038 bld.vinterp_inreg(aco_opcode::v_interp_p10_f32_inreg, dst, op0, op1, op2, 0, 6)
1039 ->vinterp_inreg()
1040 .neg[1] = true;
1041
1042 //! v_interp_p10_f32 v42, v10, v20, -v30 wait_exp:6 ; cd00062a 847a290a
1043 bld.vinterp_inreg(aco_opcode::v_interp_p10_f32_inreg, dst, op0, op1, op2, 0, 6)
1044 ->vinterp_inreg()
1045 .neg[2] = true;
1046
1047 //! v_interp_p10_f16_f32 v42, v10, v20, v30 op_sel:[1,0,0,0] wait_exp:6 ; cd020e2a 047a290a
1048 bld.vinterp_inreg(aco_opcode::v_interp_p10_f16_f32_inreg, dst, op0, op1, op2, 0x1, 6);
1049
1050 //! v_interp_p2_f16_f32 v42, v10, v20, v30 op_sel:[0,1,0,0] wait_exp:6 ; cd03162a 047a290a
1051 bld.vinterp_inreg(aco_opcode::v_interp_p2_f16_f32_inreg, dst, op0, op1, op2, 0x2, 6);
1052
1053 //! v_interp_p10_rtz_f16_f32 v42, v10, v20, v30 op_sel:[0,0,1,0] wait_exp:6 ; cd04262a 047a290a
1054 bld.vinterp_inreg(aco_opcode::v_interp_p10_rtz_f16_f32_inreg, dst, op0, op1, op2, 0x4, 6);
1055
1056 //! v_interp_p2_rtz_f16_f32 v42, v10, v20, v30 op_sel:[0,0,0,1] wait_exp:6 ; cd05462a 047a290a
1057 bld.vinterp_inreg(aco_opcode::v_interp_p2_rtz_f16_f32_inreg, dst, op0, op1, op2, 0x8, 6);
1058
1059 //! v_interp_p10_f32 v42, v10, v20, v30 clamp wait_exp:6 ; cd00862a 047a290a
1060 bld.vinterp_inreg(aco_opcode::v_interp_p10_f32_inreg, dst, op0, op1, op2, 0, 6)
1061 ->vinterp_inreg()
1062 .clamp = true;
1063
1064 finish_assembler_test();
1065 }
1066 END_TEST
1067
1068 BEGIN_TEST(assembler.ldsdir)
1069 for (amd_gfx_level gfx : filter_gfx_levels({GFX11, GFX12})) {
1070 if (!setup_cs(NULL, gfx))
1071 continue;
1072
1073 Definition dst = bld.def(v1);
1074 dst.setFixed(PhysReg(256 + 42));
1075
1076 Operand op(bld.tmp(s1));
1077 op.setFixed(m0);
1078
1079 //! llvm_version: #llvm_ver
1080 fprintf(output, "llvm_version: %u\n", LLVM_VERSION_MAJOR);
1081
1082 //~gfx11>> lds_direct_load v42 wait_vdst:15 ; ce1f002a
1083 //~gfx12>> ds_direct_load v42 wait_va_vdst:15 wait_vm_vsrc:1 ; ce9f002a
1084 bld.ldsdir(aco_opcode::lds_direct_load, dst, op)->ldsdir().wait_vdst = 15;
1085
1086 //~gfx11! lds_direct_load v42 wait_vdst:6 ; ce16002a
1087 //~gfx12! ds_direct_load v42 wait_va_vdst:6 wait_vm_vsrc:1 ; ce96002a
1088 bld.ldsdir(aco_opcode::lds_direct_load, dst, op)->ldsdir().wait_vdst = 6;
1089
1090 //; if llvm_ver >= 18 and variant == 'gfx11':
1091 //; insert_pattern('lds_direct_load v42 wait_vdst:0 ; ce10002a')
1092 //; elif variant == 'gfx11':
1093 //; insert_pattern('lds_direct_load v42 ; ce10002a')
1094 //~gfx12! ds_direct_load v42 wait_va_vdst:0 wait_vm_vsrc:1 ; ce90002a
1095 bld.ldsdir(aco_opcode::lds_direct_load, dst, op)->ldsdir().wait_vdst = 0;
1096
1097 //~gfx11! lds_param_load v42, attr56.x wait_vdst:8 ; ce08e02a
1098 //~gfx12! ds_param_load v42, attr56.x wait_va_vdst:8 wait_vm_vsrc:1 ; ce88e02a
1099 bld.ldsdir(aco_opcode::lds_param_load, dst, op, 56, 0)->ldsdir().wait_vdst = 8;
1100
1101 //; if llvm_ver >= 18 and variant == 'gfx11':
1102 //; insert_pattern('lds_param_load v42, attr56.x wait_vdst:0 ; ce00e02a')
1103 //; elif variant == 'gfx11':
1104 //; insert_pattern('lds_param_load v42, attr56.x ; ce00e02a')
1105 //~gfx12! ds_param_load v42, attr56.x wait_va_vdst:0 wait_vm_vsrc:1 ; ce80e02a
1106 bld.ldsdir(aco_opcode::lds_param_load, dst, op, 56, 0)->ldsdir().wait_vdst = 0;
1107
1108 //~gfx11! lds_param_load v42, attr34.y wait_vdst:8 ; ce08892a
1109 //~gfx12! ds_param_load v42, attr34.y wait_va_vdst:8 wait_vm_vsrc:1 ; ce88892a
1110 bld.ldsdir(aco_opcode::lds_param_load, dst, op, 34, 1)->ldsdir().wait_vdst = 8;
1111
1112 //~gfx11! lds_param_load v42, attr12.z wait_vdst:8 ; ce08322a
1113 //~gfx12! ds_param_load v42, attr12.z wait_va_vdst:8 wait_vm_vsrc:1 ; ce88322a
1114 bld.ldsdir(aco_opcode::lds_param_load, dst, op, 12, 2)->ldsdir().wait_vdst = 8;
1115
1116 //~gfx11>> lds_direct_load v42 wait_vdst:15 ; ce1f002a
1117 //~gfx12>> ds_direct_load v42 wait_va_vdst:15 wait_vm_vsrc:0 ; ce1f002a
1118 bld.ldsdir(aco_opcode::lds_direct_load, dst, op)->ldsdir().wait_vsrc = 0;
1119
1120 finish_assembler_test();
1121 }
1122 END_TEST
1123
1124 BEGIN_TEST(assembler.vop12c_v128)
1125 for (amd_gfx_level gfx : filter_gfx_levels({GFX11, GFX12})) {
1126 if (!setup_cs(NULL, gfx))
1127 continue;
1128
1129 Definition dst_v0 = bld.def(v1);
1130 dst_v0.setFixed(PhysReg(256));
1131
1132 Definition dst_v128 = bld.def(v1);
1133 dst_v128.setFixed(PhysReg(256 + 128));
1134
1135 Operand op_v1(bld.tmp(v1));
1136 op_v1.setFixed(PhysReg(256 + 1));
1137
1138 Operand op_v2(bld.tmp(v1));
1139 op_v2.setFixed(PhysReg(256 + 2));
1140
1141 Operand op_v129(bld.tmp(v1));
1142 op_v129.setFixed(PhysReg(256 + 129));
1143
1144 Operand op_v130(bld.tmp(v1));
1145 op_v130.setFixed(PhysReg(256 + 130));
1146
1147 //! llvm_version: #llvm_ver
1148 fprintf(output, "llvm_version: %u\n", LLVM_VERSION_MAJOR);
1149
1150 //>> BB0:
1151 //; if llvm_ver == 16:
1152 //; insert_pattern('v_mul_f16_e32 v0, v1, v2 ; Error: VGPR_32_Lo128: unknown register 128 ; 6a000501')
1153 //; else:
1154 //; insert_pattern('v_mul_f16_e32 v0, v1, v2 ; 6a000501')
1155 bld.vop2(aco_opcode::v_mul_f16, dst_v0, op_v1, op_v2);
1156
1157 //! v_mul_f16_e64 v128, v1, v2 ; d5350080 00020501
1158 bld.vop2(aco_opcode::v_mul_f16, dst_v128, op_v1, op_v2);
1159
1160 //! v_mul_f16_e64 v0, v129, v2 ; d5350000 00020581
1161 bld.vop2(aco_opcode::v_mul_f16, dst_v0, op_v129, op_v2);
1162
1163 //! v_mul_f16_e64 v0, v1, v130 ; d5350000 00030501
1164 bld.vop2(aco_opcode::v_mul_f16, dst_v0, op_v1, op_v130);
1165
1166 //! v_rcp_f16_e64 v128, v1 ; d5d40080 00000101
1167 bld.vop1(aco_opcode::v_rcp_f16, dst_v128, op_v1);
1168
1169 //! v_cmp_eq_f16_e64 vcc, v129, v2 ; d402006a 00020581
1170 bld.vopc(aco_opcode::v_cmp_eq_f16, bld.def(s2, vcc), op_v129, op_v2);
1171
1172 //! v_mul_f16_e64_dpp v128, v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 fi:1 ; d5350080 000204fa ff0d2101
1173 bld.vop2_dpp(aco_opcode::v_mul_f16, dst_v128, op_v1, op_v2, dpp_row_rr(1));
1174
1175 //! v_mul_f16_e64_dpp v0, v129, v2 row_ror:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 fi:1 ; d5350000 000204fa ff0d2181
1176 bld.vop2_dpp(aco_opcode::v_mul_f16, dst_v0, op_v129, op_v2, dpp_row_rr(1));
1177
1178 //! v_mul_f16_e64_dpp v0, v1, v130 row_ror:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 fi:1 ; d5350000 000304fa ff0d2101
1179 bld.vop2_dpp(aco_opcode::v_mul_f16, dst_v0, op_v1, op_v130, dpp_row_rr(1));
1180
1181 //! v_mul_f16_e64_dpp v128, v1, v2 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; d5350080 000204ea 00000001
1182 bld.vop2_dpp8(aco_opcode::v_mul_f16, dst_v128, op_v1, op_v2);
1183
1184 //! v_mul_f16_e64_dpp v0, v129, v2 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; d5350000 000204ea 00000081
1185 bld.vop2_dpp8(aco_opcode::v_mul_f16, dst_v0, op_v129, op_v2);
1186
1187 //! v_mul_f16_e64_dpp v0, v1, v130 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; d5350000 000304ea 00000001
1188 bld.vop2_dpp8(aco_opcode::v_mul_f16, dst_v0, op_v1, op_v130);
1189
1190 //! v_fma_f16 v128, v1, v2, 0x60 ; d6480080 03fe0501 00000060
1191 bld.vop2(aco_opcode::v_fmaak_f16, dst_v128, op_v1, op_v2, Operand::literal32(96));
1192
1193 //! v_fma_f16 v128, v1, 0x60, v2 ; d6480080 0409ff01 00000060
1194 bld.vop2(aco_opcode::v_fmamk_f16, dst_v128, op_v1, op_v2, Operand::literal32(96));
1195
1196 //! v_rcp_f16_e64_dpp v128, -v1 row_ror:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 fi:1 ; d5d40080 200000fa ff1d2101
1197 bld.vop1_dpp(aco_opcode::v_rcp_f16, dst_v128, op_v1, dpp_row_rr(1))->dpp16().neg[0] = true;
1198
1199 //! v_rcp_f16_e64_dpp v128, |v1| row_ror:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 fi:1 ; d5d40180 000000fa ff2d2101
1200 bld.vop1_dpp(aco_opcode::v_rcp_f16, dst_v128, op_v1, dpp_row_rr(1))->dpp16().abs[0] = true;
1201
1202 //! v_mul_f16_e64_dpp v128, -v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 fi:1 ; d5350080 200204fa ff1d2101
1203 bld.vop2_dpp(aco_opcode::v_mul_f16, dst_v128, op_v1, op_v2, dpp_row_rr(1))->dpp16().neg[0] =
1204 true;
1205
1206 //! v_mul_f16_e64_dpp v128, |v1|, v2 row_ror:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 fi:1 ; d5350180 000204fa ff2d2101
1207 bld.vop2_dpp(aco_opcode::v_mul_f16, dst_v128, op_v1, op_v2, dpp_row_rr(1))->dpp16().abs[0] =
1208 true;
1209
1210 //! v_cmp_eq_f16_e64_dpp vcc, -v129, v2 row_ror:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 fi:1 ; d402006a 200204fa ff1d2181
1211 bld.vopc_dpp(aco_opcode::v_cmp_eq_f16, bld.def(s2, vcc), op_v129, op_v2, dpp_row_rr(1))
1212 ->dpp16()
1213 .neg[0] = true;
1214
1215 //! v_cmp_eq_f16_e64_dpp vcc, |v129|, v2 row_ror:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 fi:1 ; d402016a 000204fa ff2d2181
1216 bld.vopc_dpp(aco_opcode::v_cmp_eq_f16, bld.def(s2, vcc), op_v129, op_v2, dpp_row_rr(1))
1217 ->dpp16()
1218 .abs[0] = true;
1219
1220 finish_assembler_test();
1221 }
1222 END_TEST
1223
1224 BEGIN_TEST(assembler.vop3_dpp)
1225 for (amd_gfx_level gfx : filter_gfx_levels({GFX11, GFX12})) {
1226 if (!setup_cs(NULL, gfx))
1227 continue;
1228
1229 Definition dst_v0 = bld.def(v1);
1230 dst_v0.setFixed(PhysReg(256));
1231
1232 Definition dst_non_vcc = bld.def(s2);
1233 dst_non_vcc.setFixed(PhysReg(4));
1234
1235 Operand op_v1(bld.tmp(v1));
1236 op_v1.setFixed(PhysReg(256 + 1));
1237
1238 Operand op_v2(bld.tmp(v1));
1239 op_v2.setFixed(PhysReg(256 + 2));
1240
1241 Operand op_s1(bld.tmp(s1));
1242 op_s1.setFixed(PhysReg(1));
1243
1244 //>> BB0:
1245 //! v_fma_f32_e64_dpp v0, v1, v2, s1 clamp row_ror:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 fi:1 ; d6138000 000604fa ff0d2101
1246 bld.vop3_dpp(aco_opcode::v_fma_f32, dst_v0, op_v1, op_v2, op_s1, dpp_row_rr(1))->valu().clamp =
1247 true;
1248
1249 //! v_fma_mix_f32_e64_dpp v0, |v1|, |v2|, |s1| op_sel:[1,0,0] op_sel_hi:[1,0,1] row_ror:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 fi:1 ; cc204f00 080604fa ffad2101
1250 bld.vop3p_dpp(aco_opcode::v_fma_mix_f32, dst_v0, op_v1, op_v2, op_s1, 0x1, 0x5, dpp_row_rr(1))
1251 ->valu()
1252 .abs = 0x7;
1253
1254 //! v_fma_f32_e64_dpp v0, -v1, -v2, -s1 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; d6130000 e00604ea 00000001
1255 bld.vop3_dpp8(aco_opcode::v_fma_f32, dst_v0, op_v1, op_v2, op_s1)->valu().neg = 0x7;
1256
1257 //! v_fma_mix_f32_e64_dpp v0, -v1, -v2, s1 op_sel_hi:[1,1,1] dpp8:[0,0,0,0,0,0,0,0] fi:1 ; cc204000 780604ea 00000001
1258 bld.vop3p_dpp8(aco_opcode::v_fma_mix_f32, dst_v0, op_v1, op_v2, op_s1, 0x0, 0x7)->valu().neg =
1259 0x3;
1260
1261 //! v_add_f32_e64_dpp v0, v1, v2 clamp row_ror:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 fi:1 ; d5038000 000204fa ff0d2101
1262 bld.vop2_e64_dpp(aco_opcode::v_add_f32, dst_v0, op_v1, op_v2, dpp_row_rr(1))->valu().clamp =
1263 true;
1264
1265 //! v_sqrt_f32_e64_dpp v0, v1 clamp row_ror:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 fi:1 ; d5b38000 000000fa ff0d2101
1266 bld.vop1_e64_dpp(aco_opcode::v_sqrt_f32, dst_v0, op_v1, dpp_row_rr(1))->valu().clamp = true;
1267
1268 //! v_cmp_lt_f32_e64_dpp s[4:5], |v1|, |v2| row_ror:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 fi:1 ; d4110304 000204fa ffad2101
1269 bld.vopc_e64_dpp(aco_opcode::v_cmp_lt_f32, dst_non_vcc, op_v1, op_v2, dpp_row_rr(1))
1270 ->valu()
1271 .abs = 0x3;
1272
1273 //! v_add_f32_e64_dpp v0, v1, v2 mul:4 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; d5030000 100204ea 00000001
1274 bld.vop2_e64_dpp8(aco_opcode::v_add_f32, dst_v0, op_v1, op_v2)->valu().omod = 2;
1275
1276 //! v_sqrt_f32_e64_dpp v0, v1 clamp dpp8:[0,0,0,0,0,0,0,0] fi:1 ; d5b38000 000000ea 00000001
1277 bld.vop1_e64_dpp8(aco_opcode::v_sqrt_f32, dst_v0, op_v1)->valu().clamp = true;
1278
1279 //! v_cmp_lt_f32_e64_dpp s[4:5], |v1|, v2 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; d4110104 000204ea 00000001
1280 bld.vopc_e64_dpp8(aco_opcode::v_cmp_lt_f32, dst_non_vcc, op_v1, op_v2)->valu().abs = 0x1;
1281
1282 finish_assembler_test();
1283 }
1284 END_TEST
1285
1286 BEGIN_TEST(assembler.vopd)
1287 for (amd_gfx_level gfx : filter_gfx_levels({GFX11, GFX12})) {
1288 if (!setup_cs(NULL, gfx))
1289 continue;
1290
1291 program->wave_size = 32;
1292
1293 Definition dst_v0 = bld.def(v1);
1294 dst_v0.setFixed(PhysReg(256));
1295
1296 Definition dst_v1 = bld.def(v1);
1297 dst_v1.setFixed(PhysReg(256 + 1));
1298
1299 Operand op_v0(bld.tmp(v1));
1300 op_v0.setFixed(PhysReg(256 + 0));
1301
1302 Operand op_v1(bld.tmp(v1));
1303 op_v1.setFixed(PhysReg(256 + 1));
1304
1305 Operand op_v2(bld.tmp(v1));
1306 op_v2.setFixed(PhysReg(256 + 2));
1307
1308 Operand op_v3(bld.tmp(v1));
1309 op_v3.setFixed(PhysReg(256 + 3));
1310
1311 Operand op_s0(bld.tmp(s1));
1312 op_s0.setFixed(PhysReg(0));
1313
1314 Operand op_vcc(bld.tmp(s1));
1315 op_vcc.setFixed(vcc);
1316
1317 //>> BB0:
1318 //! v_dual_mov_b32 v0, v0 :: v_dual_mov_b32 v1, v1 ; ca100100 00000101
1319 bld.vopd(aco_opcode::v_dual_mov_b32, dst_v0, dst_v1, op_v0, op_v1,
1320 aco_opcode::v_dual_mov_b32);
1321
1322 //! v_dual_mov_b32 v0, 0x60 :: v_dual_mov_b32 v1, s0 ; ca1000ff 00000000 00000060
1323 bld.vopd(aco_opcode::v_dual_mov_b32, dst_v0, dst_v1, Operand::c32(96), op_s0,
1324 aco_opcode::v_dual_mov_b32);
1325
1326 //! v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, 0x60 ; ca100000 000000ff 00000060
1327 bld.vopd(aco_opcode::v_dual_mov_b32, dst_v0, dst_v1, op_s0, Operand::c32(96),
1328 aco_opcode::v_dual_mov_b32);
1329
1330 //! v_dual_mul_f32 v0, v0, v1 :: v_dual_mov_b32 v1, v2 ; c8d00300 00000102
1331 bld.vopd(aco_opcode::v_dual_mul_f32, dst_v0, dst_v1, op_v0, op_v1, op_v2,
1332 aco_opcode::v_dual_mov_b32);
1333
1334 //! v_dual_fmac_f32 v0, v1, v2 :: v_dual_mov_b32 v1, v3 ; c8100501 00000103
1335 bld.vopd(aco_opcode::v_dual_fmac_f32, dst_v0, dst_v1, op_v1, op_v2, op_v0, op_v3,
1336 aco_opcode::v_dual_mov_b32);
1337
1338 //! v_dual_mov_b32 v0, v0 :: v_dual_and_b32 v1, v1, v2 ; ca240100 00000501
1339 bld.vopd(aco_opcode::v_dual_mov_b32, dst_v0, dst_v1, op_v0, op_v1, op_v2,
1340 aco_opcode::v_dual_and_b32);
1341
1342 //! v_dual_cndmask_b32 v0, v0, v1 :: v_dual_cndmask_b32 v1, v2, v3 ; ca520300 00000702
1343 bld.vopd(aco_opcode::v_dual_cndmask_b32, dst_v0, dst_v1, op_v0, op_v1, op_vcc, op_v2, op_v3,
1344 op_vcc, aco_opcode::v_dual_cndmask_b32);
1345
1346 finish_assembler_test();
1347 }
1348 END_TEST
1349
1350 BEGIN_TEST(assembler.pseudo_scalar_trans)
1351 if (LLVM_VERSION_MAJOR < 19 || !setup_cs(NULL, GFX12))
1352 return;
1353
1354 //>> v_s_sqrt_f32 s5, s1 ; d6880005 00000001
1355 bld.vop3(aco_opcode::v_s_sqrt_f32, Definition(PhysReg(5), s1), Operand(PhysReg(1), s1));
1356
1357 finish_assembler_test();
1358 END_TEST
1359
1360 BEGIN_TEST(assembler.vintrp_high_16bits)
1361 for (unsigned i = GFX8; i <= GFX10; i++) {
1362 if (!setup_cs(NULL, (amd_gfx_level)i))
1363 continue;
1364
1365 Definition dst_v0 = bld.def(v1);
1366 dst_v0.setFixed(PhysReg(256));
1367
1368 Definition dst_v1 = bld.def(v1);
1369 dst_v1.setFixed(PhysReg(256 + 1));
1370
1371 Operand op_v0(bld.tmp(v1));
1372 op_v0.setFixed(PhysReg(256 + 0));
1373
1374 Operand op_v1(bld.tmp(v1));
1375 op_v1.setFixed(PhysReg(256 + 1));
1376
1377 Operand op_v2(bld.tmp(v1));
1378 op_v2.setFixed(PhysReg(256 + 2));
1379
1380 Operand op_m0(bld.tmp(s1));
1381 op_m0.setFixed(m0);
1382
1383 aco_opcode interp_p2_op = aco_opcode::v_interp_p2_f16;
1384
1385 if (bld.program->gfx_level == GFX8)
1386 interp_p2_op = aco_opcode::v_interp_p2_legacy_f16;
1387
1388 //! BB0:
1389 //~gfx8! v_interp_p1ll_f16 v0, v1, attr4.y high ; d2740000 00020344
1390 //~gfx9! v_interp_p1ll_f16 v0, v1, attr4.y high ; d2740000 00020344
1391 //~gfx10! v_interp_p1ll_f16 v0, v1, attr4.y high ; d7420000 00020344
1392 bld.vintrp(aco_opcode::v_interp_p1ll_f16, dst_v0, op_v1, op_m0, 4, 1, true);
1393
1394 //~gfx8! v_interp_p2_f16 v1, v2, attr4.y, v0 high ; d2760001 04020544
1395 //~gfx9! v_interp_p2_f16 v1, v2, attr4.y, v0 high ; d2770001 04020544
1396 //~gfx10! v_interp_p2_f16 v1, v2, attr4.y, v0 high ; d75a0001 04020544
1397 bld.vintrp(interp_p2_op, dst_v1, op_v2, op_m0, op_v0, 4, 1, true);
1398
1399 finish_assembler_test();
1400 }
1401 END_TEST
1402