1 /*
2 * Copyright © 2022 Collabora Ltd.
3 * SPDX-License-Identifier: MIT
4 */
5 #include "mme_runner.h"
6 #include "mme_tu104_sim.h"
7
8 #include "nvk_clc597.h"
9
10 class mme_tu104_sim_test : public ::testing::Test, public mme_hw_runner {
11 public:
12 mme_tu104_sim_test();
13 ~mme_tu104_sim_test();
14
15 void SetUp();
16 void test_macro(const mme_builder *b,
17 const std::vector<uint32_t>& macro,
18 const std::vector<uint32_t>& params);
19 };
20
mme_tu104_sim_test()21 mme_tu104_sim_test::mme_tu104_sim_test() :
22 ::testing::Test(),
23 mme_hw_runner()
24 { }
25
~mme_tu104_sim_test()26 mme_tu104_sim_test::~mme_tu104_sim_test()
27 { }
28
29 void
SetUp()30 mme_tu104_sim_test::SetUp()
31 {
32 ASSERT_TRUE(set_up_hw(TURING_A, UINT16_MAX));
33 }
34
35 void
test_macro(const mme_builder * b,const std::vector<uint32_t> & macro,const std::vector<uint32_t> & params)36 mme_tu104_sim_test::test_macro(const mme_builder *b,
37 const std::vector<uint32_t>& macro,
38 const std::vector<uint32_t>& params)
39 {
40 const uint32_t data_dwords = DATA_BO_SIZE / sizeof(uint32_t);
41
42 std::vector<mme_tu104_inst> insts(macro.size() / 3);
43 mme_tu104_decode(&insts[0], ¯o[0], macro.size() / 3);
44
45 /* First, make a copy of the data and simulate the macro */
46 std::vector<uint32_t> sim_data(data, data + (DATA_BO_SIZE / 4));
47 mme_tu104_sim_mem sim_mem = {
48 .addr = data_addr,
49 .data = &sim_data[0],
50 .size = DATA_BO_SIZE,
51 };
52 mme_tu104_sim(insts.size(), &insts[0],
53 params.size(), ¶ms[0],
54 1, &sim_mem);
55
56 /* Now run the macro on the GPU */
57 push_macro(0, macro);
58
59 P_1INC(p, NVC597, CALL_MME_MACRO(0));
60 if (params.empty()) {
61 P_NVC597_CALL_MME_MACRO(p, 0, 0);
62 } else {
63 P_INLINE_ARRAY(p, ¶ms[0], params.size());
64 }
65
66 submit_push();
67
68 /* Check the results */
69 for (uint32_t i = 0; i < data_dwords; i++)
70 ASSERT_EQ(data[i], sim_data[i]);
71 }
72
73 static mme_tu104_reg
mme_value_as_reg(mme_value val)74 mme_value_as_reg(mme_value val)
75 {
76 assert(val.type == MME_VALUE_TYPE_REG);
77 return (mme_tu104_reg)(MME_TU104_REG_R0 + val.reg);
78 }
79
TEST_F(mme_tu104_sim_test,sanity)80 TEST_F(mme_tu104_sim_test, sanity)
81 {
82 const uint32_t canary = 0xc0ffee01;
83
84 mme_builder b;
85 mme_builder_init(&b, devinfo);
86
87 mme_store_imm_addr(&b, data_addr, mme_imm(canary));
88
89 auto macro = mme_builder_finish_vec(&b);
90
91 std::vector<uint32_t> params;
92 test_macro(&b, macro, params);
93 }
94
TEST_F(mme_tu104_sim_test,multi_param)95 TEST_F(mme_tu104_sim_test, multi_param)
96 {
97 mme_builder b;
98 mme_builder_init(&b, devinfo);
99
100 mme_value v0 = mme_alloc_reg(&b);
101 mme_value v1 = mme_alloc_reg(&b);
102
103 mme_tu104_asm(&b, i) {
104 i.alu[0].dst = mme_value_as_reg(v0);
105 i.alu[0].src[0] = MME_TU104_REG_LOAD1;
106 i.alu[1].dst = mme_value_as_reg(v1);
107 i.alu[1].src[0] = MME_TU104_REG_LOAD0;
108 i.imm[0] = (1<<12) | (NVC597_SET_MME_SHADOW_SCRATCH(12) >> 2);
109 i.out[0].mthd = MME_TU104_OUT_OP_IMM0;
110 i.out[0].emit = MME_TU104_OUT_OP_LOAD0;
111 i.imm[1] = (1<<12) | (NVC597_SET_MME_SHADOW_SCRATCH(35) >> 2);
112 i.out[1].mthd = MME_TU104_OUT_OP_IMM1;
113 i.out[1].emit = MME_TU104_OUT_OP_LOAD1;
114 }
115
116 mme_value v2 = mme_state(&b, NVC597_SET_MME_SHADOW_SCRATCH(12));
117 mme_value v3 = mme_state(&b, NVC597_SET_MME_SHADOW_SCRATCH(35));
118
119 mme_store_imm_addr(&b, data_addr + 0, v0);
120 mme_store_imm_addr(&b, data_addr + 4, v1);
121 mme_store_imm_addr(&b, data_addr + 8, v2);
122 mme_store_imm_addr(&b, data_addr + 12, v3);
123
124 auto macro = mme_builder_finish_vec(&b);
125
126 std::vector<uint32_t> params;
127 params.push_back(2581);
128 params.push_back(3048);
129
130 test_macro(&b, macro, params);
131 }
132
TEST_F(mme_tu104_sim_test,pred_param)133 TEST_F(mme_tu104_sim_test, pred_param)
134 {
135 mme_builder b;
136 mme_builder_init(&b, devinfo);
137
138 mme_value v0 = mme_load(&b);
139 mme_value v1 = mme_mov(&b, mme_imm(240));
140
141 mme_tu104_asm(&b, i) {
142 i.pred_mode = MME_TU104_PRED_TTTT;
143 i.alu[0].dst = mme_value_as_reg(v1);
144 i.alu[0].src[0] = MME_TU104_REG_LOAD0;
145 }
146
147 mme_value v2 = mme_load(&b);
148
149 mme_store_imm_addr(&b, data_addr + 0, v0);
150 mme_store_imm_addr(&b, data_addr + 4, v1);
151 mme_store_imm_addr(&b, data_addr + 8, v2);
152
153 auto macro = mme_builder_finish_vec(&b);
154
155 for (uint32_t j = 0; j < 4; j++) {
156 reset_push();
157
158 std::vector<uint32_t> params;
159 params.push_back((j & 1) * 2043);
160 params.push_back((j & 2) * 523);
161 params.push_back(2581);
162 params.push_back(3048);
163
164 test_macro(&b, macro, params);
165 }
166 }
167
TEST_F(mme_tu104_sim_test,out_imm0)168 TEST_F(mme_tu104_sim_test, out_imm0)
169 {
170 mme_builder b;
171 mme_builder_init(&b, devinfo);
172
173 mme_mthd(&b, NVC597_SET_REPORT_SEMAPHORE_A);
174 mme_emit(&b, mme_imm(high32(data_addr + 0)));
175 mme_emit(&b, mme_imm(low32(data_addr + 0)));
176 mme_tu104_asm(&b, i) {
177 i.imm[0] = 0x1234;
178 i.out[0].emit = MME_TU104_OUT_OP_IMM0;
179 }
180 mme_emit(&b, mme_imm(0x10000000));
181
182 mme_mthd(&b, NVC597_SET_REPORT_SEMAPHORE_A);
183 mme_emit(&b, mme_imm(high32(data_addr + 4)));
184 mme_emit(&b, mme_imm(low32(data_addr + 4)));
185 mme_tu104_asm(&b, i) {
186 i.imm[0] = 0x8765;
187 i.out[0].emit = MME_TU104_OUT_OP_IMM0;
188 }
189 mme_emit(&b, mme_imm(0x10000000));
190
191 auto macro = mme_builder_finish_vec(&b);
192
193 std::vector<uint32_t> params;
194 test_macro(&b, macro, params);
195 }
196
TEST_F(mme_tu104_sim_test,out_imm1)197 TEST_F(mme_tu104_sim_test, out_imm1)
198 {
199 mme_builder b;
200 mme_builder_init(&b, devinfo);
201
202 mme_mthd(&b, NVC597_SET_REPORT_SEMAPHORE_A);
203 mme_emit(&b, mme_imm(high32(data_addr + 0)));
204 mme_emit(&b, mme_imm(low32(data_addr + 0)));
205 mme_tu104_asm(&b, i) {
206 i.imm[1] = 0x1234;
207 i.out[0].emit = MME_TU104_OUT_OP_IMM1;
208 }
209 mme_emit(&b, mme_imm(0x10000000));
210
211 mme_mthd(&b, NVC597_SET_REPORT_SEMAPHORE_A);
212 mme_emit(&b, mme_imm(high32(data_addr + 4)));
213 mme_emit(&b, mme_imm(low32(data_addr + 4)));
214 mme_tu104_asm(&b, i) {
215 i.imm[1] = 0x8765;
216 i.out[0].emit = MME_TU104_OUT_OP_IMM1;
217 }
218 mme_emit(&b, mme_imm(0x10000000));
219
220 auto macro = mme_builder_finish_vec(&b);
221
222 std::vector<uint32_t> params;
223 test_macro(&b, macro, params);
224 }
225
TEST_F(mme_tu104_sim_test,out_immhigh0)226 TEST_F(mme_tu104_sim_test, out_immhigh0)
227 {
228 mme_builder b;
229 mme_builder_init(&b, devinfo);
230
231 mme_mthd(&b, NVC597_SET_REPORT_SEMAPHORE_A);
232 mme_emit(&b, mme_imm(high32(data_addr + 0)));
233 mme_emit(&b, mme_imm(low32(data_addr + 0)));
234 mme_tu104_asm(&b, i) {
235 i.imm[0] = 0x1234;
236 i.out[0].emit = MME_TU104_OUT_OP_IMMHIGH0;
237 }
238 mme_emit(&b, mme_imm(0x10000000));
239
240 mme_mthd(&b, NVC597_SET_REPORT_SEMAPHORE_A);
241 mme_emit(&b, mme_imm(high32(data_addr + 4)));
242 mme_emit(&b, mme_imm(low32(data_addr + 4)));
243 mme_tu104_asm(&b, i) {
244 i.imm[0] = 0x8765;
245 i.out[1].emit = MME_TU104_OUT_OP_IMMHIGH0;
246 }
247 mme_emit(&b, mme_imm(0x10000000));
248
249 auto macro = mme_builder_finish_vec(&b);
250
251 std::vector<uint32_t> params;
252 test_macro(&b, macro, params);
253 }
254
TEST_F(mme_tu104_sim_test,out_immhigh1)255 TEST_F(mme_tu104_sim_test, out_immhigh1)
256 {
257 mme_builder b;
258 mme_builder_init(&b, devinfo);
259
260 mme_mthd(&b, NVC597_SET_REPORT_SEMAPHORE_A);
261 mme_emit(&b, mme_imm(high32(data_addr + 0)));
262 mme_emit(&b, mme_imm(low32(data_addr + 0)));
263 mme_tu104_asm(&b, i) {
264 i.imm[1] = 0x1234;
265 i.out[0].emit = MME_TU104_OUT_OP_IMMHIGH1;
266 }
267 mme_emit(&b, mme_imm(0x10000000));
268
269 mme_mthd(&b, NVC597_SET_REPORT_SEMAPHORE_A);
270 mme_emit(&b, mme_imm(high32(data_addr + 4)));
271 mme_emit(&b, mme_imm(low32(data_addr + 4)));
272 mme_tu104_asm(&b, i) {
273 i.imm[1] = 0x8765;
274 i.out[1].emit = MME_TU104_OUT_OP_IMMHIGH1;
275 }
276 mme_emit(&b, mme_imm(0x10000000));
277
278 auto macro = mme_builder_finish_vec(&b);
279
280 std::vector<uint32_t> params;
281 test_macro(&b, macro, params);
282 }
283
TEST_F(mme_tu104_sim_test,out_imm32)284 TEST_F(mme_tu104_sim_test, out_imm32)
285 {
286 mme_builder b;
287 mme_builder_init(&b, devinfo);
288
289 mme_mthd(&b, NVC597_SET_REPORT_SEMAPHORE_A);
290 mme_emit(&b, mme_imm(high32(data_addr + 0)));
291 mme_emit(&b, mme_imm(low32(data_addr + 0)));
292 mme_tu104_asm(&b, i) {
293 i.imm[0] = 0x1234;
294 i.imm[1] = 0x7654;
295 i.out[0].emit = MME_TU104_OUT_OP_IMM32;
296 }
297 mme_emit(&b, mme_imm(0x10000000));
298
299 mme_mthd(&b, NVC597_SET_REPORT_SEMAPHORE_A);
300 mme_emit(&b, mme_imm(high32(data_addr + 4)));
301 mme_emit(&b, mme_imm(low32(data_addr + 4)));
302 mme_tu104_asm(&b, i) {
303 i.imm[0] = 0x1234;
304 i.imm[1] = 0x7654;
305 i.out[1].emit = MME_TU104_OUT_OP_IMM32;
306 }
307 mme_emit(&b, mme_imm(0x10000000));
308
309 auto macro = mme_builder_finish_vec(&b);
310
311 std::vector<uint32_t> params;
312 test_macro(&b, macro, params);
313 }
314
TEST_F(mme_tu104_sim_test,reg_imm32)315 TEST_F(mme_tu104_sim_test, reg_imm32)
316 {
317 const uint32_t canary = 0xc0ffee01;
318
319 mme_builder b;
320 mme_builder_init(&b, devinfo);
321
322 mme_value v = mme_alloc_reg(&b);
323
324 mme_tu104_asm(&b, i) {
325 i.alu[0].dst = mme_value_as_reg(v);
326 i.alu[0].op = MME_TU104_ALU_OP_ADD;
327 i.alu[0].src[0] = MME_TU104_REG_IMM32,
328 i.imm[0] = (uint16_t)canary;
329 i.imm[1] = (uint16_t)(canary >> 16);
330 }
331
332 mme_store_imm_addr(&b, data_addr, v);
333
334 auto macro = mme_builder_finish_vec(&b);
335
336 std::vector<uint32_t> params;
337 test_macro(&b, macro, params);
338 }
339
TEST_F(mme_tu104_sim_test,pred_alu)340 TEST_F(mme_tu104_sim_test, pred_alu)
341 {
342 static const enum mme_tu104_pred preds[] = {
343 MME_TU104_PRED_UUUU,
344 MME_TU104_PRED_TTTT,
345 MME_TU104_PRED_FFFF,
346 MME_TU104_PRED_TTUU,
347 MME_TU104_PRED_FFUU,
348 MME_TU104_PRED_TFUU,
349 MME_TU104_PRED_TUUU,
350 MME_TU104_PRED_FUUU,
351 MME_TU104_PRED_UUTT,
352 MME_TU104_PRED_UUTF,
353 MME_TU104_PRED_UUTU,
354 MME_TU104_PRED_UUFT,
355 MME_TU104_PRED_UUFF,
356 MME_TU104_PRED_UUFU,
357 MME_TU104_PRED_UUUT,
358 MME_TU104_PRED_UUUF,
359 };
360
361 for (uint32_t i = 0; i < ARRAY_SIZE(preds); i++) {
362 mme_builder b;
363 mme_builder_init(&b, devinfo);
364
365 mme_value pred = mme_load(&b);
366 mme_value v0 = mme_mov(&b, mme_imm(i * 100 + 13));
367 mme_value v1 = mme_mov(&b, mme_imm(i * 100 + 62));
368
369 mme_tu104_asm(&b, inst) {
370 inst.pred = mme_value_as_reg(pred);
371 inst.pred_mode = preds[i];
372 inst.alu[0].dst = mme_value_as_reg(v0);
373 inst.alu[0].src[0] = MME_TU104_REG_IMM;
374 inst.imm[0] = i * 100 + 25;
375 inst.alu[1].dst = mme_value_as_reg(v1);
376 inst.alu[1].src[0] = MME_TU104_REG_IMM;
377 inst.imm[1] = i * 100 + 73;
378 }
379
380 mme_store_imm_addr(&b, data_addr + i * 8 + 0, v0);
381 mme_store_imm_addr(&b, data_addr + i * 8 + 4, v1);
382
383 auto macro = mme_builder_finish_vec(&b);
384
385 for (uint32_t j = 0; j < 2; j++) {
386 reset_push();
387
388 std::vector<uint32_t> params;
389 params.push_back(j * 25894);
390
391 test_macro(&b, macro, params);
392 }
393 }
394 }
395
TEST_F(mme_tu104_sim_test,pred_out)396 TEST_F(mme_tu104_sim_test, pred_out)
397 {
398 static const enum mme_tu104_pred preds[] = {
399 MME_TU104_PRED_UUUU,
400 MME_TU104_PRED_TTTT,
401 MME_TU104_PRED_FFFF,
402 MME_TU104_PRED_TTUU,
403 MME_TU104_PRED_FFUU,
404 MME_TU104_PRED_TFUU,
405 MME_TU104_PRED_TUUU,
406 MME_TU104_PRED_FUUU,
407 MME_TU104_PRED_UUTT,
408 MME_TU104_PRED_UUTF,
409 MME_TU104_PRED_UUTU,
410 MME_TU104_PRED_UUFT,
411 MME_TU104_PRED_UUFF,
412 MME_TU104_PRED_UUFU,
413 MME_TU104_PRED_UUUT,
414 MME_TU104_PRED_UUUF,
415 };
416
417 for (uint32_t i = 0; i < ARRAY_SIZE(preds); i++) {
418 mme_builder b;
419 mme_builder_init(&b, devinfo);
420
421 mme_value pred = mme_load(&b);
422
423 mme_tu104_asm(&b, inst) {
424 inst.imm[0] = (1<<12) | (NVC597_SET_MME_SHADOW_SCRATCH(i*2 + 0) >> 2);
425 inst.imm[1] = i * 100 + 25;
426 inst.out[0].mthd = MME_TU104_OUT_OP_IMM0;
427 inst.out[0].emit = MME_TU104_OUT_OP_IMM1;
428 }
429
430 mme_tu104_asm(&b, inst) {
431 inst.imm[0] = (1<<12) | (NVC597_SET_MME_SHADOW_SCRATCH(i*2 + 1) >> 2);
432 inst.imm[1] = i * 100 + 75;
433 inst.out[0].mthd = MME_TU104_OUT_OP_IMM0;
434 inst.out[0].emit = MME_TU104_OUT_OP_IMM1;
435 }
436
437 mme_tu104_asm(&b, inst) {
438 inst.pred = mme_value_as_reg(pred);
439 inst.pred_mode = preds[i];
440 inst.imm[0] = (1<<12) | (NVC597_SET_MME_SHADOW_SCRATCH(i*2 + 0) >> 2);
441 inst.imm[1] = (1<<12) | (NVC597_SET_MME_SHADOW_SCRATCH(i*2 + 1) >> 2);
442 inst.out[0].mthd = MME_TU104_OUT_OP_IMM0;
443 inst.out[0].emit = MME_TU104_OUT_OP_IMM1;
444 inst.out[1].mthd = MME_TU104_OUT_OP_IMM1;
445 inst.out[1].emit = MME_TU104_OUT_OP_IMM0;
446 }
447
448 mme_value v0 = mme_state(&b, NVC597_SET_MME_SHADOW_SCRATCH(i*2 + 0));
449 mme_value v1 = mme_state(&b, NVC597_SET_MME_SHADOW_SCRATCH(i*2 + 1));
450
451 mme_store_imm_addr(&b, data_addr + i * 8 + 0, v0);
452 mme_store_imm_addr(&b, data_addr + i * 8 + 4, v1);
453
454 auto macro = mme_builder_finish_vec(&b);
455
456 for (uint32_t j = 0; j < 2; j++) {
457 reset_push();
458
459 std::vector<uint32_t> params;
460 params.push_back(j * 25894);
461
462 test_macro(&b, macro, params);
463 }
464 }
465 }
466
TEST_F(mme_tu104_sim_test,add)467 TEST_F(mme_tu104_sim_test, add)
468 {
469 mme_builder b;
470 mme_builder_init(&b, devinfo);
471
472 mme_value x = mme_load(&b);
473 mme_value y = mme_load(&b);
474 mme_value sum = mme_add(&b, x, y);
475 mme_store_imm_addr(&b, data_addr, sum);
476
477 auto macro = mme_builder_finish_vec(&b);
478
479 std::vector<uint32_t> params;
480 params.push_back(25);
481 params.push_back(138);
482
483 test_macro(&b, macro, params);
484 }
485
TEST_F(mme_tu104_sim_test,add_imm)486 TEST_F(mme_tu104_sim_test, add_imm)
487 {
488 mme_builder b;
489 mme_builder_init(&b, devinfo);
490
491 mme_value x = mme_load(&b);
492
493 mme_value v0 = mme_add(&b, x, mme_imm(0x00000001));
494 mme_value v1 = mme_add(&b, x, mme_imm(0xffffffff));
495 mme_value v2 = mme_add(&b, x, mme_imm(0xffff8000));
496 mme_value v3 = mme_add(&b, mme_imm(0x00000001), x);
497 mme_value v4 = mme_add(&b, mme_imm(0xffffffff), x);
498 mme_value v5 = mme_add(&b, mme_imm(0xffff8000), x);
499 mme_value v6 = mme_add(&b, mme_zero(), mme_imm(0x00000001));
500 mme_value v7 = mme_add(&b, mme_zero(), mme_imm(0xffffffff));
501 mme_value v8 = mme_add(&b, mme_zero(), mme_imm(0xffff8000));
502
503 mme_store_imm_addr(&b, data_addr + 0, v0);
504 mme_store_imm_addr(&b, data_addr + 4, v1);
505 mme_store_imm_addr(&b, data_addr + 8, v2);
506 mme_store_imm_addr(&b, data_addr + 12, v3);
507 mme_store_imm_addr(&b, data_addr + 16, v4);
508 mme_store_imm_addr(&b, data_addr + 20, v5);
509 mme_store_imm_addr(&b, data_addr + 24, v6);
510 mme_store_imm_addr(&b, data_addr + 28, v7);
511 mme_store_imm_addr(&b, data_addr + 32, v8);
512
513 auto macro = mme_builder_finish_vec(&b);
514
515 uint32_t vals[] = {
516 0x0000ffff,
517 0x00008000,
518 0x0001ffff,
519 0xffffffff,
520 };
521
522 for (uint32_t i = 0; i < ARRAY_SIZE(vals); i++) {
523 reset_push();
524
525 std::vector<uint32_t> params;
526 params.push_back(vals[i]);
527
528 test_macro(&b, macro, params);
529 }
530 }
531
TEST_F(mme_tu104_sim_test,addc)532 TEST_F(mme_tu104_sim_test, addc)
533 {
534 mme_builder b;
535 mme_builder_init(&b, devinfo);
536
537 struct mme_value64 x = { mme_load(&b), mme_load(&b) };
538 struct mme_value64 y = { mme_load(&b), mme_load(&b) };
539
540 struct mme_value64 sum = mme_add64(&b, x, y);
541
542 mme_store_imm_addr(&b, data_addr + 0, sum.lo);
543 mme_store_imm_addr(&b, data_addr + 4, sum.hi);
544
545 auto macro = mme_builder_finish_vec(&b);
546
547 std::vector<uint32_t> params;
548 params.push_back(0x80008650);
549 params.push_back(0x596);
550 params.push_back(0x8000a8f6);
551 params.push_back(0x836);
552
553 test_macro(&b, macro, params);
554 }
555
TEST_F(mme_tu104_sim_test,addc_imm)556 TEST_F(mme_tu104_sim_test, addc_imm)
557 {
558 mme_builder b;
559 mme_builder_init(&b, devinfo);
560
561 mme_value x_lo = mme_load(&b);
562 mme_value x_hi = mme_load(&b);
563
564 mme_value v1_lo = mme_alloc_reg(&b);
565 mme_value v1_hi = mme_alloc_reg(&b);
566 mme_tu104_asm(&b, i) {
567 i.alu[0].dst = mme_value_as_reg(v1_lo);
568 i.alu[0].op = MME_TU104_ALU_OP_ADD;
569 i.alu[0].src[0] = mme_value_as_reg(x_lo);
570 i.alu[0].src[1] = MME_TU104_REG_IMM;
571 i.imm[0] = 0x0001;
572 i.alu[1].dst = mme_value_as_reg(v1_hi);
573 i.alu[1].op = MME_TU104_ALU_OP_ADDC;
574 i.alu[1].src[0] = mme_value_as_reg(x_hi);
575 i.alu[1].src[1] = MME_TU104_REG_IMM;
576 i.imm[1] = 0x0000;
577 }
578
579 mme_value v2_lo = mme_alloc_reg(&b);
580 mme_value v2_hi = mme_alloc_reg(&b);
581 mme_tu104_asm(&b, i) {
582 i.alu[0].dst = mme_value_as_reg(v2_lo);
583 i.alu[0].op = MME_TU104_ALU_OP_ADD;
584 i.alu[0].src[0] = mme_value_as_reg(x_lo);
585 i.alu[0].src[1] = MME_TU104_REG_IMM;
586 i.imm[0] = 0x0000;
587 i.alu[1].dst = mme_value_as_reg(v2_hi);
588 i.alu[1].op = MME_TU104_ALU_OP_ADDC;
589 i.alu[1].src[0] = mme_value_as_reg(x_hi);
590 i.alu[1].src[1] = MME_TU104_REG_IMM;
591 i.imm[1] = 0x0001;
592 }
593
594 mme_value v3_lo = mme_alloc_reg(&b);
595 mme_value v3_hi = mme_alloc_reg(&b);
596 mme_tu104_asm(&b, i) {
597 i.alu[0].dst = mme_value_as_reg(v3_lo);
598 i.alu[0].op = MME_TU104_ALU_OP_ADD;
599 i.alu[0].src[0] = mme_value_as_reg(x_lo);
600 i.alu[0].src[1] = MME_TU104_REG_IMM;
601 i.imm[0] = 0x0000;
602 i.alu[1].dst = mme_value_as_reg(v3_hi);
603 i.alu[1].op = MME_TU104_ALU_OP_ADDC;
604 i.alu[1].src[0] = mme_value_as_reg(x_hi);
605 i.alu[1].src[1] = MME_TU104_REG_IMM;
606 i.imm[1] = 0xffff;
607 }
608
609 mme_value v4_lo = mme_alloc_reg(&b);
610 mme_value v4_hi = mme_alloc_reg(&b);
611 mme_tu104_asm(&b, i) {
612 i.alu[0].dst = mme_value_as_reg(v4_lo);
613 i.alu[0].op = MME_TU104_ALU_OP_ADD;
614 i.alu[0].src[0] = mme_value_as_reg(x_lo);
615 i.alu[0].src[1] = MME_TU104_REG_IMM;
616 i.imm[0] = 0x0000;
617 i.alu[1].dst = mme_value_as_reg(v4_hi);
618 i.alu[1].op = MME_TU104_ALU_OP_ADDC;
619 i.alu[1].src[0] = mme_value_as_reg(x_hi);
620 i.alu[1].src[1] = MME_TU104_REG_IMM;
621 i.imm[1] = 0x8000;
622 }
623
624 mme_store_imm_addr(&b, data_addr + 0, v1_lo);
625 mme_store_imm_addr(&b, data_addr + 4, v1_hi);
626 mme_store_imm_addr(&b, data_addr + 8, v2_lo);
627 mme_store_imm_addr(&b, data_addr + 12, v2_hi);
628 mme_store_imm_addr(&b, data_addr + 16, v3_lo);
629 mme_store_imm_addr(&b, data_addr + 20, v3_hi);
630 mme_store_imm_addr(&b, data_addr + 24, v4_lo);
631 mme_store_imm_addr(&b, data_addr + 28, v4_hi);
632
633 auto macro = mme_builder_finish_vec(&b);
634
635 uint64_t vals[] = {
636 0x0000ffffffffffffull,
637 0x0000ffffffff8000ull,
638 0x0000ffff00000000ull,
639 0x0000800000000000ull,
640 0x00008000ffffffffull,
641 0x0001ffff00000000ull,
642 0xffffffff00000000ull,
643 0xffffffffffffffffull,
644 };
645
646 for (uint32_t i = 0; i < ARRAY_SIZE(vals); i++) {
647 reset_push();
648
649 std::vector<uint32_t> params;
650 params.push_back(low32(vals[i]));
651 params.push_back(high32(vals[i]));
652
653 test_macro(&b, macro, params);
654 }
655 }
656
TEST_F(mme_tu104_sim_test,sub)657 TEST_F(mme_tu104_sim_test, sub)
658 {
659 mme_builder b;
660 mme_builder_init(&b, devinfo);
661
662 mme_value x = mme_load(&b);
663 mme_value y = mme_load(&b);
664 mme_value diff = mme_sub(&b, x, y);
665 mme_store_imm_addr(&b, data_addr, diff);
666
667 auto macro = mme_builder_finish_vec(&b);
668
669 std::vector<uint32_t> params;
670 params.push_back(25);
671 params.push_back(138);
672
673 test_macro(&b, macro, params);
674 }
675
TEST_F(mme_tu104_sim_test,subb)676 TEST_F(mme_tu104_sim_test, subb)
677 {
678 mme_builder b;
679 mme_builder_init(&b, devinfo);
680
681 struct mme_value64 x = { mme_load(&b), mme_load(&b) };
682 struct mme_value64 y = { mme_load(&b), mme_load(&b) };
683
684 struct mme_value64 diff = mme_sub64(&b, x, y);
685
686 mme_store_imm_addr(&b, data_addr + 0, diff.lo);
687 mme_store_imm_addr(&b, data_addr + 4, diff.hi);
688
689 auto macro = mme_builder_finish_vec(&b);
690
691 std::vector<uint32_t> params;
692 params.push_back(0x80008650);
693 params.push_back(0x596);
694 params.push_back(0x8000a8f6);
695 params.push_back(0x836);
696
697 test_macro(&b, macro, params);
698 }
699
TEST_F(mme_tu104_sim_test,mul)700 TEST_F(mme_tu104_sim_test, mul)
701 {
702 mme_builder b;
703 mme_builder_init(&b, devinfo);
704
705 mme_value x = mme_load(&b);
706 mme_value y = mme_load(&b);
707 mme_value sum = mme_mul(&b, x, y);
708 mme_store_imm_addr(&b, data_addr, sum);
709
710 auto macro = mme_builder_finish_vec(&b);
711
712 std::vector<uint32_t> params;
713 params.push_back(25);
714 params.push_back(138);
715
716 test_macro(&b, macro, params);
717 }
718
TEST_F(mme_tu104_sim_test,mul_imm)719 TEST_F(mme_tu104_sim_test, mul_imm)
720 {
721 mme_builder b;
722 mme_builder_init(&b, devinfo);
723
724 mme_value x = mme_load(&b);
725
726 mme_value v0 = mme_mul(&b, x, mme_imm(0x00000001));
727 mme_value v1 = mme_mul(&b, x, mme_imm(0xffffffff));
728 mme_value v2 = mme_mul(&b, x, mme_imm(0xffff8000));
729 mme_value v3 = mme_mul(&b, mme_imm(0x00000001), x);
730 mme_value v4 = mme_mul(&b, mme_imm(0xffffffff), x);
731 mme_value v5 = mme_mul(&b, mme_imm(0xffff8000), x);
732
733 mme_store_imm_addr(&b, data_addr + 0, v0);
734 mme_store_imm_addr(&b, data_addr + 4, v1);
735 mme_store_imm_addr(&b, data_addr + 8, v2);
736 mme_store_imm_addr(&b, data_addr + 12, v3);
737 mme_store_imm_addr(&b, data_addr + 16, v4);
738 mme_store_imm_addr(&b, data_addr + 20, v5);
739
740 auto macro = mme_builder_finish_vec(&b);
741
742 int32_t vals[] = { 1, -5, -1, 5 };
743
744 for (uint32_t i = 0; i < ARRAY_SIZE(vals); i++) {
745 reset_push();
746
747 std::vector<uint32_t> params;
748 params.push_back(vals[i]);
749
750 test_macro(&b, macro, params);
751 }
752 }
753
TEST_F(mme_tu104_sim_test,mul_mulh)754 TEST_F(mme_tu104_sim_test, mul_mulh)
755 {
756 mme_builder b;
757 mme_builder_init(&b, devinfo);
758
759 mme_value x = mme_load(&b);
760 mme_value y = mme_load(&b);
761
762 struct mme_value64 prod = mme_imul_32x32_64(&b, x, y);
763
764 mme_store_imm_addr(&b, data_addr + 0, prod.lo);
765 mme_store_imm_addr(&b, data_addr + 4, prod.hi);
766
767 auto macro = mme_builder_finish_vec(&b);
768
769 std::vector<uint32_t> params;
770 params.push_back(0x80008650);
771 params.push_back(0x596);
772
773 test_macro(&b, macro, params);
774 }
775
776 static inline struct mme_value
mme_mulu(struct mme_builder * b,struct mme_value x,struct mme_value y)777 mme_mulu(struct mme_builder *b, struct mme_value x, struct mme_value y)
778 {
779 return mme_alu(b, MME_ALU_OP_MULU, x, y);
780 }
781
TEST_F(mme_tu104_sim_test,mulu_imm)782 TEST_F(mme_tu104_sim_test, mulu_imm)
783 {
784 mme_builder b;
785 mme_builder_init(&b, devinfo);
786
787 mme_value x = mme_load(&b);
788
789 mme_value v0 = mme_mulu(&b, x, mme_imm(0x00000001));
790 mme_value v1 = mme_mulu(&b, x, mme_imm(0xffffffff));
791 mme_value v2 = mme_mulu(&b, x, mme_imm(0xffff8000));
792 mme_value v3 = mme_mulu(&b, mme_imm(0x00000001), x);
793 mme_value v4 = mme_mulu(&b, mme_imm(0xffffffff), x);
794 mme_value v5 = mme_mulu(&b, mme_imm(0xffff8000), x);
795
796 mme_store_imm_addr(&b, data_addr + 0, v0);
797 mme_store_imm_addr(&b, data_addr + 4, v1);
798 mme_store_imm_addr(&b, data_addr + 8, v2);
799 mme_store_imm_addr(&b, data_addr + 12, v3);
800 mme_store_imm_addr(&b, data_addr + 16, v4);
801 mme_store_imm_addr(&b, data_addr + 20, v5);
802
803 auto macro = mme_builder_finish_vec(&b);
804
805 int32_t vals[] = { 1, -5, -1, 5 };
806
807 for (uint32_t i = 0; i < ARRAY_SIZE(vals); i++) {
808 reset_push();
809
810 std::vector<uint32_t> params;
811 params.push_back(vals[i]);
812
813 test_macro(&b, macro, params);
814 }
815 }
816
TEST_F(mme_tu104_sim_test,mulu_mulh)817 TEST_F(mme_tu104_sim_test, mulu_mulh)
818 {
819 mme_builder b;
820 mme_builder_init(&b, devinfo);
821
822 mme_value x = mme_load(&b);
823 mme_value y = mme_load(&b);
824
825 struct mme_value64 prod = mme_umul_32x32_64(&b, x, y);
826
827 mme_store_imm_addr(&b, data_addr + 0, prod.lo);
828 mme_store_imm_addr(&b, data_addr + 4, prod.hi);
829
830 auto macro = mme_builder_finish_vec(&b);
831
832 std::vector<uint32_t> params;
833 params.push_back(0x80008650);
834 params.push_back(0x596);
835
836 test_macro(&b, macro, params);
837 }
838
TEST_F(mme_tu104_sim_test,clz)839 TEST_F(mme_tu104_sim_test, clz)
840 {
841 mme_builder b;
842 mme_builder_init(&b, devinfo);
843
844 mme_value bits = mme_clz(&b, mme_load(&b));
845 mme_store_imm_addr(&b, data_addr, bits);
846
847 auto macro = mme_builder_finish_vec(&b);
848
849 std::vector<uint32_t> params;
850 params.push_back(0x00406fe0);
851
852 test_macro(&b, macro, params);
853 }
854
855 #define SHIFT_TEST(op) \
856 TEST_F(mme_tu104_sim_test, op) \
857 { \
858 mme_builder b; \
859 mme_builder_init(&b, devinfo); \
860 \
861 mme_value val = mme_load(&b); \
862 mme_value shift1 = mme_load(&b); \
863 mme_value shift2 = mme_load(&b); \
864 mme_store_imm_addr(&b, data_addr + 0, mme_##op(&b, val, shift1)); \
865 mme_store_imm_addr(&b, data_addr + 4, mme_##op(&b, val, shift2)); \
866 \
867 auto macro = mme_builder_finish_vec(&b); \
868 \
869 std::vector<uint32_t> params; \
870 params.push_back(0x0c406fe0); \
871 params.push_back(5); \
872 params.push_back(51); \
873 \
874 test_macro(&b, macro, params); \
875 }
876
877 SHIFT_TEST(sll)
SHIFT_TEST(srl)878 SHIFT_TEST(srl)
879 SHIFT_TEST(sra)
880
881 #undef SHIFT_TEST
882
883 TEST_F(mme_tu104_sim_test, bfe)
884 {
885 const uint32_t canary = 0xc0ffee01;
886
887 mme_builder b;
888 mme_builder_init(&b, devinfo);
889
890 mme_value val = mme_load(&b);
891 mme_value pos = mme_load(&b);
892
893 mme_store_imm_addr(&b, data_addr + 0, mme_bfe(&b, val, pos, 1), true);
894 mme_store_imm_addr(&b, data_addr + 4, mme_bfe(&b, val, pos, 2), true);
895 mme_store_imm_addr(&b, data_addr + 8, mme_bfe(&b, val, pos, 5), true);
896
897 auto macro = mme_builder_finish_vec(&b);
898
899 for (unsigned i = 0; i < 31; i++) {
900 std::vector<uint32_t> params;
901 params.push_back(canary);
902 params.push_back(i);
903
904 test_macro(&b, macro, params);
905
906 ASSERT_EQ(data[0], (canary >> i) & 0x1);
907 ASSERT_EQ(data[1], (canary >> i) & 0x3);
908 ASSERT_EQ(data[2], (canary >> i) & 0x1f);
909 }
910 }
911
912 #define BITOP_TEST(op) \
913 TEST_F(mme_tu104_sim_test, op) \
914 { \
915 mme_builder b; \
916 mme_builder_init(&b, devinfo); \
917 \
918 mme_value x = mme_load(&b); \
919 mme_value y = mme_load(&b); \
920 mme_value v1 = mme_##op(&b, x, y); \
921 mme_value v2 = mme_##op(&b, x, mme_imm(0xffff8000)); \
922 mme_value v3 = mme_##op(&b, x, mme_imm(0xffffffff)); \
923 mme_store_imm_addr(&b, data_addr + 0, v1); \
924 mme_store_imm_addr(&b, data_addr + 4, v2); \
925 mme_store_imm_addr(&b, data_addr + 8, v3); \
926 \
927 auto macro = mme_builder_finish_vec(&b); \
928 \
929 std::vector<uint32_t> params; \
930 params.push_back(0x0c406fe0); \
931 params.push_back(0x00fff0c0); \
932 \
933 test_macro(&b, macro, params); \
934 }
935
936 BITOP_TEST(and)
BITOP_TEST(nand)937 BITOP_TEST(nand)
938 BITOP_TEST(or)
939 BITOP_TEST(xor)
940
941 #undef BITOP_TEST
942
943 TEST_F(mme_tu104_sim_test, merge)
944 {
945 mme_builder b;
946 mme_builder_init(&b, devinfo);
947
948 mme_value x = mme_load(&b);
949 mme_value y = mme_load(&b);
950
951 mme_value m1 = mme_merge(&b, x, y, 12, 12, 20);
952 mme_value m2 = mme_merge(&b, x, y, 12, 8, 20);
953 mme_value m3 = mme_merge(&b, x, y, 8, 12, 20);
954 mme_value m4 = mme_merge(&b, x, y, 12, 16, 8);
955 mme_value m5 = mme_merge(&b, x, y, 24, 12, 8);
956
957 mme_store_imm_addr(&b, data_addr + 0, m1);
958 mme_store_imm_addr(&b, data_addr + 4, m2);
959 mme_store_imm_addr(&b, data_addr + 8, m3);
960 mme_store_imm_addr(&b, data_addr + 12, m4);
961 mme_store_imm_addr(&b, data_addr + 16, m5);
962
963 auto macro = mme_builder_finish_vec(&b);
964
965 std::vector<uint32_t> params;
966 params.push_back(0x0c406fe0);
967 params.push_back(0x76543210u);
968
969 test_macro(&b, macro, params);
970 }
971
972 #define COMPARISON_TEST(op) \
973 TEST_F(mme_tu104_sim_test, op) \
974 { \
975 mme_builder b; \
976 mme_builder_init(&b, devinfo); \
977 \
978 mme_value x = mme_load(&b); \
979 mme_value y = mme_load(&b); \
980 mme_value z = mme_load(&b); \
981 mme_value w = mme_load(&b); \
982 \
983 mme_value v1 = mme_##op(&b, x, y); \
984 mme_value v2 = mme_##op(&b, y, x); \
985 mme_value v3 = mme_##op(&b, y, z); \
986 mme_value v4 = mme_##op(&b, z, y); \
987 mme_value v5 = mme_##op(&b, w, z); \
988 mme_value v6 = mme_##op(&b, z, w); \
989 mme_value v7 = mme_##op(&b, w, w); \
990 \
991 mme_store_imm_addr(&b, data_addr + 0, v1); \
992 mme_store_imm_addr(&b, data_addr + 4, v2); \
993 mme_store_imm_addr(&b, data_addr + 8, v3); \
994 mme_store_imm_addr(&b, data_addr + 12, v4); \
995 mme_store_imm_addr(&b, data_addr + 16, v5); \
996 mme_store_imm_addr(&b, data_addr + 20, v6); \
997 mme_store_imm_addr(&b, data_addr + 24, v7); \
998 \
999 auto macro = mme_builder_finish_vec(&b); \
1000 \
1001 std::vector<uint32_t> params; \
1002 params.push_back(-5); \
1003 params.push_back(-10); \
1004 params.push_back(5); \
1005 params.push_back(10); \
1006 \
1007 test_macro(&b, macro, params); \
1008 }
1009
1010 COMPARISON_TEST(slt)
COMPARISON_TEST(sltu)1011 COMPARISON_TEST(sltu)
1012 COMPARISON_TEST(sle)
1013 COMPARISON_TEST(sleu)
1014 COMPARISON_TEST(seq)
1015
1016 #undef COMPARISON_TEST
1017
1018 static inline void
1019 mme_inc_whole_inst(mme_builder *b, mme_value val)
1020 {
1021 mme_tu104_asm(b, i) {
1022 i.alu[0].dst = mme_value_as_reg(val);
1023 i.alu[0].op = MME_TU104_ALU_OP_ADD;
1024 i.alu[0].src[0] = mme_value_as_reg(val);
1025 i.alu[0].src[1] = MME_TU104_REG_IMM;
1026 i.imm[0] = 1;
1027 }
1028 }
1029
TEST_F(mme_tu104_sim_test,loop)1030 TEST_F(mme_tu104_sim_test, loop)
1031 {
1032 mme_builder b;
1033 mme_builder_init(&b, devinfo);
1034
1035 mme_value count = mme_load(&b);
1036
1037 mme_value x = mme_mov(&b, mme_zero());
1038 mme_value y = mme_mov(&b, mme_zero());
1039
1040 mme_loop(&b, count) {
1041 mme_tu104_asm(&b, i) { } /* noop */
1042 mme_add_to(&b, x, x, count);
1043 }
1044 mme_add_to(&b, y, y, mme_imm(1));
1045 mme_tu104_asm(&b, i) { } /* noop */
1046 mme_tu104_asm(&b, i) { } /* noop */
1047 mme_tu104_asm(&b, i) { } /* noop */
1048
1049 mme_store_imm_addr(&b, data_addr + 0, count);
1050 mme_store_imm_addr(&b, data_addr + 4, x);
1051 mme_store_imm_addr(&b, data_addr + 8, y);
1052
1053 auto macro = mme_builder_finish_vec(&b);
1054
1055 uint32_t counts[] = {0, 1, 5, 9};
1056
1057 for (uint32_t i = 0; i < ARRAY_SIZE(counts); i++) {
1058 reset_push();
1059
1060 std::vector<uint32_t> params;
1061 params.push_back(counts[i]);
1062
1063 test_macro(&b, macro, params);
1064 ASSERT_EQ(data[0], counts[i]);
1065 ASSERT_EQ(data[1], counts[i] * counts[i]);
1066 ASSERT_EQ(data[2], 1);
1067 }
1068 }
1069
TEST_F(mme_tu104_sim_test,jal)1070 TEST_F(mme_tu104_sim_test, jal)
1071 {
1072 mme_builder b;
1073 mme_builder_init(&b, devinfo);
1074
1075 mme_value x = mme_mov(&b, mme_zero());
1076 mme_value y = mme_mov(&b, mme_zero());
1077
1078 mme_tu104_asm(&b, i) {
1079 i.alu[0].op = MME_TU104_ALU_OP_JAL;
1080 i.imm[0] = (1 << 15) | 6;
1081 }
1082
1083 for (uint32_t j = 0; j < 10; j++)
1084 mme_inc_whole_inst(&b, x);
1085
1086 // mme_tu104_asm(&b, i) {
1087 // i.alu[0].op = MME_TU104_ALU_OP_JAL;
1088 // i.imm[0] = 6;
1089 // }
1090 //
1091 // for (uint32_t j = 0; j < 10; j++)
1092 // mme_inc_whole_inst(&b, y);
1093
1094 mme_store_imm_addr(&b, data_addr + 0, x);
1095 mme_store_imm_addr(&b, data_addr + 4, y);
1096
1097 auto macro = mme_builder_finish_vec(&b);
1098
1099 std::vector<uint32_t> params;
1100 test_macro(&b, macro, params);
1101 ASSERT_EQ(data[0], 5);
1102 }
1103
TEST_F(mme_tu104_sim_test,bxx_fwd)1104 TEST_F(mme_tu104_sim_test, bxx_fwd)
1105 {
1106 mme_builder b;
1107 mme_builder_init(&b, devinfo);
1108
1109 mme_value vals[10];
1110 for (uint32_t i = 0; i < 10; i++)
1111 vals[i] = mme_mov(&b, mme_zero());
1112
1113 mme_tu104_asm(&b, i) {
1114 i.alu[0].op = MME_TU104_ALU_OP_BEQ;
1115 i.imm[0] = (1 << 15) | 6;
1116 }
1117
1118 for (uint32_t j = 0; j < 10; j++)
1119 mme_inc_whole_inst(&b, vals[j]);
1120
1121 for (uint32_t j = 0; j < 10; j++)
1122 mme_store_imm_addr(&b, data_addr + j * 4, vals[j]);
1123
1124 auto macro = mme_builder_finish_vec(&b);
1125
1126 std::vector<uint32_t> params;
1127 test_macro(&b, macro, params);
1128 }
1129
TEST_F(mme_tu104_sim_test,bxx_bwd)1130 TEST_F(mme_tu104_sim_test, bxx_bwd)
1131 {
1132 mme_builder b;
1133 mme_builder_init(&b, devinfo);
1134
1135 mme_value vals[15];
1136 for (uint32_t i = 0; i < 15; i++)
1137 vals[i] = mme_mov(&b, mme_zero());
1138
1139 mme_tu104_asm(&b, i) {
1140 i.alu[0].op = MME_TU104_ALU_OP_JAL;
1141 i.imm[0] = (1 << 15) | 12;
1142 }
1143
1144 for (uint32_t j = 0; j < 10; j++)
1145 mme_inc_whole_inst(&b, vals[j]);
1146
1147 mme_tu104_asm(&b, i) {
1148 i.alu[0].op = MME_TU104_ALU_OP_JAL;
1149 i.imm[0] = (1 << 15) | 2;
1150 }
1151
1152 mme_tu104_asm(&b, i) {
1153 i.alu[0].op = MME_TU104_ALU_OP_BEQ;
1154 i.imm[0] = (1 << 15) | ((-8) & 0x1fff);
1155 }
1156
1157 for (uint32_t j = 10; j < 15; j++)
1158 mme_inc_whole_inst(&b, vals[j]);
1159
1160 for (uint32_t j = 0; j < 15; j++)
1161 mme_store_imm_addr(&b, data_addr + j * 4, vals[j]);
1162
1163 auto macro = mme_builder_finish_vec(&b);
1164
1165 std::vector<uint32_t> params;
1166 test_macro(&b, macro, params);
1167 for (uint32_t j = 0; j < 3; j++)
1168 ASSERT_EQ(data[j], 0);
1169 for (uint32_t j = 3; j < 15; j++)
1170 ASSERT_EQ(data[j], 1);
1171 }
1172
TEST_F(mme_tu104_sim_test,bxx_exit)1173 TEST_F(mme_tu104_sim_test, bxx_exit)
1174 {
1175 mme_builder b;
1176 mme_builder_init(&b, devinfo);
1177
1178 mme_value vals[10];
1179 for (uint32_t i = 0; i < 10; i++)
1180 vals[i] = mme_mov(&b, mme_zero());
1181
1182 for (uint32_t i = 0; i < 10; i++)
1183 mme_store_imm_addr(&b, data_addr + i * 4, mme_imm(0));
1184
1185 mme_tu104_asm(&b, i) {
1186 i.alu[0].op = MME_TU104_ALU_OP_BEQ;
1187 i.imm[0] = (1 << 15) | 0x1000;
1188 }
1189
1190 /* those writes won't be visible */
1191 for (uint32_t j = 0; j < 10; j++)
1192 mme_inc_whole_inst(&b, vals[j]);
1193
1194 for (uint32_t i = 0; i < 10; i++)
1195 mme_store_imm_addr(&b, data_addr + i * 4, vals[i]);
1196
1197 std::vector<uint32_t> params;
1198
1199 auto macro = mme_builder_finish_vec(&b);
1200 test_macro(&b, macro, params);
1201
1202 uint32_t i;
1203 for (i = 0; i < 10; i++)
1204 ASSERT_EQ(data[i], 0);
1205 }
1206
TEST_F(mme_tu104_sim_test,mme_exit)1207 TEST_F(mme_tu104_sim_test, mme_exit)
1208 {
1209 mme_builder b;
1210 mme_builder_init(&b, devinfo);
1211
1212 mme_value vals[10];
1213 for (uint32_t i = 0; i < 10; i++)
1214 vals[i] = mme_mov(&b, mme_zero());
1215
1216 for (uint32_t i = 0; i < 10; i++)
1217 mme_store_imm_addr(&b, data_addr + i * 4, mme_imm(0));
1218
1219 /* abort */
1220 mme_exit(&b);
1221
1222 /* those writes won't be visible */
1223 for (uint32_t i = 0; i < 10; i++)
1224 vals[i] = mme_mov(&b, mme_imm(i));
1225
1226 for (uint32_t i = 0; i < 10; i++) {
1227 mme_store_imm_addr(&b, data_addr + i * 4, vals[i]);
1228 }
1229
1230 std::vector<uint32_t> params;
1231
1232 auto macro = mme_builder_finish_vec(&b);
1233 test_macro(&b, macro, params);
1234
1235 uint32_t i;
1236 for (i = 0; i < 10; i++)
1237 ASSERT_EQ(data[i], 0);
1238 }
1239
TEST_F(mme_tu104_sim_test,mme_exit_if)1240 TEST_F(mme_tu104_sim_test, mme_exit_if)
1241 {
1242 mme_builder b;
1243 mme_builder_init(&b, devinfo);
1244
1245 mme_value vals[10];
1246 for (uint32_t i = 0; i < 10; i++)
1247 vals[i] = mme_mov(&b, mme_zero());
1248
1249 for (uint32_t i = 0; i < 10; i++)
1250 mme_store_imm_addr(&b, data_addr + i * 4, mme_imm(0));
1251
1252 /* shouldn't do anything */
1253 mme_exit_if(&b, ieq, mme_zero(), mme_imm(1));
1254
1255 for (uint32_t i = 0; i < 10; i++)
1256 vals[i] = mme_mov(&b, mme_imm(i));
1257
1258 for (uint32_t i = 0; i < 10; i++) {
1259 /* abort on reaching 5 */
1260 mme_exit_if(&b, ile, mme_imm(5), vals[i]);
1261 mme_store_imm_addr(&b, data_addr + i * 4, vals[i]);
1262 }
1263
1264 std::vector<uint32_t> params;
1265
1266 auto macro = mme_builder_finish_vec(&b);
1267 test_macro(&b, macro, params);
1268
1269 uint32_t i;
1270 for (i = 0; i < 10; i++)
1271 ASSERT_EQ(data[i], i < 5 ? i : 0);
1272 }
1273
c_ilt(int32_t x,int32_t y)1274 static bool c_ilt(int32_t x, int32_t y) { return x < y; };
c_ult(uint32_t x,uint32_t y)1275 static bool c_ult(uint32_t x, uint32_t y) { return x < y; };
c_ile(int32_t x,int32_t y)1276 static bool c_ile(int32_t x, int32_t y) { return x <= y; };
c_ule(uint32_t x,uint32_t y)1277 static bool c_ule(uint32_t x, uint32_t y) { return x <= y; };
c_ieq(int32_t x,int32_t y)1278 static bool c_ieq(int32_t x, int32_t y) { return x == y; };
c_ige(int32_t x,int32_t y)1279 static bool c_ige(int32_t x, int32_t y) { return x >= y; };
c_uge(uint32_t x,uint32_t y)1280 static bool c_uge(uint32_t x, uint32_t y) { return x >= y; };
c_igt(int32_t x,int32_t y)1281 static bool c_igt(int32_t x, int32_t y) { return x > y; };
c_ugt(uint32_t x,uint32_t y)1282 static bool c_ugt(uint32_t x, uint32_t y) { return x > y; };
c_ine(int32_t x,int32_t y)1283 static bool c_ine(int32_t x, int32_t y) { return x != y; };
1284
1285 #define IF_TEST(op) \
1286 TEST_F(mme_tu104_sim_test, if_##op) \
1287 { \
1288 mme_builder b; \
1289 mme_builder_init(&b, devinfo); \
1290 \
1291 mme_value x = mme_load(&b); \
1292 mme_value y = mme_load(&b); \
1293 mme_value i = mme_mov(&b, mme_zero()); \
1294 \
1295 mme_start_if_##op(&b, x, y); \
1296 { \
1297 mme_add_to(&b, i, i, mme_imm(1)); \
1298 mme_add_to(&b, i, i, mme_imm(1)); \
1299 } \
1300 mme_end_if(&b); \
1301 mme_add_to(&b, i, i, mme_imm(1)); \
1302 mme_add_to(&b, i, i, mme_imm(1)); \
1303 mme_add_to(&b, i, i, mme_imm(1)); \
1304 \
1305 mme_store_imm_addr(&b, data_addr + 0, i); \
1306 \
1307 auto macro = mme_builder_finish_vec(&b); \
1308 \
1309 uint32_t vals[] = {23, 56, (uint32_t)-5, (uint32_t)-10, 56, 14}; \
1310 \
1311 for (uint32_t i = 0; i < ARRAY_SIZE(vals) - 1; i++) { \
1312 reset_push(); \
1313 \
1314 std::vector<uint32_t> params; \
1315 params.push_back(vals[i + 0]); \
1316 params.push_back(vals[i + 1]); \
1317 \
1318 test_macro(&b, macro, params); \
1319 \
1320 ASSERT_EQ(data[0], c_##op(params[0], params[1]) ? 5 : 3); \
1321 } \
1322 }
1323
1324 IF_TEST(ilt)
IF_TEST(ult)1325 IF_TEST(ult)
1326 IF_TEST(ile)
1327 IF_TEST(ule)
1328 IF_TEST(ieq)
1329 IF_TEST(ige)
1330 IF_TEST(uge)
1331 IF_TEST(igt)
1332 IF_TEST(ugt)
1333 IF_TEST(ine)
1334
1335 #undef IF_TEST
1336
1337 #define WHILE_TEST(op, start, step, bound) \
1338 TEST_F(mme_tu104_sim_test, while_##op) \
1339 { \
1340 mme_builder b; \
1341 mme_builder_init(&b, devinfo); \
1342 \
1343 mme_value x = mme_mov(&b, mme_zero()); \
1344 mme_value y = mme_mov(&b, mme_zero()); \
1345 mme_value z = mme_mov(&b, mme_imm(start)); \
1346 mme_value w = mme_mov(&b, mme_zero()); \
1347 mme_value v = mme_mov(&b, mme_zero()); \
1348 \
1349 for (uint32_t j = 0; j < 5; j++) \
1350 mme_inc_whole_inst(&b, x); \
1351 \
1352 mme_while(&b, op, z, mme_imm(bound)) { \
1353 for (uint32_t j = 0; j < 5; j++) \
1354 mme_inc_whole_inst(&b, y); \
1355 \
1356 mme_add_to(&b, z, z, mme_imm(step)); \
1357 \
1358 for (uint32_t j = 0; j < 5; j++) \
1359 mme_inc_whole_inst(&b, w); \
1360 } \
1361 \
1362 for (uint32_t j = 0; j < 5; j++) \
1363 mme_inc_whole_inst(&b, v); \
1364 \
1365 mme_store_imm_addr(&b, data_addr + 0, x); \
1366 mme_store_imm_addr(&b, data_addr + 4, y); \
1367 mme_store_imm_addr(&b, data_addr + 8, z); \
1368 mme_store_imm_addr(&b, data_addr + 12, w); \
1369 mme_store_imm_addr(&b, data_addr + 16, v); \
1370 \
1371 auto macro = mme_builder_finish_vec(&b); \
1372 \
1373 uint32_t end = (uint32_t)(start), count = 0; \
1374 while (c_##op(end, (bound))) { \
1375 end += (uint32_t)(step); \
1376 count++; \
1377 } \
1378 \
1379 std::vector<uint32_t> params; \
1380 test_macro(&b, macro, params); \
1381 ASSERT_EQ(data[0], 5); \
1382 ASSERT_EQ(data[1], 5 * count); \
1383 ASSERT_EQ(data[2], end); \
1384 ASSERT_EQ(data[3], 5 * count); \
1385 ASSERT_EQ(data[4], 5); \
1386 }
1387
1388 WHILE_TEST(ilt, 0, 1, 7)
1389 WHILE_TEST(ult, 0, 1, 7)
1390 WHILE_TEST(ile, -10, 2, 0)
1391 WHILE_TEST(ule, 0, 1, 7)
1392 WHILE_TEST(ieq, 0, 5, 0)
1393 WHILE_TEST(ige, 5, -1, -5)
1394 WHILE_TEST(uge, 15, -2, 2)
1395 WHILE_TEST(igt, 7, -3, -10)
1396 WHILE_TEST(ugt, 1604, -30, 1000)
1397 WHILE_TEST(ine, 0, 1, 7)
1398
1399 #undef WHILE_TEST
1400
1401 TEST_F(mme_tu104_sim_test, nested_while)
1402 {
1403 mme_builder b;
1404 mme_builder_init(&b, devinfo);
1405
1406 mme_value n = mme_load(&b);
1407 mme_value m = mme_load(&b);
1408
1409 mme_value count = mme_mov(&b, mme_zero());
1410
1411 mme_value i = mme_mov(&b, mme_zero());
1412 mme_value j = mme_mov(&b, mme_imm(0xffff));
1413 mme_while(&b, ine, i, n) {
1414 mme_mov_to(&b, j, mme_zero());
1415 mme_while(&b, ine, j, m) {
1416 mme_add_to(&b, count, count, mme_imm(1));
1417 mme_add_to(&b, j, j, mme_imm(1));
1418 }
1419
1420 mme_add_to(&b, i, i, mme_imm(1));
1421 }
1422
1423 mme_store_imm_addr(&b, data_addr + 0, i);
1424 mme_store_imm_addr(&b, data_addr + 4, j);
1425 mme_store_imm_addr(&b, data_addr + 8, count);
1426
1427 auto macro = mme_builder_finish_vec(&b);
1428
1429 std::vector<uint32_t> params;
1430 params.push_back(3);
1431 params.push_back(5);
1432
1433 test_macro(&b, macro, params);
1434 ASSERT_EQ(data[0], 3);
1435 ASSERT_EQ(data[1], 5);
1436 ASSERT_EQ(data[2], 15);
1437 }
1438
1439 #if 0
1440 TEST_F(mme_tu104_sim_test, do_ble)
1441 {
1442 mme_builder b;
1443 mme_builder_init(&b, devinfo);
1444
1445 mme_alu(&b, R5, ADD, LOAD0, ZERO);
1446 mme_alu(&b, R6, ADD, ZERO, ZERO);
1447 mme_alu(&b, R7, ADD, ZERO, ZERO);
1448
1449 mme_alu_imm(&b, R7, ADD, R7, IMM, 1);
1450 mme_alu_imm(&b, R7, ADD, R7, IMM, 1);
1451 mme_alu_imm(&b, R7, ADD, R7, IMM, 1);
1452 mme_alu_imm(&b, R7, ADD, R7, IMM, 1);
1453 mme_alu_imm(&b, R6, ADD, R6, IMM, 1);
1454 mme_branch(&b, BLE, R6, R5, -3, 2);
1455 mme_alu_imm(&b, R7, ADD, R7, IMM, 1);
1456 mme_alu_imm(&b, R7, ADD, R7, IMM, 1);
1457
1458 mme_store_imm_addr(&b, data_addr + 0, MME_TU104_REG_R7);
1459
1460 mme_end(&b);
1461
1462 uint32_t counts[] = {0, 1, 5, 9};
1463
1464 for (uint32_t i = 0; i < ARRAY_SIZE(counts); i++) {
1465 reset_push();
1466
1467 std::vector<uint32_t> params;
1468 params.push_back(counts[i]);
1469
1470 test_macro(&b, params);
1471 }
1472 }
1473 #endif
1474
TEST_F(mme_tu104_sim_test,dread_dwrite)1475 TEST_F(mme_tu104_sim_test, dread_dwrite)
1476 {
1477 mme_builder b;
1478 mme_builder_init(&b, devinfo);
1479
1480 mme_value x = mme_load(&b);
1481 mme_value y = mme_load(&b);
1482
1483 mme_dwrite(&b, mme_imm(5), x);
1484 mme_dwrite(&b, mme_imm(8), y);
1485
1486 mme_value y2 = mme_dread(&b, mme_imm(8));
1487 mme_value x2 = mme_dread(&b, mme_imm(5));
1488
1489 mme_store_imm_addr(&b, data_addr + 0, y2);
1490 mme_store_imm_addr(&b, data_addr + 4, x2);
1491
1492 auto macro = mme_builder_finish_vec(&b);
1493
1494 std::vector<uint32_t> params;
1495 params.push_back(-10);
1496 params.push_back(5);
1497
1498 test_macro(&b, macro, params);
1499 }
1500
TEST_F(mme_tu104_sim_test,dwrite_dma)1501 TEST_F(mme_tu104_sim_test, dwrite_dma)
1502 {
1503 const uint32_t canary5 = 0xc0ffee01;
1504 const uint32_t canary8 = canary5 & 0x00ffff00;
1505
1506 mme_builder b;
1507 mme_builder_init(&b, devinfo);
1508
1509 mme_value x = mme_load(&b);
1510 mme_value y = mme_load(&b);
1511
1512 mme_dwrite(&b, mme_imm(5), x);
1513 mme_dwrite(&b, mme_imm(8), y);
1514
1515 auto macro = mme_builder_finish_vec(&b);
1516
1517 push_macro(0, macro);
1518
1519 P_1INC(p, NVC597, CALL_MME_MACRO(0));
1520 P_INLINE_DATA(p, canary5);
1521 P_INLINE_DATA(p, canary8);
1522
1523 P_MTHD(p, NVC597, SET_MME_MEM_ADDRESS_A);
1524 P_NVC597_SET_MME_MEM_ADDRESS_A(p, high32(data_addr));
1525 P_NVC597_SET_MME_MEM_ADDRESS_B(p, low32(data_addr));
1526 /* Start 3 dwords into MME RAM */
1527 P_NVC597_SET_MME_DATA_RAM_ADDRESS(p, 3);
1528 P_IMMD(p, NVC597, MME_DMA_WRITE, 20);
1529
1530 submit_push();
1531
1532 for (uint32_t i = 0; i < 20; i++) {
1533 if (i == 5 - 3) {
1534 ASSERT_EQ(data[i], canary5);
1535 } else if (i == 8 - 3) {
1536 ASSERT_EQ(data[i], canary8);
1537 } else {
1538 ASSERT_EQ(data[i], 0);
1539 }
1540 }
1541 }
1542
TEST_F(mme_tu104_sim_test,dram_limit)1543 TEST_F(mme_tu104_sim_test, dram_limit)
1544 {
1545 static const uint32_t chunk_size = 32;
1546
1547 mme_builder b;
1548 mme_builder_init(&b, devinfo);
1549
1550 mme_value start = mme_load(&b);
1551 mme_value count = mme_load(&b);
1552
1553 mme_value i = mme_mov(&b, start);
1554 mme_loop(&b, count) {
1555 mme_dwrite(&b, i, i);
1556 mme_add_to(&b, i, i, mme_imm(1));
1557 }
1558
1559 mme_value j = mme_mov(&b, start);
1560 struct mme_value64 addr = mme_mov64(&b, mme_imm64(data_addr));
1561
1562 mme_loop(&b, count) {
1563 mme_value x = mme_dread(&b, j);
1564 mme_store(&b, addr, x);
1565 mme_add_to(&b, j, j, mme_imm(1));
1566 mme_add64_to(&b, addr, addr, mme_imm64(4));
1567 }
1568
1569 auto macro = mme_builder_finish_vec(&b);
1570
1571 for (uint32_t i = 0; i < MME_TU104_DRAM_COUNT; i += chunk_size) {
1572 reset_push();
1573
1574 push_macro(0, macro);
1575
1576 P_1INC(p, NVC597, CALL_MME_MACRO(0));
1577 P_INLINE_DATA(p, i);
1578 P_INLINE_DATA(p, chunk_size);
1579
1580 submit_push();
1581
1582 for (uint32_t j = 0; j < chunk_size; j++)
1583 ASSERT_EQ(data[j], i + j);
1584 }
1585 }
1586
TEST_F(mme_tu104_sim_test,dma_read_fifoed)1587 TEST_F(mme_tu104_sim_test, dma_read_fifoed)
1588 {
1589 mme_builder b;
1590 mme_builder_init(&b, devinfo);
1591
1592 mme_mthd(&b, NVC597_SET_MME_DATA_RAM_ADDRESS);
1593 mme_emit(&b, mme_zero());
1594
1595 mme_mthd(&b, NVC597_SET_MME_MEM_ADDRESS_A);
1596 mme_emit(&b, mme_imm(high32(data_addr)));
1597 mme_emit(&b, mme_imm(low32(data_addr)));
1598
1599 mme_mthd(&b, NVC597_MME_DMA_READ_FIFOED);
1600 mme_emit(&b, mme_imm(2));
1601
1602 mme_tu104_load_barrier(&b);
1603
1604 mme_value x = mme_load(&b);
1605 mme_value y = mme_load(&b);
1606
1607 mme_store_imm_addr(&b, data_addr + 256 + 0, x);
1608 mme_store_imm_addr(&b, data_addr + 256 + 4, y);
1609
1610 auto macro = mme_builder_finish_vec(&b);
1611
1612 P_IMMD(p, NVC597, SET_MME_DATA_FIFO_CONFIG, FIFO_SIZE_SIZE_4KB);
1613
1614 for (uint32_t i = 0; i < 64; i++)
1615 data[i] = 1000 + i;
1616
1617 std::vector<uint32_t> params;
1618 params.push_back(7);
1619
1620 test_macro(&b, macro, params);
1621 }
1622
TEST_F(mme_tu104_sim_test,scratch_limit)1623 TEST_F(mme_tu104_sim_test, scratch_limit)
1624 {
1625 static const uint32_t chunk_size = 32;
1626
1627 mme_builder b;
1628 mme_builder_init(&b, devinfo);
1629
1630 mme_value start = mme_load(&b);
1631 mme_value count = mme_load(&b);
1632
1633 mme_value i = mme_mov(&b, start);
1634 mme_loop(&b, count) {
1635 mme_mthd_arr(&b, NVC597_SET_MME_SHADOW_SCRATCH(0), i);
1636 mme_emit(&b, i);
1637 mme_add_to(&b, i, i, mme_imm(1));
1638 }
1639
1640 mme_value j = mme_mov(&b, start);
1641 struct mme_value64 addr = mme_mov64(&b, mme_imm64(data_addr));
1642
1643 mme_loop(&b, count) {
1644 mme_value x = mme_state_arr(&b, NVC597_SET_MME_SHADOW_SCRATCH(0), j);
1645 mme_store(&b, addr, x);
1646 mme_add_to(&b, j, j, mme_imm(1));
1647 mme_add64_to(&b, addr, addr, mme_imm64(4));
1648 }
1649
1650 auto macro = mme_builder_finish_vec(&b);
1651
1652 for (uint32_t i = 0; i < MME_TU104_SCRATCH_COUNT; i += chunk_size) {
1653 reset_push();
1654
1655 push_macro(0, macro);
1656
1657 P_1INC(p, NVC597, CALL_MME_MACRO(0));
1658 P_INLINE_DATA(p, i);
1659 P_INLINE_DATA(p, chunk_size);
1660
1661 submit_push();
1662
1663 for (uint32_t j = 0; j < chunk_size; j++)
1664 ASSERT_EQ(data[j], i + j);
1665 }
1666 }
1667