• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /*
2  * Copyright © 2022 Collabora Ltd.
3  * SPDX-License-Identifier: MIT
4  */
5 #include "mme_runner.h"
6 #include "mme_tu104_sim.h"
7 
8 #include "nvk_clc597.h"
9 
10 class mme_tu104_sim_test : public ::testing::Test, public mme_hw_runner {
11 public:
12    mme_tu104_sim_test();
13    ~mme_tu104_sim_test();
14 
15    void SetUp();
16    void test_macro(const mme_builder *b,
17                    const std::vector<uint32_t>& macro,
18                    const std::vector<uint32_t>& params);
19 };
20 
mme_tu104_sim_test()21 mme_tu104_sim_test::mme_tu104_sim_test() :
22    ::testing::Test(),
23    mme_hw_runner()
24 { }
25 
~mme_tu104_sim_test()26 mme_tu104_sim_test::~mme_tu104_sim_test()
27 { }
28 
29 void
SetUp()30 mme_tu104_sim_test::SetUp()
31 {
32    ASSERT_TRUE(set_up_hw(TURING_A, UINT16_MAX));
33 }
34 
35 void
test_macro(const mme_builder * b,const std::vector<uint32_t> & macro,const std::vector<uint32_t> & params)36 mme_tu104_sim_test::test_macro(const mme_builder *b,
37                                const std::vector<uint32_t>& macro,
38                                const std::vector<uint32_t>& params)
39 {
40    const uint32_t data_dwords = DATA_BO_SIZE / sizeof(uint32_t);
41 
42    std::vector<mme_tu104_inst> insts(macro.size() / 3);
43    mme_tu104_decode(&insts[0], &macro[0], macro.size() / 3);
44 
45    /* First, make a copy of the data and simulate the macro */
46    std::vector<uint32_t> sim_data(data, data + (DATA_BO_SIZE / 4));
47    mme_tu104_sim_mem sim_mem = {
48       .addr = data_addr,
49       .data = &sim_data[0],
50       .size = DATA_BO_SIZE,
51    };
52    mme_tu104_sim(insts.size(), &insts[0],
53                  params.size(), &params[0],
54                  1, &sim_mem);
55 
56    /* Now run the macro on the GPU */
57    push_macro(0, macro);
58 
59    P_1INC(p, NVC597, CALL_MME_MACRO(0));
60    if (params.empty()) {
61       P_NVC597_CALL_MME_MACRO(p, 0, 0);
62    } else {
63       P_INLINE_ARRAY(p, &params[0], params.size());
64    }
65 
66    submit_push();
67 
68    /* Check the results */
69    for (uint32_t i = 0; i < data_dwords; i++)
70       ASSERT_EQ(data[i], sim_data[i]);
71 }
72 
73 static mme_tu104_reg
mme_value_as_reg(mme_value val)74 mme_value_as_reg(mme_value val)
75 {
76    assert(val.type == MME_VALUE_TYPE_REG);
77    return (mme_tu104_reg)(MME_TU104_REG_R0 + val.reg);
78 }
79 
TEST_F(mme_tu104_sim_test,sanity)80 TEST_F(mme_tu104_sim_test, sanity)
81 {
82    const uint32_t canary = 0xc0ffee01;
83 
84    mme_builder b;
85    mme_builder_init(&b, devinfo);
86 
87    mme_store_imm_addr(&b, data_addr, mme_imm(canary));
88 
89    auto macro = mme_builder_finish_vec(&b);
90 
91    std::vector<uint32_t> params;
92    test_macro(&b, macro, params);
93 }
94 
TEST_F(mme_tu104_sim_test,multi_param)95 TEST_F(mme_tu104_sim_test, multi_param)
96 {
97    mme_builder b;
98    mme_builder_init(&b, devinfo);
99 
100    mme_value v0 = mme_alloc_reg(&b);
101    mme_value v1 = mme_alloc_reg(&b);
102 
103    mme_tu104_asm(&b, i) {
104       i.alu[0].dst = mme_value_as_reg(v0);
105       i.alu[0].src[0] = MME_TU104_REG_LOAD1;
106       i.alu[1].dst = mme_value_as_reg(v1);
107       i.alu[1].src[0] = MME_TU104_REG_LOAD0;
108       i.imm[0] = (1<<12) | (NVC597_SET_MME_SHADOW_SCRATCH(12) >> 2);
109       i.out[0].mthd = MME_TU104_OUT_OP_IMM0;
110       i.out[0].emit = MME_TU104_OUT_OP_LOAD0;
111       i.imm[1] = (1<<12) | (NVC597_SET_MME_SHADOW_SCRATCH(35) >> 2);
112       i.out[1].mthd = MME_TU104_OUT_OP_IMM1;
113       i.out[1].emit = MME_TU104_OUT_OP_LOAD1;
114    }
115 
116    mme_value v2 = mme_state(&b, NVC597_SET_MME_SHADOW_SCRATCH(12));
117    mme_value v3 = mme_state(&b, NVC597_SET_MME_SHADOW_SCRATCH(35));
118 
119    mme_store_imm_addr(&b, data_addr + 0, v0);
120    mme_store_imm_addr(&b, data_addr + 4, v1);
121    mme_store_imm_addr(&b, data_addr + 8, v2);
122    mme_store_imm_addr(&b, data_addr + 12, v3);
123 
124    auto macro = mme_builder_finish_vec(&b);
125 
126    std::vector<uint32_t> params;
127    params.push_back(2581);
128    params.push_back(3048);
129 
130    test_macro(&b, macro, params);
131 }
132 
TEST_F(mme_tu104_sim_test,pred_param)133 TEST_F(mme_tu104_sim_test, pred_param)
134 {
135    mme_builder b;
136    mme_builder_init(&b, devinfo);
137 
138    mme_value v0 = mme_load(&b);
139    mme_value v1 = mme_mov(&b, mme_imm(240));
140 
141    mme_tu104_asm(&b, i) {
142       i.pred_mode = MME_TU104_PRED_TTTT;
143       i.alu[0].dst = mme_value_as_reg(v1);
144       i.alu[0].src[0] = MME_TU104_REG_LOAD0;
145    }
146 
147    mme_value v2 = mme_load(&b);
148 
149    mme_store_imm_addr(&b, data_addr + 0, v0);
150    mme_store_imm_addr(&b, data_addr + 4, v1);
151    mme_store_imm_addr(&b, data_addr + 8, v2);
152 
153    auto macro = mme_builder_finish_vec(&b);
154 
155    for (uint32_t j = 0; j < 4; j++) {
156       reset_push();
157 
158       std::vector<uint32_t> params;
159       params.push_back((j & 1) * 2043);
160       params.push_back((j & 2) * 523);
161       params.push_back(2581);
162       params.push_back(3048);
163 
164       test_macro(&b, macro, params);
165    }
166 }
167 
TEST_F(mme_tu104_sim_test,out_imm0)168 TEST_F(mme_tu104_sim_test, out_imm0)
169 {
170    mme_builder b;
171    mme_builder_init(&b, devinfo);
172 
173    mme_mthd(&b, NVC597_SET_REPORT_SEMAPHORE_A);
174    mme_emit(&b, mme_imm(high32(data_addr + 0)));
175    mme_emit(&b, mme_imm(low32(data_addr + 0)));
176    mme_tu104_asm(&b, i) {
177       i.imm[0] = 0x1234;
178       i.out[0].emit = MME_TU104_OUT_OP_IMM0;
179    }
180    mme_emit(&b, mme_imm(0x10000000));
181 
182    mme_mthd(&b, NVC597_SET_REPORT_SEMAPHORE_A);
183    mme_emit(&b, mme_imm(high32(data_addr + 4)));
184    mme_emit(&b, mme_imm(low32(data_addr + 4)));
185    mme_tu104_asm(&b, i) {
186       i.imm[0] = 0x8765;
187       i.out[0].emit = MME_TU104_OUT_OP_IMM0;
188    }
189    mme_emit(&b, mme_imm(0x10000000));
190 
191    auto macro = mme_builder_finish_vec(&b);
192 
193    std::vector<uint32_t> params;
194    test_macro(&b, macro, params);
195 }
196 
TEST_F(mme_tu104_sim_test,out_imm1)197 TEST_F(mme_tu104_sim_test, out_imm1)
198 {
199    mme_builder b;
200    mme_builder_init(&b, devinfo);
201 
202    mme_mthd(&b, NVC597_SET_REPORT_SEMAPHORE_A);
203    mme_emit(&b, mme_imm(high32(data_addr + 0)));
204    mme_emit(&b, mme_imm(low32(data_addr + 0)));
205    mme_tu104_asm(&b, i) {
206       i.imm[1] = 0x1234;
207       i.out[0].emit = MME_TU104_OUT_OP_IMM1;
208    }
209    mme_emit(&b, mme_imm(0x10000000));
210 
211    mme_mthd(&b, NVC597_SET_REPORT_SEMAPHORE_A);
212    mme_emit(&b, mme_imm(high32(data_addr + 4)));
213    mme_emit(&b, mme_imm(low32(data_addr + 4)));
214    mme_tu104_asm(&b, i) {
215       i.imm[1] = 0x8765;
216       i.out[0].emit = MME_TU104_OUT_OP_IMM1;
217    }
218    mme_emit(&b, mme_imm(0x10000000));
219 
220    auto macro = mme_builder_finish_vec(&b);
221 
222    std::vector<uint32_t> params;
223    test_macro(&b, macro, params);
224 }
225 
TEST_F(mme_tu104_sim_test,out_immhigh0)226 TEST_F(mme_tu104_sim_test, out_immhigh0)
227 {
228    mme_builder b;
229    mme_builder_init(&b, devinfo);
230 
231    mme_mthd(&b, NVC597_SET_REPORT_SEMAPHORE_A);
232    mme_emit(&b, mme_imm(high32(data_addr + 0)));
233    mme_emit(&b, mme_imm(low32(data_addr + 0)));
234    mme_tu104_asm(&b, i) {
235       i.imm[0] = 0x1234;
236       i.out[0].emit = MME_TU104_OUT_OP_IMMHIGH0;
237    }
238    mme_emit(&b, mme_imm(0x10000000));
239 
240    mme_mthd(&b, NVC597_SET_REPORT_SEMAPHORE_A);
241    mme_emit(&b, mme_imm(high32(data_addr + 4)));
242    mme_emit(&b, mme_imm(low32(data_addr + 4)));
243    mme_tu104_asm(&b, i) {
244       i.imm[0] = 0x8765;
245       i.out[1].emit = MME_TU104_OUT_OP_IMMHIGH0;
246    }
247    mme_emit(&b, mme_imm(0x10000000));
248 
249    auto macro = mme_builder_finish_vec(&b);
250 
251    std::vector<uint32_t> params;
252    test_macro(&b, macro, params);
253 }
254 
TEST_F(mme_tu104_sim_test,out_immhigh1)255 TEST_F(mme_tu104_sim_test, out_immhigh1)
256 {
257    mme_builder b;
258    mme_builder_init(&b, devinfo);
259 
260    mme_mthd(&b, NVC597_SET_REPORT_SEMAPHORE_A);
261    mme_emit(&b, mme_imm(high32(data_addr + 0)));
262    mme_emit(&b, mme_imm(low32(data_addr + 0)));
263    mme_tu104_asm(&b, i) {
264       i.imm[1] = 0x1234;
265       i.out[0].emit = MME_TU104_OUT_OP_IMMHIGH1;
266    }
267    mme_emit(&b, mme_imm(0x10000000));
268 
269    mme_mthd(&b, NVC597_SET_REPORT_SEMAPHORE_A);
270    mme_emit(&b, mme_imm(high32(data_addr + 4)));
271    mme_emit(&b, mme_imm(low32(data_addr + 4)));
272    mme_tu104_asm(&b, i) {
273       i.imm[1] = 0x8765;
274       i.out[1].emit = MME_TU104_OUT_OP_IMMHIGH1;
275    }
276    mme_emit(&b, mme_imm(0x10000000));
277 
278    auto macro = mme_builder_finish_vec(&b);
279 
280    std::vector<uint32_t> params;
281    test_macro(&b, macro, params);
282 }
283 
TEST_F(mme_tu104_sim_test,out_imm32)284 TEST_F(mme_tu104_sim_test, out_imm32)
285 {
286    mme_builder b;
287    mme_builder_init(&b, devinfo);
288 
289    mme_mthd(&b, NVC597_SET_REPORT_SEMAPHORE_A);
290    mme_emit(&b, mme_imm(high32(data_addr + 0)));
291    mme_emit(&b, mme_imm(low32(data_addr + 0)));
292    mme_tu104_asm(&b, i) {
293       i.imm[0] = 0x1234;
294       i.imm[1] = 0x7654;
295       i.out[0].emit = MME_TU104_OUT_OP_IMM32;
296    }
297    mme_emit(&b, mme_imm(0x10000000));
298 
299    mme_mthd(&b, NVC597_SET_REPORT_SEMAPHORE_A);
300    mme_emit(&b, mme_imm(high32(data_addr + 4)));
301    mme_emit(&b, mme_imm(low32(data_addr + 4)));
302    mme_tu104_asm(&b, i) {
303       i.imm[0] = 0x1234;
304       i.imm[1] = 0x7654;
305       i.out[1].emit = MME_TU104_OUT_OP_IMM32;
306    }
307    mme_emit(&b, mme_imm(0x10000000));
308 
309    auto macro = mme_builder_finish_vec(&b);
310 
311    std::vector<uint32_t> params;
312    test_macro(&b, macro, params);
313 }
314 
TEST_F(mme_tu104_sim_test,reg_imm32)315 TEST_F(mme_tu104_sim_test, reg_imm32)
316 {
317    const uint32_t canary = 0xc0ffee01;
318 
319    mme_builder b;
320    mme_builder_init(&b, devinfo);
321 
322    mme_value v = mme_alloc_reg(&b);
323 
324    mme_tu104_asm(&b, i) {
325       i.alu[0].dst = mme_value_as_reg(v);
326       i.alu[0].op = MME_TU104_ALU_OP_ADD;
327       i.alu[0].src[0] = MME_TU104_REG_IMM32,
328       i.imm[0] = (uint16_t)canary;
329       i.imm[1] = (uint16_t)(canary >> 16);
330    }
331 
332    mme_store_imm_addr(&b, data_addr, v);
333 
334    auto macro = mme_builder_finish_vec(&b);
335 
336    std::vector<uint32_t> params;
337    test_macro(&b, macro, params);
338 }
339 
TEST_F(mme_tu104_sim_test,pred_alu)340 TEST_F(mme_tu104_sim_test, pred_alu)
341 {
342    static const enum mme_tu104_pred preds[] = {
343       MME_TU104_PRED_UUUU,
344       MME_TU104_PRED_TTTT,
345       MME_TU104_PRED_FFFF,
346       MME_TU104_PRED_TTUU,
347       MME_TU104_PRED_FFUU,
348       MME_TU104_PRED_TFUU,
349       MME_TU104_PRED_TUUU,
350       MME_TU104_PRED_FUUU,
351       MME_TU104_PRED_UUTT,
352       MME_TU104_PRED_UUTF,
353       MME_TU104_PRED_UUTU,
354       MME_TU104_PRED_UUFT,
355       MME_TU104_PRED_UUFF,
356       MME_TU104_PRED_UUFU,
357       MME_TU104_PRED_UUUT,
358       MME_TU104_PRED_UUUF,
359    };
360 
361    for (uint32_t i = 0; i < ARRAY_SIZE(preds); i++) {
362       mme_builder b;
363       mme_builder_init(&b, devinfo);
364 
365       mme_value pred = mme_load(&b);
366       mme_value v0 = mme_mov(&b, mme_imm(i * 100 + 13));
367       mme_value v1 = mme_mov(&b, mme_imm(i * 100 + 62));
368 
369       mme_tu104_asm(&b, inst) {
370          inst.pred = mme_value_as_reg(pred);
371          inst.pred_mode = preds[i];
372          inst.alu[0].dst = mme_value_as_reg(v0);
373          inst.alu[0].src[0] = MME_TU104_REG_IMM;
374          inst.imm[0] = i * 100 + 25;
375          inst.alu[1].dst = mme_value_as_reg(v1);
376          inst.alu[1].src[0] = MME_TU104_REG_IMM;
377          inst.imm[1] = i * 100 + 73;
378       }
379 
380       mme_store_imm_addr(&b, data_addr + i * 8 + 0, v0);
381       mme_store_imm_addr(&b, data_addr + i * 8 + 4, v1);
382 
383       auto macro = mme_builder_finish_vec(&b);
384 
385       for (uint32_t j = 0; j < 2; j++) {
386          reset_push();
387 
388          std::vector<uint32_t> params;
389          params.push_back(j * 25894);
390 
391          test_macro(&b, macro, params);
392       }
393    }
394 }
395 
TEST_F(mme_tu104_sim_test,pred_out)396 TEST_F(mme_tu104_sim_test, pred_out)
397 {
398    static const enum mme_tu104_pred preds[] = {
399       MME_TU104_PRED_UUUU,
400       MME_TU104_PRED_TTTT,
401       MME_TU104_PRED_FFFF,
402       MME_TU104_PRED_TTUU,
403       MME_TU104_PRED_FFUU,
404       MME_TU104_PRED_TFUU,
405       MME_TU104_PRED_TUUU,
406       MME_TU104_PRED_FUUU,
407       MME_TU104_PRED_UUTT,
408       MME_TU104_PRED_UUTF,
409       MME_TU104_PRED_UUTU,
410       MME_TU104_PRED_UUFT,
411       MME_TU104_PRED_UUFF,
412       MME_TU104_PRED_UUFU,
413       MME_TU104_PRED_UUUT,
414       MME_TU104_PRED_UUUF,
415    };
416 
417    for (uint32_t i = 0; i < ARRAY_SIZE(preds); i++) {
418       mme_builder b;
419       mme_builder_init(&b, devinfo);
420 
421       mme_value pred = mme_load(&b);
422 
423       mme_tu104_asm(&b, inst) {
424          inst.imm[0] = (1<<12) | (NVC597_SET_MME_SHADOW_SCRATCH(i*2 + 0) >> 2);
425          inst.imm[1] = i * 100 + 25;
426          inst.out[0].mthd = MME_TU104_OUT_OP_IMM0;
427          inst.out[0].emit = MME_TU104_OUT_OP_IMM1;
428       }
429 
430       mme_tu104_asm(&b, inst) {
431          inst.imm[0] = (1<<12) | (NVC597_SET_MME_SHADOW_SCRATCH(i*2 + 1) >> 2);
432          inst.imm[1] = i * 100 + 75;
433          inst.out[0].mthd = MME_TU104_OUT_OP_IMM0;
434          inst.out[0].emit = MME_TU104_OUT_OP_IMM1;
435       }
436 
437       mme_tu104_asm(&b, inst) {
438          inst.pred = mme_value_as_reg(pred);
439          inst.pred_mode = preds[i];
440          inst.imm[0] = (1<<12) | (NVC597_SET_MME_SHADOW_SCRATCH(i*2 + 0) >> 2);
441          inst.imm[1] = (1<<12) | (NVC597_SET_MME_SHADOW_SCRATCH(i*2 + 1) >> 2);
442          inst.out[0].mthd = MME_TU104_OUT_OP_IMM0;
443          inst.out[0].emit = MME_TU104_OUT_OP_IMM1;
444          inst.out[1].mthd = MME_TU104_OUT_OP_IMM1;
445          inst.out[1].emit = MME_TU104_OUT_OP_IMM0;
446       }
447 
448       mme_value v0 = mme_state(&b, NVC597_SET_MME_SHADOW_SCRATCH(i*2 + 0));
449       mme_value v1 = mme_state(&b, NVC597_SET_MME_SHADOW_SCRATCH(i*2 + 1));
450 
451       mme_store_imm_addr(&b, data_addr + i * 8 + 0, v0);
452       mme_store_imm_addr(&b, data_addr + i * 8 + 4, v1);
453 
454       auto macro = mme_builder_finish_vec(&b);
455 
456       for (uint32_t j = 0; j < 2; j++) {
457          reset_push();
458 
459          std::vector<uint32_t> params;
460          params.push_back(j * 25894);
461 
462          test_macro(&b, macro, params);
463       }
464    }
465 }
466 
TEST_F(mme_tu104_sim_test,add)467 TEST_F(mme_tu104_sim_test, add)
468 {
469    mme_builder b;
470    mme_builder_init(&b, devinfo);
471 
472    mme_value x = mme_load(&b);
473    mme_value y = mme_load(&b);
474    mme_value sum = mme_add(&b, x, y);
475    mme_store_imm_addr(&b, data_addr, sum);
476 
477    auto macro = mme_builder_finish_vec(&b);
478 
479    std::vector<uint32_t> params;
480    params.push_back(25);
481    params.push_back(138);
482 
483    test_macro(&b, macro, params);
484 }
485 
TEST_F(mme_tu104_sim_test,add_imm)486 TEST_F(mme_tu104_sim_test, add_imm)
487 {
488    mme_builder b;
489    mme_builder_init(&b, devinfo);
490 
491    mme_value x = mme_load(&b);
492 
493    mme_value v0 = mme_add(&b, x, mme_imm(0x00000001));
494    mme_value v1 = mme_add(&b, x, mme_imm(0xffffffff));
495    mme_value v2 = mme_add(&b, x, mme_imm(0xffff8000));
496    mme_value v3 = mme_add(&b, mme_imm(0x00000001), x);
497    mme_value v4 = mme_add(&b, mme_imm(0xffffffff), x);
498    mme_value v5 = mme_add(&b, mme_imm(0xffff8000), x);
499    mme_value v6 = mme_add(&b, mme_zero(), mme_imm(0x00000001));
500    mme_value v7 = mme_add(&b, mme_zero(), mme_imm(0xffffffff));
501    mme_value v8 = mme_add(&b, mme_zero(), mme_imm(0xffff8000));
502 
503    mme_store_imm_addr(&b, data_addr + 0,  v0);
504    mme_store_imm_addr(&b, data_addr + 4,  v1);
505    mme_store_imm_addr(&b, data_addr + 8,  v2);
506    mme_store_imm_addr(&b, data_addr + 12, v3);
507    mme_store_imm_addr(&b, data_addr + 16, v4);
508    mme_store_imm_addr(&b, data_addr + 20, v5);
509    mme_store_imm_addr(&b, data_addr + 24, v6);
510    mme_store_imm_addr(&b, data_addr + 28, v7);
511    mme_store_imm_addr(&b, data_addr + 32, v8);
512 
513    auto macro = mme_builder_finish_vec(&b);
514 
515    uint32_t vals[] = {
516       0x0000ffff,
517       0x00008000,
518       0x0001ffff,
519       0xffffffff,
520    };
521 
522    for (uint32_t i = 0; i < ARRAY_SIZE(vals); i++) {
523       reset_push();
524 
525       std::vector<uint32_t> params;
526       params.push_back(vals[i]);
527 
528       test_macro(&b, macro, params);
529    }
530 }
531 
TEST_F(mme_tu104_sim_test,addc)532 TEST_F(mme_tu104_sim_test, addc)
533 {
534    mme_builder b;
535    mme_builder_init(&b, devinfo);
536 
537    struct mme_value64 x = { mme_load(&b), mme_load(&b) };
538    struct mme_value64 y = { mme_load(&b), mme_load(&b) };
539 
540    struct mme_value64 sum = mme_add64(&b, x, y);
541 
542    mme_store_imm_addr(&b, data_addr + 0, sum.lo);
543    mme_store_imm_addr(&b, data_addr + 4, sum.hi);
544 
545    auto macro = mme_builder_finish_vec(&b);
546 
547    std::vector<uint32_t> params;
548    params.push_back(0x80008650);
549    params.push_back(0x596);
550    params.push_back(0x8000a8f6);
551    params.push_back(0x836);
552 
553    test_macro(&b, macro, params);
554 }
555 
TEST_F(mme_tu104_sim_test,addc_imm)556 TEST_F(mme_tu104_sim_test, addc_imm)
557 {
558    mme_builder b;
559    mme_builder_init(&b, devinfo);
560 
561    mme_value x_lo = mme_load(&b);
562    mme_value x_hi = mme_load(&b);
563 
564    mme_value v1_lo = mme_alloc_reg(&b);
565    mme_value v1_hi = mme_alloc_reg(&b);
566    mme_tu104_asm(&b, i) {
567       i.alu[0].dst = mme_value_as_reg(v1_lo);
568       i.alu[0].op = MME_TU104_ALU_OP_ADD;
569       i.alu[0].src[0] = mme_value_as_reg(x_lo);
570       i.alu[0].src[1] = MME_TU104_REG_IMM;
571       i.imm[0] = 0x0001;
572       i.alu[1].dst = mme_value_as_reg(v1_hi);
573       i.alu[1].op = MME_TU104_ALU_OP_ADDC;
574       i.alu[1].src[0] = mme_value_as_reg(x_hi);
575       i.alu[1].src[1] = MME_TU104_REG_IMM;
576       i.imm[1] = 0x0000;
577    }
578 
579    mme_value v2_lo = mme_alloc_reg(&b);
580    mme_value v2_hi = mme_alloc_reg(&b);
581    mme_tu104_asm(&b, i) {
582       i.alu[0].dst = mme_value_as_reg(v2_lo);
583       i.alu[0].op = MME_TU104_ALU_OP_ADD;
584       i.alu[0].src[0] = mme_value_as_reg(x_lo);
585       i.alu[0].src[1] = MME_TU104_REG_IMM;
586       i.imm[0] = 0x0000;
587       i.alu[1].dst = mme_value_as_reg(v2_hi);
588       i.alu[1].op = MME_TU104_ALU_OP_ADDC;
589       i.alu[1].src[0] = mme_value_as_reg(x_hi);
590       i.alu[1].src[1] = MME_TU104_REG_IMM;
591       i.imm[1] = 0x0001;
592    }
593 
594    mme_value v3_lo = mme_alloc_reg(&b);
595    mme_value v3_hi = mme_alloc_reg(&b);
596    mme_tu104_asm(&b, i) {
597       i.alu[0].dst = mme_value_as_reg(v3_lo);
598       i.alu[0].op = MME_TU104_ALU_OP_ADD;
599       i.alu[0].src[0] = mme_value_as_reg(x_lo);
600       i.alu[0].src[1] = MME_TU104_REG_IMM;
601       i.imm[0] = 0x0000;
602       i.alu[1].dst = mme_value_as_reg(v3_hi);
603       i.alu[1].op = MME_TU104_ALU_OP_ADDC;
604       i.alu[1].src[0] = mme_value_as_reg(x_hi);
605       i.alu[1].src[1] = MME_TU104_REG_IMM;
606       i.imm[1] = 0xffff;
607    }
608 
609    mme_value v4_lo = mme_alloc_reg(&b);
610    mme_value v4_hi = mme_alloc_reg(&b);
611    mme_tu104_asm(&b, i) {
612       i.alu[0].dst = mme_value_as_reg(v4_lo);
613       i.alu[0].op = MME_TU104_ALU_OP_ADD;
614       i.alu[0].src[0] = mme_value_as_reg(x_lo);
615       i.alu[0].src[1] = MME_TU104_REG_IMM;
616       i.imm[0] = 0x0000;
617       i.alu[1].dst = mme_value_as_reg(v4_hi);
618       i.alu[1].op = MME_TU104_ALU_OP_ADDC;
619       i.alu[1].src[0] = mme_value_as_reg(x_hi);
620       i.alu[1].src[1] = MME_TU104_REG_IMM;
621       i.imm[1] = 0x8000;
622    }
623 
624    mme_store_imm_addr(&b, data_addr + 0,  v1_lo);
625    mme_store_imm_addr(&b, data_addr + 4,  v1_hi);
626    mme_store_imm_addr(&b, data_addr + 8,  v2_lo);
627    mme_store_imm_addr(&b, data_addr + 12, v2_hi);
628    mme_store_imm_addr(&b, data_addr + 16, v3_lo);
629    mme_store_imm_addr(&b, data_addr + 20, v3_hi);
630    mme_store_imm_addr(&b, data_addr + 24, v4_lo);
631    mme_store_imm_addr(&b, data_addr + 28, v4_hi);
632 
633    auto macro = mme_builder_finish_vec(&b);
634 
635    uint64_t vals[] = {
636       0x0000ffffffffffffull,
637       0x0000ffffffff8000ull,
638       0x0000ffff00000000ull,
639       0x0000800000000000ull,
640       0x00008000ffffffffull,
641       0x0001ffff00000000ull,
642       0xffffffff00000000ull,
643       0xffffffffffffffffull,
644    };
645 
646    for (uint32_t i = 0; i < ARRAY_SIZE(vals); i++) {
647       reset_push();
648 
649       std::vector<uint32_t> params;
650       params.push_back(low32(vals[i]));
651       params.push_back(high32(vals[i]));
652 
653       test_macro(&b, macro, params);
654    }
655 }
656 
TEST_F(mme_tu104_sim_test,sub)657 TEST_F(mme_tu104_sim_test, sub)
658 {
659    mme_builder b;
660    mme_builder_init(&b, devinfo);
661 
662    mme_value x = mme_load(&b);
663    mme_value y = mme_load(&b);
664    mme_value diff = mme_sub(&b, x, y);
665    mme_store_imm_addr(&b, data_addr, diff);
666 
667    auto macro = mme_builder_finish_vec(&b);
668 
669    std::vector<uint32_t> params;
670    params.push_back(25);
671    params.push_back(138);
672 
673    test_macro(&b, macro, params);
674 }
675 
TEST_F(mme_tu104_sim_test,subb)676 TEST_F(mme_tu104_sim_test, subb)
677 {
678    mme_builder b;
679    mme_builder_init(&b, devinfo);
680 
681    struct mme_value64 x = { mme_load(&b), mme_load(&b) };
682    struct mme_value64 y = { mme_load(&b), mme_load(&b) };
683 
684    struct mme_value64 diff = mme_sub64(&b, x, y);
685 
686    mme_store_imm_addr(&b, data_addr + 0, diff.lo);
687    mme_store_imm_addr(&b, data_addr + 4, diff.hi);
688 
689    auto macro = mme_builder_finish_vec(&b);
690 
691    std::vector<uint32_t> params;
692    params.push_back(0x80008650);
693    params.push_back(0x596);
694    params.push_back(0x8000a8f6);
695    params.push_back(0x836);
696 
697    test_macro(&b, macro, params);
698 }
699 
TEST_F(mme_tu104_sim_test,mul)700 TEST_F(mme_tu104_sim_test, mul)
701 {
702    mme_builder b;
703    mme_builder_init(&b, devinfo);
704 
705    mme_value x = mme_load(&b);
706    mme_value y = mme_load(&b);
707    mme_value sum = mme_mul(&b, x, y);
708    mme_store_imm_addr(&b, data_addr, sum);
709 
710    auto macro = mme_builder_finish_vec(&b);
711 
712    std::vector<uint32_t> params;
713    params.push_back(25);
714    params.push_back(138);
715 
716    test_macro(&b, macro, params);
717 }
718 
TEST_F(mme_tu104_sim_test,mul_imm)719 TEST_F(mme_tu104_sim_test, mul_imm)
720 {
721    mme_builder b;
722    mme_builder_init(&b, devinfo);
723 
724    mme_value x = mme_load(&b);
725 
726    mme_value v0 = mme_mul(&b, x, mme_imm(0x00000001));
727    mme_value v1 = mme_mul(&b, x, mme_imm(0xffffffff));
728    mme_value v2 = mme_mul(&b, x, mme_imm(0xffff8000));
729    mme_value v3 = mme_mul(&b, mme_imm(0x00000001), x);
730    mme_value v4 = mme_mul(&b, mme_imm(0xffffffff), x);
731    mme_value v5 = mme_mul(&b, mme_imm(0xffff8000), x);
732 
733    mme_store_imm_addr(&b, data_addr + 0,  v0);
734    mme_store_imm_addr(&b, data_addr + 4,  v1);
735    mme_store_imm_addr(&b, data_addr + 8,  v2);
736    mme_store_imm_addr(&b, data_addr + 12, v3);
737    mme_store_imm_addr(&b, data_addr + 16, v4);
738    mme_store_imm_addr(&b, data_addr + 20, v5);
739 
740    auto macro = mme_builder_finish_vec(&b);
741 
742    int32_t vals[] = { 1, -5, -1, 5 };
743 
744    for (uint32_t i = 0; i < ARRAY_SIZE(vals); i++) {
745       reset_push();
746 
747       std::vector<uint32_t> params;
748       params.push_back(vals[i]);
749 
750       test_macro(&b, macro, params);
751    }
752 }
753 
TEST_F(mme_tu104_sim_test,mul_mulh)754 TEST_F(mme_tu104_sim_test, mul_mulh)
755 {
756    mme_builder b;
757    mme_builder_init(&b, devinfo);
758 
759    mme_value x = mme_load(&b);
760    mme_value y = mme_load(&b);
761 
762    struct mme_value64 prod = mme_imul_32x32_64(&b, x, y);
763 
764    mme_store_imm_addr(&b, data_addr + 0, prod.lo);
765    mme_store_imm_addr(&b, data_addr + 4, prod.hi);
766 
767    auto macro = mme_builder_finish_vec(&b);
768 
769    std::vector<uint32_t> params;
770    params.push_back(0x80008650);
771    params.push_back(0x596);
772 
773    test_macro(&b, macro, params);
774 }
775 
776 static inline struct mme_value
mme_mulu(struct mme_builder * b,struct mme_value x,struct mme_value y)777 mme_mulu(struct mme_builder *b, struct mme_value x, struct mme_value y)
778 {
779    return mme_alu(b, MME_ALU_OP_MULU, x, y);
780 }
781 
TEST_F(mme_tu104_sim_test,mulu_imm)782 TEST_F(mme_tu104_sim_test, mulu_imm)
783 {
784    mme_builder b;
785    mme_builder_init(&b, devinfo);
786 
787    mme_value x = mme_load(&b);
788 
789    mme_value v0 = mme_mulu(&b, x, mme_imm(0x00000001));
790    mme_value v1 = mme_mulu(&b, x, mme_imm(0xffffffff));
791    mme_value v2 = mme_mulu(&b, x, mme_imm(0xffff8000));
792    mme_value v3 = mme_mulu(&b, mme_imm(0x00000001), x);
793    mme_value v4 = mme_mulu(&b, mme_imm(0xffffffff), x);
794    mme_value v5 = mme_mulu(&b, mme_imm(0xffff8000), x);
795 
796    mme_store_imm_addr(&b, data_addr + 0,  v0);
797    mme_store_imm_addr(&b, data_addr + 4,  v1);
798    mme_store_imm_addr(&b, data_addr + 8,  v2);
799    mme_store_imm_addr(&b, data_addr + 12, v3);
800    mme_store_imm_addr(&b, data_addr + 16, v4);
801    mme_store_imm_addr(&b, data_addr + 20, v5);
802 
803    auto macro = mme_builder_finish_vec(&b);
804 
805    int32_t vals[] = { 1, -5, -1, 5 };
806 
807    for (uint32_t i = 0; i < ARRAY_SIZE(vals); i++) {
808       reset_push();
809 
810       std::vector<uint32_t> params;
811       params.push_back(vals[i]);
812 
813       test_macro(&b, macro, params);
814    }
815 }
816 
TEST_F(mme_tu104_sim_test,mulu_mulh)817 TEST_F(mme_tu104_sim_test, mulu_mulh)
818 {
819    mme_builder b;
820    mme_builder_init(&b, devinfo);
821 
822    mme_value x = mme_load(&b);
823    mme_value y = mme_load(&b);
824 
825    struct mme_value64 prod = mme_umul_32x32_64(&b, x, y);
826 
827    mme_store_imm_addr(&b, data_addr + 0, prod.lo);
828    mme_store_imm_addr(&b, data_addr + 4, prod.hi);
829 
830    auto macro = mme_builder_finish_vec(&b);
831 
832    std::vector<uint32_t> params;
833    params.push_back(0x80008650);
834    params.push_back(0x596);
835 
836    test_macro(&b, macro, params);
837 }
838 
TEST_F(mme_tu104_sim_test,clz)839 TEST_F(mme_tu104_sim_test, clz)
840 {
841    mme_builder b;
842    mme_builder_init(&b, devinfo);
843 
844    mme_value bits = mme_clz(&b, mme_load(&b));
845    mme_store_imm_addr(&b, data_addr, bits);
846 
847    auto macro = mme_builder_finish_vec(&b);
848 
849    std::vector<uint32_t> params;
850    params.push_back(0x00406fe0);
851 
852    test_macro(&b, macro, params);
853 }
854 
855 #define SHIFT_TEST(op)                                               \
856 TEST_F(mme_tu104_sim_test, op)                                       \
857 {                                                                    \
858    mme_builder b;                                                    \
859    mme_builder_init(&b, devinfo);                                 \
860                                                                      \
861    mme_value val = mme_load(&b);                                     \
862    mme_value shift1 = mme_load(&b);                                  \
863    mme_value shift2 = mme_load(&b);                                  \
864    mme_store_imm_addr(&b, data_addr + 0, mme_##op(&b, val, shift1)); \
865    mme_store_imm_addr(&b, data_addr + 4, mme_##op(&b, val, shift2)); \
866                                                                      \
867    auto macro = mme_builder_finish_vec(&b);                          \
868                                                                      \
869    std::vector<uint32_t> params;                                     \
870    params.push_back(0x0c406fe0);                                     \
871    params.push_back(5);                                              \
872    params.push_back(51);                                             \
873                                                                      \
874    test_macro(&b, macro, params);                                    \
875 }
876 
877 SHIFT_TEST(sll)
SHIFT_TEST(srl)878 SHIFT_TEST(srl)
879 SHIFT_TEST(sra)
880 
881 #undef SHIFT_TEST
882 
883 TEST_F(mme_tu104_sim_test, bfe)
884 {
885    const uint32_t canary = 0xc0ffee01;
886 
887    mme_builder b;
888    mme_builder_init(&b, devinfo);
889 
890    mme_value val = mme_load(&b);
891    mme_value pos = mme_load(&b);
892 
893    mme_store_imm_addr(&b, data_addr + 0, mme_bfe(&b, val, pos, 1), true);
894    mme_store_imm_addr(&b, data_addr + 4, mme_bfe(&b, val, pos, 2), true);
895    mme_store_imm_addr(&b, data_addr + 8, mme_bfe(&b, val, pos, 5), true);
896 
897    auto macro = mme_builder_finish_vec(&b);
898 
899    for (unsigned i = 0; i < 31; i++) {
900       std::vector<uint32_t> params;
901       params.push_back(canary);
902       params.push_back(i);
903 
904       test_macro(&b, macro, params);
905 
906       ASSERT_EQ(data[0], (canary >> i) & 0x1);
907       ASSERT_EQ(data[1], (canary >> i) & 0x3);
908       ASSERT_EQ(data[2], (canary >> i) & 0x1f);
909    }
910 }
911 
912 #define BITOP_TEST(op)                                               \
913 TEST_F(mme_tu104_sim_test, op)                                       \
914 {                                                                    \
915    mme_builder b;                                                    \
916    mme_builder_init(&b, devinfo);                                 \
917                                                                      \
918    mme_value x = mme_load(&b);                                       \
919    mme_value y = mme_load(&b);                                       \
920    mme_value v1 = mme_##op(&b, x, y);                                \
921    mme_value v2 = mme_##op(&b, x, mme_imm(0xffff8000));              \
922    mme_value v3 = mme_##op(&b, x, mme_imm(0xffffffff));              \
923    mme_store_imm_addr(&b, data_addr + 0, v1);                        \
924    mme_store_imm_addr(&b, data_addr + 4, v2);                        \
925    mme_store_imm_addr(&b, data_addr + 8, v3);                        \
926                                                                      \
927    auto macro = mme_builder_finish_vec(&b);                          \
928                                                                      \
929    std::vector<uint32_t> params;                                     \
930    params.push_back(0x0c406fe0);                                     \
931    params.push_back(0x00fff0c0);                                     \
932                                                                      \
933    test_macro(&b, macro, params);                                    \
934 }
935 
936 BITOP_TEST(and)
BITOP_TEST(nand)937 BITOP_TEST(nand)
938 BITOP_TEST(or)
939 BITOP_TEST(xor)
940 
941 #undef BITOP_TEST
942 
943 TEST_F(mme_tu104_sim_test, merge)
944 {
945    mme_builder b;
946    mme_builder_init(&b, devinfo);
947 
948    mme_value x = mme_load(&b);
949    mme_value y = mme_load(&b);
950 
951    mme_value m1 = mme_merge(&b, x, y, 12, 12, 20);
952    mme_value m2 = mme_merge(&b, x, y, 12, 8,  20);
953    mme_value m3 = mme_merge(&b, x, y, 8,  12, 20);
954    mme_value m4 = mme_merge(&b, x, y, 12, 16, 8);
955    mme_value m5 = mme_merge(&b, x, y, 24, 12, 8);
956 
957    mme_store_imm_addr(&b, data_addr + 0,  m1);
958    mme_store_imm_addr(&b, data_addr + 4,  m2);
959    mme_store_imm_addr(&b, data_addr + 8,  m3);
960    mme_store_imm_addr(&b, data_addr + 12, m4);
961    mme_store_imm_addr(&b, data_addr + 16, m5);
962 
963    auto macro = mme_builder_finish_vec(&b);
964 
965    std::vector<uint32_t> params;
966    params.push_back(0x0c406fe0);
967    params.push_back(0x76543210u);
968 
969    test_macro(&b, macro, params);
970 }
971 
972 #define COMPARISON_TEST(op)                     \
973 TEST_F(mme_tu104_sim_test, op)                  \
974 {                                               \
975    mme_builder b;                               \
976    mme_builder_init(&b, devinfo);            \
977                                                 \
978    mme_value x = mme_load(&b);                  \
979    mme_value y = mme_load(&b);                  \
980    mme_value z = mme_load(&b);                  \
981    mme_value w = mme_load(&b);                  \
982                                                 \
983    mme_value v1 = mme_##op(&b, x, y);           \
984    mme_value v2 = mme_##op(&b, y, x);           \
985    mme_value v3 = mme_##op(&b, y, z);           \
986    mme_value v4 = mme_##op(&b, z, y);           \
987    mme_value v5 = mme_##op(&b, w, z);           \
988    mme_value v6 = mme_##op(&b, z, w);           \
989    mme_value v7 = mme_##op(&b, w, w);           \
990                                                 \
991    mme_store_imm_addr(&b, data_addr + 0,  v1);  \
992    mme_store_imm_addr(&b, data_addr + 4,  v2);  \
993    mme_store_imm_addr(&b, data_addr + 8,  v3);  \
994    mme_store_imm_addr(&b, data_addr + 12, v4);  \
995    mme_store_imm_addr(&b, data_addr + 16, v5);  \
996    mme_store_imm_addr(&b, data_addr + 20, v6);  \
997    mme_store_imm_addr(&b, data_addr + 24, v7);  \
998                                                 \
999    auto macro = mme_builder_finish_vec(&b);     \
1000                                                 \
1001    std::vector<uint32_t> params;                \
1002    params.push_back(-5);                        \
1003    params.push_back(-10);                       \
1004    params.push_back(5);                         \
1005    params.push_back(10);                        \
1006                                                 \
1007    test_macro(&b, macro, params);               \
1008 }
1009 
1010 COMPARISON_TEST(slt)
COMPARISON_TEST(sltu)1011 COMPARISON_TEST(sltu)
1012 COMPARISON_TEST(sle)
1013 COMPARISON_TEST(sleu)
1014 COMPARISON_TEST(seq)
1015 
1016 #undef COMPARISON_TEST
1017 
1018 static inline void
1019 mme_inc_whole_inst(mme_builder *b, mme_value val)
1020 {
1021    mme_tu104_asm(b, i) {
1022       i.alu[0].dst = mme_value_as_reg(val);
1023       i.alu[0].op = MME_TU104_ALU_OP_ADD;
1024       i.alu[0].src[0] = mme_value_as_reg(val);
1025       i.alu[0].src[1] = MME_TU104_REG_IMM;
1026       i.imm[0] = 1;
1027    }
1028 }
1029 
TEST_F(mme_tu104_sim_test,loop)1030 TEST_F(mme_tu104_sim_test, loop)
1031 {
1032    mme_builder b;
1033    mme_builder_init(&b, devinfo);
1034 
1035    mme_value count = mme_load(&b);
1036 
1037    mme_value x = mme_mov(&b, mme_zero());
1038    mme_value y = mme_mov(&b, mme_zero());
1039 
1040    mme_loop(&b, count) {
1041       mme_tu104_asm(&b, i) { } /* noop */
1042       mme_add_to(&b, x, x, count);
1043    }
1044    mme_add_to(&b, y, y, mme_imm(1));
1045    mme_tu104_asm(&b, i) { } /* noop */
1046    mme_tu104_asm(&b, i) { } /* noop */
1047    mme_tu104_asm(&b, i) { } /* noop */
1048 
1049    mme_store_imm_addr(&b, data_addr + 0,  count);
1050    mme_store_imm_addr(&b, data_addr + 4,  x);
1051    mme_store_imm_addr(&b, data_addr + 8,  y);
1052 
1053    auto macro = mme_builder_finish_vec(&b);
1054 
1055    uint32_t counts[] = {0, 1, 5, 9};
1056 
1057    for (uint32_t i = 0; i < ARRAY_SIZE(counts); i++) {
1058       reset_push();
1059 
1060       std::vector<uint32_t> params;
1061       params.push_back(counts[i]);
1062 
1063       test_macro(&b, macro, params);
1064       ASSERT_EQ(data[0], counts[i]);
1065       ASSERT_EQ(data[1], counts[i] * counts[i]);
1066       ASSERT_EQ(data[2], 1);
1067    }
1068 }
1069 
TEST_F(mme_tu104_sim_test,jal)1070 TEST_F(mme_tu104_sim_test, jal)
1071 {
1072    mme_builder b;
1073    mme_builder_init(&b, devinfo);
1074 
1075    mme_value x = mme_mov(&b, mme_zero());
1076    mme_value y = mme_mov(&b, mme_zero());
1077 
1078    mme_tu104_asm(&b, i) {
1079       i.alu[0].op = MME_TU104_ALU_OP_JAL;
1080       i.imm[0] = (1 << 15) | 6;
1081    }
1082 
1083    for (uint32_t j = 0; j < 10; j++)
1084       mme_inc_whole_inst(&b, x);
1085 
1086 //   mme_tu104_asm(&b, i) {
1087 //      i.alu[0].op = MME_TU104_ALU_OP_JAL;
1088 //      i.imm[0] = 6;
1089 //   }
1090 //
1091 //   for (uint32_t j = 0; j < 10; j++)
1092 //      mme_inc_whole_inst(&b, y);
1093 
1094    mme_store_imm_addr(&b, data_addr + 0, x);
1095    mme_store_imm_addr(&b, data_addr + 4, y);
1096 
1097    auto macro = mme_builder_finish_vec(&b);
1098 
1099    std::vector<uint32_t> params;
1100    test_macro(&b, macro, params);
1101    ASSERT_EQ(data[0], 5);
1102 }
1103 
TEST_F(mme_tu104_sim_test,bxx_fwd)1104 TEST_F(mme_tu104_sim_test, bxx_fwd)
1105 {
1106    mme_builder b;
1107    mme_builder_init(&b, devinfo);
1108 
1109    mme_value vals[10];
1110    for (uint32_t i = 0; i < 10; i++)
1111       vals[i] = mme_mov(&b, mme_zero());
1112 
1113    mme_tu104_asm(&b, i) {
1114       i.alu[0].op = MME_TU104_ALU_OP_BEQ;
1115       i.imm[0] = (1 << 15) | 6;
1116    }
1117 
1118    for (uint32_t j = 0; j < 10; j++)
1119       mme_inc_whole_inst(&b, vals[j]);
1120 
1121    for (uint32_t j = 0; j < 10; j++)
1122       mme_store_imm_addr(&b, data_addr + j * 4, vals[j]);
1123 
1124    auto macro = mme_builder_finish_vec(&b);
1125 
1126    std::vector<uint32_t> params;
1127    test_macro(&b, macro, params);
1128 }
1129 
TEST_F(mme_tu104_sim_test,bxx_bwd)1130 TEST_F(mme_tu104_sim_test, bxx_bwd)
1131 {
1132    mme_builder b;
1133    mme_builder_init(&b, devinfo);
1134 
1135    mme_value vals[15];
1136    for (uint32_t i = 0; i < 15; i++)
1137       vals[i] = mme_mov(&b, mme_zero());
1138 
1139    mme_tu104_asm(&b, i) {
1140       i.alu[0].op = MME_TU104_ALU_OP_JAL;
1141       i.imm[0] = (1 << 15) | 12;
1142    }
1143 
1144    for (uint32_t j = 0; j < 10; j++)
1145       mme_inc_whole_inst(&b, vals[j]);
1146 
1147    mme_tu104_asm(&b, i) {
1148       i.alu[0].op = MME_TU104_ALU_OP_JAL;
1149       i.imm[0] = (1 << 15) | 2;
1150    }
1151 
1152    mme_tu104_asm(&b, i) {
1153       i.alu[0].op = MME_TU104_ALU_OP_BEQ;
1154       i.imm[0] = (1 << 15) | ((-8) & 0x1fff);
1155    }
1156 
1157    for (uint32_t j = 10; j < 15; j++)
1158       mme_inc_whole_inst(&b, vals[j]);
1159 
1160    for (uint32_t j = 0; j < 15; j++)
1161       mme_store_imm_addr(&b, data_addr + j * 4, vals[j]);
1162 
1163    auto macro = mme_builder_finish_vec(&b);
1164 
1165    std::vector<uint32_t> params;
1166    test_macro(&b, macro, params);
1167    for (uint32_t j = 0; j < 3; j++)
1168       ASSERT_EQ(data[j], 0);
1169    for (uint32_t j = 3; j < 15; j++)
1170       ASSERT_EQ(data[j], 1);
1171 }
1172 
TEST_F(mme_tu104_sim_test,bxx_exit)1173 TEST_F(mme_tu104_sim_test, bxx_exit)
1174 {
1175    mme_builder b;
1176    mme_builder_init(&b, devinfo);
1177 
1178    mme_value vals[10];
1179    for (uint32_t i = 0; i < 10; i++)
1180       vals[i] = mme_mov(&b, mme_zero());
1181 
1182    for (uint32_t i = 0; i < 10; i++)
1183       mme_store_imm_addr(&b, data_addr + i * 4, mme_imm(0));
1184 
1185    mme_tu104_asm(&b, i) {
1186       i.alu[0].op = MME_TU104_ALU_OP_BEQ;
1187       i.imm[0] = (1 << 15) | 0x1000;
1188    }
1189 
1190    /* those writes won't be visible */
1191    for (uint32_t j = 0; j < 10; j++)
1192       mme_inc_whole_inst(&b, vals[j]);
1193 
1194    for (uint32_t i = 0; i < 10; i++)
1195       mme_store_imm_addr(&b, data_addr + i * 4, vals[i]);
1196 
1197    std::vector<uint32_t> params;
1198 
1199    auto macro = mme_builder_finish_vec(&b);
1200    test_macro(&b, macro, params);
1201 
1202    uint32_t i;
1203    for (i = 0; i < 10; i++)
1204       ASSERT_EQ(data[i], 0);
1205 }
1206 
TEST_F(mme_tu104_sim_test,mme_exit)1207 TEST_F(mme_tu104_sim_test, mme_exit)
1208 {
1209    mme_builder b;
1210    mme_builder_init(&b, devinfo);
1211 
1212    mme_value vals[10];
1213    for (uint32_t i = 0; i < 10; i++)
1214       vals[i] = mme_mov(&b, mme_zero());
1215 
1216    for (uint32_t i = 0; i < 10; i++)
1217       mme_store_imm_addr(&b, data_addr + i * 4, mme_imm(0));
1218 
1219    /* abort */
1220    mme_exit(&b);
1221 
1222    /* those writes won't be visible */
1223    for (uint32_t i = 0; i < 10; i++)
1224       vals[i] = mme_mov(&b, mme_imm(i));
1225 
1226    for (uint32_t i = 0; i < 10; i++) {
1227       mme_store_imm_addr(&b, data_addr + i * 4, vals[i]);
1228    }
1229 
1230    std::vector<uint32_t> params;
1231 
1232    auto macro = mme_builder_finish_vec(&b);
1233    test_macro(&b, macro, params);
1234 
1235    uint32_t i;
1236    for (i = 0; i < 10; i++)
1237       ASSERT_EQ(data[i], 0);
1238 }
1239 
TEST_F(mme_tu104_sim_test,mme_exit_if)1240 TEST_F(mme_tu104_sim_test, mme_exit_if)
1241 {
1242    mme_builder b;
1243    mme_builder_init(&b, devinfo);
1244 
1245    mme_value vals[10];
1246    for (uint32_t i = 0; i < 10; i++)
1247       vals[i] = mme_mov(&b, mme_zero());
1248 
1249    for (uint32_t i = 0; i < 10; i++)
1250       mme_store_imm_addr(&b, data_addr + i * 4, mme_imm(0));
1251 
1252    /* shouldn't do anything */
1253    mme_exit_if(&b, ieq, mme_zero(), mme_imm(1));
1254 
1255    for (uint32_t i = 0; i < 10; i++)
1256       vals[i] = mme_mov(&b, mme_imm(i));
1257 
1258    for (uint32_t i = 0; i < 10; i++) {
1259       /* abort on reaching 5 */
1260       mme_exit_if(&b, ile, mme_imm(5), vals[i]);
1261       mme_store_imm_addr(&b, data_addr + i * 4, vals[i]);
1262    }
1263 
1264    std::vector<uint32_t> params;
1265 
1266    auto macro = mme_builder_finish_vec(&b);
1267    test_macro(&b, macro, params);
1268 
1269    uint32_t i;
1270    for (i = 0; i < 10; i++)
1271       ASSERT_EQ(data[i], i < 5 ? i : 0);
1272 }
1273 
c_ilt(int32_t x,int32_t y)1274 static bool c_ilt(int32_t x, int32_t y) { return x < y; };
c_ult(uint32_t x,uint32_t y)1275 static bool c_ult(uint32_t x, uint32_t y) { return x < y; };
c_ile(int32_t x,int32_t y)1276 static bool c_ile(int32_t x, int32_t y) { return x <= y; };
c_ule(uint32_t x,uint32_t y)1277 static bool c_ule(uint32_t x, uint32_t y) { return x <= y; };
c_ieq(int32_t x,int32_t y)1278 static bool c_ieq(int32_t x, int32_t y) { return x == y; };
c_ige(int32_t x,int32_t y)1279 static bool c_ige(int32_t x, int32_t y) { return x >= y; };
c_uge(uint32_t x,uint32_t y)1280 static bool c_uge(uint32_t x, uint32_t y) { return x >= y; };
c_igt(int32_t x,int32_t y)1281 static bool c_igt(int32_t x, int32_t y) { return x > y; };
c_ugt(uint32_t x,uint32_t y)1282 static bool c_ugt(uint32_t x, uint32_t y) { return x > y; };
c_ine(int32_t x,int32_t y)1283 static bool c_ine(int32_t x, int32_t y) { return x != y; };
1284 
1285 #define IF_TEST(op)                                                  \
1286 TEST_F(mme_tu104_sim_test, if_##op)                                  \
1287 {                                                                    \
1288    mme_builder b;                                                    \
1289    mme_builder_init(&b, devinfo);                                 \
1290                                                                      \
1291    mme_value x = mme_load(&b);                                       \
1292    mme_value y = mme_load(&b);                                       \
1293    mme_value i = mme_mov(&b, mme_zero());                            \
1294                                                                      \
1295    mme_start_if_##op(&b, x, y);                                      \
1296    {                                                                 \
1297       mme_add_to(&b, i, i, mme_imm(1));                              \
1298       mme_add_to(&b, i, i, mme_imm(1));                              \
1299    }                                                                 \
1300    mme_end_if(&b);                                                   \
1301    mme_add_to(&b, i, i, mme_imm(1));                                 \
1302    mme_add_to(&b, i, i, mme_imm(1));                                 \
1303    mme_add_to(&b, i, i, mme_imm(1));                                 \
1304                                                                      \
1305    mme_store_imm_addr(&b, data_addr + 0, i);                         \
1306                                                                      \
1307    auto macro = mme_builder_finish_vec(&b);                          \
1308                                                                      \
1309    uint32_t vals[] = {23, 56, (uint32_t)-5, (uint32_t)-10, 56, 14};  \
1310                                                                      \
1311    for (uint32_t i = 0; i < ARRAY_SIZE(vals) - 1; i++) {             \
1312       reset_push();                                                  \
1313                                                                      \
1314       std::vector<uint32_t> params;                                  \
1315       params.push_back(vals[i + 0]);                                 \
1316       params.push_back(vals[i + 1]);                                 \
1317                                                                      \
1318       test_macro(&b, macro, params);                                 \
1319                                                                      \
1320       ASSERT_EQ(data[0], c_##op(params[0], params[1]) ? 5 : 3);      \
1321    }                                                                 \
1322 }
1323 
1324 IF_TEST(ilt)
IF_TEST(ult)1325 IF_TEST(ult)
1326 IF_TEST(ile)
1327 IF_TEST(ule)
1328 IF_TEST(ieq)
1329 IF_TEST(ige)
1330 IF_TEST(uge)
1331 IF_TEST(igt)
1332 IF_TEST(ugt)
1333 IF_TEST(ine)
1334 
1335 #undef IF_TEST
1336 
1337 #define WHILE_TEST(op, start, step, bound)            \
1338 TEST_F(mme_tu104_sim_test, while_##op)                \
1339 {                                                     \
1340    mme_builder b;                                     \
1341    mme_builder_init(&b, devinfo);                  \
1342                                                       \
1343    mme_value x = mme_mov(&b, mme_zero());             \
1344    mme_value y = mme_mov(&b, mme_zero());             \
1345    mme_value z = mme_mov(&b, mme_imm(start));         \
1346    mme_value w = mme_mov(&b, mme_zero());             \
1347    mme_value v = mme_mov(&b, mme_zero());             \
1348                                                       \
1349    for (uint32_t j = 0; j < 5; j++)                   \
1350       mme_inc_whole_inst(&b, x);                      \
1351                                                       \
1352    mme_while(&b, op, z, mme_imm(bound)) {             \
1353       for (uint32_t j = 0; j < 5; j++)                \
1354          mme_inc_whole_inst(&b, y);                   \
1355                                                       \
1356       mme_add_to(&b, z, z, mme_imm(step));            \
1357                                                       \
1358       for (uint32_t j = 0; j < 5; j++)                \
1359          mme_inc_whole_inst(&b, w);                   \
1360    }                                                  \
1361                                                       \
1362    for (uint32_t j = 0; j < 5; j++)                   \
1363       mme_inc_whole_inst(&b, v);                      \
1364                                                       \
1365    mme_store_imm_addr(&b, data_addr + 0, x);          \
1366    mme_store_imm_addr(&b, data_addr + 4, y);          \
1367    mme_store_imm_addr(&b, data_addr + 8, z);          \
1368    mme_store_imm_addr(&b, data_addr + 12, w);         \
1369    mme_store_imm_addr(&b, data_addr + 16, v);         \
1370                                                       \
1371    auto macro = mme_builder_finish_vec(&b);           \
1372                                                       \
1373    uint32_t end = (uint32_t)(start), count = 0;       \
1374    while (c_##op(end, (bound))) {                     \
1375       end += (uint32_t)(step);                        \
1376       count++;                                        \
1377    }                                                  \
1378                                                       \
1379    std::vector<uint32_t> params;                      \
1380    test_macro(&b, macro, params);                     \
1381    ASSERT_EQ(data[0], 5);                             \
1382    ASSERT_EQ(data[1], 5 * count);                     \
1383    ASSERT_EQ(data[2], end);                           \
1384    ASSERT_EQ(data[3], 5 * count);                     \
1385    ASSERT_EQ(data[4], 5);                             \
1386 }
1387 
1388 WHILE_TEST(ilt, 0, 1, 7)
1389 WHILE_TEST(ult, 0, 1, 7)
1390 WHILE_TEST(ile, -10, 2, 0)
1391 WHILE_TEST(ule, 0, 1, 7)
1392 WHILE_TEST(ieq, 0, 5, 0)
1393 WHILE_TEST(ige, 5, -1, -5)
1394 WHILE_TEST(uge, 15, -2, 2)
1395 WHILE_TEST(igt, 7, -3, -10)
1396 WHILE_TEST(ugt, 1604, -30, 1000)
1397 WHILE_TEST(ine, 0, 1, 7)
1398 
1399 #undef WHILE_TEST
1400 
1401 TEST_F(mme_tu104_sim_test, nested_while)
1402 {
1403    mme_builder b;
1404    mme_builder_init(&b, devinfo);
1405 
1406    mme_value n = mme_load(&b);
1407    mme_value m = mme_load(&b);
1408 
1409    mme_value count = mme_mov(&b, mme_zero());
1410 
1411    mme_value i = mme_mov(&b, mme_zero());
1412    mme_value j = mme_mov(&b, mme_imm(0xffff));
1413    mme_while(&b, ine, i, n) {
1414       mme_mov_to(&b, j, mme_zero());
1415       mme_while(&b, ine, j, m) {
1416          mme_add_to(&b, count, count, mme_imm(1));
1417          mme_add_to(&b, j, j, mme_imm(1));
1418       }
1419 
1420       mme_add_to(&b, i, i, mme_imm(1));
1421    }
1422 
1423    mme_store_imm_addr(&b, data_addr + 0, i);
1424    mme_store_imm_addr(&b, data_addr + 4, j);
1425    mme_store_imm_addr(&b, data_addr + 8, count);
1426 
1427    auto macro = mme_builder_finish_vec(&b);
1428 
1429    std::vector<uint32_t> params;
1430    params.push_back(3);
1431    params.push_back(5);
1432 
1433    test_macro(&b, macro, params);
1434    ASSERT_EQ(data[0], 3);
1435    ASSERT_EQ(data[1], 5);
1436    ASSERT_EQ(data[2], 15);
1437 }
1438 
1439 #if 0
1440 TEST_F(mme_tu104_sim_test, do_ble)
1441 {
1442    mme_builder b;
1443    mme_builder_init(&b, devinfo);
1444 
1445    mme_alu(&b, R5, ADD, LOAD0, ZERO);
1446    mme_alu(&b, R6, ADD, ZERO, ZERO);
1447    mme_alu(&b, R7, ADD, ZERO, ZERO);
1448 
1449    mme_alu_imm(&b, R7, ADD, R7, IMM, 1);
1450    mme_alu_imm(&b, R7, ADD, R7, IMM, 1);
1451    mme_alu_imm(&b, R7, ADD, R7, IMM, 1);
1452    mme_alu_imm(&b, R7, ADD, R7, IMM, 1);
1453    mme_alu_imm(&b, R6, ADD, R6, IMM, 1);
1454    mme_branch(&b, BLE, R6, R5, -3, 2);
1455    mme_alu_imm(&b, R7, ADD, R7, IMM, 1);
1456    mme_alu_imm(&b, R7, ADD, R7, IMM, 1);
1457 
1458    mme_store_imm_addr(&b, data_addr + 0,  MME_TU104_REG_R7);
1459 
1460    mme_end(&b);
1461 
1462    uint32_t counts[] = {0, 1, 5, 9};
1463 
1464    for (uint32_t i = 0; i < ARRAY_SIZE(counts); i++) {
1465       reset_push();
1466 
1467       std::vector<uint32_t> params;
1468       params.push_back(counts[i]);
1469 
1470       test_macro(&b, params);
1471    }
1472 }
1473 #endif
1474 
TEST_F(mme_tu104_sim_test,dread_dwrite)1475 TEST_F(mme_tu104_sim_test, dread_dwrite)
1476 {
1477    mme_builder b;
1478    mme_builder_init(&b, devinfo);
1479 
1480    mme_value x = mme_load(&b);
1481    mme_value y = mme_load(&b);
1482 
1483    mme_dwrite(&b, mme_imm(5), x);
1484    mme_dwrite(&b, mme_imm(8), y);
1485 
1486    mme_value y2 = mme_dread(&b, mme_imm(8));
1487    mme_value x2 = mme_dread(&b, mme_imm(5));
1488 
1489    mme_store_imm_addr(&b, data_addr + 0, y2);
1490    mme_store_imm_addr(&b, data_addr + 4, x2);
1491 
1492    auto macro = mme_builder_finish_vec(&b);
1493 
1494    std::vector<uint32_t> params;
1495    params.push_back(-10);
1496    params.push_back(5);
1497 
1498    test_macro(&b, macro, params);
1499 }
1500 
TEST_F(mme_tu104_sim_test,dwrite_dma)1501 TEST_F(mme_tu104_sim_test, dwrite_dma)
1502 {
1503    const uint32_t canary5 = 0xc0ffee01;
1504    const uint32_t canary8 = canary5 & 0x00ffff00;
1505 
1506    mme_builder b;
1507    mme_builder_init(&b, devinfo);
1508 
1509    mme_value x = mme_load(&b);
1510    mme_value y = mme_load(&b);
1511 
1512    mme_dwrite(&b, mme_imm(5), x);
1513    mme_dwrite(&b, mme_imm(8), y);
1514 
1515    auto macro = mme_builder_finish_vec(&b);
1516 
1517    push_macro(0, macro);
1518 
1519    P_1INC(p, NVC597, CALL_MME_MACRO(0));
1520    P_INLINE_DATA(p, canary5);
1521    P_INLINE_DATA(p, canary8);
1522 
1523    P_MTHD(p, NVC597, SET_MME_MEM_ADDRESS_A);
1524    P_NVC597_SET_MME_MEM_ADDRESS_A(p, high32(data_addr));
1525    P_NVC597_SET_MME_MEM_ADDRESS_B(p, low32(data_addr));
1526    /* Start 3 dwords into MME RAM */
1527    P_NVC597_SET_MME_DATA_RAM_ADDRESS(p, 3);
1528    P_IMMD(p, NVC597, MME_DMA_WRITE, 20);
1529 
1530    submit_push();
1531 
1532    for (uint32_t i = 0; i < 20; i++) {
1533       if (i == 5 - 3) {
1534          ASSERT_EQ(data[i], canary5);
1535       } else if (i == 8 - 3) {
1536          ASSERT_EQ(data[i], canary8);
1537       } else {
1538          ASSERT_EQ(data[i], 0);
1539       }
1540    }
1541 }
1542 
TEST_F(mme_tu104_sim_test,dram_limit)1543 TEST_F(mme_tu104_sim_test, dram_limit)
1544 {
1545    static const uint32_t chunk_size = 32;
1546 
1547    mme_builder b;
1548    mme_builder_init(&b, devinfo);
1549 
1550    mme_value start = mme_load(&b);
1551    mme_value count = mme_load(&b);
1552 
1553    mme_value i = mme_mov(&b, start);
1554    mme_loop(&b, count) {
1555       mme_dwrite(&b, i, i);
1556       mme_add_to(&b, i, i, mme_imm(1));
1557    }
1558 
1559    mme_value j = mme_mov(&b, start);
1560    struct mme_value64 addr = mme_mov64(&b, mme_imm64(data_addr));
1561 
1562    mme_loop(&b, count) {
1563       mme_value x = mme_dread(&b, j);
1564       mme_store(&b, addr, x);
1565       mme_add_to(&b, j, j, mme_imm(1));
1566       mme_add64_to(&b, addr, addr, mme_imm64(4));
1567    }
1568 
1569    auto macro = mme_builder_finish_vec(&b);
1570 
1571    for (uint32_t i = 0; i < MME_TU104_DRAM_COUNT; i += chunk_size) {
1572       reset_push();
1573 
1574       push_macro(0, macro);
1575 
1576       P_1INC(p, NVC597, CALL_MME_MACRO(0));
1577       P_INLINE_DATA(p, i);
1578       P_INLINE_DATA(p, chunk_size);
1579 
1580       submit_push();
1581 
1582       for (uint32_t j = 0; j < chunk_size; j++)
1583          ASSERT_EQ(data[j], i + j);
1584    }
1585 }
1586 
TEST_F(mme_tu104_sim_test,dma_read_fifoed)1587 TEST_F(mme_tu104_sim_test, dma_read_fifoed)
1588 {
1589    mme_builder b;
1590    mme_builder_init(&b, devinfo);
1591 
1592    mme_mthd(&b, NVC597_SET_MME_DATA_RAM_ADDRESS);
1593    mme_emit(&b, mme_zero());
1594 
1595    mme_mthd(&b, NVC597_SET_MME_MEM_ADDRESS_A);
1596    mme_emit(&b, mme_imm(high32(data_addr)));
1597    mme_emit(&b, mme_imm(low32(data_addr)));
1598 
1599    mme_mthd(&b, NVC597_MME_DMA_READ_FIFOED);
1600    mme_emit(&b, mme_imm(2));
1601 
1602    mme_tu104_load_barrier(&b);
1603 
1604    mme_value x = mme_load(&b);
1605    mme_value y = mme_load(&b);
1606 
1607    mme_store_imm_addr(&b, data_addr + 256 + 0, x);
1608    mme_store_imm_addr(&b, data_addr + 256 + 4, y);
1609 
1610    auto macro = mme_builder_finish_vec(&b);
1611 
1612    P_IMMD(p, NVC597, SET_MME_DATA_FIFO_CONFIG, FIFO_SIZE_SIZE_4KB);
1613 
1614    for (uint32_t i = 0; i < 64; i++)
1615       data[i] = 1000 + i;
1616 
1617    std::vector<uint32_t> params;
1618    params.push_back(7);
1619 
1620    test_macro(&b, macro, params);
1621 }
1622 
TEST_F(mme_tu104_sim_test,scratch_limit)1623 TEST_F(mme_tu104_sim_test, scratch_limit)
1624 {
1625    static const uint32_t chunk_size = 32;
1626 
1627    mme_builder b;
1628    mme_builder_init(&b, devinfo);
1629 
1630    mme_value start = mme_load(&b);
1631    mme_value count = mme_load(&b);
1632 
1633    mme_value i = mme_mov(&b, start);
1634    mme_loop(&b, count) {
1635       mme_mthd_arr(&b, NVC597_SET_MME_SHADOW_SCRATCH(0), i);
1636       mme_emit(&b, i);
1637       mme_add_to(&b, i, i, mme_imm(1));
1638    }
1639 
1640    mme_value j = mme_mov(&b, start);
1641    struct mme_value64 addr = mme_mov64(&b, mme_imm64(data_addr));
1642 
1643    mme_loop(&b, count) {
1644       mme_value x = mme_state_arr(&b, NVC597_SET_MME_SHADOW_SCRATCH(0), j);
1645       mme_store(&b, addr, x);
1646       mme_add_to(&b, j, j, mme_imm(1));
1647       mme_add64_to(&b, addr, addr, mme_imm64(4));
1648    }
1649 
1650    auto macro = mme_builder_finish_vec(&b);
1651 
1652    for (uint32_t i = 0; i < MME_TU104_SCRATCH_COUNT; i += chunk_size) {
1653       reset_push();
1654 
1655       push_macro(0, macro);
1656 
1657       P_1INC(p, NVC597, CALL_MME_MACRO(0));
1658       P_INLINE_DATA(p, i);
1659       P_INLINE_DATA(p, chunk_size);
1660 
1661       submit_push();
1662 
1663       for (uint32_t j = 0; j < chunk_size; j++)
1664          ASSERT_EQ(data[j], i + j);
1665    }
1666 }
1667