• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /*
2  * Copyright © 2022 Collabora Ltd.
3  * SPDX-License-Identifier: MIT
4  */
5 #include "mme_runner.h"
6 #include "mme_tu104_sim.h"
7 
8 #include "nv_push_clc597.h"
9 
10 class mme_tu104_sim_test : public ::testing::Test, public mme_hw_runner {
11 public:
12    mme_tu104_sim_test();
13    ~mme_tu104_sim_test();
14 
15    void SetUp();
16    void test_macro(const mme_builder *b,
17                    const std::vector<uint32_t>& macro,
18                    const std::vector<uint32_t>& params);
19 };
20 
mme_tu104_sim_test()21 mme_tu104_sim_test::mme_tu104_sim_test() :
22    ::testing::Test(),
23    mme_hw_runner()
24 { }
25 
~mme_tu104_sim_test()26 mme_tu104_sim_test::~mme_tu104_sim_test()
27 { }
28 
29 void
SetUp()30 mme_tu104_sim_test::SetUp()
31 {
32    ASSERT_TRUE(set_up_hw(TURING_A, UINT16_MAX));
33 }
34 
35 void
test_macro(const mme_builder * b,const std::vector<uint32_t> & macro,const std::vector<uint32_t> & params)36 mme_tu104_sim_test::test_macro(const mme_builder *b,
37                                const std::vector<uint32_t>& macro,
38                                const std::vector<uint32_t>& params)
39 {
40    const uint32_t data_dwords = DATA_BO_SIZE / sizeof(uint32_t);
41 
42    std::vector<mme_tu104_inst> insts(macro.size() / 3);
43    mme_tu104_decode(&insts[0], &macro[0], macro.size() / 3);
44 
45    /* First, make a copy of the data and simulate the macro */
46    std::vector<uint32_t> sim_data(data, data + (DATA_BO_SIZE / 4));
47    mme_tu104_sim_mem sim_mem = {
48       .addr = data_addr,
49       .data = &sim_data[0],
50       .size = DATA_BO_SIZE,
51    };
52    mme_tu104_sim(insts.size(), &insts[0],
53                  params.size(), params.size() ? &params[0] : NULL,
54                  1, &sim_mem);
55 
56    /* Now run the macro on the GPU */
57    push_macro(0, macro);
58 
59    P_1INC(p, NVC597, CALL_MME_MACRO(0));
60    if (params.empty()) {
61       P_NVC597_CALL_MME_MACRO(p, 0, 0);
62    } else {
63       P_INLINE_ARRAY(p, &params[0], params.size());
64    }
65 
66    submit_push();
67 
68    /* Check the results */
69    for (uint32_t i = 0; i < data_dwords; i++)
70       ASSERT_EQ(data[i], sim_data[i]);
71 }
72 
73 static mme_tu104_reg
mme_value_as_reg(mme_value val)74 mme_value_as_reg(mme_value val)
75 {
76    assert(val.type == MME_VALUE_TYPE_REG);
77    return (mme_tu104_reg)(MME_TU104_REG_R0 + val.reg);
78 }
79 
TEST_F(mme_tu104_sim_test,sanity)80 TEST_F(mme_tu104_sim_test, sanity)
81 {
82    const uint32_t canary = 0xc0ffee01;
83 
84    mme_builder b;
85    mme_builder_init(&b, devinfo);
86 
87    mme_store_imm_addr(&b, data_addr, mme_imm(canary));
88 
89    auto macro = mme_builder_finish_vec(&b);
90 
91    std::vector<uint32_t> params;
92    test_macro(&b, macro, params);
93 }
94 
TEST_F(mme_tu104_sim_test,multi_param)95 TEST_F(mme_tu104_sim_test, multi_param)
96 {
97    mme_builder b;
98    mme_builder_init(&b, devinfo);
99 
100    mme_value v0 = mme_alloc_reg(&b);
101    mme_value v1 = mme_alloc_reg(&b);
102 
103    mme_tu104_asm(&b, i) {
104       i.alu[0].dst = mme_value_as_reg(v0);
105       i.alu[0].src[0] = MME_TU104_REG_LOAD1;
106       i.alu[1].dst = mme_value_as_reg(v1);
107       i.alu[1].src[0] = MME_TU104_REG_LOAD0;
108       i.imm[0] = (1<<12) | (NVC597_SET_MME_SHADOW_SCRATCH(12) >> 2);
109       i.out[0].mthd = MME_TU104_OUT_OP_IMM0;
110       i.out[0].emit = MME_TU104_OUT_OP_LOAD0;
111       i.imm[1] = (1<<12) | (NVC597_SET_MME_SHADOW_SCRATCH(35) >> 2);
112       i.out[1].mthd = MME_TU104_OUT_OP_IMM1;
113       i.out[1].emit = MME_TU104_OUT_OP_LOAD1;
114    }
115 
116    mme_value v2 = mme_state(&b, NVC597_SET_MME_SHADOW_SCRATCH(12));
117    mme_value v3 = mme_state(&b, NVC597_SET_MME_SHADOW_SCRATCH(35));
118 
119    mme_store_imm_addr(&b, data_addr + 0, v0);
120    mme_store_imm_addr(&b, data_addr + 4, v1);
121    mme_store_imm_addr(&b, data_addr + 8, v2);
122    mme_store_imm_addr(&b, data_addr + 12, v3);
123 
124    auto macro = mme_builder_finish_vec(&b);
125 
126    std::vector<uint32_t> params;
127    params.push_back(2581);
128    params.push_back(3048);
129 
130    test_macro(&b, macro, params);
131 }
132 
TEST_F(mme_tu104_sim_test,pred_param)133 TEST_F(mme_tu104_sim_test, pred_param)
134 {
135    mme_builder b;
136    mme_builder_init(&b, devinfo);
137 
138    mme_value v0 = mme_load(&b);
139    mme_value v1 = mme_mov(&b, mme_imm(240));
140 
141    mme_tu104_asm(&b, i) {
142       i.pred_mode = MME_TU104_PRED_TTTT;
143       i.alu[0].dst = mme_value_as_reg(v1);
144       i.alu[0].src[0] = MME_TU104_REG_LOAD0;
145    }
146 
147    mme_value v2 = mme_load(&b);
148 
149    mme_store_imm_addr(&b, data_addr + 0, v0);
150    mme_store_imm_addr(&b, data_addr + 4, v1);
151    mme_store_imm_addr(&b, data_addr + 8, v2);
152 
153    auto macro = mme_builder_finish_vec(&b);
154 
155    for (uint32_t j = 0; j < 4; j++) {
156       reset_push();
157 
158       std::vector<uint32_t> params;
159       params.push_back((j & 1) * 2043);
160       params.push_back((j & 2) * 523);
161       params.push_back(2581);
162       params.push_back(3048);
163 
164       test_macro(&b, macro, params);
165    }
166 }
167 
TEST_F(mme_tu104_sim_test,out_imm0)168 TEST_F(mme_tu104_sim_test, out_imm0)
169 {
170    mme_builder b;
171    mme_builder_init(&b, devinfo);
172 
173    mme_mthd(&b, NVC597_SET_REPORT_SEMAPHORE_A);
174    mme_emit(&b, mme_imm(high32(data_addr + 0)));
175    mme_emit(&b, mme_imm(low32(data_addr + 0)));
176    mme_tu104_asm(&b, i) {
177       i.imm[0] = 0x1234;
178       i.out[0].emit = MME_TU104_OUT_OP_IMM0;
179    }
180    mme_emit(&b, mme_imm(0x10000000));
181 
182    mme_mthd(&b, NVC597_SET_REPORT_SEMAPHORE_A);
183    mme_emit(&b, mme_imm(high32(data_addr + 4)));
184    mme_emit(&b, mme_imm(low32(data_addr + 4)));
185    mme_tu104_asm(&b, i) {
186       i.imm[0] = 0x8765;
187       i.out[0].emit = MME_TU104_OUT_OP_IMM0;
188    }
189    mme_emit(&b, mme_imm(0x10000000));
190 
191    auto macro = mme_builder_finish_vec(&b);
192 
193    std::vector<uint32_t> params;
194    test_macro(&b, macro, params);
195 }
196 
TEST_F(mme_tu104_sim_test,out_imm1)197 TEST_F(mme_tu104_sim_test, out_imm1)
198 {
199    mme_builder b;
200    mme_builder_init(&b, devinfo);
201 
202    mme_mthd(&b, NVC597_SET_REPORT_SEMAPHORE_A);
203    mme_emit(&b, mme_imm(high32(data_addr + 0)));
204    mme_emit(&b, mme_imm(low32(data_addr + 0)));
205    mme_tu104_asm(&b, i) {
206       i.imm[1] = 0x1234;
207       i.out[0].emit = MME_TU104_OUT_OP_IMM1;
208    }
209    mme_emit(&b, mme_imm(0x10000000));
210 
211    mme_mthd(&b, NVC597_SET_REPORT_SEMAPHORE_A);
212    mme_emit(&b, mme_imm(high32(data_addr + 4)));
213    mme_emit(&b, mme_imm(low32(data_addr + 4)));
214    mme_tu104_asm(&b, i) {
215       i.imm[1] = 0x8765;
216       i.out[0].emit = MME_TU104_OUT_OP_IMM1;
217    }
218    mme_emit(&b, mme_imm(0x10000000));
219 
220    auto macro = mme_builder_finish_vec(&b);
221 
222    std::vector<uint32_t> params;
223    test_macro(&b, macro, params);
224 }
225 
TEST_F(mme_tu104_sim_test,out_immhigh0)226 TEST_F(mme_tu104_sim_test, out_immhigh0)
227 {
228    mme_builder b;
229    mme_builder_init(&b, devinfo);
230 
231    mme_mthd(&b, NVC597_SET_REPORT_SEMAPHORE_A);
232    mme_emit(&b, mme_imm(high32(data_addr + 0)));
233    mme_emit(&b, mme_imm(low32(data_addr + 0)));
234    mme_tu104_asm(&b, i) {
235       i.imm[0] = 0x1234;
236       i.out[0].emit = MME_TU104_OUT_OP_IMMHIGH0;
237    }
238    mme_emit(&b, mme_imm(0x10000000));
239 
240    mme_mthd(&b, NVC597_SET_REPORT_SEMAPHORE_A);
241    mme_emit(&b, mme_imm(high32(data_addr + 4)));
242    mme_emit(&b, mme_imm(low32(data_addr + 4)));
243    mme_tu104_asm(&b, i) {
244       i.imm[0] = 0x8765;
245       i.out[1].emit = MME_TU104_OUT_OP_IMMHIGH0;
246    }
247    mme_emit(&b, mme_imm(0x10000000));
248 
249    auto macro = mme_builder_finish_vec(&b);
250 
251    std::vector<uint32_t> params;
252    test_macro(&b, macro, params);
253 }
254 
TEST_F(mme_tu104_sim_test,out_immhigh1)255 TEST_F(mme_tu104_sim_test, out_immhigh1)
256 {
257    mme_builder b;
258    mme_builder_init(&b, devinfo);
259 
260    mme_mthd(&b, NVC597_SET_REPORT_SEMAPHORE_A);
261    mme_emit(&b, mme_imm(high32(data_addr + 0)));
262    mme_emit(&b, mme_imm(low32(data_addr + 0)));
263    mme_tu104_asm(&b, i) {
264       i.imm[1] = 0x1234;
265       i.out[0].emit = MME_TU104_OUT_OP_IMMHIGH1;
266    }
267    mme_emit(&b, mme_imm(0x10000000));
268 
269    mme_mthd(&b, NVC597_SET_REPORT_SEMAPHORE_A);
270    mme_emit(&b, mme_imm(high32(data_addr + 4)));
271    mme_emit(&b, mme_imm(low32(data_addr + 4)));
272    mme_tu104_asm(&b, i) {
273       i.imm[1] = 0x8765;
274       i.out[1].emit = MME_TU104_OUT_OP_IMMHIGH1;
275    }
276    mme_emit(&b, mme_imm(0x10000000));
277 
278    auto macro = mme_builder_finish_vec(&b);
279 
280    std::vector<uint32_t> params;
281    test_macro(&b, macro, params);
282 }
283 
TEST_F(mme_tu104_sim_test,out_imm32)284 TEST_F(mme_tu104_sim_test, out_imm32)
285 {
286    mme_builder b;
287    mme_builder_init(&b, devinfo);
288 
289    mme_mthd(&b, NVC597_SET_REPORT_SEMAPHORE_A);
290    mme_emit(&b, mme_imm(high32(data_addr + 0)));
291    mme_emit(&b, mme_imm(low32(data_addr + 0)));
292    mme_tu104_asm(&b, i) {
293       i.imm[0] = 0x1234;
294       i.imm[1] = 0x7654;
295       i.out[0].emit = MME_TU104_OUT_OP_IMM32;
296    }
297    mme_emit(&b, mme_imm(0x10000000));
298 
299    mme_mthd(&b, NVC597_SET_REPORT_SEMAPHORE_A);
300    mme_emit(&b, mme_imm(high32(data_addr + 4)));
301    mme_emit(&b, mme_imm(low32(data_addr + 4)));
302    mme_tu104_asm(&b, i) {
303       i.imm[0] = 0x1234;
304       i.imm[1] = 0x7654;
305       i.out[1].emit = MME_TU104_OUT_OP_IMM32;
306    }
307    mme_emit(&b, mme_imm(0x10000000));
308 
309    auto macro = mme_builder_finish_vec(&b);
310 
311    std::vector<uint32_t> params;
312    test_macro(&b, macro, params);
313 }
314 
TEST_F(mme_tu104_sim_test,reg_imm32)315 TEST_F(mme_tu104_sim_test, reg_imm32)
316 {
317    const uint32_t canary = 0xc0ffee01;
318 
319    mme_builder b;
320    mme_builder_init(&b, devinfo);
321 
322    mme_value v = mme_alloc_reg(&b);
323 
324    mme_tu104_asm(&b, i) {
325       i.alu[0].dst = mme_value_as_reg(v);
326       i.alu[0].op = MME_TU104_ALU_OP_ADD;
327       i.alu[0].src[0] = MME_TU104_REG_IMM32,
328       i.imm[0] = (uint16_t)canary;
329       i.imm[1] = (uint16_t)(canary >> 16);
330    }
331 
332    mme_store_imm_addr(&b, data_addr, v);
333 
334    auto macro = mme_builder_finish_vec(&b);
335 
336    std::vector<uint32_t> params;
337    test_macro(&b, macro, params);
338 }
339 
TEST_F(mme_tu104_sim_test,pred_alu)340 TEST_F(mme_tu104_sim_test, pred_alu)
341 {
342    static const enum mme_tu104_pred preds[] = {
343       MME_TU104_PRED_UUUU,
344       MME_TU104_PRED_TTTT,
345       MME_TU104_PRED_FFFF,
346       MME_TU104_PRED_TTUU,
347       MME_TU104_PRED_FFUU,
348       MME_TU104_PRED_TFUU,
349       MME_TU104_PRED_TUUU,
350       MME_TU104_PRED_FUUU,
351       MME_TU104_PRED_UUTT,
352       MME_TU104_PRED_UUTF,
353       MME_TU104_PRED_UUTU,
354       MME_TU104_PRED_UUFT,
355       MME_TU104_PRED_UUFF,
356       MME_TU104_PRED_UUFU,
357       MME_TU104_PRED_UUUT,
358       MME_TU104_PRED_UUUF,
359    };
360 
361    for (uint32_t i = 0; i < ARRAY_SIZE(preds); i++) {
362       mme_builder b;
363       mme_builder_init(&b, devinfo);
364 
365       mme_value pred = mme_load(&b);
366       mme_value v0 = mme_mov(&b, mme_imm(i * 100 + 13));
367       mme_value v1 = mme_mov(&b, mme_imm(i * 100 + 62));
368 
369       mme_tu104_asm(&b, inst) {
370          inst.pred = mme_value_as_reg(pred);
371          inst.pred_mode = preds[i];
372          inst.alu[0].dst = mme_value_as_reg(v0);
373          inst.alu[0].src[0] = MME_TU104_REG_IMM;
374          inst.imm[0] = i * 100 + 25;
375          inst.alu[1].dst = mme_value_as_reg(v1);
376          inst.alu[1].src[0] = MME_TU104_REG_IMM;
377          inst.imm[1] = i * 100 + 73;
378       }
379 
380       mme_store_imm_addr(&b, data_addr + i * 8 + 0, v0);
381       mme_store_imm_addr(&b, data_addr + i * 8 + 4, v1);
382 
383       auto macro = mme_builder_finish_vec(&b);
384 
385       for (uint32_t j = 0; j < 2; j++) {
386          reset_push();
387 
388          std::vector<uint32_t> params;
389          params.push_back(j * 25894);
390 
391          test_macro(&b, macro, params);
392       }
393    }
394 }
395 
TEST_F(mme_tu104_sim_test,pred_out)396 TEST_F(mme_tu104_sim_test, pred_out)
397 {
398    static const enum mme_tu104_pred preds[] = {
399       MME_TU104_PRED_UUUU,
400       MME_TU104_PRED_TTTT,
401       MME_TU104_PRED_FFFF,
402       MME_TU104_PRED_TTUU,
403       MME_TU104_PRED_FFUU,
404       MME_TU104_PRED_TFUU,
405       MME_TU104_PRED_TUUU,
406       MME_TU104_PRED_FUUU,
407       MME_TU104_PRED_UUTT,
408       MME_TU104_PRED_UUTF,
409       MME_TU104_PRED_UUTU,
410       MME_TU104_PRED_UUFT,
411       MME_TU104_PRED_UUFF,
412       MME_TU104_PRED_UUFU,
413       MME_TU104_PRED_UUUT,
414       MME_TU104_PRED_UUUF,
415    };
416 
417    for (uint32_t i = 0; i < ARRAY_SIZE(preds); i++) {
418       mme_builder b;
419       mme_builder_init(&b, devinfo);
420 
421       mme_value pred = mme_load(&b);
422 
423       mme_tu104_asm(&b, inst) {
424          inst.imm[0] = (1<<12) | (NVC597_SET_MME_SHADOW_SCRATCH(i*2 + 0) >> 2);
425          inst.imm[1] = i * 100 + 25;
426          inst.out[0].mthd = MME_TU104_OUT_OP_IMM0;
427          inst.out[0].emit = MME_TU104_OUT_OP_IMM1;
428       }
429 
430       mme_tu104_asm(&b, inst) {
431          inst.imm[0] = (1<<12) | (NVC597_SET_MME_SHADOW_SCRATCH(i*2 + 1) >> 2);
432          inst.imm[1] = i * 100 + 75;
433          inst.out[0].mthd = MME_TU104_OUT_OP_IMM0;
434          inst.out[0].emit = MME_TU104_OUT_OP_IMM1;
435       }
436 
437       mme_tu104_asm(&b, inst) {
438          inst.pred = mme_value_as_reg(pred);
439          inst.pred_mode = preds[i];
440          inst.imm[0] = (1<<12) | (NVC597_SET_MME_SHADOW_SCRATCH(i*2 + 0) >> 2);
441          inst.imm[1] = (1<<12) | (NVC597_SET_MME_SHADOW_SCRATCH(i*2 + 1) >> 2);
442          inst.out[0].mthd = MME_TU104_OUT_OP_IMM0;
443          inst.out[0].emit = MME_TU104_OUT_OP_IMM1;
444          inst.out[1].mthd = MME_TU104_OUT_OP_IMM1;
445          inst.out[1].emit = MME_TU104_OUT_OP_IMM0;
446       }
447 
448       mme_value v0 = mme_state(&b, NVC597_SET_MME_SHADOW_SCRATCH(i*2 + 0));
449       mme_value v1 = mme_state(&b, NVC597_SET_MME_SHADOW_SCRATCH(i*2 + 1));
450 
451       mme_store_imm_addr(&b, data_addr + i * 8 + 0, v0);
452       mme_store_imm_addr(&b, data_addr + i * 8 + 4, v1);
453 
454       auto macro = mme_builder_finish_vec(&b);
455 
456       for (uint32_t j = 0; j < 2; j++) {
457          reset_push();
458 
459          std::vector<uint32_t> params;
460          params.push_back(j * 25894);
461 
462          test_macro(&b, macro, params);
463       }
464    }
465 }
466 
TEST_F(mme_tu104_sim_test,add)467 TEST_F(mme_tu104_sim_test, add)
468 {
469    mme_builder b;
470    mme_builder_init(&b, devinfo);
471 
472    mme_value x = mme_load(&b);
473    mme_value y = mme_load(&b);
474    mme_value sum = mme_add(&b, x, y);
475    mme_store_imm_addr(&b, data_addr, sum);
476 
477    auto macro = mme_builder_finish_vec(&b);
478 
479    std::vector<uint32_t> params;
480    params.push_back(25);
481    params.push_back(138);
482 
483    test_macro(&b, macro, params);
484 }
485 
TEST_F(mme_tu104_sim_test,add_imm)486 TEST_F(mme_tu104_sim_test, add_imm)
487 {
488    mme_builder b;
489    mme_builder_init(&b, devinfo);
490 
491    mme_value x = mme_load(&b);
492 
493    mme_value v0 = mme_add(&b, x, mme_imm(0x00000001));
494    mme_value v1 = mme_add(&b, x, mme_imm(0xffffffff));
495    mme_value v2 = mme_add(&b, x, mme_imm(0xffff8000));
496    mme_value v3 = mme_add(&b, mme_imm(0x00000001), x);
497    mme_value v4 = mme_add(&b, mme_imm(0xffffffff), x);
498    mme_value v5 = mme_add(&b, mme_imm(0xffff8000), x);
499    mme_value v6 = mme_add(&b, mme_zero(), mme_imm(0x00000001));
500    mme_value v7 = mme_add(&b, mme_zero(), mme_imm(0xffffffff));
501    mme_value v8 = mme_add(&b, mme_zero(), mme_imm(0xffff8000));
502 
503    mme_store_imm_addr(&b, data_addr + 0,  v0);
504    mme_store_imm_addr(&b, data_addr + 4,  v1);
505    mme_store_imm_addr(&b, data_addr + 8,  v2);
506    mme_store_imm_addr(&b, data_addr + 12, v3);
507    mme_store_imm_addr(&b, data_addr + 16, v4);
508    mme_store_imm_addr(&b, data_addr + 20, v5);
509    mme_store_imm_addr(&b, data_addr + 24, v6);
510    mme_store_imm_addr(&b, data_addr + 28, v7);
511    mme_store_imm_addr(&b, data_addr + 32, v8);
512 
513    auto macro = mme_builder_finish_vec(&b);
514 
515    uint32_t vals[] = {
516       0x0000ffff,
517       0x00008000,
518       0x0001ffff,
519       0xffffffff,
520    };
521 
522    for (uint32_t i = 0; i < ARRAY_SIZE(vals); i++) {
523       reset_push();
524 
525       std::vector<uint32_t> params;
526       params.push_back(vals[i]);
527 
528       test_macro(&b, macro, params);
529    }
530 }
531 
TEST_F(mme_tu104_sim_test,addc)532 TEST_F(mme_tu104_sim_test, addc)
533 {
534    mme_builder b;
535    mme_builder_init(&b, devinfo);
536 
537    struct mme_value64 x = { mme_load(&b), mme_load(&b) };
538    struct mme_value64 y = { mme_load(&b), mme_load(&b) };
539 
540    struct mme_value64 sum = mme_add64(&b, x, y);
541 
542    mme_store_imm_addr(&b, data_addr + 0, sum.lo);
543    mme_store_imm_addr(&b, data_addr + 4, sum.hi);
544 
545    auto macro = mme_builder_finish_vec(&b);
546 
547    std::vector<uint32_t> params;
548    params.push_back(0x80008650);
549    params.push_back(0x596);
550    params.push_back(0x8000a8f6);
551    params.push_back(0x836);
552 
553    test_macro(&b, macro, params);
554 }
555 
TEST_F(mme_tu104_sim_test,addc_imm)556 TEST_F(mme_tu104_sim_test, addc_imm)
557 {
558    mme_builder b;
559    mme_builder_init(&b, devinfo);
560 
561    mme_value x_lo = mme_load(&b);
562    mme_value x_hi = mme_load(&b);
563 
564    mme_value v1_lo = mme_alloc_reg(&b);
565    mme_value v1_hi = mme_alloc_reg(&b);
566    mme_tu104_asm(&b, i) {
567       i.alu[0].dst = mme_value_as_reg(v1_lo);
568       i.alu[0].op = MME_TU104_ALU_OP_ADD;
569       i.alu[0].src[0] = mme_value_as_reg(x_lo);
570       i.alu[0].src[1] = MME_TU104_REG_IMM;
571       i.imm[0] = 0x0001;
572       i.alu[1].dst = mme_value_as_reg(v1_hi);
573       i.alu[1].op = MME_TU104_ALU_OP_ADDC;
574       i.alu[1].src[0] = mme_value_as_reg(x_hi);
575       i.alu[1].src[1] = MME_TU104_REG_IMM;
576       i.imm[1] = 0x0000;
577    }
578 
579    mme_value v2_lo = mme_alloc_reg(&b);
580    mme_value v2_hi = mme_alloc_reg(&b);
581    mme_tu104_asm(&b, i) {
582       i.alu[0].dst = mme_value_as_reg(v2_lo);
583       i.alu[0].op = MME_TU104_ALU_OP_ADD;
584       i.alu[0].src[0] = mme_value_as_reg(x_lo);
585       i.alu[0].src[1] = MME_TU104_REG_IMM;
586       i.imm[0] = 0x0000;
587       i.alu[1].dst = mme_value_as_reg(v2_hi);
588       i.alu[1].op = MME_TU104_ALU_OP_ADDC;
589       i.alu[1].src[0] = mme_value_as_reg(x_hi);
590       i.alu[1].src[1] = MME_TU104_REG_IMM;
591       i.imm[1] = 0x0001;
592    }
593 
594    mme_value v3_lo = mme_alloc_reg(&b);
595    mme_value v3_hi = mme_alloc_reg(&b);
596    mme_tu104_asm(&b, i) {
597       i.alu[0].dst = mme_value_as_reg(v3_lo);
598       i.alu[0].op = MME_TU104_ALU_OP_ADD;
599       i.alu[0].src[0] = mme_value_as_reg(x_lo);
600       i.alu[0].src[1] = MME_TU104_REG_IMM;
601       i.imm[0] = 0x0000;
602       i.alu[1].dst = mme_value_as_reg(v3_hi);
603       i.alu[1].op = MME_TU104_ALU_OP_ADDC;
604       i.alu[1].src[0] = mme_value_as_reg(x_hi);
605       i.alu[1].src[1] = MME_TU104_REG_IMM;
606       i.imm[1] = 0xffff;
607    }
608 
609    mme_value v4_lo = mme_alloc_reg(&b);
610    mme_value v4_hi = mme_alloc_reg(&b);
611    mme_tu104_asm(&b, i) {
612       i.alu[0].dst = mme_value_as_reg(v4_lo);
613       i.alu[0].op = MME_TU104_ALU_OP_ADD;
614       i.alu[0].src[0] = mme_value_as_reg(x_lo);
615       i.alu[0].src[1] = MME_TU104_REG_IMM;
616       i.imm[0] = 0x0000;
617       i.alu[1].dst = mme_value_as_reg(v4_hi);
618       i.alu[1].op = MME_TU104_ALU_OP_ADDC;
619       i.alu[1].src[0] = mme_value_as_reg(x_hi);
620       i.alu[1].src[1] = MME_TU104_REG_IMM;
621       i.imm[1] = 0x8000;
622    }
623 
624    mme_store_imm_addr(&b, data_addr + 0,  v1_lo);
625    mme_store_imm_addr(&b, data_addr + 4,  v1_hi);
626    mme_store_imm_addr(&b, data_addr + 8,  v2_lo);
627    mme_store_imm_addr(&b, data_addr + 12, v2_hi);
628    mme_store_imm_addr(&b, data_addr + 16, v3_lo);
629    mme_store_imm_addr(&b, data_addr + 20, v3_hi);
630    mme_store_imm_addr(&b, data_addr + 24, v4_lo);
631    mme_store_imm_addr(&b, data_addr + 28, v4_hi);
632 
633    auto macro = mme_builder_finish_vec(&b);
634 
635    uint64_t vals[] = {
636       0x0000ffffffffffffull,
637       0x0000ffffffff8000ull,
638       0x0000ffff00000000ull,
639       0x0000800000000000ull,
640       0x00008000ffffffffull,
641       0x0001ffff00000000ull,
642       0xffffffff00000000ull,
643       0xffffffffffffffffull,
644    };
645 
646    for (uint32_t i = 0; i < ARRAY_SIZE(vals); i++) {
647       reset_push();
648 
649       std::vector<uint32_t> params;
650       params.push_back(low32(vals[i]));
651       params.push_back(high32(vals[i]));
652 
653       test_macro(&b, macro, params);
654    }
655 }
656 
TEST_F(mme_tu104_sim_test,sub)657 TEST_F(mme_tu104_sim_test, sub)
658 {
659    mme_builder b;
660    mme_builder_init(&b, devinfo);
661 
662    mme_value x = mme_load(&b);
663    mme_value y = mme_load(&b);
664    mme_value diff = mme_sub(&b, x, y);
665    mme_store_imm_addr(&b, data_addr, diff);
666 
667    auto macro = mme_builder_finish_vec(&b);
668 
669    std::vector<uint32_t> params;
670    params.push_back(25);
671    params.push_back(138);
672 
673    test_macro(&b, macro, params);
674 }
675 
TEST_F(mme_tu104_sim_test,subb)676 TEST_F(mme_tu104_sim_test, subb)
677 {
678    mme_builder b;
679    mme_builder_init(&b, devinfo);
680 
681    struct mme_value64 x = { mme_load(&b), mme_load(&b) };
682    struct mme_value64 y = { mme_load(&b), mme_load(&b) };
683 
684    struct mme_value64 diff = mme_sub64(&b, x, y);
685 
686    mme_store_imm_addr(&b, data_addr + 0, diff.lo);
687    mme_store_imm_addr(&b, data_addr + 4, diff.hi);
688 
689    auto macro = mme_builder_finish_vec(&b);
690 
691    std::vector<uint32_t> params;
692    params.push_back(0x80008650);
693    params.push_back(0x596);
694    params.push_back(0x8000a8f6);
695    params.push_back(0x836);
696 
697    test_macro(&b, macro, params);
698 }
699 
TEST_F(mme_tu104_sim_test,mul)700 TEST_F(mme_tu104_sim_test, mul)
701 {
702    mme_builder b;
703    mme_builder_init(&b, devinfo);
704 
705    mme_value x = mme_load(&b);
706    mme_value y = mme_load(&b);
707    mme_value sum = mme_mul(&b, x, y);
708    mme_store_imm_addr(&b, data_addr, sum);
709 
710    auto macro = mme_builder_finish_vec(&b);
711 
712    std::vector<uint32_t> params;
713    params.push_back(25);
714    params.push_back(138);
715 
716    test_macro(&b, macro, params);
717 }
718 
TEST_F(mme_tu104_sim_test,mul_imm)719 TEST_F(mme_tu104_sim_test, mul_imm)
720 {
721    mme_builder b;
722    mme_builder_init(&b, devinfo);
723 
724    mme_value x = mme_load(&b);
725 
726    mme_value v0 = mme_mul(&b, x, mme_imm(0x00000001));
727    mme_value v1 = mme_mul(&b, x, mme_imm(0xffffffff));
728    mme_value v2 = mme_mul(&b, x, mme_imm(0xffff8000));
729    mme_value v3 = mme_mul(&b, mme_imm(0x00000001), x);
730    mme_value v4 = mme_mul(&b, mme_imm(0xffffffff), x);
731    mme_value v5 = mme_mul(&b, mme_imm(0xffff8000), x);
732 
733    mme_store_imm_addr(&b, data_addr + 0,  v0);
734    mme_store_imm_addr(&b, data_addr + 4,  v1);
735    mme_store_imm_addr(&b, data_addr + 8,  v2);
736    mme_store_imm_addr(&b, data_addr + 12, v3);
737    mme_store_imm_addr(&b, data_addr + 16, v4);
738    mme_store_imm_addr(&b, data_addr + 20, v5);
739 
740    auto macro = mme_builder_finish_vec(&b);
741 
742    int32_t vals[] = { 1, -5, -1, 5 };
743 
744    for (uint32_t i = 0; i < ARRAY_SIZE(vals); i++) {
745       reset_push();
746 
747       std::vector<uint32_t> params;
748       params.push_back(vals[i]);
749 
750       test_macro(&b, macro, params);
751    }
752 }
753 
TEST_F(mme_tu104_sim_test,mul_mulh)754 TEST_F(mme_tu104_sim_test, mul_mulh)
755 {
756    mme_builder b;
757    mme_builder_init(&b, devinfo);
758 
759    mme_value x = mme_load(&b);
760    mme_value y = mme_load(&b);
761 
762    struct mme_value64 prod = mme_imul_32x32_64(&b, x, y);
763 
764    mme_store_imm_addr(&b, data_addr + 0, prod.lo);
765    mme_store_imm_addr(&b, data_addr + 4, prod.hi);
766 
767    auto macro = mme_builder_finish_vec(&b);
768 
769    std::vector<uint32_t> params;
770    params.push_back(0x80008650);
771    params.push_back(0x596);
772 
773    test_macro(&b, macro, params);
774 }
775 
776 static inline struct mme_value
mme_mulu(struct mme_builder * b,struct mme_value x,struct mme_value y)777 mme_mulu(struct mme_builder *b, struct mme_value x, struct mme_value y)
778 {
779    return mme_alu(b, MME_ALU_OP_MULU, x, y);
780 }
781 
TEST_F(mme_tu104_sim_test,mulu_imm)782 TEST_F(mme_tu104_sim_test, mulu_imm)
783 {
784    mme_builder b;
785    mme_builder_init(&b, devinfo);
786 
787    mme_value x = mme_load(&b);
788 
789    mme_value v0 = mme_mulu(&b, x, mme_imm(0x00000001));
790    mme_value v1 = mme_mulu(&b, x, mme_imm(0xffffffff));
791    mme_value v2 = mme_mulu(&b, x, mme_imm(0xffff8000));
792    mme_value v3 = mme_mulu(&b, mme_imm(0x00000001), x);
793    mme_value v4 = mme_mulu(&b, mme_imm(0xffffffff), x);
794    mme_value v5 = mme_mulu(&b, mme_imm(0xffff8000), x);
795 
796    mme_store_imm_addr(&b, data_addr + 0,  v0);
797    mme_store_imm_addr(&b, data_addr + 4,  v1);
798    mme_store_imm_addr(&b, data_addr + 8,  v2);
799    mme_store_imm_addr(&b, data_addr + 12, v3);
800    mme_store_imm_addr(&b, data_addr + 16, v4);
801    mme_store_imm_addr(&b, data_addr + 20, v5);
802 
803    auto macro = mme_builder_finish_vec(&b);
804 
805    int32_t vals[] = { 1, -5, -1, 5 };
806 
807    for (uint32_t i = 0; i < ARRAY_SIZE(vals); i++) {
808       reset_push();
809 
810       std::vector<uint32_t> params;
811       params.push_back(vals[i]);
812 
813       test_macro(&b, macro, params);
814    }
815 }
816 
TEST_F(mme_tu104_sim_test,mulu_mulh)817 TEST_F(mme_tu104_sim_test, mulu_mulh)
818 {
819    mme_builder b;
820    mme_builder_init(&b, devinfo);
821 
822    mme_value x = mme_load(&b);
823    mme_value y = mme_load(&b);
824 
825    struct mme_value64 prod = mme_umul_32x32_64(&b, x, y);
826 
827    mme_store_imm_addr(&b, data_addr + 0, prod.lo);
828    mme_store_imm_addr(&b, data_addr + 4, prod.hi);
829 
830    auto macro = mme_builder_finish_vec(&b);
831 
832    std::vector<uint32_t> params;
833    params.push_back(0x80008650);
834    params.push_back(0x596);
835 
836    test_macro(&b, macro, params);
837 }
838 
TEST_F(mme_tu104_sim_test,clz)839 TEST_F(mme_tu104_sim_test, clz)
840 {
841    mme_builder b;
842    mme_builder_init(&b, devinfo);
843 
844    mme_value bits = mme_clz(&b, mme_load(&b));
845    mme_store_imm_addr(&b, data_addr, bits);
846 
847    auto macro = mme_builder_finish_vec(&b);
848 
849    std::vector<uint32_t> params;
850    params.push_back(0x00406fe0);
851 
852    test_macro(&b, macro, params);
853 }
854 
855 #define SHIFT_TEST(op)                                               \
856 TEST_F(mme_tu104_sim_test, op)                                       \
857 {                                                                    \
858    mme_builder b;                                                    \
859    mme_builder_init(&b, devinfo);                                 \
860                                                                      \
861    mme_value val = mme_load(&b);                                     \
862    mme_value shift1 = mme_load(&b);                                  \
863    mme_value shift2 = mme_load(&b);                                  \
864    mme_store_imm_addr(&b, data_addr + 0, mme_##op(&b, val, shift1)); \
865    mme_store_imm_addr(&b, data_addr + 4, mme_##op(&b, val, shift2)); \
866                                                                      \
867    auto macro = mme_builder_finish_vec(&b);                          \
868                                                                      \
869    std::vector<uint32_t> params;                                     \
870    params.push_back(0x0c406fe0);                                     \
871    params.push_back(5);                                              \
872    params.push_back(51);                                             \
873                                                                      \
874    test_macro(&b, macro, params);                                    \
875 }
876 
877 SHIFT_TEST(sll)
SHIFT_TEST(srl)878 SHIFT_TEST(srl)
879 SHIFT_TEST(sra)
880 
881 #undef SHIFT_TEST
882 
883 TEST_F(mme_tu104_sim_test, bfe)
884 {
885    const uint32_t canary = 0xc0ffee01;
886 
887    mme_builder b;
888    mme_builder_init(&b, devinfo);
889 
890    mme_value val = mme_load(&b);
891    mme_value pos = mme_load(&b);
892 
893    mme_store_imm_addr(&b, data_addr + 0, mme_bfe(&b, val, pos, 1), true);
894    mme_store_imm_addr(&b, data_addr + 4, mme_bfe(&b, val, pos, 2), true);
895    mme_store_imm_addr(&b, data_addr + 8, mme_bfe(&b, val, pos, 5), true);
896 
897    auto macro = mme_builder_finish_vec(&b);
898 
899    for (unsigned i = 0; i < 31; i++) {
900       std::vector<uint32_t> params;
901       params.push_back(canary);
902       params.push_back(i);
903 
904       test_macro(&b, macro, params);
905 
906       ASSERT_EQ(data[0], (canary >> i) & 0x1);
907       ASSERT_EQ(data[1], (canary >> i) & 0x3);
908       ASSERT_EQ(data[2], (canary >> i) & 0x1f);
909    }
910 }
911 
TEST_F(mme_tu104_sim_test,not)912 TEST_F(mme_tu104_sim_test, not)
913 {
914    mme_builder b;
915    mme_builder_init(&b, devinfo);
916 
917    mme_value x = mme_load(&b);
918    mme_value v1 = mme_not(&b, x);
919    mme_store_imm_addr(&b, data_addr + 0, v1);
920 
921    auto macro = mme_builder_finish_vec(&b);
922 
923    std::vector<uint32_t> params;
924    params.push_back(0x0c406fe0);
925 
926    test_macro(&b, macro, params);
927 }
928 
929 #define BITOP_TEST(op)                                               \
930 TEST_F(mme_tu104_sim_test, op)                                       \
931 {                                                                    \
932    mme_builder b;                                                    \
933    mme_builder_init(&b, devinfo);                                 \
934                                                                      \
935    mme_value x = mme_load(&b);                                       \
936    mme_value y = mme_load(&b);                                       \
937    mme_value v1 = mme_##op(&b, x, y);                                \
938    mme_value v2 = mme_##op(&b, x, mme_imm(0xffff8000));              \
939    mme_value v3 = mme_##op(&b, x, mme_imm(0xffffffff));              \
940    mme_store_imm_addr(&b, data_addr + 0, v1);                        \
941    mme_store_imm_addr(&b, data_addr + 4, v2);                        \
942    mme_store_imm_addr(&b, data_addr + 8, v3);                        \
943                                                                      \
944    auto macro = mme_builder_finish_vec(&b);                          \
945                                                                      \
946    std::vector<uint32_t> params;                                     \
947    params.push_back(0x0c406fe0);                                     \
948    params.push_back(0x00fff0c0);                                     \
949                                                                      \
950    test_macro(&b, macro, params);                                    \
951 }
952 
953 BITOP_TEST(and)
BITOP_TEST(and_not)954 BITOP_TEST(and_not)
955 BITOP_TEST(nand)
956 BITOP_TEST(or)
957 BITOP_TEST(xor)
958 
959 #undef BITOP_TEST
960 
961 TEST_F(mme_tu104_sim_test, merge)
962 {
963    mme_builder b;
964    mme_builder_init(&b, devinfo);
965 
966    mme_value x = mme_load(&b);
967    mme_value y = mme_load(&b);
968 
969    mme_value m1 = mme_merge(&b, x, y, 12, 12, 20);
970    mme_value m2 = mme_merge(&b, x, y, 12, 8,  20);
971    mme_value m3 = mme_merge(&b, x, y, 8,  12, 20);
972    mme_value m4 = mme_merge(&b, x, y, 12, 16, 8);
973    mme_value m5 = mme_merge(&b, x, y, 24, 12, 8);
974 
975    mme_store_imm_addr(&b, data_addr + 0,  m1);
976    mme_store_imm_addr(&b, data_addr + 4,  m2);
977    mme_store_imm_addr(&b, data_addr + 8,  m3);
978    mme_store_imm_addr(&b, data_addr + 12, m4);
979    mme_store_imm_addr(&b, data_addr + 16, m5);
980 
981    auto macro = mme_builder_finish_vec(&b);
982 
983    std::vector<uint32_t> params;
984    params.push_back(0x0c406fe0);
985    params.push_back(0x76543210u);
986 
987    test_macro(&b, macro, params);
988 }
989 
990 #define COMPARISON_TEST(op)                     \
991 TEST_F(mme_tu104_sim_test, op)                  \
992 {                                               \
993    mme_builder b;                               \
994    mme_builder_init(&b, devinfo);            \
995                                                 \
996    mme_value x = mme_load(&b);                  \
997    mme_value y = mme_load(&b);                  \
998    mme_value z = mme_load(&b);                  \
999    mme_value w = mme_load(&b);                  \
1000                                                 \
1001    mme_value v1 = mme_##op(&b, x, y);           \
1002    mme_value v2 = mme_##op(&b, y, x);           \
1003    mme_value v3 = mme_##op(&b, y, z);           \
1004    mme_value v4 = mme_##op(&b, z, y);           \
1005    mme_value v5 = mme_##op(&b, w, z);           \
1006    mme_value v6 = mme_##op(&b, z, w);           \
1007    mme_value v7 = mme_##op(&b, w, w);           \
1008                                                 \
1009    mme_store_imm_addr(&b, data_addr + 0,  v1);  \
1010    mme_store_imm_addr(&b, data_addr + 4,  v2);  \
1011    mme_store_imm_addr(&b, data_addr + 8,  v3);  \
1012    mme_store_imm_addr(&b, data_addr + 12, v4);  \
1013    mme_store_imm_addr(&b, data_addr + 16, v5);  \
1014    mme_store_imm_addr(&b, data_addr + 20, v6);  \
1015    mme_store_imm_addr(&b, data_addr + 24, v7);  \
1016                                                 \
1017    auto macro = mme_builder_finish_vec(&b);     \
1018                                                 \
1019    std::vector<uint32_t> params;                \
1020    params.push_back(-5);                        \
1021    params.push_back(-10);                       \
1022    params.push_back(5);                         \
1023    params.push_back(10);                        \
1024                                                 \
1025    test_macro(&b, macro, params);               \
1026 }
1027 
1028 COMPARISON_TEST(slt)
COMPARISON_TEST(sltu)1029 COMPARISON_TEST(sltu)
1030 COMPARISON_TEST(sle)
1031 COMPARISON_TEST(sleu)
1032 COMPARISON_TEST(seq)
1033 
1034 #undef COMPARISON_TEST
1035 
1036 static inline void
1037 mme_inc_whole_inst(mme_builder *b, mme_value val)
1038 {
1039    mme_tu104_asm(b, i) {
1040       i.alu[0].dst = mme_value_as_reg(val);
1041       i.alu[0].op = MME_TU104_ALU_OP_ADD;
1042       i.alu[0].src[0] = mme_value_as_reg(val);
1043       i.alu[0].src[1] = MME_TU104_REG_IMM;
1044       i.imm[0] = 1;
1045    }
1046 }
1047 
TEST_F(mme_tu104_sim_test,loop)1048 TEST_F(mme_tu104_sim_test, loop)
1049 {
1050    mme_builder b;
1051    mme_builder_init(&b, devinfo);
1052 
1053    mme_value count = mme_load(&b);
1054 
1055    mme_value x = mme_mov(&b, mme_zero());
1056    mme_value y = mme_mov(&b, mme_zero());
1057 
1058    mme_loop(&b, count) {
1059       mme_tu104_asm(&b, i) { } /* noop */
1060       mme_add_to(&b, x, x, count);
1061    }
1062    mme_add_to(&b, y, y, mme_imm(1));
1063    mme_tu104_asm(&b, i) { } /* noop */
1064    mme_tu104_asm(&b, i) { } /* noop */
1065    mme_tu104_asm(&b, i) { } /* noop */
1066 
1067    mme_store_imm_addr(&b, data_addr + 0,  count);
1068    mme_store_imm_addr(&b, data_addr + 4,  x);
1069    mme_store_imm_addr(&b, data_addr + 8,  y);
1070 
1071    auto macro = mme_builder_finish_vec(&b);
1072 
1073    uint32_t counts[] = {0, 1, 5, 9};
1074 
1075    for (uint32_t i = 0; i < ARRAY_SIZE(counts); i++) {
1076       reset_push();
1077 
1078       std::vector<uint32_t> params;
1079       params.push_back(counts[i]);
1080 
1081       test_macro(&b, macro, params);
1082       ASSERT_EQ(data[0], counts[i]);
1083       ASSERT_EQ(data[1], counts[i] * counts[i]);
1084       ASSERT_EQ(data[2], 1);
1085    }
1086 }
1087 
TEST_F(mme_tu104_sim_test,jal)1088 TEST_F(mme_tu104_sim_test, jal)
1089 {
1090    mme_builder b;
1091    mme_builder_init(&b, devinfo);
1092 
1093    mme_value x = mme_mov(&b, mme_zero());
1094    mme_value y = mme_mov(&b, mme_zero());
1095 
1096    mme_tu104_asm(&b, i) {
1097       i.alu[0].op = MME_TU104_ALU_OP_JAL;
1098       i.imm[0] = (1 << 15) | 6;
1099    }
1100 
1101    for (uint32_t j = 0; j < 10; j++)
1102       mme_inc_whole_inst(&b, x);
1103 
1104 //   mme_tu104_asm(&b, i) {
1105 //      i.alu[0].op = MME_TU104_ALU_OP_JAL;
1106 //      i.imm[0] = 6;
1107 //   }
1108 //
1109 //   for (uint32_t j = 0; j < 10; j++)
1110 //      mme_inc_whole_inst(&b, y);
1111 
1112    mme_store_imm_addr(&b, data_addr + 0, x);
1113    mme_store_imm_addr(&b, data_addr + 4, y);
1114 
1115    auto macro = mme_builder_finish_vec(&b);
1116 
1117    std::vector<uint32_t> params;
1118    test_macro(&b, macro, params);
1119    ASSERT_EQ(data[0], 5);
1120 }
1121 
TEST_F(mme_tu104_sim_test,bxx_fwd)1122 TEST_F(mme_tu104_sim_test, bxx_fwd)
1123 {
1124    mme_builder b;
1125    mme_builder_init(&b, devinfo);
1126 
1127    mme_value vals[10];
1128    for (uint32_t i = 0; i < 10; i++)
1129       vals[i] = mme_mov(&b, mme_zero());
1130 
1131    mme_tu104_asm(&b, i) {
1132       i.alu[0].op = MME_TU104_ALU_OP_BEQ;
1133       i.imm[0] = (1 << 15) | 6;
1134    }
1135 
1136    for (uint32_t j = 0; j < 10; j++)
1137       mme_inc_whole_inst(&b, vals[j]);
1138 
1139    for (uint32_t j = 0; j < 10; j++)
1140       mme_store_imm_addr(&b, data_addr + j * 4, vals[j]);
1141 
1142    auto macro = mme_builder_finish_vec(&b);
1143 
1144    std::vector<uint32_t> params;
1145    test_macro(&b, macro, params);
1146 }
1147 
TEST_F(mme_tu104_sim_test,bxx_bwd)1148 TEST_F(mme_tu104_sim_test, bxx_bwd)
1149 {
1150    mme_builder b;
1151    mme_builder_init(&b, devinfo);
1152 
1153    mme_value vals[15];
1154    for (uint32_t i = 0; i < 15; i++)
1155       vals[i] = mme_mov(&b, mme_zero());
1156 
1157    mme_tu104_asm(&b, i) {
1158       i.alu[0].op = MME_TU104_ALU_OP_JAL;
1159       i.imm[0] = (1 << 15) | 12;
1160    }
1161 
1162    for (uint32_t j = 0; j < 10; j++)
1163       mme_inc_whole_inst(&b, vals[j]);
1164 
1165    mme_tu104_asm(&b, i) {
1166       i.alu[0].op = MME_TU104_ALU_OP_JAL;
1167       i.imm[0] = (1 << 15) | 2;
1168    }
1169 
1170    mme_tu104_asm(&b, i) {
1171       i.alu[0].op = MME_TU104_ALU_OP_BEQ;
1172       i.imm[0] = (1 << 15) | ((-8) & 0x1fff);
1173    }
1174 
1175    for (uint32_t j = 10; j < 15; j++)
1176       mme_inc_whole_inst(&b, vals[j]);
1177 
1178    for (uint32_t j = 0; j < 15; j++)
1179       mme_store_imm_addr(&b, data_addr + j * 4, vals[j]);
1180 
1181    auto macro = mme_builder_finish_vec(&b);
1182 
1183    std::vector<uint32_t> params;
1184    test_macro(&b, macro, params);
1185    for (uint32_t j = 0; j < 3; j++)
1186       ASSERT_EQ(data[j], 0);
1187    for (uint32_t j = 3; j < 15; j++)
1188       ASSERT_EQ(data[j], 1);
1189 }
1190 
TEST_F(mme_tu104_sim_test,bxx_exit)1191 TEST_F(mme_tu104_sim_test, bxx_exit)
1192 {
1193    mme_builder b;
1194    mme_builder_init(&b, devinfo);
1195 
1196    mme_value vals[10];
1197    for (uint32_t i = 0; i < 10; i++)
1198       vals[i] = mme_mov(&b, mme_zero());
1199 
1200    for (uint32_t i = 0; i < 10; i++)
1201       mme_store_imm_addr(&b, data_addr + i * 4, mme_imm(0));
1202 
1203    mme_tu104_asm(&b, i) {
1204       i.alu[0].op = MME_TU104_ALU_OP_BEQ;
1205       i.imm[0] = (1 << 15) | 0x1000;
1206    }
1207 
1208    /* those writes won't be visible */
1209    for (uint32_t j = 0; j < 10; j++)
1210       mme_inc_whole_inst(&b, vals[j]);
1211 
1212    for (uint32_t i = 0; i < 10; i++)
1213       mme_store_imm_addr(&b, data_addr + i * 4, vals[i]);
1214 
1215    std::vector<uint32_t> params;
1216 
1217    auto macro = mme_builder_finish_vec(&b);
1218    test_macro(&b, macro, params);
1219 
1220    uint32_t i;
1221    for (i = 0; i < 10; i++)
1222       ASSERT_EQ(data[i], 0);
1223 }
1224 
TEST_F(mme_tu104_sim_test,mme_exit)1225 TEST_F(mme_tu104_sim_test, mme_exit)
1226 {
1227    mme_builder b;
1228    mme_builder_init(&b, devinfo);
1229 
1230    mme_value vals[10];
1231    for (uint32_t i = 0; i < 10; i++)
1232       vals[i] = mme_mov(&b, mme_zero());
1233 
1234    for (uint32_t i = 0; i < 10; i++)
1235       mme_store_imm_addr(&b, data_addr + i * 4, mme_imm(0));
1236 
1237    /* abort */
1238    mme_exit(&b);
1239 
1240    /* those writes won't be visible */
1241    for (uint32_t i = 0; i < 10; i++)
1242       vals[i] = mme_mov(&b, mme_imm(i));
1243 
1244    for (uint32_t i = 0; i < 10; i++) {
1245       mme_store_imm_addr(&b, data_addr + i * 4, vals[i]);
1246    }
1247 
1248    std::vector<uint32_t> params;
1249 
1250    auto macro = mme_builder_finish_vec(&b);
1251    test_macro(&b, macro, params);
1252 
1253    uint32_t i;
1254    for (i = 0; i < 10; i++)
1255       ASSERT_EQ(data[i], 0);
1256 }
1257 
TEST_F(mme_tu104_sim_test,mme_exit_if)1258 TEST_F(mme_tu104_sim_test, mme_exit_if)
1259 {
1260    mme_builder b;
1261    mme_builder_init(&b, devinfo);
1262 
1263    mme_value vals[10];
1264    for (uint32_t i = 0; i < 10; i++)
1265       vals[i] = mme_mov(&b, mme_zero());
1266 
1267    for (uint32_t i = 0; i < 10; i++)
1268       mme_store_imm_addr(&b, data_addr + i * 4, mme_imm(0));
1269 
1270    /* shouldn't do anything */
1271    mme_exit_if(&b, ieq, mme_zero(), mme_imm(1));
1272 
1273    for (uint32_t i = 0; i < 10; i++)
1274       vals[i] = mme_mov(&b, mme_imm(i));
1275 
1276    for (uint32_t i = 0; i < 10; i++) {
1277       /* abort on reaching 5 */
1278       mme_exit_if(&b, ile, mme_imm(5), vals[i]);
1279       mme_store_imm_addr(&b, data_addr + i * 4, vals[i]);
1280    }
1281 
1282    std::vector<uint32_t> params;
1283 
1284    auto macro = mme_builder_finish_vec(&b);
1285    test_macro(&b, macro, params);
1286 
1287    uint32_t i;
1288    for (i = 0; i < 10; i++)
1289       ASSERT_EQ(data[i], i < 5 ? i : 0);
1290 }
1291 
c_ilt(int32_t x,int32_t y)1292 static bool c_ilt(int32_t x, int32_t y) { return x < y; };
c_ult(uint32_t x,uint32_t y)1293 static bool c_ult(uint32_t x, uint32_t y) { return x < y; };
c_ile(int32_t x,int32_t y)1294 static bool c_ile(int32_t x, int32_t y) { return x <= y; };
c_ule(uint32_t x,uint32_t y)1295 static bool c_ule(uint32_t x, uint32_t y) { return x <= y; };
c_ieq(int32_t x,int32_t y)1296 static bool c_ieq(int32_t x, int32_t y) { return x == y; };
c_ige(int32_t x,int32_t y)1297 static bool c_ige(int32_t x, int32_t y) { return x >= y; };
c_uge(uint32_t x,uint32_t y)1298 static bool c_uge(uint32_t x, uint32_t y) { return x >= y; };
c_igt(int32_t x,int32_t y)1299 static bool c_igt(int32_t x, int32_t y) { return x > y; };
c_ugt(uint32_t x,uint32_t y)1300 static bool c_ugt(uint32_t x, uint32_t y) { return x > y; };
c_ine(int32_t x,int32_t y)1301 static bool c_ine(int32_t x, int32_t y) { return x != y; };
1302 
1303 #define IF_TEST(op)                                                  \
1304 TEST_F(mme_tu104_sim_test, if_##op)                                  \
1305 {                                                                    \
1306    mme_builder b;                                                    \
1307    mme_builder_init(&b, devinfo);                                    \
1308                                                                      \
1309    mme_value x = mme_load(&b);                                       \
1310    mme_value y = mme_load(&b);                                       \
1311    mme_value i = mme_mov(&b, mme_zero());                            \
1312                                                                      \
1313    mme_start_if_##op(&b, x, y);                                      \
1314    {                                                                 \
1315       mme_add_to(&b, i, i, mme_imm(1));                              \
1316       mme_add_to(&b, i, i, mme_imm(1));                              \
1317    }                                                                 \
1318    mme_end_if(&b);                                                   \
1319    mme_start_if_##op(&b, x, mme_imm(56));                            \
1320    {                                                                 \
1321       mme_add_to(&b, i, i, mme_imm(1));                              \
1322    }                                                                 \
1323    mme_end_if(&b);                                                   \
1324    mme_add_to(&b, i, i, mme_imm(1));                                 \
1325    mme_add_to(&b, i, i, mme_imm(1));                                 \
1326    mme_add_to(&b, i, i, mme_imm(1));                                 \
1327                                                                      \
1328    mme_store_imm_addr(&b, data_addr + 0, i);                         \
1329                                                                      \
1330    auto macro = mme_builder_finish_vec(&b);                          \
1331                                                                      \
1332    uint32_t vals[] = {23, 56, (uint32_t)-5, (uint32_t)-10, 56, 14};  \
1333                                                                      \
1334    for (uint32_t i = 0; i < ARRAY_SIZE(vals) - 1; i++) {             \
1335       reset_push();                                                  \
1336                                                                      \
1337       std::vector<uint32_t> params;                                  \
1338       params.push_back(vals[i + 0]);                                 \
1339       params.push_back(vals[i + 1]);                                 \
1340                                                                      \
1341       test_macro(&b, macro, params);                                 \
1342                                                                      \
1343       uint32_t expected = 3;                                         \
1344       if (c_##op(params[0], params[1]))                              \
1345          expected += 2;                                              \
1346       if (c_##op(params[0], 56))                                     \
1347          expected += 1;                                              \
1348                                                                      \
1349       ASSERT_EQ(data[0], expected);                                  \
1350    }                                                                 \
1351 }
1352 
1353 IF_TEST(ilt)
IF_TEST(ult)1354 IF_TEST(ult)
1355 IF_TEST(ile)
1356 IF_TEST(ule)
1357 IF_TEST(ieq)
1358 IF_TEST(ige)
1359 IF_TEST(uge)
1360 IF_TEST(igt)
1361 IF_TEST(ugt)
1362 IF_TEST(ine)
1363 
1364 #undef IF_TEST
1365 
1366 #define WHILE_TEST(op, start, step, bound)            \
1367 TEST_F(mme_tu104_sim_test, while_##op)                \
1368 {                                                     \
1369    mme_builder b;                                     \
1370    mme_builder_init(&b, devinfo);                  \
1371                                                       \
1372    mme_value x = mme_mov(&b, mme_zero());             \
1373    mme_value y = mme_mov(&b, mme_zero());             \
1374    mme_value z = mme_mov(&b, mme_imm(start));         \
1375    mme_value w = mme_mov(&b, mme_zero());             \
1376    mme_value v = mme_mov(&b, mme_zero());             \
1377                                                       \
1378    for (uint32_t j = 0; j < 5; j++)                   \
1379       mme_inc_whole_inst(&b, x);                      \
1380                                                       \
1381    mme_while(&b, op, z, mme_imm(bound)) {             \
1382       for (uint32_t j = 0; j < 5; j++)                \
1383          mme_inc_whole_inst(&b, y);                   \
1384                                                       \
1385       mme_add_to(&b, z, z, mme_imm(step));            \
1386                                                       \
1387       for (uint32_t j = 0; j < 5; j++)                \
1388          mme_inc_whole_inst(&b, w);                   \
1389    }                                                  \
1390                                                       \
1391    for (uint32_t j = 0; j < 5; j++)                   \
1392       mme_inc_whole_inst(&b, v);                      \
1393                                                       \
1394    mme_store_imm_addr(&b, data_addr + 0, x);          \
1395    mme_store_imm_addr(&b, data_addr + 4, y);          \
1396    mme_store_imm_addr(&b, data_addr + 8, z);          \
1397    mme_store_imm_addr(&b, data_addr + 12, w);         \
1398    mme_store_imm_addr(&b, data_addr + 16, v);         \
1399                                                       \
1400    auto macro = mme_builder_finish_vec(&b);           \
1401                                                       \
1402    uint32_t end = (uint32_t)(start), count = 0;       \
1403    while (c_##op(end, (bound))) {                     \
1404       end += (uint32_t)(step);                        \
1405       count++;                                        \
1406    }                                                  \
1407                                                       \
1408    std::vector<uint32_t> params;                      \
1409    test_macro(&b, macro, params);                     \
1410    ASSERT_EQ(data[0], 5);                             \
1411    ASSERT_EQ(data[1], 5 * count);                     \
1412    ASSERT_EQ(data[2], end);                           \
1413    ASSERT_EQ(data[3], 5 * count);                     \
1414    ASSERT_EQ(data[4], 5);                             \
1415 }
1416 
1417 WHILE_TEST(ilt, 0, 1, 7)
1418 WHILE_TEST(ult, 0, 1, 7)
1419 WHILE_TEST(ile, -10, 2, 0)
1420 WHILE_TEST(ule, 0, 1, 7)
1421 WHILE_TEST(ieq, 0, 5, 0)
1422 WHILE_TEST(ige, 5, -1, -5)
1423 WHILE_TEST(uge, 15, -2, 2)
1424 WHILE_TEST(igt, 7, -3, -10)
1425 WHILE_TEST(ugt, 1604, -30, 1000)
1426 WHILE_TEST(ine, 0, 1, 7)
1427 
1428 #undef WHILE_TEST
1429 
1430 TEST_F(mme_tu104_sim_test, nested_while)
1431 {
1432    mme_builder b;
1433    mme_builder_init(&b, devinfo);
1434 
1435    mme_value n = mme_load(&b);
1436    mme_value m = mme_load(&b);
1437 
1438    mme_value count = mme_mov(&b, mme_zero());
1439 
1440    mme_value i = mme_mov(&b, mme_zero());
1441    mme_value j = mme_mov(&b, mme_imm(0xffff));
1442    mme_while(&b, ine, i, n) {
1443       mme_mov_to(&b, j, mme_zero());
1444       mme_while(&b, ine, j, m) {
1445          mme_add_to(&b, count, count, mme_imm(1));
1446          mme_add_to(&b, j, j, mme_imm(1));
1447       }
1448 
1449       mme_add_to(&b, i, i, mme_imm(1));
1450    }
1451 
1452    mme_store_imm_addr(&b, data_addr + 0, i);
1453    mme_store_imm_addr(&b, data_addr + 4, j);
1454    mme_store_imm_addr(&b, data_addr + 8, count);
1455 
1456    auto macro = mme_builder_finish_vec(&b);
1457 
1458    std::vector<uint32_t> params;
1459    params.push_back(3);
1460    params.push_back(5);
1461 
1462    test_macro(&b, macro, params);
1463    ASSERT_EQ(data[0], 3);
1464    ASSERT_EQ(data[1], 5);
1465    ASSERT_EQ(data[2], 15);
1466 }
1467 
1468 #if 0
1469 TEST_F(mme_tu104_sim_test, do_ble)
1470 {
1471    mme_builder b;
1472    mme_builder_init(&b, devinfo);
1473 
1474    mme_alu(&b, R5, ADD, LOAD0, ZERO);
1475    mme_alu(&b, R6, ADD, ZERO, ZERO);
1476    mme_alu(&b, R7, ADD, ZERO, ZERO);
1477 
1478    mme_alu_imm(&b, R7, ADD, R7, IMM, 1);
1479    mme_alu_imm(&b, R7, ADD, R7, IMM, 1);
1480    mme_alu_imm(&b, R7, ADD, R7, IMM, 1);
1481    mme_alu_imm(&b, R7, ADD, R7, IMM, 1);
1482    mme_alu_imm(&b, R6, ADD, R6, IMM, 1);
1483    mme_branch(&b, BLE, R6, R5, -3, 2);
1484    mme_alu_imm(&b, R7, ADD, R7, IMM, 1);
1485    mme_alu_imm(&b, R7, ADD, R7, IMM, 1);
1486 
1487    mme_store_imm_addr(&b, data_addr + 0,  MME_TU104_REG_R7);
1488 
1489    mme_end(&b);
1490 
1491    uint32_t counts[] = {0, 1, 5, 9};
1492 
1493    for (uint32_t i = 0; i < ARRAY_SIZE(counts); i++) {
1494       reset_push();
1495 
1496       std::vector<uint32_t> params;
1497       params.push_back(counts[i]);
1498 
1499       test_macro(&b, params);
1500    }
1501 }
1502 #endif
1503 
TEST_F(mme_tu104_sim_test,dread_dwrite)1504 TEST_F(mme_tu104_sim_test, dread_dwrite)
1505 {
1506    mme_builder b;
1507    mme_builder_init(&b, devinfo);
1508 
1509    mme_value x = mme_load(&b);
1510    mme_value y = mme_load(&b);
1511 
1512    mme_dwrite(&b, mme_imm(5), x);
1513    mme_dwrite(&b, mme_imm(8), y);
1514 
1515    mme_value y2 = mme_dread(&b, mme_imm(8));
1516    mme_value x2 = mme_dread(&b, mme_imm(5));
1517 
1518    mme_store_imm_addr(&b, data_addr + 0, y2);
1519    mme_store_imm_addr(&b, data_addr + 4, x2);
1520 
1521    auto macro = mme_builder_finish_vec(&b);
1522 
1523    std::vector<uint32_t> params;
1524    params.push_back(-10);
1525    params.push_back(5);
1526 
1527    test_macro(&b, macro, params);
1528 }
1529 
TEST_F(mme_tu104_sim_test,dwrite_dma)1530 TEST_F(mme_tu104_sim_test, dwrite_dma)
1531 {
1532    const uint32_t canary5 = 0xc0ffee01;
1533    const uint32_t canary8 = canary5 & 0x00ffff00;
1534 
1535    mme_builder b;
1536    mme_builder_init(&b, devinfo);
1537 
1538    mme_value x = mme_load(&b);
1539    mme_value y = mme_load(&b);
1540 
1541    mme_dwrite(&b, mme_imm(5), x);
1542    mme_dwrite(&b, mme_imm(8), y);
1543 
1544    auto macro = mme_builder_finish_vec(&b);
1545 
1546    push_macro(0, macro);
1547 
1548    P_1INC(p, NVC597, CALL_MME_MACRO(0));
1549    P_INLINE_DATA(p, canary5);
1550    P_INLINE_DATA(p, canary8);
1551 
1552    P_MTHD(p, NVC597, SET_MME_MEM_ADDRESS_A);
1553    P_NVC597_SET_MME_MEM_ADDRESS_A(p, high32(data_addr));
1554    P_NVC597_SET_MME_MEM_ADDRESS_B(p, low32(data_addr));
1555    /* Start 3 dwords into MME RAM */
1556    P_NVC597_SET_MME_DATA_RAM_ADDRESS(p, 3);
1557    P_IMMD(p, NVC597, MME_DMA_WRITE, 20);
1558 
1559    submit_push();
1560 
1561    for (uint32_t i = 0; i < 20; i++) {
1562       if (i == 5 - 3) {
1563          ASSERT_EQ(data[i], canary5);
1564       } else if (i == 8 - 3) {
1565          ASSERT_EQ(data[i], canary8);
1566       } else {
1567          ASSERT_EQ(data[i], 0);
1568       }
1569    }
1570 }
1571 
TEST_F(mme_tu104_sim_test,dram_limit)1572 TEST_F(mme_tu104_sim_test, dram_limit)
1573 {
1574    static const uint32_t chunk_size = 32;
1575 
1576    mme_builder b;
1577    mme_builder_init(&b, devinfo);
1578 
1579    mme_value start = mme_load(&b);
1580    mme_value count = mme_load(&b);
1581 
1582    mme_value i = mme_mov(&b, start);
1583    mme_loop(&b, count) {
1584       mme_dwrite(&b, i, i);
1585       mme_add_to(&b, i, i, mme_imm(1));
1586    }
1587 
1588    mme_value j = mme_mov(&b, start);
1589    struct mme_value64 addr = mme_mov64(&b, mme_imm64(data_addr));
1590 
1591    mme_loop(&b, count) {
1592       mme_value x = mme_dread(&b, j);
1593       mme_store(&b, addr, x);
1594       mme_add_to(&b, j, j, mme_imm(1));
1595       mme_add64_to(&b, addr, addr, mme_imm64(4));
1596    }
1597 
1598    auto macro = mme_builder_finish_vec(&b);
1599 
1600    for (uint32_t i = 0; i < MME_TU104_DRAM_COUNT; i += chunk_size) {
1601       reset_push();
1602 
1603       push_macro(0, macro);
1604 
1605       P_1INC(p, NVC597, CALL_MME_MACRO(0));
1606       P_INLINE_DATA(p, i);
1607       P_INLINE_DATA(p, chunk_size);
1608 
1609       submit_push();
1610 
1611       for (uint32_t j = 0; j < chunk_size; j++)
1612          ASSERT_EQ(data[j], i + j);
1613    }
1614 }
1615 
TEST_F(mme_tu104_sim_test,dma_read_fifoed)1616 TEST_F(mme_tu104_sim_test, dma_read_fifoed)
1617 {
1618    mme_builder b;
1619    mme_builder_init(&b, devinfo);
1620 
1621    mme_mthd(&b, NVC597_SET_MME_DATA_RAM_ADDRESS);
1622    mme_emit(&b, mme_zero());
1623 
1624    mme_mthd(&b, NVC597_SET_MME_MEM_ADDRESS_A);
1625    mme_emit(&b, mme_imm(high32(data_addr)));
1626    mme_emit(&b, mme_imm(low32(data_addr)));
1627 
1628    mme_mthd(&b, NVC597_MME_DMA_READ_FIFOED);
1629    mme_emit(&b, mme_imm(2));
1630 
1631    mme_tu104_load_barrier(&b);
1632 
1633    mme_value x = mme_load(&b);
1634    mme_value y = mme_load(&b);
1635 
1636    mme_store_imm_addr(&b, data_addr + 256 + 0, x);
1637    mme_store_imm_addr(&b, data_addr + 256 + 4, y);
1638 
1639    auto macro = mme_builder_finish_vec(&b);
1640 
1641    P_IMMD(p, NVC597, SET_MME_DATA_FIFO_CONFIG, FIFO_SIZE_SIZE_4KB);
1642 
1643    for (uint32_t i = 0; i < 64; i++)
1644       data[i] = 1000 + i;
1645 
1646    std::vector<uint32_t> params;
1647    params.push_back(7);
1648 
1649    test_macro(&b, macro, params);
1650 }
1651 
TEST_F(mme_tu104_sim_test,scratch_limit)1652 TEST_F(mme_tu104_sim_test, scratch_limit)
1653 {
1654    static const uint32_t chunk_size = 32;
1655 
1656    mme_builder b;
1657    mme_builder_init(&b, devinfo);
1658 
1659    mme_value start = mme_load(&b);
1660    mme_value count = mme_load(&b);
1661 
1662    mme_value i = mme_mov(&b, start);
1663    mme_loop(&b, count) {
1664       mme_mthd_arr(&b, NVC597_SET_MME_SHADOW_SCRATCH(0), i);
1665       mme_emit(&b, i);
1666       mme_add_to(&b, i, i, mme_imm(1));
1667    }
1668 
1669    mme_value j = mme_mov(&b, start);
1670    struct mme_value64 addr = mme_mov64(&b, mme_imm64(data_addr));
1671 
1672    mme_loop(&b, count) {
1673       mme_value x = mme_state_arr(&b, NVC597_SET_MME_SHADOW_SCRATCH(0), j);
1674       mme_store(&b, addr, x);
1675       mme_add_to(&b, j, j, mme_imm(1));
1676       mme_add64_to(&b, addr, addr, mme_imm64(4));
1677    }
1678 
1679    auto macro = mme_builder_finish_vec(&b);
1680 
1681    for (uint32_t i = 0; i < MME_TU104_SCRATCH_COUNT; i += chunk_size) {
1682       reset_push();
1683 
1684       push_macro(0, macro);
1685 
1686       P_1INC(p, NVC597, CALL_MME_MACRO(0));
1687       P_INLINE_DATA(p, i);
1688       P_INLINE_DATA(p, chunk_size);
1689 
1690       submit_push();
1691 
1692       for (uint32_t j = 0; j < chunk_size; j++)
1693          ASSERT_EQ(data[j], i + j);
1694    }
1695 }
1696