• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /*
2  * Copyright © 2022 Collabora Ltd.
3  * SPDX-License-Identifier: MIT
4  */
5 #include "mme_tu104_sim.h"
6 
7 #include <inttypes.h>
8 
9 #include "mme_tu104.h"
10 #include "util/u_math.h"
11 
12 #include "nvk_clc597.h"
13 
14 struct mme_tu104_sim {
15    uint32_t param_count;
16    const uint32_t *params;
17 
18    uint32_t load[2];
19 
20    /* Bound memory ranges */
21    uint32_t mem_count;
22    struct mme_tu104_sim_mem *mems;
23 
24    /* SET_MME_MEM_ADDRESS_A/B */
25    uint64_t mem_addr;
26 
27    /* RAM, accessed by DREAD/DWRITE */
28    struct {
29       uint32_t data[MME_TU104_DRAM_COUNT];
30 
31       /* SET_MME_MEM_RAM_ADDRESS */
32       uint32_t addr;
33    } ram;
34 
35    struct {
36       struct {
37          uint32_t data[1024];
38          uint64_t count;
39       } read_fifo;
40    } dma;
41 
42    /* NVC597_SET_MME_SHADOW_SCRATCH(i) */
43    uint32_t scratch[256];
44 
45    struct {
46       unsigned mthd:16;
47       unsigned inc:4;
48       bool has_mthd:1;
49       unsigned _pad:5;
50       unsigned data_len:8;
51       uint32_t data[8];
52    } mthd;
53 
54    uint32_t set_regs;
55    uint32_t regs[23];
56    uint32_t alu_res[2];
57    uint32_t alu_carry;
58 
59    uint16_t ip;
60    uint16_t next_ip;
61    bool stop;
62 
63    uint32_t loop_count;
64    uint16_t loop_start;
65    uint16_t loop_end;
66 };
67 
68 static bool
inst_loads_reg(const struct mme_tu104_inst * inst,enum mme_tu104_reg reg)69 inst_loads_reg(const struct mme_tu104_inst *inst,
70                enum mme_tu104_reg reg)
71 {
72    return inst->pred == reg ||
73           inst->alu[0].src[0] == reg ||
74           inst->alu[0].src[1] == reg ||
75           inst->alu[1].src[0] == reg ||
76           inst->alu[1].src[1] == reg;
77 }
78 
79 static bool
inst_loads_out(const struct mme_tu104_inst * inst,enum mme_tu104_out_op out)80 inst_loads_out(const struct mme_tu104_inst *inst,
81                enum mme_tu104_out_op out)
82 {
83    return inst->out[0].mthd == out ||
84           inst->out[0].emit == out ||
85           inst->out[1].mthd == out ||
86           inst->out[1].emit == out;
87 }
88 
89 static void
load_params(struct mme_tu104_sim * sim,const struct mme_tu104_inst * inst)90 load_params(struct mme_tu104_sim *sim,
91             const struct mme_tu104_inst *inst)
92 {
93    const bool has_load0 = inst_loads_reg(inst, MME_TU104_REG_LOAD0) ||
94                           inst_loads_out(inst, MME_TU104_OUT_OP_LOAD0);
95    const bool has_load1 = inst_loads_reg(inst, MME_TU104_REG_LOAD1) ||
96                           inst_loads_out(inst, MME_TU104_OUT_OP_LOAD1);
97    assert(has_load0 || !has_load1);
98 
99    if (has_load0) {
100       sim->load[0] = *sim->params;
101       sim->params++;
102       sim->param_count--;
103    }
104 
105    if (has_load1) {
106       sim->load[1] = *sim->params;
107       sim->params++;
108       sim->param_count--;
109    }
110 }
111 
112 static uint32_t
load_state(struct mme_tu104_sim * sim,uint16_t state)113 load_state(struct mme_tu104_sim *sim, uint16_t state)
114 {
115    assert(state % 4 == 0);
116 
117    if (NVC597_SET_MME_SHADOW_SCRATCH(0) <= state &&
118        state < NVC597_CALL_MME_MACRO(0)) {
119       uint32_t i = (state - NVC597_SET_MME_SHADOW_SCRATCH(0)) / 4;
120       assert(i <= ARRAY_SIZE(sim->scratch));
121       return sim->scratch[i];
122    }
123 
124    return 0;
125 }
126 
127 static uint32_t *
find_mem(struct mme_tu104_sim * sim,uint64_t addr,const char * op_desc)128 find_mem(struct mme_tu104_sim *sim, uint64_t addr, const char *op_desc)
129 {
130    for (uint32_t i = 0; i < sim->mem_count; i++) {
131       if (addr < sim->mems[i].addr)
132          continue;
133 
134       uint64_t offset = addr - sim->mems[i].addr;
135       if (offset >= sim->mems[i].size)
136          continue;
137 
138       assert(sim->mems[i].data != NULL);
139       return (uint32_t *)((char *)sim->mems[i].data + offset);
140    }
141 
142    fprintf(stderr, "FAULT in %s at address 0x%"PRIx64"\n", op_desc, addr);
143    abort();
144 }
145 
146 static void
finish_dma_read_fifo(struct mme_tu104_sim * sim)147 finish_dma_read_fifo(struct mme_tu104_sim *sim)
148 {
149    if (sim->dma.read_fifo.count == 0)
150       return;
151 
152    for (uint32_t i = 0; i < sim->dma.read_fifo.count; i++) {
153       uint32_t *src = find_mem(sim, sim->mem_addr + i * 4,
154                                "MME_DMA_READ_FIFOED");
155       assert(src != NULL);
156       sim->dma.read_fifo.data[i] = *src;
157    }
158 
159    sim->param_count = sim->dma.read_fifo.count;
160    sim->params = sim->dma.read_fifo.data;
161 }
162 
163 static void
flush_mthd(struct mme_tu104_sim * sim)164 flush_mthd(struct mme_tu104_sim *sim)
165 {
166    if (!sim->mthd.has_mthd)
167       return;
168 
169    uint16_t mthd = sim->mthd.mthd;
170    const uint32_t *p = sim->mthd.data;
171    const uint32_t *end = sim->mthd.data + sim->mthd.data_len;
172    while (p < end) {
173       uint32_t dw_used = 1;
174       if (NVC597_SET_MME_SHADOW_SCRATCH(0) <= mthd &&
175           mthd < NVC597_CALL_MME_MACRO(0)) {
176          uint32_t i = (mthd - NVC597_SET_MME_SHADOW_SCRATCH(0)) / 4;
177          assert(i <= ARRAY_SIZE(sim->scratch));
178          sim->scratch[i] = *p;
179       } else {
180          switch (mthd) {
181          case NVC597_SET_REPORT_SEMAPHORE_A: {
182             assert(p + 4 <= end);
183             uint64_t addr = ((uint64_t)p[0] << 32) | p[1];
184             uint32_t data = p[2];
185             assert(p[3] == 0x10000000);
186             dw_used = 4;
187 
188             uint32_t *mem = find_mem(sim, addr, "SET_REPORT_SEMAPHORE");
189             *mem = data;
190             break;
191          }
192          case NVC597_SET_MME_DATA_RAM_ADDRESS:
193             sim->ram.addr = *p;
194             break;
195          case NVC597_SET_MME_MEM_ADDRESS_A:
196             assert(p + 2 <= end);
197             sim->mem_addr = ((uint64_t)p[0] << 32) | p[1];
198             dw_used = 2;
199             break;
200          case NVC597_MME_DMA_READ_FIFOED:
201             sim->dma.read_fifo.count = *p;
202             break;
203          default:
204             fprintf(stdout, "%s:\n", P_PARSE_NVC597_MTHD(mthd));
205             P_DUMP_NVC597_MTHD_DATA(stdout, mthd, *p, "    ");
206             break;
207          }
208       }
209 
210       p += dw_used;
211       assert(sim->mthd.inc == 1);
212       mthd += dw_used * 4;
213    }
214 
215    sim->mthd.has_mthd = false;
216 }
217 
218 static void
eval_extended(struct mme_tu104_sim * sim,uint32_t x,uint32_t y)219 eval_extended(struct mme_tu104_sim *sim,
220               uint32_t x, uint32_t y)
221 {
222    /* The only extended method we know about appears to be some sort of
223     * barrier required when using READ_FIFOED.
224     */
225    assert(x == 0x1000);
226    assert(y == 1);
227    flush_mthd(sim);
228    finish_dma_read_fifo(sim);
229 }
230 
231 static uint32_t
load_reg(struct mme_tu104_sim * sim,const struct mme_tu104_inst * inst,uint32_t imm_idx,enum mme_tu104_reg reg)232 load_reg(struct mme_tu104_sim *sim,
233          const struct mme_tu104_inst *inst,
234          uint32_t imm_idx, enum mme_tu104_reg reg)
235 {
236    if (reg <= MME_TU104_REG_R23) {
237       assert(sim->set_regs & BITFIELD_BIT(reg));
238       return sim->regs[reg];
239    }
240 
241    switch (reg) {
242    case MME_TU104_REG_ZERO:
243       return 0;
244    case MME_TU104_REG_IMM:
245       assert(imm_idx < 2);
246       /* Immediates are treated as signed for ALU ops */
247       return (int16_t)inst->imm[imm_idx];
248    case MME_TU104_REG_IMMPAIR:
249       assert(imm_idx < 2);
250       /* Immediates are treated as signed for ALU ops */
251       return (int16_t)inst->imm[1 - imm_idx];
252    case MME_TU104_REG_IMM32:
253       return ((uint32_t)inst->imm[0] << 16) | inst->imm[1];
254    case MME_TU104_REG_LOAD0:
255       return sim->load[0];
256    case MME_TU104_REG_LOAD1:
257       return sim->load[1];
258    default:
259       unreachable("Unhandled register type");
260    }
261 }
262 
263 static uint8_t
load_pred(struct mme_tu104_sim * sim,const struct mme_tu104_inst * inst)264 load_pred(struct mme_tu104_sim *sim,
265           const struct mme_tu104_inst *inst)
266 {
267    if (inst->pred_mode == MME_TU104_PRED_UUUU)
268       return 0xf;
269 
270    uint32_t val = load_reg(sim, inst, -1, inst->pred);
271    const char *pred = mme_tu104_pred_to_str(inst->pred_mode);
272 
273    uint8_t mask = 0;
274    for (unsigned i = 0; i < 4; i++) {
275       if (pred[i] != (val ? 'T' : 'F'))
276          mask |= BITFIELD_BIT(i);
277    }
278 
279    return mask;
280 }
281 
282 static void
store_reg(struct mme_tu104_sim * sim,enum mme_tu104_reg reg,uint32_t val)283 store_reg(struct mme_tu104_sim *sim,
284           enum mme_tu104_reg reg,
285           uint32_t val)
286 {
287    if (reg <= MME_TU104_REG_R23) {
288       sim->set_regs |= BITFIELD_BIT(reg);
289       sim->regs[reg] = val;
290    } else if (reg <= MME_TU104_REG_ZERO) {
291       /* Do nothing */
292    } else {
293       unreachable("Unhandled register type");
294    }
295 }
296 
297 static bool
eval_cond(enum mme_tu104_alu_op op,uint32_t x,uint32_t y)298 eval_cond(enum mme_tu104_alu_op op, uint32_t x, uint32_t y)
299 {
300    switch (op) {
301    case MME_TU104_ALU_OP_BLT:
302    case MME_TU104_ALU_OP_SLT:
303       return (int32_t)x < (int32_t)y;
304    case MME_TU104_ALU_OP_BLTU:
305    case MME_TU104_ALU_OP_SLTU:
306       return (uint32_t)x < (uint32_t)y;
307    case MME_TU104_ALU_OP_BLE:
308    case MME_TU104_ALU_OP_SLE:
309       return (int32_t)x <= (int32_t)y;
310    case MME_TU104_ALU_OP_BLEU:
311    case MME_TU104_ALU_OP_SLEU:
312       return (uint32_t)x <= (uint32_t)y;
313    case MME_TU104_ALU_OP_BEQ:
314    case MME_TU104_ALU_OP_SEQ:
315       return x == y;
316    default:
317       unreachable("Not a comparison op");
318    }
319 }
320 
321 static void
eval_alu(struct mme_tu104_sim * sim,const struct mme_tu104_inst * inst,uint32_t alu_idx)322 eval_alu(struct mme_tu104_sim *sim,
323          const struct mme_tu104_inst *inst,
324          uint32_t alu_idx)
325 {
326    const struct mme_tu104_alu *alu = &inst->alu[alu_idx];
327    const uint32_t x = load_reg(sim, inst, alu_idx, alu->src[0]);
328    const uint32_t y = load_reg(sim, inst, alu_idx, alu->src[1]);
329 
330    uint32_t res = 0;
331    switch (inst->alu[alu_idx].op) {
332    case MME_TU104_ALU_OP_ADD:
333       res = x + y;
334       sim->alu_carry = res < x;
335       break;
336    case MME_TU104_ALU_OP_ADDC:
337       assert(alu_idx == 1);
338       assert(inst->alu[0].op == MME_TU104_ALU_OP_ADD);
339       res = x + y + sim->alu_carry;
340       break;
341    case MME_TU104_ALU_OP_SUB:
342       res = x - y;
343       sim->alu_carry = res > x;
344       break;
345    case MME_TU104_ALU_OP_SUBB:
346       assert(alu_idx == 1);
347       assert(inst->alu[0].op == MME_TU104_ALU_OP_SUB);
348       res = x - y - sim->alu_carry;
349       break;
350    case MME_TU104_ALU_OP_MUL: {
351       /* Sign extend but use uint64_t for the multiply so that we avoid
352        * undefined behavior from possible signed multiply roll-over.
353        */
354       const uint64_t x_u64 = (int64_t)(int32_t)x;
355       const uint64_t y_u64 = (int64_t)(int32_t)y;
356       const uint64_t xy_u64 = x_u64 * y_u64;
357       res = xy_u64;
358       sim->alu_carry = xy_u64 >> 32;
359       break;
360    }
361    case MME_TU104_ALU_OP_MULH:
362       assert(inst->alu[alu_idx].src[0] == MME_TU104_REG_ZERO);
363       assert(inst->alu[alu_idx].src[1] == MME_TU104_REG_ZERO);
364       res = sim->alu_carry;
365       break;
366    case MME_TU104_ALU_OP_MULU: {
367       const uint64_t x_u64 = x;
368       const uint64_t y_u64 = y;
369       const uint64_t xy_u64 = x_u64 * y_u64;
370       res = xy_u64;
371       sim->alu_carry = xy_u64 >> 32;
372       break;
373    }
374    case MME_TU104_ALU_OP_EXTENDED:
375       eval_extended(sim, x, y);
376       break;
377    case MME_TU104_ALU_OP_CLZ:
378       res = __builtin_clz(x);
379       break;
380    case MME_TU104_ALU_OP_SLL:
381       res = x << (y & 31);
382       break;
383    case MME_TU104_ALU_OP_SRL:
384       res = x >> (y & 31);
385       break;
386    case MME_TU104_ALU_OP_SRA:
387       res = (int32_t)x >> (y & 31);
388       break;
389    case MME_TU104_ALU_OP_AND:
390       res = x & y;
391       break;
392    case MME_TU104_ALU_OP_NAND:
393       res = ~(x & y);
394       break;
395    case MME_TU104_ALU_OP_OR:
396       res = x | y;
397       break;
398    case MME_TU104_ALU_OP_XOR:
399       res = x ^ y;
400       break;
401    case MME_TU104_ALU_OP_MERGE: {
402       uint16_t immed = inst->imm[alu_idx];
403       uint32_t dst_pos  = (immed >> 10) & 0x3f;
404       uint32_t bits     = (immed >> 5)  & 0x1f;
405       uint32_t src_pos  = (immed >> 0)  & 0x1f;
406       res = (x & ~(BITFIELD_MASK(bits) << dst_pos)) |
407             (((y >> src_pos) & BITFIELD_MASK(bits)) << dst_pos);
408       break;
409    }
410    case MME_TU104_ALU_OP_SLT:
411    case MME_TU104_ALU_OP_SLTU:
412    case MME_TU104_ALU_OP_SLE:
413    case MME_TU104_ALU_OP_SLEU:
414    case MME_TU104_ALU_OP_SEQ:
415       res = eval_cond(inst->alu[alu_idx].op, x, y) ? ~0u : 0u;
416       break;
417    case MME_TU104_ALU_OP_STATE:
418       flush_mthd(sim);
419       res = load_state(sim, (uint16_t)(x + y) * 4);
420       break;
421    case MME_TU104_ALU_OP_LOOP:
422       assert(sim->loop_count == 0);
423       assert(inst->alu[alu_idx].dst == MME_TU104_REG_ZERO);
424       assert(inst->alu[alu_idx].src[1] == MME_TU104_REG_ZERO);
425       sim->loop_count = MAX2(1, x) - 1;
426       sim->loop_start = sim->ip;
427       sim->loop_end = sim->ip + inst->imm[alu_idx] - 1;
428       assert(sim->loop_end > sim->ip);
429       break;
430    case MME_TU104_ALU_OP_JAL: {
431       assert(inst->alu[alu_idx].dst == MME_TU104_REG_ZERO);
432       assert(inst->alu[alu_idx].src[0] == MME_TU104_REG_ZERO);
433       assert(inst->alu[alu_idx].src[1] == MME_TU104_REG_ZERO);
434       /* No idea what bit 15 does.  The NVIDIA blob always sets it. */
435       assert(inst->imm[alu_idx] & BITFIELD_BIT(15));
436       uint16_t offset = (inst->imm[alu_idx] & BITFIELD_MASK(15));
437       sim->next_ip = sim->ip + offset;
438       res = 0;
439       break;
440    }
441    case MME_TU104_ALU_OP_BLT:
442    case MME_TU104_ALU_OP_BLTU:
443    case MME_TU104_ALU_OP_BLE:
444    case MME_TU104_ALU_OP_BLEU:
445    case MME_TU104_ALU_OP_BEQ: {
446       assert(inst->alu[alu_idx].dst == MME_TU104_REG_ZERO);
447       bool expect = (inst->imm[alu_idx] & BITFIELD_BIT(15)) != 0;
448       if (eval_cond(inst->alu[alu_idx].op, x, y) == expect) {
449          int16_t offset = util_mask_sign_extend(inst->imm[alu_idx], 13);
450          if ((uint16_t)offset == 0xf000) {
451             sim->stop = true;
452             break;
453          }
454 
455          assert((int)sim->ip + offset >= 0);
456          assert((int)sim->ip + offset < 0x1000);
457          sim->next_ip = sim->ip + offset;
458       }
459       break;
460    }
461    case MME_TU104_ALU_OP_DREAD:
462       assert(inst->alu[alu_idx].src[1] == MME_TU104_REG_ZERO);
463       assert(x < ARRAY_SIZE(sim->ram.data));
464       res = sim->ram.data[x];
465       break;
466    case MME_TU104_ALU_OP_DWRITE:
467       assert(inst->alu[alu_idx].dst == MME_TU104_REG_ZERO);
468       assert(x < ARRAY_SIZE(sim->ram.data));
469       sim->ram.data[x] = y;
470       break;
471    default:
472       unreachable("Unhandled ALU op");
473    }
474 
475    sim->alu_res[alu_idx] = res;
476    store_reg(sim, inst->alu[alu_idx].dst, res);
477 }
478 
479 static uint32_t
load_out(struct mme_tu104_sim * sim,const struct mme_tu104_inst * inst,enum mme_tu104_out_op op)480 load_out(struct mme_tu104_sim *sim,
481          const struct mme_tu104_inst *inst,
482          enum mme_tu104_out_op op)
483 {
484    switch (op) {
485    case MME_TU104_OUT_OP_ALU0:
486       return sim->alu_res[0];
487    case MME_TU104_OUT_OP_ALU1:
488       return sim->alu_res[1];
489    case MME_TU104_OUT_OP_LOAD0:
490       return sim->load[0];
491    case MME_TU104_OUT_OP_LOAD1:
492       return sim->load[1];
493    case MME_TU104_OUT_OP_IMM0:
494       return inst->imm[0];
495    case MME_TU104_OUT_OP_IMM1:
496       return inst->imm[1];
497    case MME_TU104_OUT_OP_IMMHIGH0:
498       return inst->imm[0] >> 12;
499    case MME_TU104_OUT_OP_IMMHIGH1:
500       return inst->imm[1] >> 12;
501    case MME_TU104_OUT_OP_IMM32:
502       return ((uint32_t)inst->imm[0] << 16) | inst->imm[1];
503    default:
504       unreachable("Unhandled output op");
505    }
506 }
507 
508 static void
eval_out(struct mme_tu104_sim * sim,const struct mme_tu104_inst * inst,uint32_t out_idx)509 eval_out(struct mme_tu104_sim *sim,
510          const struct mme_tu104_inst *inst,
511          uint32_t out_idx)
512 {
513    if (inst->out[out_idx].mthd != MME_TU104_OUT_OP_NONE) {
514       uint32_t data = load_out(sim, inst, inst->out[out_idx].mthd);
515 
516       flush_mthd(sim);
517       sim->mthd.mthd = (data & 0xfff) << 2;
518       sim->mthd.inc = (data >> 12) & 0xf;
519       sim->mthd.has_mthd = true;
520       sim->mthd.data_len = 0;
521    }
522 
523    if (inst->out[out_idx].emit != MME_TU104_OUT_OP_NONE) {
524       uint32_t data = load_out(sim, inst, inst->out[out_idx].emit);
525 
526       assert(sim->mthd.data_len < ARRAY_SIZE(sim->mthd.data));
527       sim->mthd.data[sim->mthd.data_len++] = data;
528    }
529 }
530 
531 void
mme_tu104_sim(uint32_t inst_count,const struct mme_tu104_inst * insts,uint32_t param_count,const uint32_t * params,uint32_t mem_count,struct mme_tu104_sim_mem * mems)532 mme_tu104_sim(uint32_t inst_count, const struct mme_tu104_inst *insts,
533               uint32_t param_count, const uint32_t *params,
534               uint32_t mem_count, struct mme_tu104_sim_mem *mems)
535 {
536    struct mme_tu104_sim sim = {
537       .param_count = param_count,
538       .params = params,
539       .mem_count = mem_count,
540       .mems = mems,
541    };
542 
543    bool end_next = false;
544    while (true) {
545       assert(sim.ip < inst_count);
546       const struct mme_tu104_inst *inst = &insts[sim.ip];
547       sim.next_ip = sim.ip + 1;
548 
549       load_params(&sim, inst);
550 
551       uint8_t pred = load_pred(&sim, inst);
552 
553       /* No idea why the HW has this rule but it does */
554       assert(inst->alu[0].op != MME_TU104_ALU_OP_STATE ||
555              inst->alu[1].op != MME_TU104_ALU_OP_STATE);
556 
557       if (pred & BITFIELD_BIT(0))
558          eval_alu(&sim, inst, 0);
559       if (pred & BITFIELD_BIT(1))
560          eval_alu(&sim, inst, 1);
561       if (pred & BITFIELD_BIT(2))
562          eval_out(&sim, inst, 0);
563       if (pred & BITFIELD_BIT(3))
564          eval_out(&sim, inst, 1);
565 
566       if (end_next || sim.stop)
567          break;
568 
569       end_next = inst->end_next;
570 
571       if (sim.loop_count > 0 && sim.ip == sim.loop_end) {
572          sim.loop_count--;
573          sim.next_ip = sim.loop_start + 1;
574       }
575 
576       sim.ip = sim.next_ip;
577    }
578 
579    flush_mthd(&sim);
580 }
581