1 /*
2 * Copyright © 2022 Collabora Ltd.
3 * SPDX-License-Identifier: MIT
4 */
5 #include "mme_tu104_sim.h"
6
7 #include <inttypes.h>
8
9 #include "mme_tu104.h"
10 #include "util/u_math.h"
11
12 #include "nvk_clc597.h"
13
14 struct mme_tu104_sim {
15 uint32_t param_count;
16 const uint32_t *params;
17
18 uint32_t load[2];
19
20 /* Bound memory ranges */
21 uint32_t mem_count;
22 struct mme_tu104_sim_mem *mems;
23
24 /* SET_MME_MEM_ADDRESS_A/B */
25 uint64_t mem_addr;
26
27 /* RAM, accessed by DREAD/DWRITE */
28 struct {
29 uint32_t data[MME_TU104_DRAM_COUNT];
30
31 /* SET_MME_MEM_RAM_ADDRESS */
32 uint32_t addr;
33 } ram;
34
35 struct {
36 struct {
37 uint32_t data[1024];
38 uint64_t count;
39 } read_fifo;
40 } dma;
41
42 /* NVC597_SET_MME_SHADOW_SCRATCH(i) */
43 uint32_t scratch[256];
44
45 struct {
46 unsigned mthd:16;
47 unsigned inc:4;
48 bool has_mthd:1;
49 unsigned _pad:5;
50 unsigned data_len:8;
51 uint32_t data[8];
52 } mthd;
53
54 uint32_t set_regs;
55 uint32_t regs[23];
56 uint32_t alu_res[2];
57 uint32_t alu_carry;
58
59 uint16_t ip;
60 uint16_t next_ip;
61 bool stop;
62
63 uint32_t loop_count;
64 uint16_t loop_start;
65 uint16_t loop_end;
66 };
67
68 static bool
inst_loads_reg(const struct mme_tu104_inst * inst,enum mme_tu104_reg reg)69 inst_loads_reg(const struct mme_tu104_inst *inst,
70 enum mme_tu104_reg reg)
71 {
72 return inst->pred == reg ||
73 inst->alu[0].src[0] == reg ||
74 inst->alu[0].src[1] == reg ||
75 inst->alu[1].src[0] == reg ||
76 inst->alu[1].src[1] == reg;
77 }
78
79 static bool
inst_loads_out(const struct mme_tu104_inst * inst,enum mme_tu104_out_op out)80 inst_loads_out(const struct mme_tu104_inst *inst,
81 enum mme_tu104_out_op out)
82 {
83 return inst->out[0].mthd == out ||
84 inst->out[0].emit == out ||
85 inst->out[1].mthd == out ||
86 inst->out[1].emit == out;
87 }
88
89 static void
load_params(struct mme_tu104_sim * sim,const struct mme_tu104_inst * inst)90 load_params(struct mme_tu104_sim *sim,
91 const struct mme_tu104_inst *inst)
92 {
93 const bool has_load0 = inst_loads_reg(inst, MME_TU104_REG_LOAD0) ||
94 inst_loads_out(inst, MME_TU104_OUT_OP_LOAD0);
95 const bool has_load1 = inst_loads_reg(inst, MME_TU104_REG_LOAD1) ||
96 inst_loads_out(inst, MME_TU104_OUT_OP_LOAD1);
97 assert(has_load0 || !has_load1);
98
99 if (has_load0) {
100 sim->load[0] = *sim->params;
101 sim->params++;
102 sim->param_count--;
103 }
104
105 if (has_load1) {
106 sim->load[1] = *sim->params;
107 sim->params++;
108 sim->param_count--;
109 }
110 }
111
112 static uint32_t
load_state(struct mme_tu104_sim * sim,uint16_t state)113 load_state(struct mme_tu104_sim *sim, uint16_t state)
114 {
115 assert(state % 4 == 0);
116
117 if (NVC597_SET_MME_SHADOW_SCRATCH(0) <= state &&
118 state < NVC597_CALL_MME_MACRO(0)) {
119 uint32_t i = (state - NVC597_SET_MME_SHADOW_SCRATCH(0)) / 4;
120 assert(i <= ARRAY_SIZE(sim->scratch));
121 return sim->scratch[i];
122 }
123
124 return 0;
125 }
126
127 static uint32_t *
find_mem(struct mme_tu104_sim * sim,uint64_t addr,const char * op_desc)128 find_mem(struct mme_tu104_sim *sim, uint64_t addr, const char *op_desc)
129 {
130 for (uint32_t i = 0; i < sim->mem_count; i++) {
131 if (addr < sim->mems[i].addr)
132 continue;
133
134 uint64_t offset = addr - sim->mems[i].addr;
135 if (offset >= sim->mems[i].size)
136 continue;
137
138 assert(sim->mems[i].data != NULL);
139 return (uint32_t *)((char *)sim->mems[i].data + offset);
140 }
141
142 fprintf(stderr, "FAULT in %s at address 0x%"PRIx64"\n", op_desc, addr);
143 abort();
144 }
145
146 static void
finish_dma_read_fifo(struct mme_tu104_sim * sim)147 finish_dma_read_fifo(struct mme_tu104_sim *sim)
148 {
149 if (sim->dma.read_fifo.count == 0)
150 return;
151
152 for (uint32_t i = 0; i < sim->dma.read_fifo.count; i++) {
153 uint32_t *src = find_mem(sim, sim->mem_addr + i * 4,
154 "MME_DMA_READ_FIFOED");
155 assert(src != NULL);
156 sim->dma.read_fifo.data[i] = *src;
157 }
158
159 sim->param_count = sim->dma.read_fifo.count;
160 sim->params = sim->dma.read_fifo.data;
161 }
162
163 static void
flush_mthd(struct mme_tu104_sim * sim)164 flush_mthd(struct mme_tu104_sim *sim)
165 {
166 if (!sim->mthd.has_mthd)
167 return;
168
169 uint16_t mthd = sim->mthd.mthd;
170 const uint32_t *p = sim->mthd.data;
171 const uint32_t *end = sim->mthd.data + sim->mthd.data_len;
172 while (p < end) {
173 uint32_t dw_used = 1;
174 if (NVC597_SET_MME_SHADOW_SCRATCH(0) <= mthd &&
175 mthd < NVC597_CALL_MME_MACRO(0)) {
176 uint32_t i = (mthd - NVC597_SET_MME_SHADOW_SCRATCH(0)) / 4;
177 assert(i <= ARRAY_SIZE(sim->scratch));
178 sim->scratch[i] = *p;
179 } else {
180 switch (mthd) {
181 case NVC597_SET_REPORT_SEMAPHORE_A: {
182 assert(p + 4 <= end);
183 uint64_t addr = ((uint64_t)p[0] << 32) | p[1];
184 uint32_t data = p[2];
185 assert(p[3] == 0x10000000);
186 dw_used = 4;
187
188 uint32_t *mem = find_mem(sim, addr, "SET_REPORT_SEMAPHORE");
189 *mem = data;
190 break;
191 }
192 case NVC597_SET_MME_DATA_RAM_ADDRESS:
193 sim->ram.addr = *p;
194 break;
195 case NVC597_SET_MME_MEM_ADDRESS_A:
196 assert(p + 2 <= end);
197 sim->mem_addr = ((uint64_t)p[0] << 32) | p[1];
198 dw_used = 2;
199 break;
200 case NVC597_MME_DMA_READ_FIFOED:
201 sim->dma.read_fifo.count = *p;
202 break;
203 default:
204 fprintf(stdout, "%s:\n", P_PARSE_NVC597_MTHD(mthd));
205 P_DUMP_NVC597_MTHD_DATA(stdout, mthd, *p, " ");
206 break;
207 }
208 }
209
210 p += dw_used;
211 assert(sim->mthd.inc == 1);
212 mthd += dw_used * 4;
213 }
214
215 sim->mthd.has_mthd = false;
216 }
217
218 static void
eval_extended(struct mme_tu104_sim * sim,uint32_t x,uint32_t y)219 eval_extended(struct mme_tu104_sim *sim,
220 uint32_t x, uint32_t y)
221 {
222 /* The only extended method we know about appears to be some sort of
223 * barrier required when using READ_FIFOED.
224 */
225 assert(x == 0x1000);
226 assert(y == 1);
227 flush_mthd(sim);
228 finish_dma_read_fifo(sim);
229 }
230
231 static uint32_t
load_reg(struct mme_tu104_sim * sim,const struct mme_tu104_inst * inst,uint32_t imm_idx,enum mme_tu104_reg reg)232 load_reg(struct mme_tu104_sim *sim,
233 const struct mme_tu104_inst *inst,
234 uint32_t imm_idx, enum mme_tu104_reg reg)
235 {
236 if (reg <= MME_TU104_REG_R23) {
237 assert(sim->set_regs & BITFIELD_BIT(reg));
238 return sim->regs[reg];
239 }
240
241 switch (reg) {
242 case MME_TU104_REG_ZERO:
243 return 0;
244 case MME_TU104_REG_IMM:
245 assert(imm_idx < 2);
246 /* Immediates are treated as signed for ALU ops */
247 return (int16_t)inst->imm[imm_idx];
248 case MME_TU104_REG_IMMPAIR:
249 assert(imm_idx < 2);
250 /* Immediates are treated as signed for ALU ops */
251 return (int16_t)inst->imm[1 - imm_idx];
252 case MME_TU104_REG_IMM32:
253 return ((uint32_t)inst->imm[0] << 16) | inst->imm[1];
254 case MME_TU104_REG_LOAD0:
255 return sim->load[0];
256 case MME_TU104_REG_LOAD1:
257 return sim->load[1];
258 default:
259 unreachable("Unhandled register type");
260 }
261 }
262
263 static uint8_t
load_pred(struct mme_tu104_sim * sim,const struct mme_tu104_inst * inst)264 load_pred(struct mme_tu104_sim *sim,
265 const struct mme_tu104_inst *inst)
266 {
267 if (inst->pred_mode == MME_TU104_PRED_UUUU)
268 return 0xf;
269
270 uint32_t val = load_reg(sim, inst, -1, inst->pred);
271 const char *pred = mme_tu104_pred_to_str(inst->pred_mode);
272
273 uint8_t mask = 0;
274 for (unsigned i = 0; i < 4; i++) {
275 if (pred[i] != (val ? 'T' : 'F'))
276 mask |= BITFIELD_BIT(i);
277 }
278
279 return mask;
280 }
281
282 static void
store_reg(struct mme_tu104_sim * sim,enum mme_tu104_reg reg,uint32_t val)283 store_reg(struct mme_tu104_sim *sim,
284 enum mme_tu104_reg reg,
285 uint32_t val)
286 {
287 if (reg <= MME_TU104_REG_R23) {
288 sim->set_regs |= BITFIELD_BIT(reg);
289 sim->regs[reg] = val;
290 } else if (reg <= MME_TU104_REG_ZERO) {
291 /* Do nothing */
292 } else {
293 unreachable("Unhandled register type");
294 }
295 }
296
297 static bool
eval_cond(enum mme_tu104_alu_op op,uint32_t x,uint32_t y)298 eval_cond(enum mme_tu104_alu_op op, uint32_t x, uint32_t y)
299 {
300 switch (op) {
301 case MME_TU104_ALU_OP_BLT:
302 case MME_TU104_ALU_OP_SLT:
303 return (int32_t)x < (int32_t)y;
304 case MME_TU104_ALU_OP_BLTU:
305 case MME_TU104_ALU_OP_SLTU:
306 return (uint32_t)x < (uint32_t)y;
307 case MME_TU104_ALU_OP_BLE:
308 case MME_TU104_ALU_OP_SLE:
309 return (int32_t)x <= (int32_t)y;
310 case MME_TU104_ALU_OP_BLEU:
311 case MME_TU104_ALU_OP_SLEU:
312 return (uint32_t)x <= (uint32_t)y;
313 case MME_TU104_ALU_OP_BEQ:
314 case MME_TU104_ALU_OP_SEQ:
315 return x == y;
316 default:
317 unreachable("Not a comparison op");
318 }
319 }
320
321 static void
eval_alu(struct mme_tu104_sim * sim,const struct mme_tu104_inst * inst,uint32_t alu_idx)322 eval_alu(struct mme_tu104_sim *sim,
323 const struct mme_tu104_inst *inst,
324 uint32_t alu_idx)
325 {
326 const struct mme_tu104_alu *alu = &inst->alu[alu_idx];
327 const uint32_t x = load_reg(sim, inst, alu_idx, alu->src[0]);
328 const uint32_t y = load_reg(sim, inst, alu_idx, alu->src[1]);
329
330 uint32_t res = 0;
331 switch (inst->alu[alu_idx].op) {
332 case MME_TU104_ALU_OP_ADD:
333 res = x + y;
334 sim->alu_carry = res < x;
335 break;
336 case MME_TU104_ALU_OP_ADDC:
337 assert(alu_idx == 1);
338 assert(inst->alu[0].op == MME_TU104_ALU_OP_ADD);
339 res = x + y + sim->alu_carry;
340 break;
341 case MME_TU104_ALU_OP_SUB:
342 res = x - y;
343 sim->alu_carry = res > x;
344 break;
345 case MME_TU104_ALU_OP_SUBB:
346 assert(alu_idx == 1);
347 assert(inst->alu[0].op == MME_TU104_ALU_OP_SUB);
348 res = x - y - sim->alu_carry;
349 break;
350 case MME_TU104_ALU_OP_MUL: {
351 /* Sign extend but use uint64_t for the multiply so that we avoid
352 * undefined behavior from possible signed multiply roll-over.
353 */
354 const uint64_t x_u64 = (int64_t)(int32_t)x;
355 const uint64_t y_u64 = (int64_t)(int32_t)y;
356 const uint64_t xy_u64 = x_u64 * y_u64;
357 res = xy_u64;
358 sim->alu_carry = xy_u64 >> 32;
359 break;
360 }
361 case MME_TU104_ALU_OP_MULH:
362 assert(inst->alu[alu_idx].src[0] == MME_TU104_REG_ZERO);
363 assert(inst->alu[alu_idx].src[1] == MME_TU104_REG_ZERO);
364 res = sim->alu_carry;
365 break;
366 case MME_TU104_ALU_OP_MULU: {
367 const uint64_t x_u64 = x;
368 const uint64_t y_u64 = y;
369 const uint64_t xy_u64 = x_u64 * y_u64;
370 res = xy_u64;
371 sim->alu_carry = xy_u64 >> 32;
372 break;
373 }
374 case MME_TU104_ALU_OP_EXTENDED:
375 eval_extended(sim, x, y);
376 break;
377 case MME_TU104_ALU_OP_CLZ:
378 res = __builtin_clz(x);
379 break;
380 case MME_TU104_ALU_OP_SLL:
381 res = x << (y & 31);
382 break;
383 case MME_TU104_ALU_OP_SRL:
384 res = x >> (y & 31);
385 break;
386 case MME_TU104_ALU_OP_SRA:
387 res = (int32_t)x >> (y & 31);
388 break;
389 case MME_TU104_ALU_OP_AND:
390 res = x & y;
391 break;
392 case MME_TU104_ALU_OP_NAND:
393 res = ~(x & y);
394 break;
395 case MME_TU104_ALU_OP_OR:
396 res = x | y;
397 break;
398 case MME_TU104_ALU_OP_XOR:
399 res = x ^ y;
400 break;
401 case MME_TU104_ALU_OP_MERGE: {
402 uint16_t immed = inst->imm[alu_idx];
403 uint32_t dst_pos = (immed >> 10) & 0x3f;
404 uint32_t bits = (immed >> 5) & 0x1f;
405 uint32_t src_pos = (immed >> 0) & 0x1f;
406 res = (x & ~(BITFIELD_MASK(bits) << dst_pos)) |
407 (((y >> src_pos) & BITFIELD_MASK(bits)) << dst_pos);
408 break;
409 }
410 case MME_TU104_ALU_OP_SLT:
411 case MME_TU104_ALU_OP_SLTU:
412 case MME_TU104_ALU_OP_SLE:
413 case MME_TU104_ALU_OP_SLEU:
414 case MME_TU104_ALU_OP_SEQ:
415 res = eval_cond(inst->alu[alu_idx].op, x, y) ? ~0u : 0u;
416 break;
417 case MME_TU104_ALU_OP_STATE:
418 flush_mthd(sim);
419 res = load_state(sim, (uint16_t)(x + y) * 4);
420 break;
421 case MME_TU104_ALU_OP_LOOP:
422 assert(sim->loop_count == 0);
423 assert(inst->alu[alu_idx].dst == MME_TU104_REG_ZERO);
424 assert(inst->alu[alu_idx].src[1] == MME_TU104_REG_ZERO);
425 sim->loop_count = MAX2(1, x) - 1;
426 sim->loop_start = sim->ip;
427 sim->loop_end = sim->ip + inst->imm[alu_idx] - 1;
428 assert(sim->loop_end > sim->ip);
429 break;
430 case MME_TU104_ALU_OP_JAL: {
431 assert(inst->alu[alu_idx].dst == MME_TU104_REG_ZERO);
432 assert(inst->alu[alu_idx].src[0] == MME_TU104_REG_ZERO);
433 assert(inst->alu[alu_idx].src[1] == MME_TU104_REG_ZERO);
434 /* No idea what bit 15 does. The NVIDIA blob always sets it. */
435 assert(inst->imm[alu_idx] & BITFIELD_BIT(15));
436 uint16_t offset = (inst->imm[alu_idx] & BITFIELD_MASK(15));
437 sim->next_ip = sim->ip + offset;
438 res = 0;
439 break;
440 }
441 case MME_TU104_ALU_OP_BLT:
442 case MME_TU104_ALU_OP_BLTU:
443 case MME_TU104_ALU_OP_BLE:
444 case MME_TU104_ALU_OP_BLEU:
445 case MME_TU104_ALU_OP_BEQ: {
446 assert(inst->alu[alu_idx].dst == MME_TU104_REG_ZERO);
447 bool expect = (inst->imm[alu_idx] & BITFIELD_BIT(15)) != 0;
448 if (eval_cond(inst->alu[alu_idx].op, x, y) == expect) {
449 int16_t offset = util_mask_sign_extend(inst->imm[alu_idx], 13);
450 if ((uint16_t)offset == 0xf000) {
451 sim->stop = true;
452 break;
453 }
454
455 assert((int)sim->ip + offset >= 0);
456 assert((int)sim->ip + offset < 0x1000);
457 sim->next_ip = sim->ip + offset;
458 }
459 break;
460 }
461 case MME_TU104_ALU_OP_DREAD:
462 assert(inst->alu[alu_idx].src[1] == MME_TU104_REG_ZERO);
463 assert(x < ARRAY_SIZE(sim->ram.data));
464 res = sim->ram.data[x];
465 break;
466 case MME_TU104_ALU_OP_DWRITE:
467 assert(inst->alu[alu_idx].dst == MME_TU104_REG_ZERO);
468 assert(x < ARRAY_SIZE(sim->ram.data));
469 sim->ram.data[x] = y;
470 break;
471 default:
472 unreachable("Unhandled ALU op");
473 }
474
475 sim->alu_res[alu_idx] = res;
476 store_reg(sim, inst->alu[alu_idx].dst, res);
477 }
478
479 static uint32_t
load_out(struct mme_tu104_sim * sim,const struct mme_tu104_inst * inst,enum mme_tu104_out_op op)480 load_out(struct mme_tu104_sim *sim,
481 const struct mme_tu104_inst *inst,
482 enum mme_tu104_out_op op)
483 {
484 switch (op) {
485 case MME_TU104_OUT_OP_ALU0:
486 return sim->alu_res[0];
487 case MME_TU104_OUT_OP_ALU1:
488 return sim->alu_res[1];
489 case MME_TU104_OUT_OP_LOAD0:
490 return sim->load[0];
491 case MME_TU104_OUT_OP_LOAD1:
492 return sim->load[1];
493 case MME_TU104_OUT_OP_IMM0:
494 return inst->imm[0];
495 case MME_TU104_OUT_OP_IMM1:
496 return inst->imm[1];
497 case MME_TU104_OUT_OP_IMMHIGH0:
498 return inst->imm[0] >> 12;
499 case MME_TU104_OUT_OP_IMMHIGH1:
500 return inst->imm[1] >> 12;
501 case MME_TU104_OUT_OP_IMM32:
502 return ((uint32_t)inst->imm[0] << 16) | inst->imm[1];
503 default:
504 unreachable("Unhandled output op");
505 }
506 }
507
508 static void
eval_out(struct mme_tu104_sim * sim,const struct mme_tu104_inst * inst,uint32_t out_idx)509 eval_out(struct mme_tu104_sim *sim,
510 const struct mme_tu104_inst *inst,
511 uint32_t out_idx)
512 {
513 if (inst->out[out_idx].mthd != MME_TU104_OUT_OP_NONE) {
514 uint32_t data = load_out(sim, inst, inst->out[out_idx].mthd);
515
516 flush_mthd(sim);
517 sim->mthd.mthd = (data & 0xfff) << 2;
518 sim->mthd.inc = (data >> 12) & 0xf;
519 sim->mthd.has_mthd = true;
520 sim->mthd.data_len = 0;
521 }
522
523 if (inst->out[out_idx].emit != MME_TU104_OUT_OP_NONE) {
524 uint32_t data = load_out(sim, inst, inst->out[out_idx].emit);
525
526 assert(sim->mthd.data_len < ARRAY_SIZE(sim->mthd.data));
527 sim->mthd.data[sim->mthd.data_len++] = data;
528 }
529 }
530
531 void
mme_tu104_sim(uint32_t inst_count,const struct mme_tu104_inst * insts,uint32_t param_count,const uint32_t * params,uint32_t mem_count,struct mme_tu104_sim_mem * mems)532 mme_tu104_sim(uint32_t inst_count, const struct mme_tu104_inst *insts,
533 uint32_t param_count, const uint32_t *params,
534 uint32_t mem_count, struct mme_tu104_sim_mem *mems)
535 {
536 struct mme_tu104_sim sim = {
537 .param_count = param_count,
538 .params = params,
539 .mem_count = mem_count,
540 .mems = mems,
541 };
542
543 bool end_next = false;
544 while (true) {
545 assert(sim.ip < inst_count);
546 const struct mme_tu104_inst *inst = &insts[sim.ip];
547 sim.next_ip = sim.ip + 1;
548
549 load_params(&sim, inst);
550
551 uint8_t pred = load_pred(&sim, inst);
552
553 /* No idea why the HW has this rule but it does */
554 assert(inst->alu[0].op != MME_TU104_ALU_OP_STATE ||
555 inst->alu[1].op != MME_TU104_ALU_OP_STATE);
556
557 if (pred & BITFIELD_BIT(0))
558 eval_alu(&sim, inst, 0);
559 if (pred & BITFIELD_BIT(1))
560 eval_alu(&sim, inst, 1);
561 if (pred & BITFIELD_BIT(2))
562 eval_out(&sim, inst, 0);
563 if (pred & BITFIELD_BIT(3))
564 eval_out(&sim, inst, 1);
565
566 if (end_next || sim.stop)
567 break;
568
569 end_next = inst->end_next;
570
571 if (sim.loop_count > 0 && sim.ip == sim.loop_end) {
572 sim.loop_count--;
573 sim.next_ip = sim.loop_start + 1;
574 }
575
576 sim.ip = sim.next_ip;
577 }
578
579 flush_mthd(&sim);
580 }
581