1 /*
2 * Copyright © 2021 Google, Inc.
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
13 * Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 * SOFTWARE.
22 */
23
24 #include <assert.h>
25 #include <ctype.h>
26 #include <errno.h>
27 #include <stdio.h>
28 #include <stdlib.h>
29 #include <string.h>
30 #include <sys/mman.h>
31 #include <unistd.h>
32
33 #include "util/u_math.h"
34
35 #include "freedreno_pm4.h"
36
37 #include "isaspec.h"
38
39 #include "emu.h"
40 #include "util.h"
41
42 #define rotl32(x,r) (((x) << (r)) | ((x) >> (32 - (r))))
43 #define rotl64(x,r) (((x) << (r)) | ((x) >> (64 - (r))))
44
45 EMU_SQE_REG(SP);
46 EMU_SQE_REG(STACK0);
47 EMU_CONTROL_REG(DRAW_STATE_SET_HDR);
48
49 /**
50 * AFUC emulator. Currently only supports a6xx
51 *
52 * TODO to add a5xx it might be easier to compile this multiple times
53 * with conditional compile to deal with differences between generations.
54 */
55
56 static uint32_t
emu_alu(struct emu * emu,afuc_opc opc,uint32_t src1,uint32_t src2)57 emu_alu(struct emu *emu, afuc_opc opc, uint32_t src1, uint32_t src2)
58 {
59 uint64_t tmp;
60 switch (opc) {
61 case OPC_ADD:
62 tmp = (uint64_t)src1 + (uint64_t)src2;
63 emu->carry = tmp >> 32;
64 return (uint32_t)tmp;
65 case OPC_ADDHI:
66 return src1 + src2 + emu->carry;
67 case OPC_SUB:
68 tmp = (uint64_t)src1 - (uint64_t)src2;
69 emu->carry = tmp >> 32;
70 return (uint32_t)tmp;
71 case OPC_SUBHI:
72 return src1 - src2 + emu->carry;
73 case OPC_AND:
74 return src1 & src2;
75 case OPC_OR:
76 return src1 | src2;
77 case OPC_XOR:
78 return src1 ^ src2;
79 case OPC_NOT:
80 return ~src1;
81 case OPC_SHL:
82 return src1 << src2;
83 case OPC_USHR:
84 return src1 >> src2;
85 case OPC_ISHR:
86 return (int32_t)src1 >> src2;
87 case OPC_ROT:
88 if (src2 & 0x80000000)
89 return rotl64(src1, -*(int32_t *)&src2);
90 else
91 return rotl32(src1, src2);
92 case OPC_MUL8:
93 return (src1 & 0xff) * (src2 & 0xff);
94 case OPC_MIN:
95 return MIN2(src1, src2);
96 case OPC_MAX:
97 return MAX2(src1, src2);
98 case OPC_CMP:
99 if (src1 > src2)
100 return 0x00;
101 else if (src1 == src2)
102 return 0x2b;
103 return 0x1e;
104 case OPC_BIC:
105 return src1 & ~src2;
106 case OPC_MSB:
107 if (!src2)
108 return 0;
109 return util_last_bit(src2) - 1;
110 case OPC_SETBIT: {
111 unsigned bit = src2 >> 1;
112 unsigned val = src2 & 1;
113 return (src1 & ~(1u << bit)) | (val << bit);
114 }
115 default:
116 printf("unhandled alu opc: 0x%02x\n", opc);
117 exit(1);
118 }
119 }
120
121 /**
122 * Helper to calculate load/store address based on LOAD_STORE_HI
123 */
124 static uintptr_t
load_store_addr(struct emu * emu,unsigned gpr)125 load_store_addr(struct emu *emu, unsigned gpr)
126 {
127 EMU_CONTROL_REG(LOAD_STORE_HI);
128
129 uintptr_t addr = emu_get_reg32(emu, &LOAD_STORE_HI);
130 addr <<= 32;
131
132 return addr + emu_get_gpr_reg(emu, gpr);
133 }
134
135 static void
emu_instr(struct emu * emu,struct afuc_instr * instr)136 emu_instr(struct emu *emu, struct afuc_instr *instr)
137 {
138 uint32_t rem = emu_get_gpr_reg(emu, REG_REM);
139
140 switch (instr->opc) {
141 case OPC_NOP:
142 break;
143 case OPC_MSB:
144 case OPC_ADD ... OPC_BIC: {
145 uint32_t val = emu_alu(emu, instr->opc,
146 emu_get_gpr_reg(emu, instr->src1),
147 instr->has_immed ? instr->immed :
148 emu_get_gpr_reg(emu, instr->src2));
149 emu_set_gpr_reg(emu, instr->dst, val);
150
151 if (instr->xmov) {
152 unsigned m = MIN2(instr->xmov, rem);
153
154 assert(m <= 3);
155
156 if (m == 1) {
157 emu_set_gpr_reg(emu, REG_REM, --rem);
158 emu_dump_state_change(emu);
159 emu_set_gpr_reg(emu, REG_DATA,
160 emu_get_gpr_reg(emu, instr->src2));
161 } else if (m == 2) {
162 emu_set_gpr_reg(emu, REG_REM, --rem);
163 emu_dump_state_change(emu);
164 emu_set_gpr_reg(emu, REG_DATA,
165 emu_get_gpr_reg(emu, instr->src2));
166 emu_set_gpr_reg(emu, REG_REM, --rem);
167 emu_dump_state_change(emu);
168 emu_set_gpr_reg(emu, REG_DATA,
169 emu_get_gpr_reg(emu, instr->src2));
170 } else if (m == 3) {
171 emu_set_gpr_reg(emu, REG_REM, --rem);
172 emu_dump_state_change(emu);
173 emu_set_gpr_reg(emu, REG_DATA,
174 emu_get_gpr_reg(emu, instr->src2));
175 emu_set_gpr_reg(emu, REG_REM, --rem);
176 emu_dump_state_change(emu);
177 emu_set_gpr_reg(emu, instr->dst,
178 emu_get_gpr_reg(emu, instr->src2));
179 emu_set_gpr_reg(emu, REG_REM, --rem);
180 emu_dump_state_change(emu);
181 emu_set_gpr_reg(emu, REG_DATA,
182 emu_get_gpr_reg(emu, instr->src2));
183 }
184 }
185 break;
186 }
187 case OPC_MOVI: {
188 uint32_t val = instr->immed << instr->shift;
189 emu_set_gpr_reg(emu, instr->dst, val);
190 break;
191 }
192 case OPC_SETBITI: {
193 uint32_t src = emu_get_gpr_reg(emu, instr->src1);
194 emu_set_gpr_reg(emu, instr->dst, src | (1u << instr->bit));
195 break;
196 }
197 case OPC_CLRBIT: {
198 uint32_t src = emu_get_gpr_reg(emu, instr->src1);
199 emu_set_gpr_reg(emu, instr->dst, src & ~(1u << instr->bit));
200 break;
201 }
202 case OPC_UBFX: {
203 uint32_t src = emu_get_gpr_reg(emu, instr->src1);
204 unsigned lo = instr->bit, hi = instr->immed;
205 uint32_t dst = (src >> lo) & BITFIELD_MASK(hi - lo + 1);
206 emu_set_gpr_reg(emu, instr->dst, dst);
207 break;
208 }
209 case OPC_BFI: {
210 uint32_t src = emu_get_gpr_reg(emu, instr->src1);
211 unsigned lo = instr->bit, hi = instr->immed;
212 src = (src & BITFIELD_MASK(hi - lo + 1)) << lo;
213 emu_set_gpr_reg(emu, instr->dst, emu_get_gpr_reg(emu, instr->dst) | src);
214 break;
215 }
216 case OPC_CWRITE: {
217 uint32_t src1 = emu_get_gpr_reg(emu, instr->src1);
218 uint32_t src2 = emu_get_gpr_reg(emu, instr->src2);
219 uint32_t reg = src2 + instr->immed;
220
221 if (instr->preincrement) {
222 emu_set_gpr_reg(emu, instr->src2, reg);
223 }
224
225 emu_set_control_reg(emu, reg, src1);
226
227 for (unsigned i = 0; i < instr->sds; i++) {
228 uint32_t src1 = emu_get_gpr_reg(emu, instr->src1);
229
230 /* TODO: There is likely a DRAW_STATE_SET_BASE register on a6xx, as
231 * there is on a7xx, and we should be writing that instead of setting
232 * the base directly.
233 */
234 if (reg == emu_reg_offset(&DRAW_STATE_SET_HDR))
235 emu_set_draw_state_base(emu, i, src1);
236 }
237 break;
238 }
239 case OPC_CREAD: {
240 uint32_t src1 = emu_get_gpr_reg(emu, instr->src1);
241
242 if (instr->preincrement) {
243 emu_set_gpr_reg(emu, instr->src1, src1 + instr->immed);
244 }
245
246 emu_set_gpr_reg(emu, instr->dst,
247 emu_get_control_reg(emu, src1 + instr->immed));
248 break;
249 }
250 case OPC_SWRITE: {
251 uint32_t src1 = emu_get_gpr_reg(emu, instr->src1);
252 uint32_t src2 = emu_get_gpr_reg(emu, instr->src2);
253
254 if (instr->preincrement) {
255 emu_set_gpr_reg(emu, instr->src2, src2 + instr->immed);
256 }
257
258 emu_set_sqe_reg(emu, src2 + instr->immed, src1);
259 break;
260 }
261 case OPC_SREAD: {
262 uint32_t src1 = emu_get_gpr_reg(emu, instr->src1);
263
264 if (instr->preincrement) {
265 emu_set_gpr_reg(emu, instr->src1, src1 + instr->immed);
266 }
267
268 emu_set_gpr_reg(emu, instr->dst,
269 emu_get_sqe_reg(emu, src1 + instr->immed));
270 break;
271 }
272 case OPC_LOAD: {
273 uintptr_t addr = load_store_addr(emu, instr->src1) +
274 instr->immed;
275
276 if (instr->preincrement) {
277 uint32_t src1 = emu_get_gpr_reg(emu, instr->src1);
278 emu_set_gpr_reg(emu, instr->src1, src1 + instr->immed);
279 }
280
281 uint32_t val = emu_mem_read_dword(emu, addr);
282
283 emu_set_gpr_reg(emu, instr->dst, val);
284
285 break;
286 }
287 case OPC_STORE: {
288 uintptr_t addr = load_store_addr(emu, instr->src2) +
289 instr->immed;
290
291 if (instr->preincrement) {
292 uint32_t src2 = emu_get_gpr_reg(emu, instr->src2);
293 emu_set_gpr_reg(emu, instr->src2, src2 + instr->immed);
294 }
295
296 uint32_t val = emu_get_gpr_reg(emu, instr->src1);
297
298 emu_mem_write_dword(emu, addr, val);
299
300 break;
301 }
302 case OPC_BRNEI ... OPC_BREQB: {
303 uint32_t off = emu->gpr_regs.pc + instr->offset;
304 uint32_t src = emu_get_gpr_reg(emu, instr->src1);
305
306 if (instr->opc == OPC_BRNEI) {
307 if (src != instr->immed)
308 emu->branch_target = off;
309 } else if (instr->opc == OPC_BREQI) {
310 if (src == instr->immed)
311 emu->branch_target = off;
312 } else if (instr->opc == OPC_BRNEB) {
313 if (!(src & (1 << instr->bit)))
314 emu->branch_target = off;
315 } else if (instr->opc == OPC_BREQB) {
316 if (src & (1 << instr->bit))
317 emu->branch_target = off;
318 } else {
319 assert(0);
320 }
321 break;
322 }
323 case OPC_RET: {
324 unsigned sp = emu_get_reg32(emu, &SP);
325 assert(sp > 0);
326
327 /* counter-part to 'call' instruction, also has a delay slot: */
328 emu->branch_target = emu_get_sqe_reg(emu, emu_reg_offset(&STACK0) + sp - 1);
329 emu_set_reg32(emu, &SP, sp - 1);
330
331 break;
332 }
333 case OPC_CALL: {
334 unsigned sp = emu_get_reg32(emu, &SP);
335 assert(sp + emu_reg_offset(&STACK0) < ARRAY_SIZE(emu->sqe_regs.val));
336
337 /* call looks to have same delay-slot behavior as branch/etc, so
338 * presumably the return PC is two instructions later:
339 */
340 emu_set_sqe_reg(emu, emu_reg_offset(&STACK0) + sp, emu->gpr_regs.pc + 2);
341 emu_set_reg32(emu, &SP, sp + 1);
342 emu->branch_target = instr->literal;
343
344 break;
345 }
346 case OPC_WAITIN: {
347 assert(!emu->branch_target);
348 emu->run_mode = false;
349 emu->waitin = true;
350 break;
351 }
352 /* OPC_PREEMPTLEAVE6 */
353 case OPC_SETSECURE: {
354 // TODO this acts like a conditional branch, but in which case
355 // does it branch?
356 break;
357 }
358 default:
359 printf("unhandled opc: 0x%02x\n", instr->opc);
360 exit(1);
361 }
362
363 if (instr->rep) {
364 assert(rem > 0);
365 emu_set_gpr_reg(emu, REG_REM, --rem);
366 }
367 }
368
369 void
emu_step(struct emu * emu)370 emu_step(struct emu *emu)
371 {
372 struct afuc_instr *instr;
373 bool decoded = isa_decode((void *)&instr,
374 (void *)&emu->instrs[emu->gpr_regs.pc],
375 &(struct isa_decode_options) {
376 .gpu_id = gpuver,
377 });
378
379 if (!decoded) {
380 uint32_t instr_val = emu->instrs[emu->gpr_regs.pc];
381 if ((instr_val >> 27) == 0) {
382 /* This is printed as an undecoded literal to show the immediate
383 * payload, but when executing it's just a NOP.
384 */
385 instr = calloc(1, sizeof(struct afuc_instr));
386 instr->opc = OPC_NOP;
387 } else {
388 printf("unmatched instruction: 0x%08x\n", instr_val);
389 exit(1);
390 }
391 }
392
393 emu_main_prompt(emu);
394
395 uint32_t branch_target = emu->branch_target;
396 emu->branch_target = 0;
397
398 bool waitin = emu->waitin;
399 emu->waitin = false;
400
401 if (instr->rep) {
402 do {
403 if (!emu_get_gpr_reg(emu, REG_REM))
404 break;
405
406 emu_clear_state_change(emu);
407 emu_instr(emu, instr);
408
409 /* defer last state-change dump until after any
410 * post-delay-slot handling below:
411 */
412 if (emu_get_gpr_reg(emu, REG_REM))
413 emu_dump_state_change(emu);
414 } while (true);
415 } else {
416 emu_clear_state_change(emu);
417 emu_instr(emu, instr);
418 }
419
420 emu->gpr_regs.pc++;
421
422 if (branch_target) {
423 emu->gpr_regs.pc = branch_target;
424 }
425
426 if (waitin) {
427 uint32_t hdr = emu_get_gpr_reg(emu, 1);
428 uint32_t id, count;
429
430 if (pkt_is_type4(hdr)) {
431 id = afuc_pm4_id("PKT4");
432 count = type4_pkt_size(hdr);
433
434 /* Possibly a hack, not sure what the hw actually
435 * does here, but we want to mask out the pkt
436 * type field from the hdr, so that PKT4 handler
437 * doesn't see it and interpret it as part as the
438 * register offset:
439 */
440 emu->gpr_regs.val[1] &= 0x0fffffff;
441 } else if (pkt_is_type7(hdr)) {
442 id = cp_type7_opcode(hdr);
443 count = type7_pkt_size(hdr);
444 } else {
445 printf("Invalid opcode: 0x%08x\n", hdr);
446 exit(1); /* GPU goes *boom* */
447 }
448
449 assert(id < ARRAY_SIZE(emu->jmptbl));
450
451 emu_set_gpr_reg(emu, REG_REM, count);
452 emu->gpr_regs.pc = emu->jmptbl[id];
453 }
454
455 emu_dump_state_change(emu);
456
457 free(instr);
458 }
459
460 void
emu_run_bootstrap(struct emu * emu)461 emu_run_bootstrap(struct emu *emu)
462 {
463 EMU_CONTROL_REG(PACKET_TABLE_WRITE_ADDR);
464
465 emu->quiet = true;
466 emu->run_mode = true;
467
468 while (emu_get_reg32(emu, &PACKET_TABLE_WRITE_ADDR) < 0x80) {
469 emu_step(emu);
470 }
471 }
472
473
474 static void
check_access(struct emu * emu,uintptr_t gpuaddr,unsigned sz)475 check_access(struct emu *emu, uintptr_t gpuaddr, unsigned sz)
476 {
477 if ((gpuaddr % sz) != 0) {
478 printf("unaligned access fault: %p\n", (void *)gpuaddr);
479 exit(1);
480 }
481
482 if ((gpuaddr + sz) >= EMU_MEMORY_SIZE) {
483 printf("iova fault: %p\n", (void *)gpuaddr);
484 exit(1);
485 }
486 }
487
488 uint32_t
emu_mem_read_dword(struct emu * emu,uintptr_t gpuaddr)489 emu_mem_read_dword(struct emu *emu, uintptr_t gpuaddr)
490 {
491 check_access(emu, gpuaddr, 4);
492 return *(uint32_t *)(emu->gpumem + gpuaddr);
493 }
494
495 static void
mem_write_dword(struct emu * emu,uintptr_t gpuaddr,uint32_t val)496 mem_write_dword(struct emu *emu, uintptr_t gpuaddr, uint32_t val)
497 {
498 check_access(emu, gpuaddr, 4);
499 *(uint32_t *)(emu->gpumem + gpuaddr) = val;
500 }
501
502 void
emu_mem_write_dword(struct emu * emu,uintptr_t gpuaddr,uint32_t val)503 emu_mem_write_dword(struct emu *emu, uintptr_t gpuaddr, uint32_t val)
504 {
505 mem_write_dword(emu, gpuaddr, val);
506 assert(emu->gpumem_written == ~0);
507 emu->gpumem_written = gpuaddr;
508 }
509
510 void
emu_init(struct emu * emu)511 emu_init(struct emu *emu)
512 {
513 emu->gpumem = mmap(NULL, EMU_MEMORY_SIZE,
514 PROT_READ | PROT_WRITE,
515 MAP_PRIVATE | MAP_ANONYMOUS | MAP_NORESERVE,
516 0, 0);
517 if (emu->gpumem == MAP_FAILED) {
518 printf("Could not allocate GPU memory: %s\n", strerror(errno));
519 exit(1);
520 }
521
522 /* Copy the instructions into GPU memory: */
523 for (unsigned i = 0; i < emu->sizedwords; i++) {
524 mem_write_dword(emu, EMU_INSTR_BASE + (4 * i), emu->instrs[i]);
525 }
526
527 EMU_GPU_REG(CP_SQE_INSTR_BASE);
528 EMU_GPU_REG(CP_LPAC_SQE_INSTR_BASE);
529 EMU_CONTROL_REG(BV_INSTR_BASE);
530 EMU_CONTROL_REG(LPAC_INSTR_BASE);
531
532 /* Setup the address of the SQE fw, just use the normal CPU ptr address: */
533 switch (emu->processor) {
534 case EMU_PROC_SQE:
535 emu_set_reg64(emu, &CP_SQE_INSTR_BASE, EMU_INSTR_BASE);
536 break;
537 case EMU_PROC_BV:
538 emu_set_reg64(emu, &BV_INSTR_BASE, EMU_INSTR_BASE);
539 break;
540 case EMU_PROC_LPAC:
541 if (gpuver >= 7)
542 emu_set_reg64(emu, &LPAC_INSTR_BASE, EMU_INSTR_BASE);
543 else
544 emu_set_reg64(emu, &CP_LPAC_SQE_INSTR_BASE, EMU_INSTR_BASE);
545 break;
546 }
547
548 if (emu->gpu_id == 730) {
549 emu_set_control_reg(emu, 0xef, 1 << 21);
550 emu_set_control_reg(emu, 0, 7 << 28);
551 } else if (emu->gpu_id == 660) {
552 emu_set_control_reg(emu, 0, 3 << 28);
553 } else if (emu->gpu_id == 650) {
554 emu_set_control_reg(emu, 0, 1 << 28);
555 }
556 }
557
558 void
emu_fini(struct emu * emu)559 emu_fini(struct emu *emu)
560 {
561 uint32_t *instrs = emu->instrs;
562 unsigned sizedwords = emu->sizedwords;
563 unsigned gpu_id = emu->gpu_id;
564
565 munmap(emu->gpumem, EMU_MEMORY_SIZE);
566 memset(emu, 0, sizeof(*emu));
567
568 emu->instrs = instrs;
569 emu->sizedwords = sizedwords;
570 emu->gpu_id = gpu_id;
571 }
572