1 /*
2 * Copyright (C) 2022-2023 Collabora, Ltd.
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
13 * Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 * SOFTWARE.
22 */
23
24 #include "genxml/gen_macros.h"
25 #include "decode.h"
26
27 #if PAN_ARCH >= 10
28 /* Limit for Mali-G610. -1 because we're not including the active frame */
29 #define MAX_CALL_STACK_DEPTH (8 - 1)
30
31 struct queue_ctx {
32 /* Size of CSHWIF register file in 32-bit registers */
33 unsigned nr_regs;
34
35 /* CSHWIF register file */
36 uint32_t *regs;
37
38 /* Current instruction pointer (CPU pointer for convenience) */
39 uint64_t *ip;
40
41 /* Current instruction end pointer */
42 uint64_t *end;
43
44 /* Call stack. Depth=0 means root */
45 struct {
46 /* Link register to return to */
47 uint64_t *lr;
48
49 /* End pointer, there is a return (or exit) after */
50 uint64_t *end;
51 } call_stack[MAX_CALL_STACK_DEPTH];
52 uint8_t call_stack_depth;
53
54 unsigned gpu_id;
55 };
56
57 static uint32_t
cs_get_u32(struct queue_ctx * qctx,uint8_t reg)58 cs_get_u32(struct queue_ctx *qctx, uint8_t reg)
59 {
60 assert(reg < qctx->nr_regs);
61 return qctx->regs[reg];
62 }
63
64 static uint64_t
cs_get_u64(struct queue_ctx * qctx,uint8_t reg)65 cs_get_u64(struct queue_ctx *qctx, uint8_t reg)
66 {
67 return (((uint64_t)cs_get_u32(qctx, reg + 1)) << 32) | cs_get_u32(qctx, reg);
68 }
69
70 static void
pandecode_run_compute(struct pandecode_context * ctx,FILE * fp,struct queue_ctx * qctx,struct MALI_CEU_RUN_COMPUTE * I)71 pandecode_run_compute(struct pandecode_context *ctx, FILE *fp,
72 struct queue_ctx *qctx, struct MALI_CEU_RUN_COMPUTE *I)
73 {
74 const char *axes[4] = {"x_axis", "y_axis", "z_axis"};
75
76 /* Print the instruction. Ignore the selects and the flags override
77 * since we'll print them implicitly later.
78 */
79 fprintf(fp, "RUN_COMPUTE.%s #%u\n", axes[I->task_axis], I->task_increment);
80
81 ctx->indent++;
82
83 unsigned reg_srt = 0 + (I->srt_select * 2);
84 unsigned reg_fau = 8 + (I->fau_select * 2);
85 unsigned reg_spd = 16 + (I->spd_select * 2);
86 unsigned reg_tsd = 24 + (I->tsd_select * 2);
87
88 GENX(pandecode_resource_tables)(ctx, cs_get_u64(qctx, reg_srt), "Resources");
89
90 mali_ptr fau = cs_get_u64(qctx, reg_fau);
91
92 if (fau)
93 GENX(pandecode_fau)(ctx, fau & BITFIELD64_MASK(48), fau >> 56, "FAU");
94
95 GENX(pandecode_shader)
96 (ctx, cs_get_u64(qctx, reg_spd), "Shader", qctx->gpu_id);
97
98 DUMP_ADDR(ctx, LOCAL_STORAGE, cs_get_u64(qctx, reg_tsd),
99 "Local Storage @%" PRIx64 ":\n", cs_get_u64(qctx, reg_tsd));
100
101 pandecode_log(ctx, "Global attribute offset: %u\n", cs_get_u32(qctx, 32));
102 DUMP_CL(ctx, COMPUTE_SIZE_WORKGROUP, &qctx->regs[33], "Workgroup size\n");
103 pandecode_log(ctx, "Job offset X: %u\n", cs_get_u32(qctx, 34));
104 pandecode_log(ctx, "Job offset Y: %u\n", cs_get_u32(qctx, 35));
105 pandecode_log(ctx, "Job offset Z: %u\n", cs_get_u32(qctx, 36));
106 pandecode_log(ctx, "Job size X: %u\n", cs_get_u32(qctx, 37));
107 pandecode_log(ctx, "Job size Y: %u\n", cs_get_u32(qctx, 38));
108 pandecode_log(ctx, "Job size Z: %u\n", cs_get_u32(qctx, 39));
109
110 ctx->indent--;
111 }
112
113 static void
pandecode_run_idvs(struct pandecode_context * ctx,FILE * fp,struct queue_ctx * qctx,struct MALI_CEU_RUN_IDVS * I)114 pandecode_run_idvs(struct pandecode_context *ctx, FILE *fp,
115 struct queue_ctx *qctx, struct MALI_CEU_RUN_IDVS *I)
116 {
117 /* Print the instruction. Ignore the selects and the flags override
118 * since we'll print them implicitly later.
119 */
120 fprintf(fp, "RUN_IDVS%s", I->malloc_enable ? "" : ".no_malloc");
121
122 if (I->draw_id_register_enable)
123 fprintf(fp, " r%u", I->draw_id);
124
125 fprintf(fp, "\n");
126
127 ctx->indent++;
128
129 /* Merge flag overrides with the register flags */
130 uint32_t tiler_flags_raw = cs_get_u64(qctx, 56);
131 tiler_flags_raw |= I->flags_override;
132 pan_unpack(&tiler_flags_raw, PRIMITIVE_FLAGS, tiler_flags);
133
134 unsigned reg_position_srt = 0;
135 unsigned reg_position_fau = 8;
136 unsigned reg_position_tsd = 24;
137
138 unsigned reg_vary_srt = I->varying_srt_select ? 2 : 0;
139 unsigned reg_vary_fau = I->varying_fau_select ? 10 : 8;
140 unsigned reg_vary_tsd = I->varying_tsd_select ? 26 : 24;
141
142 unsigned reg_frag_srt = I->fragment_srt_select ? 4 : 0;
143 unsigned reg_frag_fau = 12;
144 unsigned reg_frag_tsd = I->fragment_tsd_select ? 28 : 24;
145
146 uint64_t position_srt = cs_get_u64(qctx, reg_position_srt);
147 uint64_t vary_srt = cs_get_u64(qctx, reg_vary_srt);
148 uint64_t frag_srt = cs_get_u64(qctx, reg_frag_srt);
149
150 if (position_srt)
151 GENX(pandecode_resource_tables)(ctx, position_srt, "Position resources");
152
153 if (vary_srt)
154 GENX(pandecode_resource_tables)(ctx, vary_srt, "Varying resources");
155
156 if (frag_srt)
157 GENX(pandecode_resource_tables)(ctx, frag_srt, "Fragment resources");
158
159 mali_ptr position_fau = cs_get_u64(qctx, reg_position_fau);
160 mali_ptr vary_fau = cs_get_u64(qctx, reg_vary_fau);
161 mali_ptr fragment_fau = cs_get_u64(qctx, reg_frag_fau);
162
163 if (position_fau) {
164 uint64_t lo = position_fau & BITFIELD64_MASK(48);
165 uint64_t hi = position_fau >> 56;
166
167 GENX(pandecode_fau)(ctx, lo, hi, "Position FAU");
168 }
169
170 if (vary_fau) {
171 uint64_t lo = vary_fau & BITFIELD64_MASK(48);
172 uint64_t hi = vary_fau >> 56;
173
174 GENX(pandecode_fau)(ctx, lo, hi, "Varying FAU");
175 }
176
177 if (fragment_fau) {
178 uint64_t lo = fragment_fau & BITFIELD64_MASK(48);
179 uint64_t hi = fragment_fau >> 56;
180
181 GENX(pandecode_fau)(ctx, lo, hi, "Fragment FAU");
182 }
183
184 if (cs_get_u64(qctx, 16)) {
185 GENX(pandecode_shader)
186 (ctx, cs_get_u64(qctx, 16), "Position shader", qctx->gpu_id);
187 }
188
189 if (tiler_flags.secondary_shader) {
190 uint64_t ptr = cs_get_u64(qctx, 18);
191
192 GENX(pandecode_shader)(ctx, ptr, "Varying shader", qctx->gpu_id);
193 }
194
195 if (cs_get_u64(qctx, 20)) {
196 GENX(pandecode_shader)
197 (ctx, cs_get_u64(qctx, 20), "Fragment shader", qctx->gpu_id);
198 }
199
200 DUMP_ADDR(ctx, LOCAL_STORAGE, cs_get_u64(qctx, 24),
201 "Position Local Storage @%" PRIx64 ":\n",
202 cs_get_u64(qctx, reg_position_tsd));
203 DUMP_ADDR(ctx, LOCAL_STORAGE, cs_get_u64(qctx, 24),
204 "Varying Local Storage @%" PRIx64 ":\n",
205 cs_get_u64(qctx, reg_vary_tsd));
206 DUMP_ADDR(ctx, LOCAL_STORAGE, cs_get_u64(qctx, 30),
207 "Fragment Local Storage @%" PRIx64 ":\n",
208 cs_get_u64(qctx, reg_frag_tsd));
209
210 pandecode_log(ctx, "Global attribute offset: %u\n", cs_get_u32(qctx, 32));
211 pandecode_log(ctx, "Index count: %u\n", cs_get_u32(qctx, 33));
212 pandecode_log(ctx, "Instance count: %u\n", cs_get_u32(qctx, 34));
213
214 if (tiler_flags.index_type)
215 pandecode_log(ctx, "Index offset: %u\n", cs_get_u32(qctx, 35));
216
217 pandecode_log(ctx, "Vertex offset: %d\n", cs_get_u32(qctx, 36));
218 pandecode_log(ctx, "Instance offset: %u\n", cs_get_u32(qctx, 37));
219 pandecode_log(ctx, "Tiler DCD flags2: %X\n", cs_get_u32(qctx, 38));
220
221 if (tiler_flags.index_type)
222 pandecode_log(ctx, "Index array size: %u\n", cs_get_u32(qctx, 39));
223
224 GENX(pandecode_tiler)(ctx, cs_get_u64(qctx, 40), qctx->gpu_id);
225
226 DUMP_CL(ctx, SCISSOR, &qctx->regs[42], "Scissor\n");
227 pandecode_log(ctx, "Low depth clamp: %f\n", uif(cs_get_u32(qctx, 44)));
228 pandecode_log(ctx, "High depth clamp: %f\n", uif(cs_get_u32(qctx, 45)));
229 pandecode_log(ctx, "Occlusion: %" PRIx64 "\n", cs_get_u64(qctx, 46));
230
231 if (tiler_flags.secondary_shader)
232 pandecode_log(ctx, "Varying allocation: %u\n", cs_get_u32(qctx, 48));
233
234 mali_ptr blend = cs_get_u64(qctx, 50);
235 GENX(pandecode_blend_descs)(ctx, blend & ~7, blend & 7, 0, qctx->gpu_id);
236
237 DUMP_ADDR(ctx, DEPTH_STENCIL, cs_get_u64(qctx, 52), "Depth/stencil");
238
239 if (tiler_flags.index_type)
240 pandecode_log(ctx, "Indices: %" PRIx64 "\n", cs_get_u64(qctx, 54));
241
242 DUMP_UNPACKED(ctx, PRIMITIVE_FLAGS, tiler_flags, "Primitive flags\n");
243 DUMP_CL(ctx, DCD_FLAGS_0, &qctx->regs[57], "DCD Flags 0\n");
244 DUMP_CL(ctx, DCD_FLAGS_1, &qctx->regs[58], "DCD Flags 1\n");
245 DUMP_CL(ctx, PRIMITIVE_SIZE, &qctx->regs[60], "Primitive size\n");
246
247 ctx->indent--;
248 }
249
250 static void
pandecode_run_fragment(struct pandecode_context * ctx,struct queue_ctx * qctx,struct MALI_CEU_RUN_FRAGMENT * I)251 pandecode_run_fragment(struct pandecode_context *ctx, struct queue_ctx *qctx,
252 struct MALI_CEU_RUN_FRAGMENT *I)
253 {
254 ctx->indent++;
255
256 DUMP_CL(ctx, SCISSOR, &qctx->regs[42], "Scissor\n");
257
258 /* TODO: Tile enable map */
259 GENX(pandecode_fbd)
260 (ctx, cs_get_u64(qctx, 40) & ~0x3full, true, qctx->gpu_id);
261
262 ctx->indent--;
263 }
264
265 static void
print_indirect(unsigned address,int16_t offset,FILE * fp)266 print_indirect(unsigned address, int16_t offset, FILE *fp)
267 {
268 if (offset)
269 fprintf(fp, "[d%u + %d]", address, offset);
270 else
271 fprintf(fp, "[d%u]", address);
272 }
273
274 static void
print_reg_tuple(unsigned base,uint16_t mask,FILE * fp)275 print_reg_tuple(unsigned base, uint16_t mask, FILE *fp)
276 {
277 bool first_reg = true;
278
279 u_foreach_bit(i, mask) {
280 fprintf(fp, "%sr%u", first_reg ? "" : ":", base + i);
281 first_reg = false;
282 }
283
284 if (mask == 0)
285 fprintf(fp, "_");
286 }
287
288 static void
disassemble_ceu_instr(struct pandecode_context * ctx,uint64_t dword,unsigned indent,bool verbose,FILE * fp,struct queue_ctx * qctx)289 disassemble_ceu_instr(struct pandecode_context *ctx, uint64_t dword,
290 unsigned indent, bool verbose, FILE *fp,
291 struct queue_ctx *qctx)
292 {
293 if (verbose) {
294 fprintf(fp, " ");
295 for (unsigned b = 0; b < 8; ++b)
296 fprintf(fp, " %02x", (uint8_t)(dword >> (8 * b)));
297 }
298
299 for (int i = 0; i < indent; ++i)
300 fprintf(fp, " ");
301
302 /* Unpack the base so we get the opcode */
303 uint8_t *bytes = (uint8_t *)&dword;
304 pan_unpack(bytes, CEU_BASE, base);
305
306 switch (base.opcode) {
307 case MALI_CEU_OPCODE_NOP: {
308 pan_unpack(bytes, CEU_NOP, I);
309
310 if (I.ignored)
311 fprintf(fp, "NOP // 0x%" PRIX64 "\n", I.ignored);
312 else
313 fprintf(fp, "NOP\n");
314 break;
315 }
316
317 case MALI_CEU_OPCODE_MOVE: {
318 pan_unpack(bytes, CEU_MOVE, I);
319
320 fprintf(fp, "MOVE d%u, #0x%" PRIX64 "\n", I.destination, I.immediate);
321 break;
322 }
323
324 case MALI_CEU_OPCODE_MOVE32: {
325 pan_unpack(bytes, CEU_MOVE32, I);
326 fprintf(fp, "MOVE32 r%u, #0x%X\n", I.destination, I.immediate);
327 break;
328 }
329
330 case MALI_CEU_OPCODE_WAIT: {
331 bool first = true;
332 pan_unpack(bytes, CEU_WAIT, I);
333 fprintf(fp, "WAIT ");
334
335 u_foreach_bit(i, I.slots) {
336 fprintf(fp, "%s%u", first ? "" : ",", i);
337 first = false;
338 }
339
340 fprintf(fp, "\n");
341 break;
342 }
343
344 case MALI_CEU_OPCODE_RUN_COMPUTE: {
345 pan_unpack(bytes, CEU_RUN_COMPUTE, I);
346 pandecode_run_compute(ctx, fp, qctx, &I);
347 break;
348 }
349
350 case MALI_CEU_OPCODE_RUN_IDVS: {
351 pan_unpack(bytes, CEU_RUN_IDVS, I);
352 pandecode_run_idvs(ctx, fp, qctx, &I);
353 break;
354 }
355
356 case MALI_CEU_OPCODE_RUN_FRAGMENT: {
357 pan_unpack(bytes, CEU_RUN_FRAGMENT, I);
358 fprintf(fp, "RUN_FRAGMENT%s\n",
359 I.enable_tem ? ".tile_enable_map_enable" : "");
360 pandecode_run_fragment(ctx, qctx, &I);
361 break;
362 }
363
364 case MALI_CEU_OPCODE_ADD_IMMEDIATE32: {
365 pan_unpack(bytes, CEU_ADD_IMMEDIATE32, I);
366
367 fprintf(fp, "ADD_IMMEDIATE32 r%u, r%u, #%d\n", I.destination, I.source,
368 I.immediate);
369 break;
370 }
371
372 case MALI_CEU_OPCODE_ADD_IMMEDIATE64: {
373 pan_unpack(bytes, CEU_ADD_IMMEDIATE64, I);
374
375 fprintf(fp, "ADD_IMMEDIATE64 d%u, d%u, #%d\n", I.destination, I.source,
376 I.immediate);
377 break;
378 }
379
380 case MALI_CEU_OPCODE_LOAD_MULTIPLE: {
381 pan_unpack(bytes, CEU_LOAD_MULTIPLE, I);
382
383 fprintf(fp, "LOAD_MULTIPLE ");
384 print_reg_tuple(I.base, I.mask, fp);
385 fprintf(fp, ", ");
386 print_indirect(I.address, I.offset, fp);
387 fprintf(fp, "\n");
388 break;
389 }
390
391 case MALI_CEU_OPCODE_STORE_MULTIPLE: {
392 pan_unpack(bytes, CEU_STORE_MULTIPLE, I);
393
394 fprintf(fp, "STORE_MULTIPLE ");
395 print_indirect(I.address, I.offset, fp);
396 fprintf(fp, ", ");
397 print_reg_tuple(I.base, I.mask, fp);
398 fprintf(fp, "\n");
399 break;
400 }
401
402 case MALI_CEU_OPCODE_SET_SB_ENTRY: {
403 pan_unpack(bytes, CEU_SET_SB_ENTRY, I);
404
405 fprintf(fp, "SET_SB_ENTRY #%u, #%u\n", I.endpoint_entry, I.other_entry);
406 break;
407 }
408
409 case MALI_CEU_OPCODE_SYNC_ADD32: {
410 pan_unpack(bytes, CEU_SYNC_ADD32, I);
411 bool first = true;
412 fprintf(fp, "SYNC_ADD32%s%s signal(%u), wait(",
413 I.error_propagate ? ".error_propagate" : "",
414 I.scope_csg ? ".csg" : ".system", I.scoreboard_slot);
415
416 u_foreach_bit(i, I.wait_mask) {
417 fprintf(fp, "%s%u", first ? "" : ",", i);
418 first = false;
419 }
420
421 fprintf(fp, ") [d%u], r%u\n", I.address, I.data);
422 break;
423 }
424
425 case MALI_CEU_OPCODE_SYNC_ADD64: {
426 pan_unpack(bytes, CEU_SYNC_ADD64, I);
427 bool first = true;
428 fprintf(fp, "SYNC_ADD64%s%s signal(%u), wait(",
429 I.error_propagate ? ".error_propagate" : "",
430 I.scope_csg ? ".csg" : ".system", I.scoreboard_slot);
431
432 u_foreach_bit(i, I.wait_mask) {
433 fprintf(fp, "%s%u", first ? "" : ",", i);
434 first = false;
435 }
436
437 fprintf(fp, ") [d%u], d%u\n", I.address, I.data);
438 break;
439 }
440
441 case MALI_CEU_OPCODE_SYNC_SET32: {
442 pan_unpack(bytes, CEU_SYNC_SET32, I);
443 bool first = true;
444 fprintf(fp, "SYNC_SET32.%s%s signal(%u), wait(",
445 I.error_propagate ? ".error_propagate" : "",
446 I.scope_csg ? ".csg" : ".system", I.scoreboard_slot);
447
448 u_foreach_bit(i, I.wait_mask) {
449 fprintf(fp, "%s%u", first ? "" : ",", i);
450 first = false;
451 }
452
453 fprintf(fp, ") [d%u], r%u\n", I.address, I.data);
454 break;
455 }
456
457 case MALI_CEU_OPCODE_SYNC_SET64: {
458 pan_unpack(bytes, CEU_SYNC_SET64, I);
459 bool first = true;
460 fprintf(fp, "SYNC_SET64.%s%s signal(%u), wait(",
461 I.error_propagate ? ".error_propagate" : "",
462 I.scope_csg ? ".csg" : ".system", I.scoreboard_slot);
463
464 u_foreach_bit(i, I.wait_mask) {
465 fprintf(fp, "%s%u", first ? "" : ",", i);
466 first = false;
467 }
468
469 fprintf(fp, ") [d%u], d%u\n", I.address, I.data);
470 break;
471 }
472
473 case MALI_CEU_OPCODE_CALL: {
474 pan_unpack(bytes, CEU_CALL, I);
475 fprintf(fp, "CALL d%u, r%u\n", I.address, I.length);
476 break;
477 }
478
479 case MALI_CEU_OPCODE_JUMP: {
480 pan_unpack(bytes, CEU_JUMP, I);
481 fprintf(fp, "JUMP d%u, r%u\n", I.address, I.length);
482 break;
483 }
484
485 case MALI_CEU_OPCODE_REQ_RESOURCE: {
486 pan_unpack(bytes, CEU_REQ_RESOURCE, I);
487
488 fprintf(fp, "REQ_RESOURCE");
489 if (I.compute)
490 fprintf(fp, ".compute");
491 if (I.fragment)
492 fprintf(fp, ".fragment");
493 if (I.tiler)
494 fprintf(fp, ".tiler");
495 if (I.idvs)
496 fprintf(fp, ".idvs");
497 fprintf(fp, "\n");
498 break;
499 }
500
501 case MALI_CEU_OPCODE_SYNC_WAIT32: {
502 pan_unpack(bytes, CEU_SYNC_WAIT32, I);
503
504 fprintf(fp, "SYNC_WAIT32%s%s d%u, r%u\n", I.invert ? ".gt" : ".le",
505 I.error_reject ? ".reject" : ".inherit", I.address, I.data);
506 break;
507 }
508
509 case MALI_CEU_OPCODE_SYNC_WAIT64: {
510 pan_unpack(bytes, CEU_SYNC_WAIT64, I);
511
512 fprintf(fp, "SYNC_WAIT64%s%s d%u, d%u\n", I.invert ? ".gt" : ".le",
513 I.error_reject ? ".reject" : ".inherit", I.address, I.data);
514 break;
515 }
516
517 case MALI_CEU_OPCODE_UMIN32: {
518 pan_unpack(bytes, CEU_UMIN32, I);
519
520 fprintf(fp, "UMIN32 r%u, r%u, r%u\n", I.destination, I.source_1,
521 I.source_2);
522 break;
523 }
524
525 case MALI_CEU_OPCODE_BRANCH: {
526 pan_unpack(bytes, CEU_BRANCH, I);
527
528 static const char *condition[] = {
529 "le", "gt", "eq", "ne", "lt", "ge", "always",
530 };
531 fprintf(fp, "BRANCH.%s r%u, #%d\n", condition[I.condition], I.value,
532 I.offset);
533
534 break;
535 }
536
537 case MALI_CEU_OPCODE_FLUSH_CACHE2: {
538 pan_unpack(bytes, CEU_FLUSH_CACHE2, I);
539 static const char *mode[] = {
540 "nop",
541 "clean",
542 "INVALID",
543 "clean_invalidate",
544 };
545
546 fprintf(fp, "FLUSH_CACHE2.%s_l2.%s_lsc%s r%u, signal(%u), wait(",
547 mode[I.l2_flush_mode], mode[I.lsc_flush_mode],
548 I.other_invalidate ? ".invalidate_other" : "", I.latest_flush_id,
549 I.scoreboard_entry);
550
551 bool first = true;
552 u_foreach_bit(i, I.scoreboard_mask) {
553 fprintf(fp, "%s%u", first ? "" : ",", i);
554 first = false;
555 }
556 fprintf(fp, ")\n");
557 break;
558 }
559
560 case MALI_CEU_OPCODE_FINISH_TILING: {
561 pan_unpack(bytes, CEU_FINISH_TILING, I);
562 fprintf(fp, "FINISH_TILING\n");
563 break;
564 }
565
566 case MALI_CEU_OPCODE_FINISH_FRAGMENT: {
567 pan_unpack(bytes, CEU_FINISH_FRAGMENT, I);
568
569 bool first = true;
570 fprintf(fp, "FINISH_FRAGMENT.%s, d%u, d%u, signal(%u), wait(",
571 I.increment_fragment_completed ? ".frag_end" : "",
572 I.last_heap_chunk, I.first_heap_chunk, I.scoreboard_entry);
573
574 u_foreach_bit(i, I.wait_mask) {
575 fprintf(fp, "%s%u", first ? "" : ",", i);
576 first = false;
577 }
578 fprintf(fp, ")\n");
579 break;
580 }
581
582 case MALI_CEU_OPCODE_HEAP_OPERATION: {
583 pan_unpack(bytes, CEU_HEAP_OPERATION, I);
584 const char *counter_names[] = {"vt_start", "vt_end", NULL, "frag_end"};
585 bool first = true;
586 fprintf(fp, "HEAP_OPERATION.%s signal(%u), wait(",
587 counter_names[I.operation], I.scoreboard_entry);
588
589 u_foreach_bit(i, I.wait_mask) {
590 fprintf(fp, "%s%u", first ? "" : ",", i);
591 first = false;
592 }
593
594 fprintf(fp, ")\n");
595 break;
596 }
597
598 case MALI_CEU_OPCODE_HEAP_SET: {
599 pan_unpack(bytes, CEU_HEAP_SET, I);
600 fprintf(fp, "HEAP_SET d%u\n", I.address);
601 break;
602 }
603
604 default: {
605 fprintf(fp, "INVALID_%u 0x%" PRIX64 "\n", base.opcode, base.data);
606 break;
607 }
608 }
609 }
610
611 static bool
interpret_ceu_jump(struct pandecode_context * ctx,struct queue_ctx * qctx,uint64_t reg_address,uint32_t reg_length)612 interpret_ceu_jump(struct pandecode_context *ctx, struct queue_ctx *qctx,
613 uint64_t reg_address, uint32_t reg_length)
614 {
615 uint32_t address_lo = qctx->regs[reg_address];
616 uint32_t address_hi = qctx->regs[reg_address + 1];
617 uint32_t length = qctx->regs[reg_length];
618
619 if (length % 8) {
620 fprintf(stderr, "CS call alignment error\n");
621 return false;
622 }
623
624 /* Map the entire subqueue now */
625 uint64_t address = ((uint64_t)address_hi << 32) | address_lo;
626 uint64_t *cs = pandecode_fetch_gpu_mem(ctx, address, length);
627
628 qctx->ip = cs;
629 qctx->end = cs + (length / 8);
630
631 /* Skip the usual IP update */
632 return true;
633 }
634
635 /*
636 * Interpret a single instruction of the CEU, updating the register file,
637 * instruction pointer, and call stack. Memory access and GPU controls are
638 * ignored for now.
639 *
640 * Returns true if execution should continue.
641 */
642 static bool
interpret_ceu_instr(struct pandecode_context * ctx,struct queue_ctx * qctx)643 interpret_ceu_instr(struct pandecode_context *ctx, struct queue_ctx *qctx)
644 {
645 /* Unpack the base so we get the opcode */
646 uint8_t *bytes = (uint8_t *)qctx->ip;
647 pan_unpack(bytes, CEU_BASE, base);
648
649 assert(qctx->ip < qctx->end);
650
651 switch (base.opcode) {
652 case MALI_CEU_OPCODE_MOVE: {
653 pan_unpack(bytes, CEU_MOVE, I);
654
655 qctx->regs[I.destination + 0] = (uint32_t)I.immediate;
656 qctx->regs[I.destination + 1] = (uint32_t)(I.immediate >> 32);
657 break;
658 }
659
660 case MALI_CEU_OPCODE_MOVE32: {
661 pan_unpack(bytes, CEU_MOVE32, I);
662
663 qctx->regs[I.destination] = I.immediate;
664 break;
665 }
666
667 case MALI_CEU_OPCODE_ADD_IMMEDIATE32: {
668 pan_unpack(bytes, CEU_ADD_IMMEDIATE32, I);
669
670 qctx->regs[I.destination] = qctx->regs[I.source] + I.immediate;
671 break;
672 }
673
674 case MALI_CEU_OPCODE_ADD_IMMEDIATE64: {
675 pan_unpack(bytes, CEU_ADD_IMMEDIATE64, I);
676
677 int64_t value =
678 (qctx->regs[I.source] | ((int64_t)qctx->regs[I.source + 1] << 32)) +
679 I.immediate;
680
681 qctx->regs[I.destination] = value;
682 qctx->regs[I.destination + 1] = value >> 32;
683 break;
684 }
685
686 case MALI_CEU_OPCODE_CALL: {
687 pan_unpack(bytes, CEU_CALL, I);
688
689 if (qctx->call_stack_depth == MAX_CALL_STACK_DEPTH) {
690 fprintf(stderr, "CS call stack overflow\n");
691 return false;
692 }
693
694 assert(qctx->call_stack_depth < MAX_CALL_STACK_DEPTH);
695
696 qctx->ip++;
697
698 /* Note: tail calls are not optimized in the hardware. */
699 assert(qctx->ip <= qctx->end);
700
701 unsigned depth = qctx->call_stack_depth++;
702
703 qctx->call_stack[depth].lr = qctx->ip;
704 qctx->call_stack[depth].end = qctx->end;
705
706 return interpret_ceu_jump(ctx, qctx, I.address, I.length);
707 }
708
709 case MALI_CEU_OPCODE_JUMP: {
710 pan_unpack(bytes, CEU_JUMP, I);
711
712 if (qctx->call_stack_depth == 0) {
713 fprintf(stderr, "Cannot jump from the entrypoint\n");
714 return false;
715 }
716
717 return interpret_ceu_jump(ctx, qctx, I.address, I.length);
718 }
719
720 default:
721 break;
722 }
723
724 /* Update IP first to point to the next instruction, so call doesn't
725 * require special handling (even for tail calls).
726 */
727 qctx->ip++;
728
729 while (qctx->ip == qctx->end) {
730 /* Graceful termination */
731 if (qctx->call_stack_depth == 0)
732 return false;
733
734 /* Pop off the call stack */
735 unsigned old_depth = --qctx->call_stack_depth;
736
737 qctx->ip = qctx->call_stack[old_depth].lr;
738 qctx->end = qctx->call_stack[old_depth].end;
739 }
740
741 return true;
742 }
743
744 void
GENX(pandecode_cs)745 GENX(pandecode_cs)(struct pandecode_context *ctx, mali_ptr queue, uint32_t size,
746 unsigned gpu_id, uint32_t *regs)
747 {
748 pandecode_dump_file_open(ctx);
749
750 uint64_t *cs = pandecode_fetch_gpu_mem(ctx, queue, size);
751
752 /* Mali-G610 has 96 registers. Other devices not yet supported, we can make
753 * this configurable later when we encounter new Malis.
754 */
755 struct queue_ctx qctx = {
756 .nr_regs = 96,
757 .regs = regs,
758 .ip = cs,
759 .end = cs + (size / 8),
760 .gpu_id = gpu_id,
761 };
762
763 if (size) {
764 do {
765 disassemble_ceu_instr(ctx, *(qctx.ip), 1 + qctx.call_stack_depth, true,
766 ctx->dump_stream, &qctx);
767 } while (interpret_ceu_instr(ctx, &qctx));
768 }
769
770 fflush(ctx->dump_stream);
771 pandecode_map_read_write(ctx);
772 }
773 #endif
774