• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /*
2  * Copyright (C) 2022-2023 Collabora, Ltd.
3  *
4  * Permission is hereby granted, free of charge, to any person obtaining a
5  * copy of this software and associated documentation files (the "Software"),
6  * to deal in the Software without restriction, including without limitation
7  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8  * and/or sell copies of the Software, and to permit persons to whom the
9  * Software is furnished to do so, subject to the following conditions:
10  *
11  * The above copyright notice and this permission notice (including the next
12  * paragraph) shall be included in all copies or substantial portions of the
13  * Software.
14  *
15  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
18  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21  * SOFTWARE.
22  */
23 
24 #include "genxml/gen_macros.h"
25 #include "decode.h"
26 
27 #if PAN_ARCH >= 10
28 /* Limit for Mali-G610. -1 because we're not including the active frame */
29 #define MAX_CALL_STACK_DEPTH (8 - 1)
30 
31 struct queue_ctx {
32    /* Size of CSHWIF register file in 32-bit registers */
33    unsigned nr_regs;
34 
35    /* CSHWIF register file */
36    uint32_t *regs;
37 
38    /* Current instruction pointer (CPU pointer for convenience) */
39    uint64_t *ip;
40 
41    /* Current instruction end pointer */
42    uint64_t *end;
43 
44    /* Call stack. Depth=0 means root */
45    struct {
46       /* Link register to return to */
47       uint64_t *lr;
48 
49       /* End pointer, there is a return (or exit) after */
50       uint64_t *end;
51    } call_stack[MAX_CALL_STACK_DEPTH];
52    uint8_t call_stack_depth;
53 
54    unsigned gpu_id;
55 };
56 
57 static uint32_t
cs_get_u32(struct queue_ctx * qctx,uint8_t reg)58 cs_get_u32(struct queue_ctx *qctx, uint8_t reg)
59 {
60    assert(reg < qctx->nr_regs);
61    return qctx->regs[reg];
62 }
63 
64 static uint64_t
cs_get_u64(struct queue_ctx * qctx,uint8_t reg)65 cs_get_u64(struct queue_ctx *qctx, uint8_t reg)
66 {
67    return (((uint64_t)cs_get_u32(qctx, reg + 1)) << 32) | cs_get_u32(qctx, reg);
68 }
69 
70 static void
pandecode_run_compute(struct pandecode_context * ctx,FILE * fp,struct queue_ctx * qctx,struct MALI_CEU_RUN_COMPUTE * I)71 pandecode_run_compute(struct pandecode_context *ctx, FILE *fp,
72                       struct queue_ctx *qctx, struct MALI_CEU_RUN_COMPUTE *I)
73 {
74    const char *axes[4] = {"x_axis", "y_axis", "z_axis"};
75 
76    /* Print the instruction. Ignore the selects and the flags override
77     * since we'll print them implicitly later.
78     */
79    fprintf(fp, "RUN_COMPUTE.%s #%u\n", axes[I->task_axis], I->task_increment);
80 
81    ctx->indent++;
82 
83    unsigned reg_srt = 0 + (I->srt_select * 2);
84    unsigned reg_fau = 8 + (I->fau_select * 2);
85    unsigned reg_spd = 16 + (I->spd_select * 2);
86    unsigned reg_tsd = 24 + (I->tsd_select * 2);
87 
88    GENX(pandecode_resource_tables)(ctx, cs_get_u64(qctx, reg_srt), "Resources");
89 
90    mali_ptr fau = cs_get_u64(qctx, reg_fau);
91 
92    if (fau)
93       GENX(pandecode_fau)(ctx, fau & BITFIELD64_MASK(48), fau >> 56, "FAU");
94 
95    GENX(pandecode_shader)
96    (ctx, cs_get_u64(qctx, reg_spd), "Shader", qctx->gpu_id);
97 
98    DUMP_ADDR(ctx, LOCAL_STORAGE, cs_get_u64(qctx, reg_tsd),
99              "Local Storage @%" PRIx64 ":\n", cs_get_u64(qctx, reg_tsd));
100 
101    pandecode_log(ctx, "Global attribute offset: %u\n", cs_get_u32(qctx, 32));
102    DUMP_CL(ctx, COMPUTE_SIZE_WORKGROUP, &qctx->regs[33], "Workgroup size\n");
103    pandecode_log(ctx, "Job offset X: %u\n", cs_get_u32(qctx, 34));
104    pandecode_log(ctx, "Job offset Y: %u\n", cs_get_u32(qctx, 35));
105    pandecode_log(ctx, "Job offset Z: %u\n", cs_get_u32(qctx, 36));
106    pandecode_log(ctx, "Job size X: %u\n", cs_get_u32(qctx, 37));
107    pandecode_log(ctx, "Job size Y: %u\n", cs_get_u32(qctx, 38));
108    pandecode_log(ctx, "Job size Z: %u\n", cs_get_u32(qctx, 39));
109 
110    ctx->indent--;
111 }
112 
113 static void
pandecode_run_idvs(struct pandecode_context * ctx,FILE * fp,struct queue_ctx * qctx,struct MALI_CEU_RUN_IDVS * I)114 pandecode_run_idvs(struct pandecode_context *ctx, FILE *fp,
115                    struct queue_ctx *qctx, struct MALI_CEU_RUN_IDVS *I)
116 {
117    /* Print the instruction. Ignore the selects and the flags override
118     * since we'll print them implicitly later.
119     */
120    fprintf(fp, "RUN_IDVS%s", I->malloc_enable ? "" : ".no_malloc");
121 
122    if (I->draw_id_register_enable)
123       fprintf(fp, " r%u", I->draw_id);
124 
125    fprintf(fp, "\n");
126 
127    ctx->indent++;
128 
129    /* Merge flag overrides with the register flags */
130    uint32_t tiler_flags_raw = cs_get_u64(qctx, 56);
131    tiler_flags_raw |= I->flags_override;
132    pan_unpack(&tiler_flags_raw, PRIMITIVE_FLAGS, tiler_flags);
133 
134    unsigned reg_position_srt = 0;
135    unsigned reg_position_fau = 8;
136    unsigned reg_position_tsd = 24;
137 
138    unsigned reg_vary_srt = I->varying_srt_select ? 2 : 0;
139    unsigned reg_vary_fau = I->varying_fau_select ? 10 : 8;
140    unsigned reg_vary_tsd = I->varying_tsd_select ? 26 : 24;
141 
142    unsigned reg_frag_srt = I->fragment_srt_select ? 4 : 0;
143    unsigned reg_frag_fau = 12;
144    unsigned reg_frag_tsd = I->fragment_tsd_select ? 28 : 24;
145 
146    uint64_t position_srt = cs_get_u64(qctx, reg_position_srt);
147    uint64_t vary_srt = cs_get_u64(qctx, reg_vary_srt);
148    uint64_t frag_srt = cs_get_u64(qctx, reg_frag_srt);
149 
150    if (position_srt)
151       GENX(pandecode_resource_tables)(ctx, position_srt, "Position resources");
152 
153    if (vary_srt)
154       GENX(pandecode_resource_tables)(ctx, vary_srt, "Varying resources");
155 
156    if (frag_srt)
157       GENX(pandecode_resource_tables)(ctx, frag_srt, "Fragment resources");
158 
159    mali_ptr position_fau = cs_get_u64(qctx, reg_position_fau);
160    mali_ptr vary_fau = cs_get_u64(qctx, reg_vary_fau);
161    mali_ptr fragment_fau = cs_get_u64(qctx, reg_frag_fau);
162 
163    if (position_fau) {
164       uint64_t lo = position_fau & BITFIELD64_MASK(48);
165       uint64_t hi = position_fau >> 56;
166 
167       GENX(pandecode_fau)(ctx, lo, hi, "Position FAU");
168    }
169 
170    if (vary_fau) {
171       uint64_t lo = vary_fau & BITFIELD64_MASK(48);
172       uint64_t hi = vary_fau >> 56;
173 
174       GENX(pandecode_fau)(ctx, lo, hi, "Varying FAU");
175    }
176 
177    if (fragment_fau) {
178       uint64_t lo = fragment_fau & BITFIELD64_MASK(48);
179       uint64_t hi = fragment_fau >> 56;
180 
181       GENX(pandecode_fau)(ctx, lo, hi, "Fragment FAU");
182    }
183 
184    if (cs_get_u64(qctx, 16)) {
185       GENX(pandecode_shader)
186       (ctx, cs_get_u64(qctx, 16), "Position shader", qctx->gpu_id);
187    }
188 
189    if (tiler_flags.secondary_shader) {
190       uint64_t ptr = cs_get_u64(qctx, 18);
191 
192       GENX(pandecode_shader)(ctx, ptr, "Varying shader", qctx->gpu_id);
193    }
194 
195    if (cs_get_u64(qctx, 20)) {
196       GENX(pandecode_shader)
197       (ctx, cs_get_u64(qctx, 20), "Fragment shader", qctx->gpu_id);
198    }
199 
200    DUMP_ADDR(ctx, LOCAL_STORAGE, cs_get_u64(qctx, 24),
201              "Position Local Storage @%" PRIx64 ":\n",
202              cs_get_u64(qctx, reg_position_tsd));
203    DUMP_ADDR(ctx, LOCAL_STORAGE, cs_get_u64(qctx, 24),
204              "Varying Local Storage @%" PRIx64 ":\n",
205              cs_get_u64(qctx, reg_vary_tsd));
206    DUMP_ADDR(ctx, LOCAL_STORAGE, cs_get_u64(qctx, 30),
207              "Fragment Local Storage @%" PRIx64 ":\n",
208              cs_get_u64(qctx, reg_frag_tsd));
209 
210    pandecode_log(ctx, "Global attribute offset: %u\n", cs_get_u32(qctx, 32));
211    pandecode_log(ctx, "Index count: %u\n", cs_get_u32(qctx, 33));
212    pandecode_log(ctx, "Instance count: %u\n", cs_get_u32(qctx, 34));
213 
214    if (tiler_flags.index_type)
215       pandecode_log(ctx, "Index offset: %u\n", cs_get_u32(qctx, 35));
216 
217    pandecode_log(ctx, "Vertex offset: %d\n", cs_get_u32(qctx, 36));
218    pandecode_log(ctx, "Instance offset: %u\n", cs_get_u32(qctx, 37));
219    pandecode_log(ctx, "Tiler DCD flags2: %X\n", cs_get_u32(qctx, 38));
220 
221    if (tiler_flags.index_type)
222       pandecode_log(ctx, "Index array size: %u\n", cs_get_u32(qctx, 39));
223 
224    GENX(pandecode_tiler)(ctx, cs_get_u64(qctx, 40), qctx->gpu_id);
225 
226    DUMP_CL(ctx, SCISSOR, &qctx->regs[42], "Scissor\n");
227    pandecode_log(ctx, "Low depth clamp: %f\n", uif(cs_get_u32(qctx, 44)));
228    pandecode_log(ctx, "High depth clamp: %f\n", uif(cs_get_u32(qctx, 45)));
229    pandecode_log(ctx, "Occlusion: %" PRIx64 "\n", cs_get_u64(qctx, 46));
230 
231    if (tiler_flags.secondary_shader)
232       pandecode_log(ctx, "Varying allocation: %u\n", cs_get_u32(qctx, 48));
233 
234    mali_ptr blend = cs_get_u64(qctx, 50);
235    GENX(pandecode_blend_descs)(ctx, blend & ~7, blend & 7, 0, qctx->gpu_id);
236 
237    DUMP_ADDR(ctx, DEPTH_STENCIL, cs_get_u64(qctx, 52), "Depth/stencil");
238 
239    if (tiler_flags.index_type)
240       pandecode_log(ctx, "Indices: %" PRIx64 "\n", cs_get_u64(qctx, 54));
241 
242    DUMP_UNPACKED(ctx, PRIMITIVE_FLAGS, tiler_flags, "Primitive flags\n");
243    DUMP_CL(ctx, DCD_FLAGS_0, &qctx->regs[57], "DCD Flags 0\n");
244    DUMP_CL(ctx, DCD_FLAGS_1, &qctx->regs[58], "DCD Flags 1\n");
245    DUMP_CL(ctx, PRIMITIVE_SIZE, &qctx->regs[60], "Primitive size\n");
246 
247    ctx->indent--;
248 }
249 
250 static void
pandecode_run_fragment(struct pandecode_context * ctx,struct queue_ctx * qctx,struct MALI_CEU_RUN_FRAGMENT * I)251 pandecode_run_fragment(struct pandecode_context *ctx, struct queue_ctx *qctx,
252                        struct MALI_CEU_RUN_FRAGMENT *I)
253 {
254    ctx->indent++;
255 
256    DUMP_CL(ctx, SCISSOR, &qctx->regs[42], "Scissor\n");
257 
258    /* TODO: Tile enable map */
259    GENX(pandecode_fbd)
260    (ctx, cs_get_u64(qctx, 40) & ~0x3full, true, qctx->gpu_id);
261 
262    ctx->indent--;
263 }
264 
265 static void
print_indirect(unsigned address,int16_t offset,FILE * fp)266 print_indirect(unsigned address, int16_t offset, FILE *fp)
267 {
268    if (offset)
269       fprintf(fp, "[d%u + %d]", address, offset);
270    else
271       fprintf(fp, "[d%u]", address);
272 }
273 
274 static void
print_reg_tuple(unsigned base,uint16_t mask,FILE * fp)275 print_reg_tuple(unsigned base, uint16_t mask, FILE *fp)
276 {
277    bool first_reg = true;
278 
279    u_foreach_bit(i, mask) {
280       fprintf(fp, "%sr%u", first_reg ? "" : ":", base + i);
281       first_reg = false;
282    }
283 
284    if (mask == 0)
285       fprintf(fp, "_");
286 }
287 
288 static void
disassemble_ceu_instr(struct pandecode_context * ctx,uint64_t dword,unsigned indent,bool verbose,FILE * fp,struct queue_ctx * qctx)289 disassemble_ceu_instr(struct pandecode_context *ctx, uint64_t dword,
290                       unsigned indent, bool verbose, FILE *fp,
291                       struct queue_ctx *qctx)
292 {
293    if (verbose) {
294       fprintf(fp, " ");
295       for (unsigned b = 0; b < 8; ++b)
296          fprintf(fp, " %02x", (uint8_t)(dword >> (8 * b)));
297    }
298 
299    for (int i = 0; i < indent; ++i)
300       fprintf(fp, "  ");
301 
302    /* Unpack the base so we get the opcode */
303    uint8_t *bytes = (uint8_t *)&dword;
304    pan_unpack(bytes, CEU_BASE, base);
305 
306    switch (base.opcode) {
307    case MALI_CEU_OPCODE_NOP: {
308       pan_unpack(bytes, CEU_NOP, I);
309 
310       if (I.ignored)
311          fprintf(fp, "NOP // 0x%" PRIX64 "\n", I.ignored);
312       else
313          fprintf(fp, "NOP\n");
314       break;
315    }
316 
317    case MALI_CEU_OPCODE_MOVE: {
318       pan_unpack(bytes, CEU_MOVE, I);
319 
320       fprintf(fp, "MOVE d%u, #0x%" PRIX64 "\n", I.destination, I.immediate);
321       break;
322    }
323 
324    case MALI_CEU_OPCODE_MOVE32: {
325       pan_unpack(bytes, CEU_MOVE32, I);
326       fprintf(fp, "MOVE32 r%u, #0x%X\n", I.destination, I.immediate);
327       break;
328    }
329 
330    case MALI_CEU_OPCODE_WAIT: {
331       bool first = true;
332       pan_unpack(bytes, CEU_WAIT, I);
333       fprintf(fp, "WAIT ");
334 
335       u_foreach_bit(i, I.slots) {
336          fprintf(fp, "%s%u", first ? "" : ",", i);
337          first = false;
338       }
339 
340       fprintf(fp, "\n");
341       break;
342    }
343 
344    case MALI_CEU_OPCODE_RUN_COMPUTE: {
345       pan_unpack(bytes, CEU_RUN_COMPUTE, I);
346       pandecode_run_compute(ctx, fp, qctx, &I);
347       break;
348    }
349 
350    case MALI_CEU_OPCODE_RUN_IDVS: {
351       pan_unpack(bytes, CEU_RUN_IDVS, I);
352       pandecode_run_idvs(ctx, fp, qctx, &I);
353       break;
354    }
355 
356    case MALI_CEU_OPCODE_RUN_FRAGMENT: {
357       pan_unpack(bytes, CEU_RUN_FRAGMENT, I);
358       fprintf(fp, "RUN_FRAGMENT%s\n",
359               I.enable_tem ? ".tile_enable_map_enable" : "");
360       pandecode_run_fragment(ctx, qctx, &I);
361       break;
362    }
363 
364    case MALI_CEU_OPCODE_ADD_IMMEDIATE32: {
365       pan_unpack(bytes, CEU_ADD_IMMEDIATE32, I);
366 
367       fprintf(fp, "ADD_IMMEDIATE32 r%u, r%u, #%d\n", I.destination, I.source,
368               I.immediate);
369       break;
370    }
371 
372    case MALI_CEU_OPCODE_ADD_IMMEDIATE64: {
373       pan_unpack(bytes, CEU_ADD_IMMEDIATE64, I);
374 
375       fprintf(fp, "ADD_IMMEDIATE64 d%u, d%u, #%d\n", I.destination, I.source,
376               I.immediate);
377       break;
378    }
379 
380    case MALI_CEU_OPCODE_LOAD_MULTIPLE: {
381       pan_unpack(bytes, CEU_LOAD_MULTIPLE, I);
382 
383       fprintf(fp, "LOAD_MULTIPLE ");
384       print_reg_tuple(I.base, I.mask, fp);
385       fprintf(fp, ", ");
386       print_indirect(I.address, I.offset, fp);
387       fprintf(fp, "\n");
388       break;
389    }
390 
391    case MALI_CEU_OPCODE_STORE_MULTIPLE: {
392       pan_unpack(bytes, CEU_STORE_MULTIPLE, I);
393 
394       fprintf(fp, "STORE_MULTIPLE ");
395       print_indirect(I.address, I.offset, fp);
396       fprintf(fp, ", ");
397       print_reg_tuple(I.base, I.mask, fp);
398       fprintf(fp, "\n");
399       break;
400    }
401 
402    case MALI_CEU_OPCODE_SET_SB_ENTRY: {
403       pan_unpack(bytes, CEU_SET_SB_ENTRY, I);
404 
405       fprintf(fp, "SET_SB_ENTRY #%u, #%u\n", I.endpoint_entry, I.other_entry);
406       break;
407    }
408 
409    case MALI_CEU_OPCODE_SYNC_ADD32: {
410       pan_unpack(bytes, CEU_SYNC_ADD32, I);
411       bool first = true;
412       fprintf(fp, "SYNC_ADD32%s%s signal(%u), wait(",
413               I.error_propagate ? ".error_propagate" : "",
414               I.scope_csg ? ".csg" : ".system", I.scoreboard_slot);
415 
416       u_foreach_bit(i, I.wait_mask) {
417          fprintf(fp, "%s%u", first ? "" : ",", i);
418          first = false;
419       }
420 
421       fprintf(fp, ") [d%u], r%u\n", I.address, I.data);
422       break;
423    }
424 
425    case MALI_CEU_OPCODE_SYNC_ADD64: {
426       pan_unpack(bytes, CEU_SYNC_ADD64, I);
427       bool first = true;
428       fprintf(fp, "SYNC_ADD64%s%s signal(%u), wait(",
429               I.error_propagate ? ".error_propagate" : "",
430               I.scope_csg ? ".csg" : ".system", I.scoreboard_slot);
431 
432       u_foreach_bit(i, I.wait_mask) {
433          fprintf(fp, "%s%u", first ? "" : ",", i);
434          first = false;
435       }
436 
437       fprintf(fp, ") [d%u], d%u\n", I.address, I.data);
438       break;
439    }
440 
441    case MALI_CEU_OPCODE_SYNC_SET32: {
442       pan_unpack(bytes, CEU_SYNC_SET32, I);
443       bool first = true;
444       fprintf(fp, "SYNC_SET32.%s%s signal(%u), wait(",
445               I.error_propagate ? ".error_propagate" : "",
446               I.scope_csg ? ".csg" : ".system", I.scoreboard_slot);
447 
448       u_foreach_bit(i, I.wait_mask) {
449          fprintf(fp, "%s%u", first ? "" : ",", i);
450          first = false;
451       }
452 
453       fprintf(fp, ") [d%u], r%u\n", I.address, I.data);
454       break;
455    }
456 
457    case MALI_CEU_OPCODE_SYNC_SET64: {
458       pan_unpack(bytes, CEU_SYNC_SET64, I);
459       bool first = true;
460       fprintf(fp, "SYNC_SET64.%s%s signal(%u), wait(",
461               I.error_propagate ? ".error_propagate" : "",
462               I.scope_csg ? ".csg" : ".system", I.scoreboard_slot);
463 
464       u_foreach_bit(i, I.wait_mask) {
465          fprintf(fp, "%s%u", first ? "" : ",", i);
466          first = false;
467       }
468 
469       fprintf(fp, ") [d%u], d%u\n", I.address, I.data);
470       break;
471    }
472 
473    case MALI_CEU_OPCODE_CALL: {
474       pan_unpack(bytes, CEU_CALL, I);
475       fprintf(fp, "CALL d%u, r%u\n", I.address, I.length);
476       break;
477    }
478 
479    case MALI_CEU_OPCODE_JUMP: {
480       pan_unpack(bytes, CEU_JUMP, I);
481       fprintf(fp, "JUMP d%u, r%u\n", I.address, I.length);
482       break;
483    }
484 
485    case MALI_CEU_OPCODE_REQ_RESOURCE: {
486       pan_unpack(bytes, CEU_REQ_RESOURCE, I);
487 
488       fprintf(fp, "REQ_RESOURCE");
489       if (I.compute)
490          fprintf(fp, ".compute");
491       if (I.fragment)
492          fprintf(fp, ".fragment");
493       if (I.tiler)
494          fprintf(fp, ".tiler");
495       if (I.idvs)
496          fprintf(fp, ".idvs");
497       fprintf(fp, "\n");
498       break;
499    }
500 
501    case MALI_CEU_OPCODE_SYNC_WAIT32: {
502       pan_unpack(bytes, CEU_SYNC_WAIT32, I);
503 
504       fprintf(fp, "SYNC_WAIT32%s%s d%u, r%u\n", I.invert ? ".gt" : ".le",
505               I.error_reject ? ".reject" : ".inherit", I.address, I.data);
506       break;
507    }
508 
509    case MALI_CEU_OPCODE_SYNC_WAIT64: {
510       pan_unpack(bytes, CEU_SYNC_WAIT64, I);
511 
512       fprintf(fp, "SYNC_WAIT64%s%s d%u, d%u\n", I.invert ? ".gt" : ".le",
513               I.error_reject ? ".reject" : ".inherit", I.address, I.data);
514       break;
515    }
516 
517    case MALI_CEU_OPCODE_UMIN32: {
518       pan_unpack(bytes, CEU_UMIN32, I);
519 
520       fprintf(fp, "UMIN32 r%u, r%u, r%u\n", I.destination, I.source_1,
521               I.source_2);
522       break;
523    }
524 
525    case MALI_CEU_OPCODE_BRANCH: {
526       pan_unpack(bytes, CEU_BRANCH, I);
527 
528       static const char *condition[] = {
529          "le", "gt", "eq", "ne", "lt", "ge", "always",
530       };
531       fprintf(fp, "BRANCH.%s r%u, #%d\n", condition[I.condition], I.value,
532               I.offset);
533 
534       break;
535    }
536 
537    case MALI_CEU_OPCODE_FLUSH_CACHE2: {
538       pan_unpack(bytes, CEU_FLUSH_CACHE2, I);
539       static const char *mode[] = {
540          "nop",
541          "clean",
542          "INVALID",
543          "clean_invalidate",
544       };
545 
546       fprintf(fp, "FLUSH_CACHE2.%s_l2.%s_lsc%s r%u, signal(%u), wait(",
547               mode[I.l2_flush_mode], mode[I.lsc_flush_mode],
548               I.other_invalidate ? ".invalidate_other" : "", I.latest_flush_id,
549               I.scoreboard_entry);
550 
551       bool first = true;
552       u_foreach_bit(i, I.scoreboard_mask) {
553          fprintf(fp, "%s%u", first ? "" : ",", i);
554          first = false;
555       }
556       fprintf(fp, ")\n");
557       break;
558    }
559 
560    case MALI_CEU_OPCODE_FINISH_TILING: {
561       pan_unpack(bytes, CEU_FINISH_TILING, I);
562       fprintf(fp, "FINISH_TILING\n");
563       break;
564    }
565 
566    case MALI_CEU_OPCODE_FINISH_FRAGMENT: {
567       pan_unpack(bytes, CEU_FINISH_FRAGMENT, I);
568 
569       bool first = true;
570       fprintf(fp, "FINISH_FRAGMENT.%s, d%u, d%u, signal(%u), wait(",
571               I.increment_fragment_completed ? ".frag_end" : "",
572               I.last_heap_chunk, I.first_heap_chunk, I.scoreboard_entry);
573 
574       u_foreach_bit(i, I.wait_mask) {
575          fprintf(fp, "%s%u", first ? "" : ",", i);
576          first = false;
577       }
578       fprintf(fp, ")\n");
579       break;
580    }
581 
582    case MALI_CEU_OPCODE_HEAP_OPERATION: {
583       pan_unpack(bytes, CEU_HEAP_OPERATION, I);
584       const char *counter_names[] = {"vt_start", "vt_end", NULL, "frag_end"};
585       bool first = true;
586       fprintf(fp, "HEAP_OPERATION.%s signal(%u), wait(",
587               counter_names[I.operation], I.scoreboard_entry);
588 
589       u_foreach_bit(i, I.wait_mask) {
590          fprintf(fp, "%s%u", first ? "" : ",", i);
591          first = false;
592       }
593 
594       fprintf(fp, ")\n");
595       break;
596    }
597 
598    case MALI_CEU_OPCODE_HEAP_SET: {
599       pan_unpack(bytes, CEU_HEAP_SET, I);
600       fprintf(fp, "HEAP_SET d%u\n", I.address);
601       break;
602    }
603 
604    default: {
605       fprintf(fp, "INVALID_%u 0x%" PRIX64 "\n", base.opcode, base.data);
606       break;
607    }
608    }
609 }
610 
611 static bool
interpret_ceu_jump(struct pandecode_context * ctx,struct queue_ctx * qctx,uint64_t reg_address,uint32_t reg_length)612 interpret_ceu_jump(struct pandecode_context *ctx, struct queue_ctx *qctx,
613                    uint64_t reg_address, uint32_t reg_length)
614 {
615    uint32_t address_lo = qctx->regs[reg_address];
616    uint32_t address_hi = qctx->regs[reg_address + 1];
617    uint32_t length = qctx->regs[reg_length];
618 
619    if (length % 8) {
620       fprintf(stderr, "CS call alignment error\n");
621       return false;
622    }
623 
624    /* Map the entire subqueue now */
625    uint64_t address = ((uint64_t)address_hi << 32) | address_lo;
626    uint64_t *cs = pandecode_fetch_gpu_mem(ctx, address, length);
627 
628    qctx->ip = cs;
629    qctx->end = cs + (length / 8);
630 
631    /* Skip the usual IP update */
632    return true;
633 }
634 
635 /*
636  * Interpret a single instruction of the CEU, updating the register file,
637  * instruction pointer, and call stack. Memory access and GPU controls are
638  * ignored for now.
639  *
640  * Returns true if execution should continue.
641  */
642 static bool
interpret_ceu_instr(struct pandecode_context * ctx,struct queue_ctx * qctx)643 interpret_ceu_instr(struct pandecode_context *ctx, struct queue_ctx *qctx)
644 {
645    /* Unpack the base so we get the opcode */
646    uint8_t *bytes = (uint8_t *)qctx->ip;
647    pan_unpack(bytes, CEU_BASE, base);
648 
649    assert(qctx->ip < qctx->end);
650 
651    switch (base.opcode) {
652    case MALI_CEU_OPCODE_MOVE: {
653       pan_unpack(bytes, CEU_MOVE, I);
654 
655       qctx->regs[I.destination + 0] = (uint32_t)I.immediate;
656       qctx->regs[I.destination + 1] = (uint32_t)(I.immediate >> 32);
657       break;
658    }
659 
660    case MALI_CEU_OPCODE_MOVE32: {
661       pan_unpack(bytes, CEU_MOVE32, I);
662 
663       qctx->regs[I.destination] = I.immediate;
664       break;
665    }
666 
667    case MALI_CEU_OPCODE_ADD_IMMEDIATE32: {
668       pan_unpack(bytes, CEU_ADD_IMMEDIATE32, I);
669 
670       qctx->regs[I.destination] = qctx->regs[I.source] + I.immediate;
671       break;
672    }
673 
674    case MALI_CEU_OPCODE_ADD_IMMEDIATE64: {
675       pan_unpack(bytes, CEU_ADD_IMMEDIATE64, I);
676 
677       int64_t value =
678          (qctx->regs[I.source] | ((int64_t)qctx->regs[I.source + 1] << 32)) +
679          I.immediate;
680 
681       qctx->regs[I.destination] = value;
682       qctx->regs[I.destination + 1] = value >> 32;
683       break;
684    }
685 
686    case MALI_CEU_OPCODE_CALL: {
687       pan_unpack(bytes, CEU_CALL, I);
688 
689       if (qctx->call_stack_depth == MAX_CALL_STACK_DEPTH) {
690          fprintf(stderr, "CS call stack overflow\n");
691          return false;
692       }
693 
694       assert(qctx->call_stack_depth < MAX_CALL_STACK_DEPTH);
695 
696       qctx->ip++;
697 
698       /* Note: tail calls are not optimized in the hardware. */
699       assert(qctx->ip <= qctx->end);
700 
701       unsigned depth = qctx->call_stack_depth++;
702 
703       qctx->call_stack[depth].lr = qctx->ip;
704       qctx->call_stack[depth].end = qctx->end;
705 
706       return interpret_ceu_jump(ctx, qctx, I.address, I.length);
707    }
708 
709    case MALI_CEU_OPCODE_JUMP: {
710       pan_unpack(bytes, CEU_JUMP, I);
711 
712       if (qctx->call_stack_depth == 0) {
713          fprintf(stderr, "Cannot jump from the entrypoint\n");
714          return false;
715       }
716 
717       return interpret_ceu_jump(ctx, qctx, I.address, I.length);
718    }
719 
720    default:
721       break;
722    }
723 
724    /* Update IP first to point to the next instruction, so call doesn't
725     * require special handling (even for tail calls).
726     */
727    qctx->ip++;
728 
729    while (qctx->ip == qctx->end) {
730       /* Graceful termination */
731       if (qctx->call_stack_depth == 0)
732          return false;
733 
734       /* Pop off the call stack */
735       unsigned old_depth = --qctx->call_stack_depth;
736 
737       qctx->ip = qctx->call_stack[old_depth].lr;
738       qctx->end = qctx->call_stack[old_depth].end;
739    }
740 
741    return true;
742 }
743 
744 void
GENX(pandecode_cs)745 GENX(pandecode_cs)(struct pandecode_context *ctx, mali_ptr queue, uint32_t size,
746                    unsigned gpu_id, uint32_t *regs)
747 {
748    pandecode_dump_file_open(ctx);
749 
750    uint64_t *cs = pandecode_fetch_gpu_mem(ctx, queue, size);
751 
752    /* Mali-G610 has 96 registers. Other devices not yet supported, we can make
753     * this configurable later when we encounter new Malis.
754     */
755    struct queue_ctx qctx = {
756       .nr_regs = 96,
757       .regs = regs,
758       .ip = cs,
759       .end = cs + (size / 8),
760       .gpu_id = gpu_id,
761    };
762 
763    if (size) {
764       do {
765          disassemble_ceu_instr(ctx, *(qctx.ip), 1 + qctx.call_stack_depth, true,
766                                ctx->dump_stream, &qctx);
767       } while (interpret_ceu_instr(ctx, &qctx));
768    }
769 
770    fflush(ctx->dump_stream);
771    pandecode_map_read_write(ctx);
772 }
773 #endif
774