• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /*
2  * Copyright (C) 2022-2023 Collabora, Ltd.
3  *
4  * Permission is hereby granted, free of charge, to any person obtaining a
5  * copy of this software and associated documentation files (the "Software"),
6  * to deal in the Software without restriction, including without limitation
7  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8  * and/or sell copies of the Software, and to permit persons to whom the
9  * Software is furnished to do so, subject to the following conditions:
10  *
11  * The above copyright notice and this permission notice (including the next
12  * paragraph) shall be included in all copies or substantial portions of the
13  * Software.
14  *
15  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
18  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21  * SOFTWARE.
22  */
23 
24 #include "util/bitset.h"
25 #include "util/hash_table.h"
26 #include "util/list.h"
27 #include "util/ralloc.h"
28 
29 #include "genxml/gen_macros.h"
30 #include "decode.h"
31 
32 #if PAN_ARCH >= 10
33 
34 #include "genxml/cs_builder.h"
35 
36 /* Limit for Mali-G610. -1 because we're not including the active frame */
37 #define MAX_CALL_STACK_DEPTH (8 - 1)
38 
39 #define cs_unpack(packed, T, unpacked) pan_cast_and_unpack(packed, T, unpacked)
40 
41 struct queue_ctx {
42    /* Size of CSHWIF register file in 32-bit registers */
43    unsigned nr_regs;
44 
45    /* CSHWIF register file */
46    uint32_t *regs;
47 
48    /* Current instruction pointer (CPU pointer for convenience) */
49    uint64_t *ip;
50 
51    /* Current instruction end pointer */
52    uint64_t *end;
53 
54    /* Whether currently inside an exception handler */
55    bool in_exception_handler;
56 
57    /* Call stack. Depth=0 means root */
58    struct {
59       /* Link register to return to */
60       uint64_t *lr;
61 
62       /* End pointer, there is a return (or exit) after */
63       uint64_t *end;
64    } call_stack[MAX_CALL_STACK_DEPTH + 1]; /* +1 for exception handler */
65    uint8_t call_stack_depth;
66 
67    unsigned gpu_id;
68 };
69 
70 static void
print_indirect(unsigned address,int16_t offset,FILE * fp)71 print_indirect(unsigned address, int16_t offset, FILE *fp)
72 {
73    if (offset)
74       fprintf(fp, "[d%u + %d]", address, offset);
75    else
76       fprintf(fp, "[d%u]", address);
77 }
78 
79 static void
print_reg_tuple(unsigned base,uint16_t mask,FILE * fp)80 print_reg_tuple(unsigned base, uint16_t mask, FILE *fp)
81 {
82    bool first_reg = true;
83 
84    u_foreach_bit(i, mask) {
85       fprintf(fp, "%sr%u", first_reg ? "" : ":", base + i);
86       first_reg = false;
87    }
88 
89    if (mask == 0)
90       fprintf(fp, "_");
91 }
92 
93 static const char *conditions_str[] = {
94    "le", "gt", "eq", "ne", "lt", "ge", "always",
95 };
96 
97 static void
print_cs_instr(FILE * fp,const uint64_t * instr)98 print_cs_instr(FILE *fp, const uint64_t *instr)
99 {
100    cs_unpack(instr, CS_BASE, base);
101    switch (base.opcode) {
102    case MALI_CS_OPCODE_NOP: {
103       cs_unpack(instr, CS_NOP, I);
104       if (I.ignored)
105          fprintf(fp, "NOP // 0x%" PRIX64, I.ignored);
106       else
107          fprintf(fp, "NOP");
108       break;
109    }
110 
111    case MALI_CS_OPCODE_MOVE: {
112       cs_unpack(instr, CS_MOVE, I);
113       fprintf(fp, "MOVE d%u, #0x%" PRIX64, I.destination, I.immediate);
114       break;
115    }
116 
117    case MALI_CS_OPCODE_MOVE32: {
118       cs_unpack(instr, CS_MOVE32, I);
119       fprintf(fp, "MOVE32 r%u, #0x%X", I.destination, I.immediate);
120       break;
121    }
122 
123    case MALI_CS_OPCODE_WAIT: {
124       cs_unpack(instr, CS_WAIT, I);
125       fprintf(fp, "WAIT%s #%x", I.progress_increment ? ".progress_inc" : "",
126               I.wait_mask);
127       break;
128    }
129 
130    case MALI_CS_OPCODE_RUN_COMPUTE: {
131       const char *axes[4] = {"x_axis", "y_axis", "z_axis"};
132       cs_unpack(instr, CS_RUN_COMPUTE, I);
133 
134       /* Print the instruction. Ignore the selects and the flags override
135        * since we'll print them implicitly later.
136        */
137       fprintf(fp, "RUN_COMPUTE%s.%s.srt%d.spd%d.tsd%d.fau%d #%u",
138               I.progress_increment ? ".progress_inc" : "", axes[I.task_axis],
139               I.srt_select, I.spd_select, I.tsd_select, I.fau_select,
140               I.task_increment);
141       break;
142    }
143 
144    case MALI_CS_OPCODE_RUN_TILING: {
145       cs_unpack(instr, CS_RUN_TILING, I);
146       fprintf(fp, "RUN_TILING%s.srt%d.spd%d.tsd%d.fau%d",
147               I.progress_increment ? ".progress_inc" : "", I.srt_select,
148               I.spd_select, I.tsd_select, I.fau_select);
149       break;
150    }
151 
152    case MALI_CS_OPCODE_RUN_IDVS: {
153       cs_unpack(instr, CS_RUN_IDVS, I);
154       fprintf(
155          fp,
156          "RUN_IDVS%s%s%s.varying_srt%d.varying_fau%d.varying_tsd%d.frag_srt%d.frag_tsd%d r%u, #%x",
157          I.progress_increment ? ".progress_inc" : "",
158          I.malloc_enable ? "" : ".no_malloc",
159          I.draw_id_register_enable ? ".draw_id_enable" : "",
160          I.varying_srt_select, I.varying_fau_select, I.varying_tsd_select,
161          I.fragment_srt_select, I.fragment_tsd_select, I.draw_id,
162          I.flags_override);
163       break;
164    }
165 
166    case MALI_CS_OPCODE_RUN_FRAGMENT: {
167       static const char *tile_order[] = {
168          "zorder",  "horizontal",     "vertical",     "unknown",
169          "unknown", "rev_horizontal", "rev_vertical", "unknown",
170          "unknown", "unknown",        "unknown",      "unknown",
171          "unknown", "unknown",        "unknown",      "unknown",
172       };
173       cs_unpack(instr, CS_RUN_FRAGMENT, I);
174 
175       fprintf(fp, "RUN_FRAGMENT%s%s.tile_order=%s",
176               I.progress_increment ? ".progress_inc" : "",
177               I.enable_tem ? ".tile_enable_map_enable" : "",
178               tile_order[I.tile_order]);
179       break;
180    }
181 
182    case MALI_CS_OPCODE_RUN_FULLSCREEN: {
183       cs_unpack(instr, CS_RUN_FULLSCREEN, I);
184       fprintf(fp, "RUN_FULLSCREEN%s r%u, #%x",
185               I.progress_increment ? ".progress_inc" : "", I.dcd,
186               I.flags_override);
187       break;
188    }
189 
190    case MALI_CS_OPCODE_FINISH_TILING: {
191       cs_unpack(instr, CS_FINISH_TILING, I);
192       fprintf(fp, "FINISH_TILING%s",
193               I.progress_increment ? ".progress_inc" : "");
194       break;
195    }
196 
197    case MALI_CS_OPCODE_FINISH_FRAGMENT: {
198       cs_unpack(instr, CS_FINISH_FRAGMENT, I);
199       fprintf(fp, "FINISH_FRAGMENT%s d%u, d%u, #%x, #%u",
200               I.increment_fragment_completed ? ".frag_end" : "",
201               I.last_heap_chunk, I.first_heap_chunk, I.wait_mask,
202               I.signal_slot);
203       break;
204    }
205 
206    case MALI_CS_OPCODE_ADD_IMMEDIATE32: {
207       cs_unpack(instr, CS_ADD_IMMEDIATE32, I);
208 
209       fprintf(fp, "ADD_IMMEDIATE32 r%u, r%u, #%d", I.destination, I.source,
210               I.immediate);
211       break;
212    }
213 
214    case MALI_CS_OPCODE_ADD_IMMEDIATE64: {
215       cs_unpack(instr, CS_ADD_IMMEDIATE64, I);
216 
217       fprintf(fp, "ADD_IMMEDIATE64 d%u, d%u, #%d", I.destination, I.source,
218               I.immediate);
219       break;
220    }
221 
222    case MALI_CS_OPCODE_UMIN32: {
223       cs_unpack(instr, CS_UMIN32, I);
224 
225       fprintf(fp, "UMIN32 r%u, r%u, r%u", I.destination, I.source_1,
226               I.source_2);
227       break;
228    }
229 
230    case MALI_CS_OPCODE_LOAD_MULTIPLE: {
231       cs_unpack(instr, CS_LOAD_MULTIPLE, I);
232 
233       fprintf(fp, "LOAD_MULTIPLE ");
234       print_reg_tuple(I.base_register, I.mask, fp);
235       fprintf(fp, ", ");
236       print_indirect(I.address, I.offset, fp);
237       break;
238    }
239 
240    case MALI_CS_OPCODE_STORE_MULTIPLE: {
241       cs_unpack(instr, CS_STORE_MULTIPLE, I);
242 
243       fprintf(fp, "STORE_MULTIPLE ");
244       print_indirect(I.address, I.offset, fp);
245       fprintf(fp, ", ");
246       print_reg_tuple(I.base_register, I.mask, fp);
247       break;
248    }
249 
250    case MALI_CS_OPCODE_BRANCH: {
251       cs_unpack(instr, CS_BRANCH, I);
252       fprintf(fp, "BRANCH.%s r%u, #%d", conditions_str[I.condition], I.value,
253               I.offset);
254       break;
255    }
256 
257    case MALI_CS_OPCODE_SET_SB_ENTRY: {
258       cs_unpack(instr, CS_SET_SB_ENTRY, I);
259       fprintf(fp, "SET_SB_ENTRY #%u, #%u", I.endpoint_entry, I.other_entry);
260       break;
261    }
262 
263    case MALI_CS_OPCODE_PROGRESS_WAIT: {
264       cs_unpack(instr, CS_PROGRESS_WAIT, I);
265       fprintf(fp, "PROGRESS_WAIT d%u, #%u", I.source, I.queue);
266       break;
267    }
268 
269    case MALI_CS_OPCODE_SET_EXCEPTION_HANDLER: {
270       cs_unpack(instr, CS_SET_EXCEPTION_HANDLER, I);
271       fprintf(fp, "SET_EXCEPTION_HANDLER d%u, r%u", I.address, I.length);
272       break;
273    }
274 
275    case MALI_CS_OPCODE_CALL: {
276       cs_unpack(instr, CS_CALL, I);
277       fprintf(fp, "CALL d%u, r%u", I.address, I.length);
278       break;
279    }
280 
281    case MALI_CS_OPCODE_JUMP: {
282       cs_unpack(instr, CS_JUMP, I);
283       fprintf(fp, "JUMP d%u, r%u", I.address, I.length);
284       break;
285    }
286 
287    case MALI_CS_OPCODE_REQ_RESOURCE: {
288       cs_unpack(instr, CS_REQ_RESOURCE, I);
289       fprintf(fp, "REQ_RESOURCE%s%s%s%s", I.compute ? ".compute" : "",
290               I.fragment ? ".fragment" : "", I.tiler ? ".tiler" : "",
291               I.idvs ? ".idvs" : "");
292       break;
293    }
294 
295    case MALI_CS_OPCODE_FLUSH_CACHE2: {
296       cs_unpack(instr, CS_FLUSH_CACHE2, I);
297       static const char *mode[] = {
298          "nop",
299          "clean",
300          "INVALID",
301          "clean_invalidate",
302       };
303 
304       fprintf(fp, "FLUSH_CACHE2.%s_l2.%s_lsc%s r%u, #%x, #%u",
305               mode[I.l2_flush_mode], mode[I.lsc_flush_mode],
306               I.other_invalidate ? ".invalidate_other" : ".nop_other",
307               I.latest_flush_id, I.wait_mask, I.signal_slot);
308       break;
309    }
310 
311    case MALI_CS_OPCODE_SYNC_ADD32: {
312       cs_unpack(instr, CS_SYNC_ADD32, I);
313       fprintf(fp, "SYNC_ADD32%s%s [d%u], r%u, #%x, #%u",
314               I.error_propagate ? ".error_propagate" : "",
315               I.scope == MALI_CS_SYNC_SCOPE_CSG ? ".csg" : ".system", I.address,
316               I.data, I.wait_mask, I.signal_slot);
317       break;
318    }
319 
320    case MALI_CS_OPCODE_SYNC_SET32: {
321       cs_unpack(instr, CS_SYNC_SET32, I);
322       fprintf(fp, "SYNC_SET32.%s%s [d%u], r%u, #%x, #%u",
323               I.error_propagate ? ".error_propagate" : "",
324               I.scope == MALI_CS_SYNC_SCOPE_CSG ? ".csg" : ".system", I.address,
325               I.data, I.wait_mask, I.signal_slot);
326       break;
327    }
328 
329    case MALI_CS_OPCODE_SYNC_WAIT32: {
330       cs_unpack(instr, CS_SYNC_WAIT32, I);
331       fprintf(fp, "SYNC_WAIT32%s%s d%u, r%u", conditions_str[I.condition],
332               I.error_reject ? ".reject" : ".inherit", I.address, I.data);
333       break;
334    }
335 
336    case MALI_CS_OPCODE_STORE_STATE: {
337       static const char *states_str[] = {
338          "SYSTEM_TIMESTAMP",
339          "CYCLE_COUNT",
340          "DISJOINT_COUNT",
341          "ERROR_STATE",
342       };
343 
344       cs_unpack(instr, CS_STORE_STATE, I);
345       fprintf(fp, "STORE_STATE.%s d%u, #%i, #%x, #%u",
346               I.state >= ARRAY_SIZE(states_str) ? "UNKNOWN_STATE"
347                                                 : states_str[I.state],
348               I.address, I.offset, I.wait_mask, I.signal_slot);
349       break;
350    }
351 
352    case MALI_CS_OPCODE_PROT_REGION: {
353       cs_unpack(instr, CS_PROT_REGION, I);
354       fprintf(fp, "PROT_REGION #%u", I.size);
355       break;
356    }
357 
358    case MALI_CS_OPCODE_PROGRESS_STORE: {
359       cs_unpack(instr, CS_PROGRESS_STORE, I);
360       fprintf(fp, "PROGRESS_STORE d%u", I.source);
361       break;
362    }
363 
364    case MALI_CS_OPCODE_PROGRESS_LOAD: {
365       cs_unpack(instr, CS_PROGRESS_LOAD, I);
366       fprintf(fp, "PROGRESS_LOAD d%u", I.destination);
367       break;
368    }
369 
370    case MALI_CS_OPCODE_RUN_COMPUTE_INDIRECT: {
371       cs_unpack(instr, CS_RUN_COMPUTE_INDIRECT, I);
372       fprintf(fp, "RUN_COMPUTE_INDIRECT%s.srt%d.spd%d.tsd%d.fau%d #%u",
373               I.progress_increment ? ".progress_inc" : "", I.srt_select,
374               I.spd_select, I.tsd_select, I.fau_select, I.workgroups_per_task);
375 
376       break;
377    }
378 
379    case MALI_CS_OPCODE_ERROR_BARRIER: {
380       cs_unpack(instr, CS_ERROR_BARRIER, I);
381       fprintf(fp, "ERROR_BARRIER");
382       break;
383    }
384 
385    case MALI_CS_OPCODE_HEAP_SET: {
386       cs_unpack(instr, CS_HEAP_SET, I);
387       fprintf(fp, "HEAP_SET d%u", I.address);
388       break;
389    }
390 
391    case MALI_CS_OPCODE_HEAP_OPERATION: {
392       cs_unpack(instr, CS_HEAP_OPERATION, I);
393       const char *counter_names[] = {"vt_start", "vt_end", NULL, "frag_end"};
394       fprintf(fp, "HEAP_OPERATION.%s #%x, #%d", counter_names[I.operation],
395               I.wait_mask, I.signal_slot);
396       break;
397    }
398 
399    case MALI_CS_OPCODE_TRACE_POINT: {
400       cs_unpack(instr, CS_TRACE_POINT, I);
401       fprintf(fp, "TRACE_POINT r%d:r%d, #%x, #%u", I.base_register,
402               I.base_register + I.register_count - 1, I.wait_mask,
403               I.signal_slot);
404       break;
405    }
406 
407    case MALI_CS_OPCODE_SYNC_ADD64: {
408       cs_unpack(instr, CS_SYNC_ADD64, I);
409       fprintf(fp, "SYNC_ADD64%s%s [d%u], d%u, #%x, #%u",
410               I.error_propagate ? ".error_propagate" : "",
411               I.scope == MALI_CS_SYNC_SCOPE_CSG ? ".csg" : ".system", I.address,
412               I.data, I.wait_mask, I.signal_slot);
413       break;
414    }
415 
416    case MALI_CS_OPCODE_SYNC_SET64: {
417       cs_unpack(instr, CS_SYNC_SET64, I);
418       fprintf(fp, "SYNC_SET64.%s%s [d%u], d%u, #%x, #%u",
419               I.error_propagate ? ".error_propagate" : "",
420               I.scope == MALI_CS_SYNC_SCOPE_CSG ? ".csg" : ".system", I.address,
421               I.data, I.wait_mask, I.signal_slot);
422       break;
423    }
424 
425    case MALI_CS_OPCODE_SYNC_WAIT64: {
426       cs_unpack(instr, CS_SYNC_WAIT64, I);
427 
428       fprintf(fp, "SYNC_WAIT64%s%s d%u, d%u", conditions_str[I.condition],
429               I.error_reject ? ".reject" : ".inherit", I.address, I.data);
430       break;
431    }
432 
433    default: {
434       fprintf(fp, "UNKNOWN_%u 0x%" PRIX64 "\n", base.opcode, base.data);
435       break;
436    }
437    }
438 }
439 
440 static uint32_t
cs_get_u32(struct queue_ctx * qctx,uint8_t reg)441 cs_get_u32(struct queue_ctx *qctx, uint8_t reg)
442 {
443    assert(reg < qctx->nr_regs);
444    return qctx->regs[reg];
445 }
446 
447 static uint64_t
cs_get_u64(struct queue_ctx * qctx,uint8_t reg)448 cs_get_u64(struct queue_ctx *qctx, uint8_t reg)
449 {
450    return (((uint64_t)cs_get_u32(qctx, reg + 1)) << 32) | cs_get_u32(qctx, reg);
451 }
452 
453 static void
pandecode_run_compute(struct pandecode_context * ctx,FILE * fp,struct queue_ctx * qctx,struct MALI_CS_RUN_COMPUTE * I)454 pandecode_run_compute(struct pandecode_context *ctx, FILE *fp,
455                       struct queue_ctx *qctx, struct MALI_CS_RUN_COMPUTE *I)
456 {
457    if (qctx->in_exception_handler)
458       return;
459 
460    ctx->indent++;
461 
462    unsigned reg_srt = 0 + (I->srt_select * 2);
463    unsigned reg_fau = 8 + (I->fau_select * 2);
464    unsigned reg_spd = 16 + (I->spd_select * 2);
465    unsigned reg_tsd = 24 + (I->tsd_select * 2);
466 
467    GENX(pandecode_resource_tables)(ctx, cs_get_u64(qctx, reg_srt), "Resources");
468 
469    uint64_t fau = cs_get_u64(qctx, reg_fau);
470 
471    if (fau)
472       GENX(pandecode_fau)(ctx, fau & BITFIELD64_MASK(48), fau >> 56, "FAU");
473 
474    GENX(pandecode_shader)
475    (ctx, cs_get_u64(qctx, reg_spd), "Shader", qctx->gpu_id);
476 
477    DUMP_ADDR(ctx, LOCAL_STORAGE, cs_get_u64(qctx, reg_tsd),
478              "Local Storage @%" PRIx64 ":\n", cs_get_u64(qctx, reg_tsd));
479 
480    pandecode_log(ctx, "Global attribute offset: %u\n", cs_get_u32(qctx, 32));
481    DUMP_CL(ctx, COMPUTE_SIZE_WORKGROUP, &qctx->regs[33], "Workgroup size\n");
482    pandecode_log(ctx, "Job offset X: %u\n", cs_get_u32(qctx, 34));
483    pandecode_log(ctx, "Job offset Y: %u\n", cs_get_u32(qctx, 35));
484    pandecode_log(ctx, "Job offset Z: %u\n", cs_get_u32(qctx, 36));
485    pandecode_log(ctx, "Job size X: %u\n", cs_get_u32(qctx, 37));
486    pandecode_log(ctx, "Job size Y: %u\n", cs_get_u32(qctx, 38));
487    pandecode_log(ctx, "Job size Z: %u\n", cs_get_u32(qctx, 39));
488 
489    ctx->indent--;
490 }
491 
492 static void
pandecode_run_compute_indirect(struct pandecode_context * ctx,FILE * fp,struct queue_ctx * qctx,struct MALI_CS_RUN_COMPUTE_INDIRECT * I)493 pandecode_run_compute_indirect(struct pandecode_context *ctx, FILE *fp,
494                                struct queue_ctx *qctx,
495                                struct MALI_CS_RUN_COMPUTE_INDIRECT *I)
496 {
497    if (qctx->in_exception_handler)
498       return;
499 
500    ctx->indent++;
501 
502    unsigned reg_srt = 0 + (I->srt_select * 2);
503    unsigned reg_fau = 8 + (I->fau_select * 2);
504    unsigned reg_spd = 16 + (I->spd_select * 2);
505    unsigned reg_tsd = 24 + (I->tsd_select * 2);
506 
507    GENX(pandecode_resource_tables)(ctx, cs_get_u64(qctx, reg_srt), "Resources");
508 
509    uint64_t fau = cs_get_u64(qctx, reg_fau);
510 
511    if (fau)
512       GENX(pandecode_fau)(ctx, fau & BITFIELD64_MASK(48), fau >> 56, "FAU");
513 
514    GENX(pandecode_shader)
515    (ctx, cs_get_u64(qctx, reg_spd), "Shader", qctx->gpu_id);
516 
517    DUMP_ADDR(ctx, LOCAL_STORAGE, cs_get_u64(qctx, reg_tsd),
518              "Local Storage @%" PRIx64 ":\n", cs_get_u64(qctx, reg_tsd));
519 
520    pandecode_log(ctx, "Global attribute offset: %u\n", cs_get_u32(qctx, 32));
521    DUMP_CL(ctx, COMPUTE_SIZE_WORKGROUP, &qctx->regs[33], "Workgroup size\n");
522    pandecode_log(ctx, "Job offset X: %u\n", cs_get_u32(qctx, 34));
523    pandecode_log(ctx, "Job offset Y: %u\n", cs_get_u32(qctx, 35));
524    pandecode_log(ctx, "Job offset Z: %u\n", cs_get_u32(qctx, 36));
525    pandecode_log(ctx, "Job size X: %u\n", cs_get_u32(qctx, 37));
526    pandecode_log(ctx, "Job size Y: %u\n", cs_get_u32(qctx, 38));
527    pandecode_log(ctx, "Job size Z: %u\n", cs_get_u32(qctx, 39));
528 
529    ctx->indent--;
530 }
531 
532 static void
pandecode_run_tiling(struct pandecode_context * ctx,FILE * fp,struct queue_ctx * qctx,struct MALI_CS_RUN_TILING * I)533 pandecode_run_tiling(struct pandecode_context *ctx, FILE *fp,
534                      struct queue_ctx *qctx, struct MALI_CS_RUN_TILING *I)
535 {
536    if (qctx->in_exception_handler)
537       return;
538 
539    ctx->indent++;
540 
541    /* Merge flag overrides with the register flags */
542    struct mali_primitive_flags_packed tiler_flags_packed = {
543       .opaque[0] = cs_get_u32(qctx, 56) | I->flags_override,
544    };
545    pan_unpack(&tiler_flags_packed, PRIMITIVE_FLAGS, tiler_flags);
546 
547    unsigned reg_srt = I->srt_select * 2;
548    unsigned reg_fau = 8 + I->fau_select * 2;
549    unsigned reg_spd = 16 + I->spd_select * 2;
550    unsigned reg_tsd = 24 + I->tsd_select;
551 
552    uint64_t srt = cs_get_u64(qctx, reg_srt);
553    uint64_t fau = cs_get_u64(qctx, reg_fau);
554    uint64_t spd = cs_get_u64(qctx, reg_spd);
555    uint64_t tsd = cs_get_u64(qctx, reg_tsd);
556 
557    if (srt)
558       GENX(pandecode_resource_tables)(ctx, srt, "Fragment resources");
559 
560    if (fau) {
561       uint64_t lo = fau & BITFIELD64_MASK(48);
562       uint64_t hi = fau >> 56;
563 
564       GENX(pandecode_fau)(ctx, lo, hi, "Fragment FAU");
565    }
566 
567    if (spd) {
568       GENX(pandecode_shader)
569       (ctx, spd, "Fragment shader", qctx->gpu_id);
570    }
571 
572    DUMP_ADDR(ctx, LOCAL_STORAGE, tsd, "Fragment Local Storage @%" PRIx64 ":\n",
573              tsd);
574 
575    pandecode_log(ctx, "Global attribute offset: %u\n", cs_get_u32(qctx, 32));
576    pandecode_log(ctx, "Index count: %u\n", cs_get_u32(qctx, 33));
577    pandecode_log(ctx, "Instance count: %u\n", cs_get_u32(qctx, 34));
578 
579    if (tiler_flags.index_type)
580       pandecode_log(ctx, "Index offset: %u\n", cs_get_u32(qctx, 35));
581 
582    pandecode_log(ctx, "Vertex offset: %d\n", cs_get_u32(qctx, 36));
583    pandecode_log(ctx, "Tiler DCD flags2: %X\n", cs_get_u32(qctx, 38));
584 
585    if (tiler_flags.index_type)
586       pandecode_log(ctx, "Index array size: %u\n", cs_get_u32(qctx, 39));
587 
588    GENX(pandecode_tiler)(ctx, cs_get_u64(qctx, 40), qctx->gpu_id);
589 
590    DUMP_CL(ctx, SCISSOR, &qctx->regs[42], "Scissor\n");
591    pandecode_log(ctx, "Low depth clamp: %f\n", uif(cs_get_u32(qctx, 44)));
592    pandecode_log(ctx, "High depth clamp: %f\n", uif(cs_get_u32(qctx, 45)));
593    pandecode_log(ctx, "Occlusion: %" PRIx64 "\n", cs_get_u64(qctx, 46));
594    pandecode_log(ctx, "Vertex position array: %" PRIx64 "\n",
595                  cs_get_u64(qctx, 48));
596 
597    uint64_t blend = cs_get_u64(qctx, 50);
598    GENX(pandecode_blend_descs)(ctx, blend & ~15, blend & 15, 0, qctx->gpu_id);
599 
600    DUMP_ADDR(ctx, DEPTH_STENCIL, cs_get_u64(qctx, 52), "Depth/stencil");
601 
602    if (tiler_flags.index_type)
603       pandecode_log(ctx, "Indices: %" PRIx64 "\n", cs_get_u64(qctx, 54));
604 
605    DUMP_UNPACKED(ctx, PRIMITIVE_FLAGS, tiler_flags, "Primitive flags\n");
606    DUMP_CL(ctx, DCD_FLAGS_0, &qctx->regs[57], "DCD Flags 0\n");
607    DUMP_CL(ctx, DCD_FLAGS_1, &qctx->regs[58], "DCD Flags 1\n");
608    pandecode_log(ctx, "Vertex bounds: %u\n", cs_get_u32(qctx, 59));
609    DUMP_CL(ctx, PRIMITIVE_SIZE, &qctx->regs[60], "Primitive size\n");
610 
611    ctx->indent--;
612 }
613 
614 static void
pandecode_run_idvs(struct pandecode_context * ctx,FILE * fp,struct queue_ctx * qctx,struct MALI_CS_RUN_IDVS * I)615 pandecode_run_idvs(struct pandecode_context *ctx, FILE *fp,
616                    struct queue_ctx *qctx, struct MALI_CS_RUN_IDVS *I)
617 {
618    if (qctx->in_exception_handler)
619       return;
620 
621    ctx->indent++;
622 
623    /* Merge flag overrides with the register flags */
624    struct mali_primitive_flags_packed tiler_flags_packed = {
625       .opaque[0] = cs_get_u32(qctx, 56) | I->flags_override,
626    };
627    pan_unpack(&tiler_flags_packed, PRIMITIVE_FLAGS, tiler_flags);
628 
629    unsigned reg_position_srt = 0;
630    unsigned reg_position_fau = 8;
631    unsigned reg_position_tsd = 24;
632 
633    unsigned reg_vary_srt = I->varying_srt_select ? 2 : 0;
634    unsigned reg_vary_fau = I->varying_fau_select ? 10 : 8;
635    unsigned reg_vary_tsd = I->varying_tsd_select ? 26 : 24;
636 
637    unsigned reg_frag_srt = I->fragment_srt_select ? 4 : 0;
638    unsigned reg_frag_fau = 12;
639    unsigned reg_frag_tsd = I->fragment_tsd_select ? 28 : 24;
640 
641    uint64_t position_srt = cs_get_u64(qctx, reg_position_srt);
642    uint64_t vary_srt = cs_get_u64(qctx, reg_vary_srt);
643    uint64_t frag_srt = cs_get_u64(qctx, reg_frag_srt);
644 
645    if (position_srt)
646       GENX(pandecode_resource_tables)(ctx, position_srt, "Position resources");
647 
648    if (vary_srt)
649       GENX(pandecode_resource_tables)(ctx, vary_srt, "Varying resources");
650 
651    if (frag_srt)
652       GENX(pandecode_resource_tables)(ctx, frag_srt, "Fragment resources");
653 
654    uint64_t position_fau = cs_get_u64(qctx, reg_position_fau);
655    uint64_t vary_fau = cs_get_u64(qctx, reg_vary_fau);
656    uint64_t fragment_fau = cs_get_u64(qctx, reg_frag_fau);
657 
658    if (position_fau) {
659       uint64_t lo = position_fau & BITFIELD64_MASK(48);
660       uint64_t hi = position_fau >> 56;
661 
662       GENX(pandecode_fau)(ctx, lo, hi, "Position FAU");
663    }
664 
665    if (vary_fau) {
666       uint64_t lo = vary_fau & BITFIELD64_MASK(48);
667       uint64_t hi = vary_fau >> 56;
668 
669       GENX(pandecode_fau)(ctx, lo, hi, "Varying FAU");
670    }
671 
672    if (fragment_fau) {
673       uint64_t lo = fragment_fau & BITFIELD64_MASK(48);
674       uint64_t hi = fragment_fau >> 56;
675 
676       GENX(pandecode_fau)(ctx, lo, hi, "Fragment FAU");
677    }
678 
679    if (cs_get_u64(qctx, 16)) {
680       GENX(pandecode_shader)
681       (ctx, cs_get_u64(qctx, 16), "Position shader", qctx->gpu_id);
682    }
683 
684    if (tiler_flags.secondary_shader) {
685       uint64_t ptr = cs_get_u64(qctx, 18);
686 
687       GENX(pandecode_shader)(ctx, ptr, "Varying shader", qctx->gpu_id);
688    }
689 
690    if (cs_get_u64(qctx, 20)) {
691       GENX(pandecode_shader)
692       (ctx, cs_get_u64(qctx, 20), "Fragment shader", qctx->gpu_id);
693    }
694 
695    DUMP_ADDR(ctx, LOCAL_STORAGE, cs_get_u64(qctx, reg_position_tsd),
696              "Position Local Storage @%" PRIx64 ":\n",
697              cs_get_u64(qctx, reg_position_tsd));
698    DUMP_ADDR(ctx, LOCAL_STORAGE, cs_get_u64(qctx, reg_vary_tsd),
699              "Varying Local Storage @%" PRIx64 ":\n",
700              cs_get_u64(qctx, reg_vary_tsd));
701    DUMP_ADDR(ctx, LOCAL_STORAGE, cs_get_u64(qctx, reg_frag_tsd),
702              "Fragment Local Storage @%" PRIx64 ":\n",
703              cs_get_u64(qctx, reg_frag_tsd));
704 
705    pandecode_log(ctx, "Global attribute offset: %u\n", cs_get_u32(qctx, 32));
706    pandecode_log(ctx, "Index count: %u\n", cs_get_u32(qctx, 33));
707    pandecode_log(ctx, "Instance count: %u\n", cs_get_u32(qctx, 34));
708 
709    if (tiler_flags.index_type)
710       pandecode_log(ctx, "Index offset: %u\n", cs_get_u32(qctx, 35));
711 
712    pandecode_log(ctx, "Vertex offset: %d\n", cs_get_u32(qctx, 36));
713    pandecode_log(ctx, "Instance offset: %u\n", cs_get_u32(qctx, 37));
714    pandecode_log(ctx, "Tiler DCD flags2: %X\n", cs_get_u32(qctx, 38));
715 
716    if (tiler_flags.index_type)
717       pandecode_log(ctx, "Index array size: %u\n", cs_get_u32(qctx, 39));
718 
719    GENX(pandecode_tiler)(ctx, cs_get_u64(qctx, 40), qctx->gpu_id);
720 
721    DUMP_CL(ctx, SCISSOR, &qctx->regs[42], "Scissor\n");
722    pandecode_log(ctx, "Low depth clamp: %f\n", uif(cs_get_u32(qctx, 44)));
723    pandecode_log(ctx, "High depth clamp: %f\n", uif(cs_get_u32(qctx, 45)));
724    pandecode_log(ctx, "Occlusion: %" PRIx64 "\n", cs_get_u64(qctx, 46));
725 
726    if (tiler_flags.secondary_shader)
727       pandecode_log(ctx, "Varying allocation: %u\n", cs_get_u32(qctx, 48));
728 
729    uint64_t blend = cs_get_u64(qctx, 50);
730    GENX(pandecode_blend_descs)(ctx, blend & ~15, blend & 15, 0, qctx->gpu_id);
731 
732    DUMP_ADDR(ctx, DEPTH_STENCIL, cs_get_u64(qctx, 52), "Depth/stencil");
733 
734    if (tiler_flags.index_type)
735       pandecode_log(ctx, "Indices: %" PRIx64 "\n", cs_get_u64(qctx, 54));
736 
737    DUMP_UNPACKED(ctx, PRIMITIVE_FLAGS, tiler_flags, "Primitive flags\n");
738    DUMP_CL(ctx, DCD_FLAGS_0, &qctx->regs[57], "DCD Flags 0\n");
739    DUMP_CL(ctx, DCD_FLAGS_1, &qctx->regs[58], "DCD Flags 1\n");
740    DUMP_CL(ctx, PRIMITIVE_SIZE, &qctx->regs[60], "Primitive size\n");
741 
742    ctx->indent--;
743 }
744 
745 static void
pandecode_run_fragment(struct pandecode_context * ctx,FILE * fp,struct queue_ctx * qctx,struct MALI_CS_RUN_FRAGMENT * I)746 pandecode_run_fragment(struct pandecode_context *ctx, FILE *fp,
747                        struct queue_ctx *qctx, struct MALI_CS_RUN_FRAGMENT *I)
748 {
749    if (qctx->in_exception_handler)
750       return;
751 
752    ctx->indent++;
753 
754    DUMP_CL(ctx, SCISSOR, &qctx->regs[42], "Scissor\n");
755 
756    /* TODO: Tile enable map */
757    GENX(pandecode_fbd)
758    (ctx, cs_get_u64(qctx, 40) & ~0x3full, true, qctx->gpu_id);
759 
760    ctx->indent--;
761 }
762 
763 static void
pandecode_run_fullscreen(struct pandecode_context * ctx,FILE * fp,struct queue_ctx * qctx,struct MALI_CS_RUN_FULLSCREEN * I)764 pandecode_run_fullscreen(struct pandecode_context *ctx, FILE *fp,
765                          struct queue_ctx *qctx,
766                          struct MALI_CS_RUN_FULLSCREEN *I)
767 {
768    if (qctx->in_exception_handler)
769       return;
770 
771    ctx->indent++;
772 
773    /* Merge flag overrides with the register flags */
774    struct mali_primitive_flags_packed tiler_flags_packed = {
775       .opaque[0] = cs_get_u32(qctx, 56) | I->flags_override,
776    };
777    pan_unpack(&tiler_flags_packed, PRIMITIVE_FLAGS, tiler_flags);
778    DUMP_UNPACKED(ctx, PRIMITIVE_FLAGS, tiler_flags, "Primitive flags\n");
779 
780    GENX(pandecode_tiler)(ctx, cs_get_u64(qctx, 40), qctx->gpu_id);
781 
782    DUMP_CL(ctx, SCISSOR, &qctx->regs[42], "Scissor\n");
783 
784    pan_unpack(
785       PANDECODE_PTR(ctx, cs_get_u64(qctx, I->dcd), struct mali_draw_packed),
786       DRAW, dcd);
787    GENX(pandecode_dcd)(ctx, &dcd, 0, qctx->gpu_id);
788 
789    ctx->indent--;
790 }
791 
792 static bool
interpret_cs_jump(struct pandecode_context * ctx,struct queue_ctx * qctx,uint64_t reg_address,uint32_t reg_length)793 interpret_cs_jump(struct pandecode_context *ctx, struct queue_ctx *qctx,
794                   uint64_t reg_address, uint32_t reg_length)
795 {
796    uint32_t address_lo = qctx->regs[reg_address];
797    uint32_t address_hi = qctx->regs[reg_address + 1];
798    uint32_t length = qctx->regs[reg_length];
799 
800    if (length % 8) {
801       fprintf(stderr, "CS call alignment error\n");
802       return false;
803    }
804 
805    /* Map the entire subqueue now */
806    uint64_t address = ((uint64_t)address_hi << 32) | address_lo;
807    /* Return if the jump is for an exception handler that's set to zero */
808    if (qctx->in_exception_handler && (!address || !length)) {
809       qctx->in_exception_handler = false;
810       qctx->call_stack_depth--;
811       return true;
812    }
813    uint64_t *cs = pandecode_fetch_gpu_mem(ctx, address, length);
814 
815    qctx->ip = cs;
816    qctx->end = cs + (length / 8);
817 
818    /* Skip the usual IP update */
819    return true;
820 }
821 
822 static bool
eval_cond(struct queue_ctx * qctx,enum mali_cs_condition cond,uint32_t reg)823 eval_cond(struct queue_ctx *qctx, enum mali_cs_condition cond, uint32_t reg)
824 {
825    int32_t val = qctx->regs[reg];
826 
827    switch (cond) {
828    case MALI_CS_CONDITION_LEQUAL:
829       return val <= 0;
830    case MALI_CS_CONDITION_EQUAL:
831       return val == 0;
832    case MALI_CS_CONDITION_LESS:
833       return val < 0;
834    case MALI_CS_CONDITION_GREATER:
835       return val > 0;
836    case MALI_CS_CONDITION_NEQUAL:
837       return val != 0;
838    case MALI_CS_CONDITION_GEQUAL:
839       return val >= 0;
840    case MALI_CS_CONDITION_ALWAYS:
841       return true;
842    default:
843       assert(!"Invalid condition");
844       return false;
845    }
846 }
847 
848 static void
interpret_cs_branch(struct pandecode_context * ctx,struct queue_ctx * qctx,int16_t offset,enum mali_cs_condition cond,uint32_t reg)849 interpret_cs_branch(struct pandecode_context *ctx, struct queue_ctx *qctx,
850                     int16_t offset, enum mali_cs_condition cond, uint32_t reg)
851 {
852    if (eval_cond(qctx, cond, reg))
853       qctx->ip += offset;
854 }
855 
856 /*
857  * Interpret a single instruction of the CS, updating the register file,
858  * instruction pointer, and call stack. Memory access and GPU controls are
859  * ignored for now.
860  *
861  * Returns true if execution should continue.
862  */
863 static bool
interpret_cs_instr(struct pandecode_context * ctx,struct queue_ctx * qctx)864 interpret_cs_instr(struct pandecode_context *ctx, struct queue_ctx *qctx)
865 {
866    FILE *fp = ctx->dump_stream;
867    /* Unpack the base so we get the opcode */
868    uint8_t *bytes = (uint8_t *)qctx->ip;
869    cs_unpack(bytes, CS_BASE, base);
870 
871    assert(qctx->ip < qctx->end);
872 
873    /* Don't try to keep track of registers/operations inside exception handler */
874    if (qctx->in_exception_handler) {
875       assert(base.opcode != MALI_CS_OPCODE_SET_EXCEPTION_HANDLER);
876       goto no_interpret;
877    }
878 
879    switch (base.opcode) {
880    case MALI_CS_OPCODE_RUN_COMPUTE: {
881       cs_unpack(bytes, CS_RUN_COMPUTE, I);
882       pandecode_run_compute(ctx, fp, qctx, &I);
883       break;
884    }
885 
886    case MALI_CS_OPCODE_RUN_TILING: {
887       cs_unpack(bytes, CS_RUN_TILING, I);
888       pandecode_run_tiling(ctx, fp, qctx, &I);
889       break;
890    }
891 
892    case MALI_CS_OPCODE_RUN_IDVS: {
893       cs_unpack(bytes, CS_RUN_IDVS, I);
894       pandecode_run_idvs(ctx, fp, qctx, &I);
895       break;
896    }
897 
898    case MALI_CS_OPCODE_RUN_FRAGMENT: {
899       cs_unpack(bytes, CS_RUN_FRAGMENT, I);
900       pandecode_run_fragment(ctx, fp, qctx, &I);
901       break;
902    }
903 
904    case MALI_CS_OPCODE_RUN_FULLSCREEN: {
905       cs_unpack(bytes, CS_RUN_FULLSCREEN, I);
906       pandecode_run_fullscreen(ctx, fp, qctx, &I);
907       break;
908    }
909 
910    case MALI_CS_OPCODE_RUN_COMPUTE_INDIRECT: {
911       cs_unpack(bytes, CS_RUN_COMPUTE_INDIRECT, I);
912       pandecode_run_compute_indirect(ctx, fp, qctx, &I);
913       break;
914    }
915 
916    case MALI_CS_OPCODE_MOVE: {
917       cs_unpack(bytes, CS_MOVE, I);
918 
919       qctx->regs[I.destination + 0] = (uint32_t)I.immediate;
920       qctx->regs[I.destination + 1] = (uint32_t)(I.immediate >> 32);
921       break;
922    }
923 
924    case MALI_CS_OPCODE_MOVE32: {
925       cs_unpack(bytes, CS_MOVE32, I);
926 
927       qctx->regs[I.destination] = I.immediate;
928       break;
929    }
930 
931    case MALI_CS_OPCODE_LOAD_MULTIPLE: {
932       cs_unpack(bytes, CS_LOAD_MULTIPLE, I);
933       uint64_t addr =
934          ((uint64_t)qctx->regs[I.address + 1] << 32) | qctx->regs[I.address];
935       addr += I.offset;
936 
937       uint32_t *src =
938          pandecode_fetch_gpu_mem(ctx, addr, util_last_bit(I.mask) * 4);
939 
940       for (uint32_t i = 0; i < 16; i++) {
941          if (I.mask & BITFIELD_BIT(i))
942             qctx->regs[I.base_register + i] = src[i];
943       }
944       break;
945    }
946 
947    case MALI_CS_OPCODE_ADD_IMMEDIATE32: {
948       cs_unpack(bytes, CS_ADD_IMMEDIATE32, I);
949 
950       qctx->regs[I.destination] = qctx->regs[I.source] + I.immediate;
951       break;
952    }
953 
954    case MALI_CS_OPCODE_ADD_IMMEDIATE64: {
955       cs_unpack(bytes, CS_ADD_IMMEDIATE64, I);
956 
957       int64_t value =
958          (qctx->regs[I.source] | ((int64_t)qctx->regs[I.source + 1] << 32)) +
959          I.immediate;
960 
961       qctx->regs[I.destination] = value;
962       qctx->regs[I.destination + 1] = value >> 32;
963       break;
964    }
965 
966    case MALI_CS_OPCODE_CALL: {
967       cs_unpack(bytes, CS_CALL, I);
968 
969       if (qctx->call_stack_depth == MAX_CALL_STACK_DEPTH) {
970          fprintf(stderr, "CS call stack overflow\n");
971          return false;
972       }
973 
974       assert(qctx->call_stack_depth < MAX_CALL_STACK_DEPTH);
975 
976       qctx->ip++;
977 
978       /* Note: tail calls are not optimized in the hardware. */
979       assert(qctx->ip <= qctx->end);
980 
981       unsigned depth = qctx->call_stack_depth++;
982 
983       qctx->call_stack[depth].lr = qctx->ip;
984       qctx->call_stack[depth].end = qctx->end;
985 
986       return interpret_cs_jump(ctx, qctx, I.address, I.length);
987    }
988 
989    case MALI_CS_OPCODE_SET_EXCEPTION_HANDLER: {
990       cs_unpack(bytes, CS_SET_EXCEPTION_HANDLER, I);
991 
992       assert(qctx->call_stack_depth < MAX_CALL_STACK_DEPTH);
993 
994       qctx->ip++;
995 
996       /* Note: tail calls are not optimized in the hardware. */
997       assert(qctx->ip <= qctx->end);
998 
999       unsigned depth = qctx->call_stack_depth++;
1000 
1001       qctx->call_stack[depth].lr = qctx->ip;
1002       qctx->call_stack[depth].end = qctx->end;
1003 
1004       /* Exception handler can use the full frame stack depth but we don't try
1005        * to keep track of the nested JUMP/CALL as we don't know what will be
1006        * the registers/memory content when the handler is triggered. */
1007       qctx->in_exception_handler = true;
1008 
1009       return interpret_cs_jump(ctx, qctx, I.address, I.length);
1010    }
1011 
1012    case MALI_CS_OPCODE_JUMP: {
1013       cs_unpack(bytes, CS_JUMP, I);
1014 
1015       if (qctx->call_stack_depth == 0) {
1016          fprintf(stderr, "Cannot jump from the entrypoint\n");
1017          return false;
1018       }
1019 
1020       return interpret_cs_jump(ctx, qctx, I.address, I.length);
1021    }
1022 
1023    case MALI_CS_OPCODE_BRANCH: {
1024       cs_unpack(bytes, CS_BRANCH, I);
1025 
1026       interpret_cs_branch(ctx, qctx, I.offset, I.condition, I.value);
1027       break;
1028    }
1029 
1030    default:
1031       break;
1032    }
1033 
1034 no_interpret:
1035 
1036    /* Update IP first to point to the next instruction, so call doesn't
1037     * require special handling (even for tail calls).
1038     */
1039    qctx->ip++;
1040 
1041    while (qctx->ip == qctx->end) {
1042       /* Graceful termination */
1043       if (qctx->call_stack_depth == 0)
1044          return false;
1045 
1046       /* Pop off the call stack */
1047       unsigned old_depth = --qctx->call_stack_depth;
1048 
1049       qctx->ip = qctx->call_stack[old_depth].lr;
1050       qctx->end = qctx->call_stack[old_depth].end;
1051       qctx->in_exception_handler = false;
1052    }
1053 
1054    return true;
1055 }
1056 
1057 void
GENX(pandecode_interpret_cs)1058 GENX(pandecode_interpret_cs)(struct pandecode_context *ctx, uint64_t queue,
1059                              uint32_t size, unsigned gpu_id, uint32_t *regs)
1060 {
1061    pandecode_dump_file_open(ctx);
1062 
1063    uint64_t *cs = pandecode_fetch_gpu_mem(ctx, queue, size);
1064 
1065    /* Mali-G610 has 96 registers. Other devices not yet supported, we can make
1066     * this configurable later when we encounter new Malis.
1067     */
1068    struct queue_ctx qctx = {
1069       .nr_regs = 96,
1070       .regs = regs,
1071       .ip = cs,
1072       .end = cs + (size / 8),
1073       .gpu_id = gpu_id,
1074 
1075       /* If this is a kernel mode queue, we don't see the root ring buffer and
1076        * we must adjust the initial call stack depth accordingly.
1077        */
1078       .call_stack_depth = ctx->usermode_queue ? 0 : 1,
1079    };
1080    FILE *fp = ctx->dump_stream;
1081 
1082    if (size) {
1083       do {
1084          uint64_t instr = *qctx.ip;
1085 
1086          fprintf(fp, " ");
1087          for (unsigned b = 0; b < 8; ++b)
1088             fprintf(fp, " %02x", (uint8_t)(instr >> (8 * b)));
1089 
1090          for (int i = 0; i < 1 + qctx.call_stack_depth; ++i)
1091             fprintf(fp, "  ");
1092 
1093          print_cs_instr(fp, qctx.ip);
1094          fprintf(fp, "\n");
1095       } while (interpret_cs_instr(ctx, &qctx));
1096    }
1097 
1098    fflush(ctx->dump_stream);
1099    pandecode_map_read_write(ctx);
1100 }
1101 
1102 struct cs_code_block {
1103    struct list_head node;
1104    unsigned start;
1105    unsigned size;
1106    struct util_dynarray predecessors;
1107    unsigned successors[2];
1108 };
1109 
1110 struct cs_indirect_branch_target {
1111    uint64_t address;
1112    uint32_t length;
1113 };
1114 
1115 struct cs_indirect_branch {
1116    unsigned instr_idx;
1117    bool has_unknown_targets;
1118    struct util_dynarray targets;
1119 };
1120 
1121 struct cs_code_cfg {
1122    uint64_t *instrs;
1123    unsigned instr_count;
1124    struct cs_code_block **blk_map;
1125    struct util_dynarray indirect_branches;
1126 };
1127 
1128 static struct cs_code_block *
cs_code_block_alloc(void * alloc_ctx,unsigned start,unsigned size)1129 cs_code_block_alloc(void *alloc_ctx, unsigned start, unsigned size)
1130 {
1131    struct cs_code_block *block = rzalloc(alloc_ctx, struct cs_code_block);
1132 
1133    block->start = start;
1134    block->size = size;
1135    memset(block->successors, ~0, sizeof(block->successors));
1136    list_inithead(&block->node);
1137    util_dynarray_init(&block->predecessors, alloc_ctx);
1138    return block;
1139 }
1140 
1141 static void
record_indirect_branch_target(struct cs_code_cfg * cfg,struct list_head * blk_stack,struct cs_code_block * cur_blk,unsigned blk_offs,struct cs_indirect_branch * ibranch)1142 record_indirect_branch_target(struct cs_code_cfg *cfg,
1143                               struct list_head *blk_stack,
1144                               struct cs_code_block *cur_blk, unsigned blk_offs,
1145                               struct cs_indirect_branch *ibranch)
1146 {
1147    union {
1148       uint32_t u32[256];
1149       uint64_t u64[128];
1150    } reg_file = {0};
1151 
1152    list_add(&cur_blk->node, blk_stack);
1153    list_for_each_entry(struct cs_code_block, blk, blk_stack, node) {
1154       for (; blk_offs < blk->size &&
1155              blk->start + blk_offs != ibranch->instr_idx;
1156            blk_offs++) {
1157          const uint64_t *instr = &cfg->instrs[blk->start + blk_offs];
1158          cs_unpack(instr, CS_BASE, base);
1159          switch (base.opcode) {
1160          case MALI_CS_OPCODE_MOVE: {
1161             cs_unpack(instr, CS_MOVE, I);
1162 
1163             assert(I.destination % 2 == 0 &&
1164                    "Destination register should be aligned to 2");
1165 
1166             reg_file.u64[I.destination / 2] = I.immediate;
1167             break;
1168          }
1169 
1170          case MALI_CS_OPCODE_MOVE32: {
1171             cs_unpack(instr, CS_MOVE32, I);
1172             reg_file.u32[I.destination] = I.immediate;
1173             break;
1174          }
1175 
1176          case MALI_CS_OPCODE_ADD_IMMEDIATE32: {
1177             cs_unpack(instr, CS_ADD_IMMEDIATE32, I);
1178             reg_file.u32[I.destination] = reg_file.u32[I.source] + I.immediate;
1179             break;
1180          }
1181 
1182          case MALI_CS_OPCODE_ADD_IMMEDIATE64: {
1183             cs_unpack(instr, CS_ADD_IMMEDIATE64, I);
1184 
1185             assert(I.destination % 2 == 0 &&
1186                    "Destination register should be aligned to 2");
1187             assert(I.source % 2 == 0 &&
1188                    "Source register should be aligned to 2");
1189 
1190             reg_file.u64[I.destination / 2] =
1191                reg_file.u64[I.source / 2] + I.immediate;
1192             break;
1193          }
1194 
1195          case MALI_CS_OPCODE_UMIN32: {
1196             cs_unpack(instr, CS_UMIN32, I);
1197             reg_file.u32[I.destination] =
1198                MIN2(reg_file.u32[I.source_1], reg_file.u32[I.source_2]);
1199             break;
1200          }
1201 
1202          default:
1203             break;
1204          }
1205       }
1206       blk_offs = 0;
1207    }
1208    list_delinit(&cur_blk->node);
1209 
1210    uint64_t *instr = &cfg->instrs[ibranch->instr_idx];
1211    cs_unpack(instr, CS_JUMP, I);
1212 
1213    assert(I.address % 2 == 0 && "Address register should be aligned to 2");
1214 
1215    struct cs_indirect_branch_target target = {
1216       .address = reg_file.u64[I.address / 2],
1217       .length = reg_file.u32[I.length],
1218    };
1219 
1220    util_dynarray_append(&ibranch->targets, struct cs_indirect_branch_target,
1221                         target);
1222 }
1223 
1224 static void
collect_indirect_branch_targets_recurse(struct cs_code_cfg * cfg,struct list_head * blk_stack,BITSET_WORD * track_map,struct cs_code_block * cur_blk,int instr_ptr,struct cs_indirect_branch * ibranch)1225 collect_indirect_branch_targets_recurse(struct cs_code_cfg *cfg,
1226                                         struct list_head *blk_stack,
1227                                         BITSET_WORD *track_map,
1228                                         struct cs_code_block *cur_blk,
1229                                         int instr_ptr,
1230                                         struct cs_indirect_branch *ibranch)
1231 {
1232    for (; instr_ptr >= (int)cur_blk->start; instr_ptr--) {
1233       assert(instr_ptr >= 0);
1234       const uint64_t *instr = &cfg->instrs[instr_ptr];
1235       cs_unpack(instr, CS_BASE, base);
1236       switch (base.opcode) {
1237       case MALI_CS_OPCODE_MOVE: {
1238          cs_unpack(instr, CS_MOVE, I);
1239          BITSET_CLEAR(track_map, I.destination);
1240          BITSET_CLEAR(track_map, I.destination + 1);
1241          break;
1242       }
1243 
1244       case MALI_CS_OPCODE_MOVE32: {
1245          cs_unpack(instr, CS_MOVE32, I);
1246          BITSET_CLEAR(track_map, I.destination);
1247          break;
1248       }
1249 
1250       case MALI_CS_OPCODE_ADD_IMMEDIATE32: {
1251          cs_unpack(instr, CS_ADD_IMMEDIATE32, I);
1252          if (BITSET_TEST(track_map, I.destination)) {
1253             BITSET_SET(track_map, I.source);
1254             BITSET_CLEAR(track_map, I.destination);
1255          }
1256          break;
1257       }
1258 
1259       case MALI_CS_OPCODE_ADD_IMMEDIATE64: {
1260          cs_unpack(instr, CS_ADD_IMMEDIATE64, I);
1261          if (BITSET_TEST(track_map, I.destination)) {
1262             BITSET_SET(track_map, I.source);
1263             BITSET_CLEAR(track_map, I.destination);
1264          }
1265          if (BITSET_TEST(track_map, I.destination + 1)) {
1266             BITSET_SET(track_map, I.source + 1);
1267             BITSET_CLEAR(track_map, I.destination + 1);
1268          }
1269          break;
1270       }
1271 
1272       case MALI_CS_OPCODE_UMIN32: {
1273          cs_unpack(instr, CS_UMIN32, I);
1274          if (BITSET_TEST(track_map, I.destination)) {
1275             BITSET_SET(track_map, I.source_1);
1276             BITSET_SET(track_map, I.source_2);
1277             BITSET_CLEAR(track_map, I.destination);
1278          }
1279          break;
1280       }
1281 
1282       case MALI_CS_OPCODE_LOAD_MULTIPLE: {
1283          cs_unpack(instr, CS_LOAD_MULTIPLE, I);
1284          for (unsigned i = 0; i < 16; i++) {
1285             if ((I.mask & BITFIELD_BIT(i)) &&
1286                 BITSET_TEST(track_map, I.base_register + i)) {
1287                ibranch->has_unknown_targets = true;
1288                return;
1289             }
1290          }
1291          break;
1292       }
1293 
1294       case MALI_CS_OPCODE_PROGRESS_LOAD: {
1295          cs_unpack(instr, CS_PROGRESS_LOAD, I);
1296          for (unsigned i = 0; i < 16; i++) {
1297             if (BITSET_TEST(track_map, I.destination) ||
1298                 BITSET_TEST(track_map, I.destination + 1)) {
1299                ibranch->has_unknown_targets = true;
1300                return;
1301             }
1302          }
1303          break;
1304       }
1305 
1306       default:
1307          break;
1308       }
1309 
1310       if (__bitset_is_empty(track_map, BITSET_WORDS(256))) {
1311          record_indirect_branch_target(cfg, blk_stack, cur_blk,
1312                                        instr_ptr - cur_blk->start, ibranch);
1313          return;
1314       }
1315    }
1316 
1317    assert(!__bitset_is_empty(track_map, BITSET_WORDS(256)));
1318 
1319    if (util_dynarray_num_elements(&cur_blk->predecessors, unsigned) == 0) {
1320       ibranch->has_unknown_targets = true;
1321       return;
1322    }
1323 
1324    list_add(&cur_blk->node, blk_stack);
1325    util_dynarray_foreach(&cur_blk->predecessors, unsigned, pred) {
1326       struct cs_code_block *prev_blk = cfg->blk_map[*pred];
1327 
1328       /* If the node is already in the block stack, we skip it
1329        * and consider this path leading to an unknown target. */
1330       if (!list_is_empty(&cur_blk->node)) {
1331          ibranch->has_unknown_targets = true;
1332          continue;
1333       }
1334 
1335       collect_indirect_branch_targets_recurse(
1336          cfg, blk_stack, track_map, prev_blk,
1337          prev_blk->start + prev_blk->size - 1, ibranch);
1338    }
1339    list_delinit(&cur_blk->node);
1340 
1341    return;
1342 }
1343 
1344 static void
collect_indirect_branch_targets(struct cs_code_cfg * cfg,struct cs_indirect_branch * ibranch)1345 collect_indirect_branch_targets(struct cs_code_cfg *cfg,
1346                                 struct cs_indirect_branch *ibranch)
1347 {
1348    uint64_t *instr = &cfg->instrs[ibranch->instr_idx];
1349    struct cs_code_block *cur_blk = cfg->blk_map[ibranch->instr_idx];
1350    struct list_head blk_stack;
1351    BITSET_DECLARE(track_map, 256) = {0};
1352 
1353    list_inithead(&blk_stack);
1354 
1355    cs_unpack(instr, CS_JUMP, I);
1356    BITSET_SET(track_map, I.address);
1357    BITSET_SET(track_map, I.address + 1);
1358    BITSET_SET(track_map, I.length);
1359 
1360    collect_indirect_branch_targets_recurse(cfg, &blk_stack, track_map, cur_blk,
1361                                            ibranch->instr_idx - 1, ibranch);
1362 }
1363 
1364 static struct cs_code_cfg *
get_cs_cfg(struct pandecode_context * ctx,struct hash_table_u64 * symbols,uint64_t bin,uint32_t bin_size)1365 get_cs_cfg(struct pandecode_context *ctx, struct hash_table_u64 *symbols,
1366            uint64_t bin, uint32_t bin_size)
1367 {
1368    uint32_t instr_count = bin_size / sizeof(uint64_t);
1369    struct cs_code_cfg *cfg = _mesa_hash_table_u64_search(symbols, bin);
1370 
1371    if (cfg) {
1372       assert(cfg->instr_count == instr_count);
1373       return cfg;
1374    }
1375 
1376    uint64_t *instrs = pandecode_fetch_gpu_mem(ctx, bin, bin_size);
1377 
1378    cfg = rzalloc(symbols, struct cs_code_cfg);
1379    _mesa_hash_table_u64_insert(symbols, bin, cfg);
1380 
1381    util_dynarray_init(&cfg->indirect_branches, cfg);
1382 
1383    cfg->blk_map = rzalloc_array(cfg, struct cs_code_block *, instr_count);
1384    cfg->instrs = instrs;
1385    cfg->instr_count = instr_count;
1386 
1387    struct cs_code_block *block = cs_code_block_alloc(cfg, 0, 0);
1388 
1389    for (unsigned i = 0; i < instr_count; i++) {
1390       const uint64_t *instr = &instrs[i];
1391 
1392       if (!cfg->blk_map[i]) {
1393          cfg->blk_map[i] = block;
1394          block->size++;
1395       } else {
1396          if (block->successors[0] == ~0)
1397             block->successors[0] = i;
1398 
1399          block = cfg->blk_map[i];
1400          util_dynarray_append(&block->predecessors, unsigned, i - 1);
1401       }
1402 
1403       cs_unpack(instr, CS_BASE, base);
1404 
1405       if (base.opcode == MALI_CS_OPCODE_JUMP ||
1406           base.opcode == MALI_CS_OPCODE_CALL) {
1407          struct cs_indirect_branch ibranch = {
1408             .instr_idx = i,
1409          };
1410 
1411          util_dynarray_append(&cfg->indirect_branches,
1412                               struct cs_indirect_branch, ibranch);
1413       }
1414 
1415       if (base.opcode != MALI_CS_OPCODE_BRANCH)
1416          continue;
1417 
1418       cs_unpack(instr, CS_BRANCH, I);
1419 
1420       unsigned target = MIN2(i + 1 + I.offset, instr_count);
1421 
1422       /* If the target of the branch is the next instruction, it's just a NOP,
1423        * and we consider it the same block. */
1424       if (target == i + 1)
1425          continue;
1426 
1427       if (I.offset < 0 && cfg->blk_map[target]->start != target) {
1428          struct cs_code_block *old = cfg->blk_map[target];
1429          struct cs_code_block *new =
1430             cs_code_block_alloc(cfg, target, old->start + old->size - target);
1431 
1432          util_dynarray_append(&new->predecessors, unsigned, target - 1);
1433          memcpy(&new->successors, &old->successors, sizeof(new->successors));
1434 
1435          old->successors[0] = target;
1436          old->successors[1] = ~0;
1437          old->size = new->start - old->start;
1438 
1439          for (unsigned j = 0; j <= new->size; j++)
1440             cfg->blk_map[new->start + j] = new;
1441       }
1442 
1443       if (I.offset > 0 && target < instr_count && !cfg->blk_map[target]) {
1444          struct cs_code_block *new = cs_code_block_alloc(cfg, target, 1);
1445 
1446          cfg->blk_map[target] = new;
1447          util_dynarray_append(&new->predecessors, unsigned, i);
1448       }
1449 
1450       block->successors[0] = target;
1451       if (I.condition != MALI_CS_CONDITION_ALWAYS)
1452          block->successors[1] = i + 1;
1453 
1454       block = cs_code_block_alloc(cfg, i + 1, 0);
1455 
1456       if (target == i + 1 || I.condition != MALI_CS_CONDITION_ALWAYS)
1457          util_dynarray_append(&block->predecessors, unsigned, i);
1458    }
1459 
1460    util_dynarray_foreach(&cfg->indirect_branches, struct cs_indirect_branch,
1461                          ibranch)
1462    {
1463       collect_indirect_branch_targets(cfg, ibranch);
1464       util_dynarray_foreach(&ibranch->targets, struct cs_indirect_branch_target,
1465                             target)
1466       {
1467          get_cs_cfg(ctx, symbols, target->address, target->length);
1468       }
1469    }
1470 
1471    return cfg;
1472 }
1473 
1474 static void
print_cs_binary(struct pandecode_context * ctx,uint64_t bin,struct cs_code_cfg * cfg,const char * name)1475 print_cs_binary(struct pandecode_context *ctx, uint64_t bin,
1476                 struct cs_code_cfg *cfg, const char *name)
1477 {
1478    pandecode_log(ctx, "%s@%" PRIx64 "{\n", name, bin);
1479    unsigned ibranch_idx = 0;
1480 
1481    ctx->indent++;
1482    for (unsigned i = 0; i < cfg->instr_count; i++) {
1483       if (i && cfg->blk_map[i - 1] != cfg->blk_map[i]) {
1484          ctx->indent--;
1485          pandecode_log(ctx, "label_%" PRIx64 ":\n", bin + i * sizeof(uint64_t));
1486          ctx->indent++;
1487       }
1488 
1489       pandecode_make_indent(ctx);
1490       print_cs_instr(ctx->dump_stream, &cfg->instrs[i]);
1491       cs_unpack(&cfg->instrs[i], CS_BASE, base);
1492       switch (base.opcode) {
1493       case MALI_CS_OPCODE_JUMP:
1494       case MALI_CS_OPCODE_CALL: {
1495          struct cs_indirect_branch *ibranch = util_dynarray_element(
1496             &cfg->indirect_branches, struct cs_indirect_branch, ibranch_idx);
1497 
1498          assert(ibranch->instr_idx == i);
1499          fprintf(ctx->dump_stream, " // ");
1500          util_dynarray_foreach(&ibranch->targets,
1501                                struct cs_indirect_branch_target, target)
1502          {
1503             fprintf(ctx->dump_stream, "%scs@%" PRIx64,
1504                     target == ibranch->targets.data ? "" : ",",
1505                     target->address);
1506          }
1507          if (ibranch->has_unknown_targets)
1508             fprintf(ctx->dump_stream, "%s??", ibranch->targets.size ? "," : "");
1509          ibranch_idx++;
1510          break;
1511       }
1512 
1513       case MALI_CS_OPCODE_BRANCH: {
1514          cs_unpack(&cfg->instrs[i], CS_BRANCH, I);
1515          fprintf(ctx->dump_stream, " // ");
1516 
1517          unsigned target = i + 1 + I.offset;
1518 
1519          if (target < cfg->instr_count)
1520             fprintf(ctx->dump_stream, "label_%" PRIx64,
1521                     bin + (target * sizeof(uint64_t)));
1522          else
1523             fprintf(ctx->dump_stream, "end_of_cs");
1524          break;
1525       }
1526 
1527       case MALI_CS_OPCODE_RUN_IDVS:
1528       case MALI_CS_OPCODE_RUN_FRAGMENT:
1529       case MALI_CS_OPCODE_RUN_COMPUTE:
1530       case MALI_CS_OPCODE_RUN_COMPUTE_INDIRECT:
1531          fprintf(ctx->dump_stream, " // tracepoint_%" PRIx64,
1532                  bin + (i * sizeof(uint64_t)));
1533          break;
1534 
1535       default:
1536          break;
1537       }
1538 
1539       fprintf(ctx->dump_stream, "\n");
1540    }
1541    ctx->indent--;
1542    pandecode_log(ctx, "} // %s@%" PRIx64 "\n\n", name, bin);
1543 }
1544 
1545 void
GENX(pandecode_cs_binary)1546 GENX(pandecode_cs_binary)(struct pandecode_context *ctx, uint64_t bin,
1547                           uint32_t bin_size, unsigned gpu_id)
1548 {
1549    if (!bin_size)
1550       return;
1551 
1552    pandecode_dump_file_open(ctx);
1553 
1554    struct hash_table_u64 *symbols = _mesa_hash_table_u64_create(NULL);
1555    struct cs_code_cfg *main_cfg = get_cs_cfg(ctx, symbols, bin, bin_size);
1556 
1557    print_cs_binary(ctx, bin, main_cfg, "main_cs");
1558    hash_table_u64_foreach(symbols, he)
1559    {
1560       struct cs_code_cfg *other_cfg = he.data;
1561       if (other_cfg == main_cfg)
1562          continue;
1563 
1564       print_cs_binary(ctx, he.key, other_cfg, "cs");
1565    }
1566 
1567    ralloc_free(symbols);
1568 
1569    pandecode_map_read_write(ctx);
1570 }
1571 
1572 void
GENX(pandecode_cs_trace)1573 GENX(pandecode_cs_trace)(struct pandecode_context *ctx, uint64_t trace,
1574                          uint32_t trace_size, unsigned gpu_id)
1575 {
1576    pandecode_dump_file_open(ctx);
1577 
1578    void *trace_data = pandecode_fetch_gpu_mem(ctx, trace, trace_size);
1579 
1580    while (trace_size > 0) {
1581       uint32_t regs[256] = {};
1582       uint64_t *ip = trace_data;
1583 
1584       uint64_t *instr = pandecode_fetch_gpu_mem(ctx, *ip, sizeof(*instr));
1585 
1586       /* Mali-G610 has 96 registers. Other devices not yet supported, we can
1587        * make this configurable later when we encounter new Malis.
1588        */
1589       struct queue_ctx qctx = {
1590          .nr_regs = 96,
1591          .regs = regs,
1592          .ip = instr,
1593          .end = instr + 1,
1594          .gpu_id = gpu_id,
1595       };
1596 
1597       pandecode_make_indent(ctx);
1598       print_cs_instr(ctx->dump_stream, instr);
1599       fprintf(ctx->dump_stream, " // from tracepoint_%" PRIx64 "\n", *ip);
1600 
1601       cs_unpack(instr, CS_BASE, base);
1602 
1603       switch (base.opcode) {
1604       case MALI_CS_OPCODE_RUN_IDVS: {
1605          struct cs_run_idvs_trace *idvs_trace = trace_data;
1606 
1607          assert(trace_size >= sizeof(idvs_trace));
1608          cs_unpack(instr, CS_RUN_IDVS, I);
1609          memcpy(regs, idvs_trace->sr, sizeof(idvs_trace->sr));
1610 
1611          if (I.draw_id_register_enable)
1612             regs[I.draw_id] = idvs_trace->draw_id;
1613 
1614          pandecode_run_idvs(ctx, ctx->dump_stream, &qctx, &I);
1615          trace_data = idvs_trace + 1;
1616          trace_size -= sizeof(*idvs_trace);
1617          break;
1618       }
1619 
1620       case MALI_CS_OPCODE_RUN_FRAGMENT: {
1621          struct cs_run_fragment_trace *frag_trace = trace_data;
1622 
1623          assert(trace_size >= sizeof(frag_trace));
1624          cs_unpack(instr, CS_RUN_FRAGMENT, I);
1625          memcpy(&regs[40], frag_trace->sr, sizeof(frag_trace->sr));
1626          pandecode_run_fragment(ctx, ctx->dump_stream, &qctx, &I);
1627          trace_data = frag_trace + 1;
1628          trace_size -= sizeof(*frag_trace);
1629          break;
1630       }
1631 
1632       case MALI_CS_OPCODE_RUN_COMPUTE: {
1633          struct cs_run_compute_trace *comp_trace = trace_data;
1634 
1635          assert(trace_size >= sizeof(comp_trace));
1636          cs_unpack(instr, CS_RUN_COMPUTE, I);
1637          memcpy(regs, comp_trace->sr, sizeof(comp_trace->sr));
1638          pandecode_run_compute(ctx, ctx->dump_stream, &qctx, &I);
1639          trace_data = comp_trace + 1;
1640          trace_size -= sizeof(*comp_trace);
1641          break;
1642       }
1643 
1644       case MALI_CS_OPCODE_RUN_COMPUTE_INDIRECT: {
1645          struct cs_run_compute_trace *comp_trace = trace_data;
1646 
1647          assert(trace_size >= sizeof(comp_trace));
1648          cs_unpack(instr, CS_RUN_COMPUTE_INDIRECT, I);
1649          memcpy(regs, comp_trace->sr, sizeof(comp_trace->sr));
1650          pandecode_run_compute_indirect(ctx, ctx->dump_stream, &qctx, &I);
1651          trace_data = comp_trace + 1;
1652          trace_size -= sizeof(*comp_trace);
1653          break;
1654       }
1655 
1656       default:
1657          assert(!"Invalid trace packet");
1658          break;
1659       }
1660 
1661       pandecode_log(ctx, "\n");
1662    }
1663 
1664    fflush(ctx->dump_stream);
1665    pandecode_map_read_write(ctx);
1666 }
1667 #endif
1668