1 /*
2 * Copyright (C) 2022-2023 Collabora, Ltd.
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
13 * Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 * SOFTWARE.
22 */
23
24 #include "util/bitset.h"
25 #include "util/hash_table.h"
26 #include "util/list.h"
27 #include "util/ralloc.h"
28
29 #include "genxml/gen_macros.h"
30 #include "decode.h"
31
32 #if PAN_ARCH >= 10
33
34 #include "genxml/cs_builder.h"
35
36 /* Limit for Mali-G610. -1 because we're not including the active frame */
37 #define MAX_CALL_STACK_DEPTH (8 - 1)
38
39 #define cs_unpack(packed, T, unpacked) pan_cast_and_unpack(packed, T, unpacked)
40
41 struct queue_ctx {
42 /* Size of CSHWIF register file in 32-bit registers */
43 unsigned nr_regs;
44
45 /* CSHWIF register file */
46 uint32_t *regs;
47
48 /* Current instruction pointer (CPU pointer for convenience) */
49 uint64_t *ip;
50
51 /* Current instruction end pointer */
52 uint64_t *end;
53
54 /* Whether currently inside an exception handler */
55 bool in_exception_handler;
56
57 /* Call stack. Depth=0 means root */
58 struct {
59 /* Link register to return to */
60 uint64_t *lr;
61
62 /* End pointer, there is a return (or exit) after */
63 uint64_t *end;
64 } call_stack[MAX_CALL_STACK_DEPTH + 1]; /* +1 for exception handler */
65 uint8_t call_stack_depth;
66
67 unsigned gpu_id;
68 };
69
70 static void
print_indirect(unsigned address,int16_t offset,FILE * fp)71 print_indirect(unsigned address, int16_t offset, FILE *fp)
72 {
73 if (offset)
74 fprintf(fp, "[d%u + %d]", address, offset);
75 else
76 fprintf(fp, "[d%u]", address);
77 }
78
79 static void
print_reg_tuple(unsigned base,uint16_t mask,FILE * fp)80 print_reg_tuple(unsigned base, uint16_t mask, FILE *fp)
81 {
82 bool first_reg = true;
83
84 u_foreach_bit(i, mask) {
85 fprintf(fp, "%sr%u", first_reg ? "" : ":", base + i);
86 first_reg = false;
87 }
88
89 if (mask == 0)
90 fprintf(fp, "_");
91 }
92
93 static const char *conditions_str[] = {
94 "le", "gt", "eq", "ne", "lt", "ge", "always",
95 };
96
97 static void
print_cs_instr(FILE * fp,const uint64_t * instr)98 print_cs_instr(FILE *fp, const uint64_t *instr)
99 {
100 cs_unpack(instr, CS_BASE, base);
101 switch (base.opcode) {
102 case MALI_CS_OPCODE_NOP: {
103 cs_unpack(instr, CS_NOP, I);
104 if (I.ignored)
105 fprintf(fp, "NOP // 0x%" PRIX64, I.ignored);
106 else
107 fprintf(fp, "NOP");
108 break;
109 }
110
111 case MALI_CS_OPCODE_MOVE: {
112 cs_unpack(instr, CS_MOVE, I);
113 fprintf(fp, "MOVE d%u, #0x%" PRIX64, I.destination, I.immediate);
114 break;
115 }
116
117 case MALI_CS_OPCODE_MOVE32: {
118 cs_unpack(instr, CS_MOVE32, I);
119 fprintf(fp, "MOVE32 r%u, #0x%X", I.destination, I.immediate);
120 break;
121 }
122
123 case MALI_CS_OPCODE_WAIT: {
124 cs_unpack(instr, CS_WAIT, I);
125 fprintf(fp, "WAIT%s #%x", I.progress_increment ? ".progress_inc" : "",
126 I.wait_mask);
127 break;
128 }
129
130 case MALI_CS_OPCODE_RUN_COMPUTE: {
131 const char *axes[4] = {"x_axis", "y_axis", "z_axis"};
132 cs_unpack(instr, CS_RUN_COMPUTE, I);
133
134 /* Print the instruction. Ignore the selects and the flags override
135 * since we'll print them implicitly later.
136 */
137 fprintf(fp, "RUN_COMPUTE%s.%s.srt%d.spd%d.tsd%d.fau%d #%u",
138 I.progress_increment ? ".progress_inc" : "", axes[I.task_axis],
139 I.srt_select, I.spd_select, I.tsd_select, I.fau_select,
140 I.task_increment);
141 break;
142 }
143
144 case MALI_CS_OPCODE_RUN_TILING: {
145 cs_unpack(instr, CS_RUN_TILING, I);
146 fprintf(fp, "RUN_TILING%s.srt%d.spd%d.tsd%d.fau%d",
147 I.progress_increment ? ".progress_inc" : "", I.srt_select,
148 I.spd_select, I.tsd_select, I.fau_select);
149 break;
150 }
151
152 case MALI_CS_OPCODE_RUN_IDVS: {
153 cs_unpack(instr, CS_RUN_IDVS, I);
154 fprintf(
155 fp,
156 "RUN_IDVS%s%s%s.varying_srt%d.varying_fau%d.varying_tsd%d.frag_srt%d.frag_tsd%d r%u, #%x",
157 I.progress_increment ? ".progress_inc" : "",
158 I.malloc_enable ? "" : ".no_malloc",
159 I.draw_id_register_enable ? ".draw_id_enable" : "",
160 I.varying_srt_select, I.varying_fau_select, I.varying_tsd_select,
161 I.fragment_srt_select, I.fragment_tsd_select, I.draw_id,
162 I.flags_override);
163 break;
164 }
165
166 case MALI_CS_OPCODE_RUN_FRAGMENT: {
167 static const char *tile_order[] = {
168 "zorder", "horizontal", "vertical", "unknown",
169 "unknown", "rev_horizontal", "rev_vertical", "unknown",
170 "unknown", "unknown", "unknown", "unknown",
171 "unknown", "unknown", "unknown", "unknown",
172 };
173 cs_unpack(instr, CS_RUN_FRAGMENT, I);
174
175 fprintf(fp, "RUN_FRAGMENT%s%s.tile_order=%s",
176 I.progress_increment ? ".progress_inc" : "",
177 I.enable_tem ? ".tile_enable_map_enable" : "",
178 tile_order[I.tile_order]);
179 break;
180 }
181
182 case MALI_CS_OPCODE_RUN_FULLSCREEN: {
183 cs_unpack(instr, CS_RUN_FULLSCREEN, I);
184 fprintf(fp, "RUN_FULLSCREEN%s r%u, #%x",
185 I.progress_increment ? ".progress_inc" : "", I.dcd,
186 I.flags_override);
187 break;
188 }
189
190 case MALI_CS_OPCODE_FINISH_TILING: {
191 cs_unpack(instr, CS_FINISH_TILING, I);
192 fprintf(fp, "FINISH_TILING%s",
193 I.progress_increment ? ".progress_inc" : "");
194 break;
195 }
196
197 case MALI_CS_OPCODE_FINISH_FRAGMENT: {
198 cs_unpack(instr, CS_FINISH_FRAGMENT, I);
199 fprintf(fp, "FINISH_FRAGMENT%s d%u, d%u, #%x, #%u",
200 I.increment_fragment_completed ? ".frag_end" : "",
201 I.last_heap_chunk, I.first_heap_chunk, I.wait_mask,
202 I.signal_slot);
203 break;
204 }
205
206 case MALI_CS_OPCODE_ADD_IMMEDIATE32: {
207 cs_unpack(instr, CS_ADD_IMMEDIATE32, I);
208
209 fprintf(fp, "ADD_IMMEDIATE32 r%u, r%u, #%d", I.destination, I.source,
210 I.immediate);
211 break;
212 }
213
214 case MALI_CS_OPCODE_ADD_IMMEDIATE64: {
215 cs_unpack(instr, CS_ADD_IMMEDIATE64, I);
216
217 fprintf(fp, "ADD_IMMEDIATE64 d%u, d%u, #%d", I.destination, I.source,
218 I.immediate);
219 break;
220 }
221
222 case MALI_CS_OPCODE_UMIN32: {
223 cs_unpack(instr, CS_UMIN32, I);
224
225 fprintf(fp, "UMIN32 r%u, r%u, r%u", I.destination, I.source_1,
226 I.source_2);
227 break;
228 }
229
230 case MALI_CS_OPCODE_LOAD_MULTIPLE: {
231 cs_unpack(instr, CS_LOAD_MULTIPLE, I);
232
233 fprintf(fp, "LOAD_MULTIPLE ");
234 print_reg_tuple(I.base_register, I.mask, fp);
235 fprintf(fp, ", ");
236 print_indirect(I.address, I.offset, fp);
237 break;
238 }
239
240 case MALI_CS_OPCODE_STORE_MULTIPLE: {
241 cs_unpack(instr, CS_STORE_MULTIPLE, I);
242
243 fprintf(fp, "STORE_MULTIPLE ");
244 print_indirect(I.address, I.offset, fp);
245 fprintf(fp, ", ");
246 print_reg_tuple(I.base_register, I.mask, fp);
247 break;
248 }
249
250 case MALI_CS_OPCODE_BRANCH: {
251 cs_unpack(instr, CS_BRANCH, I);
252 fprintf(fp, "BRANCH.%s r%u, #%d", conditions_str[I.condition], I.value,
253 I.offset);
254 break;
255 }
256
257 case MALI_CS_OPCODE_SET_SB_ENTRY: {
258 cs_unpack(instr, CS_SET_SB_ENTRY, I);
259 fprintf(fp, "SET_SB_ENTRY #%u, #%u", I.endpoint_entry, I.other_entry);
260 break;
261 }
262
263 case MALI_CS_OPCODE_PROGRESS_WAIT: {
264 cs_unpack(instr, CS_PROGRESS_WAIT, I);
265 fprintf(fp, "PROGRESS_WAIT d%u, #%u", I.source, I.queue);
266 break;
267 }
268
269 case MALI_CS_OPCODE_SET_EXCEPTION_HANDLER: {
270 cs_unpack(instr, CS_SET_EXCEPTION_HANDLER, I);
271 fprintf(fp, "SET_EXCEPTION_HANDLER d%u, r%u", I.address, I.length);
272 break;
273 }
274
275 case MALI_CS_OPCODE_CALL: {
276 cs_unpack(instr, CS_CALL, I);
277 fprintf(fp, "CALL d%u, r%u", I.address, I.length);
278 break;
279 }
280
281 case MALI_CS_OPCODE_JUMP: {
282 cs_unpack(instr, CS_JUMP, I);
283 fprintf(fp, "JUMP d%u, r%u", I.address, I.length);
284 break;
285 }
286
287 case MALI_CS_OPCODE_REQ_RESOURCE: {
288 cs_unpack(instr, CS_REQ_RESOURCE, I);
289 fprintf(fp, "REQ_RESOURCE%s%s%s%s", I.compute ? ".compute" : "",
290 I.fragment ? ".fragment" : "", I.tiler ? ".tiler" : "",
291 I.idvs ? ".idvs" : "");
292 break;
293 }
294
295 case MALI_CS_OPCODE_FLUSH_CACHE2: {
296 cs_unpack(instr, CS_FLUSH_CACHE2, I);
297 static const char *mode[] = {
298 "nop",
299 "clean",
300 "INVALID",
301 "clean_invalidate",
302 };
303
304 fprintf(fp, "FLUSH_CACHE2.%s_l2.%s_lsc%s r%u, #%x, #%u",
305 mode[I.l2_flush_mode], mode[I.lsc_flush_mode],
306 I.other_invalidate ? ".invalidate_other" : ".nop_other",
307 I.latest_flush_id, I.wait_mask, I.signal_slot);
308 break;
309 }
310
311 case MALI_CS_OPCODE_SYNC_ADD32: {
312 cs_unpack(instr, CS_SYNC_ADD32, I);
313 fprintf(fp, "SYNC_ADD32%s%s [d%u], r%u, #%x, #%u",
314 I.error_propagate ? ".error_propagate" : "",
315 I.scope == MALI_CS_SYNC_SCOPE_CSG ? ".csg" : ".system", I.address,
316 I.data, I.wait_mask, I.signal_slot);
317 break;
318 }
319
320 case MALI_CS_OPCODE_SYNC_SET32: {
321 cs_unpack(instr, CS_SYNC_SET32, I);
322 fprintf(fp, "SYNC_SET32.%s%s [d%u], r%u, #%x, #%u",
323 I.error_propagate ? ".error_propagate" : "",
324 I.scope == MALI_CS_SYNC_SCOPE_CSG ? ".csg" : ".system", I.address,
325 I.data, I.wait_mask, I.signal_slot);
326 break;
327 }
328
329 case MALI_CS_OPCODE_SYNC_WAIT32: {
330 cs_unpack(instr, CS_SYNC_WAIT32, I);
331 fprintf(fp, "SYNC_WAIT32%s%s d%u, r%u", conditions_str[I.condition],
332 I.error_reject ? ".reject" : ".inherit", I.address, I.data);
333 break;
334 }
335
336 case MALI_CS_OPCODE_STORE_STATE: {
337 static const char *states_str[] = {
338 "SYSTEM_TIMESTAMP",
339 "CYCLE_COUNT",
340 "DISJOINT_COUNT",
341 "ERROR_STATE",
342 };
343
344 cs_unpack(instr, CS_STORE_STATE, I);
345 fprintf(fp, "STORE_STATE.%s d%u, #%i, #%x, #%u",
346 I.state >= ARRAY_SIZE(states_str) ? "UNKNOWN_STATE"
347 : states_str[I.state],
348 I.address, I.offset, I.wait_mask, I.signal_slot);
349 break;
350 }
351
352 case MALI_CS_OPCODE_PROT_REGION: {
353 cs_unpack(instr, CS_PROT_REGION, I);
354 fprintf(fp, "PROT_REGION #%u", I.size);
355 break;
356 }
357
358 case MALI_CS_OPCODE_PROGRESS_STORE: {
359 cs_unpack(instr, CS_PROGRESS_STORE, I);
360 fprintf(fp, "PROGRESS_STORE d%u", I.source);
361 break;
362 }
363
364 case MALI_CS_OPCODE_PROGRESS_LOAD: {
365 cs_unpack(instr, CS_PROGRESS_LOAD, I);
366 fprintf(fp, "PROGRESS_LOAD d%u", I.destination);
367 break;
368 }
369
370 case MALI_CS_OPCODE_RUN_COMPUTE_INDIRECT: {
371 cs_unpack(instr, CS_RUN_COMPUTE_INDIRECT, I);
372 fprintf(fp, "RUN_COMPUTE_INDIRECT%s.srt%d.spd%d.tsd%d.fau%d #%u",
373 I.progress_increment ? ".progress_inc" : "", I.srt_select,
374 I.spd_select, I.tsd_select, I.fau_select, I.workgroups_per_task);
375
376 break;
377 }
378
379 case MALI_CS_OPCODE_ERROR_BARRIER: {
380 cs_unpack(instr, CS_ERROR_BARRIER, I);
381 fprintf(fp, "ERROR_BARRIER");
382 break;
383 }
384
385 case MALI_CS_OPCODE_HEAP_SET: {
386 cs_unpack(instr, CS_HEAP_SET, I);
387 fprintf(fp, "HEAP_SET d%u", I.address);
388 break;
389 }
390
391 case MALI_CS_OPCODE_HEAP_OPERATION: {
392 cs_unpack(instr, CS_HEAP_OPERATION, I);
393 const char *counter_names[] = {"vt_start", "vt_end", NULL, "frag_end"};
394 fprintf(fp, "HEAP_OPERATION.%s #%x, #%d", counter_names[I.operation],
395 I.wait_mask, I.signal_slot);
396 break;
397 }
398
399 case MALI_CS_OPCODE_TRACE_POINT: {
400 cs_unpack(instr, CS_TRACE_POINT, I);
401 fprintf(fp, "TRACE_POINT r%d:r%d, #%x, #%u", I.base_register,
402 I.base_register + I.register_count - 1, I.wait_mask,
403 I.signal_slot);
404 break;
405 }
406
407 case MALI_CS_OPCODE_SYNC_ADD64: {
408 cs_unpack(instr, CS_SYNC_ADD64, I);
409 fprintf(fp, "SYNC_ADD64%s%s [d%u], d%u, #%x, #%u",
410 I.error_propagate ? ".error_propagate" : "",
411 I.scope == MALI_CS_SYNC_SCOPE_CSG ? ".csg" : ".system", I.address,
412 I.data, I.wait_mask, I.signal_slot);
413 break;
414 }
415
416 case MALI_CS_OPCODE_SYNC_SET64: {
417 cs_unpack(instr, CS_SYNC_SET64, I);
418 fprintf(fp, "SYNC_SET64.%s%s [d%u], d%u, #%x, #%u",
419 I.error_propagate ? ".error_propagate" : "",
420 I.scope == MALI_CS_SYNC_SCOPE_CSG ? ".csg" : ".system", I.address,
421 I.data, I.wait_mask, I.signal_slot);
422 break;
423 }
424
425 case MALI_CS_OPCODE_SYNC_WAIT64: {
426 cs_unpack(instr, CS_SYNC_WAIT64, I);
427
428 fprintf(fp, "SYNC_WAIT64%s%s d%u, d%u", conditions_str[I.condition],
429 I.error_reject ? ".reject" : ".inherit", I.address, I.data);
430 break;
431 }
432
433 default: {
434 fprintf(fp, "UNKNOWN_%u 0x%" PRIX64 "\n", base.opcode, base.data);
435 break;
436 }
437 }
438 }
439
440 static uint32_t
cs_get_u32(struct queue_ctx * qctx,uint8_t reg)441 cs_get_u32(struct queue_ctx *qctx, uint8_t reg)
442 {
443 assert(reg < qctx->nr_regs);
444 return qctx->regs[reg];
445 }
446
447 static uint64_t
cs_get_u64(struct queue_ctx * qctx,uint8_t reg)448 cs_get_u64(struct queue_ctx *qctx, uint8_t reg)
449 {
450 return (((uint64_t)cs_get_u32(qctx, reg + 1)) << 32) | cs_get_u32(qctx, reg);
451 }
452
453 static void
pandecode_run_compute(struct pandecode_context * ctx,FILE * fp,struct queue_ctx * qctx,struct MALI_CS_RUN_COMPUTE * I)454 pandecode_run_compute(struct pandecode_context *ctx, FILE *fp,
455 struct queue_ctx *qctx, struct MALI_CS_RUN_COMPUTE *I)
456 {
457 if (qctx->in_exception_handler)
458 return;
459
460 ctx->indent++;
461
462 unsigned reg_srt = 0 + (I->srt_select * 2);
463 unsigned reg_fau = 8 + (I->fau_select * 2);
464 unsigned reg_spd = 16 + (I->spd_select * 2);
465 unsigned reg_tsd = 24 + (I->tsd_select * 2);
466
467 GENX(pandecode_resource_tables)(ctx, cs_get_u64(qctx, reg_srt), "Resources");
468
469 uint64_t fau = cs_get_u64(qctx, reg_fau);
470
471 if (fau)
472 GENX(pandecode_fau)(ctx, fau & BITFIELD64_MASK(48), fau >> 56, "FAU");
473
474 GENX(pandecode_shader)
475 (ctx, cs_get_u64(qctx, reg_spd), "Shader", qctx->gpu_id);
476
477 DUMP_ADDR(ctx, LOCAL_STORAGE, cs_get_u64(qctx, reg_tsd),
478 "Local Storage @%" PRIx64 ":\n", cs_get_u64(qctx, reg_tsd));
479
480 pandecode_log(ctx, "Global attribute offset: %u\n", cs_get_u32(qctx, 32));
481 DUMP_CL(ctx, COMPUTE_SIZE_WORKGROUP, &qctx->regs[33], "Workgroup size\n");
482 pandecode_log(ctx, "Job offset X: %u\n", cs_get_u32(qctx, 34));
483 pandecode_log(ctx, "Job offset Y: %u\n", cs_get_u32(qctx, 35));
484 pandecode_log(ctx, "Job offset Z: %u\n", cs_get_u32(qctx, 36));
485 pandecode_log(ctx, "Job size X: %u\n", cs_get_u32(qctx, 37));
486 pandecode_log(ctx, "Job size Y: %u\n", cs_get_u32(qctx, 38));
487 pandecode_log(ctx, "Job size Z: %u\n", cs_get_u32(qctx, 39));
488
489 ctx->indent--;
490 }
491
492 static void
pandecode_run_compute_indirect(struct pandecode_context * ctx,FILE * fp,struct queue_ctx * qctx,struct MALI_CS_RUN_COMPUTE_INDIRECT * I)493 pandecode_run_compute_indirect(struct pandecode_context *ctx, FILE *fp,
494 struct queue_ctx *qctx,
495 struct MALI_CS_RUN_COMPUTE_INDIRECT *I)
496 {
497 if (qctx->in_exception_handler)
498 return;
499
500 ctx->indent++;
501
502 unsigned reg_srt = 0 + (I->srt_select * 2);
503 unsigned reg_fau = 8 + (I->fau_select * 2);
504 unsigned reg_spd = 16 + (I->spd_select * 2);
505 unsigned reg_tsd = 24 + (I->tsd_select * 2);
506
507 GENX(pandecode_resource_tables)(ctx, cs_get_u64(qctx, reg_srt), "Resources");
508
509 uint64_t fau = cs_get_u64(qctx, reg_fau);
510
511 if (fau)
512 GENX(pandecode_fau)(ctx, fau & BITFIELD64_MASK(48), fau >> 56, "FAU");
513
514 GENX(pandecode_shader)
515 (ctx, cs_get_u64(qctx, reg_spd), "Shader", qctx->gpu_id);
516
517 DUMP_ADDR(ctx, LOCAL_STORAGE, cs_get_u64(qctx, reg_tsd),
518 "Local Storage @%" PRIx64 ":\n", cs_get_u64(qctx, reg_tsd));
519
520 pandecode_log(ctx, "Global attribute offset: %u\n", cs_get_u32(qctx, 32));
521 DUMP_CL(ctx, COMPUTE_SIZE_WORKGROUP, &qctx->regs[33], "Workgroup size\n");
522 pandecode_log(ctx, "Job offset X: %u\n", cs_get_u32(qctx, 34));
523 pandecode_log(ctx, "Job offset Y: %u\n", cs_get_u32(qctx, 35));
524 pandecode_log(ctx, "Job offset Z: %u\n", cs_get_u32(qctx, 36));
525 pandecode_log(ctx, "Job size X: %u\n", cs_get_u32(qctx, 37));
526 pandecode_log(ctx, "Job size Y: %u\n", cs_get_u32(qctx, 38));
527 pandecode_log(ctx, "Job size Z: %u\n", cs_get_u32(qctx, 39));
528
529 ctx->indent--;
530 }
531
532 static void
pandecode_run_tiling(struct pandecode_context * ctx,FILE * fp,struct queue_ctx * qctx,struct MALI_CS_RUN_TILING * I)533 pandecode_run_tiling(struct pandecode_context *ctx, FILE *fp,
534 struct queue_ctx *qctx, struct MALI_CS_RUN_TILING *I)
535 {
536 if (qctx->in_exception_handler)
537 return;
538
539 ctx->indent++;
540
541 /* Merge flag overrides with the register flags */
542 struct mali_primitive_flags_packed tiler_flags_packed = {
543 .opaque[0] = cs_get_u32(qctx, 56) | I->flags_override,
544 };
545 pan_unpack(&tiler_flags_packed, PRIMITIVE_FLAGS, tiler_flags);
546
547 unsigned reg_srt = I->srt_select * 2;
548 unsigned reg_fau = 8 + I->fau_select * 2;
549 unsigned reg_spd = 16 + I->spd_select * 2;
550 unsigned reg_tsd = 24 + I->tsd_select;
551
552 uint64_t srt = cs_get_u64(qctx, reg_srt);
553 uint64_t fau = cs_get_u64(qctx, reg_fau);
554 uint64_t spd = cs_get_u64(qctx, reg_spd);
555 uint64_t tsd = cs_get_u64(qctx, reg_tsd);
556
557 if (srt)
558 GENX(pandecode_resource_tables)(ctx, srt, "Fragment resources");
559
560 if (fau) {
561 uint64_t lo = fau & BITFIELD64_MASK(48);
562 uint64_t hi = fau >> 56;
563
564 GENX(pandecode_fau)(ctx, lo, hi, "Fragment FAU");
565 }
566
567 if (spd) {
568 GENX(pandecode_shader)
569 (ctx, spd, "Fragment shader", qctx->gpu_id);
570 }
571
572 DUMP_ADDR(ctx, LOCAL_STORAGE, tsd, "Fragment Local Storage @%" PRIx64 ":\n",
573 tsd);
574
575 pandecode_log(ctx, "Global attribute offset: %u\n", cs_get_u32(qctx, 32));
576 pandecode_log(ctx, "Index count: %u\n", cs_get_u32(qctx, 33));
577 pandecode_log(ctx, "Instance count: %u\n", cs_get_u32(qctx, 34));
578
579 if (tiler_flags.index_type)
580 pandecode_log(ctx, "Index offset: %u\n", cs_get_u32(qctx, 35));
581
582 pandecode_log(ctx, "Vertex offset: %d\n", cs_get_u32(qctx, 36));
583 pandecode_log(ctx, "Tiler DCD flags2: %X\n", cs_get_u32(qctx, 38));
584
585 if (tiler_flags.index_type)
586 pandecode_log(ctx, "Index array size: %u\n", cs_get_u32(qctx, 39));
587
588 GENX(pandecode_tiler)(ctx, cs_get_u64(qctx, 40), qctx->gpu_id);
589
590 DUMP_CL(ctx, SCISSOR, &qctx->regs[42], "Scissor\n");
591 pandecode_log(ctx, "Low depth clamp: %f\n", uif(cs_get_u32(qctx, 44)));
592 pandecode_log(ctx, "High depth clamp: %f\n", uif(cs_get_u32(qctx, 45)));
593 pandecode_log(ctx, "Occlusion: %" PRIx64 "\n", cs_get_u64(qctx, 46));
594 pandecode_log(ctx, "Vertex position array: %" PRIx64 "\n",
595 cs_get_u64(qctx, 48));
596
597 uint64_t blend = cs_get_u64(qctx, 50);
598 GENX(pandecode_blend_descs)(ctx, blend & ~15, blend & 15, 0, qctx->gpu_id);
599
600 DUMP_ADDR(ctx, DEPTH_STENCIL, cs_get_u64(qctx, 52), "Depth/stencil");
601
602 if (tiler_flags.index_type)
603 pandecode_log(ctx, "Indices: %" PRIx64 "\n", cs_get_u64(qctx, 54));
604
605 DUMP_UNPACKED(ctx, PRIMITIVE_FLAGS, tiler_flags, "Primitive flags\n");
606 DUMP_CL(ctx, DCD_FLAGS_0, &qctx->regs[57], "DCD Flags 0\n");
607 DUMP_CL(ctx, DCD_FLAGS_1, &qctx->regs[58], "DCD Flags 1\n");
608 pandecode_log(ctx, "Vertex bounds: %u\n", cs_get_u32(qctx, 59));
609 DUMP_CL(ctx, PRIMITIVE_SIZE, &qctx->regs[60], "Primitive size\n");
610
611 ctx->indent--;
612 }
613
614 static void
pandecode_run_idvs(struct pandecode_context * ctx,FILE * fp,struct queue_ctx * qctx,struct MALI_CS_RUN_IDVS * I)615 pandecode_run_idvs(struct pandecode_context *ctx, FILE *fp,
616 struct queue_ctx *qctx, struct MALI_CS_RUN_IDVS *I)
617 {
618 if (qctx->in_exception_handler)
619 return;
620
621 ctx->indent++;
622
623 /* Merge flag overrides with the register flags */
624 struct mali_primitive_flags_packed tiler_flags_packed = {
625 .opaque[0] = cs_get_u32(qctx, 56) | I->flags_override,
626 };
627 pan_unpack(&tiler_flags_packed, PRIMITIVE_FLAGS, tiler_flags);
628
629 unsigned reg_position_srt = 0;
630 unsigned reg_position_fau = 8;
631 unsigned reg_position_tsd = 24;
632
633 unsigned reg_vary_srt = I->varying_srt_select ? 2 : 0;
634 unsigned reg_vary_fau = I->varying_fau_select ? 10 : 8;
635 unsigned reg_vary_tsd = I->varying_tsd_select ? 26 : 24;
636
637 unsigned reg_frag_srt = I->fragment_srt_select ? 4 : 0;
638 unsigned reg_frag_fau = 12;
639 unsigned reg_frag_tsd = I->fragment_tsd_select ? 28 : 24;
640
641 uint64_t position_srt = cs_get_u64(qctx, reg_position_srt);
642 uint64_t vary_srt = cs_get_u64(qctx, reg_vary_srt);
643 uint64_t frag_srt = cs_get_u64(qctx, reg_frag_srt);
644
645 if (position_srt)
646 GENX(pandecode_resource_tables)(ctx, position_srt, "Position resources");
647
648 if (vary_srt)
649 GENX(pandecode_resource_tables)(ctx, vary_srt, "Varying resources");
650
651 if (frag_srt)
652 GENX(pandecode_resource_tables)(ctx, frag_srt, "Fragment resources");
653
654 uint64_t position_fau = cs_get_u64(qctx, reg_position_fau);
655 uint64_t vary_fau = cs_get_u64(qctx, reg_vary_fau);
656 uint64_t fragment_fau = cs_get_u64(qctx, reg_frag_fau);
657
658 if (position_fau) {
659 uint64_t lo = position_fau & BITFIELD64_MASK(48);
660 uint64_t hi = position_fau >> 56;
661
662 GENX(pandecode_fau)(ctx, lo, hi, "Position FAU");
663 }
664
665 if (vary_fau) {
666 uint64_t lo = vary_fau & BITFIELD64_MASK(48);
667 uint64_t hi = vary_fau >> 56;
668
669 GENX(pandecode_fau)(ctx, lo, hi, "Varying FAU");
670 }
671
672 if (fragment_fau) {
673 uint64_t lo = fragment_fau & BITFIELD64_MASK(48);
674 uint64_t hi = fragment_fau >> 56;
675
676 GENX(pandecode_fau)(ctx, lo, hi, "Fragment FAU");
677 }
678
679 if (cs_get_u64(qctx, 16)) {
680 GENX(pandecode_shader)
681 (ctx, cs_get_u64(qctx, 16), "Position shader", qctx->gpu_id);
682 }
683
684 if (tiler_flags.secondary_shader) {
685 uint64_t ptr = cs_get_u64(qctx, 18);
686
687 GENX(pandecode_shader)(ctx, ptr, "Varying shader", qctx->gpu_id);
688 }
689
690 if (cs_get_u64(qctx, 20)) {
691 GENX(pandecode_shader)
692 (ctx, cs_get_u64(qctx, 20), "Fragment shader", qctx->gpu_id);
693 }
694
695 DUMP_ADDR(ctx, LOCAL_STORAGE, cs_get_u64(qctx, reg_position_tsd),
696 "Position Local Storage @%" PRIx64 ":\n",
697 cs_get_u64(qctx, reg_position_tsd));
698 DUMP_ADDR(ctx, LOCAL_STORAGE, cs_get_u64(qctx, reg_vary_tsd),
699 "Varying Local Storage @%" PRIx64 ":\n",
700 cs_get_u64(qctx, reg_vary_tsd));
701 DUMP_ADDR(ctx, LOCAL_STORAGE, cs_get_u64(qctx, reg_frag_tsd),
702 "Fragment Local Storage @%" PRIx64 ":\n",
703 cs_get_u64(qctx, reg_frag_tsd));
704
705 pandecode_log(ctx, "Global attribute offset: %u\n", cs_get_u32(qctx, 32));
706 pandecode_log(ctx, "Index count: %u\n", cs_get_u32(qctx, 33));
707 pandecode_log(ctx, "Instance count: %u\n", cs_get_u32(qctx, 34));
708
709 if (tiler_flags.index_type)
710 pandecode_log(ctx, "Index offset: %u\n", cs_get_u32(qctx, 35));
711
712 pandecode_log(ctx, "Vertex offset: %d\n", cs_get_u32(qctx, 36));
713 pandecode_log(ctx, "Instance offset: %u\n", cs_get_u32(qctx, 37));
714 pandecode_log(ctx, "Tiler DCD flags2: %X\n", cs_get_u32(qctx, 38));
715
716 if (tiler_flags.index_type)
717 pandecode_log(ctx, "Index array size: %u\n", cs_get_u32(qctx, 39));
718
719 GENX(pandecode_tiler)(ctx, cs_get_u64(qctx, 40), qctx->gpu_id);
720
721 DUMP_CL(ctx, SCISSOR, &qctx->regs[42], "Scissor\n");
722 pandecode_log(ctx, "Low depth clamp: %f\n", uif(cs_get_u32(qctx, 44)));
723 pandecode_log(ctx, "High depth clamp: %f\n", uif(cs_get_u32(qctx, 45)));
724 pandecode_log(ctx, "Occlusion: %" PRIx64 "\n", cs_get_u64(qctx, 46));
725
726 if (tiler_flags.secondary_shader)
727 pandecode_log(ctx, "Varying allocation: %u\n", cs_get_u32(qctx, 48));
728
729 uint64_t blend = cs_get_u64(qctx, 50);
730 GENX(pandecode_blend_descs)(ctx, blend & ~15, blend & 15, 0, qctx->gpu_id);
731
732 DUMP_ADDR(ctx, DEPTH_STENCIL, cs_get_u64(qctx, 52), "Depth/stencil");
733
734 if (tiler_flags.index_type)
735 pandecode_log(ctx, "Indices: %" PRIx64 "\n", cs_get_u64(qctx, 54));
736
737 DUMP_UNPACKED(ctx, PRIMITIVE_FLAGS, tiler_flags, "Primitive flags\n");
738 DUMP_CL(ctx, DCD_FLAGS_0, &qctx->regs[57], "DCD Flags 0\n");
739 DUMP_CL(ctx, DCD_FLAGS_1, &qctx->regs[58], "DCD Flags 1\n");
740 DUMP_CL(ctx, PRIMITIVE_SIZE, &qctx->regs[60], "Primitive size\n");
741
742 ctx->indent--;
743 }
744
745 static void
pandecode_run_fragment(struct pandecode_context * ctx,FILE * fp,struct queue_ctx * qctx,struct MALI_CS_RUN_FRAGMENT * I)746 pandecode_run_fragment(struct pandecode_context *ctx, FILE *fp,
747 struct queue_ctx *qctx, struct MALI_CS_RUN_FRAGMENT *I)
748 {
749 if (qctx->in_exception_handler)
750 return;
751
752 ctx->indent++;
753
754 DUMP_CL(ctx, SCISSOR, &qctx->regs[42], "Scissor\n");
755
756 /* TODO: Tile enable map */
757 GENX(pandecode_fbd)
758 (ctx, cs_get_u64(qctx, 40) & ~0x3full, true, qctx->gpu_id);
759
760 ctx->indent--;
761 }
762
763 static void
pandecode_run_fullscreen(struct pandecode_context * ctx,FILE * fp,struct queue_ctx * qctx,struct MALI_CS_RUN_FULLSCREEN * I)764 pandecode_run_fullscreen(struct pandecode_context *ctx, FILE *fp,
765 struct queue_ctx *qctx,
766 struct MALI_CS_RUN_FULLSCREEN *I)
767 {
768 if (qctx->in_exception_handler)
769 return;
770
771 ctx->indent++;
772
773 /* Merge flag overrides with the register flags */
774 struct mali_primitive_flags_packed tiler_flags_packed = {
775 .opaque[0] = cs_get_u32(qctx, 56) | I->flags_override,
776 };
777 pan_unpack(&tiler_flags_packed, PRIMITIVE_FLAGS, tiler_flags);
778 DUMP_UNPACKED(ctx, PRIMITIVE_FLAGS, tiler_flags, "Primitive flags\n");
779
780 GENX(pandecode_tiler)(ctx, cs_get_u64(qctx, 40), qctx->gpu_id);
781
782 DUMP_CL(ctx, SCISSOR, &qctx->regs[42], "Scissor\n");
783
784 pan_unpack(
785 PANDECODE_PTR(ctx, cs_get_u64(qctx, I->dcd), struct mali_draw_packed),
786 DRAW, dcd);
787 GENX(pandecode_dcd)(ctx, &dcd, 0, qctx->gpu_id);
788
789 ctx->indent--;
790 }
791
792 static bool
interpret_cs_jump(struct pandecode_context * ctx,struct queue_ctx * qctx,uint64_t reg_address,uint32_t reg_length)793 interpret_cs_jump(struct pandecode_context *ctx, struct queue_ctx *qctx,
794 uint64_t reg_address, uint32_t reg_length)
795 {
796 uint32_t address_lo = qctx->regs[reg_address];
797 uint32_t address_hi = qctx->regs[reg_address + 1];
798 uint32_t length = qctx->regs[reg_length];
799
800 if (length % 8) {
801 fprintf(stderr, "CS call alignment error\n");
802 return false;
803 }
804
805 /* Map the entire subqueue now */
806 uint64_t address = ((uint64_t)address_hi << 32) | address_lo;
807 /* Return if the jump is for an exception handler that's set to zero */
808 if (qctx->in_exception_handler && (!address || !length)) {
809 qctx->in_exception_handler = false;
810 qctx->call_stack_depth--;
811 return true;
812 }
813 uint64_t *cs = pandecode_fetch_gpu_mem(ctx, address, length);
814
815 qctx->ip = cs;
816 qctx->end = cs + (length / 8);
817
818 /* Skip the usual IP update */
819 return true;
820 }
821
822 static bool
eval_cond(struct queue_ctx * qctx,enum mali_cs_condition cond,uint32_t reg)823 eval_cond(struct queue_ctx *qctx, enum mali_cs_condition cond, uint32_t reg)
824 {
825 int32_t val = qctx->regs[reg];
826
827 switch (cond) {
828 case MALI_CS_CONDITION_LEQUAL:
829 return val <= 0;
830 case MALI_CS_CONDITION_EQUAL:
831 return val == 0;
832 case MALI_CS_CONDITION_LESS:
833 return val < 0;
834 case MALI_CS_CONDITION_GREATER:
835 return val > 0;
836 case MALI_CS_CONDITION_NEQUAL:
837 return val != 0;
838 case MALI_CS_CONDITION_GEQUAL:
839 return val >= 0;
840 case MALI_CS_CONDITION_ALWAYS:
841 return true;
842 default:
843 assert(!"Invalid condition");
844 return false;
845 }
846 }
847
848 static void
interpret_cs_branch(struct pandecode_context * ctx,struct queue_ctx * qctx,int16_t offset,enum mali_cs_condition cond,uint32_t reg)849 interpret_cs_branch(struct pandecode_context *ctx, struct queue_ctx *qctx,
850 int16_t offset, enum mali_cs_condition cond, uint32_t reg)
851 {
852 if (eval_cond(qctx, cond, reg))
853 qctx->ip += offset;
854 }
855
856 /*
857 * Interpret a single instruction of the CS, updating the register file,
858 * instruction pointer, and call stack. Memory access and GPU controls are
859 * ignored for now.
860 *
861 * Returns true if execution should continue.
862 */
863 static bool
interpret_cs_instr(struct pandecode_context * ctx,struct queue_ctx * qctx)864 interpret_cs_instr(struct pandecode_context *ctx, struct queue_ctx *qctx)
865 {
866 FILE *fp = ctx->dump_stream;
867 /* Unpack the base so we get the opcode */
868 uint8_t *bytes = (uint8_t *)qctx->ip;
869 cs_unpack(bytes, CS_BASE, base);
870
871 assert(qctx->ip < qctx->end);
872
873 /* Don't try to keep track of registers/operations inside exception handler */
874 if (qctx->in_exception_handler) {
875 assert(base.opcode != MALI_CS_OPCODE_SET_EXCEPTION_HANDLER);
876 goto no_interpret;
877 }
878
879 switch (base.opcode) {
880 case MALI_CS_OPCODE_RUN_COMPUTE: {
881 cs_unpack(bytes, CS_RUN_COMPUTE, I);
882 pandecode_run_compute(ctx, fp, qctx, &I);
883 break;
884 }
885
886 case MALI_CS_OPCODE_RUN_TILING: {
887 cs_unpack(bytes, CS_RUN_TILING, I);
888 pandecode_run_tiling(ctx, fp, qctx, &I);
889 break;
890 }
891
892 case MALI_CS_OPCODE_RUN_IDVS: {
893 cs_unpack(bytes, CS_RUN_IDVS, I);
894 pandecode_run_idvs(ctx, fp, qctx, &I);
895 break;
896 }
897
898 case MALI_CS_OPCODE_RUN_FRAGMENT: {
899 cs_unpack(bytes, CS_RUN_FRAGMENT, I);
900 pandecode_run_fragment(ctx, fp, qctx, &I);
901 break;
902 }
903
904 case MALI_CS_OPCODE_RUN_FULLSCREEN: {
905 cs_unpack(bytes, CS_RUN_FULLSCREEN, I);
906 pandecode_run_fullscreen(ctx, fp, qctx, &I);
907 break;
908 }
909
910 case MALI_CS_OPCODE_RUN_COMPUTE_INDIRECT: {
911 cs_unpack(bytes, CS_RUN_COMPUTE_INDIRECT, I);
912 pandecode_run_compute_indirect(ctx, fp, qctx, &I);
913 break;
914 }
915
916 case MALI_CS_OPCODE_MOVE: {
917 cs_unpack(bytes, CS_MOVE, I);
918
919 qctx->regs[I.destination + 0] = (uint32_t)I.immediate;
920 qctx->regs[I.destination + 1] = (uint32_t)(I.immediate >> 32);
921 break;
922 }
923
924 case MALI_CS_OPCODE_MOVE32: {
925 cs_unpack(bytes, CS_MOVE32, I);
926
927 qctx->regs[I.destination] = I.immediate;
928 break;
929 }
930
931 case MALI_CS_OPCODE_LOAD_MULTIPLE: {
932 cs_unpack(bytes, CS_LOAD_MULTIPLE, I);
933 uint64_t addr =
934 ((uint64_t)qctx->regs[I.address + 1] << 32) | qctx->regs[I.address];
935 addr += I.offset;
936
937 uint32_t *src =
938 pandecode_fetch_gpu_mem(ctx, addr, util_last_bit(I.mask) * 4);
939
940 for (uint32_t i = 0; i < 16; i++) {
941 if (I.mask & BITFIELD_BIT(i))
942 qctx->regs[I.base_register + i] = src[i];
943 }
944 break;
945 }
946
947 case MALI_CS_OPCODE_ADD_IMMEDIATE32: {
948 cs_unpack(bytes, CS_ADD_IMMEDIATE32, I);
949
950 qctx->regs[I.destination] = qctx->regs[I.source] + I.immediate;
951 break;
952 }
953
954 case MALI_CS_OPCODE_ADD_IMMEDIATE64: {
955 cs_unpack(bytes, CS_ADD_IMMEDIATE64, I);
956
957 int64_t value =
958 (qctx->regs[I.source] | ((int64_t)qctx->regs[I.source + 1] << 32)) +
959 I.immediate;
960
961 qctx->regs[I.destination] = value;
962 qctx->regs[I.destination + 1] = value >> 32;
963 break;
964 }
965
966 case MALI_CS_OPCODE_CALL: {
967 cs_unpack(bytes, CS_CALL, I);
968
969 if (qctx->call_stack_depth == MAX_CALL_STACK_DEPTH) {
970 fprintf(stderr, "CS call stack overflow\n");
971 return false;
972 }
973
974 assert(qctx->call_stack_depth < MAX_CALL_STACK_DEPTH);
975
976 qctx->ip++;
977
978 /* Note: tail calls are not optimized in the hardware. */
979 assert(qctx->ip <= qctx->end);
980
981 unsigned depth = qctx->call_stack_depth++;
982
983 qctx->call_stack[depth].lr = qctx->ip;
984 qctx->call_stack[depth].end = qctx->end;
985
986 return interpret_cs_jump(ctx, qctx, I.address, I.length);
987 }
988
989 case MALI_CS_OPCODE_SET_EXCEPTION_HANDLER: {
990 cs_unpack(bytes, CS_SET_EXCEPTION_HANDLER, I);
991
992 assert(qctx->call_stack_depth < MAX_CALL_STACK_DEPTH);
993
994 qctx->ip++;
995
996 /* Note: tail calls are not optimized in the hardware. */
997 assert(qctx->ip <= qctx->end);
998
999 unsigned depth = qctx->call_stack_depth++;
1000
1001 qctx->call_stack[depth].lr = qctx->ip;
1002 qctx->call_stack[depth].end = qctx->end;
1003
1004 /* Exception handler can use the full frame stack depth but we don't try
1005 * to keep track of the nested JUMP/CALL as we don't know what will be
1006 * the registers/memory content when the handler is triggered. */
1007 qctx->in_exception_handler = true;
1008
1009 return interpret_cs_jump(ctx, qctx, I.address, I.length);
1010 }
1011
1012 case MALI_CS_OPCODE_JUMP: {
1013 cs_unpack(bytes, CS_JUMP, I);
1014
1015 if (qctx->call_stack_depth == 0) {
1016 fprintf(stderr, "Cannot jump from the entrypoint\n");
1017 return false;
1018 }
1019
1020 return interpret_cs_jump(ctx, qctx, I.address, I.length);
1021 }
1022
1023 case MALI_CS_OPCODE_BRANCH: {
1024 cs_unpack(bytes, CS_BRANCH, I);
1025
1026 interpret_cs_branch(ctx, qctx, I.offset, I.condition, I.value);
1027 break;
1028 }
1029
1030 default:
1031 break;
1032 }
1033
1034 no_interpret:
1035
1036 /* Update IP first to point to the next instruction, so call doesn't
1037 * require special handling (even for tail calls).
1038 */
1039 qctx->ip++;
1040
1041 while (qctx->ip == qctx->end) {
1042 /* Graceful termination */
1043 if (qctx->call_stack_depth == 0)
1044 return false;
1045
1046 /* Pop off the call stack */
1047 unsigned old_depth = --qctx->call_stack_depth;
1048
1049 qctx->ip = qctx->call_stack[old_depth].lr;
1050 qctx->end = qctx->call_stack[old_depth].end;
1051 qctx->in_exception_handler = false;
1052 }
1053
1054 return true;
1055 }
1056
1057 void
GENX(pandecode_interpret_cs)1058 GENX(pandecode_interpret_cs)(struct pandecode_context *ctx, uint64_t queue,
1059 uint32_t size, unsigned gpu_id, uint32_t *regs)
1060 {
1061 pandecode_dump_file_open(ctx);
1062
1063 uint64_t *cs = pandecode_fetch_gpu_mem(ctx, queue, size);
1064
1065 /* Mali-G610 has 96 registers. Other devices not yet supported, we can make
1066 * this configurable later when we encounter new Malis.
1067 */
1068 struct queue_ctx qctx = {
1069 .nr_regs = 96,
1070 .regs = regs,
1071 .ip = cs,
1072 .end = cs + (size / 8),
1073 .gpu_id = gpu_id,
1074
1075 /* If this is a kernel mode queue, we don't see the root ring buffer and
1076 * we must adjust the initial call stack depth accordingly.
1077 */
1078 .call_stack_depth = ctx->usermode_queue ? 0 : 1,
1079 };
1080 FILE *fp = ctx->dump_stream;
1081
1082 if (size) {
1083 do {
1084 uint64_t instr = *qctx.ip;
1085
1086 fprintf(fp, " ");
1087 for (unsigned b = 0; b < 8; ++b)
1088 fprintf(fp, " %02x", (uint8_t)(instr >> (8 * b)));
1089
1090 for (int i = 0; i < 1 + qctx.call_stack_depth; ++i)
1091 fprintf(fp, " ");
1092
1093 print_cs_instr(fp, qctx.ip);
1094 fprintf(fp, "\n");
1095 } while (interpret_cs_instr(ctx, &qctx));
1096 }
1097
1098 fflush(ctx->dump_stream);
1099 pandecode_map_read_write(ctx);
1100 }
1101
1102 struct cs_code_block {
1103 struct list_head node;
1104 unsigned start;
1105 unsigned size;
1106 struct util_dynarray predecessors;
1107 unsigned successors[2];
1108 };
1109
1110 struct cs_indirect_branch_target {
1111 uint64_t address;
1112 uint32_t length;
1113 };
1114
1115 struct cs_indirect_branch {
1116 unsigned instr_idx;
1117 bool has_unknown_targets;
1118 struct util_dynarray targets;
1119 };
1120
1121 struct cs_code_cfg {
1122 uint64_t *instrs;
1123 unsigned instr_count;
1124 struct cs_code_block **blk_map;
1125 struct util_dynarray indirect_branches;
1126 };
1127
1128 static struct cs_code_block *
cs_code_block_alloc(void * alloc_ctx,unsigned start,unsigned size)1129 cs_code_block_alloc(void *alloc_ctx, unsigned start, unsigned size)
1130 {
1131 struct cs_code_block *block = rzalloc(alloc_ctx, struct cs_code_block);
1132
1133 block->start = start;
1134 block->size = size;
1135 memset(block->successors, ~0, sizeof(block->successors));
1136 list_inithead(&block->node);
1137 util_dynarray_init(&block->predecessors, alloc_ctx);
1138 return block;
1139 }
1140
1141 static void
record_indirect_branch_target(struct cs_code_cfg * cfg,struct list_head * blk_stack,struct cs_code_block * cur_blk,unsigned blk_offs,struct cs_indirect_branch * ibranch)1142 record_indirect_branch_target(struct cs_code_cfg *cfg,
1143 struct list_head *blk_stack,
1144 struct cs_code_block *cur_blk, unsigned blk_offs,
1145 struct cs_indirect_branch *ibranch)
1146 {
1147 union {
1148 uint32_t u32[256];
1149 uint64_t u64[128];
1150 } reg_file = {0};
1151
1152 list_add(&cur_blk->node, blk_stack);
1153 list_for_each_entry(struct cs_code_block, blk, blk_stack, node) {
1154 for (; blk_offs < blk->size &&
1155 blk->start + blk_offs != ibranch->instr_idx;
1156 blk_offs++) {
1157 const uint64_t *instr = &cfg->instrs[blk->start + blk_offs];
1158 cs_unpack(instr, CS_BASE, base);
1159 switch (base.opcode) {
1160 case MALI_CS_OPCODE_MOVE: {
1161 cs_unpack(instr, CS_MOVE, I);
1162
1163 assert(I.destination % 2 == 0 &&
1164 "Destination register should be aligned to 2");
1165
1166 reg_file.u64[I.destination / 2] = I.immediate;
1167 break;
1168 }
1169
1170 case MALI_CS_OPCODE_MOVE32: {
1171 cs_unpack(instr, CS_MOVE32, I);
1172 reg_file.u32[I.destination] = I.immediate;
1173 break;
1174 }
1175
1176 case MALI_CS_OPCODE_ADD_IMMEDIATE32: {
1177 cs_unpack(instr, CS_ADD_IMMEDIATE32, I);
1178 reg_file.u32[I.destination] = reg_file.u32[I.source] + I.immediate;
1179 break;
1180 }
1181
1182 case MALI_CS_OPCODE_ADD_IMMEDIATE64: {
1183 cs_unpack(instr, CS_ADD_IMMEDIATE64, I);
1184
1185 assert(I.destination % 2 == 0 &&
1186 "Destination register should be aligned to 2");
1187 assert(I.source % 2 == 0 &&
1188 "Source register should be aligned to 2");
1189
1190 reg_file.u64[I.destination / 2] =
1191 reg_file.u64[I.source / 2] + I.immediate;
1192 break;
1193 }
1194
1195 case MALI_CS_OPCODE_UMIN32: {
1196 cs_unpack(instr, CS_UMIN32, I);
1197 reg_file.u32[I.destination] =
1198 MIN2(reg_file.u32[I.source_1], reg_file.u32[I.source_2]);
1199 break;
1200 }
1201
1202 default:
1203 break;
1204 }
1205 }
1206 blk_offs = 0;
1207 }
1208 list_delinit(&cur_blk->node);
1209
1210 uint64_t *instr = &cfg->instrs[ibranch->instr_idx];
1211 cs_unpack(instr, CS_JUMP, I);
1212
1213 assert(I.address % 2 == 0 && "Address register should be aligned to 2");
1214
1215 struct cs_indirect_branch_target target = {
1216 .address = reg_file.u64[I.address / 2],
1217 .length = reg_file.u32[I.length],
1218 };
1219
1220 util_dynarray_append(&ibranch->targets, struct cs_indirect_branch_target,
1221 target);
1222 }
1223
1224 static void
collect_indirect_branch_targets_recurse(struct cs_code_cfg * cfg,struct list_head * blk_stack,BITSET_WORD * track_map,struct cs_code_block * cur_blk,int instr_ptr,struct cs_indirect_branch * ibranch)1225 collect_indirect_branch_targets_recurse(struct cs_code_cfg *cfg,
1226 struct list_head *blk_stack,
1227 BITSET_WORD *track_map,
1228 struct cs_code_block *cur_blk,
1229 int instr_ptr,
1230 struct cs_indirect_branch *ibranch)
1231 {
1232 for (; instr_ptr >= (int)cur_blk->start; instr_ptr--) {
1233 assert(instr_ptr >= 0);
1234 const uint64_t *instr = &cfg->instrs[instr_ptr];
1235 cs_unpack(instr, CS_BASE, base);
1236 switch (base.opcode) {
1237 case MALI_CS_OPCODE_MOVE: {
1238 cs_unpack(instr, CS_MOVE, I);
1239 BITSET_CLEAR(track_map, I.destination);
1240 BITSET_CLEAR(track_map, I.destination + 1);
1241 break;
1242 }
1243
1244 case MALI_CS_OPCODE_MOVE32: {
1245 cs_unpack(instr, CS_MOVE32, I);
1246 BITSET_CLEAR(track_map, I.destination);
1247 break;
1248 }
1249
1250 case MALI_CS_OPCODE_ADD_IMMEDIATE32: {
1251 cs_unpack(instr, CS_ADD_IMMEDIATE32, I);
1252 if (BITSET_TEST(track_map, I.destination)) {
1253 BITSET_SET(track_map, I.source);
1254 BITSET_CLEAR(track_map, I.destination);
1255 }
1256 break;
1257 }
1258
1259 case MALI_CS_OPCODE_ADD_IMMEDIATE64: {
1260 cs_unpack(instr, CS_ADD_IMMEDIATE64, I);
1261 if (BITSET_TEST(track_map, I.destination)) {
1262 BITSET_SET(track_map, I.source);
1263 BITSET_CLEAR(track_map, I.destination);
1264 }
1265 if (BITSET_TEST(track_map, I.destination + 1)) {
1266 BITSET_SET(track_map, I.source + 1);
1267 BITSET_CLEAR(track_map, I.destination + 1);
1268 }
1269 break;
1270 }
1271
1272 case MALI_CS_OPCODE_UMIN32: {
1273 cs_unpack(instr, CS_UMIN32, I);
1274 if (BITSET_TEST(track_map, I.destination)) {
1275 BITSET_SET(track_map, I.source_1);
1276 BITSET_SET(track_map, I.source_2);
1277 BITSET_CLEAR(track_map, I.destination);
1278 }
1279 break;
1280 }
1281
1282 case MALI_CS_OPCODE_LOAD_MULTIPLE: {
1283 cs_unpack(instr, CS_LOAD_MULTIPLE, I);
1284 for (unsigned i = 0; i < 16; i++) {
1285 if ((I.mask & BITFIELD_BIT(i)) &&
1286 BITSET_TEST(track_map, I.base_register + i)) {
1287 ibranch->has_unknown_targets = true;
1288 return;
1289 }
1290 }
1291 break;
1292 }
1293
1294 case MALI_CS_OPCODE_PROGRESS_LOAD: {
1295 cs_unpack(instr, CS_PROGRESS_LOAD, I);
1296 for (unsigned i = 0; i < 16; i++) {
1297 if (BITSET_TEST(track_map, I.destination) ||
1298 BITSET_TEST(track_map, I.destination + 1)) {
1299 ibranch->has_unknown_targets = true;
1300 return;
1301 }
1302 }
1303 break;
1304 }
1305
1306 default:
1307 break;
1308 }
1309
1310 if (__bitset_is_empty(track_map, BITSET_WORDS(256))) {
1311 record_indirect_branch_target(cfg, blk_stack, cur_blk,
1312 instr_ptr - cur_blk->start, ibranch);
1313 return;
1314 }
1315 }
1316
1317 assert(!__bitset_is_empty(track_map, BITSET_WORDS(256)));
1318
1319 if (util_dynarray_num_elements(&cur_blk->predecessors, unsigned) == 0) {
1320 ibranch->has_unknown_targets = true;
1321 return;
1322 }
1323
1324 list_add(&cur_blk->node, blk_stack);
1325 util_dynarray_foreach(&cur_blk->predecessors, unsigned, pred) {
1326 struct cs_code_block *prev_blk = cfg->blk_map[*pred];
1327
1328 /* If the node is already in the block stack, we skip it
1329 * and consider this path leading to an unknown target. */
1330 if (!list_is_empty(&cur_blk->node)) {
1331 ibranch->has_unknown_targets = true;
1332 continue;
1333 }
1334
1335 collect_indirect_branch_targets_recurse(
1336 cfg, blk_stack, track_map, prev_blk,
1337 prev_blk->start + prev_blk->size - 1, ibranch);
1338 }
1339 list_delinit(&cur_blk->node);
1340
1341 return;
1342 }
1343
1344 static void
collect_indirect_branch_targets(struct cs_code_cfg * cfg,struct cs_indirect_branch * ibranch)1345 collect_indirect_branch_targets(struct cs_code_cfg *cfg,
1346 struct cs_indirect_branch *ibranch)
1347 {
1348 uint64_t *instr = &cfg->instrs[ibranch->instr_idx];
1349 struct cs_code_block *cur_blk = cfg->blk_map[ibranch->instr_idx];
1350 struct list_head blk_stack;
1351 BITSET_DECLARE(track_map, 256) = {0};
1352
1353 list_inithead(&blk_stack);
1354
1355 cs_unpack(instr, CS_JUMP, I);
1356 BITSET_SET(track_map, I.address);
1357 BITSET_SET(track_map, I.address + 1);
1358 BITSET_SET(track_map, I.length);
1359
1360 collect_indirect_branch_targets_recurse(cfg, &blk_stack, track_map, cur_blk,
1361 ibranch->instr_idx - 1, ibranch);
1362 }
1363
1364 static struct cs_code_cfg *
get_cs_cfg(struct pandecode_context * ctx,struct hash_table_u64 * symbols,uint64_t bin,uint32_t bin_size)1365 get_cs_cfg(struct pandecode_context *ctx, struct hash_table_u64 *symbols,
1366 uint64_t bin, uint32_t bin_size)
1367 {
1368 uint32_t instr_count = bin_size / sizeof(uint64_t);
1369 struct cs_code_cfg *cfg = _mesa_hash_table_u64_search(symbols, bin);
1370
1371 if (cfg) {
1372 assert(cfg->instr_count == instr_count);
1373 return cfg;
1374 }
1375
1376 uint64_t *instrs = pandecode_fetch_gpu_mem(ctx, bin, bin_size);
1377
1378 cfg = rzalloc(symbols, struct cs_code_cfg);
1379 _mesa_hash_table_u64_insert(symbols, bin, cfg);
1380
1381 util_dynarray_init(&cfg->indirect_branches, cfg);
1382
1383 cfg->blk_map = rzalloc_array(cfg, struct cs_code_block *, instr_count);
1384 cfg->instrs = instrs;
1385 cfg->instr_count = instr_count;
1386
1387 struct cs_code_block *block = cs_code_block_alloc(cfg, 0, 0);
1388
1389 for (unsigned i = 0; i < instr_count; i++) {
1390 const uint64_t *instr = &instrs[i];
1391
1392 if (!cfg->blk_map[i]) {
1393 cfg->blk_map[i] = block;
1394 block->size++;
1395 } else {
1396 if (block->successors[0] == ~0)
1397 block->successors[0] = i;
1398
1399 block = cfg->blk_map[i];
1400 util_dynarray_append(&block->predecessors, unsigned, i - 1);
1401 }
1402
1403 cs_unpack(instr, CS_BASE, base);
1404
1405 if (base.opcode == MALI_CS_OPCODE_JUMP ||
1406 base.opcode == MALI_CS_OPCODE_CALL) {
1407 struct cs_indirect_branch ibranch = {
1408 .instr_idx = i,
1409 };
1410
1411 util_dynarray_append(&cfg->indirect_branches,
1412 struct cs_indirect_branch, ibranch);
1413 }
1414
1415 if (base.opcode != MALI_CS_OPCODE_BRANCH)
1416 continue;
1417
1418 cs_unpack(instr, CS_BRANCH, I);
1419
1420 unsigned target = MIN2(i + 1 + I.offset, instr_count);
1421
1422 /* If the target of the branch is the next instruction, it's just a NOP,
1423 * and we consider it the same block. */
1424 if (target == i + 1)
1425 continue;
1426
1427 if (I.offset < 0 && cfg->blk_map[target]->start != target) {
1428 struct cs_code_block *old = cfg->blk_map[target];
1429 struct cs_code_block *new =
1430 cs_code_block_alloc(cfg, target, old->start + old->size - target);
1431
1432 util_dynarray_append(&new->predecessors, unsigned, target - 1);
1433 memcpy(&new->successors, &old->successors, sizeof(new->successors));
1434
1435 old->successors[0] = target;
1436 old->successors[1] = ~0;
1437 old->size = new->start - old->start;
1438
1439 for (unsigned j = 0; j <= new->size; j++)
1440 cfg->blk_map[new->start + j] = new;
1441 }
1442
1443 if (I.offset > 0 && target < instr_count && !cfg->blk_map[target]) {
1444 struct cs_code_block *new = cs_code_block_alloc(cfg, target, 1);
1445
1446 cfg->blk_map[target] = new;
1447 util_dynarray_append(&new->predecessors, unsigned, i);
1448 }
1449
1450 block->successors[0] = target;
1451 if (I.condition != MALI_CS_CONDITION_ALWAYS)
1452 block->successors[1] = i + 1;
1453
1454 block = cs_code_block_alloc(cfg, i + 1, 0);
1455
1456 if (target == i + 1 || I.condition != MALI_CS_CONDITION_ALWAYS)
1457 util_dynarray_append(&block->predecessors, unsigned, i);
1458 }
1459
1460 util_dynarray_foreach(&cfg->indirect_branches, struct cs_indirect_branch,
1461 ibranch)
1462 {
1463 collect_indirect_branch_targets(cfg, ibranch);
1464 util_dynarray_foreach(&ibranch->targets, struct cs_indirect_branch_target,
1465 target)
1466 {
1467 get_cs_cfg(ctx, symbols, target->address, target->length);
1468 }
1469 }
1470
1471 return cfg;
1472 }
1473
1474 static void
print_cs_binary(struct pandecode_context * ctx,uint64_t bin,struct cs_code_cfg * cfg,const char * name)1475 print_cs_binary(struct pandecode_context *ctx, uint64_t bin,
1476 struct cs_code_cfg *cfg, const char *name)
1477 {
1478 pandecode_log(ctx, "%s@%" PRIx64 "{\n", name, bin);
1479 unsigned ibranch_idx = 0;
1480
1481 ctx->indent++;
1482 for (unsigned i = 0; i < cfg->instr_count; i++) {
1483 if (i && cfg->blk_map[i - 1] != cfg->blk_map[i]) {
1484 ctx->indent--;
1485 pandecode_log(ctx, "label_%" PRIx64 ":\n", bin + i * sizeof(uint64_t));
1486 ctx->indent++;
1487 }
1488
1489 pandecode_make_indent(ctx);
1490 print_cs_instr(ctx->dump_stream, &cfg->instrs[i]);
1491 cs_unpack(&cfg->instrs[i], CS_BASE, base);
1492 switch (base.opcode) {
1493 case MALI_CS_OPCODE_JUMP:
1494 case MALI_CS_OPCODE_CALL: {
1495 struct cs_indirect_branch *ibranch = util_dynarray_element(
1496 &cfg->indirect_branches, struct cs_indirect_branch, ibranch_idx);
1497
1498 assert(ibranch->instr_idx == i);
1499 fprintf(ctx->dump_stream, " // ");
1500 util_dynarray_foreach(&ibranch->targets,
1501 struct cs_indirect_branch_target, target)
1502 {
1503 fprintf(ctx->dump_stream, "%scs@%" PRIx64,
1504 target == ibranch->targets.data ? "" : ",",
1505 target->address);
1506 }
1507 if (ibranch->has_unknown_targets)
1508 fprintf(ctx->dump_stream, "%s??", ibranch->targets.size ? "," : "");
1509 ibranch_idx++;
1510 break;
1511 }
1512
1513 case MALI_CS_OPCODE_BRANCH: {
1514 cs_unpack(&cfg->instrs[i], CS_BRANCH, I);
1515 fprintf(ctx->dump_stream, " // ");
1516
1517 unsigned target = i + 1 + I.offset;
1518
1519 if (target < cfg->instr_count)
1520 fprintf(ctx->dump_stream, "label_%" PRIx64,
1521 bin + (target * sizeof(uint64_t)));
1522 else
1523 fprintf(ctx->dump_stream, "end_of_cs");
1524 break;
1525 }
1526
1527 case MALI_CS_OPCODE_RUN_IDVS:
1528 case MALI_CS_OPCODE_RUN_FRAGMENT:
1529 case MALI_CS_OPCODE_RUN_COMPUTE:
1530 case MALI_CS_OPCODE_RUN_COMPUTE_INDIRECT:
1531 fprintf(ctx->dump_stream, " // tracepoint_%" PRIx64,
1532 bin + (i * sizeof(uint64_t)));
1533 break;
1534
1535 default:
1536 break;
1537 }
1538
1539 fprintf(ctx->dump_stream, "\n");
1540 }
1541 ctx->indent--;
1542 pandecode_log(ctx, "} // %s@%" PRIx64 "\n\n", name, bin);
1543 }
1544
1545 void
GENX(pandecode_cs_binary)1546 GENX(pandecode_cs_binary)(struct pandecode_context *ctx, uint64_t bin,
1547 uint32_t bin_size, unsigned gpu_id)
1548 {
1549 if (!bin_size)
1550 return;
1551
1552 pandecode_dump_file_open(ctx);
1553
1554 struct hash_table_u64 *symbols = _mesa_hash_table_u64_create(NULL);
1555 struct cs_code_cfg *main_cfg = get_cs_cfg(ctx, symbols, bin, bin_size);
1556
1557 print_cs_binary(ctx, bin, main_cfg, "main_cs");
1558 hash_table_u64_foreach(symbols, he)
1559 {
1560 struct cs_code_cfg *other_cfg = he.data;
1561 if (other_cfg == main_cfg)
1562 continue;
1563
1564 print_cs_binary(ctx, he.key, other_cfg, "cs");
1565 }
1566
1567 ralloc_free(symbols);
1568
1569 pandecode_map_read_write(ctx);
1570 }
1571
1572 void
GENX(pandecode_cs_trace)1573 GENX(pandecode_cs_trace)(struct pandecode_context *ctx, uint64_t trace,
1574 uint32_t trace_size, unsigned gpu_id)
1575 {
1576 pandecode_dump_file_open(ctx);
1577
1578 void *trace_data = pandecode_fetch_gpu_mem(ctx, trace, trace_size);
1579
1580 while (trace_size > 0) {
1581 uint32_t regs[256] = {};
1582 uint64_t *ip = trace_data;
1583
1584 uint64_t *instr = pandecode_fetch_gpu_mem(ctx, *ip, sizeof(*instr));
1585
1586 /* Mali-G610 has 96 registers. Other devices not yet supported, we can
1587 * make this configurable later when we encounter new Malis.
1588 */
1589 struct queue_ctx qctx = {
1590 .nr_regs = 96,
1591 .regs = regs,
1592 .ip = instr,
1593 .end = instr + 1,
1594 .gpu_id = gpu_id,
1595 };
1596
1597 pandecode_make_indent(ctx);
1598 print_cs_instr(ctx->dump_stream, instr);
1599 fprintf(ctx->dump_stream, " // from tracepoint_%" PRIx64 "\n", *ip);
1600
1601 cs_unpack(instr, CS_BASE, base);
1602
1603 switch (base.opcode) {
1604 case MALI_CS_OPCODE_RUN_IDVS: {
1605 struct cs_run_idvs_trace *idvs_trace = trace_data;
1606
1607 assert(trace_size >= sizeof(idvs_trace));
1608 cs_unpack(instr, CS_RUN_IDVS, I);
1609 memcpy(regs, idvs_trace->sr, sizeof(idvs_trace->sr));
1610
1611 if (I.draw_id_register_enable)
1612 regs[I.draw_id] = idvs_trace->draw_id;
1613
1614 pandecode_run_idvs(ctx, ctx->dump_stream, &qctx, &I);
1615 trace_data = idvs_trace + 1;
1616 trace_size -= sizeof(*idvs_trace);
1617 break;
1618 }
1619
1620 case MALI_CS_OPCODE_RUN_FRAGMENT: {
1621 struct cs_run_fragment_trace *frag_trace = trace_data;
1622
1623 assert(trace_size >= sizeof(frag_trace));
1624 cs_unpack(instr, CS_RUN_FRAGMENT, I);
1625 memcpy(®s[40], frag_trace->sr, sizeof(frag_trace->sr));
1626 pandecode_run_fragment(ctx, ctx->dump_stream, &qctx, &I);
1627 trace_data = frag_trace + 1;
1628 trace_size -= sizeof(*frag_trace);
1629 break;
1630 }
1631
1632 case MALI_CS_OPCODE_RUN_COMPUTE: {
1633 struct cs_run_compute_trace *comp_trace = trace_data;
1634
1635 assert(trace_size >= sizeof(comp_trace));
1636 cs_unpack(instr, CS_RUN_COMPUTE, I);
1637 memcpy(regs, comp_trace->sr, sizeof(comp_trace->sr));
1638 pandecode_run_compute(ctx, ctx->dump_stream, &qctx, &I);
1639 trace_data = comp_trace + 1;
1640 trace_size -= sizeof(*comp_trace);
1641 break;
1642 }
1643
1644 case MALI_CS_OPCODE_RUN_COMPUTE_INDIRECT: {
1645 struct cs_run_compute_trace *comp_trace = trace_data;
1646
1647 assert(trace_size >= sizeof(comp_trace));
1648 cs_unpack(instr, CS_RUN_COMPUTE_INDIRECT, I);
1649 memcpy(regs, comp_trace->sr, sizeof(comp_trace->sr));
1650 pandecode_run_compute_indirect(ctx, ctx->dump_stream, &qctx, &I);
1651 trace_data = comp_trace + 1;
1652 trace_size -= sizeof(*comp_trace);
1653 break;
1654 }
1655
1656 default:
1657 assert(!"Invalid trace packet");
1658 break;
1659 }
1660
1661 pandecode_log(ctx, "\n");
1662 }
1663
1664 fflush(ctx->dump_stream);
1665 pandecode_map_read_write(ctx);
1666 }
1667 #endif
1668