1 /*
2 * Copyright 2023 Advanced Micro Devices, Inc.
3 *
4 * SPDX-License-Identifier: MIT
5 */
6
7 /* Utility for gathering context rolls for performance bottleneck analysis.
8 *
9 * Usage for radeonsi:
10 * AMD_ROLLS=filename app1
11 * AMD_ROLLS=filename app2
12 * ...
13 * AMD_ROLLS=filename appN
14 *
15 * sort filename | uniq -c | sort -n > rolls_sorted.txt
16 *
17 * Then try to reduce the most frequent context rolls.
18 */
19
20 #include "ac_debug.h"
21 #include "sid.h"
22 #include "sid_tables.h"
23
24 #include "util/bitset.h"
25 #include "util/u_dynarray.h"
26 #include "util/u_memory.h"
27
28 struct ac_context_reg_deltas {
29 uint32_t changed_masks[1024]; /* changes masks of context registers */
30 BITSET_DECLARE(changed, 1024); /* which context register was set */
31 bool acquire_mem; /* whether ACQUIRE_MEM rolled the context */
32 };
33
34 struct ac_context_reg_state {
35 uint32_t regs[1024];
36 struct ac_context_reg_deltas deltas;
37 };
38
39 struct ac_context_roll_ctx {
40 struct ac_context_reg_state *cur;
41 bool context_busy;
42
43 unsigned num_busy_contexts;
44 struct util_dynarray rolls;
45
46 const struct radeon_info *info;
47 };
48
ac_roll_context(struct ac_context_roll_ctx * ctx)49 static void ac_roll_context(struct ac_context_roll_ctx *ctx)
50 {
51 if (!ctx->context_busy)
52 return;
53
54 struct ac_context_reg_state *last = ctx->cur;
55 ctx->cur = CALLOC_STRUCT(ac_context_reg_state);
56 memcpy(ctx->cur->regs, last->regs, sizeof(last->regs));
57 ctx->context_busy = false;
58 ctx->num_busy_contexts++;
59
60 /* Ignore the first context at the beginning or after waiting for idle. */
61 if (ctx->num_busy_contexts > 1) {
62 util_dynarray_append(&ctx->rolls, struct ac_context_reg_state *, last);
63 } else {
64 FREE(last);
65 }
66 }
67
ac_record_wait_idle(struct ac_context_roll_ctx * ctx)68 static void ac_record_wait_idle(struct ac_context_roll_ctx *ctx)
69 {
70 ctx->num_busy_contexts = 0;
71 ctx->context_busy = false;
72 memset(&ctx->cur->deltas, 0, sizeof(ctx->cur->deltas));
73 }
74
ac_record_set_context_reg(struct ac_context_roll_ctx * ctx,unsigned reg_rel_dw_offset,unsigned value)75 static void ac_record_set_context_reg(struct ac_context_roll_ctx *ctx,
76 unsigned reg_rel_dw_offset, unsigned value)
77 {
78 if (!ac_register_exists(ctx->info->gfx_level, ctx->info->family,
79 SI_CONTEXT_REG_OFFSET + reg_rel_dw_offset * 4)) {
80 fprintf(stderr, "This register is not supported by this chip: 0x%X\n",
81 SI_CONTEXT_REG_OFFSET + reg_rel_dw_offset * 4);
82 abort();
83 }
84
85 assert(reg_rel_dw_offset < 1024);
86 BITSET_SET(ctx->cur->deltas.changed, reg_rel_dw_offset);
87 ctx->cur->deltas.changed_masks[reg_rel_dw_offset] |= ctx->cur->regs[reg_rel_dw_offset] ^ value;
88 ctx->cur->regs[reg_rel_dw_offset] = value;
89 }
90
get_reg_index(unsigned reg)91 static unsigned get_reg_index(unsigned reg)
92 {
93 return (reg - SI_CONTEXT_REG_OFFSET) / 4;
94 }
95
ac_ib_gather_context_rolls(struct ac_context_roll_ctx * ctx,uint32_t * ib,int num_dw)96 static void ac_ib_gather_context_rolls(struct ac_context_roll_ctx *ctx, uint32_t *ib, int num_dw)
97 {
98 for (unsigned cur_dw = 0; cur_dw < num_dw;) {
99 uint32_t header = ib[cur_dw++];
100 unsigned type = PKT_TYPE_G(header);
101
102 if (type != 3) {
103 fprintf(stderr, "Unexpected type %u packet\n", type);
104 abort();
105 }
106
107 int count = PKT_COUNT_G(header);
108 unsigned op = PKT3_IT_OPCODE_G(header);
109
110 switch (op) {
111 /* Record context register changes. */
112 case PKT3_SET_CONTEXT_REG: {
113 ac_roll_context(ctx);
114
115 unsigned reg_dw = ib[cur_dw++];
116 unsigned reg_rel_dw_offset = reg_dw & 0xFFFF;
117
118 for (int i = 0; i < count; i++)
119 ac_record_set_context_reg(ctx, reg_rel_dw_offset + i, ib[cur_dw++]);
120 continue;
121 }
122
123 case PKT3_SET_CONTEXT_REG_PAIRS:
124 ac_roll_context(ctx);
125
126 for (int i = 0; i < (count + 1) / 2; i++) {
127 unsigned reg_rel_dw_offset = ib[cur_dw++];
128 ac_record_set_context_reg(ctx, reg_rel_dw_offset, ib[cur_dw++]);
129 }
130 continue;
131
132 case PKT3_SET_CONTEXT_REG_PAIRS_PACKED: {
133 ac_roll_context(ctx);
134
135 unsigned reg_rel_dw_offset0 = 0, reg_rel_dw_offset1 = 0;
136 cur_dw++;
137
138 for (int i = 0; i < count; i++) {
139 if (i % 3 == 0) {
140 unsigned tmp = ib[cur_dw++];
141 reg_rel_dw_offset0 = tmp & 0xffff;
142 reg_rel_dw_offset1 = tmp >> 16;
143 } else if (i % 3 == 1) {
144 ac_record_set_context_reg(ctx, reg_rel_dw_offset0, ib[cur_dw++]);
145 } else {
146 ac_record_set_context_reg(ctx, reg_rel_dw_offset1, ib[cur_dw++]);
147 }
148 }
149 continue;
150 }
151
152 case PKT3_CLEAR_STATE:
153 ac_roll_context(ctx);
154
155 ac_record_set_context_reg(ctx, get_reg_index(R_028000_DB_RENDER_CONTROL), 0);
156 ac_record_set_context_reg(ctx, get_reg_index(R_028004_DB_COUNT_CONTROL), 0);
157
158 ac_record_set_context_reg(ctx, get_reg_index(R_028BDC_PA_SC_LINE_CNTL), 0x1000);
159 ac_record_set_context_reg(ctx, get_reg_index(R_028BE0_PA_SC_AA_CONFIG), 0);
160
161 ac_record_set_context_reg(ctx, get_reg_index(R_028BE4_PA_SU_VTX_CNTL), 0x5);
162 ac_record_set_context_reg(ctx, get_reg_index(R_028BE8_PA_CL_GB_VERT_CLIP_ADJ), 0x3f800000);
163 ac_record_set_context_reg(ctx, get_reg_index(R_028BEC_PA_CL_GB_VERT_DISC_ADJ), 0x3f800000);
164 ac_record_set_context_reg(ctx, get_reg_index(R_028BF0_PA_CL_GB_HORZ_CLIP_ADJ), 0x3f800000);
165 ac_record_set_context_reg(ctx, get_reg_index(R_028BF4_PA_CL_GB_HORZ_DISC_ADJ), 0x3f800000);
166
167 ac_record_set_context_reg(ctx, get_reg_index(R_02870C_SPI_SHADER_POS_FORMAT), 0);
168
169 ac_record_set_context_reg(ctx, get_reg_index(R_028710_SPI_SHADER_Z_FORMAT), 0);
170 ac_record_set_context_reg(ctx, get_reg_index(R_028714_SPI_SHADER_COL_FORMAT), 0);
171 ac_record_set_context_reg(ctx, get_reg_index(R_0286E0_SPI_BARYC_CNTL), 0);
172 ac_record_set_context_reg(ctx, get_reg_index(R_0286CC_SPI_PS_INPUT_ENA), 0);
173 ac_record_set_context_reg(ctx, get_reg_index(R_0286D0_SPI_PS_INPUT_ADDR), 0);
174
175 ac_record_set_context_reg(ctx, get_reg_index(R_028804_DB_EQAA), 0);
176 ac_record_set_context_reg(ctx, get_reg_index(R_02880C_DB_SHADER_CONTROL), 0);
177 ac_record_set_context_reg(ctx, get_reg_index(R_02823C_CB_SHADER_MASK), 0xffffffff);
178 ac_record_set_context_reg(ctx, get_reg_index(R_028238_CB_TARGET_MASK), 0xffffffff);
179 ac_record_set_context_reg(ctx, get_reg_index(R_028810_PA_CL_CLIP_CNTL), 0x90000);
180 ac_record_set_context_reg(ctx, get_reg_index(R_02881C_PA_CL_VS_OUT_CNTL), 0);
181 ac_record_set_context_reg(ctx, get_reg_index(R_028818_PA_CL_VTE_CNTL), 0);
182 ac_record_set_context_reg(ctx, get_reg_index(R_02820C_PA_SC_CLIPRECT_RULE), 0xffff);
183 ac_record_set_context_reg(ctx, get_reg_index(R_028A0C_PA_SC_LINE_STIPPLE), 0);
184 ac_record_set_context_reg(ctx, get_reg_index(R_028A4C_PA_SC_MODE_CNTL_1), 0);
185 ac_record_set_context_reg(ctx, get_reg_index(R_028234_PA_SU_HARDWARE_SCREEN_OFFSET), 0);
186 ac_record_set_context_reg(ctx, get_reg_index(R_0286D8_SPI_PS_IN_CONTROL), 0x2);
187 ac_record_set_context_reg(ctx, get_reg_index(R_028B90_VGT_GS_INSTANCE_CNT), 0);
188 ac_record_set_context_reg(ctx, get_reg_index(R_028B38_VGT_GS_MAX_VERT_OUT), 0);
189 ac_record_set_context_reg(ctx, get_reg_index(R_028B54_VGT_SHADER_STAGES_EN), 0);
190 ac_record_set_context_reg(ctx, get_reg_index(R_028B58_VGT_LS_HS_CONFIG), 0);
191 ac_record_set_context_reg(ctx, get_reg_index(R_028B6C_VGT_TF_PARAM), 0);
192 ac_record_set_context_reg(ctx, get_reg_index(R_028830_PA_SU_SMALL_PRIM_FILTER_CNTL), 0);
193 ac_record_set_context_reg(ctx, get_reg_index(R_028C44_PA_SC_BINNER_CNTL_0), 0x3);
194 if (ctx->info->gfx_level >= GFX10) {
195 ac_record_set_context_reg(ctx, get_reg_index(R_0287FC_GE_MAX_OUTPUT_PER_SUBGROUP), 0);
196 ac_record_set_context_reg(ctx, get_reg_index(R_028B4C_GE_NGG_SUBGRP_CNTL), 0);
197 }
198 if (ctx->info->gfx_level >= GFX11)
199 ac_record_set_context_reg(ctx, get_reg_index(R_0283D0_PA_SC_VRS_OVERRIDE_CNTL), 0);
200 else if (ctx->info->gfx_level == GFX10_3)
201 ac_record_set_context_reg(ctx, get_reg_index(R_028064_DB_VRS_OVERRIDE_CNTL), 0);
202
203 ac_record_set_context_reg(ctx, get_reg_index(R_028754_SX_PS_DOWNCONVERT), 0);
204 ac_record_set_context_reg(ctx, get_reg_index(R_028758_SX_BLEND_OPT_EPSILON), 0);
205 ac_record_set_context_reg(ctx, get_reg_index(R_02875C_SX_BLEND_OPT_CONTROL), 0);
206
207 ac_record_set_context_reg(ctx, get_reg_index(R_028AAC_VGT_ESGS_RING_ITEMSIZE), 0);
208 ac_record_set_context_reg(ctx, get_reg_index(R_028AB4_VGT_REUSE_OFF), 0);
209 if (ctx->info->gfx_level <= GFX9)
210 ac_record_set_context_reg(ctx, get_reg_index(R_028AA8_IA_MULTI_VGT_PARAM), 0xff);
211
212 if (ctx->info->gfx_level == GFX9)
213 ac_record_set_context_reg(ctx, get_reg_index(R_028A94_VGT_GS_MAX_PRIMS_PER_SUBGROUP), 0);
214 if (ctx->info->gfx_level <= GFX10_3) {
215 ac_record_set_context_reg(ctx, get_reg_index(R_028A44_VGT_GS_ONCHIP_CNTL), 0);
216 ac_record_set_context_reg(ctx, get_reg_index(R_028AB0_VGT_GSVS_RING_ITEMSIZE), 0);
217 ac_record_set_context_reg(ctx, get_reg_index(R_028A40_VGT_GS_MODE), 0);
218 ac_record_set_context_reg(ctx, get_reg_index(R_028C58_VGT_VERTEX_REUSE_BLOCK_CNTL), 0x1e);
219 ac_record_set_context_reg(ctx, get_reg_index(R_028A6C_VGT_GS_OUT_PRIM_TYPE), 0);
220
221 ac_record_set_context_reg(ctx, get_reg_index(R_028A60_VGT_GSVS_RING_OFFSET_1), 0);
222 ac_record_set_context_reg(ctx, get_reg_index(R_028A64_VGT_GSVS_RING_OFFSET_2), 0);
223 ac_record_set_context_reg(ctx, get_reg_index(R_028A68_VGT_GSVS_RING_OFFSET_3), 0);
224
225 ac_record_set_context_reg(ctx, get_reg_index(R_028B5C_VGT_GS_VERT_ITEMSIZE), 0);
226 ac_record_set_context_reg(ctx, get_reg_index(R_028B60_VGT_GS_VERT_ITEMSIZE_1), 0);
227 ac_record_set_context_reg(ctx, get_reg_index(R_028B64_VGT_GS_VERT_ITEMSIZE_2), 0);
228 ac_record_set_context_reg(ctx, get_reg_index(R_028B68_VGT_GS_VERT_ITEMSIZE_3), 0);
229 }
230
231 ac_record_set_context_reg(ctx, get_reg_index(R_028010_DB_RENDER_OVERRIDE2), 0);
232 ac_record_set_context_reg(ctx, get_reg_index(R_0286C4_SPI_VS_OUT_CONFIG), 0);
233 ac_record_set_context_reg(ctx, get_reg_index(R_028A84_VGT_PRIMITIVEID_EN), 0);
234 ac_record_set_context_reg(ctx, get_reg_index(R_028424_CB_DCC_CONTROL), 0);
235 break;
236
237 case PKT3_LOAD_CONTEXT_REG_INDEX:
238 case PKT3_COPY_DATA:
239 /* TODO */
240 break;
241
242 case PKT3_ACQUIRE_MEM:
243 if (G_580_PWS_ENA2(ib[cur_dw])) {
244 ac_record_wait_idle(ctx);
245 } else {
246 ac_roll_context(ctx);
247 ctx->cur->deltas.acquire_mem = true;
248 }
249 break;
250
251 case PKT3_WAIT_REG_MEM:
252 ac_record_wait_idle(ctx);
253 break;
254
255 case PKT3_EVENT_WRITE:
256 if (G_490_EVENT_TYPE(ib[cur_dw]) == V_028A90_PS_PARTIAL_FLUSH)
257 ac_record_wait_idle(ctx);
258 break;
259
260 /* Record draws. */
261 case PKT3_DRAW_INDEX_AUTO:
262 case PKT3_DRAW_INDEX_IMMD:
263 case PKT3_DRAW_INDEX_MULTI_AUTO:
264 case PKT3_DRAW_INDEX_2:
265 case PKT3_DRAW_INDEX_OFFSET_2:
266 case PKT3_DRAW_INDIRECT:
267 case PKT3_DRAW_INDEX_INDIRECT:
268 case PKT3_DRAW_INDIRECT_MULTI:
269 case PKT3_DRAW_INDEX_INDIRECT_MULTI:
270 case PKT3_DISPATCH_MESH_DIRECT:
271 case PKT3_DISPATCH_MESH_INDIRECT_MULTI:
272 case PKT3_DISPATCH_TASKMESH_GFX:
273 ctx->context_busy = true;
274 break;
275
276 case PKT3_INDIRECT_BUFFER:
277 /* Chaining. Note that the CHAIN bit is not set at this point, so we can't distinguish
278 * between chaining and IB2.
279 */
280 return;
281
282 case PKT3_CONTEXT_REG_RMW:
283 case PKT3_INDIRECT_BUFFER_SI:
284 case PKT3_SURFACE_SYNC:
285 fprintf(stderr, "Unhandled packet: 0x%x\n", op);
286 abort();
287 break;
288 }
289
290 cur_dw += count + 1;
291 }
292 }
293
ac_gather_context_rolls(FILE * f,uint32_t ** ibs,uint32_t * ib_dw_sizes,unsigned num_ibs,const struct radeon_info * info)294 void ac_gather_context_rolls(FILE *f, uint32_t **ibs, uint32_t *ib_dw_sizes, unsigned num_ibs,
295 const struct radeon_info *info)
296 {
297 struct ac_context_roll_ctx ctx;
298
299 /* Initialize. */
300 memset(&ctx, 0, sizeof(ctx));
301 ctx.info = info;
302 ctx.cur = CALLOC_STRUCT(ac_context_reg_state);
303 util_dynarray_init(&ctx.rolls, NULL);
304
305 /* Parse the IBs. */
306 for (unsigned i = 0; i < num_ibs; i++)
307 ac_ib_gather_context_rolls(&ctx, ibs[i], ib_dw_sizes[i]);
308
309 /* Roll the last context to add it to the list. */
310 ac_roll_context(&ctx);
311
312 /* Print context rolls. */
313 if (util_dynarray_num_elements(&ctx.rolls, struct ac_context_reg_state *)) {
314 /* Print the context rolls starting with the most frequent one. */
315 util_dynarray_foreach(&ctx.rolls, struct ac_context_reg_state *, iter) {
316 struct ac_context_reg_state *state = *iter;
317
318 unsigned i;
319 BITSET_FOREACH_SET(i, state->deltas.changed, 1024) {
320 unsigned reg_offset = SI_CONTEXT_REG_OFFSET + i * 4;
321 const struct si_reg *reg = ac_find_register(info->gfx_level, info->family,
322 reg_offset);
323
324 if (!reg) {
325 fprintf(f, "0x%X(0x%x) ", reg_offset, state->deltas.changed_masks[i]);
326 } else {
327 fprintf(f, "%s(0x%x) ", sid_strings + reg->name_offset,
328 state->deltas.changed_masks[i]);
329 }
330 }
331
332 if (state->deltas.acquire_mem)
333 fprintf(f, "ACQUIRE_MEM");
334
335 fprintf(f, "\n");
336 }
337 }
338
339 /* Free. */
340 FREE(ctx.cur);
341 util_dynarray_foreach(&ctx.rolls, struct ac_context_reg_state *, iter) {
342 FREE(*iter);
343 }
344 util_dynarray_fini(&ctx.rolls);
345 }
346