1 /*
2 * Copyright 2020 Advanced Micro Devices, Inc.
3 * Copyright 2020 Valve Corporation
4 *
5 * SPDX-License-Identifier: MIT
6 */
7
8 #include "ac_pm4.h"
9 #include "ac_sqtt.h"
10
11 #include "sid.h"
12 #include "ac_gpu_info.h"
13 #include "util/u_math.h"
14 #include "util/os_time.h"
15
16 #include "sid.h"
17
18 uint64_t
ac_sqtt_get_info_offset(unsigned se)19 ac_sqtt_get_info_offset(unsigned se)
20 {
21 return sizeof(struct ac_sqtt_data_info) * se;
22 }
23
24 uint64_t
ac_sqtt_get_data_offset(const struct radeon_info * rad_info,const struct ac_sqtt * data,unsigned se)25 ac_sqtt_get_data_offset(const struct radeon_info *rad_info, const struct ac_sqtt *data, unsigned se)
26 {
27 unsigned max_se = rad_info->max_se;
28 uint64_t data_offset;
29
30 data_offset = align64(sizeof(struct ac_sqtt_data_info) * max_se, 1 << SQTT_BUFFER_ALIGN_SHIFT);
31 data_offset += data->buffer_size * se;
32
33 return data_offset;
34 }
35
36 static uint64_t
ac_sqtt_get_info_va(uint64_t va,unsigned se)37 ac_sqtt_get_info_va(uint64_t va, unsigned se)
38 {
39 return va + ac_sqtt_get_info_offset(se);
40 }
41
42 static uint64_t
ac_sqtt_get_data_va(const struct radeon_info * rad_info,const struct ac_sqtt * data,unsigned se)43 ac_sqtt_get_data_va(const struct radeon_info *rad_info, const struct ac_sqtt *data,
44 unsigned se)
45 {
46 return data->buffer_va + ac_sqtt_get_data_offset(rad_info, data, se);
47 }
48
49 void
ac_sqtt_init(struct ac_sqtt * data)50 ac_sqtt_init(struct ac_sqtt *data)
51 {
52 list_inithead(&data->rgp_pso_correlation.record);
53 simple_mtx_init(&data->rgp_pso_correlation.lock, mtx_plain);
54
55 list_inithead(&data->rgp_loader_events.record);
56 simple_mtx_init(&data->rgp_loader_events.lock, mtx_plain);
57
58 list_inithead(&data->rgp_code_object.record);
59 simple_mtx_init(&data->rgp_code_object.lock, mtx_plain);
60
61 list_inithead(&data->rgp_clock_calibration.record);
62 simple_mtx_init(&data->rgp_clock_calibration.lock, mtx_plain);
63
64 list_inithead(&data->rgp_queue_info.record);
65 simple_mtx_init(&data->rgp_queue_info.lock, mtx_plain);
66
67 list_inithead(&data->rgp_queue_event.record);
68 simple_mtx_init(&data->rgp_queue_event.lock, mtx_plain);
69 }
70
71 void
ac_sqtt_finish(struct ac_sqtt * data)72 ac_sqtt_finish(struct ac_sqtt *data)
73 {
74 assert(data->rgp_pso_correlation.record_count == 0);
75 simple_mtx_destroy(&data->rgp_pso_correlation.lock);
76
77 assert(data->rgp_loader_events.record_count == 0);
78 simple_mtx_destroy(&data->rgp_loader_events.lock);
79
80 assert(data->rgp_code_object.record_count == 0);
81 simple_mtx_destroy(&data->rgp_code_object.lock);
82
83 assert(data->rgp_clock_calibration.record_count == 0);
84 simple_mtx_destroy(&data->rgp_clock_calibration.lock);
85
86 assert(data->rgp_queue_info.record_count == 0);
87 simple_mtx_destroy(&data->rgp_queue_info.lock);
88
89 assert(data->rgp_queue_event.record_count == 0);
90 simple_mtx_destroy(&data->rgp_queue_event.lock);
91 }
92
93 bool
ac_is_sqtt_complete(const struct radeon_info * rad_info,const struct ac_sqtt * data,const struct ac_sqtt_data_info * info)94 ac_is_sqtt_complete(const struct radeon_info *rad_info, const struct ac_sqtt *data,
95 const struct ac_sqtt_data_info *info)
96 {
97 if (rad_info->gfx_level >= GFX10) {
98 /* GFX10 doesn't have THREAD_TRACE_CNTR but it reports the number of
99 * dropped bytes per SE via THREAD_TRACE_DROPPED_CNTR. Though, this
100 * doesn't seem reliable because it might still report non-zero even if
101 * the SQTT buffer isn't full.
102 *
103 * The solution here is to compare the number of bytes written by the hw
104 * (in units of 32 bytes) to the SQTT buffer size. If it's equal, that
105 * means that the buffer is full and should be resized.
106 */
107 return !(info->cur_offset * 32 == data->buffer_size - 32);
108 }
109
110 /* Otherwise, compare the current thread trace offset with the number
111 * of written bytes.
112 */
113 return info->cur_offset == info->gfx9_write_counter;
114 }
115
116 bool
ac_sqtt_add_pso_correlation(struct ac_sqtt * sqtt,uint64_t pipeline_hash,uint64_t api_hash)117 ac_sqtt_add_pso_correlation(struct ac_sqtt *sqtt, uint64_t pipeline_hash, uint64_t api_hash)
118 {
119 struct rgp_pso_correlation *pso_correlation = &sqtt->rgp_pso_correlation;
120 struct rgp_pso_correlation_record *record;
121
122 record = malloc(sizeof(struct rgp_pso_correlation_record));
123 if (!record)
124 return false;
125
126 record->api_pso_hash = api_hash;
127 record->pipeline_hash[0] = pipeline_hash;
128 record->pipeline_hash[1] = pipeline_hash;
129 memset(record->api_level_obj_name, 0, sizeof(record->api_level_obj_name));
130
131 simple_mtx_lock(&pso_correlation->lock);
132 list_addtail(&record->list, &pso_correlation->record);
133 pso_correlation->record_count++;
134 simple_mtx_unlock(&pso_correlation->lock);
135
136 return true;
137 }
138
139 bool
ac_sqtt_add_code_object_loader_event(struct ac_sqtt * sqtt,uint64_t pipeline_hash,uint64_t base_address)140 ac_sqtt_add_code_object_loader_event(struct ac_sqtt *sqtt, uint64_t pipeline_hash,
141 uint64_t base_address)
142 {
143 struct rgp_loader_events *loader_events = &sqtt->rgp_loader_events;
144 struct rgp_loader_events_record *record;
145
146 record = malloc(sizeof(struct rgp_loader_events_record));
147 if (!record)
148 return false;
149
150 record->loader_event_type = RGP_LOAD_TO_GPU_MEMORY;
151 record->reserved = 0;
152 record->base_address = base_address & 0xffffffffffff;
153 record->code_object_hash[0] = pipeline_hash;
154 record->code_object_hash[1] = pipeline_hash;
155 record->time_stamp = os_time_get_nano();
156
157 simple_mtx_lock(&loader_events->lock);
158 list_addtail(&record->list, &loader_events->record);
159 loader_events->record_count++;
160 simple_mtx_unlock(&loader_events->lock);
161
162 return true;
163 }
164
165 bool
ac_sqtt_add_clock_calibration(struct ac_sqtt * sqtt,uint64_t cpu_timestamp,uint64_t gpu_timestamp)166 ac_sqtt_add_clock_calibration(struct ac_sqtt *sqtt, uint64_t cpu_timestamp, uint64_t gpu_timestamp)
167 {
168 struct rgp_clock_calibration *clock_calibration = &sqtt->rgp_clock_calibration;
169 struct rgp_clock_calibration_record *record;
170
171 record = malloc(sizeof(struct rgp_clock_calibration_record));
172 if (!record)
173 return false;
174
175 record->cpu_timestamp = cpu_timestamp;
176 record->gpu_timestamp = gpu_timestamp;
177
178 simple_mtx_lock(&clock_calibration->lock);
179 list_addtail(&record->list, &clock_calibration->record);
180 clock_calibration->record_count++;
181 simple_mtx_unlock(&clock_calibration->lock);
182
183 return true;
184 }
185
186 /* See https://gitlab.freedesktop.org/mesa/mesa/-/issues/5260
187 * On some HW SQTT can hang if we're not in one of the profiling pstates. */
188 bool
ac_check_profile_state(const struct radeon_info * info)189 ac_check_profile_state(const struct radeon_info *info)
190 {
191 char path[128];
192 char data[128];
193 int n;
194
195 if (!info->pci.valid)
196 return false; /* Unknown but optimistic. */
197
198 snprintf(path, sizeof(path),
199 "/sys/bus/pci/devices/%04x:%02x:%02x.%x/power_dpm_force_performance_level",
200 info->pci.domain, info->pci.bus, info->pci.dev, info->pci.func);
201
202 FILE *f = fopen(path, "r");
203 if (!f)
204 return false; /* Unknown but optimistic. */
205 n = fread(data, 1, sizeof(data) - 1, f);
206 fclose(f);
207 data[n] = 0;
208 return strstr(data, "profile") == NULL;
209 }
210
211 union rgp_sqtt_marker_cb_id
ac_sqtt_get_next_cmdbuf_id(struct ac_sqtt * data,enum amd_ip_type ip_type)212 ac_sqtt_get_next_cmdbuf_id(struct ac_sqtt *data, enum amd_ip_type ip_type)
213 {
214 union rgp_sqtt_marker_cb_id cb_id = {0};
215
216 cb_id.global_cb_id.cb_index =
217 p_atomic_inc_return(&data->cmdbuf_ids_per_queue[ip_type]);
218
219 return cb_id;
220 }
221
222 static bool
ac_sqtt_se_is_disabled(const struct radeon_info * info,unsigned se)223 ac_sqtt_se_is_disabled(const struct radeon_info *info, unsigned se)
224 {
225 /* No active CU on the SE means it is disabled. */
226 return info->cu_mask[se][0] == 0;
227 }
228
229 static uint32_t
ac_sqtt_get_active_cu(const struct radeon_info * info,unsigned se)230 ac_sqtt_get_active_cu(const struct radeon_info *info, unsigned se)
231 {
232 uint32_t cu_index;
233
234 if (info->gfx_level >= GFX11) {
235 /* GFX11 seems to operate on the last active CU. */
236 cu_index = util_last_bit(info->cu_mask[se][0]) - 1;
237 } else {
238 /* Default to the first active CU. */
239 cu_index = ffs(info->cu_mask[se][0]);
240 }
241
242 return cu_index;
243 }
244
245 bool
ac_sqtt_get_trace(struct ac_sqtt * data,const struct radeon_info * info,struct ac_sqtt_trace * sqtt_trace)246 ac_sqtt_get_trace(struct ac_sqtt *data, const struct radeon_info *info,
247 struct ac_sqtt_trace *sqtt_trace)
248 {
249 unsigned max_se = info->max_se;
250 void *ptr = data->ptr;
251
252 memset(sqtt_trace, 0, sizeof(*sqtt_trace));
253
254 for (unsigned se = 0; se < max_se; se++) {
255 uint64_t info_offset = ac_sqtt_get_info_offset(se);
256 uint64_t data_offset = ac_sqtt_get_data_offset(info, data, se);
257 void *info_ptr = (uint8_t *)ptr + info_offset;
258 void *data_ptr = (uint8_t *)ptr + data_offset;
259 struct ac_sqtt_data_info *trace_info = (struct ac_sqtt_data_info *)info_ptr;
260 struct ac_sqtt_data_se data_se = {0};
261 int active_cu = ac_sqtt_get_active_cu(info, se);
262
263 if (ac_sqtt_se_is_disabled(info, se))
264 continue;
265
266 if (!ac_is_sqtt_complete(info, data, trace_info))
267 return false;
268
269 data_se.data_ptr = data_ptr;
270 data_se.info = *trace_info;
271 data_se.shader_engine = se;
272
273 /* RGP seems to expect units of WGP on GFX10+. */
274 data_se.compute_unit = info->gfx_level >= GFX10 ? (active_cu / 2) : active_cu;
275
276 sqtt_trace->traces[sqtt_trace->num_traces] = data_se;
277 sqtt_trace->num_traces++;
278 }
279
280 sqtt_trace->rgp_code_object = &data->rgp_code_object;
281 sqtt_trace->rgp_loader_events = &data->rgp_loader_events;
282 sqtt_trace->rgp_pso_correlation = &data->rgp_pso_correlation;
283 sqtt_trace->rgp_queue_info = &data->rgp_queue_info;
284 sqtt_trace->rgp_queue_event = &data->rgp_queue_event;
285 sqtt_trace->rgp_clock_calibration = &data->rgp_clock_calibration;
286
287 return true;
288 }
289
290 uint32_t
ac_sqtt_get_ctrl(const struct radeon_info * info,bool enable)291 ac_sqtt_get_ctrl(const struct radeon_info *info, bool enable)
292 {
293
294 uint32_t ctrl;
295
296 if (info->gfx_level >= GFX11) {
297 ctrl = S_0367B0_MODE(enable) | S_0367B0_HIWATER(5) |
298 S_0367B0_UTIL_TIMER_GFX11(1) | S_0367B0_RT_FREQ(2) | /* 4096 clk */
299 S_0367B0_DRAW_EVENT_EN(1) | S_0367B0_SPI_STALL_EN(1) |
300 S_0367B0_SQ_STALL_EN(1) | S_0367B0_REG_AT_HWM(2);
301 } else {
302 assert(info->gfx_level >= GFX10);
303
304 ctrl = S_008D1C_MODE(enable) | S_008D1C_HIWATER(5) | S_008D1C_UTIL_TIMER(1) |
305 S_008D1C_RT_FREQ(2) | /* 4096 clk */ S_008D1C_DRAW_EVENT_EN(1) |
306 S_008D1C_REG_STALL_EN(1) | S_008D1C_SPI_STALL_EN(1) |
307 S_008D1C_SQ_STALL_EN(1) | S_008D1C_REG_DROP_ON_STALL(0);
308
309 if (info->gfx_level == GFX10_3)
310 ctrl |= S_008D1C_LOWATER_OFFSET(4);
311
312 if (info->has_sqtt_auto_flush_mode_bug)
313 ctrl |= S_008D1C_AUTO_FLUSH_MODE(1);
314 }
315
316 return ctrl;
317 }
318
319 uint32_t
ac_sqtt_get_shader_mask(const struct radeon_info * info)320 ac_sqtt_get_shader_mask(const struct radeon_info *info)
321 {
322 unsigned shader_mask = 0x7f; /* all shader stages */
323
324 if (info->gfx_level >= GFX11) {
325 /* Disable unsupported hw shader stages */
326 shader_mask &= ~(0x02 /* VS */ | 0x08 /* ES */ | 0x20 /* LS */);
327 }
328
329 return shader_mask;
330 }
331
332 void
ac_sqtt_emit_start(const struct radeon_info * info,struct ac_pm4_state * pm4,const struct ac_sqtt * sqtt,bool is_compute_queue)333 ac_sqtt_emit_start(const struct radeon_info *info, struct ac_pm4_state *pm4,
334 const struct ac_sqtt *sqtt, bool is_compute_queue)
335 {
336 const uint32_t shifted_size = sqtt->buffer_size >> SQTT_BUFFER_ALIGN_SHIFT;
337 const unsigned shader_mask = ac_sqtt_get_shader_mask(info);
338 const unsigned max_se = info->max_se;
339
340 for (unsigned se = 0; se < max_se; se++) {
341 uint64_t data_va = ac_sqtt_get_data_va(info, sqtt, se);
342 uint64_t shifted_va = data_va >> SQTT_BUFFER_ALIGN_SHIFT;
343 int active_cu = ac_sqtt_get_active_cu(info, se);
344
345 if (ac_sqtt_se_is_disabled(info, se))
346 continue;
347
348 /* Target SEx and SH0. */
349 ac_pm4_set_reg(pm4, R_030800_GRBM_GFX_INDEX, S_030800_SE_INDEX(se) |
350 S_030800_SH_INDEX(0) | S_030800_INSTANCE_BROADCAST_WRITES(1));
351
352 if (info->gfx_level >= GFX11) {
353 /* Order seems important for the following 2 registers. */
354 ac_pm4_set_reg(pm4, R_0367A4_SQ_THREAD_TRACE_BUF0_SIZE,
355 S_0367A4_SIZE(shifted_size) | S_0367A4_BASE_HI(shifted_va >> 32));
356
357 ac_pm4_set_reg(pm4, R_0367A0_SQ_THREAD_TRACE_BUF0_BASE, shifted_va);
358
359 ac_pm4_set_reg(pm4, R_0367B4_SQ_THREAD_TRACE_MASK,
360 S_0367B4_WTYPE_INCLUDE(shader_mask) | S_0367B4_SA_SEL(0) |
361 S_0367B4_WGP_SEL(active_cu / 2) | S_0367B4_SIMD_SEL(0));
362
363 uint32_t sqtt_token_mask = S_0367B8_REG_INCLUDE(V_0367B8_REG_INCLUDE_SQDEC | V_0367B8_REG_INCLUDE_SHDEC |
364 V_0367B8_REG_INCLUDE_GFXUDEC | V_0367B8_REG_INCLUDE_COMP |
365 V_0367B8_REG_INCLUDE_CONTEXT | V_0367B8_REG_INCLUDE_CONFIG);
366
367 /* Performance counters with SQTT are considered deprecated. */
368 uint32_t token_exclude = V_0367B8_TOKEN_EXCLUDE_PERF;
369
370 if (!sqtt->instruction_timing_enabled) {
371 /* Reduce SQTT traffic when instruction timing isn't enabled. */
372 token_exclude |= V_0367B8_TOKEN_EXCLUDE_VMEMEXEC | V_0367B8_TOKEN_EXCLUDE_ALUEXEC |
373 V_0367B8_TOKEN_EXCLUDE_VALUINST | V_0367B8_TOKEN_EXCLUDE_IMMEDIATE |
374 V_0367B8_TOKEN_EXCLUDE_INST;
375 }
376 sqtt_token_mask |= S_0367B8_TOKEN_EXCLUDE_GFX11(token_exclude) | S_0367B8_BOP_EVENTS_TOKEN_INCLUDE_GFX11(1);
377
378 ac_pm4_set_reg(pm4, R_0367B8_SQ_THREAD_TRACE_TOKEN_MASK, sqtt_token_mask);
379
380 /* Should be emitted last (it enables thread traces). */
381 ac_pm4_set_reg(pm4, R_0367B0_SQ_THREAD_TRACE_CTRL, ac_sqtt_get_ctrl(info, true));
382 } else if (info->gfx_level >= GFX10) {
383 /* Order seems important for the following 2 registers. */
384 ac_pm4_set_reg(pm4, R_008D04_SQ_THREAD_TRACE_BUF0_SIZE,
385 S_008D04_SIZE(shifted_size) | S_008D04_BASE_HI(shifted_va >> 32));
386
387 ac_pm4_set_reg(pm4, R_008D00_SQ_THREAD_TRACE_BUF0_BASE, shifted_va);
388
389 ac_pm4_set_reg(pm4, R_008D14_SQ_THREAD_TRACE_MASK,
390 S_008D14_WTYPE_INCLUDE(shader_mask) | S_008D14_SA_SEL(0) |
391 S_008D14_WGP_SEL(active_cu / 2) | S_008D14_SIMD_SEL(0));
392
393 uint32_t sqtt_token_mask = S_008D18_REG_INCLUDE(V_008D18_REG_INCLUDE_SQDEC | V_008D18_REG_INCLUDE_SHDEC |
394 V_008D18_REG_INCLUDE_GFXUDEC | V_008D18_REG_INCLUDE_COMP |
395 V_008D18_REG_INCLUDE_CONTEXT | V_008D18_REG_INCLUDE_CONFIG);
396
397 /* Performance counters with SQTT are considered deprecated. */
398 uint32_t token_exclude = V_008D18_TOKEN_EXCLUDE_PERF;
399
400 if (!sqtt->instruction_timing_enabled) {
401 /* Reduce SQTT traffic when instruction timing isn't enabled. */
402 token_exclude |= V_008D18_TOKEN_EXCLUDE_VMEMEXEC | V_008D18_TOKEN_EXCLUDE_ALUEXEC |
403 V_008D18_TOKEN_EXCLUDE_VALUINST | V_008D18_TOKEN_EXCLUDE_IMMEDIATE |
404 V_008D18_TOKEN_EXCLUDE_INST;
405 }
406 sqtt_token_mask |=
407 S_008D18_TOKEN_EXCLUDE(token_exclude) | S_008D18_BOP_EVENTS_TOKEN_INCLUDE(info->gfx_level == GFX10_3);
408
409 ac_pm4_set_reg(pm4, R_008D18_SQ_THREAD_TRACE_TOKEN_MASK, sqtt_token_mask);
410
411 /* Should be emitted last (it enables thread traces). */
412 ac_pm4_set_reg(pm4, R_008D1C_SQ_THREAD_TRACE_CTRL, ac_sqtt_get_ctrl(info, true));
413 } else {
414 /* Order seems important for the following 4 registers. */
415 ac_pm4_set_reg(pm4, R_030CDC_SQ_THREAD_TRACE_BASE2, S_030CDC_ADDR_HI(shifted_va >> 32));
416
417 ac_pm4_set_reg(pm4, R_030CC0_SQ_THREAD_TRACE_BASE, shifted_va);
418
419 ac_pm4_set_reg(pm4, R_030CC4_SQ_THREAD_TRACE_SIZE, S_030CC4_SIZE(shifted_size));
420
421 ac_pm4_set_reg(pm4, R_030CD4_SQ_THREAD_TRACE_CTRL, S_030CD4_RESET_BUFFER(1));
422
423 uint32_t sqtt_mask = S_030CC8_CU_SEL(active_cu) | S_030CC8_SH_SEL(0) | S_030CC8_SIMD_EN(0xf) |
424 S_030CC8_VM_ID_MASK(0) | S_030CC8_REG_STALL_EN(1) | S_030CC8_SPI_STALL_EN(1) |
425 S_030CC8_SQ_STALL_EN(1);
426
427 if (info->gfx_level < GFX9) {
428 sqtt_mask |= S_030CC8_RANDOM_SEED(0xffff);
429 }
430
431 ac_pm4_set_reg(pm4, R_030CC8_SQ_THREAD_TRACE_MASK, sqtt_mask);
432
433 /* Trace all tokens and registers. */
434 ac_pm4_set_reg(pm4, R_030CCC_SQ_THREAD_TRACE_TOKEN_MASK,
435 S_030CCC_TOKEN_MASK(0xbfff) | S_030CCC_REG_MASK(0xff) | S_030CCC_REG_DROP_ON_STALL(0));
436
437 /* Enable SQTT perf counters for all CUs. */
438 ac_pm4_set_reg(pm4, R_030CD0_SQ_THREAD_TRACE_PERF_MASK,
439 S_030CD0_SH0_MASK(0xffff) | S_030CD0_SH1_MASK(0xffff));
440
441 ac_pm4_set_reg(pm4, R_030CE0_SQ_THREAD_TRACE_TOKEN_MASK2, 0xffffffff);
442
443 ac_pm4_set_reg(pm4, R_030CEC_SQ_THREAD_TRACE_HIWATER, S_030CEC_HIWATER(4));
444
445 if (info->gfx_level == GFX9) {
446 /* Reset thread trace status errors. */
447 ac_pm4_set_reg(pm4, R_030CE8_SQ_THREAD_TRACE_STATUS, S_030CE8_UTC_ERROR(0));
448 }
449
450 /* Enable the thread trace mode. */
451 uint32_t sqtt_mode = S_030CD8_MASK_PS(1) | S_030CD8_MASK_VS(1) | S_030CD8_MASK_GS(1) | S_030CD8_MASK_ES(1) |
452 S_030CD8_MASK_HS(1) | S_030CD8_MASK_LS(1) | S_030CD8_MASK_CS(1) |
453 S_030CD8_AUTOFLUSH_EN(1) | /* periodically flush SQTT data to memory */
454 S_030CD8_MODE(1);
455
456 if (info->gfx_level == GFX9) {
457 /* Count SQTT traffic in TCC perf counters. */
458 sqtt_mode |= S_030CD8_TC_PERF_EN(1);
459 }
460
461 ac_pm4_set_reg(pm4, R_030CD8_SQ_THREAD_TRACE_MODE, sqtt_mode);
462 }
463 }
464
465 /* Restore global broadcasting. */
466 ac_pm4_set_reg(pm4, R_030800_GRBM_GFX_INDEX, S_030800_SE_BROADCAST_WRITES(1) |
467 S_030800_SH_BROADCAST_WRITES(1) | S_030800_INSTANCE_BROADCAST_WRITES(1));
468
469 /* Start the thread trace with a different event based on the queue. */
470 if (is_compute_queue) {
471 ac_pm4_set_reg(pm4, R_00B878_COMPUTE_THREAD_TRACE_ENABLE, S_00B878_THREAD_TRACE_ENABLE(1));
472 } else {
473 ac_pm4_cmd_add(pm4, PKT3(PKT3_EVENT_WRITE, 0, 0));
474 ac_pm4_cmd_add(pm4, EVENT_TYPE(V_028A90_THREAD_TRACE_START) | EVENT_INDEX(0));
475 }
476
477 }
478
479 static const uint32_t gfx8_sqtt_info_regs[] = {
480 R_030CE4_SQ_THREAD_TRACE_WPTR,
481 R_030CE8_SQ_THREAD_TRACE_STATUS,
482 R_008E40_SQ_THREAD_TRACE_CNTR,
483 };
484
485 static const uint32_t gfx9_sqtt_info_regs[] = {
486 R_030CE4_SQ_THREAD_TRACE_WPTR,
487 R_030CE8_SQ_THREAD_TRACE_STATUS,
488 R_030CF0_SQ_THREAD_TRACE_CNTR,
489 };
490
491 static const uint32_t gfx10_sqtt_info_regs[] = {
492 R_008D10_SQ_THREAD_TRACE_WPTR,
493 R_008D20_SQ_THREAD_TRACE_STATUS,
494 R_008D24_SQ_THREAD_TRACE_DROPPED_CNTR,
495 };
496
497 static const uint32_t gfx11_sqtt_info_regs[] = {
498 R_0367BC_SQ_THREAD_TRACE_WPTR,
499 R_0367D0_SQ_THREAD_TRACE_STATUS,
500 R_0367E8_SQ_THREAD_TRACE_DROPPED_CNTR,
501 };
502
503 static void
ac_sqtt_copy_info_regs(const struct radeon_info * info,struct ac_pm4_state * pm4,const struct ac_sqtt * sqtt,uint32_t se_index)504 ac_sqtt_copy_info_regs(const struct radeon_info *info, struct ac_pm4_state *pm4,
505 const struct ac_sqtt *sqtt, uint32_t se_index)
506 {
507 const uint32_t *sqtt_info_regs = NULL;
508
509 if (info->gfx_level >= GFX11) {
510 sqtt_info_regs = gfx11_sqtt_info_regs;
511 } else if (info->gfx_level >= GFX10) {
512 sqtt_info_regs = gfx10_sqtt_info_regs;
513 } else if (info->gfx_level == GFX9) {
514 sqtt_info_regs = gfx9_sqtt_info_regs;
515 } else {
516 assert(info->gfx_level == GFX8);
517 sqtt_info_regs = gfx8_sqtt_info_regs;
518 }
519
520 /* Get the VA where the info struct is stored for this SE. */
521 uint64_t info_va = ac_sqtt_get_info_va(sqtt->buffer_va, se_index);
522
523 /* Copy back the info struct one DWORD at a time. */
524 for (unsigned i = 0; i < 3; i++) {
525 ac_pm4_cmd_add(pm4, PKT3(PKT3_COPY_DATA, 4, 0));
526 ac_pm4_cmd_add(pm4, COPY_DATA_SRC_SEL(COPY_DATA_PERF) | COPY_DATA_DST_SEL(COPY_DATA_TC_L2) | COPY_DATA_WR_CONFIRM);
527 ac_pm4_cmd_add(pm4, sqtt_info_regs[i] >> 2);
528 ac_pm4_cmd_add(pm4, 0); /* unused */
529 ac_pm4_cmd_add(pm4, (info_va + i * 4));
530 ac_pm4_cmd_add(pm4, (info_va + i * 4) >> 32);
531 }
532
533 if (info->gfx_level == GFX11) {
534 /* On GFX11, SQ_THREAD_TRACE_WPTR is incremented from the "initial WPTR address" instead of 0.
535 * To get the number of bytes (in units of 32 bytes) written by SQTT, the workaround is to
536 * subtract SQ_THREAD_TRACE_WPTR from the "initial WPTR address" as follow:
537 *
538 * 1) get the current buffer base address for this SE
539 * 2) shift right by 5 bits because SQ_THREAD_TRACE_WPTR is 32-byte aligned
540 * 3) mask off the higher 3 bits because WPTR.OFFSET is 29 bits
541 */
542 uint64_t data_va = ac_sqtt_get_data_va(info, sqtt, se_index);
543 uint64_t shifted_data_va = (data_va >> 5);
544 uint32_t init_wptr_value = shifted_data_va & 0x1fffffff;
545
546 ac_pm4_cmd_add(pm4, PKT3(PKT3_ATOMIC_MEM, 7, 0));
547 ac_pm4_cmd_add(pm4, ATOMIC_OP(TC_OP_ATOMIC_SUB_RTN_32));
548 ac_pm4_cmd_add(pm4, info_va); /* addr lo */
549 ac_pm4_cmd_add(pm4, info_va >> 32); /* addr hi */
550 ac_pm4_cmd_add(pm4, init_wptr_value); /* data lo */
551 ac_pm4_cmd_add(pm4, 0); /* data hi */
552 ac_pm4_cmd_add(pm4, 0); /* compare data lo */
553 ac_pm4_cmd_add(pm4, 0); /* compare data hi */
554 ac_pm4_cmd_add(pm4, 0); /* loop interval */
555 }
556 }
557
558 void
ac_sqtt_emit_stop(const struct radeon_info * info,struct ac_pm4_state * pm4,bool is_compute_queue)559 ac_sqtt_emit_stop(const struct radeon_info *info, struct ac_pm4_state *pm4,
560 bool is_compute_queue)
561 {
562 /* Stop the thread trace with a different event based on the queue. */
563 if (is_compute_queue) {
564 ac_pm4_set_reg(pm4, R_00B878_COMPUTE_THREAD_TRACE_ENABLE, S_00B878_THREAD_TRACE_ENABLE(0));
565 } else {
566 ac_pm4_cmd_add(pm4, PKT3(PKT3_EVENT_WRITE, 0, 0));
567 ac_pm4_cmd_add(pm4, EVENT_TYPE(V_028A90_THREAD_TRACE_STOP) | EVENT_INDEX(0));
568 }
569
570 ac_pm4_cmd_add(pm4, PKT3(PKT3_EVENT_WRITE, 0, 0));
571 ac_pm4_cmd_add(pm4, EVENT_TYPE(V_028A90_THREAD_TRACE_FINISH) | EVENT_INDEX(0));
572 }
573
574 void
ac_sqtt_emit_wait(const struct radeon_info * info,struct ac_pm4_state * pm4,const struct ac_sqtt * sqtt,bool is_compute_queue)575 ac_sqtt_emit_wait(const struct radeon_info *info, struct ac_pm4_state *pm4,
576 const struct ac_sqtt *sqtt, bool is_compute_queue)
577 {
578 const unsigned max_se = info->max_se;
579
580 for (unsigned se = 0; se < max_se; se++) {
581 if (ac_sqtt_se_is_disabled(info, se))
582 continue;
583
584 /* Target SEi and SH0. */
585 ac_pm4_set_reg(pm4, R_030800_GRBM_GFX_INDEX, S_030800_SE_INDEX(se) |
586 S_030800_SH_INDEX(0) | S_030800_INSTANCE_BROADCAST_WRITES(1));
587
588 if (info->gfx_level >= GFX11) {
589 /* Make sure to wait for the trace buffer. */
590 ac_pm4_cmd_add(pm4, PKT3(PKT3_WAIT_REG_MEM, 5, 0));
591 ac_pm4_cmd_add(pm4, WAIT_REG_MEM_NOT_EQUAL); /* wait until the register is equal to the reference value */
592 ac_pm4_cmd_add(pm4, R_0367D0_SQ_THREAD_TRACE_STATUS >> 2); /* register */
593 ac_pm4_cmd_add(pm4, 0);
594 ac_pm4_cmd_add(pm4, 0); /* reference value */
595 ac_pm4_cmd_add(pm4, ~C_0367D0_FINISH_DONE);
596 ac_pm4_cmd_add(pm4, 4); /* poll interval */
597
598 /* Disable the thread trace mode. */
599 ac_pm4_set_reg(pm4, R_0367B0_SQ_THREAD_TRACE_CTRL, ac_sqtt_get_ctrl(info, false));
600
601 /* Wait for thread trace completion. */
602 ac_pm4_cmd_add(pm4, PKT3(PKT3_WAIT_REG_MEM, 5, 0));
603 ac_pm4_cmd_add(pm4, WAIT_REG_MEM_EQUAL); /* wait until the register is equal to the reference value */
604 ac_pm4_cmd_add(pm4, R_0367D0_SQ_THREAD_TRACE_STATUS >> 2); /* register */
605 ac_pm4_cmd_add(pm4, 0);
606 ac_pm4_cmd_add(pm4, 0); /* reference value */
607 ac_pm4_cmd_add(pm4, ~C_0367D0_BUSY); /* mask */
608 ac_pm4_cmd_add(pm4, 4); /* poll interval */
609 } else if (info->gfx_level >= GFX10) {
610 if (!info->has_sqtt_rb_harvest_bug) {
611 /* Make sure to wait for the trace buffer. */
612 ac_pm4_cmd_add(pm4, PKT3(PKT3_WAIT_REG_MEM, 5, 0));
613 ac_pm4_cmd_add(pm4, WAIT_REG_MEM_NOT_EQUAL); /* wait until the register is equal to the reference value */
614 ac_pm4_cmd_add(pm4, R_008D20_SQ_THREAD_TRACE_STATUS >> 2); /* register */
615 ac_pm4_cmd_add(pm4, 0);
616 ac_pm4_cmd_add(pm4, 0); /* reference value */
617 ac_pm4_cmd_add(pm4, ~C_008D20_FINISH_DONE);
618 ac_pm4_cmd_add(pm4, 4); /* poll interval */
619 }
620
621 /* Disable the thread trace mode. */
622 ac_pm4_set_reg(pm4, R_008D1C_SQ_THREAD_TRACE_CTRL, ac_sqtt_get_ctrl(info, false));
623
624 /* Wait for thread trace completion. */
625 ac_pm4_cmd_add(pm4, PKT3(PKT3_WAIT_REG_MEM, 5, 0));
626 ac_pm4_cmd_add(pm4, WAIT_REG_MEM_EQUAL); /* wait until the register is equal to the reference value */
627 ac_pm4_cmd_add(pm4, R_008D20_SQ_THREAD_TRACE_STATUS >> 2); /* register */
628 ac_pm4_cmd_add(pm4, 0);
629 ac_pm4_cmd_add(pm4, 0); /* reference value */
630 ac_pm4_cmd_add(pm4, ~C_008D20_BUSY); /* mask */
631 ac_pm4_cmd_add(pm4, 4); /* poll interval */
632 } else {
633 /* Disable the thread trace mode. */
634 ac_pm4_set_reg(pm4, R_030CD8_SQ_THREAD_TRACE_MODE, S_030CD8_MODE(0));
635
636 /* Wait for thread trace completion. */
637 ac_pm4_cmd_add(pm4, PKT3(PKT3_WAIT_REG_MEM, 5, 0));
638 ac_pm4_cmd_add(pm4, WAIT_REG_MEM_EQUAL); /* wait until the register is equal to the reference value */
639 ac_pm4_cmd_add(pm4, R_030CE8_SQ_THREAD_TRACE_STATUS >> 2); /* register */
640 ac_pm4_cmd_add(pm4, 0);
641 ac_pm4_cmd_add(pm4, 0); /* reference value */
642 ac_pm4_cmd_add(pm4, ~C_030CE8_BUSY); /* mask */
643 ac_pm4_cmd_add(pm4, 4); /* poll interval */
644 }
645
646 ac_sqtt_copy_info_regs(info, pm4, sqtt, se);
647 }
648
649 /* Restore global broadcasting. */
650 ac_pm4_set_reg(pm4, R_030800_GRBM_GFX_INDEX, S_030800_SE_BROADCAST_WRITES(1) |
651 S_030800_SH_BROADCAST_WRITES(1) | S_030800_INSTANCE_BROADCAST_WRITES(1));
652 }
653