• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /*
2  * Copyright 2020 Advanced Micro Devices, Inc.
3  * Copyright 2020 Valve Corporation
4  *
5  * SPDX-License-Identifier: MIT
6  */
7 
8 #include "ac_pm4.h"
9 #include "ac_sqtt.h"
10 
11 #include "sid.h"
12 #include "ac_gpu_info.h"
13 #include "util/u_math.h"
14 #include "util/os_time.h"
15 
16 #include "sid.h"
17 
18 uint64_t
ac_sqtt_get_info_offset(unsigned se)19 ac_sqtt_get_info_offset(unsigned se)
20 {
21    return sizeof(struct ac_sqtt_data_info) * se;
22 }
23 
24 uint64_t
ac_sqtt_get_data_offset(const struct radeon_info * rad_info,const struct ac_sqtt * data,unsigned se)25 ac_sqtt_get_data_offset(const struct radeon_info *rad_info, const struct ac_sqtt *data, unsigned se)
26 {
27    unsigned max_se = rad_info->max_se;
28    uint64_t data_offset;
29 
30    data_offset = align64(sizeof(struct ac_sqtt_data_info) * max_se, 1 << SQTT_BUFFER_ALIGN_SHIFT);
31    data_offset += data->buffer_size * se;
32 
33    return data_offset;
34 }
35 
36 static uint64_t
ac_sqtt_get_info_va(uint64_t va,unsigned se)37 ac_sqtt_get_info_va(uint64_t va, unsigned se)
38 {
39    return va + ac_sqtt_get_info_offset(se);
40 }
41 
42 static uint64_t
ac_sqtt_get_data_va(const struct radeon_info * rad_info,const struct ac_sqtt * data,unsigned se)43 ac_sqtt_get_data_va(const struct radeon_info *rad_info, const struct ac_sqtt *data,
44                     unsigned se)
45 {
46    return data->buffer_va + ac_sqtt_get_data_offset(rad_info, data, se);
47 }
48 
49 void
ac_sqtt_init(struct ac_sqtt * data)50 ac_sqtt_init(struct ac_sqtt *data)
51 {
52    list_inithead(&data->rgp_pso_correlation.record);
53    simple_mtx_init(&data->rgp_pso_correlation.lock, mtx_plain);
54 
55    list_inithead(&data->rgp_loader_events.record);
56    simple_mtx_init(&data->rgp_loader_events.lock, mtx_plain);
57 
58    list_inithead(&data->rgp_code_object.record);
59    simple_mtx_init(&data->rgp_code_object.lock, mtx_plain);
60 
61    list_inithead(&data->rgp_clock_calibration.record);
62    simple_mtx_init(&data->rgp_clock_calibration.lock, mtx_plain);
63 
64    list_inithead(&data->rgp_queue_info.record);
65    simple_mtx_init(&data->rgp_queue_info.lock, mtx_plain);
66 
67    list_inithead(&data->rgp_queue_event.record);
68    simple_mtx_init(&data->rgp_queue_event.lock, mtx_plain);
69 }
70 
71 void
ac_sqtt_finish(struct ac_sqtt * data)72 ac_sqtt_finish(struct ac_sqtt *data)
73 {
74    assert(data->rgp_pso_correlation.record_count == 0);
75    simple_mtx_destroy(&data->rgp_pso_correlation.lock);
76 
77    assert(data->rgp_loader_events.record_count == 0);
78    simple_mtx_destroy(&data->rgp_loader_events.lock);
79 
80    assert(data->rgp_code_object.record_count == 0);
81    simple_mtx_destroy(&data->rgp_code_object.lock);
82 
83    assert(data->rgp_clock_calibration.record_count == 0);
84    simple_mtx_destroy(&data->rgp_clock_calibration.lock);
85 
86    assert(data->rgp_queue_info.record_count == 0);
87    simple_mtx_destroy(&data->rgp_queue_info.lock);
88 
89    assert(data->rgp_queue_event.record_count == 0);
90    simple_mtx_destroy(&data->rgp_queue_event.lock);
91 }
92 
93 bool
ac_is_sqtt_complete(const struct radeon_info * rad_info,const struct ac_sqtt * data,const struct ac_sqtt_data_info * info)94 ac_is_sqtt_complete(const struct radeon_info *rad_info, const struct ac_sqtt *data,
95                     const struct ac_sqtt_data_info *info)
96 {
97    if (rad_info->gfx_level >= GFX10) {
98       /* GFX10 doesn't have THREAD_TRACE_CNTR but it reports the number of
99        * dropped bytes per SE via THREAD_TRACE_DROPPED_CNTR. Though, this
100        * doesn't seem reliable because it might still report non-zero even if
101        * the SQTT buffer isn't full.
102        *
103        * The solution here is to compare the number of bytes written by the hw
104        * (in units of 32 bytes) to the SQTT buffer size. If it's equal, that
105        * means that the buffer is full and should be resized.
106        */
107       return !(info->cur_offset * 32 == data->buffer_size - 32);
108    }
109 
110    /* Otherwise, compare the current thread trace offset with the number
111     * of written bytes.
112     */
113    return info->cur_offset == info->gfx9_write_counter;
114 }
115 
116 bool
ac_sqtt_add_pso_correlation(struct ac_sqtt * sqtt,uint64_t pipeline_hash,uint64_t api_hash)117 ac_sqtt_add_pso_correlation(struct ac_sqtt *sqtt, uint64_t pipeline_hash, uint64_t api_hash)
118 {
119    struct rgp_pso_correlation *pso_correlation = &sqtt->rgp_pso_correlation;
120    struct rgp_pso_correlation_record *record;
121 
122    record = malloc(sizeof(struct rgp_pso_correlation_record));
123    if (!record)
124       return false;
125 
126    record->api_pso_hash = api_hash;
127    record->pipeline_hash[0] = pipeline_hash;
128    record->pipeline_hash[1] = pipeline_hash;
129    memset(record->api_level_obj_name, 0, sizeof(record->api_level_obj_name));
130 
131    simple_mtx_lock(&pso_correlation->lock);
132    list_addtail(&record->list, &pso_correlation->record);
133    pso_correlation->record_count++;
134    simple_mtx_unlock(&pso_correlation->lock);
135 
136    return true;
137 }
138 
139 bool
ac_sqtt_add_code_object_loader_event(struct ac_sqtt * sqtt,uint64_t pipeline_hash,uint64_t base_address)140 ac_sqtt_add_code_object_loader_event(struct ac_sqtt *sqtt, uint64_t pipeline_hash,
141                                      uint64_t base_address)
142 {
143    struct rgp_loader_events *loader_events = &sqtt->rgp_loader_events;
144    struct rgp_loader_events_record *record;
145 
146    record = malloc(sizeof(struct rgp_loader_events_record));
147    if (!record)
148       return false;
149 
150    record->loader_event_type = RGP_LOAD_TO_GPU_MEMORY;
151    record->reserved = 0;
152    record->base_address = base_address & 0xffffffffffff;
153    record->code_object_hash[0] = pipeline_hash;
154    record->code_object_hash[1] = pipeline_hash;
155    record->time_stamp = os_time_get_nano();
156 
157    simple_mtx_lock(&loader_events->lock);
158    list_addtail(&record->list, &loader_events->record);
159    loader_events->record_count++;
160    simple_mtx_unlock(&loader_events->lock);
161 
162    return true;
163 }
164 
165 bool
ac_sqtt_add_clock_calibration(struct ac_sqtt * sqtt,uint64_t cpu_timestamp,uint64_t gpu_timestamp)166 ac_sqtt_add_clock_calibration(struct ac_sqtt *sqtt, uint64_t cpu_timestamp, uint64_t gpu_timestamp)
167 {
168    struct rgp_clock_calibration *clock_calibration = &sqtt->rgp_clock_calibration;
169    struct rgp_clock_calibration_record *record;
170 
171    record = malloc(sizeof(struct rgp_clock_calibration_record));
172    if (!record)
173       return false;
174 
175    record->cpu_timestamp = cpu_timestamp;
176    record->gpu_timestamp = gpu_timestamp;
177 
178    simple_mtx_lock(&clock_calibration->lock);
179    list_addtail(&record->list, &clock_calibration->record);
180    clock_calibration->record_count++;
181    simple_mtx_unlock(&clock_calibration->lock);
182 
183    return true;
184 }
185 
186 /* See https://gitlab.freedesktop.org/mesa/mesa/-/issues/5260
187  * On some HW SQTT can hang if we're not in one of the profiling pstates. */
188 bool
ac_check_profile_state(const struct radeon_info * info)189 ac_check_profile_state(const struct radeon_info *info)
190 {
191    char path[128];
192    char data[128];
193    int n;
194 
195    if (!info->pci.valid)
196       return false; /* Unknown but optimistic. */
197 
198    snprintf(path, sizeof(path),
199             "/sys/bus/pci/devices/%04x:%02x:%02x.%x/power_dpm_force_performance_level",
200             info->pci.domain, info->pci.bus, info->pci.dev, info->pci.func);
201 
202    FILE *f = fopen(path, "r");
203    if (!f)
204       return false; /* Unknown but optimistic. */
205    n = fread(data, 1, sizeof(data) - 1, f);
206    fclose(f);
207    data[n] = 0;
208    return strstr(data, "profile") == NULL;
209 }
210 
211 union rgp_sqtt_marker_cb_id
ac_sqtt_get_next_cmdbuf_id(struct ac_sqtt * data,enum amd_ip_type ip_type)212 ac_sqtt_get_next_cmdbuf_id(struct ac_sqtt *data, enum amd_ip_type ip_type)
213 {
214    union rgp_sqtt_marker_cb_id cb_id = {0};
215 
216    cb_id.global_cb_id.cb_index =
217       p_atomic_inc_return(&data->cmdbuf_ids_per_queue[ip_type]);
218 
219    return cb_id;
220 }
221 
222 static bool
ac_sqtt_se_is_disabled(const struct radeon_info * info,unsigned se)223 ac_sqtt_se_is_disabled(const struct radeon_info *info, unsigned se)
224 {
225    /* No active CU on the SE means it is disabled. */
226    return info->cu_mask[se][0] == 0;
227 }
228 
229 static uint32_t
ac_sqtt_get_active_cu(const struct radeon_info * info,unsigned se)230 ac_sqtt_get_active_cu(const struct radeon_info *info, unsigned se)
231 {
232    uint32_t cu_index;
233 
234    if (info->gfx_level >= GFX11) {
235       /* GFX11 seems to operate on the last active CU. */
236       cu_index = util_last_bit(info->cu_mask[se][0]) - 1;
237    } else {
238       /* Default to the first active CU. */
239       cu_index = ffs(info->cu_mask[se][0]);
240    }
241 
242    return cu_index;
243 }
244 
245 bool
ac_sqtt_get_trace(struct ac_sqtt * data,const struct radeon_info * info,struct ac_sqtt_trace * sqtt_trace)246 ac_sqtt_get_trace(struct ac_sqtt *data, const struct radeon_info *info,
247                   struct ac_sqtt_trace *sqtt_trace)
248 {
249    unsigned max_se = info->max_se;
250    void *ptr = data->ptr;
251 
252    memset(sqtt_trace, 0, sizeof(*sqtt_trace));
253 
254    for (unsigned se = 0; se < max_se; se++) {
255       uint64_t info_offset = ac_sqtt_get_info_offset(se);
256       uint64_t data_offset = ac_sqtt_get_data_offset(info, data, se);
257       void *info_ptr = (uint8_t *)ptr + info_offset;
258       void *data_ptr = (uint8_t *)ptr + data_offset;
259       struct ac_sqtt_data_info *trace_info = (struct ac_sqtt_data_info *)info_ptr;
260       struct ac_sqtt_data_se data_se = {0};
261       int active_cu = ac_sqtt_get_active_cu(info, se);
262 
263       if (ac_sqtt_se_is_disabled(info, se))
264          continue;
265 
266       if (!ac_is_sqtt_complete(info, data, trace_info))
267          return false;
268 
269       data_se.data_ptr = data_ptr;
270       data_se.info = *trace_info;
271       data_se.shader_engine = se;
272 
273       /* RGP seems to expect units of WGP on GFX10+. */
274       data_se.compute_unit = info->gfx_level >= GFX10 ? (active_cu / 2) : active_cu;
275 
276       sqtt_trace->traces[sqtt_trace->num_traces] = data_se;
277       sqtt_trace->num_traces++;
278    }
279 
280    sqtt_trace->rgp_code_object = &data->rgp_code_object;
281    sqtt_trace->rgp_loader_events = &data->rgp_loader_events;
282    sqtt_trace->rgp_pso_correlation = &data->rgp_pso_correlation;
283    sqtt_trace->rgp_queue_info = &data->rgp_queue_info;
284    sqtt_trace->rgp_queue_event = &data->rgp_queue_event;
285    sqtt_trace->rgp_clock_calibration = &data->rgp_clock_calibration;
286 
287    return true;
288 }
289 
290 uint32_t
ac_sqtt_get_ctrl(const struct radeon_info * info,bool enable)291 ac_sqtt_get_ctrl(const struct radeon_info *info, bool enable)
292 {
293 
294    uint32_t ctrl;
295 
296    if (info->gfx_level >= GFX11) {
297       ctrl = S_0367B0_MODE(enable) | S_0367B0_HIWATER(5) |
298              S_0367B0_UTIL_TIMER_GFX11(1) | S_0367B0_RT_FREQ(2) | /* 4096 clk */
299              S_0367B0_DRAW_EVENT_EN(1) | S_0367B0_SPI_STALL_EN(1) |
300              S_0367B0_SQ_STALL_EN(1) | S_0367B0_REG_AT_HWM(2);
301    } else {
302       assert(info->gfx_level >= GFX10);
303 
304       ctrl = S_008D1C_MODE(enable) | S_008D1C_HIWATER(5) | S_008D1C_UTIL_TIMER(1) |
305              S_008D1C_RT_FREQ(2) | /* 4096 clk */ S_008D1C_DRAW_EVENT_EN(1) |
306              S_008D1C_REG_STALL_EN(1) | S_008D1C_SPI_STALL_EN(1) |
307              S_008D1C_SQ_STALL_EN(1) | S_008D1C_REG_DROP_ON_STALL(0);
308 
309       if (info->gfx_level == GFX10_3)
310          ctrl |= S_008D1C_LOWATER_OFFSET(4);
311 
312       if (info->has_sqtt_auto_flush_mode_bug)
313          ctrl |= S_008D1C_AUTO_FLUSH_MODE(1);
314    }
315 
316    return ctrl;
317 }
318 
319 uint32_t
ac_sqtt_get_shader_mask(const struct radeon_info * info)320 ac_sqtt_get_shader_mask(const struct radeon_info *info)
321 {
322    unsigned shader_mask = 0x7f; /* all shader stages */
323 
324    if (info->gfx_level >= GFX11) {
325       /* Disable unsupported hw shader stages */
326       shader_mask &= ~(0x02 /* VS */ | 0x08 /* ES */ | 0x20 /* LS */);
327    }
328 
329    return shader_mask;
330 }
331 
332 void
ac_sqtt_emit_start(const struct radeon_info * info,struct ac_pm4_state * pm4,const struct ac_sqtt * sqtt,bool is_compute_queue)333 ac_sqtt_emit_start(const struct radeon_info *info, struct ac_pm4_state *pm4,
334                    const struct ac_sqtt *sqtt, bool is_compute_queue)
335 {
336    const uint32_t shifted_size = sqtt->buffer_size >> SQTT_BUFFER_ALIGN_SHIFT;
337    const unsigned shader_mask = ac_sqtt_get_shader_mask(info);
338    const unsigned max_se = info->max_se;
339 
340    for (unsigned se = 0; se < max_se; se++) {
341       uint64_t data_va = ac_sqtt_get_data_va(info, sqtt, se);
342       uint64_t shifted_va = data_va >> SQTT_BUFFER_ALIGN_SHIFT;
343       int active_cu = ac_sqtt_get_active_cu(info, se);
344 
345       if (ac_sqtt_se_is_disabled(info, se))
346          continue;
347 
348       /* Target SEx and SH0. */
349       ac_pm4_set_reg(pm4, R_030800_GRBM_GFX_INDEX, S_030800_SE_INDEX(se) |
350                      S_030800_SH_INDEX(0) | S_030800_INSTANCE_BROADCAST_WRITES(1));
351 
352       if (info->gfx_level >= GFX11) {
353          /* Order seems important for the following 2 registers. */
354          ac_pm4_set_reg(pm4, R_0367A4_SQ_THREAD_TRACE_BUF0_SIZE,
355                         S_0367A4_SIZE(shifted_size) | S_0367A4_BASE_HI(shifted_va >> 32));
356 
357          ac_pm4_set_reg(pm4, R_0367A0_SQ_THREAD_TRACE_BUF0_BASE, shifted_va);
358 
359          ac_pm4_set_reg(pm4, R_0367B4_SQ_THREAD_TRACE_MASK,
360                         S_0367B4_WTYPE_INCLUDE(shader_mask) | S_0367B4_SA_SEL(0) |
361                         S_0367B4_WGP_SEL(active_cu / 2) | S_0367B4_SIMD_SEL(0));
362 
363          uint32_t sqtt_token_mask = S_0367B8_REG_INCLUDE(V_0367B8_REG_INCLUDE_SQDEC | V_0367B8_REG_INCLUDE_SHDEC |
364                                                          V_0367B8_REG_INCLUDE_GFXUDEC | V_0367B8_REG_INCLUDE_COMP |
365                                                          V_0367B8_REG_INCLUDE_CONTEXT | V_0367B8_REG_INCLUDE_CONFIG);
366 
367          /* Performance counters with SQTT are considered deprecated. */
368          uint32_t token_exclude = V_0367B8_TOKEN_EXCLUDE_PERF;
369 
370          if (!sqtt->instruction_timing_enabled) {
371             /* Reduce SQTT traffic when instruction timing isn't enabled. */
372             token_exclude |= V_0367B8_TOKEN_EXCLUDE_VMEMEXEC | V_0367B8_TOKEN_EXCLUDE_ALUEXEC |
373                              V_0367B8_TOKEN_EXCLUDE_VALUINST | V_0367B8_TOKEN_EXCLUDE_IMMEDIATE |
374                              V_0367B8_TOKEN_EXCLUDE_INST;
375          }
376          sqtt_token_mask |= S_0367B8_TOKEN_EXCLUDE_GFX11(token_exclude) | S_0367B8_BOP_EVENTS_TOKEN_INCLUDE_GFX11(1);
377 
378          ac_pm4_set_reg(pm4, R_0367B8_SQ_THREAD_TRACE_TOKEN_MASK, sqtt_token_mask);
379 
380          /* Should be emitted last (it enables thread traces). */
381          ac_pm4_set_reg(pm4, R_0367B0_SQ_THREAD_TRACE_CTRL, ac_sqtt_get_ctrl(info, true));
382       } else if (info->gfx_level >= GFX10) {
383          /* Order seems important for the following 2 registers. */
384          ac_pm4_set_reg(pm4, R_008D04_SQ_THREAD_TRACE_BUF0_SIZE,
385                         S_008D04_SIZE(shifted_size) | S_008D04_BASE_HI(shifted_va >> 32));
386 
387          ac_pm4_set_reg(pm4, R_008D00_SQ_THREAD_TRACE_BUF0_BASE, shifted_va);
388 
389          ac_pm4_set_reg(pm4, R_008D14_SQ_THREAD_TRACE_MASK,
390                         S_008D14_WTYPE_INCLUDE(shader_mask) | S_008D14_SA_SEL(0) |
391                         S_008D14_WGP_SEL(active_cu / 2) | S_008D14_SIMD_SEL(0));
392 
393          uint32_t sqtt_token_mask = S_008D18_REG_INCLUDE(V_008D18_REG_INCLUDE_SQDEC | V_008D18_REG_INCLUDE_SHDEC |
394                                                          V_008D18_REG_INCLUDE_GFXUDEC | V_008D18_REG_INCLUDE_COMP |
395                                                          V_008D18_REG_INCLUDE_CONTEXT | V_008D18_REG_INCLUDE_CONFIG);
396 
397          /* Performance counters with SQTT are considered deprecated. */
398          uint32_t token_exclude = V_008D18_TOKEN_EXCLUDE_PERF;
399 
400          if (!sqtt->instruction_timing_enabled) {
401             /* Reduce SQTT traffic when instruction timing isn't enabled. */
402             token_exclude |= V_008D18_TOKEN_EXCLUDE_VMEMEXEC | V_008D18_TOKEN_EXCLUDE_ALUEXEC |
403                              V_008D18_TOKEN_EXCLUDE_VALUINST | V_008D18_TOKEN_EXCLUDE_IMMEDIATE |
404                              V_008D18_TOKEN_EXCLUDE_INST;
405          }
406          sqtt_token_mask |=
407             S_008D18_TOKEN_EXCLUDE(token_exclude) | S_008D18_BOP_EVENTS_TOKEN_INCLUDE(info->gfx_level == GFX10_3);
408 
409          ac_pm4_set_reg(pm4, R_008D18_SQ_THREAD_TRACE_TOKEN_MASK, sqtt_token_mask);
410 
411          /* Should be emitted last (it enables thread traces). */
412          ac_pm4_set_reg(pm4, R_008D1C_SQ_THREAD_TRACE_CTRL, ac_sqtt_get_ctrl(info, true));
413       } else {
414          /* Order seems important for the following 4 registers. */
415          ac_pm4_set_reg(pm4, R_030CDC_SQ_THREAD_TRACE_BASE2, S_030CDC_ADDR_HI(shifted_va >> 32));
416 
417          ac_pm4_set_reg(pm4, R_030CC0_SQ_THREAD_TRACE_BASE, shifted_va);
418 
419          ac_pm4_set_reg(pm4, R_030CC4_SQ_THREAD_TRACE_SIZE, S_030CC4_SIZE(shifted_size));
420 
421          ac_pm4_set_reg(pm4, R_030CD4_SQ_THREAD_TRACE_CTRL, S_030CD4_RESET_BUFFER(1));
422 
423          uint32_t sqtt_mask = S_030CC8_CU_SEL(active_cu) | S_030CC8_SH_SEL(0) | S_030CC8_SIMD_EN(0xf) |
424                               S_030CC8_VM_ID_MASK(0) | S_030CC8_REG_STALL_EN(1) | S_030CC8_SPI_STALL_EN(1) |
425                               S_030CC8_SQ_STALL_EN(1);
426 
427          if (info->gfx_level < GFX9) {
428             sqtt_mask |= S_030CC8_RANDOM_SEED(0xffff);
429          }
430 
431          ac_pm4_set_reg(pm4, R_030CC8_SQ_THREAD_TRACE_MASK, sqtt_mask);
432 
433          /* Trace all tokens and registers. */
434          ac_pm4_set_reg(pm4, R_030CCC_SQ_THREAD_TRACE_TOKEN_MASK,
435                         S_030CCC_TOKEN_MASK(0xbfff) | S_030CCC_REG_MASK(0xff) | S_030CCC_REG_DROP_ON_STALL(0));
436 
437          /* Enable SQTT perf counters for all CUs. */
438          ac_pm4_set_reg(pm4, R_030CD0_SQ_THREAD_TRACE_PERF_MASK,
439                         S_030CD0_SH0_MASK(0xffff) | S_030CD0_SH1_MASK(0xffff));
440 
441          ac_pm4_set_reg(pm4, R_030CE0_SQ_THREAD_TRACE_TOKEN_MASK2, 0xffffffff);
442 
443          ac_pm4_set_reg(pm4, R_030CEC_SQ_THREAD_TRACE_HIWATER, S_030CEC_HIWATER(4));
444 
445          if (info->gfx_level == GFX9) {
446             /* Reset thread trace status errors. */
447             ac_pm4_set_reg(pm4, R_030CE8_SQ_THREAD_TRACE_STATUS, S_030CE8_UTC_ERROR(0));
448          }
449 
450          /* Enable the thread trace mode. */
451          uint32_t sqtt_mode = S_030CD8_MASK_PS(1) | S_030CD8_MASK_VS(1) | S_030CD8_MASK_GS(1) | S_030CD8_MASK_ES(1) |
452                               S_030CD8_MASK_HS(1) | S_030CD8_MASK_LS(1) | S_030CD8_MASK_CS(1) |
453                               S_030CD8_AUTOFLUSH_EN(1) | /* periodically flush SQTT data to memory */
454                               S_030CD8_MODE(1);
455 
456          if (info->gfx_level == GFX9) {
457             /* Count SQTT traffic in TCC perf counters. */
458             sqtt_mode |= S_030CD8_TC_PERF_EN(1);
459          }
460 
461          ac_pm4_set_reg(pm4, R_030CD8_SQ_THREAD_TRACE_MODE, sqtt_mode);
462       }
463    }
464 
465    /* Restore global broadcasting. */
466    ac_pm4_set_reg(pm4, R_030800_GRBM_GFX_INDEX,  S_030800_SE_BROADCAST_WRITES(1) |
467                   S_030800_SH_BROADCAST_WRITES(1) | S_030800_INSTANCE_BROADCAST_WRITES(1));
468 
469    /* Start the thread trace with a different event based on the queue. */
470    if (is_compute_queue) {
471       ac_pm4_set_reg(pm4, R_00B878_COMPUTE_THREAD_TRACE_ENABLE, S_00B878_THREAD_TRACE_ENABLE(1));
472    } else {
473       ac_pm4_cmd_add(pm4, PKT3(PKT3_EVENT_WRITE, 0, 0));
474       ac_pm4_cmd_add(pm4, EVENT_TYPE(V_028A90_THREAD_TRACE_START) | EVENT_INDEX(0));
475    }
476 
477 }
478 
479 static const uint32_t gfx8_sqtt_info_regs[] = {
480    R_030CE4_SQ_THREAD_TRACE_WPTR,
481    R_030CE8_SQ_THREAD_TRACE_STATUS,
482    R_008E40_SQ_THREAD_TRACE_CNTR,
483 };
484 
485 static const uint32_t gfx9_sqtt_info_regs[] = {
486    R_030CE4_SQ_THREAD_TRACE_WPTR,
487    R_030CE8_SQ_THREAD_TRACE_STATUS,
488    R_030CF0_SQ_THREAD_TRACE_CNTR,
489 };
490 
491 static const uint32_t gfx10_sqtt_info_regs[] = {
492    R_008D10_SQ_THREAD_TRACE_WPTR,
493    R_008D20_SQ_THREAD_TRACE_STATUS,
494    R_008D24_SQ_THREAD_TRACE_DROPPED_CNTR,
495 };
496 
497 static const uint32_t gfx11_sqtt_info_regs[] = {
498    R_0367BC_SQ_THREAD_TRACE_WPTR,
499    R_0367D0_SQ_THREAD_TRACE_STATUS,
500    R_0367E8_SQ_THREAD_TRACE_DROPPED_CNTR,
501 };
502 
503 static void
ac_sqtt_copy_info_regs(const struct radeon_info * info,struct ac_pm4_state * pm4,const struct ac_sqtt * sqtt,uint32_t se_index)504 ac_sqtt_copy_info_regs(const struct radeon_info *info, struct ac_pm4_state *pm4,
505                        const struct ac_sqtt *sqtt, uint32_t se_index)
506 {
507    const uint32_t *sqtt_info_regs = NULL;
508 
509    if (info->gfx_level >= GFX11) {
510       sqtt_info_regs = gfx11_sqtt_info_regs;
511    } else if (info->gfx_level >= GFX10) {
512       sqtt_info_regs = gfx10_sqtt_info_regs;
513    } else if (info->gfx_level == GFX9) {
514       sqtt_info_regs = gfx9_sqtt_info_regs;
515    } else {
516       assert(info->gfx_level == GFX8);
517       sqtt_info_regs = gfx8_sqtt_info_regs;
518    }
519 
520    /* Get the VA where the info struct is stored for this SE. */
521    uint64_t info_va = ac_sqtt_get_info_va(sqtt->buffer_va, se_index);
522 
523    /* Copy back the info struct one DWORD at a time. */
524    for (unsigned i = 0; i < 3; i++) {
525       ac_pm4_cmd_add(pm4, PKT3(PKT3_COPY_DATA, 4, 0));
526       ac_pm4_cmd_add(pm4, COPY_DATA_SRC_SEL(COPY_DATA_PERF) | COPY_DATA_DST_SEL(COPY_DATA_TC_L2) | COPY_DATA_WR_CONFIRM);
527       ac_pm4_cmd_add(pm4, sqtt_info_regs[i] >> 2);
528       ac_pm4_cmd_add(pm4, 0); /* unused */
529       ac_pm4_cmd_add(pm4, (info_va + i * 4));
530       ac_pm4_cmd_add(pm4, (info_va + i * 4) >> 32);
531    }
532 
533    if (info->gfx_level == GFX11) {
534       /* On GFX11, SQ_THREAD_TRACE_WPTR is incremented from the "initial WPTR address" instead of 0.
535        * To get the number of bytes (in units of 32 bytes) written by SQTT, the workaround is to
536        * subtract SQ_THREAD_TRACE_WPTR from the "initial WPTR address" as follow:
537        *
538        * 1) get the current buffer base address for this SE
539        * 2) shift right by 5 bits because SQ_THREAD_TRACE_WPTR is 32-byte aligned
540        * 3) mask off the higher 3 bits because WPTR.OFFSET is 29 bits
541        */
542       uint64_t data_va = ac_sqtt_get_data_va(info, sqtt, se_index);
543       uint64_t shifted_data_va = (data_va >> 5);
544       uint32_t init_wptr_value = shifted_data_va & 0x1fffffff;
545 
546       ac_pm4_cmd_add(pm4, PKT3(PKT3_ATOMIC_MEM, 7, 0));
547       ac_pm4_cmd_add(pm4, ATOMIC_OP(TC_OP_ATOMIC_SUB_RTN_32));
548       ac_pm4_cmd_add(pm4, info_va);         /* addr lo */
549       ac_pm4_cmd_add(pm4, info_va >> 32);   /* addr hi */
550       ac_pm4_cmd_add(pm4, init_wptr_value); /* data lo */
551       ac_pm4_cmd_add(pm4, 0);               /* data hi */
552       ac_pm4_cmd_add(pm4, 0);               /* compare data lo */
553       ac_pm4_cmd_add(pm4, 0);               /* compare data hi */
554       ac_pm4_cmd_add(pm4, 0);               /* loop interval */
555    }
556 }
557 
558 void
ac_sqtt_emit_stop(const struct radeon_info * info,struct ac_pm4_state * pm4,bool is_compute_queue)559 ac_sqtt_emit_stop(const struct radeon_info *info, struct ac_pm4_state *pm4,
560                   bool is_compute_queue)
561 {
562    /* Stop the thread trace with a different event based on the queue. */
563    if (is_compute_queue) {
564       ac_pm4_set_reg(pm4, R_00B878_COMPUTE_THREAD_TRACE_ENABLE, S_00B878_THREAD_TRACE_ENABLE(0));
565    } else {
566       ac_pm4_cmd_add(pm4, PKT3(PKT3_EVENT_WRITE, 0, 0));
567       ac_pm4_cmd_add(pm4, EVENT_TYPE(V_028A90_THREAD_TRACE_STOP) | EVENT_INDEX(0));
568    }
569 
570    ac_pm4_cmd_add(pm4, PKT3(PKT3_EVENT_WRITE, 0, 0));
571    ac_pm4_cmd_add(pm4, EVENT_TYPE(V_028A90_THREAD_TRACE_FINISH) | EVENT_INDEX(0));
572 }
573 
574 void
ac_sqtt_emit_wait(const struct radeon_info * info,struct ac_pm4_state * pm4,const struct ac_sqtt * sqtt,bool is_compute_queue)575 ac_sqtt_emit_wait(const struct radeon_info *info, struct ac_pm4_state *pm4,
576                   const struct ac_sqtt *sqtt, bool is_compute_queue)
577 {
578    const unsigned max_se = info->max_se;
579 
580    for (unsigned se = 0; se < max_se; se++) {
581       if (ac_sqtt_se_is_disabled(info, se))
582          continue;
583 
584       /* Target SEi and SH0. */
585       ac_pm4_set_reg(pm4, R_030800_GRBM_GFX_INDEX, S_030800_SE_INDEX(se) |
586                      S_030800_SH_INDEX(0) | S_030800_INSTANCE_BROADCAST_WRITES(1));
587 
588       if (info->gfx_level >= GFX11) {
589          /* Make sure to wait for the trace buffer. */
590          ac_pm4_cmd_add(pm4, PKT3(PKT3_WAIT_REG_MEM, 5, 0));
591          ac_pm4_cmd_add(pm4, WAIT_REG_MEM_NOT_EQUAL); /* wait until the register is equal to the reference value */
592          ac_pm4_cmd_add(pm4, R_0367D0_SQ_THREAD_TRACE_STATUS >> 2); /* register */
593          ac_pm4_cmd_add(pm4, 0);
594          ac_pm4_cmd_add(pm4, 0); /* reference value */
595          ac_pm4_cmd_add(pm4, ~C_0367D0_FINISH_DONE);
596          ac_pm4_cmd_add(pm4, 4); /* poll interval */
597 
598          /* Disable the thread trace mode. */
599          ac_pm4_set_reg(pm4, R_0367B0_SQ_THREAD_TRACE_CTRL, ac_sqtt_get_ctrl(info, false));
600 
601          /* Wait for thread trace completion. */
602          ac_pm4_cmd_add(pm4, PKT3(PKT3_WAIT_REG_MEM, 5, 0));
603          ac_pm4_cmd_add(pm4, WAIT_REG_MEM_EQUAL); /* wait until the register is equal to the reference value */
604          ac_pm4_cmd_add(pm4, R_0367D0_SQ_THREAD_TRACE_STATUS >> 2); /* register */
605          ac_pm4_cmd_add(pm4, 0);
606          ac_pm4_cmd_add(pm4, 0);              /* reference value */
607          ac_pm4_cmd_add(pm4, ~C_0367D0_BUSY); /* mask */
608          ac_pm4_cmd_add(pm4, 4);              /* poll interval */
609       } else if (info->gfx_level >= GFX10) {
610          if (!info->has_sqtt_rb_harvest_bug) {
611             /* Make sure to wait for the trace buffer. */
612             ac_pm4_cmd_add(pm4, PKT3(PKT3_WAIT_REG_MEM, 5, 0));
613             ac_pm4_cmd_add(pm4, WAIT_REG_MEM_NOT_EQUAL); /* wait until the register is equal to the reference value */
614             ac_pm4_cmd_add(pm4, R_008D20_SQ_THREAD_TRACE_STATUS >> 2); /* register */
615             ac_pm4_cmd_add(pm4, 0);
616             ac_pm4_cmd_add(pm4, 0); /* reference value */
617             ac_pm4_cmd_add(pm4, ~C_008D20_FINISH_DONE);
618             ac_pm4_cmd_add(pm4, 4); /* poll interval */
619          }
620 
621          /* Disable the thread trace mode. */
622          ac_pm4_set_reg(pm4, R_008D1C_SQ_THREAD_TRACE_CTRL, ac_sqtt_get_ctrl(info, false));
623 
624          /* Wait for thread trace completion. */
625          ac_pm4_cmd_add(pm4, PKT3(PKT3_WAIT_REG_MEM, 5, 0));
626          ac_pm4_cmd_add(pm4, WAIT_REG_MEM_EQUAL); /* wait until the register is equal to the reference value */
627          ac_pm4_cmd_add(pm4, R_008D20_SQ_THREAD_TRACE_STATUS >> 2); /* register */
628          ac_pm4_cmd_add(pm4, 0);
629          ac_pm4_cmd_add(pm4, 0);              /* reference value */
630          ac_pm4_cmd_add(pm4, ~C_008D20_BUSY); /* mask */
631          ac_pm4_cmd_add(pm4, 4);              /* poll interval */
632       } else {
633          /* Disable the thread trace mode. */
634          ac_pm4_set_reg(pm4, R_030CD8_SQ_THREAD_TRACE_MODE, S_030CD8_MODE(0));
635 
636          /* Wait for thread trace completion. */
637          ac_pm4_cmd_add(pm4, PKT3(PKT3_WAIT_REG_MEM, 5, 0));
638          ac_pm4_cmd_add(pm4, WAIT_REG_MEM_EQUAL); /* wait until the register is equal to the reference value */
639          ac_pm4_cmd_add(pm4, R_030CE8_SQ_THREAD_TRACE_STATUS >> 2); /* register */
640          ac_pm4_cmd_add(pm4, 0);
641          ac_pm4_cmd_add(pm4, 0);              /* reference value */
642          ac_pm4_cmd_add(pm4, ~C_030CE8_BUSY); /* mask */
643          ac_pm4_cmd_add(pm4, 4);              /* poll interval */
644       }
645 
646       ac_sqtt_copy_info_regs(info, pm4, sqtt, se);
647    }
648 
649    /* Restore global broadcasting. */
650    ac_pm4_set_reg(pm4, R_030800_GRBM_GFX_INDEX, S_030800_SE_BROADCAST_WRITES(1) |
651                   S_030800_SH_BROADCAST_WRITES(1) | S_030800_INSTANCE_BROADCAST_WRITES(1));
652 }
653