1 /*
2 * Copyright 2020 Advanced Micro Devices, Inc.
3 * Copyright 2020 Valve Corporation
4 *
5 * SPDX-License-Identifier: MIT
6 */
7
8 #include "ac_sqtt.h"
9
10 #include "ac_gpu_info.h"
11 #include "util/u_math.h"
12 #include "util/os_time.h"
13
14 uint64_t
ac_sqtt_get_info_offset(unsigned se)15 ac_sqtt_get_info_offset(unsigned se)
16 {
17 return sizeof(struct ac_sqtt_data_info) * se;
18 }
19
20 uint64_t
ac_sqtt_get_data_offset(const struct radeon_info * rad_info,const struct ac_sqtt * data,unsigned se)21 ac_sqtt_get_data_offset(const struct radeon_info *rad_info, const struct ac_sqtt *data, unsigned se)
22 {
23 unsigned max_se = rad_info->max_se;
24 uint64_t data_offset;
25
26 data_offset = align64(sizeof(struct ac_sqtt_data_info) * max_se, 1 << SQTT_BUFFER_ALIGN_SHIFT);
27 data_offset += data->buffer_size * se;
28
29 return data_offset;
30 }
31
32 uint64_t
ac_sqtt_get_info_va(uint64_t va,unsigned se)33 ac_sqtt_get_info_va(uint64_t va, unsigned se)
34 {
35 return va + ac_sqtt_get_info_offset(se);
36 }
37
38 uint64_t
ac_sqtt_get_data_va(const struct radeon_info * rad_info,const struct ac_sqtt * data,uint64_t va,unsigned se)39 ac_sqtt_get_data_va(const struct radeon_info *rad_info, const struct ac_sqtt *data, uint64_t va,
40 unsigned se)
41 {
42 return va + ac_sqtt_get_data_offset(rad_info, data, se);
43 }
44
45 void
ac_sqtt_init(struct ac_sqtt * data)46 ac_sqtt_init(struct ac_sqtt *data)
47 {
48 list_inithead(&data->rgp_pso_correlation.record);
49 simple_mtx_init(&data->rgp_pso_correlation.lock, mtx_plain);
50
51 list_inithead(&data->rgp_loader_events.record);
52 simple_mtx_init(&data->rgp_loader_events.lock, mtx_plain);
53
54 list_inithead(&data->rgp_code_object.record);
55 simple_mtx_init(&data->rgp_code_object.lock, mtx_plain);
56
57 list_inithead(&data->rgp_clock_calibration.record);
58 simple_mtx_init(&data->rgp_clock_calibration.lock, mtx_plain);
59
60 list_inithead(&data->rgp_queue_info.record);
61 simple_mtx_init(&data->rgp_queue_info.lock, mtx_plain);
62
63 list_inithead(&data->rgp_queue_event.record);
64 simple_mtx_init(&data->rgp_queue_event.lock, mtx_plain);
65 }
66
67 void
ac_sqtt_finish(struct ac_sqtt * data)68 ac_sqtt_finish(struct ac_sqtt *data)
69 {
70 assert(data->rgp_pso_correlation.record_count == 0);
71 simple_mtx_destroy(&data->rgp_pso_correlation.lock);
72
73 assert(data->rgp_loader_events.record_count == 0);
74 simple_mtx_destroy(&data->rgp_loader_events.lock);
75
76 assert(data->rgp_code_object.record_count == 0);
77 simple_mtx_destroy(&data->rgp_code_object.lock);
78
79 assert(data->rgp_clock_calibration.record_count == 0);
80 simple_mtx_destroy(&data->rgp_clock_calibration.lock);
81
82 assert(data->rgp_queue_info.record_count == 0);
83 simple_mtx_destroy(&data->rgp_queue_info.lock);
84
85 assert(data->rgp_queue_event.record_count == 0);
86 simple_mtx_destroy(&data->rgp_queue_event.lock);
87 }
88
89 bool
ac_is_sqtt_complete(const struct radeon_info * rad_info,const struct ac_sqtt * data,const struct ac_sqtt_data_info * info)90 ac_is_sqtt_complete(const struct radeon_info *rad_info, const struct ac_sqtt *data,
91 const struct ac_sqtt_data_info *info)
92 {
93 if (rad_info->gfx_level >= GFX10) {
94 /* GFX10 doesn't have THREAD_TRACE_CNTR but it reports the number of
95 * dropped bytes per SE via THREAD_TRACE_DROPPED_CNTR. Though, this
96 * doesn't seem reliable because it might still report non-zero even if
97 * the SQTT buffer isn't full.
98 *
99 * The solution here is to compare the number of bytes written by the hw
100 * (in units of 32 bytes) to the SQTT buffer size. If it's equal, that
101 * means that the buffer is full and should be resized.
102 */
103 return !(info->cur_offset * 32 == data->buffer_size - 32);
104 }
105
106 /* Otherwise, compare the current thread trace offset with the number
107 * of written bytes.
108 */
109 return info->cur_offset == info->gfx9_write_counter;
110 }
111
112 uint32_t
ac_get_expected_buffer_size(struct radeon_info * rad_info,const struct ac_sqtt_data_info * info)113 ac_get_expected_buffer_size(struct radeon_info *rad_info, const struct ac_sqtt_data_info *info)
114 {
115 if (rad_info->gfx_level >= GFX10) {
116 uint32_t dropped_cntr_per_se = info->gfx10_dropped_cntr / rad_info->max_se;
117 return ((info->cur_offset * 32) + dropped_cntr_per_se) / 1024;
118 }
119
120 return (info->gfx9_write_counter * 32) / 1024;
121 }
122
123 bool
ac_sqtt_add_pso_correlation(struct ac_sqtt * sqtt,uint64_t pipeline_hash,uint64_t api_hash)124 ac_sqtt_add_pso_correlation(struct ac_sqtt *sqtt, uint64_t pipeline_hash, uint64_t api_hash)
125 {
126 struct rgp_pso_correlation *pso_correlation = &sqtt->rgp_pso_correlation;
127 struct rgp_pso_correlation_record *record;
128
129 record = malloc(sizeof(struct rgp_pso_correlation_record));
130 if (!record)
131 return false;
132
133 record->api_pso_hash = api_hash;
134 record->pipeline_hash[0] = pipeline_hash;
135 record->pipeline_hash[1] = pipeline_hash;
136 memset(record->api_level_obj_name, 0, sizeof(record->api_level_obj_name));
137
138 simple_mtx_lock(&pso_correlation->lock);
139 list_addtail(&record->list, &pso_correlation->record);
140 pso_correlation->record_count++;
141 simple_mtx_unlock(&pso_correlation->lock);
142
143 return true;
144 }
145
146 bool
ac_sqtt_add_code_object_loader_event(struct ac_sqtt * sqtt,uint64_t pipeline_hash,uint64_t base_address)147 ac_sqtt_add_code_object_loader_event(struct ac_sqtt *sqtt, uint64_t pipeline_hash,
148 uint64_t base_address)
149 {
150 struct rgp_loader_events *loader_events = &sqtt->rgp_loader_events;
151 struct rgp_loader_events_record *record;
152
153 record = malloc(sizeof(struct rgp_loader_events_record));
154 if (!record)
155 return false;
156
157 record->loader_event_type = RGP_LOAD_TO_GPU_MEMORY;
158 record->reserved = 0;
159 record->base_address = base_address & 0xffffffffffff;
160 record->code_object_hash[0] = pipeline_hash;
161 record->code_object_hash[1] = pipeline_hash;
162 record->time_stamp = os_time_get_nano();
163
164 simple_mtx_lock(&loader_events->lock);
165 list_addtail(&record->list, &loader_events->record);
166 loader_events->record_count++;
167 simple_mtx_unlock(&loader_events->lock);
168
169 return true;
170 }
171
172 bool
ac_sqtt_add_clock_calibration(struct ac_sqtt * sqtt,uint64_t cpu_timestamp,uint64_t gpu_timestamp)173 ac_sqtt_add_clock_calibration(struct ac_sqtt *sqtt, uint64_t cpu_timestamp, uint64_t gpu_timestamp)
174 {
175 struct rgp_clock_calibration *clock_calibration = &sqtt->rgp_clock_calibration;
176 struct rgp_clock_calibration_record *record;
177
178 record = malloc(sizeof(struct rgp_clock_calibration_record));
179 if (!record)
180 return false;
181
182 record->cpu_timestamp = cpu_timestamp;
183 record->gpu_timestamp = gpu_timestamp;
184
185 simple_mtx_lock(&clock_calibration->lock);
186 list_addtail(&record->list, &clock_calibration->record);
187 clock_calibration->record_count++;
188 simple_mtx_unlock(&clock_calibration->lock);
189
190 return true;
191 }
192
193 /* See https://gitlab.freedesktop.org/mesa/mesa/-/issues/5260
194 * On some HW SQTT can hang if we're not in one of the profiling pstates. */
195 bool
ac_check_profile_state(const struct radeon_info * info)196 ac_check_profile_state(const struct radeon_info *info)
197 {
198 char path[128];
199 char data[128];
200 int n;
201
202 if (!info->pci.valid)
203 return false; /* Unknown but optimistic. */
204
205 snprintf(path, sizeof(path),
206 "/sys/bus/pci/devices/%04x:%02x:%02x.%x/power_dpm_force_performance_level",
207 info->pci.domain, info->pci.bus, info->pci.dev, info->pci.func);
208
209 FILE *f = fopen(path, "r");
210 if (!f)
211 return false; /* Unknown but optimistic. */
212 n = fread(data, 1, sizeof(data) - 1, f);
213 fclose(f);
214 data[n] = 0;
215 return strstr(data, "profile") == NULL;
216 }
217
218 union rgp_sqtt_marker_cb_id
ac_sqtt_get_next_cmdbuf_id(struct ac_sqtt * data,enum amd_ip_type ip_type)219 ac_sqtt_get_next_cmdbuf_id(struct ac_sqtt *data, enum amd_ip_type ip_type)
220 {
221 union rgp_sqtt_marker_cb_id cb_id = {0};
222
223 cb_id.global_cb_id.cb_index =
224 p_atomic_inc_return(&data->cmdbuf_ids_per_queue[ip_type]);
225
226 return cb_id;
227 }
228
229 bool
ac_sqtt_se_is_disabled(const struct radeon_info * info,unsigned se)230 ac_sqtt_se_is_disabled(const struct radeon_info *info, unsigned se)
231 {
232 /* No active CU on the SE means it is disabled. */
233 return info->cu_mask[se][0] == 0;
234 }
235
236 uint32_t
ac_sqtt_get_active_cu(const struct radeon_info * info,unsigned se)237 ac_sqtt_get_active_cu(const struct radeon_info *info, unsigned se)
238 {
239 uint32_t cu_index;
240
241 if (info->gfx_level >= GFX11) {
242 /* GFX11 seems to operate on the last active CU. */
243 cu_index = util_last_bit(info->cu_mask[se][0]) - 1;
244 } else {
245 /* Default to the first active CU. */
246 cu_index = ffs(info->cu_mask[se][0]);
247 }
248
249 return cu_index;
250 }
251
252 bool
ac_sqtt_get_trace(struct ac_sqtt * data,const struct radeon_info * info,struct ac_sqtt_trace * sqtt_trace)253 ac_sqtt_get_trace(struct ac_sqtt *data, const struct radeon_info *info,
254 struct ac_sqtt_trace *sqtt_trace)
255 {
256 unsigned max_se = info->max_se;
257 void *ptr = data->ptr;
258
259 memset(sqtt_trace, 0, sizeof(*sqtt_trace));
260
261 for (unsigned se = 0; se < max_se; se++) {
262 uint64_t info_offset = ac_sqtt_get_info_offset(se);
263 uint64_t data_offset = ac_sqtt_get_data_offset(info, data, se);
264 void *info_ptr = (uint8_t *)ptr + info_offset;
265 void *data_ptr = (uint8_t *)ptr + data_offset;
266 struct ac_sqtt_data_info *trace_info = (struct ac_sqtt_data_info *)info_ptr;
267 struct ac_sqtt_data_se data_se = {0};
268 int active_cu = ac_sqtt_get_active_cu(info, se);
269
270 if (ac_sqtt_se_is_disabled(info, se))
271 continue;
272
273 if (!ac_is_sqtt_complete(info, data, trace_info))
274 return false;
275
276 data_se.data_ptr = data_ptr;
277 data_se.info = *trace_info;
278 data_se.shader_engine = se;
279
280 /* RGP seems to expect units of WGP on GFX10+. */
281 data_se.compute_unit = info->gfx_level >= GFX10 ? (active_cu / 2) : active_cu;
282
283 sqtt_trace->traces[sqtt_trace->num_traces] = data_se;
284 sqtt_trace->num_traces++;
285 }
286
287 sqtt_trace->rgp_code_object = &data->rgp_code_object;
288 sqtt_trace->rgp_loader_events = &data->rgp_loader_events;
289 sqtt_trace->rgp_pso_correlation = &data->rgp_pso_correlation;
290 sqtt_trace->rgp_queue_info = &data->rgp_queue_info;
291 sqtt_trace->rgp_queue_event = &data->rgp_queue_event;
292 sqtt_trace->rgp_clock_calibration = &data->rgp_clock_calibration;
293
294 return true;
295 }
296
297 uint32_t
ac_sqtt_get_shader_mask(const struct radeon_info * info)298 ac_sqtt_get_shader_mask(const struct radeon_info *info)
299 {
300 unsigned shader_mask = 0x7f; /* all shader stages */
301
302 if (info->gfx_level >= GFX11) {
303 /* Disable unsupported hw shader stages */
304 shader_mask &= ~(0x02 /* VS */ | 0x08 /* ES */ | 0x20 /* LS */);
305 }
306
307 return shader_mask;
308 }
309