• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /*
2  * Copyright 2020 Advanced Micro Devices, Inc.
3  * Copyright 2020 Valve Corporation
4  *
5  * SPDX-License-Identifier: MIT
6  */
7 
8 #include "ac_sqtt.h"
9 
10 #include "ac_gpu_info.h"
11 #include "util/u_math.h"
12 #include "util/os_time.h"
13 
14 uint64_t
ac_sqtt_get_info_offset(unsigned se)15 ac_sqtt_get_info_offset(unsigned se)
16 {
17    return sizeof(struct ac_sqtt_data_info) * se;
18 }
19 
20 uint64_t
ac_sqtt_get_data_offset(const struct radeon_info * rad_info,const struct ac_sqtt * data,unsigned se)21 ac_sqtt_get_data_offset(const struct radeon_info *rad_info, const struct ac_sqtt *data, unsigned se)
22 {
23    unsigned max_se = rad_info->max_se;
24    uint64_t data_offset;
25 
26    data_offset = align64(sizeof(struct ac_sqtt_data_info) * max_se, 1 << SQTT_BUFFER_ALIGN_SHIFT);
27    data_offset += data->buffer_size * se;
28 
29    return data_offset;
30 }
31 
32 uint64_t
ac_sqtt_get_info_va(uint64_t va,unsigned se)33 ac_sqtt_get_info_va(uint64_t va, unsigned se)
34 {
35    return va + ac_sqtt_get_info_offset(se);
36 }
37 
38 uint64_t
ac_sqtt_get_data_va(const struct radeon_info * rad_info,const struct ac_sqtt * data,uint64_t va,unsigned se)39 ac_sqtt_get_data_va(const struct radeon_info *rad_info, const struct ac_sqtt *data, uint64_t va,
40                     unsigned se)
41 {
42    return va + ac_sqtt_get_data_offset(rad_info, data, se);
43 }
44 
45 void
ac_sqtt_init(struct ac_sqtt * data)46 ac_sqtt_init(struct ac_sqtt *data)
47 {
48    list_inithead(&data->rgp_pso_correlation.record);
49    simple_mtx_init(&data->rgp_pso_correlation.lock, mtx_plain);
50 
51    list_inithead(&data->rgp_loader_events.record);
52    simple_mtx_init(&data->rgp_loader_events.lock, mtx_plain);
53 
54    list_inithead(&data->rgp_code_object.record);
55    simple_mtx_init(&data->rgp_code_object.lock, mtx_plain);
56 
57    list_inithead(&data->rgp_clock_calibration.record);
58    simple_mtx_init(&data->rgp_clock_calibration.lock, mtx_plain);
59 
60    list_inithead(&data->rgp_queue_info.record);
61    simple_mtx_init(&data->rgp_queue_info.lock, mtx_plain);
62 
63    list_inithead(&data->rgp_queue_event.record);
64    simple_mtx_init(&data->rgp_queue_event.lock, mtx_plain);
65 }
66 
67 void
ac_sqtt_finish(struct ac_sqtt * data)68 ac_sqtt_finish(struct ac_sqtt *data)
69 {
70    assert(data->rgp_pso_correlation.record_count == 0);
71    simple_mtx_destroy(&data->rgp_pso_correlation.lock);
72 
73    assert(data->rgp_loader_events.record_count == 0);
74    simple_mtx_destroy(&data->rgp_loader_events.lock);
75 
76    assert(data->rgp_code_object.record_count == 0);
77    simple_mtx_destroy(&data->rgp_code_object.lock);
78 
79    assert(data->rgp_clock_calibration.record_count == 0);
80    simple_mtx_destroy(&data->rgp_clock_calibration.lock);
81 
82    assert(data->rgp_queue_info.record_count == 0);
83    simple_mtx_destroy(&data->rgp_queue_info.lock);
84 
85    assert(data->rgp_queue_event.record_count == 0);
86    simple_mtx_destroy(&data->rgp_queue_event.lock);
87 }
88 
89 bool
ac_is_sqtt_complete(const struct radeon_info * rad_info,const struct ac_sqtt * data,const struct ac_sqtt_data_info * info)90 ac_is_sqtt_complete(const struct radeon_info *rad_info, const struct ac_sqtt *data,
91                     const struct ac_sqtt_data_info *info)
92 {
93    if (rad_info->gfx_level >= GFX10) {
94       /* GFX10 doesn't have THREAD_TRACE_CNTR but it reports the number of
95        * dropped bytes per SE via THREAD_TRACE_DROPPED_CNTR. Though, this
96        * doesn't seem reliable because it might still report non-zero even if
97        * the SQTT buffer isn't full.
98        *
99        * The solution here is to compare the number of bytes written by the hw
100        * (in units of 32 bytes) to the SQTT buffer size. If it's equal, that
101        * means that the buffer is full and should be resized.
102        */
103       return !(info->cur_offset * 32 == data->buffer_size - 32);
104    }
105 
106    /* Otherwise, compare the current thread trace offset with the number
107     * of written bytes.
108     */
109    return info->cur_offset == info->gfx9_write_counter;
110 }
111 
112 uint32_t
ac_get_expected_buffer_size(struct radeon_info * rad_info,const struct ac_sqtt_data_info * info)113 ac_get_expected_buffer_size(struct radeon_info *rad_info, const struct ac_sqtt_data_info *info)
114 {
115    if (rad_info->gfx_level >= GFX10) {
116       uint32_t dropped_cntr_per_se = info->gfx10_dropped_cntr / rad_info->max_se;
117       return ((info->cur_offset * 32) + dropped_cntr_per_se) / 1024;
118    }
119 
120    return (info->gfx9_write_counter * 32) / 1024;
121 }
122 
123 bool
ac_sqtt_add_pso_correlation(struct ac_sqtt * sqtt,uint64_t pipeline_hash,uint64_t api_hash)124 ac_sqtt_add_pso_correlation(struct ac_sqtt *sqtt, uint64_t pipeline_hash, uint64_t api_hash)
125 {
126    struct rgp_pso_correlation *pso_correlation = &sqtt->rgp_pso_correlation;
127    struct rgp_pso_correlation_record *record;
128 
129    record = malloc(sizeof(struct rgp_pso_correlation_record));
130    if (!record)
131       return false;
132 
133    record->api_pso_hash = api_hash;
134    record->pipeline_hash[0] = pipeline_hash;
135    record->pipeline_hash[1] = pipeline_hash;
136    memset(record->api_level_obj_name, 0, sizeof(record->api_level_obj_name));
137 
138    simple_mtx_lock(&pso_correlation->lock);
139    list_addtail(&record->list, &pso_correlation->record);
140    pso_correlation->record_count++;
141    simple_mtx_unlock(&pso_correlation->lock);
142 
143    return true;
144 }
145 
146 bool
ac_sqtt_add_code_object_loader_event(struct ac_sqtt * sqtt,uint64_t pipeline_hash,uint64_t base_address)147 ac_sqtt_add_code_object_loader_event(struct ac_sqtt *sqtt, uint64_t pipeline_hash,
148                                      uint64_t base_address)
149 {
150    struct rgp_loader_events *loader_events = &sqtt->rgp_loader_events;
151    struct rgp_loader_events_record *record;
152 
153    record = malloc(sizeof(struct rgp_loader_events_record));
154    if (!record)
155       return false;
156 
157    record->loader_event_type = RGP_LOAD_TO_GPU_MEMORY;
158    record->reserved = 0;
159    record->base_address = base_address & 0xffffffffffff;
160    record->code_object_hash[0] = pipeline_hash;
161    record->code_object_hash[1] = pipeline_hash;
162    record->time_stamp = os_time_get_nano();
163 
164    simple_mtx_lock(&loader_events->lock);
165    list_addtail(&record->list, &loader_events->record);
166    loader_events->record_count++;
167    simple_mtx_unlock(&loader_events->lock);
168 
169    return true;
170 }
171 
172 bool
ac_sqtt_add_clock_calibration(struct ac_sqtt * sqtt,uint64_t cpu_timestamp,uint64_t gpu_timestamp)173 ac_sqtt_add_clock_calibration(struct ac_sqtt *sqtt, uint64_t cpu_timestamp, uint64_t gpu_timestamp)
174 {
175    struct rgp_clock_calibration *clock_calibration = &sqtt->rgp_clock_calibration;
176    struct rgp_clock_calibration_record *record;
177 
178    record = malloc(sizeof(struct rgp_clock_calibration_record));
179    if (!record)
180       return false;
181 
182    record->cpu_timestamp = cpu_timestamp;
183    record->gpu_timestamp = gpu_timestamp;
184 
185    simple_mtx_lock(&clock_calibration->lock);
186    list_addtail(&record->list, &clock_calibration->record);
187    clock_calibration->record_count++;
188    simple_mtx_unlock(&clock_calibration->lock);
189 
190    return true;
191 }
192 
193 /* See https://gitlab.freedesktop.org/mesa/mesa/-/issues/5260
194  * On some HW SQTT can hang if we're not in one of the profiling pstates. */
195 bool
ac_check_profile_state(const struct radeon_info * info)196 ac_check_profile_state(const struct radeon_info *info)
197 {
198    char path[128];
199    char data[128];
200    int n;
201 
202    if (!info->pci.valid)
203       return false; /* Unknown but optimistic. */
204 
205    snprintf(path, sizeof(path),
206             "/sys/bus/pci/devices/%04x:%02x:%02x.%x/power_dpm_force_performance_level",
207             info->pci.domain, info->pci.bus, info->pci.dev, info->pci.func);
208 
209    FILE *f = fopen(path, "r");
210    if (!f)
211       return false; /* Unknown but optimistic. */
212    n = fread(data, 1, sizeof(data) - 1, f);
213    fclose(f);
214    data[n] = 0;
215    return strstr(data, "profile") == NULL;
216 }
217 
218 union rgp_sqtt_marker_cb_id
ac_sqtt_get_next_cmdbuf_id(struct ac_sqtt * data,enum amd_ip_type ip_type)219 ac_sqtt_get_next_cmdbuf_id(struct ac_sqtt *data, enum amd_ip_type ip_type)
220 {
221    union rgp_sqtt_marker_cb_id cb_id = {0};
222 
223    cb_id.global_cb_id.cb_index =
224       p_atomic_inc_return(&data->cmdbuf_ids_per_queue[ip_type]);
225 
226    return cb_id;
227 }
228 
229 bool
ac_sqtt_se_is_disabled(const struct radeon_info * info,unsigned se)230 ac_sqtt_se_is_disabled(const struct radeon_info *info, unsigned se)
231 {
232    /* No active CU on the SE means it is disabled. */
233    return info->cu_mask[se][0] == 0;
234 }
235 
236 uint32_t
ac_sqtt_get_active_cu(const struct radeon_info * info,unsigned se)237 ac_sqtt_get_active_cu(const struct radeon_info *info, unsigned se)
238 {
239    uint32_t cu_index;
240 
241    if (info->gfx_level >= GFX11) {
242       /* GFX11 seems to operate on the last active CU. */
243       cu_index = util_last_bit(info->cu_mask[se][0]) - 1;
244    } else {
245       /* Default to the first active CU. */
246       cu_index = ffs(info->cu_mask[se][0]);
247    }
248 
249    return cu_index;
250 }
251 
252 bool
ac_sqtt_get_trace(struct ac_sqtt * data,const struct radeon_info * info,struct ac_sqtt_trace * sqtt_trace)253 ac_sqtt_get_trace(struct ac_sqtt *data, const struct radeon_info *info,
254                   struct ac_sqtt_trace *sqtt_trace)
255 {
256    unsigned max_se = info->max_se;
257    void *ptr = data->ptr;
258 
259    memset(sqtt_trace, 0, sizeof(*sqtt_trace));
260 
261    for (unsigned se = 0; se < max_se; se++) {
262       uint64_t info_offset = ac_sqtt_get_info_offset(se);
263       uint64_t data_offset = ac_sqtt_get_data_offset(info, data, se);
264       void *info_ptr = (uint8_t *)ptr + info_offset;
265       void *data_ptr = (uint8_t *)ptr + data_offset;
266       struct ac_sqtt_data_info *trace_info = (struct ac_sqtt_data_info *)info_ptr;
267       struct ac_sqtt_data_se data_se = {0};
268       int active_cu = ac_sqtt_get_active_cu(info, se);
269 
270       if (ac_sqtt_se_is_disabled(info, se))
271          continue;
272 
273       if (!ac_is_sqtt_complete(info, data, trace_info))
274          return false;
275 
276       data_se.data_ptr = data_ptr;
277       data_se.info = *trace_info;
278       data_se.shader_engine = se;
279 
280       /* RGP seems to expect units of WGP on GFX10+. */
281       data_se.compute_unit = info->gfx_level >= GFX10 ? (active_cu / 2) : active_cu;
282 
283       sqtt_trace->traces[sqtt_trace->num_traces] = data_se;
284       sqtt_trace->num_traces++;
285    }
286 
287    sqtt_trace->rgp_code_object = &data->rgp_code_object;
288    sqtt_trace->rgp_loader_events = &data->rgp_loader_events;
289    sqtt_trace->rgp_pso_correlation = &data->rgp_pso_correlation;
290    sqtt_trace->rgp_queue_info = &data->rgp_queue_info;
291    sqtt_trace->rgp_queue_event = &data->rgp_queue_event;
292    sqtt_trace->rgp_clock_calibration = &data->rgp_clock_calibration;
293 
294    return true;
295 }
296 
297 uint32_t
ac_sqtt_get_shader_mask(const struct radeon_info * info)298 ac_sqtt_get_shader_mask(const struct radeon_info *info)
299 {
300    unsigned shader_mask = 0x7f; /* all shader stages */
301 
302    if (info->gfx_level >= GFX11) {
303       /* Disable unsupported hw shader stages */
304       shader_mask &= ~(0x02 /* VS */ | 0x08 /* ES */ | 0x20 /* LS */);
305    }
306 
307    return shader_mask;
308 }
309