1 /*
2 * Copyright © 2020-2021 Collabora, Ltd.
3 * Author: Antonio Caggiano <antonio.caggiano@collabora.com>
4 * Author: Corentin Noël <corentin.noel@collabora.com>
5 *
6 * SPDX-License-Identifier: MIT
7 */
8
9 #include "intel_pps_driver.h"
10
11 #include <dirent.h>
12 #include <fcntl.h>
13 #include <math.h>
14 #include <poll.h>
15 #include <strings.h>
16 #include <sys/ioctl.h>
17 #include <unistd.h>
18
19 #include "drm-uapi/i915_drm.h"
20
21 #include "common/intel_gem.h"
22 #include "dev/intel_device_info.h"
23 #include "perf/intel_perf.h"
24 #include "perf/intel_perf_query.h"
25
26 #include <pps/pps.h>
27 #include <pps/pps_algorithm.h>
28
29 #include "intel_pps_perf.h"
30 #include "intel_pps_priv.h"
31
32 namespace pps
33 {
34
35 // The HW sampling period is programmed using period_exponent following this
36 // formula:
37 // sample_period = timestamp_period * 2^(period_exponent + 1)
38 // So our minimum sampling period is twice the timestamp period
39
get_min_sampling_period_ns()40 uint64_t IntelDriver::get_min_sampling_period_ns()
41 {
42 return (2.f * perf->devinfo.timestamp_frequency) / 1000000000ull;
43 }
44
IntelDriver()45 IntelDriver::IntelDriver()
46 {
47 }
48
~IntelDriver()49 IntelDriver::~IntelDriver()
50 {
51 }
52
enable_counter(uint32_t counter_id)53 void IntelDriver::enable_counter(uint32_t counter_id)
54 {
55 auto &counter = counters[counter_id];
56
57 enabled_counters.emplace_back(counter);
58 }
59
enable_all_counters()60 void IntelDriver::enable_all_counters()
61 {
62 // We should only have one group
63 assert(groups.size() == 1);
64 for (uint32_t counter_id : groups[0].counters) {
65 auto &counter = counters[counter_id];
66 enabled_counters.emplace_back(counter);
67 }
68 }
69
init_perfcnt()70 bool IntelDriver::init_perfcnt()
71 {
72 /* Note: clock_id's below 128 are reserved.. for custom clock sources,
73 * using the hash of a namespaced string is the recommended approach.
74 * See: https://perfetto.dev/docs/concepts/clock-sync
75 */
76 this->clock_id = intel_pps_clock_id(drm_device.gpu_num);
77
78 assert(!perf && "Intel perf should not be initialized at this point");
79
80 perf = std::make_unique<IntelPerf>(drm_device.fd);
81
82 const char *metric_set_name = getenv("INTEL_PERFETTO_METRIC_SET");
83
84 struct intel_perf_query_info *default_query = nullptr;
85 selected_query = nullptr;
86 for (auto &query : perf->get_queries()) {
87 if (!strcmp(query->symbol_name, "RenderBasic"))
88 default_query = query;
89 if (metric_set_name && !strcmp(query->symbol_name, metric_set_name))
90 selected_query = query;
91 }
92
93 assert(default_query);
94
95 if (!selected_query) {
96 if (metric_set_name) {
97 PPS_LOG_ERROR("Available metric sets:");
98 for (auto &query : perf->get_queries())
99 PPS_LOG_ERROR(" %s", query->symbol_name);
100 PPS_LOG_FATAL("Metric set '%s' not available.", metric_set_name);
101 }
102 selected_query = default_query;
103 }
104
105 PPS_LOG("Using metric set '%s': %s",
106 selected_query->symbol_name, selected_query->name);
107
108 // Create group
109 CounterGroup group = {};
110 group.id = groups.size();
111 group.name = selected_query->symbol_name;
112
113 for (int i = 0; i < selected_query->n_counters; ++i) {
114 intel_perf_query_counter &counter = selected_query->counters[i];
115
116 // Create counter
117 Counter counter_desc = {};
118 counter_desc.id = counters.size();
119 counter_desc.name = counter.symbol_name;
120 counter_desc.group = group.id;
121 counter_desc.getter = [counter, this](
122 const Counter &c, const Driver &dri) -> Counter::Value {
123 switch (counter.data_type) {
124 case INTEL_PERF_COUNTER_DATA_TYPE_UINT64:
125 case INTEL_PERF_COUNTER_DATA_TYPE_UINT32:
126 case INTEL_PERF_COUNTER_DATA_TYPE_BOOL32:
127 return (int64_t)counter.oa_counter_read_uint64(perf->cfg,
128 selected_query,
129 &perf->result);
130 break;
131 case INTEL_PERF_COUNTER_DATA_TYPE_DOUBLE:
132 case INTEL_PERF_COUNTER_DATA_TYPE_FLOAT:
133 return counter.oa_counter_read_float(perf->cfg,
134 selected_query,
135 &perf->result);
136 break;
137 }
138
139 return {};
140 };
141
142 // Add counter id to the group
143 group.counters.emplace_back(counter_desc.id);
144
145 // Store counter
146 counters.emplace_back(std::move(counter_desc));
147 }
148
149 // Store group
150 groups.emplace_back(std::move(group));
151
152 assert(counters.size() && "Failed to query counters");
153
154 // Clear accumulations
155 intel_perf_query_result_clear(&perf->result);
156
157 return true;
158 }
159
enable_perfcnt(uint64_t sampling_period_ns)160 void IntelDriver::enable_perfcnt(uint64_t sampling_period_ns)
161 {
162 this->sampling_period_ns = sampling_period_ns;
163
164 intel_gem_read_render_timestamp(drm_device.fd, perf->devinfo.kmd_type,
165 &gpu_timestamp_udw);
166 gpu_timestamp_udw &= ~perf->cfg->oa_timestamp_mask;
167 if (!perf->open(sampling_period_ns, selected_query)) {
168 PPS_LOG_FATAL("Failed to open intel perf");
169 }
170 }
171
disable_perfcnt()172 void IntelDriver::disable_perfcnt()
173 {
174 gpu_timestamp_udw = 0;
175 perf = nullptr;
176 groups.clear();
177 counters.clear();
178 enabled_counters.clear();
179 }
180
181 /// @brief Some perf record durations can be really short
182 /// @return True if the duration is at least close to the sampling period
close_enough(uint64_t duration,uint64_t sampling_period)183 static bool close_enough(uint64_t duration, uint64_t sampling_period)
184 {
185 return duration > sampling_period - 100000;
186 }
187
188 /// @brief Transforms the raw data received in from the driver into records
parse_perf_records(const std::vector<uint8_t> & data,const size_t byte_count)189 std::vector<PerfRecord> IntelDriver::parse_perf_records(const std::vector<uint8_t> &data,
190 const size_t byte_count)
191 {
192 std::vector<PerfRecord> records;
193 records.reserve(128);
194
195 PerfRecord record;
196 record.data.reserve(512);
197
198 const uint8_t *iter = data.data();
199 const uint8_t *end = iter + byte_count;
200
201 uint64_t prev_gpu_timestamp = last_gpu_timestamp;
202
203 while (iter < end) {
204 // Iterate a record at a time
205 auto header = reinterpret_cast<const drm_i915_perf_record_header *>(iter);
206
207 if (header->type == DRM_I915_PERF_RECORD_SAMPLE) {
208 // Report is next to the header
209 const uint32_t *report = reinterpret_cast<const uint32_t *>(header + 1);
210 uint64_t gpu_timestamp_ldw =
211 intel_perf_report_timestamp(selected_query, report);
212
213 /* Our HW only provides us with the lower 32 bits of the 36bits
214 * timestamp counter value. If we haven't captured the top bits yet,
215 * do it now. If we see a roll over the lower 32bits capture it
216 * again.
217 */
218 if (gpu_timestamp_udw == 0 ||
219 (gpu_timestamp_udw | gpu_timestamp_ldw) < last_gpu_timestamp) {
220 intel_gem_read_render_timestamp(drm_device.fd,
221 perf->devinfo.kmd_type,
222 &gpu_timestamp_udw);
223 gpu_timestamp_udw &= ~perf->cfg->oa_timestamp_mask;
224 }
225
226 uint64_t gpu_timestamp = gpu_timestamp_udw | gpu_timestamp_ldw;
227
228 auto duration = intel_device_info_timebase_scale(&perf->devinfo,
229 gpu_timestamp - prev_gpu_timestamp);
230
231 // Skip perf-records that are too short by checking
232 // the distance between last report and this one
233 if (close_enough(duration, sampling_period_ns)) {
234 prev_gpu_timestamp = gpu_timestamp;
235
236 // Add the new record to the list
237 record.timestamp = gpu_timestamp;
238 record.data.resize(header->size); // Possibly 264?
239 memcpy(record.data.data(), iter, header->size);
240 records.emplace_back(record);
241 }
242 }
243
244 // Go to the next record
245 iter += header->size;
246 }
247
248 return records;
249 }
250
251 /// @brief Read all the available data from the metric set currently in use
read_data_from_metric_set()252 void IntelDriver::read_data_from_metric_set()
253 {
254 assert(metric_buffer.size() >= 1024 && "Metric buffer should have space for reading");
255
256 ssize_t bytes_read = 0;
257 while ((bytes_read = perf->read_oa_stream(metric_buffer.data() + total_bytes_read,
258 metric_buffer.size() - total_bytes_read)) > 0 ||
259 errno == EINTR) {
260 total_bytes_read += std::max(ssize_t(0), bytes_read);
261
262 // Increase size of the buffer for the next read
263 if (metric_buffer.size() / 2 < total_bytes_read) {
264 metric_buffer.resize(metric_buffer.size() * 2);
265 }
266 }
267
268 assert(total_bytes_read < metric_buffer.size() && "Buffer not big enough");
269 }
270
dump_perfcnt()271 bool IntelDriver::dump_perfcnt()
272 {
273 if (!perf->oa_stream_ready()) {
274 return false;
275 }
276
277 read_data_from_metric_set();
278
279 auto new_records = parse_perf_records(metric_buffer, total_bytes_read);
280 if (new_records.empty()) {
281 // No new records from the GPU yet
282 return false;
283 } else {
284 // Records are parsed correctly, so we can reset the
285 // number of bytes read so far from the metric set
286 total_bytes_read = 0;
287 }
288
289 APPEND(records, new_records);
290
291 if (records.size() < 2) {
292 // Not enough records to accumulate
293 return false;
294 }
295
296 return true;
297 }
298
gpu_next()299 uint64_t IntelDriver::gpu_next()
300 {
301 if (records.size() < 2) {
302 // Not enough records to accumulate
303 return 0;
304 }
305
306 // Get first and second
307 auto record_a = reinterpret_cast<const drm_i915_perf_record_header *>(records[0].data.data());
308 auto record_b = reinterpret_cast<const drm_i915_perf_record_header *>(records[1].data.data());
309
310 intel_perf_query_result_accumulate_fields(&perf->result,
311 selected_query,
312 record_a + 1,
313 record_b + 1,
314 false /* no_oa_accumulate */);
315
316 // Get last timestamp
317 auto gpu_timestamp = records[1].timestamp;
318
319 // Consume first record
320 records.erase(std::begin(records), std::begin(records) + 1);
321
322 return intel_device_info_timebase_scale(&perf->devinfo, gpu_timestamp);
323 }
324
next()325 uint64_t IntelDriver::next()
326 {
327 // Reset accumulation
328 intel_perf_query_result_clear(&perf->result);
329 return gpu_next();
330 }
331
gpu_clock_id() const332 uint32_t IntelDriver::gpu_clock_id() const
333 {
334 return this->clock_id;
335 }
336
gpu_timestamp() const337 uint64_t IntelDriver::gpu_timestamp() const
338 {
339 uint64_t timestamp;
340 intel_gem_read_render_timestamp(drm_device.fd, perf->devinfo.kmd_type,
341 ×tamp);
342 return intel_device_info_timebase_scale(&perf->devinfo, timestamp);
343 }
344
cpu_gpu_timestamp(uint64_t & cpu_timestamp,uint64_t & gpu_timestamp) const345 bool IntelDriver::cpu_gpu_timestamp(uint64_t &cpu_timestamp,
346 uint64_t &gpu_timestamp) const
347 {
348 if (!intel_gem_read_correlate_cpu_gpu_timestamp(drm_device.fd,
349 perf->devinfo.kmd_type,
350 INTEL_ENGINE_CLASS_RENDER, 0,
351 CLOCK_BOOTTIME,
352 &cpu_timestamp,
353 &gpu_timestamp,
354 NULL))
355 return false;
356
357 gpu_timestamp =
358 intel_device_info_timebase_scale(&perf->devinfo, gpu_timestamp);
359 return true;
360 }
361
362 } // namespace pps
363