1 /*
2 * Copyright © 2016 Intel Corporation
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
13 * Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21 * IN THE SOFTWARE.
22 *
23 */
24
25 #include "igt.h"
26 #include <unistd.h>
27 #include <stdlib.h>
28 #include <stdint.h>
29 #include <stdio.h>
30 #include <string.h>
31 #include <fcntl.h>
32 #include <ftw.h>
33 #include <inttypes.h>
34 #include <pthread.h>
35 #include <sched.h>
36 #include <signal.h>
37 #include <errno.h>
38 #include <sys/stat.h>
39 #include <sys/ioctl.h>
40 #include <sys/time.h>
41 #include <time.h>
42 #include <limits.h>
43 #include "drm.h"
44
45 #include <linux/unistd.h>
46
47 #define sigev_notify_thread_id _sigev_un._tid
48
49 static volatile int done;
50
51 struct gem_busyspin {
52 pthread_t thread;
53 unsigned long sz;
54 unsigned long count;
55 bool leak;
56 bool interrupts;
57 };
58
59 struct sys_wait {
60 pthread_t thread;
61 struct igt_mean mean;
62 };
63
force_low_latency(void)64 static void force_low_latency(void)
65 {
66 int32_t target = 0;
67 int fd = open("/dev/cpu_dma_latency", O_RDWR);
68 if (fd < 0 || write(fd, &target, sizeof(target)) < 0)
69 fprintf(stderr,
70 "Unable to prevent CPU sleeps and force low latency using /dev/cpu_dma_latency: %s\n",
71 strerror(errno));
72 }
73
74 #define LOCAL_I915_EXEC_NO_RELOC (1<<11)
75 #define LOCAL_I915_EXEC_HANDLE_LUT (1<<12)
76
77 #define LOCAL_I915_EXEC_BSD_SHIFT (13)
78 #define LOCAL_I915_EXEC_BSD_MASK (3 << LOCAL_I915_EXEC_BSD_SHIFT)
79
80 #define ENGINE_FLAGS (I915_EXEC_RING_MASK | LOCAL_I915_EXEC_BSD_MASK)
81
ignore_engine(int fd,unsigned engine)82 static bool ignore_engine(int fd, unsigned engine)
83 {
84 if (engine == 0)
85 return true;
86
87 if (gem_has_bsd2(fd) && engine == I915_EXEC_BSD)
88 return true;
89
90 return false;
91 }
92
gem_busyspin(void * arg)93 static void *gem_busyspin(void *arg)
94 {
95 const uint32_t bbe = MI_BATCH_BUFFER_END;
96 struct gem_busyspin *bs = arg;
97 struct drm_i915_gem_execbuffer2 execbuf;
98 struct drm_i915_gem_exec_object2 obj[2];
99 const unsigned sz =
100 bs->sz ? bs->sz + sizeof(bbe) : bs->leak ? 16 << 20 : 4 << 10;
101 unsigned engines[16];
102 unsigned nengine;
103 unsigned engine;
104 int fd;
105
106 fd = drm_open_driver(DRIVER_INTEL);
107
108 nengine = 0;
109 for_each_engine(fd, engine)
110 if (!ignore_engine(fd, engine)) engines[nengine++] = engine;
111
112 memset(obj, 0, sizeof(obj));
113 obj[0].handle = gem_create(fd, 4096);
114 obj[0].flags = EXEC_OBJECT_WRITE;
115 obj[1].handle = gem_create(fd, sz);
116 gem_write(fd, obj[1].handle, bs->sz, &bbe, sizeof(bbe));
117
118 memset(&execbuf, 0, sizeof(execbuf));
119 if (bs->interrupts) {
120 execbuf.buffers_ptr = (uintptr_t)&obj[0];
121 execbuf.buffer_count = 2;
122 } else {
123 execbuf.buffers_ptr = (uintptr_t)&obj[1];
124 execbuf.buffer_count = 1;
125 }
126 execbuf.flags |= LOCAL_I915_EXEC_HANDLE_LUT;
127 execbuf.flags |= LOCAL_I915_EXEC_NO_RELOC;
128 if (__gem_execbuf(fd, &execbuf)) {
129 execbuf.flags = 0;
130 gem_execbuf(fd, &execbuf);
131 }
132
133 while (!done) {
134 for (int n = 0; n < nengine; n++) {
135 const int m = rand() % nengine;
136 unsigned int tmp = engines[n];
137 engines[n] = engines[m];
138 engines[m] = tmp;
139 }
140 for (int n = 0; n < nengine; n++) {
141 execbuf.flags &= ~ENGINE_FLAGS;
142 execbuf.flags |= engines[n];
143 gem_execbuf(fd, &execbuf);
144 }
145 bs->count += nengine;
146 if (bs->leak) {
147 gem_madvise(fd, obj[1].handle, I915_MADV_DONTNEED);
148 obj[1].handle = gem_create(fd, sz);
149 gem_write(fd, obj[1].handle, bs->sz, &bbe, sizeof(bbe));
150 }
151 }
152
153 close(fd);
154 return NULL;
155 }
156
elapsed(const struct timespec * a,const struct timespec * b)157 static double elapsed(const struct timespec *a, const struct timespec *b)
158 {
159 return 1e9*(b->tv_sec - a->tv_sec) + (b->tv_nsec - a ->tv_nsec);
160 }
161
sys_wait(void * arg)162 static void *sys_wait(void *arg)
163 {
164 struct sys_wait *w = arg;
165 struct sigevent sev;
166 timer_t timer;
167 sigset_t mask;
168 struct timespec now;
169 #define SIG SIGRTMIN
170
171 sigemptyset(&mask);
172 sigaddset(&mask, SIG);
173 sigprocmask(SIG_SETMASK, &mask, NULL);
174
175 sev.sigev_notify = SIGEV_SIGNAL | SIGEV_THREAD_ID;
176 sev.sigev_notify_thread_id = gettid();
177 sev.sigev_signo = SIG;
178 timer_create(CLOCK_MONOTONIC, &sev, &timer);
179
180 clock_gettime(CLOCK_MONOTONIC, &now);
181 while (!done) {
182 struct itimerspec its;
183 int sigs;
184
185 its.it_value = now;
186 its.it_value.tv_nsec += 100 * 1000;
187 its.it_value.tv_nsec += rand() % (NSEC_PER_SEC / 1000);
188 if (its.it_value.tv_nsec >= NSEC_PER_SEC) {
189 its.it_value.tv_nsec -= NSEC_PER_SEC;
190 its.it_value.tv_sec += 1;
191 }
192 its.it_interval.tv_sec = its.it_interval.tv_nsec = 0;
193 timer_settime(timer, TIMER_ABSTIME, &its, NULL);
194
195 sigwait(&mask, &sigs);
196 clock_gettime(CLOCK_MONOTONIC, &now);
197 igt_mean_add(&w->mean, elapsed(&its.it_value, &now));
198 }
199
200 sigprocmask(SIG_UNBLOCK, &mask, NULL);
201 timer_delete(timer);
202
203 return NULL;
204 }
205
206 #define PAGE_SIZE 4096
sys_thp_alloc(void * arg)207 static void *sys_thp_alloc(void *arg)
208 {
209 struct sys_wait *w = arg;
210 struct timespec now;
211
212 clock_gettime(CLOCK_MONOTONIC, &now);
213 while (!done) {
214 const size_t sz = 2 << 20;
215 const struct timespec start = now;
216 void *ptr;
217
218 ptr = mmap(NULL, sz,
219 PROT_WRITE, MAP_PRIVATE | MAP_ANONYMOUS,
220 -1, 0);
221 assert(ptr != MAP_FAILED);
222 madvise(ptr, sz, MADV_HUGEPAGE);
223 for (size_t page = 0; page < sz; page += PAGE_SIZE)
224 *(volatile uint32_t *)((unsigned char *)ptr + page) = 0;
225 munmap(ptr, sz);
226
227 clock_gettime(CLOCK_MONOTONIC, &now);
228 igt_mean_add(&w->mean, elapsed(&start, &now));
229 }
230
231 return NULL;
232 }
233
bind_cpu(pthread_attr_t * attr,int cpu)234 static void bind_cpu(pthread_attr_t *attr, int cpu)
235 {
236 #ifdef __USE_GNU
237 cpu_set_t mask;
238
239 if (cpu == -1)
240 return;
241
242 CPU_ZERO(&mask);
243 CPU_SET(cpu, &mask);
244
245 pthread_attr_setaffinity_np(attr, sizeof(mask), &mask);
246 #endif
247 }
248
rtprio(pthread_attr_t * attr,int prio)249 static void rtprio(pthread_attr_t *attr, int prio)
250 {
251 #ifdef PTHREAD_EXPLICIT_SCHED
252 struct sched_param param = { .sched_priority = 99 };
253 pthread_attr_setinheritsched(attr, PTHREAD_EXPLICIT_SCHED);
254 pthread_attr_setschedpolicy(attr, SCHED_FIFO);
255 pthread_attr_setschedparam(attr, ¶m);
256 #endif
257 }
258
l_estimate(igt_stats_t * stats)259 static double l_estimate(igt_stats_t *stats)
260 {
261 if (stats->n_values > 9)
262 return igt_stats_get_trimean(stats);
263 else if (stats->n_values > 5)
264 return igt_stats_get_median(stats);
265 else
266 return igt_stats_get_mean(stats);
267 }
268
min_measurement_error(void)269 static double min_measurement_error(void)
270 {
271 struct timespec start, end;
272 int n;
273
274 clock_gettime(CLOCK_MONOTONIC, &start);
275 for (n = 0; n < 1024; n++)
276 clock_gettime(CLOCK_MONOTONIC, &end);
277
278 return elapsed(&start, &end) / n;
279 }
280
print_entry(const char * filepath,const struct stat * info,const int typeflag,struct FTW * pathinfo)281 static int print_entry(const char *filepath, const struct stat *info,
282 const int typeflag, struct FTW *pathinfo)
283 {
284 int fd;
285
286 fd = open(filepath, O_RDONLY);
287 if (fd != -1) {
288 void *ptr;
289
290 ptr = mmap(NULL, info->st_size,
291 PROT_READ, MAP_SHARED | MAP_POPULATE,
292 fd, 0);
293 if (ptr != MAP_FAILED)
294 munmap(ptr, info->st_size);
295
296 close(fd);
297 }
298
299 return 0;
300 }
301
background_fs(void * path)302 static void *background_fs(void *path)
303 {
304 while (1)
305 nftw(path, print_entry, 20, FTW_PHYS | FTW_MOUNT);
306 return NULL;
307 }
308
calibrate_nop(unsigned int target_us,unsigned int tolerance_pct)309 static unsigned long calibrate_nop(unsigned int target_us,
310 unsigned int tolerance_pct)
311 {
312 const uint32_t bbe = MI_BATCH_BUFFER_END;
313 const unsigned int loops = 100;
314 struct drm_i915_gem_exec_object2 obj = {};
315 struct drm_i915_gem_execbuffer2 eb =
316 { .buffer_count = 1, .buffers_ptr = (uintptr_t)&obj};
317 struct timespec t_0, t_end;
318 long sz, prev;
319 int fd;
320
321 fd = drm_open_driver(DRIVER_INTEL);
322
323 clock_gettime(CLOCK_MONOTONIC, &t_0);
324
325 sz = 256 * 1024;
326 do {
327 struct timespec t_start;
328
329 obj.handle = gem_create(fd, sz + sizeof(bbe));
330 gem_write(fd, obj.handle, sz, &bbe, sizeof(bbe));
331 gem_execbuf(fd, &eb);
332 gem_sync(fd, obj.handle);
333
334 clock_gettime(CLOCK_MONOTONIC, &t_start);
335 for (int loop = 0; loop < loops; loop++)
336 gem_execbuf(fd, &eb);
337 gem_sync(fd, obj.handle);
338 clock_gettime(CLOCK_MONOTONIC, &t_end);
339
340 gem_close(fd, obj.handle);
341
342 prev = sz;
343 sz = loops * sz / elapsed(&t_start, &t_end) * 1e3 * target_us;
344 sz = ALIGN(sz, sizeof(uint32_t));
345 } while (elapsed(&t_0, &t_end) < 5 ||
346 abs(sz - prev) > (sz * tolerance_pct / 100));
347
348 close(fd);
349
350 return sz;
351 }
352
main(int argc,char ** argv)353 int main(int argc, char **argv)
354 {
355 struct gem_busyspin *busy;
356 struct sys_wait *wait;
357 void *sys_fn = sys_wait;
358 pthread_attr_t attr;
359 pthread_t bg_fs = 0;
360 int ncpus = sysconf(_SC_NPROCESSORS_ONLN);
361 igt_stats_t cycles, mean, max;
362 double min;
363 int time = 10;
364 int field = -1;
365 int enable_gem_sysbusy = 1;
366 bool leak = false;
367 bool interrupts = false;
368 long batch = 0;
369 int n, c;
370
371 while ((c = getopt(argc, argv, "r:t:f:bmni1")) != -1) {
372 switch (c) {
373 case '1':
374 ncpus = 1;
375 break;
376 case 'n': /* dry run, measure baseline system latency */
377 enable_gem_sysbusy = 0;
378 break;
379 case 'i': /* interrupts ahoy! */
380 interrupts = true;
381 break;
382 case 't':
383 /* How long to run the benchmark for (seconds) */
384 time = atoi(optarg);
385 if (time < 0)
386 time = INT_MAX;
387 break;
388 case 'r':
389 /* Duration of each batch (microseconds) */
390 batch = atoi(optarg);
391 break;
392 case 'f':
393 /* Select an output field */
394 field = atoi(optarg);
395 break;
396 case 'b':
397 pthread_create(&bg_fs, NULL,
398 background_fs, (void *)"/");
399 sleep(5);
400 break;
401 case 'm':
402 sys_fn = sys_thp_alloc;
403 leak = true;
404 break;
405 default:
406 break;
407 }
408 }
409
410 /* Prevent CPU sleeps so that busy and idle loads are consistent. */
411 force_low_latency();
412 min = min_measurement_error();
413
414 if (batch > 0)
415 batch = calibrate_nop(batch, 2);
416 else
417 batch = -batch;
418
419 busy = calloc(ncpus, sizeof(*busy));
420 pthread_attr_init(&attr);
421 if (enable_gem_sysbusy) {
422 for (n = 0; n < ncpus; n++) {
423 bind_cpu(&attr, n);
424 busy[n].sz = batch;
425 busy[n].leak = leak;
426 busy[n].interrupts = interrupts;
427 pthread_create(&busy[n].thread, &attr,
428 gem_busyspin, &busy[n]);
429 }
430 }
431
432 wait = calloc(ncpus, sizeof(*wait));
433 pthread_attr_init(&attr);
434 rtprio(&attr, 99);
435 for (n = 0; n < ncpus; n++) {
436 igt_mean_init(&wait[n].mean);
437 bind_cpu(&attr, n);
438 pthread_create(&wait[n].thread, &attr, sys_fn, &wait[n]);
439 }
440
441 sleep(time);
442 done = 1;
443
444 igt_stats_init_with_size(&cycles, ncpus);
445 if (enable_gem_sysbusy) {
446 for (n = 0; n < ncpus; n++) {
447 pthread_join(busy[n].thread, NULL);
448 igt_stats_push(&cycles, busy[n].count);
449 }
450 }
451
452 igt_stats_init_with_size(&mean, ncpus);
453 igt_stats_init_with_size(&max, ncpus);
454 for (n = 0; n < ncpus; n++) {
455 pthread_join(wait[n].thread, NULL);
456 igt_stats_push_float(&mean, wait[n].mean.mean);
457 igt_stats_push_float(&max, wait[n].mean.max);
458 }
459 if (bg_fs) {
460 pthread_cancel(bg_fs);
461 pthread_join(bg_fs, NULL);
462 }
463
464 switch (field) {
465 default:
466 printf("gem_syslatency: cycles=%.0f, latency mean=%.3fus max=%.0fus\n",
467 igt_stats_get_mean(&cycles),
468 (igt_stats_get_mean(&mean) - min)/ 1000,
469 (l_estimate(&max) - min) / 1000);
470 break;
471 case 0:
472 printf("%.0f\n", igt_stats_get_mean(&cycles));
473 break;
474 case 1:
475 printf("%.3f\n", (igt_stats_get_mean(&mean) - min) / 1000);
476 break;
477 case 2:
478 printf("%.0f\n", (l_estimate(&max) - min) / 1000);
479 break;
480 }
481
482 return 0;
483
484 }
485