1 /*
2 * Copyright © 2015 Intel Corporation
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
13 * Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21 * IN THE SOFTWARE.
22 *
23 * Authors:
24 * Chris Wilson <chris@chris-wilson.co.uk>
25 *
26 */
27
28 #include <pthread.h>
29
30 #include "igt.h"
31 #include <unistd.h>
32 #include <stdlib.h>
33 #include <stdint.h>
34 #include <stdio.h>
35 #include <string.h>
36 #include <fcntl.h>
37 #include <inttypes.h>
38 #include <limits.h>
39 #include <errno.h>
40 #include <sys/stat.h>
41 #include <sys/ioctl.h>
42 #include <sys/time.h>
43 #include <sys/poll.h>
44 #include <sys/resource.h>
45 #include "drm.h"
46
47 #define LOCAL_I915_EXEC_FENCE_IN (1<<16)
48 #define LOCAL_I915_EXEC_FENCE_OUT (1<<17)
49
50 #define CONTEXT 0x1
51 #define REALTIME 0x2
52 #define CMDPARSER 0x4
53 #define FENCE_OUT 0x8
54
55 static int done;
56 static int fd;
57 static volatile uint32_t *timestamp_reg;
58
59 #define REG(x) (volatile uint32_t *)((volatile char *)igt_global_mmio + x)
60 #define REG_OFFSET(x) ((volatile char *)(x) - (volatile char *)igt_global_mmio)
61
62 #if defined(__USE_XOPEN2K) && defined(gen7_safe_mmio)
63 static pthread_spinlock_t timestamp_lock;
64
read_timestamp_locked(void)65 static uint32_t read_timestamp_locked(void)
66 {
67 uint32_t t;
68
69 pthread_spin_lock(×tamp_lock);
70 t = *timestamp_reg;
71 pthread_spin_unlock(×tamp_lock);
72
73 return t;
74 }
setup_timestamp_locked(void)75 static int setup_timestamp_locked(void)
76 {
77 if (pthread_spin_init(×tamp_lock, 0))
78 return 0;
79
80 read_timestamp = read_timestamp_locked;
81 return 1;
82 }
83
read_timestamp_unlocked(void)84 static uint32_t read_timestamp_unlocked(void)
85 {
86 return *timestamp_reg;
87 }
88
89 static uint32_t (*read_timestamp)(void) = read_timestamp_unlocked;
90
91 #else
setup_timestamp_locked(void)92 static int setup_timestamp_locked(void)
93 {
94 return 1;
95 }
96
read_timestamp(void)97 inline static uint32_t read_timestamp(void)
98 {
99 return *timestamp_reg;
100 }
101 #endif
102
103 struct consumer {
104 pthread_t thread;
105
106 int go;
107
108 struct igt_mean latency;
109 struct producer *producer;
110 };
111
112 struct producer {
113 pthread_t thread;
114 uint32_t ctx;
115 struct {
116 struct drm_i915_gem_exec_object2 exec[1];
117 struct drm_i915_gem_execbuffer2 execbuf;
118 } nop_dispatch;
119 struct {
120 struct drm_i915_gem_exec_object2 exec[2];
121 struct drm_i915_gem_execbuffer2 execbuf;
122 } workload_dispatch;
123 struct {
124 struct drm_i915_gem_exec_object2 exec[1];
125 struct drm_i915_gem_relocation_entry reloc[1];
126 struct drm_i915_gem_execbuffer2 execbuf;
127 } latency_dispatch;
128
129 pthread_mutex_t lock;
130 pthread_cond_t p_cond, c_cond;
131 uint32_t *last_timestamp;
132 int wait;
133 int complete;
134 int done;
135 struct igt_mean latency, dispatch;
136
137 int nop;
138 int nconsumers;
139 struct consumer *consumers;
140 };
141
142 #define LOCAL_EXEC_NO_RELOC (1<<11)
143 #define COPY_BLT_CMD (2<<29|0x53<<22|0x6)
144 #define BLT_WRITE_ALPHA (1<<21)
145 #define BLT_WRITE_RGB (1<<20)
146
147 #define WIDTH 1024
148 #define HEIGHT 1024
149
150 #define RCS_TIMESTAMP (0x2000 + 0x358)
151 #define BCS_TIMESTAMP (0x22000 + 0x358)
152 #define CYCLES_TO_NS(x) (80.*(x))
153 #define CYCLES_TO_US(x) (CYCLES_TO_NS(x)/1000.)
154
create_workload(int gen,int factor)155 static uint32_t create_workload(int gen, int factor)
156 {
157 const int has_64bit_reloc = gen >= 8;
158 uint32_t handle = gem_create(fd, 4096);
159 uint32_t *map = gem_mmap__cpu(fd, handle, 0, 4096, PROT_WRITE);
160 int i = 0;
161
162 while (factor--) {
163 /* XY_SRC_COPY */
164 map[i++] = COPY_BLT_CMD | BLT_WRITE_ALPHA | BLT_WRITE_RGB;
165 if (has_64bit_reloc)
166 map[i-1] += 2;
167 map[i++] = 0xcc << 16 | 1 << 25 | 1 << 24 | (4*WIDTH);
168 map[i++] = 0;
169 map[i++] = HEIGHT << 16 | WIDTH;
170 map[i++] = 0;
171 if (has_64bit_reloc)
172 map[i++] = 0;
173 map[i++] = 0;
174 map[i++] = 4096;
175 map[i++] = 0;
176 if (has_64bit_reloc)
177 map[i++] = 0;
178 }
179 map[i++] = MI_BATCH_BUFFER_END;
180 munmap(map, 4096);
181
182 return handle;
183 }
184
setup_workload(struct producer * p,int gen,uint32_t scratch,uint32_t batch,int factor,unsigned flags)185 static void setup_workload(struct producer *p, int gen,
186 uint32_t scratch,
187 uint32_t batch,
188 int factor,
189 unsigned flags)
190 {
191 struct drm_i915_gem_execbuffer2 *eb;
192 const int has_64bit_reloc = gen >= 8;
193 struct drm_i915_gem_relocation_entry *reloc;
194 int offset;
195
196 reloc = calloc(sizeof(*reloc), 2*factor);
197
198 p->workload_dispatch.exec[0].handle = scratch;
199 p->workload_dispatch.exec[1].relocation_count = 2*factor;
200 p->workload_dispatch.exec[1].relocs_ptr = (uintptr_t)reloc;
201 p->workload_dispatch.exec[1].handle = batch;
202
203 offset = 0;
204 while (factor--) {
205 reloc->offset = (offset+4) * sizeof(uint32_t);
206 reloc->target_handle = scratch;
207 reloc->read_domains = I915_GEM_DOMAIN_RENDER;
208 reloc->write_domain = I915_GEM_DOMAIN_RENDER;
209 reloc++;
210
211 reloc->offset = (offset+7) * sizeof(uint32_t);
212 if (has_64bit_reloc)
213 reloc->offset += sizeof(uint32_t);
214 reloc->target_handle = scratch;
215 reloc->read_domains = I915_GEM_DOMAIN_RENDER;
216 reloc++;
217
218 offset += 8;
219 if (has_64bit_reloc)
220 offset += 2;
221 }
222
223 eb = memset(&p->workload_dispatch.execbuf, 0, sizeof(*eb));
224 eb->buffers_ptr = (uintptr_t)p->workload_dispatch.exec;
225 eb->buffer_count = 2;
226 if (flags & CMDPARSER)
227 eb->batch_len = 4096;
228 eb->flags = I915_EXEC_BLT | LOCAL_EXEC_NO_RELOC;
229 eb->rsvd1 = p->ctx;
230 }
231
setup_latency(struct producer * p,int gen,unsigned flags)232 static void setup_latency(struct producer *p, int gen, unsigned flags)
233 {
234 struct drm_i915_gem_execbuffer2 *eb;
235 const int has_64bit_reloc = gen >= 8;
236 uint32_t handle;
237 uint32_t *map;
238 int i = 0;
239
240 handle = gem_create(fd, 4096);
241 if (gem_has_llc(fd))
242 map = gem_mmap__cpu(fd, handle, 0, 4096, PROT_WRITE);
243 else
244 map = gem_mmap__gtt(fd, handle, 4096, PROT_WRITE);
245
246 p->latency_dispatch.exec[0].relocation_count = 1;
247 p->latency_dispatch.exec[0].relocs_ptr =
248 (uintptr_t)p->latency_dispatch.reloc;
249 p->latency_dispatch.exec[0].handle = handle;
250
251 /* MI_STORE_REG_MEM */
252 map[i++] = 0x24 << 23 | 1;
253 if (has_64bit_reloc)
254 map[i-1]++;
255 map[i++] = REG_OFFSET(timestamp_reg);
256 p->latency_dispatch.reloc[0].offset = i * sizeof(uint32_t);
257 p->latency_dispatch.reloc[0].delta = 4000;
258 p->latency_dispatch.reloc[0].target_handle = handle;
259 p->latency_dispatch.reloc[0].read_domains = I915_GEM_DOMAIN_INSTRUCTION;
260 p->latency_dispatch.reloc[0].write_domain = 0; /* We lie! */
261 p->latency_dispatch.reloc[0].presumed_offset = 0;
262 p->last_timestamp = &map[1000];
263 map[i++] = 4000;
264 if (has_64bit_reloc)
265 map[i++] = 0;
266
267 map[i++] = MI_BATCH_BUFFER_END;
268
269 eb = memset(&p->latency_dispatch.execbuf, 0, sizeof(*eb));
270 eb->buffers_ptr = (uintptr_t)p->latency_dispatch.exec;
271 eb->buffer_count = 1;
272 if (flags & CMDPARSER)
273 eb->batch_len = sizeof(*map) * ((i + 1) & ~1);
274 eb->flags = I915_EXEC_BLT | LOCAL_EXEC_NO_RELOC;
275 if (flags & FENCE_OUT)
276 eb->flags |= LOCAL_I915_EXEC_FENCE_OUT;
277 eb->rsvd1 = p->ctx;
278 }
279
create_nop(void)280 static uint32_t create_nop(void)
281 {
282 uint32_t buf = MI_BATCH_BUFFER_END;
283 uint32_t handle;
284
285 handle = gem_create(fd, 4096);
286 gem_write(fd, handle, 0, &buf, sizeof(buf));
287
288 return handle;
289 }
290
setup_nop(struct producer * p,uint32_t batch,unsigned flags)291 static void setup_nop(struct producer *p, uint32_t batch, unsigned flags)
292 {
293 struct drm_i915_gem_execbuffer2 *eb;
294
295 p->nop_dispatch.exec[0].handle = batch;
296
297 eb = memset(&p->nop_dispatch.execbuf, 0, sizeof(*eb));
298 eb->buffers_ptr = (uintptr_t)p->nop_dispatch.exec;
299 eb->buffer_count = 1;
300 if (flags & CMDPARSER)
301 eb->batch_len = 8;
302 eb->flags = I915_EXEC_BLT | LOCAL_EXEC_NO_RELOC;
303 eb->rsvd1 = p->ctx;
304 }
305
fence_wait(int fence)306 static void fence_wait(int fence)
307 {
308 struct pollfd pfd = { .fd = fence, .events = POLLIN };
309 poll(&pfd, 1, -1);
310 }
311
measure_latency(struct producer * p,struct igt_mean * mean)312 static void measure_latency(struct producer *p, struct igt_mean *mean)
313 {
314 if (!(p->latency_dispatch.execbuf.flags & LOCAL_I915_EXEC_FENCE_OUT))
315 gem_sync(fd, p->latency_dispatch.exec[0].handle);
316 else
317 fence_wait(p->latency_dispatch.execbuf.rsvd2 >> 32);
318 igt_mean_add(mean, read_timestamp() - *p->last_timestamp);
319 }
320
producer(void * arg)321 static void *producer(void *arg)
322 {
323 struct producer *p = arg;
324 int n;
325
326 while (!done) {
327 uint32_t start = read_timestamp();
328 int batches;
329
330 /* Control the amount of work we do, similar to submitting
331 * empty buffers below, except this time we will load the
332 * GPU with a small amount of real work - so there is a small
333 * period between execution and interrupts.
334 */
335 gem_execbuf(fd, &p->workload_dispatch.execbuf);
336
337 /* Submitting a set of empty batches has a two fold effect:
338 * - increases contention on execbuffer, i.e. measure dispatch
339 * latency with number of clients.
340 * - generates lots of spurious interrupts (if someone is
341 * waiting).
342 */
343 batches = p->nop;
344 while (batches--)
345 gem_execbuf(fd, &p->nop_dispatch.execbuf);
346
347 /* Finally, execute a batch that just reads the current
348 * TIMESTAMP so we can measure the latency.
349 */
350 if (p->latency_dispatch.execbuf.flags & LOCAL_I915_EXEC_FENCE_OUT)
351 gem_execbuf_wr(fd, &p->latency_dispatch.execbuf);
352 else
353 gem_execbuf(fd, &p->latency_dispatch.execbuf);
354
355 /* Wake all the associated clients to wait upon our batch */
356 p->wait = p->nconsumers;
357 for (n = 0; n < p->nconsumers; n++)
358 p->consumers[n].go = 1;
359 pthread_cond_broadcast(&p->c_cond);
360
361 /* Wait for this batch to finish and record how long we waited,
362 * and how long it took for the batch to be submitted
363 * (including the nop delays).
364 */
365 measure_latency(p, &p->latency);
366 igt_mean_add(&p->dispatch, *p->last_timestamp - start);
367
368 /* Tidy up all the extra threads before we submit again. */
369 pthread_mutex_lock(&p->lock);
370 while (p->wait)
371 pthread_cond_wait(&p->p_cond, &p->lock);
372 pthread_mutex_unlock(&p->lock);
373
374 p->complete++;
375
376 if (p->latency_dispatch.execbuf.flags & LOCAL_I915_EXEC_FENCE_OUT)
377 close(p->latency_dispatch.execbuf.rsvd2 >> 32);
378 }
379
380 pthread_mutex_lock(&p->lock);
381 p->wait = p->nconsumers;
382 p->done = true;
383 for (n = 0; n < p->nconsumers; n++)
384 p->consumers[n].go = 1;
385 pthread_cond_broadcast(&p->c_cond);
386 pthread_mutex_unlock(&p->lock);
387
388 return NULL;
389 }
390
consumer(void * arg)391 static void *consumer(void *arg)
392 {
393 struct consumer *c = arg;
394 struct producer *p = c->producer;
395
396 /* Sit around waiting for the "go" signal from the producer, then
397 * wait upon the batch to finish. This is to add extra waiters to
398 * the same request - increasing wakeup contention.
399 */
400 do {
401 pthread_mutex_lock(&p->lock);
402 if (--p->wait == 0)
403 pthread_cond_signal(&p->p_cond);
404 while (!c->go)
405 pthread_cond_wait(&p->c_cond, &p->lock);
406 c->go = 0;
407 pthread_mutex_unlock(&p->lock);
408 if (p->done)
409 return NULL;
410
411 measure_latency(p, &c->latency);
412 } while (1);
413 }
414
l_estimate(igt_stats_t * stats)415 static double l_estimate(igt_stats_t *stats)
416 {
417 if (stats->n_values > 9)
418 return igt_stats_get_trimean(stats);
419 else if (stats->n_values > 5)
420 return igt_stats_get_median(stats);
421 else
422 return igt_stats_get_mean(stats);
423 }
424
cpu_time(const struct rusage * r)425 static double cpu_time(const struct rusage *r)
426 {
427 return 10e6*(r->ru_utime.tv_sec + r->ru_stime.tv_sec) +
428 (r->ru_utime.tv_usec + r->ru_stime.tv_usec);
429 }
430
run(int seconds,int nproducers,int nconsumers,int nop,int workload,unsigned flags)431 static int run(int seconds,
432 int nproducers,
433 int nconsumers,
434 int nop,
435 int workload,
436 unsigned flags)
437 {
438 pthread_attr_t attr;
439 struct producer *p;
440 igt_stats_t platency, latency, dispatch;
441 struct rusage rused;
442 uint32_t nop_batch;
443 uint32_t workload_batch;
444 uint32_t scratch;
445 int gen, n, m;
446 int complete;
447 int nrun;
448
449 #if 0
450 printf("producers=%d, consumers=%d, nop=%d, workload=%d, flags=%x\n",
451 nproducers, nconsumers, nop, workload, flags);
452 #endif
453
454 fd = drm_open_driver(DRIVER_INTEL);
455 gen = intel_gen(intel_get_drm_devid(fd));
456 if (gen < 6)
457 return IGT_EXIT_SKIP; /* Needs BCS timestamp */
458
459 intel_register_access_init(intel_get_pci_device(), false, fd);
460
461 if (gen == 6)
462 timestamp_reg = REG(RCS_TIMESTAMP);
463 else
464 timestamp_reg = REG(BCS_TIMESTAMP);
465
466 if (gen < 8 && !setup_timestamp_locked())
467 return IGT_EXIT_SKIP;
468
469 nrun = read_timestamp();
470 usleep(1);
471 if (read_timestamp() == nrun)
472 return IGT_EXIT_SKIP;
473
474 scratch = gem_create(fd, 4*WIDTH*HEIGHT);
475 nop_batch = create_nop();
476 workload_batch = create_workload(gen, workload);
477
478 p = calloc(nproducers, sizeof(*p));
479 for (n = 0; n < nproducers; n++) {
480 if (flags & CONTEXT)
481 p[n].ctx = gem_context_create(fd);
482
483 setup_nop(&p[n], nop_batch, flags);
484 setup_workload(&p[n], gen, scratch, workload_batch, workload, flags);
485 setup_latency(&p[n], gen, flags);
486
487 pthread_mutex_init(&p[n].lock, NULL);
488 pthread_cond_init(&p[n].p_cond, NULL);
489 pthread_cond_init(&p[n].c_cond, NULL);
490
491 igt_mean_init(&p[n].latency);
492 igt_mean_init(&p[n].dispatch);
493 p[n].wait = nconsumers;
494 p[n].nop = nop;
495 p[n].nconsumers = nconsumers;
496 p[n].consumers = calloc(nconsumers, sizeof(struct consumer));
497 for (m = 0; m < nconsumers; m++) {
498 p[n].consumers[m].producer = &p[n];
499 igt_mean_init(&p[n].consumers[m].latency);
500 pthread_create(&p[n].consumers[m].thread, NULL,
501 consumer, &p[n].consumers[m]);
502 }
503 pthread_mutex_lock(&p[n].lock);
504 while (p[n].wait)
505 pthread_cond_wait(&p[n].p_cond, &p[n].lock);
506 pthread_mutex_unlock(&p[n].lock);
507 }
508
509 pthread_attr_init(&attr);
510 if (flags & REALTIME) {
511 #ifdef PTHREAD_EXPLICIT_SCHED
512 struct sched_param param = { .sched_priority = 99 };
513 pthread_attr_setinheritsched(&attr, PTHREAD_EXPLICIT_SCHED);
514 pthread_attr_setschedpolicy(&attr, SCHED_FIFO);
515 pthread_attr_setschedparam(&attr, ¶m);
516 #else
517 return IGT_EXIT_SKIP;
518 #endif
519 }
520 for (n = 0; n < nproducers; n++)
521 pthread_create(&p[n].thread, &attr, producer, &p[n]);
522
523 sleep(seconds);
524 done = true;
525
526 nrun = complete = 0;
527 igt_stats_init_with_size(&dispatch, nproducers);
528 igt_stats_init_with_size(&platency, nproducers);
529 igt_stats_init_with_size(&latency, nconsumers*nproducers);
530 for (n = 0; n < nproducers; n++) {
531 pthread_join(p[n].thread, NULL);
532
533 if (!p[n].complete)
534 continue;
535
536 nrun++;
537 complete += p[n].complete;
538 igt_stats_push_float(&latency, p[n].latency.mean);
539 igt_stats_push_float(&platency, p[n].latency.mean);
540 igt_stats_push_float(&dispatch, p[n].dispatch.mean);
541
542 for (m = 0; m < nconsumers; m++) {
543 pthread_join(p[n].consumers[m].thread, NULL);
544 igt_stats_push_float(&latency,
545 p[n].consumers[m].latency.mean);
546 }
547 }
548
549 getrusage(RUSAGE_SELF, &rused);
550
551 switch ((flags >> 8) & 0xf) {
552 default:
553 printf("%d/%d: %7.3fus %7.3fus %7.3fus %7.3fus\n",
554 complete, nrun,
555 CYCLES_TO_US(l_estimate(&dispatch)),
556 CYCLES_TO_US(l_estimate(&latency)),
557 CYCLES_TO_US(l_estimate(&platency)),
558 cpu_time(&rused) / complete);
559 break;
560 case 1:
561 printf("%f\n", CYCLES_TO_US(l_estimate(&dispatch)));
562 break;
563 case 2:
564 printf("%f\n", CYCLES_TO_US(l_estimate(&latency)));
565 break;
566 case 3:
567 printf("%f\n", CYCLES_TO_US(l_estimate(&platency)));
568 break;
569 case 4:
570 printf("%f\n", cpu_time(&rused) / complete);
571 break;
572 case 5:
573 printf("%d\n", complete);
574 break;
575 }
576
577 return 0;
578 }
579
main(int argc,char ** argv)580 int main(int argc, char **argv)
581 {
582 int time = 10;
583 int producers = 1;
584 int consumers = 0;
585 int nop = 0;
586 int workload = 0;
587 unsigned flags = 0;
588 int c;
589
590 while ((c = getopt(argc, argv, "Cp:c:n:w:t:f:sRF")) != -1) {
591 switch (c) {
592 case 'p':
593 /* How many threads generate work? */
594 producers = atoi(optarg);
595 if (producers < 1)
596 producers = 1;
597 break;
598
599 case 'c':
600 /* How many threads wait upon each piece of work? */
601 consumers = atoi(optarg);
602 if (consumers < 0)
603 consumers = 0;
604 break;
605
606 case 'n':
607 /* Extra dispatch contention + interrupts */
608 nop = atoi(optarg);
609 if (nop < 0)
610 nop = 0;
611 break;
612
613 case 'w':
614 /* Control the amount of real work done */
615 workload = atoi(optarg);
616 if (workload < 0)
617 workload = 0;
618 if (workload > 100)
619 workload = 100;
620 break;
621
622 case 't':
623 /* How long to run the benchmark for (seconds) */
624 time = atoi(optarg);
625 if (time < 0)
626 time = INT_MAX;
627 break;
628
629 case 'f':
630 /* Select an output field */
631 flags |= atoi(optarg) << 8;
632 break;
633
634 case 's':
635 /* Assign each producer to its own context, adding
636 * context switching into the mix (e.g. execlists
637 * can amalgamate requests from one context, so
638 * having each producer submit in different contexts
639 * should force more execlist interrupts).
640 */
641 flags |= CONTEXT;
642 break;
643
644 case 'R':
645 /* Run the producers at RealTime priority */
646 flags |= REALTIME;
647 break;
648
649 case 'C':
650 /* Don't hide from the command parser (gen7) */
651 flags |= CMDPARSER;
652 break;
653
654 case 'F':
655 flags |= FENCE_OUT;
656 break;
657
658 default:
659 break;
660 }
661 }
662
663 return run(time, producers, consumers, nop, workload, flags);
664 }
665