1 /*
2 * Copyright © 2016 Intel Corporation
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
13 * Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21 * IN THE SOFTWARE.
22 *
23 */
24
25 #include <linux/prime_numbers.h>
26 #include <linux/pm_qos.h>
27 #include <linux/sort.h>
28
29 #include "gem/i915_gem_internal.h"
30 #include "gem/i915_gem_pm.h"
31 #include "gem/selftests/mock_context.h"
32
33 #include "gt/intel_engine_heartbeat.h"
34 #include "gt/intel_engine_pm.h"
35 #include "gt/intel_engine_user.h"
36 #include "gt/intel_gt.h"
37 #include "gt/intel_gt_clock_utils.h"
38 #include "gt/intel_gt_requests.h"
39 #include "gt/selftest_engine_heartbeat.h"
40
41 #include "i915_random.h"
42 #include "i915_selftest.h"
43 #include "igt_flush_test.h"
44 #include "igt_live_test.h"
45 #include "igt_spinner.h"
46 #include "lib_sw_fence.h"
47
48 #include "mock_drm.h"
49 #include "mock_gem_device.h"
50
num_uabi_engines(struct drm_i915_private * i915)51 static unsigned int num_uabi_engines(struct drm_i915_private *i915)
52 {
53 struct intel_engine_cs *engine;
54 unsigned int count;
55
56 count = 0;
57 for_each_uabi_engine(engine, i915)
58 count++;
59
60 return count;
61 }
62
rcs0(struct drm_i915_private * i915)63 static struct intel_engine_cs *rcs0(struct drm_i915_private *i915)
64 {
65 return intel_engine_lookup_user(i915, I915_ENGINE_CLASS_RENDER, 0);
66 }
67
igt_add_request(void * arg)68 static int igt_add_request(void *arg)
69 {
70 struct drm_i915_private *i915 = arg;
71 struct i915_request *request;
72
73 /* Basic preliminary test to create a request and let it loose! */
74
75 request = mock_request(rcs0(i915)->kernel_context, HZ / 10);
76 if (!request)
77 return -ENOMEM;
78
79 i915_request_add(request);
80
81 return 0;
82 }
83
igt_wait_request(void * arg)84 static int igt_wait_request(void *arg)
85 {
86 const long T = HZ / 4;
87 struct drm_i915_private *i915 = arg;
88 struct i915_request *request;
89 int err = -EINVAL;
90
91 /* Submit a request, then wait upon it */
92
93 request = mock_request(rcs0(i915)->kernel_context, T);
94 if (!request)
95 return -ENOMEM;
96
97 i915_request_get(request);
98
99 if (i915_request_wait(request, 0, 0) != -ETIME) {
100 pr_err("request wait (busy query) succeeded (expected timeout before submit!)\n");
101 goto out_request;
102 }
103
104 if (i915_request_wait(request, 0, T) != -ETIME) {
105 pr_err("request wait succeeded (expected timeout before submit!)\n");
106 goto out_request;
107 }
108
109 if (i915_request_completed(request)) {
110 pr_err("request completed before submit!!\n");
111 goto out_request;
112 }
113
114 i915_request_add(request);
115
116 if (i915_request_wait(request, 0, 0) != -ETIME) {
117 pr_err("request wait (busy query) succeeded (expected timeout after submit!)\n");
118 goto out_request;
119 }
120
121 if (i915_request_completed(request)) {
122 pr_err("request completed immediately!\n");
123 goto out_request;
124 }
125
126 if (i915_request_wait(request, 0, T / 2) != -ETIME) {
127 pr_err("request wait succeeded (expected timeout!)\n");
128 goto out_request;
129 }
130
131 if (i915_request_wait(request, 0, T) == -ETIME) {
132 pr_err("request wait timed out!\n");
133 goto out_request;
134 }
135
136 if (!i915_request_completed(request)) {
137 pr_err("request not complete after waiting!\n");
138 goto out_request;
139 }
140
141 if (i915_request_wait(request, 0, T) == -ETIME) {
142 pr_err("request wait timed out when already complete!\n");
143 goto out_request;
144 }
145
146 err = 0;
147 out_request:
148 i915_request_put(request);
149 mock_device_flush(i915);
150 return err;
151 }
152
igt_fence_wait(void * arg)153 static int igt_fence_wait(void *arg)
154 {
155 const long T = HZ / 4;
156 struct drm_i915_private *i915 = arg;
157 struct i915_request *request;
158 int err = -EINVAL;
159
160 /* Submit a request, treat it as a fence and wait upon it */
161
162 request = mock_request(rcs0(i915)->kernel_context, T);
163 if (!request)
164 return -ENOMEM;
165
166 if (dma_fence_wait_timeout(&request->fence, false, T) != -ETIME) {
167 pr_err("fence wait success before submit (expected timeout)!\n");
168 goto out;
169 }
170
171 i915_request_add(request);
172
173 if (dma_fence_is_signaled(&request->fence)) {
174 pr_err("fence signaled immediately!\n");
175 goto out;
176 }
177
178 if (dma_fence_wait_timeout(&request->fence, false, T / 2) != -ETIME) {
179 pr_err("fence wait success after submit (expected timeout)!\n");
180 goto out;
181 }
182
183 if (dma_fence_wait_timeout(&request->fence, false, T) <= 0) {
184 pr_err("fence wait timed out (expected success)!\n");
185 goto out;
186 }
187
188 if (!dma_fence_is_signaled(&request->fence)) {
189 pr_err("fence unsignaled after waiting!\n");
190 goto out;
191 }
192
193 if (dma_fence_wait_timeout(&request->fence, false, T) <= 0) {
194 pr_err("fence wait timed out when complete (expected success)!\n");
195 goto out;
196 }
197
198 err = 0;
199 out:
200 mock_device_flush(i915);
201 return err;
202 }
203
igt_request_rewind(void * arg)204 static int igt_request_rewind(void *arg)
205 {
206 struct drm_i915_private *i915 = arg;
207 struct i915_request *request, *vip;
208 struct i915_gem_context *ctx[2];
209 struct intel_context *ce;
210 int err = -EINVAL;
211
212 ctx[0] = mock_context(i915, "A");
213 if (!ctx[0]) {
214 err = -ENOMEM;
215 goto err_ctx_0;
216 }
217
218 ce = i915_gem_context_get_engine(ctx[0], RCS0);
219 GEM_BUG_ON(IS_ERR(ce));
220 request = mock_request(ce, 2 * HZ);
221 intel_context_put(ce);
222 if (!request) {
223 err = -ENOMEM;
224 goto err_context_0;
225 }
226
227 i915_request_get(request);
228 i915_request_add(request);
229
230 ctx[1] = mock_context(i915, "B");
231 if (!ctx[1]) {
232 err = -ENOMEM;
233 goto err_ctx_1;
234 }
235
236 ce = i915_gem_context_get_engine(ctx[1], RCS0);
237 GEM_BUG_ON(IS_ERR(ce));
238 vip = mock_request(ce, 0);
239 intel_context_put(ce);
240 if (!vip) {
241 err = -ENOMEM;
242 goto err_context_1;
243 }
244
245 /* Simulate preemption by manual reordering */
246 if (!mock_cancel_request(request)) {
247 pr_err("failed to cancel request (already executed)!\n");
248 i915_request_add(vip);
249 goto err_context_1;
250 }
251 i915_request_get(vip);
252 i915_request_add(vip);
253 rcu_read_lock();
254 request->engine->submit_request(request);
255 rcu_read_unlock();
256
257
258 if (i915_request_wait(vip, 0, HZ) == -ETIME) {
259 pr_err("timed out waiting for high priority request\n");
260 goto err;
261 }
262
263 if (i915_request_completed(request)) {
264 pr_err("low priority request already completed\n");
265 goto err;
266 }
267
268 err = 0;
269 err:
270 i915_request_put(vip);
271 err_context_1:
272 mock_context_close(ctx[1]);
273 err_ctx_1:
274 i915_request_put(request);
275 err_context_0:
276 mock_context_close(ctx[0]);
277 err_ctx_0:
278 mock_device_flush(i915);
279 return err;
280 }
281
282 struct smoketest {
283 struct intel_engine_cs *engine;
284 struct i915_gem_context **contexts;
285 atomic_long_t num_waits, num_fences;
286 int ncontexts, max_batch;
287 struct i915_request *(*request_alloc)(struct intel_context *ce);
288 };
289
290 static struct i915_request *
__mock_request_alloc(struct intel_context * ce)291 __mock_request_alloc(struct intel_context *ce)
292 {
293 return mock_request(ce, 0);
294 }
295
296 static struct i915_request *
__live_request_alloc(struct intel_context * ce)297 __live_request_alloc(struct intel_context *ce)
298 {
299 return intel_context_create_request(ce);
300 }
301
302 struct smoke_thread {
303 struct kthread_worker *worker;
304 struct kthread_work work;
305 struct smoketest *t;
306 bool stop;
307 int result;
308 };
309
__igt_breadcrumbs_smoketest(struct kthread_work * work)310 static void __igt_breadcrumbs_smoketest(struct kthread_work *work)
311 {
312 struct smoke_thread *thread = container_of(work, typeof(*thread), work);
313 struct smoketest *t = thread->t;
314 const unsigned int max_batch = min(t->ncontexts, t->max_batch) - 1;
315 const unsigned int total = 4 * t->ncontexts + 1;
316 unsigned int num_waits = 0, num_fences = 0;
317 struct i915_request **requests;
318 I915_RND_STATE(prng);
319 unsigned int *order;
320 int err = 0;
321
322 /*
323 * A very simple test to catch the most egregious of list handling bugs.
324 *
325 * At its heart, we simply create oodles of requests running across
326 * multiple kthreads and enable signaling on them, for the sole purpose
327 * of stressing our breadcrumb handling. The only inspection we do is
328 * that the fences were marked as signaled.
329 */
330
331 requests = kcalloc(total, sizeof(*requests), GFP_KERNEL);
332 if (!requests) {
333 thread->result = -ENOMEM;
334 return;
335 }
336
337 order = i915_random_order(total, &prng);
338 if (!order) {
339 err = -ENOMEM;
340 goto out_requests;
341 }
342
343 while (!READ_ONCE(thread->stop)) {
344 struct i915_sw_fence *submit, *wait;
345 unsigned int n, count;
346
347 submit = heap_fence_create(GFP_KERNEL);
348 if (!submit) {
349 err = -ENOMEM;
350 break;
351 }
352
353 wait = heap_fence_create(GFP_KERNEL);
354 if (!wait) {
355 i915_sw_fence_commit(submit);
356 heap_fence_put(submit);
357 err = -ENOMEM;
358 break;
359 }
360
361 i915_random_reorder(order, total, &prng);
362 count = 1 + i915_prandom_u32_max_state(max_batch, &prng);
363
364 for (n = 0; n < count; n++) {
365 struct i915_gem_context *ctx =
366 t->contexts[order[n] % t->ncontexts];
367 struct i915_request *rq;
368 struct intel_context *ce;
369
370 ce = i915_gem_context_get_engine(ctx, t->engine->legacy_idx);
371 GEM_BUG_ON(IS_ERR(ce));
372 rq = t->request_alloc(ce);
373 intel_context_put(ce);
374 if (IS_ERR(rq)) {
375 err = PTR_ERR(rq);
376 count = n;
377 break;
378 }
379
380 err = i915_sw_fence_await_sw_fence_gfp(&rq->submit,
381 submit,
382 GFP_KERNEL);
383
384 requests[n] = i915_request_get(rq);
385 i915_request_add(rq);
386
387 if (err >= 0)
388 err = i915_sw_fence_await_dma_fence(wait,
389 &rq->fence,
390 0,
391 GFP_KERNEL);
392
393 if (err < 0) {
394 i915_request_put(rq);
395 count = n;
396 break;
397 }
398 }
399
400 i915_sw_fence_commit(submit);
401 i915_sw_fence_commit(wait);
402
403 if (!wait_event_timeout(wait->wait,
404 i915_sw_fence_done(wait),
405 5 * HZ)) {
406 struct i915_request *rq = requests[count - 1];
407
408 pr_err("waiting for %d/%d fences (last %llx:%lld) on %s timed out!\n",
409 atomic_read(&wait->pending), count,
410 rq->fence.context, rq->fence.seqno,
411 t->engine->name);
412 GEM_TRACE_DUMP();
413
414 intel_gt_set_wedged(t->engine->gt);
415 GEM_BUG_ON(!i915_request_completed(rq));
416 i915_sw_fence_wait(wait);
417 err = -EIO;
418 }
419
420 for (n = 0; n < count; n++) {
421 struct i915_request *rq = requests[n];
422
423 if (!test_bit(DMA_FENCE_FLAG_SIGNALED_BIT,
424 &rq->fence.flags)) {
425 pr_err("%llu:%llu was not signaled!\n",
426 rq->fence.context, rq->fence.seqno);
427 err = -EINVAL;
428 }
429
430 i915_request_put(rq);
431 }
432
433 heap_fence_put(wait);
434 heap_fence_put(submit);
435
436 if (err < 0)
437 break;
438
439 num_fences += count;
440 num_waits++;
441
442 cond_resched();
443 }
444
445 atomic_long_add(num_fences, &t->num_fences);
446 atomic_long_add(num_waits, &t->num_waits);
447
448 kfree(order);
449 out_requests:
450 kfree(requests);
451 thread->result = err;
452 }
453
mock_breadcrumbs_smoketest(void * arg)454 static int mock_breadcrumbs_smoketest(void *arg)
455 {
456 struct drm_i915_private *i915 = arg;
457 struct smoketest t = {
458 .engine = rcs0(i915),
459 .ncontexts = 1024,
460 .max_batch = 1024,
461 .request_alloc = __mock_request_alloc
462 };
463 unsigned int ncpus = num_online_cpus();
464 struct smoke_thread *threads;
465 unsigned int n;
466 int ret = 0;
467
468 /*
469 * Smoketest our breadcrumb/signal handling for requests across multiple
470 * threads. A very simple test to only catch the most egregious of bugs.
471 * See __igt_breadcrumbs_smoketest();
472 */
473
474 threads = kcalloc(ncpus, sizeof(*threads), GFP_KERNEL);
475 if (!threads)
476 return -ENOMEM;
477
478 t.contexts = kcalloc(t.ncontexts, sizeof(*t.contexts), GFP_KERNEL);
479 if (!t.contexts) {
480 ret = -ENOMEM;
481 goto out_threads;
482 }
483
484 for (n = 0; n < t.ncontexts; n++) {
485 t.contexts[n] = mock_context(t.engine->i915, "mock");
486 if (!t.contexts[n]) {
487 ret = -ENOMEM;
488 goto out_contexts;
489 }
490 }
491
492 for (n = 0; n < ncpus; n++) {
493 struct kthread_worker *worker;
494
495 worker = kthread_create_worker(0, "igt/%d", n);
496 if (IS_ERR(worker)) {
497 ret = PTR_ERR(worker);
498 ncpus = n;
499 break;
500 }
501
502 threads[n].worker = worker;
503 threads[n].t = &t;
504 threads[n].stop = false;
505 threads[n].result = 0;
506
507 kthread_init_work(&threads[n].work,
508 __igt_breadcrumbs_smoketest);
509 kthread_queue_work(worker, &threads[n].work);
510 }
511
512 msleep(jiffies_to_msecs(i915_selftest.timeout_jiffies));
513
514 for (n = 0; n < ncpus; n++) {
515 int err;
516
517 WRITE_ONCE(threads[n].stop, true);
518 kthread_flush_work(&threads[n].work);
519 err = READ_ONCE(threads[n].result);
520 if (err < 0 && !ret)
521 ret = err;
522
523 kthread_destroy_worker(threads[n].worker);
524 }
525 pr_info("Completed %lu waits for %lu fence across %d cpus\n",
526 atomic_long_read(&t.num_waits),
527 atomic_long_read(&t.num_fences),
528 ncpus);
529
530 out_contexts:
531 for (n = 0; n < t.ncontexts; n++) {
532 if (!t.contexts[n])
533 break;
534 mock_context_close(t.contexts[n]);
535 }
536 kfree(t.contexts);
537 out_threads:
538 kfree(threads);
539 return ret;
540 }
541
i915_request_mock_selftests(void)542 int i915_request_mock_selftests(void)
543 {
544 static const struct i915_subtest tests[] = {
545 SUBTEST(igt_add_request),
546 SUBTEST(igt_wait_request),
547 SUBTEST(igt_fence_wait),
548 SUBTEST(igt_request_rewind),
549 SUBTEST(mock_breadcrumbs_smoketest),
550 };
551 struct drm_i915_private *i915;
552 intel_wakeref_t wakeref;
553 int err = 0;
554
555 i915 = mock_gem_device();
556 if (!i915)
557 return -ENOMEM;
558
559 with_intel_runtime_pm(&i915->runtime_pm, wakeref)
560 err = i915_subtests(tests, i915);
561
562 mock_destroy_device(i915);
563
564 return err;
565 }
566
live_nop_request(void * arg)567 static int live_nop_request(void *arg)
568 {
569 struct drm_i915_private *i915 = arg;
570 struct intel_engine_cs *engine;
571 struct igt_live_test t;
572 int err = -ENODEV;
573
574 /*
575 * Submit various sized batches of empty requests, to each engine
576 * (individually), and wait for the batch to complete. We can check
577 * the overhead of submitting requests to the hardware.
578 */
579
580 for_each_uabi_engine(engine, i915) {
581 unsigned long n, prime;
582 IGT_TIMEOUT(end_time);
583 ktime_t times[2] = {};
584
585 err = igt_live_test_begin(&t, i915, __func__, engine->name);
586 if (err)
587 return err;
588
589 intel_engine_pm_get(engine);
590 for_each_prime_number_from(prime, 1, 8192) {
591 struct i915_request *request = NULL;
592
593 times[1] = ktime_get_raw();
594
595 for (n = 0; n < prime; n++) {
596 i915_request_put(request);
597 request = i915_request_create(engine->kernel_context);
598 if (IS_ERR(request))
599 return PTR_ERR(request);
600
601 /*
602 * This space is left intentionally blank.
603 *
604 * We do not actually want to perform any
605 * action with this request, we just want
606 * to measure the latency in allocation
607 * and submission of our breadcrumbs -
608 * ensuring that the bare request is sufficient
609 * for the system to work (i.e. proper HEAD
610 * tracking of the rings, interrupt handling,
611 * etc). It also gives us the lowest bounds
612 * for latency.
613 */
614
615 i915_request_get(request);
616 i915_request_add(request);
617 }
618 i915_request_wait(request, 0, MAX_SCHEDULE_TIMEOUT);
619 i915_request_put(request);
620
621 times[1] = ktime_sub(ktime_get_raw(), times[1]);
622 if (prime == 1)
623 times[0] = times[1];
624
625 if (__igt_timeout(end_time, NULL))
626 break;
627 }
628 intel_engine_pm_put(engine);
629
630 err = igt_live_test_end(&t);
631 if (err)
632 return err;
633
634 pr_info("Request latencies on %s: 1 = %lluns, %lu = %lluns\n",
635 engine->name,
636 ktime_to_ns(times[0]),
637 prime, div64_u64(ktime_to_ns(times[1]), prime));
638 }
639
640 return err;
641 }
642
__cancel_inactive(struct intel_engine_cs * engine)643 static int __cancel_inactive(struct intel_engine_cs *engine)
644 {
645 struct intel_context *ce;
646 struct igt_spinner spin;
647 struct i915_request *rq;
648 int err = 0;
649
650 if (igt_spinner_init(&spin, engine->gt))
651 return -ENOMEM;
652
653 ce = intel_context_create(engine);
654 if (IS_ERR(ce)) {
655 err = PTR_ERR(ce);
656 goto out_spin;
657 }
658
659 rq = igt_spinner_create_request(&spin, ce, MI_ARB_CHECK);
660 if (IS_ERR(rq)) {
661 err = PTR_ERR(rq);
662 goto out_ce;
663 }
664
665 pr_debug("%s: Cancelling inactive request\n", engine->name);
666 i915_request_cancel(rq, -EINTR);
667 i915_request_get(rq);
668 i915_request_add(rq);
669
670 if (i915_request_wait(rq, 0, HZ / 5) < 0) {
671 struct drm_printer p = drm_info_printer(engine->i915->drm.dev);
672
673 pr_err("%s: Failed to cancel inactive request\n", engine->name);
674 intel_engine_dump(engine, &p, "%s\n", engine->name);
675 err = -ETIME;
676 goto out_rq;
677 }
678
679 if (rq->fence.error != -EINTR) {
680 pr_err("%s: fence not cancelled (%u)\n",
681 engine->name, rq->fence.error);
682 err = -EINVAL;
683 }
684
685 out_rq:
686 i915_request_put(rq);
687 out_ce:
688 intel_context_put(ce);
689 out_spin:
690 igt_spinner_fini(&spin);
691 if (err)
692 pr_err("%s: %s error %d\n", __func__, engine->name, err);
693 return err;
694 }
695
__cancel_active(struct intel_engine_cs * engine)696 static int __cancel_active(struct intel_engine_cs *engine)
697 {
698 struct intel_context *ce;
699 struct igt_spinner spin;
700 struct i915_request *rq;
701 int err = 0;
702
703 if (igt_spinner_init(&spin, engine->gt))
704 return -ENOMEM;
705
706 ce = intel_context_create(engine);
707 if (IS_ERR(ce)) {
708 err = PTR_ERR(ce);
709 goto out_spin;
710 }
711
712 rq = igt_spinner_create_request(&spin, ce, MI_ARB_CHECK);
713 if (IS_ERR(rq)) {
714 err = PTR_ERR(rq);
715 goto out_ce;
716 }
717
718 pr_debug("%s: Cancelling active request\n", engine->name);
719 i915_request_get(rq);
720 i915_request_add(rq);
721 if (!igt_wait_for_spinner(&spin, rq)) {
722 struct drm_printer p = drm_info_printer(engine->i915->drm.dev);
723
724 pr_err("Failed to start spinner on %s\n", engine->name);
725 intel_engine_dump(engine, &p, "%s\n", engine->name);
726 err = -ETIME;
727 goto out_rq;
728 }
729 i915_request_cancel(rq, -EINTR);
730
731 if (i915_request_wait(rq, 0, HZ / 5) < 0) {
732 struct drm_printer p = drm_info_printer(engine->i915->drm.dev);
733
734 pr_err("%s: Failed to cancel active request\n", engine->name);
735 intel_engine_dump(engine, &p, "%s\n", engine->name);
736 err = -ETIME;
737 goto out_rq;
738 }
739
740 if (rq->fence.error != -EINTR) {
741 pr_err("%s: fence not cancelled (%u)\n",
742 engine->name, rq->fence.error);
743 err = -EINVAL;
744 }
745
746 out_rq:
747 i915_request_put(rq);
748 out_ce:
749 intel_context_put(ce);
750 out_spin:
751 igt_spinner_fini(&spin);
752 if (err)
753 pr_err("%s: %s error %d\n", __func__, engine->name, err);
754 return err;
755 }
756
__cancel_completed(struct intel_engine_cs * engine)757 static int __cancel_completed(struct intel_engine_cs *engine)
758 {
759 struct intel_context *ce;
760 struct igt_spinner spin;
761 struct i915_request *rq;
762 int err = 0;
763
764 if (igt_spinner_init(&spin, engine->gt))
765 return -ENOMEM;
766
767 ce = intel_context_create(engine);
768 if (IS_ERR(ce)) {
769 err = PTR_ERR(ce);
770 goto out_spin;
771 }
772
773 rq = igt_spinner_create_request(&spin, ce, MI_ARB_CHECK);
774 if (IS_ERR(rq)) {
775 err = PTR_ERR(rq);
776 goto out_ce;
777 }
778 igt_spinner_end(&spin);
779 i915_request_get(rq);
780 i915_request_add(rq);
781
782 if (i915_request_wait(rq, 0, HZ / 5) < 0) {
783 err = -ETIME;
784 goto out_rq;
785 }
786
787 pr_debug("%s: Cancelling completed request\n", engine->name);
788 i915_request_cancel(rq, -EINTR);
789 if (rq->fence.error) {
790 pr_err("%s: fence not cancelled (%u)\n",
791 engine->name, rq->fence.error);
792 err = -EINVAL;
793 }
794
795 out_rq:
796 i915_request_put(rq);
797 out_ce:
798 intel_context_put(ce);
799 out_spin:
800 igt_spinner_fini(&spin);
801 if (err)
802 pr_err("%s: %s error %d\n", __func__, engine->name, err);
803 return err;
804 }
805
806 /*
807 * Test to prove a non-preemptable request can be cancelled and a subsequent
808 * request on the same context can successfully complete after cancellation.
809 *
810 * Testing methodology is to create a non-preemptible request and submit it,
811 * wait for spinner to start, create a NOP request and submit it, cancel the
812 * spinner, wait for spinner to complete and verify it failed with an error,
813 * finally wait for NOP request to complete verify it succeeded without an
814 * error. Preemption timeout also reduced / restored so test runs in a timely
815 * maner.
816 */
__cancel_reset(struct drm_i915_private * i915,struct intel_engine_cs * engine)817 static int __cancel_reset(struct drm_i915_private *i915,
818 struct intel_engine_cs *engine)
819 {
820 struct intel_context *ce;
821 struct igt_spinner spin;
822 struct i915_request *rq, *nop;
823 unsigned long preempt_timeout_ms;
824 int err = 0;
825
826 if (!CONFIG_DRM_I915_PREEMPT_TIMEOUT ||
827 !intel_has_reset_engine(engine->gt))
828 return 0;
829
830 preempt_timeout_ms = engine->props.preempt_timeout_ms;
831 engine->props.preempt_timeout_ms = 100;
832
833 if (igt_spinner_init(&spin, engine->gt))
834 goto out_restore;
835
836 ce = intel_context_create(engine);
837 if (IS_ERR(ce)) {
838 err = PTR_ERR(ce);
839 goto out_spin;
840 }
841
842 rq = igt_spinner_create_request(&spin, ce, MI_NOOP);
843 if (IS_ERR(rq)) {
844 err = PTR_ERR(rq);
845 goto out_ce;
846 }
847
848 pr_debug("%s: Cancelling active non-preemptable request\n",
849 engine->name);
850 i915_request_get(rq);
851 i915_request_add(rq);
852 if (!igt_wait_for_spinner(&spin, rq)) {
853 struct drm_printer p = drm_info_printer(engine->i915->drm.dev);
854
855 pr_err("Failed to start spinner on %s\n", engine->name);
856 intel_engine_dump(engine, &p, "%s\n", engine->name);
857 err = -ETIME;
858 goto out_rq;
859 }
860
861 nop = intel_context_create_request(ce);
862 if (IS_ERR(nop))
863 goto out_rq;
864 i915_request_get(nop);
865 i915_request_add(nop);
866
867 i915_request_cancel(rq, -EINTR);
868
869 if (i915_request_wait(rq, 0, HZ) < 0) {
870 struct drm_printer p = drm_info_printer(engine->i915->drm.dev);
871
872 pr_err("%s: Failed to cancel hung request\n", engine->name);
873 intel_engine_dump(engine, &p, "%s\n", engine->name);
874 err = -ETIME;
875 goto out_nop;
876 }
877
878 if (rq->fence.error != -EINTR) {
879 pr_err("%s: fence not cancelled (%u)\n",
880 engine->name, rq->fence.error);
881 err = -EINVAL;
882 goto out_nop;
883 }
884
885 if (i915_request_wait(nop, 0, HZ) < 0) {
886 struct drm_printer p = drm_info_printer(engine->i915->drm.dev);
887
888 pr_err("%s: Failed to complete nop request\n", engine->name);
889 intel_engine_dump(engine, &p, "%s\n", engine->name);
890 err = -ETIME;
891 goto out_nop;
892 }
893
894 if (nop->fence.error != 0) {
895 pr_err("%s: Nop request errored (%u)\n",
896 engine->name, nop->fence.error);
897 err = -EINVAL;
898 }
899
900 out_nop:
901 i915_request_put(nop);
902 out_rq:
903 i915_request_put(rq);
904 out_ce:
905 intel_context_put(ce);
906 out_spin:
907 igt_spinner_fini(&spin);
908 out_restore:
909 engine->props.preempt_timeout_ms = preempt_timeout_ms;
910 if (err)
911 pr_err("%s: %s error %d\n", __func__, engine->name, err);
912 return err;
913 }
914
live_cancel_request(void * arg)915 static int live_cancel_request(void *arg)
916 {
917 struct drm_i915_private *i915 = arg;
918 struct intel_engine_cs *engine;
919
920 /*
921 * Check cancellation of requests. We expect to be able to immediately
922 * cancel active requests, even if they are currently on the GPU.
923 */
924
925 for_each_uabi_engine(engine, i915) {
926 struct igt_live_test t;
927 int err, err2;
928
929 if (!intel_engine_has_preemption(engine))
930 continue;
931
932 err = igt_live_test_begin(&t, i915, __func__, engine->name);
933 if (err)
934 return err;
935
936 err = __cancel_inactive(engine);
937 if (err == 0)
938 err = __cancel_active(engine);
939 if (err == 0)
940 err = __cancel_completed(engine);
941
942 err2 = igt_live_test_end(&t);
943 if (err)
944 return err;
945 if (err2)
946 return err2;
947
948 /* Expects reset so call outside of igt_live_test_* */
949 err = __cancel_reset(i915, engine);
950 if (err)
951 return err;
952
953 if (igt_flush_test(i915))
954 return -EIO;
955 }
956
957 return 0;
958 }
959
empty_batch(struct intel_gt * gt)960 static struct i915_vma *empty_batch(struct intel_gt *gt)
961 {
962 struct drm_i915_gem_object *obj;
963 struct i915_vma *vma;
964 u32 *cmd;
965 int err;
966
967 obj = i915_gem_object_create_internal(gt->i915, PAGE_SIZE);
968 if (IS_ERR(obj))
969 return ERR_CAST(obj);
970
971 cmd = i915_gem_object_pin_map_unlocked(obj, I915_MAP_WC);
972 if (IS_ERR(cmd)) {
973 err = PTR_ERR(cmd);
974 goto err;
975 }
976
977 *cmd = MI_BATCH_BUFFER_END;
978
979 __i915_gem_object_flush_map(obj, 0, 64);
980 i915_gem_object_unpin_map(obj);
981
982 intel_gt_chipset_flush(gt);
983
984 vma = i915_vma_instance(obj, gt->vm, NULL);
985 if (IS_ERR(vma)) {
986 err = PTR_ERR(vma);
987 goto err;
988 }
989
990 err = i915_vma_pin(vma, 0, 0, PIN_USER);
991 if (err)
992 goto err;
993
994 /* Force the wait now to avoid including it in the benchmark */
995 err = i915_vma_sync(vma);
996 if (err)
997 goto err_pin;
998
999 return vma;
1000
1001 err_pin:
1002 i915_vma_unpin(vma);
1003 err:
1004 i915_gem_object_put(obj);
1005 return ERR_PTR(err);
1006 }
1007
emit_bb_start(struct i915_request * rq,struct i915_vma * batch)1008 static int emit_bb_start(struct i915_request *rq, struct i915_vma *batch)
1009 {
1010 return rq->engine->emit_bb_start(rq,
1011 i915_vma_offset(batch),
1012 i915_vma_size(batch),
1013 0);
1014 }
1015
1016 static struct i915_request *
empty_request(struct intel_engine_cs * engine,struct i915_vma * batch)1017 empty_request(struct intel_engine_cs *engine,
1018 struct i915_vma *batch)
1019 {
1020 struct i915_request *request;
1021 int err;
1022
1023 request = i915_request_create(engine->kernel_context);
1024 if (IS_ERR(request))
1025 return request;
1026
1027 err = emit_bb_start(request, batch);
1028 if (err)
1029 goto out_request;
1030
1031 i915_request_get(request);
1032 out_request:
1033 i915_request_add(request);
1034 return err ? ERR_PTR(err) : request;
1035 }
1036
live_empty_request(void * arg)1037 static int live_empty_request(void *arg)
1038 {
1039 struct drm_i915_private *i915 = arg;
1040 struct intel_engine_cs *engine;
1041 struct igt_live_test t;
1042 int err;
1043
1044 /*
1045 * Submit various sized batches of empty requests, to each engine
1046 * (individually), and wait for the batch to complete. We can check
1047 * the overhead of submitting requests to the hardware.
1048 */
1049
1050 for_each_uabi_engine(engine, i915) {
1051 IGT_TIMEOUT(end_time);
1052 struct i915_request *request;
1053 struct i915_vma *batch;
1054 unsigned long n, prime;
1055 ktime_t times[2] = {};
1056
1057 batch = empty_batch(engine->gt);
1058 if (IS_ERR(batch))
1059 return PTR_ERR(batch);
1060
1061 err = igt_live_test_begin(&t, i915, __func__, engine->name);
1062 if (err)
1063 goto out_batch;
1064
1065 intel_engine_pm_get(engine);
1066
1067 /* Warmup / preload */
1068 request = empty_request(engine, batch);
1069 if (IS_ERR(request)) {
1070 err = PTR_ERR(request);
1071 intel_engine_pm_put(engine);
1072 goto out_batch;
1073 }
1074 i915_request_wait(request, 0, MAX_SCHEDULE_TIMEOUT);
1075
1076 for_each_prime_number_from(prime, 1, 8192) {
1077 times[1] = ktime_get_raw();
1078
1079 for (n = 0; n < prime; n++) {
1080 i915_request_put(request);
1081 request = empty_request(engine, batch);
1082 if (IS_ERR(request)) {
1083 err = PTR_ERR(request);
1084 intel_engine_pm_put(engine);
1085 goto out_batch;
1086 }
1087 }
1088 i915_request_wait(request, 0, MAX_SCHEDULE_TIMEOUT);
1089
1090 times[1] = ktime_sub(ktime_get_raw(), times[1]);
1091 if (prime == 1)
1092 times[0] = times[1];
1093
1094 if (__igt_timeout(end_time, NULL))
1095 break;
1096 }
1097 i915_request_put(request);
1098 intel_engine_pm_put(engine);
1099
1100 err = igt_live_test_end(&t);
1101 if (err)
1102 goto out_batch;
1103
1104 pr_info("Batch latencies on %s: 1 = %lluns, %lu = %lluns\n",
1105 engine->name,
1106 ktime_to_ns(times[0]),
1107 prime, div64_u64(ktime_to_ns(times[1]), prime));
1108 out_batch:
1109 i915_vma_unpin(batch);
1110 i915_vma_put(batch);
1111 if (err)
1112 break;
1113 }
1114
1115 return err;
1116 }
1117
recursive_batch(struct intel_gt * gt)1118 static struct i915_vma *recursive_batch(struct intel_gt *gt)
1119 {
1120 struct drm_i915_gem_object *obj;
1121 const int ver = GRAPHICS_VER(gt->i915);
1122 struct i915_vma *vma;
1123 u32 *cmd;
1124 int err;
1125
1126 obj = i915_gem_object_create_internal(gt->i915, PAGE_SIZE);
1127 if (IS_ERR(obj))
1128 return ERR_CAST(obj);
1129
1130 vma = i915_vma_instance(obj, gt->vm, NULL);
1131 if (IS_ERR(vma)) {
1132 err = PTR_ERR(vma);
1133 goto err;
1134 }
1135
1136 err = i915_vma_pin(vma, 0, 0, PIN_USER);
1137 if (err)
1138 goto err;
1139
1140 cmd = i915_gem_object_pin_map_unlocked(obj, I915_MAP_WC);
1141 if (IS_ERR(cmd)) {
1142 err = PTR_ERR(cmd);
1143 goto err;
1144 }
1145
1146 if (ver >= 8) {
1147 *cmd++ = MI_BATCH_BUFFER_START | 1 << 8 | 1;
1148 *cmd++ = lower_32_bits(i915_vma_offset(vma));
1149 *cmd++ = upper_32_bits(i915_vma_offset(vma));
1150 } else if (ver >= 6) {
1151 *cmd++ = MI_BATCH_BUFFER_START | 1 << 8;
1152 *cmd++ = lower_32_bits(i915_vma_offset(vma));
1153 } else {
1154 *cmd++ = MI_BATCH_BUFFER_START | MI_BATCH_GTT;
1155 *cmd++ = lower_32_bits(i915_vma_offset(vma));
1156 }
1157 *cmd++ = MI_BATCH_BUFFER_END; /* terminate early in case of error */
1158
1159 __i915_gem_object_flush_map(obj, 0, 64);
1160 i915_gem_object_unpin_map(obj);
1161
1162 intel_gt_chipset_flush(gt);
1163
1164 return vma;
1165
1166 err:
1167 i915_gem_object_put(obj);
1168 return ERR_PTR(err);
1169 }
1170
recursive_batch_resolve(struct i915_vma * batch)1171 static int recursive_batch_resolve(struct i915_vma *batch)
1172 {
1173 u32 *cmd;
1174
1175 cmd = i915_gem_object_pin_map_unlocked(batch->obj, I915_MAP_WC);
1176 if (IS_ERR(cmd))
1177 return PTR_ERR(cmd);
1178
1179 *cmd = MI_BATCH_BUFFER_END;
1180
1181 __i915_gem_object_flush_map(batch->obj, 0, sizeof(*cmd));
1182 i915_gem_object_unpin_map(batch->obj);
1183
1184 intel_gt_chipset_flush(batch->vm->gt);
1185
1186 return 0;
1187 }
1188
live_all_engines(void * arg)1189 static int live_all_engines(void *arg)
1190 {
1191 struct drm_i915_private *i915 = arg;
1192 const unsigned int nengines = num_uabi_engines(i915);
1193 struct intel_engine_cs *engine;
1194 struct i915_request **request;
1195 struct igt_live_test t;
1196 unsigned int idx;
1197 int err;
1198
1199 /*
1200 * Check we can submit requests to all engines simultaneously. We
1201 * send a recursive batch to each engine - checking that we don't
1202 * block doing so, and that they don't complete too soon.
1203 */
1204
1205 request = kcalloc(nengines, sizeof(*request), GFP_KERNEL);
1206 if (!request)
1207 return -ENOMEM;
1208
1209 err = igt_live_test_begin(&t, i915, __func__, "");
1210 if (err)
1211 goto out_free;
1212
1213 idx = 0;
1214 for_each_uabi_engine(engine, i915) {
1215 struct i915_vma *batch;
1216
1217 batch = recursive_batch(engine->gt);
1218 if (IS_ERR(batch)) {
1219 err = PTR_ERR(batch);
1220 pr_err("%s: Unable to create batch, err=%d\n",
1221 __func__, err);
1222 goto out_free;
1223 }
1224
1225 i915_vma_lock(batch);
1226 request[idx] = intel_engine_create_kernel_request(engine);
1227 if (IS_ERR(request[idx])) {
1228 err = PTR_ERR(request[idx]);
1229 pr_err("%s: Request allocation failed with err=%d\n",
1230 __func__, err);
1231 goto out_unlock;
1232 }
1233 GEM_BUG_ON(request[idx]->context->vm != batch->vm);
1234
1235 err = i915_vma_move_to_active(batch, request[idx], 0);
1236 GEM_BUG_ON(err);
1237
1238 err = emit_bb_start(request[idx], batch);
1239 GEM_BUG_ON(err);
1240 request[idx]->batch = batch;
1241
1242 i915_request_get(request[idx]);
1243 i915_request_add(request[idx]);
1244 idx++;
1245 out_unlock:
1246 i915_vma_unlock(batch);
1247 if (err)
1248 goto out_request;
1249 }
1250
1251 idx = 0;
1252 for_each_uabi_engine(engine, i915) {
1253 if (i915_request_completed(request[idx])) {
1254 pr_err("%s(%s): request completed too early!\n",
1255 __func__, engine->name);
1256 err = -EINVAL;
1257 goto out_request;
1258 }
1259 idx++;
1260 }
1261
1262 idx = 0;
1263 for_each_uabi_engine(engine, i915) {
1264 err = recursive_batch_resolve(request[idx]->batch);
1265 if (err) {
1266 pr_err("%s: failed to resolve batch, err=%d\n",
1267 __func__, err);
1268 goto out_request;
1269 }
1270 idx++;
1271 }
1272
1273 idx = 0;
1274 for_each_uabi_engine(engine, i915) {
1275 struct i915_request *rq = request[idx];
1276 long timeout;
1277
1278 timeout = i915_request_wait(rq, 0,
1279 MAX_SCHEDULE_TIMEOUT);
1280 if (timeout < 0) {
1281 err = timeout;
1282 pr_err("%s: error waiting for request on %s, err=%d\n",
1283 __func__, engine->name, err);
1284 goto out_request;
1285 }
1286
1287 GEM_BUG_ON(!i915_request_completed(rq));
1288 i915_vma_unpin(rq->batch);
1289 i915_vma_put(rq->batch);
1290 i915_request_put(rq);
1291 request[idx] = NULL;
1292 idx++;
1293 }
1294
1295 err = igt_live_test_end(&t);
1296
1297 out_request:
1298 idx = 0;
1299 for_each_uabi_engine(engine, i915) {
1300 struct i915_request *rq = request[idx];
1301
1302 if (!rq)
1303 continue;
1304
1305 if (rq->batch) {
1306 i915_vma_unpin(rq->batch);
1307 i915_vma_put(rq->batch);
1308 }
1309 i915_request_put(rq);
1310 idx++;
1311 }
1312 out_free:
1313 kfree(request);
1314 return err;
1315 }
1316
live_sequential_engines(void * arg)1317 static int live_sequential_engines(void *arg)
1318 {
1319 struct drm_i915_private *i915 = arg;
1320 const unsigned int nengines = num_uabi_engines(i915);
1321 struct i915_request **request;
1322 struct i915_request *prev = NULL;
1323 struct intel_engine_cs *engine;
1324 struct igt_live_test t;
1325 unsigned int idx;
1326 int err;
1327
1328 /*
1329 * Check we can submit requests to all engines sequentially, such
1330 * that each successive request waits for the earlier ones. This
1331 * tests that we don't execute requests out of order, even though
1332 * they are running on independent engines.
1333 */
1334
1335 request = kcalloc(nengines, sizeof(*request), GFP_KERNEL);
1336 if (!request)
1337 return -ENOMEM;
1338
1339 err = igt_live_test_begin(&t, i915, __func__, "");
1340 if (err)
1341 goto out_free;
1342
1343 idx = 0;
1344 for_each_uabi_engine(engine, i915) {
1345 struct i915_vma *batch;
1346
1347 batch = recursive_batch(engine->gt);
1348 if (IS_ERR(batch)) {
1349 err = PTR_ERR(batch);
1350 pr_err("%s: Unable to create batch for %s, err=%d\n",
1351 __func__, engine->name, err);
1352 goto out_free;
1353 }
1354
1355 i915_vma_lock(batch);
1356 request[idx] = intel_engine_create_kernel_request(engine);
1357 if (IS_ERR(request[idx])) {
1358 err = PTR_ERR(request[idx]);
1359 pr_err("%s: Request allocation failed for %s with err=%d\n",
1360 __func__, engine->name, err);
1361 goto out_unlock;
1362 }
1363 GEM_BUG_ON(request[idx]->context->vm != batch->vm);
1364
1365 if (prev) {
1366 err = i915_request_await_dma_fence(request[idx],
1367 &prev->fence);
1368 if (err) {
1369 i915_request_add(request[idx]);
1370 pr_err("%s: Request await failed for %s with err=%d\n",
1371 __func__, engine->name, err);
1372 goto out_unlock;
1373 }
1374 }
1375
1376 err = i915_vma_move_to_active(batch, request[idx], 0);
1377 GEM_BUG_ON(err);
1378
1379 err = emit_bb_start(request[idx], batch);
1380 GEM_BUG_ON(err);
1381 request[idx]->batch = batch;
1382
1383 i915_request_get(request[idx]);
1384 i915_request_add(request[idx]);
1385
1386 prev = request[idx];
1387 idx++;
1388
1389 out_unlock:
1390 i915_vma_unlock(batch);
1391 if (err)
1392 goto out_request;
1393 }
1394
1395 idx = 0;
1396 for_each_uabi_engine(engine, i915) {
1397 long timeout;
1398
1399 if (i915_request_completed(request[idx])) {
1400 pr_err("%s(%s): request completed too early!\n",
1401 __func__, engine->name);
1402 err = -EINVAL;
1403 goto out_request;
1404 }
1405
1406 err = recursive_batch_resolve(request[idx]->batch);
1407 if (err) {
1408 pr_err("%s: failed to resolve batch, err=%d\n",
1409 __func__, err);
1410 goto out_request;
1411 }
1412
1413 timeout = i915_request_wait(request[idx], 0,
1414 MAX_SCHEDULE_TIMEOUT);
1415 if (timeout < 0) {
1416 err = timeout;
1417 pr_err("%s: error waiting for request on %s, err=%d\n",
1418 __func__, engine->name, err);
1419 goto out_request;
1420 }
1421
1422 GEM_BUG_ON(!i915_request_completed(request[idx]));
1423 idx++;
1424 }
1425
1426 err = igt_live_test_end(&t);
1427
1428 out_request:
1429 idx = 0;
1430 for_each_uabi_engine(engine, i915) {
1431 u32 *cmd;
1432
1433 if (!request[idx])
1434 break;
1435
1436 cmd = i915_gem_object_pin_map_unlocked(request[idx]->batch->obj,
1437 I915_MAP_WC);
1438 if (!IS_ERR(cmd)) {
1439 *cmd = MI_BATCH_BUFFER_END;
1440
1441 __i915_gem_object_flush_map(request[idx]->batch->obj,
1442 0, sizeof(*cmd));
1443 i915_gem_object_unpin_map(request[idx]->batch->obj);
1444
1445 intel_gt_chipset_flush(engine->gt);
1446 }
1447
1448 i915_vma_put(request[idx]->batch);
1449 i915_request_put(request[idx]);
1450 idx++;
1451 }
1452 out_free:
1453 kfree(request);
1454 return err;
1455 }
1456
1457 struct parallel_thread {
1458 struct kthread_worker *worker;
1459 struct kthread_work work;
1460 struct intel_engine_cs *engine;
1461 int result;
1462 };
1463
__live_parallel_engine1(struct kthread_work * work)1464 static void __live_parallel_engine1(struct kthread_work *work)
1465 {
1466 struct parallel_thread *thread =
1467 container_of(work, typeof(*thread), work);
1468 struct intel_engine_cs *engine = thread->engine;
1469 IGT_TIMEOUT(end_time);
1470 unsigned long count;
1471 int err = 0;
1472
1473 count = 0;
1474 intel_engine_pm_get(engine);
1475 do {
1476 struct i915_request *rq;
1477
1478 rq = i915_request_create(engine->kernel_context);
1479 if (IS_ERR(rq)) {
1480 err = PTR_ERR(rq);
1481 break;
1482 }
1483
1484 i915_request_get(rq);
1485 i915_request_add(rq);
1486
1487 err = 0;
1488 if (i915_request_wait(rq, 0, HZ) < 0)
1489 err = -ETIME;
1490 i915_request_put(rq);
1491 if (err)
1492 break;
1493
1494 count++;
1495 } while (!__igt_timeout(end_time, NULL));
1496 intel_engine_pm_put(engine);
1497
1498 pr_info("%s: %lu request + sync\n", engine->name, count);
1499 thread->result = err;
1500 }
1501
__live_parallel_engineN(struct kthread_work * work)1502 static void __live_parallel_engineN(struct kthread_work *work)
1503 {
1504 struct parallel_thread *thread =
1505 container_of(work, typeof(*thread), work);
1506 struct intel_engine_cs *engine = thread->engine;
1507 IGT_TIMEOUT(end_time);
1508 unsigned long count;
1509 int err = 0;
1510
1511 count = 0;
1512 intel_engine_pm_get(engine);
1513 do {
1514 struct i915_request *rq;
1515
1516 rq = i915_request_create(engine->kernel_context);
1517 if (IS_ERR(rq)) {
1518 err = PTR_ERR(rq);
1519 break;
1520 }
1521
1522 i915_request_add(rq);
1523 count++;
1524 } while (!__igt_timeout(end_time, NULL));
1525 intel_engine_pm_put(engine);
1526
1527 pr_info("%s: %lu requests\n", engine->name, count);
1528 thread->result = err;
1529 }
1530
wake_all(struct drm_i915_private * i915)1531 static bool wake_all(struct drm_i915_private *i915)
1532 {
1533 if (atomic_dec_and_test(&i915->selftest.counter)) {
1534 wake_up_var(&i915->selftest.counter);
1535 return true;
1536 }
1537
1538 return false;
1539 }
1540
wait_for_all(struct drm_i915_private * i915)1541 static int wait_for_all(struct drm_i915_private *i915)
1542 {
1543 if (wake_all(i915))
1544 return 0;
1545
1546 if (wait_var_event_timeout(&i915->selftest.counter,
1547 !atomic_read(&i915->selftest.counter),
1548 i915_selftest.timeout_jiffies))
1549 return 0;
1550
1551 return -ETIME;
1552 }
1553
__live_parallel_spin(struct kthread_work * work)1554 static void __live_parallel_spin(struct kthread_work *work)
1555 {
1556 struct parallel_thread *thread =
1557 container_of(work, typeof(*thread), work);
1558 struct intel_engine_cs *engine = thread->engine;
1559 struct igt_spinner spin;
1560 struct i915_request *rq;
1561 int err = 0;
1562
1563 /*
1564 * Create a spinner running for eternity on each engine. If a second
1565 * spinner is incorrectly placed on the same engine, it will not be
1566 * able to start in time.
1567 */
1568
1569 if (igt_spinner_init(&spin, engine->gt)) {
1570 wake_all(engine->i915);
1571 thread->result = -ENOMEM;
1572 return;
1573 }
1574
1575 intel_engine_pm_get(engine);
1576 rq = igt_spinner_create_request(&spin,
1577 engine->kernel_context,
1578 MI_NOOP); /* no preemption */
1579 intel_engine_pm_put(engine);
1580 if (IS_ERR(rq)) {
1581 err = PTR_ERR(rq);
1582 if (err == -ENODEV)
1583 err = 0;
1584 wake_all(engine->i915);
1585 goto out_spin;
1586 }
1587
1588 i915_request_get(rq);
1589 i915_request_add(rq);
1590 if (igt_wait_for_spinner(&spin, rq)) {
1591 /* Occupy this engine for the whole test */
1592 err = wait_for_all(engine->i915);
1593 } else {
1594 pr_err("Failed to start spinner on %s\n", engine->name);
1595 err = -EINVAL;
1596 }
1597 igt_spinner_end(&spin);
1598
1599 if (err == 0 && i915_request_wait(rq, 0, HZ) < 0)
1600 err = -EIO;
1601 i915_request_put(rq);
1602
1603 out_spin:
1604 igt_spinner_fini(&spin);
1605 thread->result = err;
1606 }
1607
live_parallel_engines(void * arg)1608 static int live_parallel_engines(void *arg)
1609 {
1610 struct drm_i915_private *i915 = arg;
1611 static void (* const func[])(struct kthread_work *) = {
1612 __live_parallel_engine1,
1613 __live_parallel_engineN,
1614 __live_parallel_spin,
1615 NULL,
1616 };
1617 const unsigned int nengines = num_uabi_engines(i915);
1618 struct parallel_thread *threads;
1619 struct intel_engine_cs *engine;
1620 void (* const *fn)(struct kthread_work *);
1621 int err = 0;
1622
1623 /*
1624 * Check we can submit requests to all engines concurrently. This
1625 * tests that we load up the system maximally.
1626 */
1627
1628 threads = kcalloc(nengines, sizeof(*threads), GFP_KERNEL);
1629 if (!threads)
1630 return -ENOMEM;
1631
1632 for (fn = func; !err && *fn; fn++) {
1633 char name[KSYM_NAME_LEN];
1634 struct igt_live_test t;
1635 unsigned int idx;
1636
1637 snprintf(name, sizeof(name), "%ps", *fn);
1638 err = igt_live_test_begin(&t, i915, __func__, name);
1639 if (err)
1640 break;
1641
1642 atomic_set(&i915->selftest.counter, nengines);
1643
1644 idx = 0;
1645 for_each_uabi_engine(engine, i915) {
1646 struct kthread_worker *worker;
1647
1648 worker = kthread_create_worker(0, "igt/parallel:%s",
1649 engine->name);
1650 if (IS_ERR(worker)) {
1651 err = PTR_ERR(worker);
1652 break;
1653 }
1654
1655 threads[idx].worker = worker;
1656 threads[idx].result = 0;
1657 threads[idx].engine = engine;
1658
1659 kthread_init_work(&threads[idx].work, *fn);
1660 kthread_queue_work(worker, &threads[idx].work);
1661 idx++;
1662 }
1663
1664 idx = 0;
1665 for_each_uabi_engine(engine, i915) {
1666 int status;
1667
1668 if (!threads[idx].worker)
1669 break;
1670
1671 kthread_flush_work(&threads[idx].work);
1672 status = READ_ONCE(threads[idx].result);
1673 if (status && !err)
1674 err = status;
1675
1676 kthread_destroy_worker(threads[idx++].worker);
1677 }
1678
1679 if (igt_live_test_end(&t))
1680 err = -EIO;
1681 }
1682
1683 kfree(threads);
1684 return err;
1685 }
1686
1687 static int
max_batches(struct i915_gem_context * ctx,struct intel_engine_cs * engine)1688 max_batches(struct i915_gem_context *ctx, struct intel_engine_cs *engine)
1689 {
1690 struct i915_request *rq;
1691 int ret;
1692
1693 /*
1694 * Before execlists, all contexts share the same ringbuffer. With
1695 * execlists, each context/engine has a separate ringbuffer and
1696 * for the purposes of this test, inexhaustible.
1697 *
1698 * For the global ringbuffer though, we have to be very careful
1699 * that we do not wrap while preventing the execution of requests
1700 * with a unsignaled fence.
1701 */
1702 if (HAS_EXECLISTS(ctx->i915))
1703 return INT_MAX;
1704
1705 rq = igt_request_alloc(ctx, engine);
1706 if (IS_ERR(rq)) {
1707 ret = PTR_ERR(rq);
1708 } else {
1709 int sz;
1710
1711 ret = rq->ring->size - rq->reserved_space;
1712 i915_request_add(rq);
1713
1714 sz = rq->ring->emit - rq->head;
1715 if (sz < 0)
1716 sz += rq->ring->size;
1717 ret /= sz;
1718 ret /= 2; /* leave half spare, in case of emergency! */
1719 }
1720
1721 return ret;
1722 }
1723
live_breadcrumbs_smoketest(void * arg)1724 static int live_breadcrumbs_smoketest(void *arg)
1725 {
1726 struct drm_i915_private *i915 = arg;
1727 const unsigned int nengines = num_uabi_engines(i915);
1728 const unsigned int ncpus = /* saturate with nengines * ncpus */
1729 max_t(int, 2, DIV_ROUND_UP(num_online_cpus(), nengines));
1730 unsigned long num_waits, num_fences;
1731 struct intel_engine_cs *engine;
1732 struct smoke_thread *threads;
1733 struct igt_live_test live;
1734 intel_wakeref_t wakeref;
1735 struct smoketest *smoke;
1736 unsigned int n, idx;
1737 struct file *file;
1738 int ret = 0;
1739
1740 /*
1741 * Smoketest our breadcrumb/signal handling for requests across multiple
1742 * threads. A very simple test to only catch the most egregious of bugs.
1743 * See __igt_breadcrumbs_smoketest();
1744 *
1745 * On real hardware this time.
1746 */
1747
1748 wakeref = intel_runtime_pm_get(&i915->runtime_pm);
1749
1750 file = mock_file(i915);
1751 if (IS_ERR(file)) {
1752 ret = PTR_ERR(file);
1753 goto out_rpm;
1754 }
1755
1756 smoke = kcalloc(nengines, sizeof(*smoke), GFP_KERNEL);
1757 if (!smoke) {
1758 ret = -ENOMEM;
1759 goto out_file;
1760 }
1761
1762 threads = kcalloc(ncpus * nengines, sizeof(*threads), GFP_KERNEL);
1763 if (!threads) {
1764 ret = -ENOMEM;
1765 goto out_smoke;
1766 }
1767
1768 smoke[0].request_alloc = __live_request_alloc;
1769 smoke[0].ncontexts = 64;
1770 smoke[0].contexts = kcalloc(smoke[0].ncontexts,
1771 sizeof(*smoke[0].contexts),
1772 GFP_KERNEL);
1773 if (!smoke[0].contexts) {
1774 ret = -ENOMEM;
1775 goto out_threads;
1776 }
1777
1778 for (n = 0; n < smoke[0].ncontexts; n++) {
1779 smoke[0].contexts[n] = live_context(i915, file);
1780 if (IS_ERR(smoke[0].contexts[n])) {
1781 ret = PTR_ERR(smoke[0].contexts[n]);
1782 goto out_contexts;
1783 }
1784 }
1785
1786 ret = igt_live_test_begin(&live, i915, __func__, "");
1787 if (ret)
1788 goto out_contexts;
1789
1790 idx = 0;
1791 for_each_uabi_engine(engine, i915) {
1792 smoke[idx] = smoke[0];
1793 smoke[idx].engine = engine;
1794 smoke[idx].max_batch =
1795 max_batches(smoke[0].contexts[0], engine);
1796 if (smoke[idx].max_batch < 0) {
1797 ret = smoke[idx].max_batch;
1798 goto out_flush;
1799 }
1800 /* One ring interleaved between requests from all cpus */
1801 smoke[idx].max_batch /= ncpus + 1;
1802 pr_debug("Limiting batches to %d requests on %s\n",
1803 smoke[idx].max_batch, engine->name);
1804
1805 for (n = 0; n < ncpus; n++) {
1806 unsigned int i = idx * ncpus + n;
1807 struct kthread_worker *worker;
1808
1809 worker = kthread_create_worker(0, "igt/%d.%d", idx, n);
1810 if (IS_ERR(worker)) {
1811 ret = PTR_ERR(worker);
1812 goto out_flush;
1813 }
1814
1815 threads[i].worker = worker;
1816 threads[i].t = &smoke[idx];
1817
1818 kthread_init_work(&threads[i].work,
1819 __igt_breadcrumbs_smoketest);
1820 kthread_queue_work(worker, &threads[i].work);
1821 }
1822
1823 idx++;
1824 }
1825
1826 msleep(jiffies_to_msecs(i915_selftest.timeout_jiffies));
1827
1828 out_flush:
1829 idx = 0;
1830 num_waits = 0;
1831 num_fences = 0;
1832 for_each_uabi_engine(engine, i915) {
1833 for (n = 0; n < ncpus; n++) {
1834 unsigned int i = idx * ncpus + n;
1835 int err;
1836
1837 if (!threads[i].worker)
1838 continue;
1839
1840 WRITE_ONCE(threads[i].stop, true);
1841 kthread_flush_work(&threads[i].work);
1842 err = READ_ONCE(threads[i].result);
1843 if (err < 0 && !ret)
1844 ret = err;
1845
1846 kthread_destroy_worker(threads[i].worker);
1847 }
1848
1849 num_waits += atomic_long_read(&smoke[idx].num_waits);
1850 num_fences += atomic_long_read(&smoke[idx].num_fences);
1851 idx++;
1852 }
1853 pr_info("Completed %lu waits for %lu fences across %d engines and %d cpus\n",
1854 num_waits, num_fences, idx, ncpus);
1855
1856 ret = igt_live_test_end(&live) ?: ret;
1857 out_contexts:
1858 kfree(smoke[0].contexts);
1859 out_threads:
1860 kfree(threads);
1861 out_smoke:
1862 kfree(smoke);
1863 out_file:
1864 fput(file);
1865 out_rpm:
1866 intel_runtime_pm_put(&i915->runtime_pm, wakeref);
1867
1868 return ret;
1869 }
1870
i915_request_live_selftests(struct drm_i915_private * i915)1871 int i915_request_live_selftests(struct drm_i915_private *i915)
1872 {
1873 static const struct i915_subtest tests[] = {
1874 SUBTEST(live_nop_request),
1875 SUBTEST(live_all_engines),
1876 SUBTEST(live_sequential_engines),
1877 SUBTEST(live_parallel_engines),
1878 SUBTEST(live_empty_request),
1879 SUBTEST(live_cancel_request),
1880 SUBTEST(live_breadcrumbs_smoketest),
1881 };
1882
1883 if (intel_gt_is_wedged(to_gt(i915)))
1884 return 0;
1885
1886 return i915_live_subtests(tests, i915);
1887 }
1888
switch_to_kernel_sync(struct intel_context * ce,int err)1889 static int switch_to_kernel_sync(struct intel_context *ce, int err)
1890 {
1891 struct i915_request *rq;
1892 struct dma_fence *fence;
1893
1894 rq = intel_engine_create_kernel_request(ce->engine);
1895 if (IS_ERR(rq))
1896 return PTR_ERR(rq);
1897
1898 fence = i915_active_fence_get(&ce->timeline->last_request);
1899 if (fence) {
1900 i915_request_await_dma_fence(rq, fence);
1901 dma_fence_put(fence);
1902 }
1903
1904 rq = i915_request_get(rq);
1905 i915_request_add(rq);
1906 if (i915_request_wait(rq, 0, HZ / 2) < 0 && !err)
1907 err = -ETIME;
1908 i915_request_put(rq);
1909
1910 while (!err && !intel_engine_is_idle(ce->engine))
1911 intel_engine_flush_submission(ce->engine);
1912
1913 return err;
1914 }
1915
1916 struct perf_stats {
1917 struct intel_engine_cs *engine;
1918 unsigned long count;
1919 ktime_t time;
1920 ktime_t busy;
1921 u64 runtime;
1922 };
1923
1924 struct perf_series {
1925 struct drm_i915_private *i915;
1926 unsigned int nengines;
1927 struct intel_context *ce[];
1928 };
1929
cmp_u32(const void * A,const void * B)1930 static int cmp_u32(const void *A, const void *B)
1931 {
1932 const u32 *a = A, *b = B;
1933
1934 return *a - *b;
1935 }
1936
trifilter(u32 * a)1937 static u32 trifilter(u32 *a)
1938 {
1939 u64 sum;
1940
1941 #define TF_COUNT 5
1942 sort(a, TF_COUNT, sizeof(*a), cmp_u32, NULL);
1943
1944 sum = mul_u32_u32(a[2], 2);
1945 sum += a[1];
1946 sum += a[3];
1947
1948 GEM_BUG_ON(sum > U32_MAX);
1949 return sum;
1950 #define TF_BIAS 2
1951 }
1952
cycles_to_ns(struct intel_engine_cs * engine,u32 cycles)1953 static u64 cycles_to_ns(struct intel_engine_cs *engine, u32 cycles)
1954 {
1955 u64 ns = intel_gt_clock_interval_to_ns(engine->gt, cycles);
1956
1957 return DIV_ROUND_CLOSEST(ns, 1 << TF_BIAS);
1958 }
1959
emit_timestamp_store(u32 * cs,struct intel_context * ce,u32 offset)1960 static u32 *emit_timestamp_store(u32 *cs, struct intel_context *ce, u32 offset)
1961 {
1962 *cs++ = MI_STORE_REGISTER_MEM_GEN8 | MI_USE_GGTT;
1963 *cs++ = i915_mmio_reg_offset(RING_TIMESTAMP((ce->engine->mmio_base)));
1964 *cs++ = offset;
1965 *cs++ = 0;
1966
1967 return cs;
1968 }
1969
emit_store_dw(u32 * cs,u32 offset,u32 value)1970 static u32 *emit_store_dw(u32 *cs, u32 offset, u32 value)
1971 {
1972 *cs++ = MI_STORE_DWORD_IMM_GEN4 | MI_USE_GGTT;
1973 *cs++ = offset;
1974 *cs++ = 0;
1975 *cs++ = value;
1976
1977 return cs;
1978 }
1979
emit_semaphore_poll(u32 * cs,u32 mode,u32 value,u32 offset)1980 static u32 *emit_semaphore_poll(u32 *cs, u32 mode, u32 value, u32 offset)
1981 {
1982 *cs++ = MI_SEMAPHORE_WAIT |
1983 MI_SEMAPHORE_GLOBAL_GTT |
1984 MI_SEMAPHORE_POLL |
1985 mode;
1986 *cs++ = value;
1987 *cs++ = offset;
1988 *cs++ = 0;
1989
1990 return cs;
1991 }
1992
emit_semaphore_poll_until(u32 * cs,u32 offset,u32 value)1993 static u32 *emit_semaphore_poll_until(u32 *cs, u32 offset, u32 value)
1994 {
1995 return emit_semaphore_poll(cs, MI_SEMAPHORE_SAD_EQ_SDD, value, offset);
1996 }
1997
semaphore_set(u32 * sema,u32 value)1998 static void semaphore_set(u32 *sema, u32 value)
1999 {
2000 WRITE_ONCE(*sema, value);
2001 wmb(); /* flush the update to the cache, and beyond */
2002 }
2003
hwsp_scratch(const struct intel_context * ce)2004 static u32 *hwsp_scratch(const struct intel_context *ce)
2005 {
2006 return memset32(ce->engine->status_page.addr + 1000, 0, 21);
2007 }
2008
hwsp_offset(const struct intel_context * ce,u32 * dw)2009 static u32 hwsp_offset(const struct intel_context *ce, u32 *dw)
2010 {
2011 return (i915_ggtt_offset(ce->engine->status_page.vma) +
2012 offset_in_page(dw));
2013 }
2014
measure_semaphore_response(struct intel_context * ce)2015 static int measure_semaphore_response(struct intel_context *ce)
2016 {
2017 u32 *sema = hwsp_scratch(ce);
2018 const u32 offset = hwsp_offset(ce, sema);
2019 u32 elapsed[TF_COUNT], cycles;
2020 struct i915_request *rq;
2021 u32 *cs;
2022 int err;
2023 int i;
2024
2025 /*
2026 * Measure how many cycles it takes for the HW to detect the change
2027 * in a semaphore value.
2028 *
2029 * A: read CS_TIMESTAMP from CPU
2030 * poke semaphore
2031 * B: read CS_TIMESTAMP on GPU
2032 *
2033 * Semaphore latency: B - A
2034 */
2035
2036 semaphore_set(sema, -1);
2037
2038 rq = i915_request_create(ce);
2039 if (IS_ERR(rq))
2040 return PTR_ERR(rq);
2041
2042 cs = intel_ring_begin(rq, 4 + 12 * ARRAY_SIZE(elapsed));
2043 if (IS_ERR(cs)) {
2044 i915_request_add(rq);
2045 err = PTR_ERR(cs);
2046 goto err;
2047 }
2048
2049 cs = emit_store_dw(cs, offset, 0);
2050 for (i = 1; i <= ARRAY_SIZE(elapsed); i++) {
2051 cs = emit_semaphore_poll_until(cs, offset, i);
2052 cs = emit_timestamp_store(cs, ce, offset + i * sizeof(u32));
2053 cs = emit_store_dw(cs, offset, 0);
2054 }
2055
2056 intel_ring_advance(rq, cs);
2057 i915_request_add(rq);
2058
2059 if (wait_for(READ_ONCE(*sema) == 0, 50)) {
2060 err = -EIO;
2061 goto err;
2062 }
2063
2064 for (i = 1; i <= ARRAY_SIZE(elapsed); i++) {
2065 preempt_disable();
2066 cycles = ENGINE_READ_FW(ce->engine, RING_TIMESTAMP);
2067 semaphore_set(sema, i);
2068 preempt_enable();
2069
2070 if (wait_for(READ_ONCE(*sema) == 0, 50)) {
2071 err = -EIO;
2072 goto err;
2073 }
2074
2075 elapsed[i - 1] = sema[i] - cycles;
2076 }
2077
2078 cycles = trifilter(elapsed);
2079 pr_info("%s: semaphore response %d cycles, %lluns\n",
2080 ce->engine->name, cycles >> TF_BIAS,
2081 cycles_to_ns(ce->engine, cycles));
2082
2083 return intel_gt_wait_for_idle(ce->engine->gt, HZ);
2084
2085 err:
2086 intel_gt_set_wedged(ce->engine->gt);
2087 return err;
2088 }
2089
measure_idle_dispatch(struct intel_context * ce)2090 static int measure_idle_dispatch(struct intel_context *ce)
2091 {
2092 u32 *sema = hwsp_scratch(ce);
2093 const u32 offset = hwsp_offset(ce, sema);
2094 u32 elapsed[TF_COUNT], cycles;
2095 u32 *cs;
2096 int err;
2097 int i;
2098
2099 /*
2100 * Measure how long it takes for us to submit a request while the
2101 * engine is idle, but is resting in our context.
2102 *
2103 * A: read CS_TIMESTAMP from CPU
2104 * submit request
2105 * B: read CS_TIMESTAMP on GPU
2106 *
2107 * Submission latency: B - A
2108 */
2109
2110 for (i = 0; i < ARRAY_SIZE(elapsed); i++) {
2111 struct i915_request *rq;
2112
2113 err = intel_gt_wait_for_idle(ce->engine->gt, HZ / 2);
2114 if (err)
2115 return err;
2116
2117 rq = i915_request_create(ce);
2118 if (IS_ERR(rq)) {
2119 err = PTR_ERR(rq);
2120 goto err;
2121 }
2122
2123 cs = intel_ring_begin(rq, 4);
2124 if (IS_ERR(cs)) {
2125 i915_request_add(rq);
2126 err = PTR_ERR(cs);
2127 goto err;
2128 }
2129
2130 cs = emit_timestamp_store(cs, ce, offset + i * sizeof(u32));
2131
2132 intel_ring_advance(rq, cs);
2133
2134 preempt_disable();
2135 local_bh_disable();
2136 elapsed[i] = ENGINE_READ_FW(ce->engine, RING_TIMESTAMP);
2137 i915_request_add(rq);
2138 local_bh_enable();
2139 preempt_enable();
2140 }
2141
2142 err = intel_gt_wait_for_idle(ce->engine->gt, HZ / 2);
2143 if (err)
2144 goto err;
2145
2146 for (i = 0; i < ARRAY_SIZE(elapsed); i++)
2147 elapsed[i] = sema[i] - elapsed[i];
2148
2149 cycles = trifilter(elapsed);
2150 pr_info("%s: idle dispatch latency %d cycles, %lluns\n",
2151 ce->engine->name, cycles >> TF_BIAS,
2152 cycles_to_ns(ce->engine, cycles));
2153
2154 return intel_gt_wait_for_idle(ce->engine->gt, HZ);
2155
2156 err:
2157 intel_gt_set_wedged(ce->engine->gt);
2158 return err;
2159 }
2160
measure_busy_dispatch(struct intel_context * ce)2161 static int measure_busy_dispatch(struct intel_context *ce)
2162 {
2163 u32 *sema = hwsp_scratch(ce);
2164 const u32 offset = hwsp_offset(ce, sema);
2165 u32 elapsed[TF_COUNT + 1], cycles;
2166 u32 *cs;
2167 int err;
2168 int i;
2169
2170 /*
2171 * Measure how long it takes for us to submit a request while the
2172 * engine is busy, polling on a semaphore in our context. With
2173 * direct submission, this will include the cost of a lite restore.
2174 *
2175 * A: read CS_TIMESTAMP from CPU
2176 * submit request
2177 * B: read CS_TIMESTAMP on GPU
2178 *
2179 * Submission latency: B - A
2180 */
2181
2182 for (i = 1; i <= ARRAY_SIZE(elapsed); i++) {
2183 struct i915_request *rq;
2184
2185 rq = i915_request_create(ce);
2186 if (IS_ERR(rq)) {
2187 err = PTR_ERR(rq);
2188 goto err;
2189 }
2190
2191 cs = intel_ring_begin(rq, 12);
2192 if (IS_ERR(cs)) {
2193 i915_request_add(rq);
2194 err = PTR_ERR(cs);
2195 goto err;
2196 }
2197
2198 cs = emit_store_dw(cs, offset + i * sizeof(u32), -1);
2199 cs = emit_semaphore_poll_until(cs, offset, i);
2200 cs = emit_timestamp_store(cs, ce, offset + i * sizeof(u32));
2201
2202 intel_ring_advance(rq, cs);
2203
2204 if (i > 1 && wait_for(READ_ONCE(sema[i - 1]), 500)) {
2205 err = -EIO;
2206 goto err;
2207 }
2208
2209 preempt_disable();
2210 local_bh_disable();
2211 elapsed[i - 1] = ENGINE_READ_FW(ce->engine, RING_TIMESTAMP);
2212 i915_request_add(rq);
2213 local_bh_enable();
2214 semaphore_set(sema, i - 1);
2215 preempt_enable();
2216 }
2217
2218 wait_for(READ_ONCE(sema[i - 1]), 500);
2219 semaphore_set(sema, i - 1);
2220
2221 for (i = 1; i <= TF_COUNT; i++) {
2222 GEM_BUG_ON(sema[i] == -1);
2223 elapsed[i - 1] = sema[i] - elapsed[i];
2224 }
2225
2226 cycles = trifilter(elapsed);
2227 pr_info("%s: busy dispatch latency %d cycles, %lluns\n",
2228 ce->engine->name, cycles >> TF_BIAS,
2229 cycles_to_ns(ce->engine, cycles));
2230
2231 return intel_gt_wait_for_idle(ce->engine->gt, HZ);
2232
2233 err:
2234 intel_gt_set_wedged(ce->engine->gt);
2235 return err;
2236 }
2237
plug(struct intel_engine_cs * engine,u32 * sema,u32 mode,int value)2238 static int plug(struct intel_engine_cs *engine, u32 *sema, u32 mode, int value)
2239 {
2240 const u32 offset =
2241 i915_ggtt_offset(engine->status_page.vma) +
2242 offset_in_page(sema);
2243 struct i915_request *rq;
2244 u32 *cs;
2245
2246 rq = i915_request_create(engine->kernel_context);
2247 if (IS_ERR(rq))
2248 return PTR_ERR(rq);
2249
2250 cs = intel_ring_begin(rq, 4);
2251 if (IS_ERR(cs)) {
2252 i915_request_add(rq);
2253 return PTR_ERR(cs);
2254 }
2255
2256 cs = emit_semaphore_poll(cs, mode, value, offset);
2257
2258 intel_ring_advance(rq, cs);
2259 i915_request_add(rq);
2260
2261 return 0;
2262 }
2263
measure_inter_request(struct intel_context * ce)2264 static int measure_inter_request(struct intel_context *ce)
2265 {
2266 u32 *sema = hwsp_scratch(ce);
2267 const u32 offset = hwsp_offset(ce, sema);
2268 u32 elapsed[TF_COUNT + 1], cycles;
2269 struct i915_sw_fence *submit;
2270 int i, err;
2271
2272 /*
2273 * Measure how long it takes to advance from one request into the
2274 * next. Between each request we flush the GPU caches to memory,
2275 * update the breadcrumbs, and then invalidate those caches.
2276 * We queue up all the requests to be submitted in one batch so
2277 * it should be one set of contiguous measurements.
2278 *
2279 * A: read CS_TIMESTAMP on GPU
2280 * advance request
2281 * B: read CS_TIMESTAMP on GPU
2282 *
2283 * Request latency: B - A
2284 */
2285
2286 err = plug(ce->engine, sema, MI_SEMAPHORE_SAD_NEQ_SDD, 0);
2287 if (err)
2288 return err;
2289
2290 submit = heap_fence_create(GFP_KERNEL);
2291 if (!submit) {
2292 semaphore_set(sema, 1);
2293 return -ENOMEM;
2294 }
2295
2296 intel_engine_flush_submission(ce->engine);
2297 for (i = 1; i <= ARRAY_SIZE(elapsed); i++) {
2298 struct i915_request *rq;
2299 u32 *cs;
2300
2301 rq = i915_request_create(ce);
2302 if (IS_ERR(rq)) {
2303 err = PTR_ERR(rq);
2304 goto err_submit;
2305 }
2306
2307 err = i915_sw_fence_await_sw_fence_gfp(&rq->submit,
2308 submit,
2309 GFP_KERNEL);
2310 if (err < 0) {
2311 i915_request_add(rq);
2312 goto err_submit;
2313 }
2314
2315 cs = intel_ring_begin(rq, 4);
2316 if (IS_ERR(cs)) {
2317 i915_request_add(rq);
2318 err = PTR_ERR(cs);
2319 goto err_submit;
2320 }
2321
2322 cs = emit_timestamp_store(cs, ce, offset + i * sizeof(u32));
2323
2324 intel_ring_advance(rq, cs);
2325 i915_request_add(rq);
2326 }
2327 i915_sw_fence_commit(submit);
2328 intel_engine_flush_submission(ce->engine);
2329 heap_fence_put(submit);
2330
2331 semaphore_set(sema, 1);
2332 err = intel_gt_wait_for_idle(ce->engine->gt, HZ / 2);
2333 if (err)
2334 goto err;
2335
2336 for (i = 1; i <= TF_COUNT; i++)
2337 elapsed[i - 1] = sema[i + 1] - sema[i];
2338
2339 cycles = trifilter(elapsed);
2340 pr_info("%s: inter-request latency %d cycles, %lluns\n",
2341 ce->engine->name, cycles >> TF_BIAS,
2342 cycles_to_ns(ce->engine, cycles));
2343
2344 return intel_gt_wait_for_idle(ce->engine->gt, HZ);
2345
2346 err_submit:
2347 i915_sw_fence_commit(submit);
2348 heap_fence_put(submit);
2349 semaphore_set(sema, 1);
2350 err:
2351 intel_gt_set_wedged(ce->engine->gt);
2352 return err;
2353 }
2354
measure_context_switch(struct intel_context * ce)2355 static int measure_context_switch(struct intel_context *ce)
2356 {
2357 u32 *sema = hwsp_scratch(ce);
2358 const u32 offset = hwsp_offset(ce, sema);
2359 struct i915_request *fence = NULL;
2360 u32 elapsed[TF_COUNT + 1], cycles;
2361 int i, j, err;
2362 u32 *cs;
2363
2364 /*
2365 * Measure how long it takes to advance from one request in one
2366 * context to a request in another context. This allows us to
2367 * measure how long the context save/restore take, along with all
2368 * the inter-context setup we require.
2369 *
2370 * A: read CS_TIMESTAMP on GPU
2371 * switch context
2372 * B: read CS_TIMESTAMP on GPU
2373 *
2374 * Context switch latency: B - A
2375 */
2376
2377 err = plug(ce->engine, sema, MI_SEMAPHORE_SAD_NEQ_SDD, 0);
2378 if (err)
2379 return err;
2380
2381 for (i = 1; i <= ARRAY_SIZE(elapsed); i++) {
2382 struct intel_context *arr[] = {
2383 ce, ce->engine->kernel_context
2384 };
2385 u32 addr = offset + ARRAY_SIZE(arr) * i * sizeof(u32);
2386
2387 for (j = 0; j < ARRAY_SIZE(arr); j++) {
2388 struct i915_request *rq;
2389
2390 rq = i915_request_create(arr[j]);
2391 if (IS_ERR(rq)) {
2392 err = PTR_ERR(rq);
2393 goto err_fence;
2394 }
2395
2396 if (fence) {
2397 err = i915_request_await_dma_fence(rq,
2398 &fence->fence);
2399 if (err) {
2400 i915_request_add(rq);
2401 goto err_fence;
2402 }
2403 }
2404
2405 cs = intel_ring_begin(rq, 4);
2406 if (IS_ERR(cs)) {
2407 i915_request_add(rq);
2408 err = PTR_ERR(cs);
2409 goto err_fence;
2410 }
2411
2412 cs = emit_timestamp_store(cs, ce, addr);
2413 addr += sizeof(u32);
2414
2415 intel_ring_advance(rq, cs);
2416
2417 i915_request_put(fence);
2418 fence = i915_request_get(rq);
2419
2420 i915_request_add(rq);
2421 }
2422 }
2423 i915_request_put(fence);
2424 intel_engine_flush_submission(ce->engine);
2425
2426 semaphore_set(sema, 1);
2427 err = intel_gt_wait_for_idle(ce->engine->gt, HZ / 2);
2428 if (err)
2429 goto err;
2430
2431 for (i = 1; i <= TF_COUNT; i++)
2432 elapsed[i - 1] = sema[2 * i + 2] - sema[2 * i + 1];
2433
2434 cycles = trifilter(elapsed);
2435 pr_info("%s: context switch latency %d cycles, %lluns\n",
2436 ce->engine->name, cycles >> TF_BIAS,
2437 cycles_to_ns(ce->engine, cycles));
2438
2439 return intel_gt_wait_for_idle(ce->engine->gt, HZ);
2440
2441 err_fence:
2442 i915_request_put(fence);
2443 semaphore_set(sema, 1);
2444 err:
2445 intel_gt_set_wedged(ce->engine->gt);
2446 return err;
2447 }
2448
measure_preemption(struct intel_context * ce)2449 static int measure_preemption(struct intel_context *ce)
2450 {
2451 u32 *sema = hwsp_scratch(ce);
2452 const u32 offset = hwsp_offset(ce, sema);
2453 u32 elapsed[TF_COUNT], cycles;
2454 u32 *cs;
2455 int err;
2456 int i;
2457
2458 /*
2459 * We measure two latencies while triggering preemption. The first
2460 * latency is how long it takes for us to submit a preempting request.
2461 * The second latency is how it takes for us to return from the
2462 * preemption back to the original context.
2463 *
2464 * A: read CS_TIMESTAMP from CPU
2465 * submit preemption
2466 * B: read CS_TIMESTAMP on GPU (in preempting context)
2467 * context switch
2468 * C: read CS_TIMESTAMP on GPU (in original context)
2469 *
2470 * Preemption dispatch latency: B - A
2471 * Preemption switch latency: C - B
2472 */
2473
2474 if (!intel_engine_has_preemption(ce->engine))
2475 return 0;
2476
2477 for (i = 1; i <= ARRAY_SIZE(elapsed); i++) {
2478 u32 addr = offset + 2 * i * sizeof(u32);
2479 struct i915_request *rq;
2480
2481 rq = i915_request_create(ce);
2482 if (IS_ERR(rq)) {
2483 err = PTR_ERR(rq);
2484 goto err;
2485 }
2486
2487 cs = intel_ring_begin(rq, 12);
2488 if (IS_ERR(cs)) {
2489 i915_request_add(rq);
2490 err = PTR_ERR(cs);
2491 goto err;
2492 }
2493
2494 cs = emit_store_dw(cs, addr, -1);
2495 cs = emit_semaphore_poll_until(cs, offset, i);
2496 cs = emit_timestamp_store(cs, ce, addr + sizeof(u32));
2497
2498 intel_ring_advance(rq, cs);
2499 i915_request_add(rq);
2500
2501 if (wait_for(READ_ONCE(sema[2 * i]) == -1, 500)) {
2502 err = -EIO;
2503 goto err;
2504 }
2505
2506 rq = i915_request_create(ce->engine->kernel_context);
2507 if (IS_ERR(rq)) {
2508 err = PTR_ERR(rq);
2509 goto err;
2510 }
2511
2512 cs = intel_ring_begin(rq, 8);
2513 if (IS_ERR(cs)) {
2514 i915_request_add(rq);
2515 err = PTR_ERR(cs);
2516 goto err;
2517 }
2518
2519 cs = emit_timestamp_store(cs, ce, addr);
2520 cs = emit_store_dw(cs, offset, i);
2521
2522 intel_ring_advance(rq, cs);
2523 rq->sched.attr.priority = I915_PRIORITY_BARRIER;
2524
2525 elapsed[i - 1] = ENGINE_READ_FW(ce->engine, RING_TIMESTAMP);
2526 i915_request_add(rq);
2527 }
2528
2529 if (wait_for(READ_ONCE(sema[2 * i - 2]) != -1, 500)) {
2530 err = -EIO;
2531 goto err;
2532 }
2533
2534 for (i = 1; i <= TF_COUNT; i++)
2535 elapsed[i - 1] = sema[2 * i + 0] - elapsed[i - 1];
2536
2537 cycles = trifilter(elapsed);
2538 pr_info("%s: preemption dispatch latency %d cycles, %lluns\n",
2539 ce->engine->name, cycles >> TF_BIAS,
2540 cycles_to_ns(ce->engine, cycles));
2541
2542 for (i = 1; i <= TF_COUNT; i++)
2543 elapsed[i - 1] = sema[2 * i + 1] - sema[2 * i + 0];
2544
2545 cycles = trifilter(elapsed);
2546 pr_info("%s: preemption switch latency %d cycles, %lluns\n",
2547 ce->engine->name, cycles >> TF_BIAS,
2548 cycles_to_ns(ce->engine, cycles));
2549
2550 return intel_gt_wait_for_idle(ce->engine->gt, HZ);
2551
2552 err:
2553 intel_gt_set_wedged(ce->engine->gt);
2554 return err;
2555 }
2556
2557 struct signal_cb {
2558 struct dma_fence_cb base;
2559 bool seen;
2560 };
2561
signal_cb(struct dma_fence * fence,struct dma_fence_cb * cb)2562 static void signal_cb(struct dma_fence *fence, struct dma_fence_cb *cb)
2563 {
2564 struct signal_cb *s = container_of(cb, typeof(*s), base);
2565
2566 smp_store_mb(s->seen, true); /* be safe, be strong */
2567 }
2568
measure_completion(struct intel_context * ce)2569 static int measure_completion(struct intel_context *ce)
2570 {
2571 u32 *sema = hwsp_scratch(ce);
2572 const u32 offset = hwsp_offset(ce, sema);
2573 u32 elapsed[TF_COUNT], cycles;
2574 u32 *cs;
2575 int err;
2576 int i;
2577
2578 /*
2579 * Measure how long it takes for the signal (interrupt) to be
2580 * sent from the GPU to be processed by the CPU.
2581 *
2582 * A: read CS_TIMESTAMP on GPU
2583 * signal
2584 * B: read CS_TIMESTAMP from CPU
2585 *
2586 * Completion latency: B - A
2587 */
2588
2589 for (i = 1; i <= ARRAY_SIZE(elapsed); i++) {
2590 struct signal_cb cb = { .seen = false };
2591 struct i915_request *rq;
2592
2593 rq = i915_request_create(ce);
2594 if (IS_ERR(rq)) {
2595 err = PTR_ERR(rq);
2596 goto err;
2597 }
2598
2599 cs = intel_ring_begin(rq, 12);
2600 if (IS_ERR(cs)) {
2601 i915_request_add(rq);
2602 err = PTR_ERR(cs);
2603 goto err;
2604 }
2605
2606 cs = emit_store_dw(cs, offset + i * sizeof(u32), -1);
2607 cs = emit_semaphore_poll_until(cs, offset, i);
2608 cs = emit_timestamp_store(cs, ce, offset + i * sizeof(u32));
2609
2610 intel_ring_advance(rq, cs);
2611
2612 dma_fence_add_callback(&rq->fence, &cb.base, signal_cb);
2613 i915_request_add(rq);
2614
2615 intel_engine_flush_submission(ce->engine);
2616 if (wait_for(READ_ONCE(sema[i]) == -1, 50)) {
2617 err = -EIO;
2618 goto err;
2619 }
2620
2621 preempt_disable();
2622 semaphore_set(sema, i);
2623 while (!READ_ONCE(cb.seen))
2624 cpu_relax();
2625
2626 elapsed[i - 1] = ENGINE_READ_FW(ce->engine, RING_TIMESTAMP);
2627 preempt_enable();
2628 }
2629
2630 err = intel_gt_wait_for_idle(ce->engine->gt, HZ / 2);
2631 if (err)
2632 goto err;
2633
2634 for (i = 0; i < ARRAY_SIZE(elapsed); i++) {
2635 GEM_BUG_ON(sema[i + 1] == -1);
2636 elapsed[i] = elapsed[i] - sema[i + 1];
2637 }
2638
2639 cycles = trifilter(elapsed);
2640 pr_info("%s: completion latency %d cycles, %lluns\n",
2641 ce->engine->name, cycles >> TF_BIAS,
2642 cycles_to_ns(ce->engine, cycles));
2643
2644 return intel_gt_wait_for_idle(ce->engine->gt, HZ);
2645
2646 err:
2647 intel_gt_set_wedged(ce->engine->gt);
2648 return err;
2649 }
2650
rps_pin(struct intel_gt * gt)2651 static void rps_pin(struct intel_gt *gt)
2652 {
2653 /* Pin the frequency to max */
2654 atomic_inc(>->rps.num_waiters);
2655 intel_uncore_forcewake_get(gt->uncore, FORCEWAKE_ALL);
2656
2657 mutex_lock(>->rps.lock);
2658 intel_rps_set(>->rps, gt->rps.max_freq);
2659 mutex_unlock(>->rps.lock);
2660 }
2661
rps_unpin(struct intel_gt * gt)2662 static void rps_unpin(struct intel_gt *gt)
2663 {
2664 intel_uncore_forcewake_put(gt->uncore, FORCEWAKE_ALL);
2665 atomic_dec(>->rps.num_waiters);
2666 }
2667
perf_request_latency(void * arg)2668 static int perf_request_latency(void *arg)
2669 {
2670 struct drm_i915_private *i915 = arg;
2671 struct intel_engine_cs *engine;
2672 struct pm_qos_request qos;
2673 int err = 0;
2674
2675 if (GRAPHICS_VER(i915) < 8) /* per-engine CS timestamp, semaphores */
2676 return 0;
2677
2678 cpu_latency_qos_add_request(&qos, 0); /* disable cstates */
2679
2680 for_each_uabi_engine(engine, i915) {
2681 struct intel_context *ce;
2682
2683 ce = intel_context_create(engine);
2684 if (IS_ERR(ce)) {
2685 err = PTR_ERR(ce);
2686 goto out;
2687 }
2688
2689 err = intel_context_pin(ce);
2690 if (err) {
2691 intel_context_put(ce);
2692 goto out;
2693 }
2694
2695 st_engine_heartbeat_disable(engine);
2696 rps_pin(engine->gt);
2697
2698 if (err == 0)
2699 err = measure_semaphore_response(ce);
2700 if (err == 0)
2701 err = measure_idle_dispatch(ce);
2702 if (err == 0)
2703 err = measure_busy_dispatch(ce);
2704 if (err == 0)
2705 err = measure_inter_request(ce);
2706 if (err == 0)
2707 err = measure_context_switch(ce);
2708 if (err == 0)
2709 err = measure_preemption(ce);
2710 if (err == 0)
2711 err = measure_completion(ce);
2712
2713 rps_unpin(engine->gt);
2714 st_engine_heartbeat_enable(engine);
2715
2716 intel_context_unpin(ce);
2717 intel_context_put(ce);
2718 if (err)
2719 goto out;
2720 }
2721
2722 out:
2723 if (igt_flush_test(i915))
2724 err = -EIO;
2725
2726 cpu_latency_qos_remove_request(&qos);
2727 return err;
2728 }
2729
s_sync0(void * arg)2730 static int s_sync0(void *arg)
2731 {
2732 struct perf_series *ps = arg;
2733 IGT_TIMEOUT(end_time);
2734 unsigned int idx = 0;
2735 int err = 0;
2736
2737 GEM_BUG_ON(!ps->nengines);
2738 do {
2739 struct i915_request *rq;
2740
2741 rq = i915_request_create(ps->ce[idx]);
2742 if (IS_ERR(rq)) {
2743 err = PTR_ERR(rq);
2744 break;
2745 }
2746
2747 i915_request_get(rq);
2748 i915_request_add(rq);
2749
2750 if (i915_request_wait(rq, 0, HZ / 5) < 0)
2751 err = -ETIME;
2752 i915_request_put(rq);
2753 if (err)
2754 break;
2755
2756 if (++idx == ps->nengines)
2757 idx = 0;
2758 } while (!__igt_timeout(end_time, NULL));
2759
2760 return err;
2761 }
2762
s_sync1(void * arg)2763 static int s_sync1(void *arg)
2764 {
2765 struct perf_series *ps = arg;
2766 struct i915_request *prev = NULL;
2767 IGT_TIMEOUT(end_time);
2768 unsigned int idx = 0;
2769 int err = 0;
2770
2771 GEM_BUG_ON(!ps->nengines);
2772 do {
2773 struct i915_request *rq;
2774
2775 rq = i915_request_create(ps->ce[idx]);
2776 if (IS_ERR(rq)) {
2777 err = PTR_ERR(rq);
2778 break;
2779 }
2780
2781 i915_request_get(rq);
2782 i915_request_add(rq);
2783
2784 if (prev && i915_request_wait(prev, 0, HZ / 5) < 0)
2785 err = -ETIME;
2786 i915_request_put(prev);
2787 prev = rq;
2788 if (err)
2789 break;
2790
2791 if (++idx == ps->nengines)
2792 idx = 0;
2793 } while (!__igt_timeout(end_time, NULL));
2794 i915_request_put(prev);
2795
2796 return err;
2797 }
2798
s_many(void * arg)2799 static int s_many(void *arg)
2800 {
2801 struct perf_series *ps = arg;
2802 IGT_TIMEOUT(end_time);
2803 unsigned int idx = 0;
2804
2805 GEM_BUG_ON(!ps->nengines);
2806 do {
2807 struct i915_request *rq;
2808
2809 rq = i915_request_create(ps->ce[idx]);
2810 if (IS_ERR(rq))
2811 return PTR_ERR(rq);
2812
2813 i915_request_add(rq);
2814
2815 if (++idx == ps->nengines)
2816 idx = 0;
2817 } while (!__igt_timeout(end_time, NULL));
2818
2819 return 0;
2820 }
2821
perf_series_engines(void * arg)2822 static int perf_series_engines(void *arg)
2823 {
2824 struct drm_i915_private *i915 = arg;
2825 static int (* const func[])(void *arg) = {
2826 s_sync0,
2827 s_sync1,
2828 s_many,
2829 NULL,
2830 };
2831 const unsigned int nengines = num_uabi_engines(i915);
2832 struct intel_engine_cs *engine;
2833 int (* const *fn)(void *arg);
2834 struct pm_qos_request qos;
2835 struct perf_stats *stats;
2836 struct perf_series *ps;
2837 unsigned int idx;
2838 int err = 0;
2839
2840 stats = kcalloc(nengines, sizeof(*stats), GFP_KERNEL);
2841 if (!stats)
2842 return -ENOMEM;
2843
2844 ps = kzalloc(struct_size(ps, ce, nengines), GFP_KERNEL);
2845 if (!ps) {
2846 kfree(stats);
2847 return -ENOMEM;
2848 }
2849
2850 cpu_latency_qos_add_request(&qos, 0); /* disable cstates */
2851
2852 ps->i915 = i915;
2853 ps->nengines = nengines;
2854
2855 idx = 0;
2856 for_each_uabi_engine(engine, i915) {
2857 struct intel_context *ce;
2858
2859 ce = intel_context_create(engine);
2860 if (IS_ERR(ce)) {
2861 err = PTR_ERR(ce);
2862 goto out;
2863 }
2864
2865 err = intel_context_pin(ce);
2866 if (err) {
2867 intel_context_put(ce);
2868 goto out;
2869 }
2870
2871 ps->ce[idx++] = ce;
2872 }
2873 GEM_BUG_ON(idx != ps->nengines);
2874
2875 for (fn = func; *fn && !err; fn++) {
2876 char name[KSYM_NAME_LEN];
2877 struct igt_live_test t;
2878
2879 snprintf(name, sizeof(name), "%ps", *fn);
2880 err = igt_live_test_begin(&t, i915, __func__, name);
2881 if (err)
2882 break;
2883
2884 for (idx = 0; idx < nengines; idx++) {
2885 struct perf_stats *p =
2886 memset(&stats[idx], 0, sizeof(stats[idx]));
2887 struct intel_context *ce = ps->ce[idx];
2888
2889 p->engine = ps->ce[idx]->engine;
2890 intel_engine_pm_get(p->engine);
2891
2892 if (intel_engine_supports_stats(p->engine))
2893 p->busy = intel_engine_get_busy_time(p->engine,
2894 &p->time) + 1;
2895 else
2896 p->time = ktime_get();
2897 p->runtime = -intel_context_get_total_runtime_ns(ce);
2898 }
2899
2900 err = (*fn)(ps);
2901 if (igt_live_test_end(&t))
2902 err = -EIO;
2903
2904 for (idx = 0; idx < nengines; idx++) {
2905 struct perf_stats *p = &stats[idx];
2906 struct intel_context *ce = ps->ce[idx];
2907 int integer, decimal;
2908 u64 busy, dt, now;
2909
2910 if (p->busy)
2911 p->busy = ktime_sub(intel_engine_get_busy_time(p->engine,
2912 &now),
2913 p->busy - 1);
2914 else
2915 now = ktime_get();
2916 p->time = ktime_sub(now, p->time);
2917
2918 err = switch_to_kernel_sync(ce, err);
2919 p->runtime += intel_context_get_total_runtime_ns(ce);
2920 intel_engine_pm_put(p->engine);
2921
2922 busy = 100 * ktime_to_ns(p->busy);
2923 dt = ktime_to_ns(p->time);
2924 if (dt) {
2925 integer = div64_u64(busy, dt);
2926 busy -= integer * dt;
2927 decimal = div64_u64(100 * busy, dt);
2928 } else {
2929 integer = 0;
2930 decimal = 0;
2931 }
2932
2933 pr_info("%s %5s: { seqno:%d, busy:%d.%02d%%, runtime:%lldms, walltime:%lldms }\n",
2934 name, p->engine->name, ce->timeline->seqno,
2935 integer, decimal,
2936 div_u64(p->runtime, 1000 * 1000),
2937 div_u64(ktime_to_ns(p->time), 1000 * 1000));
2938 }
2939 }
2940
2941 out:
2942 for (idx = 0; idx < nengines; idx++) {
2943 if (IS_ERR_OR_NULL(ps->ce[idx]))
2944 break;
2945
2946 intel_context_unpin(ps->ce[idx]);
2947 intel_context_put(ps->ce[idx]);
2948 }
2949 kfree(ps);
2950
2951 cpu_latency_qos_remove_request(&qos);
2952 kfree(stats);
2953 return err;
2954 }
2955
2956 struct p_thread {
2957 struct perf_stats p;
2958 struct kthread_worker *worker;
2959 struct kthread_work work;
2960 struct intel_engine_cs *engine;
2961 int result;
2962 };
2963
p_sync0(struct kthread_work * work)2964 static void p_sync0(struct kthread_work *work)
2965 {
2966 struct p_thread *thread = container_of(work, typeof(*thread), work);
2967 struct perf_stats *p = &thread->p;
2968 struct intel_engine_cs *engine = p->engine;
2969 struct intel_context *ce;
2970 IGT_TIMEOUT(end_time);
2971 unsigned long count;
2972 bool busy;
2973 int err = 0;
2974
2975 ce = intel_context_create(engine);
2976 if (IS_ERR(ce)) {
2977 thread->result = PTR_ERR(ce);
2978 return;
2979 }
2980
2981 err = intel_context_pin(ce);
2982 if (err) {
2983 intel_context_put(ce);
2984 thread->result = err;
2985 return;
2986 }
2987
2988 if (intel_engine_supports_stats(engine)) {
2989 p->busy = intel_engine_get_busy_time(engine, &p->time);
2990 busy = true;
2991 } else {
2992 p->time = ktime_get();
2993 busy = false;
2994 }
2995
2996 count = 0;
2997 do {
2998 struct i915_request *rq;
2999
3000 rq = i915_request_create(ce);
3001 if (IS_ERR(rq)) {
3002 err = PTR_ERR(rq);
3003 break;
3004 }
3005
3006 i915_request_get(rq);
3007 i915_request_add(rq);
3008
3009 err = 0;
3010 if (i915_request_wait(rq, 0, HZ) < 0)
3011 err = -ETIME;
3012 i915_request_put(rq);
3013 if (err)
3014 break;
3015
3016 count++;
3017 } while (!__igt_timeout(end_time, NULL));
3018
3019 if (busy) {
3020 ktime_t now;
3021
3022 p->busy = ktime_sub(intel_engine_get_busy_time(engine, &now),
3023 p->busy);
3024 p->time = ktime_sub(now, p->time);
3025 } else {
3026 p->time = ktime_sub(ktime_get(), p->time);
3027 }
3028
3029 err = switch_to_kernel_sync(ce, err);
3030 p->runtime = intel_context_get_total_runtime_ns(ce);
3031 p->count = count;
3032
3033 intel_context_unpin(ce);
3034 intel_context_put(ce);
3035 thread->result = err;
3036 }
3037
p_sync1(struct kthread_work * work)3038 static void p_sync1(struct kthread_work *work)
3039 {
3040 struct p_thread *thread = container_of(work, typeof(*thread), work);
3041 struct perf_stats *p = &thread->p;
3042 struct intel_engine_cs *engine = p->engine;
3043 struct i915_request *prev = NULL;
3044 struct intel_context *ce;
3045 IGT_TIMEOUT(end_time);
3046 unsigned long count;
3047 bool busy;
3048 int err = 0;
3049
3050 ce = intel_context_create(engine);
3051 if (IS_ERR(ce)) {
3052 thread->result = PTR_ERR(ce);
3053 return;
3054 }
3055
3056 err = intel_context_pin(ce);
3057 if (err) {
3058 intel_context_put(ce);
3059 thread->result = err;
3060 return;
3061 }
3062
3063 if (intel_engine_supports_stats(engine)) {
3064 p->busy = intel_engine_get_busy_time(engine, &p->time);
3065 busy = true;
3066 } else {
3067 p->time = ktime_get();
3068 busy = false;
3069 }
3070
3071 count = 0;
3072 do {
3073 struct i915_request *rq;
3074
3075 rq = i915_request_create(ce);
3076 if (IS_ERR(rq)) {
3077 err = PTR_ERR(rq);
3078 break;
3079 }
3080
3081 i915_request_get(rq);
3082 i915_request_add(rq);
3083
3084 err = 0;
3085 if (prev && i915_request_wait(prev, 0, HZ) < 0)
3086 err = -ETIME;
3087 i915_request_put(prev);
3088 prev = rq;
3089 if (err)
3090 break;
3091
3092 count++;
3093 } while (!__igt_timeout(end_time, NULL));
3094 i915_request_put(prev);
3095
3096 if (busy) {
3097 ktime_t now;
3098
3099 p->busy = ktime_sub(intel_engine_get_busy_time(engine, &now),
3100 p->busy);
3101 p->time = ktime_sub(now, p->time);
3102 } else {
3103 p->time = ktime_sub(ktime_get(), p->time);
3104 }
3105
3106 err = switch_to_kernel_sync(ce, err);
3107 p->runtime = intel_context_get_total_runtime_ns(ce);
3108 p->count = count;
3109
3110 intel_context_unpin(ce);
3111 intel_context_put(ce);
3112 thread->result = err;
3113 }
3114
p_many(struct kthread_work * work)3115 static void p_many(struct kthread_work *work)
3116 {
3117 struct p_thread *thread = container_of(work, typeof(*thread), work);
3118 struct perf_stats *p = &thread->p;
3119 struct intel_engine_cs *engine = p->engine;
3120 struct intel_context *ce;
3121 IGT_TIMEOUT(end_time);
3122 unsigned long count;
3123 int err = 0;
3124 bool busy;
3125
3126 ce = intel_context_create(engine);
3127 if (IS_ERR(ce)) {
3128 thread->result = PTR_ERR(ce);
3129 return;
3130 }
3131
3132 err = intel_context_pin(ce);
3133 if (err) {
3134 intel_context_put(ce);
3135 thread->result = err;
3136 return;
3137 }
3138
3139 if (intel_engine_supports_stats(engine)) {
3140 p->busy = intel_engine_get_busy_time(engine, &p->time);
3141 busy = true;
3142 } else {
3143 p->time = ktime_get();
3144 busy = false;
3145 }
3146
3147 count = 0;
3148 do {
3149 struct i915_request *rq;
3150
3151 rq = i915_request_create(ce);
3152 if (IS_ERR(rq)) {
3153 err = PTR_ERR(rq);
3154 break;
3155 }
3156
3157 i915_request_add(rq);
3158 count++;
3159 } while (!__igt_timeout(end_time, NULL));
3160
3161 if (busy) {
3162 ktime_t now;
3163
3164 p->busy = ktime_sub(intel_engine_get_busy_time(engine, &now),
3165 p->busy);
3166 p->time = ktime_sub(now, p->time);
3167 } else {
3168 p->time = ktime_sub(ktime_get(), p->time);
3169 }
3170
3171 err = switch_to_kernel_sync(ce, err);
3172 p->runtime = intel_context_get_total_runtime_ns(ce);
3173 p->count = count;
3174
3175 intel_context_unpin(ce);
3176 intel_context_put(ce);
3177 thread->result = err;
3178 }
3179
perf_parallel_engines(void * arg)3180 static int perf_parallel_engines(void *arg)
3181 {
3182 struct drm_i915_private *i915 = arg;
3183 static void (* const func[])(struct kthread_work *) = {
3184 p_sync0,
3185 p_sync1,
3186 p_many,
3187 NULL,
3188 };
3189 const unsigned int nengines = num_uabi_engines(i915);
3190 void (* const *fn)(struct kthread_work *);
3191 struct intel_engine_cs *engine;
3192 struct pm_qos_request qos;
3193 struct p_thread *engines;
3194 int err = 0;
3195
3196 engines = kcalloc(nengines, sizeof(*engines), GFP_KERNEL);
3197 if (!engines)
3198 return -ENOMEM;
3199
3200 cpu_latency_qos_add_request(&qos, 0);
3201
3202 for (fn = func; *fn; fn++) {
3203 char name[KSYM_NAME_LEN];
3204 struct igt_live_test t;
3205 unsigned int idx;
3206
3207 snprintf(name, sizeof(name), "%ps", *fn);
3208 err = igt_live_test_begin(&t, i915, __func__, name);
3209 if (err)
3210 break;
3211
3212 atomic_set(&i915->selftest.counter, nengines);
3213
3214 idx = 0;
3215 for_each_uabi_engine(engine, i915) {
3216 struct kthread_worker *worker;
3217
3218 intel_engine_pm_get(engine);
3219
3220 memset(&engines[idx].p, 0, sizeof(engines[idx].p));
3221
3222 worker = kthread_create_worker(0, "igt:%s",
3223 engine->name);
3224 if (IS_ERR(worker)) {
3225 err = PTR_ERR(worker);
3226 intel_engine_pm_put(engine);
3227 break;
3228 }
3229 engines[idx].worker = worker;
3230 engines[idx].result = 0;
3231 engines[idx].p.engine = engine;
3232 engines[idx].engine = engine;
3233
3234 kthread_init_work(&engines[idx].work, *fn);
3235 kthread_queue_work(worker, &engines[idx].work);
3236 idx++;
3237 }
3238
3239 idx = 0;
3240 for_each_uabi_engine(engine, i915) {
3241 int status;
3242
3243 if (!engines[idx].worker)
3244 break;
3245
3246 kthread_flush_work(&engines[idx].work);
3247 status = READ_ONCE(engines[idx].result);
3248 if (status && !err)
3249 err = status;
3250
3251 intel_engine_pm_put(engine);
3252
3253 kthread_destroy_worker(engines[idx].worker);
3254 idx++;
3255 }
3256
3257 if (igt_live_test_end(&t))
3258 err = -EIO;
3259 if (err)
3260 break;
3261
3262 idx = 0;
3263 for_each_uabi_engine(engine, i915) {
3264 struct perf_stats *p = &engines[idx].p;
3265 u64 busy = 100 * ktime_to_ns(p->busy);
3266 u64 dt = ktime_to_ns(p->time);
3267 int integer, decimal;
3268
3269 if (dt) {
3270 integer = div64_u64(busy, dt);
3271 busy -= integer * dt;
3272 decimal = div64_u64(100 * busy, dt);
3273 } else {
3274 integer = 0;
3275 decimal = 0;
3276 }
3277
3278 GEM_BUG_ON(engine != p->engine);
3279 pr_info("%s %5s: { count:%lu, busy:%d.%02d%%, runtime:%lldms, walltime:%lldms }\n",
3280 name, engine->name, p->count, integer, decimal,
3281 div_u64(p->runtime, 1000 * 1000),
3282 div_u64(ktime_to_ns(p->time), 1000 * 1000));
3283 idx++;
3284 }
3285 }
3286
3287 cpu_latency_qos_remove_request(&qos);
3288 kfree(engines);
3289 return err;
3290 }
3291
i915_request_perf_selftests(struct drm_i915_private * i915)3292 int i915_request_perf_selftests(struct drm_i915_private *i915)
3293 {
3294 static const struct i915_subtest tests[] = {
3295 SUBTEST(perf_request_latency),
3296 SUBTEST(perf_series_engines),
3297 SUBTEST(perf_parallel_engines),
3298 };
3299
3300 if (intel_gt_is_wedged(to_gt(i915)))
3301 return 0;
3302
3303 return i915_subtests(tests, i915);
3304 }
3305