1 // SPDX-License-Identifier: MIT
2 /*
3 * Copyright © 2020 Intel Corporation
4 */
5
6 #include <linux/pm_qos.h>
7 #include <linux/sort.h>
8
9 #include "intel_engine_heartbeat.h"
10 #include "intel_engine_pm.h"
11 #include "intel_gpu_commands.h"
12 #include "intel_gt_clock_utils.h"
13 #include "intel_gt_pm.h"
14 #include "intel_rc6.h"
15 #include "selftest_engine_heartbeat.h"
16 #include "selftest_rps.h"
17 #include "selftests/igt_flush_test.h"
18 #include "selftests/igt_spinner.h"
19 #include "selftests/librapl.h"
20
21 /* Try to isolate the impact of cstates from determing frequency response */
22 #define CPU_LATENCY 0 /* -1 to disable pm_qos, 0 to disable cstates */
23
dummy_rps_work(struct work_struct * wrk)24 static void dummy_rps_work(struct work_struct *wrk)
25 {
26 }
27
cmp_u64(const void * A,const void * B)28 static int cmp_u64(const void *A, const void *B)
29 {
30 const u64 *a = A, *b = B;
31
32 if (*a < *b)
33 return -1;
34 else if (*a > *b)
35 return 1;
36 else
37 return 0;
38 }
39
cmp_u32(const void * A,const void * B)40 static int cmp_u32(const void *A, const void *B)
41 {
42 const u32 *a = A, *b = B;
43
44 if (*a < *b)
45 return -1;
46 else if (*a > *b)
47 return 1;
48 else
49 return 0;
50 }
51
52 static struct i915_vma *
create_spin_counter(struct intel_engine_cs * engine,struct i915_address_space * vm,bool srm,u32 ** cancel,u32 ** counter)53 create_spin_counter(struct intel_engine_cs *engine,
54 struct i915_address_space *vm,
55 bool srm,
56 u32 **cancel,
57 u32 **counter)
58 {
59 enum {
60 COUNT,
61 INC,
62 __NGPR__,
63 };
64 #define CS_GPR(x) GEN8_RING_CS_GPR(engine->mmio_base, x)
65 struct drm_i915_gem_object *obj;
66 struct i915_vma *vma;
67 unsigned long end;
68 u32 *base, *cs;
69 int loop, i;
70 int err;
71
72 obj = i915_gem_object_create_internal(vm->i915, 64 << 10);
73 if (IS_ERR(obj))
74 return ERR_CAST(obj);
75
76 end = obj->base.size / sizeof(u32) - 1;
77
78 vma = i915_vma_instance(obj, vm, NULL);
79 if (IS_ERR(vma)) {
80 err = PTR_ERR(vma);
81 goto err_put;
82 }
83
84 err = i915_vma_pin(vma, 0, 0, PIN_USER);
85 if (err)
86 goto err_unlock;
87
88 i915_vma_lock(vma);
89
90 base = i915_gem_object_pin_map(obj, I915_MAP_WC);
91 if (IS_ERR(base)) {
92 err = PTR_ERR(base);
93 goto err_unpin;
94 }
95 cs = base;
96
97 *cs++ = MI_LOAD_REGISTER_IMM(__NGPR__ * 2);
98 for (i = 0; i < __NGPR__; i++) {
99 *cs++ = i915_mmio_reg_offset(CS_GPR(i));
100 *cs++ = 0;
101 *cs++ = i915_mmio_reg_offset(CS_GPR(i)) + 4;
102 *cs++ = 0;
103 }
104
105 *cs++ = MI_LOAD_REGISTER_IMM(1);
106 *cs++ = i915_mmio_reg_offset(CS_GPR(INC));
107 *cs++ = 1;
108
109 loop = cs - base;
110
111 /* Unroll the loop to avoid MI_BB_START stalls impacting measurements */
112 for (i = 0; i < 1024; i++) {
113 *cs++ = MI_MATH(4);
114 *cs++ = MI_MATH_LOAD(MI_MATH_REG_SRCA, MI_MATH_REG(COUNT));
115 *cs++ = MI_MATH_LOAD(MI_MATH_REG_SRCB, MI_MATH_REG(INC));
116 *cs++ = MI_MATH_ADD;
117 *cs++ = MI_MATH_STORE(MI_MATH_REG(COUNT), MI_MATH_REG_ACCU);
118
119 if (srm) {
120 *cs++ = MI_STORE_REGISTER_MEM_GEN8;
121 *cs++ = i915_mmio_reg_offset(CS_GPR(COUNT));
122 *cs++ = lower_32_bits(vma->node.start + end * sizeof(*cs));
123 *cs++ = upper_32_bits(vma->node.start + end * sizeof(*cs));
124 }
125 }
126
127 *cs++ = MI_BATCH_BUFFER_START_GEN8;
128 *cs++ = lower_32_bits(vma->node.start + loop * sizeof(*cs));
129 *cs++ = upper_32_bits(vma->node.start + loop * sizeof(*cs));
130 GEM_BUG_ON(cs - base > end);
131
132 i915_gem_object_flush_map(obj);
133
134 *cancel = base + loop;
135 *counter = srm ? memset32(base + end, 0, 1) : NULL;
136 return vma;
137
138 err_unpin:
139 i915_vma_unpin(vma);
140 err_unlock:
141 i915_vma_unlock(vma);
142 err_put:
143 i915_gem_object_put(obj);
144 return ERR_PTR(err);
145 }
146
wait_for_freq(struct intel_rps * rps,u8 freq,int timeout_ms)147 static u8 wait_for_freq(struct intel_rps *rps, u8 freq, int timeout_ms)
148 {
149 u8 history[64], i;
150 unsigned long end;
151 int sleep;
152
153 i = 0;
154 memset(history, freq, sizeof(history));
155 sleep = 20;
156
157 /* The PCU does not change instantly, but drifts towards the goal? */
158 end = jiffies + msecs_to_jiffies(timeout_ms);
159 do {
160 u8 act;
161
162 act = read_cagf(rps);
163 if (time_after(jiffies, end))
164 return act;
165
166 /* Target acquired */
167 if (act == freq)
168 return act;
169
170 /* Any change within the last N samples? */
171 if (!memchr_inv(history, act, sizeof(history)))
172 return act;
173
174 history[i] = act;
175 i = (i + 1) % ARRAY_SIZE(history);
176
177 usleep_range(sleep, 2 * sleep);
178 sleep *= 2;
179 if (sleep > timeout_ms * 20)
180 sleep = timeout_ms * 20;
181 } while (1);
182 }
183
rps_set_check(struct intel_rps * rps,u8 freq)184 static u8 rps_set_check(struct intel_rps *rps, u8 freq)
185 {
186 mutex_lock(&rps->lock);
187 GEM_BUG_ON(!intel_rps_is_active(rps));
188 if (wait_for(!intel_rps_set(rps, freq), 50)) {
189 mutex_unlock(&rps->lock);
190 return 0;
191 }
192 GEM_BUG_ON(rps->last_freq != freq);
193 mutex_unlock(&rps->lock);
194
195 return wait_for_freq(rps, freq, 50);
196 }
197
show_pstate_limits(struct intel_rps * rps)198 static void show_pstate_limits(struct intel_rps *rps)
199 {
200 struct drm_i915_private *i915 = rps_to_i915(rps);
201
202 if (IS_BROXTON(i915)) {
203 pr_info("P_STATE_CAP[%x]: 0x%08x\n",
204 i915_mmio_reg_offset(BXT_RP_STATE_CAP),
205 intel_uncore_read(rps_to_uncore(rps),
206 BXT_RP_STATE_CAP));
207 } else if (GRAPHICS_VER(i915) == 9) {
208 pr_info("P_STATE_LIMITS[%x]: 0x%08x\n",
209 i915_mmio_reg_offset(GEN9_RP_STATE_LIMITS),
210 intel_uncore_read(rps_to_uncore(rps),
211 GEN9_RP_STATE_LIMITS));
212 }
213 }
214
live_rps_clock_interval(void * arg)215 int live_rps_clock_interval(void *arg)
216 {
217 struct intel_gt *gt = arg;
218 struct intel_rps *rps = >->rps;
219 void (*saved_work)(struct work_struct *wrk);
220 struct intel_engine_cs *engine;
221 enum intel_engine_id id;
222 struct igt_spinner spin;
223 int err = 0;
224
225 if (!intel_rps_is_enabled(rps) || GRAPHICS_VER(gt->i915) < 6)
226 return 0;
227
228 if (igt_spinner_init(&spin, gt))
229 return -ENOMEM;
230
231 intel_gt_pm_wait_for_idle(gt);
232 saved_work = rps->work.func;
233 rps->work.func = dummy_rps_work;
234
235 intel_gt_pm_get(gt);
236 intel_rps_disable(>->rps);
237
238 intel_gt_check_clock_frequency(gt);
239
240 for_each_engine(engine, gt, id) {
241 struct i915_request *rq;
242 u32 cycles;
243 u64 dt;
244
245 if (!intel_engine_can_store_dword(engine))
246 continue;
247
248 st_engine_heartbeat_disable(engine);
249
250 rq = igt_spinner_create_request(&spin,
251 engine->kernel_context,
252 MI_NOOP);
253 if (IS_ERR(rq)) {
254 st_engine_heartbeat_enable(engine);
255 err = PTR_ERR(rq);
256 break;
257 }
258
259 i915_request_add(rq);
260
261 if (!igt_wait_for_spinner(&spin, rq)) {
262 pr_err("%s: RPS spinner did not start\n",
263 engine->name);
264 igt_spinner_end(&spin);
265 st_engine_heartbeat_enable(engine);
266 intel_gt_set_wedged(engine->gt);
267 err = -EIO;
268 break;
269 }
270
271 intel_uncore_forcewake_get(gt->uncore, FORCEWAKE_ALL);
272
273 intel_uncore_write_fw(gt->uncore, GEN6_RP_CUR_UP_EI, 0);
274
275 /* Set the evaluation interval to infinity! */
276 intel_uncore_write_fw(gt->uncore,
277 GEN6_RP_UP_EI, 0xffffffff);
278 intel_uncore_write_fw(gt->uncore,
279 GEN6_RP_UP_THRESHOLD, 0xffffffff);
280
281 intel_uncore_write_fw(gt->uncore, GEN6_RP_CONTROL,
282 GEN6_RP_ENABLE | GEN6_RP_UP_BUSY_AVG);
283
284 if (wait_for(intel_uncore_read_fw(gt->uncore,
285 GEN6_RP_CUR_UP_EI),
286 10)) {
287 /* Just skip the test; assume lack of HW support */
288 pr_notice("%s: rps evaluation interval not ticking\n",
289 engine->name);
290 err = -ENODEV;
291 } else {
292 ktime_t dt_[5];
293 u32 cycles_[5];
294 int i;
295
296 for (i = 0; i < 5; i++) {
297 preempt_disable();
298
299 dt_[i] = ktime_get();
300 cycles_[i] = -intel_uncore_read_fw(gt->uncore, GEN6_RP_CUR_UP_EI);
301
302 udelay(1000);
303
304 dt_[i] = ktime_sub(ktime_get(), dt_[i]);
305 cycles_[i] += intel_uncore_read_fw(gt->uncore, GEN6_RP_CUR_UP_EI);
306
307 preempt_enable();
308 }
309
310 /* Use the median of both cycle/dt; close enough */
311 sort(cycles_, 5, sizeof(*cycles_), cmp_u32, NULL);
312 cycles = (cycles_[1] + 2 * cycles_[2] + cycles_[3]) / 4;
313 sort(dt_, 5, sizeof(*dt_), cmp_u64, NULL);
314 dt = div_u64(dt_[1] + 2 * dt_[2] + dt_[3], 4);
315 }
316
317 intel_uncore_write_fw(gt->uncore, GEN6_RP_CONTROL, 0);
318 intel_uncore_forcewake_put(gt->uncore, FORCEWAKE_ALL);
319
320 igt_spinner_end(&spin);
321 st_engine_heartbeat_enable(engine);
322
323 if (err == 0) {
324 u64 time = intel_gt_pm_interval_to_ns(gt, cycles);
325 u32 expected =
326 intel_gt_ns_to_pm_interval(gt, dt);
327
328 pr_info("%s: rps counted %d C0 cycles [%lldns] in %lldns [%d cycles], using GT clock frequency of %uKHz\n",
329 engine->name, cycles, time, dt, expected,
330 gt->clock_frequency / 1000);
331
332 if (10 * time < 8 * dt ||
333 8 * time > 10 * dt) {
334 pr_err("%s: rps clock time does not match walltime!\n",
335 engine->name);
336 err = -EINVAL;
337 }
338
339 if (10 * expected < 8 * cycles ||
340 8 * expected > 10 * cycles) {
341 pr_err("%s: walltime does not match rps clock ticks!\n",
342 engine->name);
343 err = -EINVAL;
344 }
345 }
346
347 if (igt_flush_test(gt->i915))
348 err = -EIO;
349
350 break; /* once is enough */
351 }
352
353 intel_rps_enable(>->rps);
354 intel_gt_pm_put(gt);
355
356 igt_spinner_fini(&spin);
357
358 intel_gt_pm_wait_for_idle(gt);
359 rps->work.func = saved_work;
360
361 if (err == -ENODEV) /* skipped, don't report a fail */
362 err = 0;
363
364 return err;
365 }
366
live_rps_control(void * arg)367 int live_rps_control(void *arg)
368 {
369 struct intel_gt *gt = arg;
370 struct intel_rps *rps = >->rps;
371 void (*saved_work)(struct work_struct *wrk);
372 struct intel_engine_cs *engine;
373 enum intel_engine_id id;
374 struct igt_spinner spin;
375 int err = 0;
376
377 /*
378 * Check that the actual frequency matches our requested frequency,
379 * to verify our control mechanism. We have to be careful that the
380 * PCU may throttle the GPU in which case the actual frequency used
381 * will be lowered than requested.
382 */
383
384 if (!intel_rps_is_enabled(rps))
385 return 0;
386
387 if (IS_CHERRYVIEW(gt->i915)) /* XXX fragile PCU */
388 return 0;
389
390 if (igt_spinner_init(&spin, gt))
391 return -ENOMEM;
392
393 intel_gt_pm_wait_for_idle(gt);
394 saved_work = rps->work.func;
395 rps->work.func = dummy_rps_work;
396
397 intel_gt_pm_get(gt);
398 for_each_engine(engine, gt, id) {
399 struct i915_request *rq;
400 ktime_t min_dt, max_dt;
401 int f, limit;
402 int min, max;
403
404 if (!intel_engine_can_store_dword(engine))
405 continue;
406
407 st_engine_heartbeat_disable(engine);
408
409 rq = igt_spinner_create_request(&spin,
410 engine->kernel_context,
411 MI_NOOP);
412 if (IS_ERR(rq)) {
413 err = PTR_ERR(rq);
414 break;
415 }
416
417 i915_request_add(rq);
418
419 if (!igt_wait_for_spinner(&spin, rq)) {
420 pr_err("%s: RPS spinner did not start\n",
421 engine->name);
422 igt_spinner_end(&spin);
423 st_engine_heartbeat_enable(engine);
424 intel_gt_set_wedged(engine->gt);
425 err = -EIO;
426 break;
427 }
428
429 if (rps_set_check(rps, rps->min_freq) != rps->min_freq) {
430 pr_err("%s: could not set minimum frequency [%x], only %x!\n",
431 engine->name, rps->min_freq, read_cagf(rps));
432 igt_spinner_end(&spin);
433 st_engine_heartbeat_enable(engine);
434 show_pstate_limits(rps);
435 err = -EINVAL;
436 break;
437 }
438
439 for (f = rps->min_freq + 1; f < rps->max_freq; f++) {
440 if (rps_set_check(rps, f) < f)
441 break;
442 }
443
444 limit = rps_set_check(rps, f);
445
446 if (rps_set_check(rps, rps->min_freq) != rps->min_freq) {
447 pr_err("%s: could not restore minimum frequency [%x], only %x!\n",
448 engine->name, rps->min_freq, read_cagf(rps));
449 igt_spinner_end(&spin);
450 st_engine_heartbeat_enable(engine);
451 show_pstate_limits(rps);
452 err = -EINVAL;
453 break;
454 }
455
456 max_dt = ktime_get();
457 max = rps_set_check(rps, limit);
458 max_dt = ktime_sub(ktime_get(), max_dt);
459
460 min_dt = ktime_get();
461 min = rps_set_check(rps, rps->min_freq);
462 min_dt = ktime_sub(ktime_get(), min_dt);
463
464 igt_spinner_end(&spin);
465 st_engine_heartbeat_enable(engine);
466
467 pr_info("%s: range:[%x:%uMHz, %x:%uMHz] limit:[%x:%uMHz], %x:%x response %lluns:%lluns\n",
468 engine->name,
469 rps->min_freq, intel_gpu_freq(rps, rps->min_freq),
470 rps->max_freq, intel_gpu_freq(rps, rps->max_freq),
471 limit, intel_gpu_freq(rps, limit),
472 min, max, ktime_to_ns(min_dt), ktime_to_ns(max_dt));
473
474 if (limit == rps->min_freq) {
475 pr_err("%s: GPU throttled to minimum!\n",
476 engine->name);
477 show_pstate_limits(rps);
478 err = -ENODEV;
479 break;
480 }
481
482 if (igt_flush_test(gt->i915)) {
483 err = -EIO;
484 break;
485 }
486 }
487 intel_gt_pm_put(gt);
488
489 igt_spinner_fini(&spin);
490
491 intel_gt_pm_wait_for_idle(gt);
492 rps->work.func = saved_work;
493
494 return err;
495 }
496
show_pcu_config(struct intel_rps * rps)497 static void show_pcu_config(struct intel_rps *rps)
498 {
499 struct drm_i915_private *i915 = rps_to_i915(rps);
500 unsigned int max_gpu_freq, min_gpu_freq;
501 intel_wakeref_t wakeref;
502 int gpu_freq;
503
504 if (!HAS_LLC(i915))
505 return;
506
507 min_gpu_freq = rps->min_freq;
508 max_gpu_freq = rps->max_freq;
509 if (GRAPHICS_VER(i915) >= 9) {
510 /* Convert GT frequency to 50 HZ units */
511 min_gpu_freq /= GEN9_FREQ_SCALER;
512 max_gpu_freq /= GEN9_FREQ_SCALER;
513 }
514
515 wakeref = intel_runtime_pm_get(rps_to_uncore(rps)->rpm);
516
517 pr_info("%5s %5s %5s\n", "GPU", "eCPU", "eRing");
518 for (gpu_freq = min_gpu_freq; gpu_freq <= max_gpu_freq; gpu_freq++) {
519 int ia_freq = gpu_freq;
520
521 sandybridge_pcode_read(i915,
522 GEN6_PCODE_READ_MIN_FREQ_TABLE,
523 &ia_freq, NULL);
524
525 pr_info("%5d %5d %5d\n",
526 gpu_freq * 50,
527 ((ia_freq >> 0) & 0xff) * 100,
528 ((ia_freq >> 8) & 0xff) * 100);
529 }
530
531 intel_runtime_pm_put(rps_to_uncore(rps)->rpm, wakeref);
532 }
533
__measure_frequency(u32 * cntr,int duration_ms)534 static u64 __measure_frequency(u32 *cntr, int duration_ms)
535 {
536 u64 dc, dt;
537
538 dt = ktime_get();
539 dc = READ_ONCE(*cntr);
540 usleep_range(1000 * duration_ms, 2000 * duration_ms);
541 dc = READ_ONCE(*cntr) - dc;
542 dt = ktime_get() - dt;
543
544 return div64_u64(1000 * 1000 * dc, dt);
545 }
546
measure_frequency_at(struct intel_rps * rps,u32 * cntr,int * freq)547 static u64 measure_frequency_at(struct intel_rps *rps, u32 *cntr, int *freq)
548 {
549 u64 x[5];
550 int i;
551
552 *freq = rps_set_check(rps, *freq);
553 for (i = 0; i < 5; i++)
554 x[i] = __measure_frequency(cntr, 2);
555 *freq = (*freq + read_cagf(rps)) / 2;
556
557 /* A simple triangle filter for better result stability */
558 sort(x, 5, sizeof(*x), cmp_u64, NULL);
559 return div_u64(x[1] + 2 * x[2] + x[3], 4);
560 }
561
__measure_cs_frequency(struct intel_engine_cs * engine,int duration_ms)562 static u64 __measure_cs_frequency(struct intel_engine_cs *engine,
563 int duration_ms)
564 {
565 u64 dc, dt;
566
567 dt = ktime_get();
568 dc = intel_uncore_read_fw(engine->uncore, CS_GPR(0));
569 usleep_range(1000 * duration_ms, 2000 * duration_ms);
570 dc = intel_uncore_read_fw(engine->uncore, CS_GPR(0)) - dc;
571 dt = ktime_get() - dt;
572
573 return div64_u64(1000 * 1000 * dc, dt);
574 }
575
measure_cs_frequency_at(struct intel_rps * rps,struct intel_engine_cs * engine,int * freq)576 static u64 measure_cs_frequency_at(struct intel_rps *rps,
577 struct intel_engine_cs *engine,
578 int *freq)
579 {
580 u64 x[5];
581 int i;
582
583 *freq = rps_set_check(rps, *freq);
584 for (i = 0; i < 5; i++)
585 x[i] = __measure_cs_frequency(engine, 2);
586 *freq = (*freq + read_cagf(rps)) / 2;
587
588 /* A simple triangle filter for better result stability */
589 sort(x, 5, sizeof(*x), cmp_u64, NULL);
590 return div_u64(x[1] + 2 * x[2] + x[3], 4);
591 }
592
scaled_within(u64 x,u64 y,u32 f_n,u32 f_d)593 static bool scaled_within(u64 x, u64 y, u32 f_n, u32 f_d)
594 {
595 return f_d * x > f_n * y && f_n * x < f_d * y;
596 }
597
live_rps_frequency_cs(void * arg)598 int live_rps_frequency_cs(void *arg)
599 {
600 void (*saved_work)(struct work_struct *wrk);
601 struct intel_gt *gt = arg;
602 struct intel_rps *rps = >->rps;
603 struct intel_engine_cs *engine;
604 struct pm_qos_request qos;
605 enum intel_engine_id id;
606 int err = 0;
607
608 /*
609 * The premise is that the GPU does change frequency at our behest.
610 * Let's check there is a correspondence between the requested
611 * frequency, the actual frequency, and the observed clock rate.
612 */
613
614 if (!intel_rps_is_enabled(rps))
615 return 0;
616
617 if (GRAPHICS_VER(gt->i915) < 8) /* for CS simplicity */
618 return 0;
619
620 if (CPU_LATENCY >= 0)
621 cpu_latency_qos_add_request(&qos, CPU_LATENCY);
622
623 intel_gt_pm_wait_for_idle(gt);
624 saved_work = rps->work.func;
625 rps->work.func = dummy_rps_work;
626
627 for_each_engine(engine, gt, id) {
628 struct i915_request *rq;
629 struct i915_vma *vma;
630 u32 *cancel, *cntr;
631 struct {
632 u64 count;
633 int freq;
634 } min, max;
635
636 st_engine_heartbeat_disable(engine);
637
638 vma = create_spin_counter(engine,
639 engine->kernel_context->vm, false,
640 &cancel, &cntr);
641 if (IS_ERR(vma)) {
642 err = PTR_ERR(vma);
643 st_engine_heartbeat_enable(engine);
644 break;
645 }
646
647 rq = intel_engine_create_kernel_request(engine);
648 if (IS_ERR(rq)) {
649 err = PTR_ERR(rq);
650 goto err_vma;
651 }
652
653 err = i915_request_await_object(rq, vma->obj, false);
654 if (!err)
655 err = i915_vma_move_to_active(vma, rq, 0);
656 if (!err)
657 err = rq->engine->emit_bb_start(rq,
658 vma->node.start,
659 PAGE_SIZE, 0);
660 i915_request_add(rq);
661 if (err)
662 goto err_vma;
663
664 if (wait_for(intel_uncore_read(engine->uncore, CS_GPR(0)),
665 10)) {
666 pr_err("%s: timed loop did not start\n",
667 engine->name);
668 goto err_vma;
669 }
670
671 min.freq = rps->min_freq;
672 min.count = measure_cs_frequency_at(rps, engine, &min.freq);
673
674 max.freq = rps->max_freq;
675 max.count = measure_cs_frequency_at(rps, engine, &max.freq);
676
677 pr_info("%s: min:%lluKHz @ %uMHz, max:%lluKHz @ %uMHz [%d%%]\n",
678 engine->name,
679 min.count, intel_gpu_freq(rps, min.freq),
680 max.count, intel_gpu_freq(rps, max.freq),
681 (int)DIV64_U64_ROUND_CLOSEST(100 * min.freq * max.count,
682 max.freq * min.count));
683
684 if (!scaled_within(max.freq * min.count,
685 min.freq * max.count,
686 2, 3)) {
687 int f;
688
689 pr_err("%s: CS did not scale with frequency! scaled min:%llu, max:%llu\n",
690 engine->name,
691 max.freq * min.count,
692 min.freq * max.count);
693 show_pcu_config(rps);
694
695 for (f = min.freq + 1; f <= rps->max_freq; f++) {
696 int act = f;
697 u64 count;
698
699 count = measure_cs_frequency_at(rps, engine, &act);
700 if (act < f)
701 break;
702
703 pr_info("%s: %x:%uMHz: %lluKHz [%d%%]\n",
704 engine->name,
705 act, intel_gpu_freq(rps, act), count,
706 (int)DIV64_U64_ROUND_CLOSEST(100 * min.freq * count,
707 act * min.count));
708
709 f = act; /* may skip ahead [pcu granularity] */
710 }
711
712 err = -EINTR; /* ignore error, continue on with test */
713 }
714
715 err_vma:
716 *cancel = MI_BATCH_BUFFER_END;
717 i915_gem_object_flush_map(vma->obj);
718 i915_gem_object_unpin_map(vma->obj);
719 i915_vma_unpin(vma);
720 i915_vma_unlock(vma);
721 i915_vma_put(vma);
722
723 st_engine_heartbeat_enable(engine);
724 if (igt_flush_test(gt->i915))
725 err = -EIO;
726 if (err)
727 break;
728 }
729
730 intel_gt_pm_wait_for_idle(gt);
731 rps->work.func = saved_work;
732
733 if (CPU_LATENCY >= 0)
734 cpu_latency_qos_remove_request(&qos);
735
736 return err;
737 }
738
live_rps_frequency_srm(void * arg)739 int live_rps_frequency_srm(void *arg)
740 {
741 void (*saved_work)(struct work_struct *wrk);
742 struct intel_gt *gt = arg;
743 struct intel_rps *rps = >->rps;
744 struct intel_engine_cs *engine;
745 struct pm_qos_request qos;
746 enum intel_engine_id id;
747 int err = 0;
748
749 /*
750 * The premise is that the GPU does change frequency at our behest.
751 * Let's check there is a correspondence between the requested
752 * frequency, the actual frequency, and the observed clock rate.
753 */
754
755 if (!intel_rps_is_enabled(rps))
756 return 0;
757
758 if (GRAPHICS_VER(gt->i915) < 8) /* for CS simplicity */
759 return 0;
760
761 if (CPU_LATENCY >= 0)
762 cpu_latency_qos_add_request(&qos, CPU_LATENCY);
763
764 intel_gt_pm_wait_for_idle(gt);
765 saved_work = rps->work.func;
766 rps->work.func = dummy_rps_work;
767
768 for_each_engine(engine, gt, id) {
769 struct i915_request *rq;
770 struct i915_vma *vma;
771 u32 *cancel, *cntr;
772 struct {
773 u64 count;
774 int freq;
775 } min, max;
776
777 st_engine_heartbeat_disable(engine);
778
779 vma = create_spin_counter(engine,
780 engine->kernel_context->vm, true,
781 &cancel, &cntr);
782 if (IS_ERR(vma)) {
783 err = PTR_ERR(vma);
784 st_engine_heartbeat_enable(engine);
785 break;
786 }
787
788 rq = intel_engine_create_kernel_request(engine);
789 if (IS_ERR(rq)) {
790 err = PTR_ERR(rq);
791 goto err_vma;
792 }
793
794 err = i915_request_await_object(rq, vma->obj, false);
795 if (!err)
796 err = i915_vma_move_to_active(vma, rq, 0);
797 if (!err)
798 err = rq->engine->emit_bb_start(rq,
799 vma->node.start,
800 PAGE_SIZE, 0);
801 i915_request_add(rq);
802 if (err)
803 goto err_vma;
804
805 if (wait_for(READ_ONCE(*cntr), 10)) {
806 pr_err("%s: timed loop did not start\n",
807 engine->name);
808 goto err_vma;
809 }
810
811 min.freq = rps->min_freq;
812 min.count = measure_frequency_at(rps, cntr, &min.freq);
813
814 max.freq = rps->max_freq;
815 max.count = measure_frequency_at(rps, cntr, &max.freq);
816
817 pr_info("%s: min:%lluKHz @ %uMHz, max:%lluKHz @ %uMHz [%d%%]\n",
818 engine->name,
819 min.count, intel_gpu_freq(rps, min.freq),
820 max.count, intel_gpu_freq(rps, max.freq),
821 (int)DIV64_U64_ROUND_CLOSEST(100 * min.freq * max.count,
822 max.freq * min.count));
823
824 if (!scaled_within(max.freq * min.count,
825 min.freq * max.count,
826 1, 2)) {
827 int f;
828
829 pr_err("%s: CS did not scale with frequency! scaled min:%llu, max:%llu\n",
830 engine->name,
831 max.freq * min.count,
832 min.freq * max.count);
833 show_pcu_config(rps);
834
835 for (f = min.freq + 1; f <= rps->max_freq; f++) {
836 int act = f;
837 u64 count;
838
839 count = measure_frequency_at(rps, cntr, &act);
840 if (act < f)
841 break;
842
843 pr_info("%s: %x:%uMHz: %lluKHz [%d%%]\n",
844 engine->name,
845 act, intel_gpu_freq(rps, act), count,
846 (int)DIV64_U64_ROUND_CLOSEST(100 * min.freq * count,
847 act * min.count));
848
849 f = act; /* may skip ahead [pcu granularity] */
850 }
851
852 err = -EINTR; /* ignore error, continue on with test */
853 }
854
855 err_vma:
856 *cancel = MI_BATCH_BUFFER_END;
857 i915_gem_object_flush_map(vma->obj);
858 i915_gem_object_unpin_map(vma->obj);
859 i915_vma_unpin(vma);
860 i915_vma_unlock(vma);
861 i915_vma_put(vma);
862
863 st_engine_heartbeat_enable(engine);
864 if (igt_flush_test(gt->i915))
865 err = -EIO;
866 if (err)
867 break;
868 }
869
870 intel_gt_pm_wait_for_idle(gt);
871 rps->work.func = saved_work;
872
873 if (CPU_LATENCY >= 0)
874 cpu_latency_qos_remove_request(&qos);
875
876 return err;
877 }
878
sleep_for_ei(struct intel_rps * rps,int timeout_us)879 static void sleep_for_ei(struct intel_rps *rps, int timeout_us)
880 {
881 /* Flush any previous EI */
882 usleep_range(timeout_us, 2 * timeout_us);
883
884 /* Reset the interrupt status */
885 rps_disable_interrupts(rps);
886 GEM_BUG_ON(rps->pm_iir);
887 rps_enable_interrupts(rps);
888
889 /* And then wait for the timeout, for real this time */
890 usleep_range(2 * timeout_us, 3 * timeout_us);
891 }
892
__rps_up_interrupt(struct intel_rps * rps,struct intel_engine_cs * engine,struct igt_spinner * spin)893 static int __rps_up_interrupt(struct intel_rps *rps,
894 struct intel_engine_cs *engine,
895 struct igt_spinner *spin)
896 {
897 struct intel_uncore *uncore = engine->uncore;
898 struct i915_request *rq;
899 u32 timeout;
900
901 if (!intel_engine_can_store_dword(engine))
902 return 0;
903
904 rps_set_check(rps, rps->min_freq);
905
906 rq = igt_spinner_create_request(spin, engine->kernel_context, MI_NOOP);
907 if (IS_ERR(rq))
908 return PTR_ERR(rq);
909
910 i915_request_get(rq);
911 i915_request_add(rq);
912
913 if (!igt_wait_for_spinner(spin, rq)) {
914 pr_err("%s: RPS spinner did not start\n",
915 engine->name);
916 i915_request_put(rq);
917 intel_gt_set_wedged(engine->gt);
918 return -EIO;
919 }
920
921 if (!intel_rps_is_active(rps)) {
922 pr_err("%s: RPS not enabled on starting spinner\n",
923 engine->name);
924 igt_spinner_end(spin);
925 i915_request_put(rq);
926 return -EINVAL;
927 }
928
929 if (!(rps->pm_events & GEN6_PM_RP_UP_THRESHOLD)) {
930 pr_err("%s: RPS did not register UP interrupt\n",
931 engine->name);
932 i915_request_put(rq);
933 return -EINVAL;
934 }
935
936 if (rps->last_freq != rps->min_freq) {
937 pr_err("%s: RPS did not program min frequency\n",
938 engine->name);
939 i915_request_put(rq);
940 return -EINVAL;
941 }
942
943 timeout = intel_uncore_read(uncore, GEN6_RP_UP_EI);
944 timeout = intel_gt_pm_interval_to_ns(engine->gt, timeout);
945 timeout = DIV_ROUND_UP(timeout, 1000);
946
947 sleep_for_ei(rps, timeout);
948 GEM_BUG_ON(i915_request_completed(rq));
949
950 igt_spinner_end(spin);
951 i915_request_put(rq);
952
953 if (rps->cur_freq != rps->min_freq) {
954 pr_err("%s: Frequency unexpectedly changed [up], now %d!\n",
955 engine->name, intel_rps_read_actual_frequency(rps));
956 return -EINVAL;
957 }
958
959 if (!(rps->pm_iir & GEN6_PM_RP_UP_THRESHOLD)) {
960 pr_err("%s: UP interrupt not recorded for spinner, pm_iir:%x, prev_up:%x, up_threshold:%x, up_ei:%x\n",
961 engine->name, rps->pm_iir,
962 intel_uncore_read(uncore, GEN6_RP_PREV_UP),
963 intel_uncore_read(uncore, GEN6_RP_UP_THRESHOLD),
964 intel_uncore_read(uncore, GEN6_RP_UP_EI));
965 return -EINVAL;
966 }
967
968 return 0;
969 }
970
__rps_down_interrupt(struct intel_rps * rps,struct intel_engine_cs * engine)971 static int __rps_down_interrupt(struct intel_rps *rps,
972 struct intel_engine_cs *engine)
973 {
974 struct intel_uncore *uncore = engine->uncore;
975 u32 timeout;
976
977 rps_set_check(rps, rps->max_freq);
978
979 if (!(rps->pm_events & GEN6_PM_RP_DOWN_THRESHOLD)) {
980 pr_err("%s: RPS did not register DOWN interrupt\n",
981 engine->name);
982 return -EINVAL;
983 }
984
985 if (rps->last_freq != rps->max_freq) {
986 pr_err("%s: RPS did not program max frequency\n",
987 engine->name);
988 return -EINVAL;
989 }
990
991 timeout = intel_uncore_read(uncore, GEN6_RP_DOWN_EI);
992 timeout = intel_gt_pm_interval_to_ns(engine->gt, timeout);
993 timeout = DIV_ROUND_UP(timeout, 1000);
994
995 sleep_for_ei(rps, timeout);
996
997 if (rps->cur_freq != rps->max_freq) {
998 pr_err("%s: Frequency unexpectedly changed [down], now %d!\n",
999 engine->name,
1000 intel_rps_read_actual_frequency(rps));
1001 return -EINVAL;
1002 }
1003
1004 if (!(rps->pm_iir & (GEN6_PM_RP_DOWN_THRESHOLD | GEN6_PM_RP_DOWN_TIMEOUT))) {
1005 pr_err("%s: DOWN interrupt not recorded for idle, pm_iir:%x, prev_down:%x, down_threshold:%x, down_ei:%x [prev_up:%x, up_threshold:%x, up_ei:%x]\n",
1006 engine->name, rps->pm_iir,
1007 intel_uncore_read(uncore, GEN6_RP_PREV_DOWN),
1008 intel_uncore_read(uncore, GEN6_RP_DOWN_THRESHOLD),
1009 intel_uncore_read(uncore, GEN6_RP_DOWN_EI),
1010 intel_uncore_read(uncore, GEN6_RP_PREV_UP),
1011 intel_uncore_read(uncore, GEN6_RP_UP_THRESHOLD),
1012 intel_uncore_read(uncore, GEN6_RP_UP_EI));
1013 return -EINVAL;
1014 }
1015
1016 return 0;
1017 }
1018
live_rps_interrupt(void * arg)1019 int live_rps_interrupt(void *arg)
1020 {
1021 struct intel_gt *gt = arg;
1022 struct intel_rps *rps = >->rps;
1023 void (*saved_work)(struct work_struct *wrk);
1024 struct intel_engine_cs *engine;
1025 enum intel_engine_id id;
1026 struct igt_spinner spin;
1027 u32 pm_events;
1028 int err = 0;
1029
1030 /*
1031 * First, let's check whether or not we are receiving interrupts.
1032 */
1033
1034 if (!intel_rps_has_interrupts(rps) || GRAPHICS_VER(gt->i915) < 6)
1035 return 0;
1036
1037 intel_gt_pm_get(gt);
1038 pm_events = rps->pm_events;
1039 intel_gt_pm_put(gt);
1040 if (!pm_events) {
1041 pr_err("No RPS PM events registered, but RPS is enabled?\n");
1042 return -ENODEV;
1043 }
1044
1045 if (igt_spinner_init(&spin, gt))
1046 return -ENOMEM;
1047
1048 intel_gt_pm_wait_for_idle(gt);
1049 saved_work = rps->work.func;
1050 rps->work.func = dummy_rps_work;
1051
1052 for_each_engine(engine, gt, id) {
1053 /* Keep the engine busy with a spinner; expect an UP! */
1054 if (pm_events & GEN6_PM_RP_UP_THRESHOLD) {
1055 intel_gt_pm_wait_for_idle(engine->gt);
1056 GEM_BUG_ON(intel_rps_is_active(rps));
1057
1058 st_engine_heartbeat_disable(engine);
1059
1060 err = __rps_up_interrupt(rps, engine, &spin);
1061
1062 st_engine_heartbeat_enable(engine);
1063 if (err)
1064 goto out;
1065
1066 intel_gt_pm_wait_for_idle(engine->gt);
1067 }
1068
1069 /* Keep the engine awake but idle and check for DOWN */
1070 if (pm_events & GEN6_PM_RP_DOWN_THRESHOLD) {
1071 st_engine_heartbeat_disable(engine);
1072 intel_rc6_disable(>->rc6);
1073
1074 err = __rps_down_interrupt(rps, engine);
1075
1076 intel_rc6_enable(>->rc6);
1077 st_engine_heartbeat_enable(engine);
1078 if (err)
1079 goto out;
1080 }
1081 }
1082
1083 out:
1084 if (igt_flush_test(gt->i915))
1085 err = -EIO;
1086
1087 igt_spinner_fini(&spin);
1088
1089 intel_gt_pm_wait_for_idle(gt);
1090 rps->work.func = saved_work;
1091
1092 return err;
1093 }
1094
__measure_power(int duration_ms)1095 static u64 __measure_power(int duration_ms)
1096 {
1097 u64 dE, dt;
1098
1099 dt = ktime_get();
1100 dE = librapl_energy_uJ();
1101 usleep_range(1000 * duration_ms, 2000 * duration_ms);
1102 dE = librapl_energy_uJ() - dE;
1103 dt = ktime_get() - dt;
1104
1105 return div64_u64(1000 * 1000 * dE, dt);
1106 }
1107
measure_power_at(struct intel_rps * rps,int * freq)1108 static u64 measure_power_at(struct intel_rps *rps, int *freq)
1109 {
1110 u64 x[5];
1111 int i;
1112
1113 *freq = rps_set_check(rps, *freq);
1114 for (i = 0; i < 5; i++)
1115 x[i] = __measure_power(5);
1116 *freq = (*freq + read_cagf(rps)) / 2;
1117
1118 /* A simple triangle filter for better result stability */
1119 sort(x, 5, sizeof(*x), cmp_u64, NULL);
1120 return div_u64(x[1] + 2 * x[2] + x[3], 4);
1121 }
1122
live_rps_power(void * arg)1123 int live_rps_power(void *arg)
1124 {
1125 struct intel_gt *gt = arg;
1126 struct intel_rps *rps = >->rps;
1127 void (*saved_work)(struct work_struct *wrk);
1128 struct intel_engine_cs *engine;
1129 enum intel_engine_id id;
1130 struct igt_spinner spin;
1131 int err = 0;
1132
1133 /*
1134 * Our fundamental assumption is that running at lower frequency
1135 * actually saves power. Let's see if our RAPL measurement support
1136 * that theory.
1137 */
1138
1139 if (!intel_rps_is_enabled(rps) || GRAPHICS_VER(gt->i915) < 6)
1140 return 0;
1141
1142 if (!librapl_supported(gt->i915))
1143 return 0;
1144
1145 if (igt_spinner_init(&spin, gt))
1146 return -ENOMEM;
1147
1148 intel_gt_pm_wait_for_idle(gt);
1149 saved_work = rps->work.func;
1150 rps->work.func = dummy_rps_work;
1151
1152 for_each_engine(engine, gt, id) {
1153 struct i915_request *rq;
1154 struct {
1155 u64 power;
1156 int freq;
1157 } min, max;
1158
1159 if (!intel_engine_can_store_dword(engine))
1160 continue;
1161
1162 st_engine_heartbeat_disable(engine);
1163
1164 rq = igt_spinner_create_request(&spin,
1165 engine->kernel_context,
1166 MI_NOOP);
1167 if (IS_ERR(rq)) {
1168 st_engine_heartbeat_enable(engine);
1169 err = PTR_ERR(rq);
1170 break;
1171 }
1172
1173 i915_request_add(rq);
1174
1175 if (!igt_wait_for_spinner(&spin, rq)) {
1176 pr_err("%s: RPS spinner did not start\n",
1177 engine->name);
1178 igt_spinner_end(&spin);
1179 st_engine_heartbeat_enable(engine);
1180 intel_gt_set_wedged(engine->gt);
1181 err = -EIO;
1182 break;
1183 }
1184
1185 max.freq = rps->max_freq;
1186 max.power = measure_power_at(rps, &max.freq);
1187
1188 min.freq = rps->min_freq;
1189 min.power = measure_power_at(rps, &min.freq);
1190
1191 igt_spinner_end(&spin);
1192 st_engine_heartbeat_enable(engine);
1193
1194 pr_info("%s: min:%llumW @ %uMHz, max:%llumW @ %uMHz\n",
1195 engine->name,
1196 min.power, intel_gpu_freq(rps, min.freq),
1197 max.power, intel_gpu_freq(rps, max.freq));
1198
1199 if (10 * min.freq >= 9 * max.freq) {
1200 pr_notice("Could not control frequency, ran at [%d:%uMHz, %d:%uMhz]\n",
1201 min.freq, intel_gpu_freq(rps, min.freq),
1202 max.freq, intel_gpu_freq(rps, max.freq));
1203 continue;
1204 }
1205
1206 if (11 * min.power > 10 * max.power) {
1207 pr_err("%s: did not conserve power when setting lower frequency!\n",
1208 engine->name);
1209 err = -EINVAL;
1210 break;
1211 }
1212
1213 if (igt_flush_test(gt->i915)) {
1214 err = -EIO;
1215 break;
1216 }
1217 }
1218
1219 igt_spinner_fini(&spin);
1220
1221 intel_gt_pm_wait_for_idle(gt);
1222 rps->work.func = saved_work;
1223
1224 return err;
1225 }
1226
live_rps_dynamic(void * arg)1227 int live_rps_dynamic(void *arg)
1228 {
1229 struct intel_gt *gt = arg;
1230 struct intel_rps *rps = >->rps;
1231 struct intel_engine_cs *engine;
1232 enum intel_engine_id id;
1233 struct igt_spinner spin;
1234 int err = 0;
1235
1236 /*
1237 * We've looked at the bascs, and have established that we
1238 * can change the clock frequency and that the HW will generate
1239 * interrupts based on load. Now we check how we integrate those
1240 * moving parts into dynamic reclocking based on load.
1241 */
1242
1243 if (!intel_rps_is_enabled(rps) || GRAPHICS_VER(gt->i915) < 6)
1244 return 0;
1245
1246 if (igt_spinner_init(&spin, gt))
1247 return -ENOMEM;
1248
1249 if (intel_rps_has_interrupts(rps))
1250 pr_info("RPS has interrupt support\n");
1251 if (intel_rps_uses_timer(rps))
1252 pr_info("RPS has timer support\n");
1253
1254 for_each_engine(engine, gt, id) {
1255 struct i915_request *rq;
1256 struct {
1257 ktime_t dt;
1258 u8 freq;
1259 } min, max;
1260
1261 if (!intel_engine_can_store_dword(engine))
1262 continue;
1263
1264 intel_gt_pm_wait_for_idle(gt);
1265 GEM_BUG_ON(intel_rps_is_active(rps));
1266 rps->cur_freq = rps->min_freq;
1267
1268 intel_engine_pm_get(engine);
1269 intel_rc6_disable(>->rc6);
1270 GEM_BUG_ON(rps->last_freq != rps->min_freq);
1271
1272 rq = igt_spinner_create_request(&spin,
1273 engine->kernel_context,
1274 MI_NOOP);
1275 if (IS_ERR(rq)) {
1276 err = PTR_ERR(rq);
1277 goto err;
1278 }
1279
1280 i915_request_add(rq);
1281
1282 max.dt = ktime_get();
1283 max.freq = wait_for_freq(rps, rps->max_freq, 500);
1284 max.dt = ktime_sub(ktime_get(), max.dt);
1285
1286 igt_spinner_end(&spin);
1287
1288 min.dt = ktime_get();
1289 min.freq = wait_for_freq(rps, rps->min_freq, 2000);
1290 min.dt = ktime_sub(ktime_get(), min.dt);
1291
1292 pr_info("%s: dynamically reclocked to %u:%uMHz while busy in %lluns, and %u:%uMHz while idle in %lluns\n",
1293 engine->name,
1294 max.freq, intel_gpu_freq(rps, max.freq),
1295 ktime_to_ns(max.dt),
1296 min.freq, intel_gpu_freq(rps, min.freq),
1297 ktime_to_ns(min.dt));
1298 if (min.freq >= max.freq) {
1299 pr_err("%s: dynamic reclocking of spinner failed\n!",
1300 engine->name);
1301 err = -EINVAL;
1302 }
1303
1304 err:
1305 intel_rc6_enable(>->rc6);
1306 intel_engine_pm_put(engine);
1307
1308 if (igt_flush_test(gt->i915))
1309 err = -EIO;
1310 if (err)
1311 break;
1312 }
1313
1314 igt_spinner_fini(&spin);
1315
1316 return err;
1317 }
1318