1 // SPDX-License-Identifier: MIT
2 /*
3 * Copyright © 2014 Intel Corporation
4 */
5
6 #include "gem/i915_gem_lmem.h"
7
8 #include "gen8_engine_cs.h"
9 #include "i915_drv.h"
10 #include "i915_perf.h"
11 #include "intel_engine.h"
12 #include "intel_gpu_commands.h"
13 #include "intel_gt.h"
14 #include "intel_lrc.h"
15 #include "intel_lrc_reg.h"
16 #include "intel_ring.h"
17 #include "shmem_utils.h"
18
set_offsets(u32 * regs,const u8 * data,const struct intel_engine_cs * engine,bool close)19 static void set_offsets(u32 *regs,
20 const u8 *data,
21 const struct intel_engine_cs *engine,
22 bool close)
23 #define NOP(x) (BIT(7) | (x))
24 #define LRI(count, flags) ((flags) << 6 | (count) | BUILD_BUG_ON_ZERO(count >= BIT(6)))
25 #define POSTED BIT(0)
26 #define REG(x) (((x) >> 2) | BUILD_BUG_ON_ZERO(x >= 0x200))
27 #define REG16(x) \
28 (((x) >> 9) | BIT(7) | BUILD_BUG_ON_ZERO(x >= 0x10000)), \
29 (((x) >> 2) & 0x7f)
30 #define END 0
31 {
32 const u32 base = engine->mmio_base;
33
34 while (*data) {
35 u8 count, flags;
36
37 if (*data & BIT(7)) { /* skip */
38 count = *data++ & ~BIT(7);
39 regs += count;
40 continue;
41 }
42
43 count = *data & 0x3f;
44 flags = *data >> 6;
45 data++;
46
47 *regs = MI_LOAD_REGISTER_IMM(count);
48 if (flags & POSTED)
49 *regs |= MI_LRI_FORCE_POSTED;
50 if (GRAPHICS_VER(engine->i915) >= 11)
51 *regs |= MI_LRI_LRM_CS_MMIO;
52 regs++;
53
54 GEM_BUG_ON(!count);
55 do {
56 u32 offset = 0;
57 u8 v;
58
59 do {
60 v = *data++;
61 offset <<= 7;
62 offset |= v & ~BIT(7);
63 } while (v & BIT(7));
64
65 regs[0] = base + (offset << 2);
66 regs += 2;
67 } while (--count);
68 }
69
70 if (close) {
71 /* Close the batch; used mainly by live_lrc_layout() */
72 *regs = MI_BATCH_BUFFER_END;
73 if (GRAPHICS_VER(engine->i915) >= 11)
74 *regs |= BIT(0);
75 }
76 }
77
78 static const u8 gen8_xcs_offsets[] = {
79 NOP(1),
80 LRI(11, 0),
81 REG16(0x244),
82 REG(0x034),
83 REG(0x030),
84 REG(0x038),
85 REG(0x03c),
86 REG(0x168),
87 REG(0x140),
88 REG(0x110),
89 REG(0x11c),
90 REG(0x114),
91 REG(0x118),
92
93 NOP(9),
94 LRI(9, 0),
95 REG16(0x3a8),
96 REG16(0x28c),
97 REG16(0x288),
98 REG16(0x284),
99 REG16(0x280),
100 REG16(0x27c),
101 REG16(0x278),
102 REG16(0x274),
103 REG16(0x270),
104
105 NOP(13),
106 LRI(2, 0),
107 REG16(0x200),
108 REG(0x028),
109
110 END
111 };
112
113 static const u8 gen9_xcs_offsets[] = {
114 NOP(1),
115 LRI(14, POSTED),
116 REG16(0x244),
117 REG(0x034),
118 REG(0x030),
119 REG(0x038),
120 REG(0x03c),
121 REG(0x168),
122 REG(0x140),
123 REG(0x110),
124 REG(0x11c),
125 REG(0x114),
126 REG(0x118),
127 REG(0x1c0),
128 REG(0x1c4),
129 REG(0x1c8),
130
131 NOP(3),
132 LRI(9, POSTED),
133 REG16(0x3a8),
134 REG16(0x28c),
135 REG16(0x288),
136 REG16(0x284),
137 REG16(0x280),
138 REG16(0x27c),
139 REG16(0x278),
140 REG16(0x274),
141 REG16(0x270),
142
143 NOP(13),
144 LRI(1, POSTED),
145 REG16(0x200),
146
147 NOP(13),
148 LRI(44, POSTED),
149 REG(0x028),
150 REG(0x09c),
151 REG(0x0c0),
152 REG(0x178),
153 REG(0x17c),
154 REG16(0x358),
155 REG(0x170),
156 REG(0x150),
157 REG(0x154),
158 REG(0x158),
159 REG16(0x41c),
160 REG16(0x600),
161 REG16(0x604),
162 REG16(0x608),
163 REG16(0x60c),
164 REG16(0x610),
165 REG16(0x614),
166 REG16(0x618),
167 REG16(0x61c),
168 REG16(0x620),
169 REG16(0x624),
170 REG16(0x628),
171 REG16(0x62c),
172 REG16(0x630),
173 REG16(0x634),
174 REG16(0x638),
175 REG16(0x63c),
176 REG16(0x640),
177 REG16(0x644),
178 REG16(0x648),
179 REG16(0x64c),
180 REG16(0x650),
181 REG16(0x654),
182 REG16(0x658),
183 REG16(0x65c),
184 REG16(0x660),
185 REG16(0x664),
186 REG16(0x668),
187 REG16(0x66c),
188 REG16(0x670),
189 REG16(0x674),
190 REG16(0x678),
191 REG16(0x67c),
192 REG(0x068),
193
194 END
195 };
196
197 static const u8 gen12_xcs_offsets[] = {
198 NOP(1),
199 LRI(13, POSTED),
200 REG16(0x244),
201 REG(0x034),
202 REG(0x030),
203 REG(0x038),
204 REG(0x03c),
205 REG(0x168),
206 REG(0x140),
207 REG(0x110),
208 REG(0x1c0),
209 REG(0x1c4),
210 REG(0x1c8),
211 REG(0x180),
212 REG16(0x2b4),
213
214 NOP(5),
215 LRI(9, POSTED),
216 REG16(0x3a8),
217 REG16(0x28c),
218 REG16(0x288),
219 REG16(0x284),
220 REG16(0x280),
221 REG16(0x27c),
222 REG16(0x278),
223 REG16(0x274),
224 REG16(0x270),
225
226 END
227 };
228
229 static const u8 gen8_rcs_offsets[] = {
230 NOP(1),
231 LRI(14, POSTED),
232 REG16(0x244),
233 REG(0x034),
234 REG(0x030),
235 REG(0x038),
236 REG(0x03c),
237 REG(0x168),
238 REG(0x140),
239 REG(0x110),
240 REG(0x11c),
241 REG(0x114),
242 REG(0x118),
243 REG(0x1c0),
244 REG(0x1c4),
245 REG(0x1c8),
246
247 NOP(3),
248 LRI(9, POSTED),
249 REG16(0x3a8),
250 REG16(0x28c),
251 REG16(0x288),
252 REG16(0x284),
253 REG16(0x280),
254 REG16(0x27c),
255 REG16(0x278),
256 REG16(0x274),
257 REG16(0x270),
258
259 NOP(13),
260 LRI(1, 0),
261 REG(0x0c8),
262
263 END
264 };
265
266 static const u8 gen9_rcs_offsets[] = {
267 NOP(1),
268 LRI(14, POSTED),
269 REG16(0x244),
270 REG(0x34),
271 REG(0x30),
272 REG(0x38),
273 REG(0x3c),
274 REG(0x168),
275 REG(0x140),
276 REG(0x110),
277 REG(0x11c),
278 REG(0x114),
279 REG(0x118),
280 REG(0x1c0),
281 REG(0x1c4),
282 REG(0x1c8),
283
284 NOP(3),
285 LRI(9, POSTED),
286 REG16(0x3a8),
287 REG16(0x28c),
288 REG16(0x288),
289 REG16(0x284),
290 REG16(0x280),
291 REG16(0x27c),
292 REG16(0x278),
293 REG16(0x274),
294 REG16(0x270),
295
296 NOP(13),
297 LRI(1, 0),
298 REG(0xc8),
299
300 NOP(13),
301 LRI(44, POSTED),
302 REG(0x28),
303 REG(0x9c),
304 REG(0xc0),
305 REG(0x178),
306 REG(0x17c),
307 REG16(0x358),
308 REG(0x170),
309 REG(0x150),
310 REG(0x154),
311 REG(0x158),
312 REG16(0x41c),
313 REG16(0x600),
314 REG16(0x604),
315 REG16(0x608),
316 REG16(0x60c),
317 REG16(0x610),
318 REG16(0x614),
319 REG16(0x618),
320 REG16(0x61c),
321 REG16(0x620),
322 REG16(0x624),
323 REG16(0x628),
324 REG16(0x62c),
325 REG16(0x630),
326 REG16(0x634),
327 REG16(0x638),
328 REG16(0x63c),
329 REG16(0x640),
330 REG16(0x644),
331 REG16(0x648),
332 REG16(0x64c),
333 REG16(0x650),
334 REG16(0x654),
335 REG16(0x658),
336 REG16(0x65c),
337 REG16(0x660),
338 REG16(0x664),
339 REG16(0x668),
340 REG16(0x66c),
341 REG16(0x670),
342 REG16(0x674),
343 REG16(0x678),
344 REG16(0x67c),
345 REG(0x68),
346
347 END
348 };
349
350 static const u8 gen11_rcs_offsets[] = {
351 NOP(1),
352 LRI(15, POSTED),
353 REG16(0x244),
354 REG(0x034),
355 REG(0x030),
356 REG(0x038),
357 REG(0x03c),
358 REG(0x168),
359 REG(0x140),
360 REG(0x110),
361 REG(0x11c),
362 REG(0x114),
363 REG(0x118),
364 REG(0x1c0),
365 REG(0x1c4),
366 REG(0x1c8),
367 REG(0x180),
368
369 NOP(1),
370 LRI(9, POSTED),
371 REG16(0x3a8),
372 REG16(0x28c),
373 REG16(0x288),
374 REG16(0x284),
375 REG16(0x280),
376 REG16(0x27c),
377 REG16(0x278),
378 REG16(0x274),
379 REG16(0x270),
380
381 LRI(1, POSTED),
382 REG(0x1b0),
383
384 NOP(10),
385 LRI(1, 0),
386 REG(0x0c8),
387
388 END
389 };
390
391 static const u8 gen12_rcs_offsets[] = {
392 NOP(1),
393 LRI(13, POSTED),
394 REG16(0x244),
395 REG(0x034),
396 REG(0x030),
397 REG(0x038),
398 REG(0x03c),
399 REG(0x168),
400 REG(0x140),
401 REG(0x110),
402 REG(0x1c0),
403 REG(0x1c4),
404 REG(0x1c8),
405 REG(0x180),
406 REG16(0x2b4),
407
408 NOP(5),
409 LRI(9, POSTED),
410 REG16(0x3a8),
411 REG16(0x28c),
412 REG16(0x288),
413 REG16(0x284),
414 REG16(0x280),
415 REG16(0x27c),
416 REG16(0x278),
417 REG16(0x274),
418 REG16(0x270),
419
420 LRI(3, POSTED),
421 REG(0x1b0),
422 REG16(0x5a8),
423 REG16(0x5ac),
424
425 NOP(6),
426 LRI(1, 0),
427 REG(0x0c8),
428 NOP(3 + 9 + 1),
429
430 LRI(51, POSTED),
431 REG16(0x588),
432 REG16(0x588),
433 REG16(0x588),
434 REG16(0x588),
435 REG16(0x588),
436 REG16(0x588),
437 REG(0x028),
438 REG(0x09c),
439 REG(0x0c0),
440 REG(0x178),
441 REG(0x17c),
442 REG16(0x358),
443 REG(0x170),
444 REG(0x150),
445 REG(0x154),
446 REG(0x158),
447 REG16(0x41c),
448 REG16(0x600),
449 REG16(0x604),
450 REG16(0x608),
451 REG16(0x60c),
452 REG16(0x610),
453 REG16(0x614),
454 REG16(0x618),
455 REG16(0x61c),
456 REG16(0x620),
457 REG16(0x624),
458 REG16(0x628),
459 REG16(0x62c),
460 REG16(0x630),
461 REG16(0x634),
462 REG16(0x638),
463 REG16(0x63c),
464 REG16(0x640),
465 REG16(0x644),
466 REG16(0x648),
467 REG16(0x64c),
468 REG16(0x650),
469 REG16(0x654),
470 REG16(0x658),
471 REG16(0x65c),
472 REG16(0x660),
473 REG16(0x664),
474 REG16(0x668),
475 REG16(0x66c),
476 REG16(0x670),
477 REG16(0x674),
478 REG16(0x678),
479 REG16(0x67c),
480 REG(0x068),
481 REG(0x084),
482 NOP(1),
483
484 END
485 };
486
487 static const u8 xehp_rcs_offsets[] = {
488 NOP(1),
489 LRI(13, POSTED),
490 REG16(0x244),
491 REG(0x034),
492 REG(0x030),
493 REG(0x038),
494 REG(0x03c),
495 REG(0x168),
496 REG(0x140),
497 REG(0x110),
498 REG(0x1c0),
499 REG(0x1c4),
500 REG(0x1c8),
501 REG(0x180),
502 REG16(0x2b4),
503
504 NOP(5),
505 LRI(9, POSTED),
506 REG16(0x3a8),
507 REG16(0x28c),
508 REG16(0x288),
509 REG16(0x284),
510 REG16(0x280),
511 REG16(0x27c),
512 REG16(0x278),
513 REG16(0x274),
514 REG16(0x270),
515
516 LRI(3, POSTED),
517 REG(0x1b0),
518 REG16(0x5a8),
519 REG16(0x5ac),
520
521 NOP(6),
522 LRI(1, 0),
523 REG(0x0c8),
524
525 END
526 };
527
528 #undef END
529 #undef REG16
530 #undef REG
531 #undef LRI
532 #undef NOP
533
reg_offsets(const struct intel_engine_cs * engine)534 static const u8 *reg_offsets(const struct intel_engine_cs *engine)
535 {
536 /*
537 * The gen12+ lists only have the registers we program in the basic
538 * default state. We rely on the context image using relative
539 * addressing to automatic fixup the register state between the
540 * physical engines for virtual engine.
541 */
542 GEM_BUG_ON(GRAPHICS_VER(engine->i915) >= 12 &&
543 !intel_engine_has_relative_mmio(engine));
544
545 if (engine->class == RENDER_CLASS) {
546 if (GRAPHICS_VER_FULL(engine->i915) >= IP_VER(12, 50))
547 return xehp_rcs_offsets;
548 else if (GRAPHICS_VER(engine->i915) >= 12)
549 return gen12_rcs_offsets;
550 else if (GRAPHICS_VER(engine->i915) >= 11)
551 return gen11_rcs_offsets;
552 else if (GRAPHICS_VER(engine->i915) >= 9)
553 return gen9_rcs_offsets;
554 else
555 return gen8_rcs_offsets;
556 } else {
557 if (GRAPHICS_VER(engine->i915) >= 12)
558 return gen12_xcs_offsets;
559 else if (GRAPHICS_VER(engine->i915) >= 9)
560 return gen9_xcs_offsets;
561 else
562 return gen8_xcs_offsets;
563 }
564 }
565
lrc_ring_mi_mode(const struct intel_engine_cs * engine)566 static int lrc_ring_mi_mode(const struct intel_engine_cs *engine)
567 {
568 if (GRAPHICS_VER_FULL(engine->i915) >= IP_VER(12, 50))
569 return 0x70;
570 else if (GRAPHICS_VER(engine->i915) >= 12)
571 return 0x60;
572 else if (GRAPHICS_VER(engine->i915) >= 9)
573 return 0x54;
574 else if (engine->class == RENDER_CLASS)
575 return 0x58;
576 else
577 return -1;
578 }
579
lrc_ring_gpr0(const struct intel_engine_cs * engine)580 static int lrc_ring_gpr0(const struct intel_engine_cs *engine)
581 {
582 if (GRAPHICS_VER_FULL(engine->i915) >= IP_VER(12, 50))
583 return 0x84;
584 else if (GRAPHICS_VER(engine->i915) >= 12)
585 return 0x74;
586 else if (GRAPHICS_VER(engine->i915) >= 9)
587 return 0x68;
588 else if (engine->class == RENDER_CLASS)
589 return 0xd8;
590 else
591 return -1;
592 }
593
lrc_ring_wa_bb_per_ctx(const struct intel_engine_cs * engine)594 static int lrc_ring_wa_bb_per_ctx(const struct intel_engine_cs *engine)
595 {
596 if (GRAPHICS_VER(engine->i915) >= 12)
597 return 0x12;
598 else if (GRAPHICS_VER(engine->i915) >= 9 || engine->class == RENDER_CLASS)
599 return 0x18;
600 else
601 return -1;
602 }
603
lrc_ring_indirect_ptr(const struct intel_engine_cs * engine)604 static int lrc_ring_indirect_ptr(const struct intel_engine_cs *engine)
605 {
606 int x;
607
608 x = lrc_ring_wa_bb_per_ctx(engine);
609 if (x < 0)
610 return x;
611
612 return x + 2;
613 }
614
lrc_ring_indirect_offset(const struct intel_engine_cs * engine)615 static int lrc_ring_indirect_offset(const struct intel_engine_cs *engine)
616 {
617 int x;
618
619 x = lrc_ring_indirect_ptr(engine);
620 if (x < 0)
621 return x;
622
623 return x + 2;
624 }
625
lrc_ring_cmd_buf_cctl(const struct intel_engine_cs * engine)626 static int lrc_ring_cmd_buf_cctl(const struct intel_engine_cs *engine)
627 {
628
629 if (GRAPHICS_VER_FULL(engine->i915) >= IP_VER(12, 50))
630 /*
631 * Note that the CSFE context has a dummy slot for CMD_BUF_CCTL
632 * simply to match the RCS context image layout.
633 */
634 return 0xc6;
635 else if (engine->class != RENDER_CLASS)
636 return -1;
637 else if (GRAPHICS_VER(engine->i915) >= 12)
638 return 0xb6;
639 else if (GRAPHICS_VER(engine->i915) >= 11)
640 return 0xaa;
641 else
642 return -1;
643 }
644
645 static u32
lrc_ring_indirect_offset_default(const struct intel_engine_cs * engine)646 lrc_ring_indirect_offset_default(const struct intel_engine_cs *engine)
647 {
648 switch (GRAPHICS_VER(engine->i915)) {
649 default:
650 MISSING_CASE(GRAPHICS_VER(engine->i915));
651 fallthrough;
652 case 12:
653 return GEN12_CTX_RCS_INDIRECT_CTX_OFFSET_DEFAULT;
654 case 11:
655 return GEN11_CTX_RCS_INDIRECT_CTX_OFFSET_DEFAULT;
656 case 9:
657 return GEN9_CTX_RCS_INDIRECT_CTX_OFFSET_DEFAULT;
658 case 8:
659 return GEN8_CTX_RCS_INDIRECT_CTX_OFFSET_DEFAULT;
660 }
661 }
662
663 static void
lrc_setup_indirect_ctx(u32 * regs,const struct intel_engine_cs * engine,u32 ctx_bb_ggtt_addr,u32 size)664 lrc_setup_indirect_ctx(u32 *regs,
665 const struct intel_engine_cs *engine,
666 u32 ctx_bb_ggtt_addr,
667 u32 size)
668 {
669 GEM_BUG_ON(!size);
670 GEM_BUG_ON(!IS_ALIGNED(size, CACHELINE_BYTES));
671 GEM_BUG_ON(lrc_ring_indirect_ptr(engine) == -1);
672 regs[lrc_ring_indirect_ptr(engine) + 1] =
673 ctx_bb_ggtt_addr | (size / CACHELINE_BYTES);
674
675 GEM_BUG_ON(lrc_ring_indirect_offset(engine) == -1);
676 regs[lrc_ring_indirect_offset(engine) + 1] =
677 lrc_ring_indirect_offset_default(engine) << 6;
678 }
679
init_common_regs(u32 * const regs,const struct intel_context * ce,const struct intel_engine_cs * engine,bool inhibit)680 static void init_common_regs(u32 * const regs,
681 const struct intel_context *ce,
682 const struct intel_engine_cs *engine,
683 bool inhibit)
684 {
685 u32 ctl;
686
687 ctl = _MASKED_BIT_ENABLE(CTX_CTRL_INHIBIT_SYN_CTX_SWITCH);
688 ctl |= _MASKED_BIT_DISABLE(CTX_CTRL_ENGINE_CTX_RESTORE_INHIBIT);
689 if (inhibit)
690 ctl |= CTX_CTRL_ENGINE_CTX_RESTORE_INHIBIT;
691 if (GRAPHICS_VER(engine->i915) < 11)
692 ctl |= _MASKED_BIT_DISABLE(CTX_CTRL_ENGINE_CTX_SAVE_INHIBIT |
693 CTX_CTRL_RS_CTX_ENABLE);
694 regs[CTX_CONTEXT_CONTROL] = ctl;
695
696 regs[CTX_TIMESTAMP] = ce->runtime.last;
697 }
698
init_wa_bb_regs(u32 * const regs,const struct intel_engine_cs * engine)699 static void init_wa_bb_regs(u32 * const regs,
700 const struct intel_engine_cs *engine)
701 {
702 const struct i915_ctx_workarounds * const wa_ctx = &engine->wa_ctx;
703
704 if (wa_ctx->per_ctx.size) {
705 const u32 ggtt_offset = i915_ggtt_offset(wa_ctx->vma);
706
707 GEM_BUG_ON(lrc_ring_wa_bb_per_ctx(engine) == -1);
708 regs[lrc_ring_wa_bb_per_ctx(engine) + 1] =
709 (ggtt_offset + wa_ctx->per_ctx.offset) | 0x01;
710 }
711
712 if (wa_ctx->indirect_ctx.size) {
713 lrc_setup_indirect_ctx(regs, engine,
714 i915_ggtt_offset(wa_ctx->vma) +
715 wa_ctx->indirect_ctx.offset,
716 wa_ctx->indirect_ctx.size);
717 }
718 }
719
init_ppgtt_regs(u32 * regs,const struct i915_ppgtt * ppgtt)720 static void init_ppgtt_regs(u32 *regs, const struct i915_ppgtt *ppgtt)
721 {
722 if (i915_vm_is_4lvl(&ppgtt->vm)) {
723 /* 64b PPGTT (48bit canonical)
724 * PDP0_DESCRIPTOR contains the base address to PML4 and
725 * other PDP Descriptors are ignored.
726 */
727 ASSIGN_CTX_PML4(ppgtt, regs);
728 } else {
729 ASSIGN_CTX_PDP(ppgtt, regs, 3);
730 ASSIGN_CTX_PDP(ppgtt, regs, 2);
731 ASSIGN_CTX_PDP(ppgtt, regs, 1);
732 ASSIGN_CTX_PDP(ppgtt, regs, 0);
733 }
734 }
735
vm_alias(struct i915_address_space * vm)736 static struct i915_ppgtt *vm_alias(struct i915_address_space *vm)
737 {
738 if (i915_is_ggtt(vm))
739 return i915_vm_to_ggtt(vm)->alias;
740 else
741 return i915_vm_to_ppgtt(vm);
742 }
743
__reset_stop_ring(u32 * regs,const struct intel_engine_cs * engine)744 static void __reset_stop_ring(u32 *regs, const struct intel_engine_cs *engine)
745 {
746 int x;
747
748 x = lrc_ring_mi_mode(engine);
749 if (x != -1) {
750 regs[x + 1] &= ~STOP_RING;
751 regs[x + 1] |= STOP_RING << 16;
752 }
753 }
754
__lrc_init_regs(u32 * regs,const struct intel_context * ce,const struct intel_engine_cs * engine,bool inhibit)755 static void __lrc_init_regs(u32 *regs,
756 const struct intel_context *ce,
757 const struct intel_engine_cs *engine,
758 bool inhibit)
759 {
760 /*
761 * A context is actually a big batch buffer with several
762 * MI_LOAD_REGISTER_IMM commands followed by (reg, value) pairs. The
763 * values we are setting here are only for the first context restore:
764 * on a subsequent save, the GPU will recreate this batchbuffer with new
765 * values (including all the missing MI_LOAD_REGISTER_IMM commands that
766 * we are not initializing here).
767 *
768 * Must keep consistent with virtual_update_register_offsets().
769 */
770
771 if (inhibit)
772 memset(regs, 0, PAGE_SIZE);
773
774 set_offsets(regs, reg_offsets(engine), engine, inhibit);
775
776 init_common_regs(regs, ce, engine, inhibit);
777 init_ppgtt_regs(regs, vm_alias(ce->vm));
778
779 init_wa_bb_regs(regs, engine);
780
781 __reset_stop_ring(regs, engine);
782 }
783
lrc_init_regs(const struct intel_context * ce,const struct intel_engine_cs * engine,bool inhibit)784 void lrc_init_regs(const struct intel_context *ce,
785 const struct intel_engine_cs *engine,
786 bool inhibit)
787 {
788 __lrc_init_regs(ce->lrc_reg_state, ce, engine, inhibit);
789 }
790
lrc_reset_regs(const struct intel_context * ce,const struct intel_engine_cs * engine)791 void lrc_reset_regs(const struct intel_context *ce,
792 const struct intel_engine_cs *engine)
793 {
794 __reset_stop_ring(ce->lrc_reg_state, engine);
795 }
796
797 static void
set_redzone(void * vaddr,const struct intel_engine_cs * engine)798 set_redzone(void *vaddr, const struct intel_engine_cs *engine)
799 {
800 if (!IS_ENABLED(CONFIG_DRM_I915_DEBUG_GEM))
801 return;
802
803 vaddr += engine->context_size;
804
805 memset(vaddr, CONTEXT_REDZONE, I915_GTT_PAGE_SIZE);
806 }
807
808 static void
check_redzone(const void * vaddr,const struct intel_engine_cs * engine)809 check_redzone(const void *vaddr, const struct intel_engine_cs *engine)
810 {
811 if (!IS_ENABLED(CONFIG_DRM_I915_DEBUG_GEM))
812 return;
813
814 vaddr += engine->context_size;
815
816 if (memchr_inv(vaddr, CONTEXT_REDZONE, I915_GTT_PAGE_SIZE))
817 drm_err_once(&engine->i915->drm,
818 "%s context redzone overwritten!\n",
819 engine->name);
820 }
821
lrc_init_state(struct intel_context * ce,struct intel_engine_cs * engine,void * state)822 void lrc_init_state(struct intel_context *ce,
823 struct intel_engine_cs *engine,
824 void *state)
825 {
826 bool inhibit = true;
827
828 set_redzone(state, engine);
829
830 if (engine->default_state) {
831 shmem_read(engine->default_state, 0,
832 state, engine->context_size);
833 __set_bit(CONTEXT_VALID_BIT, &ce->flags);
834 inhibit = false;
835 }
836
837 /* Clear the ppHWSP (inc. per-context counters) */
838 memset(state, 0, PAGE_SIZE);
839
840 /*
841 * The second page of the context object contains some registers which
842 * must be set up prior to the first execution.
843 */
844 __lrc_init_regs(state + LRC_STATE_OFFSET, ce, engine, inhibit);
845 }
846
847 static struct i915_vma *
__lrc_alloc_state(struct intel_context * ce,struct intel_engine_cs * engine)848 __lrc_alloc_state(struct intel_context *ce, struct intel_engine_cs *engine)
849 {
850 struct drm_i915_gem_object *obj;
851 struct i915_vma *vma;
852 u32 context_size;
853
854 context_size = round_up(engine->context_size, I915_GTT_PAGE_SIZE);
855
856 if (IS_ENABLED(CONFIG_DRM_I915_DEBUG_GEM))
857 context_size += I915_GTT_PAGE_SIZE; /* for redzone */
858
859 if (GRAPHICS_VER(engine->i915) == 12) {
860 ce->wa_bb_page = context_size / PAGE_SIZE;
861 context_size += PAGE_SIZE;
862 }
863
864 obj = i915_gem_object_create_lmem(engine->i915, context_size, 0);
865 if (IS_ERR(obj))
866 obj = i915_gem_object_create_shmem(engine->i915, context_size);
867 if (IS_ERR(obj))
868 return ERR_CAST(obj);
869
870 vma = i915_vma_instance(obj, &engine->gt->ggtt->vm, NULL);
871 if (IS_ERR(vma)) {
872 i915_gem_object_put(obj);
873 return vma;
874 }
875
876 return vma;
877 }
878
879 static struct intel_timeline *
pinned_timeline(struct intel_context * ce,struct intel_engine_cs * engine)880 pinned_timeline(struct intel_context *ce, struct intel_engine_cs *engine)
881 {
882 struct intel_timeline *tl = fetch_and_zero(&ce->timeline);
883
884 return intel_timeline_create_from_engine(engine, page_unmask_bits(tl));
885 }
886
lrc_alloc(struct intel_context * ce,struct intel_engine_cs * engine)887 int lrc_alloc(struct intel_context *ce, struct intel_engine_cs *engine)
888 {
889 struct intel_ring *ring;
890 struct i915_vma *vma;
891 int err;
892
893 GEM_BUG_ON(ce->state);
894
895 vma = __lrc_alloc_state(ce, engine);
896 if (IS_ERR(vma))
897 return PTR_ERR(vma);
898
899 ring = intel_engine_create_ring(engine, ce->ring_size);
900 if (IS_ERR(ring)) {
901 err = PTR_ERR(ring);
902 goto err_vma;
903 }
904
905 if (!page_mask_bits(ce->timeline)) {
906 struct intel_timeline *tl;
907
908 /*
909 * Use the static global HWSP for the kernel context, and
910 * a dynamically allocated cacheline for everyone else.
911 */
912 if (unlikely(ce->timeline))
913 tl = pinned_timeline(ce, engine);
914 else
915 tl = intel_timeline_create(engine->gt);
916 if (IS_ERR(tl)) {
917 err = PTR_ERR(tl);
918 goto err_ring;
919 }
920
921 ce->timeline = tl;
922 }
923
924 ce->ring = ring;
925 ce->state = vma;
926
927 return 0;
928
929 err_ring:
930 intel_ring_put(ring);
931 err_vma:
932 i915_vma_put(vma);
933 return err;
934 }
935
lrc_reset(struct intel_context * ce)936 void lrc_reset(struct intel_context *ce)
937 {
938 GEM_BUG_ON(!intel_context_is_pinned(ce));
939
940 intel_ring_reset(ce->ring, ce->ring->emit);
941
942 /* Scrub away the garbage */
943 lrc_init_regs(ce, ce->engine, true);
944 ce->lrc.lrca = lrc_update_regs(ce, ce->engine, ce->ring->tail);
945 }
946
947 int
lrc_pre_pin(struct intel_context * ce,struct intel_engine_cs * engine,struct i915_gem_ww_ctx * ww,void ** vaddr)948 lrc_pre_pin(struct intel_context *ce,
949 struct intel_engine_cs *engine,
950 struct i915_gem_ww_ctx *ww,
951 void **vaddr)
952 {
953 GEM_BUG_ON(!ce->state);
954 GEM_BUG_ON(!i915_vma_is_pinned(ce->state));
955
956 *vaddr = i915_gem_object_pin_map(ce->state->obj,
957 i915_coherent_map_type(ce->engine->i915,
958 ce->state->obj,
959 false) |
960 I915_MAP_OVERRIDE);
961
962 return PTR_ERR_OR_ZERO(*vaddr);
963 }
964
965 int
lrc_pin(struct intel_context * ce,struct intel_engine_cs * engine,void * vaddr)966 lrc_pin(struct intel_context *ce,
967 struct intel_engine_cs *engine,
968 void *vaddr)
969 {
970 ce->lrc_reg_state = vaddr + LRC_STATE_OFFSET;
971
972 if (!__test_and_set_bit(CONTEXT_INIT_BIT, &ce->flags))
973 lrc_init_state(ce, engine, vaddr);
974
975 ce->lrc.lrca = lrc_update_regs(ce, engine, ce->ring->tail);
976 return 0;
977 }
978
lrc_unpin(struct intel_context * ce)979 void lrc_unpin(struct intel_context *ce)
980 {
981 check_redzone((void *)ce->lrc_reg_state - LRC_STATE_OFFSET,
982 ce->engine);
983 }
984
lrc_post_unpin(struct intel_context * ce)985 void lrc_post_unpin(struct intel_context *ce)
986 {
987 i915_gem_object_unpin_map(ce->state->obj);
988 }
989
lrc_fini(struct intel_context * ce)990 void lrc_fini(struct intel_context *ce)
991 {
992 if (!ce->state)
993 return;
994
995 intel_ring_put(fetch_and_zero(&ce->ring));
996 i915_vma_put(fetch_and_zero(&ce->state));
997 }
998
lrc_destroy(struct kref * kref)999 void lrc_destroy(struct kref *kref)
1000 {
1001 struct intel_context *ce = container_of(kref, typeof(*ce), ref);
1002
1003 GEM_BUG_ON(!i915_active_is_idle(&ce->active));
1004 GEM_BUG_ON(intel_context_is_pinned(ce));
1005
1006 lrc_fini(ce);
1007
1008 intel_context_fini(ce);
1009 intel_context_free(ce);
1010 }
1011
1012 static u32 *
gen12_emit_timestamp_wa(const struct intel_context * ce,u32 * cs)1013 gen12_emit_timestamp_wa(const struct intel_context *ce, u32 *cs)
1014 {
1015 *cs++ = MI_LOAD_REGISTER_MEM_GEN8 |
1016 MI_SRM_LRM_GLOBAL_GTT |
1017 MI_LRI_LRM_CS_MMIO;
1018 *cs++ = i915_mmio_reg_offset(GEN8_RING_CS_GPR(0, 0));
1019 *cs++ = i915_ggtt_offset(ce->state) + LRC_STATE_OFFSET +
1020 CTX_TIMESTAMP * sizeof(u32);
1021 *cs++ = 0;
1022
1023 *cs++ = MI_LOAD_REGISTER_REG |
1024 MI_LRR_SOURCE_CS_MMIO |
1025 MI_LRI_LRM_CS_MMIO;
1026 *cs++ = i915_mmio_reg_offset(GEN8_RING_CS_GPR(0, 0));
1027 *cs++ = i915_mmio_reg_offset(RING_CTX_TIMESTAMP(0));
1028
1029 *cs++ = MI_LOAD_REGISTER_REG |
1030 MI_LRR_SOURCE_CS_MMIO |
1031 MI_LRI_LRM_CS_MMIO;
1032 *cs++ = i915_mmio_reg_offset(GEN8_RING_CS_GPR(0, 0));
1033 *cs++ = i915_mmio_reg_offset(RING_CTX_TIMESTAMP(0));
1034
1035 return cs;
1036 }
1037
1038 static u32 *
gen12_emit_restore_scratch(const struct intel_context * ce,u32 * cs)1039 gen12_emit_restore_scratch(const struct intel_context *ce, u32 *cs)
1040 {
1041 GEM_BUG_ON(lrc_ring_gpr0(ce->engine) == -1);
1042
1043 *cs++ = MI_LOAD_REGISTER_MEM_GEN8 |
1044 MI_SRM_LRM_GLOBAL_GTT |
1045 MI_LRI_LRM_CS_MMIO;
1046 *cs++ = i915_mmio_reg_offset(GEN8_RING_CS_GPR(0, 0));
1047 *cs++ = i915_ggtt_offset(ce->state) + LRC_STATE_OFFSET +
1048 (lrc_ring_gpr0(ce->engine) + 1) * sizeof(u32);
1049 *cs++ = 0;
1050
1051 return cs;
1052 }
1053
1054 static u32 *
gen12_emit_cmd_buf_wa(const struct intel_context * ce,u32 * cs)1055 gen12_emit_cmd_buf_wa(const struct intel_context *ce, u32 *cs)
1056 {
1057 GEM_BUG_ON(lrc_ring_cmd_buf_cctl(ce->engine) == -1);
1058
1059 *cs++ = MI_LOAD_REGISTER_MEM_GEN8 |
1060 MI_SRM_LRM_GLOBAL_GTT |
1061 MI_LRI_LRM_CS_MMIO;
1062 *cs++ = i915_mmio_reg_offset(GEN8_RING_CS_GPR(0, 0));
1063 *cs++ = i915_ggtt_offset(ce->state) + LRC_STATE_OFFSET +
1064 (lrc_ring_cmd_buf_cctl(ce->engine) + 1) * sizeof(u32);
1065 *cs++ = 0;
1066
1067 *cs++ = MI_LOAD_REGISTER_REG |
1068 MI_LRR_SOURCE_CS_MMIO |
1069 MI_LRI_LRM_CS_MMIO;
1070 *cs++ = i915_mmio_reg_offset(GEN8_RING_CS_GPR(0, 0));
1071 *cs++ = i915_mmio_reg_offset(RING_CMD_BUF_CCTL(0));
1072
1073 return cs;
1074 }
1075
1076 static u32 *
gen12_emit_indirect_ctx_rcs(const struct intel_context * ce,u32 * cs)1077 gen12_emit_indirect_ctx_rcs(const struct intel_context *ce, u32 *cs)
1078 {
1079 cs = gen12_emit_timestamp_wa(ce, cs);
1080 cs = gen12_emit_cmd_buf_wa(ce, cs);
1081 cs = gen12_emit_restore_scratch(ce, cs);
1082
1083 return cs;
1084 }
1085
1086 static u32 *
gen12_emit_indirect_ctx_xcs(const struct intel_context * ce,u32 * cs)1087 gen12_emit_indirect_ctx_xcs(const struct intel_context *ce, u32 *cs)
1088 {
1089 cs = gen12_emit_timestamp_wa(ce, cs);
1090 cs = gen12_emit_restore_scratch(ce, cs);
1091
1092 return cs;
1093 }
1094
context_wa_bb_offset(const struct intel_context * ce)1095 static u32 context_wa_bb_offset(const struct intel_context *ce)
1096 {
1097 return PAGE_SIZE * ce->wa_bb_page;
1098 }
1099
context_indirect_bb(const struct intel_context * ce)1100 static u32 *context_indirect_bb(const struct intel_context *ce)
1101 {
1102 void *ptr;
1103
1104 GEM_BUG_ON(!ce->wa_bb_page);
1105
1106 ptr = ce->lrc_reg_state;
1107 ptr -= LRC_STATE_OFFSET; /* back to start of context image */
1108 ptr += context_wa_bb_offset(ce);
1109
1110 return ptr;
1111 }
1112
1113 static void
setup_indirect_ctx_bb(const struct intel_context * ce,const struct intel_engine_cs * engine,u32 * (* emit)(const struct intel_context *,u32 *))1114 setup_indirect_ctx_bb(const struct intel_context *ce,
1115 const struct intel_engine_cs *engine,
1116 u32 *(*emit)(const struct intel_context *, u32 *))
1117 {
1118 u32 * const start = context_indirect_bb(ce);
1119 u32 *cs;
1120
1121 cs = emit(ce, start);
1122 GEM_BUG_ON(cs - start > I915_GTT_PAGE_SIZE / sizeof(*cs));
1123 while ((unsigned long)cs % CACHELINE_BYTES)
1124 *cs++ = MI_NOOP;
1125
1126 lrc_setup_indirect_ctx(ce->lrc_reg_state, engine,
1127 i915_ggtt_offset(ce->state) +
1128 context_wa_bb_offset(ce),
1129 (cs - start) * sizeof(*cs));
1130 }
1131
1132 /*
1133 * The context descriptor encodes various attributes of a context,
1134 * including its GTT address and some flags. Because it's fairly
1135 * expensive to calculate, we'll just do it once and cache the result,
1136 * which remains valid until the context is unpinned.
1137 *
1138 * This is what a descriptor looks like, from LSB to MSB::
1139 *
1140 * bits 0-11: flags, GEN8_CTX_* (cached in ctx->desc_template)
1141 * bits 12-31: LRCA, GTT address of (the HWSP of) this context
1142 * bits 32-52: ctx ID, a globally unique tag (highest bit used by GuC)
1143 * bits 53-54: mbz, reserved for use by hardware
1144 * bits 55-63: group ID, currently unused and set to 0
1145 *
1146 * Starting from Gen11, the upper dword of the descriptor has a new format:
1147 *
1148 * bits 32-36: reserved
1149 * bits 37-47: SW context ID
1150 * bits 48:53: engine instance
1151 * bit 54: mbz, reserved for use by hardware
1152 * bits 55-60: SW counter
1153 * bits 61-63: engine class
1154 *
1155 * On Xe_HP, the upper dword of the descriptor has a new format:
1156 *
1157 * bits 32-37: virtual function number
1158 * bit 38: mbz, reserved for use by hardware
1159 * bits 39-54: SW context ID
1160 * bits 55-57: reserved
1161 * bits 58-63: SW counter
1162 *
1163 * engine info, SW context ID and SW counter need to form a unique number
1164 * (Context ID) per lrc.
1165 */
lrc_descriptor(const struct intel_context * ce)1166 static u32 lrc_descriptor(const struct intel_context *ce)
1167 {
1168 u32 desc;
1169
1170 desc = INTEL_LEGACY_32B_CONTEXT;
1171 if (i915_vm_is_4lvl(ce->vm))
1172 desc = INTEL_LEGACY_64B_CONTEXT;
1173 desc <<= GEN8_CTX_ADDRESSING_MODE_SHIFT;
1174
1175 desc |= GEN8_CTX_VALID | GEN8_CTX_PRIVILEGE;
1176 if (GRAPHICS_VER(ce->vm->i915) == 8)
1177 desc |= GEN8_CTX_L3LLC_COHERENT;
1178
1179 return i915_ggtt_offset(ce->state) | desc;
1180 }
1181
lrc_update_regs(const struct intel_context * ce,const struct intel_engine_cs * engine,u32 head)1182 u32 lrc_update_regs(const struct intel_context *ce,
1183 const struct intel_engine_cs *engine,
1184 u32 head)
1185 {
1186 struct intel_ring *ring = ce->ring;
1187 u32 *regs = ce->lrc_reg_state;
1188
1189 GEM_BUG_ON(!intel_ring_offset_valid(ring, head));
1190 GEM_BUG_ON(!intel_ring_offset_valid(ring, ring->tail));
1191
1192 regs[CTX_RING_START] = i915_ggtt_offset(ring->vma);
1193 regs[CTX_RING_HEAD] = head;
1194 regs[CTX_RING_TAIL] = ring->tail;
1195 regs[CTX_RING_CTL] = RING_CTL_SIZE(ring->size) | RING_VALID;
1196
1197 /* RPCS */
1198 if (engine->class == RENDER_CLASS) {
1199 regs[CTX_R_PWR_CLK_STATE] =
1200 intel_sseu_make_rpcs(engine->gt, &ce->sseu);
1201
1202 i915_oa_init_reg_state(ce, engine);
1203 }
1204
1205 if (ce->wa_bb_page) {
1206 u32 *(*fn)(const struct intel_context *ce, u32 *cs);
1207
1208 fn = gen12_emit_indirect_ctx_xcs;
1209 if (ce->engine->class == RENDER_CLASS)
1210 fn = gen12_emit_indirect_ctx_rcs;
1211
1212 /* Mutually exclusive wrt to global indirect bb */
1213 GEM_BUG_ON(engine->wa_ctx.indirect_ctx.size);
1214 setup_indirect_ctx_bb(ce, engine, fn);
1215 }
1216
1217 return lrc_descriptor(ce) | CTX_DESC_FORCE_RESTORE;
1218 }
1219
lrc_update_offsets(struct intel_context * ce,struct intel_engine_cs * engine)1220 void lrc_update_offsets(struct intel_context *ce,
1221 struct intel_engine_cs *engine)
1222 {
1223 set_offsets(ce->lrc_reg_state, reg_offsets(engine), engine, false);
1224 }
1225
lrc_check_regs(const struct intel_context * ce,const struct intel_engine_cs * engine,const char * when)1226 void lrc_check_regs(const struct intel_context *ce,
1227 const struct intel_engine_cs *engine,
1228 const char *when)
1229 {
1230 const struct intel_ring *ring = ce->ring;
1231 u32 *regs = ce->lrc_reg_state;
1232 bool valid = true;
1233 int x;
1234
1235 if (regs[CTX_RING_START] != i915_ggtt_offset(ring->vma)) {
1236 pr_err("%s: context submitted with incorrect RING_START [%08x], expected %08x\n",
1237 engine->name,
1238 regs[CTX_RING_START],
1239 i915_ggtt_offset(ring->vma));
1240 regs[CTX_RING_START] = i915_ggtt_offset(ring->vma);
1241 valid = false;
1242 }
1243
1244 if ((regs[CTX_RING_CTL] & ~(RING_WAIT | RING_WAIT_SEMAPHORE)) !=
1245 (RING_CTL_SIZE(ring->size) | RING_VALID)) {
1246 pr_err("%s: context submitted with incorrect RING_CTL [%08x], expected %08x\n",
1247 engine->name,
1248 regs[CTX_RING_CTL],
1249 (u32)(RING_CTL_SIZE(ring->size) | RING_VALID));
1250 regs[CTX_RING_CTL] = RING_CTL_SIZE(ring->size) | RING_VALID;
1251 valid = false;
1252 }
1253
1254 x = lrc_ring_mi_mode(engine);
1255 if (x != -1 && regs[x + 1] & (regs[x + 1] >> 16) & STOP_RING) {
1256 pr_err("%s: context submitted with STOP_RING [%08x] in RING_MI_MODE\n",
1257 engine->name, regs[x + 1]);
1258 regs[x + 1] &= ~STOP_RING;
1259 regs[x + 1] |= STOP_RING << 16;
1260 valid = false;
1261 }
1262
1263 WARN_ONCE(!valid, "Invalid lrc state found %s submission\n", when);
1264 }
1265
1266 /*
1267 * In this WA we need to set GEN8_L3SQCREG4[21:21] and reset it after
1268 * PIPE_CONTROL instruction. This is required for the flush to happen correctly
1269 * but there is a slight complication as this is applied in WA batch where the
1270 * values are only initialized once so we cannot take register value at the
1271 * beginning and reuse it further; hence we save its value to memory, upload a
1272 * constant value with bit21 set and then we restore it back with the saved value.
1273 * To simplify the WA, a constant value is formed by using the default value
1274 * of this register. This shouldn't be a problem because we are only modifying
1275 * it for a short period and this batch in non-premptible. We can ofcourse
1276 * use additional instructions that read the actual value of the register
1277 * at that time and set our bit of interest but it makes the WA complicated.
1278 *
1279 * This WA is also required for Gen9 so extracting as a function avoids
1280 * code duplication.
1281 */
1282 static u32 *
gen8_emit_flush_coherentl3_wa(struct intel_engine_cs * engine,u32 * batch)1283 gen8_emit_flush_coherentl3_wa(struct intel_engine_cs *engine, u32 *batch)
1284 {
1285 /* NB no one else is allowed to scribble over scratch + 256! */
1286 *batch++ = MI_STORE_REGISTER_MEM_GEN8 | MI_SRM_LRM_GLOBAL_GTT;
1287 *batch++ = i915_mmio_reg_offset(GEN8_L3SQCREG4);
1288 *batch++ = intel_gt_scratch_offset(engine->gt,
1289 INTEL_GT_SCRATCH_FIELD_COHERENTL3_WA);
1290 *batch++ = 0;
1291
1292 *batch++ = MI_LOAD_REGISTER_IMM(1);
1293 *batch++ = i915_mmio_reg_offset(GEN8_L3SQCREG4);
1294 *batch++ = 0x40400000 | GEN8_LQSC_FLUSH_COHERENT_LINES;
1295
1296 batch = gen8_emit_pipe_control(batch,
1297 PIPE_CONTROL_CS_STALL |
1298 PIPE_CONTROL_DC_FLUSH_ENABLE,
1299 0);
1300
1301 *batch++ = MI_LOAD_REGISTER_MEM_GEN8 | MI_SRM_LRM_GLOBAL_GTT;
1302 *batch++ = i915_mmio_reg_offset(GEN8_L3SQCREG4);
1303 *batch++ = intel_gt_scratch_offset(engine->gt,
1304 INTEL_GT_SCRATCH_FIELD_COHERENTL3_WA);
1305 *batch++ = 0;
1306
1307 return batch;
1308 }
1309
1310 /*
1311 * Typically we only have one indirect_ctx and per_ctx batch buffer which are
1312 * initialized at the beginning and shared across all contexts but this field
1313 * helps us to have multiple batches at different offsets and select them based
1314 * on a criteria. At the moment this batch always start at the beginning of the page
1315 * and at this point we don't have multiple wa_ctx batch buffers.
1316 *
1317 * The number of WA applied are not known at the beginning; we use this field
1318 * to return the no of DWORDS written.
1319 *
1320 * It is to be noted that this batch does not contain MI_BATCH_BUFFER_END
1321 * so it adds NOOPs as padding to make it cacheline aligned.
1322 * MI_BATCH_BUFFER_END will be added to perctx batch and both of them together
1323 * makes a complete batch buffer.
1324 */
gen8_init_indirectctx_bb(struct intel_engine_cs * engine,u32 * batch)1325 static u32 *gen8_init_indirectctx_bb(struct intel_engine_cs *engine, u32 *batch)
1326 {
1327 /* WaDisableCtxRestoreArbitration:bdw,chv */
1328 *batch++ = MI_ARB_ON_OFF | MI_ARB_DISABLE;
1329
1330 /* WaFlushCoherentL3CacheLinesAtContextSwitch:bdw */
1331 if (IS_BROADWELL(engine->i915))
1332 batch = gen8_emit_flush_coherentl3_wa(engine, batch);
1333
1334 /* WaClearSlmSpaceAtContextSwitch:bdw,chv */
1335 /* Actual scratch location is at 128 bytes offset */
1336 batch = gen8_emit_pipe_control(batch,
1337 PIPE_CONTROL_FLUSH_L3 |
1338 PIPE_CONTROL_STORE_DATA_INDEX |
1339 PIPE_CONTROL_CS_STALL |
1340 PIPE_CONTROL_QW_WRITE,
1341 LRC_PPHWSP_SCRATCH_ADDR);
1342
1343 *batch++ = MI_ARB_ON_OFF | MI_ARB_ENABLE;
1344
1345 /* Pad to end of cacheline */
1346 while ((unsigned long)batch % CACHELINE_BYTES)
1347 *batch++ = MI_NOOP;
1348
1349 /*
1350 * MI_BATCH_BUFFER_END is not required in Indirect ctx BB because
1351 * execution depends on the length specified in terms of cache lines
1352 * in the register CTX_RCS_INDIRECT_CTX
1353 */
1354
1355 return batch;
1356 }
1357
1358 struct lri {
1359 i915_reg_t reg;
1360 u32 value;
1361 };
1362
emit_lri(u32 * batch,const struct lri * lri,unsigned int count)1363 static u32 *emit_lri(u32 *batch, const struct lri *lri, unsigned int count)
1364 {
1365 GEM_BUG_ON(!count || count > 63);
1366
1367 *batch++ = MI_LOAD_REGISTER_IMM(count);
1368 do {
1369 *batch++ = i915_mmio_reg_offset(lri->reg);
1370 *batch++ = lri->value;
1371 } while (lri++, --count);
1372 *batch++ = MI_NOOP;
1373
1374 return batch;
1375 }
1376
gen9_init_indirectctx_bb(struct intel_engine_cs * engine,u32 * batch)1377 static u32 *gen9_init_indirectctx_bb(struct intel_engine_cs *engine, u32 *batch)
1378 {
1379 static const struct lri lri[] = {
1380 /* WaDisableGatherAtSetShaderCommonSlice:skl,bxt,kbl,glk */
1381 {
1382 COMMON_SLICE_CHICKEN2,
1383 __MASKED_FIELD(GEN9_DISABLE_GATHER_AT_SET_SHADER_COMMON_SLICE,
1384 0),
1385 },
1386
1387 /* BSpec: 11391 */
1388 {
1389 FF_SLICE_CHICKEN,
1390 __MASKED_FIELD(FF_SLICE_CHICKEN_CL_PROVOKING_VERTEX_FIX,
1391 FF_SLICE_CHICKEN_CL_PROVOKING_VERTEX_FIX),
1392 },
1393
1394 /* BSpec: 11299 */
1395 {
1396 _3D_CHICKEN3,
1397 __MASKED_FIELD(_3D_CHICKEN_SF_PROVOKING_VERTEX_FIX,
1398 _3D_CHICKEN_SF_PROVOKING_VERTEX_FIX),
1399 }
1400 };
1401
1402 *batch++ = MI_ARB_ON_OFF | MI_ARB_DISABLE;
1403
1404 /* WaFlushCoherentL3CacheLinesAtContextSwitch:skl,bxt,glk */
1405 batch = gen8_emit_flush_coherentl3_wa(engine, batch);
1406
1407 /* WaClearSlmSpaceAtContextSwitch:skl,bxt,kbl,glk,cfl */
1408 batch = gen8_emit_pipe_control(batch,
1409 PIPE_CONTROL_FLUSH_L3 |
1410 PIPE_CONTROL_STORE_DATA_INDEX |
1411 PIPE_CONTROL_CS_STALL |
1412 PIPE_CONTROL_QW_WRITE,
1413 LRC_PPHWSP_SCRATCH_ADDR);
1414
1415 batch = emit_lri(batch, lri, ARRAY_SIZE(lri));
1416
1417 /* WaMediaPoolStateCmdInWABB:bxt,glk */
1418 if (HAS_POOLED_EU(engine->i915)) {
1419 /*
1420 * EU pool configuration is setup along with golden context
1421 * during context initialization. This value depends on
1422 * device type (2x6 or 3x6) and needs to be updated based
1423 * on which subslice is disabled especially for 2x6
1424 * devices, however it is safe to load default
1425 * configuration of 3x6 device instead of masking off
1426 * corresponding bits because HW ignores bits of a disabled
1427 * subslice and drops down to appropriate config. Please
1428 * see render_state_setup() in i915_gem_render_state.c for
1429 * possible configurations, to avoid duplication they are
1430 * not shown here again.
1431 */
1432 *batch++ = GEN9_MEDIA_POOL_STATE;
1433 *batch++ = GEN9_MEDIA_POOL_ENABLE;
1434 *batch++ = 0x00777000;
1435 *batch++ = 0;
1436 *batch++ = 0;
1437 *batch++ = 0;
1438 }
1439
1440 *batch++ = MI_ARB_ON_OFF | MI_ARB_ENABLE;
1441
1442 /* Pad to end of cacheline */
1443 while ((unsigned long)batch % CACHELINE_BYTES)
1444 *batch++ = MI_NOOP;
1445
1446 return batch;
1447 }
1448
1449 #define CTX_WA_BB_SIZE (PAGE_SIZE)
1450
lrc_create_wa_ctx(struct intel_engine_cs * engine)1451 static int lrc_create_wa_ctx(struct intel_engine_cs *engine)
1452 {
1453 struct drm_i915_gem_object *obj;
1454 struct i915_vma *vma;
1455 int err;
1456
1457 obj = i915_gem_object_create_shmem(engine->i915, CTX_WA_BB_SIZE);
1458 if (IS_ERR(obj))
1459 return PTR_ERR(obj);
1460
1461 vma = i915_vma_instance(obj, &engine->gt->ggtt->vm, NULL);
1462 if (IS_ERR(vma)) {
1463 err = PTR_ERR(vma);
1464 goto err;
1465 }
1466
1467 engine->wa_ctx.vma = vma;
1468 return 0;
1469
1470 err:
1471 i915_gem_object_put(obj);
1472 return err;
1473 }
1474
lrc_fini_wa_ctx(struct intel_engine_cs * engine)1475 void lrc_fini_wa_ctx(struct intel_engine_cs *engine)
1476 {
1477 i915_vma_unpin_and_release(&engine->wa_ctx.vma, 0);
1478 }
1479
1480 typedef u32 *(*wa_bb_func_t)(struct intel_engine_cs *engine, u32 *batch);
1481
lrc_init_wa_ctx(struct intel_engine_cs * engine)1482 void lrc_init_wa_ctx(struct intel_engine_cs *engine)
1483 {
1484 struct i915_ctx_workarounds *wa_ctx = &engine->wa_ctx;
1485 struct i915_wa_ctx_bb *wa_bb[] = {
1486 &wa_ctx->indirect_ctx, &wa_ctx->per_ctx
1487 };
1488 wa_bb_func_t wa_bb_fn[ARRAY_SIZE(wa_bb)];
1489 struct i915_gem_ww_ctx ww;
1490 void *batch, *batch_ptr;
1491 unsigned int i;
1492 int err;
1493
1494 if (engine->class != RENDER_CLASS)
1495 return;
1496
1497 switch (GRAPHICS_VER(engine->i915)) {
1498 case 12:
1499 case 11:
1500 return;
1501 case 9:
1502 wa_bb_fn[0] = gen9_init_indirectctx_bb;
1503 wa_bb_fn[1] = NULL;
1504 break;
1505 case 8:
1506 wa_bb_fn[0] = gen8_init_indirectctx_bb;
1507 wa_bb_fn[1] = NULL;
1508 break;
1509 default:
1510 MISSING_CASE(GRAPHICS_VER(engine->i915));
1511 return;
1512 }
1513
1514 err = lrc_create_wa_ctx(engine);
1515 if (err) {
1516 /*
1517 * We continue even if we fail to initialize WA batch
1518 * because we only expect rare glitches but nothing
1519 * critical to prevent us from using GPU
1520 */
1521 drm_err(&engine->i915->drm,
1522 "Ignoring context switch w/a allocation error:%d\n",
1523 err);
1524 return;
1525 }
1526
1527 if (!engine->wa_ctx.vma)
1528 return;
1529
1530 i915_gem_ww_ctx_init(&ww, true);
1531 retry:
1532 err = i915_gem_object_lock(wa_ctx->vma->obj, &ww);
1533 if (!err)
1534 err = i915_ggtt_pin(wa_ctx->vma, &ww, 0, PIN_HIGH);
1535 if (err)
1536 goto err;
1537
1538 batch = i915_gem_object_pin_map(wa_ctx->vma->obj, I915_MAP_WB);
1539 if (IS_ERR(batch)) {
1540 err = PTR_ERR(batch);
1541 goto err_unpin;
1542 }
1543
1544 /*
1545 * Emit the two workaround batch buffers, recording the offset from the
1546 * start of the workaround batch buffer object for each and their
1547 * respective sizes.
1548 */
1549 batch_ptr = batch;
1550 for (i = 0; i < ARRAY_SIZE(wa_bb_fn); i++) {
1551 wa_bb[i]->offset = batch_ptr - batch;
1552 if (GEM_DEBUG_WARN_ON(!IS_ALIGNED(wa_bb[i]->offset,
1553 CACHELINE_BYTES))) {
1554 err = -EINVAL;
1555 break;
1556 }
1557 if (wa_bb_fn[i])
1558 batch_ptr = wa_bb_fn[i](engine, batch_ptr);
1559 wa_bb[i]->size = batch_ptr - (batch + wa_bb[i]->offset);
1560 }
1561 GEM_BUG_ON(batch_ptr - batch > CTX_WA_BB_SIZE);
1562
1563 __i915_gem_object_flush_map(wa_ctx->vma->obj, 0, batch_ptr - batch);
1564 __i915_gem_object_release_map(wa_ctx->vma->obj);
1565
1566 /* Verify that we can handle failure to setup the wa_ctx */
1567 if (!err)
1568 err = i915_inject_probe_error(engine->i915, -ENODEV);
1569
1570 err_unpin:
1571 if (err)
1572 i915_vma_unpin(wa_ctx->vma);
1573 err:
1574 if (err == -EDEADLK) {
1575 err = i915_gem_ww_ctx_backoff(&ww);
1576 if (!err)
1577 goto retry;
1578 }
1579 i915_gem_ww_ctx_fini(&ww);
1580
1581 if (err) {
1582 i915_vma_put(engine->wa_ctx.vma);
1583
1584 /* Clear all flags to prevent further use */
1585 memset(wa_ctx, 0, sizeof(*wa_ctx));
1586 }
1587 }
1588
st_update_runtime_underflow(struct intel_context * ce,s32 dt)1589 static void st_update_runtime_underflow(struct intel_context *ce, s32 dt)
1590 {
1591 #if IS_ENABLED(CONFIG_DRM_I915_SELFTEST)
1592 ce->runtime.num_underflow++;
1593 ce->runtime.max_underflow = max_t(u32, ce->runtime.max_underflow, -dt);
1594 #endif
1595 }
1596
lrc_update_runtime(struct intel_context * ce)1597 void lrc_update_runtime(struct intel_context *ce)
1598 {
1599 u32 old;
1600 s32 dt;
1601
1602 if (intel_context_is_barrier(ce))
1603 return;
1604
1605 old = ce->runtime.last;
1606 ce->runtime.last = lrc_get_runtime(ce);
1607 dt = ce->runtime.last - old;
1608
1609 if (unlikely(dt < 0)) {
1610 CE_TRACE(ce, "runtime underflow: last=%u, new=%u, delta=%d\n",
1611 old, ce->runtime.last, dt);
1612 st_update_runtime_underflow(ce, dt);
1613 return;
1614 }
1615
1616 ewma_runtime_add(&ce->runtime.avg, dt);
1617 ce->runtime.total += dt;
1618 }
1619
1620 #if IS_ENABLED(CONFIG_DRM_I915_SELFTEST)
1621 #include "selftest_lrc.c"
1622 #endif
1623