• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 // SPDX-License-Identifier: MIT
2 /*
3  * Copyright © 2014 Intel Corporation
4  */
5 
6 #include "gem/i915_gem_lmem.h"
7 
8 #include "gen8_engine_cs.h"
9 #include "i915_drv.h"
10 #include "i915_perf.h"
11 #include "intel_engine.h"
12 #include "intel_gpu_commands.h"
13 #include "intel_gt.h"
14 #include "intel_lrc.h"
15 #include "intel_lrc_reg.h"
16 #include "intel_ring.h"
17 #include "shmem_utils.h"
18 
set_offsets(u32 * regs,const u8 * data,const struct intel_engine_cs * engine,bool close)19 static void set_offsets(u32 *regs,
20 			const u8 *data,
21 			const struct intel_engine_cs *engine,
22 			bool close)
23 #define NOP(x) (BIT(7) | (x))
24 #define LRI(count, flags) ((flags) << 6 | (count) | BUILD_BUG_ON_ZERO(count >= BIT(6)))
25 #define POSTED BIT(0)
26 #define REG(x) (((x) >> 2) | BUILD_BUG_ON_ZERO(x >= 0x200))
27 #define REG16(x) \
28 	(((x) >> 9) | BIT(7) | BUILD_BUG_ON_ZERO(x >= 0x10000)), \
29 	(((x) >> 2) & 0x7f)
30 #define END 0
31 {
32 	const u32 base = engine->mmio_base;
33 
34 	while (*data) {
35 		u8 count, flags;
36 
37 		if (*data & BIT(7)) { /* skip */
38 			count = *data++ & ~BIT(7);
39 			regs += count;
40 			continue;
41 		}
42 
43 		count = *data & 0x3f;
44 		flags = *data >> 6;
45 		data++;
46 
47 		*regs = MI_LOAD_REGISTER_IMM(count);
48 		if (flags & POSTED)
49 			*regs |= MI_LRI_FORCE_POSTED;
50 		if (GRAPHICS_VER(engine->i915) >= 11)
51 			*regs |= MI_LRI_LRM_CS_MMIO;
52 		regs++;
53 
54 		GEM_BUG_ON(!count);
55 		do {
56 			u32 offset = 0;
57 			u8 v;
58 
59 			do {
60 				v = *data++;
61 				offset <<= 7;
62 				offset |= v & ~BIT(7);
63 			} while (v & BIT(7));
64 
65 			regs[0] = base + (offset << 2);
66 			regs += 2;
67 		} while (--count);
68 	}
69 
70 	if (close) {
71 		/* Close the batch; used mainly by live_lrc_layout() */
72 		*regs = MI_BATCH_BUFFER_END;
73 		if (GRAPHICS_VER(engine->i915) >= 11)
74 			*regs |= BIT(0);
75 	}
76 }
77 
78 static const u8 gen8_xcs_offsets[] = {
79 	NOP(1),
80 	LRI(11, 0),
81 	REG16(0x244),
82 	REG(0x034),
83 	REG(0x030),
84 	REG(0x038),
85 	REG(0x03c),
86 	REG(0x168),
87 	REG(0x140),
88 	REG(0x110),
89 	REG(0x11c),
90 	REG(0x114),
91 	REG(0x118),
92 
93 	NOP(9),
94 	LRI(9, 0),
95 	REG16(0x3a8),
96 	REG16(0x28c),
97 	REG16(0x288),
98 	REG16(0x284),
99 	REG16(0x280),
100 	REG16(0x27c),
101 	REG16(0x278),
102 	REG16(0x274),
103 	REG16(0x270),
104 
105 	NOP(13),
106 	LRI(2, 0),
107 	REG16(0x200),
108 	REG(0x028),
109 
110 	END
111 };
112 
113 static const u8 gen9_xcs_offsets[] = {
114 	NOP(1),
115 	LRI(14, POSTED),
116 	REG16(0x244),
117 	REG(0x034),
118 	REG(0x030),
119 	REG(0x038),
120 	REG(0x03c),
121 	REG(0x168),
122 	REG(0x140),
123 	REG(0x110),
124 	REG(0x11c),
125 	REG(0x114),
126 	REG(0x118),
127 	REG(0x1c0),
128 	REG(0x1c4),
129 	REG(0x1c8),
130 
131 	NOP(3),
132 	LRI(9, POSTED),
133 	REG16(0x3a8),
134 	REG16(0x28c),
135 	REG16(0x288),
136 	REG16(0x284),
137 	REG16(0x280),
138 	REG16(0x27c),
139 	REG16(0x278),
140 	REG16(0x274),
141 	REG16(0x270),
142 
143 	NOP(13),
144 	LRI(1, POSTED),
145 	REG16(0x200),
146 
147 	NOP(13),
148 	LRI(44, POSTED),
149 	REG(0x028),
150 	REG(0x09c),
151 	REG(0x0c0),
152 	REG(0x178),
153 	REG(0x17c),
154 	REG16(0x358),
155 	REG(0x170),
156 	REG(0x150),
157 	REG(0x154),
158 	REG(0x158),
159 	REG16(0x41c),
160 	REG16(0x600),
161 	REG16(0x604),
162 	REG16(0x608),
163 	REG16(0x60c),
164 	REG16(0x610),
165 	REG16(0x614),
166 	REG16(0x618),
167 	REG16(0x61c),
168 	REG16(0x620),
169 	REG16(0x624),
170 	REG16(0x628),
171 	REG16(0x62c),
172 	REG16(0x630),
173 	REG16(0x634),
174 	REG16(0x638),
175 	REG16(0x63c),
176 	REG16(0x640),
177 	REG16(0x644),
178 	REG16(0x648),
179 	REG16(0x64c),
180 	REG16(0x650),
181 	REG16(0x654),
182 	REG16(0x658),
183 	REG16(0x65c),
184 	REG16(0x660),
185 	REG16(0x664),
186 	REG16(0x668),
187 	REG16(0x66c),
188 	REG16(0x670),
189 	REG16(0x674),
190 	REG16(0x678),
191 	REG16(0x67c),
192 	REG(0x068),
193 
194 	END
195 };
196 
197 static const u8 gen12_xcs_offsets[] = {
198 	NOP(1),
199 	LRI(13, POSTED),
200 	REG16(0x244),
201 	REG(0x034),
202 	REG(0x030),
203 	REG(0x038),
204 	REG(0x03c),
205 	REG(0x168),
206 	REG(0x140),
207 	REG(0x110),
208 	REG(0x1c0),
209 	REG(0x1c4),
210 	REG(0x1c8),
211 	REG(0x180),
212 	REG16(0x2b4),
213 
214 	NOP(5),
215 	LRI(9, POSTED),
216 	REG16(0x3a8),
217 	REG16(0x28c),
218 	REG16(0x288),
219 	REG16(0x284),
220 	REG16(0x280),
221 	REG16(0x27c),
222 	REG16(0x278),
223 	REG16(0x274),
224 	REG16(0x270),
225 
226 	END
227 };
228 
229 static const u8 gen8_rcs_offsets[] = {
230 	NOP(1),
231 	LRI(14, POSTED),
232 	REG16(0x244),
233 	REG(0x034),
234 	REG(0x030),
235 	REG(0x038),
236 	REG(0x03c),
237 	REG(0x168),
238 	REG(0x140),
239 	REG(0x110),
240 	REG(0x11c),
241 	REG(0x114),
242 	REG(0x118),
243 	REG(0x1c0),
244 	REG(0x1c4),
245 	REG(0x1c8),
246 
247 	NOP(3),
248 	LRI(9, POSTED),
249 	REG16(0x3a8),
250 	REG16(0x28c),
251 	REG16(0x288),
252 	REG16(0x284),
253 	REG16(0x280),
254 	REG16(0x27c),
255 	REG16(0x278),
256 	REG16(0x274),
257 	REG16(0x270),
258 
259 	NOP(13),
260 	LRI(1, 0),
261 	REG(0x0c8),
262 
263 	END
264 };
265 
266 static const u8 gen9_rcs_offsets[] = {
267 	NOP(1),
268 	LRI(14, POSTED),
269 	REG16(0x244),
270 	REG(0x34),
271 	REG(0x30),
272 	REG(0x38),
273 	REG(0x3c),
274 	REG(0x168),
275 	REG(0x140),
276 	REG(0x110),
277 	REG(0x11c),
278 	REG(0x114),
279 	REG(0x118),
280 	REG(0x1c0),
281 	REG(0x1c4),
282 	REG(0x1c8),
283 
284 	NOP(3),
285 	LRI(9, POSTED),
286 	REG16(0x3a8),
287 	REG16(0x28c),
288 	REG16(0x288),
289 	REG16(0x284),
290 	REG16(0x280),
291 	REG16(0x27c),
292 	REG16(0x278),
293 	REG16(0x274),
294 	REG16(0x270),
295 
296 	NOP(13),
297 	LRI(1, 0),
298 	REG(0xc8),
299 
300 	NOP(13),
301 	LRI(44, POSTED),
302 	REG(0x28),
303 	REG(0x9c),
304 	REG(0xc0),
305 	REG(0x178),
306 	REG(0x17c),
307 	REG16(0x358),
308 	REG(0x170),
309 	REG(0x150),
310 	REG(0x154),
311 	REG(0x158),
312 	REG16(0x41c),
313 	REG16(0x600),
314 	REG16(0x604),
315 	REG16(0x608),
316 	REG16(0x60c),
317 	REG16(0x610),
318 	REG16(0x614),
319 	REG16(0x618),
320 	REG16(0x61c),
321 	REG16(0x620),
322 	REG16(0x624),
323 	REG16(0x628),
324 	REG16(0x62c),
325 	REG16(0x630),
326 	REG16(0x634),
327 	REG16(0x638),
328 	REG16(0x63c),
329 	REG16(0x640),
330 	REG16(0x644),
331 	REG16(0x648),
332 	REG16(0x64c),
333 	REG16(0x650),
334 	REG16(0x654),
335 	REG16(0x658),
336 	REG16(0x65c),
337 	REG16(0x660),
338 	REG16(0x664),
339 	REG16(0x668),
340 	REG16(0x66c),
341 	REG16(0x670),
342 	REG16(0x674),
343 	REG16(0x678),
344 	REG16(0x67c),
345 	REG(0x68),
346 
347 	END
348 };
349 
350 static const u8 gen11_rcs_offsets[] = {
351 	NOP(1),
352 	LRI(15, POSTED),
353 	REG16(0x244),
354 	REG(0x034),
355 	REG(0x030),
356 	REG(0x038),
357 	REG(0x03c),
358 	REG(0x168),
359 	REG(0x140),
360 	REG(0x110),
361 	REG(0x11c),
362 	REG(0x114),
363 	REG(0x118),
364 	REG(0x1c0),
365 	REG(0x1c4),
366 	REG(0x1c8),
367 	REG(0x180),
368 
369 	NOP(1),
370 	LRI(9, POSTED),
371 	REG16(0x3a8),
372 	REG16(0x28c),
373 	REG16(0x288),
374 	REG16(0x284),
375 	REG16(0x280),
376 	REG16(0x27c),
377 	REG16(0x278),
378 	REG16(0x274),
379 	REG16(0x270),
380 
381 	LRI(1, POSTED),
382 	REG(0x1b0),
383 
384 	NOP(10),
385 	LRI(1, 0),
386 	REG(0x0c8),
387 
388 	END
389 };
390 
391 static const u8 gen12_rcs_offsets[] = {
392 	NOP(1),
393 	LRI(13, POSTED),
394 	REG16(0x244),
395 	REG(0x034),
396 	REG(0x030),
397 	REG(0x038),
398 	REG(0x03c),
399 	REG(0x168),
400 	REG(0x140),
401 	REG(0x110),
402 	REG(0x1c0),
403 	REG(0x1c4),
404 	REG(0x1c8),
405 	REG(0x180),
406 	REG16(0x2b4),
407 
408 	NOP(5),
409 	LRI(9, POSTED),
410 	REG16(0x3a8),
411 	REG16(0x28c),
412 	REG16(0x288),
413 	REG16(0x284),
414 	REG16(0x280),
415 	REG16(0x27c),
416 	REG16(0x278),
417 	REG16(0x274),
418 	REG16(0x270),
419 
420 	LRI(3, POSTED),
421 	REG(0x1b0),
422 	REG16(0x5a8),
423 	REG16(0x5ac),
424 
425 	NOP(6),
426 	LRI(1, 0),
427 	REG(0x0c8),
428 	NOP(3 + 9 + 1),
429 
430 	LRI(51, POSTED),
431 	REG16(0x588),
432 	REG16(0x588),
433 	REG16(0x588),
434 	REG16(0x588),
435 	REG16(0x588),
436 	REG16(0x588),
437 	REG(0x028),
438 	REG(0x09c),
439 	REG(0x0c0),
440 	REG(0x178),
441 	REG(0x17c),
442 	REG16(0x358),
443 	REG(0x170),
444 	REG(0x150),
445 	REG(0x154),
446 	REG(0x158),
447 	REG16(0x41c),
448 	REG16(0x600),
449 	REG16(0x604),
450 	REG16(0x608),
451 	REG16(0x60c),
452 	REG16(0x610),
453 	REG16(0x614),
454 	REG16(0x618),
455 	REG16(0x61c),
456 	REG16(0x620),
457 	REG16(0x624),
458 	REG16(0x628),
459 	REG16(0x62c),
460 	REG16(0x630),
461 	REG16(0x634),
462 	REG16(0x638),
463 	REG16(0x63c),
464 	REG16(0x640),
465 	REG16(0x644),
466 	REG16(0x648),
467 	REG16(0x64c),
468 	REG16(0x650),
469 	REG16(0x654),
470 	REG16(0x658),
471 	REG16(0x65c),
472 	REG16(0x660),
473 	REG16(0x664),
474 	REG16(0x668),
475 	REG16(0x66c),
476 	REG16(0x670),
477 	REG16(0x674),
478 	REG16(0x678),
479 	REG16(0x67c),
480 	REG(0x068),
481 	REG(0x084),
482 	NOP(1),
483 
484 	END
485 };
486 
487 static const u8 xehp_rcs_offsets[] = {
488 	NOP(1),
489 	LRI(13, POSTED),
490 	REG16(0x244),
491 	REG(0x034),
492 	REG(0x030),
493 	REG(0x038),
494 	REG(0x03c),
495 	REG(0x168),
496 	REG(0x140),
497 	REG(0x110),
498 	REG(0x1c0),
499 	REG(0x1c4),
500 	REG(0x1c8),
501 	REG(0x180),
502 	REG16(0x2b4),
503 
504 	NOP(5),
505 	LRI(9, POSTED),
506 	REG16(0x3a8),
507 	REG16(0x28c),
508 	REG16(0x288),
509 	REG16(0x284),
510 	REG16(0x280),
511 	REG16(0x27c),
512 	REG16(0x278),
513 	REG16(0x274),
514 	REG16(0x270),
515 
516 	LRI(3, POSTED),
517 	REG(0x1b0),
518 	REG16(0x5a8),
519 	REG16(0x5ac),
520 
521 	NOP(6),
522 	LRI(1, 0),
523 	REG(0x0c8),
524 
525 	END
526 };
527 
528 #undef END
529 #undef REG16
530 #undef REG
531 #undef LRI
532 #undef NOP
533 
reg_offsets(const struct intel_engine_cs * engine)534 static const u8 *reg_offsets(const struct intel_engine_cs *engine)
535 {
536 	/*
537 	 * The gen12+ lists only have the registers we program in the basic
538 	 * default state. We rely on the context image using relative
539 	 * addressing to automatic fixup the register state between the
540 	 * physical engines for virtual engine.
541 	 */
542 	GEM_BUG_ON(GRAPHICS_VER(engine->i915) >= 12 &&
543 		   !intel_engine_has_relative_mmio(engine));
544 
545 	if (engine->class == RENDER_CLASS) {
546 		if (GRAPHICS_VER_FULL(engine->i915) >= IP_VER(12, 50))
547 			return xehp_rcs_offsets;
548 		else if (GRAPHICS_VER(engine->i915) >= 12)
549 			return gen12_rcs_offsets;
550 		else if (GRAPHICS_VER(engine->i915) >= 11)
551 			return gen11_rcs_offsets;
552 		else if (GRAPHICS_VER(engine->i915) >= 9)
553 			return gen9_rcs_offsets;
554 		else
555 			return gen8_rcs_offsets;
556 	} else {
557 		if (GRAPHICS_VER(engine->i915) >= 12)
558 			return gen12_xcs_offsets;
559 		else if (GRAPHICS_VER(engine->i915) >= 9)
560 			return gen9_xcs_offsets;
561 		else
562 			return gen8_xcs_offsets;
563 	}
564 }
565 
lrc_ring_mi_mode(const struct intel_engine_cs * engine)566 static int lrc_ring_mi_mode(const struct intel_engine_cs *engine)
567 {
568 	if (GRAPHICS_VER_FULL(engine->i915) >= IP_VER(12, 50))
569 		return 0x70;
570 	else if (GRAPHICS_VER(engine->i915) >= 12)
571 		return 0x60;
572 	else if (GRAPHICS_VER(engine->i915) >= 9)
573 		return 0x54;
574 	else if (engine->class == RENDER_CLASS)
575 		return 0x58;
576 	else
577 		return -1;
578 }
579 
lrc_ring_gpr0(const struct intel_engine_cs * engine)580 static int lrc_ring_gpr0(const struct intel_engine_cs *engine)
581 {
582 	if (GRAPHICS_VER_FULL(engine->i915) >= IP_VER(12, 50))
583 		return 0x84;
584 	else if (GRAPHICS_VER(engine->i915) >= 12)
585 		return 0x74;
586 	else if (GRAPHICS_VER(engine->i915) >= 9)
587 		return 0x68;
588 	else if (engine->class == RENDER_CLASS)
589 		return 0xd8;
590 	else
591 		return -1;
592 }
593 
lrc_ring_wa_bb_per_ctx(const struct intel_engine_cs * engine)594 static int lrc_ring_wa_bb_per_ctx(const struct intel_engine_cs *engine)
595 {
596 	if (GRAPHICS_VER(engine->i915) >= 12)
597 		return 0x12;
598 	else if (GRAPHICS_VER(engine->i915) >= 9 || engine->class == RENDER_CLASS)
599 		return 0x18;
600 	else
601 		return -1;
602 }
603 
lrc_ring_indirect_ptr(const struct intel_engine_cs * engine)604 static int lrc_ring_indirect_ptr(const struct intel_engine_cs *engine)
605 {
606 	int x;
607 
608 	x = lrc_ring_wa_bb_per_ctx(engine);
609 	if (x < 0)
610 		return x;
611 
612 	return x + 2;
613 }
614 
lrc_ring_indirect_offset(const struct intel_engine_cs * engine)615 static int lrc_ring_indirect_offset(const struct intel_engine_cs *engine)
616 {
617 	int x;
618 
619 	x = lrc_ring_indirect_ptr(engine);
620 	if (x < 0)
621 		return x;
622 
623 	return x + 2;
624 }
625 
lrc_ring_cmd_buf_cctl(const struct intel_engine_cs * engine)626 static int lrc_ring_cmd_buf_cctl(const struct intel_engine_cs *engine)
627 {
628 
629 	if (GRAPHICS_VER_FULL(engine->i915) >= IP_VER(12, 50))
630 		/*
631 		 * Note that the CSFE context has a dummy slot for CMD_BUF_CCTL
632 		 * simply to match the RCS context image layout.
633 		 */
634 		return 0xc6;
635 	else if (engine->class != RENDER_CLASS)
636 		return -1;
637 	else if (GRAPHICS_VER(engine->i915) >= 12)
638 		return 0xb6;
639 	else if (GRAPHICS_VER(engine->i915) >= 11)
640 		return 0xaa;
641 	else
642 		return -1;
643 }
644 
645 static u32
lrc_ring_indirect_offset_default(const struct intel_engine_cs * engine)646 lrc_ring_indirect_offset_default(const struct intel_engine_cs *engine)
647 {
648 	switch (GRAPHICS_VER(engine->i915)) {
649 	default:
650 		MISSING_CASE(GRAPHICS_VER(engine->i915));
651 		fallthrough;
652 	case 12:
653 		return GEN12_CTX_RCS_INDIRECT_CTX_OFFSET_DEFAULT;
654 	case 11:
655 		return GEN11_CTX_RCS_INDIRECT_CTX_OFFSET_DEFAULT;
656 	case 9:
657 		return GEN9_CTX_RCS_INDIRECT_CTX_OFFSET_DEFAULT;
658 	case 8:
659 		return GEN8_CTX_RCS_INDIRECT_CTX_OFFSET_DEFAULT;
660 	}
661 }
662 
663 static void
lrc_setup_indirect_ctx(u32 * regs,const struct intel_engine_cs * engine,u32 ctx_bb_ggtt_addr,u32 size)664 lrc_setup_indirect_ctx(u32 *regs,
665 		       const struct intel_engine_cs *engine,
666 		       u32 ctx_bb_ggtt_addr,
667 		       u32 size)
668 {
669 	GEM_BUG_ON(!size);
670 	GEM_BUG_ON(!IS_ALIGNED(size, CACHELINE_BYTES));
671 	GEM_BUG_ON(lrc_ring_indirect_ptr(engine) == -1);
672 	regs[lrc_ring_indirect_ptr(engine) + 1] =
673 		ctx_bb_ggtt_addr | (size / CACHELINE_BYTES);
674 
675 	GEM_BUG_ON(lrc_ring_indirect_offset(engine) == -1);
676 	regs[lrc_ring_indirect_offset(engine) + 1] =
677 		lrc_ring_indirect_offset_default(engine) << 6;
678 }
679 
init_common_regs(u32 * const regs,const struct intel_context * ce,const struct intel_engine_cs * engine,bool inhibit)680 static void init_common_regs(u32 * const regs,
681 			     const struct intel_context *ce,
682 			     const struct intel_engine_cs *engine,
683 			     bool inhibit)
684 {
685 	u32 ctl;
686 
687 	ctl = _MASKED_BIT_ENABLE(CTX_CTRL_INHIBIT_SYN_CTX_SWITCH);
688 	ctl |= _MASKED_BIT_DISABLE(CTX_CTRL_ENGINE_CTX_RESTORE_INHIBIT);
689 	if (inhibit)
690 		ctl |= CTX_CTRL_ENGINE_CTX_RESTORE_INHIBIT;
691 	if (GRAPHICS_VER(engine->i915) < 11)
692 		ctl |= _MASKED_BIT_DISABLE(CTX_CTRL_ENGINE_CTX_SAVE_INHIBIT |
693 					   CTX_CTRL_RS_CTX_ENABLE);
694 	regs[CTX_CONTEXT_CONTROL] = ctl;
695 
696 	regs[CTX_TIMESTAMP] = ce->runtime.last;
697 }
698 
init_wa_bb_regs(u32 * const regs,const struct intel_engine_cs * engine)699 static void init_wa_bb_regs(u32 * const regs,
700 			    const struct intel_engine_cs *engine)
701 {
702 	const struct i915_ctx_workarounds * const wa_ctx = &engine->wa_ctx;
703 
704 	if (wa_ctx->per_ctx.size) {
705 		const u32 ggtt_offset = i915_ggtt_offset(wa_ctx->vma);
706 
707 		GEM_BUG_ON(lrc_ring_wa_bb_per_ctx(engine) == -1);
708 		regs[lrc_ring_wa_bb_per_ctx(engine) + 1] =
709 			(ggtt_offset + wa_ctx->per_ctx.offset) | 0x01;
710 	}
711 
712 	if (wa_ctx->indirect_ctx.size) {
713 		lrc_setup_indirect_ctx(regs, engine,
714 				       i915_ggtt_offset(wa_ctx->vma) +
715 				       wa_ctx->indirect_ctx.offset,
716 				       wa_ctx->indirect_ctx.size);
717 	}
718 }
719 
init_ppgtt_regs(u32 * regs,const struct i915_ppgtt * ppgtt)720 static void init_ppgtt_regs(u32 *regs, const struct i915_ppgtt *ppgtt)
721 {
722 	if (i915_vm_is_4lvl(&ppgtt->vm)) {
723 		/* 64b PPGTT (48bit canonical)
724 		 * PDP0_DESCRIPTOR contains the base address to PML4 and
725 		 * other PDP Descriptors are ignored.
726 		 */
727 		ASSIGN_CTX_PML4(ppgtt, regs);
728 	} else {
729 		ASSIGN_CTX_PDP(ppgtt, regs, 3);
730 		ASSIGN_CTX_PDP(ppgtt, regs, 2);
731 		ASSIGN_CTX_PDP(ppgtt, regs, 1);
732 		ASSIGN_CTX_PDP(ppgtt, regs, 0);
733 	}
734 }
735 
vm_alias(struct i915_address_space * vm)736 static struct i915_ppgtt *vm_alias(struct i915_address_space *vm)
737 {
738 	if (i915_is_ggtt(vm))
739 		return i915_vm_to_ggtt(vm)->alias;
740 	else
741 		return i915_vm_to_ppgtt(vm);
742 }
743 
__reset_stop_ring(u32 * regs,const struct intel_engine_cs * engine)744 static void __reset_stop_ring(u32 *regs, const struct intel_engine_cs *engine)
745 {
746 	int x;
747 
748 	x = lrc_ring_mi_mode(engine);
749 	if (x != -1) {
750 		regs[x + 1] &= ~STOP_RING;
751 		regs[x + 1] |= STOP_RING << 16;
752 	}
753 }
754 
__lrc_init_regs(u32 * regs,const struct intel_context * ce,const struct intel_engine_cs * engine,bool inhibit)755 static void __lrc_init_regs(u32 *regs,
756 			    const struct intel_context *ce,
757 			    const struct intel_engine_cs *engine,
758 			    bool inhibit)
759 {
760 	/*
761 	 * A context is actually a big batch buffer with several
762 	 * MI_LOAD_REGISTER_IMM commands followed by (reg, value) pairs. The
763 	 * values we are setting here are only for the first context restore:
764 	 * on a subsequent save, the GPU will recreate this batchbuffer with new
765 	 * values (including all the missing MI_LOAD_REGISTER_IMM commands that
766 	 * we are not initializing here).
767 	 *
768 	 * Must keep consistent with virtual_update_register_offsets().
769 	 */
770 
771 	if (inhibit)
772 		memset(regs, 0, PAGE_SIZE);
773 
774 	set_offsets(regs, reg_offsets(engine), engine, inhibit);
775 
776 	init_common_regs(regs, ce, engine, inhibit);
777 	init_ppgtt_regs(regs, vm_alias(ce->vm));
778 
779 	init_wa_bb_regs(regs, engine);
780 
781 	__reset_stop_ring(regs, engine);
782 }
783 
lrc_init_regs(const struct intel_context * ce,const struct intel_engine_cs * engine,bool inhibit)784 void lrc_init_regs(const struct intel_context *ce,
785 		   const struct intel_engine_cs *engine,
786 		   bool inhibit)
787 {
788 	__lrc_init_regs(ce->lrc_reg_state, ce, engine, inhibit);
789 }
790 
lrc_reset_regs(const struct intel_context * ce,const struct intel_engine_cs * engine)791 void lrc_reset_regs(const struct intel_context *ce,
792 		    const struct intel_engine_cs *engine)
793 {
794 	__reset_stop_ring(ce->lrc_reg_state, engine);
795 }
796 
797 static void
set_redzone(void * vaddr,const struct intel_engine_cs * engine)798 set_redzone(void *vaddr, const struct intel_engine_cs *engine)
799 {
800 	if (!IS_ENABLED(CONFIG_DRM_I915_DEBUG_GEM))
801 		return;
802 
803 	vaddr += engine->context_size;
804 
805 	memset(vaddr, CONTEXT_REDZONE, I915_GTT_PAGE_SIZE);
806 }
807 
808 static void
check_redzone(const void * vaddr,const struct intel_engine_cs * engine)809 check_redzone(const void *vaddr, const struct intel_engine_cs *engine)
810 {
811 	if (!IS_ENABLED(CONFIG_DRM_I915_DEBUG_GEM))
812 		return;
813 
814 	vaddr += engine->context_size;
815 
816 	if (memchr_inv(vaddr, CONTEXT_REDZONE, I915_GTT_PAGE_SIZE))
817 		drm_err_once(&engine->i915->drm,
818 			     "%s context redzone overwritten!\n",
819 			     engine->name);
820 }
821 
lrc_init_state(struct intel_context * ce,struct intel_engine_cs * engine,void * state)822 void lrc_init_state(struct intel_context *ce,
823 		    struct intel_engine_cs *engine,
824 		    void *state)
825 {
826 	bool inhibit = true;
827 
828 	set_redzone(state, engine);
829 
830 	if (engine->default_state) {
831 		shmem_read(engine->default_state, 0,
832 			   state, engine->context_size);
833 		__set_bit(CONTEXT_VALID_BIT, &ce->flags);
834 		inhibit = false;
835 	}
836 
837 	/* Clear the ppHWSP (inc. per-context counters) */
838 	memset(state, 0, PAGE_SIZE);
839 
840 	/*
841 	 * The second page of the context object contains some registers which
842 	 * must be set up prior to the first execution.
843 	 */
844 	__lrc_init_regs(state + LRC_STATE_OFFSET, ce, engine, inhibit);
845 }
846 
847 static struct i915_vma *
__lrc_alloc_state(struct intel_context * ce,struct intel_engine_cs * engine)848 __lrc_alloc_state(struct intel_context *ce, struct intel_engine_cs *engine)
849 {
850 	struct drm_i915_gem_object *obj;
851 	struct i915_vma *vma;
852 	u32 context_size;
853 
854 	context_size = round_up(engine->context_size, I915_GTT_PAGE_SIZE);
855 
856 	if (IS_ENABLED(CONFIG_DRM_I915_DEBUG_GEM))
857 		context_size += I915_GTT_PAGE_SIZE; /* for redzone */
858 
859 	if (GRAPHICS_VER(engine->i915) == 12) {
860 		ce->wa_bb_page = context_size / PAGE_SIZE;
861 		context_size += PAGE_SIZE;
862 	}
863 
864 	obj = i915_gem_object_create_lmem(engine->i915, context_size, 0);
865 	if (IS_ERR(obj))
866 		obj = i915_gem_object_create_shmem(engine->i915, context_size);
867 	if (IS_ERR(obj))
868 		return ERR_CAST(obj);
869 
870 	vma = i915_vma_instance(obj, &engine->gt->ggtt->vm, NULL);
871 	if (IS_ERR(vma)) {
872 		i915_gem_object_put(obj);
873 		return vma;
874 	}
875 
876 	return vma;
877 }
878 
879 static struct intel_timeline *
pinned_timeline(struct intel_context * ce,struct intel_engine_cs * engine)880 pinned_timeline(struct intel_context *ce, struct intel_engine_cs *engine)
881 {
882 	struct intel_timeline *tl = fetch_and_zero(&ce->timeline);
883 
884 	return intel_timeline_create_from_engine(engine, page_unmask_bits(tl));
885 }
886 
lrc_alloc(struct intel_context * ce,struct intel_engine_cs * engine)887 int lrc_alloc(struct intel_context *ce, struct intel_engine_cs *engine)
888 {
889 	struct intel_ring *ring;
890 	struct i915_vma *vma;
891 	int err;
892 
893 	GEM_BUG_ON(ce->state);
894 
895 	vma = __lrc_alloc_state(ce, engine);
896 	if (IS_ERR(vma))
897 		return PTR_ERR(vma);
898 
899 	ring = intel_engine_create_ring(engine, ce->ring_size);
900 	if (IS_ERR(ring)) {
901 		err = PTR_ERR(ring);
902 		goto err_vma;
903 	}
904 
905 	if (!page_mask_bits(ce->timeline)) {
906 		struct intel_timeline *tl;
907 
908 		/*
909 		 * Use the static global HWSP for the kernel context, and
910 		 * a dynamically allocated cacheline for everyone else.
911 		 */
912 		if (unlikely(ce->timeline))
913 			tl = pinned_timeline(ce, engine);
914 		else
915 			tl = intel_timeline_create(engine->gt);
916 		if (IS_ERR(tl)) {
917 			err = PTR_ERR(tl);
918 			goto err_ring;
919 		}
920 
921 		ce->timeline = tl;
922 	}
923 
924 	ce->ring = ring;
925 	ce->state = vma;
926 
927 	return 0;
928 
929 err_ring:
930 	intel_ring_put(ring);
931 err_vma:
932 	i915_vma_put(vma);
933 	return err;
934 }
935 
lrc_reset(struct intel_context * ce)936 void lrc_reset(struct intel_context *ce)
937 {
938 	GEM_BUG_ON(!intel_context_is_pinned(ce));
939 
940 	intel_ring_reset(ce->ring, ce->ring->emit);
941 
942 	/* Scrub away the garbage */
943 	lrc_init_regs(ce, ce->engine, true);
944 	ce->lrc.lrca = lrc_update_regs(ce, ce->engine, ce->ring->tail);
945 }
946 
947 int
lrc_pre_pin(struct intel_context * ce,struct intel_engine_cs * engine,struct i915_gem_ww_ctx * ww,void ** vaddr)948 lrc_pre_pin(struct intel_context *ce,
949 	    struct intel_engine_cs *engine,
950 	    struct i915_gem_ww_ctx *ww,
951 	    void **vaddr)
952 {
953 	GEM_BUG_ON(!ce->state);
954 	GEM_BUG_ON(!i915_vma_is_pinned(ce->state));
955 
956 	*vaddr = i915_gem_object_pin_map(ce->state->obj,
957 					 i915_coherent_map_type(ce->engine->i915,
958 								ce->state->obj,
959 								false) |
960 					 I915_MAP_OVERRIDE);
961 
962 	return PTR_ERR_OR_ZERO(*vaddr);
963 }
964 
965 int
lrc_pin(struct intel_context * ce,struct intel_engine_cs * engine,void * vaddr)966 lrc_pin(struct intel_context *ce,
967 	struct intel_engine_cs *engine,
968 	void *vaddr)
969 {
970 	ce->lrc_reg_state = vaddr + LRC_STATE_OFFSET;
971 
972 	if (!__test_and_set_bit(CONTEXT_INIT_BIT, &ce->flags))
973 		lrc_init_state(ce, engine, vaddr);
974 
975 	ce->lrc.lrca = lrc_update_regs(ce, engine, ce->ring->tail);
976 	return 0;
977 }
978 
lrc_unpin(struct intel_context * ce)979 void lrc_unpin(struct intel_context *ce)
980 {
981 	check_redzone((void *)ce->lrc_reg_state - LRC_STATE_OFFSET,
982 		      ce->engine);
983 }
984 
lrc_post_unpin(struct intel_context * ce)985 void lrc_post_unpin(struct intel_context *ce)
986 {
987 	i915_gem_object_unpin_map(ce->state->obj);
988 }
989 
lrc_fini(struct intel_context * ce)990 void lrc_fini(struct intel_context *ce)
991 {
992 	if (!ce->state)
993 		return;
994 
995 	intel_ring_put(fetch_and_zero(&ce->ring));
996 	i915_vma_put(fetch_and_zero(&ce->state));
997 }
998 
lrc_destroy(struct kref * kref)999 void lrc_destroy(struct kref *kref)
1000 {
1001 	struct intel_context *ce = container_of(kref, typeof(*ce), ref);
1002 
1003 	GEM_BUG_ON(!i915_active_is_idle(&ce->active));
1004 	GEM_BUG_ON(intel_context_is_pinned(ce));
1005 
1006 	lrc_fini(ce);
1007 
1008 	intel_context_fini(ce);
1009 	intel_context_free(ce);
1010 }
1011 
1012 static u32 *
gen12_emit_timestamp_wa(const struct intel_context * ce,u32 * cs)1013 gen12_emit_timestamp_wa(const struct intel_context *ce, u32 *cs)
1014 {
1015 	*cs++ = MI_LOAD_REGISTER_MEM_GEN8 |
1016 		MI_SRM_LRM_GLOBAL_GTT |
1017 		MI_LRI_LRM_CS_MMIO;
1018 	*cs++ = i915_mmio_reg_offset(GEN8_RING_CS_GPR(0, 0));
1019 	*cs++ = i915_ggtt_offset(ce->state) + LRC_STATE_OFFSET +
1020 		CTX_TIMESTAMP * sizeof(u32);
1021 	*cs++ = 0;
1022 
1023 	*cs++ = MI_LOAD_REGISTER_REG |
1024 		MI_LRR_SOURCE_CS_MMIO |
1025 		MI_LRI_LRM_CS_MMIO;
1026 	*cs++ = i915_mmio_reg_offset(GEN8_RING_CS_GPR(0, 0));
1027 	*cs++ = i915_mmio_reg_offset(RING_CTX_TIMESTAMP(0));
1028 
1029 	*cs++ = MI_LOAD_REGISTER_REG |
1030 		MI_LRR_SOURCE_CS_MMIO |
1031 		MI_LRI_LRM_CS_MMIO;
1032 	*cs++ = i915_mmio_reg_offset(GEN8_RING_CS_GPR(0, 0));
1033 	*cs++ = i915_mmio_reg_offset(RING_CTX_TIMESTAMP(0));
1034 
1035 	return cs;
1036 }
1037 
1038 static u32 *
gen12_emit_restore_scratch(const struct intel_context * ce,u32 * cs)1039 gen12_emit_restore_scratch(const struct intel_context *ce, u32 *cs)
1040 {
1041 	GEM_BUG_ON(lrc_ring_gpr0(ce->engine) == -1);
1042 
1043 	*cs++ = MI_LOAD_REGISTER_MEM_GEN8 |
1044 		MI_SRM_LRM_GLOBAL_GTT |
1045 		MI_LRI_LRM_CS_MMIO;
1046 	*cs++ = i915_mmio_reg_offset(GEN8_RING_CS_GPR(0, 0));
1047 	*cs++ = i915_ggtt_offset(ce->state) + LRC_STATE_OFFSET +
1048 		(lrc_ring_gpr0(ce->engine) + 1) * sizeof(u32);
1049 	*cs++ = 0;
1050 
1051 	return cs;
1052 }
1053 
1054 static u32 *
gen12_emit_cmd_buf_wa(const struct intel_context * ce,u32 * cs)1055 gen12_emit_cmd_buf_wa(const struct intel_context *ce, u32 *cs)
1056 {
1057 	GEM_BUG_ON(lrc_ring_cmd_buf_cctl(ce->engine) == -1);
1058 
1059 	*cs++ = MI_LOAD_REGISTER_MEM_GEN8 |
1060 		MI_SRM_LRM_GLOBAL_GTT |
1061 		MI_LRI_LRM_CS_MMIO;
1062 	*cs++ = i915_mmio_reg_offset(GEN8_RING_CS_GPR(0, 0));
1063 	*cs++ = i915_ggtt_offset(ce->state) + LRC_STATE_OFFSET +
1064 		(lrc_ring_cmd_buf_cctl(ce->engine) + 1) * sizeof(u32);
1065 	*cs++ = 0;
1066 
1067 	*cs++ = MI_LOAD_REGISTER_REG |
1068 		MI_LRR_SOURCE_CS_MMIO |
1069 		MI_LRI_LRM_CS_MMIO;
1070 	*cs++ = i915_mmio_reg_offset(GEN8_RING_CS_GPR(0, 0));
1071 	*cs++ = i915_mmio_reg_offset(RING_CMD_BUF_CCTL(0));
1072 
1073 	return cs;
1074 }
1075 
1076 static u32 *
gen12_emit_indirect_ctx_rcs(const struct intel_context * ce,u32 * cs)1077 gen12_emit_indirect_ctx_rcs(const struct intel_context *ce, u32 *cs)
1078 {
1079 	cs = gen12_emit_timestamp_wa(ce, cs);
1080 	cs = gen12_emit_cmd_buf_wa(ce, cs);
1081 	cs = gen12_emit_restore_scratch(ce, cs);
1082 
1083 	return cs;
1084 }
1085 
1086 static u32 *
gen12_emit_indirect_ctx_xcs(const struct intel_context * ce,u32 * cs)1087 gen12_emit_indirect_ctx_xcs(const struct intel_context *ce, u32 *cs)
1088 {
1089 	cs = gen12_emit_timestamp_wa(ce, cs);
1090 	cs = gen12_emit_restore_scratch(ce, cs);
1091 
1092 	return cs;
1093 }
1094 
context_wa_bb_offset(const struct intel_context * ce)1095 static u32 context_wa_bb_offset(const struct intel_context *ce)
1096 {
1097 	return PAGE_SIZE * ce->wa_bb_page;
1098 }
1099 
context_indirect_bb(const struct intel_context * ce)1100 static u32 *context_indirect_bb(const struct intel_context *ce)
1101 {
1102 	void *ptr;
1103 
1104 	GEM_BUG_ON(!ce->wa_bb_page);
1105 
1106 	ptr = ce->lrc_reg_state;
1107 	ptr -= LRC_STATE_OFFSET; /* back to start of context image */
1108 	ptr += context_wa_bb_offset(ce);
1109 
1110 	return ptr;
1111 }
1112 
1113 static void
setup_indirect_ctx_bb(const struct intel_context * ce,const struct intel_engine_cs * engine,u32 * (* emit)(const struct intel_context *,u32 *))1114 setup_indirect_ctx_bb(const struct intel_context *ce,
1115 		      const struct intel_engine_cs *engine,
1116 		      u32 *(*emit)(const struct intel_context *, u32 *))
1117 {
1118 	u32 * const start = context_indirect_bb(ce);
1119 	u32 *cs;
1120 
1121 	cs = emit(ce, start);
1122 	GEM_BUG_ON(cs - start > I915_GTT_PAGE_SIZE / sizeof(*cs));
1123 	while ((unsigned long)cs % CACHELINE_BYTES)
1124 		*cs++ = MI_NOOP;
1125 
1126 	lrc_setup_indirect_ctx(ce->lrc_reg_state, engine,
1127 			       i915_ggtt_offset(ce->state) +
1128 			       context_wa_bb_offset(ce),
1129 			       (cs - start) * sizeof(*cs));
1130 }
1131 
1132 /*
1133  * The context descriptor encodes various attributes of a context,
1134  * including its GTT address and some flags. Because it's fairly
1135  * expensive to calculate, we'll just do it once and cache the result,
1136  * which remains valid until the context is unpinned.
1137  *
1138  * This is what a descriptor looks like, from LSB to MSB::
1139  *
1140  *      bits  0-11:    flags, GEN8_CTX_* (cached in ctx->desc_template)
1141  *      bits 12-31:    LRCA, GTT address of (the HWSP of) this context
1142  *      bits 32-52:    ctx ID, a globally unique tag (highest bit used by GuC)
1143  *      bits 53-54:    mbz, reserved for use by hardware
1144  *      bits 55-63:    group ID, currently unused and set to 0
1145  *
1146  * Starting from Gen11, the upper dword of the descriptor has a new format:
1147  *
1148  *      bits 32-36:    reserved
1149  *      bits 37-47:    SW context ID
1150  *      bits 48:53:    engine instance
1151  *      bit 54:        mbz, reserved for use by hardware
1152  *      bits 55-60:    SW counter
1153  *      bits 61-63:    engine class
1154  *
1155  * On Xe_HP, the upper dword of the descriptor has a new format:
1156  *
1157  *      bits 32-37:    virtual function number
1158  *      bit 38:        mbz, reserved for use by hardware
1159  *      bits 39-54:    SW context ID
1160  *      bits 55-57:    reserved
1161  *      bits 58-63:    SW counter
1162  *
1163  * engine info, SW context ID and SW counter need to form a unique number
1164  * (Context ID) per lrc.
1165  */
lrc_descriptor(const struct intel_context * ce)1166 static u32 lrc_descriptor(const struct intel_context *ce)
1167 {
1168 	u32 desc;
1169 
1170 	desc = INTEL_LEGACY_32B_CONTEXT;
1171 	if (i915_vm_is_4lvl(ce->vm))
1172 		desc = INTEL_LEGACY_64B_CONTEXT;
1173 	desc <<= GEN8_CTX_ADDRESSING_MODE_SHIFT;
1174 
1175 	desc |= GEN8_CTX_VALID | GEN8_CTX_PRIVILEGE;
1176 	if (GRAPHICS_VER(ce->vm->i915) == 8)
1177 		desc |= GEN8_CTX_L3LLC_COHERENT;
1178 
1179 	return i915_ggtt_offset(ce->state) | desc;
1180 }
1181 
lrc_update_regs(const struct intel_context * ce,const struct intel_engine_cs * engine,u32 head)1182 u32 lrc_update_regs(const struct intel_context *ce,
1183 		    const struct intel_engine_cs *engine,
1184 		    u32 head)
1185 {
1186 	struct intel_ring *ring = ce->ring;
1187 	u32 *regs = ce->lrc_reg_state;
1188 
1189 	GEM_BUG_ON(!intel_ring_offset_valid(ring, head));
1190 	GEM_BUG_ON(!intel_ring_offset_valid(ring, ring->tail));
1191 
1192 	regs[CTX_RING_START] = i915_ggtt_offset(ring->vma);
1193 	regs[CTX_RING_HEAD] = head;
1194 	regs[CTX_RING_TAIL] = ring->tail;
1195 	regs[CTX_RING_CTL] = RING_CTL_SIZE(ring->size) | RING_VALID;
1196 
1197 	/* RPCS */
1198 	if (engine->class == RENDER_CLASS) {
1199 		regs[CTX_R_PWR_CLK_STATE] =
1200 			intel_sseu_make_rpcs(engine->gt, &ce->sseu);
1201 
1202 		i915_oa_init_reg_state(ce, engine);
1203 	}
1204 
1205 	if (ce->wa_bb_page) {
1206 		u32 *(*fn)(const struct intel_context *ce, u32 *cs);
1207 
1208 		fn = gen12_emit_indirect_ctx_xcs;
1209 		if (ce->engine->class == RENDER_CLASS)
1210 			fn = gen12_emit_indirect_ctx_rcs;
1211 
1212 		/* Mutually exclusive wrt to global indirect bb */
1213 		GEM_BUG_ON(engine->wa_ctx.indirect_ctx.size);
1214 		setup_indirect_ctx_bb(ce, engine, fn);
1215 	}
1216 
1217 	return lrc_descriptor(ce) | CTX_DESC_FORCE_RESTORE;
1218 }
1219 
lrc_update_offsets(struct intel_context * ce,struct intel_engine_cs * engine)1220 void lrc_update_offsets(struct intel_context *ce,
1221 			struct intel_engine_cs *engine)
1222 {
1223 	set_offsets(ce->lrc_reg_state, reg_offsets(engine), engine, false);
1224 }
1225 
lrc_check_regs(const struct intel_context * ce,const struct intel_engine_cs * engine,const char * when)1226 void lrc_check_regs(const struct intel_context *ce,
1227 		    const struct intel_engine_cs *engine,
1228 		    const char *when)
1229 {
1230 	const struct intel_ring *ring = ce->ring;
1231 	u32 *regs = ce->lrc_reg_state;
1232 	bool valid = true;
1233 	int x;
1234 
1235 	if (regs[CTX_RING_START] != i915_ggtt_offset(ring->vma)) {
1236 		pr_err("%s: context submitted with incorrect RING_START [%08x], expected %08x\n",
1237 		       engine->name,
1238 		       regs[CTX_RING_START],
1239 		       i915_ggtt_offset(ring->vma));
1240 		regs[CTX_RING_START] = i915_ggtt_offset(ring->vma);
1241 		valid = false;
1242 	}
1243 
1244 	if ((regs[CTX_RING_CTL] & ~(RING_WAIT | RING_WAIT_SEMAPHORE)) !=
1245 	    (RING_CTL_SIZE(ring->size) | RING_VALID)) {
1246 		pr_err("%s: context submitted with incorrect RING_CTL [%08x], expected %08x\n",
1247 		       engine->name,
1248 		       regs[CTX_RING_CTL],
1249 		       (u32)(RING_CTL_SIZE(ring->size) | RING_VALID));
1250 		regs[CTX_RING_CTL] = RING_CTL_SIZE(ring->size) | RING_VALID;
1251 		valid = false;
1252 	}
1253 
1254 	x = lrc_ring_mi_mode(engine);
1255 	if (x != -1 && regs[x + 1] & (regs[x + 1] >> 16) & STOP_RING) {
1256 		pr_err("%s: context submitted with STOP_RING [%08x] in RING_MI_MODE\n",
1257 		       engine->name, regs[x + 1]);
1258 		regs[x + 1] &= ~STOP_RING;
1259 		regs[x + 1] |= STOP_RING << 16;
1260 		valid = false;
1261 	}
1262 
1263 	WARN_ONCE(!valid, "Invalid lrc state found %s submission\n", when);
1264 }
1265 
1266 /*
1267  * In this WA we need to set GEN8_L3SQCREG4[21:21] and reset it after
1268  * PIPE_CONTROL instruction. This is required for the flush to happen correctly
1269  * but there is a slight complication as this is applied in WA batch where the
1270  * values are only initialized once so we cannot take register value at the
1271  * beginning and reuse it further; hence we save its value to memory, upload a
1272  * constant value with bit21 set and then we restore it back with the saved value.
1273  * To simplify the WA, a constant value is formed by using the default value
1274  * of this register. This shouldn't be a problem because we are only modifying
1275  * it for a short period and this batch in non-premptible. We can ofcourse
1276  * use additional instructions that read the actual value of the register
1277  * at that time and set our bit of interest but it makes the WA complicated.
1278  *
1279  * This WA is also required for Gen9 so extracting as a function avoids
1280  * code duplication.
1281  */
1282 static u32 *
gen8_emit_flush_coherentl3_wa(struct intel_engine_cs * engine,u32 * batch)1283 gen8_emit_flush_coherentl3_wa(struct intel_engine_cs *engine, u32 *batch)
1284 {
1285 	/* NB no one else is allowed to scribble over scratch + 256! */
1286 	*batch++ = MI_STORE_REGISTER_MEM_GEN8 | MI_SRM_LRM_GLOBAL_GTT;
1287 	*batch++ = i915_mmio_reg_offset(GEN8_L3SQCREG4);
1288 	*batch++ = intel_gt_scratch_offset(engine->gt,
1289 					   INTEL_GT_SCRATCH_FIELD_COHERENTL3_WA);
1290 	*batch++ = 0;
1291 
1292 	*batch++ = MI_LOAD_REGISTER_IMM(1);
1293 	*batch++ = i915_mmio_reg_offset(GEN8_L3SQCREG4);
1294 	*batch++ = 0x40400000 | GEN8_LQSC_FLUSH_COHERENT_LINES;
1295 
1296 	batch = gen8_emit_pipe_control(batch,
1297 				       PIPE_CONTROL_CS_STALL |
1298 				       PIPE_CONTROL_DC_FLUSH_ENABLE,
1299 				       0);
1300 
1301 	*batch++ = MI_LOAD_REGISTER_MEM_GEN8 | MI_SRM_LRM_GLOBAL_GTT;
1302 	*batch++ = i915_mmio_reg_offset(GEN8_L3SQCREG4);
1303 	*batch++ = intel_gt_scratch_offset(engine->gt,
1304 					   INTEL_GT_SCRATCH_FIELD_COHERENTL3_WA);
1305 	*batch++ = 0;
1306 
1307 	return batch;
1308 }
1309 
1310 /*
1311  * Typically we only have one indirect_ctx and per_ctx batch buffer which are
1312  * initialized at the beginning and shared across all contexts but this field
1313  * helps us to have multiple batches at different offsets and select them based
1314  * on a criteria. At the moment this batch always start at the beginning of the page
1315  * and at this point we don't have multiple wa_ctx batch buffers.
1316  *
1317  * The number of WA applied are not known at the beginning; we use this field
1318  * to return the no of DWORDS written.
1319  *
1320  * It is to be noted that this batch does not contain MI_BATCH_BUFFER_END
1321  * so it adds NOOPs as padding to make it cacheline aligned.
1322  * MI_BATCH_BUFFER_END will be added to perctx batch and both of them together
1323  * makes a complete batch buffer.
1324  */
gen8_init_indirectctx_bb(struct intel_engine_cs * engine,u32 * batch)1325 static u32 *gen8_init_indirectctx_bb(struct intel_engine_cs *engine, u32 *batch)
1326 {
1327 	/* WaDisableCtxRestoreArbitration:bdw,chv */
1328 	*batch++ = MI_ARB_ON_OFF | MI_ARB_DISABLE;
1329 
1330 	/* WaFlushCoherentL3CacheLinesAtContextSwitch:bdw */
1331 	if (IS_BROADWELL(engine->i915))
1332 		batch = gen8_emit_flush_coherentl3_wa(engine, batch);
1333 
1334 	/* WaClearSlmSpaceAtContextSwitch:bdw,chv */
1335 	/* Actual scratch location is at 128 bytes offset */
1336 	batch = gen8_emit_pipe_control(batch,
1337 				       PIPE_CONTROL_FLUSH_L3 |
1338 				       PIPE_CONTROL_STORE_DATA_INDEX |
1339 				       PIPE_CONTROL_CS_STALL |
1340 				       PIPE_CONTROL_QW_WRITE,
1341 				       LRC_PPHWSP_SCRATCH_ADDR);
1342 
1343 	*batch++ = MI_ARB_ON_OFF | MI_ARB_ENABLE;
1344 
1345 	/* Pad to end of cacheline */
1346 	while ((unsigned long)batch % CACHELINE_BYTES)
1347 		*batch++ = MI_NOOP;
1348 
1349 	/*
1350 	 * MI_BATCH_BUFFER_END is not required in Indirect ctx BB because
1351 	 * execution depends on the length specified in terms of cache lines
1352 	 * in the register CTX_RCS_INDIRECT_CTX
1353 	 */
1354 
1355 	return batch;
1356 }
1357 
1358 struct lri {
1359 	i915_reg_t reg;
1360 	u32 value;
1361 };
1362 
emit_lri(u32 * batch,const struct lri * lri,unsigned int count)1363 static u32 *emit_lri(u32 *batch, const struct lri *lri, unsigned int count)
1364 {
1365 	GEM_BUG_ON(!count || count > 63);
1366 
1367 	*batch++ = MI_LOAD_REGISTER_IMM(count);
1368 	do {
1369 		*batch++ = i915_mmio_reg_offset(lri->reg);
1370 		*batch++ = lri->value;
1371 	} while (lri++, --count);
1372 	*batch++ = MI_NOOP;
1373 
1374 	return batch;
1375 }
1376 
gen9_init_indirectctx_bb(struct intel_engine_cs * engine,u32 * batch)1377 static u32 *gen9_init_indirectctx_bb(struct intel_engine_cs *engine, u32 *batch)
1378 {
1379 	static const struct lri lri[] = {
1380 		/* WaDisableGatherAtSetShaderCommonSlice:skl,bxt,kbl,glk */
1381 		{
1382 			COMMON_SLICE_CHICKEN2,
1383 			__MASKED_FIELD(GEN9_DISABLE_GATHER_AT_SET_SHADER_COMMON_SLICE,
1384 				       0),
1385 		},
1386 
1387 		/* BSpec: 11391 */
1388 		{
1389 			FF_SLICE_CHICKEN,
1390 			__MASKED_FIELD(FF_SLICE_CHICKEN_CL_PROVOKING_VERTEX_FIX,
1391 				       FF_SLICE_CHICKEN_CL_PROVOKING_VERTEX_FIX),
1392 		},
1393 
1394 		/* BSpec: 11299 */
1395 		{
1396 			_3D_CHICKEN3,
1397 			__MASKED_FIELD(_3D_CHICKEN_SF_PROVOKING_VERTEX_FIX,
1398 				       _3D_CHICKEN_SF_PROVOKING_VERTEX_FIX),
1399 		}
1400 	};
1401 
1402 	*batch++ = MI_ARB_ON_OFF | MI_ARB_DISABLE;
1403 
1404 	/* WaFlushCoherentL3CacheLinesAtContextSwitch:skl,bxt,glk */
1405 	batch = gen8_emit_flush_coherentl3_wa(engine, batch);
1406 
1407 	/* WaClearSlmSpaceAtContextSwitch:skl,bxt,kbl,glk,cfl */
1408 	batch = gen8_emit_pipe_control(batch,
1409 				       PIPE_CONTROL_FLUSH_L3 |
1410 				       PIPE_CONTROL_STORE_DATA_INDEX |
1411 				       PIPE_CONTROL_CS_STALL |
1412 				       PIPE_CONTROL_QW_WRITE,
1413 				       LRC_PPHWSP_SCRATCH_ADDR);
1414 
1415 	batch = emit_lri(batch, lri, ARRAY_SIZE(lri));
1416 
1417 	/* WaMediaPoolStateCmdInWABB:bxt,glk */
1418 	if (HAS_POOLED_EU(engine->i915)) {
1419 		/*
1420 		 * EU pool configuration is setup along with golden context
1421 		 * during context initialization. This value depends on
1422 		 * device type (2x6 or 3x6) and needs to be updated based
1423 		 * on which subslice is disabled especially for 2x6
1424 		 * devices, however it is safe to load default
1425 		 * configuration of 3x6 device instead of masking off
1426 		 * corresponding bits because HW ignores bits of a disabled
1427 		 * subslice and drops down to appropriate config. Please
1428 		 * see render_state_setup() in i915_gem_render_state.c for
1429 		 * possible configurations, to avoid duplication they are
1430 		 * not shown here again.
1431 		 */
1432 		*batch++ = GEN9_MEDIA_POOL_STATE;
1433 		*batch++ = GEN9_MEDIA_POOL_ENABLE;
1434 		*batch++ = 0x00777000;
1435 		*batch++ = 0;
1436 		*batch++ = 0;
1437 		*batch++ = 0;
1438 	}
1439 
1440 	*batch++ = MI_ARB_ON_OFF | MI_ARB_ENABLE;
1441 
1442 	/* Pad to end of cacheline */
1443 	while ((unsigned long)batch % CACHELINE_BYTES)
1444 		*batch++ = MI_NOOP;
1445 
1446 	return batch;
1447 }
1448 
1449 #define CTX_WA_BB_SIZE (PAGE_SIZE)
1450 
lrc_create_wa_ctx(struct intel_engine_cs * engine)1451 static int lrc_create_wa_ctx(struct intel_engine_cs *engine)
1452 {
1453 	struct drm_i915_gem_object *obj;
1454 	struct i915_vma *vma;
1455 	int err;
1456 
1457 	obj = i915_gem_object_create_shmem(engine->i915, CTX_WA_BB_SIZE);
1458 	if (IS_ERR(obj))
1459 		return PTR_ERR(obj);
1460 
1461 	vma = i915_vma_instance(obj, &engine->gt->ggtt->vm, NULL);
1462 	if (IS_ERR(vma)) {
1463 		err = PTR_ERR(vma);
1464 		goto err;
1465 	}
1466 
1467 	engine->wa_ctx.vma = vma;
1468 	return 0;
1469 
1470 err:
1471 	i915_gem_object_put(obj);
1472 	return err;
1473 }
1474 
lrc_fini_wa_ctx(struct intel_engine_cs * engine)1475 void lrc_fini_wa_ctx(struct intel_engine_cs *engine)
1476 {
1477 	i915_vma_unpin_and_release(&engine->wa_ctx.vma, 0);
1478 }
1479 
1480 typedef u32 *(*wa_bb_func_t)(struct intel_engine_cs *engine, u32 *batch);
1481 
lrc_init_wa_ctx(struct intel_engine_cs * engine)1482 void lrc_init_wa_ctx(struct intel_engine_cs *engine)
1483 {
1484 	struct i915_ctx_workarounds *wa_ctx = &engine->wa_ctx;
1485 	struct i915_wa_ctx_bb *wa_bb[] = {
1486 		&wa_ctx->indirect_ctx, &wa_ctx->per_ctx
1487 	};
1488 	wa_bb_func_t wa_bb_fn[ARRAY_SIZE(wa_bb)];
1489 	struct i915_gem_ww_ctx ww;
1490 	void *batch, *batch_ptr;
1491 	unsigned int i;
1492 	int err;
1493 
1494 	if (engine->class != RENDER_CLASS)
1495 		return;
1496 
1497 	switch (GRAPHICS_VER(engine->i915)) {
1498 	case 12:
1499 	case 11:
1500 		return;
1501 	case 9:
1502 		wa_bb_fn[0] = gen9_init_indirectctx_bb;
1503 		wa_bb_fn[1] = NULL;
1504 		break;
1505 	case 8:
1506 		wa_bb_fn[0] = gen8_init_indirectctx_bb;
1507 		wa_bb_fn[1] = NULL;
1508 		break;
1509 	default:
1510 		MISSING_CASE(GRAPHICS_VER(engine->i915));
1511 		return;
1512 	}
1513 
1514 	err = lrc_create_wa_ctx(engine);
1515 	if (err) {
1516 		/*
1517 		 * We continue even if we fail to initialize WA batch
1518 		 * because we only expect rare glitches but nothing
1519 		 * critical to prevent us from using GPU
1520 		 */
1521 		drm_err(&engine->i915->drm,
1522 			"Ignoring context switch w/a allocation error:%d\n",
1523 			err);
1524 		return;
1525 	}
1526 
1527 	if (!engine->wa_ctx.vma)
1528 		return;
1529 
1530 	i915_gem_ww_ctx_init(&ww, true);
1531 retry:
1532 	err = i915_gem_object_lock(wa_ctx->vma->obj, &ww);
1533 	if (!err)
1534 		err = i915_ggtt_pin(wa_ctx->vma, &ww, 0, PIN_HIGH);
1535 	if (err)
1536 		goto err;
1537 
1538 	batch = i915_gem_object_pin_map(wa_ctx->vma->obj, I915_MAP_WB);
1539 	if (IS_ERR(batch)) {
1540 		err = PTR_ERR(batch);
1541 		goto err_unpin;
1542 	}
1543 
1544 	/*
1545 	 * Emit the two workaround batch buffers, recording the offset from the
1546 	 * start of the workaround batch buffer object for each and their
1547 	 * respective sizes.
1548 	 */
1549 	batch_ptr = batch;
1550 	for (i = 0; i < ARRAY_SIZE(wa_bb_fn); i++) {
1551 		wa_bb[i]->offset = batch_ptr - batch;
1552 		if (GEM_DEBUG_WARN_ON(!IS_ALIGNED(wa_bb[i]->offset,
1553 						  CACHELINE_BYTES))) {
1554 			err = -EINVAL;
1555 			break;
1556 		}
1557 		if (wa_bb_fn[i])
1558 			batch_ptr = wa_bb_fn[i](engine, batch_ptr);
1559 		wa_bb[i]->size = batch_ptr - (batch + wa_bb[i]->offset);
1560 	}
1561 	GEM_BUG_ON(batch_ptr - batch > CTX_WA_BB_SIZE);
1562 
1563 	__i915_gem_object_flush_map(wa_ctx->vma->obj, 0, batch_ptr - batch);
1564 	__i915_gem_object_release_map(wa_ctx->vma->obj);
1565 
1566 	/* Verify that we can handle failure to setup the wa_ctx */
1567 	if (!err)
1568 		err = i915_inject_probe_error(engine->i915, -ENODEV);
1569 
1570 err_unpin:
1571 	if (err)
1572 		i915_vma_unpin(wa_ctx->vma);
1573 err:
1574 	if (err == -EDEADLK) {
1575 		err = i915_gem_ww_ctx_backoff(&ww);
1576 		if (!err)
1577 			goto retry;
1578 	}
1579 	i915_gem_ww_ctx_fini(&ww);
1580 
1581 	if (err) {
1582 		i915_vma_put(engine->wa_ctx.vma);
1583 
1584 		/* Clear all flags to prevent further use */
1585 		memset(wa_ctx, 0, sizeof(*wa_ctx));
1586 	}
1587 }
1588 
st_update_runtime_underflow(struct intel_context * ce,s32 dt)1589 static void st_update_runtime_underflow(struct intel_context *ce, s32 dt)
1590 {
1591 #if IS_ENABLED(CONFIG_DRM_I915_SELFTEST)
1592 	ce->runtime.num_underflow++;
1593 	ce->runtime.max_underflow = max_t(u32, ce->runtime.max_underflow, -dt);
1594 #endif
1595 }
1596 
lrc_update_runtime(struct intel_context * ce)1597 void lrc_update_runtime(struct intel_context *ce)
1598 {
1599 	u32 old;
1600 	s32 dt;
1601 
1602 	if (intel_context_is_barrier(ce))
1603 		return;
1604 
1605 	old = ce->runtime.last;
1606 	ce->runtime.last = lrc_get_runtime(ce);
1607 	dt = ce->runtime.last - old;
1608 
1609 	if (unlikely(dt < 0)) {
1610 		CE_TRACE(ce, "runtime underflow: last=%u, new=%u, delta=%d\n",
1611 			 old, ce->runtime.last, dt);
1612 		st_update_runtime_underflow(ce, dt);
1613 		return;
1614 	}
1615 
1616 	ewma_runtime_add(&ce->runtime.avg, dt);
1617 	ce->runtime.total += dt;
1618 }
1619 
1620 #if IS_ENABLED(CONFIG_DRM_I915_SELFTEST)
1621 #include "selftest_lrc.c"
1622 #endif
1623