1 /*
2 * Copyright © 2014 Intel Corporation
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
13 * Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21 * IN THE SOFTWARE.
22 *
23 * Authors:
24 * Ben Widawsky <ben@bwidawsk.net>
25 * Michel Thierry <michel.thierry@intel.com>
26 * Thomas Daniel <thomas.daniel@intel.com>
27 * Oscar Mateo <oscar.mateo@intel.com>
28 *
29 */
30
31 /**
32 * DOC: Logical Rings, Logical Ring Contexts and Execlists
33 *
34 * Motivation:
35 * GEN8 brings an expansion of the HW contexts: "Logical Ring Contexts".
36 * These expanded contexts enable a number of new abilities, especially
37 * "Execlists" (also implemented in this file).
38 *
39 * One of the main differences with the legacy HW contexts is that logical
40 * ring contexts incorporate many more things to the context's state, like
41 * PDPs or ringbuffer control registers:
42 *
43 * The reason why PDPs are included in the context is straightforward: as
44 * PPGTTs (per-process GTTs) are actually per-context, having the PDPs
45 * contained there mean you don't need to do a ppgtt->switch_mm yourself,
46 * instead, the GPU will do it for you on the context switch.
47 *
48 * But, what about the ringbuffer control registers (head, tail, etc..)?
49 * shouldn't we just need a set of those per engine command streamer? This is
50 * where the name "Logical Rings" starts to make sense: by virtualizing the
51 * rings, the engine cs shifts to a new "ring buffer" with every context
52 * switch. When you want to submit a workload to the GPU you: A) choose your
53 * context, B) find its appropriate virtualized ring, C) write commands to it
54 * and then, finally, D) tell the GPU to switch to that context.
55 *
56 * Instead of the legacy MI_SET_CONTEXT, the way you tell the GPU to switch
57 * to a contexts is via a context execution list, ergo "Execlists".
58 *
59 * LRC implementation:
60 * Regarding the creation of contexts, we have:
61 *
62 * - One global default context.
63 * - One local default context for each opened fd.
64 * - One local extra context for each context create ioctl call.
65 *
66 * Now that ringbuffers belong per-context (and not per-engine, like before)
67 * and that contexts are uniquely tied to a given engine (and not reusable,
68 * like before) we need:
69 *
70 * - One ringbuffer per-engine inside each context.
71 * - One backing object per-engine inside each context.
72 *
73 * The global default context starts its life with these new objects fully
74 * allocated and populated. The local default context for each opened fd is
75 * more complex, because we don't know at creation time which engine is going
76 * to use them. To handle this, we have implemented a deferred creation of LR
77 * contexts:
78 *
79 * The local context starts its life as a hollow or blank holder, that only
80 * gets populated for a given engine once we receive an execbuffer. If later
81 * on we receive another execbuffer ioctl for the same context but a different
82 * engine, we allocate/populate a new ringbuffer and context backing object and
83 * so on.
84 *
85 * Finally, regarding local contexts created using the ioctl call: as they are
86 * only allowed with the render ring, we can allocate & populate them right
87 * away (no need to defer anything, at least for now).
88 *
89 * Execlists implementation:
90 * Execlists are the new method by which, on gen8+ hardware, workloads are
91 * submitted for execution (as opposed to the legacy, ringbuffer-based, method).
92 * This method works as follows:
93 *
94 * When a request is committed, its commands (the BB start and any leading or
95 * trailing commands, like the seqno breadcrumbs) are placed in the ringbuffer
96 * for the appropriate context. The tail pointer in the hardware context is not
97 * updated at this time, but instead, kept by the driver in the ringbuffer
98 * structure. A structure representing this request is added to a request queue
99 * for the appropriate engine: this structure contains a copy of the context's
100 * tail after the request was written to the ring buffer and a pointer to the
101 * context itself.
102 *
103 * If the engine's request queue was empty before the request was added, the
104 * queue is processed immediately. Otherwise the queue will be processed during
105 * a context switch interrupt. In any case, elements on the queue will get sent
106 * (in pairs) to the GPU's ExecLists Submit Port (ELSP, for short) with a
107 * globally unique 20-bits submission ID.
108 *
109 * When execution of a request completes, the GPU updates the context status
110 * buffer with a context complete event and generates a context switch interrupt.
111 * During the interrupt handling, the driver examines the events in the buffer:
112 * for each context complete event, if the announced ID matches that on the head
113 * of the request queue, then that request is retired and removed from the queue.
114 *
115 * After processing, if any requests were retired and the queue is not empty
116 * then a new execution list can be submitted. The two requests at the front of
117 * the queue are next to be submitted but since a context may not occur twice in
118 * an execution list, if subsequent requests have the same ID as the first then
119 * the two requests must be combined. This is done simply by discarding requests
120 * at the head of the queue until either only one requests is left (in which case
121 * we use a NULL second context) or the first two requests have unique IDs.
122 *
123 * By always executing the first two requests in the queue the driver ensures
124 * that the GPU is kept as busy as possible. In the case where a single context
125 * completes but a second context is still executing, the request for this second
126 * context will be at the head of the queue when we remove the first one. This
127 * request will then be resubmitted along with a new request for a different context,
128 * which will cause the hardware to continue executing the second request and queue
129 * the new request (the GPU detects the condition of a context getting preempted
130 * with the same context and optimizes the context switch flow by not doing
131 * preemption, but just sampling the new tail pointer).
132 *
133 */
134 #include <linux/interrupt.h>
135
136 #include "i915_drv.h"
137 #include "i915_perf.h"
138 #include "i915_trace.h"
139 #include "i915_vgpu.h"
140 #include "intel_breadcrumbs.h"
141 #include "intel_context.h"
142 #include "intel_engine_pm.h"
143 #include "intel_gt.h"
144 #include "intel_gt_pm.h"
145 #include "intel_gt_requests.h"
146 #include "intel_lrc_reg.h"
147 #include "intel_mocs.h"
148 #include "intel_reset.h"
149 #include "intel_ring.h"
150 #include "intel_workarounds.h"
151 #include "shmem_utils.h"
152
153 #define RING_EXECLIST_QFULL (1 << 0x2)
154 #define RING_EXECLIST1_VALID (1 << 0x3)
155 #define RING_EXECLIST0_VALID (1 << 0x4)
156 #define RING_EXECLIST_ACTIVE_STATUS (3 << 0xE)
157 #define RING_EXECLIST1_ACTIVE (1 << 0x11)
158 #define RING_EXECLIST0_ACTIVE (1 << 0x12)
159
160 #define GEN8_CTX_STATUS_IDLE_ACTIVE (1 << 0)
161 #define GEN8_CTX_STATUS_PREEMPTED (1 << 1)
162 #define GEN8_CTX_STATUS_ELEMENT_SWITCH (1 << 2)
163 #define GEN8_CTX_STATUS_ACTIVE_IDLE (1 << 3)
164 #define GEN8_CTX_STATUS_COMPLETE (1 << 4)
165 #define GEN8_CTX_STATUS_LITE_RESTORE (1 << 15)
166
167 #define GEN8_CTX_STATUS_COMPLETED_MASK \
168 (GEN8_CTX_STATUS_COMPLETE | GEN8_CTX_STATUS_PREEMPTED)
169
170 #define CTX_DESC_FORCE_RESTORE BIT_ULL(2)
171
172 #define GEN12_CTX_STATUS_SWITCHED_TO_NEW_QUEUE (0x1) /* lower csb dword */
173 #define GEN12_CTX_SWITCH_DETAIL(csb_dw) ((csb_dw) & 0xF) /* upper csb dword */
174 #define GEN12_CSB_SW_CTX_ID_MASK GENMASK(25, 15)
175 #define GEN12_IDLE_CTX_ID 0x7FF
176 #define GEN12_CSB_CTX_VALID(csb_dw) \
177 (FIELD_GET(GEN12_CSB_SW_CTX_ID_MASK, csb_dw) != GEN12_IDLE_CTX_ID)
178
179 /* Typical size of the average request (2 pipecontrols and a MI_BB) */
180 #define EXECLISTS_REQUEST_SIZE 64 /* bytes */
181
182 struct virtual_engine {
183 struct intel_engine_cs base;
184 struct intel_context context;
185 struct rcu_work rcu;
186
187 /*
188 * We allow only a single request through the virtual engine at a time
189 * (each request in the timeline waits for the completion fence of
190 * the previous before being submitted). By restricting ourselves to
191 * only submitting a single request, each request is placed on to a
192 * physical to maximise load spreading (by virtue of the late greedy
193 * scheduling -- each real engine takes the next available request
194 * upon idling).
195 */
196 struct i915_request *request;
197
198 /*
199 * We keep a rbtree of available virtual engines inside each physical
200 * engine, sorted by priority. Here we preallocate the nodes we need
201 * for the virtual engine, indexed by physical_engine->id.
202 */
203 struct ve_node {
204 struct rb_node rb;
205 int prio;
206 } nodes[I915_NUM_ENGINES];
207
208 /*
209 * Keep track of bonded pairs -- restrictions upon on our selection
210 * of physical engines any particular request may be submitted to.
211 * If we receive a submit-fence from a master engine, we will only
212 * use one of sibling_mask physical engines.
213 */
214 struct ve_bond {
215 const struct intel_engine_cs *master;
216 intel_engine_mask_t sibling_mask;
217 } *bonds;
218 unsigned int num_bonds;
219
220 /* And finally, which physical engines this virtual engine maps onto. */
221 unsigned int num_siblings;
222 struct intel_engine_cs *siblings[];
223 };
224
to_virtual_engine(struct intel_engine_cs * engine)225 static struct virtual_engine *to_virtual_engine(struct intel_engine_cs *engine)
226 {
227 GEM_BUG_ON(!intel_engine_is_virtual(engine));
228 return container_of(engine, struct virtual_engine, base);
229 }
230
231 static int __execlists_context_alloc(struct intel_context *ce,
232 struct intel_engine_cs *engine);
233
234 static void execlists_init_reg_state(u32 *reg_state,
235 const struct intel_context *ce,
236 const struct intel_engine_cs *engine,
237 const struct intel_ring *ring,
238 bool close);
239 static void
240 __execlists_update_reg_state(const struct intel_context *ce,
241 const struct intel_engine_cs *engine,
242 u32 head);
243
lrc_ring_mi_mode(const struct intel_engine_cs * engine)244 static int lrc_ring_mi_mode(const struct intel_engine_cs *engine)
245 {
246 if (INTEL_GEN(engine->i915) >= 12)
247 return 0x60;
248 else if (INTEL_GEN(engine->i915) >= 9)
249 return 0x54;
250 else if (engine->class == RENDER_CLASS)
251 return 0x58;
252 else
253 return -1;
254 }
255
lrc_ring_gpr0(const struct intel_engine_cs * engine)256 static int lrc_ring_gpr0(const struct intel_engine_cs *engine)
257 {
258 if (INTEL_GEN(engine->i915) >= 12)
259 return 0x74;
260 else if (INTEL_GEN(engine->i915) >= 9)
261 return 0x68;
262 else if (engine->class == RENDER_CLASS)
263 return 0xd8;
264 else
265 return -1;
266 }
267
lrc_ring_wa_bb_per_ctx(const struct intel_engine_cs * engine)268 static int lrc_ring_wa_bb_per_ctx(const struct intel_engine_cs *engine)
269 {
270 if (INTEL_GEN(engine->i915) >= 12)
271 return 0x12;
272 else if (INTEL_GEN(engine->i915) >= 9 || engine->class == RENDER_CLASS)
273 return 0x18;
274 else
275 return -1;
276 }
277
lrc_ring_indirect_ptr(const struct intel_engine_cs * engine)278 static int lrc_ring_indirect_ptr(const struct intel_engine_cs *engine)
279 {
280 int x;
281
282 x = lrc_ring_wa_bb_per_ctx(engine);
283 if (x < 0)
284 return x;
285
286 return x + 2;
287 }
288
lrc_ring_indirect_offset(const struct intel_engine_cs * engine)289 static int lrc_ring_indirect_offset(const struct intel_engine_cs *engine)
290 {
291 int x;
292
293 x = lrc_ring_indirect_ptr(engine);
294 if (x < 0)
295 return x;
296
297 return x + 2;
298 }
299
lrc_ring_cmd_buf_cctl(const struct intel_engine_cs * engine)300 static int lrc_ring_cmd_buf_cctl(const struct intel_engine_cs *engine)
301 {
302 if (engine->class != RENDER_CLASS)
303 return -1;
304
305 if (INTEL_GEN(engine->i915) >= 12)
306 return 0xb6;
307 else if (INTEL_GEN(engine->i915) >= 11)
308 return 0xaa;
309 else
310 return -1;
311 }
312
313 static u32
lrc_ring_indirect_offset_default(const struct intel_engine_cs * engine)314 lrc_ring_indirect_offset_default(const struct intel_engine_cs *engine)
315 {
316 switch (INTEL_GEN(engine->i915)) {
317 default:
318 MISSING_CASE(INTEL_GEN(engine->i915));
319 fallthrough;
320 case 12:
321 return GEN12_CTX_RCS_INDIRECT_CTX_OFFSET_DEFAULT;
322 case 11:
323 return GEN11_CTX_RCS_INDIRECT_CTX_OFFSET_DEFAULT;
324 case 10:
325 return GEN10_CTX_RCS_INDIRECT_CTX_OFFSET_DEFAULT;
326 case 9:
327 return GEN9_CTX_RCS_INDIRECT_CTX_OFFSET_DEFAULT;
328 case 8:
329 return GEN8_CTX_RCS_INDIRECT_CTX_OFFSET_DEFAULT;
330 }
331 }
332
333 static void
lrc_ring_setup_indirect_ctx(u32 * regs,const struct intel_engine_cs * engine,u32 ctx_bb_ggtt_addr,u32 size)334 lrc_ring_setup_indirect_ctx(u32 *regs,
335 const struct intel_engine_cs *engine,
336 u32 ctx_bb_ggtt_addr,
337 u32 size)
338 {
339 GEM_BUG_ON(!size);
340 GEM_BUG_ON(!IS_ALIGNED(size, CACHELINE_BYTES));
341 GEM_BUG_ON(lrc_ring_indirect_ptr(engine) == -1);
342 regs[lrc_ring_indirect_ptr(engine) + 1] =
343 ctx_bb_ggtt_addr | (size / CACHELINE_BYTES);
344
345 GEM_BUG_ON(lrc_ring_indirect_offset(engine) == -1);
346 regs[lrc_ring_indirect_offset(engine) + 1] =
347 lrc_ring_indirect_offset_default(engine) << 6;
348 }
349
intel_context_get_runtime(const struct intel_context * ce)350 static u32 intel_context_get_runtime(const struct intel_context *ce)
351 {
352 /*
353 * We can use either ppHWSP[16] which is recorded before the context
354 * switch (and so excludes the cost of context switches) or use the
355 * value from the context image itself, which is saved/restored earlier
356 * and so includes the cost of the save.
357 */
358 return READ_ONCE(ce->lrc_reg_state[CTX_TIMESTAMP]);
359 }
360
mark_eio(struct i915_request * rq)361 static void mark_eio(struct i915_request *rq)
362 {
363 if (i915_request_completed(rq))
364 return;
365
366 GEM_BUG_ON(i915_request_signaled(rq));
367
368 i915_request_set_error_once(rq, -EIO);
369 i915_request_mark_complete(rq);
370 }
371
372 static struct i915_request *
active_request(const struct intel_timeline * const tl,struct i915_request * rq)373 active_request(const struct intel_timeline * const tl, struct i915_request *rq)
374 {
375 struct i915_request *active = rq;
376
377 rcu_read_lock();
378 list_for_each_entry_continue_reverse(rq, &tl->requests, link) {
379 if (i915_request_completed(rq))
380 break;
381
382 active = rq;
383 }
384 rcu_read_unlock();
385
386 return active;
387 }
388
intel_hws_preempt_address(struct intel_engine_cs * engine)389 static inline u32 intel_hws_preempt_address(struct intel_engine_cs *engine)
390 {
391 return (i915_ggtt_offset(engine->status_page.vma) +
392 I915_GEM_HWS_PREEMPT_ADDR);
393 }
394
395 static inline void
ring_set_paused(const struct intel_engine_cs * engine,int state)396 ring_set_paused(const struct intel_engine_cs *engine, int state)
397 {
398 /*
399 * We inspect HWS_PREEMPT with a semaphore inside
400 * engine->emit_fini_breadcrumb. If the dword is true,
401 * the ring is paused as the semaphore will busywait
402 * until the dword is false.
403 */
404 engine->status_page.addr[I915_GEM_HWS_PREEMPT] = state;
405 if (state)
406 wmb();
407 }
408
to_priolist(struct rb_node * rb)409 static inline struct i915_priolist *to_priolist(struct rb_node *rb)
410 {
411 return rb_entry(rb, struct i915_priolist, node);
412 }
413
rq_prio(const struct i915_request * rq)414 static inline int rq_prio(const struct i915_request *rq)
415 {
416 return READ_ONCE(rq->sched.attr.priority);
417 }
418
effective_prio(const struct i915_request * rq)419 static int effective_prio(const struct i915_request *rq)
420 {
421 int prio = rq_prio(rq);
422
423 /*
424 * If this request is special and must not be interrupted at any
425 * cost, so be it. Note we are only checking the most recent request
426 * in the context and so may be masking an earlier vip request. It
427 * is hoped that under the conditions where nopreempt is used, this
428 * will not matter (i.e. all requests to that context will be
429 * nopreempt for as long as desired).
430 */
431 if (i915_request_has_nopreempt(rq))
432 prio = I915_PRIORITY_UNPREEMPTABLE;
433
434 return prio;
435 }
436
queue_prio(const struct intel_engine_execlists * execlists)437 static int queue_prio(const struct intel_engine_execlists *execlists)
438 {
439 struct i915_priolist *p;
440 struct rb_node *rb;
441
442 rb = rb_first_cached(&execlists->queue);
443 if (!rb)
444 return INT_MIN;
445
446 /*
447 * As the priolist[] are inverted, with the highest priority in [0],
448 * we have to flip the index value to become priority.
449 */
450 p = to_priolist(rb);
451 if (!I915_USER_PRIORITY_SHIFT)
452 return p->priority;
453
454 return ((p->priority + 1) << I915_USER_PRIORITY_SHIFT) - ffs(p->used);
455 }
456
need_preempt(const struct intel_engine_cs * engine,const struct i915_request * rq,struct rb_node * rb)457 static inline bool need_preempt(const struct intel_engine_cs *engine,
458 const struct i915_request *rq,
459 struct rb_node *rb)
460 {
461 int last_prio;
462
463 if (!intel_engine_has_semaphores(engine))
464 return false;
465
466 /*
467 * Check if the current priority hint merits a preemption attempt.
468 *
469 * We record the highest value priority we saw during rescheduling
470 * prior to this dequeue, therefore we know that if it is strictly
471 * less than the current tail of ESLP[0], we do not need to force
472 * a preempt-to-idle cycle.
473 *
474 * However, the priority hint is a mere hint that we may need to
475 * preempt. If that hint is stale or we may be trying to preempt
476 * ourselves, ignore the request.
477 *
478 * More naturally we would write
479 * prio >= max(0, last);
480 * except that we wish to prevent triggering preemption at the same
481 * priority level: the task that is running should remain running
482 * to preserve FIFO ordering of dependencies.
483 */
484 last_prio = max(effective_prio(rq), I915_PRIORITY_NORMAL - 1);
485 if (engine->execlists.queue_priority_hint <= last_prio)
486 return false;
487
488 /*
489 * Check against the first request in ELSP[1], it will, thanks to the
490 * power of PI, be the highest priority of that context.
491 */
492 if (!list_is_last(&rq->sched.link, &engine->active.requests) &&
493 rq_prio(list_next_entry(rq, sched.link)) > last_prio)
494 return true;
495
496 if (rb) {
497 struct virtual_engine *ve =
498 rb_entry(rb, typeof(*ve), nodes[engine->id].rb);
499 bool preempt = false;
500
501 if (engine == ve->siblings[0]) { /* only preempt one sibling */
502 struct i915_request *next;
503
504 rcu_read_lock();
505 next = READ_ONCE(ve->request);
506 if (next)
507 preempt = rq_prio(next) > last_prio;
508 rcu_read_unlock();
509 }
510
511 if (preempt)
512 return preempt;
513 }
514
515 /*
516 * If the inflight context did not trigger the preemption, then maybe
517 * it was the set of queued requests? Pick the highest priority in
518 * the queue (the first active priolist) and see if it deserves to be
519 * running instead of ELSP[0].
520 *
521 * The highest priority request in the queue can not be either
522 * ELSP[0] or ELSP[1] as, thanks again to PI, if it was the same
523 * context, it's priority would not exceed ELSP[0] aka last_prio.
524 */
525 return queue_prio(&engine->execlists) > last_prio;
526 }
527
528 __maybe_unused static inline bool
assert_priority_queue(const struct i915_request * prev,const struct i915_request * next)529 assert_priority_queue(const struct i915_request *prev,
530 const struct i915_request *next)
531 {
532 /*
533 * Without preemption, the prev may refer to the still active element
534 * which we refuse to let go.
535 *
536 * Even with preemption, there are times when we think it is better not
537 * to preempt and leave an ostensibly lower priority request in flight.
538 */
539 if (i915_request_is_active(prev))
540 return true;
541
542 return rq_prio(prev) >= rq_prio(next);
543 }
544
545 /*
546 * The context descriptor encodes various attributes of a context,
547 * including its GTT address and some flags. Because it's fairly
548 * expensive to calculate, we'll just do it once and cache the result,
549 * which remains valid until the context is unpinned.
550 *
551 * This is what a descriptor looks like, from LSB to MSB::
552 *
553 * bits 0-11: flags, GEN8_CTX_* (cached in ctx->desc_template)
554 * bits 12-31: LRCA, GTT address of (the HWSP of) this context
555 * bits 32-52: ctx ID, a globally unique tag (highest bit used by GuC)
556 * bits 53-54: mbz, reserved for use by hardware
557 * bits 55-63: group ID, currently unused and set to 0
558 *
559 * Starting from Gen11, the upper dword of the descriptor has a new format:
560 *
561 * bits 32-36: reserved
562 * bits 37-47: SW context ID
563 * bits 48:53: engine instance
564 * bit 54: mbz, reserved for use by hardware
565 * bits 55-60: SW counter
566 * bits 61-63: engine class
567 *
568 * engine info, SW context ID and SW counter need to form a unique number
569 * (Context ID) per lrc.
570 */
571 static u32
lrc_descriptor(struct intel_context * ce,struct intel_engine_cs * engine)572 lrc_descriptor(struct intel_context *ce, struct intel_engine_cs *engine)
573 {
574 u32 desc;
575
576 desc = INTEL_LEGACY_32B_CONTEXT;
577 if (i915_vm_is_4lvl(ce->vm))
578 desc = INTEL_LEGACY_64B_CONTEXT;
579 desc <<= GEN8_CTX_ADDRESSING_MODE_SHIFT;
580
581 desc |= GEN8_CTX_VALID | GEN8_CTX_PRIVILEGE;
582 if (IS_GEN(engine->i915, 8))
583 desc |= GEN8_CTX_L3LLC_COHERENT;
584
585 return i915_ggtt_offset(ce->state) | desc;
586 }
587
dword_in_page(void * addr)588 static inline unsigned int dword_in_page(void *addr)
589 {
590 return offset_in_page(addr) / sizeof(u32);
591 }
592
set_offsets(u32 * regs,const u8 * data,const struct intel_engine_cs * engine,bool clear)593 static void set_offsets(u32 *regs,
594 const u8 *data,
595 const struct intel_engine_cs *engine,
596 bool clear)
597 #define NOP(x) (BIT(7) | (x))
598 #define LRI(count, flags) ((flags) << 6 | (count) | BUILD_BUG_ON_ZERO(count >= BIT(6)))
599 #define POSTED BIT(0)
600 #define REG(x) (((x) >> 2) | BUILD_BUG_ON_ZERO(x >= 0x200))
601 #define REG16(x) \
602 (((x) >> 9) | BIT(7) | BUILD_BUG_ON_ZERO(x >= 0x10000)), \
603 (((x) >> 2) & 0x7f)
604 #define END(total_state_size) 0, (total_state_size)
605 {
606 const u32 base = engine->mmio_base;
607
608 while (*data) {
609 u8 count, flags;
610
611 if (*data & BIT(7)) { /* skip */
612 count = *data++ & ~BIT(7);
613 if (clear)
614 memset32(regs, MI_NOOP, count);
615 regs += count;
616 continue;
617 }
618
619 count = *data & 0x3f;
620 flags = *data >> 6;
621 data++;
622
623 *regs = MI_LOAD_REGISTER_IMM(count);
624 if (flags & POSTED)
625 *regs |= MI_LRI_FORCE_POSTED;
626 if (INTEL_GEN(engine->i915) >= 11)
627 *regs |= MI_LRI_LRM_CS_MMIO;
628 regs++;
629
630 GEM_BUG_ON(!count);
631 do {
632 u32 offset = 0;
633 u8 v;
634
635 do {
636 v = *data++;
637 offset <<= 7;
638 offset |= v & ~BIT(7);
639 } while (v & BIT(7));
640
641 regs[0] = base + (offset << 2);
642 if (clear)
643 regs[1] = 0;
644 regs += 2;
645 } while (--count);
646 }
647
648 if (clear) {
649 u8 count = *++data;
650
651 /* Clear past the tail for HW access */
652 GEM_BUG_ON(dword_in_page(regs) > count);
653 memset32(regs, MI_NOOP, count - dword_in_page(regs));
654
655 /* Close the batch; used mainly by live_lrc_layout() */
656 *regs = MI_BATCH_BUFFER_END;
657 if (INTEL_GEN(engine->i915) >= 10)
658 *regs |= BIT(0);
659 }
660 }
661
662 static const u8 gen8_xcs_offsets[] = {
663 NOP(1),
664 LRI(11, 0),
665 REG16(0x244),
666 REG(0x034),
667 REG(0x030),
668 REG(0x038),
669 REG(0x03c),
670 REG(0x168),
671 REG(0x140),
672 REG(0x110),
673 REG(0x11c),
674 REG(0x114),
675 REG(0x118),
676
677 NOP(9),
678 LRI(9, 0),
679 REG16(0x3a8),
680 REG16(0x28c),
681 REG16(0x288),
682 REG16(0x284),
683 REG16(0x280),
684 REG16(0x27c),
685 REG16(0x278),
686 REG16(0x274),
687 REG16(0x270),
688
689 NOP(13),
690 LRI(2, 0),
691 REG16(0x200),
692 REG(0x028),
693
694 END(80)
695 };
696
697 static const u8 gen9_xcs_offsets[] = {
698 NOP(1),
699 LRI(14, POSTED),
700 REG16(0x244),
701 REG(0x034),
702 REG(0x030),
703 REG(0x038),
704 REG(0x03c),
705 REG(0x168),
706 REG(0x140),
707 REG(0x110),
708 REG(0x11c),
709 REG(0x114),
710 REG(0x118),
711 REG(0x1c0),
712 REG(0x1c4),
713 REG(0x1c8),
714
715 NOP(3),
716 LRI(9, POSTED),
717 REG16(0x3a8),
718 REG16(0x28c),
719 REG16(0x288),
720 REG16(0x284),
721 REG16(0x280),
722 REG16(0x27c),
723 REG16(0x278),
724 REG16(0x274),
725 REG16(0x270),
726
727 NOP(13),
728 LRI(1, POSTED),
729 REG16(0x200),
730
731 NOP(13),
732 LRI(44, POSTED),
733 REG(0x028),
734 REG(0x09c),
735 REG(0x0c0),
736 REG(0x178),
737 REG(0x17c),
738 REG16(0x358),
739 REG(0x170),
740 REG(0x150),
741 REG(0x154),
742 REG(0x158),
743 REG16(0x41c),
744 REG16(0x600),
745 REG16(0x604),
746 REG16(0x608),
747 REG16(0x60c),
748 REG16(0x610),
749 REG16(0x614),
750 REG16(0x618),
751 REG16(0x61c),
752 REG16(0x620),
753 REG16(0x624),
754 REG16(0x628),
755 REG16(0x62c),
756 REG16(0x630),
757 REG16(0x634),
758 REG16(0x638),
759 REG16(0x63c),
760 REG16(0x640),
761 REG16(0x644),
762 REG16(0x648),
763 REG16(0x64c),
764 REG16(0x650),
765 REG16(0x654),
766 REG16(0x658),
767 REG16(0x65c),
768 REG16(0x660),
769 REG16(0x664),
770 REG16(0x668),
771 REG16(0x66c),
772 REG16(0x670),
773 REG16(0x674),
774 REG16(0x678),
775 REG16(0x67c),
776 REG(0x068),
777
778 END(176)
779 };
780
781 static const u8 gen12_xcs_offsets[] = {
782 NOP(1),
783 LRI(13, POSTED),
784 REG16(0x244),
785 REG(0x034),
786 REG(0x030),
787 REG(0x038),
788 REG(0x03c),
789 REG(0x168),
790 REG(0x140),
791 REG(0x110),
792 REG(0x1c0),
793 REG(0x1c4),
794 REG(0x1c8),
795 REG(0x180),
796 REG16(0x2b4),
797
798 NOP(5),
799 LRI(9, POSTED),
800 REG16(0x3a8),
801 REG16(0x28c),
802 REG16(0x288),
803 REG16(0x284),
804 REG16(0x280),
805 REG16(0x27c),
806 REG16(0x278),
807 REG16(0x274),
808 REG16(0x270),
809
810 END(80)
811 };
812
813 static const u8 gen8_rcs_offsets[] = {
814 NOP(1),
815 LRI(14, POSTED),
816 REG16(0x244),
817 REG(0x034),
818 REG(0x030),
819 REG(0x038),
820 REG(0x03c),
821 REG(0x168),
822 REG(0x140),
823 REG(0x110),
824 REG(0x11c),
825 REG(0x114),
826 REG(0x118),
827 REG(0x1c0),
828 REG(0x1c4),
829 REG(0x1c8),
830
831 NOP(3),
832 LRI(9, POSTED),
833 REG16(0x3a8),
834 REG16(0x28c),
835 REG16(0x288),
836 REG16(0x284),
837 REG16(0x280),
838 REG16(0x27c),
839 REG16(0x278),
840 REG16(0x274),
841 REG16(0x270),
842
843 NOP(13),
844 LRI(1, 0),
845 REG(0x0c8),
846
847 END(80)
848 };
849
850 static const u8 gen9_rcs_offsets[] = {
851 NOP(1),
852 LRI(14, POSTED),
853 REG16(0x244),
854 REG(0x34),
855 REG(0x30),
856 REG(0x38),
857 REG(0x3c),
858 REG(0x168),
859 REG(0x140),
860 REG(0x110),
861 REG(0x11c),
862 REG(0x114),
863 REG(0x118),
864 REG(0x1c0),
865 REG(0x1c4),
866 REG(0x1c8),
867
868 NOP(3),
869 LRI(9, POSTED),
870 REG16(0x3a8),
871 REG16(0x28c),
872 REG16(0x288),
873 REG16(0x284),
874 REG16(0x280),
875 REG16(0x27c),
876 REG16(0x278),
877 REG16(0x274),
878 REG16(0x270),
879
880 NOP(13),
881 LRI(1, 0),
882 REG(0xc8),
883
884 NOP(13),
885 LRI(44, POSTED),
886 REG(0x28),
887 REG(0x9c),
888 REG(0xc0),
889 REG(0x178),
890 REG(0x17c),
891 REG16(0x358),
892 REG(0x170),
893 REG(0x150),
894 REG(0x154),
895 REG(0x158),
896 REG16(0x41c),
897 REG16(0x600),
898 REG16(0x604),
899 REG16(0x608),
900 REG16(0x60c),
901 REG16(0x610),
902 REG16(0x614),
903 REG16(0x618),
904 REG16(0x61c),
905 REG16(0x620),
906 REG16(0x624),
907 REG16(0x628),
908 REG16(0x62c),
909 REG16(0x630),
910 REG16(0x634),
911 REG16(0x638),
912 REG16(0x63c),
913 REG16(0x640),
914 REG16(0x644),
915 REG16(0x648),
916 REG16(0x64c),
917 REG16(0x650),
918 REG16(0x654),
919 REG16(0x658),
920 REG16(0x65c),
921 REG16(0x660),
922 REG16(0x664),
923 REG16(0x668),
924 REG16(0x66c),
925 REG16(0x670),
926 REG16(0x674),
927 REG16(0x678),
928 REG16(0x67c),
929 REG(0x68),
930
931 END(176)
932 };
933
934 static const u8 gen11_rcs_offsets[] = {
935 NOP(1),
936 LRI(15, POSTED),
937 REG16(0x244),
938 REG(0x034),
939 REG(0x030),
940 REG(0x038),
941 REG(0x03c),
942 REG(0x168),
943 REG(0x140),
944 REG(0x110),
945 REG(0x11c),
946 REG(0x114),
947 REG(0x118),
948 REG(0x1c0),
949 REG(0x1c4),
950 REG(0x1c8),
951 REG(0x180),
952
953 NOP(1),
954 LRI(9, POSTED),
955 REG16(0x3a8),
956 REG16(0x28c),
957 REG16(0x288),
958 REG16(0x284),
959 REG16(0x280),
960 REG16(0x27c),
961 REG16(0x278),
962 REG16(0x274),
963 REG16(0x270),
964
965 LRI(1, POSTED),
966 REG(0x1b0),
967
968 NOP(10),
969 LRI(1, 0),
970 REG(0x0c8),
971
972 END(80)
973 };
974
975 static const u8 gen12_rcs_offsets[] = {
976 NOP(1),
977 LRI(13, POSTED),
978 REG16(0x244),
979 REG(0x034),
980 REG(0x030),
981 REG(0x038),
982 REG(0x03c),
983 REG(0x168),
984 REG(0x140),
985 REG(0x110),
986 REG(0x1c0),
987 REG(0x1c4),
988 REG(0x1c8),
989 REG(0x180),
990 REG16(0x2b4),
991
992 NOP(5),
993 LRI(9, POSTED),
994 REG16(0x3a8),
995 REG16(0x28c),
996 REG16(0x288),
997 REG16(0x284),
998 REG16(0x280),
999 REG16(0x27c),
1000 REG16(0x278),
1001 REG16(0x274),
1002 REG16(0x270),
1003
1004 LRI(3, POSTED),
1005 REG(0x1b0),
1006 REG16(0x5a8),
1007 REG16(0x5ac),
1008
1009 NOP(6),
1010 LRI(1, 0),
1011 REG(0x0c8),
1012 NOP(3 + 9 + 1),
1013
1014 LRI(51, POSTED),
1015 REG16(0x588),
1016 REG16(0x588),
1017 REG16(0x588),
1018 REG16(0x588),
1019 REG16(0x588),
1020 REG16(0x588),
1021 REG(0x028),
1022 REG(0x09c),
1023 REG(0x0c0),
1024 REG(0x178),
1025 REG(0x17c),
1026 REG16(0x358),
1027 REG(0x170),
1028 REG(0x150),
1029 REG(0x154),
1030 REG(0x158),
1031 REG16(0x41c),
1032 REG16(0x600),
1033 REG16(0x604),
1034 REG16(0x608),
1035 REG16(0x60c),
1036 REG16(0x610),
1037 REG16(0x614),
1038 REG16(0x618),
1039 REG16(0x61c),
1040 REG16(0x620),
1041 REG16(0x624),
1042 REG16(0x628),
1043 REG16(0x62c),
1044 REG16(0x630),
1045 REG16(0x634),
1046 REG16(0x638),
1047 REG16(0x63c),
1048 REG16(0x640),
1049 REG16(0x644),
1050 REG16(0x648),
1051 REG16(0x64c),
1052 REG16(0x650),
1053 REG16(0x654),
1054 REG16(0x658),
1055 REG16(0x65c),
1056 REG16(0x660),
1057 REG16(0x664),
1058 REG16(0x668),
1059 REG16(0x66c),
1060 REG16(0x670),
1061 REG16(0x674),
1062 REG16(0x678),
1063 REG16(0x67c),
1064 REG(0x068),
1065 REG(0x084),
1066 NOP(1),
1067
1068 END(192)
1069 };
1070
1071 #undef END
1072 #undef REG16
1073 #undef REG
1074 #undef LRI
1075 #undef NOP
1076
reg_offsets(const struct intel_engine_cs * engine)1077 static const u8 *reg_offsets(const struct intel_engine_cs *engine)
1078 {
1079 /*
1080 * The gen12+ lists only have the registers we program in the basic
1081 * default state. We rely on the context image using relative
1082 * addressing to automatic fixup the register state between the
1083 * physical engines for virtual engine.
1084 */
1085 GEM_BUG_ON(INTEL_GEN(engine->i915) >= 12 &&
1086 !intel_engine_has_relative_mmio(engine));
1087
1088 if (engine->class == RENDER_CLASS) {
1089 if (INTEL_GEN(engine->i915) >= 12)
1090 return gen12_rcs_offsets;
1091 else if (INTEL_GEN(engine->i915) >= 11)
1092 return gen11_rcs_offsets;
1093 else if (INTEL_GEN(engine->i915) >= 9)
1094 return gen9_rcs_offsets;
1095 else
1096 return gen8_rcs_offsets;
1097 } else {
1098 if (INTEL_GEN(engine->i915) >= 12)
1099 return gen12_xcs_offsets;
1100 else if (INTEL_GEN(engine->i915) >= 9)
1101 return gen9_xcs_offsets;
1102 else
1103 return gen8_xcs_offsets;
1104 }
1105 }
1106
1107 static struct i915_request *
__unwind_incomplete_requests(struct intel_engine_cs * engine)1108 __unwind_incomplete_requests(struct intel_engine_cs *engine)
1109 {
1110 struct i915_request *rq, *rn, *active = NULL;
1111 struct list_head *pl;
1112 int prio = I915_PRIORITY_INVALID;
1113
1114 lockdep_assert_held(&engine->active.lock);
1115
1116 list_for_each_entry_safe_reverse(rq, rn,
1117 &engine->active.requests,
1118 sched.link) {
1119 if (i915_request_completed(rq))
1120 continue; /* XXX */
1121
1122 __i915_request_unsubmit(rq);
1123
1124 /*
1125 * Push the request back into the queue for later resubmission.
1126 * If this request is not native to this physical engine (i.e.
1127 * it came from a virtual source), push it back onto the virtual
1128 * engine so that it can be moved across onto another physical
1129 * engine as load dictates.
1130 */
1131 if (likely(rq->execution_mask == engine->mask)) {
1132 GEM_BUG_ON(rq_prio(rq) == I915_PRIORITY_INVALID);
1133 if (rq_prio(rq) != prio) {
1134 prio = rq_prio(rq);
1135 pl = i915_sched_lookup_priolist(engine, prio);
1136 }
1137 GEM_BUG_ON(RB_EMPTY_ROOT(&engine->execlists.queue.rb_root));
1138
1139 list_move(&rq->sched.link, pl);
1140 set_bit(I915_FENCE_FLAG_PQUEUE, &rq->fence.flags);
1141
1142 /* Check in case we rollback so far we wrap [size/2] */
1143 if (intel_ring_direction(rq->ring,
1144 rq->tail,
1145 rq->ring->tail + 8) > 0)
1146 rq->context->lrc.desc |= CTX_DESC_FORCE_RESTORE;
1147
1148 active = rq;
1149 } else {
1150 struct intel_engine_cs *owner = rq->context->engine;
1151
1152 WRITE_ONCE(rq->engine, owner);
1153 owner->submit_request(rq);
1154 active = NULL;
1155 }
1156 }
1157
1158 return active;
1159 }
1160
1161 struct i915_request *
execlists_unwind_incomplete_requests(struct intel_engine_execlists * execlists)1162 execlists_unwind_incomplete_requests(struct intel_engine_execlists *execlists)
1163 {
1164 struct intel_engine_cs *engine =
1165 container_of(execlists, typeof(*engine), execlists);
1166
1167 return __unwind_incomplete_requests(engine);
1168 }
1169
1170 static inline void
execlists_context_status_change(struct i915_request * rq,unsigned long status)1171 execlists_context_status_change(struct i915_request *rq, unsigned long status)
1172 {
1173 /*
1174 * Only used when GVT-g is enabled now. When GVT-g is disabled,
1175 * The compiler should eliminate this function as dead-code.
1176 */
1177 if (!IS_ENABLED(CONFIG_DRM_I915_GVT))
1178 return;
1179
1180 atomic_notifier_call_chain(&rq->engine->context_status_notifier,
1181 status, rq);
1182 }
1183
intel_engine_context_in(struct intel_engine_cs * engine)1184 static void intel_engine_context_in(struct intel_engine_cs *engine)
1185 {
1186 unsigned long flags;
1187
1188 if (atomic_add_unless(&engine->stats.active, 1, 0))
1189 return;
1190
1191 write_seqlock_irqsave(&engine->stats.lock, flags);
1192 if (!atomic_add_unless(&engine->stats.active, 1, 0)) {
1193 engine->stats.start = ktime_get();
1194 atomic_inc(&engine->stats.active);
1195 }
1196 write_sequnlock_irqrestore(&engine->stats.lock, flags);
1197 }
1198
intel_engine_context_out(struct intel_engine_cs * engine)1199 static void intel_engine_context_out(struct intel_engine_cs *engine)
1200 {
1201 unsigned long flags;
1202
1203 GEM_BUG_ON(!atomic_read(&engine->stats.active));
1204
1205 if (atomic_add_unless(&engine->stats.active, -1, 1))
1206 return;
1207
1208 write_seqlock_irqsave(&engine->stats.lock, flags);
1209 if (atomic_dec_and_test(&engine->stats.active)) {
1210 engine->stats.total =
1211 ktime_add(engine->stats.total,
1212 ktime_sub(ktime_get(), engine->stats.start));
1213 }
1214 write_sequnlock_irqrestore(&engine->stats.lock, flags);
1215 }
1216
1217 static void
execlists_check_context(const struct intel_context * ce,const struct intel_engine_cs * engine)1218 execlists_check_context(const struct intel_context *ce,
1219 const struct intel_engine_cs *engine)
1220 {
1221 const struct intel_ring *ring = ce->ring;
1222 u32 *regs = ce->lrc_reg_state;
1223 bool valid = true;
1224 int x;
1225
1226 if (regs[CTX_RING_START] != i915_ggtt_offset(ring->vma)) {
1227 pr_err("%s: context submitted with incorrect RING_START [%08x], expected %08x\n",
1228 engine->name,
1229 regs[CTX_RING_START],
1230 i915_ggtt_offset(ring->vma));
1231 regs[CTX_RING_START] = i915_ggtt_offset(ring->vma);
1232 valid = false;
1233 }
1234
1235 if ((regs[CTX_RING_CTL] & ~(RING_WAIT | RING_WAIT_SEMAPHORE)) !=
1236 (RING_CTL_SIZE(ring->size) | RING_VALID)) {
1237 pr_err("%s: context submitted with incorrect RING_CTL [%08x], expected %08x\n",
1238 engine->name,
1239 regs[CTX_RING_CTL],
1240 (u32)(RING_CTL_SIZE(ring->size) | RING_VALID));
1241 regs[CTX_RING_CTL] = RING_CTL_SIZE(ring->size) | RING_VALID;
1242 valid = false;
1243 }
1244
1245 x = lrc_ring_mi_mode(engine);
1246 if (x != -1 && regs[x + 1] & (regs[x + 1] >> 16) & STOP_RING) {
1247 pr_err("%s: context submitted with STOP_RING [%08x] in RING_MI_MODE\n",
1248 engine->name, regs[x + 1]);
1249 regs[x + 1] &= ~STOP_RING;
1250 regs[x + 1] |= STOP_RING << 16;
1251 valid = false;
1252 }
1253
1254 WARN_ONCE(!valid, "Invalid lrc state found before submission\n");
1255 }
1256
restore_default_state(struct intel_context * ce,struct intel_engine_cs * engine)1257 static void restore_default_state(struct intel_context *ce,
1258 struct intel_engine_cs *engine)
1259 {
1260 u32 *regs;
1261
1262 regs = memset(ce->lrc_reg_state, 0, engine->context_size - PAGE_SIZE);
1263 execlists_init_reg_state(regs, ce, engine, ce->ring, true);
1264
1265 ce->runtime.last = intel_context_get_runtime(ce);
1266 }
1267
reset_active(struct i915_request * rq,struct intel_engine_cs * engine)1268 static void reset_active(struct i915_request *rq,
1269 struct intel_engine_cs *engine)
1270 {
1271 struct intel_context * const ce = rq->context;
1272 u32 head;
1273
1274 /*
1275 * The executing context has been cancelled. We want to prevent
1276 * further execution along this context and propagate the error on
1277 * to anything depending on its results.
1278 *
1279 * In __i915_request_submit(), we apply the -EIO and remove the
1280 * requests' payloads for any banned requests. But first, we must
1281 * rewind the context back to the start of the incomplete request so
1282 * that we do not jump back into the middle of the batch.
1283 *
1284 * We preserve the breadcrumbs and semaphores of the incomplete
1285 * requests so that inter-timeline dependencies (i.e other timelines)
1286 * remain correctly ordered. And we defer to __i915_request_submit()
1287 * so that all asynchronous waits are correctly handled.
1288 */
1289 ENGINE_TRACE(engine, "{ rq=%llx:%lld }\n",
1290 rq->fence.context, rq->fence.seqno);
1291
1292 /* On resubmission of the active request, payload will be scrubbed */
1293 if (i915_request_completed(rq))
1294 head = rq->tail;
1295 else
1296 head = active_request(ce->timeline, rq)->head;
1297 head = intel_ring_wrap(ce->ring, head);
1298
1299 /* Scrub the context image to prevent replaying the previous batch */
1300 restore_default_state(ce, engine);
1301 __execlists_update_reg_state(ce, engine, head);
1302
1303 /* We've switched away, so this should be a no-op, but intent matters */
1304 ce->lrc.desc |= CTX_DESC_FORCE_RESTORE;
1305 }
1306
st_update_runtime_underflow(struct intel_context * ce,s32 dt)1307 static void st_update_runtime_underflow(struct intel_context *ce, s32 dt)
1308 {
1309 #if IS_ENABLED(CONFIG_DRM_I915_SELFTEST)
1310 ce->runtime.num_underflow += dt < 0;
1311 ce->runtime.max_underflow = max_t(u32, ce->runtime.max_underflow, -dt);
1312 #endif
1313 }
1314
intel_context_update_runtime(struct intel_context * ce)1315 static void intel_context_update_runtime(struct intel_context *ce)
1316 {
1317 u32 old;
1318 s32 dt;
1319
1320 if (intel_context_is_barrier(ce))
1321 return;
1322
1323 old = ce->runtime.last;
1324 ce->runtime.last = intel_context_get_runtime(ce);
1325 dt = ce->runtime.last - old;
1326
1327 if (unlikely(dt <= 0)) {
1328 CE_TRACE(ce, "runtime underflow: last=%u, new=%u, delta=%d\n",
1329 old, ce->runtime.last, dt);
1330 st_update_runtime_underflow(ce, dt);
1331 return;
1332 }
1333
1334 ewma_runtime_add(&ce->runtime.avg, dt);
1335 ce->runtime.total += dt;
1336 }
1337
1338 static inline struct intel_engine_cs *
__execlists_schedule_in(struct i915_request * rq)1339 __execlists_schedule_in(struct i915_request *rq)
1340 {
1341 struct intel_engine_cs * const engine = rq->engine;
1342 struct intel_context * const ce = rq->context;
1343
1344 intel_context_get(ce);
1345
1346 if (unlikely(intel_context_is_banned(ce)))
1347 reset_active(rq, engine);
1348
1349 if (IS_ENABLED(CONFIG_DRM_I915_DEBUG_GEM))
1350 execlists_check_context(ce, engine);
1351
1352 if (ce->tag) {
1353 /* Use a fixed tag for OA and friends */
1354 GEM_BUG_ON(ce->tag <= BITS_PER_LONG);
1355 ce->lrc.ccid = ce->tag;
1356 } else {
1357 /* We don't need a strict matching tag, just different values */
1358 unsigned int tag = ffs(READ_ONCE(engine->context_tag));
1359
1360 GEM_BUG_ON(tag == 0 || tag >= BITS_PER_LONG);
1361 clear_bit(tag - 1, &engine->context_tag);
1362 ce->lrc.ccid = tag << (GEN11_SW_CTX_ID_SHIFT - 32);
1363
1364 BUILD_BUG_ON(BITS_PER_LONG > GEN12_MAX_CONTEXT_HW_ID);
1365 }
1366
1367 ce->lrc.ccid |= engine->execlists.ccid;
1368
1369 __intel_gt_pm_get(engine->gt);
1370 if (engine->fw_domain && !atomic_fetch_inc(&engine->fw_active))
1371 intel_uncore_forcewake_get(engine->uncore, engine->fw_domain);
1372 execlists_context_status_change(rq, INTEL_CONTEXT_SCHEDULE_IN);
1373 intel_engine_context_in(engine);
1374
1375 return engine;
1376 }
1377
1378 static inline struct i915_request *
execlists_schedule_in(struct i915_request * rq,int idx)1379 execlists_schedule_in(struct i915_request *rq, int idx)
1380 {
1381 struct intel_context * const ce = rq->context;
1382 struct intel_engine_cs *old;
1383
1384 GEM_BUG_ON(!intel_engine_pm_is_awake(rq->engine));
1385 trace_i915_request_in(rq, idx);
1386
1387 old = READ_ONCE(ce->inflight);
1388 do {
1389 if (!old) {
1390 WRITE_ONCE(ce->inflight, __execlists_schedule_in(rq));
1391 break;
1392 }
1393 } while (!try_cmpxchg(&ce->inflight, &old, ptr_inc(old)));
1394
1395 GEM_BUG_ON(intel_context_inflight(ce) != rq->engine);
1396 return i915_request_get(rq);
1397 }
1398
kick_siblings(struct i915_request * rq,struct intel_context * ce)1399 static void kick_siblings(struct i915_request *rq, struct intel_context *ce)
1400 {
1401 struct virtual_engine *ve = container_of(ce, typeof(*ve), context);
1402 struct i915_request *next = READ_ONCE(ve->request);
1403
1404 if (next == rq || (next && next->execution_mask & ~rq->execution_mask))
1405 tasklet_hi_schedule(&ve->base.execlists.tasklet);
1406 }
1407
1408 static inline void
__execlists_schedule_out(struct i915_request * rq,struct intel_engine_cs * const engine,unsigned int ccid)1409 __execlists_schedule_out(struct i915_request *rq,
1410 struct intel_engine_cs * const engine,
1411 unsigned int ccid)
1412 {
1413 struct intel_context * const ce = rq->context;
1414
1415 /*
1416 * NB process_csb() is not under the engine->active.lock and hence
1417 * schedule_out can race with schedule_in meaning that we should
1418 * refrain from doing non-trivial work here.
1419 */
1420
1421 /*
1422 * If we have just completed this context, the engine may now be
1423 * idle and we want to re-enter powersaving.
1424 */
1425 if (list_is_last_rcu(&rq->link, &ce->timeline->requests) &&
1426 i915_request_completed(rq))
1427 intel_engine_add_retire(engine, ce->timeline);
1428
1429 ccid >>= GEN11_SW_CTX_ID_SHIFT - 32;
1430 ccid &= GEN12_MAX_CONTEXT_HW_ID;
1431 if (ccid < BITS_PER_LONG) {
1432 GEM_BUG_ON(ccid == 0);
1433 GEM_BUG_ON(test_bit(ccid - 1, &engine->context_tag));
1434 set_bit(ccid - 1, &engine->context_tag);
1435 }
1436
1437 intel_context_update_runtime(ce);
1438 intel_engine_context_out(engine);
1439 execlists_context_status_change(rq, INTEL_CONTEXT_SCHEDULE_OUT);
1440 if (engine->fw_domain && !atomic_dec_return(&engine->fw_active))
1441 intel_uncore_forcewake_put(engine->uncore, engine->fw_domain);
1442 intel_gt_pm_put_async(engine->gt);
1443
1444 /*
1445 * If this is part of a virtual engine, its next request may
1446 * have been blocked waiting for access to the active context.
1447 * We have to kick all the siblings again in case we need to
1448 * switch (e.g. the next request is not runnable on this
1449 * engine). Hopefully, we will already have submitted the next
1450 * request before the tasklet runs and do not need to rebuild
1451 * each virtual tree and kick everyone again.
1452 */
1453 if (ce->engine != engine)
1454 kick_siblings(rq, ce);
1455
1456 intel_context_put(ce);
1457 }
1458
1459 static inline void
execlists_schedule_out(struct i915_request * rq)1460 execlists_schedule_out(struct i915_request *rq)
1461 {
1462 struct intel_context * const ce = rq->context;
1463 struct intel_engine_cs *cur, *old;
1464 u32 ccid;
1465
1466 trace_i915_request_out(rq);
1467
1468 ccid = rq->context->lrc.ccid;
1469 old = READ_ONCE(ce->inflight);
1470 do
1471 cur = ptr_unmask_bits(old, 2) ? ptr_dec(old) : NULL;
1472 while (!try_cmpxchg(&ce->inflight, &old, cur));
1473 if (!cur)
1474 __execlists_schedule_out(rq, old, ccid);
1475
1476 i915_request_put(rq);
1477 }
1478
execlists_update_context(struct i915_request * rq)1479 static u64 execlists_update_context(struct i915_request *rq)
1480 {
1481 struct intel_context *ce = rq->context;
1482 u64 desc = ce->lrc.desc;
1483 u32 tail, prev;
1484
1485 /*
1486 * WaIdleLiteRestore:bdw,skl
1487 *
1488 * We should never submit the context with the same RING_TAIL twice
1489 * just in case we submit an empty ring, which confuses the HW.
1490 *
1491 * We append a couple of NOOPs (gen8_emit_wa_tail) after the end of
1492 * the normal request to be able to always advance the RING_TAIL on
1493 * subsequent resubmissions (for lite restore). Should that fail us,
1494 * and we try and submit the same tail again, force the context
1495 * reload.
1496 *
1497 * If we need to return to a preempted context, we need to skip the
1498 * lite-restore and force it to reload the RING_TAIL. Otherwise, the
1499 * HW has a tendency to ignore us rewinding the TAIL to the end of
1500 * an earlier request.
1501 */
1502 GEM_BUG_ON(ce->lrc_reg_state[CTX_RING_TAIL] != rq->ring->tail);
1503 prev = rq->ring->tail;
1504 tail = intel_ring_set_tail(rq->ring, rq->tail);
1505 if (unlikely(intel_ring_direction(rq->ring, tail, prev) <= 0))
1506 desc |= CTX_DESC_FORCE_RESTORE;
1507 ce->lrc_reg_state[CTX_RING_TAIL] = tail;
1508 rq->tail = rq->wa_tail;
1509
1510 /*
1511 * Make sure the context image is complete before we submit it to HW.
1512 *
1513 * Ostensibly, writes (including the WCB) should be flushed prior to
1514 * an uncached write such as our mmio register access, the empirical
1515 * evidence (esp. on Braswell) suggests that the WC write into memory
1516 * may not be visible to the HW prior to the completion of the UC
1517 * register write and that we may begin execution from the context
1518 * before its image is complete leading to invalid PD chasing.
1519 */
1520 wmb();
1521
1522 ce->lrc.desc &= ~CTX_DESC_FORCE_RESTORE;
1523 return desc;
1524 }
1525
write_desc(struct intel_engine_execlists * execlists,u64 desc,u32 port)1526 static inline void write_desc(struct intel_engine_execlists *execlists, u64 desc, u32 port)
1527 {
1528 if (execlists->ctrl_reg) {
1529 writel(lower_32_bits(desc), execlists->submit_reg + port * 2);
1530 writel(upper_32_bits(desc), execlists->submit_reg + port * 2 + 1);
1531 } else {
1532 writel(upper_32_bits(desc), execlists->submit_reg);
1533 writel(lower_32_bits(desc), execlists->submit_reg);
1534 }
1535 }
1536
1537 static __maybe_unused char *
dump_port(char * buf,int buflen,const char * prefix,struct i915_request * rq)1538 dump_port(char *buf, int buflen, const char *prefix, struct i915_request *rq)
1539 {
1540 if (!rq)
1541 return "";
1542
1543 snprintf(buf, buflen, "%sccid:%x %llx:%lld%s prio %d",
1544 prefix,
1545 rq->context->lrc.ccid,
1546 rq->fence.context, rq->fence.seqno,
1547 i915_request_completed(rq) ? "!" :
1548 i915_request_started(rq) ? "*" :
1549 "",
1550 rq_prio(rq));
1551
1552 return buf;
1553 }
1554
1555 static __maybe_unused void
trace_ports(const struct intel_engine_execlists * execlists,const char * msg,struct i915_request * const * ports)1556 trace_ports(const struct intel_engine_execlists *execlists,
1557 const char *msg,
1558 struct i915_request * const *ports)
1559 {
1560 const struct intel_engine_cs *engine =
1561 container_of(execlists, typeof(*engine), execlists);
1562 char __maybe_unused p0[40], p1[40];
1563
1564 if (!ports[0])
1565 return;
1566
1567 ENGINE_TRACE(engine, "%s { %s%s }\n", msg,
1568 dump_port(p0, sizeof(p0), "", ports[0]),
1569 dump_port(p1, sizeof(p1), ", ", ports[1]));
1570 }
1571
1572 static inline bool
reset_in_progress(const struct intel_engine_execlists * execlists)1573 reset_in_progress(const struct intel_engine_execlists *execlists)
1574 {
1575 return unlikely(!__tasklet_is_enabled(&execlists->tasklet));
1576 }
1577
1578 static __maybe_unused bool
assert_pending_valid(const struct intel_engine_execlists * execlists,const char * msg)1579 assert_pending_valid(const struct intel_engine_execlists *execlists,
1580 const char *msg)
1581 {
1582 struct intel_engine_cs *engine =
1583 container_of(execlists, typeof(*engine), execlists);
1584 struct i915_request * const *port, *rq;
1585 struct intel_context *ce = NULL;
1586 bool sentinel = false;
1587 u32 ccid = -1;
1588
1589 trace_ports(execlists, msg, execlists->pending);
1590
1591 /* We may be messing around with the lists during reset, lalala */
1592 if (reset_in_progress(execlists))
1593 return true;
1594
1595 if (!execlists->pending[0]) {
1596 GEM_TRACE_ERR("%s: Nothing pending for promotion!\n",
1597 engine->name);
1598 return false;
1599 }
1600
1601 if (execlists->pending[execlists_num_ports(execlists)]) {
1602 GEM_TRACE_ERR("%s: Excess pending[%d] for promotion!\n",
1603 engine->name, execlists_num_ports(execlists));
1604 return false;
1605 }
1606
1607 for (port = execlists->pending; (rq = *port); port++) {
1608 unsigned long flags;
1609 bool ok = true;
1610
1611 GEM_BUG_ON(!kref_read(&rq->fence.refcount));
1612 GEM_BUG_ON(!i915_request_is_active(rq));
1613
1614 if (ce == rq->context) {
1615 GEM_TRACE_ERR("%s: Dup context:%llx in pending[%zd]\n",
1616 engine->name,
1617 ce->timeline->fence_context,
1618 port - execlists->pending);
1619 return false;
1620 }
1621 ce = rq->context;
1622
1623 if (ccid == ce->lrc.ccid) {
1624 GEM_TRACE_ERR("%s: Dup ccid:%x context:%llx in pending[%zd]\n",
1625 engine->name,
1626 ccid, ce->timeline->fence_context,
1627 port - execlists->pending);
1628 return false;
1629 }
1630 ccid = ce->lrc.ccid;
1631
1632 /*
1633 * Sentinels are supposed to be the last request so they flush
1634 * the current execution off the HW. Check that they are the only
1635 * request in the pending submission.
1636 */
1637 if (sentinel) {
1638 GEM_TRACE_ERR("%s: context:%llx after sentinel in pending[%zd]\n",
1639 engine->name,
1640 ce->timeline->fence_context,
1641 port - execlists->pending);
1642 return false;
1643 }
1644 sentinel = i915_request_has_sentinel(rq);
1645
1646 /* Hold tightly onto the lock to prevent concurrent retires! */
1647 if (!spin_trylock_irqsave(&rq->lock, flags))
1648 continue;
1649
1650 if (i915_request_completed(rq))
1651 goto unlock;
1652
1653 if (i915_active_is_idle(&ce->active) &&
1654 !intel_context_is_barrier(ce)) {
1655 GEM_TRACE_ERR("%s: Inactive context:%llx in pending[%zd]\n",
1656 engine->name,
1657 ce->timeline->fence_context,
1658 port - execlists->pending);
1659 ok = false;
1660 goto unlock;
1661 }
1662
1663 if (!i915_vma_is_pinned(ce->state)) {
1664 GEM_TRACE_ERR("%s: Unpinned context:%llx in pending[%zd]\n",
1665 engine->name,
1666 ce->timeline->fence_context,
1667 port - execlists->pending);
1668 ok = false;
1669 goto unlock;
1670 }
1671
1672 if (!i915_vma_is_pinned(ce->ring->vma)) {
1673 GEM_TRACE_ERR("%s: Unpinned ring:%llx in pending[%zd]\n",
1674 engine->name,
1675 ce->timeline->fence_context,
1676 port - execlists->pending);
1677 ok = false;
1678 goto unlock;
1679 }
1680
1681 unlock:
1682 spin_unlock_irqrestore(&rq->lock, flags);
1683 if (!ok)
1684 return false;
1685 }
1686
1687 return ce;
1688 }
1689
execlists_submit_ports(struct intel_engine_cs * engine)1690 static void execlists_submit_ports(struct intel_engine_cs *engine)
1691 {
1692 struct intel_engine_execlists *execlists = &engine->execlists;
1693 unsigned int n;
1694
1695 GEM_BUG_ON(!assert_pending_valid(execlists, "submit"));
1696
1697 /*
1698 * We can skip acquiring intel_runtime_pm_get() here as it was taken
1699 * on our behalf by the request (see i915_gem_mark_busy()) and it will
1700 * not be relinquished until the device is idle (see
1701 * i915_gem_idle_work_handler()). As a precaution, we make sure
1702 * that all ELSP are drained i.e. we have processed the CSB,
1703 * before allowing ourselves to idle and calling intel_runtime_pm_put().
1704 */
1705 GEM_BUG_ON(!intel_engine_pm_is_awake(engine));
1706
1707 /*
1708 * ELSQ note: the submit queue is not cleared after being submitted
1709 * to the HW so we need to make sure we always clean it up. This is
1710 * currently ensured by the fact that we always write the same number
1711 * of elsq entries, keep this in mind before changing the loop below.
1712 */
1713 for (n = execlists_num_ports(execlists); n--; ) {
1714 struct i915_request *rq = execlists->pending[n];
1715
1716 write_desc(execlists,
1717 rq ? execlists_update_context(rq) : 0,
1718 n);
1719 }
1720
1721 /* we need to manually load the submit queue */
1722 if (execlists->ctrl_reg)
1723 writel(EL_CTRL_LOAD, execlists->ctrl_reg);
1724 }
1725
ctx_single_port_submission(const struct intel_context * ce)1726 static bool ctx_single_port_submission(const struct intel_context *ce)
1727 {
1728 return (IS_ENABLED(CONFIG_DRM_I915_GVT) &&
1729 intel_context_force_single_submission(ce));
1730 }
1731
can_merge_ctx(const struct intel_context * prev,const struct intel_context * next)1732 static bool can_merge_ctx(const struct intel_context *prev,
1733 const struct intel_context *next)
1734 {
1735 if (prev != next)
1736 return false;
1737
1738 if (ctx_single_port_submission(prev))
1739 return false;
1740
1741 return true;
1742 }
1743
i915_request_flags(const struct i915_request * rq)1744 static unsigned long i915_request_flags(const struct i915_request *rq)
1745 {
1746 return READ_ONCE(rq->fence.flags);
1747 }
1748
can_merge_rq(const struct i915_request * prev,const struct i915_request * next)1749 static bool can_merge_rq(const struct i915_request *prev,
1750 const struct i915_request *next)
1751 {
1752 GEM_BUG_ON(prev == next);
1753 GEM_BUG_ON(!assert_priority_queue(prev, next));
1754
1755 /*
1756 * We do not submit known completed requests. Therefore if the next
1757 * request is already completed, we can pretend to merge it in
1758 * with the previous context (and we will skip updating the ELSP
1759 * and tracking). Thus hopefully keeping the ELSP full with active
1760 * contexts, despite the best efforts of preempt-to-busy to confuse
1761 * us.
1762 */
1763 if (i915_request_completed(next))
1764 return true;
1765
1766 if (unlikely((i915_request_flags(prev) ^ i915_request_flags(next)) &
1767 (BIT(I915_FENCE_FLAG_NOPREEMPT) |
1768 BIT(I915_FENCE_FLAG_SENTINEL))))
1769 return false;
1770
1771 if (!can_merge_ctx(prev->context, next->context))
1772 return false;
1773
1774 GEM_BUG_ON(i915_seqno_passed(prev->fence.seqno, next->fence.seqno));
1775 return true;
1776 }
1777
virtual_update_register_offsets(u32 * regs,struct intel_engine_cs * engine)1778 static void virtual_update_register_offsets(u32 *regs,
1779 struct intel_engine_cs *engine)
1780 {
1781 set_offsets(regs, reg_offsets(engine), engine, false);
1782 }
1783
virtual_matches(const struct virtual_engine * ve,const struct i915_request * rq,const struct intel_engine_cs * engine)1784 static bool virtual_matches(const struct virtual_engine *ve,
1785 const struct i915_request *rq,
1786 const struct intel_engine_cs *engine)
1787 {
1788 const struct intel_engine_cs *inflight;
1789
1790 if (!(rq->execution_mask & engine->mask)) /* We peeked too soon! */
1791 return false;
1792
1793 /*
1794 * We track when the HW has completed saving the context image
1795 * (i.e. when we have seen the final CS event switching out of
1796 * the context) and must not overwrite the context image before
1797 * then. This restricts us to only using the active engine
1798 * while the previous virtualized request is inflight (so
1799 * we reuse the register offsets). This is a very small
1800 * hystersis on the greedy seelction algorithm.
1801 */
1802 inflight = intel_context_inflight(&ve->context);
1803 if (inflight && inflight != engine)
1804 return false;
1805
1806 return true;
1807 }
1808
virtual_xfer_context(struct virtual_engine * ve,struct intel_engine_cs * engine)1809 static void virtual_xfer_context(struct virtual_engine *ve,
1810 struct intel_engine_cs *engine)
1811 {
1812 unsigned int n;
1813
1814 if (likely(engine == ve->siblings[0]))
1815 return;
1816
1817 GEM_BUG_ON(READ_ONCE(ve->context.inflight));
1818 if (!intel_engine_has_relative_mmio(engine))
1819 virtual_update_register_offsets(ve->context.lrc_reg_state,
1820 engine);
1821
1822 /*
1823 * Move the bound engine to the top of the list for
1824 * future execution. We then kick this tasklet first
1825 * before checking others, so that we preferentially
1826 * reuse this set of bound registers.
1827 */
1828 for (n = 1; n < ve->num_siblings; n++) {
1829 if (ve->siblings[n] == engine) {
1830 swap(ve->siblings[n], ve->siblings[0]);
1831 break;
1832 }
1833 }
1834 }
1835
1836 #define for_each_waiter(p__, rq__) \
1837 list_for_each_entry_lockless(p__, \
1838 &(rq__)->sched.waiters_list, \
1839 wait_link)
1840
1841 #define for_each_signaler(p__, rq__) \
1842 list_for_each_entry_rcu(p__, \
1843 &(rq__)->sched.signalers_list, \
1844 signal_link)
1845
defer_request(struct i915_request * rq,struct list_head * const pl)1846 static void defer_request(struct i915_request *rq, struct list_head * const pl)
1847 {
1848 LIST_HEAD(list);
1849
1850 /*
1851 * We want to move the interrupted request to the back of
1852 * the round-robin list (i.e. its priority level), but
1853 * in doing so, we must then move all requests that were in
1854 * flight and were waiting for the interrupted request to
1855 * be run after it again.
1856 */
1857 do {
1858 struct i915_dependency *p;
1859
1860 GEM_BUG_ON(i915_request_is_active(rq));
1861 list_move_tail(&rq->sched.link, pl);
1862
1863 for_each_waiter(p, rq) {
1864 struct i915_request *w =
1865 container_of(p->waiter, typeof(*w), sched);
1866
1867 if (p->flags & I915_DEPENDENCY_WEAK)
1868 continue;
1869
1870 /* Leave semaphores spinning on the other engines */
1871 if (w->engine != rq->engine)
1872 continue;
1873
1874 /* No waiter should start before its signaler */
1875 GEM_BUG_ON(i915_request_has_initial_breadcrumb(w) &&
1876 i915_request_started(w) &&
1877 !i915_request_completed(rq));
1878
1879 GEM_BUG_ON(i915_request_is_active(w));
1880 if (!i915_request_is_ready(w))
1881 continue;
1882
1883 if (rq_prio(w) < rq_prio(rq))
1884 continue;
1885
1886 GEM_BUG_ON(rq_prio(w) > rq_prio(rq));
1887 list_move_tail(&w->sched.link, &list);
1888 }
1889
1890 rq = list_first_entry_or_null(&list, typeof(*rq), sched.link);
1891 } while (rq);
1892 }
1893
defer_active(struct intel_engine_cs * engine)1894 static void defer_active(struct intel_engine_cs *engine)
1895 {
1896 struct i915_request *rq;
1897
1898 rq = __unwind_incomplete_requests(engine);
1899 if (!rq)
1900 return;
1901
1902 defer_request(rq, i915_sched_lookup_priolist(engine, rq_prio(rq)));
1903 }
1904
1905 static bool
need_timeslice(const struct intel_engine_cs * engine,const struct i915_request * rq,const struct rb_node * rb)1906 need_timeslice(const struct intel_engine_cs *engine,
1907 const struct i915_request *rq,
1908 const struct rb_node *rb)
1909 {
1910 int hint;
1911
1912 if (!intel_engine_has_timeslices(engine))
1913 return false;
1914
1915 hint = engine->execlists.queue_priority_hint;
1916
1917 if (rb) {
1918 const struct virtual_engine *ve =
1919 rb_entry(rb, typeof(*ve), nodes[engine->id].rb);
1920 const struct intel_engine_cs *inflight =
1921 intel_context_inflight(&ve->context);
1922
1923 if (!inflight || inflight == engine) {
1924 struct i915_request *next;
1925
1926 rcu_read_lock();
1927 next = READ_ONCE(ve->request);
1928 if (next)
1929 hint = max(hint, rq_prio(next));
1930 rcu_read_unlock();
1931 }
1932 }
1933
1934 if (!list_is_last(&rq->sched.link, &engine->active.requests))
1935 hint = max(hint, rq_prio(list_next_entry(rq, sched.link)));
1936
1937 GEM_BUG_ON(hint >= I915_PRIORITY_UNPREEMPTABLE);
1938 return hint >= effective_prio(rq);
1939 }
1940
1941 static bool
timeslice_yield(const struct intel_engine_execlists * el,const struct i915_request * rq)1942 timeslice_yield(const struct intel_engine_execlists *el,
1943 const struct i915_request *rq)
1944 {
1945 /*
1946 * Once bitten, forever smitten!
1947 *
1948 * If the active context ever busy-waited on a semaphore,
1949 * it will be treated as a hog until the end of its timeslice (i.e.
1950 * until it is scheduled out and replaced by a new submission,
1951 * possibly even its own lite-restore). The HW only sends an interrupt
1952 * on the first miss, and we do know if that semaphore has been
1953 * signaled, or even if it is now stuck on another semaphore. Play
1954 * safe, yield if it might be stuck -- it will be given a fresh
1955 * timeslice in the near future.
1956 */
1957 return rq->context->lrc.ccid == READ_ONCE(el->yield);
1958 }
1959
1960 static bool
timeslice_expired(const struct intel_engine_execlists * el,const struct i915_request * rq)1961 timeslice_expired(const struct intel_engine_execlists *el,
1962 const struct i915_request *rq)
1963 {
1964 return timer_expired(&el->timer) || timeslice_yield(el, rq);
1965 }
1966
1967 static int
switch_prio(struct intel_engine_cs * engine,const struct i915_request * rq)1968 switch_prio(struct intel_engine_cs *engine, const struct i915_request *rq)
1969 {
1970 if (list_is_last(&rq->sched.link, &engine->active.requests))
1971 return engine->execlists.queue_priority_hint;
1972
1973 return rq_prio(list_next_entry(rq, sched.link));
1974 }
1975
1976 static inline unsigned long
timeslice(const struct intel_engine_cs * engine)1977 timeslice(const struct intel_engine_cs *engine)
1978 {
1979 return READ_ONCE(engine->props.timeslice_duration_ms);
1980 }
1981
active_timeslice(const struct intel_engine_cs * engine)1982 static unsigned long active_timeslice(const struct intel_engine_cs *engine)
1983 {
1984 const struct intel_engine_execlists *execlists = &engine->execlists;
1985 const struct i915_request *rq = *execlists->active;
1986
1987 if (!rq || i915_request_completed(rq))
1988 return 0;
1989
1990 if (READ_ONCE(execlists->switch_priority_hint) < effective_prio(rq))
1991 return 0;
1992
1993 return timeslice(engine);
1994 }
1995
set_timeslice(struct intel_engine_cs * engine)1996 static void set_timeslice(struct intel_engine_cs *engine)
1997 {
1998 unsigned long duration;
1999
2000 if (!intel_engine_has_timeslices(engine))
2001 return;
2002
2003 duration = active_timeslice(engine);
2004 ENGINE_TRACE(engine, "bump timeslicing, interval:%lu", duration);
2005
2006 set_timer_ms(&engine->execlists.timer, duration);
2007 }
2008
start_timeslice(struct intel_engine_cs * engine,int prio)2009 static void start_timeslice(struct intel_engine_cs *engine, int prio)
2010 {
2011 struct intel_engine_execlists *execlists = &engine->execlists;
2012 unsigned long duration;
2013
2014 if (!intel_engine_has_timeslices(engine))
2015 return;
2016
2017 WRITE_ONCE(execlists->switch_priority_hint, prio);
2018 if (prio == INT_MIN)
2019 return;
2020
2021 if (timer_pending(&execlists->timer))
2022 return;
2023
2024 duration = timeslice(engine);
2025 ENGINE_TRACE(engine,
2026 "start timeslicing, prio:%d, interval:%lu",
2027 prio, duration);
2028
2029 set_timer_ms(&execlists->timer, duration);
2030 }
2031
record_preemption(struct intel_engine_execlists * execlists)2032 static void record_preemption(struct intel_engine_execlists *execlists)
2033 {
2034 (void)I915_SELFTEST_ONLY(execlists->preempt_hang.count++);
2035 }
2036
active_preempt_timeout(struct intel_engine_cs * engine,const struct i915_request * rq)2037 static unsigned long active_preempt_timeout(struct intel_engine_cs *engine,
2038 const struct i915_request *rq)
2039 {
2040 if (!rq)
2041 return 0;
2042
2043 /* Force a fast reset for terminated contexts (ignoring sysfs!) */
2044 if (unlikely(intel_context_is_banned(rq->context)))
2045 return 1;
2046
2047 return READ_ONCE(engine->props.preempt_timeout_ms);
2048 }
2049
set_preempt_timeout(struct intel_engine_cs * engine,const struct i915_request * rq)2050 static void set_preempt_timeout(struct intel_engine_cs *engine,
2051 const struct i915_request *rq)
2052 {
2053 if (!intel_engine_has_preempt_reset(engine))
2054 return;
2055
2056 set_timer_ms(&engine->execlists.preempt,
2057 active_preempt_timeout(engine, rq));
2058 }
2059
clear_ports(struct i915_request ** ports,int count)2060 static inline void clear_ports(struct i915_request **ports, int count)
2061 {
2062 memset_p((void **)ports, NULL, count);
2063 }
2064
2065 static inline void
copy_ports(struct i915_request ** dst,struct i915_request ** src,int count)2066 copy_ports(struct i915_request **dst, struct i915_request **src, int count)
2067 {
2068 /* A memcpy_p() would be very useful here! */
2069 while (count--)
2070 WRITE_ONCE(*dst++, *src++); /* avoid write tearing */
2071 }
2072
execlists_dequeue(struct intel_engine_cs * engine)2073 static void execlists_dequeue(struct intel_engine_cs *engine)
2074 {
2075 struct intel_engine_execlists * const execlists = &engine->execlists;
2076 struct i915_request **port = execlists->pending;
2077 struct i915_request ** const last_port = port + execlists->port_mask;
2078 struct i915_request * const *active;
2079 struct i915_request *last;
2080 struct rb_node *rb;
2081 bool submit = false;
2082
2083 /*
2084 * Hardware submission is through 2 ports. Conceptually each port
2085 * has a (RING_START, RING_HEAD, RING_TAIL) tuple. RING_START is
2086 * static for a context, and unique to each, so we only execute
2087 * requests belonging to a single context from each ring. RING_HEAD
2088 * is maintained by the CS in the context image, it marks the place
2089 * where it got up to last time, and through RING_TAIL we tell the CS
2090 * where we want to execute up to this time.
2091 *
2092 * In this list the requests are in order of execution. Consecutive
2093 * requests from the same context are adjacent in the ringbuffer. We
2094 * can combine these requests into a single RING_TAIL update:
2095 *
2096 * RING_HEAD...req1...req2
2097 * ^- RING_TAIL
2098 * since to execute req2 the CS must first execute req1.
2099 *
2100 * Our goal then is to point each port to the end of a consecutive
2101 * sequence of requests as being the most optimal (fewest wake ups
2102 * and context switches) submission.
2103 */
2104
2105 for (rb = rb_first_cached(&execlists->virtual); rb; ) {
2106 struct virtual_engine *ve =
2107 rb_entry(rb, typeof(*ve), nodes[engine->id].rb);
2108 struct i915_request *rq = READ_ONCE(ve->request);
2109
2110 if (!rq) { /* lazily cleanup after another engine handled rq */
2111 rb_erase_cached(rb, &execlists->virtual);
2112 RB_CLEAR_NODE(rb);
2113 rb = rb_first_cached(&execlists->virtual);
2114 continue;
2115 }
2116
2117 if (!virtual_matches(ve, rq, engine)) {
2118 rb = rb_next(rb);
2119 continue;
2120 }
2121
2122 break;
2123 }
2124
2125 /*
2126 * If the queue is higher priority than the last
2127 * request in the currently active context, submit afresh.
2128 * We will resubmit again afterwards in case we need to split
2129 * the active context to interject the preemption request,
2130 * i.e. we will retrigger preemption following the ack in case
2131 * of trouble.
2132 */
2133 active = READ_ONCE(execlists->active);
2134
2135 /*
2136 * In theory we can skip over completed contexts that have not
2137 * yet been processed by events (as those events are in flight):
2138 *
2139 * while ((last = *active) && i915_request_completed(last))
2140 * active++;
2141 *
2142 * However, the GPU cannot handle this as it will ultimately
2143 * find itself trying to jump back into a context it has just
2144 * completed and barf.
2145 */
2146
2147 if ((last = *active)) {
2148 if (need_preempt(engine, last, rb)) {
2149 if (i915_request_completed(last)) {
2150 tasklet_hi_schedule(&execlists->tasklet);
2151 return;
2152 }
2153
2154 ENGINE_TRACE(engine,
2155 "preempting last=%llx:%lld, prio=%d, hint=%d\n",
2156 last->fence.context,
2157 last->fence.seqno,
2158 last->sched.attr.priority,
2159 execlists->queue_priority_hint);
2160 record_preemption(execlists);
2161
2162 /*
2163 * Don't let the RING_HEAD advance past the breadcrumb
2164 * as we unwind (and until we resubmit) so that we do
2165 * not accidentally tell it to go backwards.
2166 */
2167 ring_set_paused(engine, 1);
2168
2169 /*
2170 * Note that we have not stopped the GPU at this point,
2171 * so we are unwinding the incomplete requests as they
2172 * remain inflight and so by the time we do complete
2173 * the preemption, some of the unwound requests may
2174 * complete!
2175 */
2176 __unwind_incomplete_requests(engine);
2177
2178 last = NULL;
2179 } else if (need_timeslice(engine, last, rb) &&
2180 timeslice_expired(execlists, last)) {
2181 if (i915_request_completed(last)) {
2182 tasklet_hi_schedule(&execlists->tasklet);
2183 return;
2184 }
2185
2186 ENGINE_TRACE(engine,
2187 "expired last=%llx:%lld, prio=%d, hint=%d, yield?=%s\n",
2188 last->fence.context,
2189 last->fence.seqno,
2190 last->sched.attr.priority,
2191 execlists->queue_priority_hint,
2192 yesno(timeslice_yield(execlists, last)));
2193
2194 ring_set_paused(engine, 1);
2195 defer_active(engine);
2196
2197 /*
2198 * Unlike for preemption, if we rewind and continue
2199 * executing the same context as previously active,
2200 * the order of execution will remain the same and
2201 * the tail will only advance. We do not need to
2202 * force a full context restore, as a lite-restore
2203 * is sufficient to resample the monotonic TAIL.
2204 *
2205 * If we switch to any other context, similarly we
2206 * will not rewind TAIL of current context, and
2207 * normal save/restore will preserve state and allow
2208 * us to later continue executing the same request.
2209 */
2210 last = NULL;
2211 } else {
2212 /*
2213 * Otherwise if we already have a request pending
2214 * for execution after the current one, we can
2215 * just wait until the next CS event before
2216 * queuing more. In either case we will force a
2217 * lite-restore preemption event, but if we wait
2218 * we hopefully coalesce several updates into a single
2219 * submission.
2220 */
2221 if (!list_is_last(&last->sched.link,
2222 &engine->active.requests)) {
2223 /*
2224 * Even if ELSP[1] is occupied and not worthy
2225 * of timeslices, our queue might be.
2226 */
2227 start_timeslice(engine, queue_prio(execlists));
2228 return;
2229 }
2230 }
2231 }
2232
2233 while (rb) { /* XXX virtual is always taking precedence */
2234 struct virtual_engine *ve =
2235 rb_entry(rb, typeof(*ve), nodes[engine->id].rb);
2236 struct i915_request *rq;
2237
2238 spin_lock(&ve->base.active.lock);
2239
2240 rq = ve->request;
2241 if (unlikely(!rq)) { /* lost the race to a sibling */
2242 spin_unlock(&ve->base.active.lock);
2243 rb_erase_cached(rb, &execlists->virtual);
2244 RB_CLEAR_NODE(rb);
2245 rb = rb_first_cached(&execlists->virtual);
2246 continue;
2247 }
2248
2249 GEM_BUG_ON(rq != ve->request);
2250 GEM_BUG_ON(rq->engine != &ve->base);
2251 GEM_BUG_ON(rq->context != &ve->context);
2252
2253 if (rq_prio(rq) >= queue_prio(execlists)) {
2254 if (!virtual_matches(ve, rq, engine)) {
2255 spin_unlock(&ve->base.active.lock);
2256 rb = rb_next(rb);
2257 continue;
2258 }
2259
2260 if (last && !can_merge_rq(last, rq)) {
2261 spin_unlock(&ve->base.active.lock);
2262 start_timeslice(engine, rq_prio(rq));
2263 return; /* leave this for another sibling */
2264 }
2265
2266 ENGINE_TRACE(engine,
2267 "virtual rq=%llx:%lld%s, new engine? %s\n",
2268 rq->fence.context,
2269 rq->fence.seqno,
2270 i915_request_completed(rq) ? "!" :
2271 i915_request_started(rq) ? "*" :
2272 "",
2273 yesno(engine != ve->siblings[0]));
2274
2275 WRITE_ONCE(ve->request, NULL);
2276 WRITE_ONCE(ve->base.execlists.queue_priority_hint,
2277 INT_MIN);
2278 rb_erase_cached(rb, &execlists->virtual);
2279 RB_CLEAR_NODE(rb);
2280
2281 GEM_BUG_ON(!(rq->execution_mask & engine->mask));
2282 WRITE_ONCE(rq->engine, engine);
2283
2284 if (__i915_request_submit(rq)) {
2285 /*
2286 * Only after we confirm that we will submit
2287 * this request (i.e. it has not already
2288 * completed), do we want to update the context.
2289 *
2290 * This serves two purposes. It avoids
2291 * unnecessary work if we are resubmitting an
2292 * already completed request after timeslicing.
2293 * But more importantly, it prevents us altering
2294 * ve->siblings[] on an idle context, where
2295 * we may be using ve->siblings[] in
2296 * virtual_context_enter / virtual_context_exit.
2297 */
2298 virtual_xfer_context(ve, engine);
2299 GEM_BUG_ON(ve->siblings[0] != engine);
2300
2301 submit = true;
2302 last = rq;
2303 }
2304 i915_request_put(rq);
2305
2306 /*
2307 * Hmm, we have a bunch of virtual engine requests,
2308 * but the first one was already completed (thanks
2309 * preempt-to-busy!). Keep looking at the veng queue
2310 * until we have no more relevant requests (i.e.
2311 * the normal submit queue has higher priority).
2312 */
2313 if (!submit) {
2314 spin_unlock(&ve->base.active.lock);
2315 rb = rb_first_cached(&execlists->virtual);
2316 continue;
2317 }
2318 }
2319
2320 spin_unlock(&ve->base.active.lock);
2321 break;
2322 }
2323
2324 while ((rb = rb_first_cached(&execlists->queue))) {
2325 struct i915_priolist *p = to_priolist(rb);
2326 struct i915_request *rq, *rn;
2327 int i;
2328
2329 priolist_for_each_request_consume(rq, rn, p, i) {
2330 bool merge = true;
2331
2332 /*
2333 * Can we combine this request with the current port?
2334 * It has to be the same context/ringbuffer and not
2335 * have any exceptions (e.g. GVT saying never to
2336 * combine contexts).
2337 *
2338 * If we can combine the requests, we can execute both
2339 * by updating the RING_TAIL to point to the end of the
2340 * second request, and so we never need to tell the
2341 * hardware about the first.
2342 */
2343 if (last && !can_merge_rq(last, rq)) {
2344 /*
2345 * If we are on the second port and cannot
2346 * combine this request with the last, then we
2347 * are done.
2348 */
2349 if (port == last_port)
2350 goto done;
2351
2352 /*
2353 * We must not populate both ELSP[] with the
2354 * same LRCA, i.e. we must submit 2 different
2355 * contexts if we submit 2 ELSP.
2356 */
2357 if (last->context == rq->context)
2358 goto done;
2359
2360 if (i915_request_has_sentinel(last))
2361 goto done;
2362
2363 /*
2364 * If GVT overrides us we only ever submit
2365 * port[0], leaving port[1] empty. Note that we
2366 * also have to be careful that we don't queue
2367 * the same context (even though a different
2368 * request) to the second port.
2369 */
2370 if (ctx_single_port_submission(last->context) ||
2371 ctx_single_port_submission(rq->context))
2372 goto done;
2373
2374 merge = false;
2375 }
2376
2377 if (__i915_request_submit(rq)) {
2378 if (!merge) {
2379 *port = execlists_schedule_in(last, port - execlists->pending);
2380 port++;
2381 last = NULL;
2382 }
2383
2384 GEM_BUG_ON(last &&
2385 !can_merge_ctx(last->context,
2386 rq->context));
2387 GEM_BUG_ON(last &&
2388 i915_seqno_passed(last->fence.seqno,
2389 rq->fence.seqno));
2390
2391 submit = true;
2392 last = rq;
2393 }
2394 }
2395
2396 rb_erase_cached(&p->node, &execlists->queue);
2397 i915_priolist_free(p);
2398 }
2399
2400 done:
2401 /*
2402 * Here be a bit of magic! Or sleight-of-hand, whichever you prefer.
2403 *
2404 * We choose the priority hint such that if we add a request of greater
2405 * priority than this, we kick the submission tasklet to decide on
2406 * the right order of submitting the requests to hardware. We must
2407 * also be prepared to reorder requests as they are in-flight on the
2408 * HW. We derive the priority hint then as the first "hole" in
2409 * the HW submission ports and if there are no available slots,
2410 * the priority of the lowest executing request, i.e. last.
2411 *
2412 * When we do receive a higher priority request ready to run from the
2413 * user, see queue_request(), the priority hint is bumped to that
2414 * request triggering preemption on the next dequeue (or subsequent
2415 * interrupt for secondary ports).
2416 */
2417 execlists->queue_priority_hint = queue_prio(execlists);
2418
2419 if (submit) {
2420 *port = execlists_schedule_in(last, port - execlists->pending);
2421 execlists->switch_priority_hint =
2422 switch_prio(engine, *execlists->pending);
2423
2424 /*
2425 * Skip if we ended up with exactly the same set of requests,
2426 * e.g. trying to timeslice a pair of ordered contexts
2427 */
2428 if (!memcmp(active, execlists->pending,
2429 (port - execlists->pending + 1) * sizeof(*port))) {
2430 do
2431 execlists_schedule_out(fetch_and_zero(port));
2432 while (port-- != execlists->pending);
2433
2434 goto skip_submit;
2435 }
2436 clear_ports(port + 1, last_port - port);
2437
2438 WRITE_ONCE(execlists->yield, -1);
2439 set_preempt_timeout(engine, *active);
2440 execlists_submit_ports(engine);
2441 } else {
2442 start_timeslice(engine, execlists->queue_priority_hint);
2443 skip_submit:
2444 ring_set_paused(engine, 0);
2445 }
2446 }
2447
2448 static void
cancel_port_requests(struct intel_engine_execlists * const execlists)2449 cancel_port_requests(struct intel_engine_execlists * const execlists)
2450 {
2451 struct i915_request * const *port;
2452
2453 for (port = execlists->pending; *port; port++)
2454 execlists_schedule_out(*port);
2455 clear_ports(execlists->pending, ARRAY_SIZE(execlists->pending));
2456
2457 /* Mark the end of active before we overwrite *active */
2458 for (port = xchg(&execlists->active, execlists->pending); *port; port++)
2459 execlists_schedule_out(*port);
2460 clear_ports(execlists->inflight, ARRAY_SIZE(execlists->inflight));
2461
2462 smp_wmb(); /* complete the seqlock for execlists_active() */
2463 WRITE_ONCE(execlists->active, execlists->inflight);
2464 }
2465
2466 static inline void
invalidate_csb_entries(const u64 * first,const u64 * last)2467 invalidate_csb_entries(const u64 *first, const u64 *last)
2468 {
2469 clflush((void *)first);
2470 clflush((void *)last);
2471 }
2472
2473 /*
2474 * Starting with Gen12, the status has a new format:
2475 *
2476 * bit 0: switched to new queue
2477 * bit 1: reserved
2478 * bit 2: semaphore wait mode (poll or signal), only valid when
2479 * switch detail is set to "wait on semaphore"
2480 * bits 3-5: engine class
2481 * bits 6-11: engine instance
2482 * bits 12-14: reserved
2483 * bits 15-25: sw context id of the lrc the GT switched to
2484 * bits 26-31: sw counter of the lrc the GT switched to
2485 * bits 32-35: context switch detail
2486 * - 0: ctx complete
2487 * - 1: wait on sync flip
2488 * - 2: wait on vblank
2489 * - 3: wait on scanline
2490 * - 4: wait on semaphore
2491 * - 5: context preempted (not on SEMAPHORE_WAIT or
2492 * WAIT_FOR_EVENT)
2493 * bit 36: reserved
2494 * bits 37-43: wait detail (for switch detail 1 to 4)
2495 * bits 44-46: reserved
2496 * bits 47-57: sw context id of the lrc the GT switched away from
2497 * bits 58-63: sw counter of the lrc the GT switched away from
2498 */
gen12_csb_parse(const u64 * csb)2499 static inline bool gen12_csb_parse(const u64 *csb)
2500 {
2501 bool ctx_away_valid;
2502 bool new_queue;
2503 u64 entry;
2504
2505 /* HSD#22011248461 */
2506 entry = READ_ONCE(*csb);
2507 if (unlikely(entry == -1)) {
2508 preempt_disable();
2509 if (wait_for_atomic_us((entry = READ_ONCE(*csb)) != -1, 50))
2510 GEM_WARN_ON("50us CSB timeout");
2511 preempt_enable();
2512 }
2513 WRITE_ONCE(*(u64 *)csb, -1);
2514
2515 ctx_away_valid = GEN12_CSB_CTX_VALID(upper_32_bits(entry));
2516 new_queue =
2517 lower_32_bits(entry) & GEN12_CTX_STATUS_SWITCHED_TO_NEW_QUEUE;
2518
2519 /*
2520 * The context switch detail is not guaranteed to be 5 when a preemption
2521 * occurs, so we can't just check for that. The check below works for
2522 * all the cases we care about, including preemptions of WAIT
2523 * instructions and lite-restore. Preempt-to-idle via the CTRL register
2524 * would require some extra handling, but we don't support that.
2525 */
2526 if (!ctx_away_valid || new_queue) {
2527 GEM_BUG_ON(!GEN12_CSB_CTX_VALID(lower_32_bits(entry)));
2528 return true;
2529 }
2530
2531 /*
2532 * switch detail = 5 is covered by the case above and we do not expect a
2533 * context switch on an unsuccessful wait instruction since we always
2534 * use polling mode.
2535 */
2536 GEM_BUG_ON(GEN12_CTX_SWITCH_DETAIL(upper_32_bits(entry)));
2537 return false;
2538 }
2539
gen8_csb_parse(const u64 * csb)2540 static inline bool gen8_csb_parse(const u64 *csb)
2541 {
2542 return *csb & (GEN8_CTX_STATUS_IDLE_ACTIVE | GEN8_CTX_STATUS_PREEMPTED);
2543 }
2544
process_csb(struct intel_engine_cs * engine)2545 static void process_csb(struct intel_engine_cs *engine)
2546 {
2547 struct intel_engine_execlists * const execlists = &engine->execlists;
2548 const u64 * const buf = execlists->csb_status;
2549 const u8 num_entries = execlists->csb_size;
2550 u8 head, tail;
2551
2552 /*
2553 * As we modify our execlists state tracking we require exclusive
2554 * access. Either we are inside the tasklet, or the tasklet is disabled
2555 * and we assume that is only inside the reset paths and so serialised.
2556 */
2557 GEM_BUG_ON(!tasklet_is_locked(&execlists->tasklet) &&
2558 !reset_in_progress(execlists));
2559 GEM_BUG_ON(!intel_engine_in_execlists_submission_mode(engine));
2560
2561 /*
2562 * Note that csb_write, csb_status may be either in HWSP or mmio.
2563 * When reading from the csb_write mmio register, we have to be
2564 * careful to only use the GEN8_CSB_WRITE_PTR portion, which is
2565 * the low 4bits. As it happens we know the next 4bits are always
2566 * zero and so we can simply masked off the low u8 of the register
2567 * and treat it identically to reading from the HWSP (without having
2568 * to use explicit shifting and masking, and probably bifurcating
2569 * the code to handle the legacy mmio read).
2570 */
2571 head = execlists->csb_head;
2572 tail = READ_ONCE(*execlists->csb_write);
2573 if (unlikely(head == tail))
2574 return;
2575
2576 /*
2577 * We will consume all events from HW, or at least pretend to.
2578 *
2579 * The sequence of events from the HW is deterministic, and derived
2580 * from our writes to the ELSP, with a smidgen of variability for
2581 * the arrival of the asynchronous requests wrt to the inflight
2582 * execution. If the HW sends an event that does not correspond with
2583 * the one we are expecting, we have to abandon all hope as we lose
2584 * all tracking of what the engine is actually executing. We will
2585 * only detect we are out of sequence with the HW when we get an
2586 * 'impossible' event because we have already drained our own
2587 * preemption/promotion queue. If this occurs, we know that we likely
2588 * lost track of execution earlier and must unwind and restart, the
2589 * simplest way is by stop processing the event queue and force the
2590 * engine to reset.
2591 */
2592 execlists->csb_head = tail;
2593 ENGINE_TRACE(engine, "cs-irq head=%d, tail=%d\n", head, tail);
2594
2595 /*
2596 * Hopefully paired with a wmb() in HW!
2597 *
2598 * We must complete the read of the write pointer before any reads
2599 * from the CSB, so that we do not see stale values. Without an rmb
2600 * (lfence) the HW may speculatively perform the CSB[] reads *before*
2601 * we perform the READ_ONCE(*csb_write).
2602 */
2603 rmb();
2604 do {
2605 bool promote;
2606
2607 if (++head == num_entries)
2608 head = 0;
2609
2610 /*
2611 * We are flying near dragons again.
2612 *
2613 * We hold a reference to the request in execlist_port[]
2614 * but no more than that. We are operating in softirq
2615 * context and so cannot hold any mutex or sleep. That
2616 * prevents us stopping the requests we are processing
2617 * in port[] from being retired simultaneously (the
2618 * breadcrumb will be complete before we see the
2619 * context-switch). As we only hold the reference to the
2620 * request, any pointer chasing underneath the request
2621 * is subject to a potential use-after-free. Thus we
2622 * store all of the bookkeeping within port[] as
2623 * required, and avoid using unguarded pointers beneath
2624 * request itself. The same applies to the atomic
2625 * status notifier.
2626 */
2627
2628 ENGINE_TRACE(engine, "csb[%d]: status=0x%08x:0x%08x\n",
2629 head,
2630 upper_32_bits(buf[head]),
2631 lower_32_bits(buf[head]));
2632
2633 if (INTEL_GEN(engine->i915) >= 12)
2634 promote = gen12_csb_parse(buf + head);
2635 else
2636 promote = gen8_csb_parse(buf + head);
2637 if (promote) {
2638 struct i915_request * const *old = execlists->active;
2639
2640 if (GEM_WARN_ON(!*execlists->pending)) {
2641 execlists->error_interrupt |= ERROR_CSB;
2642 break;
2643 }
2644
2645 ring_set_paused(engine, 0);
2646
2647 /* Point active to the new ELSP; prevent overwriting */
2648 WRITE_ONCE(execlists->active, execlists->pending);
2649 smp_wmb(); /* notify execlists_active() */
2650
2651 /* cancel old inflight, prepare for switch */
2652 trace_ports(execlists, "preempted", old);
2653 while (*old)
2654 execlists_schedule_out(*old++);
2655
2656 /* switch pending to inflight */
2657 GEM_BUG_ON(!assert_pending_valid(execlists, "promote"));
2658 copy_ports(execlists->inflight,
2659 execlists->pending,
2660 execlists_num_ports(execlists));
2661 smp_wmb(); /* complete the seqlock */
2662 WRITE_ONCE(execlists->active, execlists->inflight);
2663
2664 /* XXX Magic delay for tgl */
2665 ENGINE_POSTING_READ(engine, RING_CONTEXT_STATUS_PTR);
2666
2667 WRITE_ONCE(execlists->pending[0], NULL);
2668 } else {
2669 if (GEM_WARN_ON(!*execlists->active)) {
2670 execlists->error_interrupt |= ERROR_CSB;
2671 break;
2672 }
2673
2674 /* port0 completed, advanced to port1 */
2675 trace_ports(execlists, "completed", execlists->active);
2676
2677 /*
2678 * We rely on the hardware being strongly
2679 * ordered, that the breadcrumb write is
2680 * coherent (visible from the CPU) before the
2681 * user interrupt is processed. One might assume
2682 * that the breadcrumb write being before the
2683 * user interrupt and the CS event for the context
2684 * switch would therefore be before the CS event
2685 * itself...
2686 */
2687 if (GEM_SHOW_DEBUG() &&
2688 !i915_request_completed(*execlists->active)) {
2689 struct i915_request *rq = *execlists->active;
2690 const u32 *regs __maybe_unused =
2691 rq->context->lrc_reg_state;
2692
2693 ENGINE_TRACE(engine,
2694 "context completed before request!\n");
2695 ENGINE_TRACE(engine,
2696 "ring:{start:0x%08x, head:%04x, tail:%04x, ctl:%08x, mode:%08x}\n",
2697 ENGINE_READ(engine, RING_START),
2698 ENGINE_READ(engine, RING_HEAD) & HEAD_ADDR,
2699 ENGINE_READ(engine, RING_TAIL) & TAIL_ADDR,
2700 ENGINE_READ(engine, RING_CTL),
2701 ENGINE_READ(engine, RING_MI_MODE));
2702 ENGINE_TRACE(engine,
2703 "rq:{start:%08x, head:%04x, tail:%04x, seqno:%llx:%d, hwsp:%d}, ",
2704 i915_ggtt_offset(rq->ring->vma),
2705 rq->head, rq->tail,
2706 rq->fence.context,
2707 lower_32_bits(rq->fence.seqno),
2708 hwsp_seqno(rq));
2709 ENGINE_TRACE(engine,
2710 "ctx:{start:%08x, head:%04x, tail:%04x}, ",
2711 regs[CTX_RING_START],
2712 regs[CTX_RING_HEAD],
2713 regs[CTX_RING_TAIL]);
2714 }
2715
2716 execlists_schedule_out(*execlists->active++);
2717
2718 GEM_BUG_ON(execlists->active - execlists->inflight >
2719 execlists_num_ports(execlists));
2720 }
2721 } while (head != tail);
2722
2723 set_timeslice(engine);
2724
2725 /*
2726 * Gen11 has proven to fail wrt global observation point between
2727 * entry and tail update, failing on the ordering and thus
2728 * we see an old entry in the context status buffer.
2729 *
2730 * Forcibly evict out entries for the next gpu csb update,
2731 * to increase the odds that we get a fresh entries with non
2732 * working hardware. The cost for doing so comes out mostly with
2733 * the wash as hardware, working or not, will need to do the
2734 * invalidation before.
2735 */
2736 invalidate_csb_entries(&buf[0], &buf[num_entries - 1]);
2737 }
2738
__execlists_submission_tasklet(struct intel_engine_cs * const engine)2739 static void __execlists_submission_tasklet(struct intel_engine_cs *const engine)
2740 {
2741 lockdep_assert_held(&engine->active.lock);
2742 if (!READ_ONCE(engine->execlists.pending[0])) {
2743 rcu_read_lock(); /* protect peeking at execlists->active */
2744 execlists_dequeue(engine);
2745 rcu_read_unlock();
2746 }
2747 }
2748
__execlists_hold(struct i915_request * rq)2749 static void __execlists_hold(struct i915_request *rq)
2750 {
2751 LIST_HEAD(list);
2752
2753 do {
2754 struct i915_dependency *p;
2755
2756 if (i915_request_is_active(rq))
2757 __i915_request_unsubmit(rq);
2758
2759 clear_bit(I915_FENCE_FLAG_PQUEUE, &rq->fence.flags);
2760 list_move_tail(&rq->sched.link, &rq->engine->active.hold);
2761 i915_request_set_hold(rq);
2762 RQ_TRACE(rq, "on hold\n");
2763
2764 for_each_waiter(p, rq) {
2765 struct i915_request *w =
2766 container_of(p->waiter, typeof(*w), sched);
2767
2768 /* Leave semaphores spinning on the other engines */
2769 if (w->engine != rq->engine)
2770 continue;
2771
2772 if (!i915_request_is_ready(w))
2773 continue;
2774
2775 if (i915_request_completed(w))
2776 continue;
2777
2778 if (i915_request_on_hold(w))
2779 continue;
2780
2781 list_move_tail(&w->sched.link, &list);
2782 }
2783
2784 rq = list_first_entry_or_null(&list, typeof(*rq), sched.link);
2785 } while (rq);
2786 }
2787
execlists_hold(struct intel_engine_cs * engine,struct i915_request * rq)2788 static bool execlists_hold(struct intel_engine_cs *engine,
2789 struct i915_request *rq)
2790 {
2791 if (i915_request_on_hold(rq))
2792 return false;
2793
2794 spin_lock_irq(&engine->active.lock);
2795
2796 if (i915_request_completed(rq)) { /* too late! */
2797 rq = NULL;
2798 goto unlock;
2799 }
2800
2801 if (rq->engine != engine) { /* preempted virtual engine */
2802 struct virtual_engine *ve = to_virtual_engine(rq->engine);
2803
2804 /*
2805 * intel_context_inflight() is only protected by virtue
2806 * of process_csb() being called only by the tasklet (or
2807 * directly from inside reset while the tasklet is suspended).
2808 * Assert that neither of those are allowed to run while we
2809 * poke at the request queues.
2810 */
2811 GEM_BUG_ON(!reset_in_progress(&engine->execlists));
2812
2813 /*
2814 * An unsubmitted request along a virtual engine will
2815 * remain on the active (this) engine until we are able
2816 * to process the context switch away (and so mark the
2817 * context as no longer in flight). That cannot have happened
2818 * yet, otherwise we would not be hanging!
2819 */
2820 spin_lock(&ve->base.active.lock);
2821 GEM_BUG_ON(intel_context_inflight(rq->context) != engine);
2822 GEM_BUG_ON(ve->request != rq);
2823 ve->request = NULL;
2824 spin_unlock(&ve->base.active.lock);
2825 i915_request_put(rq);
2826
2827 rq->engine = engine;
2828 }
2829
2830 /*
2831 * Transfer this request onto the hold queue to prevent it
2832 * being resumbitted to HW (and potentially completed) before we have
2833 * released it. Since we may have already submitted following
2834 * requests, we need to remove those as well.
2835 */
2836 GEM_BUG_ON(i915_request_on_hold(rq));
2837 GEM_BUG_ON(rq->engine != engine);
2838 __execlists_hold(rq);
2839 GEM_BUG_ON(list_empty(&engine->active.hold));
2840
2841 unlock:
2842 spin_unlock_irq(&engine->active.lock);
2843 return rq;
2844 }
2845
hold_request(const struct i915_request * rq)2846 static bool hold_request(const struct i915_request *rq)
2847 {
2848 struct i915_dependency *p;
2849 bool result = false;
2850
2851 /*
2852 * If one of our ancestors is on hold, we must also be on hold,
2853 * otherwise we will bypass it and execute before it.
2854 */
2855 rcu_read_lock();
2856 for_each_signaler(p, rq) {
2857 const struct i915_request *s =
2858 container_of(p->signaler, typeof(*s), sched);
2859
2860 if (s->engine != rq->engine)
2861 continue;
2862
2863 result = i915_request_on_hold(s);
2864 if (result)
2865 break;
2866 }
2867 rcu_read_unlock();
2868
2869 return result;
2870 }
2871
__execlists_unhold(struct i915_request * rq)2872 static void __execlists_unhold(struct i915_request *rq)
2873 {
2874 LIST_HEAD(list);
2875
2876 do {
2877 struct i915_dependency *p;
2878
2879 RQ_TRACE(rq, "hold release\n");
2880
2881 GEM_BUG_ON(!i915_request_on_hold(rq));
2882 GEM_BUG_ON(!i915_sw_fence_signaled(&rq->submit));
2883
2884 i915_request_clear_hold(rq);
2885 list_move_tail(&rq->sched.link,
2886 i915_sched_lookup_priolist(rq->engine,
2887 rq_prio(rq)));
2888 set_bit(I915_FENCE_FLAG_PQUEUE, &rq->fence.flags);
2889
2890 /* Also release any children on this engine that are ready */
2891 for_each_waiter(p, rq) {
2892 struct i915_request *w =
2893 container_of(p->waiter, typeof(*w), sched);
2894
2895 /* Propagate any change in error status */
2896 if (rq->fence.error)
2897 i915_request_set_error_once(w, rq->fence.error);
2898
2899 if (w->engine != rq->engine)
2900 continue;
2901
2902 if (!i915_request_on_hold(w))
2903 continue;
2904
2905 /* Check that no other parents are also on hold */
2906 if (hold_request(w))
2907 continue;
2908
2909 list_move_tail(&w->sched.link, &list);
2910 }
2911
2912 rq = list_first_entry_or_null(&list, typeof(*rq), sched.link);
2913 } while (rq);
2914 }
2915
execlists_unhold(struct intel_engine_cs * engine,struct i915_request * rq)2916 static void execlists_unhold(struct intel_engine_cs *engine,
2917 struct i915_request *rq)
2918 {
2919 spin_lock_irq(&engine->active.lock);
2920
2921 /*
2922 * Move this request back to the priority queue, and all of its
2923 * children and grandchildren that were suspended along with it.
2924 */
2925 __execlists_unhold(rq);
2926
2927 if (rq_prio(rq) > engine->execlists.queue_priority_hint) {
2928 engine->execlists.queue_priority_hint = rq_prio(rq);
2929 tasklet_hi_schedule(&engine->execlists.tasklet);
2930 }
2931
2932 spin_unlock_irq(&engine->active.lock);
2933 }
2934
2935 struct execlists_capture {
2936 struct work_struct work;
2937 struct i915_request *rq;
2938 struct i915_gpu_coredump *error;
2939 };
2940
execlists_capture_work(struct work_struct * work)2941 static void execlists_capture_work(struct work_struct *work)
2942 {
2943 struct execlists_capture *cap = container_of(work, typeof(*cap), work);
2944 const gfp_t gfp = GFP_KERNEL | __GFP_RETRY_MAYFAIL | __GFP_NOWARN;
2945 struct intel_engine_cs *engine = cap->rq->engine;
2946 struct intel_gt_coredump *gt = cap->error->gt;
2947 struct intel_engine_capture_vma *vma;
2948
2949 /* Compress all the objects attached to the request, slow! */
2950 vma = intel_engine_coredump_add_request(gt->engine, cap->rq, gfp);
2951 if (vma) {
2952 struct i915_vma_compress *compress =
2953 i915_vma_capture_prepare(gt);
2954
2955 intel_engine_coredump_add_vma(gt->engine, vma, compress);
2956 i915_vma_capture_finish(gt, compress);
2957 }
2958
2959 gt->simulated = gt->engine->simulated;
2960 cap->error->simulated = gt->simulated;
2961
2962 /* Publish the error state, and announce it to the world */
2963 i915_error_state_store(cap->error);
2964 i915_gpu_coredump_put(cap->error);
2965
2966 /* Return this request and all that depend upon it for signaling */
2967 execlists_unhold(engine, cap->rq);
2968 i915_request_put(cap->rq);
2969
2970 kfree(cap);
2971 }
2972
capture_regs(struct intel_engine_cs * engine)2973 static struct execlists_capture *capture_regs(struct intel_engine_cs *engine)
2974 {
2975 const gfp_t gfp = GFP_ATOMIC | __GFP_NOWARN;
2976 struct execlists_capture *cap;
2977
2978 cap = kmalloc(sizeof(*cap), gfp);
2979 if (!cap)
2980 return NULL;
2981
2982 cap->error = i915_gpu_coredump_alloc(engine->i915, gfp);
2983 if (!cap->error)
2984 goto err_cap;
2985
2986 cap->error->gt = intel_gt_coredump_alloc(engine->gt, gfp);
2987 if (!cap->error->gt)
2988 goto err_gpu;
2989
2990 cap->error->gt->engine = intel_engine_coredump_alloc(engine, gfp);
2991 if (!cap->error->gt->engine)
2992 goto err_gt;
2993
2994 return cap;
2995
2996 err_gt:
2997 kfree(cap->error->gt);
2998 err_gpu:
2999 kfree(cap->error);
3000 err_cap:
3001 kfree(cap);
3002 return NULL;
3003 }
3004
3005 static struct i915_request *
active_context(struct intel_engine_cs * engine,u32 ccid)3006 active_context(struct intel_engine_cs *engine, u32 ccid)
3007 {
3008 const struct intel_engine_execlists * const el = &engine->execlists;
3009 struct i915_request * const *port, *rq;
3010
3011 /*
3012 * Use the most recent result from process_csb(), but just in case
3013 * we trigger an error (via interrupt) before the first CS event has
3014 * been written, peek at the next submission.
3015 */
3016
3017 for (port = el->active; (rq = *port); port++) {
3018 if (rq->context->lrc.ccid == ccid) {
3019 ENGINE_TRACE(engine,
3020 "ccid found at active:%zd\n",
3021 port - el->active);
3022 return rq;
3023 }
3024 }
3025
3026 for (port = el->pending; (rq = *port); port++) {
3027 if (rq->context->lrc.ccid == ccid) {
3028 ENGINE_TRACE(engine,
3029 "ccid found at pending:%zd\n",
3030 port - el->pending);
3031 return rq;
3032 }
3033 }
3034
3035 ENGINE_TRACE(engine, "ccid:%x not found\n", ccid);
3036 return NULL;
3037 }
3038
active_ccid(struct intel_engine_cs * engine)3039 static u32 active_ccid(struct intel_engine_cs *engine)
3040 {
3041 return ENGINE_READ_FW(engine, RING_EXECLIST_STATUS_HI);
3042 }
3043
execlists_capture(struct intel_engine_cs * engine)3044 static void execlists_capture(struct intel_engine_cs *engine)
3045 {
3046 struct execlists_capture *cap;
3047
3048 if (!IS_ENABLED(CONFIG_DRM_I915_CAPTURE_ERROR))
3049 return;
3050
3051 /*
3052 * We need to _quickly_ capture the engine state before we reset.
3053 * We are inside an atomic section (softirq) here and we are delaying
3054 * the forced preemption event.
3055 */
3056 cap = capture_regs(engine);
3057 if (!cap)
3058 return;
3059
3060 spin_lock_irq(&engine->active.lock);
3061 cap->rq = active_context(engine, active_ccid(engine));
3062 if (cap->rq) {
3063 cap->rq = active_request(cap->rq->context->timeline, cap->rq);
3064 cap->rq = i915_request_get_rcu(cap->rq);
3065 }
3066 spin_unlock_irq(&engine->active.lock);
3067 if (!cap->rq)
3068 goto err_free;
3069
3070 /*
3071 * Remove the request from the execlists queue, and take ownership
3072 * of the request. We pass it to our worker who will _slowly_ compress
3073 * all the pages the _user_ requested for debugging their batch, after
3074 * which we return it to the queue for signaling.
3075 *
3076 * By removing them from the execlists queue, we also remove the
3077 * requests from being processed by __unwind_incomplete_requests()
3078 * during the intel_engine_reset(), and so they will *not* be replayed
3079 * afterwards.
3080 *
3081 * Note that because we have not yet reset the engine at this point,
3082 * it is possible for the request that we have identified as being
3083 * guilty, did in fact complete and we will then hit an arbitration
3084 * point allowing the outstanding preemption to succeed. The likelihood
3085 * of that is very low (as capturing of the engine registers should be
3086 * fast enough to run inside an irq-off atomic section!), so we will
3087 * simply hold that request accountable for being non-preemptible
3088 * long enough to force the reset.
3089 */
3090 if (!execlists_hold(engine, cap->rq))
3091 goto err_rq;
3092
3093 INIT_WORK(&cap->work, execlists_capture_work);
3094 schedule_work(&cap->work);
3095 return;
3096
3097 err_rq:
3098 i915_request_put(cap->rq);
3099 err_free:
3100 i915_gpu_coredump_put(cap->error);
3101 kfree(cap);
3102 }
3103
execlists_reset(struct intel_engine_cs * engine,const char * msg)3104 static void execlists_reset(struct intel_engine_cs *engine, const char *msg)
3105 {
3106 const unsigned int bit = I915_RESET_ENGINE + engine->id;
3107 unsigned long *lock = &engine->gt->reset.flags;
3108
3109 if (!intel_has_reset_engine(engine->gt))
3110 return;
3111
3112 if (test_and_set_bit(bit, lock))
3113 return;
3114
3115 ENGINE_TRACE(engine, "reset for %s\n", msg);
3116
3117 /* Mark this tasklet as disabled to avoid waiting for it to complete */
3118 tasklet_disable_nosync(&engine->execlists.tasklet);
3119
3120 ring_set_paused(engine, 1); /* Freeze the current request in place */
3121 execlists_capture(engine);
3122 intel_engine_reset(engine, msg);
3123
3124 tasklet_enable(&engine->execlists.tasklet);
3125 clear_and_wake_up_bit(bit, lock);
3126 }
3127
preempt_timeout(const struct intel_engine_cs * const engine)3128 static bool preempt_timeout(const struct intel_engine_cs *const engine)
3129 {
3130 const struct timer_list *t = &engine->execlists.preempt;
3131
3132 if (!CONFIG_DRM_I915_PREEMPT_TIMEOUT)
3133 return false;
3134
3135 if (!timer_expired(t))
3136 return false;
3137
3138 return READ_ONCE(engine->execlists.pending[0]);
3139 }
3140
3141 /*
3142 * Check the unread Context Status Buffers and manage the submission of new
3143 * contexts to the ELSP accordingly.
3144 */
execlists_submission_tasklet(unsigned long data)3145 static void execlists_submission_tasklet(unsigned long data)
3146 {
3147 struct intel_engine_cs * const engine = (struct intel_engine_cs *)data;
3148 bool timeout = preempt_timeout(engine);
3149
3150 process_csb(engine);
3151
3152 if (unlikely(READ_ONCE(engine->execlists.error_interrupt))) {
3153 const char *msg;
3154
3155 /* Generate the error message in priority wrt to the user! */
3156 if (engine->execlists.error_interrupt & GENMASK(15, 0))
3157 msg = "CS error"; /* thrown by a user payload */
3158 else if (engine->execlists.error_interrupt & ERROR_CSB)
3159 msg = "invalid CSB event";
3160 else
3161 msg = "internal error";
3162
3163 engine->execlists.error_interrupt = 0;
3164 execlists_reset(engine, msg);
3165 }
3166
3167 if (!READ_ONCE(engine->execlists.pending[0]) || timeout) {
3168 unsigned long flags;
3169
3170 spin_lock_irqsave(&engine->active.lock, flags);
3171 __execlists_submission_tasklet(engine);
3172 spin_unlock_irqrestore(&engine->active.lock, flags);
3173
3174 /* Recheck after serialising with direct-submission */
3175 if (unlikely(timeout && preempt_timeout(engine))) {
3176 cancel_timer(&engine->execlists.preempt);
3177 execlists_reset(engine, "preemption time out");
3178 }
3179 }
3180 }
3181
__execlists_kick(struct intel_engine_execlists * execlists)3182 static void __execlists_kick(struct intel_engine_execlists *execlists)
3183 {
3184 /* Kick the tasklet for some interrupt coalescing and reset handling */
3185 tasklet_hi_schedule(&execlists->tasklet);
3186 }
3187
3188 #define execlists_kick(t, member) \
3189 __execlists_kick(container_of(t, struct intel_engine_execlists, member))
3190
execlists_timeslice(struct timer_list * timer)3191 static void execlists_timeslice(struct timer_list *timer)
3192 {
3193 execlists_kick(timer, timer);
3194 }
3195
execlists_preempt(struct timer_list * timer)3196 static void execlists_preempt(struct timer_list *timer)
3197 {
3198 execlists_kick(timer, preempt);
3199 }
3200
queue_request(struct intel_engine_cs * engine,struct i915_request * rq)3201 static void queue_request(struct intel_engine_cs *engine,
3202 struct i915_request *rq)
3203 {
3204 GEM_BUG_ON(!list_empty(&rq->sched.link));
3205 list_add_tail(&rq->sched.link,
3206 i915_sched_lookup_priolist(engine, rq_prio(rq)));
3207 set_bit(I915_FENCE_FLAG_PQUEUE, &rq->fence.flags);
3208 }
3209
__submit_queue_imm(struct intel_engine_cs * engine)3210 static void __submit_queue_imm(struct intel_engine_cs *engine)
3211 {
3212 struct intel_engine_execlists * const execlists = &engine->execlists;
3213
3214 if (reset_in_progress(execlists))
3215 return; /* defer until we restart the engine following reset */
3216
3217 __execlists_submission_tasklet(engine);
3218 }
3219
submit_queue(struct intel_engine_cs * engine,const struct i915_request * rq)3220 static void submit_queue(struct intel_engine_cs *engine,
3221 const struct i915_request *rq)
3222 {
3223 struct intel_engine_execlists *execlists = &engine->execlists;
3224
3225 if (rq_prio(rq) <= execlists->queue_priority_hint)
3226 return;
3227
3228 execlists->queue_priority_hint = rq_prio(rq);
3229 __submit_queue_imm(engine);
3230 }
3231
ancestor_on_hold(const struct intel_engine_cs * engine,const struct i915_request * rq)3232 static bool ancestor_on_hold(const struct intel_engine_cs *engine,
3233 const struct i915_request *rq)
3234 {
3235 GEM_BUG_ON(i915_request_on_hold(rq));
3236 return !list_empty(&engine->active.hold) && hold_request(rq);
3237 }
3238
flush_csb(struct intel_engine_cs * engine)3239 static void flush_csb(struct intel_engine_cs *engine)
3240 {
3241 struct intel_engine_execlists *el = &engine->execlists;
3242
3243 if (READ_ONCE(el->pending[0]) && tasklet_trylock(&el->tasklet)) {
3244 if (!reset_in_progress(el))
3245 process_csb(engine);
3246 tasklet_unlock(&el->tasklet);
3247 }
3248 }
3249
execlists_submit_request(struct i915_request * request)3250 static void execlists_submit_request(struct i915_request *request)
3251 {
3252 struct intel_engine_cs *engine = request->engine;
3253 unsigned long flags;
3254
3255 /* Hopefully we clear execlists->pending[] to let us through */
3256 flush_csb(engine);
3257
3258 /* Will be called from irq-context when using foreign fences. */
3259 spin_lock_irqsave(&engine->active.lock, flags);
3260
3261 if (unlikely(ancestor_on_hold(engine, request))) {
3262 RQ_TRACE(request, "ancestor on hold\n");
3263 list_add_tail(&request->sched.link, &engine->active.hold);
3264 i915_request_set_hold(request);
3265 } else {
3266 queue_request(engine, request);
3267
3268 GEM_BUG_ON(RB_EMPTY_ROOT(&engine->execlists.queue.rb_root));
3269 GEM_BUG_ON(list_empty(&request->sched.link));
3270
3271 submit_queue(engine, request);
3272 }
3273
3274 spin_unlock_irqrestore(&engine->active.lock, flags);
3275 }
3276
__execlists_context_fini(struct intel_context * ce)3277 static void __execlists_context_fini(struct intel_context *ce)
3278 {
3279 intel_ring_put(ce->ring);
3280 i915_vma_put(ce->state);
3281 }
3282
execlists_context_destroy(struct kref * kref)3283 static void execlists_context_destroy(struct kref *kref)
3284 {
3285 struct intel_context *ce = container_of(kref, typeof(*ce), ref);
3286
3287 GEM_BUG_ON(!i915_active_is_idle(&ce->active));
3288 GEM_BUG_ON(intel_context_is_pinned(ce));
3289
3290 if (ce->state)
3291 __execlists_context_fini(ce);
3292
3293 intel_context_fini(ce);
3294 intel_context_free(ce);
3295 }
3296
3297 static void
set_redzone(void * vaddr,const struct intel_engine_cs * engine)3298 set_redzone(void *vaddr, const struct intel_engine_cs *engine)
3299 {
3300 if (!IS_ENABLED(CONFIG_DRM_I915_DEBUG_GEM))
3301 return;
3302
3303 vaddr += engine->context_size;
3304
3305 memset(vaddr, CONTEXT_REDZONE, I915_GTT_PAGE_SIZE);
3306 }
3307
3308 static void
check_redzone(const void * vaddr,const struct intel_engine_cs * engine)3309 check_redzone(const void *vaddr, const struct intel_engine_cs *engine)
3310 {
3311 if (!IS_ENABLED(CONFIG_DRM_I915_DEBUG_GEM))
3312 return;
3313
3314 vaddr += engine->context_size;
3315
3316 if (memchr_inv(vaddr, CONTEXT_REDZONE, I915_GTT_PAGE_SIZE))
3317 drm_err_once(&engine->i915->drm,
3318 "%s context redzone overwritten!\n",
3319 engine->name);
3320 }
3321
execlists_context_unpin(struct intel_context * ce)3322 static void execlists_context_unpin(struct intel_context *ce)
3323 {
3324 check_redzone((void *)ce->lrc_reg_state - LRC_STATE_OFFSET,
3325 ce->engine);
3326 }
3327
execlists_context_post_unpin(struct intel_context * ce)3328 static void execlists_context_post_unpin(struct intel_context *ce)
3329 {
3330 i915_gem_object_unpin_map(ce->state->obj);
3331 }
3332
3333 static u32 *
gen12_emit_timestamp_wa(const struct intel_context * ce,u32 * cs)3334 gen12_emit_timestamp_wa(const struct intel_context *ce, u32 *cs)
3335 {
3336 *cs++ = MI_LOAD_REGISTER_MEM_GEN8 |
3337 MI_SRM_LRM_GLOBAL_GTT |
3338 MI_LRI_LRM_CS_MMIO;
3339 *cs++ = i915_mmio_reg_offset(GEN8_RING_CS_GPR(0, 0));
3340 *cs++ = i915_ggtt_offset(ce->state) + LRC_STATE_OFFSET +
3341 CTX_TIMESTAMP * sizeof(u32);
3342 *cs++ = 0;
3343
3344 *cs++ = MI_LOAD_REGISTER_REG |
3345 MI_LRR_SOURCE_CS_MMIO |
3346 MI_LRI_LRM_CS_MMIO;
3347 *cs++ = i915_mmio_reg_offset(GEN8_RING_CS_GPR(0, 0));
3348 *cs++ = i915_mmio_reg_offset(RING_CTX_TIMESTAMP(0));
3349
3350 *cs++ = MI_LOAD_REGISTER_REG |
3351 MI_LRR_SOURCE_CS_MMIO |
3352 MI_LRI_LRM_CS_MMIO;
3353 *cs++ = i915_mmio_reg_offset(GEN8_RING_CS_GPR(0, 0));
3354 *cs++ = i915_mmio_reg_offset(RING_CTX_TIMESTAMP(0));
3355
3356 return cs;
3357 }
3358
3359 static u32 *
gen12_emit_restore_scratch(const struct intel_context * ce,u32 * cs)3360 gen12_emit_restore_scratch(const struct intel_context *ce, u32 *cs)
3361 {
3362 GEM_BUG_ON(lrc_ring_gpr0(ce->engine) == -1);
3363
3364 *cs++ = MI_LOAD_REGISTER_MEM_GEN8 |
3365 MI_SRM_LRM_GLOBAL_GTT |
3366 MI_LRI_LRM_CS_MMIO;
3367 *cs++ = i915_mmio_reg_offset(GEN8_RING_CS_GPR(0, 0));
3368 *cs++ = i915_ggtt_offset(ce->state) + LRC_STATE_OFFSET +
3369 (lrc_ring_gpr0(ce->engine) + 1) * sizeof(u32);
3370 *cs++ = 0;
3371
3372 return cs;
3373 }
3374
3375 static u32 *
gen12_emit_cmd_buf_wa(const struct intel_context * ce,u32 * cs)3376 gen12_emit_cmd_buf_wa(const struct intel_context *ce, u32 *cs)
3377 {
3378 GEM_BUG_ON(lrc_ring_cmd_buf_cctl(ce->engine) == -1);
3379
3380 *cs++ = MI_LOAD_REGISTER_MEM_GEN8 |
3381 MI_SRM_LRM_GLOBAL_GTT |
3382 MI_LRI_LRM_CS_MMIO;
3383 *cs++ = i915_mmio_reg_offset(GEN8_RING_CS_GPR(0, 0));
3384 *cs++ = i915_ggtt_offset(ce->state) + LRC_STATE_OFFSET +
3385 (lrc_ring_cmd_buf_cctl(ce->engine) + 1) * sizeof(u32);
3386 *cs++ = 0;
3387
3388 *cs++ = MI_LOAD_REGISTER_REG |
3389 MI_LRR_SOURCE_CS_MMIO |
3390 MI_LRI_LRM_CS_MMIO;
3391 *cs++ = i915_mmio_reg_offset(GEN8_RING_CS_GPR(0, 0));
3392 *cs++ = i915_mmio_reg_offset(RING_CMD_BUF_CCTL(0));
3393
3394 return cs;
3395 }
3396
3397 static u32 *
gen12_emit_indirect_ctx_rcs(const struct intel_context * ce,u32 * cs)3398 gen12_emit_indirect_ctx_rcs(const struct intel_context *ce, u32 *cs)
3399 {
3400 cs = gen12_emit_timestamp_wa(ce, cs);
3401 cs = gen12_emit_cmd_buf_wa(ce, cs);
3402 cs = gen12_emit_restore_scratch(ce, cs);
3403
3404 return cs;
3405 }
3406
3407 static u32 *
gen12_emit_indirect_ctx_xcs(const struct intel_context * ce,u32 * cs)3408 gen12_emit_indirect_ctx_xcs(const struct intel_context *ce, u32 *cs)
3409 {
3410 cs = gen12_emit_timestamp_wa(ce, cs);
3411 cs = gen12_emit_restore_scratch(ce, cs);
3412
3413 return cs;
3414 }
3415
context_wa_bb_offset(const struct intel_context * ce)3416 static inline u32 context_wa_bb_offset(const struct intel_context *ce)
3417 {
3418 return PAGE_SIZE * ce->wa_bb_page;
3419 }
3420
context_indirect_bb(const struct intel_context * ce)3421 static u32 *context_indirect_bb(const struct intel_context *ce)
3422 {
3423 void *ptr;
3424
3425 GEM_BUG_ON(!ce->wa_bb_page);
3426
3427 ptr = ce->lrc_reg_state;
3428 ptr -= LRC_STATE_OFFSET; /* back to start of context image */
3429 ptr += context_wa_bb_offset(ce);
3430
3431 return ptr;
3432 }
3433
3434 static void
setup_indirect_ctx_bb(const struct intel_context * ce,const struct intel_engine_cs * engine,u32 * (* emit)(const struct intel_context *,u32 *))3435 setup_indirect_ctx_bb(const struct intel_context *ce,
3436 const struct intel_engine_cs *engine,
3437 u32 *(*emit)(const struct intel_context *, u32 *))
3438 {
3439 u32 * const start = context_indirect_bb(ce);
3440 u32 *cs;
3441
3442 cs = emit(ce, start);
3443 GEM_BUG_ON(cs - start > I915_GTT_PAGE_SIZE / sizeof(*cs));
3444 while ((unsigned long)cs % CACHELINE_BYTES)
3445 *cs++ = MI_NOOP;
3446
3447 lrc_ring_setup_indirect_ctx(ce->lrc_reg_state, engine,
3448 i915_ggtt_offset(ce->state) +
3449 context_wa_bb_offset(ce),
3450 (cs - start) * sizeof(*cs));
3451 }
3452
3453 static void
__execlists_update_reg_state(const struct intel_context * ce,const struct intel_engine_cs * engine,u32 head)3454 __execlists_update_reg_state(const struct intel_context *ce,
3455 const struct intel_engine_cs *engine,
3456 u32 head)
3457 {
3458 struct intel_ring *ring = ce->ring;
3459 u32 *regs = ce->lrc_reg_state;
3460
3461 GEM_BUG_ON(!intel_ring_offset_valid(ring, head));
3462 GEM_BUG_ON(!intel_ring_offset_valid(ring, ring->tail));
3463
3464 regs[CTX_RING_START] = i915_ggtt_offset(ring->vma);
3465 regs[CTX_RING_HEAD] = head;
3466 regs[CTX_RING_TAIL] = ring->tail;
3467 regs[CTX_RING_CTL] = RING_CTL_SIZE(ring->size) | RING_VALID;
3468
3469 /* RPCS */
3470 if (engine->class == RENDER_CLASS) {
3471 regs[CTX_R_PWR_CLK_STATE] =
3472 intel_sseu_make_rpcs(engine->gt, &ce->sseu);
3473
3474 i915_oa_init_reg_state(ce, engine);
3475 }
3476
3477 if (ce->wa_bb_page) {
3478 u32 *(*fn)(const struct intel_context *ce, u32 *cs);
3479
3480 fn = gen12_emit_indirect_ctx_xcs;
3481 if (ce->engine->class == RENDER_CLASS)
3482 fn = gen12_emit_indirect_ctx_rcs;
3483
3484 /* Mutually exclusive wrt to global indirect bb */
3485 GEM_BUG_ON(engine->wa_ctx.indirect_ctx.size);
3486 setup_indirect_ctx_bb(ce, engine, fn);
3487 }
3488 }
3489
3490 static int
execlists_context_pre_pin(struct intel_context * ce,struct i915_gem_ww_ctx * ww,void ** vaddr)3491 execlists_context_pre_pin(struct intel_context *ce,
3492 struct i915_gem_ww_ctx *ww, void **vaddr)
3493 {
3494 GEM_BUG_ON(!ce->state);
3495 GEM_BUG_ON(!i915_vma_is_pinned(ce->state));
3496
3497 *vaddr = i915_gem_object_pin_map(ce->state->obj,
3498 i915_coherent_map_type(ce->engine->i915) |
3499 I915_MAP_OVERRIDE);
3500
3501 return PTR_ERR_OR_ZERO(*vaddr);
3502 }
3503
3504 static int
__execlists_context_pin(struct intel_context * ce,struct intel_engine_cs * engine,void * vaddr)3505 __execlists_context_pin(struct intel_context *ce,
3506 struct intel_engine_cs *engine,
3507 void *vaddr)
3508 {
3509 ce->lrc.lrca = lrc_descriptor(ce, engine) | CTX_DESC_FORCE_RESTORE;
3510 ce->lrc_reg_state = vaddr + LRC_STATE_OFFSET;
3511 __execlists_update_reg_state(ce, engine, ce->ring->tail);
3512
3513 return 0;
3514 }
3515
execlists_context_pin(struct intel_context * ce,void * vaddr)3516 static int execlists_context_pin(struct intel_context *ce, void *vaddr)
3517 {
3518 return __execlists_context_pin(ce, ce->engine, vaddr);
3519 }
3520
execlists_context_alloc(struct intel_context * ce)3521 static int execlists_context_alloc(struct intel_context *ce)
3522 {
3523 return __execlists_context_alloc(ce, ce->engine);
3524 }
3525
execlists_context_reset(struct intel_context * ce)3526 static void execlists_context_reset(struct intel_context *ce)
3527 {
3528 CE_TRACE(ce, "reset\n");
3529 GEM_BUG_ON(!intel_context_is_pinned(ce));
3530
3531 intel_ring_reset(ce->ring, ce->ring->emit);
3532
3533 /* Scrub away the garbage */
3534 execlists_init_reg_state(ce->lrc_reg_state,
3535 ce, ce->engine, ce->ring, true);
3536 __execlists_update_reg_state(ce, ce->engine, ce->ring->tail);
3537
3538 ce->lrc.desc |= CTX_DESC_FORCE_RESTORE;
3539 }
3540
3541 static const struct intel_context_ops execlists_context_ops = {
3542 .alloc = execlists_context_alloc,
3543
3544 .pre_pin = execlists_context_pre_pin,
3545 .pin = execlists_context_pin,
3546 .unpin = execlists_context_unpin,
3547 .post_unpin = execlists_context_post_unpin,
3548
3549 .enter = intel_context_enter_engine,
3550 .exit = intel_context_exit_engine,
3551
3552 .reset = execlists_context_reset,
3553 .destroy = execlists_context_destroy,
3554 };
3555
hwsp_offset(const struct i915_request * rq)3556 static u32 hwsp_offset(const struct i915_request *rq)
3557 {
3558 const struct intel_timeline_cacheline *cl;
3559
3560 /* Before the request is executed, the timeline/cachline is fixed */
3561
3562 cl = rcu_dereference_protected(rq->hwsp_cacheline, 1);
3563 if (cl)
3564 return cl->ggtt_offset;
3565
3566 return rcu_dereference_protected(rq->timeline, 1)->hwsp_offset;
3567 }
3568
gen8_emit_init_breadcrumb(struct i915_request * rq)3569 static int gen8_emit_init_breadcrumb(struct i915_request *rq)
3570 {
3571 u32 *cs;
3572
3573 GEM_BUG_ON(i915_request_has_initial_breadcrumb(rq));
3574 if (!i915_request_timeline(rq)->has_initial_breadcrumb)
3575 return 0;
3576
3577 cs = intel_ring_begin(rq, 6);
3578 if (IS_ERR(cs))
3579 return PTR_ERR(cs);
3580
3581 /*
3582 * Check if we have been preempted before we even get started.
3583 *
3584 * After this point i915_request_started() reports true, even if
3585 * we get preempted and so are no longer running.
3586 */
3587 *cs++ = MI_ARB_CHECK;
3588 *cs++ = MI_NOOP;
3589
3590 *cs++ = MI_STORE_DWORD_IMM_GEN4 | MI_USE_GGTT;
3591 *cs++ = hwsp_offset(rq);
3592 *cs++ = 0;
3593 *cs++ = rq->fence.seqno - 1;
3594
3595 intel_ring_advance(rq, cs);
3596
3597 /* Record the updated position of the request's payload */
3598 rq->infix = intel_ring_offset(rq, cs);
3599
3600 __set_bit(I915_FENCE_FLAG_INITIAL_BREADCRUMB, &rq->fence.flags);
3601
3602 return 0;
3603 }
3604
emit_pdps(struct i915_request * rq)3605 static int emit_pdps(struct i915_request *rq)
3606 {
3607 const struct intel_engine_cs * const engine = rq->engine;
3608 struct i915_ppgtt * const ppgtt = i915_vm_to_ppgtt(rq->context->vm);
3609 int err, i;
3610 u32 *cs;
3611
3612 GEM_BUG_ON(intel_vgpu_active(rq->engine->i915));
3613
3614 /*
3615 * Beware ye of the dragons, this sequence is magic!
3616 *
3617 * Small changes to this sequence can cause anything from
3618 * GPU hangs to forcewake errors and machine lockups!
3619 */
3620
3621 /* Flush any residual operations from the context load */
3622 err = engine->emit_flush(rq, EMIT_FLUSH);
3623 if (err)
3624 return err;
3625
3626 /* Magic required to prevent forcewake errors! */
3627 err = engine->emit_flush(rq, EMIT_INVALIDATE);
3628 if (err)
3629 return err;
3630
3631 cs = intel_ring_begin(rq, 4 * GEN8_3LVL_PDPES + 2);
3632 if (IS_ERR(cs))
3633 return PTR_ERR(cs);
3634
3635 /* Ensure the LRI have landed before we invalidate & continue */
3636 *cs++ = MI_LOAD_REGISTER_IMM(2 * GEN8_3LVL_PDPES) | MI_LRI_FORCE_POSTED;
3637 for (i = GEN8_3LVL_PDPES; i--; ) {
3638 const dma_addr_t pd_daddr = i915_page_dir_dma_addr(ppgtt, i);
3639 u32 base = engine->mmio_base;
3640
3641 *cs++ = i915_mmio_reg_offset(GEN8_RING_PDP_UDW(base, i));
3642 *cs++ = upper_32_bits(pd_daddr);
3643 *cs++ = i915_mmio_reg_offset(GEN8_RING_PDP_LDW(base, i));
3644 *cs++ = lower_32_bits(pd_daddr);
3645 }
3646 *cs++ = MI_NOOP;
3647
3648 intel_ring_advance(rq, cs);
3649
3650 return 0;
3651 }
3652
execlists_request_alloc(struct i915_request * request)3653 static int execlists_request_alloc(struct i915_request *request)
3654 {
3655 int ret;
3656
3657 GEM_BUG_ON(!intel_context_is_pinned(request->context));
3658
3659 /*
3660 * Flush enough space to reduce the likelihood of waiting after
3661 * we start building the request - in which case we will just
3662 * have to repeat work.
3663 */
3664 request->reserved_space += EXECLISTS_REQUEST_SIZE;
3665
3666 /*
3667 * Note that after this point, we have committed to using
3668 * this request as it is being used to both track the
3669 * state of engine initialisation and liveness of the
3670 * golden renderstate above. Think twice before you try
3671 * to cancel/unwind this request now.
3672 */
3673
3674 if (!i915_vm_is_4lvl(request->context->vm)) {
3675 ret = emit_pdps(request);
3676 if (ret)
3677 return ret;
3678 }
3679
3680 /* Unconditionally invalidate GPU caches and TLBs. */
3681 ret = request->engine->emit_flush(request, EMIT_INVALIDATE);
3682 if (ret)
3683 return ret;
3684
3685 request->reserved_space -= EXECLISTS_REQUEST_SIZE;
3686 return 0;
3687 }
3688
3689 /*
3690 * In this WA we need to set GEN8_L3SQCREG4[21:21] and reset it after
3691 * PIPE_CONTROL instruction. This is required for the flush to happen correctly
3692 * but there is a slight complication as this is applied in WA batch where the
3693 * values are only initialized once so we cannot take register value at the
3694 * beginning and reuse it further; hence we save its value to memory, upload a
3695 * constant value with bit21 set and then we restore it back with the saved value.
3696 * To simplify the WA, a constant value is formed by using the default value
3697 * of this register. This shouldn't be a problem because we are only modifying
3698 * it for a short period and this batch in non-premptible. We can ofcourse
3699 * use additional instructions that read the actual value of the register
3700 * at that time and set our bit of interest but it makes the WA complicated.
3701 *
3702 * This WA is also required for Gen9 so extracting as a function avoids
3703 * code duplication.
3704 */
3705 static u32 *
gen8_emit_flush_coherentl3_wa(struct intel_engine_cs * engine,u32 * batch)3706 gen8_emit_flush_coherentl3_wa(struct intel_engine_cs *engine, u32 *batch)
3707 {
3708 /* NB no one else is allowed to scribble over scratch + 256! */
3709 *batch++ = MI_STORE_REGISTER_MEM_GEN8 | MI_SRM_LRM_GLOBAL_GTT;
3710 *batch++ = i915_mmio_reg_offset(GEN8_L3SQCREG4);
3711 *batch++ = intel_gt_scratch_offset(engine->gt,
3712 INTEL_GT_SCRATCH_FIELD_COHERENTL3_WA);
3713 *batch++ = 0;
3714
3715 *batch++ = MI_LOAD_REGISTER_IMM(1);
3716 *batch++ = i915_mmio_reg_offset(GEN8_L3SQCREG4);
3717 *batch++ = 0x40400000 | GEN8_LQSC_FLUSH_COHERENT_LINES;
3718
3719 batch = gen8_emit_pipe_control(batch,
3720 PIPE_CONTROL_CS_STALL |
3721 PIPE_CONTROL_DC_FLUSH_ENABLE,
3722 0);
3723
3724 *batch++ = MI_LOAD_REGISTER_MEM_GEN8 | MI_SRM_LRM_GLOBAL_GTT;
3725 *batch++ = i915_mmio_reg_offset(GEN8_L3SQCREG4);
3726 *batch++ = intel_gt_scratch_offset(engine->gt,
3727 INTEL_GT_SCRATCH_FIELD_COHERENTL3_WA);
3728 *batch++ = 0;
3729
3730 return batch;
3731 }
3732
3733 /*
3734 * Typically we only have one indirect_ctx and per_ctx batch buffer which are
3735 * initialized at the beginning and shared across all contexts but this field
3736 * helps us to have multiple batches at different offsets and select them based
3737 * on a criteria. At the moment this batch always start at the beginning of the page
3738 * and at this point we don't have multiple wa_ctx batch buffers.
3739 *
3740 * The number of WA applied are not known at the beginning; we use this field
3741 * to return the no of DWORDS written.
3742 *
3743 * It is to be noted that this batch does not contain MI_BATCH_BUFFER_END
3744 * so it adds NOOPs as padding to make it cacheline aligned.
3745 * MI_BATCH_BUFFER_END will be added to perctx batch and both of them together
3746 * makes a complete batch buffer.
3747 */
gen8_init_indirectctx_bb(struct intel_engine_cs * engine,u32 * batch)3748 static u32 *gen8_init_indirectctx_bb(struct intel_engine_cs *engine, u32 *batch)
3749 {
3750 /* WaDisableCtxRestoreArbitration:bdw,chv */
3751 *batch++ = MI_ARB_ON_OFF | MI_ARB_DISABLE;
3752
3753 /* WaFlushCoherentL3CacheLinesAtContextSwitch:bdw */
3754 if (IS_BROADWELL(engine->i915))
3755 batch = gen8_emit_flush_coherentl3_wa(engine, batch);
3756
3757 /* WaClearSlmSpaceAtContextSwitch:bdw,chv */
3758 /* Actual scratch location is at 128 bytes offset */
3759 batch = gen8_emit_pipe_control(batch,
3760 PIPE_CONTROL_FLUSH_L3 |
3761 PIPE_CONTROL_STORE_DATA_INDEX |
3762 PIPE_CONTROL_CS_STALL |
3763 PIPE_CONTROL_QW_WRITE,
3764 LRC_PPHWSP_SCRATCH_ADDR);
3765
3766 *batch++ = MI_ARB_ON_OFF | MI_ARB_ENABLE;
3767
3768 /* Pad to end of cacheline */
3769 while ((unsigned long)batch % CACHELINE_BYTES)
3770 *batch++ = MI_NOOP;
3771
3772 /*
3773 * MI_BATCH_BUFFER_END is not required in Indirect ctx BB because
3774 * execution depends on the length specified in terms of cache lines
3775 * in the register CTX_RCS_INDIRECT_CTX
3776 */
3777
3778 return batch;
3779 }
3780
3781 struct lri {
3782 i915_reg_t reg;
3783 u32 value;
3784 };
3785
emit_lri(u32 * batch,const struct lri * lri,unsigned int count)3786 static u32 *emit_lri(u32 *batch, const struct lri *lri, unsigned int count)
3787 {
3788 GEM_BUG_ON(!count || count > 63);
3789
3790 *batch++ = MI_LOAD_REGISTER_IMM(count);
3791 do {
3792 *batch++ = i915_mmio_reg_offset(lri->reg);
3793 *batch++ = lri->value;
3794 } while (lri++, --count);
3795 *batch++ = MI_NOOP;
3796
3797 return batch;
3798 }
3799
gen9_init_indirectctx_bb(struct intel_engine_cs * engine,u32 * batch)3800 static u32 *gen9_init_indirectctx_bb(struct intel_engine_cs *engine, u32 *batch)
3801 {
3802 static const struct lri lri[] = {
3803 /* WaDisableGatherAtSetShaderCommonSlice:skl,bxt,kbl,glk */
3804 {
3805 COMMON_SLICE_CHICKEN2,
3806 __MASKED_FIELD(GEN9_DISABLE_GATHER_AT_SET_SHADER_COMMON_SLICE,
3807 0),
3808 },
3809
3810 /* BSpec: 11391 */
3811 {
3812 FF_SLICE_CHICKEN,
3813 __MASKED_FIELD(FF_SLICE_CHICKEN_CL_PROVOKING_VERTEX_FIX,
3814 FF_SLICE_CHICKEN_CL_PROVOKING_VERTEX_FIX),
3815 },
3816
3817 /* BSpec: 11299 */
3818 {
3819 _3D_CHICKEN3,
3820 __MASKED_FIELD(_3D_CHICKEN_SF_PROVOKING_VERTEX_FIX,
3821 _3D_CHICKEN_SF_PROVOKING_VERTEX_FIX),
3822 }
3823 };
3824
3825 *batch++ = MI_ARB_ON_OFF | MI_ARB_DISABLE;
3826
3827 /* WaFlushCoherentL3CacheLinesAtContextSwitch:skl,bxt,glk */
3828 batch = gen8_emit_flush_coherentl3_wa(engine, batch);
3829
3830 /* WaClearSlmSpaceAtContextSwitch:skl,bxt,kbl,glk,cfl */
3831 batch = gen8_emit_pipe_control(batch,
3832 PIPE_CONTROL_FLUSH_L3 |
3833 PIPE_CONTROL_STORE_DATA_INDEX |
3834 PIPE_CONTROL_CS_STALL |
3835 PIPE_CONTROL_QW_WRITE,
3836 LRC_PPHWSP_SCRATCH_ADDR);
3837
3838 batch = emit_lri(batch, lri, ARRAY_SIZE(lri));
3839
3840 /* WaMediaPoolStateCmdInWABB:bxt,glk */
3841 if (HAS_POOLED_EU(engine->i915)) {
3842 /*
3843 * EU pool configuration is setup along with golden context
3844 * during context initialization. This value depends on
3845 * device type (2x6 or 3x6) and needs to be updated based
3846 * on which subslice is disabled especially for 2x6
3847 * devices, however it is safe to load default
3848 * configuration of 3x6 device instead of masking off
3849 * corresponding bits because HW ignores bits of a disabled
3850 * subslice and drops down to appropriate config. Please
3851 * see render_state_setup() in i915_gem_render_state.c for
3852 * possible configurations, to avoid duplication they are
3853 * not shown here again.
3854 */
3855 *batch++ = GEN9_MEDIA_POOL_STATE;
3856 *batch++ = GEN9_MEDIA_POOL_ENABLE;
3857 *batch++ = 0x00777000;
3858 *batch++ = 0;
3859 *batch++ = 0;
3860 *batch++ = 0;
3861 }
3862
3863 *batch++ = MI_ARB_ON_OFF | MI_ARB_ENABLE;
3864
3865 /* Pad to end of cacheline */
3866 while ((unsigned long)batch % CACHELINE_BYTES)
3867 *batch++ = MI_NOOP;
3868
3869 return batch;
3870 }
3871
3872 static u32 *
gen10_init_indirectctx_bb(struct intel_engine_cs * engine,u32 * batch)3873 gen10_init_indirectctx_bb(struct intel_engine_cs *engine, u32 *batch)
3874 {
3875 int i;
3876
3877 /*
3878 * WaPipeControlBefore3DStateSamplePattern: cnl
3879 *
3880 * Ensure the engine is idle prior to programming a
3881 * 3DSTATE_SAMPLE_PATTERN during a context restore.
3882 */
3883 batch = gen8_emit_pipe_control(batch,
3884 PIPE_CONTROL_CS_STALL,
3885 0);
3886 /*
3887 * WaPipeControlBefore3DStateSamplePattern says we need 4 dwords for
3888 * the PIPE_CONTROL followed by 12 dwords of 0x0, so 16 dwords in
3889 * total. However, a PIPE_CONTROL is 6 dwords long, not 4, which is
3890 * confusing. Since gen8_emit_pipe_control() already advances the
3891 * batch by 6 dwords, we advance the other 10 here, completing a
3892 * cacheline. It's not clear if the workaround requires this padding
3893 * before other commands, or if it's just the regular padding we would
3894 * already have for the workaround bb, so leave it here for now.
3895 */
3896 for (i = 0; i < 10; i++)
3897 *batch++ = MI_NOOP;
3898
3899 /* Pad to end of cacheline */
3900 while ((unsigned long)batch % CACHELINE_BYTES)
3901 *batch++ = MI_NOOP;
3902
3903 return batch;
3904 }
3905
3906 #define CTX_WA_BB_OBJ_SIZE (PAGE_SIZE)
3907
lrc_setup_wa_ctx(struct intel_engine_cs * engine)3908 static int lrc_setup_wa_ctx(struct intel_engine_cs *engine)
3909 {
3910 struct drm_i915_gem_object *obj;
3911 struct i915_vma *vma;
3912 int err;
3913
3914 obj = i915_gem_object_create_shmem(engine->i915, CTX_WA_BB_OBJ_SIZE);
3915 if (IS_ERR(obj))
3916 return PTR_ERR(obj);
3917
3918 vma = i915_vma_instance(obj, &engine->gt->ggtt->vm, NULL);
3919 if (IS_ERR(vma)) {
3920 err = PTR_ERR(vma);
3921 goto err;
3922 }
3923
3924 err = i915_ggtt_pin(vma, NULL, 0, PIN_HIGH);
3925 if (err)
3926 goto err;
3927
3928 engine->wa_ctx.vma = vma;
3929 return 0;
3930
3931 err:
3932 i915_gem_object_put(obj);
3933 return err;
3934 }
3935
lrc_destroy_wa_ctx(struct intel_engine_cs * engine)3936 static void lrc_destroy_wa_ctx(struct intel_engine_cs *engine)
3937 {
3938 i915_vma_unpin_and_release(&engine->wa_ctx.vma, 0);
3939
3940 /* Called on error unwind, clear all flags to prevent further use */
3941 memset(&engine->wa_ctx, 0, sizeof(engine->wa_ctx));
3942 }
3943
3944 typedef u32 *(*wa_bb_func_t)(struct intel_engine_cs *engine, u32 *batch);
3945
intel_init_workaround_bb(struct intel_engine_cs * engine)3946 static int intel_init_workaround_bb(struct intel_engine_cs *engine)
3947 {
3948 struct i915_ctx_workarounds *wa_ctx = &engine->wa_ctx;
3949 struct i915_wa_ctx_bb *wa_bb[2] = { &wa_ctx->indirect_ctx,
3950 &wa_ctx->per_ctx };
3951 wa_bb_func_t wa_bb_fn[2];
3952 void *batch, *batch_ptr;
3953 unsigned int i;
3954 int ret;
3955
3956 if (engine->class != RENDER_CLASS)
3957 return 0;
3958
3959 switch (INTEL_GEN(engine->i915)) {
3960 case 12:
3961 case 11:
3962 return 0;
3963 case 10:
3964 wa_bb_fn[0] = gen10_init_indirectctx_bb;
3965 wa_bb_fn[1] = NULL;
3966 break;
3967 case 9:
3968 wa_bb_fn[0] = gen9_init_indirectctx_bb;
3969 wa_bb_fn[1] = NULL;
3970 break;
3971 case 8:
3972 wa_bb_fn[0] = gen8_init_indirectctx_bb;
3973 wa_bb_fn[1] = NULL;
3974 break;
3975 default:
3976 MISSING_CASE(INTEL_GEN(engine->i915));
3977 return 0;
3978 }
3979
3980 ret = lrc_setup_wa_ctx(engine);
3981 if (ret) {
3982 drm_dbg(&engine->i915->drm,
3983 "Failed to setup context WA page: %d\n", ret);
3984 return ret;
3985 }
3986
3987 batch = i915_gem_object_pin_map(wa_ctx->vma->obj, I915_MAP_WB);
3988
3989 /*
3990 * Emit the two workaround batch buffers, recording the offset from the
3991 * start of the workaround batch buffer object for each and their
3992 * respective sizes.
3993 */
3994 batch_ptr = batch;
3995 for (i = 0; i < ARRAY_SIZE(wa_bb_fn); i++) {
3996 wa_bb[i]->offset = batch_ptr - batch;
3997 if (GEM_DEBUG_WARN_ON(!IS_ALIGNED(wa_bb[i]->offset,
3998 CACHELINE_BYTES))) {
3999 ret = -EINVAL;
4000 break;
4001 }
4002 if (wa_bb_fn[i])
4003 batch_ptr = wa_bb_fn[i](engine, batch_ptr);
4004 wa_bb[i]->size = batch_ptr - (batch + wa_bb[i]->offset);
4005 }
4006 GEM_BUG_ON(batch_ptr - batch > CTX_WA_BB_OBJ_SIZE);
4007
4008 __i915_gem_object_flush_map(wa_ctx->vma->obj, 0, batch_ptr - batch);
4009 __i915_gem_object_release_map(wa_ctx->vma->obj);
4010 if (ret)
4011 lrc_destroy_wa_ctx(engine);
4012
4013 return ret;
4014 }
4015
reset_csb_pointers(struct intel_engine_cs * engine)4016 static void reset_csb_pointers(struct intel_engine_cs *engine)
4017 {
4018 struct intel_engine_execlists * const execlists = &engine->execlists;
4019 const unsigned int reset_value = execlists->csb_size - 1;
4020
4021 ring_set_paused(engine, 0);
4022
4023 /*
4024 * Sometimes Icelake forgets to reset its pointers on a GPU reset.
4025 * Bludgeon them with a mmio update to be sure.
4026 */
4027 ENGINE_WRITE(engine, RING_CONTEXT_STATUS_PTR,
4028 0xffff << 16 | reset_value << 8 | reset_value);
4029 ENGINE_POSTING_READ(engine, RING_CONTEXT_STATUS_PTR);
4030
4031 /*
4032 * After a reset, the HW starts writing into CSB entry [0]. We
4033 * therefore have to set our HEAD pointer back one entry so that
4034 * the *first* entry we check is entry 0. To complicate this further,
4035 * as we don't wait for the first interrupt after reset, we have to
4036 * fake the HW write to point back to the last entry so that our
4037 * inline comparison of our cached head position against the last HW
4038 * write works even before the first interrupt.
4039 */
4040 execlists->csb_head = reset_value;
4041 WRITE_ONCE(*execlists->csb_write, reset_value);
4042 wmb(); /* Make sure this is visible to HW (paranoia?) */
4043
4044 /* Check that the GPU does indeed update the CSB entries! */
4045 memset(execlists->csb_status, -1, (reset_value + 1) * sizeof(u64));
4046 invalidate_csb_entries(&execlists->csb_status[0],
4047 &execlists->csb_status[reset_value]);
4048
4049 /* Once more for luck and our trusty paranoia */
4050 ENGINE_WRITE(engine, RING_CONTEXT_STATUS_PTR,
4051 0xffff << 16 | reset_value << 8 | reset_value);
4052 ENGINE_POSTING_READ(engine, RING_CONTEXT_STATUS_PTR);
4053
4054 GEM_BUG_ON(READ_ONCE(*execlists->csb_write) != reset_value);
4055 }
4056
execlists_sanitize(struct intel_engine_cs * engine)4057 static void execlists_sanitize(struct intel_engine_cs *engine)
4058 {
4059 /*
4060 * Poison residual state on resume, in case the suspend didn't!
4061 *
4062 * We have to assume that across suspend/resume (or other loss
4063 * of control) that the contents of our pinned buffers has been
4064 * lost, replaced by garbage. Since this doesn't always happen,
4065 * let's poison such state so that we more quickly spot when
4066 * we falsely assume it has been preserved.
4067 */
4068 if (IS_ENABLED(CONFIG_DRM_I915_DEBUG_GEM))
4069 memset(engine->status_page.addr, POISON_INUSE, PAGE_SIZE);
4070
4071 reset_csb_pointers(engine);
4072
4073 /*
4074 * The kernel_context HWSP is stored in the status_page. As above,
4075 * that may be lost on resume/initialisation, and so we need to
4076 * reset the value in the HWSP.
4077 */
4078 intel_timeline_reset_seqno(engine->kernel_context->timeline);
4079
4080 /* And scrub the dirty cachelines for the HWSP */
4081 clflush_cache_range(engine->status_page.addr, PAGE_SIZE);
4082 }
4083
enable_error_interrupt(struct intel_engine_cs * engine)4084 static void enable_error_interrupt(struct intel_engine_cs *engine)
4085 {
4086 u32 status;
4087
4088 engine->execlists.error_interrupt = 0;
4089 ENGINE_WRITE(engine, RING_EMR, ~0u);
4090 ENGINE_WRITE(engine, RING_EIR, ~0u); /* clear all existing errors */
4091
4092 status = ENGINE_READ(engine, RING_ESR);
4093 if (unlikely(status)) {
4094 drm_err(&engine->i915->drm,
4095 "engine '%s' resumed still in error: %08x\n",
4096 engine->name, status);
4097 __intel_gt_reset(engine->gt, engine->mask);
4098 }
4099
4100 /*
4101 * On current gen8+, we have 2 signals to play with
4102 *
4103 * - I915_ERROR_INSTUCTION (bit 0)
4104 *
4105 * Generate an error if the command parser encounters an invalid
4106 * instruction
4107 *
4108 * This is a fatal error.
4109 *
4110 * - CP_PRIV (bit 2)
4111 *
4112 * Generate an error on privilege violation (where the CP replaces
4113 * the instruction with a no-op). This also fires for writes into
4114 * read-only scratch pages.
4115 *
4116 * This is a non-fatal error, parsing continues.
4117 *
4118 * * there are a few others defined for odd HW that we do not use
4119 *
4120 * Since CP_PRIV fires for cases where we have chosen to ignore the
4121 * error (as the HW is validating and suppressing the mistakes), we
4122 * only unmask the instruction error bit.
4123 */
4124 ENGINE_WRITE(engine, RING_EMR, ~I915_ERROR_INSTRUCTION);
4125 }
4126
enable_execlists(struct intel_engine_cs * engine)4127 static void enable_execlists(struct intel_engine_cs *engine)
4128 {
4129 u32 mode;
4130
4131 assert_forcewakes_active(engine->uncore, FORCEWAKE_ALL);
4132
4133 intel_engine_set_hwsp_writemask(engine, ~0u); /* HWSTAM */
4134
4135 if (INTEL_GEN(engine->i915) >= 11)
4136 mode = _MASKED_BIT_ENABLE(GEN11_GFX_DISABLE_LEGACY_MODE);
4137 else
4138 mode = _MASKED_BIT_ENABLE(GFX_RUN_LIST_ENABLE);
4139 ENGINE_WRITE_FW(engine, RING_MODE_GEN7, mode);
4140
4141 ENGINE_WRITE_FW(engine, RING_MI_MODE, _MASKED_BIT_DISABLE(STOP_RING));
4142
4143 ENGINE_WRITE_FW(engine,
4144 RING_HWS_PGA,
4145 i915_ggtt_offset(engine->status_page.vma));
4146 ENGINE_POSTING_READ(engine, RING_HWS_PGA);
4147
4148 enable_error_interrupt(engine);
4149
4150 engine->context_tag = GENMASK(BITS_PER_LONG - 2, 0);
4151 }
4152
unexpected_starting_state(struct intel_engine_cs * engine)4153 static bool unexpected_starting_state(struct intel_engine_cs *engine)
4154 {
4155 bool unexpected = false;
4156
4157 if (ENGINE_READ_FW(engine, RING_MI_MODE) & STOP_RING) {
4158 drm_dbg(&engine->i915->drm,
4159 "STOP_RING still set in RING_MI_MODE\n");
4160 unexpected = true;
4161 }
4162
4163 return unexpected;
4164 }
4165
execlists_resume(struct intel_engine_cs * engine)4166 static int execlists_resume(struct intel_engine_cs *engine)
4167 {
4168 intel_mocs_init_engine(engine);
4169
4170 intel_breadcrumbs_reset(engine->breadcrumbs);
4171
4172 if (GEM_SHOW_DEBUG() && unexpected_starting_state(engine)) {
4173 struct drm_printer p = drm_debug_printer(__func__);
4174
4175 intel_engine_dump(engine, &p, NULL);
4176 }
4177
4178 enable_execlists(engine);
4179
4180 return 0;
4181 }
4182
execlists_reset_prepare(struct intel_engine_cs * engine)4183 static void execlists_reset_prepare(struct intel_engine_cs *engine)
4184 {
4185 struct intel_engine_execlists * const execlists = &engine->execlists;
4186 unsigned long flags;
4187
4188 ENGINE_TRACE(engine, "depth<-%d\n",
4189 atomic_read(&execlists->tasklet.count));
4190
4191 /*
4192 * Prevent request submission to the hardware until we have
4193 * completed the reset in i915_gem_reset_finish(). If a request
4194 * is completed by one engine, it may then queue a request
4195 * to a second via its execlists->tasklet *just* as we are
4196 * calling engine->resume() and also writing the ELSP.
4197 * Turning off the execlists->tasklet until the reset is over
4198 * prevents the race.
4199 */
4200 __tasklet_disable_sync_once(&execlists->tasklet);
4201 GEM_BUG_ON(!reset_in_progress(execlists));
4202
4203 /* And flush any current direct submission. */
4204 spin_lock_irqsave(&engine->active.lock, flags);
4205 spin_unlock_irqrestore(&engine->active.lock, flags);
4206
4207 /*
4208 * We stop engines, otherwise we might get failed reset and a
4209 * dead gpu (on elk). Also as modern gpu as kbl can suffer
4210 * from system hang if batchbuffer is progressing when
4211 * the reset is issued, regardless of READY_TO_RESET ack.
4212 * Thus assume it is best to stop engines on all gens
4213 * where we have a gpu reset.
4214 *
4215 * WaKBLVECSSemaphoreWaitPoll:kbl (on ALL_ENGINES)
4216 *
4217 * FIXME: Wa for more modern gens needs to be validated
4218 */
4219 ring_set_paused(engine, 1);
4220 intel_engine_stop_cs(engine);
4221
4222 engine->execlists.reset_ccid = active_ccid(engine);
4223 }
4224
__reset_stop_ring(u32 * regs,const struct intel_engine_cs * engine)4225 static void __reset_stop_ring(u32 *regs, const struct intel_engine_cs *engine)
4226 {
4227 int x;
4228
4229 x = lrc_ring_mi_mode(engine);
4230 if (x != -1) {
4231 regs[x + 1] &= ~STOP_RING;
4232 regs[x + 1] |= STOP_RING << 16;
4233 }
4234 }
4235
__execlists_reset_reg_state(const struct intel_context * ce,const struct intel_engine_cs * engine)4236 static void __execlists_reset_reg_state(const struct intel_context *ce,
4237 const struct intel_engine_cs *engine)
4238 {
4239 u32 *regs = ce->lrc_reg_state;
4240
4241 __reset_stop_ring(regs, engine);
4242 }
4243
__execlists_reset(struct intel_engine_cs * engine,bool stalled)4244 static void __execlists_reset(struct intel_engine_cs *engine, bool stalled)
4245 {
4246 struct intel_engine_execlists * const execlists = &engine->execlists;
4247 struct intel_context *ce;
4248 struct i915_request *rq;
4249 u32 head;
4250
4251 mb(); /* paranoia: read the CSB pointers from after the reset */
4252 clflush(execlists->csb_write);
4253 mb();
4254
4255 process_csb(engine); /* drain preemption events */
4256
4257 /* Following the reset, we need to reload the CSB read/write pointers */
4258 reset_csb_pointers(engine);
4259
4260 /*
4261 * Save the currently executing context, even if we completed
4262 * its request, it was still running at the time of the
4263 * reset and will have been clobbered.
4264 */
4265 rq = active_context(engine, engine->execlists.reset_ccid);
4266 if (!rq)
4267 goto unwind;
4268
4269 ce = rq->context;
4270 GEM_BUG_ON(!i915_vma_is_pinned(ce->state));
4271
4272 if (i915_request_completed(rq)) {
4273 /* Idle context; tidy up the ring so we can restart afresh */
4274 head = intel_ring_wrap(ce->ring, rq->tail);
4275 goto out_replay;
4276 }
4277
4278 /* We still have requests in-flight; the engine should be active */
4279 GEM_BUG_ON(!intel_engine_pm_is_awake(engine));
4280
4281 /* Context has requests still in-flight; it should not be idle! */
4282 GEM_BUG_ON(i915_active_is_idle(&ce->active));
4283
4284 rq = active_request(ce->timeline, rq);
4285 head = intel_ring_wrap(ce->ring, rq->head);
4286 GEM_BUG_ON(head == ce->ring->tail);
4287
4288 /*
4289 * If this request hasn't started yet, e.g. it is waiting on a
4290 * semaphore, we need to avoid skipping the request or else we
4291 * break the signaling chain. However, if the context is corrupt
4292 * the request will not restart and we will be stuck with a wedged
4293 * device. It is quite often the case that if we issue a reset
4294 * while the GPU is loading the context image, that the context
4295 * image becomes corrupt.
4296 *
4297 * Otherwise, if we have not started yet, the request should replay
4298 * perfectly and we do not need to flag the result as being erroneous.
4299 */
4300 if (!i915_request_started(rq))
4301 goto out_replay;
4302
4303 /*
4304 * If the request was innocent, we leave the request in the ELSP
4305 * and will try to replay it on restarting. The context image may
4306 * have been corrupted by the reset, in which case we may have
4307 * to service a new GPU hang, but more likely we can continue on
4308 * without impact.
4309 *
4310 * If the request was guilty, we presume the context is corrupt
4311 * and have to at least restore the RING register in the context
4312 * image back to the expected values to skip over the guilty request.
4313 */
4314 __i915_request_reset(rq, stalled);
4315
4316 /*
4317 * We want a simple context + ring to execute the breadcrumb update.
4318 * We cannot rely on the context being intact across the GPU hang,
4319 * so clear it and rebuild just what we need for the breadcrumb.
4320 * All pending requests for this context will be zapped, and any
4321 * future request will be after userspace has had the opportunity
4322 * to recreate its own state.
4323 */
4324 out_replay:
4325 ENGINE_TRACE(engine, "replay {head:%04x, tail:%04x}\n",
4326 head, ce->ring->tail);
4327 __execlists_reset_reg_state(ce, engine);
4328 __execlists_update_reg_state(ce, engine, head);
4329 ce->lrc.desc |= CTX_DESC_FORCE_RESTORE; /* paranoid: GPU was reset! */
4330
4331 unwind:
4332 /* Push back any incomplete requests for replay after the reset. */
4333 cancel_port_requests(execlists);
4334 __unwind_incomplete_requests(engine);
4335 }
4336
execlists_reset_rewind(struct intel_engine_cs * engine,bool stalled)4337 static void execlists_reset_rewind(struct intel_engine_cs *engine, bool stalled)
4338 {
4339 unsigned long flags;
4340
4341 ENGINE_TRACE(engine, "\n");
4342
4343 spin_lock_irqsave(&engine->active.lock, flags);
4344
4345 __execlists_reset(engine, stalled);
4346
4347 spin_unlock_irqrestore(&engine->active.lock, flags);
4348 }
4349
nop_submission_tasklet(unsigned long data)4350 static void nop_submission_tasklet(unsigned long data)
4351 {
4352 struct intel_engine_cs * const engine = (struct intel_engine_cs *)data;
4353
4354 /* The driver is wedged; don't process any more events. */
4355 WRITE_ONCE(engine->execlists.queue_priority_hint, INT_MIN);
4356 }
4357
execlists_reset_cancel(struct intel_engine_cs * engine)4358 static void execlists_reset_cancel(struct intel_engine_cs *engine)
4359 {
4360 struct intel_engine_execlists * const execlists = &engine->execlists;
4361 struct i915_request *rq, *rn;
4362 struct rb_node *rb;
4363 unsigned long flags;
4364
4365 ENGINE_TRACE(engine, "\n");
4366
4367 /*
4368 * Before we call engine->cancel_requests(), we should have exclusive
4369 * access to the submission state. This is arranged for us by the
4370 * caller disabling the interrupt generation, the tasklet and other
4371 * threads that may then access the same state, giving us a free hand
4372 * to reset state. However, we still need to let lockdep be aware that
4373 * we know this state may be accessed in hardirq context, so we
4374 * disable the irq around this manipulation and we want to keep
4375 * the spinlock focused on its duties and not accidentally conflate
4376 * coverage to the submission's irq state. (Similarly, although we
4377 * shouldn't need to disable irq around the manipulation of the
4378 * submission's irq state, we also wish to remind ourselves that
4379 * it is irq state.)
4380 */
4381 spin_lock_irqsave(&engine->active.lock, flags);
4382
4383 __execlists_reset(engine, true);
4384
4385 /* Mark all executing requests as skipped. */
4386 list_for_each_entry(rq, &engine->active.requests, sched.link)
4387 mark_eio(rq);
4388
4389 /* Flush the queued requests to the timeline list (for retiring). */
4390 while ((rb = rb_first_cached(&execlists->queue))) {
4391 struct i915_priolist *p = to_priolist(rb);
4392 int i;
4393
4394 priolist_for_each_request_consume(rq, rn, p, i) {
4395 mark_eio(rq);
4396 __i915_request_submit(rq);
4397 }
4398
4399 rb_erase_cached(&p->node, &execlists->queue);
4400 i915_priolist_free(p);
4401 }
4402
4403 /* On-hold requests will be flushed to timeline upon their release */
4404 list_for_each_entry(rq, &engine->active.hold, sched.link)
4405 mark_eio(rq);
4406
4407 /* Cancel all attached virtual engines */
4408 while ((rb = rb_first_cached(&execlists->virtual))) {
4409 struct virtual_engine *ve =
4410 rb_entry(rb, typeof(*ve), nodes[engine->id].rb);
4411
4412 rb_erase_cached(rb, &execlists->virtual);
4413 RB_CLEAR_NODE(rb);
4414
4415 spin_lock(&ve->base.active.lock);
4416 rq = fetch_and_zero(&ve->request);
4417 if (rq) {
4418 mark_eio(rq);
4419
4420 rq->engine = engine;
4421 __i915_request_submit(rq);
4422 i915_request_put(rq);
4423
4424 ve->base.execlists.queue_priority_hint = INT_MIN;
4425 }
4426 spin_unlock(&ve->base.active.lock);
4427 }
4428
4429 /* Remaining _unready_ requests will be nop'ed when submitted */
4430
4431 execlists->queue_priority_hint = INT_MIN;
4432 execlists->queue = RB_ROOT_CACHED;
4433
4434 GEM_BUG_ON(__tasklet_is_enabled(&execlists->tasklet));
4435 execlists->tasklet.func = nop_submission_tasklet;
4436
4437 spin_unlock_irqrestore(&engine->active.lock, flags);
4438 }
4439
execlists_reset_finish(struct intel_engine_cs * engine)4440 static void execlists_reset_finish(struct intel_engine_cs *engine)
4441 {
4442 struct intel_engine_execlists * const execlists = &engine->execlists;
4443
4444 /*
4445 * After a GPU reset, we may have requests to replay. Do so now while
4446 * we still have the forcewake to be sure that the GPU is not allowed
4447 * to sleep before we restart and reload a context.
4448 */
4449 GEM_BUG_ON(!reset_in_progress(execlists));
4450 if (!RB_EMPTY_ROOT(&execlists->queue.rb_root))
4451 execlists->tasklet.func(execlists->tasklet.data);
4452
4453 if (__tasklet_enable(&execlists->tasklet))
4454 /* And kick in case we missed a new request submission. */
4455 tasklet_hi_schedule(&execlists->tasklet);
4456 ENGINE_TRACE(engine, "depth->%d\n",
4457 atomic_read(&execlists->tasklet.count));
4458 }
4459
gen8_emit_bb_start_noarb(struct i915_request * rq,u64 offset,u32 len,const unsigned int flags)4460 static int gen8_emit_bb_start_noarb(struct i915_request *rq,
4461 u64 offset, u32 len,
4462 const unsigned int flags)
4463 {
4464 u32 *cs;
4465
4466 cs = intel_ring_begin(rq, 4);
4467 if (IS_ERR(cs))
4468 return PTR_ERR(cs);
4469
4470 /*
4471 * WaDisableCtxRestoreArbitration:bdw,chv
4472 *
4473 * We don't need to perform MI_ARB_ENABLE as often as we do (in
4474 * particular all the gen that do not need the w/a at all!), if we
4475 * took care to make sure that on every switch into this context
4476 * (both ordinary and for preemption) that arbitrartion was enabled
4477 * we would be fine. However, for gen8 there is another w/a that
4478 * requires us to not preempt inside GPGPU execution, so we keep
4479 * arbitration disabled for gen8 batches. Arbitration will be
4480 * re-enabled before we close the request
4481 * (engine->emit_fini_breadcrumb).
4482 */
4483 *cs++ = MI_ARB_ON_OFF | MI_ARB_DISABLE;
4484
4485 /* FIXME(BDW+): Address space and security selectors. */
4486 *cs++ = MI_BATCH_BUFFER_START_GEN8 |
4487 (flags & I915_DISPATCH_SECURE ? 0 : BIT(8));
4488 *cs++ = lower_32_bits(offset);
4489 *cs++ = upper_32_bits(offset);
4490
4491 intel_ring_advance(rq, cs);
4492
4493 return 0;
4494 }
4495
gen8_emit_bb_start(struct i915_request * rq,u64 offset,u32 len,const unsigned int flags)4496 static int gen8_emit_bb_start(struct i915_request *rq,
4497 u64 offset, u32 len,
4498 const unsigned int flags)
4499 {
4500 u32 *cs;
4501
4502 cs = intel_ring_begin(rq, 6);
4503 if (IS_ERR(cs))
4504 return PTR_ERR(cs);
4505
4506 *cs++ = MI_ARB_ON_OFF | MI_ARB_ENABLE;
4507
4508 *cs++ = MI_BATCH_BUFFER_START_GEN8 |
4509 (flags & I915_DISPATCH_SECURE ? 0 : BIT(8));
4510 *cs++ = lower_32_bits(offset);
4511 *cs++ = upper_32_bits(offset);
4512
4513 *cs++ = MI_ARB_ON_OFF | MI_ARB_DISABLE;
4514 *cs++ = MI_NOOP;
4515
4516 intel_ring_advance(rq, cs);
4517
4518 return 0;
4519 }
4520
gen8_logical_ring_enable_irq(struct intel_engine_cs * engine)4521 static void gen8_logical_ring_enable_irq(struct intel_engine_cs *engine)
4522 {
4523 ENGINE_WRITE(engine, RING_IMR,
4524 ~(engine->irq_enable_mask | engine->irq_keep_mask));
4525 ENGINE_POSTING_READ(engine, RING_IMR);
4526 }
4527
gen8_logical_ring_disable_irq(struct intel_engine_cs * engine)4528 static void gen8_logical_ring_disable_irq(struct intel_engine_cs *engine)
4529 {
4530 ENGINE_WRITE(engine, RING_IMR, ~engine->irq_keep_mask);
4531 }
4532
gen8_emit_flush(struct i915_request * request,u32 mode)4533 static int gen8_emit_flush(struct i915_request *request, u32 mode)
4534 {
4535 u32 cmd, *cs;
4536
4537 cs = intel_ring_begin(request, 4);
4538 if (IS_ERR(cs))
4539 return PTR_ERR(cs);
4540
4541 cmd = MI_FLUSH_DW + 1;
4542
4543 /* We always require a command barrier so that subsequent
4544 * commands, such as breadcrumb interrupts, are strictly ordered
4545 * wrt the contents of the write cache being flushed to memory
4546 * (and thus being coherent from the CPU).
4547 */
4548 cmd |= MI_FLUSH_DW_STORE_INDEX | MI_FLUSH_DW_OP_STOREDW;
4549
4550 if (mode & EMIT_INVALIDATE) {
4551 cmd |= MI_INVALIDATE_TLB;
4552 if (request->engine->class == VIDEO_DECODE_CLASS)
4553 cmd |= MI_INVALIDATE_BSD;
4554 }
4555
4556 *cs++ = cmd;
4557 *cs++ = LRC_PPHWSP_SCRATCH_ADDR;
4558 *cs++ = 0; /* upper addr */
4559 *cs++ = 0; /* value */
4560 intel_ring_advance(request, cs);
4561
4562 return 0;
4563 }
4564
gen8_emit_flush_render(struct i915_request * request,u32 mode)4565 static int gen8_emit_flush_render(struct i915_request *request,
4566 u32 mode)
4567 {
4568 bool vf_flush_wa = false, dc_flush_wa = false;
4569 u32 *cs, flags = 0;
4570 int len;
4571
4572 flags |= PIPE_CONTROL_CS_STALL;
4573
4574 if (mode & EMIT_FLUSH) {
4575 flags |= PIPE_CONTROL_RENDER_TARGET_CACHE_FLUSH;
4576 flags |= PIPE_CONTROL_DEPTH_CACHE_FLUSH;
4577 flags |= PIPE_CONTROL_DC_FLUSH_ENABLE;
4578 flags |= PIPE_CONTROL_FLUSH_ENABLE;
4579 }
4580
4581 if (mode & EMIT_INVALIDATE) {
4582 flags |= PIPE_CONTROL_TLB_INVALIDATE;
4583 flags |= PIPE_CONTROL_INSTRUCTION_CACHE_INVALIDATE;
4584 flags |= PIPE_CONTROL_TEXTURE_CACHE_INVALIDATE;
4585 flags |= PIPE_CONTROL_VF_CACHE_INVALIDATE;
4586 flags |= PIPE_CONTROL_CONST_CACHE_INVALIDATE;
4587 flags |= PIPE_CONTROL_STATE_CACHE_INVALIDATE;
4588 flags |= PIPE_CONTROL_QW_WRITE;
4589 flags |= PIPE_CONTROL_STORE_DATA_INDEX;
4590
4591 /*
4592 * On GEN9: before VF_CACHE_INVALIDATE we need to emit a NULL
4593 * pipe control.
4594 */
4595 if (IS_GEN(request->engine->i915, 9))
4596 vf_flush_wa = true;
4597
4598 /* WaForGAMHang:kbl */
4599 if (IS_KBL_GT_REVID(request->engine->i915, 0, KBL_REVID_B0))
4600 dc_flush_wa = true;
4601 }
4602
4603 len = 6;
4604
4605 if (vf_flush_wa)
4606 len += 6;
4607
4608 if (dc_flush_wa)
4609 len += 12;
4610
4611 cs = intel_ring_begin(request, len);
4612 if (IS_ERR(cs))
4613 return PTR_ERR(cs);
4614
4615 if (vf_flush_wa)
4616 cs = gen8_emit_pipe_control(cs, 0, 0);
4617
4618 if (dc_flush_wa)
4619 cs = gen8_emit_pipe_control(cs, PIPE_CONTROL_DC_FLUSH_ENABLE,
4620 0);
4621
4622 cs = gen8_emit_pipe_control(cs, flags, LRC_PPHWSP_SCRATCH_ADDR);
4623
4624 if (dc_flush_wa)
4625 cs = gen8_emit_pipe_control(cs, PIPE_CONTROL_CS_STALL, 0);
4626
4627 intel_ring_advance(request, cs);
4628
4629 return 0;
4630 }
4631
gen11_emit_flush_render(struct i915_request * request,u32 mode)4632 static int gen11_emit_flush_render(struct i915_request *request,
4633 u32 mode)
4634 {
4635 if (mode & EMIT_FLUSH) {
4636 u32 *cs;
4637 u32 flags = 0;
4638
4639 flags |= PIPE_CONTROL_CS_STALL;
4640
4641 flags |= PIPE_CONTROL_TILE_CACHE_FLUSH;
4642 flags |= PIPE_CONTROL_RENDER_TARGET_CACHE_FLUSH;
4643 flags |= PIPE_CONTROL_DEPTH_CACHE_FLUSH;
4644 flags |= PIPE_CONTROL_DC_FLUSH_ENABLE;
4645 flags |= PIPE_CONTROL_FLUSH_ENABLE;
4646 flags |= PIPE_CONTROL_QW_WRITE;
4647 flags |= PIPE_CONTROL_STORE_DATA_INDEX;
4648
4649 cs = intel_ring_begin(request, 6);
4650 if (IS_ERR(cs))
4651 return PTR_ERR(cs);
4652
4653 cs = gen8_emit_pipe_control(cs, flags, LRC_PPHWSP_SCRATCH_ADDR);
4654 intel_ring_advance(request, cs);
4655 }
4656
4657 if (mode & EMIT_INVALIDATE) {
4658 u32 *cs;
4659 u32 flags = 0;
4660
4661 flags |= PIPE_CONTROL_CS_STALL;
4662
4663 flags |= PIPE_CONTROL_COMMAND_CACHE_INVALIDATE;
4664 flags |= PIPE_CONTROL_TLB_INVALIDATE;
4665 flags |= PIPE_CONTROL_INSTRUCTION_CACHE_INVALIDATE;
4666 flags |= PIPE_CONTROL_TEXTURE_CACHE_INVALIDATE;
4667 flags |= PIPE_CONTROL_VF_CACHE_INVALIDATE;
4668 flags |= PIPE_CONTROL_CONST_CACHE_INVALIDATE;
4669 flags |= PIPE_CONTROL_STATE_CACHE_INVALIDATE;
4670 flags |= PIPE_CONTROL_QW_WRITE;
4671 flags |= PIPE_CONTROL_STORE_DATA_INDEX;
4672
4673 cs = intel_ring_begin(request, 6);
4674 if (IS_ERR(cs))
4675 return PTR_ERR(cs);
4676
4677 cs = gen8_emit_pipe_control(cs, flags, LRC_PPHWSP_SCRATCH_ADDR);
4678 intel_ring_advance(request, cs);
4679 }
4680
4681 return 0;
4682 }
4683
preparser_disable(bool state)4684 static u32 preparser_disable(bool state)
4685 {
4686 return MI_ARB_CHECK | 1 << 8 | state;
4687 }
4688
aux_inv_reg(const struct intel_engine_cs * engine)4689 static i915_reg_t aux_inv_reg(const struct intel_engine_cs *engine)
4690 {
4691 static const i915_reg_t vd[] = {
4692 GEN12_VD0_AUX_NV,
4693 GEN12_VD1_AUX_NV,
4694 GEN12_VD2_AUX_NV,
4695 GEN12_VD3_AUX_NV,
4696 };
4697
4698 static const i915_reg_t ve[] = {
4699 GEN12_VE0_AUX_NV,
4700 GEN12_VE1_AUX_NV,
4701 };
4702
4703 if (engine->class == VIDEO_DECODE_CLASS)
4704 return vd[engine->instance];
4705
4706 if (engine->class == VIDEO_ENHANCEMENT_CLASS)
4707 return ve[engine->instance];
4708
4709 GEM_BUG_ON("unknown aux_inv_reg\n");
4710
4711 return INVALID_MMIO_REG;
4712 }
4713
4714 static u32 *
gen12_emit_aux_table_inv(const i915_reg_t inv_reg,u32 * cs)4715 gen12_emit_aux_table_inv(const i915_reg_t inv_reg, u32 *cs)
4716 {
4717 *cs++ = MI_LOAD_REGISTER_IMM(1);
4718 *cs++ = i915_mmio_reg_offset(inv_reg);
4719 *cs++ = AUX_INV;
4720 *cs++ = MI_NOOP;
4721
4722 return cs;
4723 }
4724
gen12_emit_flush_render(struct i915_request * request,u32 mode)4725 static int gen12_emit_flush_render(struct i915_request *request,
4726 u32 mode)
4727 {
4728 if (mode & EMIT_FLUSH) {
4729 u32 flags = 0;
4730 u32 *cs;
4731
4732 flags |= PIPE_CONTROL_TILE_CACHE_FLUSH;
4733 flags |= PIPE_CONTROL_FLUSH_L3;
4734 flags |= PIPE_CONTROL_RENDER_TARGET_CACHE_FLUSH;
4735 flags |= PIPE_CONTROL_DEPTH_CACHE_FLUSH;
4736 /* Wa_1409600907:tgl */
4737 flags |= PIPE_CONTROL_DEPTH_STALL;
4738 flags |= PIPE_CONTROL_DC_FLUSH_ENABLE;
4739 flags |= PIPE_CONTROL_FLUSH_ENABLE;
4740
4741 flags |= PIPE_CONTROL_STORE_DATA_INDEX;
4742 flags |= PIPE_CONTROL_QW_WRITE;
4743
4744 flags |= PIPE_CONTROL_CS_STALL;
4745
4746 cs = intel_ring_begin(request, 6);
4747 if (IS_ERR(cs))
4748 return PTR_ERR(cs);
4749
4750 cs = gen12_emit_pipe_control(cs,
4751 PIPE_CONTROL0_HDC_PIPELINE_FLUSH,
4752 flags, LRC_PPHWSP_SCRATCH_ADDR);
4753 intel_ring_advance(request, cs);
4754 }
4755
4756 if (mode & EMIT_INVALIDATE) {
4757 u32 flags = 0;
4758 u32 *cs;
4759
4760 flags |= PIPE_CONTROL_COMMAND_CACHE_INVALIDATE;
4761 flags |= PIPE_CONTROL_TLB_INVALIDATE;
4762 flags |= PIPE_CONTROL_INSTRUCTION_CACHE_INVALIDATE;
4763 flags |= PIPE_CONTROL_TEXTURE_CACHE_INVALIDATE;
4764 flags |= PIPE_CONTROL_VF_CACHE_INVALIDATE;
4765 flags |= PIPE_CONTROL_CONST_CACHE_INVALIDATE;
4766 flags |= PIPE_CONTROL_STATE_CACHE_INVALIDATE;
4767
4768 flags |= PIPE_CONTROL_STORE_DATA_INDEX;
4769 flags |= PIPE_CONTROL_QW_WRITE;
4770
4771 flags |= PIPE_CONTROL_CS_STALL;
4772
4773 cs = intel_ring_begin(request, 8 + 4);
4774 if (IS_ERR(cs))
4775 return PTR_ERR(cs);
4776
4777 /*
4778 * Prevent the pre-parser from skipping past the TLB
4779 * invalidate and loading a stale page for the batch
4780 * buffer / request payload.
4781 */
4782 *cs++ = preparser_disable(true);
4783
4784 cs = gen8_emit_pipe_control(cs, flags, LRC_PPHWSP_SCRATCH_ADDR);
4785
4786 /* hsdes: 1809175790 */
4787 cs = gen12_emit_aux_table_inv(GEN12_GFX_CCS_AUX_NV, cs);
4788
4789 *cs++ = preparser_disable(false);
4790 intel_ring_advance(request, cs);
4791 }
4792
4793 return 0;
4794 }
4795
gen12_emit_flush(struct i915_request * request,u32 mode)4796 static int gen12_emit_flush(struct i915_request *request, u32 mode)
4797 {
4798 intel_engine_mask_t aux_inv = 0;
4799 u32 cmd, *cs;
4800
4801 cmd = 4;
4802 if (mode & EMIT_INVALIDATE)
4803 cmd += 2;
4804 if (mode & EMIT_INVALIDATE)
4805 aux_inv = request->engine->mask & ~BIT(BCS0);
4806 if (aux_inv)
4807 cmd += 2 * hweight8(aux_inv) + 2;
4808
4809 cs = intel_ring_begin(request, cmd);
4810 if (IS_ERR(cs))
4811 return PTR_ERR(cs);
4812
4813 if (mode & EMIT_INVALIDATE)
4814 *cs++ = preparser_disable(true);
4815
4816 cmd = MI_FLUSH_DW + 1;
4817
4818 /* We always require a command barrier so that subsequent
4819 * commands, such as breadcrumb interrupts, are strictly ordered
4820 * wrt the contents of the write cache being flushed to memory
4821 * (and thus being coherent from the CPU).
4822 */
4823 cmd |= MI_FLUSH_DW_STORE_INDEX | MI_FLUSH_DW_OP_STOREDW;
4824
4825 if (mode & EMIT_INVALIDATE) {
4826 cmd |= MI_INVALIDATE_TLB;
4827 if (request->engine->class == VIDEO_DECODE_CLASS)
4828 cmd |= MI_INVALIDATE_BSD;
4829 }
4830
4831 *cs++ = cmd;
4832 *cs++ = LRC_PPHWSP_SCRATCH_ADDR;
4833 *cs++ = 0; /* upper addr */
4834 *cs++ = 0; /* value */
4835
4836 if (aux_inv) { /* hsdes: 1809175790 */
4837 struct intel_engine_cs *engine;
4838 unsigned int tmp;
4839
4840 *cs++ = MI_LOAD_REGISTER_IMM(hweight8(aux_inv));
4841 for_each_engine_masked(engine, request->engine->gt,
4842 aux_inv, tmp) {
4843 *cs++ = i915_mmio_reg_offset(aux_inv_reg(engine));
4844 *cs++ = AUX_INV;
4845 }
4846 *cs++ = MI_NOOP;
4847 }
4848
4849 if (mode & EMIT_INVALIDATE)
4850 *cs++ = preparser_disable(false);
4851
4852 intel_ring_advance(request, cs);
4853
4854 return 0;
4855 }
4856
assert_request_valid(struct i915_request * rq)4857 static void assert_request_valid(struct i915_request *rq)
4858 {
4859 struct intel_ring *ring __maybe_unused = rq->ring;
4860
4861 /* Can we unwind this request without appearing to go forwards? */
4862 GEM_BUG_ON(intel_ring_direction(ring, rq->wa_tail, rq->head) <= 0);
4863 }
4864
4865 /*
4866 * Reserve space for 2 NOOPs at the end of each request to be
4867 * used as a workaround for not being allowed to do lite
4868 * restore with HEAD==TAIL (WaIdleLiteRestore).
4869 */
gen8_emit_wa_tail(struct i915_request * request,u32 * cs)4870 static u32 *gen8_emit_wa_tail(struct i915_request *request, u32 *cs)
4871 {
4872 /* Ensure there's always at least one preemption point per-request. */
4873 *cs++ = MI_ARB_CHECK;
4874 *cs++ = MI_NOOP;
4875 request->wa_tail = intel_ring_offset(request, cs);
4876
4877 /* Check that entire request is less than half the ring */
4878 assert_request_valid(request);
4879
4880 return cs;
4881 }
4882
emit_preempt_busywait(struct i915_request * request,u32 * cs)4883 static u32 *emit_preempt_busywait(struct i915_request *request, u32 *cs)
4884 {
4885 *cs++ = MI_SEMAPHORE_WAIT |
4886 MI_SEMAPHORE_GLOBAL_GTT |
4887 MI_SEMAPHORE_POLL |
4888 MI_SEMAPHORE_SAD_EQ_SDD;
4889 *cs++ = 0;
4890 *cs++ = intel_hws_preempt_address(request->engine);
4891 *cs++ = 0;
4892
4893 return cs;
4894 }
4895
4896 static __always_inline u32*
gen8_emit_fini_breadcrumb_tail(struct i915_request * request,u32 * cs)4897 gen8_emit_fini_breadcrumb_tail(struct i915_request *request, u32 *cs)
4898 {
4899 *cs++ = MI_USER_INTERRUPT;
4900
4901 *cs++ = MI_ARB_ON_OFF | MI_ARB_ENABLE;
4902 if (intel_engine_has_semaphores(request->engine))
4903 cs = emit_preempt_busywait(request, cs);
4904
4905 request->tail = intel_ring_offset(request, cs);
4906 assert_ring_tail_valid(request->ring, request->tail);
4907
4908 return gen8_emit_wa_tail(request, cs);
4909 }
4910
emit_xcs_breadcrumb(struct i915_request * rq,u32 * cs)4911 static u32 *emit_xcs_breadcrumb(struct i915_request *rq, u32 *cs)
4912 {
4913 return gen8_emit_ggtt_write(cs, rq->fence.seqno, hwsp_offset(rq), 0);
4914 }
4915
gen8_emit_fini_breadcrumb(struct i915_request * rq,u32 * cs)4916 static u32 *gen8_emit_fini_breadcrumb(struct i915_request *rq, u32 *cs)
4917 {
4918 return gen8_emit_fini_breadcrumb_tail(rq, emit_xcs_breadcrumb(rq, cs));
4919 }
4920
gen8_emit_fini_breadcrumb_rcs(struct i915_request * request,u32 * cs)4921 static u32 *gen8_emit_fini_breadcrumb_rcs(struct i915_request *request, u32 *cs)
4922 {
4923 cs = gen8_emit_pipe_control(cs,
4924 PIPE_CONTROL_RENDER_TARGET_CACHE_FLUSH |
4925 PIPE_CONTROL_DEPTH_CACHE_FLUSH |
4926 PIPE_CONTROL_DC_FLUSH_ENABLE,
4927 0);
4928
4929 /* XXX flush+write+CS_STALL all in one upsets gem_concurrent_blt:kbl */
4930 cs = gen8_emit_ggtt_write_rcs(cs,
4931 request->fence.seqno,
4932 hwsp_offset(request),
4933 PIPE_CONTROL_FLUSH_ENABLE |
4934 PIPE_CONTROL_CS_STALL);
4935
4936 return gen8_emit_fini_breadcrumb_tail(request, cs);
4937 }
4938
4939 static u32 *
gen11_emit_fini_breadcrumb_rcs(struct i915_request * request,u32 * cs)4940 gen11_emit_fini_breadcrumb_rcs(struct i915_request *request, u32 *cs)
4941 {
4942 cs = gen8_emit_ggtt_write_rcs(cs,
4943 request->fence.seqno,
4944 hwsp_offset(request),
4945 PIPE_CONTROL_CS_STALL |
4946 PIPE_CONTROL_TILE_CACHE_FLUSH |
4947 PIPE_CONTROL_RENDER_TARGET_CACHE_FLUSH |
4948 PIPE_CONTROL_DEPTH_CACHE_FLUSH |
4949 PIPE_CONTROL_DC_FLUSH_ENABLE |
4950 PIPE_CONTROL_FLUSH_ENABLE);
4951
4952 return gen8_emit_fini_breadcrumb_tail(request, cs);
4953 }
4954
4955 /*
4956 * Note that the CS instruction pre-parser will not stall on the breadcrumb
4957 * flush and will continue pre-fetching the instructions after it before the
4958 * memory sync is completed. On pre-gen12 HW, the pre-parser will stop at
4959 * BB_START/END instructions, so, even though we might pre-fetch the pre-amble
4960 * of the next request before the memory has been flushed, we're guaranteed that
4961 * we won't access the batch itself too early.
4962 * However, on gen12+ the parser can pre-fetch across the BB_START/END commands,
4963 * so, if the current request is modifying an instruction in the next request on
4964 * the same intel_context, we might pre-fetch and then execute the pre-update
4965 * instruction. To avoid this, the users of self-modifying code should either
4966 * disable the parser around the code emitting the memory writes, via a new flag
4967 * added to MI_ARB_CHECK, or emit the writes from a different intel_context. For
4968 * the in-kernel use-cases we've opted to use a separate context, see
4969 * reloc_gpu() as an example.
4970 * All the above applies only to the instructions themselves. Non-inline data
4971 * used by the instructions is not pre-fetched.
4972 */
4973
gen12_emit_preempt_busywait(struct i915_request * request,u32 * cs)4974 static u32 *gen12_emit_preempt_busywait(struct i915_request *request, u32 *cs)
4975 {
4976 *cs++ = MI_SEMAPHORE_WAIT_TOKEN |
4977 MI_SEMAPHORE_GLOBAL_GTT |
4978 MI_SEMAPHORE_POLL |
4979 MI_SEMAPHORE_SAD_EQ_SDD;
4980 *cs++ = 0;
4981 *cs++ = intel_hws_preempt_address(request->engine);
4982 *cs++ = 0;
4983 *cs++ = 0;
4984 *cs++ = MI_NOOP;
4985
4986 return cs;
4987 }
4988
4989 static __always_inline u32*
gen12_emit_fini_breadcrumb_tail(struct i915_request * request,u32 * cs)4990 gen12_emit_fini_breadcrumb_tail(struct i915_request *request, u32 *cs)
4991 {
4992 *cs++ = MI_USER_INTERRUPT;
4993
4994 *cs++ = MI_ARB_ON_OFF | MI_ARB_ENABLE;
4995 if (intel_engine_has_semaphores(request->engine))
4996 cs = gen12_emit_preempt_busywait(request, cs);
4997
4998 request->tail = intel_ring_offset(request, cs);
4999 assert_ring_tail_valid(request->ring, request->tail);
5000
5001 return gen8_emit_wa_tail(request, cs);
5002 }
5003
gen12_emit_fini_breadcrumb(struct i915_request * rq,u32 * cs)5004 static u32 *gen12_emit_fini_breadcrumb(struct i915_request *rq, u32 *cs)
5005 {
5006 /* XXX Stalling flush before seqno write; post-sync not */
5007 cs = emit_xcs_breadcrumb(rq, __gen8_emit_flush_dw(cs, 0, 0, 0));
5008 return gen12_emit_fini_breadcrumb_tail(rq, cs);
5009 }
5010
5011 static u32 *
gen12_emit_fini_breadcrumb_rcs(struct i915_request * request,u32 * cs)5012 gen12_emit_fini_breadcrumb_rcs(struct i915_request *request, u32 *cs)
5013 {
5014 cs = gen12_emit_ggtt_write_rcs(cs,
5015 request->fence.seqno,
5016 hwsp_offset(request),
5017 PIPE_CONTROL0_HDC_PIPELINE_FLUSH,
5018 PIPE_CONTROL_CS_STALL |
5019 PIPE_CONTROL_TILE_CACHE_FLUSH |
5020 PIPE_CONTROL_FLUSH_L3 |
5021 PIPE_CONTROL_RENDER_TARGET_CACHE_FLUSH |
5022 PIPE_CONTROL_DEPTH_CACHE_FLUSH |
5023 /* Wa_1409600907:tgl */
5024 PIPE_CONTROL_DEPTH_STALL |
5025 PIPE_CONTROL_DC_FLUSH_ENABLE |
5026 PIPE_CONTROL_FLUSH_ENABLE);
5027
5028 return gen12_emit_fini_breadcrumb_tail(request, cs);
5029 }
5030
execlists_park(struct intel_engine_cs * engine)5031 static void execlists_park(struct intel_engine_cs *engine)
5032 {
5033 cancel_timer(&engine->execlists.timer);
5034 cancel_timer(&engine->execlists.preempt);
5035 }
5036
intel_execlists_set_default_submission(struct intel_engine_cs * engine)5037 void intel_execlists_set_default_submission(struct intel_engine_cs *engine)
5038 {
5039 engine->submit_request = execlists_submit_request;
5040 engine->schedule = i915_schedule;
5041 engine->execlists.tasklet.func = execlists_submission_tasklet;
5042
5043 engine->reset.prepare = execlists_reset_prepare;
5044 engine->reset.rewind = execlists_reset_rewind;
5045 engine->reset.cancel = execlists_reset_cancel;
5046 engine->reset.finish = execlists_reset_finish;
5047
5048 engine->park = execlists_park;
5049 engine->unpark = NULL;
5050
5051 engine->flags |= I915_ENGINE_SUPPORTS_STATS;
5052 if (!intel_vgpu_active(engine->i915)) {
5053 engine->flags |= I915_ENGINE_HAS_SEMAPHORES;
5054 if (HAS_LOGICAL_RING_PREEMPTION(engine->i915)) {
5055 engine->flags |= I915_ENGINE_HAS_PREEMPTION;
5056 if (IS_ACTIVE(CONFIG_DRM_I915_TIMESLICE_DURATION))
5057 engine->flags |= I915_ENGINE_HAS_TIMESLICES;
5058 }
5059 }
5060
5061 if (INTEL_GEN(engine->i915) >= 12)
5062 engine->flags |= I915_ENGINE_HAS_RELATIVE_MMIO;
5063
5064 if (intel_engine_has_preemption(engine))
5065 engine->emit_bb_start = gen8_emit_bb_start;
5066 else
5067 engine->emit_bb_start = gen8_emit_bb_start_noarb;
5068 }
5069
execlists_shutdown(struct intel_engine_cs * engine)5070 static void execlists_shutdown(struct intel_engine_cs *engine)
5071 {
5072 /* Synchronise with residual timers and any softirq they raise */
5073 del_timer_sync(&engine->execlists.timer);
5074 del_timer_sync(&engine->execlists.preempt);
5075 tasklet_kill(&engine->execlists.tasklet);
5076 }
5077
execlists_release(struct intel_engine_cs * engine)5078 static void execlists_release(struct intel_engine_cs *engine)
5079 {
5080 engine->sanitize = NULL; /* no longer in control, nothing to sanitize */
5081
5082 execlists_shutdown(engine);
5083
5084 intel_engine_cleanup_common(engine);
5085 lrc_destroy_wa_ctx(engine);
5086 }
5087
5088 static void
logical_ring_default_vfuncs(struct intel_engine_cs * engine)5089 logical_ring_default_vfuncs(struct intel_engine_cs *engine)
5090 {
5091 /* Default vfuncs which can be overriden by each engine. */
5092
5093 engine->resume = execlists_resume;
5094
5095 engine->cops = &execlists_context_ops;
5096 engine->request_alloc = execlists_request_alloc;
5097
5098 engine->emit_flush = gen8_emit_flush;
5099 engine->emit_init_breadcrumb = gen8_emit_init_breadcrumb;
5100 engine->emit_fini_breadcrumb = gen8_emit_fini_breadcrumb;
5101 if (INTEL_GEN(engine->i915) >= 12) {
5102 engine->emit_fini_breadcrumb = gen12_emit_fini_breadcrumb;
5103 engine->emit_flush = gen12_emit_flush;
5104 }
5105 engine->set_default_submission = intel_execlists_set_default_submission;
5106
5107 if (INTEL_GEN(engine->i915) < 11) {
5108 engine->irq_enable = gen8_logical_ring_enable_irq;
5109 engine->irq_disable = gen8_logical_ring_disable_irq;
5110 } else {
5111 /*
5112 * TODO: On Gen11 interrupt masks need to be clear
5113 * to allow C6 entry. Keep interrupts enabled at
5114 * and take the hit of generating extra interrupts
5115 * until a more refined solution exists.
5116 */
5117 }
5118 }
5119
5120 static inline void
logical_ring_default_irqs(struct intel_engine_cs * engine)5121 logical_ring_default_irqs(struct intel_engine_cs *engine)
5122 {
5123 unsigned int shift = 0;
5124
5125 if (INTEL_GEN(engine->i915) < 11) {
5126 const u8 irq_shifts[] = {
5127 [RCS0] = GEN8_RCS_IRQ_SHIFT,
5128 [BCS0] = GEN8_BCS_IRQ_SHIFT,
5129 [VCS0] = GEN8_VCS0_IRQ_SHIFT,
5130 [VCS1] = GEN8_VCS1_IRQ_SHIFT,
5131 [VECS0] = GEN8_VECS_IRQ_SHIFT,
5132 };
5133
5134 shift = irq_shifts[engine->id];
5135 }
5136
5137 engine->irq_enable_mask = GT_RENDER_USER_INTERRUPT << shift;
5138 engine->irq_keep_mask = GT_CONTEXT_SWITCH_INTERRUPT << shift;
5139 engine->irq_keep_mask |= GT_CS_MASTER_ERROR_INTERRUPT << shift;
5140 engine->irq_keep_mask |= GT_WAIT_SEMAPHORE_INTERRUPT << shift;
5141 }
5142
rcs_submission_override(struct intel_engine_cs * engine)5143 static void rcs_submission_override(struct intel_engine_cs *engine)
5144 {
5145 switch (INTEL_GEN(engine->i915)) {
5146 case 12:
5147 engine->emit_flush = gen12_emit_flush_render;
5148 engine->emit_fini_breadcrumb = gen12_emit_fini_breadcrumb_rcs;
5149 break;
5150 case 11:
5151 engine->emit_flush = gen11_emit_flush_render;
5152 engine->emit_fini_breadcrumb = gen11_emit_fini_breadcrumb_rcs;
5153 break;
5154 default:
5155 engine->emit_flush = gen8_emit_flush_render;
5156 engine->emit_fini_breadcrumb = gen8_emit_fini_breadcrumb_rcs;
5157 break;
5158 }
5159 }
5160
intel_execlists_submission_setup(struct intel_engine_cs * engine)5161 int intel_execlists_submission_setup(struct intel_engine_cs *engine)
5162 {
5163 struct intel_engine_execlists * const execlists = &engine->execlists;
5164 struct drm_i915_private *i915 = engine->i915;
5165 struct intel_uncore *uncore = engine->uncore;
5166 u32 base = engine->mmio_base;
5167
5168 tasklet_init(&engine->execlists.tasklet,
5169 execlists_submission_tasklet, (unsigned long)engine);
5170 timer_setup(&engine->execlists.timer, execlists_timeslice, 0);
5171 timer_setup(&engine->execlists.preempt, execlists_preempt, 0);
5172
5173 logical_ring_default_vfuncs(engine);
5174 logical_ring_default_irqs(engine);
5175
5176 if (engine->class == RENDER_CLASS)
5177 rcs_submission_override(engine);
5178
5179 if (intel_init_workaround_bb(engine))
5180 /*
5181 * We continue even if we fail to initialize WA batch
5182 * because we only expect rare glitches but nothing
5183 * critical to prevent us from using GPU
5184 */
5185 drm_err(&i915->drm, "WA batch buffer initialization failed\n");
5186
5187 if (HAS_LOGICAL_RING_ELSQ(i915)) {
5188 execlists->submit_reg = uncore->regs +
5189 i915_mmio_reg_offset(RING_EXECLIST_SQ_CONTENTS(base));
5190 execlists->ctrl_reg = uncore->regs +
5191 i915_mmio_reg_offset(RING_EXECLIST_CONTROL(base));
5192 } else {
5193 execlists->submit_reg = uncore->regs +
5194 i915_mmio_reg_offset(RING_ELSP(base));
5195 }
5196
5197 execlists->csb_status =
5198 (u64 *)&engine->status_page.addr[I915_HWS_CSB_BUF0_INDEX];
5199
5200 execlists->csb_write =
5201 &engine->status_page.addr[intel_hws_csb_write_index(i915)];
5202
5203 if (INTEL_GEN(i915) < 11)
5204 execlists->csb_size = GEN8_CSB_ENTRIES;
5205 else
5206 execlists->csb_size = GEN11_CSB_ENTRIES;
5207
5208 if (INTEL_GEN(engine->i915) >= 11) {
5209 execlists->ccid |= engine->instance << (GEN11_ENGINE_INSTANCE_SHIFT - 32);
5210 execlists->ccid |= engine->class << (GEN11_ENGINE_CLASS_SHIFT - 32);
5211 }
5212
5213 /* Finally, take ownership and responsibility for cleanup! */
5214 engine->sanitize = execlists_sanitize;
5215 engine->release = execlists_release;
5216
5217 return 0;
5218 }
5219
init_common_reg_state(u32 * const regs,const struct intel_engine_cs * engine,const struct intel_ring * ring,bool inhibit)5220 static void init_common_reg_state(u32 * const regs,
5221 const struct intel_engine_cs *engine,
5222 const struct intel_ring *ring,
5223 bool inhibit)
5224 {
5225 u32 ctl;
5226
5227 ctl = _MASKED_BIT_ENABLE(CTX_CTRL_INHIBIT_SYN_CTX_SWITCH);
5228 ctl |= _MASKED_BIT_DISABLE(CTX_CTRL_ENGINE_CTX_RESTORE_INHIBIT);
5229 if (inhibit)
5230 ctl |= CTX_CTRL_ENGINE_CTX_RESTORE_INHIBIT;
5231 if (INTEL_GEN(engine->i915) < 11)
5232 ctl |= _MASKED_BIT_DISABLE(CTX_CTRL_ENGINE_CTX_SAVE_INHIBIT |
5233 CTX_CTRL_RS_CTX_ENABLE);
5234 regs[CTX_CONTEXT_CONTROL] = ctl;
5235
5236 regs[CTX_RING_CTL] = RING_CTL_SIZE(ring->size) | RING_VALID;
5237 regs[CTX_TIMESTAMP] = 0;
5238 }
5239
init_wa_bb_reg_state(u32 * const regs,const struct intel_engine_cs * engine)5240 static void init_wa_bb_reg_state(u32 * const regs,
5241 const struct intel_engine_cs *engine)
5242 {
5243 const struct i915_ctx_workarounds * const wa_ctx = &engine->wa_ctx;
5244
5245 if (wa_ctx->per_ctx.size) {
5246 const u32 ggtt_offset = i915_ggtt_offset(wa_ctx->vma);
5247
5248 GEM_BUG_ON(lrc_ring_wa_bb_per_ctx(engine) == -1);
5249 regs[lrc_ring_wa_bb_per_ctx(engine) + 1] =
5250 (ggtt_offset + wa_ctx->per_ctx.offset) | 0x01;
5251 }
5252
5253 if (wa_ctx->indirect_ctx.size) {
5254 lrc_ring_setup_indirect_ctx(regs, engine,
5255 i915_ggtt_offset(wa_ctx->vma) +
5256 wa_ctx->indirect_ctx.offset,
5257 wa_ctx->indirect_ctx.size);
5258 }
5259 }
5260
init_ppgtt_reg_state(u32 * regs,const struct i915_ppgtt * ppgtt)5261 static void init_ppgtt_reg_state(u32 *regs, const struct i915_ppgtt *ppgtt)
5262 {
5263 if (i915_vm_is_4lvl(&ppgtt->vm)) {
5264 /* 64b PPGTT (48bit canonical)
5265 * PDP0_DESCRIPTOR contains the base address to PML4 and
5266 * other PDP Descriptors are ignored.
5267 */
5268 ASSIGN_CTX_PML4(ppgtt, regs);
5269 } else {
5270 ASSIGN_CTX_PDP(ppgtt, regs, 3);
5271 ASSIGN_CTX_PDP(ppgtt, regs, 2);
5272 ASSIGN_CTX_PDP(ppgtt, regs, 1);
5273 ASSIGN_CTX_PDP(ppgtt, regs, 0);
5274 }
5275 }
5276
vm_alias(struct i915_address_space * vm)5277 static struct i915_ppgtt *vm_alias(struct i915_address_space *vm)
5278 {
5279 if (i915_is_ggtt(vm))
5280 return i915_vm_to_ggtt(vm)->alias;
5281 else
5282 return i915_vm_to_ppgtt(vm);
5283 }
5284
execlists_init_reg_state(u32 * regs,const struct intel_context * ce,const struct intel_engine_cs * engine,const struct intel_ring * ring,bool inhibit)5285 static void execlists_init_reg_state(u32 *regs,
5286 const struct intel_context *ce,
5287 const struct intel_engine_cs *engine,
5288 const struct intel_ring *ring,
5289 bool inhibit)
5290 {
5291 /*
5292 * A context is actually a big batch buffer with several
5293 * MI_LOAD_REGISTER_IMM commands followed by (reg, value) pairs. The
5294 * values we are setting here are only for the first context restore:
5295 * on a subsequent save, the GPU will recreate this batchbuffer with new
5296 * values (including all the missing MI_LOAD_REGISTER_IMM commands that
5297 * we are not initializing here).
5298 *
5299 * Must keep consistent with virtual_update_register_offsets().
5300 */
5301 set_offsets(regs, reg_offsets(engine), engine, inhibit);
5302
5303 init_common_reg_state(regs, engine, ring, inhibit);
5304 init_ppgtt_reg_state(regs, vm_alias(ce->vm));
5305
5306 init_wa_bb_reg_state(regs, engine);
5307
5308 __reset_stop_ring(regs, engine);
5309 }
5310
5311 static int
populate_lr_context(struct intel_context * ce,struct drm_i915_gem_object * ctx_obj,struct intel_engine_cs * engine,struct intel_ring * ring)5312 populate_lr_context(struct intel_context *ce,
5313 struct drm_i915_gem_object *ctx_obj,
5314 struct intel_engine_cs *engine,
5315 struct intel_ring *ring)
5316 {
5317 bool inhibit = true;
5318 void *vaddr;
5319
5320 vaddr = i915_gem_object_pin_map(ctx_obj, I915_MAP_WB);
5321 if (IS_ERR(vaddr)) {
5322 drm_dbg(&engine->i915->drm, "Could not map object pages!\n");
5323 return PTR_ERR(vaddr);
5324 }
5325
5326 set_redzone(vaddr, engine);
5327
5328 if (engine->default_state) {
5329 shmem_read(engine->default_state, 0,
5330 vaddr, engine->context_size);
5331 __set_bit(CONTEXT_VALID_BIT, &ce->flags);
5332 inhibit = false;
5333 }
5334
5335 /* Clear the ppHWSP (inc. per-context counters) */
5336 memset(vaddr, 0, PAGE_SIZE);
5337
5338 /*
5339 * The second page of the context object contains some registers which
5340 * must be set up prior to the first execution.
5341 */
5342 execlists_init_reg_state(vaddr + LRC_STATE_OFFSET,
5343 ce, engine, ring, inhibit);
5344
5345 __i915_gem_object_flush_map(ctx_obj, 0, engine->context_size);
5346 i915_gem_object_unpin_map(ctx_obj);
5347 return 0;
5348 }
5349
pinned_timeline(struct intel_context * ce)5350 static struct intel_timeline *pinned_timeline(struct intel_context *ce)
5351 {
5352 struct intel_timeline *tl = fetch_and_zero(&ce->timeline);
5353
5354 return intel_timeline_create_from_engine(ce->engine,
5355 page_unmask_bits(tl));
5356 }
5357
__execlists_context_alloc(struct intel_context * ce,struct intel_engine_cs * engine)5358 static int __execlists_context_alloc(struct intel_context *ce,
5359 struct intel_engine_cs *engine)
5360 {
5361 struct drm_i915_gem_object *ctx_obj;
5362 struct intel_ring *ring;
5363 struct i915_vma *vma;
5364 u32 context_size;
5365 int ret;
5366
5367 GEM_BUG_ON(ce->state);
5368 context_size = round_up(engine->context_size, I915_GTT_PAGE_SIZE);
5369
5370 if (IS_ENABLED(CONFIG_DRM_I915_DEBUG_GEM))
5371 context_size += I915_GTT_PAGE_SIZE; /* for redzone */
5372
5373 if (INTEL_GEN(engine->i915) == 12) {
5374 ce->wa_bb_page = context_size / PAGE_SIZE;
5375 context_size += PAGE_SIZE;
5376 }
5377
5378 ctx_obj = i915_gem_object_create_shmem(engine->i915, context_size);
5379 if (IS_ERR(ctx_obj))
5380 return PTR_ERR(ctx_obj);
5381
5382 vma = i915_vma_instance(ctx_obj, &engine->gt->ggtt->vm, NULL);
5383 if (IS_ERR(vma)) {
5384 ret = PTR_ERR(vma);
5385 goto error_deref_obj;
5386 }
5387
5388 if (!page_mask_bits(ce->timeline)) {
5389 struct intel_timeline *tl;
5390
5391 /*
5392 * Use the static global HWSP for the kernel context, and
5393 * a dynamically allocated cacheline for everyone else.
5394 */
5395 if (unlikely(ce->timeline))
5396 tl = pinned_timeline(ce);
5397 else
5398 tl = intel_timeline_create(engine->gt);
5399 if (IS_ERR(tl)) {
5400 ret = PTR_ERR(tl);
5401 goto error_deref_obj;
5402 }
5403
5404 ce->timeline = tl;
5405 }
5406
5407 ring = intel_engine_create_ring(engine, (unsigned long)ce->ring);
5408 if (IS_ERR(ring)) {
5409 ret = PTR_ERR(ring);
5410 goto error_deref_obj;
5411 }
5412
5413 ret = populate_lr_context(ce, ctx_obj, engine, ring);
5414 if (ret) {
5415 drm_dbg(&engine->i915->drm,
5416 "Failed to populate LRC: %d\n", ret);
5417 goto error_ring_free;
5418 }
5419
5420 ce->ring = ring;
5421 ce->state = vma;
5422
5423 return 0;
5424
5425 error_ring_free:
5426 intel_ring_put(ring);
5427 error_deref_obj:
5428 i915_gem_object_put(ctx_obj);
5429 return ret;
5430 }
5431
virtual_queue(struct virtual_engine * ve)5432 static struct list_head *virtual_queue(struct virtual_engine *ve)
5433 {
5434 return &ve->base.execlists.default_priolist.requests[0];
5435 }
5436
rcu_virtual_context_destroy(struct work_struct * wrk)5437 static void rcu_virtual_context_destroy(struct work_struct *wrk)
5438 {
5439 struct virtual_engine *ve =
5440 container_of(wrk, typeof(*ve), rcu.work);
5441 unsigned int n;
5442
5443 GEM_BUG_ON(ve->context.inflight);
5444
5445 /* Preempt-to-busy may leave a stale request behind. */
5446 if (unlikely(ve->request)) {
5447 struct i915_request *old;
5448
5449 spin_lock_irq(&ve->base.active.lock);
5450
5451 old = fetch_and_zero(&ve->request);
5452 if (old) {
5453 GEM_BUG_ON(!i915_request_completed(old));
5454 __i915_request_submit(old);
5455 i915_request_put(old);
5456 }
5457
5458 spin_unlock_irq(&ve->base.active.lock);
5459 }
5460
5461 /*
5462 * Flush the tasklet in case it is still running on another core.
5463 *
5464 * This needs to be done before we remove ourselves from the siblings'
5465 * rbtrees as in the case it is running in parallel, it may reinsert
5466 * the rb_node into a sibling.
5467 */
5468 tasklet_kill(&ve->base.execlists.tasklet);
5469
5470 /* Decouple ourselves from the siblings, no more access allowed. */
5471 for (n = 0; n < ve->num_siblings; n++) {
5472 struct intel_engine_cs *sibling = ve->siblings[n];
5473 struct rb_node *node = &ve->nodes[sibling->id].rb;
5474
5475 if (RB_EMPTY_NODE(node))
5476 continue;
5477
5478 spin_lock_irq(&sibling->active.lock);
5479
5480 /* Detachment is lazily performed in the execlists tasklet */
5481 if (!RB_EMPTY_NODE(node))
5482 rb_erase_cached(node, &sibling->execlists.virtual);
5483
5484 spin_unlock_irq(&sibling->active.lock);
5485 }
5486 GEM_BUG_ON(__tasklet_is_scheduled(&ve->base.execlists.tasklet));
5487 GEM_BUG_ON(!list_empty(virtual_queue(ve)));
5488
5489 if (ve->context.state)
5490 __execlists_context_fini(&ve->context);
5491 intel_context_fini(&ve->context);
5492
5493 intel_breadcrumbs_free(ve->base.breadcrumbs);
5494 intel_engine_free_request_pool(&ve->base);
5495
5496 kfree(ve->bonds);
5497 kfree(ve);
5498 }
5499
virtual_context_destroy(struct kref * kref)5500 static void virtual_context_destroy(struct kref *kref)
5501 {
5502 struct virtual_engine *ve =
5503 container_of(kref, typeof(*ve), context.ref);
5504
5505 GEM_BUG_ON(!list_empty(&ve->context.signals));
5506
5507 /*
5508 * When destroying the virtual engine, we have to be aware that
5509 * it may still be in use from an hardirq/softirq context causing
5510 * the resubmission of a completed request (background completion
5511 * due to preempt-to-busy). Before we can free the engine, we need
5512 * to flush the submission code and tasklets that are still potentially
5513 * accessing the engine. Flushing the tasklets requires process context,
5514 * and since we can guard the resubmit onto the engine with an RCU read
5515 * lock, we can delegate the free of the engine to an RCU worker.
5516 */
5517 INIT_RCU_WORK(&ve->rcu, rcu_virtual_context_destroy);
5518 queue_rcu_work(system_wq, &ve->rcu);
5519 }
5520
virtual_engine_initial_hint(struct virtual_engine * ve)5521 static void virtual_engine_initial_hint(struct virtual_engine *ve)
5522 {
5523 int swp;
5524
5525 /*
5526 * Pick a random sibling on starting to help spread the load around.
5527 *
5528 * New contexts are typically created with exactly the same order
5529 * of siblings, and often started in batches. Due to the way we iterate
5530 * the array of sibling when submitting requests, sibling[0] is
5531 * prioritised for dequeuing. If we make sure that sibling[0] is fairly
5532 * randomised across the system, we also help spread the load by the
5533 * first engine we inspect being different each time.
5534 *
5535 * NB This does not force us to execute on this engine, it will just
5536 * typically be the first we inspect for submission.
5537 */
5538 swp = prandom_u32_max(ve->num_siblings);
5539 if (swp)
5540 swap(ve->siblings[swp], ve->siblings[0]);
5541 }
5542
virtual_context_alloc(struct intel_context * ce)5543 static int virtual_context_alloc(struct intel_context *ce)
5544 {
5545 struct virtual_engine *ve = container_of(ce, typeof(*ve), context);
5546
5547 return __execlists_context_alloc(ce, ve->siblings[0]);
5548 }
5549
virtual_context_pin(struct intel_context * ce,void * vaddr)5550 static int virtual_context_pin(struct intel_context *ce, void *vaddr)
5551 {
5552 struct virtual_engine *ve = container_of(ce, typeof(*ve), context);
5553
5554 /* Note: we must use a real engine class for setting up reg state */
5555 return __execlists_context_pin(ce, ve->siblings[0], vaddr);
5556 }
5557
virtual_context_enter(struct intel_context * ce)5558 static void virtual_context_enter(struct intel_context *ce)
5559 {
5560 struct virtual_engine *ve = container_of(ce, typeof(*ve), context);
5561 unsigned int n;
5562
5563 for (n = 0; n < ve->num_siblings; n++)
5564 intel_engine_pm_get(ve->siblings[n]);
5565
5566 intel_timeline_enter(ce->timeline);
5567 }
5568
virtual_context_exit(struct intel_context * ce)5569 static void virtual_context_exit(struct intel_context *ce)
5570 {
5571 struct virtual_engine *ve = container_of(ce, typeof(*ve), context);
5572 unsigned int n;
5573
5574 intel_timeline_exit(ce->timeline);
5575
5576 for (n = 0; n < ve->num_siblings; n++)
5577 intel_engine_pm_put(ve->siblings[n]);
5578 }
5579
5580 static const struct intel_context_ops virtual_context_ops = {
5581 .alloc = virtual_context_alloc,
5582
5583 .pre_pin = execlists_context_pre_pin,
5584 .pin = virtual_context_pin,
5585 .unpin = execlists_context_unpin,
5586 .post_unpin = execlists_context_post_unpin,
5587
5588 .enter = virtual_context_enter,
5589 .exit = virtual_context_exit,
5590
5591 .destroy = virtual_context_destroy,
5592 };
5593
virtual_submission_mask(struct virtual_engine * ve)5594 static intel_engine_mask_t virtual_submission_mask(struct virtual_engine *ve)
5595 {
5596 struct i915_request *rq;
5597 intel_engine_mask_t mask;
5598
5599 rq = READ_ONCE(ve->request);
5600 if (!rq)
5601 return 0;
5602
5603 /* The rq is ready for submission; rq->execution_mask is now stable. */
5604 mask = rq->execution_mask;
5605 if (unlikely(!mask)) {
5606 /* Invalid selection, submit to a random engine in error */
5607 i915_request_set_error_once(rq, -ENODEV);
5608 mask = ve->siblings[0]->mask;
5609 }
5610
5611 ENGINE_TRACE(&ve->base, "rq=%llx:%lld, mask=%x, prio=%d\n",
5612 rq->fence.context, rq->fence.seqno,
5613 mask, ve->base.execlists.queue_priority_hint);
5614
5615 return mask;
5616 }
5617
virtual_submission_tasklet(unsigned long data)5618 static void virtual_submission_tasklet(unsigned long data)
5619 {
5620 struct virtual_engine * const ve = (struct virtual_engine *)data;
5621 const int prio = READ_ONCE(ve->base.execlists.queue_priority_hint);
5622 intel_engine_mask_t mask;
5623 unsigned int n;
5624
5625 rcu_read_lock();
5626 mask = virtual_submission_mask(ve);
5627 rcu_read_unlock();
5628 if (unlikely(!mask))
5629 return;
5630
5631 local_irq_disable();
5632 for (n = 0; n < ve->num_siblings; n++) {
5633 struct intel_engine_cs *sibling = READ_ONCE(ve->siblings[n]);
5634 struct ve_node * const node = &ve->nodes[sibling->id];
5635 struct rb_node **parent, *rb;
5636 bool first;
5637
5638 if (!READ_ONCE(ve->request))
5639 break; /* already handled by a sibling's tasklet */
5640
5641 if (unlikely(!(mask & sibling->mask))) {
5642 if (!RB_EMPTY_NODE(&node->rb)) {
5643 spin_lock(&sibling->active.lock);
5644 rb_erase_cached(&node->rb,
5645 &sibling->execlists.virtual);
5646 RB_CLEAR_NODE(&node->rb);
5647 spin_unlock(&sibling->active.lock);
5648 }
5649 continue;
5650 }
5651
5652 spin_lock(&sibling->active.lock);
5653
5654 if (!RB_EMPTY_NODE(&node->rb)) {
5655 /*
5656 * Cheat and avoid rebalancing the tree if we can
5657 * reuse this node in situ.
5658 */
5659 first = rb_first_cached(&sibling->execlists.virtual) ==
5660 &node->rb;
5661 if (prio == node->prio || (prio > node->prio && first))
5662 goto submit_engine;
5663
5664 rb_erase_cached(&node->rb, &sibling->execlists.virtual);
5665 }
5666
5667 rb = NULL;
5668 first = true;
5669 parent = &sibling->execlists.virtual.rb_root.rb_node;
5670 while (*parent) {
5671 struct ve_node *other;
5672
5673 rb = *parent;
5674 other = rb_entry(rb, typeof(*other), rb);
5675 if (prio > other->prio) {
5676 parent = &rb->rb_left;
5677 } else {
5678 parent = &rb->rb_right;
5679 first = false;
5680 }
5681 }
5682
5683 rb_link_node(&node->rb, rb, parent);
5684 rb_insert_color_cached(&node->rb,
5685 &sibling->execlists.virtual,
5686 first);
5687
5688 submit_engine:
5689 GEM_BUG_ON(RB_EMPTY_NODE(&node->rb));
5690 node->prio = prio;
5691 if (first && prio > sibling->execlists.queue_priority_hint)
5692 tasklet_hi_schedule(&sibling->execlists.tasklet);
5693
5694 spin_unlock(&sibling->active.lock);
5695 }
5696 local_irq_enable();
5697 }
5698
virtual_submit_request(struct i915_request * rq)5699 static void virtual_submit_request(struct i915_request *rq)
5700 {
5701 struct virtual_engine *ve = to_virtual_engine(rq->engine);
5702 struct i915_request *old;
5703 unsigned long flags;
5704
5705 ENGINE_TRACE(&ve->base, "rq=%llx:%lld\n",
5706 rq->fence.context,
5707 rq->fence.seqno);
5708
5709 GEM_BUG_ON(ve->base.submit_request != virtual_submit_request);
5710
5711 spin_lock_irqsave(&ve->base.active.lock, flags);
5712
5713 old = ve->request;
5714 if (old) { /* background completion event from preempt-to-busy */
5715 GEM_BUG_ON(!i915_request_completed(old));
5716 __i915_request_submit(old);
5717 i915_request_put(old);
5718 }
5719
5720 if (i915_request_completed(rq)) {
5721 __i915_request_submit(rq);
5722
5723 ve->base.execlists.queue_priority_hint = INT_MIN;
5724 ve->request = NULL;
5725 } else {
5726 ve->base.execlists.queue_priority_hint = rq_prio(rq);
5727 ve->request = i915_request_get(rq);
5728
5729 GEM_BUG_ON(!list_empty(virtual_queue(ve)));
5730 list_move_tail(&rq->sched.link, virtual_queue(ve));
5731
5732 tasklet_hi_schedule(&ve->base.execlists.tasklet);
5733 }
5734
5735 spin_unlock_irqrestore(&ve->base.active.lock, flags);
5736 }
5737
5738 static struct ve_bond *
virtual_find_bond(struct virtual_engine * ve,const struct intel_engine_cs * master)5739 virtual_find_bond(struct virtual_engine *ve,
5740 const struct intel_engine_cs *master)
5741 {
5742 int i;
5743
5744 for (i = 0; i < ve->num_bonds; i++) {
5745 if (ve->bonds[i].master == master)
5746 return &ve->bonds[i];
5747 }
5748
5749 return NULL;
5750 }
5751
5752 static void
virtual_bond_execute(struct i915_request * rq,struct dma_fence * signal)5753 virtual_bond_execute(struct i915_request *rq, struct dma_fence *signal)
5754 {
5755 struct virtual_engine *ve = to_virtual_engine(rq->engine);
5756 intel_engine_mask_t allowed, exec;
5757 struct ve_bond *bond;
5758
5759 allowed = ~to_request(signal)->engine->mask;
5760
5761 bond = virtual_find_bond(ve, to_request(signal)->engine);
5762 if (bond)
5763 allowed &= bond->sibling_mask;
5764
5765 /* Restrict the bonded request to run on only the available engines */
5766 exec = READ_ONCE(rq->execution_mask);
5767 while (!try_cmpxchg(&rq->execution_mask, &exec, exec & allowed))
5768 ;
5769
5770 /* Prevent the master from being re-run on the bonded engines */
5771 to_request(signal)->execution_mask &= ~allowed;
5772 }
5773
5774 struct intel_context *
intel_execlists_create_virtual(struct intel_engine_cs ** siblings,unsigned int count)5775 intel_execlists_create_virtual(struct intel_engine_cs **siblings,
5776 unsigned int count)
5777 {
5778 struct virtual_engine *ve;
5779 unsigned int n;
5780 int err;
5781
5782 if (count == 0)
5783 return ERR_PTR(-EINVAL);
5784
5785 if (count == 1)
5786 return intel_context_create(siblings[0]);
5787
5788 ve = kzalloc(struct_size(ve, siblings, count), GFP_KERNEL);
5789 if (!ve)
5790 return ERR_PTR(-ENOMEM);
5791
5792 ve->base.i915 = siblings[0]->i915;
5793 ve->base.gt = siblings[0]->gt;
5794 ve->base.uncore = siblings[0]->uncore;
5795 ve->base.id = -1;
5796
5797 ve->base.class = OTHER_CLASS;
5798 ve->base.uabi_class = I915_ENGINE_CLASS_INVALID;
5799 ve->base.instance = I915_ENGINE_CLASS_INVALID_VIRTUAL;
5800 ve->base.uabi_instance = I915_ENGINE_CLASS_INVALID_VIRTUAL;
5801
5802 /*
5803 * The decision on whether to submit a request using semaphores
5804 * depends on the saturated state of the engine. We only compute
5805 * this during HW submission of the request, and we need for this
5806 * state to be globally applied to all requests being submitted
5807 * to this engine. Virtual engines encompass more than one physical
5808 * engine and so we cannot accurately tell in advance if one of those
5809 * engines is already saturated and so cannot afford to use a semaphore
5810 * and be pessimized in priority for doing so -- if we are the only
5811 * context using semaphores after all other clients have stopped, we
5812 * will be starved on the saturated system. Such a global switch for
5813 * semaphores is less than ideal, but alas is the current compromise.
5814 */
5815 ve->base.saturated = ALL_ENGINES;
5816
5817 snprintf(ve->base.name, sizeof(ve->base.name), "virtual");
5818
5819 intel_engine_init_active(&ve->base, ENGINE_VIRTUAL);
5820 intel_engine_init_execlists(&ve->base);
5821
5822 ve->base.cops = &virtual_context_ops;
5823 ve->base.request_alloc = execlists_request_alloc;
5824
5825 ve->base.schedule = i915_schedule;
5826 ve->base.submit_request = virtual_submit_request;
5827 ve->base.bond_execute = virtual_bond_execute;
5828
5829 INIT_LIST_HEAD(virtual_queue(ve));
5830 ve->base.execlists.queue_priority_hint = INT_MIN;
5831 tasklet_init(&ve->base.execlists.tasklet,
5832 virtual_submission_tasklet,
5833 (unsigned long)ve);
5834
5835 intel_context_init(&ve->context, &ve->base);
5836
5837 ve->base.breadcrumbs = intel_breadcrumbs_create(NULL);
5838 if (!ve->base.breadcrumbs) {
5839 err = -ENOMEM;
5840 goto err_put;
5841 }
5842
5843 for (n = 0; n < count; n++) {
5844 struct intel_engine_cs *sibling = siblings[n];
5845
5846 GEM_BUG_ON(!is_power_of_2(sibling->mask));
5847 if (sibling->mask & ve->base.mask) {
5848 DRM_DEBUG("duplicate %s entry in load balancer\n",
5849 sibling->name);
5850 err = -EINVAL;
5851 goto err_put;
5852 }
5853
5854 /*
5855 * The virtual engine implementation is tightly coupled to
5856 * the execlists backend -- we push out request directly
5857 * into a tree inside each physical engine. We could support
5858 * layering if we handle cloning of the requests and
5859 * submitting a copy into each backend.
5860 */
5861 if (sibling->execlists.tasklet.func !=
5862 execlists_submission_tasklet) {
5863 err = -ENODEV;
5864 goto err_put;
5865 }
5866
5867 GEM_BUG_ON(RB_EMPTY_NODE(&ve->nodes[sibling->id].rb));
5868 RB_CLEAR_NODE(&ve->nodes[sibling->id].rb);
5869
5870 ve->siblings[ve->num_siblings++] = sibling;
5871 ve->base.mask |= sibling->mask;
5872
5873 /*
5874 * All physical engines must be compatible for their emission
5875 * functions (as we build the instructions during request
5876 * construction and do not alter them before submission
5877 * on the physical engine). We use the engine class as a guide
5878 * here, although that could be refined.
5879 */
5880 if (ve->base.class != OTHER_CLASS) {
5881 if (ve->base.class != sibling->class) {
5882 DRM_DEBUG("invalid mixing of engine class, sibling %d, already %d\n",
5883 sibling->class, ve->base.class);
5884 err = -EINVAL;
5885 goto err_put;
5886 }
5887 continue;
5888 }
5889
5890 ve->base.class = sibling->class;
5891 ve->base.uabi_class = sibling->uabi_class;
5892 snprintf(ve->base.name, sizeof(ve->base.name),
5893 "v%dx%d", ve->base.class, count);
5894 ve->base.context_size = sibling->context_size;
5895
5896 ve->base.emit_bb_start = sibling->emit_bb_start;
5897 ve->base.emit_flush = sibling->emit_flush;
5898 ve->base.emit_init_breadcrumb = sibling->emit_init_breadcrumb;
5899 ve->base.emit_fini_breadcrumb = sibling->emit_fini_breadcrumb;
5900 ve->base.emit_fini_breadcrumb_dw =
5901 sibling->emit_fini_breadcrumb_dw;
5902
5903 ve->base.flags = sibling->flags;
5904 }
5905
5906 ve->base.flags |= I915_ENGINE_IS_VIRTUAL;
5907
5908 virtual_engine_initial_hint(ve);
5909 return &ve->context;
5910
5911 err_put:
5912 intel_context_put(&ve->context);
5913 return ERR_PTR(err);
5914 }
5915
5916 struct intel_context *
intel_execlists_clone_virtual(struct intel_engine_cs * src)5917 intel_execlists_clone_virtual(struct intel_engine_cs *src)
5918 {
5919 struct virtual_engine *se = to_virtual_engine(src);
5920 struct intel_context *dst;
5921
5922 dst = intel_execlists_create_virtual(se->siblings,
5923 se->num_siblings);
5924 if (IS_ERR(dst))
5925 return dst;
5926
5927 if (se->num_bonds) {
5928 struct virtual_engine *de = to_virtual_engine(dst->engine);
5929
5930 de->bonds = kmemdup(se->bonds,
5931 sizeof(*se->bonds) * se->num_bonds,
5932 GFP_KERNEL);
5933 if (!de->bonds) {
5934 intel_context_put(dst);
5935 return ERR_PTR(-ENOMEM);
5936 }
5937
5938 de->num_bonds = se->num_bonds;
5939 }
5940
5941 return dst;
5942 }
5943
intel_virtual_engine_attach_bond(struct intel_engine_cs * engine,const struct intel_engine_cs * master,const struct intel_engine_cs * sibling)5944 int intel_virtual_engine_attach_bond(struct intel_engine_cs *engine,
5945 const struct intel_engine_cs *master,
5946 const struct intel_engine_cs *sibling)
5947 {
5948 struct virtual_engine *ve = to_virtual_engine(engine);
5949 struct ve_bond *bond;
5950 int n;
5951
5952 /* Sanity check the sibling is part of the virtual engine */
5953 for (n = 0; n < ve->num_siblings; n++)
5954 if (sibling == ve->siblings[n])
5955 break;
5956 if (n == ve->num_siblings)
5957 return -EINVAL;
5958
5959 bond = virtual_find_bond(ve, master);
5960 if (bond) {
5961 bond->sibling_mask |= sibling->mask;
5962 return 0;
5963 }
5964
5965 bond = krealloc(ve->bonds,
5966 sizeof(*bond) * (ve->num_bonds + 1),
5967 GFP_KERNEL);
5968 if (!bond)
5969 return -ENOMEM;
5970
5971 bond[ve->num_bonds].master = master;
5972 bond[ve->num_bonds].sibling_mask = sibling->mask;
5973
5974 ve->bonds = bond;
5975 ve->num_bonds++;
5976
5977 return 0;
5978 }
5979
5980 struct intel_engine_cs *
intel_virtual_engine_get_sibling(struct intel_engine_cs * engine,unsigned int sibling)5981 intel_virtual_engine_get_sibling(struct intel_engine_cs *engine,
5982 unsigned int sibling)
5983 {
5984 struct virtual_engine *ve = to_virtual_engine(engine);
5985
5986 if (sibling >= ve->num_siblings)
5987 return NULL;
5988
5989 return ve->siblings[sibling];
5990 }
5991
intel_execlists_show_requests(struct intel_engine_cs * engine,struct drm_printer * m,void (* show_request)(struct drm_printer * m,struct i915_request * rq,const char * prefix),unsigned int max)5992 void intel_execlists_show_requests(struct intel_engine_cs *engine,
5993 struct drm_printer *m,
5994 void (*show_request)(struct drm_printer *m,
5995 struct i915_request *rq,
5996 const char *prefix),
5997 unsigned int max)
5998 {
5999 const struct intel_engine_execlists *execlists = &engine->execlists;
6000 struct i915_request *rq, *last;
6001 unsigned long flags;
6002 unsigned int count;
6003 struct rb_node *rb;
6004
6005 spin_lock_irqsave(&engine->active.lock, flags);
6006
6007 last = NULL;
6008 count = 0;
6009 list_for_each_entry(rq, &engine->active.requests, sched.link) {
6010 if (count++ < max - 1)
6011 show_request(m, rq, "\t\tE ");
6012 else
6013 last = rq;
6014 }
6015 if (last) {
6016 if (count > max) {
6017 drm_printf(m,
6018 "\t\t...skipping %d executing requests...\n",
6019 count - max);
6020 }
6021 show_request(m, last, "\t\tE ");
6022 }
6023
6024 if (execlists->switch_priority_hint != INT_MIN)
6025 drm_printf(m, "\t\tSwitch priority hint: %d\n",
6026 READ_ONCE(execlists->switch_priority_hint));
6027 if (execlists->queue_priority_hint != INT_MIN)
6028 drm_printf(m, "\t\tQueue priority hint: %d\n",
6029 READ_ONCE(execlists->queue_priority_hint));
6030
6031 last = NULL;
6032 count = 0;
6033 for (rb = rb_first_cached(&execlists->queue); rb; rb = rb_next(rb)) {
6034 struct i915_priolist *p = rb_entry(rb, typeof(*p), node);
6035 int i;
6036
6037 priolist_for_each_request(rq, p, i) {
6038 if (count++ < max - 1)
6039 show_request(m, rq, "\t\tQ ");
6040 else
6041 last = rq;
6042 }
6043 }
6044 if (last) {
6045 if (count > max) {
6046 drm_printf(m,
6047 "\t\t...skipping %d queued requests...\n",
6048 count - max);
6049 }
6050 show_request(m, last, "\t\tQ ");
6051 }
6052
6053 last = NULL;
6054 count = 0;
6055 for (rb = rb_first_cached(&execlists->virtual); rb; rb = rb_next(rb)) {
6056 struct virtual_engine *ve =
6057 rb_entry(rb, typeof(*ve), nodes[engine->id].rb);
6058 struct i915_request *rq = READ_ONCE(ve->request);
6059
6060 if (rq) {
6061 if (count++ < max - 1)
6062 show_request(m, rq, "\t\tV ");
6063 else
6064 last = rq;
6065 }
6066 }
6067 if (last) {
6068 if (count > max) {
6069 drm_printf(m,
6070 "\t\t...skipping %d virtual requests...\n",
6071 count - max);
6072 }
6073 show_request(m, last, "\t\tV ");
6074 }
6075
6076 spin_unlock_irqrestore(&engine->active.lock, flags);
6077 }
6078
intel_lr_context_reset(struct intel_engine_cs * engine,struct intel_context * ce,u32 head,bool scrub)6079 void intel_lr_context_reset(struct intel_engine_cs *engine,
6080 struct intel_context *ce,
6081 u32 head,
6082 bool scrub)
6083 {
6084 GEM_BUG_ON(!intel_context_is_pinned(ce));
6085
6086 /*
6087 * We want a simple context + ring to execute the breadcrumb update.
6088 * We cannot rely on the context being intact across the GPU hang,
6089 * so clear it and rebuild just what we need for the breadcrumb.
6090 * All pending requests for this context will be zapped, and any
6091 * future request will be after userspace has had the opportunity
6092 * to recreate its own state.
6093 */
6094 if (scrub)
6095 restore_default_state(ce, engine);
6096
6097 /* Rerun the request; its payload has been neutered (if guilty). */
6098 __execlists_update_reg_state(ce, engine, head);
6099 }
6100
6101 bool
intel_engine_in_execlists_submission_mode(const struct intel_engine_cs * engine)6102 intel_engine_in_execlists_submission_mode(const struct intel_engine_cs *engine)
6103 {
6104 return engine->set_default_submission ==
6105 intel_execlists_set_default_submission;
6106 }
6107
6108 #if IS_ENABLED(CONFIG_DRM_I915_SELFTEST)
6109 #include "selftest_lrc.c"
6110 #endif
6111