1 // SPDX-License-Identifier: MIT
2 /*
3 * Copyright © 2008-2018 Intel Corporation
4 */
5
6 #include <linux/sched/mm.h>
7 #include <linux/stop_machine.h>
8
9 #include "display/intel_display_types.h"
10 #include "display/intel_overlay.h"
11
12 #include "gem/i915_gem_context.h"
13
14 #include "i915_drv.h"
15 #include "i915_gpu_error.h"
16 #include "i915_irq.h"
17 #include "intel_breadcrumbs.h"
18 #include "intel_engine_pm.h"
19 #include "intel_gt.h"
20 #include "intel_gt_pm.h"
21 #include "intel_gt_requests.h"
22 #include "intel_reset.h"
23
24 #include "uc/intel_guc.h"
25
26 #define RESET_MAX_RETRIES 3
27
28 /* XXX How to handle concurrent GGTT updates using tiling registers? */
29 #define RESET_UNDER_STOP_MACHINE 0
30
rmw_set_fw(struct intel_uncore * uncore,i915_reg_t reg,u32 set)31 static void rmw_set_fw(struct intel_uncore *uncore, i915_reg_t reg, u32 set)
32 {
33 intel_uncore_rmw_fw(uncore, reg, 0, set);
34 }
35
rmw_clear_fw(struct intel_uncore * uncore,i915_reg_t reg,u32 clr)36 static void rmw_clear_fw(struct intel_uncore *uncore, i915_reg_t reg, u32 clr)
37 {
38 intel_uncore_rmw_fw(uncore, reg, clr, 0);
39 }
40
client_mark_guilty(struct i915_gem_context * ctx,bool banned)41 static void client_mark_guilty(struct i915_gem_context *ctx, bool banned)
42 {
43 struct drm_i915_file_private *file_priv = ctx->file_priv;
44 unsigned long prev_hang;
45 unsigned int score;
46
47 if (IS_ERR_OR_NULL(file_priv))
48 return;
49
50 score = 0;
51 if (banned)
52 score = I915_CLIENT_SCORE_CONTEXT_BAN;
53
54 prev_hang = xchg(&file_priv->hang_timestamp, jiffies);
55 if (time_before(jiffies, prev_hang + I915_CLIENT_FAST_HANG_JIFFIES))
56 score += I915_CLIENT_SCORE_HANG_FAST;
57
58 if (score) {
59 atomic_add(score, &file_priv->ban_score);
60
61 drm_dbg(&ctx->i915->drm,
62 "client %s: gained %u ban score, now %u\n",
63 ctx->name, score,
64 atomic_read(&file_priv->ban_score));
65 }
66 }
67
mark_guilty(struct i915_request * rq)68 static bool mark_guilty(struct i915_request *rq)
69 {
70 struct i915_gem_context *ctx;
71 unsigned long prev_hang;
72 bool banned;
73 int i;
74
75 if (intel_context_is_closed(rq->context))
76 return true;
77
78 rcu_read_lock();
79 ctx = rcu_dereference(rq->context->gem_context);
80 if (ctx && !kref_get_unless_zero(&ctx->ref))
81 ctx = NULL;
82 rcu_read_unlock();
83 if (!ctx)
84 return intel_context_is_banned(rq->context);
85
86 atomic_inc(&ctx->guilty_count);
87
88 /* Cool contexts are too cool to be banned! (Used for reset testing.) */
89 if (!i915_gem_context_is_bannable(ctx)) {
90 banned = false;
91 goto out;
92 }
93
94 drm_notice(&ctx->i915->drm,
95 "%s context reset due to GPU hang\n",
96 ctx->name);
97
98 /* Record the timestamp for the last N hangs */
99 prev_hang = ctx->hang_timestamp[0];
100 for (i = 0; i < ARRAY_SIZE(ctx->hang_timestamp) - 1; i++)
101 ctx->hang_timestamp[i] = ctx->hang_timestamp[i + 1];
102 ctx->hang_timestamp[i] = jiffies;
103
104 /* If we have hung N+1 times in rapid succession, we ban the context! */
105 banned = !i915_gem_context_is_recoverable(ctx);
106 if (time_before(jiffies, prev_hang + CONTEXT_FAST_HANG_JIFFIES))
107 banned = true;
108 if (banned)
109 drm_dbg(&ctx->i915->drm, "context %s: guilty %d, banned\n",
110 ctx->name, atomic_read(&ctx->guilty_count));
111
112 client_mark_guilty(ctx, banned);
113
114 out:
115 i915_gem_context_put(ctx);
116 return banned;
117 }
118
mark_innocent(struct i915_request * rq)119 static void mark_innocent(struct i915_request *rq)
120 {
121 struct i915_gem_context *ctx;
122
123 rcu_read_lock();
124 ctx = rcu_dereference(rq->context->gem_context);
125 if (ctx)
126 atomic_inc(&ctx->active_count);
127 rcu_read_unlock();
128 }
129
__i915_request_reset(struct i915_request * rq,bool guilty)130 void __i915_request_reset(struct i915_request *rq, bool guilty)
131 {
132 bool banned = false;
133
134 RQ_TRACE(rq, "guilty? %s\n", yesno(guilty));
135 GEM_BUG_ON(__i915_request_is_complete(rq));
136
137 rcu_read_lock(); /* protect the GEM context */
138 if (guilty) {
139 i915_request_set_error_once(rq, -EIO);
140 __i915_request_skip(rq);
141 banned = mark_guilty(rq);
142 } else {
143 i915_request_set_error_once(rq, -EAGAIN);
144 mark_innocent(rq);
145 }
146 rcu_read_unlock();
147
148 if (banned)
149 intel_context_ban(rq->context, rq);
150 }
151
i915_in_reset(struct pci_dev * pdev)152 static bool i915_in_reset(struct pci_dev *pdev)
153 {
154 u8 gdrst;
155
156 pci_read_config_byte(pdev, I915_GDRST, &gdrst);
157 return gdrst & GRDOM_RESET_STATUS;
158 }
159
i915_do_reset(struct intel_gt * gt,intel_engine_mask_t engine_mask,unsigned int retry)160 static int i915_do_reset(struct intel_gt *gt,
161 intel_engine_mask_t engine_mask,
162 unsigned int retry)
163 {
164 struct pci_dev *pdev = to_pci_dev(gt->i915->drm.dev);
165 int err;
166
167 /* Assert reset for at least 20 usec, and wait for acknowledgement. */
168 pci_write_config_byte(pdev, I915_GDRST, GRDOM_RESET_ENABLE);
169 udelay(50);
170 err = wait_for_atomic(i915_in_reset(pdev), 50);
171
172 /* Clear the reset request. */
173 pci_write_config_byte(pdev, I915_GDRST, 0);
174 udelay(50);
175 if (!err)
176 err = wait_for_atomic(!i915_in_reset(pdev), 50);
177
178 return err;
179 }
180
g4x_reset_complete(struct pci_dev * pdev)181 static bool g4x_reset_complete(struct pci_dev *pdev)
182 {
183 u8 gdrst;
184
185 pci_read_config_byte(pdev, I915_GDRST, &gdrst);
186 return (gdrst & GRDOM_RESET_ENABLE) == 0;
187 }
188
g33_do_reset(struct intel_gt * gt,intel_engine_mask_t engine_mask,unsigned int retry)189 static int g33_do_reset(struct intel_gt *gt,
190 intel_engine_mask_t engine_mask,
191 unsigned int retry)
192 {
193 struct pci_dev *pdev = to_pci_dev(gt->i915->drm.dev);
194
195 pci_write_config_byte(pdev, I915_GDRST, GRDOM_RESET_ENABLE);
196 return wait_for_atomic(g4x_reset_complete(pdev), 50);
197 }
198
g4x_do_reset(struct intel_gt * gt,intel_engine_mask_t engine_mask,unsigned int retry)199 static int g4x_do_reset(struct intel_gt *gt,
200 intel_engine_mask_t engine_mask,
201 unsigned int retry)
202 {
203 struct pci_dev *pdev = to_pci_dev(gt->i915->drm.dev);
204 struct intel_uncore *uncore = gt->uncore;
205 int ret;
206
207 /* WaVcpClkGateDisableForMediaReset:ctg,elk */
208 rmw_set_fw(uncore, VDECCLK_GATE_D, VCP_UNIT_CLOCK_GATE_DISABLE);
209 intel_uncore_posting_read_fw(uncore, VDECCLK_GATE_D);
210
211 pci_write_config_byte(pdev, I915_GDRST,
212 GRDOM_MEDIA | GRDOM_RESET_ENABLE);
213 ret = wait_for_atomic(g4x_reset_complete(pdev), 50);
214 if (ret) {
215 GT_TRACE(gt, "Wait for media reset failed\n");
216 goto out;
217 }
218
219 pci_write_config_byte(pdev, I915_GDRST,
220 GRDOM_RENDER | GRDOM_RESET_ENABLE);
221 ret = wait_for_atomic(g4x_reset_complete(pdev), 50);
222 if (ret) {
223 GT_TRACE(gt, "Wait for render reset failed\n");
224 goto out;
225 }
226
227 out:
228 pci_write_config_byte(pdev, I915_GDRST, 0);
229
230 rmw_clear_fw(uncore, VDECCLK_GATE_D, VCP_UNIT_CLOCK_GATE_DISABLE);
231 intel_uncore_posting_read_fw(uncore, VDECCLK_GATE_D);
232
233 return ret;
234 }
235
ilk_do_reset(struct intel_gt * gt,intel_engine_mask_t engine_mask,unsigned int retry)236 static int ilk_do_reset(struct intel_gt *gt, intel_engine_mask_t engine_mask,
237 unsigned int retry)
238 {
239 struct intel_uncore *uncore = gt->uncore;
240 int ret;
241
242 intel_uncore_write_fw(uncore, ILK_GDSR,
243 ILK_GRDOM_RENDER | ILK_GRDOM_RESET_ENABLE);
244 ret = __intel_wait_for_register_fw(uncore, ILK_GDSR,
245 ILK_GRDOM_RESET_ENABLE, 0,
246 5000, 0,
247 NULL);
248 if (ret) {
249 GT_TRACE(gt, "Wait for render reset failed\n");
250 goto out;
251 }
252
253 intel_uncore_write_fw(uncore, ILK_GDSR,
254 ILK_GRDOM_MEDIA | ILK_GRDOM_RESET_ENABLE);
255 ret = __intel_wait_for_register_fw(uncore, ILK_GDSR,
256 ILK_GRDOM_RESET_ENABLE, 0,
257 5000, 0,
258 NULL);
259 if (ret) {
260 GT_TRACE(gt, "Wait for media reset failed\n");
261 goto out;
262 }
263
264 out:
265 intel_uncore_write_fw(uncore, ILK_GDSR, 0);
266 intel_uncore_posting_read_fw(uncore, ILK_GDSR);
267 return ret;
268 }
269
270 /* Reset the hardware domains (GENX_GRDOM_*) specified by mask */
gen6_hw_domain_reset(struct intel_gt * gt,u32 hw_domain_mask)271 static int gen6_hw_domain_reset(struct intel_gt *gt, u32 hw_domain_mask)
272 {
273 struct intel_uncore *uncore = gt->uncore;
274 int loops = 2;
275 int err;
276
277 /*
278 * GEN6_GDRST is not in the gt power well, no need to check
279 * for fifo space for the write or forcewake the chip for
280 * the read
281 */
282 do {
283 intel_uncore_write_fw(uncore, GEN6_GDRST, hw_domain_mask);
284
285 /*
286 * Wait for the device to ack the reset requests.
287 *
288 * On some platforms, e.g. Jasperlake, we see that the
289 * engine register state is not cleared until shortly after
290 * GDRST reports completion, causing a failure as we try
291 * to immediately resume while the internal state is still
292 * in flux. If we immediately repeat the reset, the second
293 * reset appears to serialise with the first, and since
294 * it is a no-op, the registers should retain their reset
295 * value. However, there is still a concern that upon
296 * leaving the second reset, the internal engine state
297 * is still in flux and not ready for resuming.
298 */
299 err = __intel_wait_for_register_fw(uncore, GEN6_GDRST,
300 hw_domain_mask, 0,
301 2000, 0,
302 NULL);
303 } while (err == 0 && --loops);
304 if (err)
305 GT_TRACE(gt,
306 "Wait for 0x%08x engines reset failed\n",
307 hw_domain_mask);
308
309 /*
310 * As we have observed that the engine state is still volatile
311 * after GDRST is acked, impose a small delay to let everything settle.
312 */
313 udelay(50);
314
315 return err;
316 }
317
__gen6_reset_engines(struct intel_gt * gt,intel_engine_mask_t engine_mask,unsigned int retry)318 static int __gen6_reset_engines(struct intel_gt *gt,
319 intel_engine_mask_t engine_mask,
320 unsigned int retry)
321 {
322 static const u32 hw_engine_mask[] = {
323 [RCS0] = GEN6_GRDOM_RENDER,
324 [BCS0] = GEN6_GRDOM_BLT,
325 [VCS0] = GEN6_GRDOM_MEDIA,
326 [VCS1] = GEN8_GRDOM_MEDIA2,
327 [VECS0] = GEN6_GRDOM_VECS,
328 };
329 struct intel_engine_cs *engine;
330 u32 hw_mask;
331
332 if (engine_mask == ALL_ENGINES) {
333 hw_mask = GEN6_GRDOM_FULL;
334 } else {
335 intel_engine_mask_t tmp;
336
337 hw_mask = 0;
338 for_each_engine_masked(engine, gt, engine_mask, tmp) {
339 GEM_BUG_ON(engine->id >= ARRAY_SIZE(hw_engine_mask));
340 hw_mask |= hw_engine_mask[engine->id];
341 }
342 }
343
344 return gen6_hw_domain_reset(gt, hw_mask);
345 }
346
gen6_reset_engines(struct intel_gt * gt,intel_engine_mask_t engine_mask,unsigned int retry)347 static int gen6_reset_engines(struct intel_gt *gt,
348 intel_engine_mask_t engine_mask,
349 unsigned int retry)
350 {
351 unsigned long flags;
352 int ret;
353
354 spin_lock_irqsave(>->uncore->lock, flags);
355 ret = __gen6_reset_engines(gt, engine_mask, retry);
356 spin_unlock_irqrestore(>->uncore->lock, flags);
357
358 return ret;
359 }
360
find_sfc_paired_vecs_engine(struct intel_engine_cs * engine)361 static struct intel_engine_cs *find_sfc_paired_vecs_engine(struct intel_engine_cs *engine)
362 {
363 int vecs_id;
364
365 GEM_BUG_ON(engine->class != VIDEO_DECODE_CLASS);
366
367 vecs_id = _VECS((engine->instance) / 2);
368
369 return engine->gt->engine[vecs_id];
370 }
371
372 struct sfc_lock_data {
373 i915_reg_t lock_reg;
374 i915_reg_t ack_reg;
375 i915_reg_t usage_reg;
376 u32 lock_bit;
377 u32 ack_bit;
378 u32 usage_bit;
379 u32 reset_bit;
380 };
381
get_sfc_forced_lock_data(struct intel_engine_cs * engine,struct sfc_lock_data * sfc_lock)382 static void get_sfc_forced_lock_data(struct intel_engine_cs *engine,
383 struct sfc_lock_data *sfc_lock)
384 {
385 switch (engine->class) {
386 default:
387 MISSING_CASE(engine->class);
388 fallthrough;
389 case VIDEO_DECODE_CLASS:
390 sfc_lock->lock_reg = GEN11_VCS_SFC_FORCED_LOCK(engine);
391 sfc_lock->lock_bit = GEN11_VCS_SFC_FORCED_LOCK_BIT;
392
393 sfc_lock->ack_reg = GEN11_VCS_SFC_LOCK_STATUS(engine);
394 sfc_lock->ack_bit = GEN11_VCS_SFC_LOCK_ACK_BIT;
395
396 sfc_lock->usage_reg = GEN11_VCS_SFC_LOCK_STATUS(engine);
397 sfc_lock->usage_bit = GEN11_VCS_SFC_USAGE_BIT;
398 sfc_lock->reset_bit = GEN11_VCS_SFC_RESET_BIT(engine->instance);
399
400 break;
401 case VIDEO_ENHANCEMENT_CLASS:
402 sfc_lock->lock_reg = GEN11_VECS_SFC_FORCED_LOCK(engine);
403 sfc_lock->lock_bit = GEN11_VECS_SFC_FORCED_LOCK_BIT;
404
405 sfc_lock->ack_reg = GEN11_VECS_SFC_LOCK_ACK(engine);
406 sfc_lock->ack_bit = GEN11_VECS_SFC_LOCK_ACK_BIT;
407
408 sfc_lock->usage_reg = GEN11_VECS_SFC_USAGE(engine);
409 sfc_lock->usage_bit = GEN11_VECS_SFC_USAGE_BIT;
410 sfc_lock->reset_bit = GEN11_VECS_SFC_RESET_BIT(engine->instance);
411
412 break;
413 }
414 }
415
gen11_lock_sfc(struct intel_engine_cs * engine,u32 * reset_mask,u32 * unlock_mask)416 static int gen11_lock_sfc(struct intel_engine_cs *engine,
417 u32 *reset_mask,
418 u32 *unlock_mask)
419 {
420 struct intel_uncore *uncore = engine->uncore;
421 u8 vdbox_sfc_access = engine->gt->info.vdbox_sfc_access;
422 struct sfc_lock_data sfc_lock;
423 bool lock_obtained, lock_to_other = false;
424 int ret;
425
426 switch (engine->class) {
427 case VIDEO_DECODE_CLASS:
428 if ((BIT(engine->instance) & vdbox_sfc_access) == 0)
429 return 0;
430
431 fallthrough;
432 case VIDEO_ENHANCEMENT_CLASS:
433 get_sfc_forced_lock_data(engine, &sfc_lock);
434
435 break;
436 default:
437 return 0;
438 }
439
440 if (!(intel_uncore_read_fw(uncore, sfc_lock.usage_reg) & sfc_lock.usage_bit)) {
441 struct intel_engine_cs *paired_vecs;
442
443 if (engine->class != VIDEO_DECODE_CLASS ||
444 GRAPHICS_VER(engine->i915) != 12)
445 return 0;
446
447 /*
448 * Wa_14010733141
449 *
450 * If the VCS-MFX isn't using the SFC, we also need to check
451 * whether VCS-HCP is using it. If so, we need to issue a *VE*
452 * forced lock on the VE engine that shares the same SFC.
453 */
454 if (!(intel_uncore_read_fw(uncore,
455 GEN12_HCP_SFC_LOCK_STATUS(engine)) &
456 GEN12_HCP_SFC_USAGE_BIT))
457 return 0;
458
459 paired_vecs = find_sfc_paired_vecs_engine(engine);
460 get_sfc_forced_lock_data(paired_vecs, &sfc_lock);
461 lock_to_other = true;
462 *unlock_mask |= paired_vecs->mask;
463 } else {
464 *unlock_mask |= engine->mask;
465 }
466
467 /*
468 * If the engine is using an SFC, tell the engine that a software reset
469 * is going to happen. The engine will then try to force lock the SFC.
470 * If SFC ends up being locked to the engine we want to reset, we have
471 * to reset it as well (we will unlock it once the reset sequence is
472 * completed).
473 */
474 rmw_set_fw(uncore, sfc_lock.lock_reg, sfc_lock.lock_bit);
475
476 ret = __intel_wait_for_register_fw(uncore,
477 sfc_lock.ack_reg,
478 sfc_lock.ack_bit,
479 sfc_lock.ack_bit,
480 1000, 0, NULL);
481
482 /*
483 * Was the SFC released while we were trying to lock it?
484 *
485 * We should reset both the engine and the SFC if:
486 * - We were locking the SFC to this engine and the lock succeeded
487 * OR
488 * - We were locking the SFC to a different engine (Wa_14010733141)
489 * but the SFC was released before the lock was obtained.
490 *
491 * Otherwise we need only reset the engine by itself and we can
492 * leave the SFC alone.
493 */
494 lock_obtained = (intel_uncore_read_fw(uncore, sfc_lock.usage_reg) &
495 sfc_lock.usage_bit) != 0;
496 if (lock_obtained == lock_to_other)
497 return 0;
498
499 if (ret) {
500 ENGINE_TRACE(engine, "Wait for SFC forced lock ack failed\n");
501 return ret;
502 }
503
504 *reset_mask |= sfc_lock.reset_bit;
505 return 0;
506 }
507
gen11_unlock_sfc(struct intel_engine_cs * engine)508 static void gen11_unlock_sfc(struct intel_engine_cs *engine)
509 {
510 struct intel_uncore *uncore = engine->uncore;
511 u8 vdbox_sfc_access = engine->gt->info.vdbox_sfc_access;
512 struct sfc_lock_data sfc_lock = {};
513
514 if (engine->class != VIDEO_DECODE_CLASS &&
515 engine->class != VIDEO_ENHANCEMENT_CLASS)
516 return;
517
518 if (engine->class == VIDEO_DECODE_CLASS &&
519 (BIT(engine->instance) & vdbox_sfc_access) == 0)
520 return;
521
522 get_sfc_forced_lock_data(engine, &sfc_lock);
523
524 rmw_clear_fw(uncore, sfc_lock.lock_reg, sfc_lock.lock_bit);
525 }
526
__gen11_reset_engines(struct intel_gt * gt,intel_engine_mask_t engine_mask,unsigned int retry)527 static int __gen11_reset_engines(struct intel_gt *gt,
528 intel_engine_mask_t engine_mask,
529 unsigned int retry)
530 {
531 static const u32 hw_engine_mask[] = {
532 [RCS0] = GEN11_GRDOM_RENDER,
533 [BCS0] = GEN11_GRDOM_BLT,
534 [VCS0] = GEN11_GRDOM_MEDIA,
535 [VCS1] = GEN11_GRDOM_MEDIA2,
536 [VCS2] = GEN11_GRDOM_MEDIA3,
537 [VCS3] = GEN11_GRDOM_MEDIA4,
538 [VCS4] = GEN11_GRDOM_MEDIA5,
539 [VCS5] = GEN11_GRDOM_MEDIA6,
540 [VCS6] = GEN11_GRDOM_MEDIA7,
541 [VCS7] = GEN11_GRDOM_MEDIA8,
542 [VECS0] = GEN11_GRDOM_VECS,
543 [VECS1] = GEN11_GRDOM_VECS2,
544 [VECS2] = GEN11_GRDOM_VECS3,
545 [VECS3] = GEN11_GRDOM_VECS4,
546 };
547 struct intel_engine_cs *engine;
548 intel_engine_mask_t tmp;
549 u32 reset_mask, unlock_mask = 0;
550 int ret;
551
552 if (engine_mask == ALL_ENGINES) {
553 reset_mask = GEN11_GRDOM_FULL;
554 } else {
555 reset_mask = 0;
556 for_each_engine_masked(engine, gt, engine_mask, tmp) {
557 GEM_BUG_ON(engine->id >= ARRAY_SIZE(hw_engine_mask));
558 reset_mask |= hw_engine_mask[engine->id];
559 ret = gen11_lock_sfc(engine, &reset_mask, &unlock_mask);
560 if (ret)
561 goto sfc_unlock;
562 }
563 }
564
565 ret = gen6_hw_domain_reset(gt, reset_mask);
566
567 sfc_unlock:
568 /*
569 * We unlock the SFC based on the lock status and not the result of
570 * gen11_lock_sfc to make sure that we clean properly if something
571 * wrong happened during the lock (e.g. lock acquired after timeout
572 * expiration).
573 *
574 * Due to Wa_14010733141, we may have locked an SFC to an engine that
575 * wasn't being reset. So instead of calling gen11_unlock_sfc()
576 * on engine_mask, we instead call it on the mask of engines that our
577 * gen11_lock_sfc() calls told us actually had locks attempted.
578 */
579 for_each_engine_masked(engine, gt, unlock_mask, tmp)
580 gen11_unlock_sfc(engine);
581
582 return ret;
583 }
584
gen8_engine_reset_prepare(struct intel_engine_cs * engine)585 static int gen8_engine_reset_prepare(struct intel_engine_cs *engine)
586 {
587 struct intel_uncore *uncore = engine->uncore;
588 const i915_reg_t reg = RING_RESET_CTL(engine->mmio_base);
589 u32 request, mask, ack;
590 int ret;
591
592 if (I915_SELFTEST_ONLY(should_fail(&engine->reset_timeout, 1)))
593 return -ETIMEDOUT;
594
595 ack = intel_uncore_read_fw(uncore, reg);
596 if (ack & RESET_CTL_CAT_ERROR) {
597 /*
598 * For catastrophic errors, ready-for-reset sequence
599 * needs to be bypassed: HAS#396813
600 */
601 request = RESET_CTL_CAT_ERROR;
602 mask = RESET_CTL_CAT_ERROR;
603
604 /* Catastrophic errors need to be cleared by HW */
605 ack = 0;
606 } else if (!(ack & RESET_CTL_READY_TO_RESET)) {
607 request = RESET_CTL_REQUEST_RESET;
608 mask = RESET_CTL_READY_TO_RESET;
609 ack = RESET_CTL_READY_TO_RESET;
610 } else {
611 return 0;
612 }
613
614 intel_uncore_write_fw(uncore, reg, _MASKED_BIT_ENABLE(request));
615 ret = __intel_wait_for_register_fw(uncore, reg, mask, ack,
616 700, 0, NULL);
617 if (ret)
618 drm_err(&engine->i915->drm,
619 "%s reset request timed out: {request: %08x, RESET_CTL: %08x}\n",
620 engine->name, request,
621 intel_uncore_read_fw(uncore, reg));
622
623 return ret;
624 }
625
gen8_engine_reset_cancel(struct intel_engine_cs * engine)626 static void gen8_engine_reset_cancel(struct intel_engine_cs *engine)
627 {
628 intel_uncore_write_fw(engine->uncore,
629 RING_RESET_CTL(engine->mmio_base),
630 _MASKED_BIT_DISABLE(RESET_CTL_REQUEST_RESET));
631 }
632
gen8_reset_engines(struct intel_gt * gt,intel_engine_mask_t engine_mask,unsigned int retry)633 static int gen8_reset_engines(struct intel_gt *gt,
634 intel_engine_mask_t engine_mask,
635 unsigned int retry)
636 {
637 struct intel_engine_cs *engine;
638 const bool reset_non_ready = retry >= 1;
639 intel_engine_mask_t tmp;
640 unsigned long flags;
641 int ret;
642
643 spin_lock_irqsave(>->uncore->lock, flags);
644
645 for_each_engine_masked(engine, gt, engine_mask, tmp) {
646 ret = gen8_engine_reset_prepare(engine);
647 if (ret && !reset_non_ready)
648 goto skip_reset;
649
650 /*
651 * If this is not the first failed attempt to prepare,
652 * we decide to proceed anyway.
653 *
654 * By doing so we risk context corruption and with
655 * some gens (kbl), possible system hang if reset
656 * happens during active bb execution.
657 *
658 * We rather take context corruption instead of
659 * failed reset with a wedged driver/gpu. And
660 * active bb execution case should be covered by
661 * stop_engines() we have before the reset.
662 */
663 }
664
665 /*
666 * Wa_22011100796:dg2, whenever Full soft reset is required,
667 * reset all individual engines firstly, and then do a full soft reset.
668 *
669 * This is best effort, so ignore any error from the initial reset.
670 */
671 if (IS_DG2(gt->i915) && engine_mask == ALL_ENGINES)
672 __gen11_reset_engines(gt, gt->info.engine_mask, 0);
673
674 if (GRAPHICS_VER(gt->i915) >= 11)
675 ret = __gen11_reset_engines(gt, engine_mask, retry);
676 else
677 ret = __gen6_reset_engines(gt, engine_mask, retry);
678
679 skip_reset:
680 for_each_engine_masked(engine, gt, engine_mask, tmp)
681 gen8_engine_reset_cancel(engine);
682
683 spin_unlock_irqrestore(>->uncore->lock, flags);
684
685 return ret;
686 }
687
mock_reset(struct intel_gt * gt,intel_engine_mask_t mask,unsigned int retry)688 static int mock_reset(struct intel_gt *gt,
689 intel_engine_mask_t mask,
690 unsigned int retry)
691 {
692 return 0;
693 }
694
695 typedef int (*reset_func)(struct intel_gt *,
696 intel_engine_mask_t engine_mask,
697 unsigned int retry);
698
intel_get_gpu_reset(const struct intel_gt * gt)699 static reset_func intel_get_gpu_reset(const struct intel_gt *gt)
700 {
701 struct drm_i915_private *i915 = gt->i915;
702
703 if (is_mock_gt(gt))
704 return mock_reset;
705 else if (GRAPHICS_VER(i915) >= 8)
706 return gen8_reset_engines;
707 else if (GRAPHICS_VER(i915) >= 6)
708 return gen6_reset_engines;
709 else if (GRAPHICS_VER(i915) >= 5)
710 return ilk_do_reset;
711 else if (IS_G4X(i915))
712 return g4x_do_reset;
713 else if (IS_G33(i915) || IS_PINEVIEW(i915))
714 return g33_do_reset;
715 else if (GRAPHICS_VER(i915) >= 3)
716 return i915_do_reset;
717 else
718 return NULL;
719 }
720
__intel_gt_reset(struct intel_gt * gt,intel_engine_mask_t engine_mask)721 int __intel_gt_reset(struct intel_gt *gt, intel_engine_mask_t engine_mask)
722 {
723 const int retries = engine_mask == ALL_ENGINES ? RESET_MAX_RETRIES : 1;
724 reset_func reset;
725 int ret = -ETIMEDOUT;
726 int retry;
727
728 reset = intel_get_gpu_reset(gt);
729 if (!reset)
730 return -ENODEV;
731
732 /*
733 * If the power well sleeps during the reset, the reset
734 * request may be dropped and never completes (causing -EIO).
735 */
736 intel_uncore_forcewake_get(gt->uncore, FORCEWAKE_ALL);
737 for (retry = 0; ret == -ETIMEDOUT && retry < retries; retry++) {
738 GT_TRACE(gt, "engine_mask=%x\n", engine_mask);
739 preempt_disable();
740 ret = reset(gt, engine_mask, retry);
741 preempt_enable();
742 }
743 intel_uncore_forcewake_put(gt->uncore, FORCEWAKE_ALL);
744
745 return ret;
746 }
747
intel_has_gpu_reset(const struct intel_gt * gt)748 bool intel_has_gpu_reset(const struct intel_gt *gt)
749 {
750 if (!gt->i915->params.reset)
751 return NULL;
752
753 return intel_get_gpu_reset(gt);
754 }
755
intel_has_reset_engine(const struct intel_gt * gt)756 bool intel_has_reset_engine(const struct intel_gt *gt)
757 {
758 if (gt->i915->params.reset < 2)
759 return false;
760
761 return INTEL_INFO(gt->i915)->has_reset_engine;
762 }
763
intel_reset_guc(struct intel_gt * gt)764 int intel_reset_guc(struct intel_gt *gt)
765 {
766 u32 guc_domain =
767 GRAPHICS_VER(gt->i915) >= 11 ? GEN11_GRDOM_GUC : GEN9_GRDOM_GUC;
768 int ret;
769
770 GEM_BUG_ON(!HAS_GT_UC(gt->i915));
771
772 intel_uncore_forcewake_get(gt->uncore, FORCEWAKE_ALL);
773 ret = gen6_hw_domain_reset(gt, guc_domain);
774 intel_uncore_forcewake_put(gt->uncore, FORCEWAKE_ALL);
775
776 return ret;
777 }
778
779 /*
780 * Ensure irq handler finishes, and not run again.
781 * Also return the active request so that we only search for it once.
782 */
reset_prepare_engine(struct intel_engine_cs * engine)783 static void reset_prepare_engine(struct intel_engine_cs *engine)
784 {
785 /*
786 * During the reset sequence, we must prevent the engine from
787 * entering RC6. As the context state is undefined until we restart
788 * the engine, if it does enter RC6 during the reset, the state
789 * written to the powercontext is undefined and so we may lose
790 * GPU state upon resume, i.e. fail to restart after a reset.
791 */
792 intel_uncore_forcewake_get(engine->uncore, FORCEWAKE_ALL);
793 if (engine->reset.prepare)
794 engine->reset.prepare(engine);
795 }
796
revoke_mmaps(struct intel_gt * gt)797 static void revoke_mmaps(struct intel_gt *gt)
798 {
799 int i;
800
801 for (i = 0; i < gt->ggtt->num_fences; i++) {
802 struct drm_vma_offset_node *node;
803 struct i915_vma *vma;
804 u64 vma_offset;
805
806 vma = READ_ONCE(gt->ggtt->fence_regs[i].vma);
807 if (!vma)
808 continue;
809
810 if (!i915_vma_has_userfault(vma))
811 continue;
812
813 GEM_BUG_ON(vma->fence != >->ggtt->fence_regs[i]);
814
815 if (!vma->mmo)
816 continue;
817
818 node = &vma->mmo->vma_node;
819 vma_offset = vma->ggtt_view.partial.offset << PAGE_SHIFT;
820
821 unmap_mapping_range(gt->i915->drm.anon_inode->i_mapping,
822 drm_vma_node_offset_addr(node) + vma_offset,
823 vma->size,
824 1);
825 }
826 }
827
reset_prepare(struct intel_gt * gt)828 static intel_engine_mask_t reset_prepare(struct intel_gt *gt)
829 {
830 struct intel_engine_cs *engine;
831 intel_engine_mask_t awake = 0;
832 enum intel_engine_id id;
833
834 for_each_engine(engine, gt, id) {
835 if (intel_engine_pm_get_if_awake(engine))
836 awake |= engine->mask;
837 reset_prepare_engine(engine);
838 }
839
840 intel_uc_reset_prepare(>->uc);
841
842 return awake;
843 }
844
gt_revoke(struct intel_gt * gt)845 static void gt_revoke(struct intel_gt *gt)
846 {
847 revoke_mmaps(gt);
848 }
849
gt_reset(struct intel_gt * gt,intel_engine_mask_t stalled_mask)850 static int gt_reset(struct intel_gt *gt, intel_engine_mask_t stalled_mask)
851 {
852 struct intel_engine_cs *engine;
853 enum intel_engine_id id;
854 int err;
855
856 /*
857 * Everything depends on having the GTT running, so we need to start
858 * there.
859 */
860 err = i915_ggtt_enable_hw(gt->i915);
861 if (err)
862 return err;
863
864 local_bh_disable();
865 for_each_engine(engine, gt, id)
866 __intel_engine_reset(engine, stalled_mask & engine->mask);
867 local_bh_enable();
868
869 intel_uc_reset(>->uc, true);
870
871 intel_ggtt_restore_fences(gt->ggtt);
872
873 return err;
874 }
875
reset_finish_engine(struct intel_engine_cs * engine)876 static void reset_finish_engine(struct intel_engine_cs *engine)
877 {
878 if (engine->reset.finish)
879 engine->reset.finish(engine);
880 intel_uncore_forcewake_put(engine->uncore, FORCEWAKE_ALL);
881
882 intel_engine_signal_breadcrumbs(engine);
883 }
884
reset_finish(struct intel_gt * gt,intel_engine_mask_t awake)885 static void reset_finish(struct intel_gt *gt, intel_engine_mask_t awake)
886 {
887 struct intel_engine_cs *engine;
888 enum intel_engine_id id;
889
890 for_each_engine(engine, gt, id) {
891 reset_finish_engine(engine);
892 if (awake & engine->mask)
893 intel_engine_pm_put(engine);
894 }
895
896 intel_uc_reset_finish(>->uc);
897 }
898
nop_submit_request(struct i915_request * request)899 static void nop_submit_request(struct i915_request *request)
900 {
901 RQ_TRACE(request, "-EIO\n");
902
903 request = i915_request_mark_eio(request);
904 if (request) {
905 i915_request_submit(request);
906 intel_engine_signal_breadcrumbs(request->engine);
907
908 i915_request_put(request);
909 }
910 }
911
__intel_gt_set_wedged(struct intel_gt * gt)912 static void __intel_gt_set_wedged(struct intel_gt *gt)
913 {
914 struct intel_engine_cs *engine;
915 intel_engine_mask_t awake;
916 enum intel_engine_id id;
917
918 if (test_bit(I915_WEDGED, >->reset.flags))
919 return;
920
921 GT_TRACE(gt, "start\n");
922
923 /*
924 * First, stop submission to hw, but do not yet complete requests by
925 * rolling the global seqno forward (since this would complete requests
926 * for which we haven't set the fence error to EIO yet).
927 */
928 awake = reset_prepare(gt);
929
930 /* Even if the GPU reset fails, it should still stop the engines */
931 if (!INTEL_INFO(gt->i915)->gpu_reset_clobbers_display)
932 __intel_gt_reset(gt, ALL_ENGINES);
933
934 for_each_engine(engine, gt, id)
935 engine->submit_request = nop_submit_request;
936
937 /*
938 * Make sure no request can slip through without getting completed by
939 * either this call here to intel_engine_write_global_seqno, or the one
940 * in nop_submit_request.
941 */
942 synchronize_rcu_expedited();
943 set_bit(I915_WEDGED, >->reset.flags);
944
945 /* Mark all executing requests as skipped */
946 local_bh_disable();
947 for_each_engine(engine, gt, id)
948 if (engine->reset.cancel)
949 engine->reset.cancel(engine);
950 intel_uc_cancel_requests(>->uc);
951 local_bh_enable();
952
953 reset_finish(gt, awake);
954
955 GT_TRACE(gt, "end\n");
956 }
957
intel_gt_set_wedged(struct intel_gt * gt)958 void intel_gt_set_wedged(struct intel_gt *gt)
959 {
960 intel_wakeref_t wakeref;
961
962 if (test_bit(I915_WEDGED, >->reset.flags))
963 return;
964
965 wakeref = intel_runtime_pm_get(gt->uncore->rpm);
966 mutex_lock(>->reset.mutex);
967
968 if (GEM_SHOW_DEBUG()) {
969 struct drm_printer p = drm_debug_printer(__func__);
970 struct intel_engine_cs *engine;
971 enum intel_engine_id id;
972
973 drm_printf(&p, "called from %pS\n", (void *)_RET_IP_);
974 for_each_engine(engine, gt, id) {
975 if (intel_engine_is_idle(engine))
976 continue;
977
978 intel_engine_dump(engine, &p, "%s\n", engine->name);
979 }
980 }
981
982 __intel_gt_set_wedged(gt);
983
984 mutex_unlock(>->reset.mutex);
985 intel_runtime_pm_put(gt->uncore->rpm, wakeref);
986 }
987
__intel_gt_unset_wedged(struct intel_gt * gt)988 static bool __intel_gt_unset_wedged(struct intel_gt *gt)
989 {
990 struct intel_gt_timelines *timelines = >->timelines;
991 struct intel_timeline *tl;
992 bool ok;
993
994 if (!test_bit(I915_WEDGED, >->reset.flags))
995 return true;
996
997 /* Never fully initialised, recovery impossible */
998 if (intel_gt_has_unrecoverable_error(gt))
999 return false;
1000
1001 GT_TRACE(gt, "start\n");
1002
1003 /*
1004 * Before unwedging, make sure that all pending operations
1005 * are flushed and errored out - we may have requests waiting upon
1006 * third party fences. We marked all inflight requests as EIO, and
1007 * every execbuf since returned EIO, for consistency we want all
1008 * the currently pending requests to also be marked as EIO, which
1009 * is done inside our nop_submit_request - and so we must wait.
1010 *
1011 * No more can be submitted until we reset the wedged bit.
1012 */
1013 spin_lock(&timelines->lock);
1014 list_for_each_entry(tl, &timelines->active_list, link) {
1015 struct dma_fence *fence;
1016
1017 fence = i915_active_fence_get(&tl->last_request);
1018 if (!fence)
1019 continue;
1020
1021 spin_unlock(&timelines->lock);
1022
1023 /*
1024 * All internal dependencies (i915_requests) will have
1025 * been flushed by the set-wedge, but we may be stuck waiting
1026 * for external fences. These should all be capped to 10s
1027 * (I915_FENCE_TIMEOUT) so this wait should not be unbounded
1028 * in the worst case.
1029 */
1030 dma_fence_default_wait(fence, false, MAX_SCHEDULE_TIMEOUT);
1031 dma_fence_put(fence);
1032
1033 /* Restart iteration after droping lock */
1034 spin_lock(&timelines->lock);
1035 tl = list_entry(&timelines->active_list, typeof(*tl), link);
1036 }
1037 spin_unlock(&timelines->lock);
1038
1039 /* We must reset pending GPU events before restoring our submission */
1040 ok = !HAS_EXECLISTS(gt->i915); /* XXX better agnosticism desired */
1041 if (!INTEL_INFO(gt->i915)->gpu_reset_clobbers_display)
1042 ok = __intel_gt_reset(gt, ALL_ENGINES) == 0;
1043 if (!ok) {
1044 /*
1045 * Warn CI about the unrecoverable wedged condition.
1046 * Time for a reboot.
1047 */
1048 add_taint_for_CI(gt->i915, TAINT_WARN);
1049 return false;
1050 }
1051
1052 /*
1053 * Undo nop_submit_request. We prevent all new i915 requests from
1054 * being queued (by disallowing execbuf whilst wedged) so having
1055 * waited for all active requests above, we know the system is idle
1056 * and do not have to worry about a thread being inside
1057 * engine->submit_request() as we swap over. So unlike installing
1058 * the nop_submit_request on reset, we can do this from normal
1059 * context and do not require stop_machine().
1060 */
1061 intel_engines_reset_default_submission(gt);
1062
1063 GT_TRACE(gt, "end\n");
1064
1065 smp_mb__before_atomic(); /* complete takeover before enabling execbuf */
1066 clear_bit(I915_WEDGED, >->reset.flags);
1067
1068 return true;
1069 }
1070
intel_gt_unset_wedged(struct intel_gt * gt)1071 bool intel_gt_unset_wedged(struct intel_gt *gt)
1072 {
1073 bool result;
1074
1075 mutex_lock(>->reset.mutex);
1076 result = __intel_gt_unset_wedged(gt);
1077 mutex_unlock(>->reset.mutex);
1078
1079 return result;
1080 }
1081
do_reset(struct intel_gt * gt,intel_engine_mask_t stalled_mask)1082 static int do_reset(struct intel_gt *gt, intel_engine_mask_t stalled_mask)
1083 {
1084 int err, i;
1085
1086 err = __intel_gt_reset(gt, ALL_ENGINES);
1087 for (i = 0; err && i < RESET_MAX_RETRIES; i++) {
1088 msleep(10 * (i + 1));
1089 err = __intel_gt_reset(gt, ALL_ENGINES);
1090 }
1091 if (err)
1092 return err;
1093
1094 return gt_reset(gt, stalled_mask);
1095 }
1096
resume(struct intel_gt * gt)1097 static int resume(struct intel_gt *gt)
1098 {
1099 struct intel_engine_cs *engine;
1100 enum intel_engine_id id;
1101 int ret;
1102
1103 for_each_engine(engine, gt, id) {
1104 ret = intel_engine_resume(engine);
1105 if (ret)
1106 return ret;
1107 }
1108
1109 return 0;
1110 }
1111
1112 /**
1113 * intel_gt_reset - reset chip after a hang
1114 * @gt: #intel_gt to reset
1115 * @stalled_mask: mask of the stalled engines with the guilty requests
1116 * @reason: user error message for why we are resetting
1117 *
1118 * Reset the chip. Useful if a hang is detected. Marks the device as wedged
1119 * on failure.
1120 *
1121 * Procedure is fairly simple:
1122 * - reset the chip using the reset reg
1123 * - re-init context state
1124 * - re-init hardware status page
1125 * - re-init ring buffer
1126 * - re-init interrupt state
1127 * - re-init display
1128 */
intel_gt_reset(struct intel_gt * gt,intel_engine_mask_t stalled_mask,const char * reason)1129 void intel_gt_reset(struct intel_gt *gt,
1130 intel_engine_mask_t stalled_mask,
1131 const char *reason)
1132 {
1133 intel_engine_mask_t awake;
1134 int ret;
1135
1136 GT_TRACE(gt, "flags=%lx\n", gt->reset.flags);
1137
1138 might_sleep();
1139 GEM_BUG_ON(!test_bit(I915_RESET_BACKOFF, >->reset.flags));
1140
1141 /*
1142 * FIXME: Revoking cpu mmap ptes cannot be done from a dma_fence
1143 * critical section like gpu reset.
1144 */
1145 gt_revoke(gt);
1146
1147 mutex_lock(>->reset.mutex);
1148
1149 /* Clear any previous failed attempts at recovery. Time to try again. */
1150 if (!__intel_gt_unset_wedged(gt))
1151 goto unlock;
1152
1153 if (reason)
1154 drm_notice(>->i915->drm,
1155 "Resetting chip for %s\n", reason);
1156 atomic_inc(>->i915->gpu_error.reset_count);
1157
1158 awake = reset_prepare(gt);
1159
1160 if (!intel_has_gpu_reset(gt)) {
1161 if (gt->i915->params.reset)
1162 drm_err(>->i915->drm, "GPU reset not supported\n");
1163 else
1164 drm_dbg(>->i915->drm, "GPU reset disabled\n");
1165 goto error;
1166 }
1167
1168 if (INTEL_INFO(gt->i915)->gpu_reset_clobbers_display)
1169 intel_runtime_pm_disable_interrupts(gt->i915);
1170
1171 if (do_reset(gt, stalled_mask)) {
1172 drm_err(>->i915->drm, "Failed to reset chip\n");
1173 goto taint;
1174 }
1175
1176 if (INTEL_INFO(gt->i915)->gpu_reset_clobbers_display)
1177 intel_runtime_pm_enable_interrupts(gt->i915);
1178
1179 intel_overlay_reset(gt->i915);
1180
1181 /*
1182 * Next we need to restore the context, but we don't use those
1183 * yet either...
1184 *
1185 * Ring buffer needs to be re-initialized in the KMS case, or if X
1186 * was running at the time of the reset (i.e. we weren't VT
1187 * switched away).
1188 */
1189 ret = intel_gt_init_hw(gt);
1190 if (ret) {
1191 drm_err(>->i915->drm,
1192 "Failed to initialise HW following reset (%d)\n",
1193 ret);
1194 goto taint;
1195 }
1196
1197 ret = resume(gt);
1198 if (ret)
1199 goto taint;
1200
1201 finish:
1202 reset_finish(gt, awake);
1203 unlock:
1204 mutex_unlock(>->reset.mutex);
1205 return;
1206
1207 taint:
1208 /*
1209 * History tells us that if we cannot reset the GPU now, we
1210 * never will. This then impacts everything that is run
1211 * subsequently. On failing the reset, we mark the driver
1212 * as wedged, preventing further execution on the GPU.
1213 * We also want to go one step further and add a taint to the
1214 * kernel so that any subsequent faults can be traced back to
1215 * this failure. This is important for CI, where if the
1216 * GPU/driver fails we would like to reboot and restart testing
1217 * rather than continue on into oblivion. For everyone else,
1218 * the system should still plod along, but they have been warned!
1219 */
1220 add_taint_for_CI(gt->i915, TAINT_WARN);
1221 error:
1222 __intel_gt_set_wedged(gt);
1223 goto finish;
1224 }
1225
intel_gt_reset_engine(struct intel_engine_cs * engine)1226 static int intel_gt_reset_engine(struct intel_engine_cs *engine)
1227 {
1228 return __intel_gt_reset(engine->gt, engine->mask);
1229 }
1230
__intel_engine_reset_bh(struct intel_engine_cs * engine,const char * msg)1231 int __intel_engine_reset_bh(struct intel_engine_cs *engine, const char *msg)
1232 {
1233 struct intel_gt *gt = engine->gt;
1234 int ret;
1235
1236 ENGINE_TRACE(engine, "flags=%lx\n", gt->reset.flags);
1237 GEM_BUG_ON(!test_bit(I915_RESET_ENGINE + engine->id, >->reset.flags));
1238
1239 if (intel_engine_uses_guc(engine))
1240 return -ENODEV;
1241
1242 if (!intel_engine_pm_get_if_awake(engine))
1243 return 0;
1244
1245 reset_prepare_engine(engine);
1246
1247 if (msg)
1248 drm_notice(&engine->i915->drm,
1249 "Resetting %s for %s\n", engine->name, msg);
1250 atomic_inc(&engine->i915->gpu_error.reset_engine_count[engine->uabi_class]);
1251
1252 ret = intel_gt_reset_engine(engine);
1253 if (ret) {
1254 /* If we fail here, we expect to fallback to a global reset */
1255 ENGINE_TRACE(engine, "Failed to reset %s, err: %d\n", engine->name, ret);
1256 goto out;
1257 }
1258
1259 /*
1260 * The request that caused the hang is stuck on elsp, we know the
1261 * active request and can drop it, adjust head to skip the offending
1262 * request to resume executing remaining requests in the queue.
1263 */
1264 __intel_engine_reset(engine, true);
1265
1266 /*
1267 * The engine and its registers (and workarounds in case of render)
1268 * have been reset to their default values. Follow the init_ring
1269 * process to program RING_MODE, HWSP and re-enable submission.
1270 */
1271 ret = intel_engine_resume(engine);
1272
1273 out:
1274 intel_engine_cancel_stop_cs(engine);
1275 reset_finish_engine(engine);
1276 intel_engine_pm_put_async(engine);
1277 return ret;
1278 }
1279
1280 /**
1281 * intel_engine_reset - reset GPU engine to recover from a hang
1282 * @engine: engine to reset
1283 * @msg: reason for GPU reset; or NULL for no drm_notice()
1284 *
1285 * Reset a specific GPU engine. Useful if a hang is detected.
1286 * Returns zero on successful reset or otherwise an error code.
1287 *
1288 * Procedure is:
1289 * - identifies the request that caused the hang and it is dropped
1290 * - reset engine (which will force the engine to idle)
1291 * - re-init/configure engine
1292 */
intel_engine_reset(struct intel_engine_cs * engine,const char * msg)1293 int intel_engine_reset(struct intel_engine_cs *engine, const char *msg)
1294 {
1295 int err;
1296
1297 local_bh_disable();
1298 err = __intel_engine_reset_bh(engine, msg);
1299 local_bh_enable();
1300
1301 return err;
1302 }
1303
intel_gt_reset_global(struct intel_gt * gt,u32 engine_mask,const char * reason)1304 static void intel_gt_reset_global(struct intel_gt *gt,
1305 u32 engine_mask,
1306 const char *reason)
1307 {
1308 struct kobject *kobj = >->i915->drm.primary->kdev->kobj;
1309 char *error_event[] = { I915_ERROR_UEVENT "=1", NULL };
1310 char *reset_event[] = { I915_RESET_UEVENT "=1", NULL };
1311 char *reset_done_event[] = { I915_ERROR_UEVENT "=0", NULL };
1312 struct intel_wedge_me w;
1313
1314 kobject_uevent_env(kobj, KOBJ_CHANGE, error_event);
1315
1316 GT_TRACE(gt, "resetting chip, engines=%x\n", engine_mask);
1317 kobject_uevent_env(kobj, KOBJ_CHANGE, reset_event);
1318
1319 /* Use a watchdog to ensure that our reset completes */
1320 intel_wedge_on_timeout(&w, gt, 5 * HZ) {
1321 intel_display_prepare_reset(gt->i915);
1322
1323 /* Flush everyone using a resource about to be clobbered */
1324 synchronize_srcu_expedited(>->reset.backoff_srcu);
1325
1326 intel_gt_reset(gt, engine_mask, reason);
1327
1328 intel_display_finish_reset(gt->i915);
1329 }
1330
1331 if (!test_bit(I915_WEDGED, >->reset.flags))
1332 kobject_uevent_env(kobj, KOBJ_CHANGE, reset_done_event);
1333 }
1334
1335 /**
1336 * intel_gt_handle_error - handle a gpu error
1337 * @gt: the intel_gt
1338 * @engine_mask: mask representing engines that are hung
1339 * @flags: control flags
1340 * @fmt: Error message format string
1341 *
1342 * Do some basic checking of register state at error time and
1343 * dump it to the syslog. Also call i915_capture_error_state() to make
1344 * sure we get a record and make it available in debugfs. Fire a uevent
1345 * so userspace knows something bad happened (should trigger collection
1346 * of a ring dump etc.).
1347 */
intel_gt_handle_error(struct intel_gt * gt,intel_engine_mask_t engine_mask,unsigned long flags,const char * fmt,...)1348 void intel_gt_handle_error(struct intel_gt *gt,
1349 intel_engine_mask_t engine_mask,
1350 unsigned long flags,
1351 const char *fmt, ...)
1352 {
1353 struct intel_engine_cs *engine;
1354 intel_wakeref_t wakeref;
1355 intel_engine_mask_t tmp;
1356 char error_msg[80];
1357 char *msg = NULL;
1358
1359 if (fmt) {
1360 va_list args;
1361
1362 va_start(args, fmt);
1363 vscnprintf(error_msg, sizeof(error_msg), fmt, args);
1364 va_end(args);
1365
1366 msg = error_msg;
1367 }
1368
1369 /*
1370 * In most cases it's guaranteed that we get here with an RPM
1371 * reference held, for example because there is a pending GPU
1372 * request that won't finish until the reset is done. This
1373 * isn't the case at least when we get here by doing a
1374 * simulated reset via debugfs, so get an RPM reference.
1375 */
1376 wakeref = intel_runtime_pm_get(gt->uncore->rpm);
1377
1378 engine_mask &= gt->info.engine_mask;
1379
1380 if (flags & I915_ERROR_CAPTURE) {
1381 i915_capture_error_state(gt, engine_mask);
1382 intel_gt_clear_error_registers(gt, engine_mask);
1383 }
1384
1385 /*
1386 * Try engine reset when available. We fall back to full reset if
1387 * single reset fails.
1388 */
1389 if (!intel_uc_uses_guc_submission(>->uc) &&
1390 intel_has_reset_engine(gt) && !intel_gt_is_wedged(gt)) {
1391 local_bh_disable();
1392 for_each_engine_masked(engine, gt, engine_mask, tmp) {
1393 BUILD_BUG_ON(I915_RESET_MODESET >= I915_RESET_ENGINE);
1394 if (test_and_set_bit(I915_RESET_ENGINE + engine->id,
1395 >->reset.flags))
1396 continue;
1397
1398 if (__intel_engine_reset_bh(engine, msg) == 0)
1399 engine_mask &= ~engine->mask;
1400
1401 clear_and_wake_up_bit(I915_RESET_ENGINE + engine->id,
1402 >->reset.flags);
1403 }
1404 local_bh_enable();
1405 }
1406
1407 if (!engine_mask)
1408 goto out;
1409
1410 /* Full reset needs the mutex, stop any other user trying to do so. */
1411 if (test_and_set_bit(I915_RESET_BACKOFF, >->reset.flags)) {
1412 wait_event(gt->reset.queue,
1413 !test_bit(I915_RESET_BACKOFF, >->reset.flags));
1414 goto out; /* piggy-back on the other reset */
1415 }
1416
1417 /* Make sure i915_reset_trylock() sees the I915_RESET_BACKOFF */
1418 synchronize_rcu_expedited();
1419
1420 /* Prevent any other reset-engine attempt. */
1421 for_each_engine(engine, gt, tmp) {
1422 while (test_and_set_bit(I915_RESET_ENGINE + engine->id,
1423 >->reset.flags))
1424 wait_on_bit(>->reset.flags,
1425 I915_RESET_ENGINE + engine->id,
1426 TASK_UNINTERRUPTIBLE);
1427 }
1428
1429 intel_gt_reset_global(gt, engine_mask, msg);
1430
1431 for_each_engine(engine, gt, tmp)
1432 clear_bit_unlock(I915_RESET_ENGINE + engine->id,
1433 >->reset.flags);
1434 clear_bit_unlock(I915_RESET_BACKOFF, >->reset.flags);
1435 smp_mb__after_atomic();
1436 wake_up_all(>->reset.queue);
1437
1438 out:
1439 intel_runtime_pm_put(gt->uncore->rpm, wakeref);
1440 }
1441
intel_gt_reset_trylock(struct intel_gt * gt,int * srcu)1442 int intel_gt_reset_trylock(struct intel_gt *gt, int *srcu)
1443 {
1444 might_lock(>->reset.backoff_srcu);
1445 might_sleep();
1446
1447 rcu_read_lock();
1448 while (test_bit(I915_RESET_BACKOFF, >->reset.flags)) {
1449 rcu_read_unlock();
1450
1451 if (wait_event_interruptible(gt->reset.queue,
1452 !test_bit(I915_RESET_BACKOFF,
1453 >->reset.flags)))
1454 return -EINTR;
1455
1456 rcu_read_lock();
1457 }
1458 *srcu = srcu_read_lock(>->reset.backoff_srcu);
1459 rcu_read_unlock();
1460
1461 return 0;
1462 }
1463
intel_gt_reset_unlock(struct intel_gt * gt,int tag)1464 void intel_gt_reset_unlock(struct intel_gt *gt, int tag)
1465 __releases(>->reset.backoff_srcu)
1466 {
1467 srcu_read_unlock(>->reset.backoff_srcu, tag);
1468 }
1469
intel_gt_terminally_wedged(struct intel_gt * gt)1470 int intel_gt_terminally_wedged(struct intel_gt *gt)
1471 {
1472 might_sleep();
1473
1474 if (!intel_gt_is_wedged(gt))
1475 return 0;
1476
1477 if (intel_gt_has_unrecoverable_error(gt))
1478 return -EIO;
1479
1480 /* Reset still in progress? Maybe we will recover? */
1481 if (wait_event_interruptible(gt->reset.queue,
1482 !test_bit(I915_RESET_BACKOFF,
1483 >->reset.flags)))
1484 return -EINTR;
1485
1486 return intel_gt_is_wedged(gt) ? -EIO : 0;
1487 }
1488
intel_gt_set_wedged_on_init(struct intel_gt * gt)1489 void intel_gt_set_wedged_on_init(struct intel_gt *gt)
1490 {
1491 BUILD_BUG_ON(I915_RESET_ENGINE + I915_NUM_ENGINES >
1492 I915_WEDGED_ON_INIT);
1493 intel_gt_set_wedged(gt);
1494 set_bit(I915_WEDGED_ON_INIT, >->reset.flags);
1495
1496 /* Wedged on init is non-recoverable */
1497 add_taint_for_CI(gt->i915, TAINT_WARN);
1498 }
1499
intel_gt_set_wedged_on_fini(struct intel_gt * gt)1500 void intel_gt_set_wedged_on_fini(struct intel_gt *gt)
1501 {
1502 intel_gt_set_wedged(gt);
1503 set_bit(I915_WEDGED_ON_FINI, >->reset.flags);
1504 intel_gt_retire_requests(gt); /* cleanup any wedged requests */
1505 }
1506
intel_gt_init_reset(struct intel_gt * gt)1507 void intel_gt_init_reset(struct intel_gt *gt)
1508 {
1509 init_waitqueue_head(>->reset.queue);
1510 mutex_init(>->reset.mutex);
1511 init_srcu_struct(>->reset.backoff_srcu);
1512
1513 /*
1514 * While undesirable to wait inside the shrinker, complain anyway.
1515 *
1516 * If we have to wait during shrinking, we guarantee forward progress
1517 * by forcing the reset. Therefore during the reset we must not
1518 * re-enter the shrinker. By declaring that we take the reset mutex
1519 * within the shrinker, we forbid ourselves from performing any
1520 * fs-reclaim or taking related locks during reset.
1521 */
1522 i915_gem_shrinker_taints_mutex(gt->i915, >->reset.mutex);
1523
1524 /* no GPU until we are ready! */
1525 __set_bit(I915_WEDGED, >->reset.flags);
1526 }
1527
intel_gt_fini_reset(struct intel_gt * gt)1528 void intel_gt_fini_reset(struct intel_gt *gt)
1529 {
1530 cleanup_srcu_struct(>->reset.backoff_srcu);
1531 }
1532
intel_wedge_me(struct work_struct * work)1533 static void intel_wedge_me(struct work_struct *work)
1534 {
1535 struct intel_wedge_me *w = container_of(work, typeof(*w), work.work);
1536
1537 drm_err(&w->gt->i915->drm,
1538 "%s timed out, cancelling all in-flight rendering.\n",
1539 w->name);
1540 intel_gt_set_wedged(w->gt);
1541 }
1542
__intel_init_wedge(struct intel_wedge_me * w,struct intel_gt * gt,long timeout,const char * name)1543 void __intel_init_wedge(struct intel_wedge_me *w,
1544 struct intel_gt *gt,
1545 long timeout,
1546 const char *name)
1547 {
1548 w->gt = gt;
1549 w->name = name;
1550
1551 INIT_DELAYED_WORK_ONSTACK(&w->work, intel_wedge_me);
1552 schedule_delayed_work(&w->work, timeout);
1553 }
1554
__intel_fini_wedge(struct intel_wedge_me * w)1555 void __intel_fini_wedge(struct intel_wedge_me *w)
1556 {
1557 cancel_delayed_work_sync(&w->work);
1558 destroy_delayed_work_on_stack(&w->work);
1559 w->gt = NULL;
1560 }
1561
1562 #if IS_ENABLED(CONFIG_DRM_I915_SELFTEST)
1563 #include "selftest_reset.c"
1564 #include "selftest_hangcheck.c"
1565 #endif
1566