1 // SPDX-License-Identifier: GPL-2.0-only
2 /*
3 * Copyright (C) 2020-2024 Intel Corporation
4 */
5
6 #include <linux/highmem.h>
7 #include <linux/moduleparam.h>
8 #include <linux/pci.h>
9 #include <linux/pm_runtime.h>
10 #include <linux/reboot.h>
11
12 #include "ivpu_coredump.h"
13 #include "ivpu_drv.h"
14 #include "ivpu_fw.h"
15 #include "ivpu_fw_log.h"
16 #include "ivpu_hw.h"
17 #include "ivpu_ipc.h"
18 #include "ivpu_job.h"
19 #include "ivpu_jsm_msg.h"
20 #include "ivpu_mmu.h"
21 #include "ivpu_ms.h"
22 #include "ivpu_pm.h"
23 #include "vpu_boot_api.h"
24
25 static bool ivpu_disable_recovery;
26 module_param_named_unsafe(disable_recovery, ivpu_disable_recovery, bool, 0644);
27 MODULE_PARM_DESC(disable_recovery, "Disables recovery when NPU hang is detected");
28
29 static unsigned long ivpu_tdr_timeout_ms;
30 module_param_named(tdr_timeout_ms, ivpu_tdr_timeout_ms, ulong, 0644);
31 MODULE_PARM_DESC(tdr_timeout_ms, "Timeout for device hang detection, in milliseconds, 0 - default");
32
33 #define PM_RESCHEDULE_LIMIT 5
34
ivpu_pm_prepare_cold_boot(struct ivpu_device * vdev)35 static void ivpu_pm_prepare_cold_boot(struct ivpu_device *vdev)
36 {
37 struct ivpu_fw_info *fw = vdev->fw;
38
39 ivpu_cmdq_reset_all_contexts(vdev);
40 ivpu_ipc_reset(vdev);
41 ivpu_fw_log_reset(vdev);
42 ivpu_fw_load(vdev);
43 fw->entry_point = fw->cold_boot_entry_point;
44 }
45
ivpu_pm_prepare_warm_boot(struct ivpu_device * vdev)46 static void ivpu_pm_prepare_warm_boot(struct ivpu_device *vdev)
47 {
48 struct ivpu_fw_info *fw = vdev->fw;
49 struct vpu_boot_params *bp = ivpu_bo_vaddr(fw->mem);
50
51 if (!bp->save_restore_ret_address) {
52 ivpu_pm_prepare_cold_boot(vdev);
53 return;
54 }
55
56 ivpu_dbg(vdev, FW_BOOT, "Save/restore entry point %llx", bp->save_restore_ret_address);
57 fw->entry_point = bp->save_restore_ret_address;
58 }
59
ivpu_suspend(struct ivpu_device * vdev)60 static int ivpu_suspend(struct ivpu_device *vdev)
61 {
62 int ret;
63
64 ivpu_prepare_for_reset(vdev);
65
66 ret = ivpu_shutdown(vdev);
67 if (ret)
68 ivpu_err(vdev, "Failed to shutdown NPU: %d\n", ret);
69
70 return ret;
71 }
72
ivpu_resume(struct ivpu_device * vdev)73 static int ivpu_resume(struct ivpu_device *vdev)
74 {
75 int ret;
76
77 retry:
78 pci_set_power_state(to_pci_dev(vdev->drm.dev), PCI_D0);
79 pci_restore_state(to_pci_dev(vdev->drm.dev));
80
81 ret = ivpu_hw_power_up(vdev);
82 if (ret) {
83 ivpu_err(vdev, "Failed to power up HW: %d\n", ret);
84 goto err_power_down;
85 }
86
87 ret = ivpu_mmu_enable(vdev);
88 if (ret) {
89 ivpu_err(vdev, "Failed to resume MMU: %d\n", ret);
90 goto err_power_down;
91 }
92
93 ret = ivpu_boot(vdev);
94 if (ret)
95 goto err_mmu_disable;
96
97 return 0;
98
99 err_mmu_disable:
100 ivpu_mmu_disable(vdev);
101 err_power_down:
102 ivpu_hw_power_down(vdev);
103 pci_set_power_state(to_pci_dev(vdev->drm.dev), PCI_D3hot);
104
105 if (!ivpu_fw_is_cold_boot(vdev)) {
106 ivpu_pm_prepare_cold_boot(vdev);
107 goto retry;
108 } else {
109 ivpu_err(vdev, "Failed to resume the FW: %d\n", ret);
110 }
111
112 return ret;
113 }
114
ivpu_pm_reset_begin(struct ivpu_device * vdev)115 static void ivpu_pm_reset_begin(struct ivpu_device *vdev)
116 {
117 pm_runtime_disable(vdev->drm.dev);
118
119 atomic_inc(&vdev->pm->reset_counter);
120 atomic_set(&vdev->pm->reset_pending, 1);
121 down_write(&vdev->pm->reset_lock);
122 }
123
ivpu_pm_reset_complete(struct ivpu_device * vdev)124 static void ivpu_pm_reset_complete(struct ivpu_device *vdev)
125 {
126 int ret;
127
128 ivpu_pm_prepare_cold_boot(vdev);
129 ivpu_jobs_abort_all(vdev);
130 ivpu_ms_cleanup_all(vdev);
131
132 ret = ivpu_resume(vdev);
133 if (ret) {
134 ivpu_err(vdev, "Failed to resume NPU: %d\n", ret);
135 pm_runtime_set_suspended(vdev->drm.dev);
136 } else {
137 pm_runtime_set_active(vdev->drm.dev);
138 }
139
140 up_write(&vdev->pm->reset_lock);
141 atomic_set(&vdev->pm->reset_pending, 0);
142
143 pm_runtime_mark_last_busy(vdev->drm.dev);
144 pm_runtime_enable(vdev->drm.dev);
145 }
146
ivpu_pm_recovery_work(struct work_struct * work)147 static void ivpu_pm_recovery_work(struct work_struct *work)
148 {
149 struct ivpu_pm_info *pm = container_of(work, struct ivpu_pm_info, recovery_work);
150 struct ivpu_device *vdev = pm->vdev;
151 char *evt[2] = {"IVPU_PM_EVENT=IVPU_RECOVER", NULL};
152
153 ivpu_err(vdev, "Recovering the NPU (reset #%d)\n", atomic_read(&vdev->pm->reset_counter));
154
155 ivpu_pm_reset_begin(vdev);
156
157 if (!pm_runtime_status_suspended(vdev->drm.dev)) {
158 ivpu_jsm_state_dump(vdev);
159 ivpu_dev_coredump(vdev);
160 ivpu_suspend(vdev);
161 }
162
163 ivpu_pm_reset_complete(vdev);
164
165 kobject_uevent_env(&vdev->drm.dev->kobj, KOBJ_CHANGE, evt);
166 }
167
ivpu_pm_trigger_recovery(struct ivpu_device * vdev,const char * reason)168 void ivpu_pm_trigger_recovery(struct ivpu_device *vdev, const char *reason)
169 {
170 ivpu_err(vdev, "Recovery triggered by %s\n", reason);
171
172 if (ivpu_disable_recovery) {
173 ivpu_err(vdev, "Recovery not available when disable_recovery param is set\n");
174 return;
175 }
176
177 if (ivpu_is_fpga(vdev)) {
178 ivpu_err(vdev, "Recovery not available on FPGA\n");
179 return;
180 }
181
182 /* Trigger recovery if it's not in progress */
183 if (atomic_cmpxchg(&vdev->pm->reset_pending, 0, 1) == 0) {
184 ivpu_hw_diagnose_failure(vdev);
185 ivpu_hw_irq_disable(vdev); /* Disable IRQ early to protect from IRQ storm */
186 queue_work(system_long_wq, &vdev->pm->recovery_work);
187 }
188 }
189
ivpu_job_timeout_work(struct work_struct * work)190 static void ivpu_job_timeout_work(struct work_struct *work)
191 {
192 struct ivpu_pm_info *pm = container_of(work, struct ivpu_pm_info, job_timeout_work.work);
193 struct ivpu_device *vdev = pm->vdev;
194
195 ivpu_pm_trigger_recovery(vdev, "TDR");
196 }
197
ivpu_start_job_timeout_detection(struct ivpu_device * vdev)198 void ivpu_start_job_timeout_detection(struct ivpu_device *vdev)
199 {
200 unsigned long timeout_ms = ivpu_tdr_timeout_ms ? ivpu_tdr_timeout_ms : vdev->timeout.tdr;
201
202 /* No-op if already queued */
203 queue_delayed_work(system_wq, &vdev->pm->job_timeout_work, msecs_to_jiffies(timeout_ms));
204 }
205
ivpu_stop_job_timeout_detection(struct ivpu_device * vdev)206 void ivpu_stop_job_timeout_detection(struct ivpu_device *vdev)
207 {
208 cancel_delayed_work_sync(&vdev->pm->job_timeout_work);
209 }
210
ivpu_pm_suspend_cb(struct device * dev)211 int ivpu_pm_suspend_cb(struct device *dev)
212 {
213 struct drm_device *drm = dev_get_drvdata(dev);
214 struct ivpu_device *vdev = to_ivpu_device(drm);
215 unsigned long timeout;
216
217 ivpu_dbg(vdev, PM, "Suspend..\n");
218
219 timeout = jiffies + msecs_to_jiffies(vdev->timeout.tdr);
220 while (!ivpu_hw_is_idle(vdev)) {
221 cond_resched();
222 if (time_after_eq(jiffies, timeout)) {
223 ivpu_err(vdev, "Failed to enter idle on system suspend\n");
224 return -EBUSY;
225 }
226 }
227
228 ivpu_jsm_pwr_d0i3_enter(vdev);
229
230 ivpu_suspend(vdev);
231 ivpu_pm_prepare_warm_boot(vdev);
232
233 ivpu_dbg(vdev, PM, "Suspend done.\n");
234
235 return 0;
236 }
237
ivpu_pm_resume_cb(struct device * dev)238 int ivpu_pm_resume_cb(struct device *dev)
239 {
240 struct drm_device *drm = dev_get_drvdata(dev);
241 struct ivpu_device *vdev = to_ivpu_device(drm);
242 int ret;
243
244 ivpu_dbg(vdev, PM, "Resume..\n");
245
246 ret = ivpu_resume(vdev);
247 if (ret)
248 ivpu_err(vdev, "Failed to resume: %d\n", ret);
249
250 ivpu_dbg(vdev, PM, "Resume done.\n");
251
252 return ret;
253 }
254
ivpu_pm_runtime_suspend_cb(struct device * dev)255 int ivpu_pm_runtime_suspend_cb(struct device *dev)
256 {
257 struct drm_device *drm = dev_get_drvdata(dev);
258 struct ivpu_device *vdev = to_ivpu_device(drm);
259 int ret, ret_d0i3;
260 bool is_idle;
261
262 drm_WARN_ON(&vdev->drm, !xa_empty(&vdev->submitted_jobs_xa));
263 drm_WARN_ON(&vdev->drm, work_pending(&vdev->pm->recovery_work));
264
265 ivpu_dbg(vdev, PM, "Runtime suspend..\n");
266
267 ivpu_mmu_disable(vdev);
268
269 is_idle = ivpu_hw_is_idle(vdev) || vdev->pm->dct_active_percent;
270 if (!is_idle)
271 ivpu_err(vdev, "NPU is not idle before autosuspend\n");
272
273 ret_d0i3 = ivpu_jsm_pwr_d0i3_enter(vdev);
274 if (ret_d0i3)
275 ivpu_err(vdev, "Failed to prepare for d0i3: %d\n", ret_d0i3);
276
277 ret = ivpu_suspend(vdev);
278 if (ret)
279 ivpu_err(vdev, "Failed to suspend NPU: %d\n", ret);
280
281 if (!is_idle || ret_d0i3) {
282 ivpu_err(vdev, "Forcing cold boot due to previous errors\n");
283 atomic_inc(&vdev->pm->reset_counter);
284 ivpu_dev_coredump(vdev);
285 ivpu_pm_prepare_cold_boot(vdev);
286 } else {
287 ivpu_pm_prepare_warm_boot(vdev);
288 }
289
290 ivpu_dbg(vdev, PM, "Runtime suspend done.\n");
291
292 return 0;
293 }
294
ivpu_pm_runtime_resume_cb(struct device * dev)295 int ivpu_pm_runtime_resume_cb(struct device *dev)
296 {
297 struct drm_device *drm = dev_get_drvdata(dev);
298 struct ivpu_device *vdev = to_ivpu_device(drm);
299 int ret;
300
301 ivpu_dbg(vdev, PM, "Runtime resume..\n");
302
303 ret = ivpu_resume(vdev);
304 if (ret)
305 ivpu_err(vdev, "Failed to set RESUME state: %d\n", ret);
306
307 ivpu_dbg(vdev, PM, "Runtime resume done.\n");
308
309 return ret;
310 }
311
ivpu_rpm_get(struct ivpu_device * vdev)312 int ivpu_rpm_get(struct ivpu_device *vdev)
313 {
314 int ret;
315
316 ret = pm_runtime_resume_and_get(vdev->drm.dev);
317 if (ret < 0) {
318 ivpu_err(vdev, "Failed to resume NPU: %d\n", ret);
319 pm_runtime_set_suspended(vdev->drm.dev);
320 }
321
322 return ret;
323 }
324
ivpu_rpm_put(struct ivpu_device * vdev)325 void ivpu_rpm_put(struct ivpu_device *vdev)
326 {
327 pm_runtime_mark_last_busy(vdev->drm.dev);
328 pm_runtime_put_autosuspend(vdev->drm.dev);
329 }
330
ivpu_pm_reset_prepare_cb(struct pci_dev * pdev)331 void ivpu_pm_reset_prepare_cb(struct pci_dev *pdev)
332 {
333 struct ivpu_device *vdev = pci_get_drvdata(pdev);
334
335 ivpu_dbg(vdev, PM, "Pre-reset..\n");
336
337 ivpu_pm_reset_begin(vdev);
338
339 if (!pm_runtime_status_suspended(vdev->drm.dev)) {
340 ivpu_prepare_for_reset(vdev);
341 ivpu_hw_reset(vdev);
342 }
343
344 ivpu_dbg(vdev, PM, "Pre-reset done.\n");
345 }
346
ivpu_pm_reset_done_cb(struct pci_dev * pdev)347 void ivpu_pm_reset_done_cb(struct pci_dev *pdev)
348 {
349 struct ivpu_device *vdev = pci_get_drvdata(pdev);
350
351 ivpu_dbg(vdev, PM, "Post-reset..\n");
352
353 ivpu_pm_reset_complete(vdev);
354
355 ivpu_dbg(vdev, PM, "Post-reset done.\n");
356 }
357
ivpu_pm_init(struct ivpu_device * vdev)358 void ivpu_pm_init(struct ivpu_device *vdev)
359 {
360 struct device *dev = vdev->drm.dev;
361 struct ivpu_pm_info *pm = vdev->pm;
362 int delay;
363
364 pm->vdev = vdev;
365
366 init_rwsem(&pm->reset_lock);
367 atomic_set(&pm->reset_pending, 0);
368 atomic_set(&pm->reset_counter, 0);
369
370 INIT_WORK(&pm->recovery_work, ivpu_pm_recovery_work);
371 INIT_DELAYED_WORK(&pm->job_timeout_work, ivpu_job_timeout_work);
372
373 if (ivpu_disable_recovery)
374 delay = -1;
375 else
376 delay = vdev->timeout.autosuspend;
377
378 pm_runtime_use_autosuspend(dev);
379 pm_runtime_set_autosuspend_delay(dev, delay);
380 pm_runtime_set_active(dev);
381
382 ivpu_dbg(vdev, PM, "Autosuspend delay = %d\n", delay);
383 }
384
ivpu_pm_disable_recovery(struct ivpu_device * vdev)385 void ivpu_pm_disable_recovery(struct ivpu_device *vdev)
386 {
387 drm_WARN_ON(&vdev->drm, delayed_work_pending(&vdev->pm->job_timeout_work));
388 disable_work_sync(&vdev->pm->recovery_work);
389 }
390
ivpu_pm_enable(struct ivpu_device * vdev)391 void ivpu_pm_enable(struct ivpu_device *vdev)
392 {
393 struct device *dev = vdev->drm.dev;
394
395 pm_runtime_allow(dev);
396 pm_runtime_mark_last_busy(dev);
397 pm_runtime_put_autosuspend(dev);
398 }
399
ivpu_pm_disable(struct ivpu_device * vdev)400 void ivpu_pm_disable(struct ivpu_device *vdev)
401 {
402 pm_runtime_get_noresume(vdev->drm.dev);
403 pm_runtime_forbid(vdev->drm.dev);
404 }
405
ivpu_pm_dct_init(struct ivpu_device * vdev)406 int ivpu_pm_dct_init(struct ivpu_device *vdev)
407 {
408 if (vdev->pm->dct_active_percent)
409 return ivpu_pm_dct_enable(vdev, vdev->pm->dct_active_percent);
410
411 return 0;
412 }
413
ivpu_pm_dct_enable(struct ivpu_device * vdev,u8 active_percent)414 int ivpu_pm_dct_enable(struct ivpu_device *vdev, u8 active_percent)
415 {
416 u32 active_us, inactive_us;
417 int ret;
418
419 if (active_percent == 0 || active_percent > 100)
420 return -EINVAL;
421
422 active_us = (DCT_PERIOD_US * active_percent) / 100;
423 inactive_us = DCT_PERIOD_US - active_us;
424
425 vdev->pm->dct_active_percent = active_percent;
426
427 ivpu_dbg(vdev, PM, "DCT requested %u%% (D0: %uus, D0i2: %uus)\n",
428 active_percent, active_us, inactive_us);
429
430 ret = ivpu_jsm_dct_enable(vdev, active_us, inactive_us);
431 if (ret) {
432 ivpu_err_ratelimited(vdev, "Filed to enable DCT: %d\n", ret);
433 return ret;
434 }
435
436 return 0;
437 }
438
ivpu_pm_dct_disable(struct ivpu_device * vdev)439 int ivpu_pm_dct_disable(struct ivpu_device *vdev)
440 {
441 int ret;
442
443 vdev->pm->dct_active_percent = 0;
444
445 ivpu_dbg(vdev, PM, "DCT requested to be disabled\n");
446
447 ret = ivpu_jsm_dct_disable(vdev);
448 if (ret) {
449 ivpu_err_ratelimited(vdev, "Filed to disable DCT: %d\n", ret);
450 return ret;
451 }
452
453 return 0;
454 }
455
ivpu_pm_dct_irq_thread_handler(struct ivpu_device * vdev)456 void ivpu_pm_dct_irq_thread_handler(struct ivpu_device *vdev)
457 {
458 bool enable;
459 int ret;
460
461 if (ivpu_hw_btrs_dct_get_request(vdev, &enable))
462 return;
463
464 if (enable)
465 ret = ivpu_pm_dct_enable(vdev, DCT_DEFAULT_ACTIVE_PERCENT);
466 else
467 ret = ivpu_pm_dct_disable(vdev);
468
469 if (!ret)
470 ivpu_hw_btrs_dct_set_status(vdev, enable, vdev->pm->dct_active_percent);
471 }
472