• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 // SPDX-License-Identifier: GPL-2.0-only
2 /*
3  * Copyright (C) 2020-2024 Intel Corporation
4  */
5 
6 #include <linux/highmem.h>
7 #include <linux/moduleparam.h>
8 #include <linux/pci.h>
9 #include <linux/pm_runtime.h>
10 #include <linux/reboot.h>
11 
12 #include "ivpu_coredump.h"
13 #include "ivpu_drv.h"
14 #include "ivpu_fw.h"
15 #include "ivpu_fw_log.h"
16 #include "ivpu_hw.h"
17 #include "ivpu_ipc.h"
18 #include "ivpu_job.h"
19 #include "ivpu_jsm_msg.h"
20 #include "ivpu_mmu.h"
21 #include "ivpu_ms.h"
22 #include "ivpu_pm.h"
23 #include "vpu_boot_api.h"
24 
25 static bool ivpu_disable_recovery;
26 module_param_named_unsafe(disable_recovery, ivpu_disable_recovery, bool, 0644);
27 MODULE_PARM_DESC(disable_recovery, "Disables recovery when NPU hang is detected");
28 
29 static unsigned long ivpu_tdr_timeout_ms;
30 module_param_named(tdr_timeout_ms, ivpu_tdr_timeout_ms, ulong, 0644);
31 MODULE_PARM_DESC(tdr_timeout_ms, "Timeout for device hang detection, in milliseconds, 0 - default");
32 
33 #define PM_RESCHEDULE_LIMIT     5
34 
ivpu_pm_prepare_cold_boot(struct ivpu_device * vdev)35 static void ivpu_pm_prepare_cold_boot(struct ivpu_device *vdev)
36 {
37 	struct ivpu_fw_info *fw = vdev->fw;
38 
39 	ivpu_cmdq_reset_all_contexts(vdev);
40 	ivpu_ipc_reset(vdev);
41 	ivpu_fw_log_reset(vdev);
42 	ivpu_fw_load(vdev);
43 	fw->entry_point = fw->cold_boot_entry_point;
44 }
45 
ivpu_pm_prepare_warm_boot(struct ivpu_device * vdev)46 static void ivpu_pm_prepare_warm_boot(struct ivpu_device *vdev)
47 {
48 	struct ivpu_fw_info *fw = vdev->fw;
49 	struct vpu_boot_params *bp = ivpu_bo_vaddr(fw->mem);
50 
51 	if (!bp->save_restore_ret_address) {
52 		ivpu_pm_prepare_cold_boot(vdev);
53 		return;
54 	}
55 
56 	ivpu_dbg(vdev, FW_BOOT, "Save/restore entry point %llx", bp->save_restore_ret_address);
57 	fw->entry_point = bp->save_restore_ret_address;
58 }
59 
ivpu_suspend(struct ivpu_device * vdev)60 static int ivpu_suspend(struct ivpu_device *vdev)
61 {
62 	int ret;
63 
64 	ivpu_prepare_for_reset(vdev);
65 
66 	ret = ivpu_shutdown(vdev);
67 	if (ret)
68 		ivpu_err(vdev, "Failed to shutdown NPU: %d\n", ret);
69 
70 	return ret;
71 }
72 
ivpu_resume(struct ivpu_device * vdev)73 static int ivpu_resume(struct ivpu_device *vdev)
74 {
75 	int ret;
76 
77 retry:
78 	pci_set_power_state(to_pci_dev(vdev->drm.dev), PCI_D0);
79 	pci_restore_state(to_pci_dev(vdev->drm.dev));
80 
81 	ret = ivpu_hw_power_up(vdev);
82 	if (ret) {
83 		ivpu_err(vdev, "Failed to power up HW: %d\n", ret);
84 		goto err_power_down;
85 	}
86 
87 	ret = ivpu_mmu_enable(vdev);
88 	if (ret) {
89 		ivpu_err(vdev, "Failed to resume MMU: %d\n", ret);
90 		goto err_power_down;
91 	}
92 
93 	ret = ivpu_boot(vdev);
94 	if (ret)
95 		goto err_mmu_disable;
96 
97 	return 0;
98 
99 err_mmu_disable:
100 	ivpu_mmu_disable(vdev);
101 err_power_down:
102 	ivpu_hw_power_down(vdev);
103 	pci_set_power_state(to_pci_dev(vdev->drm.dev), PCI_D3hot);
104 
105 	if (!ivpu_fw_is_cold_boot(vdev)) {
106 		ivpu_pm_prepare_cold_boot(vdev);
107 		goto retry;
108 	} else {
109 		ivpu_err(vdev, "Failed to resume the FW: %d\n", ret);
110 	}
111 
112 	return ret;
113 }
114 
ivpu_pm_reset_begin(struct ivpu_device * vdev)115 static void ivpu_pm_reset_begin(struct ivpu_device *vdev)
116 {
117 	pm_runtime_disable(vdev->drm.dev);
118 
119 	atomic_inc(&vdev->pm->reset_counter);
120 	atomic_set(&vdev->pm->reset_pending, 1);
121 	down_write(&vdev->pm->reset_lock);
122 }
123 
ivpu_pm_reset_complete(struct ivpu_device * vdev)124 static void ivpu_pm_reset_complete(struct ivpu_device *vdev)
125 {
126 	int ret;
127 
128 	ivpu_pm_prepare_cold_boot(vdev);
129 	ivpu_jobs_abort_all(vdev);
130 	ivpu_ms_cleanup_all(vdev);
131 
132 	ret = ivpu_resume(vdev);
133 	if (ret) {
134 		ivpu_err(vdev, "Failed to resume NPU: %d\n", ret);
135 		pm_runtime_set_suspended(vdev->drm.dev);
136 	} else {
137 		pm_runtime_set_active(vdev->drm.dev);
138 	}
139 
140 	up_write(&vdev->pm->reset_lock);
141 	atomic_set(&vdev->pm->reset_pending, 0);
142 
143 	pm_runtime_mark_last_busy(vdev->drm.dev);
144 	pm_runtime_enable(vdev->drm.dev);
145 }
146 
ivpu_pm_recovery_work(struct work_struct * work)147 static void ivpu_pm_recovery_work(struct work_struct *work)
148 {
149 	struct ivpu_pm_info *pm = container_of(work, struct ivpu_pm_info, recovery_work);
150 	struct ivpu_device *vdev = pm->vdev;
151 	char *evt[2] = {"IVPU_PM_EVENT=IVPU_RECOVER", NULL};
152 
153 	ivpu_err(vdev, "Recovering the NPU (reset #%d)\n", atomic_read(&vdev->pm->reset_counter));
154 
155 	ivpu_pm_reset_begin(vdev);
156 
157 	if (!pm_runtime_status_suspended(vdev->drm.dev)) {
158 		ivpu_jsm_state_dump(vdev);
159 		ivpu_dev_coredump(vdev);
160 		ivpu_suspend(vdev);
161 	}
162 
163 	ivpu_pm_reset_complete(vdev);
164 
165 	kobject_uevent_env(&vdev->drm.dev->kobj, KOBJ_CHANGE, evt);
166 }
167 
ivpu_pm_trigger_recovery(struct ivpu_device * vdev,const char * reason)168 void ivpu_pm_trigger_recovery(struct ivpu_device *vdev, const char *reason)
169 {
170 	ivpu_err(vdev, "Recovery triggered by %s\n", reason);
171 
172 	if (ivpu_disable_recovery) {
173 		ivpu_err(vdev, "Recovery not available when disable_recovery param is set\n");
174 		return;
175 	}
176 
177 	if (ivpu_is_fpga(vdev)) {
178 		ivpu_err(vdev, "Recovery not available on FPGA\n");
179 		return;
180 	}
181 
182 	/* Trigger recovery if it's not in progress */
183 	if (atomic_cmpxchg(&vdev->pm->reset_pending, 0, 1) == 0) {
184 		ivpu_hw_diagnose_failure(vdev);
185 		ivpu_hw_irq_disable(vdev); /* Disable IRQ early to protect from IRQ storm */
186 		queue_work(system_long_wq, &vdev->pm->recovery_work);
187 	}
188 }
189 
ivpu_job_timeout_work(struct work_struct * work)190 static void ivpu_job_timeout_work(struct work_struct *work)
191 {
192 	struct ivpu_pm_info *pm = container_of(work, struct ivpu_pm_info, job_timeout_work.work);
193 	struct ivpu_device *vdev = pm->vdev;
194 
195 	ivpu_pm_trigger_recovery(vdev, "TDR");
196 }
197 
ivpu_start_job_timeout_detection(struct ivpu_device * vdev)198 void ivpu_start_job_timeout_detection(struct ivpu_device *vdev)
199 {
200 	unsigned long timeout_ms = ivpu_tdr_timeout_ms ? ivpu_tdr_timeout_ms : vdev->timeout.tdr;
201 
202 	/* No-op if already queued */
203 	queue_delayed_work(system_wq, &vdev->pm->job_timeout_work, msecs_to_jiffies(timeout_ms));
204 }
205 
ivpu_stop_job_timeout_detection(struct ivpu_device * vdev)206 void ivpu_stop_job_timeout_detection(struct ivpu_device *vdev)
207 {
208 	cancel_delayed_work_sync(&vdev->pm->job_timeout_work);
209 }
210 
ivpu_pm_suspend_cb(struct device * dev)211 int ivpu_pm_suspend_cb(struct device *dev)
212 {
213 	struct drm_device *drm = dev_get_drvdata(dev);
214 	struct ivpu_device *vdev = to_ivpu_device(drm);
215 	unsigned long timeout;
216 
217 	ivpu_dbg(vdev, PM, "Suspend..\n");
218 
219 	timeout = jiffies + msecs_to_jiffies(vdev->timeout.tdr);
220 	while (!ivpu_hw_is_idle(vdev)) {
221 		cond_resched();
222 		if (time_after_eq(jiffies, timeout)) {
223 			ivpu_err(vdev, "Failed to enter idle on system suspend\n");
224 			return -EBUSY;
225 		}
226 	}
227 
228 	ivpu_jsm_pwr_d0i3_enter(vdev);
229 
230 	ivpu_suspend(vdev);
231 	ivpu_pm_prepare_warm_boot(vdev);
232 
233 	ivpu_dbg(vdev, PM, "Suspend done.\n");
234 
235 	return 0;
236 }
237 
ivpu_pm_resume_cb(struct device * dev)238 int ivpu_pm_resume_cb(struct device *dev)
239 {
240 	struct drm_device *drm = dev_get_drvdata(dev);
241 	struct ivpu_device *vdev = to_ivpu_device(drm);
242 	int ret;
243 
244 	ivpu_dbg(vdev, PM, "Resume..\n");
245 
246 	ret = ivpu_resume(vdev);
247 	if (ret)
248 		ivpu_err(vdev, "Failed to resume: %d\n", ret);
249 
250 	ivpu_dbg(vdev, PM, "Resume done.\n");
251 
252 	return ret;
253 }
254 
ivpu_pm_runtime_suspend_cb(struct device * dev)255 int ivpu_pm_runtime_suspend_cb(struct device *dev)
256 {
257 	struct drm_device *drm = dev_get_drvdata(dev);
258 	struct ivpu_device *vdev = to_ivpu_device(drm);
259 	int ret, ret_d0i3;
260 	bool is_idle;
261 
262 	drm_WARN_ON(&vdev->drm, !xa_empty(&vdev->submitted_jobs_xa));
263 	drm_WARN_ON(&vdev->drm, work_pending(&vdev->pm->recovery_work));
264 
265 	ivpu_dbg(vdev, PM, "Runtime suspend..\n");
266 
267 	ivpu_mmu_disable(vdev);
268 
269 	is_idle = ivpu_hw_is_idle(vdev) || vdev->pm->dct_active_percent;
270 	if (!is_idle)
271 		ivpu_err(vdev, "NPU is not idle before autosuspend\n");
272 
273 	ret_d0i3 = ivpu_jsm_pwr_d0i3_enter(vdev);
274 	if (ret_d0i3)
275 		ivpu_err(vdev, "Failed to prepare for d0i3: %d\n", ret_d0i3);
276 
277 	ret = ivpu_suspend(vdev);
278 	if (ret)
279 		ivpu_err(vdev, "Failed to suspend NPU: %d\n", ret);
280 
281 	if (!is_idle || ret_d0i3) {
282 		ivpu_err(vdev, "Forcing cold boot due to previous errors\n");
283 		atomic_inc(&vdev->pm->reset_counter);
284 		ivpu_dev_coredump(vdev);
285 		ivpu_pm_prepare_cold_boot(vdev);
286 	} else {
287 		ivpu_pm_prepare_warm_boot(vdev);
288 	}
289 
290 	ivpu_dbg(vdev, PM, "Runtime suspend done.\n");
291 
292 	return 0;
293 }
294 
ivpu_pm_runtime_resume_cb(struct device * dev)295 int ivpu_pm_runtime_resume_cb(struct device *dev)
296 {
297 	struct drm_device *drm = dev_get_drvdata(dev);
298 	struct ivpu_device *vdev = to_ivpu_device(drm);
299 	int ret;
300 
301 	ivpu_dbg(vdev, PM, "Runtime resume..\n");
302 
303 	ret = ivpu_resume(vdev);
304 	if (ret)
305 		ivpu_err(vdev, "Failed to set RESUME state: %d\n", ret);
306 
307 	ivpu_dbg(vdev, PM, "Runtime resume done.\n");
308 
309 	return ret;
310 }
311 
ivpu_rpm_get(struct ivpu_device * vdev)312 int ivpu_rpm_get(struct ivpu_device *vdev)
313 {
314 	int ret;
315 
316 	ret = pm_runtime_resume_and_get(vdev->drm.dev);
317 	if (ret < 0) {
318 		ivpu_err(vdev, "Failed to resume NPU: %d\n", ret);
319 		pm_runtime_set_suspended(vdev->drm.dev);
320 	}
321 
322 	return ret;
323 }
324 
ivpu_rpm_put(struct ivpu_device * vdev)325 void ivpu_rpm_put(struct ivpu_device *vdev)
326 {
327 	pm_runtime_mark_last_busy(vdev->drm.dev);
328 	pm_runtime_put_autosuspend(vdev->drm.dev);
329 }
330 
ivpu_pm_reset_prepare_cb(struct pci_dev * pdev)331 void ivpu_pm_reset_prepare_cb(struct pci_dev *pdev)
332 {
333 	struct ivpu_device *vdev = pci_get_drvdata(pdev);
334 
335 	ivpu_dbg(vdev, PM, "Pre-reset..\n");
336 
337 	ivpu_pm_reset_begin(vdev);
338 
339 	if (!pm_runtime_status_suspended(vdev->drm.dev)) {
340 		ivpu_prepare_for_reset(vdev);
341 		ivpu_hw_reset(vdev);
342 	}
343 
344 	ivpu_dbg(vdev, PM, "Pre-reset done.\n");
345 }
346 
ivpu_pm_reset_done_cb(struct pci_dev * pdev)347 void ivpu_pm_reset_done_cb(struct pci_dev *pdev)
348 {
349 	struct ivpu_device *vdev = pci_get_drvdata(pdev);
350 
351 	ivpu_dbg(vdev, PM, "Post-reset..\n");
352 
353 	ivpu_pm_reset_complete(vdev);
354 
355 	ivpu_dbg(vdev, PM, "Post-reset done.\n");
356 }
357 
ivpu_pm_init(struct ivpu_device * vdev)358 void ivpu_pm_init(struct ivpu_device *vdev)
359 {
360 	struct device *dev = vdev->drm.dev;
361 	struct ivpu_pm_info *pm = vdev->pm;
362 	int delay;
363 
364 	pm->vdev = vdev;
365 
366 	init_rwsem(&pm->reset_lock);
367 	atomic_set(&pm->reset_pending, 0);
368 	atomic_set(&pm->reset_counter, 0);
369 
370 	INIT_WORK(&pm->recovery_work, ivpu_pm_recovery_work);
371 	INIT_DELAYED_WORK(&pm->job_timeout_work, ivpu_job_timeout_work);
372 
373 	if (ivpu_disable_recovery)
374 		delay = -1;
375 	else
376 		delay = vdev->timeout.autosuspend;
377 
378 	pm_runtime_use_autosuspend(dev);
379 	pm_runtime_set_autosuspend_delay(dev, delay);
380 	pm_runtime_set_active(dev);
381 
382 	ivpu_dbg(vdev, PM, "Autosuspend delay = %d\n", delay);
383 }
384 
ivpu_pm_disable_recovery(struct ivpu_device * vdev)385 void ivpu_pm_disable_recovery(struct ivpu_device *vdev)
386 {
387 	drm_WARN_ON(&vdev->drm, delayed_work_pending(&vdev->pm->job_timeout_work));
388 	disable_work_sync(&vdev->pm->recovery_work);
389 }
390 
ivpu_pm_enable(struct ivpu_device * vdev)391 void ivpu_pm_enable(struct ivpu_device *vdev)
392 {
393 	struct device *dev = vdev->drm.dev;
394 
395 	pm_runtime_allow(dev);
396 	pm_runtime_mark_last_busy(dev);
397 	pm_runtime_put_autosuspend(dev);
398 }
399 
ivpu_pm_disable(struct ivpu_device * vdev)400 void ivpu_pm_disable(struct ivpu_device *vdev)
401 {
402 	pm_runtime_get_noresume(vdev->drm.dev);
403 	pm_runtime_forbid(vdev->drm.dev);
404 }
405 
ivpu_pm_dct_init(struct ivpu_device * vdev)406 int ivpu_pm_dct_init(struct ivpu_device *vdev)
407 {
408 	if (vdev->pm->dct_active_percent)
409 		return ivpu_pm_dct_enable(vdev, vdev->pm->dct_active_percent);
410 
411 	return 0;
412 }
413 
ivpu_pm_dct_enable(struct ivpu_device * vdev,u8 active_percent)414 int ivpu_pm_dct_enable(struct ivpu_device *vdev, u8 active_percent)
415 {
416 	u32 active_us, inactive_us;
417 	int ret;
418 
419 	if (active_percent == 0 || active_percent > 100)
420 		return -EINVAL;
421 
422 	active_us = (DCT_PERIOD_US * active_percent) / 100;
423 	inactive_us = DCT_PERIOD_US - active_us;
424 
425 	vdev->pm->dct_active_percent = active_percent;
426 
427 	ivpu_dbg(vdev, PM, "DCT requested %u%% (D0: %uus, D0i2: %uus)\n",
428 		 active_percent, active_us, inactive_us);
429 
430 	ret = ivpu_jsm_dct_enable(vdev, active_us, inactive_us);
431 	if (ret) {
432 		ivpu_err_ratelimited(vdev, "Filed to enable DCT: %d\n", ret);
433 		return ret;
434 	}
435 
436 	return 0;
437 }
438 
ivpu_pm_dct_disable(struct ivpu_device * vdev)439 int ivpu_pm_dct_disable(struct ivpu_device *vdev)
440 {
441 	int ret;
442 
443 	vdev->pm->dct_active_percent = 0;
444 
445 	ivpu_dbg(vdev, PM, "DCT requested to be disabled\n");
446 
447 	ret = ivpu_jsm_dct_disable(vdev);
448 	if (ret) {
449 		ivpu_err_ratelimited(vdev, "Filed to disable DCT: %d\n", ret);
450 		return ret;
451 	}
452 
453 	return 0;
454 }
455 
ivpu_pm_dct_irq_thread_handler(struct ivpu_device * vdev)456 void ivpu_pm_dct_irq_thread_handler(struct ivpu_device *vdev)
457 {
458 	bool enable;
459 	int ret;
460 
461 	if (ivpu_hw_btrs_dct_get_request(vdev, &enable))
462 		return;
463 
464 	if (enable)
465 		ret = ivpu_pm_dct_enable(vdev, DCT_DEFAULT_ACTIVE_PERCENT);
466 	else
467 		ret = ivpu_pm_dct_disable(vdev);
468 
469 	if (!ret)
470 		ivpu_hw_btrs_dct_set_status(vdev, enable, vdev->pm->dct_active_percent);
471 }
472