1 /*
2 * Copyright 2008 Advanced Micro Devices, Inc.
3 * Copyright 2008 Red Hat Inc.
4 * Copyright 2009 Jerome Glisse.
5 *
6 * Permission is hereby granted, free of charge, to any person obtaining a
7 * copy of this software and associated documentation files (the "Software"),
8 * to deal in the Software without restriction, including without limitation
9 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
10 * and/or sell copies of the Software, and to permit persons to whom the
11 * Software is furnished to do so, subject to the following conditions:
12 *
13 * The above copyright notice and this permission notice shall be included in
14 * all copies or substantial portions of the Software.
15 *
16 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
19 * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR
20 * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
21 * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
22 * OTHER DEALINGS IN THE SOFTWARE.
23 *
24 * Authors: Dave Airlie
25 * Alex Deucher
26 * Jerome Glisse
27 */
28 #include <linux/power_supply.h>
29 #include <linux/kthread.h>
30 #include <linux/module.h>
31 #include <linux/console.h>
32 #include <linux/slab.h>
33 #include <linux/iommu.h>
34 #include <linux/pci.h>
35 #include <linux/pci-p2pdma.h>
36 #include <linux/apple-gmux.h>
37
38 #include <drm/drm_aperture.h>
39 #include <drm/drm_atomic_helper.h>
40 #include <drm/drm_crtc_helper.h>
41 #include <drm/drm_fb_helper.h>
42 #include <drm/drm_probe_helper.h>
43 #include <drm/amdgpu_drm.h>
44 #include <linux/device.h>
45 #include <linux/vgaarb.h>
46 #include <linux/vga_switcheroo.h>
47 #include <linux/efi.h>
48 #include "amdgpu.h"
49 #include "amdgpu_trace.h"
50 #include "amdgpu_i2c.h"
51 #include "atom.h"
52 #include "amdgpu_atombios.h"
53 #include "amdgpu_atomfirmware.h"
54 #include "amd_pcie.h"
55 #ifdef CONFIG_DRM_AMDGPU_SI
56 #include "si.h"
57 #endif
58 #ifdef CONFIG_DRM_AMDGPU_CIK
59 #include "cik.h"
60 #endif
61 #include "vi.h"
62 #include "soc15.h"
63 #include "nv.h"
64 #include "bif/bif_4_1_d.h"
65 #include <linux/firmware.h>
66 #include "amdgpu_vf_error.h"
67
68 #include "amdgpu_amdkfd.h"
69 #include "amdgpu_pm.h"
70
71 #include "amdgpu_xgmi.h"
72 #include "amdgpu_ras.h"
73 #include "amdgpu_pmu.h"
74 #include "amdgpu_fru_eeprom.h"
75 #include "amdgpu_reset.h"
76 #include "amdgpu_virt.h"
77 #include "amdgpu_dev_coredump.h"
78
79 #include <linux/suspend.h>
80 #include <drm/task_barrier.h>
81 #include <linux/pm_runtime.h>
82
83 #include <drm/drm_drv.h>
84
85 #if IS_ENABLED(CONFIG_X86)
86 #include <asm/intel-family.h>
87 #endif
88
89 MODULE_FIRMWARE("amdgpu/vega10_gpu_info.bin");
90 MODULE_FIRMWARE("amdgpu/vega12_gpu_info.bin");
91 MODULE_FIRMWARE("amdgpu/raven_gpu_info.bin");
92 MODULE_FIRMWARE("amdgpu/picasso_gpu_info.bin");
93 MODULE_FIRMWARE("amdgpu/raven2_gpu_info.bin");
94 MODULE_FIRMWARE("amdgpu/arcturus_gpu_info.bin");
95 MODULE_FIRMWARE("amdgpu/navi12_gpu_info.bin");
96
97 #define AMDGPU_RESUME_MS 2000
98 #define AMDGPU_MAX_RETRY_LIMIT 2
99 #define AMDGPU_RETRY_SRIOV_RESET(r) ((r) == -EBUSY || (r) == -ETIMEDOUT || (r) == -EINVAL)
100 #define AMDGPU_PCIE_INDEX_FALLBACK (0x38 >> 2)
101 #define AMDGPU_PCIE_INDEX_HI_FALLBACK (0x44 >> 2)
102 #define AMDGPU_PCIE_DATA_FALLBACK (0x3C >> 2)
103
104 static const struct drm_driver amdgpu_kms_driver;
105
106 const char *amdgpu_asic_name[] = {
107 "TAHITI",
108 "PITCAIRN",
109 "VERDE",
110 "OLAND",
111 "HAINAN",
112 "BONAIRE",
113 "KAVERI",
114 "KABINI",
115 "HAWAII",
116 "MULLINS",
117 "TOPAZ",
118 "TONGA",
119 "FIJI",
120 "CARRIZO",
121 "STONEY",
122 "POLARIS10",
123 "POLARIS11",
124 "POLARIS12",
125 "VEGAM",
126 "VEGA10",
127 "VEGA12",
128 "VEGA20",
129 "RAVEN",
130 "ARCTURUS",
131 "RENOIR",
132 "ALDEBARAN",
133 "NAVI10",
134 "CYAN_SKILLFISH",
135 "NAVI14",
136 "NAVI12",
137 "SIENNA_CICHLID",
138 "NAVY_FLOUNDER",
139 "VANGOGH",
140 "DIMGREY_CAVEFISH",
141 "BEIGE_GOBY",
142 "YELLOW_CARP",
143 "IP DISCOVERY",
144 "LAST",
145 };
146
147 static inline void amdgpu_device_stop_pending_resets(struct amdgpu_device *adev);
148 static int amdgpu_device_pm_notifier(struct notifier_block *nb, unsigned long mode,
149 void *data);
150
151 /**
152 * DOC: pcie_replay_count
153 *
154 * The amdgpu driver provides a sysfs API for reporting the total number
155 * of PCIe replays (NAKs)
156 * The file pcie_replay_count is used for this and returns the total
157 * number of replays as a sum of the NAKs generated and NAKs received
158 */
159
amdgpu_device_get_pcie_replay_count(struct device * dev,struct device_attribute * attr,char * buf)160 static ssize_t amdgpu_device_get_pcie_replay_count(struct device *dev,
161 struct device_attribute *attr, char *buf)
162 {
163 struct drm_device *ddev = dev_get_drvdata(dev);
164 struct amdgpu_device *adev = drm_to_adev(ddev);
165 uint64_t cnt = amdgpu_asic_get_pcie_replay_count(adev);
166
167 return sysfs_emit(buf, "%llu\n", cnt);
168 }
169
170 static DEVICE_ATTR(pcie_replay_count, 0444,
171 amdgpu_device_get_pcie_replay_count, NULL);
172
amdgpu_device_attr_sysfs_init(struct amdgpu_device * adev)173 static int amdgpu_device_attr_sysfs_init(struct amdgpu_device *adev)
174 {
175 int ret = 0;
176
177 if (!amdgpu_sriov_vf(adev))
178 ret = sysfs_create_file(&adev->dev->kobj,
179 &dev_attr_pcie_replay_count.attr);
180
181 return ret;
182 }
183
amdgpu_device_attr_sysfs_fini(struct amdgpu_device * adev)184 static void amdgpu_device_attr_sysfs_fini(struct amdgpu_device *adev)
185 {
186 if (!amdgpu_sriov_vf(adev))
187 sysfs_remove_file(&adev->dev->kobj,
188 &dev_attr_pcie_replay_count.attr);
189 }
190
amdgpu_sysfs_reg_state_get(struct file * f,struct kobject * kobj,struct bin_attribute * attr,char * buf,loff_t ppos,size_t count)191 static ssize_t amdgpu_sysfs_reg_state_get(struct file *f, struct kobject *kobj,
192 struct bin_attribute *attr, char *buf,
193 loff_t ppos, size_t count)
194 {
195 struct device *dev = kobj_to_dev(kobj);
196 struct drm_device *ddev = dev_get_drvdata(dev);
197 struct amdgpu_device *adev = drm_to_adev(ddev);
198 ssize_t bytes_read;
199
200 switch (ppos) {
201 case AMDGPU_SYS_REG_STATE_XGMI:
202 bytes_read = amdgpu_asic_get_reg_state(
203 adev, AMDGPU_REG_STATE_TYPE_XGMI, buf, count);
204 break;
205 case AMDGPU_SYS_REG_STATE_WAFL:
206 bytes_read = amdgpu_asic_get_reg_state(
207 adev, AMDGPU_REG_STATE_TYPE_WAFL, buf, count);
208 break;
209 case AMDGPU_SYS_REG_STATE_PCIE:
210 bytes_read = amdgpu_asic_get_reg_state(
211 adev, AMDGPU_REG_STATE_TYPE_PCIE, buf, count);
212 break;
213 case AMDGPU_SYS_REG_STATE_USR:
214 bytes_read = amdgpu_asic_get_reg_state(
215 adev, AMDGPU_REG_STATE_TYPE_USR, buf, count);
216 break;
217 case AMDGPU_SYS_REG_STATE_USR_1:
218 bytes_read = amdgpu_asic_get_reg_state(
219 adev, AMDGPU_REG_STATE_TYPE_USR_1, buf, count);
220 break;
221 default:
222 return -EINVAL;
223 }
224
225 return bytes_read;
226 }
227
228 BIN_ATTR(reg_state, 0444, amdgpu_sysfs_reg_state_get, NULL,
229 AMDGPU_SYS_REG_STATE_END);
230
amdgpu_reg_state_sysfs_init(struct amdgpu_device * adev)231 int amdgpu_reg_state_sysfs_init(struct amdgpu_device *adev)
232 {
233 int ret;
234
235 if (!amdgpu_asic_get_reg_state_supported(adev))
236 return 0;
237
238 ret = sysfs_create_bin_file(&adev->dev->kobj, &bin_attr_reg_state);
239
240 return ret;
241 }
242
amdgpu_reg_state_sysfs_fini(struct amdgpu_device * adev)243 void amdgpu_reg_state_sysfs_fini(struct amdgpu_device *adev)
244 {
245 if (!amdgpu_asic_get_reg_state_supported(adev))
246 return;
247 sysfs_remove_bin_file(&adev->dev->kobj, &bin_attr_reg_state);
248 }
249
250 /**
251 * DOC: board_info
252 *
253 * The amdgpu driver provides a sysfs API for giving board related information.
254 * It provides the form factor information in the format
255 *
256 * type : form factor
257 *
258 * Possible form factor values
259 *
260 * - "cem" - PCIE CEM card
261 * - "oam" - Open Compute Accelerator Module
262 * - "unknown" - Not known
263 *
264 */
265
amdgpu_device_get_board_info(struct device * dev,struct device_attribute * attr,char * buf)266 static ssize_t amdgpu_device_get_board_info(struct device *dev,
267 struct device_attribute *attr,
268 char *buf)
269 {
270 struct drm_device *ddev = dev_get_drvdata(dev);
271 struct amdgpu_device *adev = drm_to_adev(ddev);
272 enum amdgpu_pkg_type pkg_type = AMDGPU_PKG_TYPE_CEM;
273 const char *pkg;
274
275 if (adev->smuio.funcs && adev->smuio.funcs->get_pkg_type)
276 pkg_type = adev->smuio.funcs->get_pkg_type(adev);
277
278 switch (pkg_type) {
279 case AMDGPU_PKG_TYPE_CEM:
280 pkg = "cem";
281 break;
282 case AMDGPU_PKG_TYPE_OAM:
283 pkg = "oam";
284 break;
285 default:
286 pkg = "unknown";
287 break;
288 }
289
290 return sysfs_emit(buf, "%s : %s\n", "type", pkg);
291 }
292
293 static DEVICE_ATTR(board_info, 0444, amdgpu_device_get_board_info, NULL);
294
295 static struct attribute *amdgpu_board_attrs[] = {
296 &dev_attr_board_info.attr,
297 NULL,
298 };
299
amdgpu_board_attrs_is_visible(struct kobject * kobj,struct attribute * attr,int n)300 static umode_t amdgpu_board_attrs_is_visible(struct kobject *kobj,
301 struct attribute *attr, int n)
302 {
303 struct device *dev = kobj_to_dev(kobj);
304 struct drm_device *ddev = dev_get_drvdata(dev);
305 struct amdgpu_device *adev = drm_to_adev(ddev);
306
307 if (adev->flags & AMD_IS_APU)
308 return 0;
309
310 return attr->mode;
311 }
312
313 static const struct attribute_group amdgpu_board_attrs_group = {
314 .attrs = amdgpu_board_attrs,
315 .is_visible = amdgpu_board_attrs_is_visible
316 };
317
318 static void amdgpu_device_get_pcie_info(struct amdgpu_device *adev);
319
320
321 /**
322 * amdgpu_device_supports_px - Is the device a dGPU with ATPX power control
323 *
324 * @dev: drm_device pointer
325 *
326 * Returns true if the device is a dGPU with ATPX power control,
327 * otherwise return false.
328 */
amdgpu_device_supports_px(struct drm_device * dev)329 bool amdgpu_device_supports_px(struct drm_device *dev)
330 {
331 struct amdgpu_device *adev = drm_to_adev(dev);
332
333 if ((adev->flags & AMD_IS_PX) && !amdgpu_is_atpx_hybrid())
334 return true;
335 return false;
336 }
337
338 /**
339 * amdgpu_device_supports_boco - Is the device a dGPU with ACPI power resources
340 *
341 * @dev: drm_device pointer
342 *
343 * Returns true if the device is a dGPU with ACPI power control,
344 * otherwise return false.
345 */
amdgpu_device_supports_boco(struct drm_device * dev)346 bool amdgpu_device_supports_boco(struct drm_device *dev)
347 {
348 struct amdgpu_device *adev = drm_to_adev(dev);
349
350 if (adev->has_pr3 ||
351 ((adev->flags & AMD_IS_PX) && amdgpu_is_atpx_hybrid()))
352 return true;
353 return false;
354 }
355
356 /**
357 * amdgpu_device_supports_baco - Does the device support BACO
358 *
359 * @dev: drm_device pointer
360 *
361 * Return:
362 * 1 if the device supporte BACO;
363 * 3 if the device support MACO (only works if BACO is supported)
364 * otherwise return 0.
365 */
amdgpu_device_supports_baco(struct drm_device * dev)366 int amdgpu_device_supports_baco(struct drm_device *dev)
367 {
368 struct amdgpu_device *adev = drm_to_adev(dev);
369
370 return amdgpu_asic_supports_baco(adev);
371 }
372
amdgpu_device_detect_runtime_pm_mode(struct amdgpu_device * adev)373 void amdgpu_device_detect_runtime_pm_mode(struct amdgpu_device *adev)
374 {
375 struct drm_device *dev;
376 int bamaco_support;
377
378 dev = adev_to_drm(adev);
379
380 adev->pm.rpm_mode = AMDGPU_RUNPM_NONE;
381 bamaco_support = amdgpu_device_supports_baco(dev);
382
383 switch (amdgpu_runtime_pm) {
384 case 2:
385 if (bamaco_support & MACO_SUPPORT) {
386 adev->pm.rpm_mode = AMDGPU_RUNPM_BAMACO;
387 dev_info(adev->dev, "Forcing BAMACO for runtime pm\n");
388 } else if (bamaco_support == BACO_SUPPORT) {
389 adev->pm.rpm_mode = AMDGPU_RUNPM_BACO;
390 dev_info(adev->dev, "Requested mode BAMACO not available,fallback to use BACO\n");
391 }
392 break;
393 case 1:
394 if (bamaco_support & BACO_SUPPORT) {
395 adev->pm.rpm_mode = AMDGPU_RUNPM_BACO;
396 dev_info(adev->dev, "Forcing BACO for runtime pm\n");
397 }
398 break;
399 case -1:
400 case -2:
401 if (amdgpu_device_supports_px(dev)) { /* enable PX as runtime mode */
402 adev->pm.rpm_mode = AMDGPU_RUNPM_PX;
403 dev_info(adev->dev, "Using ATPX for runtime pm\n");
404 } else if (amdgpu_device_supports_boco(dev)) { /* enable boco as runtime mode */
405 adev->pm.rpm_mode = AMDGPU_RUNPM_BOCO;
406 dev_info(adev->dev, "Using BOCO for runtime pm\n");
407 } else {
408 if (!bamaco_support)
409 goto no_runtime_pm;
410
411 switch (adev->asic_type) {
412 case CHIP_VEGA20:
413 case CHIP_ARCTURUS:
414 /* BACO are not supported on vega20 and arctrus */
415 break;
416 case CHIP_VEGA10:
417 /* enable BACO as runpm mode if noretry=0 */
418 if (!adev->gmc.noretry)
419 adev->pm.rpm_mode = AMDGPU_RUNPM_BACO;
420 break;
421 default:
422 /* enable BACO as runpm mode on CI+ */
423 adev->pm.rpm_mode = AMDGPU_RUNPM_BACO;
424 break;
425 }
426
427 if (adev->pm.rpm_mode == AMDGPU_RUNPM_BACO) {
428 if (bamaco_support & MACO_SUPPORT) {
429 adev->pm.rpm_mode = AMDGPU_RUNPM_BAMACO;
430 dev_info(adev->dev, "Using BAMACO for runtime pm\n");
431 } else {
432 dev_info(adev->dev, "Using BACO for runtime pm\n");
433 }
434 }
435 }
436 break;
437 case 0:
438 dev_info(adev->dev, "runtime pm is manually disabled\n");
439 break;
440 default:
441 break;
442 }
443
444 no_runtime_pm:
445 if (adev->pm.rpm_mode == AMDGPU_RUNPM_NONE)
446 dev_info(adev->dev, "Runtime PM not available\n");
447 }
448 /**
449 * amdgpu_device_supports_smart_shift - Is the device dGPU with
450 * smart shift support
451 *
452 * @dev: drm_device pointer
453 *
454 * Returns true if the device is a dGPU with Smart Shift support,
455 * otherwise returns false.
456 */
amdgpu_device_supports_smart_shift(struct drm_device * dev)457 bool amdgpu_device_supports_smart_shift(struct drm_device *dev)
458 {
459 return (amdgpu_device_supports_boco(dev) &&
460 amdgpu_acpi_is_power_shift_control_supported());
461 }
462
463 /*
464 * VRAM access helper functions
465 */
466
467 /**
468 * amdgpu_device_mm_access - access vram by MM_INDEX/MM_DATA
469 *
470 * @adev: amdgpu_device pointer
471 * @pos: offset of the buffer in vram
472 * @buf: virtual address of the buffer in system memory
473 * @size: read/write size, sizeof(@buf) must > @size
474 * @write: true - write to vram, otherwise - read from vram
475 */
amdgpu_device_mm_access(struct amdgpu_device * adev,loff_t pos,void * buf,size_t size,bool write)476 void amdgpu_device_mm_access(struct amdgpu_device *adev, loff_t pos,
477 void *buf, size_t size, bool write)
478 {
479 unsigned long flags;
480 uint32_t hi = ~0, tmp = 0;
481 uint32_t *data = buf;
482 uint64_t last;
483 int idx;
484
485 if (!drm_dev_enter(adev_to_drm(adev), &idx))
486 return;
487
488 BUG_ON(!IS_ALIGNED(pos, 4) || !IS_ALIGNED(size, 4));
489
490 spin_lock_irqsave(&adev->mmio_idx_lock, flags);
491 for (last = pos + size; pos < last; pos += 4) {
492 tmp = pos >> 31;
493
494 WREG32_NO_KIQ(mmMM_INDEX, ((uint32_t)pos) | 0x80000000);
495 if (tmp != hi) {
496 WREG32_NO_KIQ(mmMM_INDEX_HI, tmp);
497 hi = tmp;
498 }
499 if (write)
500 WREG32_NO_KIQ(mmMM_DATA, *data++);
501 else
502 *data++ = RREG32_NO_KIQ(mmMM_DATA);
503 }
504
505 spin_unlock_irqrestore(&adev->mmio_idx_lock, flags);
506 drm_dev_exit(idx);
507 }
508
509 /**
510 * amdgpu_device_aper_access - access vram by vram aperature
511 *
512 * @adev: amdgpu_device pointer
513 * @pos: offset of the buffer in vram
514 * @buf: virtual address of the buffer in system memory
515 * @size: read/write size, sizeof(@buf) must > @size
516 * @write: true - write to vram, otherwise - read from vram
517 *
518 * The return value means how many bytes have been transferred.
519 */
amdgpu_device_aper_access(struct amdgpu_device * adev,loff_t pos,void * buf,size_t size,bool write)520 size_t amdgpu_device_aper_access(struct amdgpu_device *adev, loff_t pos,
521 void *buf, size_t size, bool write)
522 {
523 #ifdef CONFIG_64BIT
524 void __iomem *addr;
525 size_t count = 0;
526 uint64_t last;
527
528 if (!adev->mman.aper_base_kaddr)
529 return 0;
530
531 last = min(pos + size, adev->gmc.visible_vram_size);
532 if (last > pos) {
533 addr = adev->mman.aper_base_kaddr + pos;
534 count = last - pos;
535
536 if (write) {
537 memcpy_toio(addr, buf, count);
538 /* Make sure HDP write cache flush happens without any reordering
539 * after the system memory contents are sent over PCIe device
540 */
541 mb();
542 amdgpu_device_flush_hdp(adev, NULL);
543 } else {
544 amdgpu_device_invalidate_hdp(adev, NULL);
545 /* Make sure HDP read cache is invalidated before issuing a read
546 * to the PCIe device
547 */
548 mb();
549 memcpy_fromio(buf, addr, count);
550 }
551
552 }
553
554 return count;
555 #else
556 return 0;
557 #endif
558 }
559
560 /**
561 * amdgpu_device_vram_access - read/write a buffer in vram
562 *
563 * @adev: amdgpu_device pointer
564 * @pos: offset of the buffer in vram
565 * @buf: virtual address of the buffer in system memory
566 * @size: read/write size, sizeof(@buf) must > @size
567 * @write: true - write to vram, otherwise - read from vram
568 */
amdgpu_device_vram_access(struct amdgpu_device * adev,loff_t pos,void * buf,size_t size,bool write)569 void amdgpu_device_vram_access(struct amdgpu_device *adev, loff_t pos,
570 void *buf, size_t size, bool write)
571 {
572 size_t count;
573
574 /* try to using vram apreature to access vram first */
575 count = amdgpu_device_aper_access(adev, pos, buf, size, write);
576 size -= count;
577 if (size) {
578 /* using MM to access rest vram */
579 pos += count;
580 buf += count;
581 amdgpu_device_mm_access(adev, pos, buf, size, write);
582 }
583 }
584
585 /*
586 * register access helper functions.
587 */
588
589 /* Check if hw access should be skipped because of hotplug or device error */
amdgpu_device_skip_hw_access(struct amdgpu_device * adev)590 bool amdgpu_device_skip_hw_access(struct amdgpu_device *adev)
591 {
592 if (adev->no_hw_access)
593 return true;
594
595 #ifdef CONFIG_LOCKDEP
596 /*
597 * This is a bit complicated to understand, so worth a comment. What we assert
598 * here is that the GPU reset is not running on another thread in parallel.
599 *
600 * For this we trylock the read side of the reset semaphore, if that succeeds
601 * we know that the reset is not running in paralell.
602 *
603 * If the trylock fails we assert that we are either already holding the read
604 * side of the lock or are the reset thread itself and hold the write side of
605 * the lock.
606 */
607 if (in_task()) {
608 if (down_read_trylock(&adev->reset_domain->sem))
609 up_read(&adev->reset_domain->sem);
610 else
611 lockdep_assert_held(&adev->reset_domain->sem);
612 }
613 #endif
614 return false;
615 }
616
617 /**
618 * amdgpu_device_rreg - read a memory mapped IO or indirect register
619 *
620 * @adev: amdgpu_device pointer
621 * @reg: dword aligned register offset
622 * @acc_flags: access flags which require special behavior
623 *
624 * Returns the 32 bit value from the offset specified.
625 */
amdgpu_device_rreg(struct amdgpu_device * adev,uint32_t reg,uint32_t acc_flags)626 uint32_t amdgpu_device_rreg(struct amdgpu_device *adev,
627 uint32_t reg, uint32_t acc_flags)
628 {
629 uint32_t ret;
630
631 if (amdgpu_device_skip_hw_access(adev))
632 return 0;
633
634 if ((reg * 4) < adev->rmmio_size) {
635 if (!(acc_flags & AMDGPU_REGS_NO_KIQ) &&
636 amdgpu_sriov_runtime(adev) &&
637 down_read_trylock(&adev->reset_domain->sem)) {
638 ret = amdgpu_kiq_rreg(adev, reg, 0);
639 up_read(&adev->reset_domain->sem);
640 } else {
641 ret = readl(((void __iomem *)adev->rmmio) + (reg * 4));
642 }
643 } else {
644 ret = adev->pcie_rreg(adev, reg * 4);
645 }
646
647 trace_amdgpu_device_rreg(adev->pdev->device, reg, ret);
648
649 return ret;
650 }
651
652 /*
653 * MMIO register read with bytes helper functions
654 * @offset:bytes offset from MMIO start
655 */
656
657 /**
658 * amdgpu_mm_rreg8 - read a memory mapped IO register
659 *
660 * @adev: amdgpu_device pointer
661 * @offset: byte aligned register offset
662 *
663 * Returns the 8 bit value from the offset specified.
664 */
amdgpu_mm_rreg8(struct amdgpu_device * adev,uint32_t offset)665 uint8_t amdgpu_mm_rreg8(struct amdgpu_device *adev, uint32_t offset)
666 {
667 if (amdgpu_device_skip_hw_access(adev))
668 return 0;
669
670 if (offset < adev->rmmio_size)
671 return (readb(adev->rmmio + offset));
672 BUG();
673 }
674
675
676 /**
677 * amdgpu_device_xcc_rreg - read a memory mapped IO or indirect register with specific XCC
678 *
679 * @adev: amdgpu_device pointer
680 * @reg: dword aligned register offset
681 * @acc_flags: access flags which require special behavior
682 * @xcc_id: xcc accelerated compute core id
683 *
684 * Returns the 32 bit value from the offset specified.
685 */
amdgpu_device_xcc_rreg(struct amdgpu_device * adev,uint32_t reg,uint32_t acc_flags,uint32_t xcc_id)686 uint32_t amdgpu_device_xcc_rreg(struct amdgpu_device *adev,
687 uint32_t reg, uint32_t acc_flags,
688 uint32_t xcc_id)
689 {
690 uint32_t ret, rlcg_flag;
691
692 if (amdgpu_device_skip_hw_access(adev))
693 return 0;
694
695 if ((reg * 4) < adev->rmmio_size) {
696 if (amdgpu_sriov_vf(adev) &&
697 !amdgpu_sriov_runtime(adev) &&
698 adev->gfx.rlc.rlcg_reg_access_supported &&
699 amdgpu_virt_get_rlcg_reg_access_flag(adev, acc_flags,
700 GC_HWIP, false,
701 &rlcg_flag)) {
702 ret = amdgpu_virt_rlcg_reg_rw(adev, reg, 0, rlcg_flag, GET_INST(GC, xcc_id));
703 } else if (!(acc_flags & AMDGPU_REGS_NO_KIQ) &&
704 amdgpu_sriov_runtime(adev) &&
705 down_read_trylock(&adev->reset_domain->sem)) {
706 ret = amdgpu_kiq_rreg(adev, reg, xcc_id);
707 up_read(&adev->reset_domain->sem);
708 } else {
709 ret = readl(((void __iomem *)adev->rmmio) + (reg * 4));
710 }
711 } else {
712 ret = adev->pcie_rreg(adev, reg * 4);
713 }
714
715 return ret;
716 }
717
718 /*
719 * MMIO register write with bytes helper functions
720 * @offset:bytes offset from MMIO start
721 * @value: the value want to be written to the register
722 */
723
724 /**
725 * amdgpu_mm_wreg8 - read a memory mapped IO register
726 *
727 * @adev: amdgpu_device pointer
728 * @offset: byte aligned register offset
729 * @value: 8 bit value to write
730 *
731 * Writes the value specified to the offset specified.
732 */
amdgpu_mm_wreg8(struct amdgpu_device * adev,uint32_t offset,uint8_t value)733 void amdgpu_mm_wreg8(struct amdgpu_device *adev, uint32_t offset, uint8_t value)
734 {
735 if (amdgpu_device_skip_hw_access(adev))
736 return;
737
738 if (offset < adev->rmmio_size)
739 writeb(value, adev->rmmio + offset);
740 else
741 BUG();
742 }
743
744 /**
745 * amdgpu_device_wreg - write to a memory mapped IO or indirect register
746 *
747 * @adev: amdgpu_device pointer
748 * @reg: dword aligned register offset
749 * @v: 32 bit value to write to the register
750 * @acc_flags: access flags which require special behavior
751 *
752 * Writes the value specified to the offset specified.
753 */
amdgpu_device_wreg(struct amdgpu_device * adev,uint32_t reg,uint32_t v,uint32_t acc_flags)754 void amdgpu_device_wreg(struct amdgpu_device *adev,
755 uint32_t reg, uint32_t v,
756 uint32_t acc_flags)
757 {
758 if (amdgpu_device_skip_hw_access(adev))
759 return;
760
761 if ((reg * 4) < adev->rmmio_size) {
762 if (!(acc_flags & AMDGPU_REGS_NO_KIQ) &&
763 amdgpu_sriov_runtime(adev) &&
764 down_read_trylock(&adev->reset_domain->sem)) {
765 amdgpu_kiq_wreg(adev, reg, v, 0);
766 up_read(&adev->reset_domain->sem);
767 } else {
768 writel(v, ((void __iomem *)adev->rmmio) + (reg * 4));
769 }
770 } else {
771 adev->pcie_wreg(adev, reg * 4, v);
772 }
773
774 trace_amdgpu_device_wreg(adev->pdev->device, reg, v);
775 }
776
777 /**
778 * amdgpu_mm_wreg_mmio_rlc - write register either with direct/indirect mmio or with RLC path if in range
779 *
780 * @adev: amdgpu_device pointer
781 * @reg: mmio/rlc register
782 * @v: value to write
783 * @xcc_id: xcc accelerated compute core id
784 *
785 * this function is invoked only for the debugfs register access
786 */
amdgpu_mm_wreg_mmio_rlc(struct amdgpu_device * adev,uint32_t reg,uint32_t v,uint32_t xcc_id)787 void amdgpu_mm_wreg_mmio_rlc(struct amdgpu_device *adev,
788 uint32_t reg, uint32_t v,
789 uint32_t xcc_id)
790 {
791 if (amdgpu_device_skip_hw_access(adev))
792 return;
793
794 if (amdgpu_sriov_fullaccess(adev) &&
795 adev->gfx.rlc.funcs &&
796 adev->gfx.rlc.funcs->is_rlcg_access_range) {
797 if (adev->gfx.rlc.funcs->is_rlcg_access_range(adev, reg))
798 return amdgpu_sriov_wreg(adev, reg, v, 0, 0, xcc_id);
799 } else if ((reg * 4) >= adev->rmmio_size) {
800 adev->pcie_wreg(adev, reg * 4, v);
801 } else {
802 writel(v, ((void __iomem *)adev->rmmio) + (reg * 4));
803 }
804 }
805
806 /**
807 * amdgpu_device_xcc_wreg - write to a memory mapped IO or indirect register with specific XCC
808 *
809 * @adev: amdgpu_device pointer
810 * @reg: dword aligned register offset
811 * @v: 32 bit value to write to the register
812 * @acc_flags: access flags which require special behavior
813 * @xcc_id: xcc accelerated compute core id
814 *
815 * Writes the value specified to the offset specified.
816 */
amdgpu_device_xcc_wreg(struct amdgpu_device * adev,uint32_t reg,uint32_t v,uint32_t acc_flags,uint32_t xcc_id)817 void amdgpu_device_xcc_wreg(struct amdgpu_device *adev,
818 uint32_t reg, uint32_t v,
819 uint32_t acc_flags, uint32_t xcc_id)
820 {
821 uint32_t rlcg_flag;
822
823 if (amdgpu_device_skip_hw_access(adev))
824 return;
825
826 if ((reg * 4) < adev->rmmio_size) {
827 if (amdgpu_sriov_vf(adev) &&
828 !amdgpu_sriov_runtime(adev) &&
829 adev->gfx.rlc.rlcg_reg_access_supported &&
830 amdgpu_virt_get_rlcg_reg_access_flag(adev, acc_flags,
831 GC_HWIP, true,
832 &rlcg_flag)) {
833 amdgpu_virt_rlcg_reg_rw(adev, reg, v, rlcg_flag, GET_INST(GC, xcc_id));
834 } else if (!(acc_flags & AMDGPU_REGS_NO_KIQ) &&
835 amdgpu_sriov_runtime(adev) &&
836 down_read_trylock(&adev->reset_domain->sem)) {
837 amdgpu_kiq_wreg(adev, reg, v, xcc_id);
838 up_read(&adev->reset_domain->sem);
839 } else {
840 writel(v, ((void __iomem *)adev->rmmio) + (reg * 4));
841 }
842 } else {
843 adev->pcie_wreg(adev, reg * 4, v);
844 }
845 }
846
847 /**
848 * amdgpu_device_indirect_rreg - read an indirect register
849 *
850 * @adev: amdgpu_device pointer
851 * @reg_addr: indirect register address to read from
852 *
853 * Returns the value of indirect register @reg_addr
854 */
amdgpu_device_indirect_rreg(struct amdgpu_device * adev,u32 reg_addr)855 u32 amdgpu_device_indirect_rreg(struct amdgpu_device *adev,
856 u32 reg_addr)
857 {
858 unsigned long flags, pcie_index, pcie_data;
859 void __iomem *pcie_index_offset;
860 void __iomem *pcie_data_offset;
861 u32 r;
862
863 pcie_index = adev->nbio.funcs->get_pcie_index_offset(adev);
864 pcie_data = adev->nbio.funcs->get_pcie_data_offset(adev);
865
866 spin_lock_irqsave(&adev->pcie_idx_lock, flags);
867 pcie_index_offset = (void __iomem *)adev->rmmio + pcie_index * 4;
868 pcie_data_offset = (void __iomem *)adev->rmmio + pcie_data * 4;
869
870 writel(reg_addr, pcie_index_offset);
871 readl(pcie_index_offset);
872 r = readl(pcie_data_offset);
873 spin_unlock_irqrestore(&adev->pcie_idx_lock, flags);
874
875 return r;
876 }
877
amdgpu_device_indirect_rreg_ext(struct amdgpu_device * adev,u64 reg_addr)878 u32 amdgpu_device_indirect_rreg_ext(struct amdgpu_device *adev,
879 u64 reg_addr)
880 {
881 unsigned long flags, pcie_index, pcie_index_hi, pcie_data;
882 u32 r;
883 void __iomem *pcie_index_offset;
884 void __iomem *pcie_index_hi_offset;
885 void __iomem *pcie_data_offset;
886
887 if (unlikely(!adev->nbio.funcs)) {
888 pcie_index = AMDGPU_PCIE_INDEX_FALLBACK;
889 pcie_data = AMDGPU_PCIE_DATA_FALLBACK;
890 } else {
891 pcie_index = adev->nbio.funcs->get_pcie_index_offset(adev);
892 pcie_data = adev->nbio.funcs->get_pcie_data_offset(adev);
893 }
894
895 if (reg_addr >> 32) {
896 if (unlikely(!adev->nbio.funcs))
897 pcie_index_hi = AMDGPU_PCIE_INDEX_HI_FALLBACK;
898 else
899 pcie_index_hi = adev->nbio.funcs->get_pcie_index_hi_offset(adev);
900 } else {
901 pcie_index_hi = 0;
902 }
903
904 spin_lock_irqsave(&adev->pcie_idx_lock, flags);
905 pcie_index_offset = (void __iomem *)adev->rmmio + pcie_index * 4;
906 pcie_data_offset = (void __iomem *)adev->rmmio + pcie_data * 4;
907 if (pcie_index_hi != 0)
908 pcie_index_hi_offset = (void __iomem *)adev->rmmio +
909 pcie_index_hi * 4;
910
911 writel(reg_addr, pcie_index_offset);
912 readl(pcie_index_offset);
913 if (pcie_index_hi != 0) {
914 writel((reg_addr >> 32) & 0xff, pcie_index_hi_offset);
915 readl(pcie_index_hi_offset);
916 }
917 r = readl(pcie_data_offset);
918
919 /* clear the high bits */
920 if (pcie_index_hi != 0) {
921 writel(0, pcie_index_hi_offset);
922 readl(pcie_index_hi_offset);
923 }
924
925 spin_unlock_irqrestore(&adev->pcie_idx_lock, flags);
926
927 return r;
928 }
929
930 /**
931 * amdgpu_device_indirect_rreg64 - read a 64bits indirect register
932 *
933 * @adev: amdgpu_device pointer
934 * @reg_addr: indirect register address to read from
935 *
936 * Returns the value of indirect register @reg_addr
937 */
amdgpu_device_indirect_rreg64(struct amdgpu_device * adev,u32 reg_addr)938 u64 amdgpu_device_indirect_rreg64(struct amdgpu_device *adev,
939 u32 reg_addr)
940 {
941 unsigned long flags, pcie_index, pcie_data;
942 void __iomem *pcie_index_offset;
943 void __iomem *pcie_data_offset;
944 u64 r;
945
946 pcie_index = adev->nbio.funcs->get_pcie_index_offset(adev);
947 pcie_data = adev->nbio.funcs->get_pcie_data_offset(adev);
948
949 spin_lock_irqsave(&adev->pcie_idx_lock, flags);
950 pcie_index_offset = (void __iomem *)adev->rmmio + pcie_index * 4;
951 pcie_data_offset = (void __iomem *)adev->rmmio + pcie_data * 4;
952
953 /* read low 32 bits */
954 writel(reg_addr, pcie_index_offset);
955 readl(pcie_index_offset);
956 r = readl(pcie_data_offset);
957 /* read high 32 bits */
958 writel(reg_addr + 4, pcie_index_offset);
959 readl(pcie_index_offset);
960 r |= ((u64)readl(pcie_data_offset) << 32);
961 spin_unlock_irqrestore(&adev->pcie_idx_lock, flags);
962
963 return r;
964 }
965
amdgpu_device_indirect_rreg64_ext(struct amdgpu_device * adev,u64 reg_addr)966 u64 amdgpu_device_indirect_rreg64_ext(struct amdgpu_device *adev,
967 u64 reg_addr)
968 {
969 unsigned long flags, pcie_index, pcie_data;
970 unsigned long pcie_index_hi = 0;
971 void __iomem *pcie_index_offset;
972 void __iomem *pcie_index_hi_offset;
973 void __iomem *pcie_data_offset;
974 u64 r;
975
976 pcie_index = adev->nbio.funcs->get_pcie_index_offset(adev);
977 pcie_data = adev->nbio.funcs->get_pcie_data_offset(adev);
978 if ((reg_addr >> 32) && (adev->nbio.funcs->get_pcie_index_hi_offset))
979 pcie_index_hi = adev->nbio.funcs->get_pcie_index_hi_offset(adev);
980
981 spin_lock_irqsave(&adev->pcie_idx_lock, flags);
982 pcie_index_offset = (void __iomem *)adev->rmmio + pcie_index * 4;
983 pcie_data_offset = (void __iomem *)adev->rmmio + pcie_data * 4;
984 if (pcie_index_hi != 0)
985 pcie_index_hi_offset = (void __iomem *)adev->rmmio +
986 pcie_index_hi * 4;
987
988 /* read low 32 bits */
989 writel(reg_addr, pcie_index_offset);
990 readl(pcie_index_offset);
991 if (pcie_index_hi != 0) {
992 writel((reg_addr >> 32) & 0xff, pcie_index_hi_offset);
993 readl(pcie_index_hi_offset);
994 }
995 r = readl(pcie_data_offset);
996 /* read high 32 bits */
997 writel(reg_addr + 4, pcie_index_offset);
998 readl(pcie_index_offset);
999 if (pcie_index_hi != 0) {
1000 writel((reg_addr >> 32) & 0xff, pcie_index_hi_offset);
1001 readl(pcie_index_hi_offset);
1002 }
1003 r |= ((u64)readl(pcie_data_offset) << 32);
1004
1005 /* clear the high bits */
1006 if (pcie_index_hi != 0) {
1007 writel(0, pcie_index_hi_offset);
1008 readl(pcie_index_hi_offset);
1009 }
1010
1011 spin_unlock_irqrestore(&adev->pcie_idx_lock, flags);
1012
1013 return r;
1014 }
1015
1016 /**
1017 * amdgpu_device_indirect_wreg - write an indirect register address
1018 *
1019 * @adev: amdgpu_device pointer
1020 * @reg_addr: indirect register offset
1021 * @reg_data: indirect register data
1022 *
1023 */
amdgpu_device_indirect_wreg(struct amdgpu_device * adev,u32 reg_addr,u32 reg_data)1024 void amdgpu_device_indirect_wreg(struct amdgpu_device *adev,
1025 u32 reg_addr, u32 reg_data)
1026 {
1027 unsigned long flags, pcie_index, pcie_data;
1028 void __iomem *pcie_index_offset;
1029 void __iomem *pcie_data_offset;
1030
1031 pcie_index = adev->nbio.funcs->get_pcie_index_offset(adev);
1032 pcie_data = adev->nbio.funcs->get_pcie_data_offset(adev);
1033
1034 spin_lock_irqsave(&adev->pcie_idx_lock, flags);
1035 pcie_index_offset = (void __iomem *)adev->rmmio + pcie_index * 4;
1036 pcie_data_offset = (void __iomem *)adev->rmmio + pcie_data * 4;
1037
1038 writel(reg_addr, pcie_index_offset);
1039 readl(pcie_index_offset);
1040 writel(reg_data, pcie_data_offset);
1041 readl(pcie_data_offset);
1042 spin_unlock_irqrestore(&adev->pcie_idx_lock, flags);
1043 }
1044
amdgpu_device_indirect_wreg_ext(struct amdgpu_device * adev,u64 reg_addr,u32 reg_data)1045 void amdgpu_device_indirect_wreg_ext(struct amdgpu_device *adev,
1046 u64 reg_addr, u32 reg_data)
1047 {
1048 unsigned long flags, pcie_index, pcie_index_hi, pcie_data;
1049 void __iomem *pcie_index_offset;
1050 void __iomem *pcie_index_hi_offset;
1051 void __iomem *pcie_data_offset;
1052
1053 pcie_index = adev->nbio.funcs->get_pcie_index_offset(adev);
1054 pcie_data = adev->nbio.funcs->get_pcie_data_offset(adev);
1055 if ((reg_addr >> 32) && (adev->nbio.funcs->get_pcie_index_hi_offset))
1056 pcie_index_hi = adev->nbio.funcs->get_pcie_index_hi_offset(adev);
1057 else
1058 pcie_index_hi = 0;
1059
1060 spin_lock_irqsave(&adev->pcie_idx_lock, flags);
1061 pcie_index_offset = (void __iomem *)adev->rmmio + pcie_index * 4;
1062 pcie_data_offset = (void __iomem *)adev->rmmio + pcie_data * 4;
1063 if (pcie_index_hi != 0)
1064 pcie_index_hi_offset = (void __iomem *)adev->rmmio +
1065 pcie_index_hi * 4;
1066
1067 writel(reg_addr, pcie_index_offset);
1068 readl(pcie_index_offset);
1069 if (pcie_index_hi != 0) {
1070 writel((reg_addr >> 32) & 0xff, pcie_index_hi_offset);
1071 readl(pcie_index_hi_offset);
1072 }
1073 writel(reg_data, pcie_data_offset);
1074 readl(pcie_data_offset);
1075
1076 /* clear the high bits */
1077 if (pcie_index_hi != 0) {
1078 writel(0, pcie_index_hi_offset);
1079 readl(pcie_index_hi_offset);
1080 }
1081
1082 spin_unlock_irqrestore(&adev->pcie_idx_lock, flags);
1083 }
1084
1085 /**
1086 * amdgpu_device_indirect_wreg64 - write a 64bits indirect register address
1087 *
1088 * @adev: amdgpu_device pointer
1089 * @reg_addr: indirect register offset
1090 * @reg_data: indirect register data
1091 *
1092 */
amdgpu_device_indirect_wreg64(struct amdgpu_device * adev,u32 reg_addr,u64 reg_data)1093 void amdgpu_device_indirect_wreg64(struct amdgpu_device *adev,
1094 u32 reg_addr, u64 reg_data)
1095 {
1096 unsigned long flags, pcie_index, pcie_data;
1097 void __iomem *pcie_index_offset;
1098 void __iomem *pcie_data_offset;
1099
1100 pcie_index = adev->nbio.funcs->get_pcie_index_offset(adev);
1101 pcie_data = adev->nbio.funcs->get_pcie_data_offset(adev);
1102
1103 spin_lock_irqsave(&adev->pcie_idx_lock, flags);
1104 pcie_index_offset = (void __iomem *)adev->rmmio + pcie_index * 4;
1105 pcie_data_offset = (void __iomem *)adev->rmmio + pcie_data * 4;
1106
1107 /* write low 32 bits */
1108 writel(reg_addr, pcie_index_offset);
1109 readl(pcie_index_offset);
1110 writel((u32)(reg_data & 0xffffffffULL), pcie_data_offset);
1111 readl(pcie_data_offset);
1112 /* write high 32 bits */
1113 writel(reg_addr + 4, pcie_index_offset);
1114 readl(pcie_index_offset);
1115 writel((u32)(reg_data >> 32), pcie_data_offset);
1116 readl(pcie_data_offset);
1117 spin_unlock_irqrestore(&adev->pcie_idx_lock, flags);
1118 }
1119
amdgpu_device_indirect_wreg64_ext(struct amdgpu_device * adev,u64 reg_addr,u64 reg_data)1120 void amdgpu_device_indirect_wreg64_ext(struct amdgpu_device *adev,
1121 u64 reg_addr, u64 reg_data)
1122 {
1123 unsigned long flags, pcie_index, pcie_data;
1124 unsigned long pcie_index_hi = 0;
1125 void __iomem *pcie_index_offset;
1126 void __iomem *pcie_index_hi_offset;
1127 void __iomem *pcie_data_offset;
1128
1129 pcie_index = adev->nbio.funcs->get_pcie_index_offset(adev);
1130 pcie_data = adev->nbio.funcs->get_pcie_data_offset(adev);
1131 if ((reg_addr >> 32) && (adev->nbio.funcs->get_pcie_index_hi_offset))
1132 pcie_index_hi = adev->nbio.funcs->get_pcie_index_hi_offset(adev);
1133
1134 spin_lock_irqsave(&adev->pcie_idx_lock, flags);
1135 pcie_index_offset = (void __iomem *)adev->rmmio + pcie_index * 4;
1136 pcie_data_offset = (void __iomem *)adev->rmmio + pcie_data * 4;
1137 if (pcie_index_hi != 0)
1138 pcie_index_hi_offset = (void __iomem *)adev->rmmio +
1139 pcie_index_hi * 4;
1140
1141 /* write low 32 bits */
1142 writel(reg_addr, pcie_index_offset);
1143 readl(pcie_index_offset);
1144 if (pcie_index_hi != 0) {
1145 writel((reg_addr >> 32) & 0xff, pcie_index_hi_offset);
1146 readl(pcie_index_hi_offset);
1147 }
1148 writel((u32)(reg_data & 0xffffffffULL), pcie_data_offset);
1149 readl(pcie_data_offset);
1150 /* write high 32 bits */
1151 writel(reg_addr + 4, pcie_index_offset);
1152 readl(pcie_index_offset);
1153 if (pcie_index_hi != 0) {
1154 writel((reg_addr >> 32) & 0xff, pcie_index_hi_offset);
1155 readl(pcie_index_hi_offset);
1156 }
1157 writel((u32)(reg_data >> 32), pcie_data_offset);
1158 readl(pcie_data_offset);
1159
1160 /* clear the high bits */
1161 if (pcie_index_hi != 0) {
1162 writel(0, pcie_index_hi_offset);
1163 readl(pcie_index_hi_offset);
1164 }
1165
1166 spin_unlock_irqrestore(&adev->pcie_idx_lock, flags);
1167 }
1168
1169 /**
1170 * amdgpu_device_get_rev_id - query device rev_id
1171 *
1172 * @adev: amdgpu_device pointer
1173 *
1174 * Return device rev_id
1175 */
amdgpu_device_get_rev_id(struct amdgpu_device * adev)1176 u32 amdgpu_device_get_rev_id(struct amdgpu_device *adev)
1177 {
1178 return adev->nbio.funcs->get_rev_id(adev);
1179 }
1180
1181 /**
1182 * amdgpu_invalid_rreg - dummy reg read function
1183 *
1184 * @adev: amdgpu_device pointer
1185 * @reg: offset of register
1186 *
1187 * Dummy register read function. Used for register blocks
1188 * that certain asics don't have (all asics).
1189 * Returns the value in the register.
1190 */
amdgpu_invalid_rreg(struct amdgpu_device * adev,uint32_t reg)1191 static uint32_t amdgpu_invalid_rreg(struct amdgpu_device *adev, uint32_t reg)
1192 {
1193 DRM_ERROR("Invalid callback to read register 0x%04X\n", reg);
1194 BUG();
1195 return 0;
1196 }
1197
amdgpu_invalid_rreg_ext(struct amdgpu_device * adev,uint64_t reg)1198 static uint32_t amdgpu_invalid_rreg_ext(struct amdgpu_device *adev, uint64_t reg)
1199 {
1200 DRM_ERROR("Invalid callback to read register 0x%llX\n", reg);
1201 BUG();
1202 return 0;
1203 }
1204
1205 /**
1206 * amdgpu_invalid_wreg - dummy reg write function
1207 *
1208 * @adev: amdgpu_device pointer
1209 * @reg: offset of register
1210 * @v: value to write to the register
1211 *
1212 * Dummy register read function. Used for register blocks
1213 * that certain asics don't have (all asics).
1214 */
amdgpu_invalid_wreg(struct amdgpu_device * adev,uint32_t reg,uint32_t v)1215 static void amdgpu_invalid_wreg(struct amdgpu_device *adev, uint32_t reg, uint32_t v)
1216 {
1217 DRM_ERROR("Invalid callback to write register 0x%04X with 0x%08X\n",
1218 reg, v);
1219 BUG();
1220 }
1221
amdgpu_invalid_wreg_ext(struct amdgpu_device * adev,uint64_t reg,uint32_t v)1222 static void amdgpu_invalid_wreg_ext(struct amdgpu_device *adev, uint64_t reg, uint32_t v)
1223 {
1224 DRM_ERROR("Invalid callback to write register 0x%llX with 0x%08X\n",
1225 reg, v);
1226 BUG();
1227 }
1228
1229 /**
1230 * amdgpu_invalid_rreg64 - dummy 64 bit reg read function
1231 *
1232 * @adev: amdgpu_device pointer
1233 * @reg: offset of register
1234 *
1235 * Dummy register read function. Used for register blocks
1236 * that certain asics don't have (all asics).
1237 * Returns the value in the register.
1238 */
amdgpu_invalid_rreg64(struct amdgpu_device * adev,uint32_t reg)1239 static uint64_t amdgpu_invalid_rreg64(struct amdgpu_device *adev, uint32_t reg)
1240 {
1241 DRM_ERROR("Invalid callback to read 64 bit register 0x%04X\n", reg);
1242 BUG();
1243 return 0;
1244 }
1245
amdgpu_invalid_rreg64_ext(struct amdgpu_device * adev,uint64_t reg)1246 static uint64_t amdgpu_invalid_rreg64_ext(struct amdgpu_device *adev, uint64_t reg)
1247 {
1248 DRM_ERROR("Invalid callback to read register 0x%llX\n", reg);
1249 BUG();
1250 return 0;
1251 }
1252
1253 /**
1254 * amdgpu_invalid_wreg64 - dummy reg write function
1255 *
1256 * @adev: amdgpu_device pointer
1257 * @reg: offset of register
1258 * @v: value to write to the register
1259 *
1260 * Dummy register read function. Used for register blocks
1261 * that certain asics don't have (all asics).
1262 */
amdgpu_invalid_wreg64(struct amdgpu_device * adev,uint32_t reg,uint64_t v)1263 static void amdgpu_invalid_wreg64(struct amdgpu_device *adev, uint32_t reg, uint64_t v)
1264 {
1265 DRM_ERROR("Invalid callback to write 64 bit register 0x%04X with 0x%08llX\n",
1266 reg, v);
1267 BUG();
1268 }
1269
amdgpu_invalid_wreg64_ext(struct amdgpu_device * adev,uint64_t reg,uint64_t v)1270 static void amdgpu_invalid_wreg64_ext(struct amdgpu_device *adev, uint64_t reg, uint64_t v)
1271 {
1272 DRM_ERROR("Invalid callback to write 64 bit register 0x%llX with 0x%08llX\n",
1273 reg, v);
1274 BUG();
1275 }
1276
1277 /**
1278 * amdgpu_block_invalid_rreg - dummy reg read function
1279 *
1280 * @adev: amdgpu_device pointer
1281 * @block: offset of instance
1282 * @reg: offset of register
1283 *
1284 * Dummy register read function. Used for register blocks
1285 * that certain asics don't have (all asics).
1286 * Returns the value in the register.
1287 */
amdgpu_block_invalid_rreg(struct amdgpu_device * adev,uint32_t block,uint32_t reg)1288 static uint32_t amdgpu_block_invalid_rreg(struct amdgpu_device *adev,
1289 uint32_t block, uint32_t reg)
1290 {
1291 DRM_ERROR("Invalid callback to read register 0x%04X in block 0x%04X\n",
1292 reg, block);
1293 BUG();
1294 return 0;
1295 }
1296
1297 /**
1298 * amdgpu_block_invalid_wreg - dummy reg write function
1299 *
1300 * @adev: amdgpu_device pointer
1301 * @block: offset of instance
1302 * @reg: offset of register
1303 * @v: value to write to the register
1304 *
1305 * Dummy register read function. Used for register blocks
1306 * that certain asics don't have (all asics).
1307 */
amdgpu_block_invalid_wreg(struct amdgpu_device * adev,uint32_t block,uint32_t reg,uint32_t v)1308 static void amdgpu_block_invalid_wreg(struct amdgpu_device *adev,
1309 uint32_t block,
1310 uint32_t reg, uint32_t v)
1311 {
1312 DRM_ERROR("Invalid block callback to write register 0x%04X in block 0x%04X with 0x%08X\n",
1313 reg, block, v);
1314 BUG();
1315 }
1316
1317 /**
1318 * amdgpu_device_asic_init - Wrapper for atom asic_init
1319 *
1320 * @adev: amdgpu_device pointer
1321 *
1322 * Does any asic specific work and then calls atom asic init.
1323 */
amdgpu_device_asic_init(struct amdgpu_device * adev)1324 static int amdgpu_device_asic_init(struct amdgpu_device *adev)
1325 {
1326 int ret;
1327
1328 amdgpu_asic_pre_asic_init(adev);
1329
1330 if (amdgpu_ip_version(adev, GC_HWIP, 0) == IP_VERSION(9, 4, 3) ||
1331 amdgpu_ip_version(adev, GC_HWIP, 0) == IP_VERSION(9, 4, 4) ||
1332 amdgpu_ip_version(adev, GC_HWIP, 0) >= IP_VERSION(11, 0, 0)) {
1333 amdgpu_psp_wait_for_bootloader(adev);
1334 ret = amdgpu_atomfirmware_asic_init(adev, true);
1335 return ret;
1336 } else {
1337 return amdgpu_atom_asic_init(adev->mode_info.atom_context);
1338 }
1339
1340 return 0;
1341 }
1342
1343 /**
1344 * amdgpu_device_mem_scratch_init - allocate the VRAM scratch page
1345 *
1346 * @adev: amdgpu_device pointer
1347 *
1348 * Allocates a scratch page of VRAM for use by various things in the
1349 * driver.
1350 */
amdgpu_device_mem_scratch_init(struct amdgpu_device * adev)1351 static int amdgpu_device_mem_scratch_init(struct amdgpu_device *adev)
1352 {
1353 return amdgpu_bo_create_kernel(adev, AMDGPU_GPU_PAGE_SIZE, PAGE_SIZE,
1354 AMDGPU_GEM_DOMAIN_VRAM |
1355 AMDGPU_GEM_DOMAIN_GTT,
1356 &adev->mem_scratch.robj,
1357 &adev->mem_scratch.gpu_addr,
1358 (void **)&adev->mem_scratch.ptr);
1359 }
1360
1361 /**
1362 * amdgpu_device_mem_scratch_fini - Free the VRAM scratch page
1363 *
1364 * @adev: amdgpu_device pointer
1365 *
1366 * Frees the VRAM scratch page.
1367 */
amdgpu_device_mem_scratch_fini(struct amdgpu_device * adev)1368 static void amdgpu_device_mem_scratch_fini(struct amdgpu_device *adev)
1369 {
1370 amdgpu_bo_free_kernel(&adev->mem_scratch.robj, NULL, NULL);
1371 }
1372
1373 /**
1374 * amdgpu_device_program_register_sequence - program an array of registers.
1375 *
1376 * @adev: amdgpu_device pointer
1377 * @registers: pointer to the register array
1378 * @array_size: size of the register array
1379 *
1380 * Programs an array or registers with and or masks.
1381 * This is a helper for setting golden registers.
1382 */
amdgpu_device_program_register_sequence(struct amdgpu_device * adev,const u32 * registers,const u32 array_size)1383 void amdgpu_device_program_register_sequence(struct amdgpu_device *adev,
1384 const u32 *registers,
1385 const u32 array_size)
1386 {
1387 u32 tmp, reg, and_mask, or_mask;
1388 int i;
1389
1390 if (array_size % 3)
1391 return;
1392
1393 for (i = 0; i < array_size; i += 3) {
1394 reg = registers[i + 0];
1395 and_mask = registers[i + 1];
1396 or_mask = registers[i + 2];
1397
1398 if (and_mask == 0xffffffff) {
1399 tmp = or_mask;
1400 } else {
1401 tmp = RREG32(reg);
1402 tmp &= ~and_mask;
1403 if (adev->family >= AMDGPU_FAMILY_AI)
1404 tmp |= (or_mask & and_mask);
1405 else
1406 tmp |= or_mask;
1407 }
1408 WREG32(reg, tmp);
1409 }
1410 }
1411
1412 /**
1413 * amdgpu_device_pci_config_reset - reset the GPU
1414 *
1415 * @adev: amdgpu_device pointer
1416 *
1417 * Resets the GPU using the pci config reset sequence.
1418 * Only applicable to asics prior to vega10.
1419 */
amdgpu_device_pci_config_reset(struct amdgpu_device * adev)1420 void amdgpu_device_pci_config_reset(struct amdgpu_device *adev)
1421 {
1422 pci_write_config_dword(adev->pdev, 0x7c, AMDGPU_ASIC_RESET_DATA);
1423 }
1424
1425 /**
1426 * amdgpu_device_pci_reset - reset the GPU using generic PCI means
1427 *
1428 * @adev: amdgpu_device pointer
1429 *
1430 * Resets the GPU using generic pci reset interfaces (FLR, SBR, etc.).
1431 */
amdgpu_device_pci_reset(struct amdgpu_device * adev)1432 int amdgpu_device_pci_reset(struct amdgpu_device *adev)
1433 {
1434 return pci_reset_function(adev->pdev);
1435 }
1436
1437 /*
1438 * amdgpu_device_wb_*()
1439 * Writeback is the method by which the GPU updates special pages in memory
1440 * with the status of certain GPU events (fences, ring pointers,etc.).
1441 */
1442
1443 /**
1444 * amdgpu_device_wb_fini - Disable Writeback and free memory
1445 *
1446 * @adev: amdgpu_device pointer
1447 *
1448 * Disables Writeback and frees the Writeback memory (all asics).
1449 * Used at driver shutdown.
1450 */
amdgpu_device_wb_fini(struct amdgpu_device * adev)1451 static void amdgpu_device_wb_fini(struct amdgpu_device *adev)
1452 {
1453 if (adev->wb.wb_obj) {
1454 amdgpu_bo_free_kernel(&adev->wb.wb_obj,
1455 &adev->wb.gpu_addr,
1456 (void **)&adev->wb.wb);
1457 adev->wb.wb_obj = NULL;
1458 }
1459 }
1460
1461 /**
1462 * amdgpu_device_wb_init - Init Writeback driver info and allocate memory
1463 *
1464 * @adev: amdgpu_device pointer
1465 *
1466 * Initializes writeback and allocates writeback memory (all asics).
1467 * Used at driver startup.
1468 * Returns 0 on success or an -error on failure.
1469 */
amdgpu_device_wb_init(struct amdgpu_device * adev)1470 static int amdgpu_device_wb_init(struct amdgpu_device *adev)
1471 {
1472 int r;
1473
1474 if (adev->wb.wb_obj == NULL) {
1475 /* AMDGPU_MAX_WB * sizeof(uint32_t) * 8 = AMDGPU_MAX_WB 256bit slots */
1476 r = amdgpu_bo_create_kernel(adev, AMDGPU_MAX_WB * sizeof(uint32_t) * 8,
1477 PAGE_SIZE, AMDGPU_GEM_DOMAIN_GTT,
1478 &adev->wb.wb_obj, &adev->wb.gpu_addr,
1479 (void **)&adev->wb.wb);
1480 if (r) {
1481 dev_warn(adev->dev, "(%d) create WB bo failed\n", r);
1482 return r;
1483 }
1484
1485 adev->wb.num_wb = AMDGPU_MAX_WB;
1486 memset(&adev->wb.used, 0, sizeof(adev->wb.used));
1487
1488 /* clear wb memory */
1489 memset((char *)adev->wb.wb, 0, AMDGPU_MAX_WB * sizeof(uint32_t) * 8);
1490 }
1491
1492 return 0;
1493 }
1494
1495 /**
1496 * amdgpu_device_wb_get - Allocate a wb entry
1497 *
1498 * @adev: amdgpu_device pointer
1499 * @wb: wb index
1500 *
1501 * Allocate a wb slot for use by the driver (all asics).
1502 * Returns 0 on success or -EINVAL on failure.
1503 */
amdgpu_device_wb_get(struct amdgpu_device * adev,u32 * wb)1504 int amdgpu_device_wb_get(struct amdgpu_device *adev, u32 *wb)
1505 {
1506 unsigned long flags, offset;
1507
1508 spin_lock_irqsave(&adev->wb.lock, flags);
1509 offset = find_first_zero_bit(adev->wb.used, adev->wb.num_wb);
1510 if (offset < adev->wb.num_wb) {
1511 __set_bit(offset, adev->wb.used);
1512 spin_unlock_irqrestore(&adev->wb.lock, flags);
1513 *wb = offset << 3; /* convert to dw offset */
1514 return 0;
1515 } else {
1516 spin_unlock_irqrestore(&adev->wb.lock, flags);
1517 return -EINVAL;
1518 }
1519 }
1520
1521 /**
1522 * amdgpu_device_wb_free - Free a wb entry
1523 *
1524 * @adev: amdgpu_device pointer
1525 * @wb: wb index
1526 *
1527 * Free a wb slot allocated for use by the driver (all asics)
1528 */
amdgpu_device_wb_free(struct amdgpu_device * adev,u32 wb)1529 void amdgpu_device_wb_free(struct amdgpu_device *adev, u32 wb)
1530 {
1531 unsigned long flags;
1532
1533 wb >>= 3;
1534 spin_lock_irqsave(&adev->wb.lock, flags);
1535 if (wb < adev->wb.num_wb)
1536 __clear_bit(wb, adev->wb.used);
1537 spin_unlock_irqrestore(&adev->wb.lock, flags);
1538 }
1539
1540 /**
1541 * amdgpu_device_resize_fb_bar - try to resize FB BAR
1542 *
1543 * @adev: amdgpu_device pointer
1544 *
1545 * Try to resize FB BAR to make all VRAM CPU accessible. We try very hard not
1546 * to fail, but if any of the BARs is not accessible after the size we abort
1547 * driver loading by returning -ENODEV.
1548 */
amdgpu_device_resize_fb_bar(struct amdgpu_device * adev)1549 int amdgpu_device_resize_fb_bar(struct amdgpu_device *adev)
1550 {
1551 int rbar_size = pci_rebar_bytes_to_size(adev->gmc.real_vram_size);
1552 struct pci_bus *root;
1553 struct resource *res;
1554 unsigned int i;
1555 u16 cmd;
1556 int r;
1557
1558 if (!IS_ENABLED(CONFIG_PHYS_ADDR_T_64BIT))
1559 return 0;
1560
1561 /* Bypass for VF */
1562 if (amdgpu_sriov_vf(adev))
1563 return 0;
1564
1565 /* resizing on Dell G5 SE platforms causes problems with runtime pm */
1566 if ((amdgpu_runtime_pm != 0) &&
1567 adev->pdev->vendor == PCI_VENDOR_ID_ATI &&
1568 adev->pdev->device == 0x731f &&
1569 adev->pdev->subsystem_vendor == PCI_VENDOR_ID_DELL)
1570 return 0;
1571
1572 /* PCI_EXT_CAP_ID_VNDR extended capability is located at 0x100 */
1573 if (!pci_find_ext_capability(adev->pdev, PCI_EXT_CAP_ID_VNDR))
1574 DRM_WARN("System can't access extended configuration space, please check!!\n");
1575
1576 /* skip if the bios has already enabled large BAR */
1577 if (adev->gmc.real_vram_size &&
1578 (pci_resource_len(adev->pdev, 0) >= adev->gmc.real_vram_size))
1579 return 0;
1580
1581 /* Check if the root BUS has 64bit memory resources */
1582 root = adev->pdev->bus;
1583 while (root->parent)
1584 root = root->parent;
1585
1586 pci_bus_for_each_resource(root, res, i) {
1587 if (res && res->flags & (IORESOURCE_MEM | IORESOURCE_MEM_64) &&
1588 res->start > 0x100000000ull)
1589 break;
1590 }
1591
1592 /* Trying to resize is pointless without a root hub window above 4GB */
1593 if (!res)
1594 return 0;
1595
1596 /* Limit the BAR size to what is available */
1597 rbar_size = min(fls(pci_rebar_get_possible_sizes(adev->pdev, 0)) - 1,
1598 rbar_size);
1599
1600 /* Disable memory decoding while we change the BAR addresses and size */
1601 pci_read_config_word(adev->pdev, PCI_COMMAND, &cmd);
1602 pci_write_config_word(adev->pdev, PCI_COMMAND,
1603 cmd & ~PCI_COMMAND_MEMORY);
1604
1605 /* Free the VRAM and doorbell BAR, we most likely need to move both. */
1606 amdgpu_doorbell_fini(adev);
1607 if (adev->asic_type >= CHIP_BONAIRE)
1608 pci_release_resource(adev->pdev, 2);
1609
1610 pci_release_resource(adev->pdev, 0);
1611
1612 r = pci_resize_resource(adev->pdev, 0, rbar_size);
1613 if (r == -ENOSPC)
1614 DRM_INFO("Not enough PCI address space for a large BAR.");
1615 else if (r && r != -ENOTSUPP)
1616 DRM_ERROR("Problem resizing BAR0 (%d).", r);
1617
1618 pci_assign_unassigned_bus_resources(adev->pdev->bus);
1619
1620 /* When the doorbell or fb BAR isn't available we have no chance of
1621 * using the device.
1622 */
1623 r = amdgpu_doorbell_init(adev);
1624 if (r || (pci_resource_flags(adev->pdev, 0) & IORESOURCE_UNSET))
1625 return -ENODEV;
1626
1627 pci_write_config_word(adev->pdev, PCI_COMMAND, cmd);
1628
1629 return 0;
1630 }
1631
amdgpu_device_read_bios(struct amdgpu_device * adev)1632 static bool amdgpu_device_read_bios(struct amdgpu_device *adev)
1633 {
1634 if (hweight32(adev->aid_mask) && (adev->flags & AMD_IS_APU))
1635 return false;
1636
1637 return true;
1638 }
1639
1640 /*
1641 * GPU helpers function.
1642 */
1643 /**
1644 * amdgpu_device_need_post - check if the hw need post or not
1645 *
1646 * @adev: amdgpu_device pointer
1647 *
1648 * Check if the asic has been initialized (all asics) at driver startup
1649 * or post is needed if hw reset is performed.
1650 * Returns true if need or false if not.
1651 */
amdgpu_device_need_post(struct amdgpu_device * adev)1652 bool amdgpu_device_need_post(struct amdgpu_device *adev)
1653 {
1654 uint32_t reg;
1655
1656 if (amdgpu_sriov_vf(adev))
1657 return false;
1658
1659 if (!amdgpu_device_read_bios(adev))
1660 return false;
1661
1662 if (amdgpu_passthrough(adev)) {
1663 /* for FIJI: In whole GPU pass-through virtualization case, after VM reboot
1664 * some old smc fw still need driver do vPost otherwise gpu hang, while
1665 * those smc fw version above 22.15 doesn't have this flaw, so we force
1666 * vpost executed for smc version below 22.15
1667 */
1668 if (adev->asic_type == CHIP_FIJI) {
1669 int err;
1670 uint32_t fw_ver;
1671
1672 err = request_firmware(&adev->pm.fw, "amdgpu/fiji_smc.bin", adev->dev);
1673 /* force vPost if error occured */
1674 if (err)
1675 return true;
1676
1677 fw_ver = *((uint32_t *)adev->pm.fw->data + 69);
1678 release_firmware(adev->pm.fw);
1679 if (fw_ver < 0x00160e00)
1680 return true;
1681 }
1682 }
1683
1684 /* Don't post if we need to reset whole hive on init */
1685 if (adev->gmc.xgmi.pending_reset)
1686 return false;
1687
1688 if (adev->has_hw_reset) {
1689 adev->has_hw_reset = false;
1690 return true;
1691 }
1692
1693 /* bios scratch used on CIK+ */
1694 if (adev->asic_type >= CHIP_BONAIRE)
1695 return amdgpu_atombios_scratch_need_asic_init(adev);
1696
1697 /* check MEM_SIZE for older asics */
1698 reg = amdgpu_asic_get_config_memsize(adev);
1699
1700 if ((reg != 0) && (reg != 0xffffffff))
1701 return false;
1702
1703 return true;
1704 }
1705
1706 /*
1707 * Check whether seamless boot is supported.
1708 *
1709 * So far we only support seamless boot on DCE 3.0 or later.
1710 * If users report that it works on older ASICS as well, we may
1711 * loosen this.
1712 */
amdgpu_device_seamless_boot_supported(struct amdgpu_device * adev)1713 bool amdgpu_device_seamless_boot_supported(struct amdgpu_device *adev)
1714 {
1715 switch (amdgpu_seamless) {
1716 case -1:
1717 break;
1718 case 1:
1719 return true;
1720 case 0:
1721 return false;
1722 default:
1723 DRM_ERROR("Invalid value for amdgpu.seamless: %d\n",
1724 amdgpu_seamless);
1725 return false;
1726 }
1727
1728 if (!(adev->flags & AMD_IS_APU))
1729 return false;
1730
1731 if (adev->mman.keep_stolen_vga_memory)
1732 return false;
1733
1734 return amdgpu_ip_version(adev, DCE_HWIP, 0) >= IP_VERSION(3, 0, 0);
1735 }
1736
1737 /*
1738 * Intel hosts such as Rocket Lake, Alder Lake, Raptor Lake and Sapphire Rapids
1739 * don't support dynamic speed switching. Until we have confirmation from Intel
1740 * that a specific host supports it, it's safer that we keep it disabled for all.
1741 *
1742 * https://edc.intel.com/content/www/us/en/design/products/platforms/details/raptor-lake-s/13th-generation-core-processors-datasheet-volume-1-of-2/005/pci-express-support/
1743 * https://gitlab.freedesktop.org/drm/amd/-/issues/2663
1744 */
amdgpu_device_pcie_dynamic_switching_supported(struct amdgpu_device * adev)1745 static bool amdgpu_device_pcie_dynamic_switching_supported(struct amdgpu_device *adev)
1746 {
1747 #if IS_ENABLED(CONFIG_X86)
1748 struct cpuinfo_x86 *c = &cpu_data(0);
1749
1750 /* eGPU change speeds based on USB4 fabric conditions */
1751 if (dev_is_removable(adev->dev))
1752 return true;
1753
1754 if (c->x86_vendor == X86_VENDOR_INTEL)
1755 return false;
1756 #endif
1757 return true;
1758 }
1759
1760 /**
1761 * amdgpu_device_should_use_aspm - check if the device should program ASPM
1762 *
1763 * @adev: amdgpu_device pointer
1764 *
1765 * Confirm whether the module parameter and pcie bridge agree that ASPM should
1766 * be set for this device.
1767 *
1768 * Returns true if it should be used or false if not.
1769 */
amdgpu_device_should_use_aspm(struct amdgpu_device * adev)1770 bool amdgpu_device_should_use_aspm(struct amdgpu_device *adev)
1771 {
1772 switch (amdgpu_aspm) {
1773 case -1:
1774 break;
1775 case 0:
1776 return false;
1777 case 1:
1778 return true;
1779 default:
1780 return false;
1781 }
1782 if (adev->flags & AMD_IS_APU)
1783 return false;
1784 if (!(adev->pm.pp_feature & PP_PCIE_DPM_MASK))
1785 return false;
1786 return pcie_aspm_enabled(adev->pdev);
1787 }
1788
1789 /* if we get transitioned to only one device, take VGA back */
1790 /**
1791 * amdgpu_device_vga_set_decode - enable/disable vga decode
1792 *
1793 * @pdev: PCI device pointer
1794 * @state: enable/disable vga decode
1795 *
1796 * Enable/disable vga decode (all asics).
1797 * Returns VGA resource flags.
1798 */
amdgpu_device_vga_set_decode(struct pci_dev * pdev,bool state)1799 static unsigned int amdgpu_device_vga_set_decode(struct pci_dev *pdev,
1800 bool state)
1801 {
1802 struct amdgpu_device *adev = drm_to_adev(pci_get_drvdata(pdev));
1803
1804 amdgpu_asic_set_vga_state(adev, state);
1805 if (state)
1806 return VGA_RSRC_LEGACY_IO | VGA_RSRC_LEGACY_MEM |
1807 VGA_RSRC_NORMAL_IO | VGA_RSRC_NORMAL_MEM;
1808 else
1809 return VGA_RSRC_NORMAL_IO | VGA_RSRC_NORMAL_MEM;
1810 }
1811
1812 /**
1813 * amdgpu_device_check_block_size - validate the vm block size
1814 *
1815 * @adev: amdgpu_device pointer
1816 *
1817 * Validates the vm block size specified via module parameter.
1818 * The vm block size defines number of bits in page table versus page directory,
1819 * a page is 4KB so we have 12 bits offset, minimum 9 bits in the
1820 * page table and the remaining bits are in the page directory.
1821 */
amdgpu_device_check_block_size(struct amdgpu_device * adev)1822 static void amdgpu_device_check_block_size(struct amdgpu_device *adev)
1823 {
1824 /* defines number of bits in page table versus page directory,
1825 * a page is 4KB so we have 12 bits offset, minimum 9 bits in the
1826 * page table and the remaining bits are in the page directory
1827 */
1828 if (amdgpu_vm_block_size == -1)
1829 return;
1830
1831 if (amdgpu_vm_block_size < 9) {
1832 dev_warn(adev->dev, "VM page table size (%d) too small\n",
1833 amdgpu_vm_block_size);
1834 amdgpu_vm_block_size = -1;
1835 }
1836 }
1837
1838 /**
1839 * amdgpu_device_check_vm_size - validate the vm size
1840 *
1841 * @adev: amdgpu_device pointer
1842 *
1843 * Validates the vm size in GB specified via module parameter.
1844 * The VM size is the size of the GPU virtual memory space in GB.
1845 */
amdgpu_device_check_vm_size(struct amdgpu_device * adev)1846 static void amdgpu_device_check_vm_size(struct amdgpu_device *adev)
1847 {
1848 /* no need to check the default value */
1849 if (amdgpu_vm_size == -1)
1850 return;
1851
1852 if (amdgpu_vm_size < 1) {
1853 dev_warn(adev->dev, "VM size (%d) too small, min is 1GB\n",
1854 amdgpu_vm_size);
1855 amdgpu_vm_size = -1;
1856 }
1857 }
1858
amdgpu_device_check_smu_prv_buffer_size(struct amdgpu_device * adev)1859 static void amdgpu_device_check_smu_prv_buffer_size(struct amdgpu_device *adev)
1860 {
1861 struct sysinfo si;
1862 bool is_os_64 = (sizeof(void *) == 8);
1863 uint64_t total_memory;
1864 uint64_t dram_size_seven_GB = 0x1B8000000;
1865 uint64_t dram_size_three_GB = 0xB8000000;
1866
1867 if (amdgpu_smu_memory_pool_size == 0)
1868 return;
1869
1870 if (!is_os_64) {
1871 DRM_WARN("Not 64-bit OS, feature not supported\n");
1872 goto def_value;
1873 }
1874 si_meminfo(&si);
1875 total_memory = (uint64_t)si.totalram * si.mem_unit;
1876
1877 if ((amdgpu_smu_memory_pool_size == 1) ||
1878 (amdgpu_smu_memory_pool_size == 2)) {
1879 if (total_memory < dram_size_three_GB)
1880 goto def_value1;
1881 } else if ((amdgpu_smu_memory_pool_size == 4) ||
1882 (amdgpu_smu_memory_pool_size == 8)) {
1883 if (total_memory < dram_size_seven_GB)
1884 goto def_value1;
1885 } else {
1886 DRM_WARN("Smu memory pool size not supported\n");
1887 goto def_value;
1888 }
1889 adev->pm.smu_prv_buffer_size = amdgpu_smu_memory_pool_size << 28;
1890
1891 return;
1892
1893 def_value1:
1894 DRM_WARN("No enough system memory\n");
1895 def_value:
1896 adev->pm.smu_prv_buffer_size = 0;
1897 }
1898
amdgpu_device_init_apu_flags(struct amdgpu_device * adev)1899 static int amdgpu_device_init_apu_flags(struct amdgpu_device *adev)
1900 {
1901 if (!(adev->flags & AMD_IS_APU) ||
1902 adev->asic_type < CHIP_RAVEN)
1903 return 0;
1904
1905 switch (adev->asic_type) {
1906 case CHIP_RAVEN:
1907 if (adev->pdev->device == 0x15dd)
1908 adev->apu_flags |= AMD_APU_IS_RAVEN;
1909 if (adev->pdev->device == 0x15d8)
1910 adev->apu_flags |= AMD_APU_IS_PICASSO;
1911 break;
1912 case CHIP_RENOIR:
1913 if ((adev->pdev->device == 0x1636) ||
1914 (adev->pdev->device == 0x164c))
1915 adev->apu_flags |= AMD_APU_IS_RENOIR;
1916 else
1917 adev->apu_flags |= AMD_APU_IS_GREEN_SARDINE;
1918 break;
1919 case CHIP_VANGOGH:
1920 adev->apu_flags |= AMD_APU_IS_VANGOGH;
1921 break;
1922 case CHIP_YELLOW_CARP:
1923 break;
1924 case CHIP_CYAN_SKILLFISH:
1925 if ((adev->pdev->device == 0x13FE) ||
1926 (adev->pdev->device == 0x143F))
1927 adev->apu_flags |= AMD_APU_IS_CYAN_SKILLFISH2;
1928 break;
1929 default:
1930 break;
1931 }
1932
1933 return 0;
1934 }
1935
1936 /**
1937 * amdgpu_device_check_arguments - validate module params
1938 *
1939 * @adev: amdgpu_device pointer
1940 *
1941 * Validates certain module parameters and updates
1942 * the associated values used by the driver (all asics).
1943 */
amdgpu_device_check_arguments(struct amdgpu_device * adev)1944 static int amdgpu_device_check_arguments(struct amdgpu_device *adev)
1945 {
1946 int i;
1947
1948 if (amdgpu_sched_jobs < 4) {
1949 dev_warn(adev->dev, "sched jobs (%d) must be at least 4\n",
1950 amdgpu_sched_jobs);
1951 amdgpu_sched_jobs = 4;
1952 } else if (!is_power_of_2(amdgpu_sched_jobs)) {
1953 dev_warn(adev->dev, "sched jobs (%d) must be a power of 2\n",
1954 amdgpu_sched_jobs);
1955 amdgpu_sched_jobs = roundup_pow_of_two(amdgpu_sched_jobs);
1956 }
1957
1958 if (amdgpu_gart_size != -1 && amdgpu_gart_size < 32) {
1959 /* gart size must be greater or equal to 32M */
1960 dev_warn(adev->dev, "gart size (%d) too small\n",
1961 amdgpu_gart_size);
1962 amdgpu_gart_size = -1;
1963 }
1964
1965 if (amdgpu_gtt_size != -1 && amdgpu_gtt_size < 32) {
1966 /* gtt size must be greater or equal to 32M */
1967 dev_warn(adev->dev, "gtt size (%d) too small\n",
1968 amdgpu_gtt_size);
1969 amdgpu_gtt_size = -1;
1970 }
1971
1972 /* valid range is between 4 and 9 inclusive */
1973 if (amdgpu_vm_fragment_size != -1 &&
1974 (amdgpu_vm_fragment_size > 9 || amdgpu_vm_fragment_size < 4)) {
1975 dev_warn(adev->dev, "valid range is between 4 and 9\n");
1976 amdgpu_vm_fragment_size = -1;
1977 }
1978
1979 if (amdgpu_sched_hw_submission < 2) {
1980 dev_warn(adev->dev, "sched hw submission jobs (%d) must be at least 2\n",
1981 amdgpu_sched_hw_submission);
1982 amdgpu_sched_hw_submission = 2;
1983 } else if (!is_power_of_2(amdgpu_sched_hw_submission)) {
1984 dev_warn(adev->dev, "sched hw submission jobs (%d) must be a power of 2\n",
1985 amdgpu_sched_hw_submission);
1986 amdgpu_sched_hw_submission = roundup_pow_of_two(amdgpu_sched_hw_submission);
1987 }
1988
1989 if (amdgpu_reset_method < -1 || amdgpu_reset_method > 4) {
1990 dev_warn(adev->dev, "invalid option for reset method, reverting to default\n");
1991 amdgpu_reset_method = -1;
1992 }
1993
1994 amdgpu_device_check_smu_prv_buffer_size(adev);
1995
1996 amdgpu_device_check_vm_size(adev);
1997
1998 amdgpu_device_check_block_size(adev);
1999
2000 adev->firmware.load_type = amdgpu_ucode_get_load_type(adev, amdgpu_fw_load_type);
2001
2002 for (i = 0; i < MAX_XCP; i++)
2003 adev->enforce_isolation[i] = !!enforce_isolation;
2004
2005 return 0;
2006 }
2007
2008 /**
2009 * amdgpu_switcheroo_set_state - set switcheroo state
2010 *
2011 * @pdev: pci dev pointer
2012 * @state: vga_switcheroo state
2013 *
2014 * Callback for the switcheroo driver. Suspends or resumes
2015 * the asics before or after it is powered up using ACPI methods.
2016 */
amdgpu_switcheroo_set_state(struct pci_dev * pdev,enum vga_switcheroo_state state)2017 static void amdgpu_switcheroo_set_state(struct pci_dev *pdev,
2018 enum vga_switcheroo_state state)
2019 {
2020 struct drm_device *dev = pci_get_drvdata(pdev);
2021 int r;
2022
2023 if (amdgpu_device_supports_px(dev) && state == VGA_SWITCHEROO_OFF)
2024 return;
2025
2026 if (state == VGA_SWITCHEROO_ON) {
2027 pr_info("switched on\n");
2028 /* don't suspend or resume card normally */
2029 dev->switch_power_state = DRM_SWITCH_POWER_CHANGING;
2030
2031 pci_set_power_state(pdev, PCI_D0);
2032 amdgpu_device_load_pci_state(pdev);
2033 r = pci_enable_device(pdev);
2034 if (r)
2035 DRM_WARN("pci_enable_device failed (%d)\n", r);
2036 amdgpu_device_resume(dev, true);
2037
2038 dev->switch_power_state = DRM_SWITCH_POWER_ON;
2039 } else {
2040 pr_info("switched off\n");
2041 dev->switch_power_state = DRM_SWITCH_POWER_CHANGING;
2042 amdgpu_device_prepare(dev);
2043 amdgpu_device_suspend(dev, true);
2044 amdgpu_device_cache_pci_state(pdev);
2045 /* Shut down the device */
2046 pci_disable_device(pdev);
2047 pci_set_power_state(pdev, PCI_D3cold);
2048 dev->switch_power_state = DRM_SWITCH_POWER_OFF;
2049 }
2050 }
2051
2052 /**
2053 * amdgpu_switcheroo_can_switch - see if switcheroo state can change
2054 *
2055 * @pdev: pci dev pointer
2056 *
2057 * Callback for the switcheroo driver. Check of the switcheroo
2058 * state can be changed.
2059 * Returns true if the state can be changed, false if not.
2060 */
amdgpu_switcheroo_can_switch(struct pci_dev * pdev)2061 static bool amdgpu_switcheroo_can_switch(struct pci_dev *pdev)
2062 {
2063 struct drm_device *dev = pci_get_drvdata(pdev);
2064
2065 /*
2066 * FIXME: open_count is protected by drm_global_mutex but that would lead to
2067 * locking inversion with the driver load path. And the access here is
2068 * completely racy anyway. So don't bother with locking for now.
2069 */
2070 return atomic_read(&dev->open_count) == 0;
2071 }
2072
2073 static const struct vga_switcheroo_client_ops amdgpu_switcheroo_ops = {
2074 .set_gpu_state = amdgpu_switcheroo_set_state,
2075 .reprobe = NULL,
2076 .can_switch = amdgpu_switcheroo_can_switch,
2077 };
2078
2079 /**
2080 * amdgpu_device_ip_set_clockgating_state - set the CG state
2081 *
2082 * @dev: amdgpu_device pointer
2083 * @block_type: Type of hardware IP (SMU, GFX, UVD, etc.)
2084 * @state: clockgating state (gate or ungate)
2085 *
2086 * Sets the requested clockgating state for all instances of
2087 * the hardware IP specified.
2088 * Returns the error code from the last instance.
2089 */
amdgpu_device_ip_set_clockgating_state(void * dev,enum amd_ip_block_type block_type,enum amd_clockgating_state state)2090 int amdgpu_device_ip_set_clockgating_state(void *dev,
2091 enum amd_ip_block_type block_type,
2092 enum amd_clockgating_state state)
2093 {
2094 struct amdgpu_device *adev = dev;
2095 int i, r = 0;
2096
2097 for (i = 0; i < adev->num_ip_blocks; i++) {
2098 if (!adev->ip_blocks[i].status.valid)
2099 continue;
2100 if (adev->ip_blocks[i].version->type != block_type)
2101 continue;
2102 if (!adev->ip_blocks[i].version->funcs->set_clockgating_state)
2103 continue;
2104 r = adev->ip_blocks[i].version->funcs->set_clockgating_state(
2105 (void *)adev, state);
2106 if (r)
2107 DRM_ERROR("set_clockgating_state of IP block <%s> failed %d\n",
2108 adev->ip_blocks[i].version->funcs->name, r);
2109 }
2110 return r;
2111 }
2112
2113 /**
2114 * amdgpu_device_ip_set_powergating_state - set the PG state
2115 *
2116 * @dev: amdgpu_device pointer
2117 * @block_type: Type of hardware IP (SMU, GFX, UVD, etc.)
2118 * @state: powergating state (gate or ungate)
2119 *
2120 * Sets the requested powergating state for all instances of
2121 * the hardware IP specified.
2122 * Returns the error code from the last instance.
2123 */
amdgpu_device_ip_set_powergating_state(void * dev,enum amd_ip_block_type block_type,enum amd_powergating_state state)2124 int amdgpu_device_ip_set_powergating_state(void *dev,
2125 enum amd_ip_block_type block_type,
2126 enum amd_powergating_state state)
2127 {
2128 struct amdgpu_device *adev = dev;
2129 int i, r = 0;
2130
2131 for (i = 0; i < adev->num_ip_blocks; i++) {
2132 if (!adev->ip_blocks[i].status.valid)
2133 continue;
2134 if (adev->ip_blocks[i].version->type != block_type)
2135 continue;
2136 if (!adev->ip_blocks[i].version->funcs->set_powergating_state)
2137 continue;
2138 r = adev->ip_blocks[i].version->funcs->set_powergating_state(
2139 (void *)adev, state);
2140 if (r)
2141 DRM_ERROR("set_powergating_state of IP block <%s> failed %d\n",
2142 adev->ip_blocks[i].version->funcs->name, r);
2143 }
2144 return r;
2145 }
2146
2147 /**
2148 * amdgpu_device_ip_get_clockgating_state - get the CG state
2149 *
2150 * @adev: amdgpu_device pointer
2151 * @flags: clockgating feature flags
2152 *
2153 * Walks the list of IPs on the device and updates the clockgating
2154 * flags for each IP.
2155 * Updates @flags with the feature flags for each hardware IP where
2156 * clockgating is enabled.
2157 */
amdgpu_device_ip_get_clockgating_state(struct amdgpu_device * adev,u64 * flags)2158 void amdgpu_device_ip_get_clockgating_state(struct amdgpu_device *adev,
2159 u64 *flags)
2160 {
2161 int i;
2162
2163 for (i = 0; i < adev->num_ip_blocks; i++) {
2164 if (!adev->ip_blocks[i].status.valid)
2165 continue;
2166 if (adev->ip_blocks[i].version->funcs->get_clockgating_state)
2167 adev->ip_blocks[i].version->funcs->get_clockgating_state((void *)adev, flags);
2168 }
2169 }
2170
2171 /**
2172 * amdgpu_device_ip_wait_for_idle - wait for idle
2173 *
2174 * @adev: amdgpu_device pointer
2175 * @block_type: Type of hardware IP (SMU, GFX, UVD, etc.)
2176 *
2177 * Waits for the request hardware IP to be idle.
2178 * Returns 0 for success or a negative error code on failure.
2179 */
amdgpu_device_ip_wait_for_idle(struct amdgpu_device * adev,enum amd_ip_block_type block_type)2180 int amdgpu_device_ip_wait_for_idle(struct amdgpu_device *adev,
2181 enum amd_ip_block_type block_type)
2182 {
2183 int i, r;
2184
2185 for (i = 0; i < adev->num_ip_blocks; i++) {
2186 if (!adev->ip_blocks[i].status.valid)
2187 continue;
2188 if (adev->ip_blocks[i].version->type == block_type) {
2189 r = adev->ip_blocks[i].version->funcs->wait_for_idle((void *)adev);
2190 if (r)
2191 return r;
2192 break;
2193 }
2194 }
2195 return 0;
2196
2197 }
2198
2199 /**
2200 * amdgpu_device_ip_is_idle - is the hardware IP idle
2201 *
2202 * @adev: amdgpu_device pointer
2203 * @block_type: Type of hardware IP (SMU, GFX, UVD, etc.)
2204 *
2205 * Check if the hardware IP is idle or not.
2206 * Returns true if it the IP is idle, false if not.
2207 */
amdgpu_device_ip_is_idle(struct amdgpu_device * adev,enum amd_ip_block_type block_type)2208 bool amdgpu_device_ip_is_idle(struct amdgpu_device *adev,
2209 enum amd_ip_block_type block_type)
2210 {
2211 int i;
2212
2213 for (i = 0; i < adev->num_ip_blocks; i++) {
2214 if (!adev->ip_blocks[i].status.valid)
2215 continue;
2216 if (adev->ip_blocks[i].version->type == block_type)
2217 return adev->ip_blocks[i].version->funcs->is_idle((void *)adev);
2218 }
2219 return true;
2220
2221 }
2222
2223 /**
2224 * amdgpu_device_ip_get_ip_block - get a hw IP pointer
2225 *
2226 * @adev: amdgpu_device pointer
2227 * @type: Type of hardware IP (SMU, GFX, UVD, etc.)
2228 *
2229 * Returns a pointer to the hardware IP block structure
2230 * if it exists for the asic, otherwise NULL.
2231 */
2232 struct amdgpu_ip_block *
amdgpu_device_ip_get_ip_block(struct amdgpu_device * adev,enum amd_ip_block_type type)2233 amdgpu_device_ip_get_ip_block(struct amdgpu_device *adev,
2234 enum amd_ip_block_type type)
2235 {
2236 int i;
2237
2238 for (i = 0; i < adev->num_ip_blocks; i++)
2239 if (adev->ip_blocks[i].version->type == type)
2240 return &adev->ip_blocks[i];
2241
2242 return NULL;
2243 }
2244
2245 /**
2246 * amdgpu_device_ip_block_version_cmp
2247 *
2248 * @adev: amdgpu_device pointer
2249 * @type: enum amd_ip_block_type
2250 * @major: major version
2251 * @minor: minor version
2252 *
2253 * return 0 if equal or greater
2254 * return 1 if smaller or the ip_block doesn't exist
2255 */
amdgpu_device_ip_block_version_cmp(struct amdgpu_device * adev,enum amd_ip_block_type type,u32 major,u32 minor)2256 int amdgpu_device_ip_block_version_cmp(struct amdgpu_device *adev,
2257 enum amd_ip_block_type type,
2258 u32 major, u32 minor)
2259 {
2260 struct amdgpu_ip_block *ip_block = amdgpu_device_ip_get_ip_block(adev, type);
2261
2262 if (ip_block && ((ip_block->version->major > major) ||
2263 ((ip_block->version->major == major) &&
2264 (ip_block->version->minor >= minor))))
2265 return 0;
2266
2267 return 1;
2268 }
2269
2270 /**
2271 * amdgpu_device_ip_block_add
2272 *
2273 * @adev: amdgpu_device pointer
2274 * @ip_block_version: pointer to the IP to add
2275 *
2276 * Adds the IP block driver information to the collection of IPs
2277 * on the asic.
2278 */
amdgpu_device_ip_block_add(struct amdgpu_device * adev,const struct amdgpu_ip_block_version * ip_block_version)2279 int amdgpu_device_ip_block_add(struct amdgpu_device *adev,
2280 const struct amdgpu_ip_block_version *ip_block_version)
2281 {
2282 if (!ip_block_version)
2283 return -EINVAL;
2284
2285 switch (ip_block_version->type) {
2286 case AMD_IP_BLOCK_TYPE_VCN:
2287 if (adev->harvest_ip_mask & AMD_HARVEST_IP_VCN_MASK)
2288 return 0;
2289 break;
2290 case AMD_IP_BLOCK_TYPE_JPEG:
2291 if (adev->harvest_ip_mask & AMD_HARVEST_IP_JPEG_MASK)
2292 return 0;
2293 break;
2294 default:
2295 break;
2296 }
2297
2298 DRM_INFO("add ip block number %d <%s>\n", adev->num_ip_blocks,
2299 ip_block_version->funcs->name);
2300
2301 adev->ip_blocks[adev->num_ip_blocks++].version = ip_block_version;
2302
2303 return 0;
2304 }
2305
2306 /**
2307 * amdgpu_device_enable_virtual_display - enable virtual display feature
2308 *
2309 * @adev: amdgpu_device pointer
2310 *
2311 * Enabled the virtual display feature if the user has enabled it via
2312 * the module parameter virtual_display. This feature provides a virtual
2313 * display hardware on headless boards or in virtualized environments.
2314 * This function parses and validates the configuration string specified by
2315 * the user and configues the virtual display configuration (number of
2316 * virtual connectors, crtcs, etc.) specified.
2317 */
amdgpu_device_enable_virtual_display(struct amdgpu_device * adev)2318 static void amdgpu_device_enable_virtual_display(struct amdgpu_device *adev)
2319 {
2320 adev->enable_virtual_display = false;
2321
2322 if (amdgpu_virtual_display) {
2323 const char *pci_address_name = pci_name(adev->pdev);
2324 char *pciaddstr, *pciaddstr_tmp, *pciaddname_tmp, *pciaddname;
2325
2326 pciaddstr = kstrdup(amdgpu_virtual_display, GFP_KERNEL);
2327 pciaddstr_tmp = pciaddstr;
2328 while ((pciaddname_tmp = strsep(&pciaddstr_tmp, ";"))) {
2329 pciaddname = strsep(&pciaddname_tmp, ",");
2330 if (!strcmp("all", pciaddname)
2331 || !strcmp(pci_address_name, pciaddname)) {
2332 long num_crtc;
2333 int res = -1;
2334
2335 adev->enable_virtual_display = true;
2336
2337 if (pciaddname_tmp)
2338 res = kstrtol(pciaddname_tmp, 10,
2339 &num_crtc);
2340
2341 if (!res) {
2342 if (num_crtc < 1)
2343 num_crtc = 1;
2344 if (num_crtc > 6)
2345 num_crtc = 6;
2346 adev->mode_info.num_crtc = num_crtc;
2347 } else {
2348 adev->mode_info.num_crtc = 1;
2349 }
2350 break;
2351 }
2352 }
2353
2354 DRM_INFO("virtual display string:%s, %s:virtual_display:%d, num_crtc:%d\n",
2355 amdgpu_virtual_display, pci_address_name,
2356 adev->enable_virtual_display, adev->mode_info.num_crtc);
2357
2358 kfree(pciaddstr);
2359 }
2360 }
2361
amdgpu_device_set_sriov_virtual_display(struct amdgpu_device * adev)2362 void amdgpu_device_set_sriov_virtual_display(struct amdgpu_device *adev)
2363 {
2364 if (amdgpu_sriov_vf(adev) && !adev->enable_virtual_display) {
2365 adev->mode_info.num_crtc = 1;
2366 adev->enable_virtual_display = true;
2367 DRM_INFO("virtual_display:%d, num_crtc:%d\n",
2368 adev->enable_virtual_display, adev->mode_info.num_crtc);
2369 }
2370 }
2371
2372 /**
2373 * amdgpu_device_parse_gpu_info_fw - parse gpu info firmware
2374 *
2375 * @adev: amdgpu_device pointer
2376 *
2377 * Parses the asic configuration parameters specified in the gpu info
2378 * firmware and makes them availale to the driver for use in configuring
2379 * the asic.
2380 * Returns 0 on success, -EINVAL on failure.
2381 */
amdgpu_device_parse_gpu_info_fw(struct amdgpu_device * adev)2382 static int amdgpu_device_parse_gpu_info_fw(struct amdgpu_device *adev)
2383 {
2384 const char *chip_name;
2385 int err;
2386 const struct gpu_info_firmware_header_v1_0 *hdr;
2387
2388 adev->firmware.gpu_info_fw = NULL;
2389
2390 switch (adev->asic_type) {
2391 default:
2392 return 0;
2393 case CHIP_VEGA10:
2394 chip_name = "vega10";
2395 break;
2396 case CHIP_VEGA12:
2397 chip_name = "vega12";
2398 break;
2399 case CHIP_RAVEN:
2400 if (adev->apu_flags & AMD_APU_IS_RAVEN2)
2401 chip_name = "raven2";
2402 else if (adev->apu_flags & AMD_APU_IS_PICASSO)
2403 chip_name = "picasso";
2404 else
2405 chip_name = "raven";
2406 break;
2407 case CHIP_ARCTURUS:
2408 chip_name = "arcturus";
2409 break;
2410 case CHIP_NAVI12:
2411 if (adev->mman.discovery_bin)
2412 return 0;
2413 chip_name = "navi12";
2414 break;
2415 }
2416
2417 err = amdgpu_ucode_request(adev, &adev->firmware.gpu_info_fw,
2418 "amdgpu/%s_gpu_info.bin", chip_name);
2419 if (err) {
2420 dev_err(adev->dev,
2421 "Failed to get gpu_info firmware \"%s_gpu_info.bin\"\n",
2422 chip_name);
2423 goto out;
2424 }
2425
2426 hdr = (const struct gpu_info_firmware_header_v1_0 *)adev->firmware.gpu_info_fw->data;
2427 amdgpu_ucode_print_gpu_info_hdr(&hdr->header);
2428
2429 switch (hdr->version_major) {
2430 case 1:
2431 {
2432 const struct gpu_info_firmware_v1_0 *gpu_info_fw =
2433 (const struct gpu_info_firmware_v1_0 *)(adev->firmware.gpu_info_fw->data +
2434 le32_to_cpu(hdr->header.ucode_array_offset_bytes));
2435
2436 /*
2437 * Should be droped when DAL no longer needs it.
2438 */
2439 if (adev->asic_type == CHIP_NAVI12)
2440 goto parse_soc_bounding_box;
2441
2442 adev->gfx.config.max_shader_engines = le32_to_cpu(gpu_info_fw->gc_num_se);
2443 adev->gfx.config.max_cu_per_sh = le32_to_cpu(gpu_info_fw->gc_num_cu_per_sh);
2444 adev->gfx.config.max_sh_per_se = le32_to_cpu(gpu_info_fw->gc_num_sh_per_se);
2445 adev->gfx.config.max_backends_per_se = le32_to_cpu(gpu_info_fw->gc_num_rb_per_se);
2446 adev->gfx.config.max_texture_channel_caches =
2447 le32_to_cpu(gpu_info_fw->gc_num_tccs);
2448 adev->gfx.config.max_gprs = le32_to_cpu(gpu_info_fw->gc_num_gprs);
2449 adev->gfx.config.max_gs_threads = le32_to_cpu(gpu_info_fw->gc_num_max_gs_thds);
2450 adev->gfx.config.gs_vgt_table_depth = le32_to_cpu(gpu_info_fw->gc_gs_table_depth);
2451 adev->gfx.config.gs_prim_buffer_depth = le32_to_cpu(gpu_info_fw->gc_gsprim_buff_depth);
2452 adev->gfx.config.double_offchip_lds_buf =
2453 le32_to_cpu(gpu_info_fw->gc_double_offchip_lds_buffer);
2454 adev->gfx.cu_info.wave_front_size = le32_to_cpu(gpu_info_fw->gc_wave_size);
2455 adev->gfx.cu_info.max_waves_per_simd =
2456 le32_to_cpu(gpu_info_fw->gc_max_waves_per_simd);
2457 adev->gfx.cu_info.max_scratch_slots_per_cu =
2458 le32_to_cpu(gpu_info_fw->gc_max_scratch_slots_per_cu);
2459 adev->gfx.cu_info.lds_size = le32_to_cpu(gpu_info_fw->gc_lds_size);
2460 if (hdr->version_minor >= 1) {
2461 const struct gpu_info_firmware_v1_1 *gpu_info_fw =
2462 (const struct gpu_info_firmware_v1_1 *)(adev->firmware.gpu_info_fw->data +
2463 le32_to_cpu(hdr->header.ucode_array_offset_bytes));
2464 adev->gfx.config.num_sc_per_sh =
2465 le32_to_cpu(gpu_info_fw->num_sc_per_sh);
2466 adev->gfx.config.num_packer_per_sc =
2467 le32_to_cpu(gpu_info_fw->num_packer_per_sc);
2468 }
2469
2470 parse_soc_bounding_box:
2471 /*
2472 * soc bounding box info is not integrated in disocovery table,
2473 * we always need to parse it from gpu info firmware if needed.
2474 */
2475 if (hdr->version_minor == 2) {
2476 const struct gpu_info_firmware_v1_2 *gpu_info_fw =
2477 (const struct gpu_info_firmware_v1_2 *)(adev->firmware.gpu_info_fw->data +
2478 le32_to_cpu(hdr->header.ucode_array_offset_bytes));
2479 adev->dm.soc_bounding_box = &gpu_info_fw->soc_bounding_box;
2480 }
2481 break;
2482 }
2483 default:
2484 dev_err(adev->dev,
2485 "Unsupported gpu_info table %d\n", hdr->header.ucode_version);
2486 err = -EINVAL;
2487 goto out;
2488 }
2489 out:
2490 return err;
2491 }
2492
2493 /**
2494 * amdgpu_device_ip_early_init - run early init for hardware IPs
2495 *
2496 * @adev: amdgpu_device pointer
2497 *
2498 * Early initialization pass for hardware IPs. The hardware IPs that make
2499 * up each asic are discovered each IP's early_init callback is run. This
2500 * is the first stage in initializing the asic.
2501 * Returns 0 on success, negative error code on failure.
2502 */
amdgpu_device_ip_early_init(struct amdgpu_device * adev)2503 static int amdgpu_device_ip_early_init(struct amdgpu_device *adev)
2504 {
2505 struct amdgpu_ip_block *ip_block;
2506 struct pci_dev *parent;
2507 int i, r;
2508 bool total;
2509
2510 amdgpu_device_enable_virtual_display(adev);
2511
2512 if (amdgpu_sriov_vf(adev)) {
2513 r = amdgpu_virt_request_full_gpu(adev, true);
2514 if (r)
2515 return r;
2516 }
2517
2518 switch (adev->asic_type) {
2519 #ifdef CONFIG_DRM_AMDGPU_SI
2520 case CHIP_VERDE:
2521 case CHIP_TAHITI:
2522 case CHIP_PITCAIRN:
2523 case CHIP_OLAND:
2524 case CHIP_HAINAN:
2525 adev->family = AMDGPU_FAMILY_SI;
2526 r = si_set_ip_blocks(adev);
2527 if (r)
2528 return r;
2529 break;
2530 #endif
2531 #ifdef CONFIG_DRM_AMDGPU_CIK
2532 case CHIP_BONAIRE:
2533 case CHIP_HAWAII:
2534 case CHIP_KAVERI:
2535 case CHIP_KABINI:
2536 case CHIP_MULLINS:
2537 if (adev->flags & AMD_IS_APU)
2538 adev->family = AMDGPU_FAMILY_KV;
2539 else
2540 adev->family = AMDGPU_FAMILY_CI;
2541
2542 r = cik_set_ip_blocks(adev);
2543 if (r)
2544 return r;
2545 break;
2546 #endif
2547 case CHIP_TOPAZ:
2548 case CHIP_TONGA:
2549 case CHIP_FIJI:
2550 case CHIP_POLARIS10:
2551 case CHIP_POLARIS11:
2552 case CHIP_POLARIS12:
2553 case CHIP_VEGAM:
2554 case CHIP_CARRIZO:
2555 case CHIP_STONEY:
2556 if (adev->flags & AMD_IS_APU)
2557 adev->family = AMDGPU_FAMILY_CZ;
2558 else
2559 adev->family = AMDGPU_FAMILY_VI;
2560
2561 r = vi_set_ip_blocks(adev);
2562 if (r)
2563 return r;
2564 break;
2565 default:
2566 r = amdgpu_discovery_set_ip_blocks(adev);
2567 if (r)
2568 return r;
2569 break;
2570 }
2571
2572 if (amdgpu_has_atpx() &&
2573 (amdgpu_is_atpx_hybrid() ||
2574 amdgpu_has_atpx_dgpu_power_cntl()) &&
2575 ((adev->flags & AMD_IS_APU) == 0) &&
2576 !dev_is_removable(&adev->pdev->dev))
2577 adev->flags |= AMD_IS_PX;
2578
2579 if (!(adev->flags & AMD_IS_APU)) {
2580 parent = pcie_find_root_port(adev->pdev);
2581 adev->has_pr3 = parent ? pci_pr3_present(parent) : false;
2582 }
2583
2584
2585 adev->pm.pp_feature = amdgpu_pp_feature_mask;
2586 if (amdgpu_sriov_vf(adev) || sched_policy == KFD_SCHED_POLICY_NO_HWS)
2587 adev->pm.pp_feature &= ~PP_GFXOFF_MASK;
2588 if (amdgpu_sriov_vf(adev) && adev->asic_type == CHIP_SIENNA_CICHLID)
2589 adev->pm.pp_feature &= ~PP_OVERDRIVE_MASK;
2590 if (!amdgpu_device_pcie_dynamic_switching_supported(adev))
2591 adev->pm.pp_feature &= ~PP_PCIE_DPM_MASK;
2592
2593 total = true;
2594 for (i = 0; i < adev->num_ip_blocks; i++) {
2595 if ((amdgpu_ip_block_mask & (1 << i)) == 0) {
2596 DRM_WARN("disabled ip block: %d <%s>\n",
2597 i, adev->ip_blocks[i].version->funcs->name);
2598 adev->ip_blocks[i].status.valid = false;
2599 } else {
2600 if (adev->ip_blocks[i].version->funcs->early_init) {
2601 r = adev->ip_blocks[i].version->funcs->early_init((void *)adev);
2602 if (r == -ENOENT) {
2603 adev->ip_blocks[i].status.valid = false;
2604 } else if (r) {
2605 DRM_ERROR("early_init of IP block <%s> failed %d\n",
2606 adev->ip_blocks[i].version->funcs->name, r);
2607 total = false;
2608 } else {
2609 adev->ip_blocks[i].status.valid = true;
2610 }
2611 } else {
2612 adev->ip_blocks[i].status.valid = true;
2613 }
2614 }
2615 /* get the vbios after the asic_funcs are set up */
2616 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_COMMON) {
2617 r = amdgpu_device_parse_gpu_info_fw(adev);
2618 if (r)
2619 return r;
2620
2621 /* Read BIOS */
2622 if (amdgpu_device_read_bios(adev)) {
2623 if (!amdgpu_get_bios(adev))
2624 return -EINVAL;
2625
2626 r = amdgpu_atombios_init(adev);
2627 if (r) {
2628 dev_err(adev->dev, "amdgpu_atombios_init failed\n");
2629 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_ATOMBIOS_INIT_FAIL, 0, 0);
2630 return r;
2631 }
2632 }
2633
2634 /*get pf2vf msg info at it's earliest time*/
2635 if (amdgpu_sriov_vf(adev))
2636 amdgpu_virt_init_data_exchange(adev);
2637
2638 }
2639 }
2640 if (!total)
2641 return -ENODEV;
2642
2643 ip_block = amdgpu_device_ip_get_ip_block(adev, AMD_IP_BLOCK_TYPE_GFX);
2644 if (ip_block->status.valid != false)
2645 amdgpu_amdkfd_device_probe(adev);
2646
2647 adev->cg_flags &= amdgpu_cg_mask;
2648 adev->pg_flags &= amdgpu_pg_mask;
2649
2650 return 0;
2651 }
2652
amdgpu_device_ip_hw_init_phase1(struct amdgpu_device * adev)2653 static int amdgpu_device_ip_hw_init_phase1(struct amdgpu_device *adev)
2654 {
2655 int i, r;
2656
2657 for (i = 0; i < adev->num_ip_blocks; i++) {
2658 if (!adev->ip_blocks[i].status.sw)
2659 continue;
2660 if (adev->ip_blocks[i].status.hw)
2661 continue;
2662 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_COMMON ||
2663 (amdgpu_sriov_vf(adev) && (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_PSP)) ||
2664 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_IH) {
2665 r = adev->ip_blocks[i].version->funcs->hw_init(adev);
2666 if (r) {
2667 DRM_ERROR("hw_init of IP block <%s> failed %d\n",
2668 adev->ip_blocks[i].version->funcs->name, r);
2669 return r;
2670 }
2671 adev->ip_blocks[i].status.hw = true;
2672 }
2673 }
2674
2675 return 0;
2676 }
2677
amdgpu_device_ip_hw_init_phase2(struct amdgpu_device * adev)2678 static int amdgpu_device_ip_hw_init_phase2(struct amdgpu_device *adev)
2679 {
2680 int i, r;
2681
2682 for (i = 0; i < adev->num_ip_blocks; i++) {
2683 if (!adev->ip_blocks[i].status.sw)
2684 continue;
2685 if (adev->ip_blocks[i].status.hw)
2686 continue;
2687 r = adev->ip_blocks[i].version->funcs->hw_init(adev);
2688 if (r) {
2689 DRM_ERROR("hw_init of IP block <%s> failed %d\n",
2690 adev->ip_blocks[i].version->funcs->name, r);
2691 return r;
2692 }
2693 adev->ip_blocks[i].status.hw = true;
2694 }
2695
2696 return 0;
2697 }
2698
amdgpu_device_fw_loading(struct amdgpu_device * adev)2699 static int amdgpu_device_fw_loading(struct amdgpu_device *adev)
2700 {
2701 int r = 0;
2702 int i;
2703 uint32_t smu_version;
2704
2705 if (adev->asic_type >= CHIP_VEGA10) {
2706 for (i = 0; i < adev->num_ip_blocks; i++) {
2707 if (adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_PSP)
2708 continue;
2709
2710 if (!adev->ip_blocks[i].status.sw)
2711 continue;
2712
2713 /* no need to do the fw loading again if already done*/
2714 if (adev->ip_blocks[i].status.hw == true)
2715 break;
2716
2717 if (amdgpu_in_reset(adev) || adev->in_suspend) {
2718 r = adev->ip_blocks[i].version->funcs->resume(adev);
2719 if (r) {
2720 DRM_ERROR("resume of IP block <%s> failed %d\n",
2721 adev->ip_blocks[i].version->funcs->name, r);
2722 return r;
2723 }
2724 } else {
2725 r = adev->ip_blocks[i].version->funcs->hw_init(adev);
2726 if (r) {
2727 DRM_ERROR("hw_init of IP block <%s> failed %d\n",
2728 adev->ip_blocks[i].version->funcs->name, r);
2729 return r;
2730 }
2731 }
2732
2733 adev->ip_blocks[i].status.hw = true;
2734 break;
2735 }
2736 }
2737
2738 if (!amdgpu_sriov_vf(adev) || adev->asic_type == CHIP_TONGA)
2739 r = amdgpu_pm_load_smu_firmware(adev, &smu_version);
2740
2741 return r;
2742 }
2743
amdgpu_device_init_schedulers(struct amdgpu_device * adev)2744 static int amdgpu_device_init_schedulers(struct amdgpu_device *adev)
2745 {
2746 long timeout;
2747 int r, i;
2748
2749 for (i = 0; i < AMDGPU_MAX_RINGS; ++i) {
2750 struct amdgpu_ring *ring = adev->rings[i];
2751
2752 /* No need to setup the GPU scheduler for rings that don't need it */
2753 if (!ring || ring->no_scheduler)
2754 continue;
2755
2756 switch (ring->funcs->type) {
2757 case AMDGPU_RING_TYPE_GFX:
2758 timeout = adev->gfx_timeout;
2759 break;
2760 case AMDGPU_RING_TYPE_COMPUTE:
2761 timeout = adev->compute_timeout;
2762 break;
2763 case AMDGPU_RING_TYPE_SDMA:
2764 timeout = adev->sdma_timeout;
2765 break;
2766 default:
2767 timeout = adev->video_timeout;
2768 break;
2769 }
2770
2771 r = drm_sched_init(&ring->sched, &amdgpu_sched_ops, NULL,
2772 DRM_SCHED_PRIORITY_COUNT,
2773 ring->num_hw_submission, 0,
2774 timeout, adev->reset_domain->wq,
2775 ring->sched_score, ring->name,
2776 adev->dev);
2777 if (r) {
2778 DRM_ERROR("Failed to create scheduler on ring %s.\n",
2779 ring->name);
2780 return r;
2781 }
2782 r = amdgpu_uvd_entity_init(adev, ring);
2783 if (r) {
2784 DRM_ERROR("Failed to create UVD scheduling entity on ring %s.\n",
2785 ring->name);
2786 return r;
2787 }
2788 r = amdgpu_vce_entity_init(adev, ring);
2789 if (r) {
2790 DRM_ERROR("Failed to create VCE scheduling entity on ring %s.\n",
2791 ring->name);
2792 return r;
2793 }
2794 }
2795
2796 amdgpu_xcp_update_partition_sched_list(adev);
2797
2798 return 0;
2799 }
2800
2801
2802 /**
2803 * amdgpu_device_ip_init - run init for hardware IPs
2804 *
2805 * @adev: amdgpu_device pointer
2806 *
2807 * Main initialization pass for hardware IPs. The list of all the hardware
2808 * IPs that make up the asic is walked and the sw_init and hw_init callbacks
2809 * are run. sw_init initializes the software state associated with each IP
2810 * and hw_init initializes the hardware associated with each IP.
2811 * Returns 0 on success, negative error code on failure.
2812 */
amdgpu_device_ip_init(struct amdgpu_device * adev)2813 static int amdgpu_device_ip_init(struct amdgpu_device *adev)
2814 {
2815 int i, r;
2816
2817 r = amdgpu_ras_init(adev);
2818 if (r)
2819 return r;
2820
2821 for (i = 0; i < adev->num_ip_blocks; i++) {
2822 if (!adev->ip_blocks[i].status.valid)
2823 continue;
2824 r = adev->ip_blocks[i].version->funcs->sw_init((void *)adev);
2825 if (r) {
2826 DRM_ERROR("sw_init of IP block <%s> failed %d\n",
2827 adev->ip_blocks[i].version->funcs->name, r);
2828 goto init_failed;
2829 }
2830 adev->ip_blocks[i].status.sw = true;
2831
2832 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_COMMON) {
2833 /* need to do common hw init early so everything is set up for gmc */
2834 r = adev->ip_blocks[i].version->funcs->hw_init((void *)adev);
2835 if (r) {
2836 DRM_ERROR("hw_init %d failed %d\n", i, r);
2837 goto init_failed;
2838 }
2839 adev->ip_blocks[i].status.hw = true;
2840 } else if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC) {
2841 /* need to do gmc hw init early so we can allocate gpu mem */
2842 /* Try to reserve bad pages early */
2843 if (amdgpu_sriov_vf(adev))
2844 amdgpu_virt_exchange_data(adev);
2845
2846 r = amdgpu_device_mem_scratch_init(adev);
2847 if (r) {
2848 DRM_ERROR("amdgpu_mem_scratch_init failed %d\n", r);
2849 goto init_failed;
2850 }
2851 r = adev->ip_blocks[i].version->funcs->hw_init((void *)adev);
2852 if (r) {
2853 DRM_ERROR("hw_init %d failed %d\n", i, r);
2854 goto init_failed;
2855 }
2856 r = amdgpu_device_wb_init(adev);
2857 if (r) {
2858 DRM_ERROR("amdgpu_device_wb_init failed %d\n", r);
2859 goto init_failed;
2860 }
2861 adev->ip_blocks[i].status.hw = true;
2862
2863 /* right after GMC hw init, we create CSA */
2864 if (adev->gfx.mcbp) {
2865 r = amdgpu_allocate_static_csa(adev, &adev->virt.csa_obj,
2866 AMDGPU_GEM_DOMAIN_VRAM |
2867 AMDGPU_GEM_DOMAIN_GTT,
2868 AMDGPU_CSA_SIZE);
2869 if (r) {
2870 DRM_ERROR("allocate CSA failed %d\n", r);
2871 goto init_failed;
2872 }
2873 }
2874
2875 r = amdgpu_seq64_init(adev);
2876 if (r) {
2877 DRM_ERROR("allocate seq64 failed %d\n", r);
2878 goto init_failed;
2879 }
2880 }
2881 }
2882
2883 if (amdgpu_sriov_vf(adev))
2884 amdgpu_virt_init_data_exchange(adev);
2885
2886 r = amdgpu_ib_pool_init(adev);
2887 if (r) {
2888 dev_err(adev->dev, "IB initialization failed (%d).\n", r);
2889 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_IB_INIT_FAIL, 0, r);
2890 goto init_failed;
2891 }
2892
2893 r = amdgpu_ucode_create_bo(adev); /* create ucode bo when sw_init complete*/
2894 if (r)
2895 goto init_failed;
2896
2897 r = amdgpu_device_ip_hw_init_phase1(adev);
2898 if (r)
2899 goto init_failed;
2900
2901 r = amdgpu_device_fw_loading(adev);
2902 if (r)
2903 goto init_failed;
2904
2905 r = amdgpu_device_ip_hw_init_phase2(adev);
2906 if (r)
2907 goto init_failed;
2908
2909 /*
2910 * retired pages will be loaded from eeprom and reserved here,
2911 * it should be called after amdgpu_device_ip_hw_init_phase2 since
2912 * for some ASICs the RAS EEPROM code relies on SMU fully functioning
2913 * for I2C communication which only true at this point.
2914 *
2915 * amdgpu_ras_recovery_init may fail, but the upper only cares the
2916 * failure from bad gpu situation and stop amdgpu init process
2917 * accordingly. For other failed cases, it will still release all
2918 * the resource and print error message, rather than returning one
2919 * negative value to upper level.
2920 *
2921 * Note: theoretically, this should be called before all vram allocations
2922 * to protect retired page from abusing
2923 */
2924 r = amdgpu_ras_recovery_init(adev);
2925 if (r)
2926 goto init_failed;
2927
2928 /**
2929 * In case of XGMI grab extra reference for reset domain for this device
2930 */
2931 if (adev->gmc.xgmi.num_physical_nodes > 1) {
2932 if (amdgpu_xgmi_add_device(adev) == 0) {
2933 if (!amdgpu_sriov_vf(adev)) {
2934 struct amdgpu_hive_info *hive = amdgpu_get_xgmi_hive(adev);
2935
2936 if (WARN_ON(!hive)) {
2937 r = -ENOENT;
2938 goto init_failed;
2939 }
2940
2941 if (!hive->reset_domain ||
2942 !amdgpu_reset_get_reset_domain(hive->reset_domain)) {
2943 r = -ENOENT;
2944 amdgpu_put_xgmi_hive(hive);
2945 goto init_failed;
2946 }
2947
2948 /* Drop the early temporary reset domain we created for device */
2949 amdgpu_reset_put_reset_domain(adev->reset_domain);
2950 adev->reset_domain = hive->reset_domain;
2951 amdgpu_put_xgmi_hive(hive);
2952 }
2953 }
2954 }
2955
2956 r = amdgpu_device_init_schedulers(adev);
2957 if (r)
2958 goto init_failed;
2959
2960 if (adev->mman.buffer_funcs_ring->sched.ready)
2961 amdgpu_ttm_set_buffer_funcs_status(adev, true);
2962
2963 /* Don't init kfd if whole hive need to be reset during init */
2964 if (!adev->gmc.xgmi.pending_reset) {
2965 kgd2kfd_init_zone_device(adev);
2966 amdgpu_amdkfd_device_init(adev);
2967 }
2968
2969 amdgpu_fru_get_product_info(adev);
2970
2971 init_failed:
2972
2973 return r;
2974 }
2975
2976 /**
2977 * amdgpu_device_fill_reset_magic - writes reset magic to gart pointer
2978 *
2979 * @adev: amdgpu_device pointer
2980 *
2981 * Writes a reset magic value to the gart pointer in VRAM. The driver calls
2982 * this function before a GPU reset. If the value is retained after a
2983 * GPU reset, VRAM has not been lost. Some GPU resets may destry VRAM contents.
2984 */
amdgpu_device_fill_reset_magic(struct amdgpu_device * adev)2985 static void amdgpu_device_fill_reset_magic(struct amdgpu_device *adev)
2986 {
2987 memcpy(adev->reset_magic, adev->gart.ptr, AMDGPU_RESET_MAGIC_NUM);
2988 }
2989
2990 /**
2991 * amdgpu_device_check_vram_lost - check if vram is valid
2992 *
2993 * @adev: amdgpu_device pointer
2994 *
2995 * Checks the reset magic value written to the gart pointer in VRAM.
2996 * The driver calls this after a GPU reset to see if the contents of
2997 * VRAM is lost or now.
2998 * returns true if vram is lost, false if not.
2999 */
amdgpu_device_check_vram_lost(struct amdgpu_device * adev)3000 static bool amdgpu_device_check_vram_lost(struct amdgpu_device *adev)
3001 {
3002 if (memcmp(adev->gart.ptr, adev->reset_magic,
3003 AMDGPU_RESET_MAGIC_NUM))
3004 return true;
3005
3006 if (!amdgpu_in_reset(adev))
3007 return false;
3008
3009 /*
3010 * For all ASICs with baco/mode1 reset, the VRAM is
3011 * always assumed to be lost.
3012 */
3013 switch (amdgpu_asic_reset_method(adev)) {
3014 case AMD_RESET_METHOD_BACO:
3015 case AMD_RESET_METHOD_MODE1:
3016 return true;
3017 default:
3018 return false;
3019 }
3020 }
3021
3022 /**
3023 * amdgpu_device_set_cg_state - set clockgating for amdgpu device
3024 *
3025 * @adev: amdgpu_device pointer
3026 * @state: clockgating state (gate or ungate)
3027 *
3028 * The list of all the hardware IPs that make up the asic is walked and the
3029 * set_clockgating_state callbacks are run.
3030 * Late initialization pass enabling clockgating for hardware IPs.
3031 * Fini or suspend, pass disabling clockgating for hardware IPs.
3032 * Returns 0 on success, negative error code on failure.
3033 */
3034
amdgpu_device_set_cg_state(struct amdgpu_device * adev,enum amd_clockgating_state state)3035 int amdgpu_device_set_cg_state(struct amdgpu_device *adev,
3036 enum amd_clockgating_state state)
3037 {
3038 int i, j, r;
3039
3040 if (amdgpu_emu_mode == 1)
3041 return 0;
3042
3043 for (j = 0; j < adev->num_ip_blocks; j++) {
3044 i = state == AMD_CG_STATE_GATE ? j : adev->num_ip_blocks - j - 1;
3045 if (!adev->ip_blocks[i].status.late_initialized)
3046 continue;
3047 /* skip CG for GFX, SDMA on S0ix */
3048 if (adev->in_s0ix &&
3049 (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GFX ||
3050 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_SDMA))
3051 continue;
3052 /* skip CG for VCE/UVD, it's handled specially */
3053 if (adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_UVD &&
3054 adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_VCE &&
3055 adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_VCN &&
3056 adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_JPEG &&
3057 adev->ip_blocks[i].version->funcs->set_clockgating_state) {
3058 /* enable clockgating to save power */
3059 r = adev->ip_blocks[i].version->funcs->set_clockgating_state((void *)adev,
3060 state);
3061 if (r) {
3062 DRM_ERROR("set_clockgating_state(gate) of IP block <%s> failed %d\n",
3063 adev->ip_blocks[i].version->funcs->name, r);
3064 return r;
3065 }
3066 }
3067 }
3068
3069 return 0;
3070 }
3071
amdgpu_device_set_pg_state(struct amdgpu_device * adev,enum amd_powergating_state state)3072 int amdgpu_device_set_pg_state(struct amdgpu_device *adev,
3073 enum amd_powergating_state state)
3074 {
3075 int i, j, r;
3076
3077 if (amdgpu_emu_mode == 1)
3078 return 0;
3079
3080 for (j = 0; j < adev->num_ip_blocks; j++) {
3081 i = state == AMD_PG_STATE_GATE ? j : adev->num_ip_blocks - j - 1;
3082 if (!adev->ip_blocks[i].status.late_initialized)
3083 continue;
3084 /* skip PG for GFX, SDMA on S0ix */
3085 if (adev->in_s0ix &&
3086 (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GFX ||
3087 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_SDMA))
3088 continue;
3089 /* skip CG for VCE/UVD, it's handled specially */
3090 if (adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_UVD &&
3091 adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_VCE &&
3092 adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_VCN &&
3093 adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_JPEG &&
3094 adev->ip_blocks[i].version->funcs->set_powergating_state) {
3095 /* enable powergating to save power */
3096 r = adev->ip_blocks[i].version->funcs->set_powergating_state((void *)adev,
3097 state);
3098 if (r) {
3099 DRM_ERROR("set_powergating_state(gate) of IP block <%s> failed %d\n",
3100 adev->ip_blocks[i].version->funcs->name, r);
3101 return r;
3102 }
3103 }
3104 }
3105 return 0;
3106 }
3107
amdgpu_device_enable_mgpu_fan_boost(void)3108 static int amdgpu_device_enable_mgpu_fan_boost(void)
3109 {
3110 struct amdgpu_gpu_instance *gpu_ins;
3111 struct amdgpu_device *adev;
3112 int i, ret = 0;
3113
3114 mutex_lock(&mgpu_info.mutex);
3115
3116 /*
3117 * MGPU fan boost feature should be enabled
3118 * only when there are two or more dGPUs in
3119 * the system
3120 */
3121 if (mgpu_info.num_dgpu < 2)
3122 goto out;
3123
3124 for (i = 0; i < mgpu_info.num_dgpu; i++) {
3125 gpu_ins = &(mgpu_info.gpu_ins[i]);
3126 adev = gpu_ins->adev;
3127 if (!(adev->flags & AMD_IS_APU) &&
3128 !gpu_ins->mgpu_fan_enabled) {
3129 ret = amdgpu_dpm_enable_mgpu_fan_boost(adev);
3130 if (ret)
3131 break;
3132
3133 gpu_ins->mgpu_fan_enabled = 1;
3134 }
3135 }
3136
3137 out:
3138 mutex_unlock(&mgpu_info.mutex);
3139
3140 return ret;
3141 }
3142
3143 /**
3144 * amdgpu_device_ip_late_init - run late init for hardware IPs
3145 *
3146 * @adev: amdgpu_device pointer
3147 *
3148 * Late initialization pass for hardware IPs. The list of all the hardware
3149 * IPs that make up the asic is walked and the late_init callbacks are run.
3150 * late_init covers any special initialization that an IP requires
3151 * after all of the have been initialized or something that needs to happen
3152 * late in the init process.
3153 * Returns 0 on success, negative error code on failure.
3154 */
amdgpu_device_ip_late_init(struct amdgpu_device * adev)3155 static int amdgpu_device_ip_late_init(struct amdgpu_device *adev)
3156 {
3157 struct amdgpu_gpu_instance *gpu_instance;
3158 int i = 0, r;
3159
3160 for (i = 0; i < adev->num_ip_blocks; i++) {
3161 if (!adev->ip_blocks[i].status.hw)
3162 continue;
3163 if (adev->ip_blocks[i].version->funcs->late_init) {
3164 r = adev->ip_blocks[i].version->funcs->late_init((void *)adev);
3165 if (r) {
3166 DRM_ERROR("late_init of IP block <%s> failed %d\n",
3167 adev->ip_blocks[i].version->funcs->name, r);
3168 return r;
3169 }
3170 }
3171 adev->ip_blocks[i].status.late_initialized = true;
3172 }
3173
3174 r = amdgpu_ras_late_init(adev);
3175 if (r) {
3176 DRM_ERROR("amdgpu_ras_late_init failed %d", r);
3177 return r;
3178 }
3179
3180 if (!amdgpu_in_reset(adev))
3181 amdgpu_ras_set_error_query_ready(adev, true);
3182
3183 amdgpu_device_set_cg_state(adev, AMD_CG_STATE_GATE);
3184 amdgpu_device_set_pg_state(adev, AMD_PG_STATE_GATE);
3185
3186 amdgpu_device_fill_reset_magic(adev);
3187
3188 r = amdgpu_device_enable_mgpu_fan_boost();
3189 if (r)
3190 DRM_ERROR("enable mgpu fan boost failed (%d).\n", r);
3191
3192 /* For passthrough configuration on arcturus and aldebaran, enable special handling SBR */
3193 if (amdgpu_passthrough(adev) &&
3194 ((adev->asic_type == CHIP_ARCTURUS && adev->gmc.xgmi.num_physical_nodes > 1) ||
3195 adev->asic_type == CHIP_ALDEBARAN))
3196 amdgpu_dpm_handle_passthrough_sbr(adev, true);
3197
3198 if (adev->gmc.xgmi.num_physical_nodes > 1) {
3199 mutex_lock(&mgpu_info.mutex);
3200
3201 /*
3202 * Reset device p-state to low as this was booted with high.
3203 *
3204 * This should be performed only after all devices from the same
3205 * hive get initialized.
3206 *
3207 * However, it's unknown how many device in the hive in advance.
3208 * As this is counted one by one during devices initializations.
3209 *
3210 * So, we wait for all XGMI interlinked devices initialized.
3211 * This may bring some delays as those devices may come from
3212 * different hives. But that should be OK.
3213 */
3214 if (mgpu_info.num_dgpu == adev->gmc.xgmi.num_physical_nodes) {
3215 for (i = 0; i < mgpu_info.num_gpu; i++) {
3216 gpu_instance = &(mgpu_info.gpu_ins[i]);
3217 if (gpu_instance->adev->flags & AMD_IS_APU)
3218 continue;
3219
3220 r = amdgpu_xgmi_set_pstate(gpu_instance->adev,
3221 AMDGPU_XGMI_PSTATE_MIN);
3222 if (r) {
3223 DRM_ERROR("pstate setting failed (%d).\n", r);
3224 break;
3225 }
3226 }
3227 }
3228
3229 mutex_unlock(&mgpu_info.mutex);
3230 }
3231
3232 return 0;
3233 }
3234
3235 /**
3236 * amdgpu_device_smu_fini_early - smu hw_fini wrapper
3237 *
3238 * @adev: amdgpu_device pointer
3239 *
3240 * For ASICs need to disable SMC first
3241 */
amdgpu_device_smu_fini_early(struct amdgpu_device * adev)3242 static void amdgpu_device_smu_fini_early(struct amdgpu_device *adev)
3243 {
3244 int i, r;
3245
3246 if (amdgpu_ip_version(adev, GC_HWIP, 0) > IP_VERSION(9, 0, 0))
3247 return;
3248
3249 for (i = 0; i < adev->num_ip_blocks; i++) {
3250 if (!adev->ip_blocks[i].status.hw)
3251 continue;
3252 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_SMC) {
3253 r = adev->ip_blocks[i].version->funcs->hw_fini((void *)adev);
3254 /* XXX handle errors */
3255 if (r) {
3256 DRM_DEBUG("hw_fini of IP block <%s> failed %d\n",
3257 adev->ip_blocks[i].version->funcs->name, r);
3258 }
3259 adev->ip_blocks[i].status.hw = false;
3260 break;
3261 }
3262 }
3263 }
3264
amdgpu_device_ip_fini_early(struct amdgpu_device * adev)3265 static int amdgpu_device_ip_fini_early(struct amdgpu_device *adev)
3266 {
3267 int i, r;
3268
3269 for (i = 0; i < adev->num_ip_blocks; i++) {
3270 if (!adev->ip_blocks[i].version->funcs->early_fini)
3271 continue;
3272
3273 r = adev->ip_blocks[i].version->funcs->early_fini((void *)adev);
3274 if (r) {
3275 DRM_DEBUG("early_fini of IP block <%s> failed %d\n",
3276 adev->ip_blocks[i].version->funcs->name, r);
3277 }
3278 }
3279
3280 amdgpu_device_set_pg_state(adev, AMD_PG_STATE_UNGATE);
3281 amdgpu_device_set_cg_state(adev, AMD_CG_STATE_UNGATE);
3282
3283 amdgpu_amdkfd_suspend(adev, false);
3284
3285 /* Workaroud for ASICs need to disable SMC first */
3286 amdgpu_device_smu_fini_early(adev);
3287
3288 for (i = adev->num_ip_blocks - 1; i >= 0; i--) {
3289 if (!adev->ip_blocks[i].status.hw)
3290 continue;
3291
3292 r = adev->ip_blocks[i].version->funcs->hw_fini((void *)adev);
3293 /* XXX handle errors */
3294 if (r) {
3295 DRM_DEBUG("hw_fini of IP block <%s> failed %d\n",
3296 adev->ip_blocks[i].version->funcs->name, r);
3297 }
3298
3299 adev->ip_blocks[i].status.hw = false;
3300 }
3301
3302 if (amdgpu_sriov_vf(adev)) {
3303 if (amdgpu_virt_release_full_gpu(adev, false))
3304 DRM_ERROR("failed to release exclusive mode on fini\n");
3305 }
3306
3307 return 0;
3308 }
3309
3310 /**
3311 * amdgpu_device_ip_fini - run fini for hardware IPs
3312 *
3313 * @adev: amdgpu_device pointer
3314 *
3315 * Main teardown pass for hardware IPs. The list of all the hardware
3316 * IPs that make up the asic is walked and the hw_fini and sw_fini callbacks
3317 * are run. hw_fini tears down the hardware associated with each IP
3318 * and sw_fini tears down any software state associated with each IP.
3319 * Returns 0 on success, negative error code on failure.
3320 */
amdgpu_device_ip_fini(struct amdgpu_device * adev)3321 static int amdgpu_device_ip_fini(struct amdgpu_device *adev)
3322 {
3323 int i, r;
3324
3325 if (amdgpu_sriov_vf(adev) && adev->virt.ras_init_done)
3326 amdgpu_virt_release_ras_err_handler_data(adev);
3327
3328 if (adev->gmc.xgmi.num_physical_nodes > 1)
3329 amdgpu_xgmi_remove_device(adev);
3330
3331 amdgpu_amdkfd_device_fini_sw(adev);
3332
3333 for (i = adev->num_ip_blocks - 1; i >= 0; i--) {
3334 if (!adev->ip_blocks[i].status.sw)
3335 continue;
3336
3337 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC) {
3338 amdgpu_ucode_free_bo(adev);
3339 amdgpu_free_static_csa(&adev->virt.csa_obj);
3340 amdgpu_device_wb_fini(adev);
3341 amdgpu_device_mem_scratch_fini(adev);
3342 amdgpu_ib_pool_fini(adev);
3343 amdgpu_seq64_fini(adev);
3344 amdgpu_doorbell_fini(adev);
3345 }
3346
3347 r = adev->ip_blocks[i].version->funcs->sw_fini((void *)adev);
3348 /* XXX handle errors */
3349 if (r) {
3350 DRM_DEBUG("sw_fini of IP block <%s> failed %d\n",
3351 adev->ip_blocks[i].version->funcs->name, r);
3352 }
3353 adev->ip_blocks[i].status.sw = false;
3354 adev->ip_blocks[i].status.valid = false;
3355 }
3356
3357 for (i = adev->num_ip_blocks - 1; i >= 0; i--) {
3358 if (!adev->ip_blocks[i].status.late_initialized)
3359 continue;
3360 if (adev->ip_blocks[i].version->funcs->late_fini)
3361 adev->ip_blocks[i].version->funcs->late_fini((void *)adev);
3362 adev->ip_blocks[i].status.late_initialized = false;
3363 }
3364
3365 amdgpu_ras_fini(adev);
3366
3367 return 0;
3368 }
3369
3370 /**
3371 * amdgpu_device_delayed_init_work_handler - work handler for IB tests
3372 *
3373 * @work: work_struct.
3374 */
amdgpu_device_delayed_init_work_handler(struct work_struct * work)3375 static void amdgpu_device_delayed_init_work_handler(struct work_struct *work)
3376 {
3377 struct amdgpu_device *adev =
3378 container_of(work, struct amdgpu_device, delayed_init_work.work);
3379 int r;
3380
3381 r = amdgpu_ib_ring_tests(adev);
3382 if (r)
3383 DRM_ERROR("ib ring test failed (%d).\n", r);
3384 }
3385
amdgpu_device_delay_enable_gfx_off(struct work_struct * work)3386 static void amdgpu_device_delay_enable_gfx_off(struct work_struct *work)
3387 {
3388 struct amdgpu_device *adev =
3389 container_of(work, struct amdgpu_device, gfx.gfx_off_delay_work.work);
3390
3391 WARN_ON_ONCE(adev->gfx.gfx_off_state);
3392 WARN_ON_ONCE(adev->gfx.gfx_off_req_count);
3393
3394 if (!amdgpu_dpm_set_powergating_by_smu(adev, AMD_IP_BLOCK_TYPE_GFX, true))
3395 adev->gfx.gfx_off_state = true;
3396 }
3397
3398 /**
3399 * amdgpu_device_ip_suspend_phase1 - run suspend for hardware IPs (phase 1)
3400 *
3401 * @adev: amdgpu_device pointer
3402 *
3403 * Main suspend function for hardware IPs. The list of all the hardware
3404 * IPs that make up the asic is walked, clockgating is disabled and the
3405 * suspend callbacks are run. suspend puts the hardware and software state
3406 * in each IP into a state suitable for suspend.
3407 * Returns 0 on success, negative error code on failure.
3408 */
amdgpu_device_ip_suspend_phase1(struct amdgpu_device * adev)3409 static int amdgpu_device_ip_suspend_phase1(struct amdgpu_device *adev)
3410 {
3411 int i, r;
3412
3413 amdgpu_device_set_pg_state(adev, AMD_PG_STATE_UNGATE);
3414 amdgpu_device_set_cg_state(adev, AMD_CG_STATE_UNGATE);
3415
3416 /*
3417 * Per PMFW team's suggestion, driver needs to handle gfxoff
3418 * and df cstate features disablement for gpu reset(e.g. Mode1Reset)
3419 * scenario. Add the missing df cstate disablement here.
3420 */
3421 if (amdgpu_dpm_set_df_cstate(adev, DF_CSTATE_DISALLOW))
3422 dev_warn(adev->dev, "Failed to disallow df cstate");
3423
3424 for (i = adev->num_ip_blocks - 1; i >= 0; i--) {
3425 if (!adev->ip_blocks[i].status.valid)
3426 continue;
3427
3428 /* displays are handled separately */
3429 if (adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_DCE)
3430 continue;
3431
3432 /* XXX handle errors */
3433 r = adev->ip_blocks[i].version->funcs->suspend(adev);
3434 /* XXX handle errors */
3435 if (r) {
3436 DRM_ERROR("suspend of IP block <%s> failed %d\n",
3437 adev->ip_blocks[i].version->funcs->name, r);
3438 return r;
3439 }
3440
3441 adev->ip_blocks[i].status.hw = false;
3442 }
3443
3444 return 0;
3445 }
3446
3447 /**
3448 * amdgpu_device_ip_suspend_phase2 - run suspend for hardware IPs (phase 2)
3449 *
3450 * @adev: amdgpu_device pointer
3451 *
3452 * Main suspend function for hardware IPs. The list of all the hardware
3453 * IPs that make up the asic is walked, clockgating is disabled and the
3454 * suspend callbacks are run. suspend puts the hardware and software state
3455 * in each IP into a state suitable for suspend.
3456 * Returns 0 on success, negative error code on failure.
3457 */
amdgpu_device_ip_suspend_phase2(struct amdgpu_device * adev)3458 static int amdgpu_device_ip_suspend_phase2(struct amdgpu_device *adev)
3459 {
3460 int i, r;
3461
3462 if (adev->in_s0ix)
3463 amdgpu_dpm_gfx_state_change(adev, sGpuChangeState_D3Entry);
3464
3465 for (i = adev->num_ip_blocks - 1; i >= 0; i--) {
3466 if (!adev->ip_blocks[i].status.valid)
3467 continue;
3468 /* displays are handled in phase1 */
3469 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_DCE)
3470 continue;
3471 /* PSP lost connection when err_event_athub occurs */
3472 if (amdgpu_ras_intr_triggered() &&
3473 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_PSP) {
3474 adev->ip_blocks[i].status.hw = false;
3475 continue;
3476 }
3477
3478 /* skip unnecessary suspend if we do not initialize them yet */
3479 if (adev->gmc.xgmi.pending_reset &&
3480 !(adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC ||
3481 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_SMC ||
3482 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_COMMON ||
3483 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_IH)) {
3484 adev->ip_blocks[i].status.hw = false;
3485 continue;
3486 }
3487
3488 /* skip suspend of gfx/mes and psp for S0ix
3489 * gfx is in gfxoff state, so on resume it will exit gfxoff just
3490 * like at runtime. PSP is also part of the always on hardware
3491 * so no need to suspend it.
3492 */
3493 if (adev->in_s0ix &&
3494 (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_PSP ||
3495 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GFX ||
3496 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_MES))
3497 continue;
3498
3499 /* SDMA 5.x+ is part of GFX power domain so it's covered by GFXOFF */
3500 if (adev->in_s0ix &&
3501 (amdgpu_ip_version(adev, SDMA0_HWIP, 0) >=
3502 IP_VERSION(5, 0, 0)) &&
3503 (adev->ip_blocks[i].version->type ==
3504 AMD_IP_BLOCK_TYPE_SDMA))
3505 continue;
3506
3507 /* Once swPSP provides the IMU, RLC FW binaries to TOS during cold-boot.
3508 * These are in TMR, hence are expected to be reused by PSP-TOS to reload
3509 * from this location and RLC Autoload automatically also gets loaded
3510 * from here based on PMFW -> PSP message during re-init sequence.
3511 * Therefore, the psp suspend & resume should be skipped to avoid destroy
3512 * the TMR and reload FWs again for IMU enabled APU ASICs.
3513 */
3514 if (amdgpu_in_reset(adev) &&
3515 (adev->flags & AMD_IS_APU) && adev->gfx.imu.funcs &&
3516 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_PSP)
3517 continue;
3518
3519 /* XXX handle errors */
3520 r = adev->ip_blocks[i].version->funcs->suspend(adev);
3521 /* XXX handle errors */
3522 if (r) {
3523 DRM_ERROR("suspend of IP block <%s> failed %d\n",
3524 adev->ip_blocks[i].version->funcs->name, r);
3525 }
3526 adev->ip_blocks[i].status.hw = false;
3527 /* handle putting the SMC in the appropriate state */
3528 if (!amdgpu_sriov_vf(adev)) {
3529 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_SMC) {
3530 r = amdgpu_dpm_set_mp1_state(adev, adev->mp1_state);
3531 if (r) {
3532 DRM_ERROR("SMC failed to set mp1 state %d, %d\n",
3533 adev->mp1_state, r);
3534 return r;
3535 }
3536 }
3537 }
3538 }
3539
3540 return 0;
3541 }
3542
3543 /**
3544 * amdgpu_device_ip_suspend - run suspend for hardware IPs
3545 *
3546 * @adev: amdgpu_device pointer
3547 *
3548 * Main suspend function for hardware IPs. The list of all the hardware
3549 * IPs that make up the asic is walked, clockgating is disabled and the
3550 * suspend callbacks are run. suspend puts the hardware and software state
3551 * in each IP into a state suitable for suspend.
3552 * Returns 0 on success, negative error code on failure.
3553 */
amdgpu_device_ip_suspend(struct amdgpu_device * adev)3554 int amdgpu_device_ip_suspend(struct amdgpu_device *adev)
3555 {
3556 int r;
3557
3558 if (amdgpu_sriov_vf(adev)) {
3559 amdgpu_virt_fini_data_exchange(adev);
3560 amdgpu_virt_request_full_gpu(adev, false);
3561 }
3562
3563 amdgpu_ttm_set_buffer_funcs_status(adev, false);
3564
3565 r = amdgpu_device_ip_suspend_phase1(adev);
3566 if (r)
3567 return r;
3568 r = amdgpu_device_ip_suspend_phase2(adev);
3569
3570 if (amdgpu_sriov_vf(adev))
3571 amdgpu_virt_release_full_gpu(adev, false);
3572
3573 return r;
3574 }
3575
amdgpu_device_ip_reinit_early_sriov(struct amdgpu_device * adev)3576 static int amdgpu_device_ip_reinit_early_sriov(struct amdgpu_device *adev)
3577 {
3578 int i, r;
3579
3580 static enum amd_ip_block_type ip_order[] = {
3581 AMD_IP_BLOCK_TYPE_COMMON,
3582 AMD_IP_BLOCK_TYPE_GMC,
3583 AMD_IP_BLOCK_TYPE_PSP,
3584 AMD_IP_BLOCK_TYPE_IH,
3585 };
3586
3587 for (i = 0; i < adev->num_ip_blocks; i++) {
3588 int j;
3589 struct amdgpu_ip_block *block;
3590
3591 block = &adev->ip_blocks[i];
3592 block->status.hw = false;
3593
3594 for (j = 0; j < ARRAY_SIZE(ip_order); j++) {
3595
3596 if (block->version->type != ip_order[j] ||
3597 !block->status.valid)
3598 continue;
3599
3600 r = block->version->funcs->hw_init(adev);
3601 DRM_INFO("RE-INIT-early: %s %s\n", block->version->funcs->name, r?"failed":"succeeded");
3602 if (r)
3603 return r;
3604 block->status.hw = true;
3605 }
3606 }
3607
3608 return 0;
3609 }
3610
amdgpu_device_ip_reinit_late_sriov(struct amdgpu_device * adev)3611 static int amdgpu_device_ip_reinit_late_sriov(struct amdgpu_device *adev)
3612 {
3613 int i, r;
3614
3615 static enum amd_ip_block_type ip_order[] = {
3616 AMD_IP_BLOCK_TYPE_SMC,
3617 AMD_IP_BLOCK_TYPE_DCE,
3618 AMD_IP_BLOCK_TYPE_GFX,
3619 AMD_IP_BLOCK_TYPE_SDMA,
3620 AMD_IP_BLOCK_TYPE_MES,
3621 AMD_IP_BLOCK_TYPE_UVD,
3622 AMD_IP_BLOCK_TYPE_VCE,
3623 AMD_IP_BLOCK_TYPE_VCN,
3624 AMD_IP_BLOCK_TYPE_JPEG
3625 };
3626
3627 for (i = 0; i < ARRAY_SIZE(ip_order); i++) {
3628 int j;
3629 struct amdgpu_ip_block *block;
3630
3631 for (j = 0; j < adev->num_ip_blocks; j++) {
3632 block = &adev->ip_blocks[j];
3633
3634 if (block->version->type != ip_order[i] ||
3635 !block->status.valid ||
3636 block->status.hw)
3637 continue;
3638
3639 if (block->version->type == AMD_IP_BLOCK_TYPE_SMC)
3640 r = block->version->funcs->resume(adev);
3641 else
3642 r = block->version->funcs->hw_init(adev);
3643
3644 DRM_INFO("RE-INIT-late: %s %s\n", block->version->funcs->name, r?"failed":"succeeded");
3645 if (r)
3646 return r;
3647 block->status.hw = true;
3648 }
3649 }
3650
3651 return 0;
3652 }
3653
3654 /**
3655 * amdgpu_device_ip_resume_phase1 - run resume for hardware IPs
3656 *
3657 * @adev: amdgpu_device pointer
3658 *
3659 * First resume function for hardware IPs. The list of all the hardware
3660 * IPs that make up the asic is walked and the resume callbacks are run for
3661 * COMMON, GMC, and IH. resume puts the hardware into a functional state
3662 * after a suspend and updates the software state as necessary. This
3663 * function is also used for restoring the GPU after a GPU reset.
3664 * Returns 0 on success, negative error code on failure.
3665 */
amdgpu_device_ip_resume_phase1(struct amdgpu_device * adev)3666 static int amdgpu_device_ip_resume_phase1(struct amdgpu_device *adev)
3667 {
3668 int i, r;
3669
3670 for (i = 0; i < adev->num_ip_blocks; i++) {
3671 if (!adev->ip_blocks[i].status.valid || adev->ip_blocks[i].status.hw)
3672 continue;
3673 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_COMMON ||
3674 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC ||
3675 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_IH ||
3676 (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_PSP && amdgpu_sriov_vf(adev))) {
3677
3678 r = adev->ip_blocks[i].version->funcs->resume(adev);
3679 if (r) {
3680 DRM_ERROR("resume of IP block <%s> failed %d\n",
3681 adev->ip_blocks[i].version->funcs->name, r);
3682 return r;
3683 }
3684 adev->ip_blocks[i].status.hw = true;
3685 }
3686 }
3687
3688 return 0;
3689 }
3690
3691 /**
3692 * amdgpu_device_ip_resume_phase2 - run resume for hardware IPs
3693 *
3694 * @adev: amdgpu_device pointer
3695 *
3696 * Second resume function for hardware IPs. The list of all the hardware
3697 * IPs that make up the asic is walked and the resume callbacks are run for
3698 * all blocks except COMMON, GMC, and IH. resume puts the hardware into a
3699 * functional state after a suspend and updates the software state as
3700 * necessary. This function is also used for restoring the GPU after a GPU
3701 * reset.
3702 * Returns 0 on success, negative error code on failure.
3703 */
amdgpu_device_ip_resume_phase2(struct amdgpu_device * adev)3704 static int amdgpu_device_ip_resume_phase2(struct amdgpu_device *adev)
3705 {
3706 int i, r;
3707
3708 for (i = 0; i < adev->num_ip_blocks; i++) {
3709 if (!adev->ip_blocks[i].status.valid || adev->ip_blocks[i].status.hw)
3710 continue;
3711 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_COMMON ||
3712 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC ||
3713 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_IH ||
3714 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_DCE ||
3715 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_PSP)
3716 continue;
3717 r = adev->ip_blocks[i].version->funcs->resume(adev);
3718 if (r) {
3719 DRM_ERROR("resume of IP block <%s> failed %d\n",
3720 adev->ip_blocks[i].version->funcs->name, r);
3721 return r;
3722 }
3723 adev->ip_blocks[i].status.hw = true;
3724 }
3725
3726 return 0;
3727 }
3728
3729 /**
3730 * amdgpu_device_ip_resume_phase3 - run resume for hardware IPs
3731 *
3732 * @adev: amdgpu_device pointer
3733 *
3734 * Third resume function for hardware IPs. The list of all the hardware
3735 * IPs that make up the asic is walked and the resume callbacks are run for
3736 * all DCE. resume puts the hardware into a functional state after a suspend
3737 * and updates the software state as necessary. This function is also used
3738 * for restoring the GPU after a GPU reset.
3739 *
3740 * Returns 0 on success, negative error code on failure.
3741 */
amdgpu_device_ip_resume_phase3(struct amdgpu_device * adev)3742 static int amdgpu_device_ip_resume_phase3(struct amdgpu_device *adev)
3743 {
3744 int i, r;
3745
3746 for (i = 0; i < adev->num_ip_blocks; i++) {
3747 if (!adev->ip_blocks[i].status.valid || adev->ip_blocks[i].status.hw)
3748 continue;
3749 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_DCE) {
3750 r = adev->ip_blocks[i].version->funcs->resume(adev);
3751 if (r) {
3752 DRM_ERROR("resume of IP block <%s> failed %d\n",
3753 adev->ip_blocks[i].version->funcs->name, r);
3754 return r;
3755 }
3756 adev->ip_blocks[i].status.hw = true;
3757 }
3758 }
3759
3760 return 0;
3761 }
3762
3763 /**
3764 * amdgpu_device_ip_resume - run resume for hardware IPs
3765 *
3766 * @adev: amdgpu_device pointer
3767 *
3768 * Main resume function for hardware IPs. The hardware IPs
3769 * are split into two resume functions because they are
3770 * also used in recovering from a GPU reset and some additional
3771 * steps need to be take between them. In this case (S3/S4) they are
3772 * run sequentially.
3773 * Returns 0 on success, negative error code on failure.
3774 */
amdgpu_device_ip_resume(struct amdgpu_device * adev)3775 static int amdgpu_device_ip_resume(struct amdgpu_device *adev)
3776 {
3777 int r;
3778
3779 r = amdgpu_device_ip_resume_phase1(adev);
3780 if (r)
3781 return r;
3782
3783 r = amdgpu_device_fw_loading(adev);
3784 if (r)
3785 return r;
3786
3787 r = amdgpu_device_ip_resume_phase2(adev);
3788
3789 if (adev->mman.buffer_funcs_ring->sched.ready)
3790 amdgpu_ttm_set_buffer_funcs_status(adev, true);
3791
3792 if (r)
3793 return r;
3794
3795 amdgpu_fence_driver_hw_init(adev);
3796
3797 r = amdgpu_device_ip_resume_phase3(adev);
3798
3799 return r;
3800 }
3801
3802 /**
3803 * amdgpu_device_detect_sriov_bios - determine if the board supports SR-IOV
3804 *
3805 * @adev: amdgpu_device pointer
3806 *
3807 * Query the VBIOS data tables to determine if the board supports SR-IOV.
3808 */
amdgpu_device_detect_sriov_bios(struct amdgpu_device * adev)3809 static void amdgpu_device_detect_sriov_bios(struct amdgpu_device *adev)
3810 {
3811 if (amdgpu_sriov_vf(adev)) {
3812 if (adev->is_atom_fw) {
3813 if (amdgpu_atomfirmware_gpu_virtualization_supported(adev))
3814 adev->virt.caps |= AMDGPU_SRIOV_CAPS_SRIOV_VBIOS;
3815 } else {
3816 if (amdgpu_atombios_has_gpu_virtualization_table(adev))
3817 adev->virt.caps |= AMDGPU_SRIOV_CAPS_SRIOV_VBIOS;
3818 }
3819
3820 if (!(adev->virt.caps & AMDGPU_SRIOV_CAPS_SRIOV_VBIOS))
3821 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_NO_VBIOS, 0, 0);
3822 }
3823 }
3824
3825 /**
3826 * amdgpu_device_asic_has_dc_support - determine if DC supports the asic
3827 *
3828 * @asic_type: AMD asic type
3829 *
3830 * Check if there is DC (new modesetting infrastructre) support for an asic.
3831 * returns true if DC has support, false if not.
3832 */
amdgpu_device_asic_has_dc_support(enum amd_asic_type asic_type)3833 bool amdgpu_device_asic_has_dc_support(enum amd_asic_type asic_type)
3834 {
3835 switch (asic_type) {
3836 #ifdef CONFIG_DRM_AMDGPU_SI
3837 case CHIP_HAINAN:
3838 #endif
3839 case CHIP_TOPAZ:
3840 /* chips with no display hardware */
3841 return false;
3842 #if defined(CONFIG_DRM_AMD_DC)
3843 case CHIP_TAHITI:
3844 case CHIP_PITCAIRN:
3845 case CHIP_VERDE:
3846 case CHIP_OLAND:
3847 /*
3848 * We have systems in the wild with these ASICs that require
3849 * LVDS and VGA support which is not supported with DC.
3850 *
3851 * Fallback to the non-DC driver here by default so as not to
3852 * cause regressions.
3853 */
3854 #if defined(CONFIG_DRM_AMD_DC_SI)
3855 return amdgpu_dc > 0;
3856 #else
3857 return false;
3858 #endif
3859 case CHIP_BONAIRE:
3860 case CHIP_KAVERI:
3861 case CHIP_KABINI:
3862 case CHIP_MULLINS:
3863 /*
3864 * We have systems in the wild with these ASICs that require
3865 * VGA support which is not supported with DC.
3866 *
3867 * Fallback to the non-DC driver here by default so as not to
3868 * cause regressions.
3869 */
3870 return amdgpu_dc > 0;
3871 default:
3872 return amdgpu_dc != 0;
3873 #else
3874 default:
3875 if (amdgpu_dc > 0)
3876 DRM_INFO_ONCE("Display Core has been requested via kernel parameter but isn't supported by ASIC, ignoring\n");
3877 return false;
3878 #endif
3879 }
3880 }
3881
3882 /**
3883 * amdgpu_device_has_dc_support - check if dc is supported
3884 *
3885 * @adev: amdgpu_device pointer
3886 *
3887 * Returns true for supported, false for not supported
3888 */
amdgpu_device_has_dc_support(struct amdgpu_device * adev)3889 bool amdgpu_device_has_dc_support(struct amdgpu_device *adev)
3890 {
3891 if (adev->enable_virtual_display ||
3892 (adev->harvest_ip_mask & AMD_HARVEST_IP_DMU_MASK))
3893 return false;
3894
3895 return amdgpu_device_asic_has_dc_support(adev->asic_type);
3896 }
3897
amdgpu_device_xgmi_reset_func(struct work_struct * __work)3898 static void amdgpu_device_xgmi_reset_func(struct work_struct *__work)
3899 {
3900 struct amdgpu_device *adev =
3901 container_of(__work, struct amdgpu_device, xgmi_reset_work);
3902 struct amdgpu_hive_info *hive = amdgpu_get_xgmi_hive(adev);
3903
3904 /* It's a bug to not have a hive within this function */
3905 if (WARN_ON(!hive))
3906 return;
3907
3908 /*
3909 * Use task barrier to synchronize all xgmi reset works across the
3910 * hive. task_barrier_enter and task_barrier_exit will block
3911 * until all the threads running the xgmi reset works reach
3912 * those points. task_barrier_full will do both blocks.
3913 */
3914 if (amdgpu_asic_reset_method(adev) == AMD_RESET_METHOD_BACO) {
3915
3916 task_barrier_enter(&hive->tb);
3917 adev->asic_reset_res = amdgpu_device_baco_enter(adev_to_drm(adev));
3918
3919 if (adev->asic_reset_res)
3920 goto fail;
3921
3922 task_barrier_exit(&hive->tb);
3923 adev->asic_reset_res = amdgpu_device_baco_exit(adev_to_drm(adev));
3924
3925 if (adev->asic_reset_res)
3926 goto fail;
3927
3928 amdgpu_ras_reset_error_count(adev, AMDGPU_RAS_BLOCK__MMHUB);
3929 } else {
3930
3931 task_barrier_full(&hive->tb);
3932 adev->asic_reset_res = amdgpu_asic_reset(adev);
3933 }
3934
3935 fail:
3936 if (adev->asic_reset_res)
3937 DRM_WARN("ASIC reset failed with error, %d for drm dev, %s",
3938 adev->asic_reset_res, adev_to_drm(adev)->unique);
3939 amdgpu_put_xgmi_hive(hive);
3940 }
3941
amdgpu_device_get_job_timeout_settings(struct amdgpu_device * adev)3942 static int amdgpu_device_get_job_timeout_settings(struct amdgpu_device *adev)
3943 {
3944 char *input = amdgpu_lockup_timeout;
3945 char *timeout_setting = NULL;
3946 int index = 0;
3947 long timeout;
3948 int ret = 0;
3949
3950 /*
3951 * By default timeout for non compute jobs is 10000
3952 * and 60000 for compute jobs.
3953 * In SR-IOV or passthrough mode, timeout for compute
3954 * jobs are 60000 by default.
3955 */
3956 adev->gfx_timeout = msecs_to_jiffies(10000);
3957 adev->sdma_timeout = adev->video_timeout = adev->gfx_timeout;
3958 if (amdgpu_sriov_vf(adev))
3959 adev->compute_timeout = amdgpu_sriov_is_pp_one_vf(adev) ?
3960 msecs_to_jiffies(60000) : msecs_to_jiffies(10000);
3961 else
3962 adev->compute_timeout = msecs_to_jiffies(60000);
3963
3964 if (strnlen(input, AMDGPU_MAX_TIMEOUT_PARAM_LENGTH)) {
3965 while ((timeout_setting = strsep(&input, ",")) &&
3966 strnlen(timeout_setting, AMDGPU_MAX_TIMEOUT_PARAM_LENGTH)) {
3967 ret = kstrtol(timeout_setting, 0, &timeout);
3968 if (ret)
3969 return ret;
3970
3971 if (timeout == 0) {
3972 index++;
3973 continue;
3974 } else if (timeout < 0) {
3975 timeout = MAX_SCHEDULE_TIMEOUT;
3976 dev_warn(adev->dev, "lockup timeout disabled");
3977 add_taint(TAINT_SOFTLOCKUP, LOCKDEP_STILL_OK);
3978 } else {
3979 timeout = msecs_to_jiffies(timeout);
3980 }
3981
3982 switch (index++) {
3983 case 0:
3984 adev->gfx_timeout = timeout;
3985 break;
3986 case 1:
3987 adev->compute_timeout = timeout;
3988 break;
3989 case 2:
3990 adev->sdma_timeout = timeout;
3991 break;
3992 case 3:
3993 adev->video_timeout = timeout;
3994 break;
3995 default:
3996 break;
3997 }
3998 }
3999 /*
4000 * There is only one value specified and
4001 * it should apply to all non-compute jobs.
4002 */
4003 if (index == 1) {
4004 adev->sdma_timeout = adev->video_timeout = adev->gfx_timeout;
4005 if (amdgpu_sriov_vf(adev) || amdgpu_passthrough(adev))
4006 adev->compute_timeout = adev->gfx_timeout;
4007 }
4008 }
4009
4010 return ret;
4011 }
4012
4013 /**
4014 * amdgpu_device_check_iommu_direct_map - check if RAM direct mapped to GPU
4015 *
4016 * @adev: amdgpu_device pointer
4017 *
4018 * RAM direct mapped to GPU if IOMMU is not enabled or is pass through mode
4019 */
amdgpu_device_check_iommu_direct_map(struct amdgpu_device * adev)4020 static void amdgpu_device_check_iommu_direct_map(struct amdgpu_device *adev)
4021 {
4022 struct iommu_domain *domain;
4023
4024 domain = iommu_get_domain_for_dev(adev->dev);
4025 if (!domain || domain->type == IOMMU_DOMAIN_IDENTITY)
4026 adev->ram_is_direct_mapped = true;
4027 }
4028
4029 #if defined(CONFIG_HSA_AMD_P2P)
4030 /**
4031 * amdgpu_device_check_iommu_remap - Check if DMA remapping is enabled.
4032 *
4033 * @adev: amdgpu_device pointer
4034 *
4035 * return if IOMMU remapping bar address
4036 */
amdgpu_device_check_iommu_remap(struct amdgpu_device * adev)4037 static bool amdgpu_device_check_iommu_remap(struct amdgpu_device *adev)
4038 {
4039 struct iommu_domain *domain;
4040
4041 domain = iommu_get_domain_for_dev(adev->dev);
4042 if (domain && (domain->type == IOMMU_DOMAIN_DMA ||
4043 domain->type == IOMMU_DOMAIN_DMA_FQ))
4044 return true;
4045
4046 return false;
4047 }
4048 #endif
4049
amdgpu_device_set_mcbp(struct amdgpu_device * adev)4050 static void amdgpu_device_set_mcbp(struct amdgpu_device *adev)
4051 {
4052 if (amdgpu_mcbp == 1)
4053 adev->gfx.mcbp = true;
4054 else if (amdgpu_mcbp == 0)
4055 adev->gfx.mcbp = false;
4056
4057 if (amdgpu_sriov_vf(adev))
4058 adev->gfx.mcbp = true;
4059
4060 if (adev->gfx.mcbp)
4061 DRM_INFO("MCBP is enabled\n");
4062 }
4063
4064 /**
4065 * amdgpu_device_init - initialize the driver
4066 *
4067 * @adev: amdgpu_device pointer
4068 * @flags: driver flags
4069 *
4070 * Initializes the driver info and hw (all asics).
4071 * Returns 0 for success or an error on failure.
4072 * Called at driver startup.
4073 */
amdgpu_device_init(struct amdgpu_device * adev,uint32_t flags)4074 int amdgpu_device_init(struct amdgpu_device *adev,
4075 uint32_t flags)
4076 {
4077 struct drm_device *ddev = adev_to_drm(adev);
4078 struct pci_dev *pdev = adev->pdev;
4079 int r, i;
4080 bool px = false;
4081 u32 max_MBps;
4082 int tmp;
4083
4084 adev->shutdown = false;
4085 adev->flags = flags;
4086
4087 if (amdgpu_force_asic_type >= 0 && amdgpu_force_asic_type < CHIP_LAST)
4088 adev->asic_type = amdgpu_force_asic_type;
4089 else
4090 adev->asic_type = flags & AMD_ASIC_MASK;
4091
4092 adev->usec_timeout = AMDGPU_MAX_USEC_TIMEOUT;
4093 if (amdgpu_emu_mode == 1)
4094 adev->usec_timeout *= 10;
4095 adev->gmc.gart_size = 512 * 1024 * 1024;
4096 adev->accel_working = false;
4097 adev->num_rings = 0;
4098 RCU_INIT_POINTER(adev->gang_submit, dma_fence_get_stub());
4099 adev->mman.buffer_funcs = NULL;
4100 adev->mman.buffer_funcs_ring = NULL;
4101 adev->vm_manager.vm_pte_funcs = NULL;
4102 adev->vm_manager.vm_pte_num_scheds = 0;
4103 adev->gmc.gmc_funcs = NULL;
4104 adev->harvest_ip_mask = 0x0;
4105 adev->fence_context = dma_fence_context_alloc(AMDGPU_MAX_RINGS);
4106 bitmap_zero(adev->gfx.pipe_reserve_bitmap, AMDGPU_MAX_COMPUTE_QUEUES);
4107
4108 adev->smc_rreg = &amdgpu_invalid_rreg;
4109 adev->smc_wreg = &amdgpu_invalid_wreg;
4110 adev->pcie_rreg = &amdgpu_invalid_rreg;
4111 adev->pcie_wreg = &amdgpu_invalid_wreg;
4112 adev->pcie_rreg_ext = &amdgpu_invalid_rreg_ext;
4113 adev->pcie_wreg_ext = &amdgpu_invalid_wreg_ext;
4114 adev->pciep_rreg = &amdgpu_invalid_rreg;
4115 adev->pciep_wreg = &amdgpu_invalid_wreg;
4116 adev->pcie_rreg64 = &amdgpu_invalid_rreg64;
4117 adev->pcie_wreg64 = &amdgpu_invalid_wreg64;
4118 adev->pcie_rreg64_ext = &amdgpu_invalid_rreg64_ext;
4119 adev->pcie_wreg64_ext = &amdgpu_invalid_wreg64_ext;
4120 adev->uvd_ctx_rreg = &amdgpu_invalid_rreg;
4121 adev->uvd_ctx_wreg = &amdgpu_invalid_wreg;
4122 adev->didt_rreg = &amdgpu_invalid_rreg;
4123 adev->didt_wreg = &amdgpu_invalid_wreg;
4124 adev->gc_cac_rreg = &amdgpu_invalid_rreg;
4125 adev->gc_cac_wreg = &amdgpu_invalid_wreg;
4126 adev->audio_endpt_rreg = &amdgpu_block_invalid_rreg;
4127 adev->audio_endpt_wreg = &amdgpu_block_invalid_wreg;
4128
4129 DRM_INFO("initializing kernel modesetting (%s 0x%04X:0x%04X 0x%04X:0x%04X 0x%02X).\n",
4130 amdgpu_asic_name[adev->asic_type], pdev->vendor, pdev->device,
4131 pdev->subsystem_vendor, pdev->subsystem_device, pdev->revision);
4132
4133 /* mutex initialization are all done here so we
4134 * can recall function without having locking issues
4135 */
4136 mutex_init(&adev->firmware.mutex);
4137 mutex_init(&adev->pm.mutex);
4138 mutex_init(&adev->gfx.gpu_clock_mutex);
4139 mutex_init(&adev->srbm_mutex);
4140 mutex_init(&adev->gfx.pipe_reserve_mutex);
4141 mutex_init(&adev->gfx.gfx_off_mutex);
4142 mutex_init(&adev->gfx.partition_mutex);
4143 mutex_init(&adev->grbm_idx_mutex);
4144 mutex_init(&adev->mn_lock);
4145 mutex_init(&adev->virt.vf_errors.lock);
4146 hash_init(adev->mn_hash);
4147 mutex_init(&adev->psp.mutex);
4148 mutex_init(&adev->notifier_lock);
4149 mutex_init(&adev->pm.stable_pstate_ctx_lock);
4150 mutex_init(&adev->benchmark_mutex);
4151 mutex_init(&adev->gfx.reset_sem_mutex);
4152 /* Initialize the mutex for cleaner shader isolation between GFX and compute processes */
4153 mutex_init(&adev->enforce_isolation_mutex);
4154 mutex_init(&adev->gfx.kfd_sch_mutex);
4155
4156 amdgpu_device_init_apu_flags(adev);
4157
4158 r = amdgpu_device_check_arguments(adev);
4159 if (r)
4160 return r;
4161
4162 spin_lock_init(&adev->mmio_idx_lock);
4163 spin_lock_init(&adev->smc_idx_lock);
4164 spin_lock_init(&adev->pcie_idx_lock);
4165 spin_lock_init(&adev->uvd_ctx_idx_lock);
4166 spin_lock_init(&adev->didt_idx_lock);
4167 spin_lock_init(&adev->gc_cac_idx_lock);
4168 spin_lock_init(&adev->se_cac_idx_lock);
4169 spin_lock_init(&adev->audio_endpt_idx_lock);
4170 spin_lock_init(&adev->mm_stats.lock);
4171 spin_lock_init(&adev->virt.rlcg_reg_lock);
4172 spin_lock_init(&adev->wb.lock);
4173
4174 INIT_LIST_HEAD(&adev->reset_list);
4175
4176 INIT_LIST_HEAD(&adev->ras_list);
4177
4178 INIT_LIST_HEAD(&adev->pm.od_kobj_list);
4179
4180 INIT_DELAYED_WORK(&adev->delayed_init_work,
4181 amdgpu_device_delayed_init_work_handler);
4182 INIT_DELAYED_WORK(&adev->gfx.gfx_off_delay_work,
4183 amdgpu_device_delay_enable_gfx_off);
4184 /*
4185 * Initialize the enforce_isolation work structures for each XCP
4186 * partition. This work handler is responsible for enforcing shader
4187 * isolation on AMD GPUs. It counts the number of emitted fences for
4188 * each GFX and compute ring. If there are any fences, it schedules
4189 * the `enforce_isolation_work` to be run after a delay. If there are
4190 * no fences, it signals the Kernel Fusion Driver (KFD) to resume the
4191 * runqueue.
4192 */
4193 for (i = 0; i < MAX_XCP; i++) {
4194 INIT_DELAYED_WORK(&adev->gfx.enforce_isolation[i].work,
4195 amdgpu_gfx_enforce_isolation_handler);
4196 adev->gfx.enforce_isolation[i].adev = adev;
4197 adev->gfx.enforce_isolation[i].xcp_id = i;
4198 }
4199
4200 INIT_WORK(&adev->xgmi_reset_work, amdgpu_device_xgmi_reset_func);
4201
4202 adev->gfx.gfx_off_req_count = 1;
4203 adev->gfx.gfx_off_residency = 0;
4204 adev->gfx.gfx_off_entrycount = 0;
4205 adev->pm.ac_power = power_supply_is_system_supplied() > 0;
4206
4207 atomic_set(&adev->throttling_logging_enabled, 1);
4208 /*
4209 * If throttling continues, logging will be performed every minute
4210 * to avoid log flooding. "-1" is subtracted since the thermal
4211 * throttling interrupt comes every second. Thus, the total logging
4212 * interval is 59 seconds(retelimited printk interval) + 1(waiting
4213 * for throttling interrupt) = 60 seconds.
4214 */
4215 ratelimit_state_init(&adev->throttling_logging_rs, (60 - 1) * HZ, 1);
4216 ratelimit_set_flags(&adev->throttling_logging_rs, RATELIMIT_MSG_ON_RELEASE);
4217
4218 /* Registers mapping */
4219 /* TODO: block userspace mapping of io register */
4220 if (adev->asic_type >= CHIP_BONAIRE) {
4221 adev->rmmio_base = pci_resource_start(adev->pdev, 5);
4222 adev->rmmio_size = pci_resource_len(adev->pdev, 5);
4223 } else {
4224 adev->rmmio_base = pci_resource_start(adev->pdev, 2);
4225 adev->rmmio_size = pci_resource_len(adev->pdev, 2);
4226 }
4227
4228 for (i = 0; i < AMD_IP_BLOCK_TYPE_NUM; i++)
4229 atomic_set(&adev->pm.pwr_state[i], POWER_STATE_UNKNOWN);
4230
4231 adev->rmmio = ioremap(adev->rmmio_base, adev->rmmio_size);
4232 if (!adev->rmmio)
4233 return -ENOMEM;
4234
4235 DRM_INFO("register mmio base: 0x%08X\n", (uint32_t)adev->rmmio_base);
4236 DRM_INFO("register mmio size: %u\n", (unsigned int)adev->rmmio_size);
4237
4238 /*
4239 * Reset domain needs to be present early, before XGMI hive discovered
4240 * (if any) and intitialized to use reset sem and in_gpu reset flag
4241 * early on during init and before calling to RREG32.
4242 */
4243 adev->reset_domain = amdgpu_reset_create_reset_domain(SINGLE_DEVICE, "amdgpu-reset-dev");
4244 if (!adev->reset_domain)
4245 return -ENOMEM;
4246
4247 /* detect hw virtualization here */
4248 amdgpu_detect_virtualization(adev);
4249
4250 amdgpu_device_get_pcie_info(adev);
4251
4252 r = amdgpu_device_get_job_timeout_settings(adev);
4253 if (r) {
4254 dev_err(adev->dev, "invalid lockup_timeout parameter syntax\n");
4255 return r;
4256 }
4257
4258 amdgpu_device_set_mcbp(adev);
4259
4260 /* early init functions */
4261 r = amdgpu_device_ip_early_init(adev);
4262 if (r)
4263 return r;
4264
4265 /* Get rid of things like offb */
4266 r = drm_aperture_remove_conflicting_pci_framebuffers(adev->pdev, &amdgpu_kms_driver);
4267 if (r)
4268 return r;
4269
4270 /* Enable TMZ based on IP_VERSION */
4271 amdgpu_gmc_tmz_set(adev);
4272
4273 if (amdgpu_sriov_vf(adev) &&
4274 amdgpu_ip_version(adev, GC_HWIP, 0) >= IP_VERSION(10, 3, 0))
4275 /* VF MMIO access (except mailbox range) from CPU
4276 * will be blocked during sriov runtime
4277 */
4278 adev->virt.caps |= AMDGPU_VF_MMIO_ACCESS_PROTECT;
4279
4280 amdgpu_gmc_noretry_set(adev);
4281 /* Need to get xgmi info early to decide the reset behavior*/
4282 if (adev->gmc.xgmi.supported) {
4283 r = adev->gfxhub.funcs->get_xgmi_info(adev);
4284 if (r)
4285 return r;
4286 }
4287
4288 /* enable PCIE atomic ops */
4289 if (amdgpu_sriov_vf(adev)) {
4290 if (adev->virt.fw_reserve.p_pf2vf)
4291 adev->have_atomics_support = ((struct amd_sriov_msg_pf2vf_info *)
4292 adev->virt.fw_reserve.p_pf2vf)->pcie_atomic_ops_support_flags ==
4293 (PCI_EXP_DEVCAP2_ATOMIC_COMP32 | PCI_EXP_DEVCAP2_ATOMIC_COMP64);
4294 /* APUs w/ gfx9 onwards doesn't reply on PCIe atomics, rather it is a
4295 * internal path natively support atomics, set have_atomics_support to true.
4296 */
4297 } else if ((adev->flags & AMD_IS_APU) &&
4298 (amdgpu_ip_version(adev, GC_HWIP, 0) >
4299 IP_VERSION(9, 0, 0))) {
4300 adev->have_atomics_support = true;
4301 } else {
4302 adev->have_atomics_support =
4303 !pci_enable_atomic_ops_to_root(adev->pdev,
4304 PCI_EXP_DEVCAP2_ATOMIC_COMP32 |
4305 PCI_EXP_DEVCAP2_ATOMIC_COMP64);
4306 }
4307
4308 if (!adev->have_atomics_support)
4309 dev_info(adev->dev, "PCIE atomic ops is not supported\n");
4310
4311 /* doorbell bar mapping and doorbell index init*/
4312 amdgpu_doorbell_init(adev);
4313
4314 if (amdgpu_emu_mode == 1) {
4315 /* post the asic on emulation mode */
4316 emu_soc_asic_init(adev);
4317 goto fence_driver_init;
4318 }
4319
4320 amdgpu_reset_init(adev);
4321
4322 /* detect if we are with an SRIOV vbios */
4323 if (adev->bios)
4324 amdgpu_device_detect_sriov_bios(adev);
4325
4326 /* check if we need to reset the asic
4327 * E.g., driver was not cleanly unloaded previously, etc.
4328 */
4329 if (!amdgpu_sriov_vf(adev) && amdgpu_asic_need_reset_on_init(adev)) {
4330 if (adev->gmc.xgmi.num_physical_nodes) {
4331 dev_info(adev->dev, "Pending hive reset.\n");
4332 adev->gmc.xgmi.pending_reset = true;
4333 /* Only need to init necessary block for SMU to handle the reset */
4334 for (i = 0; i < adev->num_ip_blocks; i++) {
4335 if (!adev->ip_blocks[i].status.valid)
4336 continue;
4337 if (!(adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC ||
4338 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_COMMON ||
4339 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_IH ||
4340 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_SMC)) {
4341 DRM_DEBUG("IP %s disabled for hw_init.\n",
4342 adev->ip_blocks[i].version->funcs->name);
4343 adev->ip_blocks[i].status.hw = true;
4344 }
4345 }
4346 } else if (amdgpu_ip_version(adev, MP1_HWIP, 0) == IP_VERSION(13, 0, 10) &&
4347 !amdgpu_device_has_display_hardware(adev)) {
4348 r = psp_gpu_reset(adev);
4349 } else {
4350 tmp = amdgpu_reset_method;
4351 /* It should do a default reset when loading or reloading the driver,
4352 * regardless of the module parameter reset_method.
4353 */
4354 amdgpu_reset_method = AMD_RESET_METHOD_NONE;
4355 r = amdgpu_asic_reset(adev);
4356 amdgpu_reset_method = tmp;
4357 }
4358
4359 if (r) {
4360 dev_err(adev->dev, "asic reset on init failed\n");
4361 goto failed;
4362 }
4363 }
4364
4365 /* Post card if necessary */
4366 if (amdgpu_device_need_post(adev)) {
4367 if (!adev->bios) {
4368 dev_err(adev->dev, "no vBIOS found\n");
4369 r = -EINVAL;
4370 goto failed;
4371 }
4372 DRM_INFO("GPU posting now...\n");
4373 r = amdgpu_device_asic_init(adev);
4374 if (r) {
4375 dev_err(adev->dev, "gpu post error!\n");
4376 goto failed;
4377 }
4378 }
4379
4380 if (adev->bios) {
4381 if (adev->is_atom_fw) {
4382 /* Initialize clocks */
4383 r = amdgpu_atomfirmware_get_clock_info(adev);
4384 if (r) {
4385 dev_err(adev->dev, "amdgpu_atomfirmware_get_clock_info failed\n");
4386 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_ATOMBIOS_GET_CLOCK_FAIL, 0, 0);
4387 goto failed;
4388 }
4389 } else {
4390 /* Initialize clocks */
4391 r = amdgpu_atombios_get_clock_info(adev);
4392 if (r) {
4393 dev_err(adev->dev, "amdgpu_atombios_get_clock_info failed\n");
4394 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_ATOMBIOS_GET_CLOCK_FAIL, 0, 0);
4395 goto failed;
4396 }
4397 /* init i2c buses */
4398 if (!amdgpu_device_has_dc_support(adev))
4399 amdgpu_atombios_i2c_init(adev);
4400 }
4401 }
4402
4403 fence_driver_init:
4404 /* Fence driver */
4405 r = amdgpu_fence_driver_sw_init(adev);
4406 if (r) {
4407 dev_err(adev->dev, "amdgpu_fence_driver_sw_init failed\n");
4408 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_FENCE_INIT_FAIL, 0, 0);
4409 goto failed;
4410 }
4411
4412 /* init the mode config */
4413 drm_mode_config_init(adev_to_drm(adev));
4414
4415 r = amdgpu_device_ip_init(adev);
4416 if (r) {
4417 dev_err(adev->dev, "amdgpu_device_ip_init failed\n");
4418 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_AMDGPU_INIT_FAIL, 0, 0);
4419 goto release_ras_con;
4420 }
4421
4422 amdgpu_fence_driver_hw_init(adev);
4423
4424 dev_info(adev->dev,
4425 "SE %d, SH per SE %d, CU per SH %d, active_cu_number %d\n",
4426 adev->gfx.config.max_shader_engines,
4427 adev->gfx.config.max_sh_per_se,
4428 adev->gfx.config.max_cu_per_sh,
4429 adev->gfx.cu_info.number);
4430
4431 adev->accel_working = true;
4432
4433 amdgpu_vm_check_compute_bug(adev);
4434
4435 /* Initialize the buffer migration limit. */
4436 if (amdgpu_moverate >= 0)
4437 max_MBps = amdgpu_moverate;
4438 else
4439 max_MBps = 8; /* Allow 8 MB/s. */
4440 /* Get a log2 for easy divisions. */
4441 adev->mm_stats.log2_max_MBps = ilog2(max(1u, max_MBps));
4442
4443 /*
4444 * Register gpu instance before amdgpu_device_enable_mgpu_fan_boost.
4445 * Otherwise the mgpu fan boost feature will be skipped due to the
4446 * gpu instance is counted less.
4447 */
4448 amdgpu_register_gpu_instance(adev);
4449
4450 /* enable clockgating, etc. after ib tests, etc. since some blocks require
4451 * explicit gating rather than handling it automatically.
4452 */
4453 if (!adev->gmc.xgmi.pending_reset) {
4454 r = amdgpu_device_ip_late_init(adev);
4455 if (r) {
4456 dev_err(adev->dev, "amdgpu_device_ip_late_init failed\n");
4457 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_AMDGPU_LATE_INIT_FAIL, 0, r);
4458 goto release_ras_con;
4459 }
4460 /* must succeed. */
4461 amdgpu_ras_resume(adev);
4462 queue_delayed_work(system_wq, &adev->delayed_init_work,
4463 msecs_to_jiffies(AMDGPU_RESUME_MS));
4464 }
4465
4466 if (amdgpu_sriov_vf(adev)) {
4467 amdgpu_virt_release_full_gpu(adev, true);
4468 flush_delayed_work(&adev->delayed_init_work);
4469 }
4470
4471 /*
4472 * Place those sysfs registering after `late_init`. As some of those
4473 * operations performed in `late_init` might affect the sysfs
4474 * interfaces creating.
4475 */
4476 r = amdgpu_atombios_sysfs_init(adev);
4477 if (r)
4478 drm_err(&adev->ddev,
4479 "registering atombios sysfs failed (%d).\n", r);
4480
4481 r = amdgpu_pm_sysfs_init(adev);
4482 if (r)
4483 DRM_ERROR("registering pm sysfs failed (%d).\n", r);
4484
4485 r = amdgpu_ucode_sysfs_init(adev);
4486 if (r) {
4487 adev->ucode_sysfs_en = false;
4488 DRM_ERROR("Creating firmware sysfs failed (%d).\n", r);
4489 } else
4490 adev->ucode_sysfs_en = true;
4491
4492 r = amdgpu_device_attr_sysfs_init(adev);
4493 if (r)
4494 dev_err(adev->dev, "Could not create amdgpu device attr\n");
4495
4496 r = devm_device_add_group(adev->dev, &amdgpu_board_attrs_group);
4497 if (r)
4498 dev_err(adev->dev,
4499 "Could not create amdgpu board attributes\n");
4500
4501 amdgpu_fru_sysfs_init(adev);
4502 amdgpu_reg_state_sysfs_init(adev);
4503
4504 if (IS_ENABLED(CONFIG_PERF_EVENTS))
4505 r = amdgpu_pmu_init(adev);
4506 if (r)
4507 dev_err(adev->dev, "amdgpu_pmu_init failed\n");
4508
4509 /* Have stored pci confspace at hand for restore in sudden PCI error */
4510 if (amdgpu_device_cache_pci_state(adev->pdev))
4511 pci_restore_state(pdev);
4512
4513 /* if we have > 1 VGA cards, then disable the amdgpu VGA resources */
4514 /* this will fail for cards that aren't VGA class devices, just
4515 * ignore it
4516 */
4517 if ((adev->pdev->class >> 8) == PCI_CLASS_DISPLAY_VGA)
4518 vga_client_register(adev->pdev, amdgpu_device_vga_set_decode);
4519
4520 px = amdgpu_device_supports_px(ddev);
4521
4522 if (px || (!dev_is_removable(&adev->pdev->dev) &&
4523 apple_gmux_detect(NULL, NULL)))
4524 vga_switcheroo_register_client(adev->pdev,
4525 &amdgpu_switcheroo_ops, px);
4526
4527 if (px)
4528 vga_switcheroo_init_domain_pm_ops(adev->dev, &adev->vga_pm_domain);
4529
4530 if (adev->gmc.xgmi.pending_reset)
4531 queue_delayed_work(system_wq, &mgpu_info.delayed_reset_work,
4532 msecs_to_jiffies(AMDGPU_RESUME_MS));
4533
4534 amdgpu_device_check_iommu_direct_map(adev);
4535
4536 adev->pm_nb.notifier_call = amdgpu_device_pm_notifier;
4537 r = register_pm_notifier(&adev->pm_nb);
4538 if (r)
4539 goto failed;
4540
4541 return 0;
4542
4543 release_ras_con:
4544 if (amdgpu_sriov_vf(adev))
4545 amdgpu_virt_release_full_gpu(adev, true);
4546
4547 /* failed in exclusive mode due to timeout */
4548 if (amdgpu_sriov_vf(adev) &&
4549 !amdgpu_sriov_runtime(adev) &&
4550 amdgpu_virt_mmio_blocked(adev) &&
4551 !amdgpu_virt_wait_reset(adev)) {
4552 dev_err(adev->dev, "VF exclusive mode timeout\n");
4553 /* Don't send request since VF is inactive. */
4554 adev->virt.caps &= ~AMDGPU_SRIOV_CAPS_RUNTIME;
4555 adev->virt.ops = NULL;
4556 r = -EAGAIN;
4557 }
4558 amdgpu_release_ras_context(adev);
4559
4560 failed:
4561 amdgpu_vf_error_trans_all(adev);
4562
4563 return r;
4564 }
4565
amdgpu_device_unmap_mmio(struct amdgpu_device * adev)4566 static void amdgpu_device_unmap_mmio(struct amdgpu_device *adev)
4567 {
4568
4569 /* Clear all CPU mappings pointing to this device */
4570 unmap_mapping_range(adev->ddev.anon_inode->i_mapping, 0, 0, 1);
4571
4572 /* Unmap all mapped bars - Doorbell, registers and VRAM */
4573 amdgpu_doorbell_fini(adev);
4574
4575 iounmap(adev->rmmio);
4576 adev->rmmio = NULL;
4577 if (adev->mman.aper_base_kaddr)
4578 iounmap(adev->mman.aper_base_kaddr);
4579 adev->mman.aper_base_kaddr = NULL;
4580
4581 /* Memory manager related */
4582 if (!adev->gmc.xgmi.connected_to_cpu && !adev->gmc.is_app_apu) {
4583 arch_phys_wc_del(adev->gmc.vram_mtrr);
4584 arch_io_free_memtype_wc(adev->gmc.aper_base, adev->gmc.aper_size);
4585 }
4586 }
4587
4588 /**
4589 * amdgpu_device_fini_hw - tear down the driver
4590 *
4591 * @adev: amdgpu_device pointer
4592 *
4593 * Tear down the driver info (all asics).
4594 * Called at driver shutdown.
4595 */
amdgpu_device_fini_hw(struct amdgpu_device * adev)4596 void amdgpu_device_fini_hw(struct amdgpu_device *adev)
4597 {
4598 dev_info(adev->dev, "amdgpu: finishing device.\n");
4599 flush_delayed_work(&adev->delayed_init_work);
4600
4601 if (adev->mman.initialized)
4602 drain_workqueue(adev->mman.bdev.wq);
4603 adev->shutdown = true;
4604
4605 unregister_pm_notifier(&adev->pm_nb);
4606
4607 /* make sure IB test finished before entering exclusive mode
4608 * to avoid preemption on IB test
4609 */
4610 if (amdgpu_sriov_vf(adev)) {
4611 amdgpu_virt_request_full_gpu(adev, false);
4612 amdgpu_virt_fini_data_exchange(adev);
4613 }
4614
4615 /* disable all interrupts */
4616 amdgpu_irq_disable_all(adev);
4617 if (adev->mode_info.mode_config_initialized) {
4618 if (!drm_drv_uses_atomic_modeset(adev_to_drm(adev)))
4619 drm_helper_force_disable_all(adev_to_drm(adev));
4620 else
4621 drm_atomic_helper_shutdown(adev_to_drm(adev));
4622 }
4623 amdgpu_fence_driver_hw_fini(adev);
4624
4625 if (adev->pm.sysfs_initialized)
4626 amdgpu_pm_sysfs_fini(adev);
4627 if (adev->ucode_sysfs_en)
4628 amdgpu_ucode_sysfs_fini(adev);
4629 amdgpu_device_attr_sysfs_fini(adev);
4630 amdgpu_fru_sysfs_fini(adev);
4631
4632 amdgpu_reg_state_sysfs_fini(adev);
4633
4634 /* disable ras feature must before hw fini */
4635 amdgpu_ras_pre_fini(adev);
4636
4637 amdgpu_ttm_set_buffer_funcs_status(adev, false);
4638
4639 amdgpu_device_ip_fini_early(adev);
4640
4641 amdgpu_irq_fini_hw(adev);
4642
4643 if (adev->mman.initialized)
4644 ttm_device_clear_dma_mappings(&adev->mman.bdev);
4645
4646 amdgpu_gart_dummy_page_fini(adev);
4647
4648 if (drm_dev_is_unplugged(adev_to_drm(adev)))
4649 amdgpu_device_unmap_mmio(adev);
4650
4651 }
4652
amdgpu_device_fini_sw(struct amdgpu_device * adev)4653 void amdgpu_device_fini_sw(struct amdgpu_device *adev)
4654 {
4655 int idx;
4656 bool px;
4657
4658 amdgpu_device_ip_fini(adev);
4659 amdgpu_fence_driver_sw_fini(adev);
4660 amdgpu_ucode_release(&adev->firmware.gpu_info_fw);
4661 adev->accel_working = false;
4662 dma_fence_put(rcu_dereference_protected(adev->gang_submit, true));
4663
4664 amdgpu_reset_fini(adev);
4665
4666 /* free i2c buses */
4667 if (!amdgpu_device_has_dc_support(adev))
4668 amdgpu_i2c_fini(adev);
4669
4670 if (amdgpu_emu_mode != 1)
4671 amdgpu_atombios_fini(adev);
4672
4673 kfree(adev->bios);
4674 adev->bios = NULL;
4675
4676 kfree(adev->fru_info);
4677 adev->fru_info = NULL;
4678
4679 kfree(adev->xcp_mgr);
4680 adev->xcp_mgr = NULL;
4681
4682 px = amdgpu_device_supports_px(adev_to_drm(adev));
4683
4684 if (px || (!dev_is_removable(&adev->pdev->dev) &&
4685 apple_gmux_detect(NULL, NULL)))
4686 vga_switcheroo_unregister_client(adev->pdev);
4687
4688 if (px)
4689 vga_switcheroo_fini_domain_pm_ops(adev->dev);
4690
4691 if ((adev->pdev->class >> 8) == PCI_CLASS_DISPLAY_VGA)
4692 vga_client_unregister(adev->pdev);
4693
4694 if (drm_dev_enter(adev_to_drm(adev), &idx)) {
4695
4696 iounmap(adev->rmmio);
4697 adev->rmmio = NULL;
4698 drm_dev_exit(idx);
4699 }
4700
4701 if (IS_ENABLED(CONFIG_PERF_EVENTS))
4702 amdgpu_pmu_fini(adev);
4703 if (adev->mman.discovery_bin)
4704 amdgpu_discovery_fini(adev);
4705
4706 amdgpu_reset_put_reset_domain(adev->reset_domain);
4707 adev->reset_domain = NULL;
4708
4709 kfree(adev->pci_state);
4710
4711 }
4712
4713 /**
4714 * amdgpu_device_evict_resources - evict device resources
4715 * @adev: amdgpu device object
4716 *
4717 * Evicts all ttm device resources(vram BOs, gart table) from the lru list
4718 * of the vram memory type. Mainly used for evicting device resources
4719 * at suspend time.
4720 *
4721 */
amdgpu_device_evict_resources(struct amdgpu_device * adev)4722 static int amdgpu_device_evict_resources(struct amdgpu_device *adev)
4723 {
4724 int ret;
4725
4726 /* No need to evict vram on APUs for suspend to ram or s2idle */
4727 if ((adev->in_s3 || adev->in_s0ix) && (adev->flags & AMD_IS_APU))
4728 return 0;
4729
4730 ret = amdgpu_ttm_evict_resources(adev, TTM_PL_VRAM);
4731 if (ret)
4732 DRM_WARN("evicting device resources failed\n");
4733 return ret;
4734 }
4735
4736 /*
4737 * Suspend & resume.
4738 */
4739 /**
4740 * amdgpu_device_pm_notifier - Notification block for Suspend/Hibernate events
4741 * @nb: notifier block
4742 * @mode: suspend mode
4743 * @data: data
4744 *
4745 * This function is called when the system is about to suspend or hibernate.
4746 * It is used to set the appropriate flags so that eviction can be optimized
4747 * in the pm prepare callback.
4748 */
amdgpu_device_pm_notifier(struct notifier_block * nb,unsigned long mode,void * data)4749 static int amdgpu_device_pm_notifier(struct notifier_block *nb, unsigned long mode,
4750 void *data)
4751 {
4752 struct amdgpu_device *adev = container_of(nb, struct amdgpu_device, pm_nb);
4753
4754 switch (mode) {
4755 case PM_HIBERNATION_PREPARE:
4756 adev->in_s4 = true;
4757 break;
4758 case PM_POST_HIBERNATION:
4759 adev->in_s4 = false;
4760 break;
4761 }
4762
4763 return NOTIFY_DONE;
4764 }
4765
4766 /**
4767 * amdgpu_device_prepare - prepare for device suspend
4768 *
4769 * @dev: drm dev pointer
4770 *
4771 * Prepare to put the hw in the suspend state (all asics).
4772 * Returns 0 for success or an error on failure.
4773 * Called at driver suspend.
4774 */
amdgpu_device_prepare(struct drm_device * dev)4775 int amdgpu_device_prepare(struct drm_device *dev)
4776 {
4777 struct amdgpu_device *adev = drm_to_adev(dev);
4778 int i, r;
4779
4780 if (dev->switch_power_state == DRM_SWITCH_POWER_OFF)
4781 return 0;
4782
4783 /* Evict the majority of BOs before starting suspend sequence */
4784 r = amdgpu_device_evict_resources(adev);
4785 if (r)
4786 return r;
4787
4788 flush_delayed_work(&adev->gfx.gfx_off_delay_work);
4789
4790 for (i = 0; i < adev->num_ip_blocks; i++) {
4791 if (!adev->ip_blocks[i].status.valid)
4792 continue;
4793 if (!adev->ip_blocks[i].version->funcs->prepare_suspend)
4794 continue;
4795 r = adev->ip_blocks[i].version->funcs->prepare_suspend((void *)adev);
4796 if (r)
4797 return r;
4798 }
4799
4800 return 0;
4801 }
4802
4803 /**
4804 * amdgpu_device_suspend - initiate device suspend
4805 *
4806 * @dev: drm dev pointer
4807 * @fbcon : notify the fbdev of suspend
4808 *
4809 * Puts the hw in the suspend state (all asics).
4810 * Returns 0 for success or an error on failure.
4811 * Called at driver suspend.
4812 */
amdgpu_device_suspend(struct drm_device * dev,bool fbcon)4813 int amdgpu_device_suspend(struct drm_device *dev, bool fbcon)
4814 {
4815 struct amdgpu_device *adev = drm_to_adev(dev);
4816 int r = 0;
4817
4818 if (dev->switch_power_state == DRM_SWITCH_POWER_OFF)
4819 return 0;
4820
4821 adev->in_suspend = true;
4822
4823 if (amdgpu_sriov_vf(adev)) {
4824 amdgpu_virt_fini_data_exchange(adev);
4825 r = amdgpu_virt_request_full_gpu(adev, false);
4826 if (r)
4827 return r;
4828 }
4829
4830 if (amdgpu_acpi_smart_shift_update(dev, AMDGPU_SS_DEV_D3))
4831 DRM_WARN("smart shift update failed\n");
4832
4833 if (fbcon)
4834 drm_fb_helper_set_suspend_unlocked(adev_to_drm(adev)->fb_helper, true);
4835
4836 cancel_delayed_work_sync(&adev->delayed_init_work);
4837
4838 amdgpu_ras_suspend(adev);
4839
4840 amdgpu_device_ip_suspend_phase1(adev);
4841
4842 if (!adev->in_s0ix)
4843 amdgpu_amdkfd_suspend(adev, adev->in_runpm);
4844
4845 r = amdgpu_device_evict_resources(adev);
4846 if (r)
4847 return r;
4848
4849 amdgpu_ttm_set_buffer_funcs_status(adev, false);
4850
4851 amdgpu_fence_driver_hw_fini(adev);
4852
4853 amdgpu_device_ip_suspend_phase2(adev);
4854
4855 if (amdgpu_sriov_vf(adev))
4856 amdgpu_virt_release_full_gpu(adev, false);
4857
4858 r = amdgpu_dpm_notify_rlc_state(adev, false);
4859 if (r)
4860 return r;
4861
4862 return 0;
4863 }
4864
4865 /**
4866 * amdgpu_device_resume - initiate device resume
4867 *
4868 * @dev: drm dev pointer
4869 * @fbcon : notify the fbdev of resume
4870 *
4871 * Bring the hw back to operating state (all asics).
4872 * Returns 0 for success or an error on failure.
4873 * Called at driver resume.
4874 */
amdgpu_device_resume(struct drm_device * dev,bool fbcon)4875 int amdgpu_device_resume(struct drm_device *dev, bool fbcon)
4876 {
4877 struct amdgpu_device *adev = drm_to_adev(dev);
4878 int r = 0;
4879
4880 if (amdgpu_sriov_vf(adev)) {
4881 r = amdgpu_virt_request_full_gpu(adev, true);
4882 if (r)
4883 return r;
4884 }
4885
4886 if (dev->switch_power_state == DRM_SWITCH_POWER_OFF)
4887 return 0;
4888
4889 if (adev->in_s0ix)
4890 amdgpu_dpm_gfx_state_change(adev, sGpuChangeState_D0Entry);
4891
4892 /* post card */
4893 if (amdgpu_device_need_post(adev)) {
4894 r = amdgpu_device_asic_init(adev);
4895 if (r)
4896 dev_err(adev->dev, "amdgpu asic init failed\n");
4897 }
4898
4899 r = amdgpu_device_ip_resume(adev);
4900
4901 if (r) {
4902 dev_err(adev->dev, "amdgpu_device_ip_resume failed (%d).\n", r);
4903 goto exit;
4904 }
4905
4906 if (!adev->in_s0ix) {
4907 r = amdgpu_amdkfd_resume(adev, adev->in_runpm);
4908 if (r)
4909 goto exit;
4910 }
4911
4912 r = amdgpu_device_ip_late_init(adev);
4913 if (r)
4914 goto exit;
4915
4916 queue_delayed_work(system_wq, &adev->delayed_init_work,
4917 msecs_to_jiffies(AMDGPU_RESUME_MS));
4918 exit:
4919 if (amdgpu_sriov_vf(adev)) {
4920 amdgpu_virt_init_data_exchange(adev);
4921 amdgpu_virt_release_full_gpu(adev, true);
4922 }
4923
4924 if (r)
4925 return r;
4926
4927 /* Make sure IB tests flushed */
4928 flush_delayed_work(&adev->delayed_init_work);
4929
4930 if (fbcon)
4931 drm_fb_helper_set_suspend_unlocked(adev_to_drm(adev)->fb_helper, false);
4932
4933 amdgpu_ras_resume(adev);
4934
4935 if (adev->mode_info.num_crtc) {
4936 /*
4937 * Most of the connector probing functions try to acquire runtime pm
4938 * refs to ensure that the GPU is powered on when connector polling is
4939 * performed. Since we're calling this from a runtime PM callback,
4940 * trying to acquire rpm refs will cause us to deadlock.
4941 *
4942 * Since we're guaranteed to be holding the rpm lock, it's safe to
4943 * temporarily disable the rpm helpers so this doesn't deadlock us.
4944 */
4945 #ifdef CONFIG_PM
4946 dev->dev->power.disable_depth++;
4947 #endif
4948 if (!adev->dc_enabled)
4949 drm_helper_hpd_irq_event(dev);
4950 else
4951 drm_kms_helper_hotplug_event(dev);
4952 #ifdef CONFIG_PM
4953 dev->dev->power.disable_depth--;
4954 #endif
4955 }
4956
4957 amdgpu_vram_mgr_clear_reset_blocks(adev);
4958 adev->in_suspend = false;
4959
4960 if (adev->enable_mes)
4961 amdgpu_mes_self_test(adev);
4962
4963 if (amdgpu_acpi_smart_shift_update(dev, AMDGPU_SS_DEV_D0))
4964 DRM_WARN("smart shift update failed\n");
4965
4966 return 0;
4967 }
4968
4969 /**
4970 * amdgpu_device_ip_check_soft_reset - did soft reset succeed
4971 *
4972 * @adev: amdgpu_device pointer
4973 *
4974 * The list of all the hardware IPs that make up the asic is walked and
4975 * the check_soft_reset callbacks are run. check_soft_reset determines
4976 * if the asic is still hung or not.
4977 * Returns true if any of the IPs are still in a hung state, false if not.
4978 */
amdgpu_device_ip_check_soft_reset(struct amdgpu_device * adev)4979 static bool amdgpu_device_ip_check_soft_reset(struct amdgpu_device *adev)
4980 {
4981 int i;
4982 bool asic_hang = false;
4983
4984 if (amdgpu_sriov_vf(adev))
4985 return true;
4986
4987 if (amdgpu_asic_need_full_reset(adev))
4988 return true;
4989
4990 for (i = 0; i < adev->num_ip_blocks; i++) {
4991 if (!adev->ip_blocks[i].status.valid)
4992 continue;
4993 if (adev->ip_blocks[i].version->funcs->check_soft_reset)
4994 adev->ip_blocks[i].status.hang =
4995 adev->ip_blocks[i].version->funcs->check_soft_reset(adev);
4996 if (adev->ip_blocks[i].status.hang) {
4997 dev_info(adev->dev, "IP block:%s is hung!\n", adev->ip_blocks[i].version->funcs->name);
4998 asic_hang = true;
4999 }
5000 }
5001 return asic_hang;
5002 }
5003
5004 /**
5005 * amdgpu_device_ip_pre_soft_reset - prepare for soft reset
5006 *
5007 * @adev: amdgpu_device pointer
5008 *
5009 * The list of all the hardware IPs that make up the asic is walked and the
5010 * pre_soft_reset callbacks are run if the block is hung. pre_soft_reset
5011 * handles any IP specific hardware or software state changes that are
5012 * necessary for a soft reset to succeed.
5013 * Returns 0 on success, negative error code on failure.
5014 */
amdgpu_device_ip_pre_soft_reset(struct amdgpu_device * adev)5015 static int amdgpu_device_ip_pre_soft_reset(struct amdgpu_device *adev)
5016 {
5017 int i, r = 0;
5018
5019 for (i = 0; i < adev->num_ip_blocks; i++) {
5020 if (!adev->ip_blocks[i].status.valid)
5021 continue;
5022 if (adev->ip_blocks[i].status.hang &&
5023 adev->ip_blocks[i].version->funcs->pre_soft_reset) {
5024 r = adev->ip_blocks[i].version->funcs->pre_soft_reset(adev);
5025 if (r)
5026 return r;
5027 }
5028 }
5029
5030 return 0;
5031 }
5032
5033 /**
5034 * amdgpu_device_ip_need_full_reset - check if a full asic reset is needed
5035 *
5036 * @adev: amdgpu_device pointer
5037 *
5038 * Some hardware IPs cannot be soft reset. If they are hung, a full gpu
5039 * reset is necessary to recover.
5040 * Returns true if a full asic reset is required, false if not.
5041 */
amdgpu_device_ip_need_full_reset(struct amdgpu_device * adev)5042 static bool amdgpu_device_ip_need_full_reset(struct amdgpu_device *adev)
5043 {
5044 int i;
5045
5046 if (amdgpu_asic_need_full_reset(adev))
5047 return true;
5048
5049 for (i = 0; i < adev->num_ip_blocks; i++) {
5050 if (!adev->ip_blocks[i].status.valid)
5051 continue;
5052 if ((adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC) ||
5053 (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_SMC) ||
5054 (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_ACP) ||
5055 (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_DCE) ||
5056 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_PSP) {
5057 if (adev->ip_blocks[i].status.hang) {
5058 dev_info(adev->dev, "Some block need full reset!\n");
5059 return true;
5060 }
5061 }
5062 }
5063 return false;
5064 }
5065
5066 /**
5067 * amdgpu_device_ip_soft_reset - do a soft reset
5068 *
5069 * @adev: amdgpu_device pointer
5070 *
5071 * The list of all the hardware IPs that make up the asic is walked and the
5072 * soft_reset callbacks are run if the block is hung. soft_reset handles any
5073 * IP specific hardware or software state changes that are necessary to soft
5074 * reset the IP.
5075 * Returns 0 on success, negative error code on failure.
5076 */
amdgpu_device_ip_soft_reset(struct amdgpu_device * adev)5077 static int amdgpu_device_ip_soft_reset(struct amdgpu_device *adev)
5078 {
5079 int i, r = 0;
5080
5081 for (i = 0; i < adev->num_ip_blocks; i++) {
5082 if (!adev->ip_blocks[i].status.valid)
5083 continue;
5084 if (adev->ip_blocks[i].status.hang &&
5085 adev->ip_blocks[i].version->funcs->soft_reset) {
5086 r = adev->ip_blocks[i].version->funcs->soft_reset(adev);
5087 if (r)
5088 return r;
5089 }
5090 }
5091
5092 return 0;
5093 }
5094
5095 /**
5096 * amdgpu_device_ip_post_soft_reset - clean up from soft reset
5097 *
5098 * @adev: amdgpu_device pointer
5099 *
5100 * The list of all the hardware IPs that make up the asic is walked and the
5101 * post_soft_reset callbacks are run if the asic was hung. post_soft_reset
5102 * handles any IP specific hardware or software state changes that are
5103 * necessary after the IP has been soft reset.
5104 * Returns 0 on success, negative error code on failure.
5105 */
amdgpu_device_ip_post_soft_reset(struct amdgpu_device * adev)5106 static int amdgpu_device_ip_post_soft_reset(struct amdgpu_device *adev)
5107 {
5108 int i, r = 0;
5109
5110 for (i = 0; i < adev->num_ip_blocks; i++) {
5111 if (!adev->ip_blocks[i].status.valid)
5112 continue;
5113 if (adev->ip_blocks[i].status.hang &&
5114 adev->ip_blocks[i].version->funcs->post_soft_reset)
5115 r = adev->ip_blocks[i].version->funcs->post_soft_reset(adev);
5116 if (r)
5117 return r;
5118 }
5119
5120 return 0;
5121 }
5122
5123 /**
5124 * amdgpu_device_reset_sriov - reset ASIC for SR-IOV vf
5125 *
5126 * @adev: amdgpu_device pointer
5127 * @reset_context: amdgpu reset context pointer
5128 *
5129 * do VF FLR and reinitialize Asic
5130 * return 0 means succeeded otherwise failed
5131 */
amdgpu_device_reset_sriov(struct amdgpu_device * adev,struct amdgpu_reset_context * reset_context)5132 static int amdgpu_device_reset_sriov(struct amdgpu_device *adev,
5133 struct amdgpu_reset_context *reset_context)
5134 {
5135 int r;
5136 struct amdgpu_hive_info *hive = NULL;
5137
5138 if (test_bit(AMDGPU_HOST_FLR, &reset_context->flags)) {
5139 if (!amdgpu_ras_get_fed_status(adev))
5140 amdgpu_virt_ready_to_reset(adev);
5141 amdgpu_virt_wait_reset(adev);
5142 clear_bit(AMDGPU_HOST_FLR, &reset_context->flags);
5143 r = amdgpu_virt_request_full_gpu(adev, true);
5144 } else {
5145 r = amdgpu_virt_reset_gpu(adev);
5146 }
5147 if (r)
5148 return r;
5149
5150 amdgpu_ras_set_fed(adev, false);
5151 amdgpu_irq_gpu_reset_resume_helper(adev);
5152
5153 /* some sw clean up VF needs to do before recover */
5154 amdgpu_virt_post_reset(adev);
5155
5156 /* Resume IP prior to SMC */
5157 r = amdgpu_device_ip_reinit_early_sriov(adev);
5158 if (r)
5159 return r;
5160
5161 amdgpu_virt_init_data_exchange(adev);
5162
5163 r = amdgpu_device_fw_loading(adev);
5164 if (r)
5165 return r;
5166
5167 /* now we are okay to resume SMC/CP/SDMA */
5168 r = amdgpu_device_ip_reinit_late_sriov(adev);
5169 if (r)
5170 return r;
5171
5172 hive = amdgpu_get_xgmi_hive(adev);
5173 /* Update PSP FW topology after reset */
5174 if (hive && adev->gmc.xgmi.num_physical_nodes > 1)
5175 r = amdgpu_xgmi_update_topology(hive, adev);
5176 if (hive)
5177 amdgpu_put_xgmi_hive(hive);
5178 if (r)
5179 return r;
5180
5181 r = amdgpu_ib_ring_tests(adev);
5182 if (r)
5183 return r;
5184
5185 if (adev->virt.gim_feature & AMDGIM_FEATURE_GIM_FLR_VRAMLOST)
5186 amdgpu_inc_vram_lost(adev);
5187
5188 /* need to be called during full access so we can't do it later like
5189 * bare-metal does.
5190 */
5191 amdgpu_amdkfd_post_reset(adev);
5192 amdgpu_virt_release_full_gpu(adev, true);
5193
5194 /* Aldebaran and gfx_11_0_3 support ras in SRIOV, so need resume ras during reset */
5195 if (amdgpu_ip_version(adev, GC_HWIP, 0) == IP_VERSION(9, 4, 2) ||
5196 amdgpu_ip_version(adev, GC_HWIP, 0) == IP_VERSION(9, 4, 3) ||
5197 amdgpu_ip_version(adev, GC_HWIP, 0) == IP_VERSION(9, 4, 4) ||
5198 amdgpu_ip_version(adev, GC_HWIP, 0) == IP_VERSION(11, 0, 3))
5199 amdgpu_ras_resume(adev);
5200 return 0;
5201 }
5202
5203 /**
5204 * amdgpu_device_has_job_running - check if there is any job in mirror list
5205 *
5206 * @adev: amdgpu_device pointer
5207 *
5208 * check if there is any job in mirror list
5209 */
amdgpu_device_has_job_running(struct amdgpu_device * adev)5210 bool amdgpu_device_has_job_running(struct amdgpu_device *adev)
5211 {
5212 int i;
5213 struct drm_sched_job *job;
5214
5215 for (i = 0; i < AMDGPU_MAX_RINGS; ++i) {
5216 struct amdgpu_ring *ring = adev->rings[i];
5217
5218 if (!amdgpu_ring_sched_ready(ring))
5219 continue;
5220
5221 spin_lock(&ring->sched.job_list_lock);
5222 job = list_first_entry_or_null(&ring->sched.pending_list,
5223 struct drm_sched_job, list);
5224 spin_unlock(&ring->sched.job_list_lock);
5225 if (job)
5226 return true;
5227 }
5228 return false;
5229 }
5230
5231 /**
5232 * amdgpu_device_should_recover_gpu - check if we should try GPU recovery
5233 *
5234 * @adev: amdgpu_device pointer
5235 *
5236 * Check amdgpu_gpu_recovery and SRIOV status to see if we should try to recover
5237 * a hung GPU.
5238 */
amdgpu_device_should_recover_gpu(struct amdgpu_device * adev)5239 bool amdgpu_device_should_recover_gpu(struct amdgpu_device *adev)
5240 {
5241
5242 if (amdgpu_gpu_recovery == 0)
5243 goto disabled;
5244
5245 /* Skip soft reset check in fatal error mode */
5246 if (!amdgpu_ras_is_poison_mode_supported(adev))
5247 return true;
5248
5249 if (amdgpu_sriov_vf(adev))
5250 return true;
5251
5252 if (amdgpu_gpu_recovery == -1) {
5253 switch (adev->asic_type) {
5254 #ifdef CONFIG_DRM_AMDGPU_SI
5255 case CHIP_VERDE:
5256 case CHIP_TAHITI:
5257 case CHIP_PITCAIRN:
5258 case CHIP_OLAND:
5259 case CHIP_HAINAN:
5260 #endif
5261 #ifdef CONFIG_DRM_AMDGPU_CIK
5262 case CHIP_KAVERI:
5263 case CHIP_KABINI:
5264 case CHIP_MULLINS:
5265 #endif
5266 case CHIP_CARRIZO:
5267 case CHIP_STONEY:
5268 case CHIP_CYAN_SKILLFISH:
5269 goto disabled;
5270 default:
5271 break;
5272 }
5273 }
5274
5275 return true;
5276
5277 disabled:
5278 dev_info(adev->dev, "GPU recovery disabled.\n");
5279 return false;
5280 }
5281
amdgpu_device_mode1_reset(struct amdgpu_device * adev)5282 int amdgpu_device_mode1_reset(struct amdgpu_device *adev)
5283 {
5284 u32 i;
5285 int ret = 0;
5286
5287 amdgpu_atombios_scratch_regs_engine_hung(adev, true);
5288
5289 dev_info(adev->dev, "GPU mode1 reset\n");
5290
5291 /* Cache the state before bus master disable. The saved config space
5292 * values are used in other cases like restore after mode-2 reset.
5293 */
5294 amdgpu_device_cache_pci_state(adev->pdev);
5295
5296 /* disable BM */
5297 pci_clear_master(adev->pdev);
5298
5299 if (amdgpu_dpm_is_mode1_reset_supported(adev)) {
5300 dev_info(adev->dev, "GPU smu mode1 reset\n");
5301 ret = amdgpu_dpm_mode1_reset(adev);
5302 } else {
5303 dev_info(adev->dev, "GPU psp mode1 reset\n");
5304 ret = psp_gpu_reset(adev);
5305 }
5306
5307 if (ret)
5308 goto mode1_reset_failed;
5309
5310 amdgpu_device_load_pci_state(adev->pdev);
5311 ret = amdgpu_psp_wait_for_bootloader(adev);
5312 if (ret)
5313 goto mode1_reset_failed;
5314
5315 /* wait for asic to come out of reset */
5316 for (i = 0; i < adev->usec_timeout; i++) {
5317 u32 memsize = adev->nbio.funcs->get_memsize(adev);
5318
5319 if (memsize != 0xffffffff)
5320 break;
5321 udelay(1);
5322 }
5323
5324 if (i >= adev->usec_timeout) {
5325 ret = -ETIMEDOUT;
5326 goto mode1_reset_failed;
5327 }
5328
5329 amdgpu_atombios_scratch_regs_engine_hung(adev, false);
5330
5331 return 0;
5332
5333 mode1_reset_failed:
5334 dev_err(adev->dev, "GPU mode1 reset failed\n");
5335 return ret;
5336 }
5337
amdgpu_device_pre_asic_reset(struct amdgpu_device * adev,struct amdgpu_reset_context * reset_context)5338 int amdgpu_device_pre_asic_reset(struct amdgpu_device *adev,
5339 struct amdgpu_reset_context *reset_context)
5340 {
5341 int i, r = 0;
5342 struct amdgpu_job *job = NULL;
5343 struct amdgpu_device *tmp_adev = reset_context->reset_req_dev;
5344 bool need_full_reset =
5345 test_bit(AMDGPU_NEED_FULL_RESET, &reset_context->flags);
5346
5347 if (reset_context->reset_req_dev == adev)
5348 job = reset_context->job;
5349
5350 if (amdgpu_sriov_vf(adev))
5351 amdgpu_virt_pre_reset(adev);
5352
5353 amdgpu_fence_driver_isr_toggle(adev, true);
5354
5355 /* block all schedulers and reset given job's ring */
5356 for (i = 0; i < AMDGPU_MAX_RINGS; ++i) {
5357 struct amdgpu_ring *ring = adev->rings[i];
5358
5359 if (!amdgpu_ring_sched_ready(ring))
5360 continue;
5361
5362 /* Clear job fence from fence drv to avoid force_completion
5363 * leave NULL and vm flush fence in fence drv
5364 */
5365 amdgpu_fence_driver_clear_job_fences(ring);
5366
5367 /* after all hw jobs are reset, hw fence is meaningless, so force_completion */
5368 amdgpu_fence_driver_force_completion(ring);
5369 }
5370
5371 amdgpu_fence_driver_isr_toggle(adev, false);
5372
5373 if (job && job->vm)
5374 drm_sched_increase_karma(&job->base);
5375
5376 r = amdgpu_reset_prepare_hwcontext(adev, reset_context);
5377 /* If reset handler not implemented, continue; otherwise return */
5378 if (r == -EOPNOTSUPP)
5379 r = 0;
5380 else
5381 return r;
5382
5383 /* Don't suspend on bare metal if we are not going to HW reset the ASIC */
5384 if (!amdgpu_sriov_vf(adev)) {
5385
5386 if (!need_full_reset)
5387 need_full_reset = amdgpu_device_ip_need_full_reset(adev);
5388
5389 if (!need_full_reset && amdgpu_gpu_recovery &&
5390 amdgpu_device_ip_check_soft_reset(adev)) {
5391 amdgpu_device_ip_pre_soft_reset(adev);
5392 r = amdgpu_device_ip_soft_reset(adev);
5393 amdgpu_device_ip_post_soft_reset(adev);
5394 if (r || amdgpu_device_ip_check_soft_reset(adev)) {
5395 dev_info(adev->dev, "soft reset failed, will fallback to full reset!\n");
5396 need_full_reset = true;
5397 }
5398 }
5399
5400 if (!test_bit(AMDGPU_SKIP_COREDUMP, &reset_context->flags)) {
5401 dev_info(tmp_adev->dev, "Dumping IP State\n");
5402 /* Trigger ip dump before we reset the asic */
5403 for (i = 0; i < tmp_adev->num_ip_blocks; i++)
5404 if (tmp_adev->ip_blocks[i].version->funcs->dump_ip_state)
5405 tmp_adev->ip_blocks[i].version->funcs
5406 ->dump_ip_state((void *)tmp_adev);
5407 dev_info(tmp_adev->dev, "Dumping IP State Completed\n");
5408 }
5409
5410 if (need_full_reset)
5411 r = amdgpu_device_ip_suspend(adev);
5412 if (need_full_reset)
5413 set_bit(AMDGPU_NEED_FULL_RESET, &reset_context->flags);
5414 else
5415 clear_bit(AMDGPU_NEED_FULL_RESET,
5416 &reset_context->flags);
5417 }
5418
5419 return r;
5420 }
5421
amdgpu_do_asic_reset(struct list_head * device_list_handle,struct amdgpu_reset_context * reset_context)5422 int amdgpu_do_asic_reset(struct list_head *device_list_handle,
5423 struct amdgpu_reset_context *reset_context)
5424 {
5425 struct amdgpu_device *tmp_adev = NULL;
5426 bool need_full_reset, skip_hw_reset, vram_lost = false;
5427 int r = 0;
5428
5429 /* Try reset handler method first */
5430 tmp_adev = list_first_entry(device_list_handle, struct amdgpu_device,
5431 reset_list);
5432
5433 reset_context->reset_device_list = device_list_handle;
5434 r = amdgpu_reset_perform_reset(tmp_adev, reset_context);
5435 /* If reset handler not implemented, continue; otherwise return */
5436 if (r == -EOPNOTSUPP)
5437 r = 0;
5438 else
5439 return r;
5440
5441 /* Reset handler not implemented, use the default method */
5442 need_full_reset =
5443 test_bit(AMDGPU_NEED_FULL_RESET, &reset_context->flags);
5444 skip_hw_reset = test_bit(AMDGPU_SKIP_HW_RESET, &reset_context->flags);
5445
5446 /*
5447 * ASIC reset has to be done on all XGMI hive nodes ASAP
5448 * to allow proper links negotiation in FW (within 1 sec)
5449 */
5450 if (!skip_hw_reset && need_full_reset) {
5451 list_for_each_entry(tmp_adev, device_list_handle, reset_list) {
5452 /* For XGMI run all resets in parallel to speed up the process */
5453 if (tmp_adev->gmc.xgmi.num_physical_nodes > 1) {
5454 tmp_adev->gmc.xgmi.pending_reset = false;
5455 if (!queue_work(system_unbound_wq, &tmp_adev->xgmi_reset_work))
5456 r = -EALREADY;
5457 } else
5458 r = amdgpu_asic_reset(tmp_adev);
5459
5460 if (r) {
5461 dev_err(tmp_adev->dev, "ASIC reset failed with error, %d for drm dev, %s",
5462 r, adev_to_drm(tmp_adev)->unique);
5463 goto out;
5464 }
5465 }
5466
5467 /* For XGMI wait for all resets to complete before proceed */
5468 if (!r) {
5469 list_for_each_entry(tmp_adev, device_list_handle, reset_list) {
5470 if (tmp_adev->gmc.xgmi.num_physical_nodes > 1) {
5471 flush_work(&tmp_adev->xgmi_reset_work);
5472 r = tmp_adev->asic_reset_res;
5473 if (r)
5474 break;
5475 }
5476 }
5477 }
5478 }
5479
5480 if (!r && amdgpu_ras_intr_triggered()) {
5481 list_for_each_entry(tmp_adev, device_list_handle, reset_list) {
5482 amdgpu_ras_reset_error_count(tmp_adev, AMDGPU_RAS_BLOCK__MMHUB);
5483 }
5484
5485 amdgpu_ras_intr_cleared();
5486 }
5487
5488 list_for_each_entry(tmp_adev, device_list_handle, reset_list) {
5489 if (need_full_reset) {
5490 /* post card */
5491 amdgpu_ras_set_fed(tmp_adev, false);
5492 r = amdgpu_device_asic_init(tmp_adev);
5493 if (r) {
5494 dev_warn(tmp_adev->dev, "asic atom init failed!");
5495 } else {
5496 dev_info(tmp_adev->dev, "GPU reset succeeded, trying to resume\n");
5497
5498 r = amdgpu_device_ip_resume_phase1(tmp_adev);
5499 if (r)
5500 goto out;
5501
5502 vram_lost = amdgpu_device_check_vram_lost(tmp_adev);
5503
5504 if (!test_bit(AMDGPU_SKIP_COREDUMP, &reset_context->flags))
5505 amdgpu_coredump(tmp_adev, false, vram_lost, reset_context->job);
5506
5507 if (vram_lost) {
5508 DRM_INFO("VRAM is lost due to GPU reset!\n");
5509 amdgpu_inc_vram_lost(tmp_adev);
5510 }
5511
5512 r = amdgpu_device_fw_loading(tmp_adev);
5513 if (r)
5514 return r;
5515
5516 r = amdgpu_xcp_restore_partition_mode(
5517 tmp_adev->xcp_mgr);
5518 if (r)
5519 goto out;
5520
5521 r = amdgpu_device_ip_resume_phase2(tmp_adev);
5522 if (r)
5523 goto out;
5524
5525 if (tmp_adev->mman.buffer_funcs_ring->sched.ready)
5526 amdgpu_ttm_set_buffer_funcs_status(tmp_adev, true);
5527
5528 r = amdgpu_device_ip_resume_phase3(tmp_adev);
5529 if (r)
5530 goto out;
5531
5532 if (vram_lost)
5533 amdgpu_device_fill_reset_magic(tmp_adev);
5534
5535 /*
5536 * Add this ASIC as tracked as reset was already
5537 * complete successfully.
5538 */
5539 amdgpu_register_gpu_instance(tmp_adev);
5540
5541 if (!reset_context->hive &&
5542 tmp_adev->gmc.xgmi.num_physical_nodes > 1)
5543 amdgpu_xgmi_add_device(tmp_adev);
5544
5545 r = amdgpu_device_ip_late_init(tmp_adev);
5546 if (r)
5547 goto out;
5548
5549 drm_fb_helper_set_suspend_unlocked(adev_to_drm(tmp_adev)->fb_helper, false);
5550
5551 /*
5552 * The GPU enters bad state once faulty pages
5553 * by ECC has reached the threshold, and ras
5554 * recovery is scheduled next. So add one check
5555 * here to break recovery if it indeed exceeds
5556 * bad page threshold, and remind user to
5557 * retire this GPU or setting one bigger
5558 * bad_page_threshold value to fix this once
5559 * probing driver again.
5560 */
5561 if (!amdgpu_ras_is_rma(tmp_adev)) {
5562 /* must succeed. */
5563 amdgpu_ras_resume(tmp_adev);
5564 } else {
5565 r = -EINVAL;
5566 goto out;
5567 }
5568
5569 /* Update PSP FW topology after reset */
5570 if (reset_context->hive &&
5571 tmp_adev->gmc.xgmi.num_physical_nodes > 1)
5572 r = amdgpu_xgmi_update_topology(
5573 reset_context->hive, tmp_adev);
5574 }
5575 }
5576
5577 out:
5578 if (!r) {
5579 amdgpu_irq_gpu_reset_resume_helper(tmp_adev);
5580 r = amdgpu_ib_ring_tests(tmp_adev);
5581 if (r) {
5582 dev_err(tmp_adev->dev, "ib ring test failed (%d).\n", r);
5583 need_full_reset = true;
5584 r = -EAGAIN;
5585 goto end;
5586 }
5587 }
5588
5589 if (r)
5590 tmp_adev->asic_reset_res = r;
5591 }
5592
5593 end:
5594 if (need_full_reset)
5595 set_bit(AMDGPU_NEED_FULL_RESET, &reset_context->flags);
5596 else
5597 clear_bit(AMDGPU_NEED_FULL_RESET, &reset_context->flags);
5598 return r;
5599 }
5600
amdgpu_device_set_mp1_state(struct amdgpu_device * adev)5601 static void amdgpu_device_set_mp1_state(struct amdgpu_device *adev)
5602 {
5603
5604 switch (amdgpu_asic_reset_method(adev)) {
5605 case AMD_RESET_METHOD_MODE1:
5606 adev->mp1_state = PP_MP1_STATE_SHUTDOWN;
5607 break;
5608 case AMD_RESET_METHOD_MODE2:
5609 adev->mp1_state = PP_MP1_STATE_RESET;
5610 break;
5611 default:
5612 adev->mp1_state = PP_MP1_STATE_NONE;
5613 break;
5614 }
5615 }
5616
amdgpu_device_unset_mp1_state(struct amdgpu_device * adev)5617 static void amdgpu_device_unset_mp1_state(struct amdgpu_device *adev)
5618 {
5619 amdgpu_vf_error_trans_all(adev);
5620 adev->mp1_state = PP_MP1_STATE_NONE;
5621 }
5622
amdgpu_device_resume_display_audio(struct amdgpu_device * adev)5623 static void amdgpu_device_resume_display_audio(struct amdgpu_device *adev)
5624 {
5625 struct pci_dev *p = NULL;
5626
5627 p = pci_get_domain_bus_and_slot(pci_domain_nr(adev->pdev->bus),
5628 adev->pdev->bus->number, 1);
5629 if (p) {
5630 pm_runtime_enable(&(p->dev));
5631 pm_runtime_resume(&(p->dev));
5632 }
5633
5634 pci_dev_put(p);
5635 }
5636
amdgpu_device_suspend_display_audio(struct amdgpu_device * adev)5637 static int amdgpu_device_suspend_display_audio(struct amdgpu_device *adev)
5638 {
5639 enum amd_reset_method reset_method;
5640 struct pci_dev *p = NULL;
5641 u64 expires;
5642
5643 /*
5644 * For now, only BACO and mode1 reset are confirmed
5645 * to suffer the audio issue without proper suspended.
5646 */
5647 reset_method = amdgpu_asic_reset_method(adev);
5648 if ((reset_method != AMD_RESET_METHOD_BACO) &&
5649 (reset_method != AMD_RESET_METHOD_MODE1))
5650 return -EINVAL;
5651
5652 p = pci_get_domain_bus_and_slot(pci_domain_nr(adev->pdev->bus),
5653 adev->pdev->bus->number, 1);
5654 if (!p)
5655 return -ENODEV;
5656
5657 expires = pm_runtime_autosuspend_expiration(&(p->dev));
5658 if (!expires)
5659 /*
5660 * If we cannot get the audio device autosuspend delay,
5661 * a fixed 4S interval will be used. Considering 3S is
5662 * the audio controller default autosuspend delay setting.
5663 * 4S used here is guaranteed to cover that.
5664 */
5665 expires = ktime_get_mono_fast_ns() + NSEC_PER_SEC * 4ULL;
5666
5667 while (!pm_runtime_status_suspended(&(p->dev))) {
5668 if (!pm_runtime_suspend(&(p->dev)))
5669 break;
5670
5671 if (expires < ktime_get_mono_fast_ns()) {
5672 dev_warn(adev->dev, "failed to suspend display audio\n");
5673 pci_dev_put(p);
5674 /* TODO: abort the succeeding gpu reset? */
5675 return -ETIMEDOUT;
5676 }
5677 }
5678
5679 pm_runtime_disable(&(p->dev));
5680
5681 pci_dev_put(p);
5682 return 0;
5683 }
5684
amdgpu_device_stop_pending_resets(struct amdgpu_device * adev)5685 static inline void amdgpu_device_stop_pending_resets(struct amdgpu_device *adev)
5686 {
5687 struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
5688
5689 #if defined(CONFIG_DEBUG_FS)
5690 if (!amdgpu_sriov_vf(adev))
5691 cancel_work(&adev->reset_work);
5692 #endif
5693
5694 if (adev->kfd.dev)
5695 cancel_work(&adev->kfd.reset_work);
5696
5697 if (amdgpu_sriov_vf(adev))
5698 cancel_work(&adev->virt.flr_work);
5699
5700 if (con && adev->ras_enabled)
5701 cancel_work(&con->recovery_work);
5702
5703 }
5704
amdgpu_device_health_check(struct list_head * device_list_handle)5705 static int amdgpu_device_health_check(struct list_head *device_list_handle)
5706 {
5707 struct amdgpu_device *tmp_adev;
5708 int ret = 0;
5709 u32 status;
5710
5711 list_for_each_entry(tmp_adev, device_list_handle, reset_list) {
5712 pci_read_config_dword(tmp_adev->pdev, PCI_COMMAND, &status);
5713 if (PCI_POSSIBLE_ERROR(status)) {
5714 dev_err(tmp_adev->dev, "device lost from bus!");
5715 ret = -ENODEV;
5716 }
5717 }
5718
5719 return ret;
5720 }
5721
5722 /**
5723 * amdgpu_device_gpu_recover - reset the asic and recover scheduler
5724 *
5725 * @adev: amdgpu_device pointer
5726 * @job: which job trigger hang
5727 * @reset_context: amdgpu reset context pointer
5728 *
5729 * Attempt to reset the GPU if it has hung (all asics).
5730 * Attempt to do soft-reset or full-reset and reinitialize Asic
5731 * Returns 0 for success or an error on failure.
5732 */
5733
amdgpu_device_gpu_recover(struct amdgpu_device * adev,struct amdgpu_job * job,struct amdgpu_reset_context * reset_context)5734 int amdgpu_device_gpu_recover(struct amdgpu_device *adev,
5735 struct amdgpu_job *job,
5736 struct amdgpu_reset_context *reset_context)
5737 {
5738 struct list_head device_list, *device_list_handle = NULL;
5739 bool job_signaled = false;
5740 struct amdgpu_hive_info *hive = NULL;
5741 struct amdgpu_device *tmp_adev = NULL;
5742 int i, r = 0;
5743 bool need_emergency_restart = false;
5744 bool audio_suspended = false;
5745 int retry_limit = AMDGPU_MAX_RETRY_LIMIT;
5746
5747 /*
5748 * Special case: RAS triggered and full reset isn't supported
5749 */
5750 need_emergency_restart = amdgpu_ras_need_emergency_restart(adev);
5751
5752 /*
5753 * Flush RAM to disk so that after reboot
5754 * the user can read log and see why the system rebooted.
5755 */
5756 if (need_emergency_restart && amdgpu_ras_get_context(adev) &&
5757 amdgpu_ras_get_context(adev)->reboot) {
5758 DRM_WARN("Emergency reboot.");
5759
5760 ksys_sync_helper();
5761 emergency_restart();
5762 }
5763
5764 dev_info(adev->dev, "GPU %s begin!\n",
5765 need_emergency_restart ? "jobs stop":"reset");
5766
5767 if (!amdgpu_sriov_vf(adev))
5768 hive = amdgpu_get_xgmi_hive(adev);
5769 if (hive)
5770 mutex_lock(&hive->hive_lock);
5771
5772 reset_context->job = job;
5773 reset_context->hive = hive;
5774 /*
5775 * Build list of devices to reset.
5776 * In case we are in XGMI hive mode, resort the device list
5777 * to put adev in the 1st position.
5778 */
5779 INIT_LIST_HEAD(&device_list);
5780 if (!amdgpu_sriov_vf(adev) && (adev->gmc.xgmi.num_physical_nodes > 1) && hive) {
5781 list_for_each_entry(tmp_adev, &hive->device_list, gmc.xgmi.head) {
5782 list_add_tail(&tmp_adev->reset_list, &device_list);
5783 if (adev->shutdown)
5784 tmp_adev->shutdown = true;
5785 }
5786 if (!list_is_first(&adev->reset_list, &device_list))
5787 list_rotate_to_front(&adev->reset_list, &device_list);
5788 device_list_handle = &device_list;
5789 } else {
5790 list_add_tail(&adev->reset_list, &device_list);
5791 device_list_handle = &device_list;
5792 }
5793
5794 if (!amdgpu_sriov_vf(adev)) {
5795 r = amdgpu_device_health_check(device_list_handle);
5796 if (r)
5797 goto end_reset;
5798 }
5799
5800 /* We need to lock reset domain only once both for XGMI and single device */
5801 tmp_adev = list_first_entry(device_list_handle, struct amdgpu_device,
5802 reset_list);
5803 amdgpu_device_lock_reset_domain(tmp_adev->reset_domain);
5804
5805 /* block all schedulers and reset given job's ring */
5806 list_for_each_entry(tmp_adev, device_list_handle, reset_list) {
5807
5808 amdgpu_device_set_mp1_state(tmp_adev);
5809
5810 /*
5811 * Try to put the audio codec into suspend state
5812 * before gpu reset started.
5813 *
5814 * Due to the power domain of the graphics device
5815 * is shared with AZ power domain. Without this,
5816 * we may change the audio hardware from behind
5817 * the audio driver's back. That will trigger
5818 * some audio codec errors.
5819 */
5820 if (!amdgpu_device_suspend_display_audio(tmp_adev))
5821 audio_suspended = true;
5822
5823 amdgpu_ras_set_error_query_ready(tmp_adev, false);
5824
5825 cancel_delayed_work_sync(&tmp_adev->delayed_init_work);
5826
5827 amdgpu_amdkfd_pre_reset(tmp_adev, reset_context);
5828
5829 /*
5830 * Mark these ASICs to be reseted as untracked first
5831 * And add them back after reset completed
5832 */
5833 amdgpu_unregister_gpu_instance(tmp_adev);
5834
5835 drm_fb_helper_set_suspend_unlocked(adev_to_drm(tmp_adev)->fb_helper, true);
5836
5837 /* disable ras on ALL IPs */
5838 if (!need_emergency_restart &&
5839 amdgpu_device_ip_need_full_reset(tmp_adev))
5840 amdgpu_ras_suspend(tmp_adev);
5841
5842 for (i = 0; i < AMDGPU_MAX_RINGS; ++i) {
5843 struct amdgpu_ring *ring = tmp_adev->rings[i];
5844
5845 if (!amdgpu_ring_sched_ready(ring))
5846 continue;
5847
5848 drm_sched_stop(&ring->sched, job ? &job->base : NULL);
5849
5850 if (need_emergency_restart)
5851 amdgpu_job_stop_all_jobs_on_sched(&ring->sched);
5852 }
5853 atomic_inc(&tmp_adev->gpu_reset_counter);
5854 }
5855
5856 if (need_emergency_restart)
5857 goto skip_sched_resume;
5858
5859 /*
5860 * Must check guilty signal here since after this point all old
5861 * HW fences are force signaled.
5862 *
5863 * job->base holds a reference to parent fence
5864 */
5865 if (job && dma_fence_is_signaled(&job->hw_fence.base)) {
5866 job_signaled = true;
5867 dev_info(adev->dev, "Guilty job already signaled, skipping HW reset");
5868 goto skip_hw_reset;
5869 }
5870
5871 retry: /* Rest of adevs pre asic reset from XGMI hive. */
5872 list_for_each_entry(tmp_adev, device_list_handle, reset_list) {
5873 r = amdgpu_device_pre_asic_reset(tmp_adev, reset_context);
5874 /*TODO Should we stop ?*/
5875 if (r) {
5876 dev_err(tmp_adev->dev, "GPU pre asic reset failed with err, %d for drm dev, %s ",
5877 r, adev_to_drm(tmp_adev)->unique);
5878 tmp_adev->asic_reset_res = r;
5879 }
5880 }
5881
5882 /* Actual ASIC resets if needed.*/
5883 /* Host driver will handle XGMI hive reset for SRIOV */
5884 if (amdgpu_sriov_vf(adev)) {
5885 if (amdgpu_ras_get_fed_status(adev) || amdgpu_virt_rcvd_ras_interrupt(adev)) {
5886 dev_dbg(adev->dev, "Detected RAS error, wait for FLR completion\n");
5887 amdgpu_ras_set_fed(adev, true);
5888 set_bit(AMDGPU_HOST_FLR, &reset_context->flags);
5889 }
5890
5891 r = amdgpu_device_reset_sriov(adev, reset_context);
5892 if (AMDGPU_RETRY_SRIOV_RESET(r) && (retry_limit--) > 0) {
5893 amdgpu_virt_release_full_gpu(adev, true);
5894 goto retry;
5895 }
5896 if (r)
5897 adev->asic_reset_res = r;
5898 } else {
5899 r = amdgpu_do_asic_reset(device_list_handle, reset_context);
5900 if (r && r == -EAGAIN)
5901 goto retry;
5902 }
5903
5904 list_for_each_entry(tmp_adev, device_list_handle, reset_list) {
5905 /*
5906 * Drop any pending non scheduler resets queued before reset is done.
5907 * Any reset scheduled after this point would be valid. Scheduler resets
5908 * were already dropped during drm_sched_stop and no new ones can come
5909 * in before drm_sched_start.
5910 */
5911 amdgpu_device_stop_pending_resets(tmp_adev);
5912 }
5913
5914 skip_hw_reset:
5915
5916 /* Post ASIC reset for all devs .*/
5917 list_for_each_entry(tmp_adev, device_list_handle, reset_list) {
5918
5919 for (i = 0; i < AMDGPU_MAX_RINGS; ++i) {
5920 struct amdgpu_ring *ring = tmp_adev->rings[i];
5921
5922 if (!amdgpu_ring_sched_ready(ring))
5923 continue;
5924
5925 drm_sched_start(&ring->sched);
5926 }
5927
5928 if (!drm_drv_uses_atomic_modeset(adev_to_drm(tmp_adev)) && !job_signaled)
5929 drm_helper_resume_force_mode(adev_to_drm(tmp_adev));
5930
5931 if (tmp_adev->asic_reset_res)
5932 r = tmp_adev->asic_reset_res;
5933
5934 tmp_adev->asic_reset_res = 0;
5935
5936 if (r) {
5937 /* bad news, how to tell it to userspace ?
5938 * for ras error, we should report GPU bad status instead of
5939 * reset failure
5940 */
5941 if (reset_context->src != AMDGPU_RESET_SRC_RAS ||
5942 !amdgpu_ras_eeprom_check_err_threshold(tmp_adev))
5943 dev_info(tmp_adev->dev, "GPU reset(%d) failed\n",
5944 atomic_read(&tmp_adev->gpu_reset_counter));
5945 amdgpu_vf_error_put(tmp_adev, AMDGIM_ERROR_VF_GPU_RESET_FAIL, 0, r);
5946 } else {
5947 dev_info(tmp_adev->dev, "GPU reset(%d) succeeded!\n", atomic_read(&tmp_adev->gpu_reset_counter));
5948 if (amdgpu_acpi_smart_shift_update(adev_to_drm(tmp_adev), AMDGPU_SS_DEV_D0))
5949 DRM_WARN("smart shift update failed\n");
5950 }
5951 }
5952
5953 skip_sched_resume:
5954 list_for_each_entry(tmp_adev, device_list_handle, reset_list) {
5955 /* unlock kfd: SRIOV would do it separately */
5956 if (!need_emergency_restart && !amdgpu_sriov_vf(tmp_adev))
5957 amdgpu_amdkfd_post_reset(tmp_adev);
5958
5959 /* kfd_post_reset will do nothing if kfd device is not initialized,
5960 * need to bring up kfd here if it's not be initialized before
5961 */
5962 if (!adev->kfd.init_complete)
5963 amdgpu_amdkfd_device_init(adev);
5964
5965 if (audio_suspended)
5966 amdgpu_device_resume_display_audio(tmp_adev);
5967
5968 amdgpu_device_unset_mp1_state(tmp_adev);
5969
5970 amdgpu_ras_set_error_query_ready(tmp_adev, true);
5971 }
5972
5973 tmp_adev = list_first_entry(device_list_handle, struct amdgpu_device,
5974 reset_list);
5975 amdgpu_device_unlock_reset_domain(tmp_adev->reset_domain);
5976
5977 end_reset:
5978 if (hive) {
5979 mutex_unlock(&hive->hive_lock);
5980 amdgpu_put_xgmi_hive(hive);
5981 }
5982
5983 if (r)
5984 dev_info(adev->dev, "GPU reset end with ret = %d\n", r);
5985
5986 atomic_set(&adev->reset_domain->reset_res, r);
5987 return r;
5988 }
5989
5990 /**
5991 * amdgpu_device_partner_bandwidth - find the bandwidth of appropriate partner
5992 *
5993 * @adev: amdgpu_device pointer
5994 * @speed: pointer to the speed of the link
5995 * @width: pointer to the width of the link
5996 *
5997 * Evaluate the hierarchy to find the speed and bandwidth capabilities of the
5998 * first physical partner to an AMD dGPU.
5999 * This will exclude any virtual switches and links.
6000 */
amdgpu_device_partner_bandwidth(struct amdgpu_device * adev,enum pci_bus_speed * speed,enum pcie_link_width * width)6001 static void amdgpu_device_partner_bandwidth(struct amdgpu_device *adev,
6002 enum pci_bus_speed *speed,
6003 enum pcie_link_width *width)
6004 {
6005 struct pci_dev *parent = adev->pdev;
6006
6007 if (!speed || !width)
6008 return;
6009
6010 *speed = PCI_SPEED_UNKNOWN;
6011 *width = PCIE_LNK_WIDTH_UNKNOWN;
6012
6013 if (amdgpu_device_pcie_dynamic_switching_supported(adev)) {
6014 while ((parent = pci_upstream_bridge(parent))) {
6015 /* skip upstream/downstream switches internal to dGPU*/
6016 if (parent->vendor == PCI_VENDOR_ID_ATI)
6017 continue;
6018 *speed = pcie_get_speed_cap(parent);
6019 *width = pcie_get_width_cap(parent);
6020 break;
6021 }
6022 } else {
6023 /* use the current speeds rather than max if switching is not supported */
6024 pcie_bandwidth_available(adev->pdev, NULL, speed, width);
6025 }
6026 }
6027
6028 /**
6029 * amdgpu_device_get_pcie_info - fence pcie info about the PCIE slot
6030 *
6031 * @adev: amdgpu_device pointer
6032 *
6033 * Fetchs and stores in the driver the PCIE capabilities (gen speed
6034 * and lanes) of the slot the device is in. Handles APUs and
6035 * virtualized environments where PCIE config space may not be available.
6036 */
amdgpu_device_get_pcie_info(struct amdgpu_device * adev)6037 static void amdgpu_device_get_pcie_info(struct amdgpu_device *adev)
6038 {
6039 struct pci_dev *pdev;
6040 enum pci_bus_speed speed_cap, platform_speed_cap;
6041 enum pcie_link_width platform_link_width;
6042
6043 if (amdgpu_pcie_gen_cap)
6044 adev->pm.pcie_gen_mask = amdgpu_pcie_gen_cap;
6045
6046 if (amdgpu_pcie_lane_cap)
6047 adev->pm.pcie_mlw_mask = amdgpu_pcie_lane_cap;
6048
6049 /* covers APUs as well */
6050 if (pci_is_root_bus(adev->pdev->bus) && !amdgpu_passthrough(adev)) {
6051 if (adev->pm.pcie_gen_mask == 0)
6052 adev->pm.pcie_gen_mask = AMDGPU_DEFAULT_PCIE_GEN_MASK;
6053 if (adev->pm.pcie_mlw_mask == 0)
6054 adev->pm.pcie_mlw_mask = AMDGPU_DEFAULT_PCIE_MLW_MASK;
6055 return;
6056 }
6057
6058 if (adev->pm.pcie_gen_mask && adev->pm.pcie_mlw_mask)
6059 return;
6060
6061 amdgpu_device_partner_bandwidth(adev, &platform_speed_cap,
6062 &platform_link_width);
6063
6064 if (adev->pm.pcie_gen_mask == 0) {
6065 /* asic caps */
6066 pdev = adev->pdev;
6067 speed_cap = pcie_get_speed_cap(pdev);
6068 if (speed_cap == PCI_SPEED_UNKNOWN) {
6069 adev->pm.pcie_gen_mask |= (CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN1 |
6070 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN2 |
6071 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN3);
6072 } else {
6073 if (speed_cap == PCIE_SPEED_32_0GT)
6074 adev->pm.pcie_gen_mask |= (CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN1 |
6075 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN2 |
6076 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN3 |
6077 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN4 |
6078 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN5);
6079 else if (speed_cap == PCIE_SPEED_16_0GT)
6080 adev->pm.pcie_gen_mask |= (CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN1 |
6081 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN2 |
6082 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN3 |
6083 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN4);
6084 else if (speed_cap == PCIE_SPEED_8_0GT)
6085 adev->pm.pcie_gen_mask |= (CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN1 |
6086 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN2 |
6087 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN3);
6088 else if (speed_cap == PCIE_SPEED_5_0GT)
6089 adev->pm.pcie_gen_mask |= (CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN1 |
6090 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN2);
6091 else
6092 adev->pm.pcie_gen_mask |= CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN1;
6093 }
6094 /* platform caps */
6095 if (platform_speed_cap == PCI_SPEED_UNKNOWN) {
6096 adev->pm.pcie_gen_mask |= (CAIL_PCIE_LINK_SPEED_SUPPORT_GEN1 |
6097 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN2);
6098 } else {
6099 if (platform_speed_cap == PCIE_SPEED_32_0GT)
6100 adev->pm.pcie_gen_mask |= (CAIL_PCIE_LINK_SPEED_SUPPORT_GEN1 |
6101 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN2 |
6102 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN3 |
6103 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN4 |
6104 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN5);
6105 else if (platform_speed_cap == PCIE_SPEED_16_0GT)
6106 adev->pm.pcie_gen_mask |= (CAIL_PCIE_LINK_SPEED_SUPPORT_GEN1 |
6107 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN2 |
6108 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN3 |
6109 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN4);
6110 else if (platform_speed_cap == PCIE_SPEED_8_0GT)
6111 adev->pm.pcie_gen_mask |= (CAIL_PCIE_LINK_SPEED_SUPPORT_GEN1 |
6112 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN2 |
6113 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN3);
6114 else if (platform_speed_cap == PCIE_SPEED_5_0GT)
6115 adev->pm.pcie_gen_mask |= (CAIL_PCIE_LINK_SPEED_SUPPORT_GEN1 |
6116 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN2);
6117 else
6118 adev->pm.pcie_gen_mask |= CAIL_PCIE_LINK_SPEED_SUPPORT_GEN1;
6119
6120 }
6121 }
6122 if (adev->pm.pcie_mlw_mask == 0) {
6123 if (platform_link_width == PCIE_LNK_WIDTH_UNKNOWN) {
6124 adev->pm.pcie_mlw_mask |= AMDGPU_DEFAULT_PCIE_MLW_MASK;
6125 } else {
6126 switch (platform_link_width) {
6127 case PCIE_LNK_X32:
6128 adev->pm.pcie_mlw_mask = (CAIL_PCIE_LINK_WIDTH_SUPPORT_X32 |
6129 CAIL_PCIE_LINK_WIDTH_SUPPORT_X16 |
6130 CAIL_PCIE_LINK_WIDTH_SUPPORT_X12 |
6131 CAIL_PCIE_LINK_WIDTH_SUPPORT_X8 |
6132 CAIL_PCIE_LINK_WIDTH_SUPPORT_X4 |
6133 CAIL_PCIE_LINK_WIDTH_SUPPORT_X2 |
6134 CAIL_PCIE_LINK_WIDTH_SUPPORT_X1);
6135 break;
6136 case PCIE_LNK_X16:
6137 adev->pm.pcie_mlw_mask = (CAIL_PCIE_LINK_WIDTH_SUPPORT_X16 |
6138 CAIL_PCIE_LINK_WIDTH_SUPPORT_X12 |
6139 CAIL_PCIE_LINK_WIDTH_SUPPORT_X8 |
6140 CAIL_PCIE_LINK_WIDTH_SUPPORT_X4 |
6141 CAIL_PCIE_LINK_WIDTH_SUPPORT_X2 |
6142 CAIL_PCIE_LINK_WIDTH_SUPPORT_X1);
6143 break;
6144 case PCIE_LNK_X12:
6145 adev->pm.pcie_mlw_mask = (CAIL_PCIE_LINK_WIDTH_SUPPORT_X12 |
6146 CAIL_PCIE_LINK_WIDTH_SUPPORT_X8 |
6147 CAIL_PCIE_LINK_WIDTH_SUPPORT_X4 |
6148 CAIL_PCIE_LINK_WIDTH_SUPPORT_X2 |
6149 CAIL_PCIE_LINK_WIDTH_SUPPORT_X1);
6150 break;
6151 case PCIE_LNK_X8:
6152 adev->pm.pcie_mlw_mask = (CAIL_PCIE_LINK_WIDTH_SUPPORT_X8 |
6153 CAIL_PCIE_LINK_WIDTH_SUPPORT_X4 |
6154 CAIL_PCIE_LINK_WIDTH_SUPPORT_X2 |
6155 CAIL_PCIE_LINK_WIDTH_SUPPORT_X1);
6156 break;
6157 case PCIE_LNK_X4:
6158 adev->pm.pcie_mlw_mask = (CAIL_PCIE_LINK_WIDTH_SUPPORT_X4 |
6159 CAIL_PCIE_LINK_WIDTH_SUPPORT_X2 |
6160 CAIL_PCIE_LINK_WIDTH_SUPPORT_X1);
6161 break;
6162 case PCIE_LNK_X2:
6163 adev->pm.pcie_mlw_mask = (CAIL_PCIE_LINK_WIDTH_SUPPORT_X2 |
6164 CAIL_PCIE_LINK_WIDTH_SUPPORT_X1);
6165 break;
6166 case PCIE_LNK_X1:
6167 adev->pm.pcie_mlw_mask = CAIL_PCIE_LINK_WIDTH_SUPPORT_X1;
6168 break;
6169 default:
6170 break;
6171 }
6172 }
6173 }
6174 }
6175
6176 /**
6177 * amdgpu_device_is_peer_accessible - Check peer access through PCIe BAR
6178 *
6179 * @adev: amdgpu_device pointer
6180 * @peer_adev: amdgpu_device pointer for peer device trying to access @adev
6181 *
6182 * Return true if @peer_adev can access (DMA) @adev through the PCIe
6183 * BAR, i.e. @adev is "large BAR" and the BAR matches the DMA mask of
6184 * @peer_adev.
6185 */
amdgpu_device_is_peer_accessible(struct amdgpu_device * adev,struct amdgpu_device * peer_adev)6186 bool amdgpu_device_is_peer_accessible(struct amdgpu_device *adev,
6187 struct amdgpu_device *peer_adev)
6188 {
6189 #ifdef CONFIG_HSA_AMD_P2P
6190 bool p2p_access =
6191 !adev->gmc.xgmi.connected_to_cpu &&
6192 !(pci_p2pdma_distance(adev->pdev, peer_adev->dev, false) < 0);
6193
6194 bool is_large_bar = adev->gmc.visible_vram_size &&
6195 adev->gmc.real_vram_size == adev->gmc.visible_vram_size;
6196 bool p2p_addressable = amdgpu_device_check_iommu_remap(peer_adev);
6197
6198 if (!p2p_addressable) {
6199 uint64_t address_mask = peer_adev->dev->dma_mask ?
6200 ~*peer_adev->dev->dma_mask : ~((1ULL << 32) - 1);
6201 resource_size_t aper_limit =
6202 adev->gmc.aper_base + adev->gmc.aper_size - 1;
6203
6204 p2p_addressable = !(adev->gmc.aper_base & address_mask ||
6205 aper_limit & address_mask);
6206 }
6207 return pcie_p2p && is_large_bar && p2p_access && p2p_addressable;
6208 #else
6209 return false;
6210 #endif
6211 }
6212
amdgpu_device_baco_enter(struct drm_device * dev)6213 int amdgpu_device_baco_enter(struct drm_device *dev)
6214 {
6215 struct amdgpu_device *adev = drm_to_adev(dev);
6216 struct amdgpu_ras *ras = amdgpu_ras_get_context(adev);
6217
6218 if (!amdgpu_device_supports_baco(dev))
6219 return -ENOTSUPP;
6220
6221 if (ras && adev->ras_enabled &&
6222 adev->nbio.funcs->enable_doorbell_interrupt)
6223 adev->nbio.funcs->enable_doorbell_interrupt(adev, false);
6224
6225 return amdgpu_dpm_baco_enter(adev);
6226 }
6227
amdgpu_device_baco_exit(struct drm_device * dev)6228 int amdgpu_device_baco_exit(struct drm_device *dev)
6229 {
6230 struct amdgpu_device *adev = drm_to_adev(dev);
6231 struct amdgpu_ras *ras = amdgpu_ras_get_context(adev);
6232 int ret = 0;
6233
6234 if (!amdgpu_device_supports_baco(dev))
6235 return -ENOTSUPP;
6236
6237 ret = amdgpu_dpm_baco_exit(adev);
6238 if (ret)
6239 return ret;
6240
6241 if (ras && adev->ras_enabled &&
6242 adev->nbio.funcs->enable_doorbell_interrupt)
6243 adev->nbio.funcs->enable_doorbell_interrupt(adev, true);
6244
6245 if (amdgpu_passthrough(adev) && adev->nbio.funcs &&
6246 adev->nbio.funcs->clear_doorbell_interrupt)
6247 adev->nbio.funcs->clear_doorbell_interrupt(adev);
6248
6249 return 0;
6250 }
6251
6252 /**
6253 * amdgpu_pci_error_detected - Called when a PCI error is detected.
6254 * @pdev: PCI device struct
6255 * @state: PCI channel state
6256 *
6257 * Description: Called when a PCI error is detected.
6258 *
6259 * Return: PCI_ERS_RESULT_NEED_RESET or PCI_ERS_RESULT_DISCONNECT.
6260 */
amdgpu_pci_error_detected(struct pci_dev * pdev,pci_channel_state_t state)6261 pci_ers_result_t amdgpu_pci_error_detected(struct pci_dev *pdev, pci_channel_state_t state)
6262 {
6263 struct drm_device *dev = pci_get_drvdata(pdev);
6264 struct amdgpu_device *adev = drm_to_adev(dev);
6265 int i;
6266
6267 DRM_INFO("PCI error: detected callback, state(%d)!!\n", state);
6268
6269 if (adev->gmc.xgmi.num_physical_nodes > 1) {
6270 DRM_WARN("No support for XGMI hive yet...");
6271 return PCI_ERS_RESULT_DISCONNECT;
6272 }
6273
6274 adev->pci_channel_state = state;
6275
6276 switch (state) {
6277 case pci_channel_io_normal:
6278 return PCI_ERS_RESULT_CAN_RECOVER;
6279 /* Fatal error, prepare for slot reset */
6280 case pci_channel_io_frozen:
6281 /*
6282 * Locking adev->reset_domain->sem will prevent any external access
6283 * to GPU during PCI error recovery
6284 */
6285 amdgpu_device_lock_reset_domain(adev->reset_domain);
6286 amdgpu_device_set_mp1_state(adev);
6287
6288 /*
6289 * Block any work scheduling as we do for regular GPU reset
6290 * for the duration of the recovery
6291 */
6292 for (i = 0; i < AMDGPU_MAX_RINGS; ++i) {
6293 struct amdgpu_ring *ring = adev->rings[i];
6294
6295 if (!amdgpu_ring_sched_ready(ring))
6296 continue;
6297
6298 drm_sched_stop(&ring->sched, NULL);
6299 }
6300 atomic_inc(&adev->gpu_reset_counter);
6301 return PCI_ERS_RESULT_NEED_RESET;
6302 case pci_channel_io_perm_failure:
6303 /* Permanent error, prepare for device removal */
6304 return PCI_ERS_RESULT_DISCONNECT;
6305 }
6306
6307 return PCI_ERS_RESULT_NEED_RESET;
6308 }
6309
6310 /**
6311 * amdgpu_pci_mmio_enabled - Enable MMIO and dump debug registers
6312 * @pdev: pointer to PCI device
6313 */
amdgpu_pci_mmio_enabled(struct pci_dev * pdev)6314 pci_ers_result_t amdgpu_pci_mmio_enabled(struct pci_dev *pdev)
6315 {
6316
6317 DRM_INFO("PCI error: mmio enabled callback!!\n");
6318
6319 /* TODO - dump whatever for debugging purposes */
6320
6321 /* This called only if amdgpu_pci_error_detected returns
6322 * PCI_ERS_RESULT_CAN_RECOVER. Read/write to the device still
6323 * works, no need to reset slot.
6324 */
6325
6326 return PCI_ERS_RESULT_RECOVERED;
6327 }
6328
6329 /**
6330 * amdgpu_pci_slot_reset - Called when PCI slot has been reset.
6331 * @pdev: PCI device struct
6332 *
6333 * Description: This routine is called by the pci error recovery
6334 * code after the PCI slot has been reset, just before we
6335 * should resume normal operations.
6336 */
amdgpu_pci_slot_reset(struct pci_dev * pdev)6337 pci_ers_result_t amdgpu_pci_slot_reset(struct pci_dev *pdev)
6338 {
6339 struct drm_device *dev = pci_get_drvdata(pdev);
6340 struct amdgpu_device *adev = drm_to_adev(dev);
6341 int r, i;
6342 struct amdgpu_reset_context reset_context;
6343 u32 memsize;
6344 struct list_head device_list;
6345
6346 /* PCI error slot reset should be skipped During RAS recovery */
6347 if ((amdgpu_ip_version(adev, GC_HWIP, 0) == IP_VERSION(9, 4, 3) ||
6348 amdgpu_ip_version(adev, GC_HWIP, 0) == IP_VERSION(9, 4, 4)) &&
6349 amdgpu_ras_in_recovery(adev))
6350 return PCI_ERS_RESULT_RECOVERED;
6351
6352 DRM_INFO("PCI error: slot reset callback!!\n");
6353
6354 memset(&reset_context, 0, sizeof(reset_context));
6355
6356 INIT_LIST_HEAD(&device_list);
6357 list_add_tail(&adev->reset_list, &device_list);
6358
6359 /* wait for asic to come out of reset */
6360 msleep(500);
6361
6362 /* Restore PCI confspace */
6363 amdgpu_device_load_pci_state(pdev);
6364
6365 /* confirm ASIC came out of reset */
6366 for (i = 0; i < adev->usec_timeout; i++) {
6367 memsize = amdgpu_asic_get_config_memsize(adev);
6368
6369 if (memsize != 0xffffffff)
6370 break;
6371 udelay(1);
6372 }
6373 if (memsize == 0xffffffff) {
6374 r = -ETIME;
6375 goto out;
6376 }
6377
6378 reset_context.method = AMD_RESET_METHOD_NONE;
6379 reset_context.reset_req_dev = adev;
6380 set_bit(AMDGPU_NEED_FULL_RESET, &reset_context.flags);
6381 set_bit(AMDGPU_SKIP_HW_RESET, &reset_context.flags);
6382
6383 adev->no_hw_access = true;
6384 r = amdgpu_device_pre_asic_reset(adev, &reset_context);
6385 adev->no_hw_access = false;
6386 if (r)
6387 goto out;
6388
6389 r = amdgpu_do_asic_reset(&device_list, &reset_context);
6390
6391 out:
6392 if (!r) {
6393 if (amdgpu_device_cache_pci_state(adev->pdev))
6394 pci_restore_state(adev->pdev);
6395
6396 DRM_INFO("PCIe error recovery succeeded\n");
6397 } else {
6398 DRM_ERROR("PCIe error recovery failed, err:%d", r);
6399 amdgpu_device_unset_mp1_state(adev);
6400 amdgpu_device_unlock_reset_domain(adev->reset_domain);
6401 }
6402
6403 return r ? PCI_ERS_RESULT_DISCONNECT : PCI_ERS_RESULT_RECOVERED;
6404 }
6405
6406 /**
6407 * amdgpu_pci_resume() - resume normal ops after PCI reset
6408 * @pdev: pointer to PCI device
6409 *
6410 * Called when the error recovery driver tells us that its
6411 * OK to resume normal operation.
6412 */
amdgpu_pci_resume(struct pci_dev * pdev)6413 void amdgpu_pci_resume(struct pci_dev *pdev)
6414 {
6415 struct drm_device *dev = pci_get_drvdata(pdev);
6416 struct amdgpu_device *adev = drm_to_adev(dev);
6417 int i;
6418
6419
6420 DRM_INFO("PCI error: resume callback!!\n");
6421
6422 /* Only continue execution for the case of pci_channel_io_frozen */
6423 if (adev->pci_channel_state != pci_channel_io_frozen)
6424 return;
6425
6426 for (i = 0; i < AMDGPU_MAX_RINGS; ++i) {
6427 struct amdgpu_ring *ring = adev->rings[i];
6428
6429 if (!amdgpu_ring_sched_ready(ring))
6430 continue;
6431
6432 drm_sched_start(&ring->sched);
6433 }
6434
6435 amdgpu_device_unset_mp1_state(adev);
6436 amdgpu_device_unlock_reset_domain(adev->reset_domain);
6437 }
6438
amdgpu_device_cache_pci_state(struct pci_dev * pdev)6439 bool amdgpu_device_cache_pci_state(struct pci_dev *pdev)
6440 {
6441 struct drm_device *dev = pci_get_drvdata(pdev);
6442 struct amdgpu_device *adev = drm_to_adev(dev);
6443 int r;
6444
6445 if (amdgpu_sriov_vf(adev))
6446 return false;
6447
6448 r = pci_save_state(pdev);
6449 if (!r) {
6450 kfree(adev->pci_state);
6451
6452 adev->pci_state = pci_store_saved_state(pdev);
6453
6454 if (!adev->pci_state) {
6455 DRM_ERROR("Failed to store PCI saved state");
6456 return false;
6457 }
6458 } else {
6459 DRM_WARN("Failed to save PCI state, err:%d\n", r);
6460 return false;
6461 }
6462
6463 return true;
6464 }
6465
amdgpu_device_load_pci_state(struct pci_dev * pdev)6466 bool amdgpu_device_load_pci_state(struct pci_dev *pdev)
6467 {
6468 struct drm_device *dev = pci_get_drvdata(pdev);
6469 struct amdgpu_device *adev = drm_to_adev(dev);
6470 int r;
6471
6472 if (!adev->pci_state)
6473 return false;
6474
6475 r = pci_load_saved_state(pdev, adev->pci_state);
6476
6477 if (!r) {
6478 pci_restore_state(pdev);
6479 } else {
6480 DRM_WARN("Failed to load PCI state, err:%d\n", r);
6481 return false;
6482 }
6483
6484 return true;
6485 }
6486
amdgpu_device_flush_hdp(struct amdgpu_device * adev,struct amdgpu_ring * ring)6487 void amdgpu_device_flush_hdp(struct amdgpu_device *adev,
6488 struct amdgpu_ring *ring)
6489 {
6490 #ifdef CONFIG_X86_64
6491 if ((adev->flags & AMD_IS_APU) && !amdgpu_passthrough(adev))
6492 return;
6493 #endif
6494 if (adev->gmc.xgmi.connected_to_cpu)
6495 return;
6496
6497 if (ring && ring->funcs->emit_hdp_flush)
6498 amdgpu_ring_emit_hdp_flush(ring);
6499 else
6500 amdgpu_asic_flush_hdp(adev, ring);
6501 }
6502
amdgpu_device_invalidate_hdp(struct amdgpu_device * adev,struct amdgpu_ring * ring)6503 void amdgpu_device_invalidate_hdp(struct amdgpu_device *adev,
6504 struct amdgpu_ring *ring)
6505 {
6506 #ifdef CONFIG_X86_64
6507 if ((adev->flags & AMD_IS_APU) && !amdgpu_passthrough(adev))
6508 return;
6509 #endif
6510 if (adev->gmc.xgmi.connected_to_cpu)
6511 return;
6512
6513 amdgpu_asic_invalidate_hdp(adev, ring);
6514 }
6515
amdgpu_in_reset(struct amdgpu_device * adev)6516 int amdgpu_in_reset(struct amdgpu_device *adev)
6517 {
6518 return atomic_read(&adev->reset_domain->in_gpu_reset);
6519 }
6520
6521 /**
6522 * amdgpu_device_halt() - bring hardware to some kind of halt state
6523 *
6524 * @adev: amdgpu_device pointer
6525 *
6526 * Bring hardware to some kind of halt state so that no one can touch it
6527 * any more. It will help to maintain error context when error occurred.
6528 * Compare to a simple hang, the system will keep stable at least for SSH
6529 * access. Then it should be trivial to inspect the hardware state and
6530 * see what's going on. Implemented as following:
6531 *
6532 * 1. drm_dev_unplug() makes device inaccessible to user space(IOCTLs, etc),
6533 * clears all CPU mappings to device, disallows remappings through page faults
6534 * 2. amdgpu_irq_disable_all() disables all interrupts
6535 * 3. amdgpu_fence_driver_hw_fini() signals all HW fences
6536 * 4. set adev->no_hw_access to avoid potential crashes after setp 5
6537 * 5. amdgpu_device_unmap_mmio() clears all MMIO mappings
6538 * 6. pci_disable_device() and pci_wait_for_pending_transaction()
6539 * flush any in flight DMA operations
6540 */
amdgpu_device_halt(struct amdgpu_device * adev)6541 void amdgpu_device_halt(struct amdgpu_device *adev)
6542 {
6543 struct pci_dev *pdev = adev->pdev;
6544 struct drm_device *ddev = adev_to_drm(adev);
6545
6546 amdgpu_xcp_dev_unplug(adev);
6547 drm_dev_unplug(ddev);
6548
6549 amdgpu_irq_disable_all(adev);
6550
6551 amdgpu_fence_driver_hw_fini(adev);
6552
6553 adev->no_hw_access = true;
6554
6555 amdgpu_device_unmap_mmio(adev);
6556
6557 pci_disable_device(pdev);
6558 pci_wait_for_pending_transaction(pdev);
6559 }
6560
amdgpu_device_pcie_port_rreg(struct amdgpu_device * adev,u32 reg)6561 u32 amdgpu_device_pcie_port_rreg(struct amdgpu_device *adev,
6562 u32 reg)
6563 {
6564 unsigned long flags, address, data;
6565 u32 r;
6566
6567 address = adev->nbio.funcs->get_pcie_port_index_offset(adev);
6568 data = adev->nbio.funcs->get_pcie_port_data_offset(adev);
6569
6570 spin_lock_irqsave(&adev->pcie_idx_lock, flags);
6571 WREG32(address, reg * 4);
6572 (void)RREG32(address);
6573 r = RREG32(data);
6574 spin_unlock_irqrestore(&adev->pcie_idx_lock, flags);
6575 return r;
6576 }
6577
amdgpu_device_pcie_port_wreg(struct amdgpu_device * adev,u32 reg,u32 v)6578 void amdgpu_device_pcie_port_wreg(struct amdgpu_device *adev,
6579 u32 reg, u32 v)
6580 {
6581 unsigned long flags, address, data;
6582
6583 address = adev->nbio.funcs->get_pcie_port_index_offset(adev);
6584 data = adev->nbio.funcs->get_pcie_port_data_offset(adev);
6585
6586 spin_lock_irqsave(&adev->pcie_idx_lock, flags);
6587 WREG32(address, reg * 4);
6588 (void)RREG32(address);
6589 WREG32(data, v);
6590 (void)RREG32(data);
6591 spin_unlock_irqrestore(&adev->pcie_idx_lock, flags);
6592 }
6593
6594 /**
6595 * amdgpu_device_get_gang - return a reference to the current gang
6596 * @adev: amdgpu_device pointer
6597 *
6598 * Returns: A new reference to the current gang leader.
6599 */
amdgpu_device_get_gang(struct amdgpu_device * adev)6600 struct dma_fence *amdgpu_device_get_gang(struct amdgpu_device *adev)
6601 {
6602 struct dma_fence *fence;
6603
6604 rcu_read_lock();
6605 fence = dma_fence_get_rcu_safe(&adev->gang_submit);
6606 rcu_read_unlock();
6607 return fence;
6608 }
6609
6610 /**
6611 * amdgpu_device_switch_gang - switch to a new gang
6612 * @adev: amdgpu_device pointer
6613 * @gang: the gang to switch to
6614 *
6615 * Try to switch to a new gang.
6616 * Returns: NULL if we switched to the new gang or a reference to the current
6617 * gang leader.
6618 */
amdgpu_device_switch_gang(struct amdgpu_device * adev,struct dma_fence * gang)6619 struct dma_fence *amdgpu_device_switch_gang(struct amdgpu_device *adev,
6620 struct dma_fence *gang)
6621 {
6622 struct dma_fence *old = NULL;
6623
6624 dma_fence_get(gang);
6625 do {
6626 dma_fence_put(old);
6627 old = amdgpu_device_get_gang(adev);
6628 if (old == gang)
6629 break;
6630
6631 if (!dma_fence_is_signaled(old)) {
6632 dma_fence_put(gang);
6633 return old;
6634 }
6635
6636 } while (cmpxchg((struct dma_fence __force **)&adev->gang_submit,
6637 old, gang) != old);
6638
6639 /*
6640 * Drop it once for the exchanged reference in adev and once for the
6641 * thread local reference acquired in amdgpu_device_get_gang().
6642 */
6643 dma_fence_put(old);
6644 dma_fence_put(old);
6645 return NULL;
6646 }
6647
amdgpu_device_has_display_hardware(struct amdgpu_device * adev)6648 bool amdgpu_device_has_display_hardware(struct amdgpu_device *adev)
6649 {
6650 switch (adev->asic_type) {
6651 #ifdef CONFIG_DRM_AMDGPU_SI
6652 case CHIP_HAINAN:
6653 #endif
6654 case CHIP_TOPAZ:
6655 /* chips with no display hardware */
6656 return false;
6657 #ifdef CONFIG_DRM_AMDGPU_SI
6658 case CHIP_TAHITI:
6659 case CHIP_PITCAIRN:
6660 case CHIP_VERDE:
6661 case CHIP_OLAND:
6662 #endif
6663 #ifdef CONFIG_DRM_AMDGPU_CIK
6664 case CHIP_BONAIRE:
6665 case CHIP_HAWAII:
6666 case CHIP_KAVERI:
6667 case CHIP_KABINI:
6668 case CHIP_MULLINS:
6669 #endif
6670 case CHIP_TONGA:
6671 case CHIP_FIJI:
6672 case CHIP_POLARIS10:
6673 case CHIP_POLARIS11:
6674 case CHIP_POLARIS12:
6675 case CHIP_VEGAM:
6676 case CHIP_CARRIZO:
6677 case CHIP_STONEY:
6678 /* chips with display hardware */
6679 return true;
6680 default:
6681 /* IP discovery */
6682 if (!amdgpu_ip_version(adev, DCE_HWIP, 0) ||
6683 (adev->harvest_ip_mask & AMD_HARVEST_IP_DMU_MASK))
6684 return false;
6685 return true;
6686 }
6687 }
6688
amdgpu_device_wait_on_rreg(struct amdgpu_device * adev,uint32_t inst,uint32_t reg_addr,char reg_name[],uint32_t expected_value,uint32_t mask)6689 uint32_t amdgpu_device_wait_on_rreg(struct amdgpu_device *adev,
6690 uint32_t inst, uint32_t reg_addr, char reg_name[],
6691 uint32_t expected_value, uint32_t mask)
6692 {
6693 uint32_t ret = 0;
6694 uint32_t old_ = 0;
6695 uint32_t tmp_ = RREG32(reg_addr);
6696 uint32_t loop = adev->usec_timeout;
6697
6698 while ((tmp_ & (mask)) != (expected_value)) {
6699 if (old_ != tmp_) {
6700 loop = adev->usec_timeout;
6701 old_ = tmp_;
6702 } else
6703 udelay(1);
6704 tmp_ = RREG32(reg_addr);
6705 loop--;
6706 if (!loop) {
6707 DRM_WARN("Register(%d) [%s] failed to reach value 0x%08x != 0x%08xn",
6708 inst, reg_name, (uint32_t)expected_value,
6709 (uint32_t)(tmp_ & (mask)));
6710 ret = -ETIMEDOUT;
6711 break;
6712 }
6713 }
6714 return ret;
6715 }
6716