• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /*
2  * Copyright 2008 Advanced Micro Devices, Inc.
3  * Copyright 2008 Red Hat Inc.
4  * Copyright 2009 Jerome Glisse.
5  *
6  * Permission is hereby granted, free of charge, to any person obtaining a
7  * copy of this software and associated documentation files (the "Software"),
8  * to deal in the Software without restriction, including without limitation
9  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
10  * and/or sell copies of the Software, and to permit persons to whom the
11  * Software is furnished to do so, subject to the following conditions:
12  *
13  * The above copyright notice and this permission notice shall be included in
14  * all copies or substantial portions of the Software.
15  *
16  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
19  * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR
20  * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
21  * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
22  * OTHER DEALINGS IN THE SOFTWARE.
23  *
24  * Authors: Dave Airlie
25  *          Alex Deucher
26  *          Jerome Glisse
27  */
28 #include <linux/power_supply.h>
29 #include <linux/kthread.h>
30 #include <linux/module.h>
31 #include <linux/console.h>
32 #include <linux/slab.h>
33 #include <linux/iommu.h>
34 #include <linux/pci.h>
35 #include <linux/pci-p2pdma.h>
36 #include <linux/apple-gmux.h>
37 
38 #include <drm/drm_aperture.h>
39 #include <drm/drm_atomic_helper.h>
40 #include <drm/drm_crtc_helper.h>
41 #include <drm/drm_fb_helper.h>
42 #include <drm/drm_probe_helper.h>
43 #include <drm/amdgpu_drm.h>
44 #include <linux/device.h>
45 #include <linux/vgaarb.h>
46 #include <linux/vga_switcheroo.h>
47 #include <linux/efi.h>
48 #include "amdgpu.h"
49 #include "amdgpu_trace.h"
50 #include "amdgpu_i2c.h"
51 #include "atom.h"
52 #include "amdgpu_atombios.h"
53 #include "amdgpu_atomfirmware.h"
54 #include "amd_pcie.h"
55 #ifdef CONFIG_DRM_AMDGPU_SI
56 #include "si.h"
57 #endif
58 #ifdef CONFIG_DRM_AMDGPU_CIK
59 #include "cik.h"
60 #endif
61 #include "vi.h"
62 #include "soc15.h"
63 #include "nv.h"
64 #include "bif/bif_4_1_d.h"
65 #include <linux/firmware.h>
66 #include "amdgpu_vf_error.h"
67 
68 #include "amdgpu_amdkfd.h"
69 #include "amdgpu_pm.h"
70 
71 #include "amdgpu_xgmi.h"
72 #include "amdgpu_ras.h"
73 #include "amdgpu_pmu.h"
74 #include "amdgpu_fru_eeprom.h"
75 #include "amdgpu_reset.h"
76 #include "amdgpu_virt.h"
77 #include "amdgpu_dev_coredump.h"
78 
79 #include <linux/suspend.h>
80 #include <drm/task_barrier.h>
81 #include <linux/pm_runtime.h>
82 
83 #include <drm/drm_drv.h>
84 
85 #if IS_ENABLED(CONFIG_X86)
86 #include <asm/intel-family.h>
87 #endif
88 
89 MODULE_FIRMWARE("amdgpu/vega10_gpu_info.bin");
90 MODULE_FIRMWARE("amdgpu/vega12_gpu_info.bin");
91 MODULE_FIRMWARE("amdgpu/raven_gpu_info.bin");
92 MODULE_FIRMWARE("amdgpu/picasso_gpu_info.bin");
93 MODULE_FIRMWARE("amdgpu/raven2_gpu_info.bin");
94 MODULE_FIRMWARE("amdgpu/arcturus_gpu_info.bin");
95 MODULE_FIRMWARE("amdgpu/navi12_gpu_info.bin");
96 
97 #define AMDGPU_RESUME_MS		2000
98 #define AMDGPU_MAX_RETRY_LIMIT		2
99 #define AMDGPU_RETRY_SRIOV_RESET(r) ((r) == -EBUSY || (r) == -ETIMEDOUT || (r) == -EINVAL)
100 #define AMDGPU_PCIE_INDEX_FALLBACK (0x38 >> 2)
101 #define AMDGPU_PCIE_INDEX_HI_FALLBACK (0x44 >> 2)
102 #define AMDGPU_PCIE_DATA_FALLBACK (0x3C >> 2)
103 
104 static const struct drm_driver amdgpu_kms_driver;
105 
106 const char *amdgpu_asic_name[] = {
107 	"TAHITI",
108 	"PITCAIRN",
109 	"VERDE",
110 	"OLAND",
111 	"HAINAN",
112 	"BONAIRE",
113 	"KAVERI",
114 	"KABINI",
115 	"HAWAII",
116 	"MULLINS",
117 	"TOPAZ",
118 	"TONGA",
119 	"FIJI",
120 	"CARRIZO",
121 	"STONEY",
122 	"POLARIS10",
123 	"POLARIS11",
124 	"POLARIS12",
125 	"VEGAM",
126 	"VEGA10",
127 	"VEGA12",
128 	"VEGA20",
129 	"RAVEN",
130 	"ARCTURUS",
131 	"RENOIR",
132 	"ALDEBARAN",
133 	"NAVI10",
134 	"CYAN_SKILLFISH",
135 	"NAVI14",
136 	"NAVI12",
137 	"SIENNA_CICHLID",
138 	"NAVY_FLOUNDER",
139 	"VANGOGH",
140 	"DIMGREY_CAVEFISH",
141 	"BEIGE_GOBY",
142 	"YELLOW_CARP",
143 	"IP DISCOVERY",
144 	"LAST",
145 };
146 
147 static inline void amdgpu_device_stop_pending_resets(struct amdgpu_device *adev);
148 static int amdgpu_device_pm_notifier(struct notifier_block *nb, unsigned long mode,
149 				     void *data);
150 
151 /**
152  * DOC: pcie_replay_count
153  *
154  * The amdgpu driver provides a sysfs API for reporting the total number
155  * of PCIe replays (NAKs)
156  * The file pcie_replay_count is used for this and returns the total
157  * number of replays as a sum of the NAKs generated and NAKs received
158  */
159 
amdgpu_device_get_pcie_replay_count(struct device * dev,struct device_attribute * attr,char * buf)160 static ssize_t amdgpu_device_get_pcie_replay_count(struct device *dev,
161 		struct device_attribute *attr, char *buf)
162 {
163 	struct drm_device *ddev = dev_get_drvdata(dev);
164 	struct amdgpu_device *adev = drm_to_adev(ddev);
165 	uint64_t cnt = amdgpu_asic_get_pcie_replay_count(adev);
166 
167 	return sysfs_emit(buf, "%llu\n", cnt);
168 }
169 
170 static DEVICE_ATTR(pcie_replay_count, 0444,
171 		amdgpu_device_get_pcie_replay_count, NULL);
172 
amdgpu_device_attr_sysfs_init(struct amdgpu_device * adev)173 static int amdgpu_device_attr_sysfs_init(struct amdgpu_device *adev)
174 {
175 	int ret = 0;
176 
177 	if (!amdgpu_sriov_vf(adev))
178 		ret = sysfs_create_file(&adev->dev->kobj,
179 					&dev_attr_pcie_replay_count.attr);
180 
181 	return ret;
182 }
183 
amdgpu_device_attr_sysfs_fini(struct amdgpu_device * adev)184 static void amdgpu_device_attr_sysfs_fini(struct amdgpu_device *adev)
185 {
186 	if (!amdgpu_sriov_vf(adev))
187 		sysfs_remove_file(&adev->dev->kobj,
188 				  &dev_attr_pcie_replay_count.attr);
189 }
190 
amdgpu_sysfs_reg_state_get(struct file * f,struct kobject * kobj,struct bin_attribute * attr,char * buf,loff_t ppos,size_t count)191 static ssize_t amdgpu_sysfs_reg_state_get(struct file *f, struct kobject *kobj,
192 					  struct bin_attribute *attr, char *buf,
193 					  loff_t ppos, size_t count)
194 {
195 	struct device *dev = kobj_to_dev(kobj);
196 	struct drm_device *ddev = dev_get_drvdata(dev);
197 	struct amdgpu_device *adev = drm_to_adev(ddev);
198 	ssize_t bytes_read;
199 
200 	switch (ppos) {
201 	case AMDGPU_SYS_REG_STATE_XGMI:
202 		bytes_read = amdgpu_asic_get_reg_state(
203 			adev, AMDGPU_REG_STATE_TYPE_XGMI, buf, count);
204 		break;
205 	case AMDGPU_SYS_REG_STATE_WAFL:
206 		bytes_read = amdgpu_asic_get_reg_state(
207 			adev, AMDGPU_REG_STATE_TYPE_WAFL, buf, count);
208 		break;
209 	case AMDGPU_SYS_REG_STATE_PCIE:
210 		bytes_read = amdgpu_asic_get_reg_state(
211 			adev, AMDGPU_REG_STATE_TYPE_PCIE, buf, count);
212 		break;
213 	case AMDGPU_SYS_REG_STATE_USR:
214 		bytes_read = amdgpu_asic_get_reg_state(
215 			adev, AMDGPU_REG_STATE_TYPE_USR, buf, count);
216 		break;
217 	case AMDGPU_SYS_REG_STATE_USR_1:
218 		bytes_read = amdgpu_asic_get_reg_state(
219 			adev, AMDGPU_REG_STATE_TYPE_USR_1, buf, count);
220 		break;
221 	default:
222 		return -EINVAL;
223 	}
224 
225 	return bytes_read;
226 }
227 
228 BIN_ATTR(reg_state, 0444, amdgpu_sysfs_reg_state_get, NULL,
229 	 AMDGPU_SYS_REG_STATE_END);
230 
amdgpu_reg_state_sysfs_init(struct amdgpu_device * adev)231 int amdgpu_reg_state_sysfs_init(struct amdgpu_device *adev)
232 {
233 	int ret;
234 
235 	if (!amdgpu_asic_get_reg_state_supported(adev))
236 		return 0;
237 
238 	ret = sysfs_create_bin_file(&adev->dev->kobj, &bin_attr_reg_state);
239 
240 	return ret;
241 }
242 
amdgpu_reg_state_sysfs_fini(struct amdgpu_device * adev)243 void amdgpu_reg_state_sysfs_fini(struct amdgpu_device *adev)
244 {
245 	if (!amdgpu_asic_get_reg_state_supported(adev))
246 		return;
247 	sysfs_remove_bin_file(&adev->dev->kobj, &bin_attr_reg_state);
248 }
249 
250 /**
251  * DOC: board_info
252  *
253  * The amdgpu driver provides a sysfs API for giving board related information.
254  * It provides the form factor information in the format
255  *
256  *   type : form factor
257  *
258  * Possible form factor values
259  *
260  * - "cem"		- PCIE CEM card
261  * - "oam"		- Open Compute Accelerator Module
262  * - "unknown"	- Not known
263  *
264  */
265 
amdgpu_device_get_board_info(struct device * dev,struct device_attribute * attr,char * buf)266 static ssize_t amdgpu_device_get_board_info(struct device *dev,
267 					    struct device_attribute *attr,
268 					    char *buf)
269 {
270 	struct drm_device *ddev = dev_get_drvdata(dev);
271 	struct amdgpu_device *adev = drm_to_adev(ddev);
272 	enum amdgpu_pkg_type pkg_type = AMDGPU_PKG_TYPE_CEM;
273 	const char *pkg;
274 
275 	if (adev->smuio.funcs && adev->smuio.funcs->get_pkg_type)
276 		pkg_type = adev->smuio.funcs->get_pkg_type(adev);
277 
278 	switch (pkg_type) {
279 	case AMDGPU_PKG_TYPE_CEM:
280 		pkg = "cem";
281 		break;
282 	case AMDGPU_PKG_TYPE_OAM:
283 		pkg = "oam";
284 		break;
285 	default:
286 		pkg = "unknown";
287 		break;
288 	}
289 
290 	return sysfs_emit(buf, "%s : %s\n", "type", pkg);
291 }
292 
293 static DEVICE_ATTR(board_info, 0444, amdgpu_device_get_board_info, NULL);
294 
295 static struct attribute *amdgpu_board_attrs[] = {
296 	&dev_attr_board_info.attr,
297 	NULL,
298 };
299 
amdgpu_board_attrs_is_visible(struct kobject * kobj,struct attribute * attr,int n)300 static umode_t amdgpu_board_attrs_is_visible(struct kobject *kobj,
301 					     struct attribute *attr, int n)
302 {
303 	struct device *dev = kobj_to_dev(kobj);
304 	struct drm_device *ddev = dev_get_drvdata(dev);
305 	struct amdgpu_device *adev = drm_to_adev(ddev);
306 
307 	if (adev->flags & AMD_IS_APU)
308 		return 0;
309 
310 	return attr->mode;
311 }
312 
313 static const struct attribute_group amdgpu_board_attrs_group = {
314 	.attrs = amdgpu_board_attrs,
315 	.is_visible = amdgpu_board_attrs_is_visible
316 };
317 
318 static void amdgpu_device_get_pcie_info(struct amdgpu_device *adev);
319 
320 
321 /**
322  * amdgpu_device_supports_px - Is the device a dGPU with ATPX power control
323  *
324  * @dev: drm_device pointer
325  *
326  * Returns true if the device is a dGPU with ATPX power control,
327  * otherwise return false.
328  */
amdgpu_device_supports_px(struct drm_device * dev)329 bool amdgpu_device_supports_px(struct drm_device *dev)
330 {
331 	struct amdgpu_device *adev = drm_to_adev(dev);
332 
333 	if ((adev->flags & AMD_IS_PX) && !amdgpu_is_atpx_hybrid())
334 		return true;
335 	return false;
336 }
337 
338 /**
339  * amdgpu_device_supports_boco - Is the device a dGPU with ACPI power resources
340  *
341  * @dev: drm_device pointer
342  *
343  * Returns true if the device is a dGPU with ACPI power control,
344  * otherwise return false.
345  */
amdgpu_device_supports_boco(struct drm_device * dev)346 bool amdgpu_device_supports_boco(struct drm_device *dev)
347 {
348 	struct amdgpu_device *adev = drm_to_adev(dev);
349 
350 	if (adev->has_pr3 ||
351 	    ((adev->flags & AMD_IS_PX) && amdgpu_is_atpx_hybrid()))
352 		return true;
353 	return false;
354 }
355 
356 /**
357  * amdgpu_device_supports_baco - Does the device support BACO
358  *
359  * @dev: drm_device pointer
360  *
361  * Return:
362  * 1 if the device supporte BACO;
363  * 3 if the device support MACO (only works if BACO is supported)
364  * otherwise return 0.
365  */
amdgpu_device_supports_baco(struct drm_device * dev)366 int amdgpu_device_supports_baco(struct drm_device *dev)
367 {
368 	struct amdgpu_device *adev = drm_to_adev(dev);
369 
370 	return amdgpu_asic_supports_baco(adev);
371 }
372 
amdgpu_device_detect_runtime_pm_mode(struct amdgpu_device * adev)373 void amdgpu_device_detect_runtime_pm_mode(struct amdgpu_device *adev)
374 {
375 	struct drm_device *dev;
376 	int bamaco_support;
377 
378 	dev = adev_to_drm(adev);
379 
380 	adev->pm.rpm_mode = AMDGPU_RUNPM_NONE;
381 	bamaco_support = amdgpu_device_supports_baco(dev);
382 
383 	switch (amdgpu_runtime_pm) {
384 	case 2:
385 		if (bamaco_support & MACO_SUPPORT) {
386 			adev->pm.rpm_mode = AMDGPU_RUNPM_BAMACO;
387 			dev_info(adev->dev, "Forcing BAMACO for runtime pm\n");
388 		} else if (bamaco_support == BACO_SUPPORT) {
389 			adev->pm.rpm_mode = AMDGPU_RUNPM_BACO;
390 			dev_info(adev->dev, "Requested mode BAMACO not available,fallback to use BACO\n");
391 		}
392 		break;
393 	case 1:
394 		if (bamaco_support & BACO_SUPPORT) {
395 			adev->pm.rpm_mode = AMDGPU_RUNPM_BACO;
396 			dev_info(adev->dev, "Forcing BACO for runtime pm\n");
397 		}
398 		break;
399 	case -1:
400 	case -2:
401 		if (amdgpu_device_supports_px(dev)) { /* enable PX as runtime mode */
402 			adev->pm.rpm_mode = AMDGPU_RUNPM_PX;
403 			dev_info(adev->dev, "Using ATPX for runtime pm\n");
404 		} else if (amdgpu_device_supports_boco(dev)) { /* enable boco as runtime mode */
405 			adev->pm.rpm_mode = AMDGPU_RUNPM_BOCO;
406 			dev_info(adev->dev, "Using BOCO for runtime pm\n");
407 		} else {
408 			if (!bamaco_support)
409 				goto no_runtime_pm;
410 
411 			switch (adev->asic_type) {
412 			case CHIP_VEGA20:
413 			case CHIP_ARCTURUS:
414 				/* BACO are not supported on vega20 and arctrus */
415 				break;
416 			case CHIP_VEGA10:
417 				/* enable BACO as runpm mode if noretry=0 */
418 				if (!adev->gmc.noretry)
419 					adev->pm.rpm_mode = AMDGPU_RUNPM_BACO;
420 				break;
421 			default:
422 				/* enable BACO as runpm mode on CI+ */
423 				adev->pm.rpm_mode = AMDGPU_RUNPM_BACO;
424 				break;
425 			}
426 
427 			if (adev->pm.rpm_mode == AMDGPU_RUNPM_BACO) {
428 				if (bamaco_support & MACO_SUPPORT) {
429 					adev->pm.rpm_mode = AMDGPU_RUNPM_BAMACO;
430 					dev_info(adev->dev, "Using BAMACO for runtime pm\n");
431 				} else {
432 					dev_info(adev->dev, "Using BACO for runtime pm\n");
433 				}
434 			}
435 		}
436 		break;
437 	case 0:
438 		dev_info(adev->dev, "runtime pm is manually disabled\n");
439 		break;
440 	default:
441 		break;
442 	}
443 
444 no_runtime_pm:
445 	if (adev->pm.rpm_mode == AMDGPU_RUNPM_NONE)
446 		dev_info(adev->dev, "Runtime PM not available\n");
447 }
448 /**
449  * amdgpu_device_supports_smart_shift - Is the device dGPU with
450  * smart shift support
451  *
452  * @dev: drm_device pointer
453  *
454  * Returns true if the device is a dGPU with Smart Shift support,
455  * otherwise returns false.
456  */
amdgpu_device_supports_smart_shift(struct drm_device * dev)457 bool amdgpu_device_supports_smart_shift(struct drm_device *dev)
458 {
459 	return (amdgpu_device_supports_boco(dev) &&
460 		amdgpu_acpi_is_power_shift_control_supported());
461 }
462 
463 /*
464  * VRAM access helper functions
465  */
466 
467 /**
468  * amdgpu_device_mm_access - access vram by MM_INDEX/MM_DATA
469  *
470  * @adev: amdgpu_device pointer
471  * @pos: offset of the buffer in vram
472  * @buf: virtual address of the buffer in system memory
473  * @size: read/write size, sizeof(@buf) must > @size
474  * @write: true - write to vram, otherwise - read from vram
475  */
amdgpu_device_mm_access(struct amdgpu_device * adev,loff_t pos,void * buf,size_t size,bool write)476 void amdgpu_device_mm_access(struct amdgpu_device *adev, loff_t pos,
477 			     void *buf, size_t size, bool write)
478 {
479 	unsigned long flags;
480 	uint32_t hi = ~0, tmp = 0;
481 	uint32_t *data = buf;
482 	uint64_t last;
483 	int idx;
484 
485 	if (!drm_dev_enter(adev_to_drm(adev), &idx))
486 		return;
487 
488 	BUG_ON(!IS_ALIGNED(pos, 4) || !IS_ALIGNED(size, 4));
489 
490 	spin_lock_irqsave(&adev->mmio_idx_lock, flags);
491 	for (last = pos + size; pos < last; pos += 4) {
492 		tmp = pos >> 31;
493 
494 		WREG32_NO_KIQ(mmMM_INDEX, ((uint32_t)pos) | 0x80000000);
495 		if (tmp != hi) {
496 			WREG32_NO_KIQ(mmMM_INDEX_HI, tmp);
497 			hi = tmp;
498 		}
499 		if (write)
500 			WREG32_NO_KIQ(mmMM_DATA, *data++);
501 		else
502 			*data++ = RREG32_NO_KIQ(mmMM_DATA);
503 	}
504 
505 	spin_unlock_irqrestore(&adev->mmio_idx_lock, flags);
506 	drm_dev_exit(idx);
507 }
508 
509 /**
510  * amdgpu_device_aper_access - access vram by vram aperature
511  *
512  * @adev: amdgpu_device pointer
513  * @pos: offset of the buffer in vram
514  * @buf: virtual address of the buffer in system memory
515  * @size: read/write size, sizeof(@buf) must > @size
516  * @write: true - write to vram, otherwise - read from vram
517  *
518  * The return value means how many bytes have been transferred.
519  */
amdgpu_device_aper_access(struct amdgpu_device * adev,loff_t pos,void * buf,size_t size,bool write)520 size_t amdgpu_device_aper_access(struct amdgpu_device *adev, loff_t pos,
521 				 void *buf, size_t size, bool write)
522 {
523 #ifdef CONFIG_64BIT
524 	void __iomem *addr;
525 	size_t count = 0;
526 	uint64_t last;
527 
528 	if (!adev->mman.aper_base_kaddr)
529 		return 0;
530 
531 	last = min(pos + size, adev->gmc.visible_vram_size);
532 	if (last > pos) {
533 		addr = adev->mman.aper_base_kaddr + pos;
534 		count = last - pos;
535 
536 		if (write) {
537 			memcpy_toio(addr, buf, count);
538 			/* Make sure HDP write cache flush happens without any reordering
539 			 * after the system memory contents are sent over PCIe device
540 			 */
541 			mb();
542 			amdgpu_device_flush_hdp(adev, NULL);
543 		} else {
544 			amdgpu_device_invalidate_hdp(adev, NULL);
545 			/* Make sure HDP read cache is invalidated before issuing a read
546 			 * to the PCIe device
547 			 */
548 			mb();
549 			memcpy_fromio(buf, addr, count);
550 		}
551 
552 	}
553 
554 	return count;
555 #else
556 	return 0;
557 #endif
558 }
559 
560 /**
561  * amdgpu_device_vram_access - read/write a buffer in vram
562  *
563  * @adev: amdgpu_device pointer
564  * @pos: offset of the buffer in vram
565  * @buf: virtual address of the buffer in system memory
566  * @size: read/write size, sizeof(@buf) must > @size
567  * @write: true - write to vram, otherwise - read from vram
568  */
amdgpu_device_vram_access(struct amdgpu_device * adev,loff_t pos,void * buf,size_t size,bool write)569 void amdgpu_device_vram_access(struct amdgpu_device *adev, loff_t pos,
570 			       void *buf, size_t size, bool write)
571 {
572 	size_t count;
573 
574 	/* try to using vram apreature to access vram first */
575 	count = amdgpu_device_aper_access(adev, pos, buf, size, write);
576 	size -= count;
577 	if (size) {
578 		/* using MM to access rest vram */
579 		pos += count;
580 		buf += count;
581 		amdgpu_device_mm_access(adev, pos, buf, size, write);
582 	}
583 }
584 
585 /*
586  * register access helper functions.
587  */
588 
589 /* Check if hw access should be skipped because of hotplug or device error */
amdgpu_device_skip_hw_access(struct amdgpu_device * adev)590 bool amdgpu_device_skip_hw_access(struct amdgpu_device *adev)
591 {
592 	if (adev->no_hw_access)
593 		return true;
594 
595 #ifdef CONFIG_LOCKDEP
596 	/*
597 	 * This is a bit complicated to understand, so worth a comment. What we assert
598 	 * here is that the GPU reset is not running on another thread in parallel.
599 	 *
600 	 * For this we trylock the read side of the reset semaphore, if that succeeds
601 	 * we know that the reset is not running in paralell.
602 	 *
603 	 * If the trylock fails we assert that we are either already holding the read
604 	 * side of the lock or are the reset thread itself and hold the write side of
605 	 * the lock.
606 	 */
607 	if (in_task()) {
608 		if (down_read_trylock(&adev->reset_domain->sem))
609 			up_read(&adev->reset_domain->sem);
610 		else
611 			lockdep_assert_held(&adev->reset_domain->sem);
612 	}
613 #endif
614 	return false;
615 }
616 
617 /**
618  * amdgpu_device_rreg - read a memory mapped IO or indirect register
619  *
620  * @adev: amdgpu_device pointer
621  * @reg: dword aligned register offset
622  * @acc_flags: access flags which require special behavior
623  *
624  * Returns the 32 bit value from the offset specified.
625  */
amdgpu_device_rreg(struct amdgpu_device * adev,uint32_t reg,uint32_t acc_flags)626 uint32_t amdgpu_device_rreg(struct amdgpu_device *adev,
627 			    uint32_t reg, uint32_t acc_flags)
628 {
629 	uint32_t ret;
630 
631 	if (amdgpu_device_skip_hw_access(adev))
632 		return 0;
633 
634 	if ((reg * 4) < adev->rmmio_size) {
635 		if (!(acc_flags & AMDGPU_REGS_NO_KIQ) &&
636 		    amdgpu_sriov_runtime(adev) &&
637 		    down_read_trylock(&adev->reset_domain->sem)) {
638 			ret = amdgpu_kiq_rreg(adev, reg, 0);
639 			up_read(&adev->reset_domain->sem);
640 		} else {
641 			ret = readl(((void __iomem *)adev->rmmio) + (reg * 4));
642 		}
643 	} else {
644 		ret = adev->pcie_rreg(adev, reg * 4);
645 	}
646 
647 	trace_amdgpu_device_rreg(adev->pdev->device, reg, ret);
648 
649 	return ret;
650 }
651 
652 /*
653  * MMIO register read with bytes helper functions
654  * @offset:bytes offset from MMIO start
655  */
656 
657 /**
658  * amdgpu_mm_rreg8 - read a memory mapped IO register
659  *
660  * @adev: amdgpu_device pointer
661  * @offset: byte aligned register offset
662  *
663  * Returns the 8 bit value from the offset specified.
664  */
amdgpu_mm_rreg8(struct amdgpu_device * adev,uint32_t offset)665 uint8_t amdgpu_mm_rreg8(struct amdgpu_device *adev, uint32_t offset)
666 {
667 	if (amdgpu_device_skip_hw_access(adev))
668 		return 0;
669 
670 	if (offset < adev->rmmio_size)
671 		return (readb(adev->rmmio + offset));
672 	BUG();
673 }
674 
675 
676 /**
677  * amdgpu_device_xcc_rreg - read a memory mapped IO or indirect register with specific XCC
678  *
679  * @adev: amdgpu_device pointer
680  * @reg: dword aligned register offset
681  * @acc_flags: access flags which require special behavior
682  * @xcc_id: xcc accelerated compute core id
683  *
684  * Returns the 32 bit value from the offset specified.
685  */
amdgpu_device_xcc_rreg(struct amdgpu_device * adev,uint32_t reg,uint32_t acc_flags,uint32_t xcc_id)686 uint32_t amdgpu_device_xcc_rreg(struct amdgpu_device *adev,
687 				uint32_t reg, uint32_t acc_flags,
688 				uint32_t xcc_id)
689 {
690 	uint32_t ret, rlcg_flag;
691 
692 	if (amdgpu_device_skip_hw_access(adev))
693 		return 0;
694 
695 	if ((reg * 4) < adev->rmmio_size) {
696 		if (amdgpu_sriov_vf(adev) &&
697 		    !amdgpu_sriov_runtime(adev) &&
698 		    adev->gfx.rlc.rlcg_reg_access_supported &&
699 		    amdgpu_virt_get_rlcg_reg_access_flag(adev, acc_flags,
700 							 GC_HWIP, false,
701 							 &rlcg_flag)) {
702 			ret = amdgpu_virt_rlcg_reg_rw(adev, reg, 0, rlcg_flag, GET_INST(GC, xcc_id));
703 		} else if (!(acc_flags & AMDGPU_REGS_NO_KIQ) &&
704 		    amdgpu_sriov_runtime(adev) &&
705 		    down_read_trylock(&adev->reset_domain->sem)) {
706 			ret = amdgpu_kiq_rreg(adev, reg, xcc_id);
707 			up_read(&adev->reset_domain->sem);
708 		} else {
709 			ret = readl(((void __iomem *)adev->rmmio) + (reg * 4));
710 		}
711 	} else {
712 		ret = adev->pcie_rreg(adev, reg * 4);
713 	}
714 
715 	return ret;
716 }
717 
718 /*
719  * MMIO register write with bytes helper functions
720  * @offset:bytes offset from MMIO start
721  * @value: the value want to be written to the register
722  */
723 
724 /**
725  * amdgpu_mm_wreg8 - read a memory mapped IO register
726  *
727  * @adev: amdgpu_device pointer
728  * @offset: byte aligned register offset
729  * @value: 8 bit value to write
730  *
731  * Writes the value specified to the offset specified.
732  */
amdgpu_mm_wreg8(struct amdgpu_device * adev,uint32_t offset,uint8_t value)733 void amdgpu_mm_wreg8(struct amdgpu_device *adev, uint32_t offset, uint8_t value)
734 {
735 	if (amdgpu_device_skip_hw_access(adev))
736 		return;
737 
738 	if (offset < adev->rmmio_size)
739 		writeb(value, adev->rmmio + offset);
740 	else
741 		BUG();
742 }
743 
744 /**
745  * amdgpu_device_wreg - write to a memory mapped IO or indirect register
746  *
747  * @adev: amdgpu_device pointer
748  * @reg: dword aligned register offset
749  * @v: 32 bit value to write to the register
750  * @acc_flags: access flags which require special behavior
751  *
752  * Writes the value specified to the offset specified.
753  */
amdgpu_device_wreg(struct amdgpu_device * adev,uint32_t reg,uint32_t v,uint32_t acc_flags)754 void amdgpu_device_wreg(struct amdgpu_device *adev,
755 			uint32_t reg, uint32_t v,
756 			uint32_t acc_flags)
757 {
758 	if (amdgpu_device_skip_hw_access(adev))
759 		return;
760 
761 	if ((reg * 4) < adev->rmmio_size) {
762 		if (!(acc_flags & AMDGPU_REGS_NO_KIQ) &&
763 		    amdgpu_sriov_runtime(adev) &&
764 		    down_read_trylock(&adev->reset_domain->sem)) {
765 			amdgpu_kiq_wreg(adev, reg, v, 0);
766 			up_read(&adev->reset_domain->sem);
767 		} else {
768 			writel(v, ((void __iomem *)adev->rmmio) + (reg * 4));
769 		}
770 	} else {
771 		adev->pcie_wreg(adev, reg * 4, v);
772 	}
773 
774 	trace_amdgpu_device_wreg(adev->pdev->device, reg, v);
775 }
776 
777 /**
778  * amdgpu_mm_wreg_mmio_rlc -  write register either with direct/indirect mmio or with RLC path if in range
779  *
780  * @adev: amdgpu_device pointer
781  * @reg: mmio/rlc register
782  * @v: value to write
783  * @xcc_id: xcc accelerated compute core id
784  *
785  * this function is invoked only for the debugfs register access
786  */
amdgpu_mm_wreg_mmio_rlc(struct amdgpu_device * adev,uint32_t reg,uint32_t v,uint32_t xcc_id)787 void amdgpu_mm_wreg_mmio_rlc(struct amdgpu_device *adev,
788 			     uint32_t reg, uint32_t v,
789 			     uint32_t xcc_id)
790 {
791 	if (amdgpu_device_skip_hw_access(adev))
792 		return;
793 
794 	if (amdgpu_sriov_fullaccess(adev) &&
795 	    adev->gfx.rlc.funcs &&
796 	    adev->gfx.rlc.funcs->is_rlcg_access_range) {
797 		if (adev->gfx.rlc.funcs->is_rlcg_access_range(adev, reg))
798 			return amdgpu_sriov_wreg(adev, reg, v, 0, 0, xcc_id);
799 	} else if ((reg * 4) >= adev->rmmio_size) {
800 		adev->pcie_wreg(adev, reg * 4, v);
801 	} else {
802 		writel(v, ((void __iomem *)adev->rmmio) + (reg * 4));
803 	}
804 }
805 
806 /**
807  * amdgpu_device_xcc_wreg - write to a memory mapped IO or indirect register with specific XCC
808  *
809  * @adev: amdgpu_device pointer
810  * @reg: dword aligned register offset
811  * @v: 32 bit value to write to the register
812  * @acc_flags: access flags which require special behavior
813  * @xcc_id: xcc accelerated compute core id
814  *
815  * Writes the value specified to the offset specified.
816  */
amdgpu_device_xcc_wreg(struct amdgpu_device * adev,uint32_t reg,uint32_t v,uint32_t acc_flags,uint32_t xcc_id)817 void amdgpu_device_xcc_wreg(struct amdgpu_device *adev,
818 			uint32_t reg, uint32_t v,
819 			uint32_t acc_flags, uint32_t xcc_id)
820 {
821 	uint32_t rlcg_flag;
822 
823 	if (amdgpu_device_skip_hw_access(adev))
824 		return;
825 
826 	if ((reg * 4) < adev->rmmio_size) {
827 		if (amdgpu_sriov_vf(adev) &&
828 		    !amdgpu_sriov_runtime(adev) &&
829 		    adev->gfx.rlc.rlcg_reg_access_supported &&
830 		    amdgpu_virt_get_rlcg_reg_access_flag(adev, acc_flags,
831 							 GC_HWIP, true,
832 							 &rlcg_flag)) {
833 			amdgpu_virt_rlcg_reg_rw(adev, reg, v, rlcg_flag, GET_INST(GC, xcc_id));
834 		} else if (!(acc_flags & AMDGPU_REGS_NO_KIQ) &&
835 		    amdgpu_sriov_runtime(adev) &&
836 		    down_read_trylock(&adev->reset_domain->sem)) {
837 			amdgpu_kiq_wreg(adev, reg, v, xcc_id);
838 			up_read(&adev->reset_domain->sem);
839 		} else {
840 			writel(v, ((void __iomem *)adev->rmmio) + (reg * 4));
841 		}
842 	} else {
843 		adev->pcie_wreg(adev, reg * 4, v);
844 	}
845 }
846 
847 /**
848  * amdgpu_device_indirect_rreg - read an indirect register
849  *
850  * @adev: amdgpu_device pointer
851  * @reg_addr: indirect register address to read from
852  *
853  * Returns the value of indirect register @reg_addr
854  */
amdgpu_device_indirect_rreg(struct amdgpu_device * adev,u32 reg_addr)855 u32 amdgpu_device_indirect_rreg(struct amdgpu_device *adev,
856 				u32 reg_addr)
857 {
858 	unsigned long flags, pcie_index, pcie_data;
859 	void __iomem *pcie_index_offset;
860 	void __iomem *pcie_data_offset;
861 	u32 r;
862 
863 	pcie_index = adev->nbio.funcs->get_pcie_index_offset(adev);
864 	pcie_data = adev->nbio.funcs->get_pcie_data_offset(adev);
865 
866 	spin_lock_irqsave(&adev->pcie_idx_lock, flags);
867 	pcie_index_offset = (void __iomem *)adev->rmmio + pcie_index * 4;
868 	pcie_data_offset = (void __iomem *)adev->rmmio + pcie_data * 4;
869 
870 	writel(reg_addr, pcie_index_offset);
871 	readl(pcie_index_offset);
872 	r = readl(pcie_data_offset);
873 	spin_unlock_irqrestore(&adev->pcie_idx_lock, flags);
874 
875 	return r;
876 }
877 
amdgpu_device_indirect_rreg_ext(struct amdgpu_device * adev,u64 reg_addr)878 u32 amdgpu_device_indirect_rreg_ext(struct amdgpu_device *adev,
879 				    u64 reg_addr)
880 {
881 	unsigned long flags, pcie_index, pcie_index_hi, pcie_data;
882 	u32 r;
883 	void __iomem *pcie_index_offset;
884 	void __iomem *pcie_index_hi_offset;
885 	void __iomem *pcie_data_offset;
886 
887 	if (unlikely(!adev->nbio.funcs)) {
888 		pcie_index = AMDGPU_PCIE_INDEX_FALLBACK;
889 		pcie_data = AMDGPU_PCIE_DATA_FALLBACK;
890 	} else {
891 		pcie_index = adev->nbio.funcs->get_pcie_index_offset(adev);
892 		pcie_data = adev->nbio.funcs->get_pcie_data_offset(adev);
893 	}
894 
895 	if (reg_addr >> 32) {
896 		if (unlikely(!adev->nbio.funcs))
897 			pcie_index_hi = AMDGPU_PCIE_INDEX_HI_FALLBACK;
898 		else
899 			pcie_index_hi = adev->nbio.funcs->get_pcie_index_hi_offset(adev);
900 	} else {
901 		pcie_index_hi = 0;
902 	}
903 
904 	spin_lock_irqsave(&adev->pcie_idx_lock, flags);
905 	pcie_index_offset = (void __iomem *)adev->rmmio + pcie_index * 4;
906 	pcie_data_offset = (void __iomem *)adev->rmmio + pcie_data * 4;
907 	if (pcie_index_hi != 0)
908 		pcie_index_hi_offset = (void __iomem *)adev->rmmio +
909 				pcie_index_hi * 4;
910 
911 	writel(reg_addr, pcie_index_offset);
912 	readl(pcie_index_offset);
913 	if (pcie_index_hi != 0) {
914 		writel((reg_addr >> 32) & 0xff, pcie_index_hi_offset);
915 		readl(pcie_index_hi_offset);
916 	}
917 	r = readl(pcie_data_offset);
918 
919 	/* clear the high bits */
920 	if (pcie_index_hi != 0) {
921 		writel(0, pcie_index_hi_offset);
922 		readl(pcie_index_hi_offset);
923 	}
924 
925 	spin_unlock_irqrestore(&adev->pcie_idx_lock, flags);
926 
927 	return r;
928 }
929 
930 /**
931  * amdgpu_device_indirect_rreg64 - read a 64bits indirect register
932  *
933  * @adev: amdgpu_device pointer
934  * @reg_addr: indirect register address to read from
935  *
936  * Returns the value of indirect register @reg_addr
937  */
amdgpu_device_indirect_rreg64(struct amdgpu_device * adev,u32 reg_addr)938 u64 amdgpu_device_indirect_rreg64(struct amdgpu_device *adev,
939 				  u32 reg_addr)
940 {
941 	unsigned long flags, pcie_index, pcie_data;
942 	void __iomem *pcie_index_offset;
943 	void __iomem *pcie_data_offset;
944 	u64 r;
945 
946 	pcie_index = adev->nbio.funcs->get_pcie_index_offset(adev);
947 	pcie_data = adev->nbio.funcs->get_pcie_data_offset(adev);
948 
949 	spin_lock_irqsave(&adev->pcie_idx_lock, flags);
950 	pcie_index_offset = (void __iomem *)adev->rmmio + pcie_index * 4;
951 	pcie_data_offset = (void __iomem *)adev->rmmio + pcie_data * 4;
952 
953 	/* read low 32 bits */
954 	writel(reg_addr, pcie_index_offset);
955 	readl(pcie_index_offset);
956 	r = readl(pcie_data_offset);
957 	/* read high 32 bits */
958 	writel(reg_addr + 4, pcie_index_offset);
959 	readl(pcie_index_offset);
960 	r |= ((u64)readl(pcie_data_offset) << 32);
961 	spin_unlock_irqrestore(&adev->pcie_idx_lock, flags);
962 
963 	return r;
964 }
965 
amdgpu_device_indirect_rreg64_ext(struct amdgpu_device * adev,u64 reg_addr)966 u64 amdgpu_device_indirect_rreg64_ext(struct amdgpu_device *adev,
967 				  u64 reg_addr)
968 {
969 	unsigned long flags, pcie_index, pcie_data;
970 	unsigned long pcie_index_hi = 0;
971 	void __iomem *pcie_index_offset;
972 	void __iomem *pcie_index_hi_offset;
973 	void __iomem *pcie_data_offset;
974 	u64 r;
975 
976 	pcie_index = adev->nbio.funcs->get_pcie_index_offset(adev);
977 	pcie_data = adev->nbio.funcs->get_pcie_data_offset(adev);
978 	if ((reg_addr >> 32) && (adev->nbio.funcs->get_pcie_index_hi_offset))
979 		pcie_index_hi = adev->nbio.funcs->get_pcie_index_hi_offset(adev);
980 
981 	spin_lock_irqsave(&adev->pcie_idx_lock, flags);
982 	pcie_index_offset = (void __iomem *)adev->rmmio + pcie_index * 4;
983 	pcie_data_offset = (void __iomem *)adev->rmmio + pcie_data * 4;
984 	if (pcie_index_hi != 0)
985 		pcie_index_hi_offset = (void __iomem *)adev->rmmio +
986 			pcie_index_hi * 4;
987 
988 	/* read low 32 bits */
989 	writel(reg_addr, pcie_index_offset);
990 	readl(pcie_index_offset);
991 	if (pcie_index_hi != 0) {
992 		writel((reg_addr >> 32) & 0xff, pcie_index_hi_offset);
993 		readl(pcie_index_hi_offset);
994 	}
995 	r = readl(pcie_data_offset);
996 	/* read high 32 bits */
997 	writel(reg_addr + 4, pcie_index_offset);
998 	readl(pcie_index_offset);
999 	if (pcie_index_hi != 0) {
1000 		writel((reg_addr >> 32) & 0xff, pcie_index_hi_offset);
1001 		readl(pcie_index_hi_offset);
1002 	}
1003 	r |= ((u64)readl(pcie_data_offset) << 32);
1004 
1005 	/* clear the high bits */
1006 	if (pcie_index_hi != 0) {
1007 		writel(0, pcie_index_hi_offset);
1008 		readl(pcie_index_hi_offset);
1009 	}
1010 
1011 	spin_unlock_irqrestore(&adev->pcie_idx_lock, flags);
1012 
1013 	return r;
1014 }
1015 
1016 /**
1017  * amdgpu_device_indirect_wreg - write an indirect register address
1018  *
1019  * @adev: amdgpu_device pointer
1020  * @reg_addr: indirect register offset
1021  * @reg_data: indirect register data
1022  *
1023  */
amdgpu_device_indirect_wreg(struct amdgpu_device * adev,u32 reg_addr,u32 reg_data)1024 void amdgpu_device_indirect_wreg(struct amdgpu_device *adev,
1025 				 u32 reg_addr, u32 reg_data)
1026 {
1027 	unsigned long flags, pcie_index, pcie_data;
1028 	void __iomem *pcie_index_offset;
1029 	void __iomem *pcie_data_offset;
1030 
1031 	pcie_index = adev->nbio.funcs->get_pcie_index_offset(adev);
1032 	pcie_data = adev->nbio.funcs->get_pcie_data_offset(adev);
1033 
1034 	spin_lock_irqsave(&adev->pcie_idx_lock, flags);
1035 	pcie_index_offset = (void __iomem *)adev->rmmio + pcie_index * 4;
1036 	pcie_data_offset = (void __iomem *)adev->rmmio + pcie_data * 4;
1037 
1038 	writel(reg_addr, pcie_index_offset);
1039 	readl(pcie_index_offset);
1040 	writel(reg_data, pcie_data_offset);
1041 	readl(pcie_data_offset);
1042 	spin_unlock_irqrestore(&adev->pcie_idx_lock, flags);
1043 }
1044 
amdgpu_device_indirect_wreg_ext(struct amdgpu_device * adev,u64 reg_addr,u32 reg_data)1045 void amdgpu_device_indirect_wreg_ext(struct amdgpu_device *adev,
1046 				     u64 reg_addr, u32 reg_data)
1047 {
1048 	unsigned long flags, pcie_index, pcie_index_hi, pcie_data;
1049 	void __iomem *pcie_index_offset;
1050 	void __iomem *pcie_index_hi_offset;
1051 	void __iomem *pcie_data_offset;
1052 
1053 	pcie_index = adev->nbio.funcs->get_pcie_index_offset(adev);
1054 	pcie_data = adev->nbio.funcs->get_pcie_data_offset(adev);
1055 	if ((reg_addr >> 32) && (adev->nbio.funcs->get_pcie_index_hi_offset))
1056 		pcie_index_hi = adev->nbio.funcs->get_pcie_index_hi_offset(adev);
1057 	else
1058 		pcie_index_hi = 0;
1059 
1060 	spin_lock_irqsave(&adev->pcie_idx_lock, flags);
1061 	pcie_index_offset = (void __iomem *)adev->rmmio + pcie_index * 4;
1062 	pcie_data_offset = (void __iomem *)adev->rmmio + pcie_data * 4;
1063 	if (pcie_index_hi != 0)
1064 		pcie_index_hi_offset = (void __iomem *)adev->rmmio +
1065 				pcie_index_hi * 4;
1066 
1067 	writel(reg_addr, pcie_index_offset);
1068 	readl(pcie_index_offset);
1069 	if (pcie_index_hi != 0) {
1070 		writel((reg_addr >> 32) & 0xff, pcie_index_hi_offset);
1071 		readl(pcie_index_hi_offset);
1072 	}
1073 	writel(reg_data, pcie_data_offset);
1074 	readl(pcie_data_offset);
1075 
1076 	/* clear the high bits */
1077 	if (pcie_index_hi != 0) {
1078 		writel(0, pcie_index_hi_offset);
1079 		readl(pcie_index_hi_offset);
1080 	}
1081 
1082 	spin_unlock_irqrestore(&adev->pcie_idx_lock, flags);
1083 }
1084 
1085 /**
1086  * amdgpu_device_indirect_wreg64 - write a 64bits indirect register address
1087  *
1088  * @adev: amdgpu_device pointer
1089  * @reg_addr: indirect register offset
1090  * @reg_data: indirect register data
1091  *
1092  */
amdgpu_device_indirect_wreg64(struct amdgpu_device * adev,u32 reg_addr,u64 reg_data)1093 void amdgpu_device_indirect_wreg64(struct amdgpu_device *adev,
1094 				   u32 reg_addr, u64 reg_data)
1095 {
1096 	unsigned long flags, pcie_index, pcie_data;
1097 	void __iomem *pcie_index_offset;
1098 	void __iomem *pcie_data_offset;
1099 
1100 	pcie_index = adev->nbio.funcs->get_pcie_index_offset(adev);
1101 	pcie_data = adev->nbio.funcs->get_pcie_data_offset(adev);
1102 
1103 	spin_lock_irqsave(&adev->pcie_idx_lock, flags);
1104 	pcie_index_offset = (void __iomem *)adev->rmmio + pcie_index * 4;
1105 	pcie_data_offset = (void __iomem *)adev->rmmio + pcie_data * 4;
1106 
1107 	/* write low 32 bits */
1108 	writel(reg_addr, pcie_index_offset);
1109 	readl(pcie_index_offset);
1110 	writel((u32)(reg_data & 0xffffffffULL), pcie_data_offset);
1111 	readl(pcie_data_offset);
1112 	/* write high 32 bits */
1113 	writel(reg_addr + 4, pcie_index_offset);
1114 	readl(pcie_index_offset);
1115 	writel((u32)(reg_data >> 32), pcie_data_offset);
1116 	readl(pcie_data_offset);
1117 	spin_unlock_irqrestore(&adev->pcie_idx_lock, flags);
1118 }
1119 
amdgpu_device_indirect_wreg64_ext(struct amdgpu_device * adev,u64 reg_addr,u64 reg_data)1120 void amdgpu_device_indirect_wreg64_ext(struct amdgpu_device *adev,
1121 				   u64 reg_addr, u64 reg_data)
1122 {
1123 	unsigned long flags, pcie_index, pcie_data;
1124 	unsigned long pcie_index_hi = 0;
1125 	void __iomem *pcie_index_offset;
1126 	void __iomem *pcie_index_hi_offset;
1127 	void __iomem *pcie_data_offset;
1128 
1129 	pcie_index = adev->nbio.funcs->get_pcie_index_offset(adev);
1130 	pcie_data = adev->nbio.funcs->get_pcie_data_offset(adev);
1131 	if ((reg_addr >> 32) && (adev->nbio.funcs->get_pcie_index_hi_offset))
1132 		pcie_index_hi = adev->nbio.funcs->get_pcie_index_hi_offset(adev);
1133 
1134 	spin_lock_irqsave(&adev->pcie_idx_lock, flags);
1135 	pcie_index_offset = (void __iomem *)adev->rmmio + pcie_index * 4;
1136 	pcie_data_offset = (void __iomem *)adev->rmmio + pcie_data * 4;
1137 	if (pcie_index_hi != 0)
1138 		pcie_index_hi_offset = (void __iomem *)adev->rmmio +
1139 				pcie_index_hi * 4;
1140 
1141 	/* write low 32 bits */
1142 	writel(reg_addr, pcie_index_offset);
1143 	readl(pcie_index_offset);
1144 	if (pcie_index_hi != 0) {
1145 		writel((reg_addr >> 32) & 0xff, pcie_index_hi_offset);
1146 		readl(pcie_index_hi_offset);
1147 	}
1148 	writel((u32)(reg_data & 0xffffffffULL), pcie_data_offset);
1149 	readl(pcie_data_offset);
1150 	/* write high 32 bits */
1151 	writel(reg_addr + 4, pcie_index_offset);
1152 	readl(pcie_index_offset);
1153 	if (pcie_index_hi != 0) {
1154 		writel((reg_addr >> 32) & 0xff, pcie_index_hi_offset);
1155 		readl(pcie_index_hi_offset);
1156 	}
1157 	writel((u32)(reg_data >> 32), pcie_data_offset);
1158 	readl(pcie_data_offset);
1159 
1160 	/* clear the high bits */
1161 	if (pcie_index_hi != 0) {
1162 		writel(0, pcie_index_hi_offset);
1163 		readl(pcie_index_hi_offset);
1164 	}
1165 
1166 	spin_unlock_irqrestore(&adev->pcie_idx_lock, flags);
1167 }
1168 
1169 /**
1170  * amdgpu_device_get_rev_id - query device rev_id
1171  *
1172  * @adev: amdgpu_device pointer
1173  *
1174  * Return device rev_id
1175  */
amdgpu_device_get_rev_id(struct amdgpu_device * adev)1176 u32 amdgpu_device_get_rev_id(struct amdgpu_device *adev)
1177 {
1178 	return adev->nbio.funcs->get_rev_id(adev);
1179 }
1180 
1181 /**
1182  * amdgpu_invalid_rreg - dummy reg read function
1183  *
1184  * @adev: amdgpu_device pointer
1185  * @reg: offset of register
1186  *
1187  * Dummy register read function.  Used for register blocks
1188  * that certain asics don't have (all asics).
1189  * Returns the value in the register.
1190  */
amdgpu_invalid_rreg(struct amdgpu_device * adev,uint32_t reg)1191 static uint32_t amdgpu_invalid_rreg(struct amdgpu_device *adev, uint32_t reg)
1192 {
1193 	DRM_ERROR("Invalid callback to read register 0x%04X\n", reg);
1194 	BUG();
1195 	return 0;
1196 }
1197 
amdgpu_invalid_rreg_ext(struct amdgpu_device * adev,uint64_t reg)1198 static uint32_t amdgpu_invalid_rreg_ext(struct amdgpu_device *adev, uint64_t reg)
1199 {
1200 	DRM_ERROR("Invalid callback to read register 0x%llX\n", reg);
1201 	BUG();
1202 	return 0;
1203 }
1204 
1205 /**
1206  * amdgpu_invalid_wreg - dummy reg write function
1207  *
1208  * @adev: amdgpu_device pointer
1209  * @reg: offset of register
1210  * @v: value to write to the register
1211  *
1212  * Dummy register read function.  Used for register blocks
1213  * that certain asics don't have (all asics).
1214  */
amdgpu_invalid_wreg(struct amdgpu_device * adev,uint32_t reg,uint32_t v)1215 static void amdgpu_invalid_wreg(struct amdgpu_device *adev, uint32_t reg, uint32_t v)
1216 {
1217 	DRM_ERROR("Invalid callback to write register 0x%04X with 0x%08X\n",
1218 		  reg, v);
1219 	BUG();
1220 }
1221 
amdgpu_invalid_wreg_ext(struct amdgpu_device * adev,uint64_t reg,uint32_t v)1222 static void amdgpu_invalid_wreg_ext(struct amdgpu_device *adev, uint64_t reg, uint32_t v)
1223 {
1224 	DRM_ERROR("Invalid callback to write register 0x%llX with 0x%08X\n",
1225 		  reg, v);
1226 	BUG();
1227 }
1228 
1229 /**
1230  * amdgpu_invalid_rreg64 - dummy 64 bit reg read function
1231  *
1232  * @adev: amdgpu_device pointer
1233  * @reg: offset of register
1234  *
1235  * Dummy register read function.  Used for register blocks
1236  * that certain asics don't have (all asics).
1237  * Returns the value in the register.
1238  */
amdgpu_invalid_rreg64(struct amdgpu_device * adev,uint32_t reg)1239 static uint64_t amdgpu_invalid_rreg64(struct amdgpu_device *adev, uint32_t reg)
1240 {
1241 	DRM_ERROR("Invalid callback to read 64 bit register 0x%04X\n", reg);
1242 	BUG();
1243 	return 0;
1244 }
1245 
amdgpu_invalid_rreg64_ext(struct amdgpu_device * adev,uint64_t reg)1246 static uint64_t amdgpu_invalid_rreg64_ext(struct amdgpu_device *adev, uint64_t reg)
1247 {
1248 	DRM_ERROR("Invalid callback to read register 0x%llX\n", reg);
1249 	BUG();
1250 	return 0;
1251 }
1252 
1253 /**
1254  * amdgpu_invalid_wreg64 - dummy reg write function
1255  *
1256  * @adev: amdgpu_device pointer
1257  * @reg: offset of register
1258  * @v: value to write to the register
1259  *
1260  * Dummy register read function.  Used for register blocks
1261  * that certain asics don't have (all asics).
1262  */
amdgpu_invalid_wreg64(struct amdgpu_device * adev,uint32_t reg,uint64_t v)1263 static void amdgpu_invalid_wreg64(struct amdgpu_device *adev, uint32_t reg, uint64_t v)
1264 {
1265 	DRM_ERROR("Invalid callback to write 64 bit register 0x%04X with 0x%08llX\n",
1266 		  reg, v);
1267 	BUG();
1268 }
1269 
amdgpu_invalid_wreg64_ext(struct amdgpu_device * adev,uint64_t reg,uint64_t v)1270 static void amdgpu_invalid_wreg64_ext(struct amdgpu_device *adev, uint64_t reg, uint64_t v)
1271 {
1272 	DRM_ERROR("Invalid callback to write 64 bit register 0x%llX with 0x%08llX\n",
1273 		  reg, v);
1274 	BUG();
1275 }
1276 
1277 /**
1278  * amdgpu_block_invalid_rreg - dummy reg read function
1279  *
1280  * @adev: amdgpu_device pointer
1281  * @block: offset of instance
1282  * @reg: offset of register
1283  *
1284  * Dummy register read function.  Used for register blocks
1285  * that certain asics don't have (all asics).
1286  * Returns the value in the register.
1287  */
amdgpu_block_invalid_rreg(struct amdgpu_device * adev,uint32_t block,uint32_t reg)1288 static uint32_t amdgpu_block_invalid_rreg(struct amdgpu_device *adev,
1289 					  uint32_t block, uint32_t reg)
1290 {
1291 	DRM_ERROR("Invalid callback to read register 0x%04X in block 0x%04X\n",
1292 		  reg, block);
1293 	BUG();
1294 	return 0;
1295 }
1296 
1297 /**
1298  * amdgpu_block_invalid_wreg - dummy reg write function
1299  *
1300  * @adev: amdgpu_device pointer
1301  * @block: offset of instance
1302  * @reg: offset of register
1303  * @v: value to write to the register
1304  *
1305  * Dummy register read function.  Used for register blocks
1306  * that certain asics don't have (all asics).
1307  */
amdgpu_block_invalid_wreg(struct amdgpu_device * adev,uint32_t block,uint32_t reg,uint32_t v)1308 static void amdgpu_block_invalid_wreg(struct amdgpu_device *adev,
1309 				      uint32_t block,
1310 				      uint32_t reg, uint32_t v)
1311 {
1312 	DRM_ERROR("Invalid block callback to write register 0x%04X in block 0x%04X with 0x%08X\n",
1313 		  reg, block, v);
1314 	BUG();
1315 }
1316 
1317 /**
1318  * amdgpu_device_asic_init - Wrapper for atom asic_init
1319  *
1320  * @adev: amdgpu_device pointer
1321  *
1322  * Does any asic specific work and then calls atom asic init.
1323  */
amdgpu_device_asic_init(struct amdgpu_device * adev)1324 static int amdgpu_device_asic_init(struct amdgpu_device *adev)
1325 {
1326 	int ret;
1327 
1328 	amdgpu_asic_pre_asic_init(adev);
1329 
1330 	if (amdgpu_ip_version(adev, GC_HWIP, 0) == IP_VERSION(9, 4, 3) ||
1331 	    amdgpu_ip_version(adev, GC_HWIP, 0) == IP_VERSION(9, 4, 4) ||
1332 	    amdgpu_ip_version(adev, GC_HWIP, 0) >= IP_VERSION(11, 0, 0)) {
1333 		amdgpu_psp_wait_for_bootloader(adev);
1334 		ret = amdgpu_atomfirmware_asic_init(adev, true);
1335 		return ret;
1336 	} else {
1337 		return amdgpu_atom_asic_init(adev->mode_info.atom_context);
1338 	}
1339 
1340 	return 0;
1341 }
1342 
1343 /**
1344  * amdgpu_device_mem_scratch_init - allocate the VRAM scratch page
1345  *
1346  * @adev: amdgpu_device pointer
1347  *
1348  * Allocates a scratch page of VRAM for use by various things in the
1349  * driver.
1350  */
amdgpu_device_mem_scratch_init(struct amdgpu_device * adev)1351 static int amdgpu_device_mem_scratch_init(struct amdgpu_device *adev)
1352 {
1353 	return amdgpu_bo_create_kernel(adev, AMDGPU_GPU_PAGE_SIZE, PAGE_SIZE,
1354 				       AMDGPU_GEM_DOMAIN_VRAM |
1355 				       AMDGPU_GEM_DOMAIN_GTT,
1356 				       &adev->mem_scratch.robj,
1357 				       &adev->mem_scratch.gpu_addr,
1358 				       (void **)&adev->mem_scratch.ptr);
1359 }
1360 
1361 /**
1362  * amdgpu_device_mem_scratch_fini - Free the VRAM scratch page
1363  *
1364  * @adev: amdgpu_device pointer
1365  *
1366  * Frees the VRAM scratch page.
1367  */
amdgpu_device_mem_scratch_fini(struct amdgpu_device * adev)1368 static void amdgpu_device_mem_scratch_fini(struct amdgpu_device *adev)
1369 {
1370 	amdgpu_bo_free_kernel(&adev->mem_scratch.robj, NULL, NULL);
1371 }
1372 
1373 /**
1374  * amdgpu_device_program_register_sequence - program an array of registers.
1375  *
1376  * @adev: amdgpu_device pointer
1377  * @registers: pointer to the register array
1378  * @array_size: size of the register array
1379  *
1380  * Programs an array or registers with and or masks.
1381  * This is a helper for setting golden registers.
1382  */
amdgpu_device_program_register_sequence(struct amdgpu_device * adev,const u32 * registers,const u32 array_size)1383 void amdgpu_device_program_register_sequence(struct amdgpu_device *adev,
1384 					     const u32 *registers,
1385 					     const u32 array_size)
1386 {
1387 	u32 tmp, reg, and_mask, or_mask;
1388 	int i;
1389 
1390 	if (array_size % 3)
1391 		return;
1392 
1393 	for (i = 0; i < array_size; i += 3) {
1394 		reg = registers[i + 0];
1395 		and_mask = registers[i + 1];
1396 		or_mask = registers[i + 2];
1397 
1398 		if (and_mask == 0xffffffff) {
1399 			tmp = or_mask;
1400 		} else {
1401 			tmp = RREG32(reg);
1402 			tmp &= ~and_mask;
1403 			if (adev->family >= AMDGPU_FAMILY_AI)
1404 				tmp |= (or_mask & and_mask);
1405 			else
1406 				tmp |= or_mask;
1407 		}
1408 		WREG32(reg, tmp);
1409 	}
1410 }
1411 
1412 /**
1413  * amdgpu_device_pci_config_reset - reset the GPU
1414  *
1415  * @adev: amdgpu_device pointer
1416  *
1417  * Resets the GPU using the pci config reset sequence.
1418  * Only applicable to asics prior to vega10.
1419  */
amdgpu_device_pci_config_reset(struct amdgpu_device * adev)1420 void amdgpu_device_pci_config_reset(struct amdgpu_device *adev)
1421 {
1422 	pci_write_config_dword(adev->pdev, 0x7c, AMDGPU_ASIC_RESET_DATA);
1423 }
1424 
1425 /**
1426  * amdgpu_device_pci_reset - reset the GPU using generic PCI means
1427  *
1428  * @adev: amdgpu_device pointer
1429  *
1430  * Resets the GPU using generic pci reset interfaces (FLR, SBR, etc.).
1431  */
amdgpu_device_pci_reset(struct amdgpu_device * adev)1432 int amdgpu_device_pci_reset(struct amdgpu_device *adev)
1433 {
1434 	return pci_reset_function(adev->pdev);
1435 }
1436 
1437 /*
1438  * amdgpu_device_wb_*()
1439  * Writeback is the method by which the GPU updates special pages in memory
1440  * with the status of certain GPU events (fences, ring pointers,etc.).
1441  */
1442 
1443 /**
1444  * amdgpu_device_wb_fini - Disable Writeback and free memory
1445  *
1446  * @adev: amdgpu_device pointer
1447  *
1448  * Disables Writeback and frees the Writeback memory (all asics).
1449  * Used at driver shutdown.
1450  */
amdgpu_device_wb_fini(struct amdgpu_device * adev)1451 static void amdgpu_device_wb_fini(struct amdgpu_device *adev)
1452 {
1453 	if (adev->wb.wb_obj) {
1454 		amdgpu_bo_free_kernel(&adev->wb.wb_obj,
1455 				      &adev->wb.gpu_addr,
1456 				      (void **)&adev->wb.wb);
1457 		adev->wb.wb_obj = NULL;
1458 	}
1459 }
1460 
1461 /**
1462  * amdgpu_device_wb_init - Init Writeback driver info and allocate memory
1463  *
1464  * @adev: amdgpu_device pointer
1465  *
1466  * Initializes writeback and allocates writeback memory (all asics).
1467  * Used at driver startup.
1468  * Returns 0 on success or an -error on failure.
1469  */
amdgpu_device_wb_init(struct amdgpu_device * adev)1470 static int amdgpu_device_wb_init(struct amdgpu_device *adev)
1471 {
1472 	int r;
1473 
1474 	if (adev->wb.wb_obj == NULL) {
1475 		/* AMDGPU_MAX_WB * sizeof(uint32_t) * 8 = AMDGPU_MAX_WB 256bit slots */
1476 		r = amdgpu_bo_create_kernel(adev, AMDGPU_MAX_WB * sizeof(uint32_t) * 8,
1477 					    PAGE_SIZE, AMDGPU_GEM_DOMAIN_GTT,
1478 					    &adev->wb.wb_obj, &adev->wb.gpu_addr,
1479 					    (void **)&adev->wb.wb);
1480 		if (r) {
1481 			dev_warn(adev->dev, "(%d) create WB bo failed\n", r);
1482 			return r;
1483 		}
1484 
1485 		adev->wb.num_wb = AMDGPU_MAX_WB;
1486 		memset(&adev->wb.used, 0, sizeof(adev->wb.used));
1487 
1488 		/* clear wb memory */
1489 		memset((char *)adev->wb.wb, 0, AMDGPU_MAX_WB * sizeof(uint32_t) * 8);
1490 	}
1491 
1492 	return 0;
1493 }
1494 
1495 /**
1496  * amdgpu_device_wb_get - Allocate a wb entry
1497  *
1498  * @adev: amdgpu_device pointer
1499  * @wb: wb index
1500  *
1501  * Allocate a wb slot for use by the driver (all asics).
1502  * Returns 0 on success or -EINVAL on failure.
1503  */
amdgpu_device_wb_get(struct amdgpu_device * adev,u32 * wb)1504 int amdgpu_device_wb_get(struct amdgpu_device *adev, u32 *wb)
1505 {
1506 	unsigned long flags, offset;
1507 
1508 	spin_lock_irqsave(&adev->wb.lock, flags);
1509 	offset = find_first_zero_bit(adev->wb.used, adev->wb.num_wb);
1510 	if (offset < adev->wb.num_wb) {
1511 		__set_bit(offset, adev->wb.used);
1512 		spin_unlock_irqrestore(&adev->wb.lock, flags);
1513 		*wb = offset << 3; /* convert to dw offset */
1514 		return 0;
1515 	} else {
1516 		spin_unlock_irqrestore(&adev->wb.lock, flags);
1517 		return -EINVAL;
1518 	}
1519 }
1520 
1521 /**
1522  * amdgpu_device_wb_free - Free a wb entry
1523  *
1524  * @adev: amdgpu_device pointer
1525  * @wb: wb index
1526  *
1527  * Free a wb slot allocated for use by the driver (all asics)
1528  */
amdgpu_device_wb_free(struct amdgpu_device * adev,u32 wb)1529 void amdgpu_device_wb_free(struct amdgpu_device *adev, u32 wb)
1530 {
1531 	unsigned long flags;
1532 
1533 	wb >>= 3;
1534 	spin_lock_irqsave(&adev->wb.lock, flags);
1535 	if (wb < adev->wb.num_wb)
1536 		__clear_bit(wb, adev->wb.used);
1537 	spin_unlock_irqrestore(&adev->wb.lock, flags);
1538 }
1539 
1540 /**
1541  * amdgpu_device_resize_fb_bar - try to resize FB BAR
1542  *
1543  * @adev: amdgpu_device pointer
1544  *
1545  * Try to resize FB BAR to make all VRAM CPU accessible. We try very hard not
1546  * to fail, but if any of the BARs is not accessible after the size we abort
1547  * driver loading by returning -ENODEV.
1548  */
amdgpu_device_resize_fb_bar(struct amdgpu_device * adev)1549 int amdgpu_device_resize_fb_bar(struct amdgpu_device *adev)
1550 {
1551 	int rbar_size = pci_rebar_bytes_to_size(adev->gmc.real_vram_size);
1552 	struct pci_bus *root;
1553 	struct resource *res;
1554 	unsigned int i;
1555 	u16 cmd;
1556 	int r;
1557 
1558 	if (!IS_ENABLED(CONFIG_PHYS_ADDR_T_64BIT))
1559 		return 0;
1560 
1561 	/* Bypass for VF */
1562 	if (amdgpu_sriov_vf(adev))
1563 		return 0;
1564 
1565 	/* resizing on Dell G5 SE platforms causes problems with runtime pm */
1566 	if ((amdgpu_runtime_pm != 0) &&
1567 	    adev->pdev->vendor == PCI_VENDOR_ID_ATI &&
1568 	    adev->pdev->device == 0x731f &&
1569 	    adev->pdev->subsystem_vendor == PCI_VENDOR_ID_DELL)
1570 		return 0;
1571 
1572 	/* PCI_EXT_CAP_ID_VNDR extended capability is located at 0x100 */
1573 	if (!pci_find_ext_capability(adev->pdev, PCI_EXT_CAP_ID_VNDR))
1574 		DRM_WARN("System can't access extended configuration space, please check!!\n");
1575 
1576 	/* skip if the bios has already enabled large BAR */
1577 	if (adev->gmc.real_vram_size &&
1578 	    (pci_resource_len(adev->pdev, 0) >= adev->gmc.real_vram_size))
1579 		return 0;
1580 
1581 	/* Check if the root BUS has 64bit memory resources */
1582 	root = adev->pdev->bus;
1583 	while (root->parent)
1584 		root = root->parent;
1585 
1586 	pci_bus_for_each_resource(root, res, i) {
1587 		if (res && res->flags & (IORESOURCE_MEM | IORESOURCE_MEM_64) &&
1588 		    res->start > 0x100000000ull)
1589 			break;
1590 	}
1591 
1592 	/* Trying to resize is pointless without a root hub window above 4GB */
1593 	if (!res)
1594 		return 0;
1595 
1596 	/* Limit the BAR size to what is available */
1597 	rbar_size = min(fls(pci_rebar_get_possible_sizes(adev->pdev, 0)) - 1,
1598 			rbar_size);
1599 
1600 	/* Disable memory decoding while we change the BAR addresses and size */
1601 	pci_read_config_word(adev->pdev, PCI_COMMAND, &cmd);
1602 	pci_write_config_word(adev->pdev, PCI_COMMAND,
1603 			      cmd & ~PCI_COMMAND_MEMORY);
1604 
1605 	/* Free the VRAM and doorbell BAR, we most likely need to move both. */
1606 	amdgpu_doorbell_fini(adev);
1607 	if (adev->asic_type >= CHIP_BONAIRE)
1608 		pci_release_resource(adev->pdev, 2);
1609 
1610 	pci_release_resource(adev->pdev, 0);
1611 
1612 	r = pci_resize_resource(adev->pdev, 0, rbar_size);
1613 	if (r == -ENOSPC)
1614 		DRM_INFO("Not enough PCI address space for a large BAR.");
1615 	else if (r && r != -ENOTSUPP)
1616 		DRM_ERROR("Problem resizing BAR0 (%d).", r);
1617 
1618 	pci_assign_unassigned_bus_resources(adev->pdev->bus);
1619 
1620 	/* When the doorbell or fb BAR isn't available we have no chance of
1621 	 * using the device.
1622 	 */
1623 	r = amdgpu_doorbell_init(adev);
1624 	if (r || (pci_resource_flags(adev->pdev, 0) & IORESOURCE_UNSET))
1625 		return -ENODEV;
1626 
1627 	pci_write_config_word(adev->pdev, PCI_COMMAND, cmd);
1628 
1629 	return 0;
1630 }
1631 
amdgpu_device_read_bios(struct amdgpu_device * adev)1632 static bool amdgpu_device_read_bios(struct amdgpu_device *adev)
1633 {
1634 	if (hweight32(adev->aid_mask) && (adev->flags & AMD_IS_APU))
1635 		return false;
1636 
1637 	return true;
1638 }
1639 
1640 /*
1641  * GPU helpers function.
1642  */
1643 /**
1644  * amdgpu_device_need_post - check if the hw need post or not
1645  *
1646  * @adev: amdgpu_device pointer
1647  *
1648  * Check if the asic has been initialized (all asics) at driver startup
1649  * or post is needed if  hw reset is performed.
1650  * Returns true if need or false if not.
1651  */
amdgpu_device_need_post(struct amdgpu_device * adev)1652 bool amdgpu_device_need_post(struct amdgpu_device *adev)
1653 {
1654 	uint32_t reg;
1655 
1656 	if (amdgpu_sriov_vf(adev))
1657 		return false;
1658 
1659 	if (!amdgpu_device_read_bios(adev))
1660 		return false;
1661 
1662 	if (amdgpu_passthrough(adev)) {
1663 		/* for FIJI: In whole GPU pass-through virtualization case, after VM reboot
1664 		 * some old smc fw still need driver do vPost otherwise gpu hang, while
1665 		 * those smc fw version above 22.15 doesn't have this flaw, so we force
1666 		 * vpost executed for smc version below 22.15
1667 		 */
1668 		if (adev->asic_type == CHIP_FIJI) {
1669 			int err;
1670 			uint32_t fw_ver;
1671 
1672 			err = request_firmware(&adev->pm.fw, "amdgpu/fiji_smc.bin", adev->dev);
1673 			/* force vPost if error occured */
1674 			if (err)
1675 				return true;
1676 
1677 			fw_ver = *((uint32_t *)adev->pm.fw->data + 69);
1678 			release_firmware(adev->pm.fw);
1679 			if (fw_ver < 0x00160e00)
1680 				return true;
1681 		}
1682 	}
1683 
1684 	/* Don't post if we need to reset whole hive on init */
1685 	if (adev->gmc.xgmi.pending_reset)
1686 		return false;
1687 
1688 	if (adev->has_hw_reset) {
1689 		adev->has_hw_reset = false;
1690 		return true;
1691 	}
1692 
1693 	/* bios scratch used on CIK+ */
1694 	if (adev->asic_type >= CHIP_BONAIRE)
1695 		return amdgpu_atombios_scratch_need_asic_init(adev);
1696 
1697 	/* check MEM_SIZE for older asics */
1698 	reg = amdgpu_asic_get_config_memsize(adev);
1699 
1700 	if ((reg != 0) && (reg != 0xffffffff))
1701 		return false;
1702 
1703 	return true;
1704 }
1705 
1706 /*
1707  * Check whether seamless boot is supported.
1708  *
1709  * So far we only support seamless boot on DCE 3.0 or later.
1710  * If users report that it works on older ASICS as well, we may
1711  * loosen this.
1712  */
amdgpu_device_seamless_boot_supported(struct amdgpu_device * adev)1713 bool amdgpu_device_seamless_boot_supported(struct amdgpu_device *adev)
1714 {
1715 	switch (amdgpu_seamless) {
1716 	case -1:
1717 		break;
1718 	case 1:
1719 		return true;
1720 	case 0:
1721 		return false;
1722 	default:
1723 		DRM_ERROR("Invalid value for amdgpu.seamless: %d\n",
1724 			  amdgpu_seamless);
1725 		return false;
1726 	}
1727 
1728 	if (!(adev->flags & AMD_IS_APU))
1729 		return false;
1730 
1731 	if (adev->mman.keep_stolen_vga_memory)
1732 		return false;
1733 
1734 	return amdgpu_ip_version(adev, DCE_HWIP, 0) >= IP_VERSION(3, 0, 0);
1735 }
1736 
1737 /*
1738  * Intel hosts such as Rocket Lake, Alder Lake, Raptor Lake and Sapphire Rapids
1739  * don't support dynamic speed switching. Until we have confirmation from Intel
1740  * that a specific host supports it, it's safer that we keep it disabled for all.
1741  *
1742  * https://edc.intel.com/content/www/us/en/design/products/platforms/details/raptor-lake-s/13th-generation-core-processors-datasheet-volume-1-of-2/005/pci-express-support/
1743  * https://gitlab.freedesktop.org/drm/amd/-/issues/2663
1744  */
amdgpu_device_pcie_dynamic_switching_supported(struct amdgpu_device * adev)1745 static bool amdgpu_device_pcie_dynamic_switching_supported(struct amdgpu_device *adev)
1746 {
1747 #if IS_ENABLED(CONFIG_X86)
1748 	struct cpuinfo_x86 *c = &cpu_data(0);
1749 
1750 	/* eGPU change speeds based on USB4 fabric conditions */
1751 	if (dev_is_removable(adev->dev))
1752 		return true;
1753 
1754 	if (c->x86_vendor == X86_VENDOR_INTEL)
1755 		return false;
1756 #endif
1757 	return true;
1758 }
1759 
1760 /**
1761  * amdgpu_device_should_use_aspm - check if the device should program ASPM
1762  *
1763  * @adev: amdgpu_device pointer
1764  *
1765  * Confirm whether the module parameter and pcie bridge agree that ASPM should
1766  * be set for this device.
1767  *
1768  * Returns true if it should be used or false if not.
1769  */
amdgpu_device_should_use_aspm(struct amdgpu_device * adev)1770 bool amdgpu_device_should_use_aspm(struct amdgpu_device *adev)
1771 {
1772 	switch (amdgpu_aspm) {
1773 	case -1:
1774 		break;
1775 	case 0:
1776 		return false;
1777 	case 1:
1778 		return true;
1779 	default:
1780 		return false;
1781 	}
1782 	if (adev->flags & AMD_IS_APU)
1783 		return false;
1784 	if (!(adev->pm.pp_feature & PP_PCIE_DPM_MASK))
1785 		return false;
1786 	return pcie_aspm_enabled(adev->pdev);
1787 }
1788 
1789 /* if we get transitioned to only one device, take VGA back */
1790 /**
1791  * amdgpu_device_vga_set_decode - enable/disable vga decode
1792  *
1793  * @pdev: PCI device pointer
1794  * @state: enable/disable vga decode
1795  *
1796  * Enable/disable vga decode (all asics).
1797  * Returns VGA resource flags.
1798  */
amdgpu_device_vga_set_decode(struct pci_dev * pdev,bool state)1799 static unsigned int amdgpu_device_vga_set_decode(struct pci_dev *pdev,
1800 		bool state)
1801 {
1802 	struct amdgpu_device *adev = drm_to_adev(pci_get_drvdata(pdev));
1803 
1804 	amdgpu_asic_set_vga_state(adev, state);
1805 	if (state)
1806 		return VGA_RSRC_LEGACY_IO | VGA_RSRC_LEGACY_MEM |
1807 		       VGA_RSRC_NORMAL_IO | VGA_RSRC_NORMAL_MEM;
1808 	else
1809 		return VGA_RSRC_NORMAL_IO | VGA_RSRC_NORMAL_MEM;
1810 }
1811 
1812 /**
1813  * amdgpu_device_check_block_size - validate the vm block size
1814  *
1815  * @adev: amdgpu_device pointer
1816  *
1817  * Validates the vm block size specified via module parameter.
1818  * The vm block size defines number of bits in page table versus page directory,
1819  * a page is 4KB so we have 12 bits offset, minimum 9 bits in the
1820  * page table and the remaining bits are in the page directory.
1821  */
amdgpu_device_check_block_size(struct amdgpu_device * adev)1822 static void amdgpu_device_check_block_size(struct amdgpu_device *adev)
1823 {
1824 	/* defines number of bits in page table versus page directory,
1825 	 * a page is 4KB so we have 12 bits offset, minimum 9 bits in the
1826 	 * page table and the remaining bits are in the page directory
1827 	 */
1828 	if (amdgpu_vm_block_size == -1)
1829 		return;
1830 
1831 	if (amdgpu_vm_block_size < 9) {
1832 		dev_warn(adev->dev, "VM page table size (%d) too small\n",
1833 			 amdgpu_vm_block_size);
1834 		amdgpu_vm_block_size = -1;
1835 	}
1836 }
1837 
1838 /**
1839  * amdgpu_device_check_vm_size - validate the vm size
1840  *
1841  * @adev: amdgpu_device pointer
1842  *
1843  * Validates the vm size in GB specified via module parameter.
1844  * The VM size is the size of the GPU virtual memory space in GB.
1845  */
amdgpu_device_check_vm_size(struct amdgpu_device * adev)1846 static void amdgpu_device_check_vm_size(struct amdgpu_device *adev)
1847 {
1848 	/* no need to check the default value */
1849 	if (amdgpu_vm_size == -1)
1850 		return;
1851 
1852 	if (amdgpu_vm_size < 1) {
1853 		dev_warn(adev->dev, "VM size (%d) too small, min is 1GB\n",
1854 			 amdgpu_vm_size);
1855 		amdgpu_vm_size = -1;
1856 	}
1857 }
1858 
amdgpu_device_check_smu_prv_buffer_size(struct amdgpu_device * adev)1859 static void amdgpu_device_check_smu_prv_buffer_size(struct amdgpu_device *adev)
1860 {
1861 	struct sysinfo si;
1862 	bool is_os_64 = (sizeof(void *) == 8);
1863 	uint64_t total_memory;
1864 	uint64_t dram_size_seven_GB = 0x1B8000000;
1865 	uint64_t dram_size_three_GB = 0xB8000000;
1866 
1867 	if (amdgpu_smu_memory_pool_size == 0)
1868 		return;
1869 
1870 	if (!is_os_64) {
1871 		DRM_WARN("Not 64-bit OS, feature not supported\n");
1872 		goto def_value;
1873 	}
1874 	si_meminfo(&si);
1875 	total_memory = (uint64_t)si.totalram * si.mem_unit;
1876 
1877 	if ((amdgpu_smu_memory_pool_size == 1) ||
1878 		(amdgpu_smu_memory_pool_size == 2)) {
1879 		if (total_memory < dram_size_three_GB)
1880 			goto def_value1;
1881 	} else if ((amdgpu_smu_memory_pool_size == 4) ||
1882 		(amdgpu_smu_memory_pool_size == 8)) {
1883 		if (total_memory < dram_size_seven_GB)
1884 			goto def_value1;
1885 	} else {
1886 		DRM_WARN("Smu memory pool size not supported\n");
1887 		goto def_value;
1888 	}
1889 	adev->pm.smu_prv_buffer_size = amdgpu_smu_memory_pool_size << 28;
1890 
1891 	return;
1892 
1893 def_value1:
1894 	DRM_WARN("No enough system memory\n");
1895 def_value:
1896 	adev->pm.smu_prv_buffer_size = 0;
1897 }
1898 
amdgpu_device_init_apu_flags(struct amdgpu_device * adev)1899 static int amdgpu_device_init_apu_flags(struct amdgpu_device *adev)
1900 {
1901 	if (!(adev->flags & AMD_IS_APU) ||
1902 	    adev->asic_type < CHIP_RAVEN)
1903 		return 0;
1904 
1905 	switch (adev->asic_type) {
1906 	case CHIP_RAVEN:
1907 		if (adev->pdev->device == 0x15dd)
1908 			adev->apu_flags |= AMD_APU_IS_RAVEN;
1909 		if (adev->pdev->device == 0x15d8)
1910 			adev->apu_flags |= AMD_APU_IS_PICASSO;
1911 		break;
1912 	case CHIP_RENOIR:
1913 		if ((adev->pdev->device == 0x1636) ||
1914 		    (adev->pdev->device == 0x164c))
1915 			adev->apu_flags |= AMD_APU_IS_RENOIR;
1916 		else
1917 			adev->apu_flags |= AMD_APU_IS_GREEN_SARDINE;
1918 		break;
1919 	case CHIP_VANGOGH:
1920 		adev->apu_flags |= AMD_APU_IS_VANGOGH;
1921 		break;
1922 	case CHIP_YELLOW_CARP:
1923 		break;
1924 	case CHIP_CYAN_SKILLFISH:
1925 		if ((adev->pdev->device == 0x13FE) ||
1926 		    (adev->pdev->device == 0x143F))
1927 			adev->apu_flags |= AMD_APU_IS_CYAN_SKILLFISH2;
1928 		break;
1929 	default:
1930 		break;
1931 	}
1932 
1933 	return 0;
1934 }
1935 
1936 /**
1937  * amdgpu_device_check_arguments - validate module params
1938  *
1939  * @adev: amdgpu_device pointer
1940  *
1941  * Validates certain module parameters and updates
1942  * the associated values used by the driver (all asics).
1943  */
amdgpu_device_check_arguments(struct amdgpu_device * adev)1944 static int amdgpu_device_check_arguments(struct amdgpu_device *adev)
1945 {
1946 	int i;
1947 
1948 	if (amdgpu_sched_jobs < 4) {
1949 		dev_warn(adev->dev, "sched jobs (%d) must be at least 4\n",
1950 			 amdgpu_sched_jobs);
1951 		amdgpu_sched_jobs = 4;
1952 	} else if (!is_power_of_2(amdgpu_sched_jobs)) {
1953 		dev_warn(adev->dev, "sched jobs (%d) must be a power of 2\n",
1954 			 amdgpu_sched_jobs);
1955 		amdgpu_sched_jobs = roundup_pow_of_two(amdgpu_sched_jobs);
1956 	}
1957 
1958 	if (amdgpu_gart_size != -1 && amdgpu_gart_size < 32) {
1959 		/* gart size must be greater or equal to 32M */
1960 		dev_warn(adev->dev, "gart size (%d) too small\n",
1961 			 amdgpu_gart_size);
1962 		amdgpu_gart_size = -1;
1963 	}
1964 
1965 	if (amdgpu_gtt_size != -1 && amdgpu_gtt_size < 32) {
1966 		/* gtt size must be greater or equal to 32M */
1967 		dev_warn(adev->dev, "gtt size (%d) too small\n",
1968 				 amdgpu_gtt_size);
1969 		amdgpu_gtt_size = -1;
1970 	}
1971 
1972 	/* valid range is between 4 and 9 inclusive */
1973 	if (amdgpu_vm_fragment_size != -1 &&
1974 	    (amdgpu_vm_fragment_size > 9 || amdgpu_vm_fragment_size < 4)) {
1975 		dev_warn(adev->dev, "valid range is between 4 and 9\n");
1976 		amdgpu_vm_fragment_size = -1;
1977 	}
1978 
1979 	if (amdgpu_sched_hw_submission < 2) {
1980 		dev_warn(adev->dev, "sched hw submission jobs (%d) must be at least 2\n",
1981 			 amdgpu_sched_hw_submission);
1982 		amdgpu_sched_hw_submission = 2;
1983 	} else if (!is_power_of_2(amdgpu_sched_hw_submission)) {
1984 		dev_warn(adev->dev, "sched hw submission jobs (%d) must be a power of 2\n",
1985 			 amdgpu_sched_hw_submission);
1986 		amdgpu_sched_hw_submission = roundup_pow_of_two(amdgpu_sched_hw_submission);
1987 	}
1988 
1989 	if (amdgpu_reset_method < -1 || amdgpu_reset_method > 4) {
1990 		dev_warn(adev->dev, "invalid option for reset method, reverting to default\n");
1991 		amdgpu_reset_method = -1;
1992 	}
1993 
1994 	amdgpu_device_check_smu_prv_buffer_size(adev);
1995 
1996 	amdgpu_device_check_vm_size(adev);
1997 
1998 	amdgpu_device_check_block_size(adev);
1999 
2000 	adev->firmware.load_type = amdgpu_ucode_get_load_type(adev, amdgpu_fw_load_type);
2001 
2002 	for (i = 0; i < MAX_XCP; i++)
2003 		adev->enforce_isolation[i] = !!enforce_isolation;
2004 
2005 	return 0;
2006 }
2007 
2008 /**
2009  * amdgpu_switcheroo_set_state - set switcheroo state
2010  *
2011  * @pdev: pci dev pointer
2012  * @state: vga_switcheroo state
2013  *
2014  * Callback for the switcheroo driver.  Suspends or resumes
2015  * the asics before or after it is powered up using ACPI methods.
2016  */
amdgpu_switcheroo_set_state(struct pci_dev * pdev,enum vga_switcheroo_state state)2017 static void amdgpu_switcheroo_set_state(struct pci_dev *pdev,
2018 					enum vga_switcheroo_state state)
2019 {
2020 	struct drm_device *dev = pci_get_drvdata(pdev);
2021 	int r;
2022 
2023 	if (amdgpu_device_supports_px(dev) && state == VGA_SWITCHEROO_OFF)
2024 		return;
2025 
2026 	if (state == VGA_SWITCHEROO_ON) {
2027 		pr_info("switched on\n");
2028 		/* don't suspend or resume card normally */
2029 		dev->switch_power_state = DRM_SWITCH_POWER_CHANGING;
2030 
2031 		pci_set_power_state(pdev, PCI_D0);
2032 		amdgpu_device_load_pci_state(pdev);
2033 		r = pci_enable_device(pdev);
2034 		if (r)
2035 			DRM_WARN("pci_enable_device failed (%d)\n", r);
2036 		amdgpu_device_resume(dev, true);
2037 
2038 		dev->switch_power_state = DRM_SWITCH_POWER_ON;
2039 	} else {
2040 		pr_info("switched off\n");
2041 		dev->switch_power_state = DRM_SWITCH_POWER_CHANGING;
2042 		amdgpu_device_prepare(dev);
2043 		amdgpu_device_suspend(dev, true);
2044 		amdgpu_device_cache_pci_state(pdev);
2045 		/* Shut down the device */
2046 		pci_disable_device(pdev);
2047 		pci_set_power_state(pdev, PCI_D3cold);
2048 		dev->switch_power_state = DRM_SWITCH_POWER_OFF;
2049 	}
2050 }
2051 
2052 /**
2053  * amdgpu_switcheroo_can_switch - see if switcheroo state can change
2054  *
2055  * @pdev: pci dev pointer
2056  *
2057  * Callback for the switcheroo driver.  Check of the switcheroo
2058  * state can be changed.
2059  * Returns true if the state can be changed, false if not.
2060  */
amdgpu_switcheroo_can_switch(struct pci_dev * pdev)2061 static bool amdgpu_switcheroo_can_switch(struct pci_dev *pdev)
2062 {
2063 	struct drm_device *dev = pci_get_drvdata(pdev);
2064 
2065        /*
2066 	* FIXME: open_count is protected by drm_global_mutex but that would lead to
2067 	* locking inversion with the driver load path. And the access here is
2068 	* completely racy anyway. So don't bother with locking for now.
2069 	*/
2070 	return atomic_read(&dev->open_count) == 0;
2071 }
2072 
2073 static const struct vga_switcheroo_client_ops amdgpu_switcheroo_ops = {
2074 	.set_gpu_state = amdgpu_switcheroo_set_state,
2075 	.reprobe = NULL,
2076 	.can_switch = amdgpu_switcheroo_can_switch,
2077 };
2078 
2079 /**
2080  * amdgpu_device_ip_set_clockgating_state - set the CG state
2081  *
2082  * @dev: amdgpu_device pointer
2083  * @block_type: Type of hardware IP (SMU, GFX, UVD, etc.)
2084  * @state: clockgating state (gate or ungate)
2085  *
2086  * Sets the requested clockgating state for all instances of
2087  * the hardware IP specified.
2088  * Returns the error code from the last instance.
2089  */
amdgpu_device_ip_set_clockgating_state(void * dev,enum amd_ip_block_type block_type,enum amd_clockgating_state state)2090 int amdgpu_device_ip_set_clockgating_state(void *dev,
2091 					   enum amd_ip_block_type block_type,
2092 					   enum amd_clockgating_state state)
2093 {
2094 	struct amdgpu_device *adev = dev;
2095 	int i, r = 0;
2096 
2097 	for (i = 0; i < adev->num_ip_blocks; i++) {
2098 		if (!adev->ip_blocks[i].status.valid)
2099 			continue;
2100 		if (adev->ip_blocks[i].version->type != block_type)
2101 			continue;
2102 		if (!adev->ip_blocks[i].version->funcs->set_clockgating_state)
2103 			continue;
2104 		r = adev->ip_blocks[i].version->funcs->set_clockgating_state(
2105 			(void *)adev, state);
2106 		if (r)
2107 			DRM_ERROR("set_clockgating_state of IP block <%s> failed %d\n",
2108 				  adev->ip_blocks[i].version->funcs->name, r);
2109 	}
2110 	return r;
2111 }
2112 
2113 /**
2114  * amdgpu_device_ip_set_powergating_state - set the PG state
2115  *
2116  * @dev: amdgpu_device pointer
2117  * @block_type: Type of hardware IP (SMU, GFX, UVD, etc.)
2118  * @state: powergating state (gate or ungate)
2119  *
2120  * Sets the requested powergating state for all instances of
2121  * the hardware IP specified.
2122  * Returns the error code from the last instance.
2123  */
amdgpu_device_ip_set_powergating_state(void * dev,enum amd_ip_block_type block_type,enum amd_powergating_state state)2124 int amdgpu_device_ip_set_powergating_state(void *dev,
2125 					   enum amd_ip_block_type block_type,
2126 					   enum amd_powergating_state state)
2127 {
2128 	struct amdgpu_device *adev = dev;
2129 	int i, r = 0;
2130 
2131 	for (i = 0; i < adev->num_ip_blocks; i++) {
2132 		if (!adev->ip_blocks[i].status.valid)
2133 			continue;
2134 		if (adev->ip_blocks[i].version->type != block_type)
2135 			continue;
2136 		if (!adev->ip_blocks[i].version->funcs->set_powergating_state)
2137 			continue;
2138 		r = adev->ip_blocks[i].version->funcs->set_powergating_state(
2139 			(void *)adev, state);
2140 		if (r)
2141 			DRM_ERROR("set_powergating_state of IP block <%s> failed %d\n",
2142 				  adev->ip_blocks[i].version->funcs->name, r);
2143 	}
2144 	return r;
2145 }
2146 
2147 /**
2148  * amdgpu_device_ip_get_clockgating_state - get the CG state
2149  *
2150  * @adev: amdgpu_device pointer
2151  * @flags: clockgating feature flags
2152  *
2153  * Walks the list of IPs on the device and updates the clockgating
2154  * flags for each IP.
2155  * Updates @flags with the feature flags for each hardware IP where
2156  * clockgating is enabled.
2157  */
amdgpu_device_ip_get_clockgating_state(struct amdgpu_device * adev,u64 * flags)2158 void amdgpu_device_ip_get_clockgating_state(struct amdgpu_device *adev,
2159 					    u64 *flags)
2160 {
2161 	int i;
2162 
2163 	for (i = 0; i < adev->num_ip_blocks; i++) {
2164 		if (!adev->ip_blocks[i].status.valid)
2165 			continue;
2166 		if (adev->ip_blocks[i].version->funcs->get_clockgating_state)
2167 			adev->ip_blocks[i].version->funcs->get_clockgating_state((void *)adev, flags);
2168 	}
2169 }
2170 
2171 /**
2172  * amdgpu_device_ip_wait_for_idle - wait for idle
2173  *
2174  * @adev: amdgpu_device pointer
2175  * @block_type: Type of hardware IP (SMU, GFX, UVD, etc.)
2176  *
2177  * Waits for the request hardware IP to be idle.
2178  * Returns 0 for success or a negative error code on failure.
2179  */
amdgpu_device_ip_wait_for_idle(struct amdgpu_device * adev,enum amd_ip_block_type block_type)2180 int amdgpu_device_ip_wait_for_idle(struct amdgpu_device *adev,
2181 				   enum amd_ip_block_type block_type)
2182 {
2183 	int i, r;
2184 
2185 	for (i = 0; i < adev->num_ip_blocks; i++) {
2186 		if (!adev->ip_blocks[i].status.valid)
2187 			continue;
2188 		if (adev->ip_blocks[i].version->type == block_type) {
2189 			r = adev->ip_blocks[i].version->funcs->wait_for_idle((void *)adev);
2190 			if (r)
2191 				return r;
2192 			break;
2193 		}
2194 	}
2195 	return 0;
2196 
2197 }
2198 
2199 /**
2200  * amdgpu_device_ip_is_idle - is the hardware IP idle
2201  *
2202  * @adev: amdgpu_device pointer
2203  * @block_type: Type of hardware IP (SMU, GFX, UVD, etc.)
2204  *
2205  * Check if the hardware IP is idle or not.
2206  * Returns true if it the IP is idle, false if not.
2207  */
amdgpu_device_ip_is_idle(struct amdgpu_device * adev,enum amd_ip_block_type block_type)2208 bool amdgpu_device_ip_is_idle(struct amdgpu_device *adev,
2209 			      enum amd_ip_block_type block_type)
2210 {
2211 	int i;
2212 
2213 	for (i = 0; i < adev->num_ip_blocks; i++) {
2214 		if (!adev->ip_blocks[i].status.valid)
2215 			continue;
2216 		if (adev->ip_blocks[i].version->type == block_type)
2217 			return adev->ip_blocks[i].version->funcs->is_idle((void *)adev);
2218 	}
2219 	return true;
2220 
2221 }
2222 
2223 /**
2224  * amdgpu_device_ip_get_ip_block - get a hw IP pointer
2225  *
2226  * @adev: amdgpu_device pointer
2227  * @type: Type of hardware IP (SMU, GFX, UVD, etc.)
2228  *
2229  * Returns a pointer to the hardware IP block structure
2230  * if it exists for the asic, otherwise NULL.
2231  */
2232 struct amdgpu_ip_block *
amdgpu_device_ip_get_ip_block(struct amdgpu_device * adev,enum amd_ip_block_type type)2233 amdgpu_device_ip_get_ip_block(struct amdgpu_device *adev,
2234 			      enum amd_ip_block_type type)
2235 {
2236 	int i;
2237 
2238 	for (i = 0; i < adev->num_ip_blocks; i++)
2239 		if (adev->ip_blocks[i].version->type == type)
2240 			return &adev->ip_blocks[i];
2241 
2242 	return NULL;
2243 }
2244 
2245 /**
2246  * amdgpu_device_ip_block_version_cmp
2247  *
2248  * @adev: amdgpu_device pointer
2249  * @type: enum amd_ip_block_type
2250  * @major: major version
2251  * @minor: minor version
2252  *
2253  * return 0 if equal or greater
2254  * return 1 if smaller or the ip_block doesn't exist
2255  */
amdgpu_device_ip_block_version_cmp(struct amdgpu_device * adev,enum amd_ip_block_type type,u32 major,u32 minor)2256 int amdgpu_device_ip_block_version_cmp(struct amdgpu_device *adev,
2257 				       enum amd_ip_block_type type,
2258 				       u32 major, u32 minor)
2259 {
2260 	struct amdgpu_ip_block *ip_block = amdgpu_device_ip_get_ip_block(adev, type);
2261 
2262 	if (ip_block && ((ip_block->version->major > major) ||
2263 			((ip_block->version->major == major) &&
2264 			(ip_block->version->minor >= minor))))
2265 		return 0;
2266 
2267 	return 1;
2268 }
2269 
2270 /**
2271  * amdgpu_device_ip_block_add
2272  *
2273  * @adev: amdgpu_device pointer
2274  * @ip_block_version: pointer to the IP to add
2275  *
2276  * Adds the IP block driver information to the collection of IPs
2277  * on the asic.
2278  */
amdgpu_device_ip_block_add(struct amdgpu_device * adev,const struct amdgpu_ip_block_version * ip_block_version)2279 int amdgpu_device_ip_block_add(struct amdgpu_device *adev,
2280 			       const struct amdgpu_ip_block_version *ip_block_version)
2281 {
2282 	if (!ip_block_version)
2283 		return -EINVAL;
2284 
2285 	switch (ip_block_version->type) {
2286 	case AMD_IP_BLOCK_TYPE_VCN:
2287 		if (adev->harvest_ip_mask & AMD_HARVEST_IP_VCN_MASK)
2288 			return 0;
2289 		break;
2290 	case AMD_IP_BLOCK_TYPE_JPEG:
2291 		if (adev->harvest_ip_mask & AMD_HARVEST_IP_JPEG_MASK)
2292 			return 0;
2293 		break;
2294 	default:
2295 		break;
2296 	}
2297 
2298 	DRM_INFO("add ip block number %d <%s>\n", adev->num_ip_blocks,
2299 		  ip_block_version->funcs->name);
2300 
2301 	adev->ip_blocks[adev->num_ip_blocks++].version = ip_block_version;
2302 
2303 	return 0;
2304 }
2305 
2306 /**
2307  * amdgpu_device_enable_virtual_display - enable virtual display feature
2308  *
2309  * @adev: amdgpu_device pointer
2310  *
2311  * Enabled the virtual display feature if the user has enabled it via
2312  * the module parameter virtual_display.  This feature provides a virtual
2313  * display hardware on headless boards or in virtualized environments.
2314  * This function parses and validates the configuration string specified by
2315  * the user and configues the virtual display configuration (number of
2316  * virtual connectors, crtcs, etc.) specified.
2317  */
amdgpu_device_enable_virtual_display(struct amdgpu_device * adev)2318 static void amdgpu_device_enable_virtual_display(struct amdgpu_device *adev)
2319 {
2320 	adev->enable_virtual_display = false;
2321 
2322 	if (amdgpu_virtual_display) {
2323 		const char *pci_address_name = pci_name(adev->pdev);
2324 		char *pciaddstr, *pciaddstr_tmp, *pciaddname_tmp, *pciaddname;
2325 
2326 		pciaddstr = kstrdup(amdgpu_virtual_display, GFP_KERNEL);
2327 		pciaddstr_tmp = pciaddstr;
2328 		while ((pciaddname_tmp = strsep(&pciaddstr_tmp, ";"))) {
2329 			pciaddname = strsep(&pciaddname_tmp, ",");
2330 			if (!strcmp("all", pciaddname)
2331 			    || !strcmp(pci_address_name, pciaddname)) {
2332 				long num_crtc;
2333 				int res = -1;
2334 
2335 				adev->enable_virtual_display = true;
2336 
2337 				if (pciaddname_tmp)
2338 					res = kstrtol(pciaddname_tmp, 10,
2339 						      &num_crtc);
2340 
2341 				if (!res) {
2342 					if (num_crtc < 1)
2343 						num_crtc = 1;
2344 					if (num_crtc > 6)
2345 						num_crtc = 6;
2346 					adev->mode_info.num_crtc = num_crtc;
2347 				} else {
2348 					adev->mode_info.num_crtc = 1;
2349 				}
2350 				break;
2351 			}
2352 		}
2353 
2354 		DRM_INFO("virtual display string:%s, %s:virtual_display:%d, num_crtc:%d\n",
2355 			 amdgpu_virtual_display, pci_address_name,
2356 			 adev->enable_virtual_display, adev->mode_info.num_crtc);
2357 
2358 		kfree(pciaddstr);
2359 	}
2360 }
2361 
amdgpu_device_set_sriov_virtual_display(struct amdgpu_device * adev)2362 void amdgpu_device_set_sriov_virtual_display(struct amdgpu_device *adev)
2363 {
2364 	if (amdgpu_sriov_vf(adev) && !adev->enable_virtual_display) {
2365 		adev->mode_info.num_crtc = 1;
2366 		adev->enable_virtual_display = true;
2367 		DRM_INFO("virtual_display:%d, num_crtc:%d\n",
2368 			 adev->enable_virtual_display, adev->mode_info.num_crtc);
2369 	}
2370 }
2371 
2372 /**
2373  * amdgpu_device_parse_gpu_info_fw - parse gpu info firmware
2374  *
2375  * @adev: amdgpu_device pointer
2376  *
2377  * Parses the asic configuration parameters specified in the gpu info
2378  * firmware and makes them availale to the driver for use in configuring
2379  * the asic.
2380  * Returns 0 on success, -EINVAL on failure.
2381  */
amdgpu_device_parse_gpu_info_fw(struct amdgpu_device * adev)2382 static int amdgpu_device_parse_gpu_info_fw(struct amdgpu_device *adev)
2383 {
2384 	const char *chip_name;
2385 	int err;
2386 	const struct gpu_info_firmware_header_v1_0 *hdr;
2387 
2388 	adev->firmware.gpu_info_fw = NULL;
2389 
2390 	switch (adev->asic_type) {
2391 	default:
2392 		return 0;
2393 	case CHIP_VEGA10:
2394 		chip_name = "vega10";
2395 		break;
2396 	case CHIP_VEGA12:
2397 		chip_name = "vega12";
2398 		break;
2399 	case CHIP_RAVEN:
2400 		if (adev->apu_flags & AMD_APU_IS_RAVEN2)
2401 			chip_name = "raven2";
2402 		else if (adev->apu_flags & AMD_APU_IS_PICASSO)
2403 			chip_name = "picasso";
2404 		else
2405 			chip_name = "raven";
2406 		break;
2407 	case CHIP_ARCTURUS:
2408 		chip_name = "arcturus";
2409 		break;
2410 	case CHIP_NAVI12:
2411 		if (adev->mman.discovery_bin)
2412 			return 0;
2413 		chip_name = "navi12";
2414 		break;
2415 	}
2416 
2417 	err = amdgpu_ucode_request(adev, &adev->firmware.gpu_info_fw,
2418 				   "amdgpu/%s_gpu_info.bin", chip_name);
2419 	if (err) {
2420 		dev_err(adev->dev,
2421 			"Failed to get gpu_info firmware \"%s_gpu_info.bin\"\n",
2422 			chip_name);
2423 		goto out;
2424 	}
2425 
2426 	hdr = (const struct gpu_info_firmware_header_v1_0 *)adev->firmware.gpu_info_fw->data;
2427 	amdgpu_ucode_print_gpu_info_hdr(&hdr->header);
2428 
2429 	switch (hdr->version_major) {
2430 	case 1:
2431 	{
2432 		const struct gpu_info_firmware_v1_0 *gpu_info_fw =
2433 			(const struct gpu_info_firmware_v1_0 *)(adev->firmware.gpu_info_fw->data +
2434 								le32_to_cpu(hdr->header.ucode_array_offset_bytes));
2435 
2436 		/*
2437 		 * Should be droped when DAL no longer needs it.
2438 		 */
2439 		if (adev->asic_type == CHIP_NAVI12)
2440 			goto parse_soc_bounding_box;
2441 
2442 		adev->gfx.config.max_shader_engines = le32_to_cpu(gpu_info_fw->gc_num_se);
2443 		adev->gfx.config.max_cu_per_sh = le32_to_cpu(gpu_info_fw->gc_num_cu_per_sh);
2444 		adev->gfx.config.max_sh_per_se = le32_to_cpu(gpu_info_fw->gc_num_sh_per_se);
2445 		adev->gfx.config.max_backends_per_se = le32_to_cpu(gpu_info_fw->gc_num_rb_per_se);
2446 		adev->gfx.config.max_texture_channel_caches =
2447 			le32_to_cpu(gpu_info_fw->gc_num_tccs);
2448 		adev->gfx.config.max_gprs = le32_to_cpu(gpu_info_fw->gc_num_gprs);
2449 		adev->gfx.config.max_gs_threads = le32_to_cpu(gpu_info_fw->gc_num_max_gs_thds);
2450 		adev->gfx.config.gs_vgt_table_depth = le32_to_cpu(gpu_info_fw->gc_gs_table_depth);
2451 		adev->gfx.config.gs_prim_buffer_depth = le32_to_cpu(gpu_info_fw->gc_gsprim_buff_depth);
2452 		adev->gfx.config.double_offchip_lds_buf =
2453 			le32_to_cpu(gpu_info_fw->gc_double_offchip_lds_buffer);
2454 		adev->gfx.cu_info.wave_front_size = le32_to_cpu(gpu_info_fw->gc_wave_size);
2455 		adev->gfx.cu_info.max_waves_per_simd =
2456 			le32_to_cpu(gpu_info_fw->gc_max_waves_per_simd);
2457 		adev->gfx.cu_info.max_scratch_slots_per_cu =
2458 			le32_to_cpu(gpu_info_fw->gc_max_scratch_slots_per_cu);
2459 		adev->gfx.cu_info.lds_size = le32_to_cpu(gpu_info_fw->gc_lds_size);
2460 		if (hdr->version_minor >= 1) {
2461 			const struct gpu_info_firmware_v1_1 *gpu_info_fw =
2462 				(const struct gpu_info_firmware_v1_1 *)(adev->firmware.gpu_info_fw->data +
2463 									le32_to_cpu(hdr->header.ucode_array_offset_bytes));
2464 			adev->gfx.config.num_sc_per_sh =
2465 				le32_to_cpu(gpu_info_fw->num_sc_per_sh);
2466 			adev->gfx.config.num_packer_per_sc =
2467 				le32_to_cpu(gpu_info_fw->num_packer_per_sc);
2468 		}
2469 
2470 parse_soc_bounding_box:
2471 		/*
2472 		 * soc bounding box info is not integrated in disocovery table,
2473 		 * we always need to parse it from gpu info firmware if needed.
2474 		 */
2475 		if (hdr->version_minor == 2) {
2476 			const struct gpu_info_firmware_v1_2 *gpu_info_fw =
2477 				(const struct gpu_info_firmware_v1_2 *)(adev->firmware.gpu_info_fw->data +
2478 									le32_to_cpu(hdr->header.ucode_array_offset_bytes));
2479 			adev->dm.soc_bounding_box = &gpu_info_fw->soc_bounding_box;
2480 		}
2481 		break;
2482 	}
2483 	default:
2484 		dev_err(adev->dev,
2485 			"Unsupported gpu_info table %d\n", hdr->header.ucode_version);
2486 		err = -EINVAL;
2487 		goto out;
2488 	}
2489 out:
2490 	return err;
2491 }
2492 
2493 /**
2494  * amdgpu_device_ip_early_init - run early init for hardware IPs
2495  *
2496  * @adev: amdgpu_device pointer
2497  *
2498  * Early initialization pass for hardware IPs.  The hardware IPs that make
2499  * up each asic are discovered each IP's early_init callback is run.  This
2500  * is the first stage in initializing the asic.
2501  * Returns 0 on success, negative error code on failure.
2502  */
amdgpu_device_ip_early_init(struct amdgpu_device * adev)2503 static int amdgpu_device_ip_early_init(struct amdgpu_device *adev)
2504 {
2505 	struct amdgpu_ip_block *ip_block;
2506 	struct pci_dev *parent;
2507 	int i, r;
2508 	bool total;
2509 
2510 	amdgpu_device_enable_virtual_display(adev);
2511 
2512 	if (amdgpu_sriov_vf(adev)) {
2513 		r = amdgpu_virt_request_full_gpu(adev, true);
2514 		if (r)
2515 			return r;
2516 	}
2517 
2518 	switch (adev->asic_type) {
2519 #ifdef CONFIG_DRM_AMDGPU_SI
2520 	case CHIP_VERDE:
2521 	case CHIP_TAHITI:
2522 	case CHIP_PITCAIRN:
2523 	case CHIP_OLAND:
2524 	case CHIP_HAINAN:
2525 		adev->family = AMDGPU_FAMILY_SI;
2526 		r = si_set_ip_blocks(adev);
2527 		if (r)
2528 			return r;
2529 		break;
2530 #endif
2531 #ifdef CONFIG_DRM_AMDGPU_CIK
2532 	case CHIP_BONAIRE:
2533 	case CHIP_HAWAII:
2534 	case CHIP_KAVERI:
2535 	case CHIP_KABINI:
2536 	case CHIP_MULLINS:
2537 		if (adev->flags & AMD_IS_APU)
2538 			adev->family = AMDGPU_FAMILY_KV;
2539 		else
2540 			adev->family = AMDGPU_FAMILY_CI;
2541 
2542 		r = cik_set_ip_blocks(adev);
2543 		if (r)
2544 			return r;
2545 		break;
2546 #endif
2547 	case CHIP_TOPAZ:
2548 	case CHIP_TONGA:
2549 	case CHIP_FIJI:
2550 	case CHIP_POLARIS10:
2551 	case CHIP_POLARIS11:
2552 	case CHIP_POLARIS12:
2553 	case CHIP_VEGAM:
2554 	case CHIP_CARRIZO:
2555 	case CHIP_STONEY:
2556 		if (adev->flags & AMD_IS_APU)
2557 			adev->family = AMDGPU_FAMILY_CZ;
2558 		else
2559 			adev->family = AMDGPU_FAMILY_VI;
2560 
2561 		r = vi_set_ip_blocks(adev);
2562 		if (r)
2563 			return r;
2564 		break;
2565 	default:
2566 		r = amdgpu_discovery_set_ip_blocks(adev);
2567 		if (r)
2568 			return r;
2569 		break;
2570 	}
2571 
2572 	if (amdgpu_has_atpx() &&
2573 	    (amdgpu_is_atpx_hybrid() ||
2574 	     amdgpu_has_atpx_dgpu_power_cntl()) &&
2575 	    ((adev->flags & AMD_IS_APU) == 0) &&
2576 	    !dev_is_removable(&adev->pdev->dev))
2577 		adev->flags |= AMD_IS_PX;
2578 
2579 	if (!(adev->flags & AMD_IS_APU)) {
2580 		parent = pcie_find_root_port(adev->pdev);
2581 		adev->has_pr3 = parent ? pci_pr3_present(parent) : false;
2582 	}
2583 
2584 
2585 	adev->pm.pp_feature = amdgpu_pp_feature_mask;
2586 	if (amdgpu_sriov_vf(adev) || sched_policy == KFD_SCHED_POLICY_NO_HWS)
2587 		adev->pm.pp_feature &= ~PP_GFXOFF_MASK;
2588 	if (amdgpu_sriov_vf(adev) && adev->asic_type == CHIP_SIENNA_CICHLID)
2589 		adev->pm.pp_feature &= ~PP_OVERDRIVE_MASK;
2590 	if (!amdgpu_device_pcie_dynamic_switching_supported(adev))
2591 		adev->pm.pp_feature &= ~PP_PCIE_DPM_MASK;
2592 
2593 	total = true;
2594 	for (i = 0; i < adev->num_ip_blocks; i++) {
2595 		if ((amdgpu_ip_block_mask & (1 << i)) == 0) {
2596 			DRM_WARN("disabled ip block: %d <%s>\n",
2597 				  i, adev->ip_blocks[i].version->funcs->name);
2598 			adev->ip_blocks[i].status.valid = false;
2599 		} else {
2600 			if (adev->ip_blocks[i].version->funcs->early_init) {
2601 				r = adev->ip_blocks[i].version->funcs->early_init((void *)adev);
2602 				if (r == -ENOENT) {
2603 					adev->ip_blocks[i].status.valid = false;
2604 				} else if (r) {
2605 					DRM_ERROR("early_init of IP block <%s> failed %d\n",
2606 						  adev->ip_blocks[i].version->funcs->name, r);
2607 					total = false;
2608 				} else {
2609 					adev->ip_blocks[i].status.valid = true;
2610 				}
2611 			} else {
2612 				adev->ip_blocks[i].status.valid = true;
2613 			}
2614 		}
2615 		/* get the vbios after the asic_funcs are set up */
2616 		if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_COMMON) {
2617 			r = amdgpu_device_parse_gpu_info_fw(adev);
2618 			if (r)
2619 				return r;
2620 
2621 			/* Read BIOS */
2622 			if (amdgpu_device_read_bios(adev)) {
2623 				if (!amdgpu_get_bios(adev))
2624 					return -EINVAL;
2625 
2626 				r = amdgpu_atombios_init(adev);
2627 				if (r) {
2628 					dev_err(adev->dev, "amdgpu_atombios_init failed\n");
2629 					amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_ATOMBIOS_INIT_FAIL, 0, 0);
2630 					return r;
2631 				}
2632 			}
2633 
2634 			/*get pf2vf msg info at it's earliest time*/
2635 			if (amdgpu_sriov_vf(adev))
2636 				amdgpu_virt_init_data_exchange(adev);
2637 
2638 		}
2639 	}
2640 	if (!total)
2641 		return -ENODEV;
2642 
2643 	ip_block = amdgpu_device_ip_get_ip_block(adev, AMD_IP_BLOCK_TYPE_GFX);
2644 	if (ip_block->status.valid != false)
2645 		amdgpu_amdkfd_device_probe(adev);
2646 
2647 	adev->cg_flags &= amdgpu_cg_mask;
2648 	adev->pg_flags &= amdgpu_pg_mask;
2649 
2650 	return 0;
2651 }
2652 
amdgpu_device_ip_hw_init_phase1(struct amdgpu_device * adev)2653 static int amdgpu_device_ip_hw_init_phase1(struct amdgpu_device *adev)
2654 {
2655 	int i, r;
2656 
2657 	for (i = 0; i < adev->num_ip_blocks; i++) {
2658 		if (!adev->ip_blocks[i].status.sw)
2659 			continue;
2660 		if (adev->ip_blocks[i].status.hw)
2661 			continue;
2662 		if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_COMMON ||
2663 		    (amdgpu_sriov_vf(adev) && (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_PSP)) ||
2664 		    adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_IH) {
2665 			r = adev->ip_blocks[i].version->funcs->hw_init(adev);
2666 			if (r) {
2667 				DRM_ERROR("hw_init of IP block <%s> failed %d\n",
2668 					  adev->ip_blocks[i].version->funcs->name, r);
2669 				return r;
2670 			}
2671 			adev->ip_blocks[i].status.hw = true;
2672 		}
2673 	}
2674 
2675 	return 0;
2676 }
2677 
amdgpu_device_ip_hw_init_phase2(struct amdgpu_device * adev)2678 static int amdgpu_device_ip_hw_init_phase2(struct amdgpu_device *adev)
2679 {
2680 	int i, r;
2681 
2682 	for (i = 0; i < adev->num_ip_blocks; i++) {
2683 		if (!adev->ip_blocks[i].status.sw)
2684 			continue;
2685 		if (adev->ip_blocks[i].status.hw)
2686 			continue;
2687 		r = adev->ip_blocks[i].version->funcs->hw_init(adev);
2688 		if (r) {
2689 			DRM_ERROR("hw_init of IP block <%s> failed %d\n",
2690 				  adev->ip_blocks[i].version->funcs->name, r);
2691 			return r;
2692 		}
2693 		adev->ip_blocks[i].status.hw = true;
2694 	}
2695 
2696 	return 0;
2697 }
2698 
amdgpu_device_fw_loading(struct amdgpu_device * adev)2699 static int amdgpu_device_fw_loading(struct amdgpu_device *adev)
2700 {
2701 	int r = 0;
2702 	int i;
2703 	uint32_t smu_version;
2704 
2705 	if (adev->asic_type >= CHIP_VEGA10) {
2706 		for (i = 0; i < adev->num_ip_blocks; i++) {
2707 			if (adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_PSP)
2708 				continue;
2709 
2710 			if (!adev->ip_blocks[i].status.sw)
2711 				continue;
2712 
2713 			/* no need to do the fw loading again if already done*/
2714 			if (adev->ip_blocks[i].status.hw == true)
2715 				break;
2716 
2717 			if (amdgpu_in_reset(adev) || adev->in_suspend) {
2718 				r = adev->ip_blocks[i].version->funcs->resume(adev);
2719 				if (r) {
2720 					DRM_ERROR("resume of IP block <%s> failed %d\n",
2721 							  adev->ip_blocks[i].version->funcs->name, r);
2722 					return r;
2723 				}
2724 			} else {
2725 				r = adev->ip_blocks[i].version->funcs->hw_init(adev);
2726 				if (r) {
2727 					DRM_ERROR("hw_init of IP block <%s> failed %d\n",
2728 							  adev->ip_blocks[i].version->funcs->name, r);
2729 					return r;
2730 				}
2731 			}
2732 
2733 			adev->ip_blocks[i].status.hw = true;
2734 			break;
2735 		}
2736 	}
2737 
2738 	if (!amdgpu_sriov_vf(adev) || adev->asic_type == CHIP_TONGA)
2739 		r = amdgpu_pm_load_smu_firmware(adev, &smu_version);
2740 
2741 	return r;
2742 }
2743 
amdgpu_device_init_schedulers(struct amdgpu_device * adev)2744 static int amdgpu_device_init_schedulers(struct amdgpu_device *adev)
2745 {
2746 	long timeout;
2747 	int r, i;
2748 
2749 	for (i = 0; i < AMDGPU_MAX_RINGS; ++i) {
2750 		struct amdgpu_ring *ring = adev->rings[i];
2751 
2752 		/* No need to setup the GPU scheduler for rings that don't need it */
2753 		if (!ring || ring->no_scheduler)
2754 			continue;
2755 
2756 		switch (ring->funcs->type) {
2757 		case AMDGPU_RING_TYPE_GFX:
2758 			timeout = adev->gfx_timeout;
2759 			break;
2760 		case AMDGPU_RING_TYPE_COMPUTE:
2761 			timeout = adev->compute_timeout;
2762 			break;
2763 		case AMDGPU_RING_TYPE_SDMA:
2764 			timeout = adev->sdma_timeout;
2765 			break;
2766 		default:
2767 			timeout = adev->video_timeout;
2768 			break;
2769 		}
2770 
2771 		r = drm_sched_init(&ring->sched, &amdgpu_sched_ops, NULL,
2772 				   DRM_SCHED_PRIORITY_COUNT,
2773 				   ring->num_hw_submission, 0,
2774 				   timeout, adev->reset_domain->wq,
2775 				   ring->sched_score, ring->name,
2776 				   adev->dev);
2777 		if (r) {
2778 			DRM_ERROR("Failed to create scheduler on ring %s.\n",
2779 				  ring->name);
2780 			return r;
2781 		}
2782 		r = amdgpu_uvd_entity_init(adev, ring);
2783 		if (r) {
2784 			DRM_ERROR("Failed to create UVD scheduling entity on ring %s.\n",
2785 				  ring->name);
2786 			return r;
2787 		}
2788 		r = amdgpu_vce_entity_init(adev, ring);
2789 		if (r) {
2790 			DRM_ERROR("Failed to create VCE scheduling entity on ring %s.\n",
2791 				  ring->name);
2792 			return r;
2793 		}
2794 	}
2795 
2796 	amdgpu_xcp_update_partition_sched_list(adev);
2797 
2798 	return 0;
2799 }
2800 
2801 
2802 /**
2803  * amdgpu_device_ip_init - run init for hardware IPs
2804  *
2805  * @adev: amdgpu_device pointer
2806  *
2807  * Main initialization pass for hardware IPs.  The list of all the hardware
2808  * IPs that make up the asic is walked and the sw_init and hw_init callbacks
2809  * are run.  sw_init initializes the software state associated with each IP
2810  * and hw_init initializes the hardware associated with each IP.
2811  * Returns 0 on success, negative error code on failure.
2812  */
amdgpu_device_ip_init(struct amdgpu_device * adev)2813 static int amdgpu_device_ip_init(struct amdgpu_device *adev)
2814 {
2815 	int i, r;
2816 
2817 	r = amdgpu_ras_init(adev);
2818 	if (r)
2819 		return r;
2820 
2821 	for (i = 0; i < adev->num_ip_blocks; i++) {
2822 		if (!adev->ip_blocks[i].status.valid)
2823 			continue;
2824 		r = adev->ip_blocks[i].version->funcs->sw_init((void *)adev);
2825 		if (r) {
2826 			DRM_ERROR("sw_init of IP block <%s> failed %d\n",
2827 				  adev->ip_blocks[i].version->funcs->name, r);
2828 			goto init_failed;
2829 		}
2830 		adev->ip_blocks[i].status.sw = true;
2831 
2832 		if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_COMMON) {
2833 			/* need to do common hw init early so everything is set up for gmc */
2834 			r = adev->ip_blocks[i].version->funcs->hw_init((void *)adev);
2835 			if (r) {
2836 				DRM_ERROR("hw_init %d failed %d\n", i, r);
2837 				goto init_failed;
2838 			}
2839 			adev->ip_blocks[i].status.hw = true;
2840 		} else if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC) {
2841 			/* need to do gmc hw init early so we can allocate gpu mem */
2842 			/* Try to reserve bad pages early */
2843 			if (amdgpu_sriov_vf(adev))
2844 				amdgpu_virt_exchange_data(adev);
2845 
2846 			r = amdgpu_device_mem_scratch_init(adev);
2847 			if (r) {
2848 				DRM_ERROR("amdgpu_mem_scratch_init failed %d\n", r);
2849 				goto init_failed;
2850 			}
2851 			r = adev->ip_blocks[i].version->funcs->hw_init((void *)adev);
2852 			if (r) {
2853 				DRM_ERROR("hw_init %d failed %d\n", i, r);
2854 				goto init_failed;
2855 			}
2856 			r = amdgpu_device_wb_init(adev);
2857 			if (r) {
2858 				DRM_ERROR("amdgpu_device_wb_init failed %d\n", r);
2859 				goto init_failed;
2860 			}
2861 			adev->ip_blocks[i].status.hw = true;
2862 
2863 			/* right after GMC hw init, we create CSA */
2864 			if (adev->gfx.mcbp) {
2865 				r = amdgpu_allocate_static_csa(adev, &adev->virt.csa_obj,
2866 							       AMDGPU_GEM_DOMAIN_VRAM |
2867 							       AMDGPU_GEM_DOMAIN_GTT,
2868 							       AMDGPU_CSA_SIZE);
2869 				if (r) {
2870 					DRM_ERROR("allocate CSA failed %d\n", r);
2871 					goto init_failed;
2872 				}
2873 			}
2874 
2875 			r = amdgpu_seq64_init(adev);
2876 			if (r) {
2877 				DRM_ERROR("allocate seq64 failed %d\n", r);
2878 				goto init_failed;
2879 			}
2880 		}
2881 	}
2882 
2883 	if (amdgpu_sriov_vf(adev))
2884 		amdgpu_virt_init_data_exchange(adev);
2885 
2886 	r = amdgpu_ib_pool_init(adev);
2887 	if (r) {
2888 		dev_err(adev->dev, "IB initialization failed (%d).\n", r);
2889 		amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_IB_INIT_FAIL, 0, r);
2890 		goto init_failed;
2891 	}
2892 
2893 	r = amdgpu_ucode_create_bo(adev); /* create ucode bo when sw_init complete*/
2894 	if (r)
2895 		goto init_failed;
2896 
2897 	r = amdgpu_device_ip_hw_init_phase1(adev);
2898 	if (r)
2899 		goto init_failed;
2900 
2901 	r = amdgpu_device_fw_loading(adev);
2902 	if (r)
2903 		goto init_failed;
2904 
2905 	r = amdgpu_device_ip_hw_init_phase2(adev);
2906 	if (r)
2907 		goto init_failed;
2908 
2909 	/*
2910 	 * retired pages will be loaded from eeprom and reserved here,
2911 	 * it should be called after amdgpu_device_ip_hw_init_phase2  since
2912 	 * for some ASICs the RAS EEPROM code relies on SMU fully functioning
2913 	 * for I2C communication which only true at this point.
2914 	 *
2915 	 * amdgpu_ras_recovery_init may fail, but the upper only cares the
2916 	 * failure from bad gpu situation and stop amdgpu init process
2917 	 * accordingly. For other failed cases, it will still release all
2918 	 * the resource and print error message, rather than returning one
2919 	 * negative value to upper level.
2920 	 *
2921 	 * Note: theoretically, this should be called before all vram allocations
2922 	 * to protect retired page from abusing
2923 	 */
2924 	r = amdgpu_ras_recovery_init(adev);
2925 	if (r)
2926 		goto init_failed;
2927 
2928 	/**
2929 	 * In case of XGMI grab extra reference for reset domain for this device
2930 	 */
2931 	if (adev->gmc.xgmi.num_physical_nodes > 1) {
2932 		if (amdgpu_xgmi_add_device(adev) == 0) {
2933 			if (!amdgpu_sriov_vf(adev)) {
2934 				struct amdgpu_hive_info *hive = amdgpu_get_xgmi_hive(adev);
2935 
2936 				if (WARN_ON(!hive)) {
2937 					r = -ENOENT;
2938 					goto init_failed;
2939 				}
2940 
2941 				if (!hive->reset_domain ||
2942 				    !amdgpu_reset_get_reset_domain(hive->reset_domain)) {
2943 					r = -ENOENT;
2944 					amdgpu_put_xgmi_hive(hive);
2945 					goto init_failed;
2946 				}
2947 
2948 				/* Drop the early temporary reset domain we created for device */
2949 				amdgpu_reset_put_reset_domain(adev->reset_domain);
2950 				adev->reset_domain = hive->reset_domain;
2951 				amdgpu_put_xgmi_hive(hive);
2952 			}
2953 		}
2954 	}
2955 
2956 	r = amdgpu_device_init_schedulers(adev);
2957 	if (r)
2958 		goto init_failed;
2959 
2960 	if (adev->mman.buffer_funcs_ring->sched.ready)
2961 		amdgpu_ttm_set_buffer_funcs_status(adev, true);
2962 
2963 	/* Don't init kfd if whole hive need to be reset during init */
2964 	if (!adev->gmc.xgmi.pending_reset) {
2965 		kgd2kfd_init_zone_device(adev);
2966 		amdgpu_amdkfd_device_init(adev);
2967 	}
2968 
2969 	amdgpu_fru_get_product_info(adev);
2970 
2971 init_failed:
2972 
2973 	return r;
2974 }
2975 
2976 /**
2977  * amdgpu_device_fill_reset_magic - writes reset magic to gart pointer
2978  *
2979  * @adev: amdgpu_device pointer
2980  *
2981  * Writes a reset magic value to the gart pointer in VRAM.  The driver calls
2982  * this function before a GPU reset.  If the value is retained after a
2983  * GPU reset, VRAM has not been lost.  Some GPU resets may destry VRAM contents.
2984  */
amdgpu_device_fill_reset_magic(struct amdgpu_device * adev)2985 static void amdgpu_device_fill_reset_magic(struct amdgpu_device *adev)
2986 {
2987 	memcpy(adev->reset_magic, adev->gart.ptr, AMDGPU_RESET_MAGIC_NUM);
2988 }
2989 
2990 /**
2991  * amdgpu_device_check_vram_lost - check if vram is valid
2992  *
2993  * @adev: amdgpu_device pointer
2994  *
2995  * Checks the reset magic value written to the gart pointer in VRAM.
2996  * The driver calls this after a GPU reset to see if the contents of
2997  * VRAM is lost or now.
2998  * returns true if vram is lost, false if not.
2999  */
amdgpu_device_check_vram_lost(struct amdgpu_device * adev)3000 static bool amdgpu_device_check_vram_lost(struct amdgpu_device *adev)
3001 {
3002 	if (memcmp(adev->gart.ptr, adev->reset_magic,
3003 			AMDGPU_RESET_MAGIC_NUM))
3004 		return true;
3005 
3006 	if (!amdgpu_in_reset(adev))
3007 		return false;
3008 
3009 	/*
3010 	 * For all ASICs with baco/mode1 reset, the VRAM is
3011 	 * always assumed to be lost.
3012 	 */
3013 	switch (amdgpu_asic_reset_method(adev)) {
3014 	case AMD_RESET_METHOD_BACO:
3015 	case AMD_RESET_METHOD_MODE1:
3016 		return true;
3017 	default:
3018 		return false;
3019 	}
3020 }
3021 
3022 /**
3023  * amdgpu_device_set_cg_state - set clockgating for amdgpu device
3024  *
3025  * @adev: amdgpu_device pointer
3026  * @state: clockgating state (gate or ungate)
3027  *
3028  * The list of all the hardware IPs that make up the asic is walked and the
3029  * set_clockgating_state callbacks are run.
3030  * Late initialization pass enabling clockgating for hardware IPs.
3031  * Fini or suspend, pass disabling clockgating for hardware IPs.
3032  * Returns 0 on success, negative error code on failure.
3033  */
3034 
amdgpu_device_set_cg_state(struct amdgpu_device * adev,enum amd_clockgating_state state)3035 int amdgpu_device_set_cg_state(struct amdgpu_device *adev,
3036 			       enum amd_clockgating_state state)
3037 {
3038 	int i, j, r;
3039 
3040 	if (amdgpu_emu_mode == 1)
3041 		return 0;
3042 
3043 	for (j = 0; j < adev->num_ip_blocks; j++) {
3044 		i = state == AMD_CG_STATE_GATE ? j : adev->num_ip_blocks - j - 1;
3045 		if (!adev->ip_blocks[i].status.late_initialized)
3046 			continue;
3047 		/* skip CG for GFX, SDMA on S0ix */
3048 		if (adev->in_s0ix &&
3049 		    (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GFX ||
3050 		     adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_SDMA))
3051 			continue;
3052 		/* skip CG for VCE/UVD, it's handled specially */
3053 		if (adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_UVD &&
3054 		    adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_VCE &&
3055 		    adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_VCN &&
3056 		    adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_JPEG &&
3057 		    adev->ip_blocks[i].version->funcs->set_clockgating_state) {
3058 			/* enable clockgating to save power */
3059 			r = adev->ip_blocks[i].version->funcs->set_clockgating_state((void *)adev,
3060 										     state);
3061 			if (r) {
3062 				DRM_ERROR("set_clockgating_state(gate) of IP block <%s> failed %d\n",
3063 					  adev->ip_blocks[i].version->funcs->name, r);
3064 				return r;
3065 			}
3066 		}
3067 	}
3068 
3069 	return 0;
3070 }
3071 
amdgpu_device_set_pg_state(struct amdgpu_device * adev,enum amd_powergating_state state)3072 int amdgpu_device_set_pg_state(struct amdgpu_device *adev,
3073 			       enum amd_powergating_state state)
3074 {
3075 	int i, j, r;
3076 
3077 	if (amdgpu_emu_mode == 1)
3078 		return 0;
3079 
3080 	for (j = 0; j < adev->num_ip_blocks; j++) {
3081 		i = state == AMD_PG_STATE_GATE ? j : adev->num_ip_blocks - j - 1;
3082 		if (!adev->ip_blocks[i].status.late_initialized)
3083 			continue;
3084 		/* skip PG for GFX, SDMA on S0ix */
3085 		if (adev->in_s0ix &&
3086 		    (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GFX ||
3087 		     adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_SDMA))
3088 			continue;
3089 		/* skip CG for VCE/UVD, it's handled specially */
3090 		if (adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_UVD &&
3091 		    adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_VCE &&
3092 		    adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_VCN &&
3093 		    adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_JPEG &&
3094 		    adev->ip_blocks[i].version->funcs->set_powergating_state) {
3095 			/* enable powergating to save power */
3096 			r = adev->ip_blocks[i].version->funcs->set_powergating_state((void *)adev,
3097 											state);
3098 			if (r) {
3099 				DRM_ERROR("set_powergating_state(gate) of IP block <%s> failed %d\n",
3100 					  adev->ip_blocks[i].version->funcs->name, r);
3101 				return r;
3102 			}
3103 		}
3104 	}
3105 	return 0;
3106 }
3107 
amdgpu_device_enable_mgpu_fan_boost(void)3108 static int amdgpu_device_enable_mgpu_fan_boost(void)
3109 {
3110 	struct amdgpu_gpu_instance *gpu_ins;
3111 	struct amdgpu_device *adev;
3112 	int i, ret = 0;
3113 
3114 	mutex_lock(&mgpu_info.mutex);
3115 
3116 	/*
3117 	 * MGPU fan boost feature should be enabled
3118 	 * only when there are two or more dGPUs in
3119 	 * the system
3120 	 */
3121 	if (mgpu_info.num_dgpu < 2)
3122 		goto out;
3123 
3124 	for (i = 0; i < mgpu_info.num_dgpu; i++) {
3125 		gpu_ins = &(mgpu_info.gpu_ins[i]);
3126 		adev = gpu_ins->adev;
3127 		if (!(adev->flags & AMD_IS_APU) &&
3128 		    !gpu_ins->mgpu_fan_enabled) {
3129 			ret = amdgpu_dpm_enable_mgpu_fan_boost(adev);
3130 			if (ret)
3131 				break;
3132 
3133 			gpu_ins->mgpu_fan_enabled = 1;
3134 		}
3135 	}
3136 
3137 out:
3138 	mutex_unlock(&mgpu_info.mutex);
3139 
3140 	return ret;
3141 }
3142 
3143 /**
3144  * amdgpu_device_ip_late_init - run late init for hardware IPs
3145  *
3146  * @adev: amdgpu_device pointer
3147  *
3148  * Late initialization pass for hardware IPs.  The list of all the hardware
3149  * IPs that make up the asic is walked and the late_init callbacks are run.
3150  * late_init covers any special initialization that an IP requires
3151  * after all of the have been initialized or something that needs to happen
3152  * late in the init process.
3153  * Returns 0 on success, negative error code on failure.
3154  */
amdgpu_device_ip_late_init(struct amdgpu_device * adev)3155 static int amdgpu_device_ip_late_init(struct amdgpu_device *adev)
3156 {
3157 	struct amdgpu_gpu_instance *gpu_instance;
3158 	int i = 0, r;
3159 
3160 	for (i = 0; i < adev->num_ip_blocks; i++) {
3161 		if (!adev->ip_blocks[i].status.hw)
3162 			continue;
3163 		if (adev->ip_blocks[i].version->funcs->late_init) {
3164 			r = adev->ip_blocks[i].version->funcs->late_init((void *)adev);
3165 			if (r) {
3166 				DRM_ERROR("late_init of IP block <%s> failed %d\n",
3167 					  adev->ip_blocks[i].version->funcs->name, r);
3168 				return r;
3169 			}
3170 		}
3171 		adev->ip_blocks[i].status.late_initialized = true;
3172 	}
3173 
3174 	r = amdgpu_ras_late_init(adev);
3175 	if (r) {
3176 		DRM_ERROR("amdgpu_ras_late_init failed %d", r);
3177 		return r;
3178 	}
3179 
3180 	if (!amdgpu_in_reset(adev))
3181 		amdgpu_ras_set_error_query_ready(adev, true);
3182 
3183 	amdgpu_device_set_cg_state(adev, AMD_CG_STATE_GATE);
3184 	amdgpu_device_set_pg_state(adev, AMD_PG_STATE_GATE);
3185 
3186 	amdgpu_device_fill_reset_magic(adev);
3187 
3188 	r = amdgpu_device_enable_mgpu_fan_boost();
3189 	if (r)
3190 		DRM_ERROR("enable mgpu fan boost failed (%d).\n", r);
3191 
3192 	/* For passthrough configuration on arcturus and aldebaran, enable special handling SBR */
3193 	if (amdgpu_passthrough(adev) &&
3194 	    ((adev->asic_type == CHIP_ARCTURUS && adev->gmc.xgmi.num_physical_nodes > 1) ||
3195 	     adev->asic_type == CHIP_ALDEBARAN))
3196 		amdgpu_dpm_handle_passthrough_sbr(adev, true);
3197 
3198 	if (adev->gmc.xgmi.num_physical_nodes > 1) {
3199 		mutex_lock(&mgpu_info.mutex);
3200 
3201 		/*
3202 		 * Reset device p-state to low as this was booted with high.
3203 		 *
3204 		 * This should be performed only after all devices from the same
3205 		 * hive get initialized.
3206 		 *
3207 		 * However, it's unknown how many device in the hive in advance.
3208 		 * As this is counted one by one during devices initializations.
3209 		 *
3210 		 * So, we wait for all XGMI interlinked devices initialized.
3211 		 * This may bring some delays as those devices may come from
3212 		 * different hives. But that should be OK.
3213 		 */
3214 		if (mgpu_info.num_dgpu == adev->gmc.xgmi.num_physical_nodes) {
3215 			for (i = 0; i < mgpu_info.num_gpu; i++) {
3216 				gpu_instance = &(mgpu_info.gpu_ins[i]);
3217 				if (gpu_instance->adev->flags & AMD_IS_APU)
3218 					continue;
3219 
3220 				r = amdgpu_xgmi_set_pstate(gpu_instance->adev,
3221 						AMDGPU_XGMI_PSTATE_MIN);
3222 				if (r) {
3223 					DRM_ERROR("pstate setting failed (%d).\n", r);
3224 					break;
3225 				}
3226 			}
3227 		}
3228 
3229 		mutex_unlock(&mgpu_info.mutex);
3230 	}
3231 
3232 	return 0;
3233 }
3234 
3235 /**
3236  * amdgpu_device_smu_fini_early - smu hw_fini wrapper
3237  *
3238  * @adev: amdgpu_device pointer
3239  *
3240  * For ASICs need to disable SMC first
3241  */
amdgpu_device_smu_fini_early(struct amdgpu_device * adev)3242 static void amdgpu_device_smu_fini_early(struct amdgpu_device *adev)
3243 {
3244 	int i, r;
3245 
3246 	if (amdgpu_ip_version(adev, GC_HWIP, 0) > IP_VERSION(9, 0, 0))
3247 		return;
3248 
3249 	for (i = 0; i < adev->num_ip_blocks; i++) {
3250 		if (!adev->ip_blocks[i].status.hw)
3251 			continue;
3252 		if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_SMC) {
3253 			r = adev->ip_blocks[i].version->funcs->hw_fini((void *)adev);
3254 			/* XXX handle errors */
3255 			if (r) {
3256 				DRM_DEBUG("hw_fini of IP block <%s> failed %d\n",
3257 					  adev->ip_blocks[i].version->funcs->name, r);
3258 			}
3259 			adev->ip_blocks[i].status.hw = false;
3260 			break;
3261 		}
3262 	}
3263 }
3264 
amdgpu_device_ip_fini_early(struct amdgpu_device * adev)3265 static int amdgpu_device_ip_fini_early(struct amdgpu_device *adev)
3266 {
3267 	int i, r;
3268 
3269 	for (i = 0; i < adev->num_ip_blocks; i++) {
3270 		if (!adev->ip_blocks[i].version->funcs->early_fini)
3271 			continue;
3272 
3273 		r = adev->ip_blocks[i].version->funcs->early_fini((void *)adev);
3274 		if (r) {
3275 			DRM_DEBUG("early_fini of IP block <%s> failed %d\n",
3276 				  adev->ip_blocks[i].version->funcs->name, r);
3277 		}
3278 	}
3279 
3280 	amdgpu_device_set_pg_state(adev, AMD_PG_STATE_UNGATE);
3281 	amdgpu_device_set_cg_state(adev, AMD_CG_STATE_UNGATE);
3282 
3283 	amdgpu_amdkfd_suspend(adev, false);
3284 
3285 	/* Workaroud for ASICs need to disable SMC first */
3286 	amdgpu_device_smu_fini_early(adev);
3287 
3288 	for (i = adev->num_ip_blocks - 1; i >= 0; i--) {
3289 		if (!adev->ip_blocks[i].status.hw)
3290 			continue;
3291 
3292 		r = adev->ip_blocks[i].version->funcs->hw_fini((void *)adev);
3293 		/* XXX handle errors */
3294 		if (r) {
3295 			DRM_DEBUG("hw_fini of IP block <%s> failed %d\n",
3296 				  adev->ip_blocks[i].version->funcs->name, r);
3297 		}
3298 
3299 		adev->ip_blocks[i].status.hw = false;
3300 	}
3301 
3302 	if (amdgpu_sriov_vf(adev)) {
3303 		if (amdgpu_virt_release_full_gpu(adev, false))
3304 			DRM_ERROR("failed to release exclusive mode on fini\n");
3305 	}
3306 
3307 	return 0;
3308 }
3309 
3310 /**
3311  * amdgpu_device_ip_fini - run fini for hardware IPs
3312  *
3313  * @adev: amdgpu_device pointer
3314  *
3315  * Main teardown pass for hardware IPs.  The list of all the hardware
3316  * IPs that make up the asic is walked and the hw_fini and sw_fini callbacks
3317  * are run.  hw_fini tears down the hardware associated with each IP
3318  * and sw_fini tears down any software state associated with each IP.
3319  * Returns 0 on success, negative error code on failure.
3320  */
amdgpu_device_ip_fini(struct amdgpu_device * adev)3321 static int amdgpu_device_ip_fini(struct amdgpu_device *adev)
3322 {
3323 	int i, r;
3324 
3325 	if (amdgpu_sriov_vf(adev) && adev->virt.ras_init_done)
3326 		amdgpu_virt_release_ras_err_handler_data(adev);
3327 
3328 	if (adev->gmc.xgmi.num_physical_nodes > 1)
3329 		amdgpu_xgmi_remove_device(adev);
3330 
3331 	amdgpu_amdkfd_device_fini_sw(adev);
3332 
3333 	for (i = adev->num_ip_blocks - 1; i >= 0; i--) {
3334 		if (!adev->ip_blocks[i].status.sw)
3335 			continue;
3336 
3337 		if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC) {
3338 			amdgpu_ucode_free_bo(adev);
3339 			amdgpu_free_static_csa(&adev->virt.csa_obj);
3340 			amdgpu_device_wb_fini(adev);
3341 			amdgpu_device_mem_scratch_fini(adev);
3342 			amdgpu_ib_pool_fini(adev);
3343 			amdgpu_seq64_fini(adev);
3344 			amdgpu_doorbell_fini(adev);
3345 		}
3346 
3347 		r = adev->ip_blocks[i].version->funcs->sw_fini((void *)adev);
3348 		/* XXX handle errors */
3349 		if (r) {
3350 			DRM_DEBUG("sw_fini of IP block <%s> failed %d\n",
3351 				  adev->ip_blocks[i].version->funcs->name, r);
3352 		}
3353 		adev->ip_blocks[i].status.sw = false;
3354 		adev->ip_blocks[i].status.valid = false;
3355 	}
3356 
3357 	for (i = adev->num_ip_blocks - 1; i >= 0; i--) {
3358 		if (!adev->ip_blocks[i].status.late_initialized)
3359 			continue;
3360 		if (adev->ip_blocks[i].version->funcs->late_fini)
3361 			adev->ip_blocks[i].version->funcs->late_fini((void *)adev);
3362 		adev->ip_blocks[i].status.late_initialized = false;
3363 	}
3364 
3365 	amdgpu_ras_fini(adev);
3366 
3367 	return 0;
3368 }
3369 
3370 /**
3371  * amdgpu_device_delayed_init_work_handler - work handler for IB tests
3372  *
3373  * @work: work_struct.
3374  */
amdgpu_device_delayed_init_work_handler(struct work_struct * work)3375 static void amdgpu_device_delayed_init_work_handler(struct work_struct *work)
3376 {
3377 	struct amdgpu_device *adev =
3378 		container_of(work, struct amdgpu_device, delayed_init_work.work);
3379 	int r;
3380 
3381 	r = amdgpu_ib_ring_tests(adev);
3382 	if (r)
3383 		DRM_ERROR("ib ring test failed (%d).\n", r);
3384 }
3385 
amdgpu_device_delay_enable_gfx_off(struct work_struct * work)3386 static void amdgpu_device_delay_enable_gfx_off(struct work_struct *work)
3387 {
3388 	struct amdgpu_device *adev =
3389 		container_of(work, struct amdgpu_device, gfx.gfx_off_delay_work.work);
3390 
3391 	WARN_ON_ONCE(adev->gfx.gfx_off_state);
3392 	WARN_ON_ONCE(adev->gfx.gfx_off_req_count);
3393 
3394 	if (!amdgpu_dpm_set_powergating_by_smu(adev, AMD_IP_BLOCK_TYPE_GFX, true))
3395 		adev->gfx.gfx_off_state = true;
3396 }
3397 
3398 /**
3399  * amdgpu_device_ip_suspend_phase1 - run suspend for hardware IPs (phase 1)
3400  *
3401  * @adev: amdgpu_device pointer
3402  *
3403  * Main suspend function for hardware IPs.  The list of all the hardware
3404  * IPs that make up the asic is walked, clockgating is disabled and the
3405  * suspend callbacks are run.  suspend puts the hardware and software state
3406  * in each IP into a state suitable for suspend.
3407  * Returns 0 on success, negative error code on failure.
3408  */
amdgpu_device_ip_suspend_phase1(struct amdgpu_device * adev)3409 static int amdgpu_device_ip_suspend_phase1(struct amdgpu_device *adev)
3410 {
3411 	int i, r;
3412 
3413 	amdgpu_device_set_pg_state(adev, AMD_PG_STATE_UNGATE);
3414 	amdgpu_device_set_cg_state(adev, AMD_CG_STATE_UNGATE);
3415 
3416 	/*
3417 	 * Per PMFW team's suggestion, driver needs to handle gfxoff
3418 	 * and df cstate features disablement for gpu reset(e.g. Mode1Reset)
3419 	 * scenario. Add the missing df cstate disablement here.
3420 	 */
3421 	if (amdgpu_dpm_set_df_cstate(adev, DF_CSTATE_DISALLOW))
3422 		dev_warn(adev->dev, "Failed to disallow df cstate");
3423 
3424 	for (i = adev->num_ip_blocks - 1; i >= 0; i--) {
3425 		if (!adev->ip_blocks[i].status.valid)
3426 			continue;
3427 
3428 		/* displays are handled separately */
3429 		if (adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_DCE)
3430 			continue;
3431 
3432 		/* XXX handle errors */
3433 		r = adev->ip_blocks[i].version->funcs->suspend(adev);
3434 		/* XXX handle errors */
3435 		if (r) {
3436 			DRM_ERROR("suspend of IP block <%s> failed %d\n",
3437 				  adev->ip_blocks[i].version->funcs->name, r);
3438 			return r;
3439 		}
3440 
3441 		adev->ip_blocks[i].status.hw = false;
3442 	}
3443 
3444 	return 0;
3445 }
3446 
3447 /**
3448  * amdgpu_device_ip_suspend_phase2 - run suspend for hardware IPs (phase 2)
3449  *
3450  * @adev: amdgpu_device pointer
3451  *
3452  * Main suspend function for hardware IPs.  The list of all the hardware
3453  * IPs that make up the asic is walked, clockgating is disabled and the
3454  * suspend callbacks are run.  suspend puts the hardware and software state
3455  * in each IP into a state suitable for suspend.
3456  * Returns 0 on success, negative error code on failure.
3457  */
amdgpu_device_ip_suspend_phase2(struct amdgpu_device * adev)3458 static int amdgpu_device_ip_suspend_phase2(struct amdgpu_device *adev)
3459 {
3460 	int i, r;
3461 
3462 	if (adev->in_s0ix)
3463 		amdgpu_dpm_gfx_state_change(adev, sGpuChangeState_D3Entry);
3464 
3465 	for (i = adev->num_ip_blocks - 1; i >= 0; i--) {
3466 		if (!adev->ip_blocks[i].status.valid)
3467 			continue;
3468 		/* displays are handled in phase1 */
3469 		if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_DCE)
3470 			continue;
3471 		/* PSP lost connection when err_event_athub occurs */
3472 		if (amdgpu_ras_intr_triggered() &&
3473 		    adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_PSP) {
3474 			adev->ip_blocks[i].status.hw = false;
3475 			continue;
3476 		}
3477 
3478 		/* skip unnecessary suspend if we do not initialize them yet */
3479 		if (adev->gmc.xgmi.pending_reset &&
3480 		    !(adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC ||
3481 		      adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_SMC ||
3482 		      adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_COMMON ||
3483 		      adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_IH)) {
3484 			adev->ip_blocks[i].status.hw = false;
3485 			continue;
3486 		}
3487 
3488 		/* skip suspend of gfx/mes and psp for S0ix
3489 		 * gfx is in gfxoff state, so on resume it will exit gfxoff just
3490 		 * like at runtime. PSP is also part of the always on hardware
3491 		 * so no need to suspend it.
3492 		 */
3493 		if (adev->in_s0ix &&
3494 		    (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_PSP ||
3495 		     adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GFX ||
3496 		     adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_MES))
3497 			continue;
3498 
3499 		/* SDMA 5.x+ is part of GFX power domain so it's covered by GFXOFF */
3500 		if (adev->in_s0ix &&
3501 		    (amdgpu_ip_version(adev, SDMA0_HWIP, 0) >=
3502 		     IP_VERSION(5, 0, 0)) &&
3503 		    (adev->ip_blocks[i].version->type ==
3504 		     AMD_IP_BLOCK_TYPE_SDMA))
3505 			continue;
3506 
3507 		/* Once swPSP provides the IMU, RLC FW binaries to TOS during cold-boot.
3508 		 * These are in TMR, hence are expected to be reused by PSP-TOS to reload
3509 		 * from this location and RLC Autoload automatically also gets loaded
3510 		 * from here based on PMFW -> PSP message during re-init sequence.
3511 		 * Therefore, the psp suspend & resume should be skipped to avoid destroy
3512 		 * the TMR and reload FWs again for IMU enabled APU ASICs.
3513 		 */
3514 		if (amdgpu_in_reset(adev) &&
3515 		    (adev->flags & AMD_IS_APU) && adev->gfx.imu.funcs &&
3516 		    adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_PSP)
3517 			continue;
3518 
3519 		/* XXX handle errors */
3520 		r = adev->ip_blocks[i].version->funcs->suspend(adev);
3521 		/* XXX handle errors */
3522 		if (r) {
3523 			DRM_ERROR("suspend of IP block <%s> failed %d\n",
3524 				  adev->ip_blocks[i].version->funcs->name, r);
3525 		}
3526 		adev->ip_blocks[i].status.hw = false;
3527 		/* handle putting the SMC in the appropriate state */
3528 		if (!amdgpu_sriov_vf(adev)) {
3529 			if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_SMC) {
3530 				r = amdgpu_dpm_set_mp1_state(adev, adev->mp1_state);
3531 				if (r) {
3532 					DRM_ERROR("SMC failed to set mp1 state %d, %d\n",
3533 							adev->mp1_state, r);
3534 					return r;
3535 				}
3536 			}
3537 		}
3538 	}
3539 
3540 	return 0;
3541 }
3542 
3543 /**
3544  * amdgpu_device_ip_suspend - run suspend for hardware IPs
3545  *
3546  * @adev: amdgpu_device pointer
3547  *
3548  * Main suspend function for hardware IPs.  The list of all the hardware
3549  * IPs that make up the asic is walked, clockgating is disabled and the
3550  * suspend callbacks are run.  suspend puts the hardware and software state
3551  * in each IP into a state suitable for suspend.
3552  * Returns 0 on success, negative error code on failure.
3553  */
amdgpu_device_ip_suspend(struct amdgpu_device * adev)3554 int amdgpu_device_ip_suspend(struct amdgpu_device *adev)
3555 {
3556 	int r;
3557 
3558 	if (amdgpu_sriov_vf(adev)) {
3559 		amdgpu_virt_fini_data_exchange(adev);
3560 		amdgpu_virt_request_full_gpu(adev, false);
3561 	}
3562 
3563 	amdgpu_ttm_set_buffer_funcs_status(adev, false);
3564 
3565 	r = amdgpu_device_ip_suspend_phase1(adev);
3566 	if (r)
3567 		return r;
3568 	r = amdgpu_device_ip_suspend_phase2(adev);
3569 
3570 	if (amdgpu_sriov_vf(adev))
3571 		amdgpu_virt_release_full_gpu(adev, false);
3572 
3573 	return r;
3574 }
3575 
amdgpu_device_ip_reinit_early_sriov(struct amdgpu_device * adev)3576 static int amdgpu_device_ip_reinit_early_sriov(struct amdgpu_device *adev)
3577 {
3578 	int i, r;
3579 
3580 	static enum amd_ip_block_type ip_order[] = {
3581 		AMD_IP_BLOCK_TYPE_COMMON,
3582 		AMD_IP_BLOCK_TYPE_GMC,
3583 		AMD_IP_BLOCK_TYPE_PSP,
3584 		AMD_IP_BLOCK_TYPE_IH,
3585 	};
3586 
3587 	for (i = 0; i < adev->num_ip_blocks; i++) {
3588 		int j;
3589 		struct amdgpu_ip_block *block;
3590 
3591 		block = &adev->ip_blocks[i];
3592 		block->status.hw = false;
3593 
3594 		for (j = 0; j < ARRAY_SIZE(ip_order); j++) {
3595 
3596 			if (block->version->type != ip_order[j] ||
3597 				!block->status.valid)
3598 				continue;
3599 
3600 			r = block->version->funcs->hw_init(adev);
3601 			DRM_INFO("RE-INIT-early: %s %s\n", block->version->funcs->name, r?"failed":"succeeded");
3602 			if (r)
3603 				return r;
3604 			block->status.hw = true;
3605 		}
3606 	}
3607 
3608 	return 0;
3609 }
3610 
amdgpu_device_ip_reinit_late_sriov(struct amdgpu_device * adev)3611 static int amdgpu_device_ip_reinit_late_sriov(struct amdgpu_device *adev)
3612 {
3613 	int i, r;
3614 
3615 	static enum amd_ip_block_type ip_order[] = {
3616 		AMD_IP_BLOCK_TYPE_SMC,
3617 		AMD_IP_BLOCK_TYPE_DCE,
3618 		AMD_IP_BLOCK_TYPE_GFX,
3619 		AMD_IP_BLOCK_TYPE_SDMA,
3620 		AMD_IP_BLOCK_TYPE_MES,
3621 		AMD_IP_BLOCK_TYPE_UVD,
3622 		AMD_IP_BLOCK_TYPE_VCE,
3623 		AMD_IP_BLOCK_TYPE_VCN,
3624 		AMD_IP_BLOCK_TYPE_JPEG
3625 	};
3626 
3627 	for (i = 0; i < ARRAY_SIZE(ip_order); i++) {
3628 		int j;
3629 		struct amdgpu_ip_block *block;
3630 
3631 		for (j = 0; j < adev->num_ip_blocks; j++) {
3632 			block = &adev->ip_blocks[j];
3633 
3634 			if (block->version->type != ip_order[i] ||
3635 				!block->status.valid ||
3636 				block->status.hw)
3637 				continue;
3638 
3639 			if (block->version->type == AMD_IP_BLOCK_TYPE_SMC)
3640 				r = block->version->funcs->resume(adev);
3641 			else
3642 				r = block->version->funcs->hw_init(adev);
3643 
3644 			DRM_INFO("RE-INIT-late: %s %s\n", block->version->funcs->name, r?"failed":"succeeded");
3645 			if (r)
3646 				return r;
3647 			block->status.hw = true;
3648 		}
3649 	}
3650 
3651 	return 0;
3652 }
3653 
3654 /**
3655  * amdgpu_device_ip_resume_phase1 - run resume for hardware IPs
3656  *
3657  * @adev: amdgpu_device pointer
3658  *
3659  * First resume function for hardware IPs.  The list of all the hardware
3660  * IPs that make up the asic is walked and the resume callbacks are run for
3661  * COMMON, GMC, and IH.  resume puts the hardware into a functional state
3662  * after a suspend and updates the software state as necessary.  This
3663  * function is also used for restoring the GPU after a GPU reset.
3664  * Returns 0 on success, negative error code on failure.
3665  */
amdgpu_device_ip_resume_phase1(struct amdgpu_device * adev)3666 static int amdgpu_device_ip_resume_phase1(struct amdgpu_device *adev)
3667 {
3668 	int i, r;
3669 
3670 	for (i = 0; i < adev->num_ip_blocks; i++) {
3671 		if (!adev->ip_blocks[i].status.valid || adev->ip_blocks[i].status.hw)
3672 			continue;
3673 		if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_COMMON ||
3674 		    adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC ||
3675 		    adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_IH ||
3676 		    (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_PSP && amdgpu_sriov_vf(adev))) {
3677 
3678 			r = adev->ip_blocks[i].version->funcs->resume(adev);
3679 			if (r) {
3680 				DRM_ERROR("resume of IP block <%s> failed %d\n",
3681 					  adev->ip_blocks[i].version->funcs->name, r);
3682 				return r;
3683 			}
3684 			adev->ip_blocks[i].status.hw = true;
3685 		}
3686 	}
3687 
3688 	return 0;
3689 }
3690 
3691 /**
3692  * amdgpu_device_ip_resume_phase2 - run resume for hardware IPs
3693  *
3694  * @adev: amdgpu_device pointer
3695  *
3696  * Second resume function for hardware IPs.  The list of all the hardware
3697  * IPs that make up the asic is walked and the resume callbacks are run for
3698  * all blocks except COMMON, GMC, and IH.  resume puts the hardware into a
3699  * functional state after a suspend and updates the software state as
3700  * necessary.  This function is also used for restoring the GPU after a GPU
3701  * reset.
3702  * Returns 0 on success, negative error code on failure.
3703  */
amdgpu_device_ip_resume_phase2(struct amdgpu_device * adev)3704 static int amdgpu_device_ip_resume_phase2(struct amdgpu_device *adev)
3705 {
3706 	int i, r;
3707 
3708 	for (i = 0; i < adev->num_ip_blocks; i++) {
3709 		if (!adev->ip_blocks[i].status.valid || adev->ip_blocks[i].status.hw)
3710 			continue;
3711 		if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_COMMON ||
3712 		    adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC ||
3713 		    adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_IH ||
3714 		    adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_DCE ||
3715 		    adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_PSP)
3716 			continue;
3717 		r = adev->ip_blocks[i].version->funcs->resume(adev);
3718 		if (r) {
3719 			DRM_ERROR("resume of IP block <%s> failed %d\n",
3720 				  adev->ip_blocks[i].version->funcs->name, r);
3721 			return r;
3722 		}
3723 		adev->ip_blocks[i].status.hw = true;
3724 	}
3725 
3726 	return 0;
3727 }
3728 
3729 /**
3730  * amdgpu_device_ip_resume_phase3 - run resume for hardware IPs
3731  *
3732  * @adev: amdgpu_device pointer
3733  *
3734  * Third resume function for hardware IPs.  The list of all the hardware
3735  * IPs that make up the asic is walked and the resume callbacks are run for
3736  * all DCE.  resume puts the hardware into a functional state after a suspend
3737  * and updates the software state as necessary.  This function is also used
3738  * for restoring the GPU after a GPU reset.
3739  *
3740  * Returns 0 on success, negative error code on failure.
3741  */
amdgpu_device_ip_resume_phase3(struct amdgpu_device * adev)3742 static int amdgpu_device_ip_resume_phase3(struct amdgpu_device *adev)
3743 {
3744 	int i, r;
3745 
3746 	for (i = 0; i < adev->num_ip_blocks; i++) {
3747 		if (!adev->ip_blocks[i].status.valid || adev->ip_blocks[i].status.hw)
3748 			continue;
3749 		if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_DCE) {
3750 			r = adev->ip_blocks[i].version->funcs->resume(adev);
3751 			if (r) {
3752 				DRM_ERROR("resume of IP block <%s> failed %d\n",
3753 					  adev->ip_blocks[i].version->funcs->name, r);
3754 				return r;
3755 			}
3756 			adev->ip_blocks[i].status.hw = true;
3757 		}
3758 	}
3759 
3760 	return 0;
3761 }
3762 
3763 /**
3764  * amdgpu_device_ip_resume - run resume for hardware IPs
3765  *
3766  * @adev: amdgpu_device pointer
3767  *
3768  * Main resume function for hardware IPs.  The hardware IPs
3769  * are split into two resume functions because they are
3770  * also used in recovering from a GPU reset and some additional
3771  * steps need to be take between them.  In this case (S3/S4) they are
3772  * run sequentially.
3773  * Returns 0 on success, negative error code on failure.
3774  */
amdgpu_device_ip_resume(struct amdgpu_device * adev)3775 static int amdgpu_device_ip_resume(struct amdgpu_device *adev)
3776 {
3777 	int r;
3778 
3779 	r = amdgpu_device_ip_resume_phase1(adev);
3780 	if (r)
3781 		return r;
3782 
3783 	r = amdgpu_device_fw_loading(adev);
3784 	if (r)
3785 		return r;
3786 
3787 	r = amdgpu_device_ip_resume_phase2(adev);
3788 
3789 	if (adev->mman.buffer_funcs_ring->sched.ready)
3790 		amdgpu_ttm_set_buffer_funcs_status(adev, true);
3791 
3792 	if (r)
3793 		return r;
3794 
3795 	amdgpu_fence_driver_hw_init(adev);
3796 
3797 	r = amdgpu_device_ip_resume_phase3(adev);
3798 
3799 	return r;
3800 }
3801 
3802 /**
3803  * amdgpu_device_detect_sriov_bios - determine if the board supports SR-IOV
3804  *
3805  * @adev: amdgpu_device pointer
3806  *
3807  * Query the VBIOS data tables to determine if the board supports SR-IOV.
3808  */
amdgpu_device_detect_sriov_bios(struct amdgpu_device * adev)3809 static void amdgpu_device_detect_sriov_bios(struct amdgpu_device *adev)
3810 {
3811 	if (amdgpu_sriov_vf(adev)) {
3812 		if (adev->is_atom_fw) {
3813 			if (amdgpu_atomfirmware_gpu_virtualization_supported(adev))
3814 				adev->virt.caps |= AMDGPU_SRIOV_CAPS_SRIOV_VBIOS;
3815 		} else {
3816 			if (amdgpu_atombios_has_gpu_virtualization_table(adev))
3817 				adev->virt.caps |= AMDGPU_SRIOV_CAPS_SRIOV_VBIOS;
3818 		}
3819 
3820 		if (!(adev->virt.caps & AMDGPU_SRIOV_CAPS_SRIOV_VBIOS))
3821 			amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_NO_VBIOS, 0, 0);
3822 	}
3823 }
3824 
3825 /**
3826  * amdgpu_device_asic_has_dc_support - determine if DC supports the asic
3827  *
3828  * @asic_type: AMD asic type
3829  *
3830  * Check if there is DC (new modesetting infrastructre) support for an asic.
3831  * returns true if DC has support, false if not.
3832  */
amdgpu_device_asic_has_dc_support(enum amd_asic_type asic_type)3833 bool amdgpu_device_asic_has_dc_support(enum amd_asic_type asic_type)
3834 {
3835 	switch (asic_type) {
3836 #ifdef CONFIG_DRM_AMDGPU_SI
3837 	case CHIP_HAINAN:
3838 #endif
3839 	case CHIP_TOPAZ:
3840 		/* chips with no display hardware */
3841 		return false;
3842 #if defined(CONFIG_DRM_AMD_DC)
3843 	case CHIP_TAHITI:
3844 	case CHIP_PITCAIRN:
3845 	case CHIP_VERDE:
3846 	case CHIP_OLAND:
3847 		/*
3848 		 * We have systems in the wild with these ASICs that require
3849 		 * LVDS and VGA support which is not supported with DC.
3850 		 *
3851 		 * Fallback to the non-DC driver here by default so as not to
3852 		 * cause regressions.
3853 		 */
3854 #if defined(CONFIG_DRM_AMD_DC_SI)
3855 		return amdgpu_dc > 0;
3856 #else
3857 		return false;
3858 #endif
3859 	case CHIP_BONAIRE:
3860 	case CHIP_KAVERI:
3861 	case CHIP_KABINI:
3862 	case CHIP_MULLINS:
3863 		/*
3864 		 * We have systems in the wild with these ASICs that require
3865 		 * VGA support which is not supported with DC.
3866 		 *
3867 		 * Fallback to the non-DC driver here by default so as not to
3868 		 * cause regressions.
3869 		 */
3870 		return amdgpu_dc > 0;
3871 	default:
3872 		return amdgpu_dc != 0;
3873 #else
3874 	default:
3875 		if (amdgpu_dc > 0)
3876 			DRM_INFO_ONCE("Display Core has been requested via kernel parameter but isn't supported by ASIC, ignoring\n");
3877 		return false;
3878 #endif
3879 	}
3880 }
3881 
3882 /**
3883  * amdgpu_device_has_dc_support - check if dc is supported
3884  *
3885  * @adev: amdgpu_device pointer
3886  *
3887  * Returns true for supported, false for not supported
3888  */
amdgpu_device_has_dc_support(struct amdgpu_device * adev)3889 bool amdgpu_device_has_dc_support(struct amdgpu_device *adev)
3890 {
3891 	if (adev->enable_virtual_display ||
3892 	    (adev->harvest_ip_mask & AMD_HARVEST_IP_DMU_MASK))
3893 		return false;
3894 
3895 	return amdgpu_device_asic_has_dc_support(adev->asic_type);
3896 }
3897 
amdgpu_device_xgmi_reset_func(struct work_struct * __work)3898 static void amdgpu_device_xgmi_reset_func(struct work_struct *__work)
3899 {
3900 	struct amdgpu_device *adev =
3901 		container_of(__work, struct amdgpu_device, xgmi_reset_work);
3902 	struct amdgpu_hive_info *hive = amdgpu_get_xgmi_hive(adev);
3903 
3904 	/* It's a bug to not have a hive within this function */
3905 	if (WARN_ON(!hive))
3906 		return;
3907 
3908 	/*
3909 	 * Use task barrier to synchronize all xgmi reset works across the
3910 	 * hive. task_barrier_enter and task_barrier_exit will block
3911 	 * until all the threads running the xgmi reset works reach
3912 	 * those points. task_barrier_full will do both blocks.
3913 	 */
3914 	if (amdgpu_asic_reset_method(adev) == AMD_RESET_METHOD_BACO) {
3915 
3916 		task_barrier_enter(&hive->tb);
3917 		adev->asic_reset_res = amdgpu_device_baco_enter(adev_to_drm(adev));
3918 
3919 		if (adev->asic_reset_res)
3920 			goto fail;
3921 
3922 		task_barrier_exit(&hive->tb);
3923 		adev->asic_reset_res = amdgpu_device_baco_exit(adev_to_drm(adev));
3924 
3925 		if (adev->asic_reset_res)
3926 			goto fail;
3927 
3928 		amdgpu_ras_reset_error_count(adev, AMDGPU_RAS_BLOCK__MMHUB);
3929 	} else {
3930 
3931 		task_barrier_full(&hive->tb);
3932 		adev->asic_reset_res =  amdgpu_asic_reset(adev);
3933 	}
3934 
3935 fail:
3936 	if (adev->asic_reset_res)
3937 		DRM_WARN("ASIC reset failed with error, %d for drm dev, %s",
3938 			 adev->asic_reset_res, adev_to_drm(adev)->unique);
3939 	amdgpu_put_xgmi_hive(hive);
3940 }
3941 
amdgpu_device_get_job_timeout_settings(struct amdgpu_device * adev)3942 static int amdgpu_device_get_job_timeout_settings(struct amdgpu_device *adev)
3943 {
3944 	char *input = amdgpu_lockup_timeout;
3945 	char *timeout_setting = NULL;
3946 	int index = 0;
3947 	long timeout;
3948 	int ret = 0;
3949 
3950 	/*
3951 	 * By default timeout for non compute jobs is 10000
3952 	 * and 60000 for compute jobs.
3953 	 * In SR-IOV or passthrough mode, timeout for compute
3954 	 * jobs are 60000 by default.
3955 	 */
3956 	adev->gfx_timeout = msecs_to_jiffies(10000);
3957 	adev->sdma_timeout = adev->video_timeout = adev->gfx_timeout;
3958 	if (amdgpu_sriov_vf(adev))
3959 		adev->compute_timeout = amdgpu_sriov_is_pp_one_vf(adev) ?
3960 					msecs_to_jiffies(60000) : msecs_to_jiffies(10000);
3961 	else
3962 		adev->compute_timeout =  msecs_to_jiffies(60000);
3963 
3964 	if (strnlen(input, AMDGPU_MAX_TIMEOUT_PARAM_LENGTH)) {
3965 		while ((timeout_setting = strsep(&input, ",")) &&
3966 				strnlen(timeout_setting, AMDGPU_MAX_TIMEOUT_PARAM_LENGTH)) {
3967 			ret = kstrtol(timeout_setting, 0, &timeout);
3968 			if (ret)
3969 				return ret;
3970 
3971 			if (timeout == 0) {
3972 				index++;
3973 				continue;
3974 			} else if (timeout < 0) {
3975 				timeout = MAX_SCHEDULE_TIMEOUT;
3976 				dev_warn(adev->dev, "lockup timeout disabled");
3977 				add_taint(TAINT_SOFTLOCKUP, LOCKDEP_STILL_OK);
3978 			} else {
3979 				timeout = msecs_to_jiffies(timeout);
3980 			}
3981 
3982 			switch (index++) {
3983 			case 0:
3984 				adev->gfx_timeout = timeout;
3985 				break;
3986 			case 1:
3987 				adev->compute_timeout = timeout;
3988 				break;
3989 			case 2:
3990 				adev->sdma_timeout = timeout;
3991 				break;
3992 			case 3:
3993 				adev->video_timeout = timeout;
3994 				break;
3995 			default:
3996 				break;
3997 			}
3998 		}
3999 		/*
4000 		 * There is only one value specified and
4001 		 * it should apply to all non-compute jobs.
4002 		 */
4003 		if (index == 1) {
4004 			adev->sdma_timeout = adev->video_timeout = adev->gfx_timeout;
4005 			if (amdgpu_sriov_vf(adev) || amdgpu_passthrough(adev))
4006 				adev->compute_timeout = adev->gfx_timeout;
4007 		}
4008 	}
4009 
4010 	return ret;
4011 }
4012 
4013 /**
4014  * amdgpu_device_check_iommu_direct_map - check if RAM direct mapped to GPU
4015  *
4016  * @adev: amdgpu_device pointer
4017  *
4018  * RAM direct mapped to GPU if IOMMU is not enabled or is pass through mode
4019  */
amdgpu_device_check_iommu_direct_map(struct amdgpu_device * adev)4020 static void amdgpu_device_check_iommu_direct_map(struct amdgpu_device *adev)
4021 {
4022 	struct iommu_domain *domain;
4023 
4024 	domain = iommu_get_domain_for_dev(adev->dev);
4025 	if (!domain || domain->type == IOMMU_DOMAIN_IDENTITY)
4026 		adev->ram_is_direct_mapped = true;
4027 }
4028 
4029 #if defined(CONFIG_HSA_AMD_P2P)
4030 /**
4031  * amdgpu_device_check_iommu_remap - Check if DMA remapping is enabled.
4032  *
4033  * @adev: amdgpu_device pointer
4034  *
4035  * return if IOMMU remapping bar address
4036  */
amdgpu_device_check_iommu_remap(struct amdgpu_device * adev)4037 static bool amdgpu_device_check_iommu_remap(struct amdgpu_device *adev)
4038 {
4039 	struct iommu_domain *domain;
4040 
4041 	domain = iommu_get_domain_for_dev(adev->dev);
4042 	if (domain && (domain->type == IOMMU_DOMAIN_DMA ||
4043 		domain->type ==	IOMMU_DOMAIN_DMA_FQ))
4044 		return true;
4045 
4046 	return false;
4047 }
4048 #endif
4049 
amdgpu_device_set_mcbp(struct amdgpu_device * adev)4050 static void amdgpu_device_set_mcbp(struct amdgpu_device *adev)
4051 {
4052 	if (amdgpu_mcbp == 1)
4053 		adev->gfx.mcbp = true;
4054 	else if (amdgpu_mcbp == 0)
4055 		adev->gfx.mcbp = false;
4056 
4057 	if (amdgpu_sriov_vf(adev))
4058 		adev->gfx.mcbp = true;
4059 
4060 	if (adev->gfx.mcbp)
4061 		DRM_INFO("MCBP is enabled\n");
4062 }
4063 
4064 /**
4065  * amdgpu_device_init - initialize the driver
4066  *
4067  * @adev: amdgpu_device pointer
4068  * @flags: driver flags
4069  *
4070  * Initializes the driver info and hw (all asics).
4071  * Returns 0 for success or an error on failure.
4072  * Called at driver startup.
4073  */
amdgpu_device_init(struct amdgpu_device * adev,uint32_t flags)4074 int amdgpu_device_init(struct amdgpu_device *adev,
4075 		       uint32_t flags)
4076 {
4077 	struct drm_device *ddev = adev_to_drm(adev);
4078 	struct pci_dev *pdev = adev->pdev;
4079 	int r, i;
4080 	bool px = false;
4081 	u32 max_MBps;
4082 	int tmp;
4083 
4084 	adev->shutdown = false;
4085 	adev->flags = flags;
4086 
4087 	if (amdgpu_force_asic_type >= 0 && amdgpu_force_asic_type < CHIP_LAST)
4088 		adev->asic_type = amdgpu_force_asic_type;
4089 	else
4090 		adev->asic_type = flags & AMD_ASIC_MASK;
4091 
4092 	adev->usec_timeout = AMDGPU_MAX_USEC_TIMEOUT;
4093 	if (amdgpu_emu_mode == 1)
4094 		adev->usec_timeout *= 10;
4095 	adev->gmc.gart_size = 512 * 1024 * 1024;
4096 	adev->accel_working = false;
4097 	adev->num_rings = 0;
4098 	RCU_INIT_POINTER(adev->gang_submit, dma_fence_get_stub());
4099 	adev->mman.buffer_funcs = NULL;
4100 	adev->mman.buffer_funcs_ring = NULL;
4101 	adev->vm_manager.vm_pte_funcs = NULL;
4102 	adev->vm_manager.vm_pte_num_scheds = 0;
4103 	adev->gmc.gmc_funcs = NULL;
4104 	adev->harvest_ip_mask = 0x0;
4105 	adev->fence_context = dma_fence_context_alloc(AMDGPU_MAX_RINGS);
4106 	bitmap_zero(adev->gfx.pipe_reserve_bitmap, AMDGPU_MAX_COMPUTE_QUEUES);
4107 
4108 	adev->smc_rreg = &amdgpu_invalid_rreg;
4109 	adev->smc_wreg = &amdgpu_invalid_wreg;
4110 	adev->pcie_rreg = &amdgpu_invalid_rreg;
4111 	adev->pcie_wreg = &amdgpu_invalid_wreg;
4112 	adev->pcie_rreg_ext = &amdgpu_invalid_rreg_ext;
4113 	adev->pcie_wreg_ext = &amdgpu_invalid_wreg_ext;
4114 	adev->pciep_rreg = &amdgpu_invalid_rreg;
4115 	adev->pciep_wreg = &amdgpu_invalid_wreg;
4116 	adev->pcie_rreg64 = &amdgpu_invalid_rreg64;
4117 	adev->pcie_wreg64 = &amdgpu_invalid_wreg64;
4118 	adev->pcie_rreg64_ext = &amdgpu_invalid_rreg64_ext;
4119 	adev->pcie_wreg64_ext = &amdgpu_invalid_wreg64_ext;
4120 	adev->uvd_ctx_rreg = &amdgpu_invalid_rreg;
4121 	adev->uvd_ctx_wreg = &amdgpu_invalid_wreg;
4122 	adev->didt_rreg = &amdgpu_invalid_rreg;
4123 	adev->didt_wreg = &amdgpu_invalid_wreg;
4124 	adev->gc_cac_rreg = &amdgpu_invalid_rreg;
4125 	adev->gc_cac_wreg = &amdgpu_invalid_wreg;
4126 	adev->audio_endpt_rreg = &amdgpu_block_invalid_rreg;
4127 	adev->audio_endpt_wreg = &amdgpu_block_invalid_wreg;
4128 
4129 	DRM_INFO("initializing kernel modesetting (%s 0x%04X:0x%04X 0x%04X:0x%04X 0x%02X).\n",
4130 		 amdgpu_asic_name[adev->asic_type], pdev->vendor, pdev->device,
4131 		 pdev->subsystem_vendor, pdev->subsystem_device, pdev->revision);
4132 
4133 	/* mutex initialization are all done here so we
4134 	 * can recall function without having locking issues
4135 	 */
4136 	mutex_init(&adev->firmware.mutex);
4137 	mutex_init(&adev->pm.mutex);
4138 	mutex_init(&adev->gfx.gpu_clock_mutex);
4139 	mutex_init(&adev->srbm_mutex);
4140 	mutex_init(&adev->gfx.pipe_reserve_mutex);
4141 	mutex_init(&adev->gfx.gfx_off_mutex);
4142 	mutex_init(&adev->gfx.partition_mutex);
4143 	mutex_init(&adev->grbm_idx_mutex);
4144 	mutex_init(&adev->mn_lock);
4145 	mutex_init(&adev->virt.vf_errors.lock);
4146 	hash_init(adev->mn_hash);
4147 	mutex_init(&adev->psp.mutex);
4148 	mutex_init(&adev->notifier_lock);
4149 	mutex_init(&adev->pm.stable_pstate_ctx_lock);
4150 	mutex_init(&adev->benchmark_mutex);
4151 	mutex_init(&adev->gfx.reset_sem_mutex);
4152 	/* Initialize the mutex for cleaner shader isolation between GFX and compute processes */
4153 	mutex_init(&adev->enforce_isolation_mutex);
4154 	mutex_init(&adev->gfx.kfd_sch_mutex);
4155 
4156 	amdgpu_device_init_apu_flags(adev);
4157 
4158 	r = amdgpu_device_check_arguments(adev);
4159 	if (r)
4160 		return r;
4161 
4162 	spin_lock_init(&adev->mmio_idx_lock);
4163 	spin_lock_init(&adev->smc_idx_lock);
4164 	spin_lock_init(&adev->pcie_idx_lock);
4165 	spin_lock_init(&adev->uvd_ctx_idx_lock);
4166 	spin_lock_init(&adev->didt_idx_lock);
4167 	spin_lock_init(&adev->gc_cac_idx_lock);
4168 	spin_lock_init(&adev->se_cac_idx_lock);
4169 	spin_lock_init(&adev->audio_endpt_idx_lock);
4170 	spin_lock_init(&adev->mm_stats.lock);
4171 	spin_lock_init(&adev->virt.rlcg_reg_lock);
4172 	spin_lock_init(&adev->wb.lock);
4173 
4174 	INIT_LIST_HEAD(&adev->reset_list);
4175 
4176 	INIT_LIST_HEAD(&adev->ras_list);
4177 
4178 	INIT_LIST_HEAD(&adev->pm.od_kobj_list);
4179 
4180 	INIT_DELAYED_WORK(&adev->delayed_init_work,
4181 			  amdgpu_device_delayed_init_work_handler);
4182 	INIT_DELAYED_WORK(&adev->gfx.gfx_off_delay_work,
4183 			  amdgpu_device_delay_enable_gfx_off);
4184 	/*
4185 	 * Initialize the enforce_isolation work structures for each XCP
4186 	 * partition.  This work handler is responsible for enforcing shader
4187 	 * isolation on AMD GPUs.  It counts the number of emitted fences for
4188 	 * each GFX and compute ring.  If there are any fences, it schedules
4189 	 * the `enforce_isolation_work` to be run after a delay.  If there are
4190 	 * no fences, it signals the Kernel Fusion Driver (KFD) to resume the
4191 	 * runqueue.
4192 	 */
4193 	for (i = 0; i < MAX_XCP; i++) {
4194 		INIT_DELAYED_WORK(&adev->gfx.enforce_isolation[i].work,
4195 				  amdgpu_gfx_enforce_isolation_handler);
4196 		adev->gfx.enforce_isolation[i].adev = adev;
4197 		adev->gfx.enforce_isolation[i].xcp_id = i;
4198 	}
4199 
4200 	INIT_WORK(&adev->xgmi_reset_work, amdgpu_device_xgmi_reset_func);
4201 
4202 	adev->gfx.gfx_off_req_count = 1;
4203 	adev->gfx.gfx_off_residency = 0;
4204 	adev->gfx.gfx_off_entrycount = 0;
4205 	adev->pm.ac_power = power_supply_is_system_supplied() > 0;
4206 
4207 	atomic_set(&adev->throttling_logging_enabled, 1);
4208 	/*
4209 	 * If throttling continues, logging will be performed every minute
4210 	 * to avoid log flooding. "-1" is subtracted since the thermal
4211 	 * throttling interrupt comes every second. Thus, the total logging
4212 	 * interval is 59 seconds(retelimited printk interval) + 1(waiting
4213 	 * for throttling interrupt) = 60 seconds.
4214 	 */
4215 	ratelimit_state_init(&adev->throttling_logging_rs, (60 - 1) * HZ, 1);
4216 	ratelimit_set_flags(&adev->throttling_logging_rs, RATELIMIT_MSG_ON_RELEASE);
4217 
4218 	/* Registers mapping */
4219 	/* TODO: block userspace mapping of io register */
4220 	if (adev->asic_type >= CHIP_BONAIRE) {
4221 		adev->rmmio_base = pci_resource_start(adev->pdev, 5);
4222 		adev->rmmio_size = pci_resource_len(adev->pdev, 5);
4223 	} else {
4224 		adev->rmmio_base = pci_resource_start(adev->pdev, 2);
4225 		adev->rmmio_size = pci_resource_len(adev->pdev, 2);
4226 	}
4227 
4228 	for (i = 0; i < AMD_IP_BLOCK_TYPE_NUM; i++)
4229 		atomic_set(&adev->pm.pwr_state[i], POWER_STATE_UNKNOWN);
4230 
4231 	adev->rmmio = ioremap(adev->rmmio_base, adev->rmmio_size);
4232 	if (!adev->rmmio)
4233 		return -ENOMEM;
4234 
4235 	DRM_INFO("register mmio base: 0x%08X\n", (uint32_t)adev->rmmio_base);
4236 	DRM_INFO("register mmio size: %u\n", (unsigned int)adev->rmmio_size);
4237 
4238 	/*
4239 	 * Reset domain needs to be present early, before XGMI hive discovered
4240 	 * (if any) and intitialized to use reset sem and in_gpu reset flag
4241 	 * early on during init and before calling to RREG32.
4242 	 */
4243 	adev->reset_domain = amdgpu_reset_create_reset_domain(SINGLE_DEVICE, "amdgpu-reset-dev");
4244 	if (!adev->reset_domain)
4245 		return -ENOMEM;
4246 
4247 	/* detect hw virtualization here */
4248 	amdgpu_detect_virtualization(adev);
4249 
4250 	amdgpu_device_get_pcie_info(adev);
4251 
4252 	r = amdgpu_device_get_job_timeout_settings(adev);
4253 	if (r) {
4254 		dev_err(adev->dev, "invalid lockup_timeout parameter syntax\n");
4255 		return r;
4256 	}
4257 
4258 	amdgpu_device_set_mcbp(adev);
4259 
4260 	/* early init functions */
4261 	r = amdgpu_device_ip_early_init(adev);
4262 	if (r)
4263 		return r;
4264 
4265 	/* Get rid of things like offb */
4266 	r = drm_aperture_remove_conflicting_pci_framebuffers(adev->pdev, &amdgpu_kms_driver);
4267 	if (r)
4268 		return r;
4269 
4270 	/* Enable TMZ based on IP_VERSION */
4271 	amdgpu_gmc_tmz_set(adev);
4272 
4273 	if (amdgpu_sriov_vf(adev) &&
4274 	    amdgpu_ip_version(adev, GC_HWIP, 0) >= IP_VERSION(10, 3, 0))
4275 		/* VF MMIO access (except mailbox range) from CPU
4276 		 * will be blocked during sriov runtime
4277 		 */
4278 		adev->virt.caps |= AMDGPU_VF_MMIO_ACCESS_PROTECT;
4279 
4280 	amdgpu_gmc_noretry_set(adev);
4281 	/* Need to get xgmi info early to decide the reset behavior*/
4282 	if (adev->gmc.xgmi.supported) {
4283 		r = adev->gfxhub.funcs->get_xgmi_info(adev);
4284 		if (r)
4285 			return r;
4286 	}
4287 
4288 	/* enable PCIE atomic ops */
4289 	if (amdgpu_sriov_vf(adev)) {
4290 		if (adev->virt.fw_reserve.p_pf2vf)
4291 			adev->have_atomics_support = ((struct amd_sriov_msg_pf2vf_info *)
4292 						      adev->virt.fw_reserve.p_pf2vf)->pcie_atomic_ops_support_flags ==
4293 				(PCI_EXP_DEVCAP2_ATOMIC_COMP32 | PCI_EXP_DEVCAP2_ATOMIC_COMP64);
4294 	/* APUs w/ gfx9 onwards doesn't reply on PCIe atomics, rather it is a
4295 	 * internal path natively support atomics, set have_atomics_support to true.
4296 	 */
4297 	} else if ((adev->flags & AMD_IS_APU) &&
4298 		   (amdgpu_ip_version(adev, GC_HWIP, 0) >
4299 		    IP_VERSION(9, 0, 0))) {
4300 		adev->have_atomics_support = true;
4301 	} else {
4302 		adev->have_atomics_support =
4303 			!pci_enable_atomic_ops_to_root(adev->pdev,
4304 					  PCI_EXP_DEVCAP2_ATOMIC_COMP32 |
4305 					  PCI_EXP_DEVCAP2_ATOMIC_COMP64);
4306 	}
4307 
4308 	if (!adev->have_atomics_support)
4309 		dev_info(adev->dev, "PCIE atomic ops is not supported\n");
4310 
4311 	/* doorbell bar mapping and doorbell index init*/
4312 	amdgpu_doorbell_init(adev);
4313 
4314 	if (amdgpu_emu_mode == 1) {
4315 		/* post the asic on emulation mode */
4316 		emu_soc_asic_init(adev);
4317 		goto fence_driver_init;
4318 	}
4319 
4320 	amdgpu_reset_init(adev);
4321 
4322 	/* detect if we are with an SRIOV vbios */
4323 	if (adev->bios)
4324 		amdgpu_device_detect_sriov_bios(adev);
4325 
4326 	/* check if we need to reset the asic
4327 	 *  E.g., driver was not cleanly unloaded previously, etc.
4328 	 */
4329 	if (!amdgpu_sriov_vf(adev) && amdgpu_asic_need_reset_on_init(adev)) {
4330 		if (adev->gmc.xgmi.num_physical_nodes) {
4331 			dev_info(adev->dev, "Pending hive reset.\n");
4332 			adev->gmc.xgmi.pending_reset = true;
4333 			/* Only need to init necessary block for SMU to handle the reset */
4334 			for (i = 0; i < adev->num_ip_blocks; i++) {
4335 				if (!adev->ip_blocks[i].status.valid)
4336 					continue;
4337 				if (!(adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC ||
4338 				      adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_COMMON ||
4339 				      adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_IH ||
4340 				      adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_SMC)) {
4341 					DRM_DEBUG("IP %s disabled for hw_init.\n",
4342 						adev->ip_blocks[i].version->funcs->name);
4343 					adev->ip_blocks[i].status.hw = true;
4344 				}
4345 			}
4346 		} else if (amdgpu_ip_version(adev, MP1_HWIP, 0) == IP_VERSION(13, 0, 10) &&
4347 				   !amdgpu_device_has_display_hardware(adev)) {
4348 					r = psp_gpu_reset(adev);
4349 		} else {
4350 				tmp = amdgpu_reset_method;
4351 				/* It should do a default reset when loading or reloading the driver,
4352 				 * regardless of the module parameter reset_method.
4353 				 */
4354 				amdgpu_reset_method = AMD_RESET_METHOD_NONE;
4355 				r = amdgpu_asic_reset(adev);
4356 				amdgpu_reset_method = tmp;
4357 		}
4358 
4359 		if (r) {
4360 		  dev_err(adev->dev, "asic reset on init failed\n");
4361 		  goto failed;
4362 		}
4363 	}
4364 
4365 	/* Post card if necessary */
4366 	if (amdgpu_device_need_post(adev)) {
4367 		if (!adev->bios) {
4368 			dev_err(adev->dev, "no vBIOS found\n");
4369 			r = -EINVAL;
4370 			goto failed;
4371 		}
4372 		DRM_INFO("GPU posting now...\n");
4373 		r = amdgpu_device_asic_init(adev);
4374 		if (r) {
4375 			dev_err(adev->dev, "gpu post error!\n");
4376 			goto failed;
4377 		}
4378 	}
4379 
4380 	if (adev->bios) {
4381 		if (adev->is_atom_fw) {
4382 			/* Initialize clocks */
4383 			r = amdgpu_atomfirmware_get_clock_info(adev);
4384 			if (r) {
4385 				dev_err(adev->dev, "amdgpu_atomfirmware_get_clock_info failed\n");
4386 				amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_ATOMBIOS_GET_CLOCK_FAIL, 0, 0);
4387 				goto failed;
4388 			}
4389 		} else {
4390 			/* Initialize clocks */
4391 			r = amdgpu_atombios_get_clock_info(adev);
4392 			if (r) {
4393 				dev_err(adev->dev, "amdgpu_atombios_get_clock_info failed\n");
4394 				amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_ATOMBIOS_GET_CLOCK_FAIL, 0, 0);
4395 				goto failed;
4396 			}
4397 			/* init i2c buses */
4398 			if (!amdgpu_device_has_dc_support(adev))
4399 				amdgpu_atombios_i2c_init(adev);
4400 		}
4401 	}
4402 
4403 fence_driver_init:
4404 	/* Fence driver */
4405 	r = amdgpu_fence_driver_sw_init(adev);
4406 	if (r) {
4407 		dev_err(adev->dev, "amdgpu_fence_driver_sw_init failed\n");
4408 		amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_FENCE_INIT_FAIL, 0, 0);
4409 		goto failed;
4410 	}
4411 
4412 	/* init the mode config */
4413 	drm_mode_config_init(adev_to_drm(adev));
4414 
4415 	r = amdgpu_device_ip_init(adev);
4416 	if (r) {
4417 		dev_err(adev->dev, "amdgpu_device_ip_init failed\n");
4418 		amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_AMDGPU_INIT_FAIL, 0, 0);
4419 		goto release_ras_con;
4420 	}
4421 
4422 	amdgpu_fence_driver_hw_init(adev);
4423 
4424 	dev_info(adev->dev,
4425 		"SE %d, SH per SE %d, CU per SH %d, active_cu_number %d\n",
4426 			adev->gfx.config.max_shader_engines,
4427 			adev->gfx.config.max_sh_per_se,
4428 			adev->gfx.config.max_cu_per_sh,
4429 			adev->gfx.cu_info.number);
4430 
4431 	adev->accel_working = true;
4432 
4433 	amdgpu_vm_check_compute_bug(adev);
4434 
4435 	/* Initialize the buffer migration limit. */
4436 	if (amdgpu_moverate >= 0)
4437 		max_MBps = amdgpu_moverate;
4438 	else
4439 		max_MBps = 8; /* Allow 8 MB/s. */
4440 	/* Get a log2 for easy divisions. */
4441 	adev->mm_stats.log2_max_MBps = ilog2(max(1u, max_MBps));
4442 
4443 	/*
4444 	 * Register gpu instance before amdgpu_device_enable_mgpu_fan_boost.
4445 	 * Otherwise the mgpu fan boost feature will be skipped due to the
4446 	 * gpu instance is counted less.
4447 	 */
4448 	amdgpu_register_gpu_instance(adev);
4449 
4450 	/* enable clockgating, etc. after ib tests, etc. since some blocks require
4451 	 * explicit gating rather than handling it automatically.
4452 	 */
4453 	if (!adev->gmc.xgmi.pending_reset) {
4454 		r = amdgpu_device_ip_late_init(adev);
4455 		if (r) {
4456 			dev_err(adev->dev, "amdgpu_device_ip_late_init failed\n");
4457 			amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_AMDGPU_LATE_INIT_FAIL, 0, r);
4458 			goto release_ras_con;
4459 		}
4460 		/* must succeed. */
4461 		amdgpu_ras_resume(adev);
4462 		queue_delayed_work(system_wq, &adev->delayed_init_work,
4463 				   msecs_to_jiffies(AMDGPU_RESUME_MS));
4464 	}
4465 
4466 	if (amdgpu_sriov_vf(adev)) {
4467 		amdgpu_virt_release_full_gpu(adev, true);
4468 		flush_delayed_work(&adev->delayed_init_work);
4469 	}
4470 
4471 	/*
4472 	 * Place those sysfs registering after `late_init`. As some of those
4473 	 * operations performed in `late_init` might affect the sysfs
4474 	 * interfaces creating.
4475 	 */
4476 	r = amdgpu_atombios_sysfs_init(adev);
4477 	if (r)
4478 		drm_err(&adev->ddev,
4479 			"registering atombios sysfs failed (%d).\n", r);
4480 
4481 	r = amdgpu_pm_sysfs_init(adev);
4482 	if (r)
4483 		DRM_ERROR("registering pm sysfs failed (%d).\n", r);
4484 
4485 	r = amdgpu_ucode_sysfs_init(adev);
4486 	if (r) {
4487 		adev->ucode_sysfs_en = false;
4488 		DRM_ERROR("Creating firmware sysfs failed (%d).\n", r);
4489 	} else
4490 		adev->ucode_sysfs_en = true;
4491 
4492 	r = amdgpu_device_attr_sysfs_init(adev);
4493 	if (r)
4494 		dev_err(adev->dev, "Could not create amdgpu device attr\n");
4495 
4496 	r = devm_device_add_group(adev->dev, &amdgpu_board_attrs_group);
4497 	if (r)
4498 		dev_err(adev->dev,
4499 			"Could not create amdgpu board attributes\n");
4500 
4501 	amdgpu_fru_sysfs_init(adev);
4502 	amdgpu_reg_state_sysfs_init(adev);
4503 
4504 	if (IS_ENABLED(CONFIG_PERF_EVENTS))
4505 		r = amdgpu_pmu_init(adev);
4506 	if (r)
4507 		dev_err(adev->dev, "amdgpu_pmu_init failed\n");
4508 
4509 	/* Have stored pci confspace at hand for restore in sudden PCI error */
4510 	if (amdgpu_device_cache_pci_state(adev->pdev))
4511 		pci_restore_state(pdev);
4512 
4513 	/* if we have > 1 VGA cards, then disable the amdgpu VGA resources */
4514 	/* this will fail for cards that aren't VGA class devices, just
4515 	 * ignore it
4516 	 */
4517 	if ((adev->pdev->class >> 8) == PCI_CLASS_DISPLAY_VGA)
4518 		vga_client_register(adev->pdev, amdgpu_device_vga_set_decode);
4519 
4520 	px = amdgpu_device_supports_px(ddev);
4521 
4522 	if (px || (!dev_is_removable(&adev->pdev->dev) &&
4523 				apple_gmux_detect(NULL, NULL)))
4524 		vga_switcheroo_register_client(adev->pdev,
4525 					       &amdgpu_switcheroo_ops, px);
4526 
4527 	if (px)
4528 		vga_switcheroo_init_domain_pm_ops(adev->dev, &adev->vga_pm_domain);
4529 
4530 	if (adev->gmc.xgmi.pending_reset)
4531 		queue_delayed_work(system_wq, &mgpu_info.delayed_reset_work,
4532 				   msecs_to_jiffies(AMDGPU_RESUME_MS));
4533 
4534 	amdgpu_device_check_iommu_direct_map(adev);
4535 
4536 	adev->pm_nb.notifier_call = amdgpu_device_pm_notifier;
4537 	r = register_pm_notifier(&adev->pm_nb);
4538 	if (r)
4539 		goto failed;
4540 
4541 	return 0;
4542 
4543 release_ras_con:
4544 	if (amdgpu_sriov_vf(adev))
4545 		amdgpu_virt_release_full_gpu(adev, true);
4546 
4547 	/* failed in exclusive mode due to timeout */
4548 	if (amdgpu_sriov_vf(adev) &&
4549 		!amdgpu_sriov_runtime(adev) &&
4550 		amdgpu_virt_mmio_blocked(adev) &&
4551 		!amdgpu_virt_wait_reset(adev)) {
4552 		dev_err(adev->dev, "VF exclusive mode timeout\n");
4553 		/* Don't send request since VF is inactive. */
4554 		adev->virt.caps &= ~AMDGPU_SRIOV_CAPS_RUNTIME;
4555 		adev->virt.ops = NULL;
4556 		r = -EAGAIN;
4557 	}
4558 	amdgpu_release_ras_context(adev);
4559 
4560 failed:
4561 	amdgpu_vf_error_trans_all(adev);
4562 
4563 	return r;
4564 }
4565 
amdgpu_device_unmap_mmio(struct amdgpu_device * adev)4566 static void amdgpu_device_unmap_mmio(struct amdgpu_device *adev)
4567 {
4568 
4569 	/* Clear all CPU mappings pointing to this device */
4570 	unmap_mapping_range(adev->ddev.anon_inode->i_mapping, 0, 0, 1);
4571 
4572 	/* Unmap all mapped bars - Doorbell, registers and VRAM */
4573 	amdgpu_doorbell_fini(adev);
4574 
4575 	iounmap(adev->rmmio);
4576 	adev->rmmio = NULL;
4577 	if (adev->mman.aper_base_kaddr)
4578 		iounmap(adev->mman.aper_base_kaddr);
4579 	adev->mman.aper_base_kaddr = NULL;
4580 
4581 	/* Memory manager related */
4582 	if (!adev->gmc.xgmi.connected_to_cpu && !adev->gmc.is_app_apu) {
4583 		arch_phys_wc_del(adev->gmc.vram_mtrr);
4584 		arch_io_free_memtype_wc(adev->gmc.aper_base, adev->gmc.aper_size);
4585 	}
4586 }
4587 
4588 /**
4589  * amdgpu_device_fini_hw - tear down the driver
4590  *
4591  * @adev: amdgpu_device pointer
4592  *
4593  * Tear down the driver info (all asics).
4594  * Called at driver shutdown.
4595  */
amdgpu_device_fini_hw(struct amdgpu_device * adev)4596 void amdgpu_device_fini_hw(struct amdgpu_device *adev)
4597 {
4598 	dev_info(adev->dev, "amdgpu: finishing device.\n");
4599 	flush_delayed_work(&adev->delayed_init_work);
4600 
4601 	if (adev->mman.initialized)
4602 		drain_workqueue(adev->mman.bdev.wq);
4603 	adev->shutdown = true;
4604 
4605 	unregister_pm_notifier(&adev->pm_nb);
4606 
4607 	/* make sure IB test finished before entering exclusive mode
4608 	 * to avoid preemption on IB test
4609 	 */
4610 	if (amdgpu_sriov_vf(adev)) {
4611 		amdgpu_virt_request_full_gpu(adev, false);
4612 		amdgpu_virt_fini_data_exchange(adev);
4613 	}
4614 
4615 	/* disable all interrupts */
4616 	amdgpu_irq_disable_all(adev);
4617 	if (adev->mode_info.mode_config_initialized) {
4618 		if (!drm_drv_uses_atomic_modeset(adev_to_drm(adev)))
4619 			drm_helper_force_disable_all(adev_to_drm(adev));
4620 		else
4621 			drm_atomic_helper_shutdown(adev_to_drm(adev));
4622 	}
4623 	amdgpu_fence_driver_hw_fini(adev);
4624 
4625 	if (adev->pm.sysfs_initialized)
4626 		amdgpu_pm_sysfs_fini(adev);
4627 	if (adev->ucode_sysfs_en)
4628 		amdgpu_ucode_sysfs_fini(adev);
4629 	amdgpu_device_attr_sysfs_fini(adev);
4630 	amdgpu_fru_sysfs_fini(adev);
4631 
4632 	amdgpu_reg_state_sysfs_fini(adev);
4633 
4634 	/* disable ras feature must before hw fini */
4635 	amdgpu_ras_pre_fini(adev);
4636 
4637 	amdgpu_ttm_set_buffer_funcs_status(adev, false);
4638 
4639 	amdgpu_device_ip_fini_early(adev);
4640 
4641 	amdgpu_irq_fini_hw(adev);
4642 
4643 	if (adev->mman.initialized)
4644 		ttm_device_clear_dma_mappings(&adev->mman.bdev);
4645 
4646 	amdgpu_gart_dummy_page_fini(adev);
4647 
4648 	if (drm_dev_is_unplugged(adev_to_drm(adev)))
4649 		amdgpu_device_unmap_mmio(adev);
4650 
4651 }
4652 
amdgpu_device_fini_sw(struct amdgpu_device * adev)4653 void amdgpu_device_fini_sw(struct amdgpu_device *adev)
4654 {
4655 	int idx;
4656 	bool px;
4657 
4658 	amdgpu_device_ip_fini(adev);
4659 	amdgpu_fence_driver_sw_fini(adev);
4660 	amdgpu_ucode_release(&adev->firmware.gpu_info_fw);
4661 	adev->accel_working = false;
4662 	dma_fence_put(rcu_dereference_protected(adev->gang_submit, true));
4663 
4664 	amdgpu_reset_fini(adev);
4665 
4666 	/* free i2c buses */
4667 	if (!amdgpu_device_has_dc_support(adev))
4668 		amdgpu_i2c_fini(adev);
4669 
4670 	if (amdgpu_emu_mode != 1)
4671 		amdgpu_atombios_fini(adev);
4672 
4673 	kfree(adev->bios);
4674 	adev->bios = NULL;
4675 
4676 	kfree(adev->fru_info);
4677 	adev->fru_info = NULL;
4678 
4679 	kfree(adev->xcp_mgr);
4680 	adev->xcp_mgr = NULL;
4681 
4682 	px = amdgpu_device_supports_px(adev_to_drm(adev));
4683 
4684 	if (px || (!dev_is_removable(&adev->pdev->dev) &&
4685 				apple_gmux_detect(NULL, NULL)))
4686 		vga_switcheroo_unregister_client(adev->pdev);
4687 
4688 	if (px)
4689 		vga_switcheroo_fini_domain_pm_ops(adev->dev);
4690 
4691 	if ((adev->pdev->class >> 8) == PCI_CLASS_DISPLAY_VGA)
4692 		vga_client_unregister(adev->pdev);
4693 
4694 	if (drm_dev_enter(adev_to_drm(adev), &idx)) {
4695 
4696 		iounmap(adev->rmmio);
4697 		adev->rmmio = NULL;
4698 		drm_dev_exit(idx);
4699 	}
4700 
4701 	if (IS_ENABLED(CONFIG_PERF_EVENTS))
4702 		amdgpu_pmu_fini(adev);
4703 	if (adev->mman.discovery_bin)
4704 		amdgpu_discovery_fini(adev);
4705 
4706 	amdgpu_reset_put_reset_domain(adev->reset_domain);
4707 	adev->reset_domain = NULL;
4708 
4709 	kfree(adev->pci_state);
4710 
4711 }
4712 
4713 /**
4714  * amdgpu_device_evict_resources - evict device resources
4715  * @adev: amdgpu device object
4716  *
4717  * Evicts all ttm device resources(vram BOs, gart table) from the lru list
4718  * of the vram memory type. Mainly used for evicting device resources
4719  * at suspend time.
4720  *
4721  */
amdgpu_device_evict_resources(struct amdgpu_device * adev)4722 static int amdgpu_device_evict_resources(struct amdgpu_device *adev)
4723 {
4724 	int ret;
4725 
4726 	/* No need to evict vram on APUs for suspend to ram or s2idle */
4727 	if ((adev->in_s3 || adev->in_s0ix) && (adev->flags & AMD_IS_APU))
4728 		return 0;
4729 
4730 	ret = amdgpu_ttm_evict_resources(adev, TTM_PL_VRAM);
4731 	if (ret)
4732 		DRM_WARN("evicting device resources failed\n");
4733 	return ret;
4734 }
4735 
4736 /*
4737  * Suspend & resume.
4738  */
4739 /**
4740  * amdgpu_device_pm_notifier - Notification block for Suspend/Hibernate events
4741  * @nb: notifier block
4742  * @mode: suspend mode
4743  * @data: data
4744  *
4745  * This function is called when the system is about to suspend or hibernate.
4746  * It is used to set the appropriate flags so that eviction can be optimized
4747  * in the pm prepare callback.
4748  */
amdgpu_device_pm_notifier(struct notifier_block * nb,unsigned long mode,void * data)4749 static int amdgpu_device_pm_notifier(struct notifier_block *nb, unsigned long mode,
4750 				     void *data)
4751 {
4752 	struct amdgpu_device *adev = container_of(nb, struct amdgpu_device, pm_nb);
4753 
4754 	switch (mode) {
4755 	case PM_HIBERNATION_PREPARE:
4756 		adev->in_s4 = true;
4757 		break;
4758 	case PM_POST_HIBERNATION:
4759 		adev->in_s4 = false;
4760 		break;
4761 	}
4762 
4763 	return NOTIFY_DONE;
4764 }
4765 
4766 /**
4767  * amdgpu_device_prepare - prepare for device suspend
4768  *
4769  * @dev: drm dev pointer
4770  *
4771  * Prepare to put the hw in the suspend state (all asics).
4772  * Returns 0 for success or an error on failure.
4773  * Called at driver suspend.
4774  */
amdgpu_device_prepare(struct drm_device * dev)4775 int amdgpu_device_prepare(struct drm_device *dev)
4776 {
4777 	struct amdgpu_device *adev = drm_to_adev(dev);
4778 	int i, r;
4779 
4780 	if (dev->switch_power_state == DRM_SWITCH_POWER_OFF)
4781 		return 0;
4782 
4783 	/* Evict the majority of BOs before starting suspend sequence */
4784 	r = amdgpu_device_evict_resources(adev);
4785 	if (r)
4786 		return r;
4787 
4788 	flush_delayed_work(&adev->gfx.gfx_off_delay_work);
4789 
4790 	for (i = 0; i < adev->num_ip_blocks; i++) {
4791 		if (!adev->ip_blocks[i].status.valid)
4792 			continue;
4793 		if (!adev->ip_blocks[i].version->funcs->prepare_suspend)
4794 			continue;
4795 		r = adev->ip_blocks[i].version->funcs->prepare_suspend((void *)adev);
4796 		if (r)
4797 			return r;
4798 	}
4799 
4800 	return 0;
4801 }
4802 
4803 /**
4804  * amdgpu_device_suspend - initiate device suspend
4805  *
4806  * @dev: drm dev pointer
4807  * @fbcon : notify the fbdev of suspend
4808  *
4809  * Puts the hw in the suspend state (all asics).
4810  * Returns 0 for success or an error on failure.
4811  * Called at driver suspend.
4812  */
amdgpu_device_suspend(struct drm_device * dev,bool fbcon)4813 int amdgpu_device_suspend(struct drm_device *dev, bool fbcon)
4814 {
4815 	struct amdgpu_device *adev = drm_to_adev(dev);
4816 	int r = 0;
4817 
4818 	if (dev->switch_power_state == DRM_SWITCH_POWER_OFF)
4819 		return 0;
4820 
4821 	adev->in_suspend = true;
4822 
4823 	if (amdgpu_sriov_vf(adev)) {
4824 		amdgpu_virt_fini_data_exchange(adev);
4825 		r = amdgpu_virt_request_full_gpu(adev, false);
4826 		if (r)
4827 			return r;
4828 	}
4829 
4830 	if (amdgpu_acpi_smart_shift_update(dev, AMDGPU_SS_DEV_D3))
4831 		DRM_WARN("smart shift update failed\n");
4832 
4833 	if (fbcon)
4834 		drm_fb_helper_set_suspend_unlocked(adev_to_drm(adev)->fb_helper, true);
4835 
4836 	cancel_delayed_work_sync(&adev->delayed_init_work);
4837 
4838 	amdgpu_ras_suspend(adev);
4839 
4840 	amdgpu_device_ip_suspend_phase1(adev);
4841 
4842 	if (!adev->in_s0ix)
4843 		amdgpu_amdkfd_suspend(adev, adev->in_runpm);
4844 
4845 	r = amdgpu_device_evict_resources(adev);
4846 	if (r)
4847 		return r;
4848 
4849 	amdgpu_ttm_set_buffer_funcs_status(adev, false);
4850 
4851 	amdgpu_fence_driver_hw_fini(adev);
4852 
4853 	amdgpu_device_ip_suspend_phase2(adev);
4854 
4855 	if (amdgpu_sriov_vf(adev))
4856 		amdgpu_virt_release_full_gpu(adev, false);
4857 
4858 	r = amdgpu_dpm_notify_rlc_state(adev, false);
4859 	if (r)
4860 		return r;
4861 
4862 	return 0;
4863 }
4864 
4865 /**
4866  * amdgpu_device_resume - initiate device resume
4867  *
4868  * @dev: drm dev pointer
4869  * @fbcon : notify the fbdev of resume
4870  *
4871  * Bring the hw back to operating state (all asics).
4872  * Returns 0 for success or an error on failure.
4873  * Called at driver resume.
4874  */
amdgpu_device_resume(struct drm_device * dev,bool fbcon)4875 int amdgpu_device_resume(struct drm_device *dev, bool fbcon)
4876 {
4877 	struct amdgpu_device *adev = drm_to_adev(dev);
4878 	int r = 0;
4879 
4880 	if (amdgpu_sriov_vf(adev)) {
4881 		r = amdgpu_virt_request_full_gpu(adev, true);
4882 		if (r)
4883 			return r;
4884 	}
4885 
4886 	if (dev->switch_power_state == DRM_SWITCH_POWER_OFF)
4887 		return 0;
4888 
4889 	if (adev->in_s0ix)
4890 		amdgpu_dpm_gfx_state_change(adev, sGpuChangeState_D0Entry);
4891 
4892 	/* post card */
4893 	if (amdgpu_device_need_post(adev)) {
4894 		r = amdgpu_device_asic_init(adev);
4895 		if (r)
4896 			dev_err(adev->dev, "amdgpu asic init failed\n");
4897 	}
4898 
4899 	r = amdgpu_device_ip_resume(adev);
4900 
4901 	if (r) {
4902 		dev_err(adev->dev, "amdgpu_device_ip_resume failed (%d).\n", r);
4903 		goto exit;
4904 	}
4905 
4906 	if (!adev->in_s0ix) {
4907 		r = amdgpu_amdkfd_resume(adev, adev->in_runpm);
4908 		if (r)
4909 			goto exit;
4910 	}
4911 
4912 	r = amdgpu_device_ip_late_init(adev);
4913 	if (r)
4914 		goto exit;
4915 
4916 	queue_delayed_work(system_wq, &adev->delayed_init_work,
4917 			   msecs_to_jiffies(AMDGPU_RESUME_MS));
4918 exit:
4919 	if (amdgpu_sriov_vf(adev)) {
4920 		amdgpu_virt_init_data_exchange(adev);
4921 		amdgpu_virt_release_full_gpu(adev, true);
4922 	}
4923 
4924 	if (r)
4925 		return r;
4926 
4927 	/* Make sure IB tests flushed */
4928 	flush_delayed_work(&adev->delayed_init_work);
4929 
4930 	if (fbcon)
4931 		drm_fb_helper_set_suspend_unlocked(adev_to_drm(adev)->fb_helper, false);
4932 
4933 	amdgpu_ras_resume(adev);
4934 
4935 	if (adev->mode_info.num_crtc) {
4936 		/*
4937 		 * Most of the connector probing functions try to acquire runtime pm
4938 		 * refs to ensure that the GPU is powered on when connector polling is
4939 		 * performed. Since we're calling this from a runtime PM callback,
4940 		 * trying to acquire rpm refs will cause us to deadlock.
4941 		 *
4942 		 * Since we're guaranteed to be holding the rpm lock, it's safe to
4943 		 * temporarily disable the rpm helpers so this doesn't deadlock us.
4944 		 */
4945 #ifdef CONFIG_PM
4946 		dev->dev->power.disable_depth++;
4947 #endif
4948 		if (!adev->dc_enabled)
4949 			drm_helper_hpd_irq_event(dev);
4950 		else
4951 			drm_kms_helper_hotplug_event(dev);
4952 #ifdef CONFIG_PM
4953 		dev->dev->power.disable_depth--;
4954 #endif
4955 	}
4956 
4957 	amdgpu_vram_mgr_clear_reset_blocks(adev);
4958 	adev->in_suspend = false;
4959 
4960 	if (adev->enable_mes)
4961 		amdgpu_mes_self_test(adev);
4962 
4963 	if (amdgpu_acpi_smart_shift_update(dev, AMDGPU_SS_DEV_D0))
4964 		DRM_WARN("smart shift update failed\n");
4965 
4966 	return 0;
4967 }
4968 
4969 /**
4970  * amdgpu_device_ip_check_soft_reset - did soft reset succeed
4971  *
4972  * @adev: amdgpu_device pointer
4973  *
4974  * The list of all the hardware IPs that make up the asic is walked and
4975  * the check_soft_reset callbacks are run.  check_soft_reset determines
4976  * if the asic is still hung or not.
4977  * Returns true if any of the IPs are still in a hung state, false if not.
4978  */
amdgpu_device_ip_check_soft_reset(struct amdgpu_device * adev)4979 static bool amdgpu_device_ip_check_soft_reset(struct amdgpu_device *adev)
4980 {
4981 	int i;
4982 	bool asic_hang = false;
4983 
4984 	if (amdgpu_sriov_vf(adev))
4985 		return true;
4986 
4987 	if (amdgpu_asic_need_full_reset(adev))
4988 		return true;
4989 
4990 	for (i = 0; i < adev->num_ip_blocks; i++) {
4991 		if (!adev->ip_blocks[i].status.valid)
4992 			continue;
4993 		if (adev->ip_blocks[i].version->funcs->check_soft_reset)
4994 			adev->ip_blocks[i].status.hang =
4995 				adev->ip_blocks[i].version->funcs->check_soft_reset(adev);
4996 		if (adev->ip_blocks[i].status.hang) {
4997 			dev_info(adev->dev, "IP block:%s is hung!\n", adev->ip_blocks[i].version->funcs->name);
4998 			asic_hang = true;
4999 		}
5000 	}
5001 	return asic_hang;
5002 }
5003 
5004 /**
5005  * amdgpu_device_ip_pre_soft_reset - prepare for soft reset
5006  *
5007  * @adev: amdgpu_device pointer
5008  *
5009  * The list of all the hardware IPs that make up the asic is walked and the
5010  * pre_soft_reset callbacks are run if the block is hung.  pre_soft_reset
5011  * handles any IP specific hardware or software state changes that are
5012  * necessary for a soft reset to succeed.
5013  * Returns 0 on success, negative error code on failure.
5014  */
amdgpu_device_ip_pre_soft_reset(struct amdgpu_device * adev)5015 static int amdgpu_device_ip_pre_soft_reset(struct amdgpu_device *adev)
5016 {
5017 	int i, r = 0;
5018 
5019 	for (i = 0; i < adev->num_ip_blocks; i++) {
5020 		if (!adev->ip_blocks[i].status.valid)
5021 			continue;
5022 		if (adev->ip_blocks[i].status.hang &&
5023 		    adev->ip_blocks[i].version->funcs->pre_soft_reset) {
5024 			r = adev->ip_blocks[i].version->funcs->pre_soft_reset(adev);
5025 			if (r)
5026 				return r;
5027 		}
5028 	}
5029 
5030 	return 0;
5031 }
5032 
5033 /**
5034  * amdgpu_device_ip_need_full_reset - check if a full asic reset is needed
5035  *
5036  * @adev: amdgpu_device pointer
5037  *
5038  * Some hardware IPs cannot be soft reset.  If they are hung, a full gpu
5039  * reset is necessary to recover.
5040  * Returns true if a full asic reset is required, false if not.
5041  */
amdgpu_device_ip_need_full_reset(struct amdgpu_device * adev)5042 static bool amdgpu_device_ip_need_full_reset(struct amdgpu_device *adev)
5043 {
5044 	int i;
5045 
5046 	if (amdgpu_asic_need_full_reset(adev))
5047 		return true;
5048 
5049 	for (i = 0; i < adev->num_ip_blocks; i++) {
5050 		if (!adev->ip_blocks[i].status.valid)
5051 			continue;
5052 		if ((adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC) ||
5053 		    (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_SMC) ||
5054 		    (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_ACP) ||
5055 		    (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_DCE) ||
5056 		     adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_PSP) {
5057 			if (adev->ip_blocks[i].status.hang) {
5058 				dev_info(adev->dev, "Some block need full reset!\n");
5059 				return true;
5060 			}
5061 		}
5062 	}
5063 	return false;
5064 }
5065 
5066 /**
5067  * amdgpu_device_ip_soft_reset - do a soft reset
5068  *
5069  * @adev: amdgpu_device pointer
5070  *
5071  * The list of all the hardware IPs that make up the asic is walked and the
5072  * soft_reset callbacks are run if the block is hung.  soft_reset handles any
5073  * IP specific hardware or software state changes that are necessary to soft
5074  * reset the IP.
5075  * Returns 0 on success, negative error code on failure.
5076  */
amdgpu_device_ip_soft_reset(struct amdgpu_device * adev)5077 static int amdgpu_device_ip_soft_reset(struct amdgpu_device *adev)
5078 {
5079 	int i, r = 0;
5080 
5081 	for (i = 0; i < adev->num_ip_blocks; i++) {
5082 		if (!adev->ip_blocks[i].status.valid)
5083 			continue;
5084 		if (adev->ip_blocks[i].status.hang &&
5085 		    adev->ip_blocks[i].version->funcs->soft_reset) {
5086 			r = adev->ip_blocks[i].version->funcs->soft_reset(adev);
5087 			if (r)
5088 				return r;
5089 		}
5090 	}
5091 
5092 	return 0;
5093 }
5094 
5095 /**
5096  * amdgpu_device_ip_post_soft_reset - clean up from soft reset
5097  *
5098  * @adev: amdgpu_device pointer
5099  *
5100  * The list of all the hardware IPs that make up the asic is walked and the
5101  * post_soft_reset callbacks are run if the asic was hung.  post_soft_reset
5102  * handles any IP specific hardware or software state changes that are
5103  * necessary after the IP has been soft reset.
5104  * Returns 0 on success, negative error code on failure.
5105  */
amdgpu_device_ip_post_soft_reset(struct amdgpu_device * adev)5106 static int amdgpu_device_ip_post_soft_reset(struct amdgpu_device *adev)
5107 {
5108 	int i, r = 0;
5109 
5110 	for (i = 0; i < adev->num_ip_blocks; i++) {
5111 		if (!adev->ip_blocks[i].status.valid)
5112 			continue;
5113 		if (adev->ip_blocks[i].status.hang &&
5114 		    adev->ip_blocks[i].version->funcs->post_soft_reset)
5115 			r = adev->ip_blocks[i].version->funcs->post_soft_reset(adev);
5116 		if (r)
5117 			return r;
5118 	}
5119 
5120 	return 0;
5121 }
5122 
5123 /**
5124  * amdgpu_device_reset_sriov - reset ASIC for SR-IOV vf
5125  *
5126  * @adev: amdgpu_device pointer
5127  * @reset_context: amdgpu reset context pointer
5128  *
5129  * do VF FLR and reinitialize Asic
5130  * return 0 means succeeded otherwise failed
5131  */
amdgpu_device_reset_sriov(struct amdgpu_device * adev,struct amdgpu_reset_context * reset_context)5132 static int amdgpu_device_reset_sriov(struct amdgpu_device *adev,
5133 				     struct amdgpu_reset_context *reset_context)
5134 {
5135 	int r;
5136 	struct amdgpu_hive_info *hive = NULL;
5137 
5138 	if (test_bit(AMDGPU_HOST_FLR, &reset_context->flags)) {
5139 		if (!amdgpu_ras_get_fed_status(adev))
5140 			amdgpu_virt_ready_to_reset(adev);
5141 		amdgpu_virt_wait_reset(adev);
5142 		clear_bit(AMDGPU_HOST_FLR, &reset_context->flags);
5143 		r = amdgpu_virt_request_full_gpu(adev, true);
5144 	} else {
5145 		r = amdgpu_virt_reset_gpu(adev);
5146 	}
5147 	if (r)
5148 		return r;
5149 
5150 	amdgpu_ras_set_fed(adev, false);
5151 	amdgpu_irq_gpu_reset_resume_helper(adev);
5152 
5153 	/* some sw clean up VF needs to do before recover */
5154 	amdgpu_virt_post_reset(adev);
5155 
5156 	/* Resume IP prior to SMC */
5157 	r = amdgpu_device_ip_reinit_early_sriov(adev);
5158 	if (r)
5159 		return r;
5160 
5161 	amdgpu_virt_init_data_exchange(adev);
5162 
5163 	r = amdgpu_device_fw_loading(adev);
5164 	if (r)
5165 		return r;
5166 
5167 	/* now we are okay to resume SMC/CP/SDMA */
5168 	r = amdgpu_device_ip_reinit_late_sriov(adev);
5169 	if (r)
5170 		return r;
5171 
5172 	hive = amdgpu_get_xgmi_hive(adev);
5173 	/* Update PSP FW topology after reset */
5174 	if (hive && adev->gmc.xgmi.num_physical_nodes > 1)
5175 		r = amdgpu_xgmi_update_topology(hive, adev);
5176 	if (hive)
5177 		amdgpu_put_xgmi_hive(hive);
5178 	if (r)
5179 		return r;
5180 
5181 	r = amdgpu_ib_ring_tests(adev);
5182 	if (r)
5183 		return r;
5184 
5185 	if (adev->virt.gim_feature & AMDGIM_FEATURE_GIM_FLR_VRAMLOST)
5186 		amdgpu_inc_vram_lost(adev);
5187 
5188 	/* need to be called during full access so we can't do it later like
5189 	 * bare-metal does.
5190 	 */
5191 	amdgpu_amdkfd_post_reset(adev);
5192 	amdgpu_virt_release_full_gpu(adev, true);
5193 
5194 	/* Aldebaran and gfx_11_0_3 support ras in SRIOV, so need resume ras during reset */
5195 	if (amdgpu_ip_version(adev, GC_HWIP, 0) == IP_VERSION(9, 4, 2) ||
5196 	    amdgpu_ip_version(adev, GC_HWIP, 0) == IP_VERSION(9, 4, 3) ||
5197 	    amdgpu_ip_version(adev, GC_HWIP, 0) == IP_VERSION(9, 4, 4) ||
5198 	    amdgpu_ip_version(adev, GC_HWIP, 0) == IP_VERSION(11, 0, 3))
5199 		amdgpu_ras_resume(adev);
5200 	return 0;
5201 }
5202 
5203 /**
5204  * amdgpu_device_has_job_running - check if there is any job in mirror list
5205  *
5206  * @adev: amdgpu_device pointer
5207  *
5208  * check if there is any job in mirror list
5209  */
amdgpu_device_has_job_running(struct amdgpu_device * adev)5210 bool amdgpu_device_has_job_running(struct amdgpu_device *adev)
5211 {
5212 	int i;
5213 	struct drm_sched_job *job;
5214 
5215 	for (i = 0; i < AMDGPU_MAX_RINGS; ++i) {
5216 		struct amdgpu_ring *ring = adev->rings[i];
5217 
5218 		if (!amdgpu_ring_sched_ready(ring))
5219 			continue;
5220 
5221 		spin_lock(&ring->sched.job_list_lock);
5222 		job = list_first_entry_or_null(&ring->sched.pending_list,
5223 					       struct drm_sched_job, list);
5224 		spin_unlock(&ring->sched.job_list_lock);
5225 		if (job)
5226 			return true;
5227 	}
5228 	return false;
5229 }
5230 
5231 /**
5232  * amdgpu_device_should_recover_gpu - check if we should try GPU recovery
5233  *
5234  * @adev: amdgpu_device pointer
5235  *
5236  * Check amdgpu_gpu_recovery and SRIOV status to see if we should try to recover
5237  * a hung GPU.
5238  */
amdgpu_device_should_recover_gpu(struct amdgpu_device * adev)5239 bool amdgpu_device_should_recover_gpu(struct amdgpu_device *adev)
5240 {
5241 
5242 	if (amdgpu_gpu_recovery == 0)
5243 		goto disabled;
5244 
5245 	/* Skip soft reset check in fatal error mode */
5246 	if (!amdgpu_ras_is_poison_mode_supported(adev))
5247 		return true;
5248 
5249 	if (amdgpu_sriov_vf(adev))
5250 		return true;
5251 
5252 	if (amdgpu_gpu_recovery == -1) {
5253 		switch (adev->asic_type) {
5254 #ifdef CONFIG_DRM_AMDGPU_SI
5255 		case CHIP_VERDE:
5256 		case CHIP_TAHITI:
5257 		case CHIP_PITCAIRN:
5258 		case CHIP_OLAND:
5259 		case CHIP_HAINAN:
5260 #endif
5261 #ifdef CONFIG_DRM_AMDGPU_CIK
5262 		case CHIP_KAVERI:
5263 		case CHIP_KABINI:
5264 		case CHIP_MULLINS:
5265 #endif
5266 		case CHIP_CARRIZO:
5267 		case CHIP_STONEY:
5268 		case CHIP_CYAN_SKILLFISH:
5269 			goto disabled;
5270 		default:
5271 			break;
5272 		}
5273 	}
5274 
5275 	return true;
5276 
5277 disabled:
5278 		dev_info(adev->dev, "GPU recovery disabled.\n");
5279 		return false;
5280 }
5281 
amdgpu_device_mode1_reset(struct amdgpu_device * adev)5282 int amdgpu_device_mode1_reset(struct amdgpu_device *adev)
5283 {
5284 	u32 i;
5285 	int ret = 0;
5286 
5287 	amdgpu_atombios_scratch_regs_engine_hung(adev, true);
5288 
5289 	dev_info(adev->dev, "GPU mode1 reset\n");
5290 
5291 	/* Cache the state before bus master disable. The saved config space
5292 	 * values are used in other cases like restore after mode-2 reset.
5293 	 */
5294 	amdgpu_device_cache_pci_state(adev->pdev);
5295 
5296 	/* disable BM */
5297 	pci_clear_master(adev->pdev);
5298 
5299 	if (amdgpu_dpm_is_mode1_reset_supported(adev)) {
5300 		dev_info(adev->dev, "GPU smu mode1 reset\n");
5301 		ret = amdgpu_dpm_mode1_reset(adev);
5302 	} else {
5303 		dev_info(adev->dev, "GPU psp mode1 reset\n");
5304 		ret = psp_gpu_reset(adev);
5305 	}
5306 
5307 	if (ret)
5308 		goto mode1_reset_failed;
5309 
5310 	amdgpu_device_load_pci_state(adev->pdev);
5311 	ret = amdgpu_psp_wait_for_bootloader(adev);
5312 	if (ret)
5313 		goto mode1_reset_failed;
5314 
5315 	/* wait for asic to come out of reset */
5316 	for (i = 0; i < adev->usec_timeout; i++) {
5317 		u32 memsize = adev->nbio.funcs->get_memsize(adev);
5318 
5319 		if (memsize != 0xffffffff)
5320 			break;
5321 		udelay(1);
5322 	}
5323 
5324 	if (i >= adev->usec_timeout) {
5325 		ret = -ETIMEDOUT;
5326 		goto mode1_reset_failed;
5327 	}
5328 
5329 	amdgpu_atombios_scratch_regs_engine_hung(adev, false);
5330 
5331 	return 0;
5332 
5333 mode1_reset_failed:
5334 	dev_err(adev->dev, "GPU mode1 reset failed\n");
5335 	return ret;
5336 }
5337 
amdgpu_device_pre_asic_reset(struct amdgpu_device * adev,struct amdgpu_reset_context * reset_context)5338 int amdgpu_device_pre_asic_reset(struct amdgpu_device *adev,
5339 				 struct amdgpu_reset_context *reset_context)
5340 {
5341 	int i, r = 0;
5342 	struct amdgpu_job *job = NULL;
5343 	struct amdgpu_device *tmp_adev = reset_context->reset_req_dev;
5344 	bool need_full_reset =
5345 		test_bit(AMDGPU_NEED_FULL_RESET, &reset_context->flags);
5346 
5347 	if (reset_context->reset_req_dev == adev)
5348 		job = reset_context->job;
5349 
5350 	if (amdgpu_sriov_vf(adev))
5351 		amdgpu_virt_pre_reset(adev);
5352 
5353 	amdgpu_fence_driver_isr_toggle(adev, true);
5354 
5355 	/* block all schedulers and reset given job's ring */
5356 	for (i = 0; i < AMDGPU_MAX_RINGS; ++i) {
5357 		struct amdgpu_ring *ring = adev->rings[i];
5358 
5359 		if (!amdgpu_ring_sched_ready(ring))
5360 			continue;
5361 
5362 		/* Clear job fence from fence drv to avoid force_completion
5363 		 * leave NULL and vm flush fence in fence drv
5364 		 */
5365 		amdgpu_fence_driver_clear_job_fences(ring);
5366 
5367 		/* after all hw jobs are reset, hw fence is meaningless, so force_completion */
5368 		amdgpu_fence_driver_force_completion(ring);
5369 	}
5370 
5371 	amdgpu_fence_driver_isr_toggle(adev, false);
5372 
5373 	if (job && job->vm)
5374 		drm_sched_increase_karma(&job->base);
5375 
5376 	r = amdgpu_reset_prepare_hwcontext(adev, reset_context);
5377 	/* If reset handler not implemented, continue; otherwise return */
5378 	if (r == -EOPNOTSUPP)
5379 		r = 0;
5380 	else
5381 		return r;
5382 
5383 	/* Don't suspend on bare metal if we are not going to HW reset the ASIC */
5384 	if (!amdgpu_sriov_vf(adev)) {
5385 
5386 		if (!need_full_reset)
5387 			need_full_reset = amdgpu_device_ip_need_full_reset(adev);
5388 
5389 		if (!need_full_reset && amdgpu_gpu_recovery &&
5390 		    amdgpu_device_ip_check_soft_reset(adev)) {
5391 			amdgpu_device_ip_pre_soft_reset(adev);
5392 			r = amdgpu_device_ip_soft_reset(adev);
5393 			amdgpu_device_ip_post_soft_reset(adev);
5394 			if (r || amdgpu_device_ip_check_soft_reset(adev)) {
5395 				dev_info(adev->dev, "soft reset failed, will fallback to full reset!\n");
5396 				need_full_reset = true;
5397 			}
5398 		}
5399 
5400 		if (!test_bit(AMDGPU_SKIP_COREDUMP, &reset_context->flags)) {
5401 			dev_info(tmp_adev->dev, "Dumping IP State\n");
5402 			/* Trigger ip dump before we reset the asic */
5403 			for (i = 0; i < tmp_adev->num_ip_blocks; i++)
5404 				if (tmp_adev->ip_blocks[i].version->funcs->dump_ip_state)
5405 					tmp_adev->ip_blocks[i].version->funcs
5406 						->dump_ip_state((void *)tmp_adev);
5407 			dev_info(tmp_adev->dev, "Dumping IP State Completed\n");
5408 		}
5409 
5410 		if (need_full_reset)
5411 			r = amdgpu_device_ip_suspend(adev);
5412 		if (need_full_reset)
5413 			set_bit(AMDGPU_NEED_FULL_RESET, &reset_context->flags);
5414 		else
5415 			clear_bit(AMDGPU_NEED_FULL_RESET,
5416 				  &reset_context->flags);
5417 	}
5418 
5419 	return r;
5420 }
5421 
amdgpu_do_asic_reset(struct list_head * device_list_handle,struct amdgpu_reset_context * reset_context)5422 int amdgpu_do_asic_reset(struct list_head *device_list_handle,
5423 			 struct amdgpu_reset_context *reset_context)
5424 {
5425 	struct amdgpu_device *tmp_adev = NULL;
5426 	bool need_full_reset, skip_hw_reset, vram_lost = false;
5427 	int r = 0;
5428 
5429 	/* Try reset handler method first */
5430 	tmp_adev = list_first_entry(device_list_handle, struct amdgpu_device,
5431 				    reset_list);
5432 
5433 	reset_context->reset_device_list = device_list_handle;
5434 	r = amdgpu_reset_perform_reset(tmp_adev, reset_context);
5435 	/* If reset handler not implemented, continue; otherwise return */
5436 	if (r == -EOPNOTSUPP)
5437 		r = 0;
5438 	else
5439 		return r;
5440 
5441 	/* Reset handler not implemented, use the default method */
5442 	need_full_reset =
5443 		test_bit(AMDGPU_NEED_FULL_RESET, &reset_context->flags);
5444 	skip_hw_reset = test_bit(AMDGPU_SKIP_HW_RESET, &reset_context->flags);
5445 
5446 	/*
5447 	 * ASIC reset has to be done on all XGMI hive nodes ASAP
5448 	 * to allow proper links negotiation in FW (within 1 sec)
5449 	 */
5450 	if (!skip_hw_reset && need_full_reset) {
5451 		list_for_each_entry(tmp_adev, device_list_handle, reset_list) {
5452 			/* For XGMI run all resets in parallel to speed up the process */
5453 			if (tmp_adev->gmc.xgmi.num_physical_nodes > 1) {
5454 				tmp_adev->gmc.xgmi.pending_reset = false;
5455 				if (!queue_work(system_unbound_wq, &tmp_adev->xgmi_reset_work))
5456 					r = -EALREADY;
5457 			} else
5458 				r = amdgpu_asic_reset(tmp_adev);
5459 
5460 			if (r) {
5461 				dev_err(tmp_adev->dev, "ASIC reset failed with error, %d for drm dev, %s",
5462 					 r, adev_to_drm(tmp_adev)->unique);
5463 				goto out;
5464 			}
5465 		}
5466 
5467 		/* For XGMI wait for all resets to complete before proceed */
5468 		if (!r) {
5469 			list_for_each_entry(tmp_adev, device_list_handle, reset_list) {
5470 				if (tmp_adev->gmc.xgmi.num_physical_nodes > 1) {
5471 					flush_work(&tmp_adev->xgmi_reset_work);
5472 					r = tmp_adev->asic_reset_res;
5473 					if (r)
5474 						break;
5475 				}
5476 			}
5477 		}
5478 	}
5479 
5480 	if (!r && amdgpu_ras_intr_triggered()) {
5481 		list_for_each_entry(tmp_adev, device_list_handle, reset_list) {
5482 			amdgpu_ras_reset_error_count(tmp_adev, AMDGPU_RAS_BLOCK__MMHUB);
5483 		}
5484 
5485 		amdgpu_ras_intr_cleared();
5486 	}
5487 
5488 	list_for_each_entry(tmp_adev, device_list_handle, reset_list) {
5489 		if (need_full_reset) {
5490 			/* post card */
5491 			amdgpu_ras_set_fed(tmp_adev, false);
5492 			r = amdgpu_device_asic_init(tmp_adev);
5493 			if (r) {
5494 				dev_warn(tmp_adev->dev, "asic atom init failed!");
5495 			} else {
5496 				dev_info(tmp_adev->dev, "GPU reset succeeded, trying to resume\n");
5497 
5498 				r = amdgpu_device_ip_resume_phase1(tmp_adev);
5499 				if (r)
5500 					goto out;
5501 
5502 				vram_lost = amdgpu_device_check_vram_lost(tmp_adev);
5503 
5504 				if (!test_bit(AMDGPU_SKIP_COREDUMP, &reset_context->flags))
5505 					amdgpu_coredump(tmp_adev, false, vram_lost, reset_context->job);
5506 
5507 				if (vram_lost) {
5508 					DRM_INFO("VRAM is lost due to GPU reset!\n");
5509 					amdgpu_inc_vram_lost(tmp_adev);
5510 				}
5511 
5512 				r = amdgpu_device_fw_loading(tmp_adev);
5513 				if (r)
5514 					return r;
5515 
5516 				r = amdgpu_xcp_restore_partition_mode(
5517 					tmp_adev->xcp_mgr);
5518 				if (r)
5519 					goto out;
5520 
5521 				r = amdgpu_device_ip_resume_phase2(tmp_adev);
5522 				if (r)
5523 					goto out;
5524 
5525 				if (tmp_adev->mman.buffer_funcs_ring->sched.ready)
5526 					amdgpu_ttm_set_buffer_funcs_status(tmp_adev, true);
5527 
5528 				r = amdgpu_device_ip_resume_phase3(tmp_adev);
5529 				if (r)
5530 					goto out;
5531 
5532 				if (vram_lost)
5533 					amdgpu_device_fill_reset_magic(tmp_adev);
5534 
5535 				/*
5536 				 * Add this ASIC as tracked as reset was already
5537 				 * complete successfully.
5538 				 */
5539 				amdgpu_register_gpu_instance(tmp_adev);
5540 
5541 				if (!reset_context->hive &&
5542 				    tmp_adev->gmc.xgmi.num_physical_nodes > 1)
5543 					amdgpu_xgmi_add_device(tmp_adev);
5544 
5545 				r = amdgpu_device_ip_late_init(tmp_adev);
5546 				if (r)
5547 					goto out;
5548 
5549 				drm_fb_helper_set_suspend_unlocked(adev_to_drm(tmp_adev)->fb_helper, false);
5550 
5551 				/*
5552 				 * The GPU enters bad state once faulty pages
5553 				 * by ECC has reached the threshold, and ras
5554 				 * recovery is scheduled next. So add one check
5555 				 * here to break recovery if it indeed exceeds
5556 				 * bad page threshold, and remind user to
5557 				 * retire this GPU or setting one bigger
5558 				 * bad_page_threshold value to fix this once
5559 				 * probing driver again.
5560 				 */
5561 				if (!amdgpu_ras_is_rma(tmp_adev)) {
5562 					/* must succeed. */
5563 					amdgpu_ras_resume(tmp_adev);
5564 				} else {
5565 					r = -EINVAL;
5566 					goto out;
5567 				}
5568 
5569 				/* Update PSP FW topology after reset */
5570 				if (reset_context->hive &&
5571 				    tmp_adev->gmc.xgmi.num_physical_nodes > 1)
5572 					r = amdgpu_xgmi_update_topology(
5573 						reset_context->hive, tmp_adev);
5574 			}
5575 		}
5576 
5577 out:
5578 		if (!r) {
5579 			amdgpu_irq_gpu_reset_resume_helper(tmp_adev);
5580 			r = amdgpu_ib_ring_tests(tmp_adev);
5581 			if (r) {
5582 				dev_err(tmp_adev->dev, "ib ring test failed (%d).\n", r);
5583 				need_full_reset = true;
5584 				r = -EAGAIN;
5585 				goto end;
5586 			}
5587 		}
5588 
5589 		if (r)
5590 			tmp_adev->asic_reset_res = r;
5591 	}
5592 
5593 end:
5594 	if (need_full_reset)
5595 		set_bit(AMDGPU_NEED_FULL_RESET, &reset_context->flags);
5596 	else
5597 		clear_bit(AMDGPU_NEED_FULL_RESET, &reset_context->flags);
5598 	return r;
5599 }
5600 
amdgpu_device_set_mp1_state(struct amdgpu_device * adev)5601 static void amdgpu_device_set_mp1_state(struct amdgpu_device *adev)
5602 {
5603 
5604 	switch (amdgpu_asic_reset_method(adev)) {
5605 	case AMD_RESET_METHOD_MODE1:
5606 		adev->mp1_state = PP_MP1_STATE_SHUTDOWN;
5607 		break;
5608 	case AMD_RESET_METHOD_MODE2:
5609 		adev->mp1_state = PP_MP1_STATE_RESET;
5610 		break;
5611 	default:
5612 		adev->mp1_state = PP_MP1_STATE_NONE;
5613 		break;
5614 	}
5615 }
5616 
amdgpu_device_unset_mp1_state(struct amdgpu_device * adev)5617 static void amdgpu_device_unset_mp1_state(struct amdgpu_device *adev)
5618 {
5619 	amdgpu_vf_error_trans_all(adev);
5620 	adev->mp1_state = PP_MP1_STATE_NONE;
5621 }
5622 
amdgpu_device_resume_display_audio(struct amdgpu_device * adev)5623 static void amdgpu_device_resume_display_audio(struct amdgpu_device *adev)
5624 {
5625 	struct pci_dev *p = NULL;
5626 
5627 	p = pci_get_domain_bus_and_slot(pci_domain_nr(adev->pdev->bus),
5628 			adev->pdev->bus->number, 1);
5629 	if (p) {
5630 		pm_runtime_enable(&(p->dev));
5631 		pm_runtime_resume(&(p->dev));
5632 	}
5633 
5634 	pci_dev_put(p);
5635 }
5636 
amdgpu_device_suspend_display_audio(struct amdgpu_device * adev)5637 static int amdgpu_device_suspend_display_audio(struct amdgpu_device *adev)
5638 {
5639 	enum amd_reset_method reset_method;
5640 	struct pci_dev *p = NULL;
5641 	u64 expires;
5642 
5643 	/*
5644 	 * For now, only BACO and mode1 reset are confirmed
5645 	 * to suffer the audio issue without proper suspended.
5646 	 */
5647 	reset_method = amdgpu_asic_reset_method(adev);
5648 	if ((reset_method != AMD_RESET_METHOD_BACO) &&
5649 	     (reset_method != AMD_RESET_METHOD_MODE1))
5650 		return -EINVAL;
5651 
5652 	p = pci_get_domain_bus_and_slot(pci_domain_nr(adev->pdev->bus),
5653 			adev->pdev->bus->number, 1);
5654 	if (!p)
5655 		return -ENODEV;
5656 
5657 	expires = pm_runtime_autosuspend_expiration(&(p->dev));
5658 	if (!expires)
5659 		/*
5660 		 * If we cannot get the audio device autosuspend delay,
5661 		 * a fixed 4S interval will be used. Considering 3S is
5662 		 * the audio controller default autosuspend delay setting.
5663 		 * 4S used here is guaranteed to cover that.
5664 		 */
5665 		expires = ktime_get_mono_fast_ns() + NSEC_PER_SEC * 4ULL;
5666 
5667 	while (!pm_runtime_status_suspended(&(p->dev))) {
5668 		if (!pm_runtime_suspend(&(p->dev)))
5669 			break;
5670 
5671 		if (expires < ktime_get_mono_fast_ns()) {
5672 			dev_warn(adev->dev, "failed to suspend display audio\n");
5673 			pci_dev_put(p);
5674 			/* TODO: abort the succeeding gpu reset? */
5675 			return -ETIMEDOUT;
5676 		}
5677 	}
5678 
5679 	pm_runtime_disable(&(p->dev));
5680 
5681 	pci_dev_put(p);
5682 	return 0;
5683 }
5684 
amdgpu_device_stop_pending_resets(struct amdgpu_device * adev)5685 static inline void amdgpu_device_stop_pending_resets(struct amdgpu_device *adev)
5686 {
5687 	struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
5688 
5689 #if defined(CONFIG_DEBUG_FS)
5690 	if (!amdgpu_sriov_vf(adev))
5691 		cancel_work(&adev->reset_work);
5692 #endif
5693 
5694 	if (adev->kfd.dev)
5695 		cancel_work(&adev->kfd.reset_work);
5696 
5697 	if (amdgpu_sriov_vf(adev))
5698 		cancel_work(&adev->virt.flr_work);
5699 
5700 	if (con && adev->ras_enabled)
5701 		cancel_work(&con->recovery_work);
5702 
5703 }
5704 
amdgpu_device_health_check(struct list_head * device_list_handle)5705 static int amdgpu_device_health_check(struct list_head *device_list_handle)
5706 {
5707 	struct amdgpu_device *tmp_adev;
5708 	int ret = 0;
5709 	u32 status;
5710 
5711 	list_for_each_entry(tmp_adev, device_list_handle, reset_list) {
5712 		pci_read_config_dword(tmp_adev->pdev, PCI_COMMAND, &status);
5713 		if (PCI_POSSIBLE_ERROR(status)) {
5714 			dev_err(tmp_adev->dev, "device lost from bus!");
5715 			ret = -ENODEV;
5716 		}
5717 	}
5718 
5719 	return ret;
5720 }
5721 
5722 /**
5723  * amdgpu_device_gpu_recover - reset the asic and recover scheduler
5724  *
5725  * @adev: amdgpu_device pointer
5726  * @job: which job trigger hang
5727  * @reset_context: amdgpu reset context pointer
5728  *
5729  * Attempt to reset the GPU if it has hung (all asics).
5730  * Attempt to do soft-reset or full-reset and reinitialize Asic
5731  * Returns 0 for success or an error on failure.
5732  */
5733 
amdgpu_device_gpu_recover(struct amdgpu_device * adev,struct amdgpu_job * job,struct amdgpu_reset_context * reset_context)5734 int amdgpu_device_gpu_recover(struct amdgpu_device *adev,
5735 			      struct amdgpu_job *job,
5736 			      struct amdgpu_reset_context *reset_context)
5737 {
5738 	struct list_head device_list, *device_list_handle =  NULL;
5739 	bool job_signaled = false;
5740 	struct amdgpu_hive_info *hive = NULL;
5741 	struct amdgpu_device *tmp_adev = NULL;
5742 	int i, r = 0;
5743 	bool need_emergency_restart = false;
5744 	bool audio_suspended = false;
5745 	int retry_limit = AMDGPU_MAX_RETRY_LIMIT;
5746 
5747 	/*
5748 	 * Special case: RAS triggered and full reset isn't supported
5749 	 */
5750 	need_emergency_restart = amdgpu_ras_need_emergency_restart(adev);
5751 
5752 	/*
5753 	 * Flush RAM to disk so that after reboot
5754 	 * the user can read log and see why the system rebooted.
5755 	 */
5756 	if (need_emergency_restart && amdgpu_ras_get_context(adev) &&
5757 		amdgpu_ras_get_context(adev)->reboot) {
5758 		DRM_WARN("Emergency reboot.");
5759 
5760 		ksys_sync_helper();
5761 		emergency_restart();
5762 	}
5763 
5764 	dev_info(adev->dev, "GPU %s begin!\n",
5765 		need_emergency_restart ? "jobs stop":"reset");
5766 
5767 	if (!amdgpu_sriov_vf(adev))
5768 		hive = amdgpu_get_xgmi_hive(adev);
5769 	if (hive)
5770 		mutex_lock(&hive->hive_lock);
5771 
5772 	reset_context->job = job;
5773 	reset_context->hive = hive;
5774 	/*
5775 	 * Build list of devices to reset.
5776 	 * In case we are in XGMI hive mode, resort the device list
5777 	 * to put adev in the 1st position.
5778 	 */
5779 	INIT_LIST_HEAD(&device_list);
5780 	if (!amdgpu_sriov_vf(adev) && (adev->gmc.xgmi.num_physical_nodes > 1) && hive) {
5781 		list_for_each_entry(tmp_adev, &hive->device_list, gmc.xgmi.head) {
5782 			list_add_tail(&tmp_adev->reset_list, &device_list);
5783 			if (adev->shutdown)
5784 				tmp_adev->shutdown = true;
5785 		}
5786 		if (!list_is_first(&adev->reset_list, &device_list))
5787 			list_rotate_to_front(&adev->reset_list, &device_list);
5788 		device_list_handle = &device_list;
5789 	} else {
5790 		list_add_tail(&adev->reset_list, &device_list);
5791 		device_list_handle = &device_list;
5792 	}
5793 
5794 	if (!amdgpu_sriov_vf(adev)) {
5795 		r = amdgpu_device_health_check(device_list_handle);
5796 		if (r)
5797 			goto end_reset;
5798 	}
5799 
5800 	/* We need to lock reset domain only once both for XGMI and single device */
5801 	tmp_adev = list_first_entry(device_list_handle, struct amdgpu_device,
5802 				    reset_list);
5803 	amdgpu_device_lock_reset_domain(tmp_adev->reset_domain);
5804 
5805 	/* block all schedulers and reset given job's ring */
5806 	list_for_each_entry(tmp_adev, device_list_handle, reset_list) {
5807 
5808 		amdgpu_device_set_mp1_state(tmp_adev);
5809 
5810 		/*
5811 		 * Try to put the audio codec into suspend state
5812 		 * before gpu reset started.
5813 		 *
5814 		 * Due to the power domain of the graphics device
5815 		 * is shared with AZ power domain. Without this,
5816 		 * we may change the audio hardware from behind
5817 		 * the audio driver's back. That will trigger
5818 		 * some audio codec errors.
5819 		 */
5820 		if (!amdgpu_device_suspend_display_audio(tmp_adev))
5821 			audio_suspended = true;
5822 
5823 		amdgpu_ras_set_error_query_ready(tmp_adev, false);
5824 
5825 		cancel_delayed_work_sync(&tmp_adev->delayed_init_work);
5826 
5827 		amdgpu_amdkfd_pre_reset(tmp_adev, reset_context);
5828 
5829 		/*
5830 		 * Mark these ASICs to be reseted as untracked first
5831 		 * And add them back after reset completed
5832 		 */
5833 		amdgpu_unregister_gpu_instance(tmp_adev);
5834 
5835 		drm_fb_helper_set_suspend_unlocked(adev_to_drm(tmp_adev)->fb_helper, true);
5836 
5837 		/* disable ras on ALL IPs */
5838 		if (!need_emergency_restart &&
5839 		      amdgpu_device_ip_need_full_reset(tmp_adev))
5840 			amdgpu_ras_suspend(tmp_adev);
5841 
5842 		for (i = 0; i < AMDGPU_MAX_RINGS; ++i) {
5843 			struct amdgpu_ring *ring = tmp_adev->rings[i];
5844 
5845 			if (!amdgpu_ring_sched_ready(ring))
5846 				continue;
5847 
5848 			drm_sched_stop(&ring->sched, job ? &job->base : NULL);
5849 
5850 			if (need_emergency_restart)
5851 				amdgpu_job_stop_all_jobs_on_sched(&ring->sched);
5852 		}
5853 		atomic_inc(&tmp_adev->gpu_reset_counter);
5854 	}
5855 
5856 	if (need_emergency_restart)
5857 		goto skip_sched_resume;
5858 
5859 	/*
5860 	 * Must check guilty signal here since after this point all old
5861 	 * HW fences are force signaled.
5862 	 *
5863 	 * job->base holds a reference to parent fence
5864 	 */
5865 	if (job && dma_fence_is_signaled(&job->hw_fence.base)) {
5866 		job_signaled = true;
5867 		dev_info(adev->dev, "Guilty job already signaled, skipping HW reset");
5868 		goto skip_hw_reset;
5869 	}
5870 
5871 retry:	/* Rest of adevs pre asic reset from XGMI hive. */
5872 	list_for_each_entry(tmp_adev, device_list_handle, reset_list) {
5873 		r = amdgpu_device_pre_asic_reset(tmp_adev, reset_context);
5874 		/*TODO Should we stop ?*/
5875 		if (r) {
5876 			dev_err(tmp_adev->dev, "GPU pre asic reset failed with err, %d for drm dev, %s ",
5877 				  r, adev_to_drm(tmp_adev)->unique);
5878 			tmp_adev->asic_reset_res = r;
5879 		}
5880 	}
5881 
5882 	/* Actual ASIC resets if needed.*/
5883 	/* Host driver will handle XGMI hive reset for SRIOV */
5884 	if (amdgpu_sriov_vf(adev)) {
5885 		if (amdgpu_ras_get_fed_status(adev) || amdgpu_virt_rcvd_ras_interrupt(adev)) {
5886 			dev_dbg(adev->dev, "Detected RAS error, wait for FLR completion\n");
5887 			amdgpu_ras_set_fed(adev, true);
5888 			set_bit(AMDGPU_HOST_FLR, &reset_context->flags);
5889 		}
5890 
5891 		r = amdgpu_device_reset_sriov(adev, reset_context);
5892 		if (AMDGPU_RETRY_SRIOV_RESET(r) && (retry_limit--) > 0) {
5893 			amdgpu_virt_release_full_gpu(adev, true);
5894 			goto retry;
5895 		}
5896 		if (r)
5897 			adev->asic_reset_res = r;
5898 	} else {
5899 		r = amdgpu_do_asic_reset(device_list_handle, reset_context);
5900 		if (r && r == -EAGAIN)
5901 			goto retry;
5902 	}
5903 
5904 	list_for_each_entry(tmp_adev, device_list_handle, reset_list) {
5905 		/*
5906 		 * Drop any pending non scheduler resets queued before reset is done.
5907 		 * Any reset scheduled after this point would be valid. Scheduler resets
5908 		 * were already dropped during drm_sched_stop and no new ones can come
5909 		 * in before drm_sched_start.
5910 		 */
5911 		amdgpu_device_stop_pending_resets(tmp_adev);
5912 	}
5913 
5914 skip_hw_reset:
5915 
5916 	/* Post ASIC reset for all devs .*/
5917 	list_for_each_entry(tmp_adev, device_list_handle, reset_list) {
5918 
5919 		for (i = 0; i < AMDGPU_MAX_RINGS; ++i) {
5920 			struct amdgpu_ring *ring = tmp_adev->rings[i];
5921 
5922 			if (!amdgpu_ring_sched_ready(ring))
5923 				continue;
5924 
5925 			drm_sched_start(&ring->sched);
5926 		}
5927 
5928 		if (!drm_drv_uses_atomic_modeset(adev_to_drm(tmp_adev)) && !job_signaled)
5929 			drm_helper_resume_force_mode(adev_to_drm(tmp_adev));
5930 
5931 		if (tmp_adev->asic_reset_res)
5932 			r = tmp_adev->asic_reset_res;
5933 
5934 		tmp_adev->asic_reset_res = 0;
5935 
5936 		if (r) {
5937 			/* bad news, how to tell it to userspace ?
5938 			 * for ras error, we should report GPU bad status instead of
5939 			 * reset failure
5940 			 */
5941 			if (reset_context->src != AMDGPU_RESET_SRC_RAS ||
5942 			    !amdgpu_ras_eeprom_check_err_threshold(tmp_adev))
5943 				dev_info(tmp_adev->dev, "GPU reset(%d) failed\n",
5944 					atomic_read(&tmp_adev->gpu_reset_counter));
5945 			amdgpu_vf_error_put(tmp_adev, AMDGIM_ERROR_VF_GPU_RESET_FAIL, 0, r);
5946 		} else {
5947 			dev_info(tmp_adev->dev, "GPU reset(%d) succeeded!\n", atomic_read(&tmp_adev->gpu_reset_counter));
5948 			if (amdgpu_acpi_smart_shift_update(adev_to_drm(tmp_adev), AMDGPU_SS_DEV_D0))
5949 				DRM_WARN("smart shift update failed\n");
5950 		}
5951 	}
5952 
5953 skip_sched_resume:
5954 	list_for_each_entry(tmp_adev, device_list_handle, reset_list) {
5955 		/* unlock kfd: SRIOV would do it separately */
5956 		if (!need_emergency_restart && !amdgpu_sriov_vf(tmp_adev))
5957 			amdgpu_amdkfd_post_reset(tmp_adev);
5958 
5959 		/* kfd_post_reset will do nothing if kfd device is not initialized,
5960 		 * need to bring up kfd here if it's not be initialized before
5961 		 */
5962 		if (!adev->kfd.init_complete)
5963 			amdgpu_amdkfd_device_init(adev);
5964 
5965 		if (audio_suspended)
5966 			amdgpu_device_resume_display_audio(tmp_adev);
5967 
5968 		amdgpu_device_unset_mp1_state(tmp_adev);
5969 
5970 		amdgpu_ras_set_error_query_ready(tmp_adev, true);
5971 	}
5972 
5973 	tmp_adev = list_first_entry(device_list_handle, struct amdgpu_device,
5974 					    reset_list);
5975 	amdgpu_device_unlock_reset_domain(tmp_adev->reset_domain);
5976 
5977 end_reset:
5978 	if (hive) {
5979 		mutex_unlock(&hive->hive_lock);
5980 		amdgpu_put_xgmi_hive(hive);
5981 	}
5982 
5983 	if (r)
5984 		dev_info(adev->dev, "GPU reset end with ret = %d\n", r);
5985 
5986 	atomic_set(&adev->reset_domain->reset_res, r);
5987 	return r;
5988 }
5989 
5990 /**
5991  * amdgpu_device_partner_bandwidth - find the bandwidth of appropriate partner
5992  *
5993  * @adev: amdgpu_device pointer
5994  * @speed: pointer to the speed of the link
5995  * @width: pointer to the width of the link
5996  *
5997  * Evaluate the hierarchy to find the speed and bandwidth capabilities of the
5998  * first physical partner to an AMD dGPU.
5999  * This will exclude any virtual switches and links.
6000  */
amdgpu_device_partner_bandwidth(struct amdgpu_device * adev,enum pci_bus_speed * speed,enum pcie_link_width * width)6001 static void amdgpu_device_partner_bandwidth(struct amdgpu_device *adev,
6002 					    enum pci_bus_speed *speed,
6003 					    enum pcie_link_width *width)
6004 {
6005 	struct pci_dev *parent = adev->pdev;
6006 
6007 	if (!speed || !width)
6008 		return;
6009 
6010 	*speed = PCI_SPEED_UNKNOWN;
6011 	*width = PCIE_LNK_WIDTH_UNKNOWN;
6012 
6013 	if (amdgpu_device_pcie_dynamic_switching_supported(adev)) {
6014 		while ((parent = pci_upstream_bridge(parent))) {
6015 			/* skip upstream/downstream switches internal to dGPU*/
6016 			if (parent->vendor == PCI_VENDOR_ID_ATI)
6017 				continue;
6018 			*speed = pcie_get_speed_cap(parent);
6019 			*width = pcie_get_width_cap(parent);
6020 			break;
6021 		}
6022 	} else {
6023 		/* use the current speeds rather than max if switching is not supported */
6024 		pcie_bandwidth_available(adev->pdev, NULL, speed, width);
6025 	}
6026 }
6027 
6028 /**
6029  * amdgpu_device_get_pcie_info - fence pcie info about the PCIE slot
6030  *
6031  * @adev: amdgpu_device pointer
6032  *
6033  * Fetchs and stores in the driver the PCIE capabilities (gen speed
6034  * and lanes) of the slot the device is in. Handles APUs and
6035  * virtualized environments where PCIE config space may not be available.
6036  */
amdgpu_device_get_pcie_info(struct amdgpu_device * adev)6037 static void amdgpu_device_get_pcie_info(struct amdgpu_device *adev)
6038 {
6039 	struct pci_dev *pdev;
6040 	enum pci_bus_speed speed_cap, platform_speed_cap;
6041 	enum pcie_link_width platform_link_width;
6042 
6043 	if (amdgpu_pcie_gen_cap)
6044 		adev->pm.pcie_gen_mask = amdgpu_pcie_gen_cap;
6045 
6046 	if (amdgpu_pcie_lane_cap)
6047 		adev->pm.pcie_mlw_mask = amdgpu_pcie_lane_cap;
6048 
6049 	/* covers APUs as well */
6050 	if (pci_is_root_bus(adev->pdev->bus) && !amdgpu_passthrough(adev)) {
6051 		if (adev->pm.pcie_gen_mask == 0)
6052 			adev->pm.pcie_gen_mask = AMDGPU_DEFAULT_PCIE_GEN_MASK;
6053 		if (adev->pm.pcie_mlw_mask == 0)
6054 			adev->pm.pcie_mlw_mask = AMDGPU_DEFAULT_PCIE_MLW_MASK;
6055 		return;
6056 	}
6057 
6058 	if (adev->pm.pcie_gen_mask && adev->pm.pcie_mlw_mask)
6059 		return;
6060 
6061 	amdgpu_device_partner_bandwidth(adev, &platform_speed_cap,
6062 					&platform_link_width);
6063 
6064 	if (adev->pm.pcie_gen_mask == 0) {
6065 		/* asic caps */
6066 		pdev = adev->pdev;
6067 		speed_cap = pcie_get_speed_cap(pdev);
6068 		if (speed_cap == PCI_SPEED_UNKNOWN) {
6069 			adev->pm.pcie_gen_mask |= (CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN1 |
6070 						  CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN2 |
6071 						  CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN3);
6072 		} else {
6073 			if (speed_cap == PCIE_SPEED_32_0GT)
6074 				adev->pm.pcie_gen_mask |= (CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN1 |
6075 							  CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN2 |
6076 							  CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN3 |
6077 							  CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN4 |
6078 							  CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN5);
6079 			else if (speed_cap == PCIE_SPEED_16_0GT)
6080 				adev->pm.pcie_gen_mask |= (CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN1 |
6081 							  CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN2 |
6082 							  CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN3 |
6083 							  CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN4);
6084 			else if (speed_cap == PCIE_SPEED_8_0GT)
6085 				adev->pm.pcie_gen_mask |= (CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN1 |
6086 							  CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN2 |
6087 							  CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN3);
6088 			else if (speed_cap == PCIE_SPEED_5_0GT)
6089 				adev->pm.pcie_gen_mask |= (CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN1 |
6090 							  CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN2);
6091 			else
6092 				adev->pm.pcie_gen_mask |= CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN1;
6093 		}
6094 		/* platform caps */
6095 		if (platform_speed_cap == PCI_SPEED_UNKNOWN) {
6096 			adev->pm.pcie_gen_mask |= (CAIL_PCIE_LINK_SPEED_SUPPORT_GEN1 |
6097 						   CAIL_PCIE_LINK_SPEED_SUPPORT_GEN2);
6098 		} else {
6099 			if (platform_speed_cap == PCIE_SPEED_32_0GT)
6100 				adev->pm.pcie_gen_mask |= (CAIL_PCIE_LINK_SPEED_SUPPORT_GEN1 |
6101 							   CAIL_PCIE_LINK_SPEED_SUPPORT_GEN2 |
6102 							   CAIL_PCIE_LINK_SPEED_SUPPORT_GEN3 |
6103 							   CAIL_PCIE_LINK_SPEED_SUPPORT_GEN4 |
6104 							   CAIL_PCIE_LINK_SPEED_SUPPORT_GEN5);
6105 			else if (platform_speed_cap == PCIE_SPEED_16_0GT)
6106 				adev->pm.pcie_gen_mask |= (CAIL_PCIE_LINK_SPEED_SUPPORT_GEN1 |
6107 							   CAIL_PCIE_LINK_SPEED_SUPPORT_GEN2 |
6108 							   CAIL_PCIE_LINK_SPEED_SUPPORT_GEN3 |
6109 							   CAIL_PCIE_LINK_SPEED_SUPPORT_GEN4);
6110 			else if (platform_speed_cap == PCIE_SPEED_8_0GT)
6111 				adev->pm.pcie_gen_mask |= (CAIL_PCIE_LINK_SPEED_SUPPORT_GEN1 |
6112 							   CAIL_PCIE_LINK_SPEED_SUPPORT_GEN2 |
6113 							   CAIL_PCIE_LINK_SPEED_SUPPORT_GEN3);
6114 			else if (platform_speed_cap == PCIE_SPEED_5_0GT)
6115 				adev->pm.pcie_gen_mask |= (CAIL_PCIE_LINK_SPEED_SUPPORT_GEN1 |
6116 							   CAIL_PCIE_LINK_SPEED_SUPPORT_GEN2);
6117 			else
6118 				adev->pm.pcie_gen_mask |= CAIL_PCIE_LINK_SPEED_SUPPORT_GEN1;
6119 
6120 		}
6121 	}
6122 	if (adev->pm.pcie_mlw_mask == 0) {
6123 		if (platform_link_width == PCIE_LNK_WIDTH_UNKNOWN) {
6124 			adev->pm.pcie_mlw_mask |= AMDGPU_DEFAULT_PCIE_MLW_MASK;
6125 		} else {
6126 			switch (platform_link_width) {
6127 			case PCIE_LNK_X32:
6128 				adev->pm.pcie_mlw_mask = (CAIL_PCIE_LINK_WIDTH_SUPPORT_X32 |
6129 							  CAIL_PCIE_LINK_WIDTH_SUPPORT_X16 |
6130 							  CAIL_PCIE_LINK_WIDTH_SUPPORT_X12 |
6131 							  CAIL_PCIE_LINK_WIDTH_SUPPORT_X8 |
6132 							  CAIL_PCIE_LINK_WIDTH_SUPPORT_X4 |
6133 							  CAIL_PCIE_LINK_WIDTH_SUPPORT_X2 |
6134 							  CAIL_PCIE_LINK_WIDTH_SUPPORT_X1);
6135 				break;
6136 			case PCIE_LNK_X16:
6137 				adev->pm.pcie_mlw_mask = (CAIL_PCIE_LINK_WIDTH_SUPPORT_X16 |
6138 							  CAIL_PCIE_LINK_WIDTH_SUPPORT_X12 |
6139 							  CAIL_PCIE_LINK_WIDTH_SUPPORT_X8 |
6140 							  CAIL_PCIE_LINK_WIDTH_SUPPORT_X4 |
6141 							  CAIL_PCIE_LINK_WIDTH_SUPPORT_X2 |
6142 							  CAIL_PCIE_LINK_WIDTH_SUPPORT_X1);
6143 				break;
6144 			case PCIE_LNK_X12:
6145 				adev->pm.pcie_mlw_mask = (CAIL_PCIE_LINK_WIDTH_SUPPORT_X12 |
6146 							  CAIL_PCIE_LINK_WIDTH_SUPPORT_X8 |
6147 							  CAIL_PCIE_LINK_WIDTH_SUPPORT_X4 |
6148 							  CAIL_PCIE_LINK_WIDTH_SUPPORT_X2 |
6149 							  CAIL_PCIE_LINK_WIDTH_SUPPORT_X1);
6150 				break;
6151 			case PCIE_LNK_X8:
6152 				adev->pm.pcie_mlw_mask = (CAIL_PCIE_LINK_WIDTH_SUPPORT_X8 |
6153 							  CAIL_PCIE_LINK_WIDTH_SUPPORT_X4 |
6154 							  CAIL_PCIE_LINK_WIDTH_SUPPORT_X2 |
6155 							  CAIL_PCIE_LINK_WIDTH_SUPPORT_X1);
6156 				break;
6157 			case PCIE_LNK_X4:
6158 				adev->pm.pcie_mlw_mask = (CAIL_PCIE_LINK_WIDTH_SUPPORT_X4 |
6159 							  CAIL_PCIE_LINK_WIDTH_SUPPORT_X2 |
6160 							  CAIL_PCIE_LINK_WIDTH_SUPPORT_X1);
6161 				break;
6162 			case PCIE_LNK_X2:
6163 				adev->pm.pcie_mlw_mask = (CAIL_PCIE_LINK_WIDTH_SUPPORT_X2 |
6164 							  CAIL_PCIE_LINK_WIDTH_SUPPORT_X1);
6165 				break;
6166 			case PCIE_LNK_X1:
6167 				adev->pm.pcie_mlw_mask = CAIL_PCIE_LINK_WIDTH_SUPPORT_X1;
6168 				break;
6169 			default:
6170 				break;
6171 			}
6172 		}
6173 	}
6174 }
6175 
6176 /**
6177  * amdgpu_device_is_peer_accessible - Check peer access through PCIe BAR
6178  *
6179  * @adev: amdgpu_device pointer
6180  * @peer_adev: amdgpu_device pointer for peer device trying to access @adev
6181  *
6182  * Return true if @peer_adev can access (DMA) @adev through the PCIe
6183  * BAR, i.e. @adev is "large BAR" and the BAR matches the DMA mask of
6184  * @peer_adev.
6185  */
amdgpu_device_is_peer_accessible(struct amdgpu_device * adev,struct amdgpu_device * peer_adev)6186 bool amdgpu_device_is_peer_accessible(struct amdgpu_device *adev,
6187 				      struct amdgpu_device *peer_adev)
6188 {
6189 #ifdef CONFIG_HSA_AMD_P2P
6190 	bool p2p_access =
6191 		!adev->gmc.xgmi.connected_to_cpu &&
6192 		!(pci_p2pdma_distance(adev->pdev, peer_adev->dev, false) < 0);
6193 
6194 	bool is_large_bar = adev->gmc.visible_vram_size &&
6195 		adev->gmc.real_vram_size == adev->gmc.visible_vram_size;
6196 	bool p2p_addressable = amdgpu_device_check_iommu_remap(peer_adev);
6197 
6198 	if (!p2p_addressable) {
6199 		uint64_t address_mask = peer_adev->dev->dma_mask ?
6200 			~*peer_adev->dev->dma_mask : ~((1ULL << 32) - 1);
6201 		resource_size_t aper_limit =
6202 			adev->gmc.aper_base + adev->gmc.aper_size - 1;
6203 
6204 		p2p_addressable = !(adev->gmc.aper_base & address_mask ||
6205 				     aper_limit & address_mask);
6206 	}
6207 	return pcie_p2p && is_large_bar && p2p_access && p2p_addressable;
6208 #else
6209 	return false;
6210 #endif
6211 }
6212 
amdgpu_device_baco_enter(struct drm_device * dev)6213 int amdgpu_device_baco_enter(struct drm_device *dev)
6214 {
6215 	struct amdgpu_device *adev = drm_to_adev(dev);
6216 	struct amdgpu_ras *ras = amdgpu_ras_get_context(adev);
6217 
6218 	if (!amdgpu_device_supports_baco(dev))
6219 		return -ENOTSUPP;
6220 
6221 	if (ras && adev->ras_enabled &&
6222 	    adev->nbio.funcs->enable_doorbell_interrupt)
6223 		adev->nbio.funcs->enable_doorbell_interrupt(adev, false);
6224 
6225 	return amdgpu_dpm_baco_enter(adev);
6226 }
6227 
amdgpu_device_baco_exit(struct drm_device * dev)6228 int amdgpu_device_baco_exit(struct drm_device *dev)
6229 {
6230 	struct amdgpu_device *adev = drm_to_adev(dev);
6231 	struct amdgpu_ras *ras = amdgpu_ras_get_context(adev);
6232 	int ret = 0;
6233 
6234 	if (!amdgpu_device_supports_baco(dev))
6235 		return -ENOTSUPP;
6236 
6237 	ret = amdgpu_dpm_baco_exit(adev);
6238 	if (ret)
6239 		return ret;
6240 
6241 	if (ras && adev->ras_enabled &&
6242 	    adev->nbio.funcs->enable_doorbell_interrupt)
6243 		adev->nbio.funcs->enable_doorbell_interrupt(adev, true);
6244 
6245 	if (amdgpu_passthrough(adev) && adev->nbio.funcs &&
6246 	    adev->nbio.funcs->clear_doorbell_interrupt)
6247 		adev->nbio.funcs->clear_doorbell_interrupt(adev);
6248 
6249 	return 0;
6250 }
6251 
6252 /**
6253  * amdgpu_pci_error_detected - Called when a PCI error is detected.
6254  * @pdev: PCI device struct
6255  * @state: PCI channel state
6256  *
6257  * Description: Called when a PCI error is detected.
6258  *
6259  * Return: PCI_ERS_RESULT_NEED_RESET or PCI_ERS_RESULT_DISCONNECT.
6260  */
amdgpu_pci_error_detected(struct pci_dev * pdev,pci_channel_state_t state)6261 pci_ers_result_t amdgpu_pci_error_detected(struct pci_dev *pdev, pci_channel_state_t state)
6262 {
6263 	struct drm_device *dev = pci_get_drvdata(pdev);
6264 	struct amdgpu_device *adev = drm_to_adev(dev);
6265 	int i;
6266 
6267 	DRM_INFO("PCI error: detected callback, state(%d)!!\n", state);
6268 
6269 	if (adev->gmc.xgmi.num_physical_nodes > 1) {
6270 		DRM_WARN("No support for XGMI hive yet...");
6271 		return PCI_ERS_RESULT_DISCONNECT;
6272 	}
6273 
6274 	adev->pci_channel_state = state;
6275 
6276 	switch (state) {
6277 	case pci_channel_io_normal:
6278 		return PCI_ERS_RESULT_CAN_RECOVER;
6279 	/* Fatal error, prepare for slot reset */
6280 	case pci_channel_io_frozen:
6281 		/*
6282 		 * Locking adev->reset_domain->sem will prevent any external access
6283 		 * to GPU during PCI error recovery
6284 		 */
6285 		amdgpu_device_lock_reset_domain(adev->reset_domain);
6286 		amdgpu_device_set_mp1_state(adev);
6287 
6288 		/*
6289 		 * Block any work scheduling as we do for regular GPU reset
6290 		 * for the duration of the recovery
6291 		 */
6292 		for (i = 0; i < AMDGPU_MAX_RINGS; ++i) {
6293 			struct amdgpu_ring *ring = adev->rings[i];
6294 
6295 			if (!amdgpu_ring_sched_ready(ring))
6296 				continue;
6297 
6298 			drm_sched_stop(&ring->sched, NULL);
6299 		}
6300 		atomic_inc(&adev->gpu_reset_counter);
6301 		return PCI_ERS_RESULT_NEED_RESET;
6302 	case pci_channel_io_perm_failure:
6303 		/* Permanent error, prepare for device removal */
6304 		return PCI_ERS_RESULT_DISCONNECT;
6305 	}
6306 
6307 	return PCI_ERS_RESULT_NEED_RESET;
6308 }
6309 
6310 /**
6311  * amdgpu_pci_mmio_enabled - Enable MMIO and dump debug registers
6312  * @pdev: pointer to PCI device
6313  */
amdgpu_pci_mmio_enabled(struct pci_dev * pdev)6314 pci_ers_result_t amdgpu_pci_mmio_enabled(struct pci_dev *pdev)
6315 {
6316 
6317 	DRM_INFO("PCI error: mmio enabled callback!!\n");
6318 
6319 	/* TODO - dump whatever for debugging purposes */
6320 
6321 	/* This called only if amdgpu_pci_error_detected returns
6322 	 * PCI_ERS_RESULT_CAN_RECOVER. Read/write to the device still
6323 	 * works, no need to reset slot.
6324 	 */
6325 
6326 	return PCI_ERS_RESULT_RECOVERED;
6327 }
6328 
6329 /**
6330  * amdgpu_pci_slot_reset - Called when PCI slot has been reset.
6331  * @pdev: PCI device struct
6332  *
6333  * Description: This routine is called by the pci error recovery
6334  * code after the PCI slot has been reset, just before we
6335  * should resume normal operations.
6336  */
amdgpu_pci_slot_reset(struct pci_dev * pdev)6337 pci_ers_result_t amdgpu_pci_slot_reset(struct pci_dev *pdev)
6338 {
6339 	struct drm_device *dev = pci_get_drvdata(pdev);
6340 	struct amdgpu_device *adev = drm_to_adev(dev);
6341 	int r, i;
6342 	struct amdgpu_reset_context reset_context;
6343 	u32 memsize;
6344 	struct list_head device_list;
6345 
6346 	/* PCI error slot reset should be skipped During RAS recovery */
6347 	if ((amdgpu_ip_version(adev, GC_HWIP, 0) == IP_VERSION(9, 4, 3) ||
6348 	    amdgpu_ip_version(adev, GC_HWIP, 0) == IP_VERSION(9, 4, 4)) &&
6349 	    amdgpu_ras_in_recovery(adev))
6350 		return PCI_ERS_RESULT_RECOVERED;
6351 
6352 	DRM_INFO("PCI error: slot reset callback!!\n");
6353 
6354 	memset(&reset_context, 0, sizeof(reset_context));
6355 
6356 	INIT_LIST_HEAD(&device_list);
6357 	list_add_tail(&adev->reset_list, &device_list);
6358 
6359 	/* wait for asic to come out of reset */
6360 	msleep(500);
6361 
6362 	/* Restore PCI confspace */
6363 	amdgpu_device_load_pci_state(pdev);
6364 
6365 	/* confirm  ASIC came out of reset */
6366 	for (i = 0; i < adev->usec_timeout; i++) {
6367 		memsize = amdgpu_asic_get_config_memsize(adev);
6368 
6369 		if (memsize != 0xffffffff)
6370 			break;
6371 		udelay(1);
6372 	}
6373 	if (memsize == 0xffffffff) {
6374 		r = -ETIME;
6375 		goto out;
6376 	}
6377 
6378 	reset_context.method = AMD_RESET_METHOD_NONE;
6379 	reset_context.reset_req_dev = adev;
6380 	set_bit(AMDGPU_NEED_FULL_RESET, &reset_context.flags);
6381 	set_bit(AMDGPU_SKIP_HW_RESET, &reset_context.flags);
6382 
6383 	adev->no_hw_access = true;
6384 	r = amdgpu_device_pre_asic_reset(adev, &reset_context);
6385 	adev->no_hw_access = false;
6386 	if (r)
6387 		goto out;
6388 
6389 	r = amdgpu_do_asic_reset(&device_list, &reset_context);
6390 
6391 out:
6392 	if (!r) {
6393 		if (amdgpu_device_cache_pci_state(adev->pdev))
6394 			pci_restore_state(adev->pdev);
6395 
6396 		DRM_INFO("PCIe error recovery succeeded\n");
6397 	} else {
6398 		DRM_ERROR("PCIe error recovery failed, err:%d", r);
6399 		amdgpu_device_unset_mp1_state(adev);
6400 		amdgpu_device_unlock_reset_domain(adev->reset_domain);
6401 	}
6402 
6403 	return r ? PCI_ERS_RESULT_DISCONNECT : PCI_ERS_RESULT_RECOVERED;
6404 }
6405 
6406 /**
6407  * amdgpu_pci_resume() - resume normal ops after PCI reset
6408  * @pdev: pointer to PCI device
6409  *
6410  * Called when the error recovery driver tells us that its
6411  * OK to resume normal operation.
6412  */
amdgpu_pci_resume(struct pci_dev * pdev)6413 void amdgpu_pci_resume(struct pci_dev *pdev)
6414 {
6415 	struct drm_device *dev = pci_get_drvdata(pdev);
6416 	struct amdgpu_device *adev = drm_to_adev(dev);
6417 	int i;
6418 
6419 
6420 	DRM_INFO("PCI error: resume callback!!\n");
6421 
6422 	/* Only continue execution for the case of pci_channel_io_frozen */
6423 	if (adev->pci_channel_state != pci_channel_io_frozen)
6424 		return;
6425 
6426 	for (i = 0; i < AMDGPU_MAX_RINGS; ++i) {
6427 		struct amdgpu_ring *ring = adev->rings[i];
6428 
6429 		if (!amdgpu_ring_sched_ready(ring))
6430 			continue;
6431 
6432 		drm_sched_start(&ring->sched);
6433 	}
6434 
6435 	amdgpu_device_unset_mp1_state(adev);
6436 	amdgpu_device_unlock_reset_domain(adev->reset_domain);
6437 }
6438 
amdgpu_device_cache_pci_state(struct pci_dev * pdev)6439 bool amdgpu_device_cache_pci_state(struct pci_dev *pdev)
6440 {
6441 	struct drm_device *dev = pci_get_drvdata(pdev);
6442 	struct amdgpu_device *adev = drm_to_adev(dev);
6443 	int r;
6444 
6445 	if (amdgpu_sriov_vf(adev))
6446 		return false;
6447 
6448 	r = pci_save_state(pdev);
6449 	if (!r) {
6450 		kfree(adev->pci_state);
6451 
6452 		adev->pci_state = pci_store_saved_state(pdev);
6453 
6454 		if (!adev->pci_state) {
6455 			DRM_ERROR("Failed to store PCI saved state");
6456 			return false;
6457 		}
6458 	} else {
6459 		DRM_WARN("Failed to save PCI state, err:%d\n", r);
6460 		return false;
6461 	}
6462 
6463 	return true;
6464 }
6465 
amdgpu_device_load_pci_state(struct pci_dev * pdev)6466 bool amdgpu_device_load_pci_state(struct pci_dev *pdev)
6467 {
6468 	struct drm_device *dev = pci_get_drvdata(pdev);
6469 	struct amdgpu_device *adev = drm_to_adev(dev);
6470 	int r;
6471 
6472 	if (!adev->pci_state)
6473 		return false;
6474 
6475 	r = pci_load_saved_state(pdev, adev->pci_state);
6476 
6477 	if (!r) {
6478 		pci_restore_state(pdev);
6479 	} else {
6480 		DRM_WARN("Failed to load PCI state, err:%d\n", r);
6481 		return false;
6482 	}
6483 
6484 	return true;
6485 }
6486 
amdgpu_device_flush_hdp(struct amdgpu_device * adev,struct amdgpu_ring * ring)6487 void amdgpu_device_flush_hdp(struct amdgpu_device *adev,
6488 		struct amdgpu_ring *ring)
6489 {
6490 #ifdef CONFIG_X86_64
6491 	if ((adev->flags & AMD_IS_APU) && !amdgpu_passthrough(adev))
6492 		return;
6493 #endif
6494 	if (adev->gmc.xgmi.connected_to_cpu)
6495 		return;
6496 
6497 	if (ring && ring->funcs->emit_hdp_flush)
6498 		amdgpu_ring_emit_hdp_flush(ring);
6499 	else
6500 		amdgpu_asic_flush_hdp(adev, ring);
6501 }
6502 
amdgpu_device_invalidate_hdp(struct amdgpu_device * adev,struct amdgpu_ring * ring)6503 void amdgpu_device_invalidate_hdp(struct amdgpu_device *adev,
6504 		struct amdgpu_ring *ring)
6505 {
6506 #ifdef CONFIG_X86_64
6507 	if ((adev->flags & AMD_IS_APU) && !amdgpu_passthrough(adev))
6508 		return;
6509 #endif
6510 	if (adev->gmc.xgmi.connected_to_cpu)
6511 		return;
6512 
6513 	amdgpu_asic_invalidate_hdp(adev, ring);
6514 }
6515 
amdgpu_in_reset(struct amdgpu_device * adev)6516 int amdgpu_in_reset(struct amdgpu_device *adev)
6517 {
6518 	return atomic_read(&adev->reset_domain->in_gpu_reset);
6519 }
6520 
6521 /**
6522  * amdgpu_device_halt() - bring hardware to some kind of halt state
6523  *
6524  * @adev: amdgpu_device pointer
6525  *
6526  * Bring hardware to some kind of halt state so that no one can touch it
6527  * any more. It will help to maintain error context when error occurred.
6528  * Compare to a simple hang, the system will keep stable at least for SSH
6529  * access. Then it should be trivial to inspect the hardware state and
6530  * see what's going on. Implemented as following:
6531  *
6532  * 1. drm_dev_unplug() makes device inaccessible to user space(IOCTLs, etc),
6533  *    clears all CPU mappings to device, disallows remappings through page faults
6534  * 2. amdgpu_irq_disable_all() disables all interrupts
6535  * 3. amdgpu_fence_driver_hw_fini() signals all HW fences
6536  * 4. set adev->no_hw_access to avoid potential crashes after setp 5
6537  * 5. amdgpu_device_unmap_mmio() clears all MMIO mappings
6538  * 6. pci_disable_device() and pci_wait_for_pending_transaction()
6539  *    flush any in flight DMA operations
6540  */
amdgpu_device_halt(struct amdgpu_device * adev)6541 void amdgpu_device_halt(struct amdgpu_device *adev)
6542 {
6543 	struct pci_dev *pdev = adev->pdev;
6544 	struct drm_device *ddev = adev_to_drm(adev);
6545 
6546 	amdgpu_xcp_dev_unplug(adev);
6547 	drm_dev_unplug(ddev);
6548 
6549 	amdgpu_irq_disable_all(adev);
6550 
6551 	amdgpu_fence_driver_hw_fini(adev);
6552 
6553 	adev->no_hw_access = true;
6554 
6555 	amdgpu_device_unmap_mmio(adev);
6556 
6557 	pci_disable_device(pdev);
6558 	pci_wait_for_pending_transaction(pdev);
6559 }
6560 
amdgpu_device_pcie_port_rreg(struct amdgpu_device * adev,u32 reg)6561 u32 amdgpu_device_pcie_port_rreg(struct amdgpu_device *adev,
6562 				u32 reg)
6563 {
6564 	unsigned long flags, address, data;
6565 	u32 r;
6566 
6567 	address = adev->nbio.funcs->get_pcie_port_index_offset(adev);
6568 	data = adev->nbio.funcs->get_pcie_port_data_offset(adev);
6569 
6570 	spin_lock_irqsave(&adev->pcie_idx_lock, flags);
6571 	WREG32(address, reg * 4);
6572 	(void)RREG32(address);
6573 	r = RREG32(data);
6574 	spin_unlock_irqrestore(&adev->pcie_idx_lock, flags);
6575 	return r;
6576 }
6577 
amdgpu_device_pcie_port_wreg(struct amdgpu_device * adev,u32 reg,u32 v)6578 void amdgpu_device_pcie_port_wreg(struct amdgpu_device *adev,
6579 				u32 reg, u32 v)
6580 {
6581 	unsigned long flags, address, data;
6582 
6583 	address = adev->nbio.funcs->get_pcie_port_index_offset(adev);
6584 	data = adev->nbio.funcs->get_pcie_port_data_offset(adev);
6585 
6586 	spin_lock_irqsave(&adev->pcie_idx_lock, flags);
6587 	WREG32(address, reg * 4);
6588 	(void)RREG32(address);
6589 	WREG32(data, v);
6590 	(void)RREG32(data);
6591 	spin_unlock_irqrestore(&adev->pcie_idx_lock, flags);
6592 }
6593 
6594 /**
6595  * amdgpu_device_get_gang - return a reference to the current gang
6596  * @adev: amdgpu_device pointer
6597  *
6598  * Returns: A new reference to the current gang leader.
6599  */
amdgpu_device_get_gang(struct amdgpu_device * adev)6600 struct dma_fence *amdgpu_device_get_gang(struct amdgpu_device *adev)
6601 {
6602 	struct dma_fence *fence;
6603 
6604 	rcu_read_lock();
6605 	fence = dma_fence_get_rcu_safe(&adev->gang_submit);
6606 	rcu_read_unlock();
6607 	return fence;
6608 }
6609 
6610 /**
6611  * amdgpu_device_switch_gang - switch to a new gang
6612  * @adev: amdgpu_device pointer
6613  * @gang: the gang to switch to
6614  *
6615  * Try to switch to a new gang.
6616  * Returns: NULL if we switched to the new gang or a reference to the current
6617  * gang leader.
6618  */
amdgpu_device_switch_gang(struct amdgpu_device * adev,struct dma_fence * gang)6619 struct dma_fence *amdgpu_device_switch_gang(struct amdgpu_device *adev,
6620 					    struct dma_fence *gang)
6621 {
6622 	struct dma_fence *old = NULL;
6623 
6624 	dma_fence_get(gang);
6625 	do {
6626 		dma_fence_put(old);
6627 		old = amdgpu_device_get_gang(adev);
6628 		if (old == gang)
6629 			break;
6630 
6631 		if (!dma_fence_is_signaled(old)) {
6632 			dma_fence_put(gang);
6633 			return old;
6634 		}
6635 
6636 	} while (cmpxchg((struct dma_fence __force **)&adev->gang_submit,
6637 			 old, gang) != old);
6638 
6639 	/*
6640 	 * Drop it once for the exchanged reference in adev and once for the
6641 	 * thread local reference acquired in amdgpu_device_get_gang().
6642 	 */
6643 	dma_fence_put(old);
6644 	dma_fence_put(old);
6645 	return NULL;
6646 }
6647 
amdgpu_device_has_display_hardware(struct amdgpu_device * adev)6648 bool amdgpu_device_has_display_hardware(struct amdgpu_device *adev)
6649 {
6650 	switch (adev->asic_type) {
6651 #ifdef CONFIG_DRM_AMDGPU_SI
6652 	case CHIP_HAINAN:
6653 #endif
6654 	case CHIP_TOPAZ:
6655 		/* chips with no display hardware */
6656 		return false;
6657 #ifdef CONFIG_DRM_AMDGPU_SI
6658 	case CHIP_TAHITI:
6659 	case CHIP_PITCAIRN:
6660 	case CHIP_VERDE:
6661 	case CHIP_OLAND:
6662 #endif
6663 #ifdef CONFIG_DRM_AMDGPU_CIK
6664 	case CHIP_BONAIRE:
6665 	case CHIP_HAWAII:
6666 	case CHIP_KAVERI:
6667 	case CHIP_KABINI:
6668 	case CHIP_MULLINS:
6669 #endif
6670 	case CHIP_TONGA:
6671 	case CHIP_FIJI:
6672 	case CHIP_POLARIS10:
6673 	case CHIP_POLARIS11:
6674 	case CHIP_POLARIS12:
6675 	case CHIP_VEGAM:
6676 	case CHIP_CARRIZO:
6677 	case CHIP_STONEY:
6678 		/* chips with display hardware */
6679 		return true;
6680 	default:
6681 		/* IP discovery */
6682 		if (!amdgpu_ip_version(adev, DCE_HWIP, 0) ||
6683 		    (adev->harvest_ip_mask & AMD_HARVEST_IP_DMU_MASK))
6684 			return false;
6685 		return true;
6686 	}
6687 }
6688 
amdgpu_device_wait_on_rreg(struct amdgpu_device * adev,uint32_t inst,uint32_t reg_addr,char reg_name[],uint32_t expected_value,uint32_t mask)6689 uint32_t amdgpu_device_wait_on_rreg(struct amdgpu_device *adev,
6690 		uint32_t inst, uint32_t reg_addr, char reg_name[],
6691 		uint32_t expected_value, uint32_t mask)
6692 {
6693 	uint32_t ret = 0;
6694 	uint32_t old_ = 0;
6695 	uint32_t tmp_ = RREG32(reg_addr);
6696 	uint32_t loop = adev->usec_timeout;
6697 
6698 	while ((tmp_ & (mask)) != (expected_value)) {
6699 		if (old_ != tmp_) {
6700 			loop = adev->usec_timeout;
6701 			old_ = tmp_;
6702 		} else
6703 			udelay(1);
6704 		tmp_ = RREG32(reg_addr);
6705 		loop--;
6706 		if (!loop) {
6707 			DRM_WARN("Register(%d) [%s] failed to reach value 0x%08x != 0x%08xn",
6708 				  inst, reg_name, (uint32_t)expected_value,
6709 				  (uint32_t)(tmp_ & (mask)));
6710 			ret = -ETIMEDOUT;
6711 			break;
6712 		}
6713 	}
6714 	return ret;
6715 }
6716