• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /*
2  * Copyright 2008 Advanced Micro Devices, Inc.
3  * Copyright 2008 Red Hat Inc.
4  * Copyright 2009 Jerome Glisse.
5  *
6  * Permission is hereby granted, free of charge, to any person obtaining a
7  * copy of this software and associated documentation files (the "Software"),
8  * to deal in the Software without restriction, including without limitation
9  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
10  * and/or sell copies of the Software, and to permit persons to whom the
11  * Software is furnished to do so, subject to the following conditions:
12  *
13  * The above copyright notice and this permission notice shall be included in
14  * all copies or substantial portions of the Software.
15  *
16  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
19  * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR
20  * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
21  * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
22  * OTHER DEALINGS IN THE SOFTWARE.
23  *
24  * Authors: Dave Airlie
25  *          Alex Deucher
26  *          Jerome Glisse
27  */
28 #include <linux/power_supply.h>
29 #include <linux/kthread.h>
30 #include <linux/module.h>
31 #include <linux/console.h>
32 #include <linux/slab.h>
33 #include <linux/pci.h>
34 
35 #include <drm/drm_aperture.h>
36 #include <drm/drm_atomic_helper.h>
37 #include <drm/drm_probe_helper.h>
38 #include <drm/amdgpu_drm.h>
39 #include <linux/vgaarb.h>
40 #include <linux/vga_switcheroo.h>
41 #include <linux/efi.h>
42 #include "amdgpu.h"
43 #include "amdgpu_trace.h"
44 #include "amdgpu_i2c.h"
45 #include "atom.h"
46 #include "amdgpu_atombios.h"
47 #include "amdgpu_atomfirmware.h"
48 #include "amd_pcie.h"
49 #ifdef CONFIG_DRM_AMDGPU_SI
50 #include "si.h"
51 #endif
52 #ifdef CONFIG_DRM_AMDGPU_CIK
53 #include "cik.h"
54 #endif
55 #include "vi.h"
56 #include "soc15.h"
57 #include "nv.h"
58 #include "bif/bif_4_1_d.h"
59 #include <linux/pci.h>
60 #include <linux/firmware.h>
61 #include "amdgpu_vf_error.h"
62 
63 #include "amdgpu_amdkfd.h"
64 #include "amdgpu_pm.h"
65 
66 #include "amdgpu_xgmi.h"
67 #include "amdgpu_ras.h"
68 #include "amdgpu_pmu.h"
69 #include "amdgpu_fru_eeprom.h"
70 #include "amdgpu_reset.h"
71 
72 #include <linux/suspend.h>
73 #include <drm/task_barrier.h>
74 #include <linux/pm_runtime.h>
75 
76 #include <drm/drm_drv.h>
77 
78 #if IS_ENABLED(CONFIG_X86)
79 #include <asm/intel-family.h>
80 #endif
81 
82 MODULE_FIRMWARE("amdgpu/vega10_gpu_info.bin");
83 MODULE_FIRMWARE("amdgpu/vega12_gpu_info.bin");
84 MODULE_FIRMWARE("amdgpu/raven_gpu_info.bin");
85 MODULE_FIRMWARE("amdgpu/picasso_gpu_info.bin");
86 MODULE_FIRMWARE("amdgpu/raven2_gpu_info.bin");
87 MODULE_FIRMWARE("amdgpu/arcturus_gpu_info.bin");
88 MODULE_FIRMWARE("amdgpu/renoir_gpu_info.bin");
89 MODULE_FIRMWARE("amdgpu/navi10_gpu_info.bin");
90 MODULE_FIRMWARE("amdgpu/navi14_gpu_info.bin");
91 MODULE_FIRMWARE("amdgpu/navi12_gpu_info.bin");
92 MODULE_FIRMWARE("amdgpu/vangogh_gpu_info.bin");
93 MODULE_FIRMWARE("amdgpu/yellow_carp_gpu_info.bin");
94 
95 #define AMDGPU_RESUME_MS		2000
96 
97 static const struct drm_driver amdgpu_kms_driver;
98 
99 const char *amdgpu_asic_name[] = {
100 	"TAHITI",
101 	"PITCAIRN",
102 	"VERDE",
103 	"OLAND",
104 	"HAINAN",
105 	"BONAIRE",
106 	"KAVERI",
107 	"KABINI",
108 	"HAWAII",
109 	"MULLINS",
110 	"TOPAZ",
111 	"TONGA",
112 	"FIJI",
113 	"CARRIZO",
114 	"STONEY",
115 	"POLARIS10",
116 	"POLARIS11",
117 	"POLARIS12",
118 	"VEGAM",
119 	"VEGA10",
120 	"VEGA12",
121 	"VEGA20",
122 	"RAVEN",
123 	"ARCTURUS",
124 	"RENOIR",
125 	"ALDEBARAN",
126 	"NAVI10",
127 	"CYAN_SKILLFISH",
128 	"NAVI14",
129 	"NAVI12",
130 	"SIENNA_CICHLID",
131 	"NAVY_FLOUNDER",
132 	"VANGOGH",
133 	"DIMGREY_CAVEFISH",
134 	"BEIGE_GOBY",
135 	"YELLOW_CARP",
136 	"LAST",
137 };
138 
139 /**
140  * DOC: pcie_replay_count
141  *
142  * The amdgpu driver provides a sysfs API for reporting the total number
143  * of PCIe replays (NAKs)
144  * The file pcie_replay_count is used for this and returns the total
145  * number of replays as a sum of the NAKs generated and NAKs received
146  */
147 
amdgpu_device_get_pcie_replay_count(struct device * dev,struct device_attribute * attr,char * buf)148 static ssize_t amdgpu_device_get_pcie_replay_count(struct device *dev,
149 		struct device_attribute *attr, char *buf)
150 {
151 	struct drm_device *ddev = dev_get_drvdata(dev);
152 	struct amdgpu_device *adev = drm_to_adev(ddev);
153 	uint64_t cnt = amdgpu_asic_get_pcie_replay_count(adev);
154 
155 	return sysfs_emit(buf, "%llu\n", cnt);
156 }
157 
158 static DEVICE_ATTR(pcie_replay_count, S_IRUGO,
159 		amdgpu_device_get_pcie_replay_count, NULL);
160 
161 static void amdgpu_device_get_pcie_info(struct amdgpu_device *adev);
162 
163 /**
164  * DOC: product_name
165  *
166  * The amdgpu driver provides a sysfs API for reporting the product name
167  * for the device
168  * The file serial_number is used for this and returns the product name
169  * as returned from the FRU.
170  * NOTE: This is only available for certain server cards
171  */
172 
amdgpu_device_get_product_name(struct device * dev,struct device_attribute * attr,char * buf)173 static ssize_t amdgpu_device_get_product_name(struct device *dev,
174 		struct device_attribute *attr, char *buf)
175 {
176 	struct drm_device *ddev = dev_get_drvdata(dev);
177 	struct amdgpu_device *adev = drm_to_adev(ddev);
178 
179 	return sysfs_emit(buf, "%s\n", adev->product_name);
180 }
181 
182 static DEVICE_ATTR(product_name, S_IRUGO,
183 		amdgpu_device_get_product_name, NULL);
184 
185 /**
186  * DOC: product_number
187  *
188  * The amdgpu driver provides a sysfs API for reporting the part number
189  * for the device
190  * The file serial_number is used for this and returns the part number
191  * as returned from the FRU.
192  * NOTE: This is only available for certain server cards
193  */
194 
amdgpu_device_get_product_number(struct device * dev,struct device_attribute * attr,char * buf)195 static ssize_t amdgpu_device_get_product_number(struct device *dev,
196 		struct device_attribute *attr, char *buf)
197 {
198 	struct drm_device *ddev = dev_get_drvdata(dev);
199 	struct amdgpu_device *adev = drm_to_adev(ddev);
200 
201 	return sysfs_emit(buf, "%s\n", adev->product_number);
202 }
203 
204 static DEVICE_ATTR(product_number, S_IRUGO,
205 		amdgpu_device_get_product_number, NULL);
206 
207 /**
208  * DOC: serial_number
209  *
210  * The amdgpu driver provides a sysfs API for reporting the serial number
211  * for the device
212  * The file serial_number is used for this and returns the serial number
213  * as returned from the FRU.
214  * NOTE: This is only available for certain server cards
215  */
216 
amdgpu_device_get_serial_number(struct device * dev,struct device_attribute * attr,char * buf)217 static ssize_t amdgpu_device_get_serial_number(struct device *dev,
218 		struct device_attribute *attr, char *buf)
219 {
220 	struct drm_device *ddev = dev_get_drvdata(dev);
221 	struct amdgpu_device *adev = drm_to_adev(ddev);
222 
223 	return sysfs_emit(buf, "%s\n", adev->serial);
224 }
225 
226 static DEVICE_ATTR(serial_number, S_IRUGO,
227 		amdgpu_device_get_serial_number, NULL);
228 
229 /**
230  * amdgpu_device_supports_px - Is the device a dGPU with ATPX power control
231  *
232  * @dev: drm_device pointer
233  *
234  * Returns true if the device is a dGPU with ATPX power control,
235  * otherwise return false.
236  */
amdgpu_device_supports_px(struct drm_device * dev)237 bool amdgpu_device_supports_px(struct drm_device *dev)
238 {
239 	struct amdgpu_device *adev = drm_to_adev(dev);
240 
241 	if ((adev->flags & AMD_IS_PX) && !amdgpu_is_atpx_hybrid())
242 		return true;
243 	return false;
244 }
245 
246 /**
247  * amdgpu_device_supports_boco - Is the device a dGPU with ACPI power resources
248  *
249  * @dev: drm_device pointer
250  *
251  * Returns true if the device is a dGPU with ACPI power control,
252  * otherwise return false.
253  */
amdgpu_device_supports_boco(struct drm_device * dev)254 bool amdgpu_device_supports_boco(struct drm_device *dev)
255 {
256 	struct amdgpu_device *adev = drm_to_adev(dev);
257 
258 	if (adev->has_pr3 ||
259 	    ((adev->flags & AMD_IS_PX) && amdgpu_is_atpx_hybrid()))
260 		return true;
261 	return false;
262 }
263 
264 /**
265  * amdgpu_device_supports_baco - Does the device support BACO
266  *
267  * @dev: drm_device pointer
268  *
269  * Returns true if the device supporte BACO,
270  * otherwise return false.
271  */
amdgpu_device_supports_baco(struct drm_device * dev)272 bool amdgpu_device_supports_baco(struct drm_device *dev)
273 {
274 	struct amdgpu_device *adev = drm_to_adev(dev);
275 
276 	return amdgpu_asic_supports_baco(adev);
277 }
278 
279 /**
280  * amdgpu_device_supports_smart_shift - Is the device dGPU with
281  * smart shift support
282  *
283  * @dev: drm_device pointer
284  *
285  * Returns true if the device is a dGPU with Smart Shift support,
286  * otherwise returns false.
287  */
amdgpu_device_supports_smart_shift(struct drm_device * dev)288 bool amdgpu_device_supports_smart_shift(struct drm_device *dev)
289 {
290 	return (amdgpu_device_supports_boco(dev) &&
291 		amdgpu_acpi_is_power_shift_control_supported());
292 }
293 
294 /*
295  * VRAM access helper functions
296  */
297 
298 /**
299  * amdgpu_device_mm_access - access vram by MM_INDEX/MM_DATA
300  *
301  * @adev: amdgpu_device pointer
302  * @pos: offset of the buffer in vram
303  * @buf: virtual address of the buffer in system memory
304  * @size: read/write size, sizeof(@buf) must > @size
305  * @write: true - write to vram, otherwise - read from vram
306  */
amdgpu_device_mm_access(struct amdgpu_device * adev,loff_t pos,void * buf,size_t size,bool write)307 void amdgpu_device_mm_access(struct amdgpu_device *adev, loff_t pos,
308 			     void *buf, size_t size, bool write)
309 {
310 	unsigned long flags;
311 	uint32_t hi = ~0, tmp = 0;
312 	uint32_t *data = buf;
313 	uint64_t last;
314 	int idx;
315 
316 	if (!drm_dev_enter(&adev->ddev, &idx))
317 		return;
318 
319 	BUG_ON(!IS_ALIGNED(pos, 4) || !IS_ALIGNED(size, 4));
320 
321 	spin_lock_irqsave(&adev->mmio_idx_lock, flags);
322 	for (last = pos + size; pos < last; pos += 4) {
323 		tmp = pos >> 31;
324 
325 		WREG32_NO_KIQ(mmMM_INDEX, ((uint32_t)pos) | 0x80000000);
326 		if (tmp != hi) {
327 			WREG32_NO_KIQ(mmMM_INDEX_HI, tmp);
328 			hi = tmp;
329 		}
330 		if (write)
331 			WREG32_NO_KIQ(mmMM_DATA, *data++);
332 		else
333 			*data++ = RREG32_NO_KIQ(mmMM_DATA);
334 	}
335 
336 	spin_unlock_irqrestore(&adev->mmio_idx_lock, flags);
337 	drm_dev_exit(idx);
338 }
339 
340 /**
341  * amdgpu_device_vram_access - access vram by vram aperature
342  *
343  * @adev: amdgpu_device pointer
344  * @pos: offset of the buffer in vram
345  * @buf: virtual address of the buffer in system memory
346  * @size: read/write size, sizeof(@buf) must > @size
347  * @write: true - write to vram, otherwise - read from vram
348  *
349  * The return value means how many bytes have been transferred.
350  */
amdgpu_device_aper_access(struct amdgpu_device * adev,loff_t pos,void * buf,size_t size,bool write)351 size_t amdgpu_device_aper_access(struct amdgpu_device *adev, loff_t pos,
352 				 void *buf, size_t size, bool write)
353 {
354 #ifdef CONFIG_64BIT
355 	void __iomem *addr;
356 	size_t count = 0;
357 	uint64_t last;
358 
359 	if (!adev->mman.aper_base_kaddr)
360 		return 0;
361 
362 	last = min(pos + size, adev->gmc.visible_vram_size);
363 	if (last > pos) {
364 		addr = adev->mman.aper_base_kaddr + pos;
365 		count = last - pos;
366 
367 		if (write) {
368 			memcpy_toio(addr, buf, count);
369 			mb();
370 			amdgpu_device_flush_hdp(adev, NULL);
371 		} else {
372 			amdgpu_device_invalidate_hdp(adev, NULL);
373 			mb();
374 			memcpy_fromio(buf, addr, count);
375 		}
376 
377 	}
378 
379 	return count;
380 #else
381 	return 0;
382 #endif
383 }
384 
385 /**
386  * amdgpu_device_vram_access - read/write a buffer in vram
387  *
388  * @adev: amdgpu_device pointer
389  * @pos: offset of the buffer in vram
390  * @buf: virtual address of the buffer in system memory
391  * @size: read/write size, sizeof(@buf) must > @size
392  * @write: true - write to vram, otherwise - read from vram
393  */
amdgpu_device_vram_access(struct amdgpu_device * adev,loff_t pos,void * buf,size_t size,bool write)394 void amdgpu_device_vram_access(struct amdgpu_device *adev, loff_t pos,
395 			       void *buf, size_t size, bool write)
396 {
397 	size_t count;
398 
399 	/* try to using vram apreature to access vram first */
400 	count = amdgpu_device_aper_access(adev, pos, buf, size, write);
401 	size -= count;
402 	if (size) {
403 		/* using MM to access rest vram */
404 		pos += count;
405 		buf += count;
406 		amdgpu_device_mm_access(adev, pos, buf, size, write);
407 	}
408 }
409 
410 /*
411  * register access helper functions.
412  */
413 
414 /* Check if hw access should be skipped because of hotplug or device error */
amdgpu_device_skip_hw_access(struct amdgpu_device * adev)415 bool amdgpu_device_skip_hw_access(struct amdgpu_device *adev)
416 {
417 	if (adev->no_hw_access)
418 		return true;
419 
420 #ifdef CONFIG_LOCKDEP
421 	/*
422 	 * This is a bit complicated to understand, so worth a comment. What we assert
423 	 * here is that the GPU reset is not running on another thread in parallel.
424 	 *
425 	 * For this we trylock the read side of the reset semaphore, if that succeeds
426 	 * we know that the reset is not running in paralell.
427 	 *
428 	 * If the trylock fails we assert that we are either already holding the read
429 	 * side of the lock or are the reset thread itself and hold the write side of
430 	 * the lock.
431 	 */
432 	if (in_task()) {
433 		if (down_read_trylock(&adev->reset_sem))
434 			up_read(&adev->reset_sem);
435 		else
436 			lockdep_assert_held(&adev->reset_sem);
437 	}
438 #endif
439 	return false;
440 }
441 
442 /**
443  * amdgpu_device_rreg - read a memory mapped IO or indirect register
444  *
445  * @adev: amdgpu_device pointer
446  * @reg: dword aligned register offset
447  * @acc_flags: access flags which require special behavior
448  *
449  * Returns the 32 bit value from the offset specified.
450  */
amdgpu_device_rreg(struct amdgpu_device * adev,uint32_t reg,uint32_t acc_flags)451 uint32_t amdgpu_device_rreg(struct amdgpu_device *adev,
452 			    uint32_t reg, uint32_t acc_flags)
453 {
454 	uint32_t ret;
455 
456 	if (amdgpu_device_skip_hw_access(adev))
457 		return 0;
458 
459 	if ((reg * 4) < adev->rmmio_size) {
460 		if (!(acc_flags & AMDGPU_REGS_NO_KIQ) &&
461 		    amdgpu_sriov_runtime(adev) &&
462 		    down_read_trylock(&adev->reset_sem)) {
463 			ret = amdgpu_kiq_rreg(adev, reg);
464 			up_read(&adev->reset_sem);
465 		} else {
466 			ret = readl(((void __iomem *)adev->rmmio) + (reg * 4));
467 		}
468 	} else {
469 		ret = adev->pcie_rreg(adev, reg * 4);
470 	}
471 
472 	trace_amdgpu_device_rreg(adev->pdev->device, reg, ret);
473 
474 	return ret;
475 }
476 
477 /*
478  * MMIO register read with bytes helper functions
479  * @offset:bytes offset from MMIO start
480  *
481 */
482 
483 /**
484  * amdgpu_mm_rreg8 - read a memory mapped IO register
485  *
486  * @adev: amdgpu_device pointer
487  * @offset: byte aligned register offset
488  *
489  * Returns the 8 bit value from the offset specified.
490  */
amdgpu_mm_rreg8(struct amdgpu_device * adev,uint32_t offset)491 uint8_t amdgpu_mm_rreg8(struct amdgpu_device *adev, uint32_t offset)
492 {
493 	if (amdgpu_device_skip_hw_access(adev))
494 		return 0;
495 
496 	if (offset < adev->rmmio_size)
497 		return (readb(adev->rmmio + offset));
498 	BUG();
499 }
500 
501 /*
502  * MMIO register write with bytes helper functions
503  * @offset:bytes offset from MMIO start
504  * @value: the value want to be written to the register
505  *
506 */
507 /**
508  * amdgpu_mm_wreg8 - read a memory mapped IO register
509  *
510  * @adev: amdgpu_device pointer
511  * @offset: byte aligned register offset
512  * @value: 8 bit value to write
513  *
514  * Writes the value specified to the offset specified.
515  */
amdgpu_mm_wreg8(struct amdgpu_device * adev,uint32_t offset,uint8_t value)516 void amdgpu_mm_wreg8(struct amdgpu_device *adev, uint32_t offset, uint8_t value)
517 {
518 	if (amdgpu_device_skip_hw_access(adev))
519 		return;
520 
521 	if (offset < adev->rmmio_size)
522 		writeb(value, adev->rmmio + offset);
523 	else
524 		BUG();
525 }
526 
527 /**
528  * amdgpu_device_wreg - write to a memory mapped IO or indirect register
529  *
530  * @adev: amdgpu_device pointer
531  * @reg: dword aligned register offset
532  * @v: 32 bit value to write to the register
533  * @acc_flags: access flags which require special behavior
534  *
535  * Writes the value specified to the offset specified.
536  */
amdgpu_device_wreg(struct amdgpu_device * adev,uint32_t reg,uint32_t v,uint32_t acc_flags)537 void amdgpu_device_wreg(struct amdgpu_device *adev,
538 			uint32_t reg, uint32_t v,
539 			uint32_t acc_flags)
540 {
541 	if (amdgpu_device_skip_hw_access(adev))
542 		return;
543 
544 	if ((reg * 4) < adev->rmmio_size) {
545 		if (!(acc_flags & AMDGPU_REGS_NO_KIQ) &&
546 		    amdgpu_sriov_runtime(adev) &&
547 		    down_read_trylock(&adev->reset_sem)) {
548 			amdgpu_kiq_wreg(adev, reg, v);
549 			up_read(&adev->reset_sem);
550 		} else {
551 			writel(v, ((void __iomem *)adev->rmmio) + (reg * 4));
552 		}
553 	} else {
554 		adev->pcie_wreg(adev, reg * 4, v);
555 	}
556 
557 	trace_amdgpu_device_wreg(adev->pdev->device, reg, v);
558 }
559 
560 /*
561  * amdgpu_mm_wreg_mmio_rlc -  write register either with mmio or with RLC path if in range
562  *
563  * this function is invoked only the debugfs register access
564  * */
amdgpu_mm_wreg_mmio_rlc(struct amdgpu_device * adev,uint32_t reg,uint32_t v)565 void amdgpu_mm_wreg_mmio_rlc(struct amdgpu_device *adev,
566 			     uint32_t reg, uint32_t v)
567 {
568 	if (amdgpu_device_skip_hw_access(adev))
569 		return;
570 
571 	if (amdgpu_sriov_fullaccess(adev) &&
572 	    adev->gfx.rlc.funcs &&
573 	    adev->gfx.rlc.funcs->is_rlcg_access_range) {
574 		if (adev->gfx.rlc.funcs->is_rlcg_access_range(adev, reg))
575 			return adev->gfx.rlc.funcs->sriov_wreg(adev, reg, v, 0, 0);
576 	} else {
577 		writel(v, ((void __iomem *)adev->rmmio) + (reg * 4));
578 	}
579 }
580 
581 /**
582  * amdgpu_mm_rdoorbell - read a doorbell dword
583  *
584  * @adev: amdgpu_device pointer
585  * @index: doorbell index
586  *
587  * Returns the value in the doorbell aperture at the
588  * requested doorbell index (CIK).
589  */
amdgpu_mm_rdoorbell(struct amdgpu_device * adev,u32 index)590 u32 amdgpu_mm_rdoorbell(struct amdgpu_device *adev, u32 index)
591 {
592 	if (amdgpu_device_skip_hw_access(adev))
593 		return 0;
594 
595 	if (index < adev->doorbell.num_doorbells) {
596 		return readl(adev->doorbell.ptr + index);
597 	} else {
598 		DRM_ERROR("reading beyond doorbell aperture: 0x%08x!\n", index);
599 		return 0;
600 	}
601 }
602 
603 /**
604  * amdgpu_mm_wdoorbell - write a doorbell dword
605  *
606  * @adev: amdgpu_device pointer
607  * @index: doorbell index
608  * @v: value to write
609  *
610  * Writes @v to the doorbell aperture at the
611  * requested doorbell index (CIK).
612  */
amdgpu_mm_wdoorbell(struct amdgpu_device * adev,u32 index,u32 v)613 void amdgpu_mm_wdoorbell(struct amdgpu_device *adev, u32 index, u32 v)
614 {
615 	if (amdgpu_device_skip_hw_access(adev))
616 		return;
617 
618 	if (index < adev->doorbell.num_doorbells) {
619 		writel(v, adev->doorbell.ptr + index);
620 	} else {
621 		DRM_ERROR("writing beyond doorbell aperture: 0x%08x!\n", index);
622 	}
623 }
624 
625 /**
626  * amdgpu_mm_rdoorbell64 - read a doorbell Qword
627  *
628  * @adev: amdgpu_device pointer
629  * @index: doorbell index
630  *
631  * Returns the value in the doorbell aperture at the
632  * requested doorbell index (VEGA10+).
633  */
amdgpu_mm_rdoorbell64(struct amdgpu_device * adev,u32 index)634 u64 amdgpu_mm_rdoorbell64(struct amdgpu_device *adev, u32 index)
635 {
636 	if (amdgpu_device_skip_hw_access(adev))
637 		return 0;
638 
639 	if (index < adev->doorbell.num_doorbells) {
640 		return atomic64_read((atomic64_t *)(adev->doorbell.ptr + index));
641 	} else {
642 		DRM_ERROR("reading beyond doorbell aperture: 0x%08x!\n", index);
643 		return 0;
644 	}
645 }
646 
647 /**
648  * amdgpu_mm_wdoorbell64 - write a doorbell Qword
649  *
650  * @adev: amdgpu_device pointer
651  * @index: doorbell index
652  * @v: value to write
653  *
654  * Writes @v to the doorbell aperture at the
655  * requested doorbell index (VEGA10+).
656  */
amdgpu_mm_wdoorbell64(struct amdgpu_device * adev,u32 index,u64 v)657 void amdgpu_mm_wdoorbell64(struct amdgpu_device *adev, u32 index, u64 v)
658 {
659 	if (amdgpu_device_skip_hw_access(adev))
660 		return;
661 
662 	if (index < adev->doorbell.num_doorbells) {
663 		atomic64_set((atomic64_t *)(adev->doorbell.ptr + index), v);
664 	} else {
665 		DRM_ERROR("writing beyond doorbell aperture: 0x%08x!\n", index);
666 	}
667 }
668 
669 /**
670  * amdgpu_device_indirect_rreg - read an indirect register
671  *
672  * @adev: amdgpu_device pointer
673  * @pcie_index: mmio register offset
674  * @pcie_data: mmio register offset
675  * @reg_addr: indirect register address to read from
676  *
677  * Returns the value of indirect register @reg_addr
678  */
amdgpu_device_indirect_rreg(struct amdgpu_device * adev,u32 pcie_index,u32 pcie_data,u32 reg_addr)679 u32 amdgpu_device_indirect_rreg(struct amdgpu_device *adev,
680 				u32 pcie_index, u32 pcie_data,
681 				u32 reg_addr)
682 {
683 	unsigned long flags;
684 	u32 r;
685 	void __iomem *pcie_index_offset;
686 	void __iomem *pcie_data_offset;
687 
688 	spin_lock_irqsave(&adev->pcie_idx_lock, flags);
689 	pcie_index_offset = (void __iomem *)adev->rmmio + pcie_index * 4;
690 	pcie_data_offset = (void __iomem *)adev->rmmio + pcie_data * 4;
691 
692 	writel(reg_addr, pcie_index_offset);
693 	readl(pcie_index_offset);
694 	r = readl(pcie_data_offset);
695 	spin_unlock_irqrestore(&adev->pcie_idx_lock, flags);
696 
697 	return r;
698 }
699 
700 /**
701  * amdgpu_device_indirect_rreg64 - read a 64bits indirect register
702  *
703  * @adev: amdgpu_device pointer
704  * @pcie_index: mmio register offset
705  * @pcie_data: mmio register offset
706  * @reg_addr: indirect register address to read from
707  *
708  * Returns the value of indirect register @reg_addr
709  */
amdgpu_device_indirect_rreg64(struct amdgpu_device * adev,u32 pcie_index,u32 pcie_data,u32 reg_addr)710 u64 amdgpu_device_indirect_rreg64(struct amdgpu_device *adev,
711 				  u32 pcie_index, u32 pcie_data,
712 				  u32 reg_addr)
713 {
714 	unsigned long flags;
715 	u64 r;
716 	void __iomem *pcie_index_offset;
717 	void __iomem *pcie_data_offset;
718 
719 	spin_lock_irqsave(&adev->pcie_idx_lock, flags);
720 	pcie_index_offset = (void __iomem *)adev->rmmio + pcie_index * 4;
721 	pcie_data_offset = (void __iomem *)adev->rmmio + pcie_data * 4;
722 
723 	/* read low 32 bits */
724 	writel(reg_addr, pcie_index_offset);
725 	readl(pcie_index_offset);
726 	r = readl(pcie_data_offset);
727 	/* read high 32 bits */
728 	writel(reg_addr + 4, pcie_index_offset);
729 	readl(pcie_index_offset);
730 	r |= ((u64)readl(pcie_data_offset) << 32);
731 	spin_unlock_irqrestore(&adev->pcie_idx_lock, flags);
732 
733 	return r;
734 }
735 
736 /**
737  * amdgpu_device_indirect_wreg - write an indirect register address
738  *
739  * @adev: amdgpu_device pointer
740  * @pcie_index: mmio register offset
741  * @pcie_data: mmio register offset
742  * @reg_addr: indirect register offset
743  * @reg_data: indirect register data
744  *
745  */
amdgpu_device_indirect_wreg(struct amdgpu_device * adev,u32 pcie_index,u32 pcie_data,u32 reg_addr,u32 reg_data)746 void amdgpu_device_indirect_wreg(struct amdgpu_device *adev,
747 				 u32 pcie_index, u32 pcie_data,
748 				 u32 reg_addr, u32 reg_data)
749 {
750 	unsigned long flags;
751 	void __iomem *pcie_index_offset;
752 	void __iomem *pcie_data_offset;
753 
754 	spin_lock_irqsave(&adev->pcie_idx_lock, flags);
755 	pcie_index_offset = (void __iomem *)adev->rmmio + pcie_index * 4;
756 	pcie_data_offset = (void __iomem *)adev->rmmio + pcie_data * 4;
757 
758 	writel(reg_addr, pcie_index_offset);
759 	readl(pcie_index_offset);
760 	writel(reg_data, pcie_data_offset);
761 	readl(pcie_data_offset);
762 	spin_unlock_irqrestore(&adev->pcie_idx_lock, flags);
763 }
764 
765 /**
766  * amdgpu_device_indirect_wreg64 - write a 64bits indirect register address
767  *
768  * @adev: amdgpu_device pointer
769  * @pcie_index: mmio register offset
770  * @pcie_data: mmio register offset
771  * @reg_addr: indirect register offset
772  * @reg_data: indirect register data
773  *
774  */
amdgpu_device_indirect_wreg64(struct amdgpu_device * adev,u32 pcie_index,u32 pcie_data,u32 reg_addr,u64 reg_data)775 void amdgpu_device_indirect_wreg64(struct amdgpu_device *adev,
776 				   u32 pcie_index, u32 pcie_data,
777 				   u32 reg_addr, u64 reg_data)
778 {
779 	unsigned long flags;
780 	void __iomem *pcie_index_offset;
781 	void __iomem *pcie_data_offset;
782 
783 	spin_lock_irqsave(&adev->pcie_idx_lock, flags);
784 	pcie_index_offset = (void __iomem *)adev->rmmio + pcie_index * 4;
785 	pcie_data_offset = (void __iomem *)adev->rmmio + pcie_data * 4;
786 
787 	/* write low 32 bits */
788 	writel(reg_addr, pcie_index_offset);
789 	readl(pcie_index_offset);
790 	writel((u32)(reg_data & 0xffffffffULL), pcie_data_offset);
791 	readl(pcie_data_offset);
792 	/* write high 32 bits */
793 	writel(reg_addr + 4, pcie_index_offset);
794 	readl(pcie_index_offset);
795 	writel((u32)(reg_data >> 32), pcie_data_offset);
796 	readl(pcie_data_offset);
797 	spin_unlock_irqrestore(&adev->pcie_idx_lock, flags);
798 }
799 
800 /**
801  * amdgpu_invalid_rreg - dummy reg read function
802  *
803  * @adev: amdgpu_device pointer
804  * @reg: offset of register
805  *
806  * Dummy register read function.  Used for register blocks
807  * that certain asics don't have (all asics).
808  * Returns the value in the register.
809  */
amdgpu_invalid_rreg(struct amdgpu_device * adev,uint32_t reg)810 static uint32_t amdgpu_invalid_rreg(struct amdgpu_device *adev, uint32_t reg)
811 {
812 	DRM_ERROR("Invalid callback to read register 0x%04X\n", reg);
813 	BUG();
814 	return 0;
815 }
816 
817 /**
818  * amdgpu_invalid_wreg - dummy reg write function
819  *
820  * @adev: amdgpu_device pointer
821  * @reg: offset of register
822  * @v: value to write to the register
823  *
824  * Dummy register read function.  Used for register blocks
825  * that certain asics don't have (all asics).
826  */
amdgpu_invalid_wreg(struct amdgpu_device * adev,uint32_t reg,uint32_t v)827 static void amdgpu_invalid_wreg(struct amdgpu_device *adev, uint32_t reg, uint32_t v)
828 {
829 	DRM_ERROR("Invalid callback to write register 0x%04X with 0x%08X\n",
830 		  reg, v);
831 	BUG();
832 }
833 
834 /**
835  * amdgpu_invalid_rreg64 - dummy 64 bit reg read function
836  *
837  * @adev: amdgpu_device pointer
838  * @reg: offset of register
839  *
840  * Dummy register read function.  Used for register blocks
841  * that certain asics don't have (all asics).
842  * Returns the value in the register.
843  */
amdgpu_invalid_rreg64(struct amdgpu_device * adev,uint32_t reg)844 static uint64_t amdgpu_invalid_rreg64(struct amdgpu_device *adev, uint32_t reg)
845 {
846 	DRM_ERROR("Invalid callback to read 64 bit register 0x%04X\n", reg);
847 	BUG();
848 	return 0;
849 }
850 
851 /**
852  * amdgpu_invalid_wreg64 - dummy reg write function
853  *
854  * @adev: amdgpu_device pointer
855  * @reg: offset of register
856  * @v: value to write to the register
857  *
858  * Dummy register read function.  Used for register blocks
859  * that certain asics don't have (all asics).
860  */
amdgpu_invalid_wreg64(struct amdgpu_device * adev,uint32_t reg,uint64_t v)861 static void amdgpu_invalid_wreg64(struct amdgpu_device *adev, uint32_t reg, uint64_t v)
862 {
863 	DRM_ERROR("Invalid callback to write 64 bit register 0x%04X with 0x%08llX\n",
864 		  reg, v);
865 	BUG();
866 }
867 
868 /**
869  * amdgpu_block_invalid_rreg - dummy reg read function
870  *
871  * @adev: amdgpu_device pointer
872  * @block: offset of instance
873  * @reg: offset of register
874  *
875  * Dummy register read function.  Used for register blocks
876  * that certain asics don't have (all asics).
877  * Returns the value in the register.
878  */
amdgpu_block_invalid_rreg(struct amdgpu_device * adev,uint32_t block,uint32_t reg)879 static uint32_t amdgpu_block_invalid_rreg(struct amdgpu_device *adev,
880 					  uint32_t block, uint32_t reg)
881 {
882 	DRM_ERROR("Invalid callback to read register 0x%04X in block 0x%04X\n",
883 		  reg, block);
884 	BUG();
885 	return 0;
886 }
887 
888 /**
889  * amdgpu_block_invalid_wreg - dummy reg write function
890  *
891  * @adev: amdgpu_device pointer
892  * @block: offset of instance
893  * @reg: offset of register
894  * @v: value to write to the register
895  *
896  * Dummy register read function.  Used for register blocks
897  * that certain asics don't have (all asics).
898  */
amdgpu_block_invalid_wreg(struct amdgpu_device * adev,uint32_t block,uint32_t reg,uint32_t v)899 static void amdgpu_block_invalid_wreg(struct amdgpu_device *adev,
900 				      uint32_t block,
901 				      uint32_t reg, uint32_t v)
902 {
903 	DRM_ERROR("Invalid block callback to write register 0x%04X in block 0x%04X with 0x%08X\n",
904 		  reg, block, v);
905 	BUG();
906 }
907 
908 /**
909  * amdgpu_device_asic_init - Wrapper for atom asic_init
910  *
911  * @adev: amdgpu_device pointer
912  *
913  * Does any asic specific work and then calls atom asic init.
914  */
amdgpu_device_asic_init(struct amdgpu_device * adev)915 static int amdgpu_device_asic_init(struct amdgpu_device *adev)
916 {
917 	amdgpu_asic_pre_asic_init(adev);
918 
919 	return amdgpu_atom_asic_init(adev->mode_info.atom_context);
920 }
921 
922 /**
923  * amdgpu_device_vram_scratch_init - allocate the VRAM scratch page
924  *
925  * @adev: amdgpu_device pointer
926  *
927  * Allocates a scratch page of VRAM for use by various things in the
928  * driver.
929  */
amdgpu_device_vram_scratch_init(struct amdgpu_device * adev)930 static int amdgpu_device_vram_scratch_init(struct amdgpu_device *adev)
931 {
932 	return amdgpu_bo_create_kernel(adev, AMDGPU_GPU_PAGE_SIZE,
933 				       PAGE_SIZE, AMDGPU_GEM_DOMAIN_VRAM,
934 				       &adev->vram_scratch.robj,
935 				       &adev->vram_scratch.gpu_addr,
936 				       (void **)&adev->vram_scratch.ptr);
937 }
938 
939 /**
940  * amdgpu_device_vram_scratch_fini - Free the VRAM scratch page
941  *
942  * @adev: amdgpu_device pointer
943  *
944  * Frees the VRAM scratch page.
945  */
amdgpu_device_vram_scratch_fini(struct amdgpu_device * adev)946 static void amdgpu_device_vram_scratch_fini(struct amdgpu_device *adev)
947 {
948 	amdgpu_bo_free_kernel(&adev->vram_scratch.robj, NULL, NULL);
949 }
950 
951 /**
952  * amdgpu_device_program_register_sequence - program an array of registers.
953  *
954  * @adev: amdgpu_device pointer
955  * @registers: pointer to the register array
956  * @array_size: size of the register array
957  *
958  * Programs an array or registers with and and or masks.
959  * This is a helper for setting golden registers.
960  */
amdgpu_device_program_register_sequence(struct amdgpu_device * adev,const u32 * registers,const u32 array_size)961 void amdgpu_device_program_register_sequence(struct amdgpu_device *adev,
962 					     const u32 *registers,
963 					     const u32 array_size)
964 {
965 	u32 tmp, reg, and_mask, or_mask;
966 	int i;
967 
968 	if (array_size % 3)
969 		return;
970 
971 	for (i = 0; i < array_size; i +=3) {
972 		reg = registers[i + 0];
973 		and_mask = registers[i + 1];
974 		or_mask = registers[i + 2];
975 
976 		if (and_mask == 0xffffffff) {
977 			tmp = or_mask;
978 		} else {
979 			tmp = RREG32(reg);
980 			tmp &= ~and_mask;
981 			if (adev->family >= AMDGPU_FAMILY_AI)
982 				tmp |= (or_mask & and_mask);
983 			else
984 				tmp |= or_mask;
985 		}
986 		WREG32(reg, tmp);
987 	}
988 }
989 
990 /**
991  * amdgpu_device_pci_config_reset - reset the GPU
992  *
993  * @adev: amdgpu_device pointer
994  *
995  * Resets the GPU using the pci config reset sequence.
996  * Only applicable to asics prior to vega10.
997  */
amdgpu_device_pci_config_reset(struct amdgpu_device * adev)998 void amdgpu_device_pci_config_reset(struct amdgpu_device *adev)
999 {
1000 	pci_write_config_dword(adev->pdev, 0x7c, AMDGPU_ASIC_RESET_DATA);
1001 }
1002 
1003 /**
1004  * amdgpu_device_pci_reset - reset the GPU using generic PCI means
1005  *
1006  * @adev: amdgpu_device pointer
1007  *
1008  * Resets the GPU using generic pci reset interfaces (FLR, SBR, etc.).
1009  */
amdgpu_device_pci_reset(struct amdgpu_device * adev)1010 int amdgpu_device_pci_reset(struct amdgpu_device *adev)
1011 {
1012 	return pci_reset_function(adev->pdev);
1013 }
1014 
1015 /*
1016  * GPU doorbell aperture helpers function.
1017  */
1018 /**
1019  * amdgpu_device_doorbell_init - Init doorbell driver information.
1020  *
1021  * @adev: amdgpu_device pointer
1022  *
1023  * Init doorbell driver information (CIK)
1024  * Returns 0 on success, error on failure.
1025  */
amdgpu_device_doorbell_init(struct amdgpu_device * adev)1026 static int amdgpu_device_doorbell_init(struct amdgpu_device *adev)
1027 {
1028 
1029 	/* No doorbell on SI hardware generation */
1030 	if (adev->asic_type < CHIP_BONAIRE) {
1031 		adev->doorbell.base = 0;
1032 		adev->doorbell.size = 0;
1033 		adev->doorbell.num_doorbells = 0;
1034 		adev->doorbell.ptr = NULL;
1035 		return 0;
1036 	}
1037 
1038 	if (pci_resource_flags(adev->pdev, 2) & IORESOURCE_UNSET)
1039 		return -EINVAL;
1040 
1041 	amdgpu_asic_init_doorbell_index(adev);
1042 
1043 	/* doorbell bar mapping */
1044 	adev->doorbell.base = pci_resource_start(adev->pdev, 2);
1045 	adev->doorbell.size = pci_resource_len(adev->pdev, 2);
1046 
1047 	adev->doorbell.num_doorbells = min_t(u32, adev->doorbell.size / sizeof(u32),
1048 					     adev->doorbell_index.max_assignment+1);
1049 	if (adev->doorbell.num_doorbells == 0)
1050 		return -EINVAL;
1051 
1052 	/* For Vega, reserve and map two pages on doorbell BAR since SDMA
1053 	 * paging queue doorbell use the second page. The
1054 	 * AMDGPU_DOORBELL64_MAX_ASSIGNMENT definition assumes all the
1055 	 * doorbells are in the first page. So with paging queue enabled,
1056 	 * the max num_doorbells should + 1 page (0x400 in dword)
1057 	 */
1058 	if (adev->asic_type >= CHIP_VEGA10)
1059 		adev->doorbell.num_doorbells += 0x400;
1060 
1061 	adev->doorbell.ptr = ioremap(adev->doorbell.base,
1062 				     adev->doorbell.num_doorbells *
1063 				     sizeof(u32));
1064 	if (adev->doorbell.ptr == NULL)
1065 		return -ENOMEM;
1066 
1067 	return 0;
1068 }
1069 
1070 /**
1071  * amdgpu_device_doorbell_fini - Tear down doorbell driver information.
1072  *
1073  * @adev: amdgpu_device pointer
1074  *
1075  * Tear down doorbell driver information (CIK)
1076  */
amdgpu_device_doorbell_fini(struct amdgpu_device * adev)1077 static void amdgpu_device_doorbell_fini(struct amdgpu_device *adev)
1078 {
1079 	iounmap(adev->doorbell.ptr);
1080 	adev->doorbell.ptr = NULL;
1081 }
1082 
1083 
1084 
1085 /*
1086  * amdgpu_device_wb_*()
1087  * Writeback is the method by which the GPU updates special pages in memory
1088  * with the status of certain GPU events (fences, ring pointers,etc.).
1089  */
1090 
1091 /**
1092  * amdgpu_device_wb_fini - Disable Writeback and free memory
1093  *
1094  * @adev: amdgpu_device pointer
1095  *
1096  * Disables Writeback and frees the Writeback memory (all asics).
1097  * Used at driver shutdown.
1098  */
amdgpu_device_wb_fini(struct amdgpu_device * adev)1099 static void amdgpu_device_wb_fini(struct amdgpu_device *adev)
1100 {
1101 	if (adev->wb.wb_obj) {
1102 		amdgpu_bo_free_kernel(&adev->wb.wb_obj,
1103 				      &adev->wb.gpu_addr,
1104 				      (void **)&adev->wb.wb);
1105 		adev->wb.wb_obj = NULL;
1106 	}
1107 }
1108 
1109 /**
1110  * amdgpu_device_wb_init- Init Writeback driver info and allocate memory
1111  *
1112  * @adev: amdgpu_device pointer
1113  *
1114  * Initializes writeback and allocates writeback memory (all asics).
1115  * Used at driver startup.
1116  * Returns 0 on success or an -error on failure.
1117  */
amdgpu_device_wb_init(struct amdgpu_device * adev)1118 static int amdgpu_device_wb_init(struct amdgpu_device *adev)
1119 {
1120 	int r;
1121 
1122 	if (adev->wb.wb_obj == NULL) {
1123 		/* AMDGPU_MAX_WB * sizeof(uint32_t) * 8 = AMDGPU_MAX_WB 256bit slots */
1124 		r = amdgpu_bo_create_kernel(adev, AMDGPU_MAX_WB * sizeof(uint32_t) * 8,
1125 					    PAGE_SIZE, AMDGPU_GEM_DOMAIN_GTT,
1126 					    &adev->wb.wb_obj, &adev->wb.gpu_addr,
1127 					    (void **)&adev->wb.wb);
1128 		if (r) {
1129 			dev_warn(adev->dev, "(%d) create WB bo failed\n", r);
1130 			return r;
1131 		}
1132 
1133 		adev->wb.num_wb = AMDGPU_MAX_WB;
1134 		memset(&adev->wb.used, 0, sizeof(adev->wb.used));
1135 
1136 		/* clear wb memory */
1137 		memset((char *)adev->wb.wb, 0, AMDGPU_MAX_WB * sizeof(uint32_t) * 8);
1138 	}
1139 
1140 	return 0;
1141 }
1142 
1143 /**
1144  * amdgpu_device_wb_get - Allocate a wb entry
1145  *
1146  * @adev: amdgpu_device pointer
1147  * @wb: wb index
1148  *
1149  * Allocate a wb slot for use by the driver (all asics).
1150  * Returns 0 on success or -EINVAL on failure.
1151  */
amdgpu_device_wb_get(struct amdgpu_device * adev,u32 * wb)1152 int amdgpu_device_wb_get(struct amdgpu_device *adev, u32 *wb)
1153 {
1154 	unsigned long offset = find_first_zero_bit(adev->wb.used, adev->wb.num_wb);
1155 
1156 	if (offset < adev->wb.num_wb) {
1157 		__set_bit(offset, adev->wb.used);
1158 		*wb = offset << 3; /* convert to dw offset */
1159 		return 0;
1160 	} else {
1161 		return -EINVAL;
1162 	}
1163 }
1164 
1165 /**
1166  * amdgpu_device_wb_free - Free a wb entry
1167  *
1168  * @adev: amdgpu_device pointer
1169  * @wb: wb index
1170  *
1171  * Free a wb slot allocated for use by the driver (all asics)
1172  */
amdgpu_device_wb_free(struct amdgpu_device * adev,u32 wb)1173 void amdgpu_device_wb_free(struct amdgpu_device *adev, u32 wb)
1174 {
1175 	wb >>= 3;
1176 	if (wb < adev->wb.num_wb)
1177 		__clear_bit(wb, adev->wb.used);
1178 }
1179 
1180 /**
1181  * amdgpu_device_resize_fb_bar - try to resize FB BAR
1182  *
1183  * @adev: amdgpu_device pointer
1184  *
1185  * Try to resize FB BAR to make all VRAM CPU accessible. We try very hard not
1186  * to fail, but if any of the BARs is not accessible after the size we abort
1187  * driver loading by returning -ENODEV.
1188  */
amdgpu_device_resize_fb_bar(struct amdgpu_device * adev)1189 int amdgpu_device_resize_fb_bar(struct amdgpu_device *adev)
1190 {
1191 	int rbar_size = pci_rebar_bytes_to_size(adev->gmc.real_vram_size);
1192 	struct pci_bus *root;
1193 	struct resource *res;
1194 	unsigned i;
1195 	u16 cmd;
1196 	int r;
1197 
1198 	if (!IS_ENABLED(CONFIG_PHYS_ADDR_T_64BIT))
1199 		return 0;
1200 
1201 	/* Bypass for VF */
1202 	if (amdgpu_sriov_vf(adev))
1203 		return 0;
1204 
1205 	/* skip if the bios has already enabled large BAR */
1206 	if (adev->gmc.real_vram_size &&
1207 	    (pci_resource_len(adev->pdev, 0) >= adev->gmc.real_vram_size))
1208 		return 0;
1209 
1210 	/* Check if the root BUS has 64bit memory resources */
1211 	root = adev->pdev->bus;
1212 	while (root->parent)
1213 		root = root->parent;
1214 
1215 	pci_bus_for_each_resource(root, res, i) {
1216 		if (res && res->flags & (IORESOURCE_MEM | IORESOURCE_MEM_64) &&
1217 		    res->start > 0x100000000ull)
1218 			break;
1219 	}
1220 
1221 	/* Trying to resize is pointless without a root hub window above 4GB */
1222 	if (!res)
1223 		return 0;
1224 
1225 	/* Limit the BAR size to what is available */
1226 	rbar_size = min(fls(pci_rebar_get_possible_sizes(adev->pdev, 0)) - 1,
1227 			rbar_size);
1228 
1229 	/* Disable memory decoding while we change the BAR addresses and size */
1230 	pci_read_config_word(adev->pdev, PCI_COMMAND, &cmd);
1231 	pci_write_config_word(adev->pdev, PCI_COMMAND,
1232 			      cmd & ~PCI_COMMAND_MEMORY);
1233 
1234 	/* Free the VRAM and doorbell BAR, we most likely need to move both. */
1235 	amdgpu_device_doorbell_fini(adev);
1236 	if (adev->asic_type >= CHIP_BONAIRE)
1237 		pci_release_resource(adev->pdev, 2);
1238 
1239 	pci_release_resource(adev->pdev, 0);
1240 
1241 	r = pci_resize_resource(adev->pdev, 0, rbar_size);
1242 	if (r == -ENOSPC)
1243 		DRM_INFO("Not enough PCI address space for a large BAR.");
1244 	else if (r && r != -ENOTSUPP)
1245 		DRM_ERROR("Problem resizing BAR0 (%d).", r);
1246 
1247 	pci_assign_unassigned_bus_resources(adev->pdev->bus);
1248 
1249 	/* When the doorbell or fb BAR isn't available we have no chance of
1250 	 * using the device.
1251 	 */
1252 	r = amdgpu_device_doorbell_init(adev);
1253 	if (r || (pci_resource_flags(adev->pdev, 0) & IORESOURCE_UNSET))
1254 		return -ENODEV;
1255 
1256 	pci_write_config_word(adev->pdev, PCI_COMMAND, cmd);
1257 
1258 	return 0;
1259 }
1260 
1261 /*
1262  * GPU helpers function.
1263  */
1264 /**
1265  * amdgpu_device_need_post - check if the hw need post or not
1266  *
1267  * @adev: amdgpu_device pointer
1268  *
1269  * Check if the asic has been initialized (all asics) at driver startup
1270  * or post is needed if  hw reset is performed.
1271  * Returns true if need or false if not.
1272  */
amdgpu_device_need_post(struct amdgpu_device * adev)1273 bool amdgpu_device_need_post(struct amdgpu_device *adev)
1274 {
1275 	uint32_t reg;
1276 
1277 	if (amdgpu_sriov_vf(adev))
1278 		return false;
1279 
1280 	if (amdgpu_passthrough(adev)) {
1281 		/* for FIJI: In whole GPU pass-through virtualization case, after VM reboot
1282 		 * some old smc fw still need driver do vPost otherwise gpu hang, while
1283 		 * those smc fw version above 22.15 doesn't have this flaw, so we force
1284 		 * vpost executed for smc version below 22.15
1285 		 */
1286 		if (adev->asic_type == CHIP_FIJI) {
1287 			int err;
1288 			uint32_t fw_ver;
1289 			err = request_firmware(&adev->pm.fw, "amdgpu/fiji_smc.bin", adev->dev);
1290 			/* force vPost if error occured */
1291 			if (err)
1292 				return true;
1293 
1294 			fw_ver = *((uint32_t *)adev->pm.fw->data + 69);
1295 			release_firmware(adev->pm.fw);
1296 			if (fw_ver < 0x00160e00)
1297 				return true;
1298 		}
1299 	}
1300 
1301 	/* Don't post if we need to reset whole hive on init */
1302 	if (adev->gmc.xgmi.pending_reset)
1303 		return false;
1304 
1305 	if (adev->has_hw_reset) {
1306 		adev->has_hw_reset = false;
1307 		return true;
1308 	}
1309 
1310 	/* bios scratch used on CIK+ */
1311 	if (adev->asic_type >= CHIP_BONAIRE)
1312 		return amdgpu_atombios_scratch_need_asic_init(adev);
1313 
1314 	/* check MEM_SIZE for older asics */
1315 	reg = amdgpu_asic_get_config_memsize(adev);
1316 
1317 	if ((reg != 0) && (reg != 0xffffffff))
1318 		return false;
1319 
1320 	return true;
1321 }
1322 
1323 /*
1324  * Intel hosts such as Raptor Lake and Sapphire Rapids don't support dynamic
1325  * speed switching. Until we have confirmation from Intel that a specific host
1326  * supports it, it's safer that we keep it disabled for all.
1327  *
1328  * https://edc.intel.com/content/www/us/en/design/products/platforms/details/raptor-lake-s/13th-generation-core-processors-datasheet-volume-1-of-2/005/pci-express-support/
1329  * https://gitlab.freedesktop.org/drm/amd/-/issues/2663
1330  */
amdgpu_device_pcie_dynamic_switching_supported(void)1331 bool amdgpu_device_pcie_dynamic_switching_supported(void)
1332 {
1333 #if IS_ENABLED(CONFIG_X86)
1334 	struct cpuinfo_x86 *c = &cpu_data(0);
1335 
1336 	if (c->x86_vendor == X86_VENDOR_INTEL)
1337 		return false;
1338 #endif
1339 	return true;
1340 }
1341 
1342 /**
1343  * amdgpu_device_should_use_aspm - check if the device should program ASPM
1344  *
1345  * @adev: amdgpu_device pointer
1346  *
1347  * Confirm whether the module parameter and pcie bridge agree that ASPM should
1348  * be set for this device.
1349  *
1350  * Returns true if it should be used or false if not.
1351  */
amdgpu_device_should_use_aspm(struct amdgpu_device * adev)1352 bool amdgpu_device_should_use_aspm(struct amdgpu_device *adev)
1353 {
1354 	switch (amdgpu_aspm) {
1355 	case -1:
1356 		break;
1357 	case 0:
1358 		return false;
1359 	case 1:
1360 		return true;
1361 	default:
1362 		return false;
1363 	}
1364 	return pcie_aspm_enabled(adev->pdev);
1365 }
1366 
amdgpu_device_aspm_support_quirk(void)1367 bool amdgpu_device_aspm_support_quirk(void)
1368 {
1369 #if IS_ENABLED(CONFIG_X86)
1370 	struct cpuinfo_x86 *c = &cpu_data(0);
1371 
1372 	return !(c->x86 == 6 && c->x86_model == INTEL_FAM6_ALDERLAKE);
1373 #else
1374 	return true;
1375 #endif
1376 }
1377 
1378 /* if we get transitioned to only one device, take VGA back */
1379 /**
1380  * amdgpu_device_vga_set_decode - enable/disable vga decode
1381  *
1382  * @pdev: PCI device pointer
1383  * @state: enable/disable vga decode
1384  *
1385  * Enable/disable vga decode (all asics).
1386  * Returns VGA resource flags.
1387  */
amdgpu_device_vga_set_decode(struct pci_dev * pdev,bool state)1388 static unsigned int amdgpu_device_vga_set_decode(struct pci_dev *pdev,
1389 		bool state)
1390 {
1391 	struct amdgpu_device *adev = drm_to_adev(pci_get_drvdata(pdev));
1392 	amdgpu_asic_set_vga_state(adev, state);
1393 	if (state)
1394 		return VGA_RSRC_LEGACY_IO | VGA_RSRC_LEGACY_MEM |
1395 		       VGA_RSRC_NORMAL_IO | VGA_RSRC_NORMAL_MEM;
1396 	else
1397 		return VGA_RSRC_NORMAL_IO | VGA_RSRC_NORMAL_MEM;
1398 }
1399 
1400 /**
1401  * amdgpu_device_check_block_size - validate the vm block size
1402  *
1403  * @adev: amdgpu_device pointer
1404  *
1405  * Validates the vm block size specified via module parameter.
1406  * The vm block size defines number of bits in page table versus page directory,
1407  * a page is 4KB so we have 12 bits offset, minimum 9 bits in the
1408  * page table and the remaining bits are in the page directory.
1409  */
amdgpu_device_check_block_size(struct amdgpu_device * adev)1410 static void amdgpu_device_check_block_size(struct amdgpu_device *adev)
1411 {
1412 	/* defines number of bits in page table versus page directory,
1413 	 * a page is 4KB so we have 12 bits offset, minimum 9 bits in the
1414 	 * page table and the remaining bits are in the page directory */
1415 	if (amdgpu_vm_block_size == -1)
1416 		return;
1417 
1418 	if (amdgpu_vm_block_size < 9) {
1419 		dev_warn(adev->dev, "VM page table size (%d) too small\n",
1420 			 amdgpu_vm_block_size);
1421 		amdgpu_vm_block_size = -1;
1422 	}
1423 }
1424 
1425 /**
1426  * amdgpu_device_check_vm_size - validate the vm size
1427  *
1428  * @adev: amdgpu_device pointer
1429  *
1430  * Validates the vm size in GB specified via module parameter.
1431  * The VM size is the size of the GPU virtual memory space in GB.
1432  */
amdgpu_device_check_vm_size(struct amdgpu_device * adev)1433 static void amdgpu_device_check_vm_size(struct amdgpu_device *adev)
1434 {
1435 	/* no need to check the default value */
1436 	if (amdgpu_vm_size == -1)
1437 		return;
1438 
1439 	if (amdgpu_vm_size < 1) {
1440 		dev_warn(adev->dev, "VM size (%d) too small, min is 1GB\n",
1441 			 amdgpu_vm_size);
1442 		amdgpu_vm_size = -1;
1443 	}
1444 }
1445 
amdgpu_device_check_smu_prv_buffer_size(struct amdgpu_device * adev)1446 static void amdgpu_device_check_smu_prv_buffer_size(struct amdgpu_device *adev)
1447 {
1448 	struct sysinfo si;
1449 	bool is_os_64 = (sizeof(void *) == 8);
1450 	uint64_t total_memory;
1451 	uint64_t dram_size_seven_GB = 0x1B8000000;
1452 	uint64_t dram_size_three_GB = 0xB8000000;
1453 
1454 	if (amdgpu_smu_memory_pool_size == 0)
1455 		return;
1456 
1457 	if (!is_os_64) {
1458 		DRM_WARN("Not 64-bit OS, feature not supported\n");
1459 		goto def_value;
1460 	}
1461 	si_meminfo(&si);
1462 	total_memory = (uint64_t)si.totalram * si.mem_unit;
1463 
1464 	if ((amdgpu_smu_memory_pool_size == 1) ||
1465 		(amdgpu_smu_memory_pool_size == 2)) {
1466 		if (total_memory < dram_size_three_GB)
1467 			goto def_value1;
1468 	} else if ((amdgpu_smu_memory_pool_size == 4) ||
1469 		(amdgpu_smu_memory_pool_size == 8)) {
1470 		if (total_memory < dram_size_seven_GB)
1471 			goto def_value1;
1472 	} else {
1473 		DRM_WARN("Smu memory pool size not supported\n");
1474 		goto def_value;
1475 	}
1476 	adev->pm.smu_prv_buffer_size = amdgpu_smu_memory_pool_size << 28;
1477 
1478 	return;
1479 
1480 def_value1:
1481 	DRM_WARN("No enough system memory\n");
1482 def_value:
1483 	adev->pm.smu_prv_buffer_size = 0;
1484 }
1485 
amdgpu_device_init_apu_flags(struct amdgpu_device * adev)1486 static int amdgpu_device_init_apu_flags(struct amdgpu_device *adev)
1487 {
1488 	if (!(adev->flags & AMD_IS_APU) ||
1489 	    adev->asic_type < CHIP_RAVEN)
1490 		return 0;
1491 
1492 	switch (adev->asic_type) {
1493 	case CHIP_RAVEN:
1494 		if (adev->pdev->device == 0x15dd)
1495 			adev->apu_flags |= AMD_APU_IS_RAVEN;
1496 		if (adev->pdev->device == 0x15d8)
1497 			adev->apu_flags |= AMD_APU_IS_PICASSO;
1498 		break;
1499 	case CHIP_RENOIR:
1500 		if ((adev->pdev->device == 0x1636) ||
1501 		    (adev->pdev->device == 0x164c))
1502 			adev->apu_flags |= AMD_APU_IS_RENOIR;
1503 		else
1504 			adev->apu_flags |= AMD_APU_IS_GREEN_SARDINE;
1505 		break;
1506 	case CHIP_VANGOGH:
1507 		adev->apu_flags |= AMD_APU_IS_VANGOGH;
1508 		break;
1509 	case CHIP_YELLOW_CARP:
1510 		break;
1511 	case CHIP_CYAN_SKILLFISH:
1512 		if (adev->pdev->device == 0x13FE)
1513 			adev->apu_flags |= AMD_APU_IS_CYAN_SKILLFISH2;
1514 		break;
1515 	default:
1516 		return -EINVAL;
1517 	}
1518 
1519 	return 0;
1520 }
1521 
1522 /**
1523  * amdgpu_device_check_arguments - validate module params
1524  *
1525  * @adev: amdgpu_device pointer
1526  *
1527  * Validates certain module parameters and updates
1528  * the associated values used by the driver (all asics).
1529  */
amdgpu_device_check_arguments(struct amdgpu_device * adev)1530 static int amdgpu_device_check_arguments(struct amdgpu_device *adev)
1531 {
1532 	if (amdgpu_sched_jobs < 4) {
1533 		dev_warn(adev->dev, "sched jobs (%d) must be at least 4\n",
1534 			 amdgpu_sched_jobs);
1535 		amdgpu_sched_jobs = 4;
1536 	} else if (!is_power_of_2(amdgpu_sched_jobs)){
1537 		dev_warn(adev->dev, "sched jobs (%d) must be a power of 2\n",
1538 			 amdgpu_sched_jobs);
1539 		amdgpu_sched_jobs = roundup_pow_of_two(amdgpu_sched_jobs);
1540 	}
1541 
1542 	if (amdgpu_gart_size != -1 && amdgpu_gart_size < 32) {
1543 		/* gart size must be greater or equal to 32M */
1544 		dev_warn(adev->dev, "gart size (%d) too small\n",
1545 			 amdgpu_gart_size);
1546 		amdgpu_gart_size = -1;
1547 	}
1548 
1549 	if (amdgpu_gtt_size != -1 && amdgpu_gtt_size < 32) {
1550 		/* gtt size must be greater or equal to 32M */
1551 		dev_warn(adev->dev, "gtt size (%d) too small\n",
1552 				 amdgpu_gtt_size);
1553 		amdgpu_gtt_size = -1;
1554 	}
1555 
1556 	/* valid range is between 4 and 9 inclusive */
1557 	if (amdgpu_vm_fragment_size != -1 &&
1558 	    (amdgpu_vm_fragment_size > 9 || amdgpu_vm_fragment_size < 4)) {
1559 		dev_warn(adev->dev, "valid range is between 4 and 9\n");
1560 		amdgpu_vm_fragment_size = -1;
1561 	}
1562 
1563 	if (amdgpu_sched_hw_submission < 2) {
1564 		dev_warn(adev->dev, "sched hw submission jobs (%d) must be at least 2\n",
1565 			 amdgpu_sched_hw_submission);
1566 		amdgpu_sched_hw_submission = 2;
1567 	} else if (!is_power_of_2(amdgpu_sched_hw_submission)) {
1568 		dev_warn(adev->dev, "sched hw submission jobs (%d) must be a power of 2\n",
1569 			 amdgpu_sched_hw_submission);
1570 		amdgpu_sched_hw_submission = roundup_pow_of_two(amdgpu_sched_hw_submission);
1571 	}
1572 
1573 	amdgpu_device_check_smu_prv_buffer_size(adev);
1574 
1575 	amdgpu_device_check_vm_size(adev);
1576 
1577 	amdgpu_device_check_block_size(adev);
1578 
1579 	adev->firmware.load_type = amdgpu_ucode_get_load_type(adev, amdgpu_fw_load_type);
1580 
1581 	amdgpu_gmc_tmz_set(adev);
1582 
1583 	amdgpu_gmc_noretry_set(adev);
1584 
1585 	return 0;
1586 }
1587 
1588 /**
1589  * amdgpu_switcheroo_set_state - set switcheroo state
1590  *
1591  * @pdev: pci dev pointer
1592  * @state: vga_switcheroo state
1593  *
1594  * Callback for the switcheroo driver.  Suspends or resumes the
1595  * the asics before or after it is powered up using ACPI methods.
1596  */
amdgpu_switcheroo_set_state(struct pci_dev * pdev,enum vga_switcheroo_state state)1597 static void amdgpu_switcheroo_set_state(struct pci_dev *pdev,
1598 					enum vga_switcheroo_state state)
1599 {
1600 	struct drm_device *dev = pci_get_drvdata(pdev);
1601 	int r;
1602 
1603 	if (amdgpu_device_supports_px(dev) && state == VGA_SWITCHEROO_OFF)
1604 		return;
1605 
1606 	if (state == VGA_SWITCHEROO_ON) {
1607 		pr_info("switched on\n");
1608 		/* don't suspend or resume card normally */
1609 		dev->switch_power_state = DRM_SWITCH_POWER_CHANGING;
1610 
1611 		pci_set_power_state(pdev, PCI_D0);
1612 		amdgpu_device_load_pci_state(pdev);
1613 		r = pci_enable_device(pdev);
1614 		if (r)
1615 			DRM_WARN("pci_enable_device failed (%d)\n", r);
1616 		amdgpu_device_resume(dev, true);
1617 
1618 		dev->switch_power_state = DRM_SWITCH_POWER_ON;
1619 	} else {
1620 		pr_info("switched off\n");
1621 		dev->switch_power_state = DRM_SWITCH_POWER_CHANGING;
1622 		amdgpu_device_suspend(dev, true);
1623 		amdgpu_device_cache_pci_state(pdev);
1624 		/* Shut down the device */
1625 		pci_disable_device(pdev);
1626 		pci_set_power_state(pdev, PCI_D3cold);
1627 		dev->switch_power_state = DRM_SWITCH_POWER_OFF;
1628 	}
1629 }
1630 
1631 /**
1632  * amdgpu_switcheroo_can_switch - see if switcheroo state can change
1633  *
1634  * @pdev: pci dev pointer
1635  *
1636  * Callback for the switcheroo driver.  Check of the switcheroo
1637  * state can be changed.
1638  * Returns true if the state can be changed, false if not.
1639  */
amdgpu_switcheroo_can_switch(struct pci_dev * pdev)1640 static bool amdgpu_switcheroo_can_switch(struct pci_dev *pdev)
1641 {
1642 	struct drm_device *dev = pci_get_drvdata(pdev);
1643 
1644 	/*
1645 	* FIXME: open_count is protected by drm_global_mutex but that would lead to
1646 	* locking inversion with the driver load path. And the access here is
1647 	* completely racy anyway. So don't bother with locking for now.
1648 	*/
1649 	return atomic_read(&dev->open_count) == 0;
1650 }
1651 
1652 static const struct vga_switcheroo_client_ops amdgpu_switcheroo_ops = {
1653 	.set_gpu_state = amdgpu_switcheroo_set_state,
1654 	.reprobe = NULL,
1655 	.can_switch = amdgpu_switcheroo_can_switch,
1656 };
1657 
1658 /**
1659  * amdgpu_device_ip_set_clockgating_state - set the CG state
1660  *
1661  * @dev: amdgpu_device pointer
1662  * @block_type: Type of hardware IP (SMU, GFX, UVD, etc.)
1663  * @state: clockgating state (gate or ungate)
1664  *
1665  * Sets the requested clockgating state for all instances of
1666  * the hardware IP specified.
1667  * Returns the error code from the last instance.
1668  */
amdgpu_device_ip_set_clockgating_state(void * dev,enum amd_ip_block_type block_type,enum amd_clockgating_state state)1669 int amdgpu_device_ip_set_clockgating_state(void *dev,
1670 					   enum amd_ip_block_type block_type,
1671 					   enum amd_clockgating_state state)
1672 {
1673 	struct amdgpu_device *adev = dev;
1674 	int i, r = 0;
1675 
1676 	for (i = 0; i < adev->num_ip_blocks; i++) {
1677 		if (!adev->ip_blocks[i].status.valid)
1678 			continue;
1679 		if (adev->ip_blocks[i].version->type != block_type)
1680 			continue;
1681 		if (!adev->ip_blocks[i].version->funcs->set_clockgating_state)
1682 			continue;
1683 		r = adev->ip_blocks[i].version->funcs->set_clockgating_state(
1684 			(void *)adev, state);
1685 		if (r)
1686 			DRM_ERROR("set_clockgating_state of IP block <%s> failed %d\n",
1687 				  adev->ip_blocks[i].version->funcs->name, r);
1688 	}
1689 	return r;
1690 }
1691 
1692 /**
1693  * amdgpu_device_ip_set_powergating_state - set the PG state
1694  *
1695  * @dev: amdgpu_device pointer
1696  * @block_type: Type of hardware IP (SMU, GFX, UVD, etc.)
1697  * @state: powergating state (gate or ungate)
1698  *
1699  * Sets the requested powergating state for all instances of
1700  * the hardware IP specified.
1701  * Returns the error code from the last instance.
1702  */
amdgpu_device_ip_set_powergating_state(void * dev,enum amd_ip_block_type block_type,enum amd_powergating_state state)1703 int amdgpu_device_ip_set_powergating_state(void *dev,
1704 					   enum amd_ip_block_type block_type,
1705 					   enum amd_powergating_state state)
1706 {
1707 	struct amdgpu_device *adev = dev;
1708 	int i, r = 0;
1709 
1710 	for (i = 0; i < adev->num_ip_blocks; i++) {
1711 		if (!adev->ip_blocks[i].status.valid)
1712 			continue;
1713 		if (adev->ip_blocks[i].version->type != block_type)
1714 			continue;
1715 		if (!adev->ip_blocks[i].version->funcs->set_powergating_state)
1716 			continue;
1717 		r = adev->ip_blocks[i].version->funcs->set_powergating_state(
1718 			(void *)adev, state);
1719 		if (r)
1720 			DRM_ERROR("set_powergating_state of IP block <%s> failed %d\n",
1721 				  adev->ip_blocks[i].version->funcs->name, r);
1722 	}
1723 	return r;
1724 }
1725 
1726 /**
1727  * amdgpu_device_ip_get_clockgating_state - get the CG state
1728  *
1729  * @adev: amdgpu_device pointer
1730  * @flags: clockgating feature flags
1731  *
1732  * Walks the list of IPs on the device and updates the clockgating
1733  * flags for each IP.
1734  * Updates @flags with the feature flags for each hardware IP where
1735  * clockgating is enabled.
1736  */
amdgpu_device_ip_get_clockgating_state(struct amdgpu_device * adev,u32 * flags)1737 void amdgpu_device_ip_get_clockgating_state(struct amdgpu_device *adev,
1738 					    u32 *flags)
1739 {
1740 	int i;
1741 
1742 	for (i = 0; i < adev->num_ip_blocks; i++) {
1743 		if (!adev->ip_blocks[i].status.valid)
1744 			continue;
1745 		if (adev->ip_blocks[i].version->funcs->get_clockgating_state)
1746 			adev->ip_blocks[i].version->funcs->get_clockgating_state((void *)adev, flags);
1747 	}
1748 }
1749 
1750 /**
1751  * amdgpu_device_ip_wait_for_idle - wait for idle
1752  *
1753  * @adev: amdgpu_device pointer
1754  * @block_type: Type of hardware IP (SMU, GFX, UVD, etc.)
1755  *
1756  * Waits for the request hardware IP to be idle.
1757  * Returns 0 for success or a negative error code on failure.
1758  */
amdgpu_device_ip_wait_for_idle(struct amdgpu_device * adev,enum amd_ip_block_type block_type)1759 int amdgpu_device_ip_wait_for_idle(struct amdgpu_device *adev,
1760 				   enum amd_ip_block_type block_type)
1761 {
1762 	int i, r;
1763 
1764 	for (i = 0; i < adev->num_ip_blocks; i++) {
1765 		if (!adev->ip_blocks[i].status.valid)
1766 			continue;
1767 		if (adev->ip_blocks[i].version->type == block_type) {
1768 			r = adev->ip_blocks[i].version->funcs->wait_for_idle((void *)adev);
1769 			if (r)
1770 				return r;
1771 			break;
1772 		}
1773 	}
1774 	return 0;
1775 
1776 }
1777 
1778 /**
1779  * amdgpu_device_ip_is_idle - is the hardware IP idle
1780  *
1781  * @adev: amdgpu_device pointer
1782  * @block_type: Type of hardware IP (SMU, GFX, UVD, etc.)
1783  *
1784  * Check if the hardware IP is idle or not.
1785  * Returns true if it the IP is idle, false if not.
1786  */
amdgpu_device_ip_is_idle(struct amdgpu_device * adev,enum amd_ip_block_type block_type)1787 bool amdgpu_device_ip_is_idle(struct amdgpu_device *adev,
1788 			      enum amd_ip_block_type block_type)
1789 {
1790 	int i;
1791 
1792 	for (i = 0; i < adev->num_ip_blocks; i++) {
1793 		if (!adev->ip_blocks[i].status.valid)
1794 			continue;
1795 		if (adev->ip_blocks[i].version->type == block_type)
1796 			return adev->ip_blocks[i].version->funcs->is_idle((void *)adev);
1797 	}
1798 	return true;
1799 
1800 }
1801 
1802 /**
1803  * amdgpu_device_ip_get_ip_block - get a hw IP pointer
1804  *
1805  * @adev: amdgpu_device pointer
1806  * @type: Type of hardware IP (SMU, GFX, UVD, etc.)
1807  *
1808  * Returns a pointer to the hardware IP block structure
1809  * if it exists for the asic, otherwise NULL.
1810  */
1811 struct amdgpu_ip_block *
amdgpu_device_ip_get_ip_block(struct amdgpu_device * adev,enum amd_ip_block_type type)1812 amdgpu_device_ip_get_ip_block(struct amdgpu_device *adev,
1813 			      enum amd_ip_block_type type)
1814 {
1815 	int i;
1816 
1817 	for (i = 0; i < adev->num_ip_blocks; i++)
1818 		if (adev->ip_blocks[i].version->type == type)
1819 			return &adev->ip_blocks[i];
1820 
1821 	return NULL;
1822 }
1823 
1824 /**
1825  * amdgpu_device_ip_block_version_cmp
1826  *
1827  * @adev: amdgpu_device pointer
1828  * @type: enum amd_ip_block_type
1829  * @major: major version
1830  * @minor: minor version
1831  *
1832  * return 0 if equal or greater
1833  * return 1 if smaller or the ip_block doesn't exist
1834  */
amdgpu_device_ip_block_version_cmp(struct amdgpu_device * adev,enum amd_ip_block_type type,u32 major,u32 minor)1835 int amdgpu_device_ip_block_version_cmp(struct amdgpu_device *adev,
1836 				       enum amd_ip_block_type type,
1837 				       u32 major, u32 minor)
1838 {
1839 	struct amdgpu_ip_block *ip_block = amdgpu_device_ip_get_ip_block(adev, type);
1840 
1841 	if (ip_block && ((ip_block->version->major > major) ||
1842 			((ip_block->version->major == major) &&
1843 			(ip_block->version->minor >= minor))))
1844 		return 0;
1845 
1846 	return 1;
1847 }
1848 
1849 /**
1850  * amdgpu_device_ip_block_add
1851  *
1852  * @adev: amdgpu_device pointer
1853  * @ip_block_version: pointer to the IP to add
1854  *
1855  * Adds the IP block driver information to the collection of IPs
1856  * on the asic.
1857  */
amdgpu_device_ip_block_add(struct amdgpu_device * adev,const struct amdgpu_ip_block_version * ip_block_version)1858 int amdgpu_device_ip_block_add(struct amdgpu_device *adev,
1859 			       const struct amdgpu_ip_block_version *ip_block_version)
1860 {
1861 	if (!ip_block_version)
1862 		return -EINVAL;
1863 
1864 	switch (ip_block_version->type) {
1865 	case AMD_IP_BLOCK_TYPE_VCN:
1866 		if (adev->harvest_ip_mask & AMD_HARVEST_IP_VCN_MASK)
1867 			return 0;
1868 		break;
1869 	case AMD_IP_BLOCK_TYPE_JPEG:
1870 		if (adev->harvest_ip_mask & AMD_HARVEST_IP_JPEG_MASK)
1871 			return 0;
1872 		break;
1873 	default:
1874 		break;
1875 	}
1876 
1877 	DRM_INFO("add ip block number %d <%s>\n", adev->num_ip_blocks,
1878 		  ip_block_version->funcs->name);
1879 
1880 	adev->ip_blocks[adev->num_ip_blocks++].version = ip_block_version;
1881 
1882 	return 0;
1883 }
1884 
1885 /**
1886  * amdgpu_device_enable_virtual_display - enable virtual display feature
1887  *
1888  * @adev: amdgpu_device pointer
1889  *
1890  * Enabled the virtual display feature if the user has enabled it via
1891  * the module parameter virtual_display.  This feature provides a virtual
1892  * display hardware on headless boards or in virtualized environments.
1893  * This function parses and validates the configuration string specified by
1894  * the user and configues the virtual display configuration (number of
1895  * virtual connectors, crtcs, etc.) specified.
1896  */
amdgpu_device_enable_virtual_display(struct amdgpu_device * adev)1897 static void amdgpu_device_enable_virtual_display(struct amdgpu_device *adev)
1898 {
1899 	adev->enable_virtual_display = false;
1900 
1901 	if (amdgpu_virtual_display) {
1902 		const char *pci_address_name = pci_name(adev->pdev);
1903 		char *pciaddstr, *pciaddstr_tmp, *pciaddname_tmp, *pciaddname;
1904 
1905 		pciaddstr = kstrdup(amdgpu_virtual_display, GFP_KERNEL);
1906 		pciaddstr_tmp = pciaddstr;
1907 		while ((pciaddname_tmp = strsep(&pciaddstr_tmp, ";"))) {
1908 			pciaddname = strsep(&pciaddname_tmp, ",");
1909 			if (!strcmp("all", pciaddname)
1910 			    || !strcmp(pci_address_name, pciaddname)) {
1911 				long num_crtc;
1912 				int res = -1;
1913 
1914 				adev->enable_virtual_display = true;
1915 
1916 				if (pciaddname_tmp)
1917 					res = kstrtol(pciaddname_tmp, 10,
1918 						      &num_crtc);
1919 
1920 				if (!res) {
1921 					if (num_crtc < 1)
1922 						num_crtc = 1;
1923 					if (num_crtc > 6)
1924 						num_crtc = 6;
1925 					adev->mode_info.num_crtc = num_crtc;
1926 				} else {
1927 					adev->mode_info.num_crtc = 1;
1928 				}
1929 				break;
1930 			}
1931 		}
1932 
1933 		DRM_INFO("virtual display string:%s, %s:virtual_display:%d, num_crtc:%d\n",
1934 			 amdgpu_virtual_display, pci_address_name,
1935 			 adev->enable_virtual_display, adev->mode_info.num_crtc);
1936 
1937 		kfree(pciaddstr);
1938 	}
1939 }
1940 
1941 /**
1942  * amdgpu_device_parse_gpu_info_fw - parse gpu info firmware
1943  *
1944  * @adev: amdgpu_device pointer
1945  *
1946  * Parses the asic configuration parameters specified in the gpu info
1947  * firmware and makes them availale to the driver for use in configuring
1948  * the asic.
1949  * Returns 0 on success, -EINVAL on failure.
1950  */
amdgpu_device_parse_gpu_info_fw(struct amdgpu_device * adev)1951 static int amdgpu_device_parse_gpu_info_fw(struct amdgpu_device *adev)
1952 {
1953 	const char *chip_name;
1954 	char fw_name[40];
1955 	int err;
1956 	const struct gpu_info_firmware_header_v1_0 *hdr;
1957 
1958 	adev->firmware.gpu_info_fw = NULL;
1959 
1960 	if (adev->mman.discovery_bin) {
1961 		amdgpu_discovery_get_gfx_info(adev);
1962 
1963 		/*
1964 		 * FIXME: The bounding box is still needed by Navi12, so
1965 		 * temporarily read it from gpu_info firmware. Should be droped
1966 		 * when DAL no longer needs it.
1967 		 */
1968 		if (adev->asic_type != CHIP_NAVI12)
1969 			return 0;
1970 	}
1971 
1972 	switch (adev->asic_type) {
1973 #ifdef CONFIG_DRM_AMDGPU_SI
1974 	case CHIP_VERDE:
1975 	case CHIP_TAHITI:
1976 	case CHIP_PITCAIRN:
1977 	case CHIP_OLAND:
1978 	case CHIP_HAINAN:
1979 #endif
1980 #ifdef CONFIG_DRM_AMDGPU_CIK
1981 	case CHIP_BONAIRE:
1982 	case CHIP_HAWAII:
1983 	case CHIP_KAVERI:
1984 	case CHIP_KABINI:
1985 	case CHIP_MULLINS:
1986 #endif
1987 	case CHIP_TOPAZ:
1988 	case CHIP_TONGA:
1989 	case CHIP_FIJI:
1990 	case CHIP_POLARIS10:
1991 	case CHIP_POLARIS11:
1992 	case CHIP_POLARIS12:
1993 	case CHIP_VEGAM:
1994 	case CHIP_CARRIZO:
1995 	case CHIP_STONEY:
1996 	case CHIP_VEGA20:
1997 	case CHIP_ALDEBARAN:
1998 	case CHIP_SIENNA_CICHLID:
1999 	case CHIP_NAVY_FLOUNDER:
2000 	case CHIP_DIMGREY_CAVEFISH:
2001 	case CHIP_BEIGE_GOBY:
2002 	default:
2003 		return 0;
2004 	case CHIP_VEGA10:
2005 		chip_name = "vega10";
2006 		break;
2007 	case CHIP_VEGA12:
2008 		chip_name = "vega12";
2009 		break;
2010 	case CHIP_RAVEN:
2011 		if (adev->apu_flags & AMD_APU_IS_RAVEN2)
2012 			chip_name = "raven2";
2013 		else if (adev->apu_flags & AMD_APU_IS_PICASSO)
2014 			chip_name = "picasso";
2015 		else
2016 			chip_name = "raven";
2017 		break;
2018 	case CHIP_ARCTURUS:
2019 		chip_name = "arcturus";
2020 		break;
2021 	case CHIP_RENOIR:
2022 		if (adev->apu_flags & AMD_APU_IS_RENOIR)
2023 			chip_name = "renoir";
2024 		else
2025 			chip_name = "green_sardine";
2026 		break;
2027 	case CHIP_NAVI10:
2028 		chip_name = "navi10";
2029 		break;
2030 	case CHIP_NAVI14:
2031 		chip_name = "navi14";
2032 		break;
2033 	case CHIP_NAVI12:
2034 		chip_name = "navi12";
2035 		break;
2036 	case CHIP_VANGOGH:
2037 		chip_name = "vangogh";
2038 		break;
2039 	case CHIP_YELLOW_CARP:
2040 		chip_name = "yellow_carp";
2041 		break;
2042 	}
2043 
2044 	snprintf(fw_name, sizeof(fw_name), "amdgpu/%s_gpu_info.bin", chip_name);
2045 	err = request_firmware(&adev->firmware.gpu_info_fw, fw_name, adev->dev);
2046 	if (err) {
2047 		dev_err(adev->dev,
2048 			"Failed to load gpu_info firmware \"%s\"\n",
2049 			fw_name);
2050 		goto out;
2051 	}
2052 	err = amdgpu_ucode_validate(adev->firmware.gpu_info_fw);
2053 	if (err) {
2054 		dev_err(adev->dev,
2055 			"Failed to validate gpu_info firmware \"%s\"\n",
2056 			fw_name);
2057 		goto out;
2058 	}
2059 
2060 	hdr = (const struct gpu_info_firmware_header_v1_0 *)adev->firmware.gpu_info_fw->data;
2061 	amdgpu_ucode_print_gpu_info_hdr(&hdr->header);
2062 
2063 	switch (hdr->version_major) {
2064 	case 1:
2065 	{
2066 		const struct gpu_info_firmware_v1_0 *gpu_info_fw =
2067 			(const struct gpu_info_firmware_v1_0 *)(adev->firmware.gpu_info_fw->data +
2068 								le32_to_cpu(hdr->header.ucode_array_offset_bytes));
2069 
2070 		/*
2071 		 * Should be droped when DAL no longer needs it.
2072 		 */
2073 		if (adev->asic_type == CHIP_NAVI12)
2074 			goto parse_soc_bounding_box;
2075 
2076 		adev->gfx.config.max_shader_engines = le32_to_cpu(gpu_info_fw->gc_num_se);
2077 		adev->gfx.config.max_cu_per_sh = le32_to_cpu(gpu_info_fw->gc_num_cu_per_sh);
2078 		adev->gfx.config.max_sh_per_se = le32_to_cpu(gpu_info_fw->gc_num_sh_per_se);
2079 		adev->gfx.config.max_backends_per_se = le32_to_cpu(gpu_info_fw->gc_num_rb_per_se);
2080 		adev->gfx.config.max_texture_channel_caches =
2081 			le32_to_cpu(gpu_info_fw->gc_num_tccs);
2082 		adev->gfx.config.max_gprs = le32_to_cpu(gpu_info_fw->gc_num_gprs);
2083 		adev->gfx.config.max_gs_threads = le32_to_cpu(gpu_info_fw->gc_num_max_gs_thds);
2084 		adev->gfx.config.gs_vgt_table_depth = le32_to_cpu(gpu_info_fw->gc_gs_table_depth);
2085 		adev->gfx.config.gs_prim_buffer_depth = le32_to_cpu(gpu_info_fw->gc_gsprim_buff_depth);
2086 		adev->gfx.config.double_offchip_lds_buf =
2087 			le32_to_cpu(gpu_info_fw->gc_double_offchip_lds_buffer);
2088 		adev->gfx.cu_info.wave_front_size = le32_to_cpu(gpu_info_fw->gc_wave_size);
2089 		adev->gfx.cu_info.max_waves_per_simd =
2090 			le32_to_cpu(gpu_info_fw->gc_max_waves_per_simd);
2091 		adev->gfx.cu_info.max_scratch_slots_per_cu =
2092 			le32_to_cpu(gpu_info_fw->gc_max_scratch_slots_per_cu);
2093 		adev->gfx.cu_info.lds_size = le32_to_cpu(gpu_info_fw->gc_lds_size);
2094 		if (hdr->version_minor >= 1) {
2095 			const struct gpu_info_firmware_v1_1 *gpu_info_fw =
2096 				(const struct gpu_info_firmware_v1_1 *)(adev->firmware.gpu_info_fw->data +
2097 									le32_to_cpu(hdr->header.ucode_array_offset_bytes));
2098 			adev->gfx.config.num_sc_per_sh =
2099 				le32_to_cpu(gpu_info_fw->num_sc_per_sh);
2100 			adev->gfx.config.num_packer_per_sc =
2101 				le32_to_cpu(gpu_info_fw->num_packer_per_sc);
2102 		}
2103 
2104 parse_soc_bounding_box:
2105 		/*
2106 		 * soc bounding box info is not integrated in disocovery table,
2107 		 * we always need to parse it from gpu info firmware if needed.
2108 		 */
2109 		if (hdr->version_minor == 2) {
2110 			const struct gpu_info_firmware_v1_2 *gpu_info_fw =
2111 				(const struct gpu_info_firmware_v1_2 *)(adev->firmware.gpu_info_fw->data +
2112 									le32_to_cpu(hdr->header.ucode_array_offset_bytes));
2113 			adev->dm.soc_bounding_box = &gpu_info_fw->soc_bounding_box;
2114 		}
2115 		break;
2116 	}
2117 	default:
2118 		dev_err(adev->dev,
2119 			"Unsupported gpu_info table %d\n", hdr->header.ucode_version);
2120 		err = -EINVAL;
2121 		goto out;
2122 	}
2123 out:
2124 	return err;
2125 }
2126 
2127 /**
2128  * amdgpu_device_ip_early_init - run early init for hardware IPs
2129  *
2130  * @adev: amdgpu_device pointer
2131  *
2132  * Early initialization pass for hardware IPs.  The hardware IPs that make
2133  * up each asic are discovered each IP's early_init callback is run.  This
2134  * is the first stage in initializing the asic.
2135  * Returns 0 on success, negative error code on failure.
2136  */
amdgpu_device_ip_early_init(struct amdgpu_device * adev)2137 static int amdgpu_device_ip_early_init(struct amdgpu_device *adev)
2138 {
2139 	struct drm_device *dev = adev_to_drm(adev);
2140 	struct pci_dev *parent;
2141 	int i, r;
2142 
2143 	amdgpu_device_enable_virtual_display(adev);
2144 
2145 	if (amdgpu_sriov_vf(adev)) {
2146 		r = amdgpu_virt_request_full_gpu(adev, true);
2147 		if (r)
2148 			return r;
2149 	}
2150 
2151 	switch (adev->asic_type) {
2152 #ifdef CONFIG_DRM_AMDGPU_SI
2153 	case CHIP_VERDE:
2154 	case CHIP_TAHITI:
2155 	case CHIP_PITCAIRN:
2156 	case CHIP_OLAND:
2157 	case CHIP_HAINAN:
2158 		adev->family = AMDGPU_FAMILY_SI;
2159 		r = si_set_ip_blocks(adev);
2160 		if (r)
2161 			return r;
2162 		break;
2163 #endif
2164 #ifdef CONFIG_DRM_AMDGPU_CIK
2165 	case CHIP_BONAIRE:
2166 	case CHIP_HAWAII:
2167 	case CHIP_KAVERI:
2168 	case CHIP_KABINI:
2169 	case CHIP_MULLINS:
2170 		if (adev->flags & AMD_IS_APU)
2171 			adev->family = AMDGPU_FAMILY_KV;
2172 		else
2173 			adev->family = AMDGPU_FAMILY_CI;
2174 
2175 		r = cik_set_ip_blocks(adev);
2176 		if (r)
2177 			return r;
2178 		break;
2179 #endif
2180 	case CHIP_TOPAZ:
2181 	case CHIP_TONGA:
2182 	case CHIP_FIJI:
2183 	case CHIP_POLARIS10:
2184 	case CHIP_POLARIS11:
2185 	case CHIP_POLARIS12:
2186 	case CHIP_VEGAM:
2187 	case CHIP_CARRIZO:
2188 	case CHIP_STONEY:
2189 		if (adev->flags & AMD_IS_APU)
2190 			adev->family = AMDGPU_FAMILY_CZ;
2191 		else
2192 			adev->family = AMDGPU_FAMILY_VI;
2193 
2194 		r = vi_set_ip_blocks(adev);
2195 		if (r)
2196 			return r;
2197 		break;
2198 	case CHIP_VEGA10:
2199 	case CHIP_VEGA12:
2200 	case CHIP_VEGA20:
2201 	case CHIP_RAVEN:
2202 	case CHIP_ARCTURUS:
2203 	case CHIP_RENOIR:
2204 	case CHIP_ALDEBARAN:
2205 		if (adev->flags & AMD_IS_APU)
2206 			adev->family = AMDGPU_FAMILY_RV;
2207 		else
2208 			adev->family = AMDGPU_FAMILY_AI;
2209 
2210 		r = soc15_set_ip_blocks(adev);
2211 		if (r)
2212 			return r;
2213 		break;
2214 	case  CHIP_NAVI10:
2215 	case  CHIP_NAVI14:
2216 	case  CHIP_NAVI12:
2217 	case  CHIP_SIENNA_CICHLID:
2218 	case  CHIP_NAVY_FLOUNDER:
2219 	case  CHIP_DIMGREY_CAVEFISH:
2220 	case  CHIP_BEIGE_GOBY:
2221 	case CHIP_VANGOGH:
2222 	case CHIP_YELLOW_CARP:
2223 	case CHIP_CYAN_SKILLFISH:
2224 		if (adev->asic_type == CHIP_VANGOGH)
2225 			adev->family = AMDGPU_FAMILY_VGH;
2226 		else if (adev->asic_type == CHIP_YELLOW_CARP)
2227 			adev->family = AMDGPU_FAMILY_YC;
2228 		else
2229 			adev->family = AMDGPU_FAMILY_NV;
2230 
2231 		r = nv_set_ip_blocks(adev);
2232 		if (r)
2233 			return r;
2234 		break;
2235 	default:
2236 		/* FIXME: not supported yet */
2237 		return -EINVAL;
2238 	}
2239 
2240 	if (amdgpu_has_atpx() &&
2241 	    (amdgpu_is_atpx_hybrid() ||
2242 	     amdgpu_has_atpx_dgpu_power_cntl()) &&
2243 	    ((adev->flags & AMD_IS_APU) == 0) &&
2244 	    !pci_is_thunderbolt_attached(to_pci_dev(dev->dev)))
2245 		adev->flags |= AMD_IS_PX;
2246 
2247 	if (!(adev->flags & AMD_IS_APU)) {
2248 		parent = pcie_find_root_port(adev->pdev);
2249 		adev->has_pr3 = parent ? pci_pr3_present(parent) : false;
2250 	}
2251 
2252 	amdgpu_amdkfd_device_probe(adev);
2253 
2254 	adev->pm.pp_feature = amdgpu_pp_feature_mask;
2255 	if (amdgpu_sriov_vf(adev) || sched_policy == KFD_SCHED_POLICY_NO_HWS)
2256 		adev->pm.pp_feature &= ~PP_GFXOFF_MASK;
2257 	if (amdgpu_sriov_vf(adev) && adev->asic_type == CHIP_SIENNA_CICHLID)
2258 		adev->pm.pp_feature &= ~PP_OVERDRIVE_MASK;
2259 
2260 	for (i = 0; i < adev->num_ip_blocks; i++) {
2261 		if ((amdgpu_ip_block_mask & (1 << i)) == 0) {
2262 			DRM_ERROR("disabled ip block: %d <%s>\n",
2263 				  i, adev->ip_blocks[i].version->funcs->name);
2264 			adev->ip_blocks[i].status.valid = false;
2265 		} else {
2266 			if (adev->ip_blocks[i].version->funcs->early_init) {
2267 				r = adev->ip_blocks[i].version->funcs->early_init((void *)adev);
2268 				if (r == -ENOENT) {
2269 					adev->ip_blocks[i].status.valid = false;
2270 				} else if (r) {
2271 					DRM_ERROR("early_init of IP block <%s> failed %d\n",
2272 						  adev->ip_blocks[i].version->funcs->name, r);
2273 					return r;
2274 				} else {
2275 					adev->ip_blocks[i].status.valid = true;
2276 				}
2277 			} else {
2278 				adev->ip_blocks[i].status.valid = true;
2279 			}
2280 		}
2281 		/* get the vbios after the asic_funcs are set up */
2282 		if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_COMMON) {
2283 			r = amdgpu_device_parse_gpu_info_fw(adev);
2284 			if (r)
2285 				return r;
2286 
2287 			/* Read BIOS */
2288 			if (!amdgpu_get_bios(adev))
2289 				return -EINVAL;
2290 
2291 			r = amdgpu_atombios_init(adev);
2292 			if (r) {
2293 				dev_err(adev->dev, "amdgpu_atombios_init failed\n");
2294 				amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_ATOMBIOS_INIT_FAIL, 0, 0);
2295 				return r;
2296 			}
2297 
2298 			/*get pf2vf msg info at it's earliest time*/
2299 			if (amdgpu_sriov_vf(adev))
2300 				amdgpu_virt_init_data_exchange(adev);
2301 
2302 		}
2303 	}
2304 
2305 	adev->cg_flags &= amdgpu_cg_mask;
2306 	adev->pg_flags &= amdgpu_pg_mask;
2307 
2308 	return 0;
2309 }
2310 
amdgpu_device_ip_hw_init_phase1(struct amdgpu_device * adev)2311 static int amdgpu_device_ip_hw_init_phase1(struct amdgpu_device *adev)
2312 {
2313 	int i, r;
2314 
2315 	for (i = 0; i < adev->num_ip_blocks; i++) {
2316 		if (!adev->ip_blocks[i].status.sw)
2317 			continue;
2318 		if (adev->ip_blocks[i].status.hw)
2319 			continue;
2320 		if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_COMMON ||
2321 		    (amdgpu_sriov_vf(adev) && (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_PSP)) ||
2322 		    adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_IH) {
2323 			r = adev->ip_blocks[i].version->funcs->hw_init(adev);
2324 			if (r) {
2325 				DRM_ERROR("hw_init of IP block <%s> failed %d\n",
2326 					  adev->ip_blocks[i].version->funcs->name, r);
2327 				return r;
2328 			}
2329 			adev->ip_blocks[i].status.hw = true;
2330 		}
2331 	}
2332 
2333 	return 0;
2334 }
2335 
amdgpu_device_ip_hw_init_phase2(struct amdgpu_device * adev)2336 static int amdgpu_device_ip_hw_init_phase2(struct amdgpu_device *adev)
2337 {
2338 	int i, r;
2339 
2340 	for (i = 0; i < adev->num_ip_blocks; i++) {
2341 		if (!adev->ip_blocks[i].status.sw)
2342 			continue;
2343 		if (adev->ip_blocks[i].status.hw)
2344 			continue;
2345 		r = adev->ip_blocks[i].version->funcs->hw_init(adev);
2346 		if (r) {
2347 			DRM_ERROR("hw_init of IP block <%s> failed %d\n",
2348 				  adev->ip_blocks[i].version->funcs->name, r);
2349 			return r;
2350 		}
2351 		adev->ip_blocks[i].status.hw = true;
2352 	}
2353 
2354 	return 0;
2355 }
2356 
amdgpu_device_fw_loading(struct amdgpu_device * adev)2357 static int amdgpu_device_fw_loading(struct amdgpu_device *adev)
2358 {
2359 	int r = 0;
2360 	int i;
2361 	uint32_t smu_version;
2362 
2363 	if (adev->asic_type >= CHIP_VEGA10) {
2364 		for (i = 0; i < adev->num_ip_blocks; i++) {
2365 			if (adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_PSP)
2366 				continue;
2367 
2368 			if (!adev->ip_blocks[i].status.sw)
2369 				continue;
2370 
2371 			/* no need to do the fw loading again if already done*/
2372 			if (adev->ip_blocks[i].status.hw == true)
2373 				break;
2374 
2375 			if (amdgpu_in_reset(adev) || adev->in_suspend) {
2376 				r = adev->ip_blocks[i].version->funcs->resume(adev);
2377 				if (r) {
2378 					DRM_ERROR("resume of IP block <%s> failed %d\n",
2379 							  adev->ip_blocks[i].version->funcs->name, r);
2380 					return r;
2381 				}
2382 			} else {
2383 				r = adev->ip_blocks[i].version->funcs->hw_init(adev);
2384 				if (r) {
2385 					DRM_ERROR("hw_init of IP block <%s> failed %d\n",
2386 							  adev->ip_blocks[i].version->funcs->name, r);
2387 					return r;
2388 				}
2389 			}
2390 
2391 			adev->ip_blocks[i].status.hw = true;
2392 			break;
2393 		}
2394 	}
2395 
2396 	if (!amdgpu_sriov_vf(adev) || adev->asic_type == CHIP_TONGA)
2397 		r = amdgpu_pm_load_smu_firmware(adev, &smu_version);
2398 
2399 	return r;
2400 }
2401 
2402 /**
2403  * amdgpu_device_ip_init - run init for hardware IPs
2404  *
2405  * @adev: amdgpu_device pointer
2406  *
2407  * Main initialization pass for hardware IPs.  The list of all the hardware
2408  * IPs that make up the asic is walked and the sw_init and hw_init callbacks
2409  * are run.  sw_init initializes the software state associated with each IP
2410  * and hw_init initializes the hardware associated with each IP.
2411  * Returns 0 on success, negative error code on failure.
2412  */
amdgpu_device_ip_init(struct amdgpu_device * adev)2413 static int amdgpu_device_ip_init(struct amdgpu_device *adev)
2414 {
2415 	int i, r;
2416 
2417 	r = amdgpu_ras_init(adev);
2418 	if (r)
2419 		return r;
2420 
2421 	for (i = 0; i < adev->num_ip_blocks; i++) {
2422 		if (!adev->ip_blocks[i].status.valid)
2423 			continue;
2424 		r = adev->ip_blocks[i].version->funcs->sw_init((void *)adev);
2425 		if (r) {
2426 			DRM_ERROR("sw_init of IP block <%s> failed %d\n",
2427 				  adev->ip_blocks[i].version->funcs->name, r);
2428 			goto init_failed;
2429 		}
2430 		adev->ip_blocks[i].status.sw = true;
2431 
2432 		if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_COMMON) {
2433 			/* need to do common hw init early so everything is set up for gmc */
2434 			r = adev->ip_blocks[i].version->funcs->hw_init((void *)adev);
2435 			if (r) {
2436 				DRM_ERROR("hw_init %d failed %d\n", i, r);
2437 				goto init_failed;
2438 			}
2439 			adev->ip_blocks[i].status.hw = true;
2440 		} else if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC) {
2441 			/* need to do gmc hw init early so we can allocate gpu mem */
2442 			/* Try to reserve bad pages early */
2443 			if (amdgpu_sriov_vf(adev))
2444 				amdgpu_virt_exchange_data(adev);
2445 
2446 			r = amdgpu_device_vram_scratch_init(adev);
2447 			if (r) {
2448 				DRM_ERROR("amdgpu_vram_scratch_init failed %d\n", r);
2449 				goto init_failed;
2450 			}
2451 			r = adev->ip_blocks[i].version->funcs->hw_init((void *)adev);
2452 			if (r) {
2453 				DRM_ERROR("hw_init %d failed %d\n", i, r);
2454 				goto init_failed;
2455 			}
2456 			r = amdgpu_device_wb_init(adev);
2457 			if (r) {
2458 				DRM_ERROR("amdgpu_device_wb_init failed %d\n", r);
2459 				goto init_failed;
2460 			}
2461 			adev->ip_blocks[i].status.hw = true;
2462 
2463 			/* right after GMC hw init, we create CSA */
2464 			if (amdgpu_mcbp || amdgpu_sriov_vf(adev)) {
2465 				r = amdgpu_allocate_static_csa(adev, &adev->virt.csa_obj,
2466 								AMDGPU_GEM_DOMAIN_VRAM,
2467 								AMDGPU_CSA_SIZE);
2468 				if (r) {
2469 					DRM_ERROR("allocate CSA failed %d\n", r);
2470 					goto init_failed;
2471 				}
2472 			}
2473 		}
2474 	}
2475 
2476 	if (amdgpu_sriov_vf(adev))
2477 		amdgpu_virt_init_data_exchange(adev);
2478 
2479 	r = amdgpu_ib_pool_init(adev);
2480 	if (r) {
2481 		dev_err(adev->dev, "IB initialization failed (%d).\n", r);
2482 		amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_IB_INIT_FAIL, 0, r);
2483 		goto init_failed;
2484 	}
2485 
2486 	r = amdgpu_ucode_create_bo(adev); /* create ucode bo when sw_init complete*/
2487 	if (r)
2488 		goto init_failed;
2489 
2490 	r = amdgpu_device_ip_hw_init_phase1(adev);
2491 	if (r)
2492 		goto init_failed;
2493 
2494 	r = amdgpu_device_fw_loading(adev);
2495 	if (r)
2496 		goto init_failed;
2497 
2498 	r = amdgpu_device_ip_hw_init_phase2(adev);
2499 	if (r)
2500 		goto init_failed;
2501 
2502 	/*
2503 	 * retired pages will be loaded from eeprom and reserved here,
2504 	 * it should be called after amdgpu_device_ip_hw_init_phase2  since
2505 	 * for some ASICs the RAS EEPROM code relies on SMU fully functioning
2506 	 * for I2C communication which only true at this point.
2507 	 *
2508 	 * amdgpu_ras_recovery_init may fail, but the upper only cares the
2509 	 * failure from bad gpu situation and stop amdgpu init process
2510 	 * accordingly. For other failed cases, it will still release all
2511 	 * the resource and print error message, rather than returning one
2512 	 * negative value to upper level.
2513 	 *
2514 	 * Note: theoretically, this should be called before all vram allocations
2515 	 * to protect retired page from abusing
2516 	 */
2517 	r = amdgpu_ras_recovery_init(adev);
2518 	if (r)
2519 		goto init_failed;
2520 
2521 	if (adev->gmc.xgmi.num_physical_nodes > 1)
2522 		amdgpu_xgmi_add_device(adev);
2523 
2524 	/* Don't init kfd if whole hive need to be reset during init */
2525 	if (!adev->gmc.xgmi.pending_reset)
2526 		amdgpu_amdkfd_device_init(adev);
2527 
2528 	r = amdgpu_amdkfd_resume_iommu(adev);
2529 	if (r)
2530 		goto init_failed;
2531 
2532 	amdgpu_fru_get_product_info(adev);
2533 
2534 init_failed:
2535 
2536 	return r;
2537 }
2538 
2539 /**
2540  * amdgpu_device_fill_reset_magic - writes reset magic to gart pointer
2541  *
2542  * @adev: amdgpu_device pointer
2543  *
2544  * Writes a reset magic value to the gart pointer in VRAM.  The driver calls
2545  * this function before a GPU reset.  If the value is retained after a
2546  * GPU reset, VRAM has not been lost.  Some GPU resets may destry VRAM contents.
2547  */
amdgpu_device_fill_reset_magic(struct amdgpu_device * adev)2548 static void amdgpu_device_fill_reset_magic(struct amdgpu_device *adev)
2549 {
2550 	memcpy(adev->reset_magic, adev->gart.ptr, AMDGPU_RESET_MAGIC_NUM);
2551 }
2552 
2553 /**
2554  * amdgpu_device_check_vram_lost - check if vram is valid
2555  *
2556  * @adev: amdgpu_device pointer
2557  *
2558  * Checks the reset magic value written to the gart pointer in VRAM.
2559  * The driver calls this after a GPU reset to see if the contents of
2560  * VRAM is lost or now.
2561  * returns true if vram is lost, false if not.
2562  */
amdgpu_device_check_vram_lost(struct amdgpu_device * adev)2563 static bool amdgpu_device_check_vram_lost(struct amdgpu_device *adev)
2564 {
2565 	if (memcmp(adev->gart.ptr, adev->reset_magic,
2566 			AMDGPU_RESET_MAGIC_NUM))
2567 		return true;
2568 
2569 	if (!amdgpu_in_reset(adev))
2570 		return false;
2571 
2572 	/*
2573 	 * For all ASICs with baco/mode1 reset, the VRAM is
2574 	 * always assumed to be lost.
2575 	 */
2576 	switch (amdgpu_asic_reset_method(adev)) {
2577 	case AMD_RESET_METHOD_BACO:
2578 	case AMD_RESET_METHOD_MODE1:
2579 		return true;
2580 	default:
2581 		return false;
2582 	}
2583 }
2584 
2585 /**
2586  * amdgpu_device_set_cg_state - set clockgating for amdgpu device
2587  *
2588  * @adev: amdgpu_device pointer
2589  * @state: clockgating state (gate or ungate)
2590  *
2591  * The list of all the hardware IPs that make up the asic is walked and the
2592  * set_clockgating_state callbacks are run.
2593  * Late initialization pass enabling clockgating for hardware IPs.
2594  * Fini or suspend, pass disabling clockgating for hardware IPs.
2595  * Returns 0 on success, negative error code on failure.
2596  */
2597 
amdgpu_device_set_cg_state(struct amdgpu_device * adev,enum amd_clockgating_state state)2598 int amdgpu_device_set_cg_state(struct amdgpu_device *adev,
2599 			       enum amd_clockgating_state state)
2600 {
2601 	int i, j, r;
2602 
2603 	if (amdgpu_emu_mode == 1)
2604 		return 0;
2605 
2606 	for (j = 0; j < adev->num_ip_blocks; j++) {
2607 		i = state == AMD_CG_STATE_GATE ? j : adev->num_ip_blocks - j - 1;
2608 		if (!adev->ip_blocks[i].status.late_initialized)
2609 			continue;
2610 		/* skip CG for GFX on S0ix */
2611 		if (adev->in_s0ix &&
2612 		    adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GFX)
2613 			continue;
2614 		/* skip CG for VCE/UVD, it's handled specially */
2615 		if (adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_UVD &&
2616 		    adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_VCE &&
2617 		    adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_VCN &&
2618 		    adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_JPEG &&
2619 		    adev->ip_blocks[i].version->funcs->set_clockgating_state) {
2620 			/* enable clockgating to save power */
2621 			r = adev->ip_blocks[i].version->funcs->set_clockgating_state((void *)adev,
2622 										     state);
2623 			if (r) {
2624 				DRM_ERROR("set_clockgating_state(gate) of IP block <%s> failed %d\n",
2625 					  adev->ip_blocks[i].version->funcs->name, r);
2626 				return r;
2627 			}
2628 		}
2629 	}
2630 
2631 	return 0;
2632 }
2633 
amdgpu_device_set_pg_state(struct amdgpu_device * adev,enum amd_powergating_state state)2634 int amdgpu_device_set_pg_state(struct amdgpu_device *adev,
2635 			       enum amd_powergating_state state)
2636 {
2637 	int i, j, r;
2638 
2639 	if (amdgpu_emu_mode == 1)
2640 		return 0;
2641 
2642 	for (j = 0; j < adev->num_ip_blocks; j++) {
2643 		i = state == AMD_PG_STATE_GATE ? j : adev->num_ip_blocks - j - 1;
2644 		if (!adev->ip_blocks[i].status.late_initialized)
2645 			continue;
2646 		/* skip PG for GFX on S0ix */
2647 		if (adev->in_s0ix &&
2648 		    adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GFX)
2649 			continue;
2650 		/* skip CG for VCE/UVD, it's handled specially */
2651 		if (adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_UVD &&
2652 		    adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_VCE &&
2653 		    adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_VCN &&
2654 		    adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_JPEG &&
2655 		    adev->ip_blocks[i].version->funcs->set_powergating_state) {
2656 			/* enable powergating to save power */
2657 			r = adev->ip_blocks[i].version->funcs->set_powergating_state((void *)adev,
2658 											state);
2659 			if (r) {
2660 				DRM_ERROR("set_powergating_state(gate) of IP block <%s> failed %d\n",
2661 					  adev->ip_blocks[i].version->funcs->name, r);
2662 				return r;
2663 			}
2664 		}
2665 	}
2666 	return 0;
2667 }
2668 
amdgpu_device_enable_mgpu_fan_boost(void)2669 static int amdgpu_device_enable_mgpu_fan_boost(void)
2670 {
2671 	struct amdgpu_gpu_instance *gpu_ins;
2672 	struct amdgpu_device *adev;
2673 	int i, ret = 0;
2674 
2675 	mutex_lock(&mgpu_info.mutex);
2676 
2677 	/*
2678 	 * MGPU fan boost feature should be enabled
2679 	 * only when there are two or more dGPUs in
2680 	 * the system
2681 	 */
2682 	if (mgpu_info.num_dgpu < 2)
2683 		goto out;
2684 
2685 	for (i = 0; i < mgpu_info.num_dgpu; i++) {
2686 		gpu_ins = &(mgpu_info.gpu_ins[i]);
2687 		adev = gpu_ins->adev;
2688 		if (!(adev->flags & AMD_IS_APU) &&
2689 		    !gpu_ins->mgpu_fan_enabled) {
2690 			ret = amdgpu_dpm_enable_mgpu_fan_boost(adev);
2691 			if (ret)
2692 				break;
2693 
2694 			gpu_ins->mgpu_fan_enabled = 1;
2695 		}
2696 	}
2697 
2698 out:
2699 	mutex_unlock(&mgpu_info.mutex);
2700 
2701 	return ret;
2702 }
2703 
2704 /**
2705  * amdgpu_device_ip_late_init - run late init for hardware IPs
2706  *
2707  * @adev: amdgpu_device pointer
2708  *
2709  * Late initialization pass for hardware IPs.  The list of all the hardware
2710  * IPs that make up the asic is walked and the late_init callbacks are run.
2711  * late_init covers any special initialization that an IP requires
2712  * after all of the have been initialized or something that needs to happen
2713  * late in the init process.
2714  * Returns 0 on success, negative error code on failure.
2715  */
amdgpu_device_ip_late_init(struct amdgpu_device * adev)2716 static int amdgpu_device_ip_late_init(struct amdgpu_device *adev)
2717 {
2718 	struct amdgpu_gpu_instance *gpu_instance;
2719 	int i = 0, r;
2720 
2721 	for (i = 0; i < adev->num_ip_blocks; i++) {
2722 		if (!adev->ip_blocks[i].status.hw)
2723 			continue;
2724 		if (adev->ip_blocks[i].version->funcs->late_init) {
2725 			r = adev->ip_blocks[i].version->funcs->late_init((void *)adev);
2726 			if (r) {
2727 				DRM_ERROR("late_init of IP block <%s> failed %d\n",
2728 					  adev->ip_blocks[i].version->funcs->name, r);
2729 				return r;
2730 			}
2731 		}
2732 		adev->ip_blocks[i].status.late_initialized = true;
2733 	}
2734 
2735 	amdgpu_ras_set_error_query_ready(adev, true);
2736 
2737 	amdgpu_device_set_cg_state(adev, AMD_CG_STATE_GATE);
2738 	amdgpu_device_set_pg_state(adev, AMD_PG_STATE_GATE);
2739 
2740 	amdgpu_device_fill_reset_magic(adev);
2741 
2742 	r = amdgpu_device_enable_mgpu_fan_boost();
2743 	if (r)
2744 		DRM_ERROR("enable mgpu fan boost failed (%d).\n", r);
2745 
2746 	/* For XGMI + passthrough configuration on arcturus, enable light SBR */
2747 	if (adev->asic_type == CHIP_ARCTURUS &&
2748 	    amdgpu_passthrough(adev) &&
2749 	    adev->gmc.xgmi.num_physical_nodes > 1)
2750 		smu_set_light_sbr(&adev->smu, true);
2751 
2752 	if (adev->gmc.xgmi.num_physical_nodes > 1) {
2753 		mutex_lock(&mgpu_info.mutex);
2754 
2755 		/*
2756 		 * Reset device p-state to low as this was booted with high.
2757 		 *
2758 		 * This should be performed only after all devices from the same
2759 		 * hive get initialized.
2760 		 *
2761 		 * However, it's unknown how many device in the hive in advance.
2762 		 * As this is counted one by one during devices initializations.
2763 		 *
2764 		 * So, we wait for all XGMI interlinked devices initialized.
2765 		 * This may bring some delays as those devices may come from
2766 		 * different hives. But that should be OK.
2767 		 */
2768 		if (mgpu_info.num_dgpu == adev->gmc.xgmi.num_physical_nodes) {
2769 			for (i = 0; i < mgpu_info.num_gpu; i++) {
2770 				gpu_instance = &(mgpu_info.gpu_ins[i]);
2771 				if (gpu_instance->adev->flags & AMD_IS_APU)
2772 					continue;
2773 
2774 				r = amdgpu_xgmi_set_pstate(gpu_instance->adev,
2775 						AMDGPU_XGMI_PSTATE_MIN);
2776 				if (r) {
2777 					DRM_ERROR("pstate setting failed (%d).\n", r);
2778 					break;
2779 				}
2780 			}
2781 		}
2782 
2783 		mutex_unlock(&mgpu_info.mutex);
2784 	}
2785 
2786 	return 0;
2787 }
2788 
amdgpu_device_ip_fini_early(struct amdgpu_device * adev)2789 static int amdgpu_device_ip_fini_early(struct amdgpu_device *adev)
2790 {
2791 	int i, r;
2792 
2793 	for (i = 0; i < adev->num_ip_blocks; i++) {
2794 		if (!adev->ip_blocks[i].version->funcs->early_fini)
2795 			continue;
2796 
2797 		r = adev->ip_blocks[i].version->funcs->early_fini((void *)adev);
2798 		if (r) {
2799 			DRM_DEBUG("early_fini of IP block <%s> failed %d\n",
2800 				  adev->ip_blocks[i].version->funcs->name, r);
2801 		}
2802 	}
2803 
2804 	amdgpu_amdkfd_suspend(adev, false);
2805 
2806 	amdgpu_device_set_pg_state(adev, AMD_PG_STATE_UNGATE);
2807 	amdgpu_device_set_cg_state(adev, AMD_CG_STATE_UNGATE);
2808 
2809 	/* need to disable SMC first */
2810 	for (i = 0; i < adev->num_ip_blocks; i++) {
2811 		if (!adev->ip_blocks[i].status.hw)
2812 			continue;
2813 		if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_SMC) {
2814 			r = adev->ip_blocks[i].version->funcs->hw_fini((void *)adev);
2815 			/* XXX handle errors */
2816 			if (r) {
2817 				DRM_DEBUG("hw_fini of IP block <%s> failed %d\n",
2818 					  adev->ip_blocks[i].version->funcs->name, r);
2819 			}
2820 			adev->ip_blocks[i].status.hw = false;
2821 			break;
2822 		}
2823 	}
2824 
2825 	for (i = adev->num_ip_blocks - 1; i >= 0; i--) {
2826 		if (!adev->ip_blocks[i].status.hw)
2827 			continue;
2828 
2829 		r = adev->ip_blocks[i].version->funcs->hw_fini((void *)adev);
2830 		/* XXX handle errors */
2831 		if (r) {
2832 			DRM_DEBUG("hw_fini of IP block <%s> failed %d\n",
2833 				  adev->ip_blocks[i].version->funcs->name, r);
2834 		}
2835 
2836 		adev->ip_blocks[i].status.hw = false;
2837 	}
2838 
2839 	if (amdgpu_sriov_vf(adev)) {
2840 		if (amdgpu_virt_release_full_gpu(adev, false))
2841 			DRM_ERROR("failed to release exclusive mode on fini\n");
2842 	}
2843 
2844 	return 0;
2845 }
2846 
2847 /**
2848  * amdgpu_device_ip_fini - run fini for hardware IPs
2849  *
2850  * @adev: amdgpu_device pointer
2851  *
2852  * Main teardown pass for hardware IPs.  The list of all the hardware
2853  * IPs that make up the asic is walked and the hw_fini and sw_fini callbacks
2854  * are run.  hw_fini tears down the hardware associated with each IP
2855  * and sw_fini tears down any software state associated with each IP.
2856  * Returns 0 on success, negative error code on failure.
2857  */
amdgpu_device_ip_fini(struct amdgpu_device * adev)2858 static int amdgpu_device_ip_fini(struct amdgpu_device *adev)
2859 {
2860 	int i, r;
2861 
2862 	if (amdgpu_sriov_vf(adev) && adev->virt.ras_init_done)
2863 		amdgpu_virt_release_ras_err_handler_data(adev);
2864 
2865 	amdgpu_ras_pre_fini(adev);
2866 
2867 	if (adev->gmc.xgmi.num_physical_nodes > 1)
2868 		amdgpu_xgmi_remove_device(adev);
2869 
2870 	amdgpu_amdkfd_device_fini_sw(adev);
2871 
2872 	for (i = adev->num_ip_blocks - 1; i >= 0; i--) {
2873 		if (!adev->ip_blocks[i].status.sw)
2874 			continue;
2875 
2876 		if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC) {
2877 			amdgpu_ucode_free_bo(adev);
2878 			amdgpu_free_static_csa(&adev->virt.csa_obj);
2879 			amdgpu_device_wb_fini(adev);
2880 			amdgpu_device_vram_scratch_fini(adev);
2881 			amdgpu_ib_pool_fini(adev);
2882 		}
2883 
2884 		r = adev->ip_blocks[i].version->funcs->sw_fini((void *)adev);
2885 		/* XXX handle errors */
2886 		if (r) {
2887 			DRM_DEBUG("sw_fini of IP block <%s> failed %d\n",
2888 				  adev->ip_blocks[i].version->funcs->name, r);
2889 		}
2890 		adev->ip_blocks[i].status.sw = false;
2891 		adev->ip_blocks[i].status.valid = false;
2892 	}
2893 
2894 	for (i = adev->num_ip_blocks - 1; i >= 0; i--) {
2895 		if (!adev->ip_blocks[i].status.late_initialized)
2896 			continue;
2897 		if (adev->ip_blocks[i].version->funcs->late_fini)
2898 			adev->ip_blocks[i].version->funcs->late_fini((void *)adev);
2899 		adev->ip_blocks[i].status.late_initialized = false;
2900 	}
2901 
2902 	amdgpu_ras_fini(adev);
2903 
2904 	return 0;
2905 }
2906 
2907 /**
2908  * amdgpu_device_delayed_init_work_handler - work handler for IB tests
2909  *
2910  * @work: work_struct.
2911  */
amdgpu_device_delayed_init_work_handler(struct work_struct * work)2912 static void amdgpu_device_delayed_init_work_handler(struct work_struct *work)
2913 {
2914 	struct amdgpu_device *adev =
2915 		container_of(work, struct amdgpu_device, delayed_init_work.work);
2916 	int r;
2917 
2918 	r = amdgpu_ib_ring_tests(adev);
2919 	if (r)
2920 		DRM_ERROR("ib ring test failed (%d).\n", r);
2921 }
2922 
amdgpu_device_delay_enable_gfx_off(struct work_struct * work)2923 static void amdgpu_device_delay_enable_gfx_off(struct work_struct *work)
2924 {
2925 	struct amdgpu_device *adev =
2926 		container_of(work, struct amdgpu_device, gfx.gfx_off_delay_work.work);
2927 
2928 	WARN_ON_ONCE(adev->gfx.gfx_off_state);
2929 	WARN_ON_ONCE(adev->gfx.gfx_off_req_count);
2930 
2931 	if (!amdgpu_dpm_set_powergating_by_smu(adev, AMD_IP_BLOCK_TYPE_GFX, true))
2932 		adev->gfx.gfx_off_state = true;
2933 }
2934 
2935 /**
2936  * amdgpu_device_ip_suspend_phase1 - run suspend for hardware IPs (phase 1)
2937  *
2938  * @adev: amdgpu_device pointer
2939  *
2940  * Main suspend function for hardware IPs.  The list of all the hardware
2941  * IPs that make up the asic is walked, clockgating is disabled and the
2942  * suspend callbacks are run.  suspend puts the hardware and software state
2943  * in each IP into a state suitable for suspend.
2944  * Returns 0 on success, negative error code on failure.
2945  */
amdgpu_device_ip_suspend_phase1(struct amdgpu_device * adev)2946 static int amdgpu_device_ip_suspend_phase1(struct amdgpu_device *adev)
2947 {
2948 	int i, r;
2949 
2950 	amdgpu_device_set_pg_state(adev, AMD_PG_STATE_UNGATE);
2951 	amdgpu_device_set_cg_state(adev, AMD_CG_STATE_UNGATE);
2952 
2953 	for (i = adev->num_ip_blocks - 1; i >= 0; i--) {
2954 		if (!adev->ip_blocks[i].status.valid)
2955 			continue;
2956 
2957 		/* displays are handled separately */
2958 		if (adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_DCE)
2959 			continue;
2960 
2961 		/* XXX handle errors */
2962 		r = adev->ip_blocks[i].version->funcs->suspend(adev);
2963 		/* XXX handle errors */
2964 		if (r) {
2965 			DRM_ERROR("suspend of IP block <%s> failed %d\n",
2966 				  adev->ip_blocks[i].version->funcs->name, r);
2967 			return r;
2968 		}
2969 
2970 		adev->ip_blocks[i].status.hw = false;
2971 	}
2972 
2973 	return 0;
2974 }
2975 
2976 /**
2977  * amdgpu_device_ip_suspend_phase2 - run suspend for hardware IPs (phase 2)
2978  *
2979  * @adev: amdgpu_device pointer
2980  *
2981  * Main suspend function for hardware IPs.  The list of all the hardware
2982  * IPs that make up the asic is walked, clockgating is disabled and the
2983  * suspend callbacks are run.  suspend puts the hardware and software state
2984  * in each IP into a state suitable for suspend.
2985  * Returns 0 on success, negative error code on failure.
2986  */
amdgpu_device_ip_suspend_phase2(struct amdgpu_device * adev)2987 static int amdgpu_device_ip_suspend_phase2(struct amdgpu_device *adev)
2988 {
2989 	int i, r;
2990 
2991 	if (adev->in_s0ix)
2992 		amdgpu_gfx_state_change_set(adev, sGpuChangeState_D3Entry);
2993 
2994 	for (i = adev->num_ip_blocks - 1; i >= 0; i--) {
2995 		if (!adev->ip_blocks[i].status.valid)
2996 			continue;
2997 		/* displays are handled in phase1 */
2998 		if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_DCE)
2999 			continue;
3000 		/* PSP lost connection when err_event_athub occurs */
3001 		if (amdgpu_ras_intr_triggered() &&
3002 		    adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_PSP) {
3003 			adev->ip_blocks[i].status.hw = false;
3004 			continue;
3005 		}
3006 
3007 		/* skip unnecessary suspend if we do not initialize them yet */
3008 		if (adev->gmc.xgmi.pending_reset &&
3009 		    !(adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC ||
3010 		      adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_SMC ||
3011 		      adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_COMMON ||
3012 		      adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_IH)) {
3013 			adev->ip_blocks[i].status.hw = false;
3014 			continue;
3015 		}
3016 
3017 		/* skip suspend of gfx and psp for S0ix
3018 		 * gfx is in gfxoff state, so on resume it will exit gfxoff just
3019 		 * like at runtime. PSP is also part of the always on hardware
3020 		 * so no need to suspend it.
3021 		 */
3022 		if (adev->in_s0ix &&
3023 		    (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_PSP ||
3024 		     adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GFX))
3025 			continue;
3026 
3027 		/* XXX handle errors */
3028 		r = adev->ip_blocks[i].version->funcs->suspend(adev);
3029 		/* XXX handle errors */
3030 		if (r) {
3031 			DRM_ERROR("suspend of IP block <%s> failed %d\n",
3032 				  adev->ip_blocks[i].version->funcs->name, r);
3033 		}
3034 		adev->ip_blocks[i].status.hw = false;
3035 		/* handle putting the SMC in the appropriate state */
3036 		if(!amdgpu_sriov_vf(adev)){
3037 			if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_SMC) {
3038 				r = amdgpu_dpm_set_mp1_state(adev, adev->mp1_state);
3039 				if (r) {
3040 					DRM_ERROR("SMC failed to set mp1 state %d, %d\n",
3041 							adev->mp1_state, r);
3042 					return r;
3043 				}
3044 			}
3045 		}
3046 	}
3047 
3048 	return 0;
3049 }
3050 
3051 /**
3052  * amdgpu_device_ip_suspend - run suspend for hardware IPs
3053  *
3054  * @adev: amdgpu_device pointer
3055  *
3056  * Main suspend function for hardware IPs.  The list of all the hardware
3057  * IPs that make up the asic is walked, clockgating is disabled and the
3058  * suspend callbacks are run.  suspend puts the hardware and software state
3059  * in each IP into a state suitable for suspend.
3060  * Returns 0 on success, negative error code on failure.
3061  */
amdgpu_device_ip_suspend(struct amdgpu_device * adev)3062 int amdgpu_device_ip_suspend(struct amdgpu_device *adev)
3063 {
3064 	int r;
3065 
3066 	if (amdgpu_sriov_vf(adev)) {
3067 		amdgpu_virt_fini_data_exchange(adev);
3068 		amdgpu_virt_request_full_gpu(adev, false);
3069 	}
3070 
3071 	r = amdgpu_device_ip_suspend_phase1(adev);
3072 	if (r)
3073 		return r;
3074 	r = amdgpu_device_ip_suspend_phase2(adev);
3075 
3076 	if (amdgpu_sriov_vf(adev))
3077 		amdgpu_virt_release_full_gpu(adev, false);
3078 
3079 	return r;
3080 }
3081 
amdgpu_device_ip_reinit_early_sriov(struct amdgpu_device * adev)3082 static int amdgpu_device_ip_reinit_early_sriov(struct amdgpu_device *adev)
3083 {
3084 	int i, r;
3085 
3086 	static enum amd_ip_block_type ip_order[] = {
3087 		AMD_IP_BLOCK_TYPE_COMMON,
3088 		AMD_IP_BLOCK_TYPE_GMC,
3089 		AMD_IP_BLOCK_TYPE_PSP,
3090 		AMD_IP_BLOCK_TYPE_IH,
3091 	};
3092 
3093 	for (i = 0; i < adev->num_ip_blocks; i++) {
3094 		int j;
3095 		struct amdgpu_ip_block *block;
3096 
3097 		block = &adev->ip_blocks[i];
3098 		block->status.hw = false;
3099 
3100 		for (j = 0; j < ARRAY_SIZE(ip_order); j++) {
3101 
3102 			if (block->version->type != ip_order[j] ||
3103 				!block->status.valid)
3104 				continue;
3105 
3106 			r = block->version->funcs->hw_init(adev);
3107 			DRM_INFO("RE-INIT-early: %s %s\n", block->version->funcs->name, r?"failed":"succeeded");
3108 			if (r)
3109 				return r;
3110 			block->status.hw = true;
3111 		}
3112 	}
3113 
3114 	return 0;
3115 }
3116 
amdgpu_device_ip_reinit_late_sriov(struct amdgpu_device * adev)3117 static int amdgpu_device_ip_reinit_late_sriov(struct amdgpu_device *adev)
3118 {
3119 	int i, r;
3120 
3121 	static enum amd_ip_block_type ip_order[] = {
3122 		AMD_IP_BLOCK_TYPE_SMC,
3123 		AMD_IP_BLOCK_TYPE_DCE,
3124 		AMD_IP_BLOCK_TYPE_GFX,
3125 		AMD_IP_BLOCK_TYPE_SDMA,
3126 		AMD_IP_BLOCK_TYPE_UVD,
3127 		AMD_IP_BLOCK_TYPE_VCE,
3128 		AMD_IP_BLOCK_TYPE_VCN
3129 	};
3130 
3131 	for (i = 0; i < ARRAY_SIZE(ip_order); i++) {
3132 		int j;
3133 		struct amdgpu_ip_block *block;
3134 
3135 		for (j = 0; j < adev->num_ip_blocks; j++) {
3136 			block = &adev->ip_blocks[j];
3137 
3138 			if (block->version->type != ip_order[i] ||
3139 				!block->status.valid ||
3140 				block->status.hw)
3141 				continue;
3142 
3143 			if (block->version->type == AMD_IP_BLOCK_TYPE_SMC)
3144 				r = block->version->funcs->resume(adev);
3145 			else
3146 				r = block->version->funcs->hw_init(adev);
3147 
3148 			DRM_INFO("RE-INIT-late: %s %s\n", block->version->funcs->name, r?"failed":"succeeded");
3149 			if (r)
3150 				return r;
3151 			block->status.hw = true;
3152 		}
3153 	}
3154 
3155 	return 0;
3156 }
3157 
3158 /**
3159  * amdgpu_device_ip_resume_phase1 - run resume for hardware IPs
3160  *
3161  * @adev: amdgpu_device pointer
3162  *
3163  * First resume function for hardware IPs.  The list of all the hardware
3164  * IPs that make up the asic is walked and the resume callbacks are run for
3165  * COMMON, GMC, and IH.  resume puts the hardware into a functional state
3166  * after a suspend and updates the software state as necessary.  This
3167  * function is also used for restoring the GPU after a GPU reset.
3168  * Returns 0 on success, negative error code on failure.
3169  */
amdgpu_device_ip_resume_phase1(struct amdgpu_device * adev)3170 static int amdgpu_device_ip_resume_phase1(struct amdgpu_device *adev)
3171 {
3172 	int i, r;
3173 
3174 	for (i = 0; i < adev->num_ip_blocks; i++) {
3175 		if (!adev->ip_blocks[i].status.valid || adev->ip_blocks[i].status.hw)
3176 			continue;
3177 		if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_COMMON ||
3178 		    adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC ||
3179 		    adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_IH ||
3180 		    (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_PSP && amdgpu_sriov_vf(adev))) {
3181 
3182 			r = adev->ip_blocks[i].version->funcs->resume(adev);
3183 			if (r) {
3184 				DRM_ERROR("resume of IP block <%s> failed %d\n",
3185 					  adev->ip_blocks[i].version->funcs->name, r);
3186 				return r;
3187 			}
3188 			adev->ip_blocks[i].status.hw = true;
3189 		}
3190 	}
3191 
3192 	return 0;
3193 }
3194 
3195 /**
3196  * amdgpu_device_ip_resume_phase2 - run resume for hardware IPs
3197  *
3198  * @adev: amdgpu_device pointer
3199  *
3200  * First resume function for hardware IPs.  The list of all the hardware
3201  * IPs that make up the asic is walked and the resume callbacks are run for
3202  * all blocks except COMMON, GMC, and IH.  resume puts the hardware into a
3203  * functional state after a suspend and updates the software state as
3204  * necessary.  This function is also used for restoring the GPU after a GPU
3205  * reset.
3206  * Returns 0 on success, negative error code on failure.
3207  */
amdgpu_device_ip_resume_phase2(struct amdgpu_device * adev)3208 static int amdgpu_device_ip_resume_phase2(struct amdgpu_device *adev)
3209 {
3210 	int i, r;
3211 
3212 	for (i = 0; i < adev->num_ip_blocks; i++) {
3213 		if (!adev->ip_blocks[i].status.valid || adev->ip_blocks[i].status.hw)
3214 			continue;
3215 		if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_COMMON ||
3216 		    adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC ||
3217 		    adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_IH ||
3218 		    adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_PSP)
3219 			continue;
3220 		r = adev->ip_blocks[i].version->funcs->resume(adev);
3221 		if (r) {
3222 			DRM_ERROR("resume of IP block <%s> failed %d\n",
3223 				  adev->ip_blocks[i].version->funcs->name, r);
3224 			return r;
3225 		}
3226 		adev->ip_blocks[i].status.hw = true;
3227 
3228 		if (adev->in_s0ix && adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_SMC) {
3229 			/* disable gfxoff for IP resume. The gfxoff will be re-enabled in
3230 			 * amdgpu_device_resume() after IP resume.
3231 			 */
3232 			amdgpu_gfx_off_ctrl(adev, false);
3233 			DRM_DEBUG("will disable gfxoff for re-initializing other blocks\n");
3234 		}
3235 
3236 	}
3237 
3238 	return 0;
3239 }
3240 
3241 /**
3242  * amdgpu_device_ip_resume - run resume for hardware IPs
3243  *
3244  * @adev: amdgpu_device pointer
3245  *
3246  * Main resume function for hardware IPs.  The hardware IPs
3247  * are split into two resume functions because they are
3248  * are also used in in recovering from a GPU reset and some additional
3249  * steps need to be take between them.  In this case (S3/S4) they are
3250  * run sequentially.
3251  * Returns 0 on success, negative error code on failure.
3252  */
amdgpu_device_ip_resume(struct amdgpu_device * adev)3253 static int amdgpu_device_ip_resume(struct amdgpu_device *adev)
3254 {
3255 	int r;
3256 
3257 	r = amdgpu_amdkfd_resume_iommu(adev);
3258 	if (r)
3259 		return r;
3260 
3261 	r = amdgpu_device_ip_resume_phase1(adev);
3262 	if (r)
3263 		return r;
3264 
3265 	r = amdgpu_device_fw_loading(adev);
3266 	if (r)
3267 		return r;
3268 
3269 	r = amdgpu_device_ip_resume_phase2(adev);
3270 
3271 	return r;
3272 }
3273 
3274 /**
3275  * amdgpu_device_detect_sriov_bios - determine if the board supports SR-IOV
3276  *
3277  * @adev: amdgpu_device pointer
3278  *
3279  * Query the VBIOS data tables to determine if the board supports SR-IOV.
3280  */
amdgpu_device_detect_sriov_bios(struct amdgpu_device * adev)3281 static void amdgpu_device_detect_sriov_bios(struct amdgpu_device *adev)
3282 {
3283 	if (amdgpu_sriov_vf(adev)) {
3284 		if (adev->is_atom_fw) {
3285 			if (amdgpu_atomfirmware_gpu_virtualization_supported(adev))
3286 				adev->virt.caps |= AMDGPU_SRIOV_CAPS_SRIOV_VBIOS;
3287 		} else {
3288 			if (amdgpu_atombios_has_gpu_virtualization_table(adev))
3289 				adev->virt.caps |= AMDGPU_SRIOV_CAPS_SRIOV_VBIOS;
3290 		}
3291 
3292 		if (!(adev->virt.caps & AMDGPU_SRIOV_CAPS_SRIOV_VBIOS))
3293 			amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_NO_VBIOS, 0, 0);
3294 	}
3295 }
3296 
3297 /**
3298  * amdgpu_device_asic_has_dc_support - determine if DC supports the asic
3299  *
3300  * @asic_type: AMD asic type
3301  *
3302  * Check if there is DC (new modesetting infrastructre) support for an asic.
3303  * returns true if DC has support, false if not.
3304  */
amdgpu_device_asic_has_dc_support(enum amd_asic_type asic_type)3305 bool amdgpu_device_asic_has_dc_support(enum amd_asic_type asic_type)
3306 {
3307 	switch (asic_type) {
3308 #if defined(CONFIG_DRM_AMD_DC)
3309 #if defined(CONFIG_DRM_AMD_DC_SI)
3310 	case CHIP_TAHITI:
3311 	case CHIP_PITCAIRN:
3312 	case CHIP_VERDE:
3313 	case CHIP_OLAND:
3314 #endif
3315 	case CHIP_BONAIRE:
3316 	case CHIP_KAVERI:
3317 	case CHIP_KABINI:
3318 	case CHIP_MULLINS:
3319 		/*
3320 		 * We have systems in the wild with these ASICs that require
3321 		 * LVDS and VGA support which is not supported with DC.
3322 		 *
3323 		 * Fallback to the non-DC driver here by default so as not to
3324 		 * cause regressions.
3325 		 */
3326 		return amdgpu_dc > 0;
3327 	case CHIP_HAWAII:
3328 	case CHIP_CARRIZO:
3329 	case CHIP_STONEY:
3330 	case CHIP_POLARIS10:
3331 	case CHIP_POLARIS11:
3332 	case CHIP_POLARIS12:
3333 	case CHIP_VEGAM:
3334 	case CHIP_TONGA:
3335 	case CHIP_FIJI:
3336 	case CHIP_VEGA10:
3337 	case CHIP_VEGA12:
3338 	case CHIP_VEGA20:
3339 #if defined(CONFIG_DRM_AMD_DC_DCN)
3340 	case CHIP_RAVEN:
3341 	case CHIP_NAVI10:
3342 	case CHIP_NAVI14:
3343 	case CHIP_NAVI12:
3344 	case CHIP_RENOIR:
3345 	case CHIP_SIENNA_CICHLID:
3346 	case CHIP_NAVY_FLOUNDER:
3347 	case CHIP_DIMGREY_CAVEFISH:
3348 	case CHIP_BEIGE_GOBY:
3349 	case CHIP_VANGOGH:
3350 	case CHIP_YELLOW_CARP:
3351 #endif
3352 		return amdgpu_dc != 0;
3353 #endif
3354 	default:
3355 		if (amdgpu_dc > 0)
3356 			DRM_INFO_ONCE("Display Core has been requested via kernel parameter "
3357 					 "but isn't supported by ASIC, ignoring\n");
3358 		return false;
3359 	}
3360 }
3361 
3362 /**
3363  * amdgpu_device_has_dc_support - check if dc is supported
3364  *
3365  * @adev: amdgpu_device pointer
3366  *
3367  * Returns true for supported, false for not supported
3368  */
amdgpu_device_has_dc_support(struct amdgpu_device * adev)3369 bool amdgpu_device_has_dc_support(struct amdgpu_device *adev)
3370 {
3371 	if (amdgpu_sriov_vf(adev) ||
3372 	    adev->enable_virtual_display ||
3373 	    (adev->harvest_ip_mask & AMD_HARVEST_IP_DMU_MASK))
3374 		return false;
3375 
3376 	return amdgpu_device_asic_has_dc_support(adev->asic_type);
3377 }
3378 
amdgpu_device_xgmi_reset_func(struct work_struct * __work)3379 static void amdgpu_device_xgmi_reset_func(struct work_struct *__work)
3380 {
3381 	struct amdgpu_device *adev =
3382 		container_of(__work, struct amdgpu_device, xgmi_reset_work);
3383 	struct amdgpu_hive_info *hive = amdgpu_get_xgmi_hive(adev);
3384 
3385 	/* It's a bug to not have a hive within this function */
3386 	if (WARN_ON(!hive))
3387 		return;
3388 
3389 	/*
3390 	 * Use task barrier to synchronize all xgmi reset works across the
3391 	 * hive. task_barrier_enter and task_barrier_exit will block
3392 	 * until all the threads running the xgmi reset works reach
3393 	 * those points. task_barrier_full will do both blocks.
3394 	 */
3395 	if (amdgpu_asic_reset_method(adev) == AMD_RESET_METHOD_BACO) {
3396 
3397 		task_barrier_enter(&hive->tb);
3398 		adev->asic_reset_res = amdgpu_device_baco_enter(adev_to_drm(adev));
3399 
3400 		if (adev->asic_reset_res)
3401 			goto fail;
3402 
3403 		task_barrier_exit(&hive->tb);
3404 		adev->asic_reset_res = amdgpu_device_baco_exit(adev_to_drm(adev));
3405 
3406 		if (adev->asic_reset_res)
3407 			goto fail;
3408 
3409 		if (adev->mmhub.ras_funcs &&
3410 		    adev->mmhub.ras_funcs->reset_ras_error_count)
3411 			adev->mmhub.ras_funcs->reset_ras_error_count(adev);
3412 	} else {
3413 
3414 		task_barrier_full(&hive->tb);
3415 		adev->asic_reset_res =  amdgpu_asic_reset(adev);
3416 	}
3417 
3418 fail:
3419 	if (adev->asic_reset_res)
3420 		DRM_WARN("ASIC reset failed with error, %d for drm dev, %s",
3421 			 adev->asic_reset_res, adev_to_drm(adev)->unique);
3422 	amdgpu_put_xgmi_hive(hive);
3423 }
3424 
amdgpu_device_get_job_timeout_settings(struct amdgpu_device * adev)3425 static int amdgpu_device_get_job_timeout_settings(struct amdgpu_device *adev)
3426 {
3427 	char *input = amdgpu_lockup_timeout;
3428 	char *timeout_setting = NULL;
3429 	int index = 0;
3430 	long timeout;
3431 	int ret = 0;
3432 
3433 	/*
3434 	 * By default timeout for non compute jobs is 10000
3435 	 * and 60000 for compute jobs.
3436 	 * In SR-IOV or passthrough mode, timeout for compute
3437 	 * jobs are 60000 by default.
3438 	 */
3439 	adev->gfx_timeout = msecs_to_jiffies(10000);
3440 	adev->sdma_timeout = adev->video_timeout = adev->gfx_timeout;
3441 	if (amdgpu_sriov_vf(adev))
3442 		adev->compute_timeout = amdgpu_sriov_is_pp_one_vf(adev) ?
3443 					msecs_to_jiffies(60000) : msecs_to_jiffies(10000);
3444 	else
3445 		adev->compute_timeout =  msecs_to_jiffies(60000);
3446 
3447 	if (strnlen(input, AMDGPU_MAX_TIMEOUT_PARAM_LENGTH)) {
3448 		while ((timeout_setting = strsep(&input, ",")) &&
3449 				strnlen(timeout_setting, AMDGPU_MAX_TIMEOUT_PARAM_LENGTH)) {
3450 			ret = kstrtol(timeout_setting, 0, &timeout);
3451 			if (ret)
3452 				return ret;
3453 
3454 			if (timeout == 0) {
3455 				index++;
3456 				continue;
3457 			} else if (timeout < 0) {
3458 				timeout = MAX_SCHEDULE_TIMEOUT;
3459 			} else {
3460 				timeout = msecs_to_jiffies(timeout);
3461 			}
3462 
3463 			switch (index++) {
3464 			case 0:
3465 				adev->gfx_timeout = timeout;
3466 				break;
3467 			case 1:
3468 				adev->compute_timeout = timeout;
3469 				break;
3470 			case 2:
3471 				adev->sdma_timeout = timeout;
3472 				break;
3473 			case 3:
3474 				adev->video_timeout = timeout;
3475 				break;
3476 			default:
3477 				break;
3478 			}
3479 		}
3480 		/*
3481 		 * There is only one value specified and
3482 		 * it should apply to all non-compute jobs.
3483 		 */
3484 		if (index == 1) {
3485 			adev->sdma_timeout = adev->video_timeout = adev->gfx_timeout;
3486 			if (amdgpu_sriov_vf(adev) || amdgpu_passthrough(adev))
3487 				adev->compute_timeout = adev->gfx_timeout;
3488 		}
3489 	}
3490 
3491 	return ret;
3492 }
3493 
3494 static const struct attribute *amdgpu_dev_attributes[] = {
3495 	&dev_attr_product_name.attr,
3496 	&dev_attr_product_number.attr,
3497 	&dev_attr_serial_number.attr,
3498 	&dev_attr_pcie_replay_count.attr,
3499 	NULL
3500 };
3501 
3502 /**
3503  * amdgpu_device_init - initialize the driver
3504  *
3505  * @adev: amdgpu_device pointer
3506  * @flags: driver flags
3507  *
3508  * Initializes the driver info and hw (all asics).
3509  * Returns 0 for success or an error on failure.
3510  * Called at driver startup.
3511  */
amdgpu_device_init(struct amdgpu_device * adev,uint32_t flags)3512 int amdgpu_device_init(struct amdgpu_device *adev,
3513 		       uint32_t flags)
3514 {
3515 	struct drm_device *ddev = adev_to_drm(adev);
3516 	struct pci_dev *pdev = adev->pdev;
3517 	int r, i;
3518 	bool px = false;
3519 	u32 max_MBps;
3520 	int tmp;
3521 
3522 	adev->shutdown = false;
3523 	adev->flags = flags;
3524 
3525 	if (amdgpu_force_asic_type >= 0 && amdgpu_force_asic_type < CHIP_LAST)
3526 		adev->asic_type = amdgpu_force_asic_type;
3527 	else
3528 		adev->asic_type = flags & AMD_ASIC_MASK;
3529 
3530 	adev->usec_timeout = AMDGPU_MAX_USEC_TIMEOUT;
3531 	if (amdgpu_emu_mode == 1)
3532 		adev->usec_timeout *= 10;
3533 	adev->gmc.gart_size = 512 * 1024 * 1024;
3534 	adev->accel_working = false;
3535 	adev->num_rings = 0;
3536 	adev->mman.buffer_funcs = NULL;
3537 	adev->mman.buffer_funcs_ring = NULL;
3538 	adev->vm_manager.vm_pte_funcs = NULL;
3539 	adev->vm_manager.vm_pte_num_scheds = 0;
3540 	adev->gmc.gmc_funcs = NULL;
3541 	adev->harvest_ip_mask = 0x0;
3542 	adev->fence_context = dma_fence_context_alloc(AMDGPU_MAX_RINGS);
3543 	bitmap_zero(adev->gfx.pipe_reserve_bitmap, AMDGPU_MAX_COMPUTE_QUEUES);
3544 
3545 	adev->smc_rreg = &amdgpu_invalid_rreg;
3546 	adev->smc_wreg = &amdgpu_invalid_wreg;
3547 	adev->pcie_rreg = &amdgpu_invalid_rreg;
3548 	adev->pcie_wreg = &amdgpu_invalid_wreg;
3549 	adev->pciep_rreg = &amdgpu_invalid_rreg;
3550 	adev->pciep_wreg = &amdgpu_invalid_wreg;
3551 	adev->pcie_rreg64 = &amdgpu_invalid_rreg64;
3552 	adev->pcie_wreg64 = &amdgpu_invalid_wreg64;
3553 	adev->uvd_ctx_rreg = &amdgpu_invalid_rreg;
3554 	adev->uvd_ctx_wreg = &amdgpu_invalid_wreg;
3555 	adev->didt_rreg = &amdgpu_invalid_rreg;
3556 	adev->didt_wreg = &amdgpu_invalid_wreg;
3557 	adev->gc_cac_rreg = &amdgpu_invalid_rreg;
3558 	adev->gc_cac_wreg = &amdgpu_invalid_wreg;
3559 	adev->audio_endpt_rreg = &amdgpu_block_invalid_rreg;
3560 	adev->audio_endpt_wreg = &amdgpu_block_invalid_wreg;
3561 
3562 	DRM_INFO("initializing kernel modesetting (%s 0x%04X:0x%04X 0x%04X:0x%04X 0x%02X).\n",
3563 		 amdgpu_asic_name[adev->asic_type], pdev->vendor, pdev->device,
3564 		 pdev->subsystem_vendor, pdev->subsystem_device, pdev->revision);
3565 
3566 	/* mutex initialization are all done here so we
3567 	 * can recall function without having locking issues */
3568 	mutex_init(&adev->firmware.mutex);
3569 	mutex_init(&adev->pm.mutex);
3570 	mutex_init(&adev->gfx.gpu_clock_mutex);
3571 	mutex_init(&adev->srbm_mutex);
3572 	mutex_init(&adev->gfx.pipe_reserve_mutex);
3573 	mutex_init(&adev->gfx.gfx_off_mutex);
3574 	mutex_init(&adev->grbm_idx_mutex);
3575 	mutex_init(&adev->mn_lock);
3576 	mutex_init(&adev->virt.vf_errors.lock);
3577 	hash_init(adev->mn_hash);
3578 	atomic_set(&adev->in_gpu_reset, 0);
3579 	init_rwsem(&adev->reset_sem);
3580 	mutex_init(&adev->psp.mutex);
3581 	mutex_init(&adev->notifier_lock);
3582 
3583 	r = amdgpu_device_init_apu_flags(adev);
3584 	if (r)
3585 		return r;
3586 
3587 	r = amdgpu_device_check_arguments(adev);
3588 	if (r)
3589 		return r;
3590 
3591 	spin_lock_init(&adev->mmio_idx_lock);
3592 	spin_lock_init(&adev->smc_idx_lock);
3593 	spin_lock_init(&adev->pcie_idx_lock);
3594 	spin_lock_init(&adev->uvd_ctx_idx_lock);
3595 	spin_lock_init(&adev->didt_idx_lock);
3596 	spin_lock_init(&adev->gc_cac_idx_lock);
3597 	spin_lock_init(&adev->se_cac_idx_lock);
3598 	spin_lock_init(&adev->audio_endpt_idx_lock);
3599 	spin_lock_init(&adev->mm_stats.lock);
3600 
3601 	INIT_LIST_HEAD(&adev->shadow_list);
3602 	mutex_init(&adev->shadow_list_lock);
3603 
3604 	INIT_LIST_HEAD(&adev->reset_list);
3605 
3606 	INIT_DELAYED_WORK(&adev->delayed_init_work,
3607 			  amdgpu_device_delayed_init_work_handler);
3608 	INIT_DELAYED_WORK(&adev->gfx.gfx_off_delay_work,
3609 			  amdgpu_device_delay_enable_gfx_off);
3610 
3611 	INIT_WORK(&adev->xgmi_reset_work, amdgpu_device_xgmi_reset_func);
3612 
3613 	adev->gfx.gfx_off_req_count = 1;
3614 	adev->pm.ac_power = power_supply_is_system_supplied() > 0;
3615 
3616 	atomic_set(&adev->throttling_logging_enabled, 1);
3617 	/*
3618 	 * If throttling continues, logging will be performed every minute
3619 	 * to avoid log flooding. "-1" is subtracted since the thermal
3620 	 * throttling interrupt comes every second. Thus, the total logging
3621 	 * interval is 59 seconds(retelimited printk interval) + 1(waiting
3622 	 * for throttling interrupt) = 60 seconds.
3623 	 */
3624 	ratelimit_state_init(&adev->throttling_logging_rs, (60 - 1) * HZ, 1);
3625 	ratelimit_set_flags(&adev->throttling_logging_rs, RATELIMIT_MSG_ON_RELEASE);
3626 
3627 	/* Registers mapping */
3628 	/* TODO: block userspace mapping of io register */
3629 	if (adev->asic_type >= CHIP_BONAIRE) {
3630 		adev->rmmio_base = pci_resource_start(adev->pdev, 5);
3631 		adev->rmmio_size = pci_resource_len(adev->pdev, 5);
3632 	} else {
3633 		adev->rmmio_base = pci_resource_start(adev->pdev, 2);
3634 		adev->rmmio_size = pci_resource_len(adev->pdev, 2);
3635 	}
3636 
3637 	for (i = 0; i < AMD_IP_BLOCK_TYPE_NUM; i++)
3638 		atomic_set(&adev->pm.pwr_state[i], POWER_STATE_UNKNOWN);
3639 
3640 	adev->rmmio = ioremap(adev->rmmio_base, adev->rmmio_size);
3641 	if (adev->rmmio == NULL) {
3642 		return -ENOMEM;
3643 	}
3644 	DRM_INFO("register mmio base: 0x%08X\n", (uint32_t)adev->rmmio_base);
3645 	DRM_INFO("register mmio size: %u\n", (unsigned)adev->rmmio_size);
3646 
3647 	/* enable PCIE atomic ops */
3648 	r = pci_enable_atomic_ops_to_root(adev->pdev,
3649 					  PCI_EXP_DEVCAP2_ATOMIC_COMP32 |
3650 					  PCI_EXP_DEVCAP2_ATOMIC_COMP64);
3651 	if (r) {
3652 		adev->have_atomics_support = false;
3653 		DRM_INFO("PCIE atomic ops is not supported\n");
3654 	} else {
3655 		adev->have_atomics_support = true;
3656 	}
3657 
3658 	amdgpu_device_get_pcie_info(adev);
3659 
3660 	if (amdgpu_mcbp)
3661 		DRM_INFO("MCBP is enabled\n");
3662 
3663 	if (amdgpu_mes && adev->asic_type >= CHIP_NAVI10)
3664 		adev->enable_mes = true;
3665 
3666 	/* detect hw virtualization here */
3667 	amdgpu_detect_virtualization(adev);
3668 
3669 	r = amdgpu_device_get_job_timeout_settings(adev);
3670 	if (r) {
3671 		dev_err(adev->dev, "invalid lockup_timeout parameter syntax\n");
3672 		return r;
3673 	}
3674 
3675 	/* early init functions */
3676 	r = amdgpu_device_ip_early_init(adev);
3677 	if (r)
3678 		return r;
3679 
3680 	/* Get rid of things like offb */
3681 	r = drm_aperture_remove_conflicting_pci_framebuffers(adev->pdev, &amdgpu_kms_driver);
3682 	if (r)
3683 		return r;
3684 
3685 	/* doorbell bar mapping and doorbell index init*/
3686 	amdgpu_device_doorbell_init(adev);
3687 
3688 	if (amdgpu_emu_mode == 1) {
3689 		/* post the asic on emulation mode */
3690 		emu_soc_asic_init(adev);
3691 		goto fence_driver_init;
3692 	}
3693 
3694 	amdgpu_reset_init(adev);
3695 
3696 	/* detect if we are with an SRIOV vbios */
3697 	amdgpu_device_detect_sriov_bios(adev);
3698 
3699 	/* check if we need to reset the asic
3700 	 *  E.g., driver was not cleanly unloaded previously, etc.
3701 	 */
3702 	if (!amdgpu_sriov_vf(adev) && amdgpu_asic_need_reset_on_init(adev)) {
3703 		if (adev->gmc.xgmi.num_physical_nodes) {
3704 			dev_info(adev->dev, "Pending hive reset.\n");
3705 			adev->gmc.xgmi.pending_reset = true;
3706 			/* Only need to init necessary block for SMU to handle the reset */
3707 			for (i = 0; i < adev->num_ip_blocks; i++) {
3708 				if (!adev->ip_blocks[i].status.valid)
3709 					continue;
3710 				if (!(adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC ||
3711 				      adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_COMMON ||
3712 				      adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_IH ||
3713 				      adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_SMC)) {
3714 					DRM_DEBUG("IP %s disabled for hw_init.\n",
3715 						adev->ip_blocks[i].version->funcs->name);
3716 					adev->ip_blocks[i].status.hw = true;
3717 				}
3718 			}
3719 		} else {
3720 			tmp = amdgpu_reset_method;
3721 			/* It should do a default reset when loading or reloading the driver,
3722 			 * regardless of the module parameter reset_method.
3723 			 */
3724 			amdgpu_reset_method = AMD_RESET_METHOD_NONE;
3725 			r = amdgpu_asic_reset(adev);
3726 			amdgpu_reset_method = tmp;
3727 			if (r) {
3728 				dev_err(adev->dev, "asic reset on init failed\n");
3729 				goto failed;
3730 			}
3731 		}
3732 	}
3733 
3734 	pci_enable_pcie_error_reporting(adev->pdev);
3735 
3736 	/* Post card if necessary */
3737 	if (amdgpu_device_need_post(adev)) {
3738 		if (!adev->bios) {
3739 			dev_err(adev->dev, "no vBIOS found\n");
3740 			r = -EINVAL;
3741 			goto failed;
3742 		}
3743 		DRM_INFO("GPU posting now...\n");
3744 		r = amdgpu_device_asic_init(adev);
3745 		if (r) {
3746 			dev_err(adev->dev, "gpu post error!\n");
3747 			goto failed;
3748 		}
3749 	}
3750 
3751 	if (adev->is_atom_fw) {
3752 		/* Initialize clocks */
3753 		r = amdgpu_atomfirmware_get_clock_info(adev);
3754 		if (r) {
3755 			dev_err(adev->dev, "amdgpu_atomfirmware_get_clock_info failed\n");
3756 			amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_ATOMBIOS_GET_CLOCK_FAIL, 0, 0);
3757 			goto failed;
3758 		}
3759 	} else {
3760 		/* Initialize clocks */
3761 		r = amdgpu_atombios_get_clock_info(adev);
3762 		if (r) {
3763 			dev_err(adev->dev, "amdgpu_atombios_get_clock_info failed\n");
3764 			amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_ATOMBIOS_GET_CLOCK_FAIL, 0, 0);
3765 			goto failed;
3766 		}
3767 		/* init i2c buses */
3768 		if (!amdgpu_device_has_dc_support(adev))
3769 			amdgpu_atombios_i2c_init(adev);
3770 	}
3771 
3772 fence_driver_init:
3773 	/* Fence driver */
3774 	r = amdgpu_fence_driver_sw_init(adev);
3775 	if (r) {
3776 		dev_err(adev->dev, "amdgpu_fence_driver_sw_init failed\n");
3777 		amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_FENCE_INIT_FAIL, 0, 0);
3778 		goto failed;
3779 	}
3780 
3781 	/* init the mode config */
3782 	drm_mode_config_init(adev_to_drm(adev));
3783 
3784 	r = amdgpu_device_ip_init(adev);
3785 	if (r) {
3786 		dev_err(adev->dev, "amdgpu_device_ip_init failed\n");
3787 		amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_AMDGPU_INIT_FAIL, 0, 0);
3788 		goto release_ras_con;
3789 	}
3790 
3791 	amdgpu_fence_driver_hw_init(adev);
3792 
3793 	dev_info(adev->dev,
3794 		"SE %d, SH per SE %d, CU per SH %d, active_cu_number %d\n",
3795 			adev->gfx.config.max_shader_engines,
3796 			adev->gfx.config.max_sh_per_se,
3797 			adev->gfx.config.max_cu_per_sh,
3798 			adev->gfx.cu_info.number);
3799 
3800 	adev->accel_working = true;
3801 
3802 	amdgpu_vm_check_compute_bug(adev);
3803 
3804 	/* Initialize the buffer migration limit. */
3805 	if (amdgpu_moverate >= 0)
3806 		max_MBps = amdgpu_moverate;
3807 	else
3808 		max_MBps = 8; /* Allow 8 MB/s. */
3809 	/* Get a log2 for easy divisions. */
3810 	adev->mm_stats.log2_max_MBps = ilog2(max(1u, max_MBps));
3811 
3812 	amdgpu_fbdev_init(adev);
3813 
3814 	r = amdgpu_pm_sysfs_init(adev);
3815 	if (r) {
3816 		adev->pm_sysfs_en = false;
3817 		DRM_ERROR("registering pm debugfs failed (%d).\n", r);
3818 	} else
3819 		adev->pm_sysfs_en = true;
3820 
3821 	r = amdgpu_ucode_sysfs_init(adev);
3822 	if (r) {
3823 		adev->ucode_sysfs_en = false;
3824 		DRM_ERROR("Creating firmware sysfs failed (%d).\n", r);
3825 	} else
3826 		adev->ucode_sysfs_en = true;
3827 
3828 	if ((amdgpu_testing & 1)) {
3829 		if (adev->accel_working)
3830 			amdgpu_test_moves(adev);
3831 		else
3832 			DRM_INFO("amdgpu: acceleration disabled, skipping move tests\n");
3833 	}
3834 	if (amdgpu_benchmarking) {
3835 		if (adev->accel_working)
3836 			amdgpu_benchmark(adev, amdgpu_benchmarking);
3837 		else
3838 			DRM_INFO("amdgpu: acceleration disabled, skipping benchmarks\n");
3839 	}
3840 
3841 	/*
3842 	 * Register gpu instance before amdgpu_device_enable_mgpu_fan_boost.
3843 	 * Otherwise the mgpu fan boost feature will be skipped due to the
3844 	 * gpu instance is counted less.
3845 	 */
3846 	amdgpu_register_gpu_instance(adev);
3847 
3848 	/* enable clockgating, etc. after ib tests, etc. since some blocks require
3849 	 * explicit gating rather than handling it automatically.
3850 	 */
3851 	if (!adev->gmc.xgmi.pending_reset) {
3852 		r = amdgpu_device_ip_late_init(adev);
3853 		if (r) {
3854 			dev_err(adev->dev, "amdgpu_device_ip_late_init failed\n");
3855 			amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_AMDGPU_LATE_INIT_FAIL, 0, r);
3856 			goto release_ras_con;
3857 		}
3858 		/* must succeed. */
3859 		amdgpu_ras_resume(adev);
3860 		queue_delayed_work(system_wq, &adev->delayed_init_work,
3861 				   msecs_to_jiffies(AMDGPU_RESUME_MS));
3862 	}
3863 
3864 	if (amdgpu_sriov_vf(adev)) {
3865 		amdgpu_virt_release_full_gpu(adev, true);
3866 		flush_delayed_work(&adev->delayed_init_work);
3867 	}
3868 
3869 	r = sysfs_create_files(&adev->dev->kobj, amdgpu_dev_attributes);
3870 	if (r)
3871 		dev_err(adev->dev, "Could not create amdgpu device attr\n");
3872 
3873 	if (IS_ENABLED(CONFIG_PERF_EVENTS))
3874 		r = amdgpu_pmu_init(adev);
3875 	if (r)
3876 		dev_err(adev->dev, "amdgpu_pmu_init failed\n");
3877 
3878 	/* Have stored pci confspace at hand for restore in sudden PCI error */
3879 	if (amdgpu_device_cache_pci_state(adev->pdev))
3880 		pci_restore_state(pdev);
3881 
3882 	/* if we have > 1 VGA cards, then disable the amdgpu VGA resources */
3883 	/* this will fail for cards that aren't VGA class devices, just
3884 	 * ignore it */
3885 	if ((adev->pdev->class >> 8) == PCI_CLASS_DISPLAY_VGA)
3886 		vga_client_register(adev->pdev, amdgpu_device_vga_set_decode);
3887 
3888 	if (amdgpu_device_supports_px(ddev)) {
3889 		px = true;
3890 		vga_switcheroo_register_client(adev->pdev,
3891 					       &amdgpu_switcheroo_ops, px);
3892 		vga_switcheroo_init_domain_pm_ops(adev->dev, &adev->vga_pm_domain);
3893 	}
3894 
3895 	if (adev->gmc.xgmi.pending_reset)
3896 		queue_delayed_work(system_wq, &mgpu_info.delayed_reset_work,
3897 				   msecs_to_jiffies(AMDGPU_RESUME_MS));
3898 
3899 	return 0;
3900 
3901 release_ras_con:
3902 	if (amdgpu_sriov_vf(adev))
3903 		amdgpu_virt_release_full_gpu(adev, true);
3904 
3905 	/* failed in exclusive mode due to timeout */
3906 	if (amdgpu_sriov_vf(adev) &&
3907 		!amdgpu_sriov_runtime(adev) &&
3908 		amdgpu_virt_mmio_blocked(adev) &&
3909 		!amdgpu_virt_wait_reset(adev)) {
3910 		dev_err(adev->dev, "VF exclusive mode timeout\n");
3911 		/* Don't send request since VF is inactive. */
3912 		adev->virt.caps &= ~AMDGPU_SRIOV_CAPS_RUNTIME;
3913 		adev->virt.ops = NULL;
3914 		r = -EAGAIN;
3915 	}
3916 	amdgpu_release_ras_context(adev);
3917 
3918 failed:
3919 	amdgpu_vf_error_trans_all(adev);
3920 
3921 	return r;
3922 }
3923 
amdgpu_device_unmap_mmio(struct amdgpu_device * adev)3924 static void amdgpu_device_unmap_mmio(struct amdgpu_device *adev)
3925 {
3926 	/* Clear all CPU mappings pointing to this device */
3927 	unmap_mapping_range(adev->ddev.anon_inode->i_mapping, 0, 0, 1);
3928 
3929 	/* Unmap all mapped bars - Doorbell, registers and VRAM */
3930 	amdgpu_device_doorbell_fini(adev);
3931 
3932 	iounmap(adev->rmmio);
3933 	adev->rmmio = NULL;
3934 	if (adev->mman.aper_base_kaddr)
3935 		iounmap(adev->mman.aper_base_kaddr);
3936 	adev->mman.aper_base_kaddr = NULL;
3937 
3938 	/* Memory manager related */
3939 	if (!adev->gmc.xgmi.connected_to_cpu) {
3940 		arch_phys_wc_del(adev->gmc.vram_mtrr);
3941 		arch_io_free_memtype_wc(adev->gmc.aper_base, adev->gmc.aper_size);
3942 	}
3943 }
3944 
3945 /**
3946  * amdgpu_device_fini - tear down the driver
3947  *
3948  * @adev: amdgpu_device pointer
3949  *
3950  * Tear down the driver info (all asics).
3951  * Called at driver shutdown.
3952  */
amdgpu_device_fini_hw(struct amdgpu_device * adev)3953 void amdgpu_device_fini_hw(struct amdgpu_device *adev)
3954 {
3955 	dev_info(adev->dev, "amdgpu: finishing device.\n");
3956 	flush_delayed_work(&adev->delayed_init_work);
3957 	if (adev->mman.initialized) {
3958 		flush_delayed_work(&adev->mman.bdev.wq);
3959 		ttm_bo_lock_delayed_workqueue(&adev->mman.bdev);
3960 	}
3961 	adev->shutdown = true;
3962 
3963 	/* make sure IB test finished before entering exclusive mode
3964 	 * to avoid preemption on IB test
3965 	 * */
3966 	if (amdgpu_sriov_vf(adev)) {
3967 		amdgpu_virt_request_full_gpu(adev, false);
3968 		amdgpu_virt_fini_data_exchange(adev);
3969 	}
3970 
3971 	/* disable all interrupts */
3972 	amdgpu_irq_disable_all(adev);
3973 	if (adev->mode_info.mode_config_initialized){
3974 		if (!drm_drv_uses_atomic_modeset(adev_to_drm(adev)))
3975 			drm_helper_force_disable_all(adev_to_drm(adev));
3976 		else
3977 			drm_atomic_helper_shutdown(adev_to_drm(adev));
3978 	}
3979 	amdgpu_fence_driver_hw_fini(adev);
3980 
3981 	if (adev->pm_sysfs_en)
3982 		amdgpu_pm_sysfs_fini(adev);
3983 	if (adev->ucode_sysfs_en)
3984 		amdgpu_ucode_sysfs_fini(adev);
3985 	sysfs_remove_files(&adev->dev->kobj, amdgpu_dev_attributes);
3986 
3987 	amdgpu_fbdev_fini(adev);
3988 
3989 	amdgpu_irq_fini_hw(adev);
3990 
3991 	amdgpu_device_ip_fini_early(adev);
3992 
3993 	amdgpu_gart_dummy_page_fini(adev);
3994 
3995 	amdgpu_device_unmap_mmio(adev);
3996 }
3997 
amdgpu_device_fini_sw(struct amdgpu_device * adev)3998 void amdgpu_device_fini_sw(struct amdgpu_device *adev)
3999 {
4000 	amdgpu_fence_driver_sw_fini(adev);
4001 	amdgpu_device_ip_fini(adev);
4002 	release_firmware(adev->firmware.gpu_info_fw);
4003 	adev->firmware.gpu_info_fw = NULL;
4004 	adev->accel_working = false;
4005 
4006 	amdgpu_reset_fini(adev);
4007 
4008 	/* free i2c buses */
4009 	if (!amdgpu_device_has_dc_support(adev))
4010 		amdgpu_i2c_fini(adev);
4011 
4012 	if (amdgpu_emu_mode != 1)
4013 		amdgpu_atombios_fini(adev);
4014 
4015 	kfree(adev->bios);
4016 	adev->bios = NULL;
4017 	if (amdgpu_device_supports_px(adev_to_drm(adev))) {
4018 		vga_switcheroo_unregister_client(adev->pdev);
4019 		vga_switcheroo_fini_domain_pm_ops(adev->dev);
4020 	}
4021 	if ((adev->pdev->class >> 8) == PCI_CLASS_DISPLAY_VGA)
4022 		vga_client_unregister(adev->pdev);
4023 
4024 	if (IS_ENABLED(CONFIG_PERF_EVENTS))
4025 		amdgpu_pmu_fini(adev);
4026 	if (adev->mman.discovery_bin)
4027 		amdgpu_discovery_fini(adev);
4028 
4029 	kfree(adev->pci_state);
4030 
4031 }
4032 
4033 /**
4034  * amdgpu_device_evict_resources - evict device resources
4035  * @adev: amdgpu device object
4036  *
4037  * Evicts all ttm device resources(vram BOs, gart table) from the lru list
4038  * of the vram memory type. Mainly used for evicting device resources
4039  * at suspend time.
4040  *
4041  */
amdgpu_device_evict_resources(struct amdgpu_device * adev)4042 static void amdgpu_device_evict_resources(struct amdgpu_device *adev)
4043 {
4044 	/* No need to evict vram on APUs for suspend to ram or s2idle */
4045 	if ((adev->in_s3 || adev->in_s0ix) && (adev->flags & AMD_IS_APU))
4046 		return;
4047 
4048 	if (amdgpu_ttm_evict_resources(adev, TTM_PL_VRAM))
4049 		DRM_WARN("evicting device resources failed\n");
4050 
4051 }
4052 
4053 /*
4054  * Suspend & resume.
4055  */
4056 /**
4057  * amdgpu_device_suspend - initiate device suspend
4058  *
4059  * @dev: drm dev pointer
4060  * @fbcon : notify the fbdev of suspend
4061  *
4062  * Puts the hw in the suspend state (all asics).
4063  * Returns 0 for success or an error on failure.
4064  * Called at driver suspend.
4065  */
amdgpu_device_suspend(struct drm_device * dev,bool fbcon)4066 int amdgpu_device_suspend(struct drm_device *dev, bool fbcon)
4067 {
4068 	struct amdgpu_device *adev = drm_to_adev(dev);
4069 	int r = 0;
4070 
4071 	if (dev->switch_power_state == DRM_SWITCH_POWER_OFF)
4072 		return 0;
4073 
4074 	adev->in_suspend = true;
4075 
4076 	if (amdgpu_sriov_vf(adev)) {
4077 		amdgpu_virt_fini_data_exchange(adev);
4078 		r = amdgpu_virt_request_full_gpu(adev, false);
4079 		if (r)
4080 			return r;
4081 	}
4082 
4083 	if (amdgpu_acpi_smart_shift_update(dev, AMDGPU_SS_DEV_D3))
4084 		DRM_WARN("smart shift update failed\n");
4085 
4086 	drm_kms_helper_poll_disable(dev);
4087 
4088 	if (fbcon)
4089 		amdgpu_fbdev_set_suspend(adev, 1);
4090 
4091 	cancel_delayed_work_sync(&adev->delayed_init_work);
4092 
4093 	amdgpu_ras_suspend(adev);
4094 
4095 	amdgpu_device_ip_suspend_phase1(adev);
4096 
4097 	if (!adev->in_s0ix)
4098 		amdgpu_amdkfd_suspend(adev, adev->in_runpm);
4099 
4100 	/* First evict vram memory */
4101 	amdgpu_device_evict_resources(adev);
4102 
4103 	amdgpu_fence_driver_hw_fini(adev);
4104 
4105 	amdgpu_device_ip_suspend_phase2(adev);
4106 	/* This second call to evict device resources is to evict
4107 	 * the gart page table using the CPU.
4108 	 */
4109 	amdgpu_device_evict_resources(adev);
4110 
4111 	if (amdgpu_sriov_vf(adev))
4112 		amdgpu_virt_release_full_gpu(adev, false);
4113 
4114 	return 0;
4115 }
4116 
4117 /**
4118  * amdgpu_device_resume - initiate device resume
4119  *
4120  * @dev: drm dev pointer
4121  * @fbcon : notify the fbdev of resume
4122  *
4123  * Bring the hw back to operating state (all asics).
4124  * Returns 0 for success or an error on failure.
4125  * Called at driver resume.
4126  */
amdgpu_device_resume(struct drm_device * dev,bool fbcon)4127 int amdgpu_device_resume(struct drm_device *dev, bool fbcon)
4128 {
4129 	struct amdgpu_device *adev = drm_to_adev(dev);
4130 	int r = 0;
4131 
4132 	if (amdgpu_sriov_vf(adev)) {
4133 		r = amdgpu_virt_request_full_gpu(adev, true);
4134 		if (r)
4135 			return r;
4136 	}
4137 
4138 	if (dev->switch_power_state == DRM_SWITCH_POWER_OFF)
4139 		return 0;
4140 
4141 	if (adev->in_s0ix)
4142 		amdgpu_gfx_state_change_set(adev, sGpuChangeState_D0Entry);
4143 
4144 	/* post card */
4145 	if (amdgpu_device_need_post(adev)) {
4146 		r = amdgpu_device_asic_init(adev);
4147 		if (r)
4148 			dev_err(adev->dev, "amdgpu asic init failed\n");
4149 	}
4150 
4151 	r = amdgpu_device_ip_resume(adev);
4152 
4153 	/* no matter what r is, always need to properly release full GPU */
4154 	if (amdgpu_sriov_vf(adev)) {
4155 		amdgpu_virt_init_data_exchange(adev);
4156 		amdgpu_virt_release_full_gpu(adev, true);
4157 	}
4158 
4159 	if (r) {
4160 		dev_err(adev->dev, "amdgpu_device_ip_resume failed (%d).\n", r);
4161 		return r;
4162 	}
4163 	amdgpu_fence_driver_hw_init(adev);
4164 
4165 	r = amdgpu_device_ip_late_init(adev);
4166 	if (r)
4167 		return r;
4168 
4169 	queue_delayed_work(system_wq, &adev->delayed_init_work,
4170 			   msecs_to_jiffies(AMDGPU_RESUME_MS));
4171 
4172 	if (!adev->in_s0ix) {
4173 		r = amdgpu_amdkfd_resume(adev, adev->in_runpm);
4174 		if (r)
4175 			return r;
4176 	}
4177 
4178 	/* Make sure IB tests flushed */
4179 	flush_delayed_work(&adev->delayed_init_work);
4180 
4181 	if (adev->in_s0ix) {
4182 		/* re-enable gfxoff after IP resume. This re-enables gfxoff after
4183 		 * it was disabled for IP resume in amdgpu_device_ip_resume_phase2().
4184 		 */
4185 		amdgpu_gfx_off_ctrl(adev, true);
4186 		DRM_DEBUG("will enable gfxoff for the mission mode\n");
4187 	}
4188 	if (fbcon)
4189 		amdgpu_fbdev_set_suspend(adev, 0);
4190 
4191 	drm_kms_helper_poll_enable(dev);
4192 
4193 	amdgpu_ras_resume(adev);
4194 
4195 	/*
4196 	 * Most of the connector probing functions try to acquire runtime pm
4197 	 * refs to ensure that the GPU is powered on when connector polling is
4198 	 * performed. Since we're calling this from a runtime PM callback,
4199 	 * trying to acquire rpm refs will cause us to deadlock.
4200 	 *
4201 	 * Since we're guaranteed to be holding the rpm lock, it's safe to
4202 	 * temporarily disable the rpm helpers so this doesn't deadlock us.
4203 	 */
4204 #ifdef CONFIG_PM
4205 	dev->dev->power.disable_depth++;
4206 #endif
4207 	if (!amdgpu_device_has_dc_support(adev))
4208 		drm_helper_hpd_irq_event(dev);
4209 	else
4210 		drm_kms_helper_hotplug_event(dev);
4211 #ifdef CONFIG_PM
4212 	dev->dev->power.disable_depth--;
4213 #endif
4214 	adev->in_suspend = false;
4215 
4216 	if (amdgpu_acpi_smart_shift_update(dev, AMDGPU_SS_DEV_D0))
4217 		DRM_WARN("smart shift update failed\n");
4218 
4219 	return 0;
4220 }
4221 
4222 /**
4223  * amdgpu_device_ip_check_soft_reset - did soft reset succeed
4224  *
4225  * @adev: amdgpu_device pointer
4226  *
4227  * The list of all the hardware IPs that make up the asic is walked and
4228  * the check_soft_reset callbacks are run.  check_soft_reset determines
4229  * if the asic is still hung or not.
4230  * Returns true if any of the IPs are still in a hung state, false if not.
4231  */
amdgpu_device_ip_check_soft_reset(struct amdgpu_device * adev)4232 static bool amdgpu_device_ip_check_soft_reset(struct amdgpu_device *adev)
4233 {
4234 	int i;
4235 	bool asic_hang = false;
4236 
4237 	if (amdgpu_sriov_vf(adev))
4238 		return true;
4239 
4240 	if (amdgpu_asic_need_full_reset(adev))
4241 		return true;
4242 
4243 	for (i = 0; i < adev->num_ip_blocks; i++) {
4244 		if (!adev->ip_blocks[i].status.valid)
4245 			continue;
4246 		if (adev->ip_blocks[i].version->funcs->check_soft_reset)
4247 			adev->ip_blocks[i].status.hang =
4248 				adev->ip_blocks[i].version->funcs->check_soft_reset(adev);
4249 		if (adev->ip_blocks[i].status.hang) {
4250 			dev_info(adev->dev, "IP block:%s is hung!\n", adev->ip_blocks[i].version->funcs->name);
4251 			asic_hang = true;
4252 		}
4253 	}
4254 	return asic_hang;
4255 }
4256 
4257 /**
4258  * amdgpu_device_ip_pre_soft_reset - prepare for soft reset
4259  *
4260  * @adev: amdgpu_device pointer
4261  *
4262  * The list of all the hardware IPs that make up the asic is walked and the
4263  * pre_soft_reset callbacks are run if the block is hung.  pre_soft_reset
4264  * handles any IP specific hardware or software state changes that are
4265  * necessary for a soft reset to succeed.
4266  * Returns 0 on success, negative error code on failure.
4267  */
amdgpu_device_ip_pre_soft_reset(struct amdgpu_device * adev)4268 static int amdgpu_device_ip_pre_soft_reset(struct amdgpu_device *adev)
4269 {
4270 	int i, r = 0;
4271 
4272 	for (i = 0; i < adev->num_ip_blocks; i++) {
4273 		if (!adev->ip_blocks[i].status.valid)
4274 			continue;
4275 		if (adev->ip_blocks[i].status.hang &&
4276 		    adev->ip_blocks[i].version->funcs->pre_soft_reset) {
4277 			r = adev->ip_blocks[i].version->funcs->pre_soft_reset(adev);
4278 			if (r)
4279 				return r;
4280 		}
4281 	}
4282 
4283 	return 0;
4284 }
4285 
4286 /**
4287  * amdgpu_device_ip_need_full_reset - check if a full asic reset is needed
4288  *
4289  * @adev: amdgpu_device pointer
4290  *
4291  * Some hardware IPs cannot be soft reset.  If they are hung, a full gpu
4292  * reset is necessary to recover.
4293  * Returns true if a full asic reset is required, false if not.
4294  */
amdgpu_device_ip_need_full_reset(struct amdgpu_device * adev)4295 static bool amdgpu_device_ip_need_full_reset(struct amdgpu_device *adev)
4296 {
4297 	int i;
4298 
4299 	if (amdgpu_asic_need_full_reset(adev))
4300 		return true;
4301 
4302 	for (i = 0; i < adev->num_ip_blocks; i++) {
4303 		if (!adev->ip_blocks[i].status.valid)
4304 			continue;
4305 		if ((adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC) ||
4306 		    (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_SMC) ||
4307 		    (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_ACP) ||
4308 		    (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_DCE) ||
4309 		     adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_PSP) {
4310 			if (adev->ip_blocks[i].status.hang) {
4311 				dev_info(adev->dev, "Some block need full reset!\n");
4312 				return true;
4313 			}
4314 		}
4315 	}
4316 	return false;
4317 }
4318 
4319 /**
4320  * amdgpu_device_ip_soft_reset - do a soft reset
4321  *
4322  * @adev: amdgpu_device pointer
4323  *
4324  * The list of all the hardware IPs that make up the asic is walked and the
4325  * soft_reset callbacks are run if the block is hung.  soft_reset handles any
4326  * IP specific hardware or software state changes that are necessary to soft
4327  * reset the IP.
4328  * Returns 0 on success, negative error code on failure.
4329  */
amdgpu_device_ip_soft_reset(struct amdgpu_device * adev)4330 static int amdgpu_device_ip_soft_reset(struct amdgpu_device *adev)
4331 {
4332 	int i, r = 0;
4333 
4334 	for (i = 0; i < adev->num_ip_blocks; i++) {
4335 		if (!adev->ip_blocks[i].status.valid)
4336 			continue;
4337 		if (adev->ip_blocks[i].status.hang &&
4338 		    adev->ip_blocks[i].version->funcs->soft_reset) {
4339 			r = adev->ip_blocks[i].version->funcs->soft_reset(adev);
4340 			if (r)
4341 				return r;
4342 		}
4343 	}
4344 
4345 	return 0;
4346 }
4347 
4348 /**
4349  * amdgpu_device_ip_post_soft_reset - clean up from soft reset
4350  *
4351  * @adev: amdgpu_device pointer
4352  *
4353  * The list of all the hardware IPs that make up the asic is walked and the
4354  * post_soft_reset callbacks are run if the asic was hung.  post_soft_reset
4355  * handles any IP specific hardware or software state changes that are
4356  * necessary after the IP has been soft reset.
4357  * Returns 0 on success, negative error code on failure.
4358  */
amdgpu_device_ip_post_soft_reset(struct amdgpu_device * adev)4359 static int amdgpu_device_ip_post_soft_reset(struct amdgpu_device *adev)
4360 {
4361 	int i, r = 0;
4362 
4363 	for (i = 0; i < adev->num_ip_blocks; i++) {
4364 		if (!adev->ip_blocks[i].status.valid)
4365 			continue;
4366 		if (adev->ip_blocks[i].status.hang &&
4367 		    adev->ip_blocks[i].version->funcs->post_soft_reset)
4368 			r = adev->ip_blocks[i].version->funcs->post_soft_reset(adev);
4369 		if (r)
4370 			return r;
4371 	}
4372 
4373 	return 0;
4374 }
4375 
4376 /**
4377  * amdgpu_device_recover_vram - Recover some VRAM contents
4378  *
4379  * @adev: amdgpu_device pointer
4380  *
4381  * Restores the contents of VRAM buffers from the shadows in GTT.  Used to
4382  * restore things like GPUVM page tables after a GPU reset where
4383  * the contents of VRAM might be lost.
4384  *
4385  * Returns:
4386  * 0 on success, negative error code on failure.
4387  */
amdgpu_device_recover_vram(struct amdgpu_device * adev)4388 static int amdgpu_device_recover_vram(struct amdgpu_device *adev)
4389 {
4390 	struct dma_fence *fence = NULL, *next = NULL;
4391 	struct amdgpu_bo *shadow;
4392 	struct amdgpu_bo_vm *vmbo;
4393 	long r = 1, tmo;
4394 
4395 	if (amdgpu_sriov_runtime(adev))
4396 		tmo = msecs_to_jiffies(8000);
4397 	else
4398 		tmo = msecs_to_jiffies(100);
4399 
4400 	dev_info(adev->dev, "recover vram bo from shadow start\n");
4401 	mutex_lock(&adev->shadow_list_lock);
4402 	list_for_each_entry(vmbo, &adev->shadow_list, shadow_list) {
4403 		/* If vm is compute context or adev is APU, shadow will be NULL */
4404 		if (!vmbo->shadow)
4405 			continue;
4406 		shadow = vmbo->shadow;
4407 
4408 		/* No need to recover an evicted BO */
4409 		if (shadow->tbo.resource->mem_type != TTM_PL_TT ||
4410 		    shadow->tbo.resource->start == AMDGPU_BO_INVALID_OFFSET ||
4411 		    shadow->parent->tbo.resource->mem_type != TTM_PL_VRAM)
4412 			continue;
4413 
4414 		r = amdgpu_bo_restore_shadow(shadow, &next);
4415 		if (r)
4416 			break;
4417 
4418 		if (fence) {
4419 			tmo = dma_fence_wait_timeout(fence, false, tmo);
4420 			dma_fence_put(fence);
4421 			fence = next;
4422 			if (tmo == 0) {
4423 				r = -ETIMEDOUT;
4424 				break;
4425 			} else if (tmo < 0) {
4426 				r = tmo;
4427 				break;
4428 			}
4429 		} else {
4430 			fence = next;
4431 		}
4432 	}
4433 	mutex_unlock(&adev->shadow_list_lock);
4434 
4435 	if (fence)
4436 		tmo = dma_fence_wait_timeout(fence, false, tmo);
4437 	dma_fence_put(fence);
4438 
4439 	if (r < 0 || tmo <= 0) {
4440 		dev_err(adev->dev, "recover vram bo from shadow failed, r is %ld, tmo is %ld\n", r, tmo);
4441 		return -EIO;
4442 	}
4443 
4444 	dev_info(adev->dev, "recover vram bo from shadow done\n");
4445 	return 0;
4446 }
4447 
4448 
4449 /**
4450  * amdgpu_device_reset_sriov - reset ASIC for SR-IOV vf
4451  *
4452  * @adev: amdgpu_device pointer
4453  * @from_hypervisor: request from hypervisor
4454  *
4455  * do VF FLR and reinitialize Asic
4456  * return 0 means succeeded otherwise failed
4457  */
amdgpu_device_reset_sriov(struct amdgpu_device * adev,bool from_hypervisor)4458 static int amdgpu_device_reset_sriov(struct amdgpu_device *adev,
4459 				     bool from_hypervisor)
4460 {
4461 	int r;
4462 
4463 	if (from_hypervisor)
4464 		r = amdgpu_virt_request_full_gpu(adev, true);
4465 	else
4466 		r = amdgpu_virt_reset_gpu(adev);
4467 	if (r)
4468 		return r;
4469 
4470 	amdgpu_amdkfd_pre_reset(adev);
4471 
4472 	/* Resume IP prior to SMC */
4473 	r = amdgpu_device_ip_reinit_early_sriov(adev);
4474 	if (r)
4475 		goto error;
4476 
4477 	amdgpu_virt_init_data_exchange(adev);
4478 	/* we need recover gart prior to run SMC/CP/SDMA resume */
4479 	amdgpu_gtt_mgr_recover(ttm_manager_type(&adev->mman.bdev, TTM_PL_TT));
4480 
4481 	r = amdgpu_device_fw_loading(adev);
4482 	if (r)
4483 		return r;
4484 
4485 	/* now we are okay to resume SMC/CP/SDMA */
4486 	r = amdgpu_device_ip_reinit_late_sriov(adev);
4487 	if (r)
4488 		goto error;
4489 
4490 	amdgpu_irq_gpu_reset_resume_helper(adev);
4491 	r = amdgpu_ib_ring_tests(adev);
4492 	amdgpu_amdkfd_post_reset(adev);
4493 
4494 error:
4495 	if (!r && adev->virt.gim_feature & AMDGIM_FEATURE_GIM_FLR_VRAMLOST) {
4496 		amdgpu_inc_vram_lost(adev);
4497 		r = amdgpu_device_recover_vram(adev);
4498 	}
4499 	amdgpu_virt_release_full_gpu(adev, true);
4500 
4501 	return r;
4502 }
4503 
4504 /**
4505  * amdgpu_device_has_job_running - check if there is any job in mirror list
4506  *
4507  * @adev: amdgpu_device pointer
4508  *
4509  * check if there is any job in mirror list
4510  */
amdgpu_device_has_job_running(struct amdgpu_device * adev)4511 bool amdgpu_device_has_job_running(struct amdgpu_device *adev)
4512 {
4513 	int i;
4514 	struct drm_sched_job *job;
4515 
4516 	for (i = 0; i < AMDGPU_MAX_RINGS; ++i) {
4517 		struct amdgpu_ring *ring = adev->rings[i];
4518 
4519 		if (!ring || !ring->sched.thread)
4520 			continue;
4521 
4522 		spin_lock(&ring->sched.job_list_lock);
4523 		job = list_first_entry_or_null(&ring->sched.pending_list,
4524 					       struct drm_sched_job, list);
4525 		spin_unlock(&ring->sched.job_list_lock);
4526 		if (job)
4527 			return true;
4528 	}
4529 	return false;
4530 }
4531 
4532 /**
4533  * amdgpu_device_should_recover_gpu - check if we should try GPU recovery
4534  *
4535  * @adev: amdgpu_device pointer
4536  *
4537  * Check amdgpu_gpu_recovery and SRIOV status to see if we should try to recover
4538  * a hung GPU.
4539  */
amdgpu_device_should_recover_gpu(struct amdgpu_device * adev)4540 bool amdgpu_device_should_recover_gpu(struct amdgpu_device *adev)
4541 {
4542 	if (!amdgpu_device_ip_check_soft_reset(adev)) {
4543 		dev_info(adev->dev, "Timeout, but no hardware hang detected.\n");
4544 		return false;
4545 	}
4546 
4547 	if (amdgpu_gpu_recovery == 0)
4548 		goto disabled;
4549 
4550 	if (amdgpu_sriov_vf(adev))
4551 		return true;
4552 
4553 	if (amdgpu_gpu_recovery == -1) {
4554 		switch (adev->asic_type) {
4555 		case CHIP_BONAIRE:
4556 		case CHIP_HAWAII:
4557 		case CHIP_TOPAZ:
4558 		case CHIP_TONGA:
4559 		case CHIP_FIJI:
4560 		case CHIP_POLARIS10:
4561 		case CHIP_POLARIS11:
4562 		case CHIP_POLARIS12:
4563 		case CHIP_VEGAM:
4564 		case CHIP_VEGA20:
4565 		case CHIP_VEGA10:
4566 		case CHIP_VEGA12:
4567 		case CHIP_RAVEN:
4568 		case CHIP_ARCTURUS:
4569 		case CHIP_RENOIR:
4570 		case CHIP_NAVI10:
4571 		case CHIP_NAVI14:
4572 		case CHIP_NAVI12:
4573 		case CHIP_SIENNA_CICHLID:
4574 		case CHIP_NAVY_FLOUNDER:
4575 		case CHIP_DIMGREY_CAVEFISH:
4576 		case CHIP_BEIGE_GOBY:
4577 		case CHIP_VANGOGH:
4578 		case CHIP_ALDEBARAN:
4579 			break;
4580 		default:
4581 			goto disabled;
4582 		}
4583 	}
4584 
4585 	return true;
4586 
4587 disabled:
4588 		dev_info(adev->dev, "GPU recovery disabled.\n");
4589 		return false;
4590 }
4591 
amdgpu_device_mode1_reset(struct amdgpu_device * adev)4592 int amdgpu_device_mode1_reset(struct amdgpu_device *adev)
4593 {
4594         u32 i;
4595         int ret = 0;
4596 
4597         amdgpu_atombios_scratch_regs_engine_hung(adev, true);
4598 
4599         dev_info(adev->dev, "GPU mode1 reset\n");
4600 
4601         /* disable BM */
4602         pci_clear_master(adev->pdev);
4603 
4604         amdgpu_device_cache_pci_state(adev->pdev);
4605 
4606         if (amdgpu_dpm_is_mode1_reset_supported(adev)) {
4607                 dev_info(adev->dev, "GPU smu mode1 reset\n");
4608                 ret = amdgpu_dpm_mode1_reset(adev);
4609         } else {
4610                 dev_info(adev->dev, "GPU psp mode1 reset\n");
4611                 ret = psp_gpu_reset(adev);
4612         }
4613 
4614         if (ret)
4615                 dev_err(adev->dev, "GPU mode1 reset failed\n");
4616 
4617         amdgpu_device_load_pci_state(adev->pdev);
4618 
4619         /* wait for asic to come out of reset */
4620         for (i = 0; i < adev->usec_timeout; i++) {
4621                 u32 memsize = adev->nbio.funcs->get_memsize(adev);
4622 
4623                 if (memsize != 0xffffffff)
4624                         break;
4625                 udelay(1);
4626         }
4627 
4628         amdgpu_atombios_scratch_regs_engine_hung(adev, false);
4629         return ret;
4630 }
4631 
amdgpu_device_pre_asic_reset(struct amdgpu_device * adev,struct amdgpu_reset_context * reset_context)4632 int amdgpu_device_pre_asic_reset(struct amdgpu_device *adev,
4633 				 struct amdgpu_reset_context *reset_context)
4634 {
4635 	int i, j, r = 0;
4636 	struct amdgpu_job *job = NULL;
4637 	bool need_full_reset =
4638 		test_bit(AMDGPU_NEED_FULL_RESET, &reset_context->flags);
4639 
4640 	if (reset_context->reset_req_dev == adev)
4641 		job = reset_context->job;
4642 
4643 	if (amdgpu_sriov_vf(adev)) {
4644 		/* stop the data exchange thread */
4645 		amdgpu_virt_fini_data_exchange(adev);
4646 	}
4647 
4648 	/* block all schedulers and reset given job's ring */
4649 	for (i = 0; i < AMDGPU_MAX_RINGS; ++i) {
4650 		struct amdgpu_ring *ring = adev->rings[i];
4651 
4652 		if (!ring || !ring->sched.thread)
4653 			continue;
4654 
4655 		/*clear job fence from fence drv to avoid force_completion
4656 		 *leave NULL and vm flush fence in fence drv */
4657 		for (j = 0; j <= ring->fence_drv.num_fences_mask; j++) {
4658 			struct dma_fence *old, **ptr;
4659 
4660 			ptr = &ring->fence_drv.fences[j];
4661 			old = rcu_dereference_protected(*ptr, 1);
4662 			if (old && test_bit(AMDGPU_FENCE_FLAG_EMBED_IN_JOB_BIT, &old->flags)) {
4663 				RCU_INIT_POINTER(*ptr, NULL);
4664 			}
4665 		}
4666 		/* after all hw jobs are reset, hw fence is meaningless, so force_completion */
4667 		amdgpu_fence_driver_force_completion(ring);
4668 	}
4669 
4670 	if (job && job->vm)
4671 		drm_sched_increase_karma(&job->base);
4672 
4673 	r = amdgpu_reset_prepare_hwcontext(adev, reset_context);
4674 	/* If reset handler not implemented, continue; otherwise return */
4675 	if (r == -ENOSYS)
4676 		r = 0;
4677 	else
4678 		return r;
4679 
4680 	/* Don't suspend on bare metal if we are not going to HW reset the ASIC */
4681 	if (!amdgpu_sriov_vf(adev)) {
4682 
4683 		if (!need_full_reset)
4684 			need_full_reset = amdgpu_device_ip_need_full_reset(adev);
4685 
4686 		if (!need_full_reset) {
4687 			amdgpu_device_ip_pre_soft_reset(adev);
4688 			r = amdgpu_device_ip_soft_reset(adev);
4689 			amdgpu_device_ip_post_soft_reset(adev);
4690 			if (r || amdgpu_device_ip_check_soft_reset(adev)) {
4691 				dev_info(adev->dev, "soft reset failed, will fallback to full reset!\n");
4692 				need_full_reset = true;
4693 			}
4694 		}
4695 
4696 		if (need_full_reset)
4697 			r = amdgpu_device_ip_suspend(adev);
4698 		if (need_full_reset)
4699 			set_bit(AMDGPU_NEED_FULL_RESET, &reset_context->flags);
4700 		else
4701 			clear_bit(AMDGPU_NEED_FULL_RESET,
4702 				  &reset_context->flags);
4703 	}
4704 
4705 	return r;
4706 }
4707 
amdgpu_do_asic_reset(struct list_head * device_list_handle,struct amdgpu_reset_context * reset_context)4708 int amdgpu_do_asic_reset(struct list_head *device_list_handle,
4709 			 struct amdgpu_reset_context *reset_context)
4710 {
4711 	struct amdgpu_device *tmp_adev = NULL;
4712 	bool need_full_reset, skip_hw_reset, vram_lost = false;
4713 	int r = 0;
4714 
4715 	/* Try reset handler method first */
4716 	tmp_adev = list_first_entry(device_list_handle, struct amdgpu_device,
4717 				    reset_list);
4718 	r = amdgpu_reset_perform_reset(tmp_adev, reset_context);
4719 	/* If reset handler not implemented, continue; otherwise return */
4720 	if (r == -ENOSYS)
4721 		r = 0;
4722 	else
4723 		return r;
4724 
4725 	/* Reset handler not implemented, use the default method */
4726 	need_full_reset =
4727 		test_bit(AMDGPU_NEED_FULL_RESET, &reset_context->flags);
4728 	skip_hw_reset = test_bit(AMDGPU_SKIP_HW_RESET, &reset_context->flags);
4729 
4730 	/*
4731 	 * ASIC reset has to be done on all XGMI hive nodes ASAP
4732 	 * to allow proper links negotiation in FW (within 1 sec)
4733 	 */
4734 	if (!skip_hw_reset && need_full_reset) {
4735 		list_for_each_entry(tmp_adev, device_list_handle, reset_list) {
4736 			/* For XGMI run all resets in parallel to speed up the process */
4737 			if (tmp_adev->gmc.xgmi.num_physical_nodes > 1) {
4738 				tmp_adev->gmc.xgmi.pending_reset = false;
4739 				if (!queue_work(system_unbound_wq, &tmp_adev->xgmi_reset_work))
4740 					r = -EALREADY;
4741 			} else
4742 				r = amdgpu_asic_reset(tmp_adev);
4743 
4744 			if (r) {
4745 				dev_err(tmp_adev->dev, "ASIC reset failed with error, %d for drm dev, %s",
4746 					 r, adev_to_drm(tmp_adev)->unique);
4747 				break;
4748 			}
4749 		}
4750 
4751 		/* For XGMI wait for all resets to complete before proceed */
4752 		if (!r) {
4753 			list_for_each_entry(tmp_adev, device_list_handle, reset_list) {
4754 				if (tmp_adev->gmc.xgmi.num_physical_nodes > 1) {
4755 					flush_work(&tmp_adev->xgmi_reset_work);
4756 					r = tmp_adev->asic_reset_res;
4757 					if (r)
4758 						break;
4759 				}
4760 			}
4761 		}
4762 	}
4763 
4764 	if (!r && amdgpu_ras_intr_triggered()) {
4765 		list_for_each_entry(tmp_adev, device_list_handle, reset_list) {
4766 			if (tmp_adev->mmhub.ras_funcs &&
4767 			    tmp_adev->mmhub.ras_funcs->reset_ras_error_count)
4768 				tmp_adev->mmhub.ras_funcs->reset_ras_error_count(tmp_adev);
4769 		}
4770 
4771 		amdgpu_ras_intr_cleared();
4772 	}
4773 
4774 	list_for_each_entry(tmp_adev, device_list_handle, reset_list) {
4775 		if (need_full_reset) {
4776 			/* post card */
4777 			r = amdgpu_device_asic_init(tmp_adev);
4778 			if (r) {
4779 				dev_warn(tmp_adev->dev, "asic atom init failed!");
4780 			} else {
4781 				dev_info(tmp_adev->dev, "GPU reset succeeded, trying to resume\n");
4782 				r = amdgpu_amdkfd_resume_iommu(tmp_adev);
4783 				if (r)
4784 					goto out;
4785 
4786 				r = amdgpu_device_ip_resume_phase1(tmp_adev);
4787 				if (r)
4788 					goto out;
4789 
4790 				vram_lost = amdgpu_device_check_vram_lost(tmp_adev);
4791 				if (vram_lost) {
4792 					DRM_INFO("VRAM is lost due to GPU reset!\n");
4793 					amdgpu_inc_vram_lost(tmp_adev);
4794 				}
4795 
4796 				r = amdgpu_gtt_mgr_recover(ttm_manager_type(&tmp_adev->mman.bdev, TTM_PL_TT));
4797 				if (r)
4798 					goto out;
4799 
4800 				r = amdgpu_device_fw_loading(tmp_adev);
4801 				if (r)
4802 					return r;
4803 
4804 				r = amdgpu_device_ip_resume_phase2(tmp_adev);
4805 				if (r)
4806 					goto out;
4807 
4808 				if (vram_lost)
4809 					amdgpu_device_fill_reset_magic(tmp_adev);
4810 
4811 				/*
4812 				 * Add this ASIC as tracked as reset was already
4813 				 * complete successfully.
4814 				 */
4815 				amdgpu_register_gpu_instance(tmp_adev);
4816 
4817 				if (!reset_context->hive &&
4818 				    tmp_adev->gmc.xgmi.num_physical_nodes > 1)
4819 					amdgpu_xgmi_add_device(tmp_adev);
4820 
4821 				r = amdgpu_device_ip_late_init(tmp_adev);
4822 				if (r)
4823 					goto out;
4824 
4825 				amdgpu_fbdev_set_suspend(tmp_adev, 0);
4826 
4827 				/*
4828 				 * The GPU enters bad state once faulty pages
4829 				 * by ECC has reached the threshold, and ras
4830 				 * recovery is scheduled next. So add one check
4831 				 * here to break recovery if it indeed exceeds
4832 				 * bad page threshold, and remind user to
4833 				 * retire this GPU or setting one bigger
4834 				 * bad_page_threshold value to fix this once
4835 				 * probing driver again.
4836 				 */
4837 				if (!amdgpu_ras_eeprom_check_err_threshold(tmp_adev)) {
4838 					/* must succeed. */
4839 					amdgpu_ras_resume(tmp_adev);
4840 				} else {
4841 					r = -EINVAL;
4842 					goto out;
4843 				}
4844 
4845 				/* Update PSP FW topology after reset */
4846 				if (reset_context->hive &&
4847 				    tmp_adev->gmc.xgmi.num_physical_nodes > 1)
4848 					r = amdgpu_xgmi_update_topology(
4849 						reset_context->hive, tmp_adev);
4850 			}
4851 		}
4852 
4853 out:
4854 		if (!r) {
4855 			amdgpu_irq_gpu_reset_resume_helper(tmp_adev);
4856 			r = amdgpu_ib_ring_tests(tmp_adev);
4857 			if (r) {
4858 				dev_err(tmp_adev->dev, "ib ring test failed (%d).\n", r);
4859 				need_full_reset = true;
4860 				r = -EAGAIN;
4861 				goto end;
4862 			}
4863 		}
4864 
4865 		if (!r)
4866 			r = amdgpu_device_recover_vram(tmp_adev);
4867 		else
4868 			tmp_adev->asic_reset_res = r;
4869 	}
4870 
4871 end:
4872 	if (need_full_reset)
4873 		set_bit(AMDGPU_NEED_FULL_RESET, &reset_context->flags);
4874 	else
4875 		clear_bit(AMDGPU_NEED_FULL_RESET, &reset_context->flags);
4876 	return r;
4877 }
4878 
amdgpu_device_lock_adev(struct amdgpu_device * adev,struct amdgpu_hive_info * hive)4879 static bool amdgpu_device_lock_adev(struct amdgpu_device *adev,
4880 				struct amdgpu_hive_info *hive)
4881 {
4882 	if (atomic_cmpxchg(&adev->in_gpu_reset, 0, 1) != 0)
4883 		return false;
4884 
4885 	if (hive) {
4886 		down_write_nest_lock(&adev->reset_sem, &hive->hive_lock);
4887 	} else {
4888 		down_write(&adev->reset_sem);
4889 	}
4890 
4891 	switch (amdgpu_asic_reset_method(adev)) {
4892 	case AMD_RESET_METHOD_MODE1:
4893 		adev->mp1_state = PP_MP1_STATE_SHUTDOWN;
4894 		break;
4895 	case AMD_RESET_METHOD_MODE2:
4896 		adev->mp1_state = PP_MP1_STATE_RESET;
4897 		break;
4898 	default:
4899 		adev->mp1_state = PP_MP1_STATE_NONE;
4900 		break;
4901 	}
4902 
4903 	return true;
4904 }
4905 
amdgpu_device_unlock_adev(struct amdgpu_device * adev)4906 static void amdgpu_device_unlock_adev(struct amdgpu_device *adev)
4907 {
4908 	amdgpu_vf_error_trans_all(adev);
4909 	adev->mp1_state = PP_MP1_STATE_NONE;
4910 	atomic_set(&adev->in_gpu_reset, 0);
4911 	up_write(&adev->reset_sem);
4912 }
4913 
4914 /*
4915  * to lockup a list of amdgpu devices in a hive safely, if not a hive
4916  * with multiple nodes, it will be similar as amdgpu_device_lock_adev.
4917  *
4918  * unlock won't require roll back.
4919  */
amdgpu_device_lock_hive_adev(struct amdgpu_device * adev,struct amdgpu_hive_info * hive)4920 static int amdgpu_device_lock_hive_adev(struct amdgpu_device *adev, struct amdgpu_hive_info *hive)
4921 {
4922 	struct amdgpu_device *tmp_adev = NULL;
4923 
4924 	if (adev->gmc.xgmi.num_physical_nodes > 1) {
4925 		if (!hive) {
4926 			dev_err(adev->dev, "Hive is NULL while device has multiple xgmi nodes");
4927 			return -ENODEV;
4928 		}
4929 		list_for_each_entry(tmp_adev, &hive->device_list, gmc.xgmi.head) {
4930 			if (!amdgpu_device_lock_adev(tmp_adev, hive))
4931 				goto roll_back;
4932 		}
4933 	} else if (!amdgpu_device_lock_adev(adev, hive))
4934 		return -EAGAIN;
4935 
4936 	return 0;
4937 roll_back:
4938 	if (!list_is_first(&tmp_adev->gmc.xgmi.head, &hive->device_list)) {
4939 		/*
4940 		 * if the lockup iteration break in the middle of a hive,
4941 		 * it may means there may has a race issue,
4942 		 * or a hive device locked up independently.
4943 		 * we may be in trouble and may not, so will try to roll back
4944 		 * the lock and give out a warnning.
4945 		 */
4946 		dev_warn(tmp_adev->dev, "Hive lock iteration broke in the middle. Rolling back to unlock");
4947 		list_for_each_entry_continue_reverse(tmp_adev, &hive->device_list, gmc.xgmi.head) {
4948 			amdgpu_device_unlock_adev(tmp_adev);
4949 		}
4950 	}
4951 	return -EAGAIN;
4952 }
4953 
amdgpu_device_resume_display_audio(struct amdgpu_device * adev)4954 static void amdgpu_device_resume_display_audio(struct amdgpu_device *adev)
4955 {
4956 	struct pci_dev *p = NULL;
4957 
4958 	p = pci_get_domain_bus_and_slot(pci_domain_nr(adev->pdev->bus),
4959 			adev->pdev->bus->number, 1);
4960 	if (p) {
4961 		pm_runtime_enable(&(p->dev));
4962 		pm_runtime_resume(&(p->dev));
4963 	}
4964 
4965 	pci_dev_put(p);
4966 }
4967 
amdgpu_device_suspend_display_audio(struct amdgpu_device * adev)4968 static int amdgpu_device_suspend_display_audio(struct amdgpu_device *adev)
4969 {
4970 	enum amd_reset_method reset_method;
4971 	struct pci_dev *p = NULL;
4972 	u64 expires;
4973 
4974 	/*
4975 	 * For now, only BACO and mode1 reset are confirmed
4976 	 * to suffer the audio issue without proper suspended.
4977 	 */
4978 	reset_method = amdgpu_asic_reset_method(adev);
4979 	if ((reset_method != AMD_RESET_METHOD_BACO) &&
4980 	     (reset_method != AMD_RESET_METHOD_MODE1))
4981 		return -EINVAL;
4982 
4983 	p = pci_get_domain_bus_and_slot(pci_domain_nr(adev->pdev->bus),
4984 			adev->pdev->bus->number, 1);
4985 	if (!p)
4986 		return -ENODEV;
4987 
4988 	expires = pm_runtime_autosuspend_expiration(&(p->dev));
4989 	if (!expires)
4990 		/*
4991 		 * If we cannot get the audio device autosuspend delay,
4992 		 * a fixed 4S interval will be used. Considering 3S is
4993 		 * the audio controller default autosuspend delay setting.
4994 		 * 4S used here is guaranteed to cover that.
4995 		 */
4996 		expires = ktime_get_mono_fast_ns() + NSEC_PER_SEC * 4ULL;
4997 
4998 	while (!pm_runtime_status_suspended(&(p->dev))) {
4999 		if (!pm_runtime_suspend(&(p->dev)))
5000 			break;
5001 
5002 		if (expires < ktime_get_mono_fast_ns()) {
5003 			dev_warn(adev->dev, "failed to suspend display audio\n");
5004 			pci_dev_put(p);
5005 			/* TODO: abort the succeeding gpu reset? */
5006 			return -ETIMEDOUT;
5007 		}
5008 	}
5009 
5010 	pm_runtime_disable(&(p->dev));
5011 
5012 	pci_dev_put(p);
5013 	return 0;
5014 }
5015 
amdgpu_device_recheck_guilty_jobs(struct amdgpu_device * adev,struct list_head * device_list_handle,struct amdgpu_reset_context * reset_context)5016 static void amdgpu_device_recheck_guilty_jobs(
5017 	struct amdgpu_device *adev, struct list_head *device_list_handle,
5018 	struct amdgpu_reset_context *reset_context)
5019 {
5020 	int i, r = 0;
5021 
5022 	for (i = 0; i < AMDGPU_MAX_RINGS; ++i) {
5023 		struct amdgpu_ring *ring = adev->rings[i];
5024 		int ret = 0;
5025 		struct drm_sched_job *s_job;
5026 
5027 		if (!ring || !ring->sched.thread)
5028 			continue;
5029 
5030 		s_job = list_first_entry_or_null(&ring->sched.pending_list,
5031 				struct drm_sched_job, list);
5032 		if (s_job == NULL)
5033 			continue;
5034 
5035 		/* clear job's guilty and depend the folowing step to decide the real one */
5036 		drm_sched_reset_karma(s_job);
5037 		drm_sched_resubmit_jobs_ext(&ring->sched, 1);
5038 
5039 		ret = dma_fence_wait_timeout(s_job->s_fence->parent, false, ring->sched.timeout);
5040 		if (ret == 0) { /* timeout */
5041 			DRM_ERROR("Found the real bad job! ring:%s, job_id:%llx\n",
5042 						ring->sched.name, s_job->id);
5043 
5044 			/* set guilty */
5045 			drm_sched_increase_karma(s_job);
5046 retry:
5047 			/* do hw reset */
5048 			if (amdgpu_sriov_vf(adev)) {
5049 				amdgpu_virt_fini_data_exchange(adev);
5050 				r = amdgpu_device_reset_sriov(adev, false);
5051 				if (r)
5052 					adev->asic_reset_res = r;
5053 			} else {
5054 				clear_bit(AMDGPU_SKIP_HW_RESET,
5055 					  &reset_context->flags);
5056 				r = amdgpu_do_asic_reset(device_list_handle,
5057 							 reset_context);
5058 				if (r && r == -EAGAIN)
5059 					goto retry;
5060 			}
5061 
5062 			/*
5063 			 * add reset counter so that the following
5064 			 * resubmitted job could flush vmid
5065 			 */
5066 			atomic_inc(&adev->gpu_reset_counter);
5067 			continue;
5068 		}
5069 
5070 		/* got the hw fence, signal finished fence */
5071 		atomic_dec(ring->sched.score);
5072 		dma_fence_get(&s_job->s_fence->finished);
5073 		dma_fence_signal(&s_job->s_fence->finished);
5074 		dma_fence_put(&s_job->s_fence->finished);
5075 
5076 		/* remove node from list and free the job */
5077 		spin_lock(&ring->sched.job_list_lock);
5078 		list_del_init(&s_job->list);
5079 		spin_unlock(&ring->sched.job_list_lock);
5080 		ring->sched.ops->free_job(s_job);
5081 	}
5082 }
5083 
5084 /**
5085  * amdgpu_device_gpu_recover - reset the asic and recover scheduler
5086  *
5087  * @adev: amdgpu_device pointer
5088  * @job: which job trigger hang
5089  *
5090  * Attempt to reset the GPU if it has hung (all asics).
5091  * Attempt to do soft-reset or full-reset and reinitialize Asic
5092  * Returns 0 for success or an error on failure.
5093  */
5094 
amdgpu_device_gpu_recover(struct amdgpu_device * adev,struct amdgpu_job * job)5095 int amdgpu_device_gpu_recover(struct amdgpu_device *adev,
5096 			      struct amdgpu_job *job)
5097 {
5098 	struct list_head device_list, *device_list_handle =  NULL;
5099 	bool job_signaled = false;
5100 	struct amdgpu_hive_info *hive = NULL;
5101 	struct amdgpu_device *tmp_adev = NULL;
5102 	int i, r = 0;
5103 	bool need_emergency_restart = false;
5104 	bool audio_suspended = false;
5105 	int tmp_vram_lost_counter;
5106 	struct amdgpu_reset_context reset_context;
5107 
5108 	memset(&reset_context, 0, sizeof(reset_context));
5109 
5110 	/*
5111 	 * Special case: RAS triggered and full reset isn't supported
5112 	 */
5113 	need_emergency_restart = amdgpu_ras_need_emergency_restart(adev);
5114 
5115 	/*
5116 	 * Flush RAM to disk so that after reboot
5117 	 * the user can read log and see why the system rebooted.
5118 	 */
5119 	if (need_emergency_restart && amdgpu_ras_get_context(adev) &&
5120 		amdgpu_ras_get_context(adev)->reboot) {
5121 		DRM_WARN("Emergency reboot.");
5122 
5123 		ksys_sync_helper();
5124 		emergency_restart();
5125 	}
5126 
5127 	dev_info(adev->dev, "GPU %s begin!\n",
5128 		need_emergency_restart ? "jobs stop":"reset");
5129 
5130 	/*
5131 	 * Here we trylock to avoid chain of resets executing from
5132 	 * either trigger by jobs on different adevs in XGMI hive or jobs on
5133 	 * different schedulers for same device while this TO handler is running.
5134 	 * We always reset all schedulers for device and all devices for XGMI
5135 	 * hive so that should take care of them too.
5136 	 */
5137 	hive = amdgpu_get_xgmi_hive(adev);
5138 	if (hive) {
5139 		if (atomic_cmpxchg(&hive->in_reset, 0, 1) != 0) {
5140 			DRM_INFO("Bailing on TDR for s_job:%llx, hive: %llx as another already in progress",
5141 				job ? job->base.id : -1, hive->hive_id);
5142 			amdgpu_put_xgmi_hive(hive);
5143 			if (job && job->vm)
5144 				drm_sched_increase_karma(&job->base);
5145 			return 0;
5146 		}
5147 		mutex_lock(&hive->hive_lock);
5148 	}
5149 
5150 	reset_context.method = AMD_RESET_METHOD_NONE;
5151 	reset_context.reset_req_dev = adev;
5152 	reset_context.job = job;
5153 	reset_context.hive = hive;
5154 	clear_bit(AMDGPU_NEED_FULL_RESET, &reset_context.flags);
5155 
5156 	/*
5157 	 * lock the device before we try to operate the linked list
5158 	 * if didn't get the device lock, don't touch the linked list since
5159 	 * others may iterating it.
5160 	 */
5161 	r = amdgpu_device_lock_hive_adev(adev, hive);
5162 	if (r) {
5163 		dev_info(adev->dev, "Bailing on TDR for s_job:%llx, as another already in progress",
5164 					job ? job->base.id : -1);
5165 
5166 		/* even we skipped this reset, still need to set the job to guilty */
5167 		if (job && job->vm)
5168 			drm_sched_increase_karma(&job->base);
5169 		goto skip_recovery;
5170 	}
5171 
5172 	/*
5173 	 * Build list of devices to reset.
5174 	 * In case we are in XGMI hive mode, resort the device list
5175 	 * to put adev in the 1st position.
5176 	 */
5177 	INIT_LIST_HEAD(&device_list);
5178 	if (adev->gmc.xgmi.num_physical_nodes > 1) {
5179 		list_for_each_entry(tmp_adev, &hive->device_list, gmc.xgmi.head)
5180 			list_add_tail(&tmp_adev->reset_list, &device_list);
5181 		if (!list_is_first(&adev->reset_list, &device_list))
5182 			list_rotate_to_front(&adev->reset_list, &device_list);
5183 		device_list_handle = &device_list;
5184 	} else {
5185 		list_add_tail(&adev->reset_list, &device_list);
5186 		device_list_handle = &device_list;
5187 	}
5188 
5189 	/* block all schedulers and reset given job's ring */
5190 	list_for_each_entry(tmp_adev, device_list_handle, reset_list) {
5191 		/*
5192 		 * Try to put the audio codec into suspend state
5193 		 * before gpu reset started.
5194 		 *
5195 		 * Due to the power domain of the graphics device
5196 		 * is shared with AZ power domain. Without this,
5197 		 * we may change the audio hardware from behind
5198 		 * the audio driver's back. That will trigger
5199 		 * some audio codec errors.
5200 		 */
5201 		if (!amdgpu_device_suspend_display_audio(tmp_adev))
5202 			audio_suspended = true;
5203 
5204 		amdgpu_ras_set_error_query_ready(tmp_adev, false);
5205 
5206 		cancel_delayed_work_sync(&tmp_adev->delayed_init_work);
5207 
5208 		if (!amdgpu_sriov_vf(tmp_adev))
5209 			amdgpu_amdkfd_pre_reset(tmp_adev);
5210 
5211 		/*
5212 		 * Mark these ASICs to be reseted as untracked first
5213 		 * And add them back after reset completed
5214 		 */
5215 		amdgpu_unregister_gpu_instance(tmp_adev);
5216 
5217 		amdgpu_fbdev_set_suspend(tmp_adev, 1);
5218 
5219 		/* disable ras on ALL IPs */
5220 		if (!need_emergency_restart &&
5221 		      amdgpu_device_ip_need_full_reset(tmp_adev))
5222 			amdgpu_ras_suspend(tmp_adev);
5223 
5224 		for (i = 0; i < AMDGPU_MAX_RINGS; ++i) {
5225 			struct amdgpu_ring *ring = tmp_adev->rings[i];
5226 
5227 			if (!ring || !ring->sched.thread)
5228 				continue;
5229 
5230 			drm_sched_stop(&ring->sched, job ? &job->base : NULL);
5231 
5232 			if (need_emergency_restart)
5233 				amdgpu_job_stop_all_jobs_on_sched(&ring->sched);
5234 		}
5235 		atomic_inc(&tmp_adev->gpu_reset_counter);
5236 	}
5237 
5238 	if (need_emergency_restart)
5239 		goto skip_sched_resume;
5240 
5241 	/*
5242 	 * Must check guilty signal here since after this point all old
5243 	 * HW fences are force signaled.
5244 	 *
5245 	 * job->base holds a reference to parent fence
5246 	 */
5247 	if (job && job->base.s_fence->parent &&
5248 	    dma_fence_is_signaled(job->base.s_fence->parent)) {
5249 		job_signaled = true;
5250 		dev_info(adev->dev, "Guilty job already signaled, skipping HW reset");
5251 		goto skip_hw_reset;
5252 	}
5253 
5254 retry:	/* Rest of adevs pre asic reset from XGMI hive. */
5255 	list_for_each_entry(tmp_adev, device_list_handle, reset_list) {
5256 		r = amdgpu_device_pre_asic_reset(tmp_adev, &reset_context);
5257 		/*TODO Should we stop ?*/
5258 		if (r) {
5259 			dev_err(tmp_adev->dev, "GPU pre asic reset failed with err, %d for drm dev, %s ",
5260 				  r, adev_to_drm(tmp_adev)->unique);
5261 			tmp_adev->asic_reset_res = r;
5262 		}
5263 	}
5264 
5265 	tmp_vram_lost_counter = atomic_read(&((adev)->vram_lost_counter));
5266 	/* Actual ASIC resets if needed.*/
5267 	/* TODO Implement XGMI hive reset logic for SRIOV */
5268 	if (amdgpu_sriov_vf(adev)) {
5269 		r = amdgpu_device_reset_sriov(adev, job ? false : true);
5270 		if (r)
5271 			adev->asic_reset_res = r;
5272 	} else {
5273 		r = amdgpu_do_asic_reset(device_list_handle, &reset_context);
5274 		if (r && r == -EAGAIN)
5275 			goto retry;
5276 	}
5277 
5278 skip_hw_reset:
5279 
5280 	/* Post ASIC reset for all devs .*/
5281 	list_for_each_entry(tmp_adev, device_list_handle, reset_list) {
5282 
5283 		/*
5284 		 * Sometimes a later bad compute job can block a good gfx job as gfx
5285 		 * and compute ring share internal GC HW mutually. We add an additional
5286 		 * guilty jobs recheck step to find the real guilty job, it synchronously
5287 		 * submits and pends for the first job being signaled. If it gets timeout,
5288 		 * we identify it as a real guilty job.
5289 		 */
5290 		if (amdgpu_gpu_recovery == 2 &&
5291 			!(tmp_vram_lost_counter < atomic_read(&adev->vram_lost_counter)))
5292 			amdgpu_device_recheck_guilty_jobs(
5293 				tmp_adev, device_list_handle, &reset_context);
5294 
5295 		for (i = 0; i < AMDGPU_MAX_RINGS; ++i) {
5296 			struct amdgpu_ring *ring = tmp_adev->rings[i];
5297 
5298 			if (!ring || !ring->sched.thread)
5299 				continue;
5300 
5301 			/* No point to resubmit jobs if we didn't HW reset*/
5302 			if (!tmp_adev->asic_reset_res && !job_signaled)
5303 				drm_sched_resubmit_jobs(&ring->sched);
5304 
5305 			drm_sched_start(&ring->sched, !tmp_adev->asic_reset_res);
5306 		}
5307 
5308 		if (!drm_drv_uses_atomic_modeset(adev_to_drm(tmp_adev)) && !job_signaled) {
5309 			drm_helper_resume_force_mode(adev_to_drm(tmp_adev));
5310 		}
5311 
5312 		tmp_adev->asic_reset_res = 0;
5313 
5314 		if (r) {
5315 			/* bad news, how to tell it to userspace ? */
5316 			dev_info(tmp_adev->dev, "GPU reset(%d) failed\n", atomic_read(&tmp_adev->gpu_reset_counter));
5317 			amdgpu_vf_error_put(tmp_adev, AMDGIM_ERROR_VF_GPU_RESET_FAIL, 0, r);
5318 		} else {
5319 			dev_info(tmp_adev->dev, "GPU reset(%d) succeeded!\n", atomic_read(&tmp_adev->gpu_reset_counter));
5320 			if (amdgpu_acpi_smart_shift_update(adev_to_drm(tmp_adev), AMDGPU_SS_DEV_D0))
5321 				DRM_WARN("smart shift update failed\n");
5322 		}
5323 	}
5324 
5325 skip_sched_resume:
5326 	list_for_each_entry(tmp_adev, device_list_handle, reset_list) {
5327 		/* unlock kfd: SRIOV would do it separately */
5328 		if (!need_emergency_restart && !amdgpu_sriov_vf(tmp_adev))
5329 	                amdgpu_amdkfd_post_reset(tmp_adev);
5330 
5331 		/* kfd_post_reset will do nothing if kfd device is not initialized,
5332 		 * need to bring up kfd here if it's not be initialized before
5333 		 */
5334 		if (!adev->kfd.init_complete)
5335 			amdgpu_amdkfd_device_init(adev);
5336 
5337 		if (audio_suspended)
5338 			amdgpu_device_resume_display_audio(tmp_adev);
5339 		amdgpu_device_unlock_adev(tmp_adev);
5340 	}
5341 
5342 skip_recovery:
5343 	if (hive) {
5344 		atomic_set(&hive->in_reset, 0);
5345 		mutex_unlock(&hive->hive_lock);
5346 		amdgpu_put_xgmi_hive(hive);
5347 	}
5348 
5349 	if (r && r != -EAGAIN)
5350 		dev_info(adev->dev, "GPU reset end with ret = %d\n", r);
5351 	return r;
5352 }
5353 
5354 /**
5355  * amdgpu_device_get_pcie_info - fence pcie info about the PCIE slot
5356  *
5357  * @adev: amdgpu_device pointer
5358  *
5359  * Fetchs and stores in the driver the PCIE capabilities (gen speed
5360  * and lanes) of the slot the device is in. Handles APUs and
5361  * virtualized environments where PCIE config space may not be available.
5362  */
amdgpu_device_get_pcie_info(struct amdgpu_device * adev)5363 static void amdgpu_device_get_pcie_info(struct amdgpu_device *adev)
5364 {
5365 	struct pci_dev *pdev;
5366 	enum pci_bus_speed speed_cap, platform_speed_cap;
5367 	enum pcie_link_width platform_link_width;
5368 
5369 	if (amdgpu_pcie_gen_cap)
5370 		adev->pm.pcie_gen_mask = amdgpu_pcie_gen_cap;
5371 
5372 	if (amdgpu_pcie_lane_cap)
5373 		adev->pm.pcie_mlw_mask = amdgpu_pcie_lane_cap;
5374 
5375 	/* covers APUs as well */
5376 	if (pci_is_root_bus(adev->pdev->bus)) {
5377 		if (adev->pm.pcie_gen_mask == 0)
5378 			adev->pm.pcie_gen_mask = AMDGPU_DEFAULT_PCIE_GEN_MASK;
5379 		if (adev->pm.pcie_mlw_mask == 0)
5380 			adev->pm.pcie_mlw_mask = AMDGPU_DEFAULT_PCIE_MLW_MASK;
5381 		return;
5382 	}
5383 
5384 	if (adev->pm.pcie_gen_mask && adev->pm.pcie_mlw_mask)
5385 		return;
5386 
5387 	pcie_bandwidth_available(adev->pdev, NULL,
5388 				 &platform_speed_cap, &platform_link_width);
5389 
5390 	if (adev->pm.pcie_gen_mask == 0) {
5391 		/* asic caps */
5392 		pdev = adev->pdev;
5393 		speed_cap = pcie_get_speed_cap(pdev);
5394 		if (speed_cap == PCI_SPEED_UNKNOWN) {
5395 			adev->pm.pcie_gen_mask |= (CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN1 |
5396 						  CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN2 |
5397 						  CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN3);
5398 		} else {
5399 			if (speed_cap == PCIE_SPEED_32_0GT)
5400 				adev->pm.pcie_gen_mask |= (CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN1 |
5401 							  CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN2 |
5402 							  CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN3 |
5403 							  CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN4 |
5404 							  CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN5);
5405 			else if (speed_cap == PCIE_SPEED_16_0GT)
5406 				adev->pm.pcie_gen_mask |= (CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN1 |
5407 							  CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN2 |
5408 							  CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN3 |
5409 							  CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN4);
5410 			else if (speed_cap == PCIE_SPEED_8_0GT)
5411 				adev->pm.pcie_gen_mask |= (CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN1 |
5412 							  CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN2 |
5413 							  CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN3);
5414 			else if (speed_cap == PCIE_SPEED_5_0GT)
5415 				adev->pm.pcie_gen_mask |= (CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN1 |
5416 							  CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN2);
5417 			else
5418 				adev->pm.pcie_gen_mask |= CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN1;
5419 		}
5420 		/* platform caps */
5421 		if (platform_speed_cap == PCI_SPEED_UNKNOWN) {
5422 			adev->pm.pcie_gen_mask |= (CAIL_PCIE_LINK_SPEED_SUPPORT_GEN1 |
5423 						   CAIL_PCIE_LINK_SPEED_SUPPORT_GEN2);
5424 		} else {
5425 			if (platform_speed_cap == PCIE_SPEED_32_0GT)
5426 				adev->pm.pcie_gen_mask |= (CAIL_PCIE_LINK_SPEED_SUPPORT_GEN1 |
5427 							   CAIL_PCIE_LINK_SPEED_SUPPORT_GEN2 |
5428 							   CAIL_PCIE_LINK_SPEED_SUPPORT_GEN3 |
5429 							   CAIL_PCIE_LINK_SPEED_SUPPORT_GEN4 |
5430 							   CAIL_PCIE_LINK_SPEED_SUPPORT_GEN5);
5431 			else if (platform_speed_cap == PCIE_SPEED_16_0GT)
5432 				adev->pm.pcie_gen_mask |= (CAIL_PCIE_LINK_SPEED_SUPPORT_GEN1 |
5433 							   CAIL_PCIE_LINK_SPEED_SUPPORT_GEN2 |
5434 							   CAIL_PCIE_LINK_SPEED_SUPPORT_GEN3 |
5435 							   CAIL_PCIE_LINK_SPEED_SUPPORT_GEN4);
5436 			else if (platform_speed_cap == PCIE_SPEED_8_0GT)
5437 				adev->pm.pcie_gen_mask |= (CAIL_PCIE_LINK_SPEED_SUPPORT_GEN1 |
5438 							   CAIL_PCIE_LINK_SPEED_SUPPORT_GEN2 |
5439 							   CAIL_PCIE_LINK_SPEED_SUPPORT_GEN3);
5440 			else if (platform_speed_cap == PCIE_SPEED_5_0GT)
5441 				adev->pm.pcie_gen_mask |= (CAIL_PCIE_LINK_SPEED_SUPPORT_GEN1 |
5442 							   CAIL_PCIE_LINK_SPEED_SUPPORT_GEN2);
5443 			else
5444 				adev->pm.pcie_gen_mask |= CAIL_PCIE_LINK_SPEED_SUPPORT_GEN1;
5445 
5446 		}
5447 	}
5448 	if (adev->pm.pcie_mlw_mask == 0) {
5449 		if (platform_link_width == PCIE_LNK_WIDTH_UNKNOWN) {
5450 			adev->pm.pcie_mlw_mask |= AMDGPU_DEFAULT_PCIE_MLW_MASK;
5451 		} else {
5452 			switch (platform_link_width) {
5453 			case PCIE_LNK_X32:
5454 				adev->pm.pcie_mlw_mask = (CAIL_PCIE_LINK_WIDTH_SUPPORT_X32 |
5455 							  CAIL_PCIE_LINK_WIDTH_SUPPORT_X16 |
5456 							  CAIL_PCIE_LINK_WIDTH_SUPPORT_X12 |
5457 							  CAIL_PCIE_LINK_WIDTH_SUPPORT_X8 |
5458 							  CAIL_PCIE_LINK_WIDTH_SUPPORT_X4 |
5459 							  CAIL_PCIE_LINK_WIDTH_SUPPORT_X2 |
5460 							  CAIL_PCIE_LINK_WIDTH_SUPPORT_X1);
5461 				break;
5462 			case PCIE_LNK_X16:
5463 				adev->pm.pcie_mlw_mask = (CAIL_PCIE_LINK_WIDTH_SUPPORT_X16 |
5464 							  CAIL_PCIE_LINK_WIDTH_SUPPORT_X12 |
5465 							  CAIL_PCIE_LINK_WIDTH_SUPPORT_X8 |
5466 							  CAIL_PCIE_LINK_WIDTH_SUPPORT_X4 |
5467 							  CAIL_PCIE_LINK_WIDTH_SUPPORT_X2 |
5468 							  CAIL_PCIE_LINK_WIDTH_SUPPORT_X1);
5469 				break;
5470 			case PCIE_LNK_X12:
5471 				adev->pm.pcie_mlw_mask = (CAIL_PCIE_LINK_WIDTH_SUPPORT_X12 |
5472 							  CAIL_PCIE_LINK_WIDTH_SUPPORT_X8 |
5473 							  CAIL_PCIE_LINK_WIDTH_SUPPORT_X4 |
5474 							  CAIL_PCIE_LINK_WIDTH_SUPPORT_X2 |
5475 							  CAIL_PCIE_LINK_WIDTH_SUPPORT_X1);
5476 				break;
5477 			case PCIE_LNK_X8:
5478 				adev->pm.pcie_mlw_mask = (CAIL_PCIE_LINK_WIDTH_SUPPORT_X8 |
5479 							  CAIL_PCIE_LINK_WIDTH_SUPPORT_X4 |
5480 							  CAIL_PCIE_LINK_WIDTH_SUPPORT_X2 |
5481 							  CAIL_PCIE_LINK_WIDTH_SUPPORT_X1);
5482 				break;
5483 			case PCIE_LNK_X4:
5484 				adev->pm.pcie_mlw_mask = (CAIL_PCIE_LINK_WIDTH_SUPPORT_X4 |
5485 							  CAIL_PCIE_LINK_WIDTH_SUPPORT_X2 |
5486 							  CAIL_PCIE_LINK_WIDTH_SUPPORT_X1);
5487 				break;
5488 			case PCIE_LNK_X2:
5489 				adev->pm.pcie_mlw_mask = (CAIL_PCIE_LINK_WIDTH_SUPPORT_X2 |
5490 							  CAIL_PCIE_LINK_WIDTH_SUPPORT_X1);
5491 				break;
5492 			case PCIE_LNK_X1:
5493 				adev->pm.pcie_mlw_mask = CAIL_PCIE_LINK_WIDTH_SUPPORT_X1;
5494 				break;
5495 			default:
5496 				break;
5497 			}
5498 		}
5499 	}
5500 }
5501 
amdgpu_device_baco_enter(struct drm_device * dev)5502 int amdgpu_device_baco_enter(struct drm_device *dev)
5503 {
5504 	struct amdgpu_device *adev = drm_to_adev(dev);
5505 	struct amdgpu_ras *ras = amdgpu_ras_get_context(adev);
5506 
5507 	if (!amdgpu_device_supports_baco(adev_to_drm(adev)))
5508 		return -ENOTSUPP;
5509 
5510 	if (ras && adev->ras_enabled &&
5511 	    adev->nbio.funcs->enable_doorbell_interrupt)
5512 		adev->nbio.funcs->enable_doorbell_interrupt(adev, false);
5513 
5514 	return amdgpu_dpm_baco_enter(adev);
5515 }
5516 
amdgpu_device_baco_exit(struct drm_device * dev)5517 int amdgpu_device_baco_exit(struct drm_device *dev)
5518 {
5519 	struct amdgpu_device *adev = drm_to_adev(dev);
5520 	struct amdgpu_ras *ras = amdgpu_ras_get_context(adev);
5521 	int ret = 0;
5522 
5523 	if (!amdgpu_device_supports_baco(adev_to_drm(adev)))
5524 		return -ENOTSUPP;
5525 
5526 	ret = amdgpu_dpm_baco_exit(adev);
5527 	if (ret)
5528 		return ret;
5529 
5530 	if (ras && adev->ras_enabled &&
5531 	    adev->nbio.funcs->enable_doorbell_interrupt)
5532 		adev->nbio.funcs->enable_doorbell_interrupt(adev, true);
5533 
5534 	if (amdgpu_passthrough(adev) &&
5535 	    adev->nbio.funcs->clear_doorbell_interrupt)
5536 		adev->nbio.funcs->clear_doorbell_interrupt(adev);
5537 
5538 	return 0;
5539 }
5540 
amdgpu_cancel_all_tdr(struct amdgpu_device * adev)5541 static void amdgpu_cancel_all_tdr(struct amdgpu_device *adev)
5542 {
5543 	int i;
5544 
5545 	for (i = 0; i < AMDGPU_MAX_RINGS; ++i) {
5546 		struct amdgpu_ring *ring = adev->rings[i];
5547 
5548 		if (!ring || !ring->sched.thread)
5549 			continue;
5550 
5551 		cancel_delayed_work_sync(&ring->sched.work_tdr);
5552 	}
5553 }
5554 
5555 /**
5556  * amdgpu_pci_error_detected - Called when a PCI error is detected.
5557  * @pdev: PCI device struct
5558  * @state: PCI channel state
5559  *
5560  * Description: Called when a PCI error is detected.
5561  *
5562  * Return: PCI_ERS_RESULT_NEED_RESET or PCI_ERS_RESULT_DISCONNECT.
5563  */
amdgpu_pci_error_detected(struct pci_dev * pdev,pci_channel_state_t state)5564 pci_ers_result_t amdgpu_pci_error_detected(struct pci_dev *pdev, pci_channel_state_t state)
5565 {
5566 	struct drm_device *dev = pci_get_drvdata(pdev);
5567 	struct amdgpu_device *adev = drm_to_adev(dev);
5568 	int i;
5569 
5570 	DRM_INFO("PCI error: detected callback, state(%d)!!\n", state);
5571 
5572 	if (adev->gmc.xgmi.num_physical_nodes > 1) {
5573 		DRM_WARN("No support for XGMI hive yet...");
5574 		return PCI_ERS_RESULT_DISCONNECT;
5575 	}
5576 
5577 	adev->pci_channel_state = state;
5578 
5579 	switch (state) {
5580 	case pci_channel_io_normal:
5581 		return PCI_ERS_RESULT_CAN_RECOVER;
5582 	/* Fatal error, prepare for slot reset */
5583 	case pci_channel_io_frozen:
5584 		/*
5585 		 * Cancel and wait for all TDRs in progress if failing to
5586 		 * set  adev->in_gpu_reset in amdgpu_device_lock_adev
5587 		 *
5588 		 * Locking adev->reset_sem will prevent any external access
5589 		 * to GPU during PCI error recovery
5590 		 */
5591 		while (!amdgpu_device_lock_adev(adev, NULL))
5592 			amdgpu_cancel_all_tdr(adev);
5593 
5594 		/*
5595 		 * Block any work scheduling as we do for regular GPU reset
5596 		 * for the duration of the recovery
5597 		 */
5598 		for (i = 0; i < AMDGPU_MAX_RINGS; ++i) {
5599 			struct amdgpu_ring *ring = adev->rings[i];
5600 
5601 			if (!ring || !ring->sched.thread)
5602 				continue;
5603 
5604 			drm_sched_stop(&ring->sched, NULL);
5605 		}
5606 		atomic_inc(&adev->gpu_reset_counter);
5607 		return PCI_ERS_RESULT_NEED_RESET;
5608 	case pci_channel_io_perm_failure:
5609 		/* Permanent error, prepare for device removal */
5610 		return PCI_ERS_RESULT_DISCONNECT;
5611 	}
5612 
5613 	return PCI_ERS_RESULT_NEED_RESET;
5614 }
5615 
5616 /**
5617  * amdgpu_pci_mmio_enabled - Enable MMIO and dump debug registers
5618  * @pdev: pointer to PCI device
5619  */
amdgpu_pci_mmio_enabled(struct pci_dev * pdev)5620 pci_ers_result_t amdgpu_pci_mmio_enabled(struct pci_dev *pdev)
5621 {
5622 
5623 	DRM_INFO("PCI error: mmio enabled callback!!\n");
5624 
5625 	/* TODO - dump whatever for debugging purposes */
5626 
5627 	/* This called only if amdgpu_pci_error_detected returns
5628 	 * PCI_ERS_RESULT_CAN_RECOVER. Read/write to the device still
5629 	 * works, no need to reset slot.
5630 	 */
5631 
5632 	return PCI_ERS_RESULT_RECOVERED;
5633 }
5634 
5635 /**
5636  * amdgpu_pci_slot_reset - Called when PCI slot has been reset.
5637  * @pdev: PCI device struct
5638  *
5639  * Description: This routine is called by the pci error recovery
5640  * code after the PCI slot has been reset, just before we
5641  * should resume normal operations.
5642  */
amdgpu_pci_slot_reset(struct pci_dev * pdev)5643 pci_ers_result_t amdgpu_pci_slot_reset(struct pci_dev *pdev)
5644 {
5645 	struct drm_device *dev = pci_get_drvdata(pdev);
5646 	struct amdgpu_device *adev = drm_to_adev(dev);
5647 	int r, i;
5648 	struct amdgpu_reset_context reset_context;
5649 	u32 memsize;
5650 	struct list_head device_list;
5651 
5652 	DRM_INFO("PCI error: slot reset callback!!\n");
5653 
5654 	memset(&reset_context, 0, sizeof(reset_context));
5655 
5656 	INIT_LIST_HEAD(&device_list);
5657 	list_add_tail(&adev->reset_list, &device_list);
5658 
5659 	/* wait for asic to come out of reset */
5660 	msleep(500);
5661 
5662 	/* Restore PCI confspace */
5663 	amdgpu_device_load_pci_state(pdev);
5664 
5665 	/* confirm  ASIC came out of reset */
5666 	for (i = 0; i < adev->usec_timeout; i++) {
5667 		memsize = amdgpu_asic_get_config_memsize(adev);
5668 
5669 		if (memsize != 0xffffffff)
5670 			break;
5671 		udelay(1);
5672 	}
5673 	if (memsize == 0xffffffff) {
5674 		r = -ETIME;
5675 		goto out;
5676 	}
5677 
5678 	reset_context.method = AMD_RESET_METHOD_NONE;
5679 	reset_context.reset_req_dev = adev;
5680 	set_bit(AMDGPU_NEED_FULL_RESET, &reset_context.flags);
5681 	set_bit(AMDGPU_SKIP_HW_RESET, &reset_context.flags);
5682 
5683 	adev->no_hw_access = true;
5684 	r = amdgpu_device_pre_asic_reset(adev, &reset_context);
5685 	adev->no_hw_access = false;
5686 	if (r)
5687 		goto out;
5688 
5689 	r = amdgpu_do_asic_reset(&device_list, &reset_context);
5690 
5691 out:
5692 	if (!r) {
5693 		if (amdgpu_device_cache_pci_state(adev->pdev))
5694 			pci_restore_state(adev->pdev);
5695 
5696 		DRM_INFO("PCIe error recovery succeeded\n");
5697 	} else {
5698 		DRM_ERROR("PCIe error recovery failed, err:%d", r);
5699 		amdgpu_device_unlock_adev(adev);
5700 	}
5701 
5702 	return r ? PCI_ERS_RESULT_DISCONNECT : PCI_ERS_RESULT_RECOVERED;
5703 }
5704 
5705 /**
5706  * amdgpu_pci_resume() - resume normal ops after PCI reset
5707  * @pdev: pointer to PCI device
5708  *
5709  * Called when the error recovery driver tells us that its
5710  * OK to resume normal operation.
5711  */
amdgpu_pci_resume(struct pci_dev * pdev)5712 void amdgpu_pci_resume(struct pci_dev *pdev)
5713 {
5714 	struct drm_device *dev = pci_get_drvdata(pdev);
5715 	struct amdgpu_device *adev = drm_to_adev(dev);
5716 	int i;
5717 
5718 
5719 	DRM_INFO("PCI error: resume callback!!\n");
5720 
5721 	/* Only continue execution for the case of pci_channel_io_frozen */
5722 	if (adev->pci_channel_state != pci_channel_io_frozen)
5723 		return;
5724 
5725 	for (i = 0; i < AMDGPU_MAX_RINGS; ++i) {
5726 		struct amdgpu_ring *ring = adev->rings[i];
5727 
5728 		if (!ring || !ring->sched.thread)
5729 			continue;
5730 
5731 
5732 		drm_sched_resubmit_jobs(&ring->sched);
5733 		drm_sched_start(&ring->sched, true);
5734 	}
5735 
5736 	amdgpu_device_unlock_adev(adev);
5737 }
5738 
amdgpu_device_cache_pci_state(struct pci_dev * pdev)5739 bool amdgpu_device_cache_pci_state(struct pci_dev *pdev)
5740 {
5741 	struct drm_device *dev = pci_get_drvdata(pdev);
5742 	struct amdgpu_device *adev = drm_to_adev(dev);
5743 	int r;
5744 
5745 	r = pci_save_state(pdev);
5746 	if (!r) {
5747 		kfree(adev->pci_state);
5748 
5749 		adev->pci_state = pci_store_saved_state(pdev);
5750 
5751 		if (!adev->pci_state) {
5752 			DRM_ERROR("Failed to store PCI saved state");
5753 			return false;
5754 		}
5755 	} else {
5756 		DRM_WARN("Failed to save PCI state, err:%d\n", r);
5757 		return false;
5758 	}
5759 
5760 	return true;
5761 }
5762 
amdgpu_device_load_pci_state(struct pci_dev * pdev)5763 bool amdgpu_device_load_pci_state(struct pci_dev *pdev)
5764 {
5765 	struct drm_device *dev = pci_get_drvdata(pdev);
5766 	struct amdgpu_device *adev = drm_to_adev(dev);
5767 	int r;
5768 
5769 	if (!adev->pci_state)
5770 		return false;
5771 
5772 	r = pci_load_saved_state(pdev, adev->pci_state);
5773 
5774 	if (!r) {
5775 		pci_restore_state(pdev);
5776 	} else {
5777 		DRM_WARN("Failed to load PCI state, err:%d\n", r);
5778 		return false;
5779 	}
5780 
5781 	return true;
5782 }
5783 
amdgpu_device_flush_hdp(struct amdgpu_device * adev,struct amdgpu_ring * ring)5784 void amdgpu_device_flush_hdp(struct amdgpu_device *adev,
5785 		struct amdgpu_ring *ring)
5786 {
5787 #ifdef CONFIG_X86_64
5788 	if ((adev->flags & AMD_IS_APU) && !amdgpu_passthrough(adev))
5789 		return;
5790 #endif
5791 	if (adev->gmc.xgmi.connected_to_cpu)
5792 		return;
5793 
5794 	if (ring && ring->funcs->emit_hdp_flush)
5795 		amdgpu_ring_emit_hdp_flush(ring);
5796 	else
5797 		amdgpu_asic_flush_hdp(adev, ring);
5798 }
5799 
amdgpu_device_invalidate_hdp(struct amdgpu_device * adev,struct amdgpu_ring * ring)5800 void amdgpu_device_invalidate_hdp(struct amdgpu_device *adev,
5801 		struct amdgpu_ring *ring)
5802 {
5803 #ifdef CONFIG_X86_64
5804 	if ((adev->flags & AMD_IS_APU) && !amdgpu_passthrough(adev))
5805 		return;
5806 #endif
5807 	if (adev->gmc.xgmi.connected_to_cpu)
5808 		return;
5809 
5810 	amdgpu_asic_invalidate_hdp(adev, ring);
5811 }
5812