• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /*
2  * Copyright 2008 Advanced Micro Devices, Inc.
3  * Copyright 2008 Red Hat Inc.
4  * Copyright 2009 Jerome Glisse.
5  *
6  * Permission is hereby granted, free of charge, to any person obtaining a
7  * copy of this software and associated documentation files (the "Software"),
8  * to deal in the Software without restriction, including without limitation
9  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
10  * and/or sell copies of the Software, and to permit persons to whom the
11  * Software is furnished to do so, subject to the following conditions:
12  *
13  * The above copyright notice and this permission notice shall be included in
14  * all copies or substantial portions of the Software.
15  *
16  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
19  * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR
20  * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
21  * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
22  * OTHER DEALINGS IN THE SOFTWARE.
23  *
24  * Authors: Dave Airlie
25  *          Alex Deucher
26  *          Jerome Glisse
27  */
28 #include <linux/power_supply.h>
29 #include <linux/kthread.h>
30 #include <linux/module.h>
31 #include <linux/console.h>
32 #include <linux/slab.h>
33 
34 #include <drm/drm_atomic_helper.h>
35 #include <drm/drm_probe_helper.h>
36 #include <drm/amdgpu_drm.h>
37 #include <linux/vgaarb.h>
38 #include <linux/vga_switcheroo.h>
39 #include <linux/efi.h>
40 #include "amdgpu.h"
41 #include "amdgpu_trace.h"
42 #include "amdgpu_i2c.h"
43 #include "atom.h"
44 #include "amdgpu_atombios.h"
45 #include "amdgpu_atomfirmware.h"
46 #include "amd_pcie.h"
47 #ifdef CONFIG_DRM_AMDGPU_SI
48 #include "si.h"
49 #endif
50 #ifdef CONFIG_DRM_AMDGPU_CIK
51 #include "cik.h"
52 #endif
53 #include "vi.h"
54 #include "soc15.h"
55 #include "nv.h"
56 #include "bif/bif_4_1_d.h"
57 #include <linux/pci.h>
58 #include <linux/firmware.h>
59 #include "amdgpu_vf_error.h"
60 
61 #include "amdgpu_amdkfd.h"
62 #include "amdgpu_pm.h"
63 
64 #include "amdgpu_xgmi.h"
65 #include "amdgpu_ras.h"
66 #include "amdgpu_pmu.h"
67 #include "amdgpu_fru_eeprom.h"
68 
69 #include <linux/suspend.h>
70 #include <drm/task_barrier.h>
71 #include <linux/pm_runtime.h>
72 
73 MODULE_FIRMWARE("amdgpu/vega10_gpu_info.bin");
74 MODULE_FIRMWARE("amdgpu/vega12_gpu_info.bin");
75 MODULE_FIRMWARE("amdgpu/raven_gpu_info.bin");
76 MODULE_FIRMWARE("amdgpu/picasso_gpu_info.bin");
77 MODULE_FIRMWARE("amdgpu/raven2_gpu_info.bin");
78 MODULE_FIRMWARE("amdgpu/arcturus_gpu_info.bin");
79 MODULE_FIRMWARE("amdgpu/renoir_gpu_info.bin");
80 MODULE_FIRMWARE("amdgpu/navi10_gpu_info.bin");
81 MODULE_FIRMWARE("amdgpu/navi14_gpu_info.bin");
82 MODULE_FIRMWARE("amdgpu/navi12_gpu_info.bin");
83 
84 #define AMDGPU_RESUME_MS		2000
85 
86 const char *amdgpu_asic_name[] = {
87 	"TAHITI",
88 	"PITCAIRN",
89 	"VERDE",
90 	"OLAND",
91 	"HAINAN",
92 	"BONAIRE",
93 	"KAVERI",
94 	"KABINI",
95 	"HAWAII",
96 	"MULLINS",
97 	"TOPAZ",
98 	"TONGA",
99 	"FIJI",
100 	"CARRIZO",
101 	"STONEY",
102 	"POLARIS10",
103 	"POLARIS11",
104 	"POLARIS12",
105 	"VEGAM",
106 	"VEGA10",
107 	"VEGA12",
108 	"VEGA20",
109 	"RAVEN",
110 	"ARCTURUS",
111 	"RENOIR",
112 	"NAVI10",
113 	"NAVI14",
114 	"NAVI12",
115 	"SIENNA_CICHLID",
116 	"NAVY_FLOUNDER",
117 	"LAST",
118 };
119 
120 /**
121  * DOC: pcie_replay_count
122  *
123  * The amdgpu driver provides a sysfs API for reporting the total number
124  * of PCIe replays (NAKs)
125  * The file pcie_replay_count is used for this and returns the total
126  * number of replays as a sum of the NAKs generated and NAKs received
127  */
128 
amdgpu_device_get_pcie_replay_count(struct device * dev,struct device_attribute * attr,char * buf)129 static ssize_t amdgpu_device_get_pcie_replay_count(struct device *dev,
130 		struct device_attribute *attr, char *buf)
131 {
132 	struct drm_device *ddev = dev_get_drvdata(dev);
133 	struct amdgpu_device *adev = drm_to_adev(ddev);
134 	uint64_t cnt = amdgpu_asic_get_pcie_replay_count(adev);
135 
136 	return snprintf(buf, PAGE_SIZE, "%llu\n", cnt);
137 }
138 
139 static DEVICE_ATTR(pcie_replay_count, S_IRUGO,
140 		amdgpu_device_get_pcie_replay_count, NULL);
141 
142 static void amdgpu_device_get_pcie_info(struct amdgpu_device *adev);
143 
144 /**
145  * DOC: product_name
146  *
147  * The amdgpu driver provides a sysfs API for reporting the product name
148  * for the device
149  * The file serial_number is used for this and returns the product name
150  * as returned from the FRU.
151  * NOTE: This is only available for certain server cards
152  */
153 
amdgpu_device_get_product_name(struct device * dev,struct device_attribute * attr,char * buf)154 static ssize_t amdgpu_device_get_product_name(struct device *dev,
155 		struct device_attribute *attr, char *buf)
156 {
157 	struct drm_device *ddev = dev_get_drvdata(dev);
158 	struct amdgpu_device *adev = drm_to_adev(ddev);
159 
160 	return snprintf(buf, PAGE_SIZE, "%s\n", adev->product_name);
161 }
162 
163 static DEVICE_ATTR(product_name, S_IRUGO,
164 		amdgpu_device_get_product_name, NULL);
165 
166 /**
167  * DOC: product_number
168  *
169  * The amdgpu driver provides a sysfs API for reporting the part number
170  * for the device
171  * The file serial_number is used for this and returns the part number
172  * as returned from the FRU.
173  * NOTE: This is only available for certain server cards
174  */
175 
amdgpu_device_get_product_number(struct device * dev,struct device_attribute * attr,char * buf)176 static ssize_t amdgpu_device_get_product_number(struct device *dev,
177 		struct device_attribute *attr, char *buf)
178 {
179 	struct drm_device *ddev = dev_get_drvdata(dev);
180 	struct amdgpu_device *adev = drm_to_adev(ddev);
181 
182 	return snprintf(buf, PAGE_SIZE, "%s\n", adev->product_number);
183 }
184 
185 static DEVICE_ATTR(product_number, S_IRUGO,
186 		amdgpu_device_get_product_number, NULL);
187 
188 /**
189  * DOC: serial_number
190  *
191  * The amdgpu driver provides a sysfs API for reporting the serial number
192  * for the device
193  * The file serial_number is used for this and returns the serial number
194  * as returned from the FRU.
195  * NOTE: This is only available for certain server cards
196  */
197 
amdgpu_device_get_serial_number(struct device * dev,struct device_attribute * attr,char * buf)198 static ssize_t amdgpu_device_get_serial_number(struct device *dev,
199 		struct device_attribute *attr, char *buf)
200 {
201 	struct drm_device *ddev = dev_get_drvdata(dev);
202 	struct amdgpu_device *adev = drm_to_adev(ddev);
203 
204 	return snprintf(buf, PAGE_SIZE, "%s\n", adev->serial);
205 }
206 
207 static DEVICE_ATTR(serial_number, S_IRUGO,
208 		amdgpu_device_get_serial_number, NULL);
209 
210 /**
211  * amdgpu_device_supports_boco - Is the device a dGPU with HG/PX power control
212  *
213  * @dev: drm_device pointer
214  *
215  * Returns true if the device is a dGPU with HG/PX power control,
216  * otherwise return false.
217  */
amdgpu_device_supports_boco(struct drm_device * dev)218 bool amdgpu_device_supports_boco(struct drm_device *dev)
219 {
220 	struct amdgpu_device *adev = drm_to_adev(dev);
221 
222 	if (adev->flags & AMD_IS_PX)
223 		return true;
224 	return false;
225 }
226 
227 /**
228  * amdgpu_device_supports_baco - Does the device support BACO
229  *
230  * @dev: drm_device pointer
231  *
232  * Returns true if the device supporte BACO,
233  * otherwise return false.
234  */
amdgpu_device_supports_baco(struct drm_device * dev)235 bool amdgpu_device_supports_baco(struct drm_device *dev)
236 {
237 	struct amdgpu_device *adev = drm_to_adev(dev);
238 
239 	return amdgpu_asic_supports_baco(adev);
240 }
241 
242 /*
243  * VRAM access helper functions
244  */
245 
246 /**
247  * amdgpu_device_vram_access - read/write a buffer in vram
248  *
249  * @adev: amdgpu_device pointer
250  * @pos: offset of the buffer in vram
251  * @buf: virtual address of the buffer in system memory
252  * @size: read/write size, sizeof(@buf) must > @size
253  * @write: true - write to vram, otherwise - read from vram
254  */
amdgpu_device_vram_access(struct amdgpu_device * adev,loff_t pos,uint32_t * buf,size_t size,bool write)255 void amdgpu_device_vram_access(struct amdgpu_device *adev, loff_t pos,
256 			       uint32_t *buf, size_t size, bool write)
257 {
258 	unsigned long flags;
259 	uint32_t hi = ~0;
260 	uint64_t last;
261 
262 
263 #ifdef CONFIG_64BIT
264 	last = min(pos + size, adev->gmc.visible_vram_size);
265 	if (last > pos) {
266 		void __iomem *addr = adev->mman.aper_base_kaddr + pos;
267 		size_t count = last - pos;
268 
269 		if (write) {
270 			memcpy_toio(addr, buf, count);
271 			mb();
272 			amdgpu_asic_flush_hdp(adev, NULL);
273 		} else {
274 			amdgpu_asic_invalidate_hdp(adev, NULL);
275 			mb();
276 			memcpy_fromio(buf, addr, count);
277 		}
278 
279 		if (count == size)
280 			return;
281 
282 		pos += count;
283 		buf += count / 4;
284 		size -= count;
285 	}
286 #endif
287 
288 	spin_lock_irqsave(&adev->mmio_idx_lock, flags);
289 	for (last = pos + size; pos < last; pos += 4) {
290 		uint32_t tmp = pos >> 31;
291 
292 		WREG32_NO_KIQ(mmMM_INDEX, ((uint32_t)pos) | 0x80000000);
293 		if (tmp != hi) {
294 			WREG32_NO_KIQ(mmMM_INDEX_HI, tmp);
295 			hi = tmp;
296 		}
297 		if (write)
298 			WREG32_NO_KIQ(mmMM_DATA, *buf++);
299 		else
300 			*buf++ = RREG32_NO_KIQ(mmMM_DATA);
301 	}
302 	spin_unlock_irqrestore(&adev->mmio_idx_lock, flags);
303 }
304 
305 /*
306  * register access helper functions.
307  */
308 /**
309  * amdgpu_device_rreg - read a memory mapped IO or indirect register
310  *
311  * @adev: amdgpu_device pointer
312  * @reg: dword aligned register offset
313  * @acc_flags: access flags which require special behavior
314  *
315  * Returns the 32 bit value from the offset specified.
316  */
amdgpu_device_rreg(struct amdgpu_device * adev,uint32_t reg,uint32_t acc_flags)317 uint32_t amdgpu_device_rreg(struct amdgpu_device *adev,
318 			    uint32_t reg, uint32_t acc_flags)
319 {
320 	uint32_t ret;
321 
322 	if (adev->in_pci_err_recovery)
323 		return 0;
324 
325 	if ((reg * 4) < adev->rmmio_size) {
326 		if (!(acc_flags & AMDGPU_REGS_NO_KIQ) &&
327 		    amdgpu_sriov_runtime(adev) &&
328 		    down_read_trylock(&adev->reset_sem)) {
329 			ret = amdgpu_kiq_rreg(adev, reg);
330 			up_read(&adev->reset_sem);
331 		} else {
332 			ret = readl(((void __iomem *)adev->rmmio) + (reg * 4));
333 		}
334 	} else {
335 		ret = adev->pcie_rreg(adev, reg * 4);
336 	}
337 
338 	trace_amdgpu_device_rreg(adev->pdev->device, reg, ret);
339 
340 	return ret;
341 }
342 
343 /*
344  * MMIO register read with bytes helper functions
345  * @offset:bytes offset from MMIO start
346  *
347 */
348 
349 /**
350  * amdgpu_mm_rreg8 - read a memory mapped IO register
351  *
352  * @adev: amdgpu_device pointer
353  * @offset: byte aligned register offset
354  *
355  * Returns the 8 bit value from the offset specified.
356  */
amdgpu_mm_rreg8(struct amdgpu_device * adev,uint32_t offset)357 uint8_t amdgpu_mm_rreg8(struct amdgpu_device *adev, uint32_t offset)
358 {
359 	if (adev->in_pci_err_recovery)
360 		return 0;
361 
362 	if (offset < adev->rmmio_size)
363 		return (readb(adev->rmmio + offset));
364 	BUG();
365 }
366 
367 /*
368  * MMIO register write with bytes helper functions
369  * @offset:bytes offset from MMIO start
370  * @value: the value want to be written to the register
371  *
372 */
373 /**
374  * amdgpu_mm_wreg8 - read a memory mapped IO register
375  *
376  * @adev: amdgpu_device pointer
377  * @offset: byte aligned register offset
378  * @value: 8 bit value to write
379  *
380  * Writes the value specified to the offset specified.
381  */
amdgpu_mm_wreg8(struct amdgpu_device * adev,uint32_t offset,uint8_t value)382 void amdgpu_mm_wreg8(struct amdgpu_device *adev, uint32_t offset, uint8_t value)
383 {
384 	if (adev->in_pci_err_recovery)
385 		return;
386 
387 	if (offset < adev->rmmio_size)
388 		writeb(value, adev->rmmio + offset);
389 	else
390 		BUG();
391 }
392 
393 /**
394  * amdgpu_device_wreg - write to a memory mapped IO or indirect register
395  *
396  * @adev: amdgpu_device pointer
397  * @reg: dword aligned register offset
398  * @v: 32 bit value to write to the register
399  * @acc_flags: access flags which require special behavior
400  *
401  * Writes the value specified to the offset specified.
402  */
amdgpu_device_wreg(struct amdgpu_device * adev,uint32_t reg,uint32_t v,uint32_t acc_flags)403 void amdgpu_device_wreg(struct amdgpu_device *adev,
404 			uint32_t reg, uint32_t v,
405 			uint32_t acc_flags)
406 {
407 	if (adev->in_pci_err_recovery)
408 		return;
409 
410 	if ((reg * 4) < adev->rmmio_size) {
411 		if (!(acc_flags & AMDGPU_REGS_NO_KIQ) &&
412 		    amdgpu_sriov_runtime(adev) &&
413 		    down_read_trylock(&adev->reset_sem)) {
414 			amdgpu_kiq_wreg(adev, reg, v);
415 			up_read(&adev->reset_sem);
416 		} else {
417 			writel(v, ((void __iomem *)adev->rmmio) + (reg * 4));
418 		}
419 	} else {
420 		adev->pcie_wreg(adev, reg * 4, v);
421 	}
422 
423 	trace_amdgpu_device_wreg(adev->pdev->device, reg, v);
424 }
425 
426 /*
427  * amdgpu_mm_wreg_mmio_rlc -  write register either with mmio or with RLC path if in range
428  *
429  * this function is invoked only the debugfs register access
430  * */
amdgpu_mm_wreg_mmio_rlc(struct amdgpu_device * adev,uint32_t reg,uint32_t v)431 void amdgpu_mm_wreg_mmio_rlc(struct amdgpu_device *adev,
432 			     uint32_t reg, uint32_t v)
433 {
434 	if (adev->in_pci_err_recovery)
435 		return;
436 
437 	if (amdgpu_sriov_fullaccess(adev) &&
438 	    adev->gfx.rlc.funcs &&
439 	    adev->gfx.rlc.funcs->is_rlcg_access_range) {
440 		if (adev->gfx.rlc.funcs->is_rlcg_access_range(adev, reg))
441 			return adev->gfx.rlc.funcs->rlcg_wreg(adev, reg, v);
442 	} else {
443 		writel(v, ((void __iomem *)adev->rmmio) + (reg * 4));
444 	}
445 }
446 
447 /**
448  * amdgpu_io_rreg - read an IO register
449  *
450  * @adev: amdgpu_device pointer
451  * @reg: dword aligned register offset
452  *
453  * Returns the 32 bit value from the offset specified.
454  */
amdgpu_io_rreg(struct amdgpu_device * adev,u32 reg)455 u32 amdgpu_io_rreg(struct amdgpu_device *adev, u32 reg)
456 {
457 	if (adev->in_pci_err_recovery)
458 		return 0;
459 
460 	if ((reg * 4) < adev->rio_mem_size)
461 		return ioread32(adev->rio_mem + (reg * 4));
462 	else {
463 		iowrite32((reg * 4), adev->rio_mem + (mmMM_INDEX * 4));
464 		return ioread32(adev->rio_mem + (mmMM_DATA * 4));
465 	}
466 }
467 
468 /**
469  * amdgpu_io_wreg - write to an IO register
470  *
471  * @adev: amdgpu_device pointer
472  * @reg: dword aligned register offset
473  * @v: 32 bit value to write to the register
474  *
475  * Writes the value specified to the offset specified.
476  */
amdgpu_io_wreg(struct amdgpu_device * adev,u32 reg,u32 v)477 void amdgpu_io_wreg(struct amdgpu_device *adev, u32 reg, u32 v)
478 {
479 	if (adev->in_pci_err_recovery)
480 		return;
481 
482 	if ((reg * 4) < adev->rio_mem_size)
483 		iowrite32(v, adev->rio_mem + (reg * 4));
484 	else {
485 		iowrite32((reg * 4), adev->rio_mem + (mmMM_INDEX * 4));
486 		iowrite32(v, adev->rio_mem + (mmMM_DATA * 4));
487 	}
488 }
489 
490 /**
491  * amdgpu_mm_rdoorbell - read a doorbell dword
492  *
493  * @adev: amdgpu_device pointer
494  * @index: doorbell index
495  *
496  * Returns the value in the doorbell aperture at the
497  * requested doorbell index (CIK).
498  */
amdgpu_mm_rdoorbell(struct amdgpu_device * adev,u32 index)499 u32 amdgpu_mm_rdoorbell(struct amdgpu_device *adev, u32 index)
500 {
501 	if (adev->in_pci_err_recovery)
502 		return 0;
503 
504 	if (index < adev->doorbell.num_doorbells) {
505 		return readl(adev->doorbell.ptr + index);
506 	} else {
507 		DRM_ERROR("reading beyond doorbell aperture: 0x%08x!\n", index);
508 		return 0;
509 	}
510 }
511 
512 /**
513  * amdgpu_mm_wdoorbell - write a doorbell dword
514  *
515  * @adev: amdgpu_device pointer
516  * @index: doorbell index
517  * @v: value to write
518  *
519  * Writes @v to the doorbell aperture at the
520  * requested doorbell index (CIK).
521  */
amdgpu_mm_wdoorbell(struct amdgpu_device * adev,u32 index,u32 v)522 void amdgpu_mm_wdoorbell(struct amdgpu_device *adev, u32 index, u32 v)
523 {
524 	if (adev->in_pci_err_recovery)
525 		return;
526 
527 	if (index < adev->doorbell.num_doorbells) {
528 		writel(v, adev->doorbell.ptr + index);
529 	} else {
530 		DRM_ERROR("writing beyond doorbell aperture: 0x%08x!\n", index);
531 	}
532 }
533 
534 /**
535  * amdgpu_mm_rdoorbell64 - read a doorbell Qword
536  *
537  * @adev: amdgpu_device pointer
538  * @index: doorbell index
539  *
540  * Returns the value in the doorbell aperture at the
541  * requested doorbell index (VEGA10+).
542  */
amdgpu_mm_rdoorbell64(struct amdgpu_device * adev,u32 index)543 u64 amdgpu_mm_rdoorbell64(struct amdgpu_device *adev, u32 index)
544 {
545 	if (adev->in_pci_err_recovery)
546 		return 0;
547 
548 	if (index < adev->doorbell.num_doorbells) {
549 		return atomic64_read((atomic64_t *)(adev->doorbell.ptr + index));
550 	} else {
551 		DRM_ERROR("reading beyond doorbell aperture: 0x%08x!\n", index);
552 		return 0;
553 	}
554 }
555 
556 /**
557  * amdgpu_mm_wdoorbell64 - write a doorbell Qword
558  *
559  * @adev: amdgpu_device pointer
560  * @index: doorbell index
561  * @v: value to write
562  *
563  * Writes @v to the doorbell aperture at the
564  * requested doorbell index (VEGA10+).
565  */
amdgpu_mm_wdoorbell64(struct amdgpu_device * adev,u32 index,u64 v)566 void amdgpu_mm_wdoorbell64(struct amdgpu_device *adev, u32 index, u64 v)
567 {
568 	if (adev->in_pci_err_recovery)
569 		return;
570 
571 	if (index < adev->doorbell.num_doorbells) {
572 		atomic64_set((atomic64_t *)(adev->doorbell.ptr + index), v);
573 	} else {
574 		DRM_ERROR("writing beyond doorbell aperture: 0x%08x!\n", index);
575 	}
576 }
577 
578 /**
579  * amdgpu_device_indirect_rreg - read an indirect register
580  *
581  * @adev: amdgpu_device pointer
582  * @pcie_index: mmio register offset
583  * @pcie_data: mmio register offset
584  *
585  * Returns the value of indirect register @reg_addr
586  */
amdgpu_device_indirect_rreg(struct amdgpu_device * adev,u32 pcie_index,u32 pcie_data,u32 reg_addr)587 u32 amdgpu_device_indirect_rreg(struct amdgpu_device *adev,
588 				u32 pcie_index, u32 pcie_data,
589 				u32 reg_addr)
590 {
591 	unsigned long flags;
592 	u32 r;
593 	void __iomem *pcie_index_offset;
594 	void __iomem *pcie_data_offset;
595 
596 	spin_lock_irqsave(&adev->pcie_idx_lock, flags);
597 	pcie_index_offset = (void __iomem *)adev->rmmio + pcie_index * 4;
598 	pcie_data_offset = (void __iomem *)adev->rmmio + pcie_data * 4;
599 
600 	writel(reg_addr, pcie_index_offset);
601 	readl(pcie_index_offset);
602 	r = readl(pcie_data_offset);
603 	spin_unlock_irqrestore(&adev->pcie_idx_lock, flags);
604 
605 	return r;
606 }
607 
608 /**
609  * amdgpu_device_indirect_rreg64 - read a 64bits indirect register
610  *
611  * @adev: amdgpu_device pointer
612  * @pcie_index: mmio register offset
613  * @pcie_data: mmio register offset
614  *
615  * Returns the value of indirect register @reg_addr
616  */
amdgpu_device_indirect_rreg64(struct amdgpu_device * adev,u32 pcie_index,u32 pcie_data,u32 reg_addr)617 u64 amdgpu_device_indirect_rreg64(struct amdgpu_device *adev,
618 				  u32 pcie_index, u32 pcie_data,
619 				  u32 reg_addr)
620 {
621 	unsigned long flags;
622 	u64 r;
623 	void __iomem *pcie_index_offset;
624 	void __iomem *pcie_data_offset;
625 
626 	spin_lock_irqsave(&adev->pcie_idx_lock, flags);
627 	pcie_index_offset = (void __iomem *)adev->rmmio + pcie_index * 4;
628 	pcie_data_offset = (void __iomem *)adev->rmmio + pcie_data * 4;
629 
630 	/* read low 32 bits */
631 	writel(reg_addr, pcie_index_offset);
632 	readl(pcie_index_offset);
633 	r = readl(pcie_data_offset);
634 	/* read high 32 bits */
635 	writel(reg_addr + 4, pcie_index_offset);
636 	readl(pcie_index_offset);
637 	r |= ((u64)readl(pcie_data_offset) << 32);
638 	spin_unlock_irqrestore(&adev->pcie_idx_lock, flags);
639 
640 	return r;
641 }
642 
643 /**
644  * amdgpu_device_indirect_wreg - write an indirect register address
645  *
646  * @adev: amdgpu_device pointer
647  * @pcie_index: mmio register offset
648  * @pcie_data: mmio register offset
649  * @reg_addr: indirect register offset
650  * @reg_data: indirect register data
651  *
652  */
amdgpu_device_indirect_wreg(struct amdgpu_device * adev,u32 pcie_index,u32 pcie_data,u32 reg_addr,u32 reg_data)653 void amdgpu_device_indirect_wreg(struct amdgpu_device *adev,
654 				 u32 pcie_index, u32 pcie_data,
655 				 u32 reg_addr, u32 reg_data)
656 {
657 	unsigned long flags;
658 	void __iomem *pcie_index_offset;
659 	void __iomem *pcie_data_offset;
660 
661 	spin_lock_irqsave(&adev->pcie_idx_lock, flags);
662 	pcie_index_offset = (void __iomem *)adev->rmmio + pcie_index * 4;
663 	pcie_data_offset = (void __iomem *)adev->rmmio + pcie_data * 4;
664 
665 	writel(reg_addr, pcie_index_offset);
666 	readl(pcie_index_offset);
667 	writel(reg_data, pcie_data_offset);
668 	readl(pcie_data_offset);
669 	spin_unlock_irqrestore(&adev->pcie_idx_lock, flags);
670 }
671 
672 /**
673  * amdgpu_device_indirect_wreg64 - write a 64bits indirect register address
674  *
675  * @adev: amdgpu_device pointer
676  * @pcie_index: mmio register offset
677  * @pcie_data: mmio register offset
678  * @reg_addr: indirect register offset
679  * @reg_data: indirect register data
680  *
681  */
amdgpu_device_indirect_wreg64(struct amdgpu_device * adev,u32 pcie_index,u32 pcie_data,u32 reg_addr,u64 reg_data)682 void amdgpu_device_indirect_wreg64(struct amdgpu_device *adev,
683 				   u32 pcie_index, u32 pcie_data,
684 				   u32 reg_addr, u64 reg_data)
685 {
686 	unsigned long flags;
687 	void __iomem *pcie_index_offset;
688 	void __iomem *pcie_data_offset;
689 
690 	spin_lock_irqsave(&adev->pcie_idx_lock, flags);
691 	pcie_index_offset = (void __iomem *)adev->rmmio + pcie_index * 4;
692 	pcie_data_offset = (void __iomem *)adev->rmmio + pcie_data * 4;
693 
694 	/* write low 32 bits */
695 	writel(reg_addr, pcie_index_offset);
696 	readl(pcie_index_offset);
697 	writel((u32)(reg_data & 0xffffffffULL), pcie_data_offset);
698 	readl(pcie_data_offset);
699 	/* write high 32 bits */
700 	writel(reg_addr + 4, pcie_index_offset);
701 	readl(pcie_index_offset);
702 	writel((u32)(reg_data >> 32), pcie_data_offset);
703 	readl(pcie_data_offset);
704 	spin_unlock_irqrestore(&adev->pcie_idx_lock, flags);
705 }
706 
707 /**
708  * amdgpu_invalid_rreg - dummy reg read function
709  *
710  * @adev: amdgpu_device pointer
711  * @reg: offset of register
712  *
713  * Dummy register read function.  Used for register blocks
714  * that certain asics don't have (all asics).
715  * Returns the value in the register.
716  */
amdgpu_invalid_rreg(struct amdgpu_device * adev,uint32_t reg)717 static uint32_t amdgpu_invalid_rreg(struct amdgpu_device *adev, uint32_t reg)
718 {
719 	DRM_ERROR("Invalid callback to read register 0x%04X\n", reg);
720 	BUG();
721 	return 0;
722 }
723 
724 /**
725  * amdgpu_invalid_wreg - dummy reg write function
726  *
727  * @adev: amdgpu_device pointer
728  * @reg: offset of register
729  * @v: value to write to the register
730  *
731  * Dummy register read function.  Used for register blocks
732  * that certain asics don't have (all asics).
733  */
amdgpu_invalid_wreg(struct amdgpu_device * adev,uint32_t reg,uint32_t v)734 static void amdgpu_invalid_wreg(struct amdgpu_device *adev, uint32_t reg, uint32_t v)
735 {
736 	DRM_ERROR("Invalid callback to write register 0x%04X with 0x%08X\n",
737 		  reg, v);
738 	BUG();
739 }
740 
741 /**
742  * amdgpu_invalid_rreg64 - dummy 64 bit reg read function
743  *
744  * @adev: amdgpu_device pointer
745  * @reg: offset of register
746  *
747  * Dummy register read function.  Used for register blocks
748  * that certain asics don't have (all asics).
749  * Returns the value in the register.
750  */
amdgpu_invalid_rreg64(struct amdgpu_device * adev,uint32_t reg)751 static uint64_t amdgpu_invalid_rreg64(struct amdgpu_device *adev, uint32_t reg)
752 {
753 	DRM_ERROR("Invalid callback to read 64 bit register 0x%04X\n", reg);
754 	BUG();
755 	return 0;
756 }
757 
758 /**
759  * amdgpu_invalid_wreg64 - dummy reg write function
760  *
761  * @adev: amdgpu_device pointer
762  * @reg: offset of register
763  * @v: value to write to the register
764  *
765  * Dummy register read function.  Used for register blocks
766  * that certain asics don't have (all asics).
767  */
amdgpu_invalid_wreg64(struct amdgpu_device * adev,uint32_t reg,uint64_t v)768 static void amdgpu_invalid_wreg64(struct amdgpu_device *adev, uint32_t reg, uint64_t v)
769 {
770 	DRM_ERROR("Invalid callback to write 64 bit register 0x%04X with 0x%08llX\n",
771 		  reg, v);
772 	BUG();
773 }
774 
775 /**
776  * amdgpu_block_invalid_rreg - dummy reg read function
777  *
778  * @adev: amdgpu_device pointer
779  * @block: offset of instance
780  * @reg: offset of register
781  *
782  * Dummy register read function.  Used for register blocks
783  * that certain asics don't have (all asics).
784  * Returns the value in the register.
785  */
amdgpu_block_invalid_rreg(struct amdgpu_device * adev,uint32_t block,uint32_t reg)786 static uint32_t amdgpu_block_invalid_rreg(struct amdgpu_device *adev,
787 					  uint32_t block, uint32_t reg)
788 {
789 	DRM_ERROR("Invalid callback to read register 0x%04X in block 0x%04X\n",
790 		  reg, block);
791 	BUG();
792 	return 0;
793 }
794 
795 /**
796  * amdgpu_block_invalid_wreg - dummy reg write function
797  *
798  * @adev: amdgpu_device pointer
799  * @block: offset of instance
800  * @reg: offset of register
801  * @v: value to write to the register
802  *
803  * Dummy register read function.  Used for register blocks
804  * that certain asics don't have (all asics).
805  */
amdgpu_block_invalid_wreg(struct amdgpu_device * adev,uint32_t block,uint32_t reg,uint32_t v)806 static void amdgpu_block_invalid_wreg(struct amdgpu_device *adev,
807 				      uint32_t block,
808 				      uint32_t reg, uint32_t v)
809 {
810 	DRM_ERROR("Invalid block callback to write register 0x%04X in block 0x%04X with 0x%08X\n",
811 		  reg, block, v);
812 	BUG();
813 }
814 
815 /**
816  * amdgpu_device_asic_init - Wrapper for atom asic_init
817  *
818  * @adev: amdgpu_device pointer
819  *
820  * Does any asic specific work and then calls atom asic init.
821  */
amdgpu_device_asic_init(struct amdgpu_device * adev)822 static int amdgpu_device_asic_init(struct amdgpu_device *adev)
823 {
824 	amdgpu_asic_pre_asic_init(adev);
825 
826 	return amdgpu_atom_asic_init(adev->mode_info.atom_context);
827 }
828 
829 /**
830  * amdgpu_device_vram_scratch_init - allocate the VRAM scratch page
831  *
832  * @adev: amdgpu_device pointer
833  *
834  * Allocates a scratch page of VRAM for use by various things in the
835  * driver.
836  */
amdgpu_device_vram_scratch_init(struct amdgpu_device * adev)837 static int amdgpu_device_vram_scratch_init(struct amdgpu_device *adev)
838 {
839 	return amdgpu_bo_create_kernel(adev, AMDGPU_GPU_PAGE_SIZE,
840 				       PAGE_SIZE, AMDGPU_GEM_DOMAIN_VRAM,
841 				       &adev->vram_scratch.robj,
842 				       &adev->vram_scratch.gpu_addr,
843 				       (void **)&adev->vram_scratch.ptr);
844 }
845 
846 /**
847  * amdgpu_device_vram_scratch_fini - Free the VRAM scratch page
848  *
849  * @adev: amdgpu_device pointer
850  *
851  * Frees the VRAM scratch page.
852  */
amdgpu_device_vram_scratch_fini(struct amdgpu_device * adev)853 static void amdgpu_device_vram_scratch_fini(struct amdgpu_device *adev)
854 {
855 	amdgpu_bo_free_kernel(&adev->vram_scratch.robj, NULL, NULL);
856 }
857 
858 /**
859  * amdgpu_device_program_register_sequence - program an array of registers.
860  *
861  * @adev: amdgpu_device pointer
862  * @registers: pointer to the register array
863  * @array_size: size of the register array
864  *
865  * Programs an array or registers with and and or masks.
866  * This is a helper for setting golden registers.
867  */
amdgpu_device_program_register_sequence(struct amdgpu_device * adev,const u32 * registers,const u32 array_size)868 void amdgpu_device_program_register_sequence(struct amdgpu_device *adev,
869 					     const u32 *registers,
870 					     const u32 array_size)
871 {
872 	u32 tmp, reg, and_mask, or_mask;
873 	int i;
874 
875 	if (array_size % 3)
876 		return;
877 
878 	for (i = 0; i < array_size; i +=3) {
879 		reg = registers[i + 0];
880 		and_mask = registers[i + 1];
881 		or_mask = registers[i + 2];
882 
883 		if (and_mask == 0xffffffff) {
884 			tmp = or_mask;
885 		} else {
886 			tmp = RREG32(reg);
887 			tmp &= ~and_mask;
888 			if (adev->family >= AMDGPU_FAMILY_AI)
889 				tmp |= (or_mask & and_mask);
890 			else
891 				tmp |= or_mask;
892 		}
893 		WREG32(reg, tmp);
894 	}
895 }
896 
897 /**
898  * amdgpu_device_pci_config_reset - reset the GPU
899  *
900  * @adev: amdgpu_device pointer
901  *
902  * Resets the GPU using the pci config reset sequence.
903  * Only applicable to asics prior to vega10.
904  */
amdgpu_device_pci_config_reset(struct amdgpu_device * adev)905 void amdgpu_device_pci_config_reset(struct amdgpu_device *adev)
906 {
907 	pci_write_config_dword(adev->pdev, 0x7c, AMDGPU_ASIC_RESET_DATA);
908 }
909 
910 /*
911  * GPU doorbell aperture helpers function.
912  */
913 /**
914  * amdgpu_device_doorbell_init - Init doorbell driver information.
915  *
916  * @adev: amdgpu_device pointer
917  *
918  * Init doorbell driver information (CIK)
919  * Returns 0 on success, error on failure.
920  */
amdgpu_device_doorbell_init(struct amdgpu_device * adev)921 static int amdgpu_device_doorbell_init(struct amdgpu_device *adev)
922 {
923 
924 	/* No doorbell on SI hardware generation */
925 	if (adev->asic_type < CHIP_BONAIRE) {
926 		adev->doorbell.base = 0;
927 		adev->doorbell.size = 0;
928 		adev->doorbell.num_doorbells = 0;
929 		adev->doorbell.ptr = NULL;
930 		return 0;
931 	}
932 
933 	if (pci_resource_flags(adev->pdev, 2) & IORESOURCE_UNSET)
934 		return -EINVAL;
935 
936 	amdgpu_asic_init_doorbell_index(adev);
937 
938 	/* doorbell bar mapping */
939 	adev->doorbell.base = pci_resource_start(adev->pdev, 2);
940 	adev->doorbell.size = pci_resource_len(adev->pdev, 2);
941 
942 	adev->doorbell.num_doorbells = min_t(u32, adev->doorbell.size / sizeof(u32),
943 					     adev->doorbell_index.max_assignment+1);
944 	if (adev->doorbell.num_doorbells == 0)
945 		return -EINVAL;
946 
947 	/* For Vega, reserve and map two pages on doorbell BAR since SDMA
948 	 * paging queue doorbell use the second page. The
949 	 * AMDGPU_DOORBELL64_MAX_ASSIGNMENT definition assumes all the
950 	 * doorbells are in the first page. So with paging queue enabled,
951 	 * the max num_doorbells should + 1 page (0x400 in dword)
952 	 */
953 	if (adev->asic_type >= CHIP_VEGA10)
954 		adev->doorbell.num_doorbells += 0x400;
955 
956 	adev->doorbell.ptr = ioremap(adev->doorbell.base,
957 				     adev->doorbell.num_doorbells *
958 				     sizeof(u32));
959 	if (adev->doorbell.ptr == NULL)
960 		return -ENOMEM;
961 
962 	return 0;
963 }
964 
965 /**
966  * amdgpu_device_doorbell_fini - Tear down doorbell driver information.
967  *
968  * @adev: amdgpu_device pointer
969  *
970  * Tear down doorbell driver information (CIK)
971  */
amdgpu_device_doorbell_fini(struct amdgpu_device * adev)972 static void amdgpu_device_doorbell_fini(struct amdgpu_device *adev)
973 {
974 	iounmap(adev->doorbell.ptr);
975 	adev->doorbell.ptr = NULL;
976 }
977 
978 
979 
980 /*
981  * amdgpu_device_wb_*()
982  * Writeback is the method by which the GPU updates special pages in memory
983  * with the status of certain GPU events (fences, ring pointers,etc.).
984  */
985 
986 /**
987  * amdgpu_device_wb_fini - Disable Writeback and free memory
988  *
989  * @adev: amdgpu_device pointer
990  *
991  * Disables Writeback and frees the Writeback memory (all asics).
992  * Used at driver shutdown.
993  */
amdgpu_device_wb_fini(struct amdgpu_device * adev)994 static void amdgpu_device_wb_fini(struct amdgpu_device *adev)
995 {
996 	if (adev->wb.wb_obj) {
997 		amdgpu_bo_free_kernel(&adev->wb.wb_obj,
998 				      &adev->wb.gpu_addr,
999 				      (void **)&adev->wb.wb);
1000 		adev->wb.wb_obj = NULL;
1001 	}
1002 }
1003 
1004 /**
1005  * amdgpu_device_wb_init- Init Writeback driver info and allocate memory
1006  *
1007  * @adev: amdgpu_device pointer
1008  *
1009  * Initializes writeback and allocates writeback memory (all asics).
1010  * Used at driver startup.
1011  * Returns 0 on success or an -error on failure.
1012  */
amdgpu_device_wb_init(struct amdgpu_device * adev)1013 static int amdgpu_device_wb_init(struct amdgpu_device *adev)
1014 {
1015 	int r;
1016 
1017 	if (adev->wb.wb_obj == NULL) {
1018 		/* AMDGPU_MAX_WB * sizeof(uint32_t) * 8 = AMDGPU_MAX_WB 256bit slots */
1019 		r = amdgpu_bo_create_kernel(adev, AMDGPU_MAX_WB * sizeof(uint32_t) * 8,
1020 					    PAGE_SIZE, AMDGPU_GEM_DOMAIN_GTT,
1021 					    &adev->wb.wb_obj, &adev->wb.gpu_addr,
1022 					    (void **)&adev->wb.wb);
1023 		if (r) {
1024 			dev_warn(adev->dev, "(%d) create WB bo failed\n", r);
1025 			return r;
1026 		}
1027 
1028 		adev->wb.num_wb = AMDGPU_MAX_WB;
1029 		memset(&adev->wb.used, 0, sizeof(adev->wb.used));
1030 
1031 		/* clear wb memory */
1032 		memset((char *)adev->wb.wb, 0, AMDGPU_MAX_WB * sizeof(uint32_t) * 8);
1033 	}
1034 
1035 	return 0;
1036 }
1037 
1038 /**
1039  * amdgpu_device_wb_get - Allocate a wb entry
1040  *
1041  * @adev: amdgpu_device pointer
1042  * @wb: wb index
1043  *
1044  * Allocate a wb slot for use by the driver (all asics).
1045  * Returns 0 on success or -EINVAL on failure.
1046  */
amdgpu_device_wb_get(struct amdgpu_device * adev,u32 * wb)1047 int amdgpu_device_wb_get(struct amdgpu_device *adev, u32 *wb)
1048 {
1049 	unsigned long offset = find_first_zero_bit(adev->wb.used, adev->wb.num_wb);
1050 
1051 	if (offset < adev->wb.num_wb) {
1052 		__set_bit(offset, adev->wb.used);
1053 		*wb = offset << 3; /* convert to dw offset */
1054 		return 0;
1055 	} else {
1056 		return -EINVAL;
1057 	}
1058 }
1059 
1060 /**
1061  * amdgpu_device_wb_free - Free a wb entry
1062  *
1063  * @adev: amdgpu_device pointer
1064  * @wb: wb index
1065  *
1066  * Free a wb slot allocated for use by the driver (all asics)
1067  */
amdgpu_device_wb_free(struct amdgpu_device * adev,u32 wb)1068 void amdgpu_device_wb_free(struct amdgpu_device *adev, u32 wb)
1069 {
1070 	wb >>= 3;
1071 	if (wb < adev->wb.num_wb)
1072 		__clear_bit(wb, adev->wb.used);
1073 }
1074 
1075 /**
1076  * amdgpu_device_resize_fb_bar - try to resize FB BAR
1077  *
1078  * @adev: amdgpu_device pointer
1079  *
1080  * Try to resize FB BAR to make all VRAM CPU accessible. We try very hard not
1081  * to fail, but if any of the BARs is not accessible after the size we abort
1082  * driver loading by returning -ENODEV.
1083  */
amdgpu_device_resize_fb_bar(struct amdgpu_device * adev)1084 int amdgpu_device_resize_fb_bar(struct amdgpu_device *adev)
1085 {
1086 	u64 space_needed = roundup_pow_of_two(adev->gmc.real_vram_size);
1087 	u32 rbar_size = order_base_2(((space_needed >> 20) | 1)) - 1;
1088 	struct pci_bus *root;
1089 	struct resource *res;
1090 	unsigned i;
1091 	u16 cmd;
1092 	int r;
1093 
1094 	if (!IS_ENABLED(CONFIG_PHYS_ADDR_T_64BIT))
1095 		return 0;
1096 
1097 	/* Bypass for VF */
1098 	if (amdgpu_sriov_vf(adev))
1099 		return 0;
1100 
1101 	/* skip if the bios has already enabled large BAR */
1102 	if (adev->gmc.real_vram_size &&
1103 	    (pci_resource_len(adev->pdev, 0) >= adev->gmc.real_vram_size))
1104 		return 0;
1105 
1106 	/* Check if the root BUS has 64bit memory resources */
1107 	root = adev->pdev->bus;
1108 	while (root->parent)
1109 		root = root->parent;
1110 
1111 	pci_bus_for_each_resource(root, res, i) {
1112 		if (res && res->flags & (IORESOURCE_MEM | IORESOURCE_MEM_64) &&
1113 		    res->start > 0x100000000ull)
1114 			break;
1115 	}
1116 
1117 	/* Trying to resize is pointless without a root hub window above 4GB */
1118 	if (!res)
1119 		return 0;
1120 
1121 	/* Disable memory decoding while we change the BAR addresses and size */
1122 	pci_read_config_word(adev->pdev, PCI_COMMAND, &cmd);
1123 	pci_write_config_word(adev->pdev, PCI_COMMAND,
1124 			      cmd & ~PCI_COMMAND_MEMORY);
1125 
1126 	/* Free the VRAM and doorbell BAR, we most likely need to move both. */
1127 	amdgpu_device_doorbell_fini(adev);
1128 	if (adev->asic_type >= CHIP_BONAIRE)
1129 		pci_release_resource(adev->pdev, 2);
1130 
1131 	pci_release_resource(adev->pdev, 0);
1132 
1133 	r = pci_resize_resource(adev->pdev, 0, rbar_size);
1134 	if (r == -ENOSPC)
1135 		DRM_INFO("Not enough PCI address space for a large BAR.");
1136 	else if (r && r != -ENOTSUPP)
1137 		DRM_ERROR("Problem resizing BAR0 (%d).", r);
1138 
1139 	pci_assign_unassigned_bus_resources(adev->pdev->bus);
1140 
1141 	/* When the doorbell or fb BAR isn't available we have no chance of
1142 	 * using the device.
1143 	 */
1144 	r = amdgpu_device_doorbell_init(adev);
1145 	if (r || (pci_resource_flags(adev->pdev, 0) & IORESOURCE_UNSET))
1146 		return -ENODEV;
1147 
1148 	pci_write_config_word(adev->pdev, PCI_COMMAND, cmd);
1149 
1150 	return 0;
1151 }
1152 
1153 /*
1154  * GPU helpers function.
1155  */
1156 /**
1157  * amdgpu_device_need_post - check if the hw need post or not
1158  *
1159  * @adev: amdgpu_device pointer
1160  *
1161  * Check if the asic has been initialized (all asics) at driver startup
1162  * or post is needed if  hw reset is performed.
1163  * Returns true if need or false if not.
1164  */
amdgpu_device_need_post(struct amdgpu_device * adev)1165 bool amdgpu_device_need_post(struct amdgpu_device *adev)
1166 {
1167 	uint32_t reg;
1168 
1169 	if (amdgpu_sriov_vf(adev))
1170 		return false;
1171 
1172 	if (amdgpu_passthrough(adev)) {
1173 		/* for FIJI: In whole GPU pass-through virtualization case, after VM reboot
1174 		 * some old smc fw still need driver do vPost otherwise gpu hang, while
1175 		 * those smc fw version above 22.15 doesn't have this flaw, so we force
1176 		 * vpost executed for smc version below 22.15
1177 		 */
1178 		if (adev->asic_type == CHIP_FIJI) {
1179 			int err;
1180 			uint32_t fw_ver;
1181 			err = request_firmware(&adev->pm.fw, "amdgpu/fiji_smc.bin", adev->dev);
1182 			/* force vPost if error occured */
1183 			if (err)
1184 				return true;
1185 
1186 			fw_ver = *((uint32_t *)adev->pm.fw->data + 69);
1187 			if (fw_ver < 0x00160e00)
1188 				return true;
1189 		}
1190 	}
1191 
1192 	if (adev->has_hw_reset) {
1193 		adev->has_hw_reset = false;
1194 		return true;
1195 	}
1196 
1197 	/* bios scratch used on CIK+ */
1198 	if (adev->asic_type >= CHIP_BONAIRE)
1199 		return amdgpu_atombios_scratch_need_asic_init(adev);
1200 
1201 	/* check MEM_SIZE for older asics */
1202 	reg = amdgpu_asic_get_config_memsize(adev);
1203 
1204 	if ((reg != 0) && (reg != 0xffffffff))
1205 		return false;
1206 
1207 	return true;
1208 }
1209 
1210 /* if we get transitioned to only one device, take VGA back */
1211 /**
1212  * amdgpu_device_vga_set_decode - enable/disable vga decode
1213  *
1214  * @cookie: amdgpu_device pointer
1215  * @state: enable/disable vga decode
1216  *
1217  * Enable/disable vga decode (all asics).
1218  * Returns VGA resource flags.
1219  */
amdgpu_device_vga_set_decode(void * cookie,bool state)1220 static unsigned int amdgpu_device_vga_set_decode(void *cookie, bool state)
1221 {
1222 	struct amdgpu_device *adev = cookie;
1223 	amdgpu_asic_set_vga_state(adev, state);
1224 	if (state)
1225 		return VGA_RSRC_LEGACY_IO | VGA_RSRC_LEGACY_MEM |
1226 		       VGA_RSRC_NORMAL_IO | VGA_RSRC_NORMAL_MEM;
1227 	else
1228 		return VGA_RSRC_NORMAL_IO | VGA_RSRC_NORMAL_MEM;
1229 }
1230 
1231 /**
1232  * amdgpu_device_check_block_size - validate the vm block size
1233  *
1234  * @adev: amdgpu_device pointer
1235  *
1236  * Validates the vm block size specified via module parameter.
1237  * The vm block size defines number of bits in page table versus page directory,
1238  * a page is 4KB so we have 12 bits offset, minimum 9 bits in the
1239  * page table and the remaining bits are in the page directory.
1240  */
amdgpu_device_check_block_size(struct amdgpu_device * adev)1241 static void amdgpu_device_check_block_size(struct amdgpu_device *adev)
1242 {
1243 	/* defines number of bits in page table versus page directory,
1244 	 * a page is 4KB so we have 12 bits offset, minimum 9 bits in the
1245 	 * page table and the remaining bits are in the page directory */
1246 	if (amdgpu_vm_block_size == -1)
1247 		return;
1248 
1249 	if (amdgpu_vm_block_size < 9) {
1250 		dev_warn(adev->dev, "VM page table size (%d) too small\n",
1251 			 amdgpu_vm_block_size);
1252 		amdgpu_vm_block_size = -1;
1253 	}
1254 }
1255 
1256 /**
1257  * amdgpu_device_check_vm_size - validate the vm size
1258  *
1259  * @adev: amdgpu_device pointer
1260  *
1261  * Validates the vm size in GB specified via module parameter.
1262  * The VM size is the size of the GPU virtual memory space in GB.
1263  */
amdgpu_device_check_vm_size(struct amdgpu_device * adev)1264 static void amdgpu_device_check_vm_size(struct amdgpu_device *adev)
1265 {
1266 	/* no need to check the default value */
1267 	if (amdgpu_vm_size == -1)
1268 		return;
1269 
1270 	if (amdgpu_vm_size < 1) {
1271 		dev_warn(adev->dev, "VM size (%d) too small, min is 1GB\n",
1272 			 amdgpu_vm_size);
1273 		amdgpu_vm_size = -1;
1274 	}
1275 }
1276 
amdgpu_device_check_smu_prv_buffer_size(struct amdgpu_device * adev)1277 static void amdgpu_device_check_smu_prv_buffer_size(struct amdgpu_device *adev)
1278 {
1279 	struct sysinfo si;
1280 	bool is_os_64 = (sizeof(void *) == 8);
1281 	uint64_t total_memory;
1282 	uint64_t dram_size_seven_GB = 0x1B8000000;
1283 	uint64_t dram_size_three_GB = 0xB8000000;
1284 
1285 	if (amdgpu_smu_memory_pool_size == 0)
1286 		return;
1287 
1288 	if (!is_os_64) {
1289 		DRM_WARN("Not 64-bit OS, feature not supported\n");
1290 		goto def_value;
1291 	}
1292 	si_meminfo(&si);
1293 	total_memory = (uint64_t)si.totalram * si.mem_unit;
1294 
1295 	if ((amdgpu_smu_memory_pool_size == 1) ||
1296 		(amdgpu_smu_memory_pool_size == 2)) {
1297 		if (total_memory < dram_size_three_GB)
1298 			goto def_value1;
1299 	} else if ((amdgpu_smu_memory_pool_size == 4) ||
1300 		(amdgpu_smu_memory_pool_size == 8)) {
1301 		if (total_memory < dram_size_seven_GB)
1302 			goto def_value1;
1303 	} else {
1304 		DRM_WARN("Smu memory pool size not supported\n");
1305 		goto def_value;
1306 	}
1307 	adev->pm.smu_prv_buffer_size = amdgpu_smu_memory_pool_size << 28;
1308 
1309 	return;
1310 
1311 def_value1:
1312 	DRM_WARN("No enough system memory\n");
1313 def_value:
1314 	adev->pm.smu_prv_buffer_size = 0;
1315 }
1316 
1317 /**
1318  * amdgpu_device_check_arguments - validate module params
1319  *
1320  * @adev: amdgpu_device pointer
1321  *
1322  * Validates certain module parameters and updates
1323  * the associated values used by the driver (all asics).
1324  */
amdgpu_device_check_arguments(struct amdgpu_device * adev)1325 static int amdgpu_device_check_arguments(struct amdgpu_device *adev)
1326 {
1327 	if (amdgpu_sched_jobs < 4) {
1328 		dev_warn(adev->dev, "sched jobs (%d) must be at least 4\n",
1329 			 amdgpu_sched_jobs);
1330 		amdgpu_sched_jobs = 4;
1331 	} else if (!is_power_of_2(amdgpu_sched_jobs)){
1332 		dev_warn(adev->dev, "sched jobs (%d) must be a power of 2\n",
1333 			 amdgpu_sched_jobs);
1334 		amdgpu_sched_jobs = roundup_pow_of_two(amdgpu_sched_jobs);
1335 	}
1336 
1337 	if (amdgpu_gart_size != -1 && amdgpu_gart_size < 32) {
1338 		/* gart size must be greater or equal to 32M */
1339 		dev_warn(adev->dev, "gart size (%d) too small\n",
1340 			 amdgpu_gart_size);
1341 		amdgpu_gart_size = -1;
1342 	}
1343 
1344 	if (amdgpu_gtt_size != -1 && amdgpu_gtt_size < 32) {
1345 		/* gtt size must be greater or equal to 32M */
1346 		dev_warn(adev->dev, "gtt size (%d) too small\n",
1347 				 amdgpu_gtt_size);
1348 		amdgpu_gtt_size = -1;
1349 	}
1350 
1351 	/* valid range is between 4 and 9 inclusive */
1352 	if (amdgpu_vm_fragment_size != -1 &&
1353 	    (amdgpu_vm_fragment_size > 9 || amdgpu_vm_fragment_size < 4)) {
1354 		dev_warn(adev->dev, "valid range is between 4 and 9\n");
1355 		amdgpu_vm_fragment_size = -1;
1356 	}
1357 
1358 	if (amdgpu_sched_hw_submission < 2) {
1359 		dev_warn(adev->dev, "sched hw submission jobs (%d) must be at least 2\n",
1360 			 amdgpu_sched_hw_submission);
1361 		amdgpu_sched_hw_submission = 2;
1362 	} else if (!is_power_of_2(amdgpu_sched_hw_submission)) {
1363 		dev_warn(adev->dev, "sched hw submission jobs (%d) must be a power of 2\n",
1364 			 amdgpu_sched_hw_submission);
1365 		amdgpu_sched_hw_submission = roundup_pow_of_two(amdgpu_sched_hw_submission);
1366 	}
1367 
1368 	amdgpu_device_check_smu_prv_buffer_size(adev);
1369 
1370 	amdgpu_device_check_vm_size(adev);
1371 
1372 	amdgpu_device_check_block_size(adev);
1373 
1374 	adev->firmware.load_type = amdgpu_ucode_get_load_type(adev, amdgpu_fw_load_type);
1375 
1376 	amdgpu_gmc_tmz_set(adev);
1377 
1378 	if (amdgpu_num_kcq == -1) {
1379 		amdgpu_num_kcq = 8;
1380 	} else if (amdgpu_num_kcq > 8 || amdgpu_num_kcq < 0) {
1381 		amdgpu_num_kcq = 8;
1382 		dev_warn(adev->dev, "set kernel compute queue number to 8 due to invalid parameter provided by user\n");
1383 	}
1384 
1385 	amdgpu_gmc_noretry_set(adev);
1386 
1387 	return 0;
1388 }
1389 
1390 /**
1391  * amdgpu_switcheroo_set_state - set switcheroo state
1392  *
1393  * @pdev: pci dev pointer
1394  * @state: vga_switcheroo state
1395  *
1396  * Callback for the switcheroo driver.  Suspends or resumes the
1397  * the asics before or after it is powered up using ACPI methods.
1398  */
amdgpu_switcheroo_set_state(struct pci_dev * pdev,enum vga_switcheroo_state state)1399 static void amdgpu_switcheroo_set_state(struct pci_dev *pdev,
1400 					enum vga_switcheroo_state state)
1401 {
1402 	struct drm_device *dev = pci_get_drvdata(pdev);
1403 	int r;
1404 
1405 	if (amdgpu_device_supports_boco(dev) && state == VGA_SWITCHEROO_OFF)
1406 		return;
1407 
1408 	if (state == VGA_SWITCHEROO_ON) {
1409 		pr_info("switched on\n");
1410 		/* don't suspend or resume card normally */
1411 		dev->switch_power_state = DRM_SWITCH_POWER_CHANGING;
1412 
1413 		pci_set_power_state(dev->pdev, PCI_D0);
1414 		amdgpu_device_load_pci_state(dev->pdev);
1415 		r = pci_enable_device(dev->pdev);
1416 		if (r)
1417 			DRM_WARN("pci_enable_device failed (%d)\n", r);
1418 		amdgpu_device_resume(dev, true);
1419 
1420 		dev->switch_power_state = DRM_SWITCH_POWER_ON;
1421 		drm_kms_helper_poll_enable(dev);
1422 	} else {
1423 		pr_info("switched off\n");
1424 		drm_kms_helper_poll_disable(dev);
1425 		dev->switch_power_state = DRM_SWITCH_POWER_CHANGING;
1426 		amdgpu_device_suspend(dev, true);
1427 		amdgpu_device_cache_pci_state(dev->pdev);
1428 		/* Shut down the device */
1429 		pci_disable_device(dev->pdev);
1430 		pci_set_power_state(dev->pdev, PCI_D3cold);
1431 		dev->switch_power_state = DRM_SWITCH_POWER_OFF;
1432 	}
1433 }
1434 
1435 /**
1436  * amdgpu_switcheroo_can_switch - see if switcheroo state can change
1437  *
1438  * @pdev: pci dev pointer
1439  *
1440  * Callback for the switcheroo driver.  Check of the switcheroo
1441  * state can be changed.
1442  * Returns true if the state can be changed, false if not.
1443  */
amdgpu_switcheroo_can_switch(struct pci_dev * pdev)1444 static bool amdgpu_switcheroo_can_switch(struct pci_dev *pdev)
1445 {
1446 	struct drm_device *dev = pci_get_drvdata(pdev);
1447 
1448 	/*
1449 	* FIXME: open_count is protected by drm_global_mutex but that would lead to
1450 	* locking inversion with the driver load path. And the access here is
1451 	* completely racy anyway. So don't bother with locking for now.
1452 	*/
1453 	return atomic_read(&dev->open_count) == 0;
1454 }
1455 
1456 static const struct vga_switcheroo_client_ops amdgpu_switcheroo_ops = {
1457 	.set_gpu_state = amdgpu_switcheroo_set_state,
1458 	.reprobe = NULL,
1459 	.can_switch = amdgpu_switcheroo_can_switch,
1460 };
1461 
1462 /**
1463  * amdgpu_device_ip_set_clockgating_state - set the CG state
1464  *
1465  * @dev: amdgpu_device pointer
1466  * @block_type: Type of hardware IP (SMU, GFX, UVD, etc.)
1467  * @state: clockgating state (gate or ungate)
1468  *
1469  * Sets the requested clockgating state for all instances of
1470  * the hardware IP specified.
1471  * Returns the error code from the last instance.
1472  */
amdgpu_device_ip_set_clockgating_state(void * dev,enum amd_ip_block_type block_type,enum amd_clockgating_state state)1473 int amdgpu_device_ip_set_clockgating_state(void *dev,
1474 					   enum amd_ip_block_type block_type,
1475 					   enum amd_clockgating_state state)
1476 {
1477 	struct amdgpu_device *adev = dev;
1478 	int i, r = 0;
1479 
1480 	for (i = 0; i < adev->num_ip_blocks; i++) {
1481 		if (!adev->ip_blocks[i].status.valid)
1482 			continue;
1483 		if (adev->ip_blocks[i].version->type != block_type)
1484 			continue;
1485 		if (!adev->ip_blocks[i].version->funcs->set_clockgating_state)
1486 			continue;
1487 		r = adev->ip_blocks[i].version->funcs->set_clockgating_state(
1488 			(void *)adev, state);
1489 		if (r)
1490 			DRM_ERROR("set_clockgating_state of IP block <%s> failed %d\n",
1491 				  adev->ip_blocks[i].version->funcs->name, r);
1492 	}
1493 	return r;
1494 }
1495 
1496 /**
1497  * amdgpu_device_ip_set_powergating_state - set the PG state
1498  *
1499  * @dev: amdgpu_device pointer
1500  * @block_type: Type of hardware IP (SMU, GFX, UVD, etc.)
1501  * @state: powergating state (gate or ungate)
1502  *
1503  * Sets the requested powergating state for all instances of
1504  * the hardware IP specified.
1505  * Returns the error code from the last instance.
1506  */
amdgpu_device_ip_set_powergating_state(void * dev,enum amd_ip_block_type block_type,enum amd_powergating_state state)1507 int amdgpu_device_ip_set_powergating_state(void *dev,
1508 					   enum amd_ip_block_type block_type,
1509 					   enum amd_powergating_state state)
1510 {
1511 	struct amdgpu_device *adev = dev;
1512 	int i, r = 0;
1513 
1514 	for (i = 0; i < adev->num_ip_blocks; i++) {
1515 		if (!adev->ip_blocks[i].status.valid)
1516 			continue;
1517 		if (adev->ip_blocks[i].version->type != block_type)
1518 			continue;
1519 		if (!adev->ip_blocks[i].version->funcs->set_powergating_state)
1520 			continue;
1521 		r = adev->ip_blocks[i].version->funcs->set_powergating_state(
1522 			(void *)adev, state);
1523 		if (r)
1524 			DRM_ERROR("set_powergating_state of IP block <%s> failed %d\n",
1525 				  adev->ip_blocks[i].version->funcs->name, r);
1526 	}
1527 	return r;
1528 }
1529 
1530 /**
1531  * amdgpu_device_ip_get_clockgating_state - get the CG state
1532  *
1533  * @adev: amdgpu_device pointer
1534  * @flags: clockgating feature flags
1535  *
1536  * Walks the list of IPs on the device and updates the clockgating
1537  * flags for each IP.
1538  * Updates @flags with the feature flags for each hardware IP where
1539  * clockgating is enabled.
1540  */
amdgpu_device_ip_get_clockgating_state(struct amdgpu_device * adev,u32 * flags)1541 void amdgpu_device_ip_get_clockgating_state(struct amdgpu_device *adev,
1542 					    u32 *flags)
1543 {
1544 	int i;
1545 
1546 	for (i = 0; i < adev->num_ip_blocks; i++) {
1547 		if (!adev->ip_blocks[i].status.valid)
1548 			continue;
1549 		if (adev->ip_blocks[i].version->funcs->get_clockgating_state)
1550 			adev->ip_blocks[i].version->funcs->get_clockgating_state((void *)adev, flags);
1551 	}
1552 }
1553 
1554 /**
1555  * amdgpu_device_ip_wait_for_idle - wait for idle
1556  *
1557  * @adev: amdgpu_device pointer
1558  * @block_type: Type of hardware IP (SMU, GFX, UVD, etc.)
1559  *
1560  * Waits for the request hardware IP to be idle.
1561  * Returns 0 for success or a negative error code on failure.
1562  */
amdgpu_device_ip_wait_for_idle(struct amdgpu_device * adev,enum amd_ip_block_type block_type)1563 int amdgpu_device_ip_wait_for_idle(struct amdgpu_device *adev,
1564 				   enum amd_ip_block_type block_type)
1565 {
1566 	int i, r;
1567 
1568 	for (i = 0; i < adev->num_ip_blocks; i++) {
1569 		if (!adev->ip_blocks[i].status.valid)
1570 			continue;
1571 		if (adev->ip_blocks[i].version->type == block_type) {
1572 			r = adev->ip_blocks[i].version->funcs->wait_for_idle((void *)adev);
1573 			if (r)
1574 				return r;
1575 			break;
1576 		}
1577 	}
1578 	return 0;
1579 
1580 }
1581 
1582 /**
1583  * amdgpu_device_ip_is_idle - is the hardware IP idle
1584  *
1585  * @adev: amdgpu_device pointer
1586  * @block_type: Type of hardware IP (SMU, GFX, UVD, etc.)
1587  *
1588  * Check if the hardware IP is idle or not.
1589  * Returns true if it the IP is idle, false if not.
1590  */
amdgpu_device_ip_is_idle(struct amdgpu_device * adev,enum amd_ip_block_type block_type)1591 bool amdgpu_device_ip_is_idle(struct amdgpu_device *adev,
1592 			      enum amd_ip_block_type block_type)
1593 {
1594 	int i;
1595 
1596 	for (i = 0; i < adev->num_ip_blocks; i++) {
1597 		if (!adev->ip_blocks[i].status.valid)
1598 			continue;
1599 		if (adev->ip_blocks[i].version->type == block_type)
1600 			return adev->ip_blocks[i].version->funcs->is_idle((void *)adev);
1601 	}
1602 	return true;
1603 
1604 }
1605 
1606 /**
1607  * amdgpu_device_ip_get_ip_block - get a hw IP pointer
1608  *
1609  * @adev: amdgpu_device pointer
1610  * @type: Type of hardware IP (SMU, GFX, UVD, etc.)
1611  *
1612  * Returns a pointer to the hardware IP block structure
1613  * if it exists for the asic, otherwise NULL.
1614  */
1615 struct amdgpu_ip_block *
amdgpu_device_ip_get_ip_block(struct amdgpu_device * adev,enum amd_ip_block_type type)1616 amdgpu_device_ip_get_ip_block(struct amdgpu_device *adev,
1617 			      enum amd_ip_block_type type)
1618 {
1619 	int i;
1620 
1621 	for (i = 0; i < adev->num_ip_blocks; i++)
1622 		if (adev->ip_blocks[i].version->type == type)
1623 			return &adev->ip_blocks[i];
1624 
1625 	return NULL;
1626 }
1627 
1628 /**
1629  * amdgpu_device_ip_block_version_cmp
1630  *
1631  * @adev: amdgpu_device pointer
1632  * @type: enum amd_ip_block_type
1633  * @major: major version
1634  * @minor: minor version
1635  *
1636  * return 0 if equal or greater
1637  * return 1 if smaller or the ip_block doesn't exist
1638  */
amdgpu_device_ip_block_version_cmp(struct amdgpu_device * adev,enum amd_ip_block_type type,u32 major,u32 minor)1639 int amdgpu_device_ip_block_version_cmp(struct amdgpu_device *adev,
1640 				       enum amd_ip_block_type type,
1641 				       u32 major, u32 minor)
1642 {
1643 	struct amdgpu_ip_block *ip_block = amdgpu_device_ip_get_ip_block(adev, type);
1644 
1645 	if (ip_block && ((ip_block->version->major > major) ||
1646 			((ip_block->version->major == major) &&
1647 			(ip_block->version->minor >= minor))))
1648 		return 0;
1649 
1650 	return 1;
1651 }
1652 
1653 /**
1654  * amdgpu_device_ip_block_add
1655  *
1656  * @adev: amdgpu_device pointer
1657  * @ip_block_version: pointer to the IP to add
1658  *
1659  * Adds the IP block driver information to the collection of IPs
1660  * on the asic.
1661  */
amdgpu_device_ip_block_add(struct amdgpu_device * adev,const struct amdgpu_ip_block_version * ip_block_version)1662 int amdgpu_device_ip_block_add(struct amdgpu_device *adev,
1663 			       const struct amdgpu_ip_block_version *ip_block_version)
1664 {
1665 	if (!ip_block_version)
1666 		return -EINVAL;
1667 
1668 	DRM_INFO("add ip block number %d <%s>\n", adev->num_ip_blocks,
1669 		  ip_block_version->funcs->name);
1670 
1671 	adev->ip_blocks[adev->num_ip_blocks++].version = ip_block_version;
1672 
1673 	return 0;
1674 }
1675 
1676 /**
1677  * amdgpu_device_enable_virtual_display - enable virtual display feature
1678  *
1679  * @adev: amdgpu_device pointer
1680  *
1681  * Enabled the virtual display feature if the user has enabled it via
1682  * the module parameter virtual_display.  This feature provides a virtual
1683  * display hardware on headless boards or in virtualized environments.
1684  * This function parses and validates the configuration string specified by
1685  * the user and configues the virtual display configuration (number of
1686  * virtual connectors, crtcs, etc.) specified.
1687  */
amdgpu_device_enable_virtual_display(struct amdgpu_device * adev)1688 static void amdgpu_device_enable_virtual_display(struct amdgpu_device *adev)
1689 {
1690 	adev->enable_virtual_display = false;
1691 
1692 	if (amdgpu_virtual_display) {
1693 		struct drm_device *ddev = adev_to_drm(adev);
1694 		const char *pci_address_name = pci_name(ddev->pdev);
1695 		char *pciaddstr, *pciaddstr_tmp, *pciaddname_tmp, *pciaddname;
1696 
1697 		pciaddstr = kstrdup(amdgpu_virtual_display, GFP_KERNEL);
1698 		pciaddstr_tmp = pciaddstr;
1699 		while ((pciaddname_tmp = strsep(&pciaddstr_tmp, ";"))) {
1700 			pciaddname = strsep(&pciaddname_tmp, ",");
1701 			if (!strcmp("all", pciaddname)
1702 			    || !strcmp(pci_address_name, pciaddname)) {
1703 				long num_crtc;
1704 				int res = -1;
1705 
1706 				adev->enable_virtual_display = true;
1707 
1708 				if (pciaddname_tmp)
1709 					res = kstrtol(pciaddname_tmp, 10,
1710 						      &num_crtc);
1711 
1712 				if (!res) {
1713 					if (num_crtc < 1)
1714 						num_crtc = 1;
1715 					if (num_crtc > 6)
1716 						num_crtc = 6;
1717 					adev->mode_info.num_crtc = num_crtc;
1718 				} else {
1719 					adev->mode_info.num_crtc = 1;
1720 				}
1721 				break;
1722 			}
1723 		}
1724 
1725 		DRM_INFO("virtual display string:%s, %s:virtual_display:%d, num_crtc:%d\n",
1726 			 amdgpu_virtual_display, pci_address_name,
1727 			 adev->enable_virtual_display, adev->mode_info.num_crtc);
1728 
1729 		kfree(pciaddstr);
1730 	}
1731 }
1732 
1733 /**
1734  * amdgpu_device_parse_gpu_info_fw - parse gpu info firmware
1735  *
1736  * @adev: amdgpu_device pointer
1737  *
1738  * Parses the asic configuration parameters specified in the gpu info
1739  * firmware and makes them availale to the driver for use in configuring
1740  * the asic.
1741  * Returns 0 on success, -EINVAL on failure.
1742  */
amdgpu_device_parse_gpu_info_fw(struct amdgpu_device * adev)1743 static int amdgpu_device_parse_gpu_info_fw(struct amdgpu_device *adev)
1744 {
1745 	const char *chip_name;
1746 	char fw_name[40];
1747 	int err;
1748 	const struct gpu_info_firmware_header_v1_0 *hdr;
1749 
1750 	adev->firmware.gpu_info_fw = NULL;
1751 
1752 	if (adev->mman.discovery_bin) {
1753 		amdgpu_discovery_get_gfx_info(adev);
1754 
1755 		/*
1756 		 * FIXME: The bounding box is still needed by Navi12, so
1757 		 * temporarily read it from gpu_info firmware. Should be droped
1758 		 * when DAL no longer needs it.
1759 		 */
1760 		if (adev->asic_type != CHIP_NAVI12)
1761 			return 0;
1762 	}
1763 
1764 	switch (adev->asic_type) {
1765 #ifdef CONFIG_DRM_AMDGPU_SI
1766 	case CHIP_VERDE:
1767 	case CHIP_TAHITI:
1768 	case CHIP_PITCAIRN:
1769 	case CHIP_OLAND:
1770 	case CHIP_HAINAN:
1771 #endif
1772 #ifdef CONFIG_DRM_AMDGPU_CIK
1773 	case CHIP_BONAIRE:
1774 	case CHIP_HAWAII:
1775 	case CHIP_KAVERI:
1776 	case CHIP_KABINI:
1777 	case CHIP_MULLINS:
1778 #endif
1779 	case CHIP_TOPAZ:
1780 	case CHIP_TONGA:
1781 	case CHIP_FIJI:
1782 	case CHIP_POLARIS10:
1783 	case CHIP_POLARIS11:
1784 	case CHIP_POLARIS12:
1785 	case CHIP_VEGAM:
1786 	case CHIP_CARRIZO:
1787 	case CHIP_STONEY:
1788 	case CHIP_VEGA20:
1789 	case CHIP_SIENNA_CICHLID:
1790 	case CHIP_NAVY_FLOUNDER:
1791 	default:
1792 		return 0;
1793 	case CHIP_VEGA10:
1794 		chip_name = "vega10";
1795 		break;
1796 	case CHIP_VEGA12:
1797 		chip_name = "vega12";
1798 		break;
1799 	case CHIP_RAVEN:
1800 		if (adev->apu_flags & AMD_APU_IS_RAVEN2)
1801 			chip_name = "raven2";
1802 		else if (adev->apu_flags & AMD_APU_IS_PICASSO)
1803 			chip_name = "picasso";
1804 		else
1805 			chip_name = "raven";
1806 		break;
1807 	case CHIP_ARCTURUS:
1808 		chip_name = "arcturus";
1809 		break;
1810 	case CHIP_RENOIR:
1811 		if (adev->apu_flags & AMD_APU_IS_RENOIR)
1812 			chip_name = "renoir";
1813 		else
1814 			chip_name = "green_sardine";
1815 		break;
1816 	case CHIP_NAVI10:
1817 		chip_name = "navi10";
1818 		break;
1819 	case CHIP_NAVI14:
1820 		chip_name = "navi14";
1821 		break;
1822 	case CHIP_NAVI12:
1823 		chip_name = "navi12";
1824 		break;
1825 	}
1826 
1827 	snprintf(fw_name, sizeof(fw_name), "amdgpu/%s_gpu_info.bin", chip_name);
1828 	err = request_firmware(&adev->firmware.gpu_info_fw, fw_name, adev->dev);
1829 	if (err) {
1830 		dev_err(adev->dev,
1831 			"Failed to load gpu_info firmware \"%s\"\n",
1832 			fw_name);
1833 		goto out;
1834 	}
1835 	err = amdgpu_ucode_validate(adev->firmware.gpu_info_fw);
1836 	if (err) {
1837 		dev_err(adev->dev,
1838 			"Failed to validate gpu_info firmware \"%s\"\n",
1839 			fw_name);
1840 		goto out;
1841 	}
1842 
1843 	hdr = (const struct gpu_info_firmware_header_v1_0 *)adev->firmware.gpu_info_fw->data;
1844 	amdgpu_ucode_print_gpu_info_hdr(&hdr->header);
1845 
1846 	switch (hdr->version_major) {
1847 	case 1:
1848 	{
1849 		const struct gpu_info_firmware_v1_0 *gpu_info_fw =
1850 			(const struct gpu_info_firmware_v1_0 *)(adev->firmware.gpu_info_fw->data +
1851 								le32_to_cpu(hdr->header.ucode_array_offset_bytes));
1852 
1853 		/*
1854 		 * Should be droped when DAL no longer needs it.
1855 		 */
1856 		if (adev->asic_type == CHIP_NAVI12)
1857 			goto parse_soc_bounding_box;
1858 
1859 		adev->gfx.config.max_shader_engines = le32_to_cpu(gpu_info_fw->gc_num_se);
1860 		adev->gfx.config.max_cu_per_sh = le32_to_cpu(gpu_info_fw->gc_num_cu_per_sh);
1861 		adev->gfx.config.max_sh_per_se = le32_to_cpu(gpu_info_fw->gc_num_sh_per_se);
1862 		adev->gfx.config.max_backends_per_se = le32_to_cpu(gpu_info_fw->gc_num_rb_per_se);
1863 		adev->gfx.config.max_texture_channel_caches =
1864 			le32_to_cpu(gpu_info_fw->gc_num_tccs);
1865 		adev->gfx.config.max_gprs = le32_to_cpu(gpu_info_fw->gc_num_gprs);
1866 		adev->gfx.config.max_gs_threads = le32_to_cpu(gpu_info_fw->gc_num_max_gs_thds);
1867 		adev->gfx.config.gs_vgt_table_depth = le32_to_cpu(gpu_info_fw->gc_gs_table_depth);
1868 		adev->gfx.config.gs_prim_buffer_depth = le32_to_cpu(gpu_info_fw->gc_gsprim_buff_depth);
1869 		adev->gfx.config.double_offchip_lds_buf =
1870 			le32_to_cpu(gpu_info_fw->gc_double_offchip_lds_buffer);
1871 		adev->gfx.cu_info.wave_front_size = le32_to_cpu(gpu_info_fw->gc_wave_size);
1872 		adev->gfx.cu_info.max_waves_per_simd =
1873 			le32_to_cpu(gpu_info_fw->gc_max_waves_per_simd);
1874 		adev->gfx.cu_info.max_scratch_slots_per_cu =
1875 			le32_to_cpu(gpu_info_fw->gc_max_scratch_slots_per_cu);
1876 		adev->gfx.cu_info.lds_size = le32_to_cpu(gpu_info_fw->gc_lds_size);
1877 		if (hdr->version_minor >= 1) {
1878 			const struct gpu_info_firmware_v1_1 *gpu_info_fw =
1879 				(const struct gpu_info_firmware_v1_1 *)(adev->firmware.gpu_info_fw->data +
1880 									le32_to_cpu(hdr->header.ucode_array_offset_bytes));
1881 			adev->gfx.config.num_sc_per_sh =
1882 				le32_to_cpu(gpu_info_fw->num_sc_per_sh);
1883 			adev->gfx.config.num_packer_per_sc =
1884 				le32_to_cpu(gpu_info_fw->num_packer_per_sc);
1885 		}
1886 
1887 parse_soc_bounding_box:
1888 		/*
1889 		 * soc bounding box info is not integrated in disocovery table,
1890 		 * we always need to parse it from gpu info firmware if needed.
1891 		 */
1892 		if (hdr->version_minor == 2) {
1893 			const struct gpu_info_firmware_v1_2 *gpu_info_fw =
1894 				(const struct gpu_info_firmware_v1_2 *)(adev->firmware.gpu_info_fw->data +
1895 									le32_to_cpu(hdr->header.ucode_array_offset_bytes));
1896 			adev->dm.soc_bounding_box = &gpu_info_fw->soc_bounding_box;
1897 		}
1898 		break;
1899 	}
1900 	default:
1901 		dev_err(adev->dev,
1902 			"Unsupported gpu_info table %d\n", hdr->header.ucode_version);
1903 		err = -EINVAL;
1904 		goto out;
1905 	}
1906 out:
1907 	return err;
1908 }
1909 
1910 /**
1911  * amdgpu_device_ip_early_init - run early init for hardware IPs
1912  *
1913  * @adev: amdgpu_device pointer
1914  *
1915  * Early initialization pass for hardware IPs.  The hardware IPs that make
1916  * up each asic are discovered each IP's early_init callback is run.  This
1917  * is the first stage in initializing the asic.
1918  * Returns 0 on success, negative error code on failure.
1919  */
amdgpu_device_ip_early_init(struct amdgpu_device * adev)1920 static int amdgpu_device_ip_early_init(struct amdgpu_device *adev)
1921 {
1922 	int i, r;
1923 
1924 	amdgpu_device_enable_virtual_display(adev);
1925 
1926 	if (amdgpu_sriov_vf(adev)) {
1927 		r = amdgpu_virt_request_full_gpu(adev, true);
1928 		if (r)
1929 			return r;
1930 	}
1931 
1932 	switch (adev->asic_type) {
1933 #ifdef CONFIG_DRM_AMDGPU_SI
1934 	case CHIP_VERDE:
1935 	case CHIP_TAHITI:
1936 	case CHIP_PITCAIRN:
1937 	case CHIP_OLAND:
1938 	case CHIP_HAINAN:
1939 		adev->family = AMDGPU_FAMILY_SI;
1940 		r = si_set_ip_blocks(adev);
1941 		if (r)
1942 			return r;
1943 		break;
1944 #endif
1945 #ifdef CONFIG_DRM_AMDGPU_CIK
1946 	case CHIP_BONAIRE:
1947 	case CHIP_HAWAII:
1948 	case CHIP_KAVERI:
1949 	case CHIP_KABINI:
1950 	case CHIP_MULLINS:
1951 		if (adev->flags & AMD_IS_APU)
1952 			adev->family = AMDGPU_FAMILY_KV;
1953 		else
1954 			adev->family = AMDGPU_FAMILY_CI;
1955 
1956 		r = cik_set_ip_blocks(adev);
1957 		if (r)
1958 			return r;
1959 		break;
1960 #endif
1961 	case CHIP_TOPAZ:
1962 	case CHIP_TONGA:
1963 	case CHIP_FIJI:
1964 	case CHIP_POLARIS10:
1965 	case CHIP_POLARIS11:
1966 	case CHIP_POLARIS12:
1967 	case CHIP_VEGAM:
1968 	case CHIP_CARRIZO:
1969 	case CHIP_STONEY:
1970 		if (adev->flags & AMD_IS_APU)
1971 			adev->family = AMDGPU_FAMILY_CZ;
1972 		else
1973 			adev->family = AMDGPU_FAMILY_VI;
1974 
1975 		r = vi_set_ip_blocks(adev);
1976 		if (r)
1977 			return r;
1978 		break;
1979 	case CHIP_VEGA10:
1980 	case CHIP_VEGA12:
1981 	case CHIP_VEGA20:
1982 	case CHIP_RAVEN:
1983 	case CHIP_ARCTURUS:
1984 	case CHIP_RENOIR:
1985 		if (adev->flags & AMD_IS_APU)
1986 			adev->family = AMDGPU_FAMILY_RV;
1987 		else
1988 			adev->family = AMDGPU_FAMILY_AI;
1989 
1990 		r = soc15_set_ip_blocks(adev);
1991 		if (r)
1992 			return r;
1993 		break;
1994 	case  CHIP_NAVI10:
1995 	case  CHIP_NAVI14:
1996 	case  CHIP_NAVI12:
1997 	case  CHIP_SIENNA_CICHLID:
1998 	case  CHIP_NAVY_FLOUNDER:
1999 		adev->family = AMDGPU_FAMILY_NV;
2000 
2001 		r = nv_set_ip_blocks(adev);
2002 		if (r)
2003 			return r;
2004 		break;
2005 	default:
2006 		/* FIXME: not supported yet */
2007 		return -EINVAL;
2008 	}
2009 
2010 	amdgpu_amdkfd_device_probe(adev);
2011 
2012 	adev->pm.pp_feature = amdgpu_pp_feature_mask;
2013 	if (amdgpu_sriov_vf(adev) || sched_policy == KFD_SCHED_POLICY_NO_HWS)
2014 		adev->pm.pp_feature &= ~PP_GFXOFF_MASK;
2015 
2016 	for (i = 0; i < adev->num_ip_blocks; i++) {
2017 		if ((amdgpu_ip_block_mask & (1 << i)) == 0) {
2018 			DRM_ERROR("disabled ip block: %d <%s>\n",
2019 				  i, adev->ip_blocks[i].version->funcs->name);
2020 			adev->ip_blocks[i].status.valid = false;
2021 		} else {
2022 			if (adev->ip_blocks[i].version->funcs->early_init) {
2023 				r = adev->ip_blocks[i].version->funcs->early_init((void *)adev);
2024 				if (r == -ENOENT) {
2025 					adev->ip_blocks[i].status.valid = false;
2026 				} else if (r) {
2027 					DRM_ERROR("early_init of IP block <%s> failed %d\n",
2028 						  adev->ip_blocks[i].version->funcs->name, r);
2029 					return r;
2030 				} else {
2031 					adev->ip_blocks[i].status.valid = true;
2032 				}
2033 			} else {
2034 				adev->ip_blocks[i].status.valid = true;
2035 			}
2036 		}
2037 		/* get the vbios after the asic_funcs are set up */
2038 		if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_COMMON) {
2039 			r = amdgpu_device_parse_gpu_info_fw(adev);
2040 			if (r)
2041 				return r;
2042 
2043 			/* Read BIOS */
2044 			if (!amdgpu_get_bios(adev))
2045 				return -EINVAL;
2046 
2047 			r = amdgpu_atombios_init(adev);
2048 			if (r) {
2049 				dev_err(adev->dev, "amdgpu_atombios_init failed\n");
2050 				amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_ATOMBIOS_INIT_FAIL, 0, 0);
2051 				return r;
2052 			}
2053 
2054 			/*get pf2vf msg info at it's earliest time*/
2055 			if (amdgpu_sriov_vf(adev))
2056 				amdgpu_virt_init_data_exchange(adev);
2057 
2058 		}
2059 	}
2060 
2061 	adev->cg_flags &= amdgpu_cg_mask;
2062 	adev->pg_flags &= amdgpu_pg_mask;
2063 
2064 	return 0;
2065 }
2066 
amdgpu_device_ip_hw_init_phase1(struct amdgpu_device * adev)2067 static int amdgpu_device_ip_hw_init_phase1(struct amdgpu_device *adev)
2068 {
2069 	int i, r;
2070 
2071 	for (i = 0; i < adev->num_ip_blocks; i++) {
2072 		if (!adev->ip_blocks[i].status.sw)
2073 			continue;
2074 		if (adev->ip_blocks[i].status.hw)
2075 			continue;
2076 		if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_COMMON ||
2077 		    (amdgpu_sriov_vf(adev) && (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_PSP)) ||
2078 		    adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_IH) {
2079 			r = adev->ip_blocks[i].version->funcs->hw_init(adev);
2080 			if (r) {
2081 				DRM_ERROR("hw_init of IP block <%s> failed %d\n",
2082 					  adev->ip_blocks[i].version->funcs->name, r);
2083 				return r;
2084 			}
2085 			adev->ip_blocks[i].status.hw = true;
2086 		}
2087 	}
2088 
2089 	return 0;
2090 }
2091 
amdgpu_device_ip_hw_init_phase2(struct amdgpu_device * adev)2092 static int amdgpu_device_ip_hw_init_phase2(struct amdgpu_device *adev)
2093 {
2094 	int i, r;
2095 
2096 	for (i = 0; i < adev->num_ip_blocks; i++) {
2097 		if (!adev->ip_blocks[i].status.sw)
2098 			continue;
2099 		if (adev->ip_blocks[i].status.hw)
2100 			continue;
2101 		r = adev->ip_blocks[i].version->funcs->hw_init(adev);
2102 		if (r) {
2103 			DRM_ERROR("hw_init of IP block <%s> failed %d\n",
2104 				  adev->ip_blocks[i].version->funcs->name, r);
2105 			return r;
2106 		}
2107 		adev->ip_blocks[i].status.hw = true;
2108 	}
2109 
2110 	return 0;
2111 }
2112 
amdgpu_device_fw_loading(struct amdgpu_device * adev)2113 static int amdgpu_device_fw_loading(struct amdgpu_device *adev)
2114 {
2115 	int r = 0;
2116 	int i;
2117 	uint32_t smu_version;
2118 
2119 	if (adev->asic_type >= CHIP_VEGA10) {
2120 		for (i = 0; i < adev->num_ip_blocks; i++) {
2121 			if (adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_PSP)
2122 				continue;
2123 
2124 			/* no need to do the fw loading again if already done*/
2125 			if (adev->ip_blocks[i].status.hw == true)
2126 				break;
2127 
2128 			if (amdgpu_in_reset(adev) || adev->in_suspend) {
2129 				r = adev->ip_blocks[i].version->funcs->resume(adev);
2130 				if (r) {
2131 					DRM_ERROR("resume of IP block <%s> failed %d\n",
2132 							  adev->ip_blocks[i].version->funcs->name, r);
2133 					return r;
2134 				}
2135 			} else {
2136 				r = adev->ip_blocks[i].version->funcs->hw_init(adev);
2137 				if (r) {
2138 					DRM_ERROR("hw_init of IP block <%s> failed %d\n",
2139 							  adev->ip_blocks[i].version->funcs->name, r);
2140 					return r;
2141 				}
2142 			}
2143 
2144 			adev->ip_blocks[i].status.hw = true;
2145 			break;
2146 		}
2147 	}
2148 
2149 	if (!amdgpu_sriov_vf(adev) || adev->asic_type == CHIP_TONGA)
2150 		r = amdgpu_pm_load_smu_firmware(adev, &smu_version);
2151 
2152 	return r;
2153 }
2154 
2155 /**
2156  * amdgpu_device_ip_init - run init for hardware IPs
2157  *
2158  * @adev: amdgpu_device pointer
2159  *
2160  * Main initialization pass for hardware IPs.  The list of all the hardware
2161  * IPs that make up the asic is walked and the sw_init and hw_init callbacks
2162  * are run.  sw_init initializes the software state associated with each IP
2163  * and hw_init initializes the hardware associated with each IP.
2164  * Returns 0 on success, negative error code on failure.
2165  */
amdgpu_device_ip_init(struct amdgpu_device * adev)2166 static int amdgpu_device_ip_init(struct amdgpu_device *adev)
2167 {
2168 	int i, r;
2169 
2170 	r = amdgpu_ras_init(adev);
2171 	if (r)
2172 		return r;
2173 
2174 	for (i = 0; i < adev->num_ip_blocks; i++) {
2175 		if (!adev->ip_blocks[i].status.valid)
2176 			continue;
2177 		r = adev->ip_blocks[i].version->funcs->sw_init((void *)adev);
2178 		if (r) {
2179 			DRM_ERROR("sw_init of IP block <%s> failed %d\n",
2180 				  adev->ip_blocks[i].version->funcs->name, r);
2181 			goto init_failed;
2182 		}
2183 		adev->ip_blocks[i].status.sw = true;
2184 
2185 		/* need to do gmc hw init early so we can allocate gpu mem */
2186 		if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC) {
2187 			/* Try to reserve bad pages early */
2188 			if (amdgpu_sriov_vf(adev))
2189 				amdgpu_virt_exchange_data(adev);
2190 
2191 			r = amdgpu_device_vram_scratch_init(adev);
2192 			if (r) {
2193 				DRM_ERROR("amdgpu_vram_scratch_init failed %d\n", r);
2194 				goto init_failed;
2195 			}
2196 			r = adev->ip_blocks[i].version->funcs->hw_init((void *)adev);
2197 			if (r) {
2198 				DRM_ERROR("hw_init %d failed %d\n", i, r);
2199 				goto init_failed;
2200 			}
2201 			r = amdgpu_device_wb_init(adev);
2202 			if (r) {
2203 				DRM_ERROR("amdgpu_device_wb_init failed %d\n", r);
2204 				goto init_failed;
2205 			}
2206 			adev->ip_blocks[i].status.hw = true;
2207 
2208 			/* right after GMC hw init, we create CSA */
2209 			if (amdgpu_mcbp || amdgpu_sriov_vf(adev)) {
2210 				r = amdgpu_allocate_static_csa(adev, &adev->virt.csa_obj,
2211 								AMDGPU_GEM_DOMAIN_VRAM,
2212 								AMDGPU_CSA_SIZE);
2213 				if (r) {
2214 					DRM_ERROR("allocate CSA failed %d\n", r);
2215 					goto init_failed;
2216 				}
2217 			}
2218 		}
2219 	}
2220 
2221 	if (amdgpu_sriov_vf(adev))
2222 		amdgpu_virt_init_data_exchange(adev);
2223 
2224 	r = amdgpu_ib_pool_init(adev);
2225 	if (r) {
2226 		dev_err(adev->dev, "IB initialization failed (%d).\n", r);
2227 		amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_IB_INIT_FAIL, 0, r);
2228 		goto init_failed;
2229 	}
2230 
2231 	r = amdgpu_ucode_create_bo(adev); /* create ucode bo when sw_init complete*/
2232 	if (r)
2233 		goto init_failed;
2234 
2235 	r = amdgpu_device_ip_hw_init_phase1(adev);
2236 	if (r)
2237 		goto init_failed;
2238 
2239 	r = amdgpu_device_fw_loading(adev);
2240 	if (r)
2241 		goto init_failed;
2242 
2243 	r = amdgpu_device_ip_hw_init_phase2(adev);
2244 	if (r)
2245 		goto init_failed;
2246 
2247 	/*
2248 	 * retired pages will be loaded from eeprom and reserved here,
2249 	 * it should be called after amdgpu_device_ip_hw_init_phase2  since
2250 	 * for some ASICs the RAS EEPROM code relies on SMU fully functioning
2251 	 * for I2C communication which only true at this point.
2252 	 *
2253 	 * amdgpu_ras_recovery_init may fail, but the upper only cares the
2254 	 * failure from bad gpu situation and stop amdgpu init process
2255 	 * accordingly. For other failed cases, it will still release all
2256 	 * the resource and print error message, rather than returning one
2257 	 * negative value to upper level.
2258 	 *
2259 	 * Note: theoretically, this should be called before all vram allocations
2260 	 * to protect retired page from abusing
2261 	 */
2262 	r = amdgpu_ras_recovery_init(adev);
2263 	if (r)
2264 		goto init_failed;
2265 
2266 	if (adev->gmc.xgmi.num_physical_nodes > 1)
2267 		amdgpu_xgmi_add_device(adev);
2268 	amdgpu_amdkfd_device_init(adev);
2269 
2270 	amdgpu_fru_get_product_info(adev);
2271 
2272 init_failed:
2273 	if (amdgpu_sriov_vf(adev))
2274 		amdgpu_virt_release_full_gpu(adev, true);
2275 
2276 	return r;
2277 }
2278 
2279 /**
2280  * amdgpu_device_fill_reset_magic - writes reset magic to gart pointer
2281  *
2282  * @adev: amdgpu_device pointer
2283  *
2284  * Writes a reset magic value to the gart pointer in VRAM.  The driver calls
2285  * this function before a GPU reset.  If the value is retained after a
2286  * GPU reset, VRAM has not been lost.  Some GPU resets may destry VRAM contents.
2287  */
amdgpu_device_fill_reset_magic(struct amdgpu_device * adev)2288 static void amdgpu_device_fill_reset_magic(struct amdgpu_device *adev)
2289 {
2290 	memcpy(adev->reset_magic, adev->gart.ptr, AMDGPU_RESET_MAGIC_NUM);
2291 }
2292 
2293 /**
2294  * amdgpu_device_check_vram_lost - check if vram is valid
2295  *
2296  * @adev: amdgpu_device pointer
2297  *
2298  * Checks the reset magic value written to the gart pointer in VRAM.
2299  * The driver calls this after a GPU reset to see if the contents of
2300  * VRAM is lost or now.
2301  * returns true if vram is lost, false if not.
2302  */
amdgpu_device_check_vram_lost(struct amdgpu_device * adev)2303 static bool amdgpu_device_check_vram_lost(struct amdgpu_device *adev)
2304 {
2305 	if (memcmp(adev->gart.ptr, adev->reset_magic,
2306 			AMDGPU_RESET_MAGIC_NUM))
2307 		return true;
2308 
2309 	if (!amdgpu_in_reset(adev))
2310 		return false;
2311 
2312 	/*
2313 	 * For all ASICs with baco/mode1 reset, the VRAM is
2314 	 * always assumed to be lost.
2315 	 */
2316 	switch (amdgpu_asic_reset_method(adev)) {
2317 	case AMD_RESET_METHOD_BACO:
2318 	case AMD_RESET_METHOD_MODE1:
2319 		return true;
2320 	default:
2321 		return false;
2322 	}
2323 }
2324 
2325 /**
2326  * amdgpu_device_set_cg_state - set clockgating for amdgpu device
2327  *
2328  * @adev: amdgpu_device pointer
2329  * @state: clockgating state (gate or ungate)
2330  *
2331  * The list of all the hardware IPs that make up the asic is walked and the
2332  * set_clockgating_state callbacks are run.
2333  * Late initialization pass enabling clockgating for hardware IPs.
2334  * Fini or suspend, pass disabling clockgating for hardware IPs.
2335  * Returns 0 on success, negative error code on failure.
2336  */
2337 
amdgpu_device_set_cg_state(struct amdgpu_device * adev,enum amd_clockgating_state state)2338 static int amdgpu_device_set_cg_state(struct amdgpu_device *adev,
2339 						enum amd_clockgating_state state)
2340 {
2341 	int i, j, r;
2342 
2343 	if (amdgpu_emu_mode == 1)
2344 		return 0;
2345 
2346 	for (j = 0; j < adev->num_ip_blocks; j++) {
2347 		i = state == AMD_CG_STATE_GATE ? j : adev->num_ip_blocks - j - 1;
2348 		if (!adev->ip_blocks[i].status.late_initialized)
2349 			continue;
2350 		/* skip CG for VCE/UVD, it's handled specially */
2351 		if (adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_UVD &&
2352 		    adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_VCE &&
2353 		    adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_VCN &&
2354 		    adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_JPEG &&
2355 		    adev->ip_blocks[i].version->funcs->set_clockgating_state) {
2356 			/* enable clockgating to save power */
2357 			r = adev->ip_blocks[i].version->funcs->set_clockgating_state((void *)adev,
2358 										     state);
2359 			if (r) {
2360 				DRM_ERROR("set_clockgating_state(gate) of IP block <%s> failed %d\n",
2361 					  adev->ip_blocks[i].version->funcs->name, r);
2362 				return r;
2363 			}
2364 		}
2365 	}
2366 
2367 	return 0;
2368 }
2369 
amdgpu_device_set_pg_state(struct amdgpu_device * adev,enum amd_powergating_state state)2370 static int amdgpu_device_set_pg_state(struct amdgpu_device *adev, enum amd_powergating_state state)
2371 {
2372 	int i, j, r;
2373 
2374 	if (amdgpu_emu_mode == 1)
2375 		return 0;
2376 
2377 	for (j = 0; j < adev->num_ip_blocks; j++) {
2378 		i = state == AMD_PG_STATE_GATE ? j : adev->num_ip_blocks - j - 1;
2379 		if (!adev->ip_blocks[i].status.late_initialized)
2380 			continue;
2381 		/* skip CG for VCE/UVD, it's handled specially */
2382 		if (adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_UVD &&
2383 		    adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_VCE &&
2384 		    adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_VCN &&
2385 		    adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_JPEG &&
2386 		    adev->ip_blocks[i].version->funcs->set_powergating_state) {
2387 			/* enable powergating to save power */
2388 			r = adev->ip_blocks[i].version->funcs->set_powergating_state((void *)adev,
2389 											state);
2390 			if (r) {
2391 				DRM_ERROR("set_powergating_state(gate) of IP block <%s> failed %d\n",
2392 					  adev->ip_blocks[i].version->funcs->name, r);
2393 				return r;
2394 			}
2395 		}
2396 	}
2397 	return 0;
2398 }
2399 
amdgpu_device_enable_mgpu_fan_boost(void)2400 static int amdgpu_device_enable_mgpu_fan_boost(void)
2401 {
2402 	struct amdgpu_gpu_instance *gpu_ins;
2403 	struct amdgpu_device *adev;
2404 	int i, ret = 0;
2405 
2406 	mutex_lock(&mgpu_info.mutex);
2407 
2408 	/*
2409 	 * MGPU fan boost feature should be enabled
2410 	 * only when there are two or more dGPUs in
2411 	 * the system
2412 	 */
2413 	if (mgpu_info.num_dgpu < 2)
2414 		goto out;
2415 
2416 	for (i = 0; i < mgpu_info.num_dgpu; i++) {
2417 		gpu_ins = &(mgpu_info.gpu_ins[i]);
2418 		adev = gpu_ins->adev;
2419 		if (!(adev->flags & AMD_IS_APU) &&
2420 		    !gpu_ins->mgpu_fan_enabled) {
2421 			ret = amdgpu_dpm_enable_mgpu_fan_boost(adev);
2422 			if (ret)
2423 				break;
2424 
2425 			gpu_ins->mgpu_fan_enabled = 1;
2426 		}
2427 	}
2428 
2429 out:
2430 	mutex_unlock(&mgpu_info.mutex);
2431 
2432 	return ret;
2433 }
2434 
2435 /**
2436  * amdgpu_device_ip_late_init - run late init for hardware IPs
2437  *
2438  * @adev: amdgpu_device pointer
2439  *
2440  * Late initialization pass for hardware IPs.  The list of all the hardware
2441  * IPs that make up the asic is walked and the late_init callbacks are run.
2442  * late_init covers any special initialization that an IP requires
2443  * after all of the have been initialized or something that needs to happen
2444  * late in the init process.
2445  * Returns 0 on success, negative error code on failure.
2446  */
amdgpu_device_ip_late_init(struct amdgpu_device * adev)2447 static int amdgpu_device_ip_late_init(struct amdgpu_device *adev)
2448 {
2449 	struct amdgpu_gpu_instance *gpu_instance;
2450 	int i = 0, r;
2451 
2452 	for (i = 0; i < adev->num_ip_blocks; i++) {
2453 		if (!adev->ip_blocks[i].status.hw)
2454 			continue;
2455 		if (adev->ip_blocks[i].version->funcs->late_init) {
2456 			r = adev->ip_blocks[i].version->funcs->late_init((void *)adev);
2457 			if (r) {
2458 				DRM_ERROR("late_init of IP block <%s> failed %d\n",
2459 					  adev->ip_blocks[i].version->funcs->name, r);
2460 				return r;
2461 			}
2462 		}
2463 		adev->ip_blocks[i].status.late_initialized = true;
2464 	}
2465 
2466 	amdgpu_ras_set_error_query_ready(adev, true);
2467 
2468 	amdgpu_device_set_cg_state(adev, AMD_CG_STATE_GATE);
2469 	amdgpu_device_set_pg_state(adev, AMD_PG_STATE_GATE);
2470 
2471 	amdgpu_device_fill_reset_magic(adev);
2472 
2473 	r = amdgpu_device_enable_mgpu_fan_boost();
2474 	if (r)
2475 		DRM_ERROR("enable mgpu fan boost failed (%d).\n", r);
2476 
2477 
2478 	if (adev->gmc.xgmi.num_physical_nodes > 1) {
2479 		mutex_lock(&mgpu_info.mutex);
2480 
2481 		/*
2482 		 * Reset device p-state to low as this was booted with high.
2483 		 *
2484 		 * This should be performed only after all devices from the same
2485 		 * hive get initialized.
2486 		 *
2487 		 * However, it's unknown how many device in the hive in advance.
2488 		 * As this is counted one by one during devices initializations.
2489 		 *
2490 		 * So, we wait for all XGMI interlinked devices initialized.
2491 		 * This may bring some delays as those devices may come from
2492 		 * different hives. But that should be OK.
2493 		 */
2494 		if (mgpu_info.num_dgpu == adev->gmc.xgmi.num_physical_nodes) {
2495 			for (i = 0; i < mgpu_info.num_gpu; i++) {
2496 				gpu_instance = &(mgpu_info.gpu_ins[i]);
2497 				if (gpu_instance->adev->flags & AMD_IS_APU)
2498 					continue;
2499 
2500 				r = amdgpu_xgmi_set_pstate(gpu_instance->adev,
2501 						AMDGPU_XGMI_PSTATE_MIN);
2502 				if (r) {
2503 					DRM_ERROR("pstate setting failed (%d).\n", r);
2504 					break;
2505 				}
2506 			}
2507 		}
2508 
2509 		mutex_unlock(&mgpu_info.mutex);
2510 	}
2511 
2512 	return 0;
2513 }
2514 
2515 /**
2516  * amdgpu_device_ip_fini - run fini for hardware IPs
2517  *
2518  * @adev: amdgpu_device pointer
2519  *
2520  * Main teardown pass for hardware IPs.  The list of all the hardware
2521  * IPs that make up the asic is walked and the hw_fini and sw_fini callbacks
2522  * are run.  hw_fini tears down the hardware associated with each IP
2523  * and sw_fini tears down any software state associated with each IP.
2524  * Returns 0 on success, negative error code on failure.
2525  */
amdgpu_device_ip_fini(struct amdgpu_device * adev)2526 static int amdgpu_device_ip_fini(struct amdgpu_device *adev)
2527 {
2528 	int i, r;
2529 
2530 	if (amdgpu_sriov_vf(adev) && adev->virt.ras_init_done)
2531 		amdgpu_virt_release_ras_err_handler_data(adev);
2532 
2533 	amdgpu_ras_pre_fini(adev);
2534 
2535 	if (adev->gmc.xgmi.num_physical_nodes > 1)
2536 		amdgpu_xgmi_remove_device(adev);
2537 
2538 	amdgpu_device_set_pg_state(adev, AMD_PG_STATE_UNGATE);
2539 	amdgpu_device_set_cg_state(adev, AMD_CG_STATE_UNGATE);
2540 
2541 	amdgpu_amdkfd_device_fini(adev);
2542 
2543 	/* need to disable SMC first */
2544 	for (i = 0; i < adev->num_ip_blocks; i++) {
2545 		if (!adev->ip_blocks[i].status.hw)
2546 			continue;
2547 		if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_SMC) {
2548 			r = adev->ip_blocks[i].version->funcs->hw_fini((void *)adev);
2549 			/* XXX handle errors */
2550 			if (r) {
2551 				DRM_DEBUG("hw_fini of IP block <%s> failed %d\n",
2552 					  adev->ip_blocks[i].version->funcs->name, r);
2553 			}
2554 			adev->ip_blocks[i].status.hw = false;
2555 			break;
2556 		}
2557 	}
2558 
2559 	for (i = adev->num_ip_blocks - 1; i >= 0; i--) {
2560 		if (!adev->ip_blocks[i].status.hw)
2561 			continue;
2562 
2563 		r = adev->ip_blocks[i].version->funcs->hw_fini((void *)adev);
2564 		/* XXX handle errors */
2565 		if (r) {
2566 			DRM_DEBUG("hw_fini of IP block <%s> failed %d\n",
2567 				  adev->ip_blocks[i].version->funcs->name, r);
2568 		}
2569 
2570 		adev->ip_blocks[i].status.hw = false;
2571 	}
2572 
2573 
2574 	for (i = adev->num_ip_blocks - 1; i >= 0; i--) {
2575 		if (!adev->ip_blocks[i].status.sw)
2576 			continue;
2577 
2578 		if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC) {
2579 			amdgpu_ucode_free_bo(adev);
2580 			amdgpu_free_static_csa(&adev->virt.csa_obj);
2581 			amdgpu_device_wb_fini(adev);
2582 			amdgpu_device_vram_scratch_fini(adev);
2583 			amdgpu_ib_pool_fini(adev);
2584 		}
2585 
2586 		r = adev->ip_blocks[i].version->funcs->sw_fini((void *)adev);
2587 		/* XXX handle errors */
2588 		if (r) {
2589 			DRM_DEBUG("sw_fini of IP block <%s> failed %d\n",
2590 				  adev->ip_blocks[i].version->funcs->name, r);
2591 		}
2592 		adev->ip_blocks[i].status.sw = false;
2593 		adev->ip_blocks[i].status.valid = false;
2594 	}
2595 
2596 	for (i = adev->num_ip_blocks - 1; i >= 0; i--) {
2597 		if (!adev->ip_blocks[i].status.late_initialized)
2598 			continue;
2599 		if (adev->ip_blocks[i].version->funcs->late_fini)
2600 			adev->ip_blocks[i].version->funcs->late_fini((void *)adev);
2601 		adev->ip_blocks[i].status.late_initialized = false;
2602 	}
2603 
2604 	amdgpu_ras_fini(adev);
2605 
2606 	if (amdgpu_sriov_vf(adev))
2607 		if (amdgpu_virt_release_full_gpu(adev, false))
2608 			DRM_ERROR("failed to release exclusive mode on fini\n");
2609 
2610 	return 0;
2611 }
2612 
2613 /**
2614  * amdgpu_device_delayed_init_work_handler - work handler for IB tests
2615  *
2616  * @work: work_struct.
2617  */
amdgpu_device_delayed_init_work_handler(struct work_struct * work)2618 static void amdgpu_device_delayed_init_work_handler(struct work_struct *work)
2619 {
2620 	struct amdgpu_device *adev =
2621 		container_of(work, struct amdgpu_device, delayed_init_work.work);
2622 	int r;
2623 
2624 	r = amdgpu_ib_ring_tests(adev);
2625 	if (r)
2626 		DRM_ERROR("ib ring test failed (%d).\n", r);
2627 }
2628 
amdgpu_device_delay_enable_gfx_off(struct work_struct * work)2629 static void amdgpu_device_delay_enable_gfx_off(struct work_struct *work)
2630 {
2631 	struct amdgpu_device *adev =
2632 		container_of(work, struct amdgpu_device, gfx.gfx_off_delay_work.work);
2633 
2634 	WARN_ON_ONCE(adev->gfx.gfx_off_state);
2635 	WARN_ON_ONCE(adev->gfx.gfx_off_req_count);
2636 
2637 	if (!amdgpu_dpm_set_powergating_by_smu(adev, AMD_IP_BLOCK_TYPE_GFX, true))
2638 		adev->gfx.gfx_off_state = true;
2639 }
2640 
2641 /**
2642  * amdgpu_device_ip_suspend_phase1 - run suspend for hardware IPs (phase 1)
2643  *
2644  * @adev: amdgpu_device pointer
2645  *
2646  * Main suspend function for hardware IPs.  The list of all the hardware
2647  * IPs that make up the asic is walked, clockgating is disabled and the
2648  * suspend callbacks are run.  suspend puts the hardware and software state
2649  * in each IP into a state suitable for suspend.
2650  * Returns 0 on success, negative error code on failure.
2651  */
amdgpu_device_ip_suspend_phase1(struct amdgpu_device * adev)2652 static int amdgpu_device_ip_suspend_phase1(struct amdgpu_device *adev)
2653 {
2654 	int i, r;
2655 
2656 	amdgpu_device_set_pg_state(adev, AMD_PG_STATE_UNGATE);
2657 	amdgpu_device_set_cg_state(adev, AMD_CG_STATE_UNGATE);
2658 
2659 	for (i = adev->num_ip_blocks - 1; i >= 0; i--) {
2660 		if (!adev->ip_blocks[i].status.valid)
2661 			continue;
2662 
2663 		/* displays are handled separately */
2664 		if (adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_DCE)
2665 			continue;
2666 
2667 		/* XXX handle errors */
2668 		r = adev->ip_blocks[i].version->funcs->suspend(adev);
2669 		/* XXX handle errors */
2670 		if (r) {
2671 			DRM_ERROR("suspend of IP block <%s> failed %d\n",
2672 				  adev->ip_blocks[i].version->funcs->name, r);
2673 			return r;
2674 		}
2675 
2676 		adev->ip_blocks[i].status.hw = false;
2677 	}
2678 
2679 	return 0;
2680 }
2681 
2682 /**
2683  * amdgpu_device_ip_suspend_phase2 - run suspend for hardware IPs (phase 2)
2684  *
2685  * @adev: amdgpu_device pointer
2686  *
2687  * Main suspend function for hardware IPs.  The list of all the hardware
2688  * IPs that make up the asic is walked, clockgating is disabled and the
2689  * suspend callbacks are run.  suspend puts the hardware and software state
2690  * in each IP into a state suitable for suspend.
2691  * Returns 0 on success, negative error code on failure.
2692  */
amdgpu_device_ip_suspend_phase2(struct amdgpu_device * adev)2693 static int amdgpu_device_ip_suspend_phase2(struct amdgpu_device *adev)
2694 {
2695 	int i, r;
2696 
2697 	for (i = adev->num_ip_blocks - 1; i >= 0; i--) {
2698 		if (!adev->ip_blocks[i].status.valid)
2699 			continue;
2700 		/* displays are handled in phase1 */
2701 		if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_DCE)
2702 			continue;
2703 		/* PSP lost connection when err_event_athub occurs */
2704 		if (amdgpu_ras_intr_triggered() &&
2705 		    adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_PSP) {
2706 			adev->ip_blocks[i].status.hw = false;
2707 			continue;
2708 		}
2709 		/* XXX handle errors */
2710 		r = adev->ip_blocks[i].version->funcs->suspend(adev);
2711 		/* XXX handle errors */
2712 		if (r) {
2713 			DRM_ERROR("suspend of IP block <%s> failed %d\n",
2714 				  adev->ip_blocks[i].version->funcs->name, r);
2715 		}
2716 		adev->ip_blocks[i].status.hw = false;
2717 		/* handle putting the SMC in the appropriate state */
2718 		if(!amdgpu_sriov_vf(adev)){
2719 			if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_SMC) {
2720 				r = amdgpu_dpm_set_mp1_state(adev, adev->mp1_state);
2721 				if (r) {
2722 					DRM_ERROR("SMC failed to set mp1 state %d, %d\n",
2723 							adev->mp1_state, r);
2724 					return r;
2725 				}
2726 			}
2727 		}
2728 		adev->ip_blocks[i].status.hw = false;
2729 	}
2730 
2731 	return 0;
2732 }
2733 
2734 /**
2735  * amdgpu_device_ip_suspend - run suspend for hardware IPs
2736  *
2737  * @adev: amdgpu_device pointer
2738  *
2739  * Main suspend function for hardware IPs.  The list of all the hardware
2740  * IPs that make up the asic is walked, clockgating is disabled and the
2741  * suspend callbacks are run.  suspend puts the hardware and software state
2742  * in each IP into a state suitable for suspend.
2743  * Returns 0 on success, negative error code on failure.
2744  */
amdgpu_device_ip_suspend(struct amdgpu_device * adev)2745 int amdgpu_device_ip_suspend(struct amdgpu_device *adev)
2746 {
2747 	int r;
2748 
2749 	if (amdgpu_sriov_vf(adev))
2750 		amdgpu_virt_request_full_gpu(adev, false);
2751 
2752 	r = amdgpu_device_ip_suspend_phase1(adev);
2753 	if (r)
2754 		return r;
2755 	r = amdgpu_device_ip_suspend_phase2(adev);
2756 
2757 	if (amdgpu_sriov_vf(adev))
2758 		amdgpu_virt_release_full_gpu(adev, false);
2759 
2760 	return r;
2761 }
2762 
amdgpu_device_ip_reinit_early_sriov(struct amdgpu_device * adev)2763 static int amdgpu_device_ip_reinit_early_sriov(struct amdgpu_device *adev)
2764 {
2765 	int i, r;
2766 
2767 	static enum amd_ip_block_type ip_order[] = {
2768 		AMD_IP_BLOCK_TYPE_GMC,
2769 		AMD_IP_BLOCK_TYPE_COMMON,
2770 		AMD_IP_BLOCK_TYPE_PSP,
2771 		AMD_IP_BLOCK_TYPE_IH,
2772 	};
2773 
2774 	for (i = 0; i < adev->num_ip_blocks; i++) {
2775 		int j;
2776 		struct amdgpu_ip_block *block;
2777 
2778 		block = &adev->ip_blocks[i];
2779 		block->status.hw = false;
2780 
2781 		for (j = 0; j < ARRAY_SIZE(ip_order); j++) {
2782 
2783 			if (block->version->type != ip_order[j] ||
2784 				!block->status.valid)
2785 				continue;
2786 
2787 			r = block->version->funcs->hw_init(adev);
2788 			DRM_INFO("RE-INIT-early: %s %s\n", block->version->funcs->name, r?"failed":"succeeded");
2789 			if (r)
2790 				return r;
2791 			block->status.hw = true;
2792 		}
2793 	}
2794 
2795 	return 0;
2796 }
2797 
amdgpu_device_ip_reinit_late_sriov(struct amdgpu_device * adev)2798 static int amdgpu_device_ip_reinit_late_sriov(struct amdgpu_device *adev)
2799 {
2800 	int i, r;
2801 
2802 	static enum amd_ip_block_type ip_order[] = {
2803 		AMD_IP_BLOCK_TYPE_SMC,
2804 		AMD_IP_BLOCK_TYPE_DCE,
2805 		AMD_IP_BLOCK_TYPE_GFX,
2806 		AMD_IP_BLOCK_TYPE_SDMA,
2807 		AMD_IP_BLOCK_TYPE_UVD,
2808 		AMD_IP_BLOCK_TYPE_VCE,
2809 		AMD_IP_BLOCK_TYPE_VCN
2810 	};
2811 
2812 	for (i = 0; i < ARRAY_SIZE(ip_order); i++) {
2813 		int j;
2814 		struct amdgpu_ip_block *block;
2815 
2816 		for (j = 0; j < adev->num_ip_blocks; j++) {
2817 			block = &adev->ip_blocks[j];
2818 
2819 			if (block->version->type != ip_order[i] ||
2820 				!block->status.valid ||
2821 				block->status.hw)
2822 				continue;
2823 
2824 			if (block->version->type == AMD_IP_BLOCK_TYPE_SMC)
2825 				r = block->version->funcs->resume(adev);
2826 			else
2827 				r = block->version->funcs->hw_init(adev);
2828 
2829 			DRM_INFO("RE-INIT-late: %s %s\n", block->version->funcs->name, r?"failed":"succeeded");
2830 			if (r)
2831 				return r;
2832 			block->status.hw = true;
2833 		}
2834 	}
2835 
2836 	return 0;
2837 }
2838 
2839 /**
2840  * amdgpu_device_ip_resume_phase1 - run resume for hardware IPs
2841  *
2842  * @adev: amdgpu_device pointer
2843  *
2844  * First resume function for hardware IPs.  The list of all the hardware
2845  * IPs that make up the asic is walked and the resume callbacks are run for
2846  * COMMON, GMC, and IH.  resume puts the hardware into a functional state
2847  * after a suspend and updates the software state as necessary.  This
2848  * function is also used for restoring the GPU after a GPU reset.
2849  * Returns 0 on success, negative error code on failure.
2850  */
amdgpu_device_ip_resume_phase1(struct amdgpu_device * adev)2851 static int amdgpu_device_ip_resume_phase1(struct amdgpu_device *adev)
2852 {
2853 	int i, r;
2854 
2855 	for (i = 0; i < adev->num_ip_blocks; i++) {
2856 		if (!adev->ip_blocks[i].status.valid || adev->ip_blocks[i].status.hw)
2857 			continue;
2858 		if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_COMMON ||
2859 		    adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC ||
2860 		    adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_IH) {
2861 
2862 			r = adev->ip_blocks[i].version->funcs->resume(adev);
2863 			if (r) {
2864 				DRM_ERROR("resume of IP block <%s> failed %d\n",
2865 					  adev->ip_blocks[i].version->funcs->name, r);
2866 				return r;
2867 			}
2868 			adev->ip_blocks[i].status.hw = true;
2869 		}
2870 	}
2871 
2872 	return 0;
2873 }
2874 
2875 /**
2876  * amdgpu_device_ip_resume_phase2 - run resume for hardware IPs
2877  *
2878  * @adev: amdgpu_device pointer
2879  *
2880  * First resume function for hardware IPs.  The list of all the hardware
2881  * IPs that make up the asic is walked and the resume callbacks are run for
2882  * all blocks except COMMON, GMC, and IH.  resume puts the hardware into a
2883  * functional state after a suspend and updates the software state as
2884  * necessary.  This function is also used for restoring the GPU after a GPU
2885  * reset.
2886  * Returns 0 on success, negative error code on failure.
2887  */
amdgpu_device_ip_resume_phase2(struct amdgpu_device * adev)2888 static int amdgpu_device_ip_resume_phase2(struct amdgpu_device *adev)
2889 {
2890 	int i, r;
2891 
2892 	for (i = 0; i < adev->num_ip_blocks; i++) {
2893 		if (!adev->ip_blocks[i].status.valid || adev->ip_blocks[i].status.hw)
2894 			continue;
2895 		if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_COMMON ||
2896 		    adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC ||
2897 		    adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_IH ||
2898 		    adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_PSP)
2899 			continue;
2900 		r = adev->ip_blocks[i].version->funcs->resume(adev);
2901 		if (r) {
2902 			DRM_ERROR("resume of IP block <%s> failed %d\n",
2903 				  adev->ip_blocks[i].version->funcs->name, r);
2904 			return r;
2905 		}
2906 		adev->ip_blocks[i].status.hw = true;
2907 	}
2908 
2909 	return 0;
2910 }
2911 
2912 /**
2913  * amdgpu_device_ip_resume - run resume for hardware IPs
2914  *
2915  * @adev: amdgpu_device pointer
2916  *
2917  * Main resume function for hardware IPs.  The hardware IPs
2918  * are split into two resume functions because they are
2919  * are also used in in recovering from a GPU reset and some additional
2920  * steps need to be take between them.  In this case (S3/S4) they are
2921  * run sequentially.
2922  * Returns 0 on success, negative error code on failure.
2923  */
amdgpu_device_ip_resume(struct amdgpu_device * adev)2924 static int amdgpu_device_ip_resume(struct amdgpu_device *adev)
2925 {
2926 	int r;
2927 
2928 	r = amdgpu_amdkfd_resume_iommu(adev);
2929 	if (r)
2930 		return r;
2931 
2932 	r = amdgpu_device_ip_resume_phase1(adev);
2933 	if (r)
2934 		return r;
2935 
2936 	r = amdgpu_device_fw_loading(adev);
2937 	if (r)
2938 		return r;
2939 
2940 	r = amdgpu_device_ip_resume_phase2(adev);
2941 
2942 	return r;
2943 }
2944 
2945 /**
2946  * amdgpu_device_detect_sriov_bios - determine if the board supports SR-IOV
2947  *
2948  * @adev: amdgpu_device pointer
2949  *
2950  * Query the VBIOS data tables to determine if the board supports SR-IOV.
2951  */
amdgpu_device_detect_sriov_bios(struct amdgpu_device * adev)2952 static void amdgpu_device_detect_sriov_bios(struct amdgpu_device *adev)
2953 {
2954 	if (amdgpu_sriov_vf(adev)) {
2955 		if (adev->is_atom_fw) {
2956 			if (amdgpu_atomfirmware_gpu_supports_virtualization(adev))
2957 				adev->virt.caps |= AMDGPU_SRIOV_CAPS_SRIOV_VBIOS;
2958 		} else {
2959 			if (amdgpu_atombios_has_gpu_virtualization_table(adev))
2960 				adev->virt.caps |= AMDGPU_SRIOV_CAPS_SRIOV_VBIOS;
2961 		}
2962 
2963 		if (!(adev->virt.caps & AMDGPU_SRIOV_CAPS_SRIOV_VBIOS))
2964 			amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_NO_VBIOS, 0, 0);
2965 	}
2966 }
2967 
2968 /**
2969  * amdgpu_device_asic_has_dc_support - determine if DC supports the asic
2970  *
2971  * @asic_type: AMD asic type
2972  *
2973  * Check if there is DC (new modesetting infrastructre) support for an asic.
2974  * returns true if DC has support, false if not.
2975  */
amdgpu_device_asic_has_dc_support(enum amd_asic_type asic_type)2976 bool amdgpu_device_asic_has_dc_support(enum amd_asic_type asic_type)
2977 {
2978 	switch (asic_type) {
2979 #if defined(CONFIG_DRM_AMD_DC)
2980 #if defined(CONFIG_DRM_AMD_DC_SI)
2981 	case CHIP_TAHITI:
2982 	case CHIP_PITCAIRN:
2983 	case CHIP_VERDE:
2984 	case CHIP_OLAND:
2985 #endif
2986 	case CHIP_BONAIRE:
2987 	case CHIP_KAVERI:
2988 	case CHIP_KABINI:
2989 	case CHIP_MULLINS:
2990 		/*
2991 		 * We have systems in the wild with these ASICs that require
2992 		 * LVDS and VGA support which is not supported with DC.
2993 		 *
2994 		 * Fallback to the non-DC driver here by default so as not to
2995 		 * cause regressions.
2996 		 */
2997 		return amdgpu_dc > 0;
2998 	case CHIP_HAWAII:
2999 	case CHIP_CARRIZO:
3000 	case CHIP_STONEY:
3001 	case CHIP_POLARIS10:
3002 	case CHIP_POLARIS11:
3003 	case CHIP_POLARIS12:
3004 	case CHIP_VEGAM:
3005 	case CHIP_TONGA:
3006 	case CHIP_FIJI:
3007 	case CHIP_VEGA10:
3008 	case CHIP_VEGA12:
3009 	case CHIP_VEGA20:
3010 #if defined(CONFIG_DRM_AMD_DC_DCN)
3011 	case CHIP_RAVEN:
3012 	case CHIP_NAVI10:
3013 	case CHIP_NAVI14:
3014 	case CHIP_NAVI12:
3015 	case CHIP_RENOIR:
3016 #endif
3017 #if defined(CONFIG_DRM_AMD_DC_DCN3_0)
3018 	case CHIP_SIENNA_CICHLID:
3019 	case CHIP_NAVY_FLOUNDER:
3020 #endif
3021 		return amdgpu_dc != 0;
3022 #endif
3023 	default:
3024 		if (amdgpu_dc > 0)
3025 			DRM_INFO_ONCE("Display Core has been requested via kernel parameter "
3026 					 "but isn't supported by ASIC, ignoring\n");
3027 		return false;
3028 	}
3029 }
3030 
3031 /**
3032  * amdgpu_device_has_dc_support - check if dc is supported
3033  *
3034  * @adev: amdgpu_device pointer
3035  *
3036  * Returns true for supported, false for not supported
3037  */
amdgpu_device_has_dc_support(struct amdgpu_device * adev)3038 bool amdgpu_device_has_dc_support(struct amdgpu_device *adev)
3039 {
3040 	if (amdgpu_sriov_vf(adev) || adev->enable_virtual_display)
3041 		return false;
3042 
3043 	return amdgpu_device_asic_has_dc_support(adev->asic_type);
3044 }
3045 
3046 
amdgpu_device_xgmi_reset_func(struct work_struct * __work)3047 static void amdgpu_device_xgmi_reset_func(struct work_struct *__work)
3048 {
3049 	struct amdgpu_device *adev =
3050 		container_of(__work, struct amdgpu_device, xgmi_reset_work);
3051 	struct amdgpu_hive_info *hive = amdgpu_get_xgmi_hive(adev);
3052 
3053 	/* It's a bug to not have a hive within this function */
3054 	if (WARN_ON(!hive))
3055 		return;
3056 
3057 	/*
3058 	 * Use task barrier to synchronize all xgmi reset works across the
3059 	 * hive. task_barrier_enter and task_barrier_exit will block
3060 	 * until all the threads running the xgmi reset works reach
3061 	 * those points. task_barrier_full will do both blocks.
3062 	 */
3063 	if (amdgpu_asic_reset_method(adev) == AMD_RESET_METHOD_BACO) {
3064 
3065 		task_barrier_enter(&hive->tb);
3066 		adev->asic_reset_res = amdgpu_device_baco_enter(adev_to_drm(adev));
3067 
3068 		if (adev->asic_reset_res)
3069 			goto fail;
3070 
3071 		task_barrier_exit(&hive->tb);
3072 		adev->asic_reset_res = amdgpu_device_baco_exit(adev_to_drm(adev));
3073 
3074 		if (adev->asic_reset_res)
3075 			goto fail;
3076 
3077 		if (adev->mmhub.funcs && adev->mmhub.funcs->reset_ras_error_count)
3078 			adev->mmhub.funcs->reset_ras_error_count(adev);
3079 	} else {
3080 
3081 		task_barrier_full(&hive->tb);
3082 		adev->asic_reset_res =  amdgpu_asic_reset(adev);
3083 	}
3084 
3085 fail:
3086 	if (adev->asic_reset_res)
3087 		DRM_WARN("ASIC reset failed with error, %d for drm dev, %s",
3088 			 adev->asic_reset_res, adev_to_drm(adev)->unique);
3089 	amdgpu_put_xgmi_hive(hive);
3090 }
3091 
amdgpu_device_get_job_timeout_settings(struct amdgpu_device * adev)3092 static int amdgpu_device_get_job_timeout_settings(struct amdgpu_device *adev)
3093 {
3094 	char *input = amdgpu_lockup_timeout;
3095 	char *timeout_setting = NULL;
3096 	int index = 0;
3097 	long timeout;
3098 	int ret = 0;
3099 
3100 	/*
3101 	 * By default timeout for non compute jobs is 10000.
3102 	 * And there is no timeout enforced on compute jobs.
3103 	 * In SR-IOV or passthrough mode, timeout for compute
3104 	 * jobs are 60000 by default.
3105 	 */
3106 	adev->gfx_timeout = msecs_to_jiffies(10000);
3107 	adev->sdma_timeout = adev->video_timeout = adev->gfx_timeout;
3108 	if (amdgpu_sriov_vf(adev) || amdgpu_passthrough(adev))
3109 		adev->compute_timeout =  msecs_to_jiffies(60000);
3110 	else
3111 		adev->compute_timeout = MAX_SCHEDULE_TIMEOUT;
3112 
3113 	if (strnlen(input, AMDGPU_MAX_TIMEOUT_PARAM_LENGTH)) {
3114 		while ((timeout_setting = strsep(&input, ",")) &&
3115 				strnlen(timeout_setting, AMDGPU_MAX_TIMEOUT_PARAM_LENGTH)) {
3116 			ret = kstrtol(timeout_setting, 0, &timeout);
3117 			if (ret)
3118 				return ret;
3119 
3120 			if (timeout == 0) {
3121 				index++;
3122 				continue;
3123 			} else if (timeout < 0) {
3124 				timeout = MAX_SCHEDULE_TIMEOUT;
3125 			} else {
3126 				timeout = msecs_to_jiffies(timeout);
3127 			}
3128 
3129 			switch (index++) {
3130 			case 0:
3131 				adev->gfx_timeout = timeout;
3132 				break;
3133 			case 1:
3134 				adev->compute_timeout = timeout;
3135 				break;
3136 			case 2:
3137 				adev->sdma_timeout = timeout;
3138 				break;
3139 			case 3:
3140 				adev->video_timeout = timeout;
3141 				break;
3142 			default:
3143 				break;
3144 			}
3145 		}
3146 		/*
3147 		 * There is only one value specified and
3148 		 * it should apply to all non-compute jobs.
3149 		 */
3150 		if (index == 1) {
3151 			adev->sdma_timeout = adev->video_timeout = adev->gfx_timeout;
3152 			if (amdgpu_sriov_vf(adev) || amdgpu_passthrough(adev))
3153 				adev->compute_timeout = adev->gfx_timeout;
3154 		}
3155 	}
3156 
3157 	return ret;
3158 }
3159 
3160 static const struct attribute *amdgpu_dev_attributes[] = {
3161 	&dev_attr_product_name.attr,
3162 	&dev_attr_product_number.attr,
3163 	&dev_attr_serial_number.attr,
3164 	&dev_attr_pcie_replay_count.attr,
3165 	NULL
3166 };
3167 
3168 
3169 /**
3170  * amdgpu_device_init - initialize the driver
3171  *
3172  * @adev: amdgpu_device pointer
3173  * @flags: driver flags
3174  *
3175  * Initializes the driver info and hw (all asics).
3176  * Returns 0 for success or an error on failure.
3177  * Called at driver startup.
3178  */
amdgpu_device_init(struct amdgpu_device * adev,uint32_t flags)3179 int amdgpu_device_init(struct amdgpu_device *adev,
3180 		       uint32_t flags)
3181 {
3182 	struct drm_device *ddev = adev_to_drm(adev);
3183 	struct pci_dev *pdev = adev->pdev;
3184 	int r, i;
3185 	bool boco = false;
3186 	u32 max_MBps;
3187 
3188 	adev->shutdown = false;
3189 	adev->flags = flags;
3190 
3191 	if (amdgpu_force_asic_type >= 0 && amdgpu_force_asic_type < CHIP_LAST)
3192 		adev->asic_type = amdgpu_force_asic_type;
3193 	else
3194 		adev->asic_type = flags & AMD_ASIC_MASK;
3195 
3196 	adev->usec_timeout = AMDGPU_MAX_USEC_TIMEOUT;
3197 	if (amdgpu_emu_mode == 1)
3198 		adev->usec_timeout *= 10;
3199 	adev->gmc.gart_size = 512 * 1024 * 1024;
3200 	adev->accel_working = false;
3201 	adev->num_rings = 0;
3202 	adev->mman.buffer_funcs = NULL;
3203 	adev->mman.buffer_funcs_ring = NULL;
3204 	adev->vm_manager.vm_pte_funcs = NULL;
3205 	adev->vm_manager.vm_pte_num_scheds = 0;
3206 	adev->gmc.gmc_funcs = NULL;
3207 	adev->fence_context = dma_fence_context_alloc(AMDGPU_MAX_RINGS);
3208 	bitmap_zero(adev->gfx.pipe_reserve_bitmap, AMDGPU_MAX_COMPUTE_QUEUES);
3209 
3210 	adev->smc_rreg = &amdgpu_invalid_rreg;
3211 	adev->smc_wreg = &amdgpu_invalid_wreg;
3212 	adev->pcie_rreg = &amdgpu_invalid_rreg;
3213 	adev->pcie_wreg = &amdgpu_invalid_wreg;
3214 	adev->pciep_rreg = &amdgpu_invalid_rreg;
3215 	adev->pciep_wreg = &amdgpu_invalid_wreg;
3216 	adev->pcie_rreg64 = &amdgpu_invalid_rreg64;
3217 	adev->pcie_wreg64 = &amdgpu_invalid_wreg64;
3218 	adev->uvd_ctx_rreg = &amdgpu_invalid_rreg;
3219 	adev->uvd_ctx_wreg = &amdgpu_invalid_wreg;
3220 	adev->didt_rreg = &amdgpu_invalid_rreg;
3221 	adev->didt_wreg = &amdgpu_invalid_wreg;
3222 	adev->gc_cac_rreg = &amdgpu_invalid_rreg;
3223 	adev->gc_cac_wreg = &amdgpu_invalid_wreg;
3224 	adev->audio_endpt_rreg = &amdgpu_block_invalid_rreg;
3225 	adev->audio_endpt_wreg = &amdgpu_block_invalid_wreg;
3226 
3227 	DRM_INFO("initializing kernel modesetting (%s 0x%04X:0x%04X 0x%04X:0x%04X 0x%02X).\n",
3228 		 amdgpu_asic_name[adev->asic_type], pdev->vendor, pdev->device,
3229 		 pdev->subsystem_vendor, pdev->subsystem_device, pdev->revision);
3230 
3231 	/* mutex initialization are all done here so we
3232 	 * can recall function without having locking issues */
3233 	atomic_set(&adev->irq.ih.lock, 0);
3234 	mutex_init(&adev->firmware.mutex);
3235 	mutex_init(&adev->pm.mutex);
3236 	mutex_init(&adev->gfx.gpu_clock_mutex);
3237 	mutex_init(&adev->srbm_mutex);
3238 	mutex_init(&adev->gfx.pipe_reserve_mutex);
3239 	mutex_init(&adev->gfx.gfx_off_mutex);
3240 	mutex_init(&adev->grbm_idx_mutex);
3241 	mutex_init(&adev->mn_lock);
3242 	mutex_init(&adev->virt.vf_errors.lock);
3243 	hash_init(adev->mn_hash);
3244 	atomic_set(&adev->in_gpu_reset, 0);
3245 	init_rwsem(&adev->reset_sem);
3246 	mutex_init(&adev->psp.mutex);
3247 	mutex_init(&adev->notifier_lock);
3248 
3249 	r = amdgpu_device_check_arguments(adev);
3250 	if (r)
3251 		return r;
3252 
3253 	spin_lock_init(&adev->mmio_idx_lock);
3254 	spin_lock_init(&adev->smc_idx_lock);
3255 	spin_lock_init(&adev->pcie_idx_lock);
3256 	spin_lock_init(&adev->uvd_ctx_idx_lock);
3257 	spin_lock_init(&adev->didt_idx_lock);
3258 	spin_lock_init(&adev->gc_cac_idx_lock);
3259 	spin_lock_init(&adev->se_cac_idx_lock);
3260 	spin_lock_init(&adev->audio_endpt_idx_lock);
3261 	spin_lock_init(&adev->mm_stats.lock);
3262 
3263 	INIT_LIST_HEAD(&adev->shadow_list);
3264 	mutex_init(&adev->shadow_list_lock);
3265 
3266 	INIT_DELAYED_WORK(&adev->delayed_init_work,
3267 			  amdgpu_device_delayed_init_work_handler);
3268 	INIT_DELAYED_WORK(&adev->gfx.gfx_off_delay_work,
3269 			  amdgpu_device_delay_enable_gfx_off);
3270 
3271 	INIT_WORK(&adev->xgmi_reset_work, amdgpu_device_xgmi_reset_func);
3272 
3273 	adev->gfx.gfx_off_req_count = 1;
3274 	adev->pm.ac_power = power_supply_is_system_supplied() > 0;
3275 
3276 	atomic_set(&adev->throttling_logging_enabled, 1);
3277 	/*
3278 	 * If throttling continues, logging will be performed every minute
3279 	 * to avoid log flooding. "-1" is subtracted since the thermal
3280 	 * throttling interrupt comes every second. Thus, the total logging
3281 	 * interval is 59 seconds(retelimited printk interval) + 1(waiting
3282 	 * for throttling interrupt) = 60 seconds.
3283 	 */
3284 	ratelimit_state_init(&adev->throttling_logging_rs, (60 - 1) * HZ, 1);
3285 	ratelimit_set_flags(&adev->throttling_logging_rs, RATELIMIT_MSG_ON_RELEASE);
3286 
3287 	/* Registers mapping */
3288 	/* TODO: block userspace mapping of io register */
3289 	if (adev->asic_type >= CHIP_BONAIRE) {
3290 		adev->rmmio_base = pci_resource_start(adev->pdev, 5);
3291 		adev->rmmio_size = pci_resource_len(adev->pdev, 5);
3292 	} else {
3293 		adev->rmmio_base = pci_resource_start(adev->pdev, 2);
3294 		adev->rmmio_size = pci_resource_len(adev->pdev, 2);
3295 	}
3296 
3297 	adev->rmmio = ioremap(adev->rmmio_base, adev->rmmio_size);
3298 	if (adev->rmmio == NULL) {
3299 		return -ENOMEM;
3300 	}
3301 	DRM_INFO("register mmio base: 0x%08X\n", (uint32_t)adev->rmmio_base);
3302 	DRM_INFO("register mmio size: %u\n", (unsigned)adev->rmmio_size);
3303 
3304 	/* io port mapping */
3305 	for (i = 0; i < DEVICE_COUNT_RESOURCE; i++) {
3306 		if (pci_resource_flags(adev->pdev, i) & IORESOURCE_IO) {
3307 			adev->rio_mem_size = pci_resource_len(adev->pdev, i);
3308 			adev->rio_mem = pci_iomap(adev->pdev, i, adev->rio_mem_size);
3309 			break;
3310 		}
3311 	}
3312 	if (adev->rio_mem == NULL)
3313 		DRM_INFO("PCI I/O BAR is not found.\n");
3314 
3315 	/* enable PCIE atomic ops */
3316 	r = pci_enable_atomic_ops_to_root(adev->pdev,
3317 					  PCI_EXP_DEVCAP2_ATOMIC_COMP32 |
3318 					  PCI_EXP_DEVCAP2_ATOMIC_COMP64);
3319 	if (r) {
3320 		adev->have_atomics_support = false;
3321 		DRM_INFO("PCIE atomic ops is not supported\n");
3322 	} else {
3323 		adev->have_atomics_support = true;
3324 	}
3325 
3326 	amdgpu_device_get_pcie_info(adev);
3327 
3328 	if (amdgpu_mcbp)
3329 		DRM_INFO("MCBP is enabled\n");
3330 
3331 	if (amdgpu_mes && adev->asic_type >= CHIP_NAVI10)
3332 		adev->enable_mes = true;
3333 
3334 	/* detect hw virtualization here */
3335 	amdgpu_detect_virtualization(adev);
3336 
3337 	r = amdgpu_device_get_job_timeout_settings(adev);
3338 	if (r) {
3339 		dev_err(adev->dev, "invalid lockup_timeout parameter syntax\n");
3340 		return r;
3341 	}
3342 
3343 	/* early init functions */
3344 	r = amdgpu_device_ip_early_init(adev);
3345 	if (r)
3346 		return r;
3347 
3348 	/* doorbell bar mapping and doorbell index init*/
3349 	amdgpu_device_doorbell_init(adev);
3350 
3351 	/* if we have > 1 VGA cards, then disable the amdgpu VGA resources */
3352 	/* this will fail for cards that aren't VGA class devices, just
3353 	 * ignore it */
3354 	vga_client_register(adev->pdev, adev, NULL, amdgpu_device_vga_set_decode);
3355 
3356 	if (amdgpu_device_supports_boco(ddev))
3357 		boco = true;
3358 	if (amdgpu_has_atpx() &&
3359 	    (amdgpu_is_atpx_hybrid() ||
3360 	     amdgpu_has_atpx_dgpu_power_cntl()) &&
3361 	    !pci_is_thunderbolt_attached(adev->pdev))
3362 		vga_switcheroo_register_client(adev->pdev,
3363 					       &amdgpu_switcheroo_ops, boco);
3364 	if (boco)
3365 		vga_switcheroo_init_domain_pm_ops(adev->dev, &adev->vga_pm_domain);
3366 
3367 	if (amdgpu_emu_mode == 1) {
3368 		/* post the asic on emulation mode */
3369 		emu_soc_asic_init(adev);
3370 		goto fence_driver_init;
3371 	}
3372 
3373 	/* detect if we are with an SRIOV vbios */
3374 	amdgpu_device_detect_sriov_bios(adev);
3375 
3376 	/* check if we need to reset the asic
3377 	 *  E.g., driver was not cleanly unloaded previously, etc.
3378 	 */
3379 	if (!amdgpu_sriov_vf(adev) && amdgpu_asic_need_reset_on_init(adev)) {
3380 		r = amdgpu_asic_reset(adev);
3381 		if (r) {
3382 			dev_err(adev->dev, "asic reset on init failed\n");
3383 			goto failed;
3384 		}
3385 	}
3386 
3387 	pci_enable_pcie_error_reporting(adev->ddev.pdev);
3388 
3389 	/* Post card if necessary */
3390 	if (amdgpu_device_need_post(adev)) {
3391 		if (!adev->bios) {
3392 			dev_err(adev->dev, "no vBIOS found\n");
3393 			r = -EINVAL;
3394 			goto failed;
3395 		}
3396 		DRM_INFO("GPU posting now...\n");
3397 		r = amdgpu_device_asic_init(adev);
3398 		if (r) {
3399 			dev_err(adev->dev, "gpu post error!\n");
3400 			goto failed;
3401 		}
3402 	}
3403 
3404 	if (adev->is_atom_fw) {
3405 		/* Initialize clocks */
3406 		r = amdgpu_atomfirmware_get_clock_info(adev);
3407 		if (r) {
3408 			dev_err(adev->dev, "amdgpu_atomfirmware_get_clock_info failed\n");
3409 			amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_ATOMBIOS_GET_CLOCK_FAIL, 0, 0);
3410 			goto failed;
3411 		}
3412 	} else {
3413 		/* Initialize clocks */
3414 		r = amdgpu_atombios_get_clock_info(adev);
3415 		if (r) {
3416 			dev_err(adev->dev, "amdgpu_atombios_get_clock_info failed\n");
3417 			amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_ATOMBIOS_GET_CLOCK_FAIL, 0, 0);
3418 			goto failed;
3419 		}
3420 		/* init i2c buses */
3421 		if (!amdgpu_device_has_dc_support(adev))
3422 			amdgpu_atombios_i2c_init(adev);
3423 	}
3424 
3425 fence_driver_init:
3426 	/* Fence driver */
3427 	r = amdgpu_fence_driver_init(adev);
3428 	if (r) {
3429 		dev_err(adev->dev, "amdgpu_fence_driver_init failed\n");
3430 		amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_FENCE_INIT_FAIL, 0, 0);
3431 		goto failed;
3432 	}
3433 
3434 	/* init the mode config */
3435 	drm_mode_config_init(adev_to_drm(adev));
3436 
3437 	r = amdgpu_device_ip_init(adev);
3438 	if (r) {
3439 		/* failed in exclusive mode due to timeout */
3440 		if (amdgpu_sriov_vf(adev) &&
3441 		    !amdgpu_sriov_runtime(adev) &&
3442 		    amdgpu_virt_mmio_blocked(adev) &&
3443 		    !amdgpu_virt_wait_reset(adev)) {
3444 			dev_err(adev->dev, "VF exclusive mode timeout\n");
3445 			/* Don't send request since VF is inactive. */
3446 			adev->virt.caps &= ~AMDGPU_SRIOV_CAPS_RUNTIME;
3447 			adev->virt.ops = NULL;
3448 			r = -EAGAIN;
3449 			goto failed;
3450 		}
3451 		dev_err(adev->dev, "amdgpu_device_ip_init failed\n");
3452 		amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_AMDGPU_INIT_FAIL, 0, 0);
3453 		goto failed;
3454 	}
3455 
3456 	dev_info(adev->dev,
3457 		"SE %d, SH per SE %d, CU per SH %d, active_cu_number %d\n",
3458 			adev->gfx.config.max_shader_engines,
3459 			adev->gfx.config.max_sh_per_se,
3460 			adev->gfx.config.max_cu_per_sh,
3461 			adev->gfx.cu_info.number);
3462 
3463 	adev->accel_working = true;
3464 
3465 	amdgpu_vm_check_compute_bug(adev);
3466 
3467 	/* Initialize the buffer migration limit. */
3468 	if (amdgpu_moverate >= 0)
3469 		max_MBps = amdgpu_moverate;
3470 	else
3471 		max_MBps = 8; /* Allow 8 MB/s. */
3472 	/* Get a log2 for easy divisions. */
3473 	adev->mm_stats.log2_max_MBps = ilog2(max(1u, max_MBps));
3474 
3475 	amdgpu_fbdev_init(adev);
3476 
3477 	r = amdgpu_pm_sysfs_init(adev);
3478 	if (r) {
3479 		adev->pm_sysfs_en = false;
3480 		DRM_ERROR("registering pm debugfs failed (%d).\n", r);
3481 	} else
3482 		adev->pm_sysfs_en = true;
3483 
3484 	r = amdgpu_ucode_sysfs_init(adev);
3485 	if (r) {
3486 		adev->ucode_sysfs_en = false;
3487 		DRM_ERROR("Creating firmware sysfs failed (%d).\n", r);
3488 	} else
3489 		adev->ucode_sysfs_en = true;
3490 
3491 	if ((amdgpu_testing & 1)) {
3492 		if (adev->accel_working)
3493 			amdgpu_test_moves(adev);
3494 		else
3495 			DRM_INFO("amdgpu: acceleration disabled, skipping move tests\n");
3496 	}
3497 	if (amdgpu_benchmarking) {
3498 		if (adev->accel_working)
3499 			amdgpu_benchmark(adev, amdgpu_benchmarking);
3500 		else
3501 			DRM_INFO("amdgpu: acceleration disabled, skipping benchmarks\n");
3502 	}
3503 
3504 	/*
3505 	 * Register gpu instance before amdgpu_device_enable_mgpu_fan_boost.
3506 	 * Otherwise the mgpu fan boost feature will be skipped due to the
3507 	 * gpu instance is counted less.
3508 	 */
3509 	amdgpu_register_gpu_instance(adev);
3510 
3511 	/* enable clockgating, etc. after ib tests, etc. since some blocks require
3512 	 * explicit gating rather than handling it automatically.
3513 	 */
3514 	r = amdgpu_device_ip_late_init(adev);
3515 	if (r) {
3516 		dev_err(adev->dev, "amdgpu_device_ip_late_init failed\n");
3517 		amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_AMDGPU_LATE_INIT_FAIL, 0, r);
3518 		goto failed;
3519 	}
3520 
3521 	/* must succeed. */
3522 	amdgpu_ras_resume(adev);
3523 
3524 	queue_delayed_work(system_wq, &adev->delayed_init_work,
3525 			   msecs_to_jiffies(AMDGPU_RESUME_MS));
3526 
3527 	if (amdgpu_sriov_vf(adev))
3528 		flush_delayed_work(&adev->delayed_init_work);
3529 
3530 	r = sysfs_create_files(&adev->dev->kobj, amdgpu_dev_attributes);
3531 	if (r)
3532 		dev_err(adev->dev, "Could not create amdgpu device attr\n");
3533 
3534 	if (IS_ENABLED(CONFIG_PERF_EVENTS))
3535 		r = amdgpu_pmu_init(adev);
3536 	if (r)
3537 		dev_err(adev->dev, "amdgpu_pmu_init failed\n");
3538 
3539 	/* Have stored pci confspace at hand for restore in sudden PCI error */
3540 	if (amdgpu_device_cache_pci_state(adev->pdev))
3541 		pci_restore_state(pdev);
3542 
3543 	return 0;
3544 
3545 failed:
3546 	amdgpu_vf_error_trans_all(adev);
3547 	if (boco)
3548 		vga_switcheroo_fini_domain_pm_ops(adev->dev);
3549 
3550 	return r;
3551 }
3552 
3553 /**
3554  * amdgpu_device_fini - tear down the driver
3555  *
3556  * @adev: amdgpu_device pointer
3557  *
3558  * Tear down the driver info (all asics).
3559  * Called at driver shutdown.
3560  */
amdgpu_device_fini(struct amdgpu_device * adev)3561 void amdgpu_device_fini(struct amdgpu_device *adev)
3562 {
3563 	dev_info(adev->dev, "amdgpu: finishing device.\n");
3564 	flush_delayed_work(&adev->delayed_init_work);
3565 	ttm_bo_lock_delayed_workqueue(&adev->mman.bdev);
3566 	adev->shutdown = true;
3567 
3568 	kfree(adev->pci_state);
3569 
3570 	/* make sure IB test finished before entering exclusive mode
3571 	 * to avoid preemption on IB test
3572 	 * */
3573 	if (amdgpu_sriov_vf(adev)) {
3574 		amdgpu_virt_request_full_gpu(adev, false);
3575 		amdgpu_virt_fini_data_exchange(adev);
3576 	}
3577 
3578 	/* disable all interrupts */
3579 	amdgpu_irq_disable_all(adev);
3580 	if (adev->mode_info.mode_config_initialized){
3581 		if (!amdgpu_device_has_dc_support(adev))
3582 			drm_helper_force_disable_all(adev_to_drm(adev));
3583 		else
3584 			drm_atomic_helper_shutdown(adev_to_drm(adev));
3585 	}
3586 	amdgpu_fence_driver_fini(adev);
3587 	if (adev->pm_sysfs_en)
3588 		amdgpu_pm_sysfs_fini(adev);
3589 	amdgpu_fbdev_fini(adev);
3590 	amdgpu_device_ip_fini(adev);
3591 	release_firmware(adev->firmware.gpu_info_fw);
3592 	adev->firmware.gpu_info_fw = NULL;
3593 	adev->accel_working = false;
3594 	/* free i2c buses */
3595 	if (!amdgpu_device_has_dc_support(adev))
3596 		amdgpu_i2c_fini(adev);
3597 
3598 	if (amdgpu_emu_mode != 1)
3599 		amdgpu_atombios_fini(adev);
3600 
3601 	kfree(adev->bios);
3602 	adev->bios = NULL;
3603 	if (amdgpu_has_atpx() &&
3604 	    (amdgpu_is_atpx_hybrid() ||
3605 	     amdgpu_has_atpx_dgpu_power_cntl()) &&
3606 	    !pci_is_thunderbolt_attached(adev->pdev))
3607 		vga_switcheroo_unregister_client(adev->pdev);
3608 	if (amdgpu_device_supports_boco(adev_to_drm(adev)))
3609 		vga_switcheroo_fini_domain_pm_ops(adev->dev);
3610 	vga_client_register(adev->pdev, NULL, NULL, NULL);
3611 	if (adev->rio_mem)
3612 		pci_iounmap(adev->pdev, adev->rio_mem);
3613 	adev->rio_mem = NULL;
3614 	iounmap(adev->rmmio);
3615 	adev->rmmio = NULL;
3616 	amdgpu_device_doorbell_fini(adev);
3617 
3618 	if (adev->ucode_sysfs_en)
3619 		amdgpu_ucode_sysfs_fini(adev);
3620 
3621 	sysfs_remove_files(&adev->dev->kobj, amdgpu_dev_attributes);
3622 	if (IS_ENABLED(CONFIG_PERF_EVENTS))
3623 		amdgpu_pmu_fini(adev);
3624 	if (adev->mman.discovery_bin)
3625 		amdgpu_discovery_fini(adev);
3626 }
3627 
3628 
3629 /*
3630  * Suspend & resume.
3631  */
3632 /**
3633  * amdgpu_device_suspend - initiate device suspend
3634  *
3635  * @dev: drm dev pointer
3636  * @fbcon : notify the fbdev of suspend
3637  *
3638  * Puts the hw in the suspend state (all asics).
3639  * Returns 0 for success or an error on failure.
3640  * Called at driver suspend.
3641  */
amdgpu_device_suspend(struct drm_device * dev,bool fbcon)3642 int amdgpu_device_suspend(struct drm_device *dev, bool fbcon)
3643 {
3644 	struct amdgpu_device *adev;
3645 	struct drm_crtc *crtc;
3646 	struct drm_connector *connector;
3647 	struct drm_connector_list_iter iter;
3648 	int r;
3649 
3650 	adev = drm_to_adev(dev);
3651 
3652 	if (dev->switch_power_state == DRM_SWITCH_POWER_OFF)
3653 		return 0;
3654 
3655 	adev->in_suspend = true;
3656 	drm_kms_helper_poll_disable(dev);
3657 
3658 	if (fbcon)
3659 		amdgpu_fbdev_set_suspend(adev, 1);
3660 
3661 	cancel_delayed_work_sync(&adev->delayed_init_work);
3662 
3663 	if (!amdgpu_device_has_dc_support(adev)) {
3664 		/* turn off display hw */
3665 		drm_modeset_lock_all(dev);
3666 		drm_connector_list_iter_begin(dev, &iter);
3667 		drm_for_each_connector_iter(connector, &iter)
3668 			drm_helper_connector_dpms(connector,
3669 						  DRM_MODE_DPMS_OFF);
3670 		drm_connector_list_iter_end(&iter);
3671 		drm_modeset_unlock_all(dev);
3672 			/* unpin the front buffers and cursors */
3673 		list_for_each_entry(crtc, &dev->mode_config.crtc_list, head) {
3674 			struct amdgpu_crtc *amdgpu_crtc = to_amdgpu_crtc(crtc);
3675 			struct drm_framebuffer *fb = crtc->primary->fb;
3676 			struct amdgpu_bo *robj;
3677 
3678 			if (amdgpu_crtc->cursor_bo && !adev->enable_virtual_display) {
3679 				struct amdgpu_bo *aobj = gem_to_amdgpu_bo(amdgpu_crtc->cursor_bo);
3680 				r = amdgpu_bo_reserve(aobj, true);
3681 				if (r == 0) {
3682 					amdgpu_bo_unpin(aobj);
3683 					amdgpu_bo_unreserve(aobj);
3684 				}
3685 			}
3686 
3687 			if (fb == NULL || fb->obj[0] == NULL) {
3688 				continue;
3689 			}
3690 			robj = gem_to_amdgpu_bo(fb->obj[0]);
3691 			/* don't unpin kernel fb objects */
3692 			if (!amdgpu_fbdev_robj_is_fb(adev, robj)) {
3693 				r = amdgpu_bo_reserve(robj, true);
3694 				if (r == 0) {
3695 					amdgpu_bo_unpin(robj);
3696 					amdgpu_bo_unreserve(robj);
3697 				}
3698 			}
3699 		}
3700 	}
3701 
3702 	amdgpu_ras_suspend(adev);
3703 
3704 	r = amdgpu_device_ip_suspend_phase1(adev);
3705 
3706 	amdgpu_amdkfd_suspend(adev, !fbcon);
3707 
3708 	/* evict vram memory */
3709 	amdgpu_bo_evict_vram(adev);
3710 
3711 	amdgpu_fence_driver_suspend(adev);
3712 
3713 	r = amdgpu_device_ip_suspend_phase2(adev);
3714 
3715 	/* evict remaining vram memory
3716 	 * This second call to evict vram is to evict the gart page table
3717 	 * using the CPU.
3718 	 */
3719 	amdgpu_bo_evict_vram(adev);
3720 
3721 	return 0;
3722 }
3723 
3724 /**
3725  * amdgpu_device_resume - initiate device resume
3726  *
3727  * @dev: drm dev pointer
3728  * @fbcon : notify the fbdev of resume
3729  *
3730  * Bring the hw back to operating state (all asics).
3731  * Returns 0 for success or an error on failure.
3732  * Called at driver resume.
3733  */
amdgpu_device_resume(struct drm_device * dev,bool fbcon)3734 int amdgpu_device_resume(struct drm_device *dev, bool fbcon)
3735 {
3736 	struct drm_connector *connector;
3737 	struct drm_connector_list_iter iter;
3738 	struct amdgpu_device *adev = drm_to_adev(dev);
3739 	struct drm_crtc *crtc;
3740 	int r = 0;
3741 
3742 	if (dev->switch_power_state == DRM_SWITCH_POWER_OFF)
3743 		return 0;
3744 
3745 	/* post card */
3746 	if (amdgpu_device_need_post(adev)) {
3747 		r = amdgpu_device_asic_init(adev);
3748 		if (r)
3749 			dev_err(adev->dev, "amdgpu asic init failed\n");
3750 	}
3751 
3752 	r = amdgpu_device_ip_resume(adev);
3753 	if (r) {
3754 		dev_err(adev->dev, "amdgpu_device_ip_resume failed (%d).\n", r);
3755 		return r;
3756 	}
3757 	amdgpu_fence_driver_resume(adev);
3758 
3759 
3760 	r = amdgpu_device_ip_late_init(adev);
3761 	if (r)
3762 		return r;
3763 
3764 	queue_delayed_work(system_wq, &adev->delayed_init_work,
3765 			   msecs_to_jiffies(AMDGPU_RESUME_MS));
3766 
3767 	if (!amdgpu_device_has_dc_support(adev)) {
3768 		/* pin cursors */
3769 		list_for_each_entry(crtc, &dev->mode_config.crtc_list, head) {
3770 			struct amdgpu_crtc *amdgpu_crtc = to_amdgpu_crtc(crtc);
3771 
3772 			if (amdgpu_crtc->cursor_bo && !adev->enable_virtual_display) {
3773 				struct amdgpu_bo *aobj = gem_to_amdgpu_bo(amdgpu_crtc->cursor_bo);
3774 				r = amdgpu_bo_reserve(aobj, true);
3775 				if (r == 0) {
3776 					r = amdgpu_bo_pin(aobj, AMDGPU_GEM_DOMAIN_VRAM);
3777 					if (r != 0)
3778 						dev_err(adev->dev, "Failed to pin cursor BO (%d)\n", r);
3779 					amdgpu_crtc->cursor_addr = amdgpu_bo_gpu_offset(aobj);
3780 					amdgpu_bo_unreserve(aobj);
3781 				}
3782 			}
3783 		}
3784 	}
3785 	r = amdgpu_amdkfd_resume(adev, !fbcon);
3786 	if (r)
3787 		return r;
3788 
3789 	/* Make sure IB tests flushed */
3790 	flush_delayed_work(&adev->delayed_init_work);
3791 
3792 	/* blat the mode back in */
3793 	if (fbcon) {
3794 		if (!amdgpu_device_has_dc_support(adev)) {
3795 			/* pre DCE11 */
3796 			drm_helper_resume_force_mode(dev);
3797 
3798 			/* turn on display hw */
3799 			drm_modeset_lock_all(dev);
3800 
3801 			drm_connector_list_iter_begin(dev, &iter);
3802 			drm_for_each_connector_iter(connector, &iter)
3803 				drm_helper_connector_dpms(connector,
3804 							  DRM_MODE_DPMS_ON);
3805 			drm_connector_list_iter_end(&iter);
3806 
3807 			drm_modeset_unlock_all(dev);
3808 		}
3809 		amdgpu_fbdev_set_suspend(adev, 0);
3810 	}
3811 
3812 	drm_kms_helper_poll_enable(dev);
3813 
3814 	amdgpu_ras_resume(adev);
3815 
3816 	/*
3817 	 * Most of the connector probing functions try to acquire runtime pm
3818 	 * refs to ensure that the GPU is powered on when connector polling is
3819 	 * performed. Since we're calling this from a runtime PM callback,
3820 	 * trying to acquire rpm refs will cause us to deadlock.
3821 	 *
3822 	 * Since we're guaranteed to be holding the rpm lock, it's safe to
3823 	 * temporarily disable the rpm helpers so this doesn't deadlock us.
3824 	 */
3825 #ifdef CONFIG_PM
3826 	dev->dev->power.disable_depth++;
3827 #endif
3828 	if (!amdgpu_device_has_dc_support(adev))
3829 		drm_helper_hpd_irq_event(dev);
3830 	else
3831 		drm_kms_helper_hotplug_event(dev);
3832 #ifdef CONFIG_PM
3833 	dev->dev->power.disable_depth--;
3834 #endif
3835 	adev->in_suspend = false;
3836 
3837 	return 0;
3838 }
3839 
3840 /**
3841  * amdgpu_device_ip_check_soft_reset - did soft reset succeed
3842  *
3843  * @adev: amdgpu_device pointer
3844  *
3845  * The list of all the hardware IPs that make up the asic is walked and
3846  * the check_soft_reset callbacks are run.  check_soft_reset determines
3847  * if the asic is still hung or not.
3848  * Returns true if any of the IPs are still in a hung state, false if not.
3849  */
amdgpu_device_ip_check_soft_reset(struct amdgpu_device * adev)3850 static bool amdgpu_device_ip_check_soft_reset(struct amdgpu_device *adev)
3851 {
3852 	int i;
3853 	bool asic_hang = false;
3854 
3855 	if (amdgpu_sriov_vf(adev))
3856 		return true;
3857 
3858 	if (amdgpu_asic_need_full_reset(adev))
3859 		return true;
3860 
3861 	for (i = 0; i < adev->num_ip_blocks; i++) {
3862 		if (!adev->ip_blocks[i].status.valid)
3863 			continue;
3864 		if (adev->ip_blocks[i].version->funcs->check_soft_reset)
3865 			adev->ip_blocks[i].status.hang =
3866 				adev->ip_blocks[i].version->funcs->check_soft_reset(adev);
3867 		if (adev->ip_blocks[i].status.hang) {
3868 			dev_info(adev->dev, "IP block:%s is hung!\n", adev->ip_blocks[i].version->funcs->name);
3869 			asic_hang = true;
3870 		}
3871 	}
3872 	return asic_hang;
3873 }
3874 
3875 /**
3876  * amdgpu_device_ip_pre_soft_reset - prepare for soft reset
3877  *
3878  * @adev: amdgpu_device pointer
3879  *
3880  * The list of all the hardware IPs that make up the asic is walked and the
3881  * pre_soft_reset callbacks are run if the block is hung.  pre_soft_reset
3882  * handles any IP specific hardware or software state changes that are
3883  * necessary for a soft reset to succeed.
3884  * Returns 0 on success, negative error code on failure.
3885  */
amdgpu_device_ip_pre_soft_reset(struct amdgpu_device * adev)3886 static int amdgpu_device_ip_pre_soft_reset(struct amdgpu_device *adev)
3887 {
3888 	int i, r = 0;
3889 
3890 	for (i = 0; i < adev->num_ip_blocks; i++) {
3891 		if (!adev->ip_blocks[i].status.valid)
3892 			continue;
3893 		if (adev->ip_blocks[i].status.hang &&
3894 		    adev->ip_blocks[i].version->funcs->pre_soft_reset) {
3895 			r = adev->ip_blocks[i].version->funcs->pre_soft_reset(adev);
3896 			if (r)
3897 				return r;
3898 		}
3899 	}
3900 
3901 	return 0;
3902 }
3903 
3904 /**
3905  * amdgpu_device_ip_need_full_reset - check if a full asic reset is needed
3906  *
3907  * @adev: amdgpu_device pointer
3908  *
3909  * Some hardware IPs cannot be soft reset.  If they are hung, a full gpu
3910  * reset is necessary to recover.
3911  * Returns true if a full asic reset is required, false if not.
3912  */
amdgpu_device_ip_need_full_reset(struct amdgpu_device * adev)3913 static bool amdgpu_device_ip_need_full_reset(struct amdgpu_device *adev)
3914 {
3915 	int i;
3916 
3917 	if (amdgpu_asic_need_full_reset(adev))
3918 		return true;
3919 
3920 	for (i = 0; i < adev->num_ip_blocks; i++) {
3921 		if (!adev->ip_blocks[i].status.valid)
3922 			continue;
3923 		if ((adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC) ||
3924 		    (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_SMC) ||
3925 		    (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_ACP) ||
3926 		    (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_DCE) ||
3927 		     adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_PSP) {
3928 			if (adev->ip_blocks[i].status.hang) {
3929 				dev_info(adev->dev, "Some block need full reset!\n");
3930 				return true;
3931 			}
3932 		}
3933 	}
3934 	return false;
3935 }
3936 
3937 /**
3938  * amdgpu_device_ip_soft_reset - do a soft reset
3939  *
3940  * @adev: amdgpu_device pointer
3941  *
3942  * The list of all the hardware IPs that make up the asic is walked and the
3943  * soft_reset callbacks are run if the block is hung.  soft_reset handles any
3944  * IP specific hardware or software state changes that are necessary to soft
3945  * reset the IP.
3946  * Returns 0 on success, negative error code on failure.
3947  */
amdgpu_device_ip_soft_reset(struct amdgpu_device * adev)3948 static int amdgpu_device_ip_soft_reset(struct amdgpu_device *adev)
3949 {
3950 	int i, r = 0;
3951 
3952 	for (i = 0; i < adev->num_ip_blocks; i++) {
3953 		if (!adev->ip_blocks[i].status.valid)
3954 			continue;
3955 		if (adev->ip_blocks[i].status.hang &&
3956 		    adev->ip_blocks[i].version->funcs->soft_reset) {
3957 			r = adev->ip_blocks[i].version->funcs->soft_reset(adev);
3958 			if (r)
3959 				return r;
3960 		}
3961 	}
3962 
3963 	return 0;
3964 }
3965 
3966 /**
3967  * amdgpu_device_ip_post_soft_reset - clean up from soft reset
3968  *
3969  * @adev: amdgpu_device pointer
3970  *
3971  * The list of all the hardware IPs that make up the asic is walked and the
3972  * post_soft_reset callbacks are run if the asic was hung.  post_soft_reset
3973  * handles any IP specific hardware or software state changes that are
3974  * necessary after the IP has been soft reset.
3975  * Returns 0 on success, negative error code on failure.
3976  */
amdgpu_device_ip_post_soft_reset(struct amdgpu_device * adev)3977 static int amdgpu_device_ip_post_soft_reset(struct amdgpu_device *adev)
3978 {
3979 	int i, r = 0;
3980 
3981 	for (i = 0; i < adev->num_ip_blocks; i++) {
3982 		if (!adev->ip_blocks[i].status.valid)
3983 			continue;
3984 		if (adev->ip_blocks[i].status.hang &&
3985 		    adev->ip_blocks[i].version->funcs->post_soft_reset)
3986 			r = adev->ip_blocks[i].version->funcs->post_soft_reset(adev);
3987 		if (r)
3988 			return r;
3989 	}
3990 
3991 	return 0;
3992 }
3993 
3994 /**
3995  * amdgpu_device_recover_vram - Recover some VRAM contents
3996  *
3997  * @adev: amdgpu_device pointer
3998  *
3999  * Restores the contents of VRAM buffers from the shadows in GTT.  Used to
4000  * restore things like GPUVM page tables after a GPU reset where
4001  * the contents of VRAM might be lost.
4002  *
4003  * Returns:
4004  * 0 on success, negative error code on failure.
4005  */
amdgpu_device_recover_vram(struct amdgpu_device * adev)4006 static int amdgpu_device_recover_vram(struct amdgpu_device *adev)
4007 {
4008 	struct dma_fence *fence = NULL, *next = NULL;
4009 	struct amdgpu_bo *shadow;
4010 	long r = 1, tmo;
4011 
4012 	if (amdgpu_sriov_runtime(adev))
4013 		tmo = msecs_to_jiffies(8000);
4014 	else
4015 		tmo = msecs_to_jiffies(100);
4016 
4017 	dev_info(adev->dev, "recover vram bo from shadow start\n");
4018 	mutex_lock(&adev->shadow_list_lock);
4019 	list_for_each_entry(shadow, &adev->shadow_list, shadow_list) {
4020 
4021 		/* No need to recover an evicted BO */
4022 		if (shadow->tbo.mem.mem_type != TTM_PL_TT ||
4023 		    shadow->tbo.mem.start == AMDGPU_BO_INVALID_OFFSET ||
4024 		    shadow->parent->tbo.mem.mem_type != TTM_PL_VRAM)
4025 			continue;
4026 
4027 		r = amdgpu_bo_restore_shadow(shadow, &next);
4028 		if (r)
4029 			break;
4030 
4031 		if (fence) {
4032 			tmo = dma_fence_wait_timeout(fence, false, tmo);
4033 			dma_fence_put(fence);
4034 			fence = next;
4035 			if (tmo == 0) {
4036 				r = -ETIMEDOUT;
4037 				break;
4038 			} else if (tmo < 0) {
4039 				r = tmo;
4040 				break;
4041 			}
4042 		} else {
4043 			fence = next;
4044 		}
4045 	}
4046 	mutex_unlock(&adev->shadow_list_lock);
4047 
4048 	if (fence)
4049 		tmo = dma_fence_wait_timeout(fence, false, tmo);
4050 	dma_fence_put(fence);
4051 
4052 	if (r < 0 || tmo <= 0) {
4053 		dev_err(adev->dev, "recover vram bo from shadow failed, r is %ld, tmo is %ld\n", r, tmo);
4054 		return -EIO;
4055 	}
4056 
4057 	dev_info(adev->dev, "recover vram bo from shadow done\n");
4058 	return 0;
4059 }
4060 
4061 
4062 /**
4063  * amdgpu_device_reset_sriov - reset ASIC for SR-IOV vf
4064  *
4065  * @adev: amdgpu_device pointer
4066  * @from_hypervisor: request from hypervisor
4067  *
4068  * do VF FLR and reinitialize Asic
4069  * return 0 means succeeded otherwise failed
4070  */
amdgpu_device_reset_sriov(struct amdgpu_device * adev,bool from_hypervisor)4071 static int amdgpu_device_reset_sriov(struct amdgpu_device *adev,
4072 				     bool from_hypervisor)
4073 {
4074 	int r;
4075 
4076 	if (from_hypervisor)
4077 		r = amdgpu_virt_request_full_gpu(adev, true);
4078 	else
4079 		r = amdgpu_virt_reset_gpu(adev);
4080 	if (r)
4081 		return r;
4082 
4083 	amdgpu_amdkfd_pre_reset(adev);
4084 
4085 	/* Resume IP prior to SMC */
4086 	r = amdgpu_device_ip_reinit_early_sriov(adev);
4087 	if (r)
4088 		goto error;
4089 
4090 	amdgpu_virt_init_data_exchange(adev);
4091 	/* we need recover gart prior to run SMC/CP/SDMA resume */
4092 	amdgpu_gtt_mgr_recover(ttm_manager_type(&adev->mman.bdev, TTM_PL_TT));
4093 
4094 	r = amdgpu_device_fw_loading(adev);
4095 	if (r)
4096 		return r;
4097 
4098 	/* now we are okay to resume SMC/CP/SDMA */
4099 	r = amdgpu_device_ip_reinit_late_sriov(adev);
4100 	if (r)
4101 		goto error;
4102 
4103 	amdgpu_irq_gpu_reset_resume_helper(adev);
4104 	r = amdgpu_ib_ring_tests(adev);
4105 	amdgpu_amdkfd_post_reset(adev);
4106 
4107 error:
4108 	amdgpu_virt_release_full_gpu(adev, true);
4109 	if (!r && adev->virt.gim_feature & AMDGIM_FEATURE_GIM_FLR_VRAMLOST) {
4110 		amdgpu_inc_vram_lost(adev);
4111 		r = amdgpu_device_recover_vram(adev);
4112 	}
4113 
4114 	return r;
4115 }
4116 
4117 /**
4118  * amdgpu_device_has_job_running - check if there is any job in mirror list
4119  *
4120  * @adev: amdgpu_device pointer
4121  *
4122  * check if there is any job in mirror list
4123  */
amdgpu_device_has_job_running(struct amdgpu_device * adev)4124 bool amdgpu_device_has_job_running(struct amdgpu_device *adev)
4125 {
4126 	int i;
4127 	struct drm_sched_job *job;
4128 
4129 	for (i = 0; i < AMDGPU_MAX_RINGS; ++i) {
4130 		struct amdgpu_ring *ring = adev->rings[i];
4131 
4132 		if (!ring || !ring->sched.thread)
4133 			continue;
4134 
4135 		spin_lock(&ring->sched.job_list_lock);
4136 		job = list_first_entry_or_null(&ring->sched.ring_mirror_list,
4137 				struct drm_sched_job, node);
4138 		spin_unlock(&ring->sched.job_list_lock);
4139 		if (job)
4140 			return true;
4141 	}
4142 	return false;
4143 }
4144 
4145 /**
4146  * amdgpu_device_should_recover_gpu - check if we should try GPU recovery
4147  *
4148  * @adev: amdgpu_device pointer
4149  *
4150  * Check amdgpu_gpu_recovery and SRIOV status to see if we should try to recover
4151  * a hung GPU.
4152  */
amdgpu_device_should_recover_gpu(struct amdgpu_device * adev)4153 bool amdgpu_device_should_recover_gpu(struct amdgpu_device *adev)
4154 {
4155 	if (!amdgpu_device_ip_check_soft_reset(adev)) {
4156 		dev_info(adev->dev, "Timeout, but no hardware hang detected.\n");
4157 		return false;
4158 	}
4159 
4160 	if (amdgpu_gpu_recovery == 0)
4161 		goto disabled;
4162 
4163 	if (amdgpu_sriov_vf(adev))
4164 		return true;
4165 
4166 	if (amdgpu_gpu_recovery == -1) {
4167 		switch (adev->asic_type) {
4168 		case CHIP_BONAIRE:
4169 		case CHIP_HAWAII:
4170 		case CHIP_TOPAZ:
4171 		case CHIP_TONGA:
4172 		case CHIP_FIJI:
4173 		case CHIP_POLARIS10:
4174 		case CHIP_POLARIS11:
4175 		case CHIP_POLARIS12:
4176 		case CHIP_VEGAM:
4177 		case CHIP_VEGA20:
4178 		case CHIP_VEGA10:
4179 		case CHIP_VEGA12:
4180 		case CHIP_RAVEN:
4181 		case CHIP_ARCTURUS:
4182 		case CHIP_RENOIR:
4183 		case CHIP_NAVI10:
4184 		case CHIP_NAVI14:
4185 		case CHIP_NAVI12:
4186 		case CHIP_SIENNA_CICHLID:
4187 			break;
4188 		default:
4189 			goto disabled;
4190 		}
4191 	}
4192 
4193 	return true;
4194 
4195 disabled:
4196 		dev_info(adev->dev, "GPU recovery disabled.\n");
4197 		return false;
4198 }
4199 
4200 
amdgpu_device_pre_asic_reset(struct amdgpu_device * adev,struct amdgpu_job * job,bool * need_full_reset_arg)4201 static int amdgpu_device_pre_asic_reset(struct amdgpu_device *adev,
4202 					struct amdgpu_job *job,
4203 					bool *need_full_reset_arg)
4204 {
4205 	int i, r = 0;
4206 	bool need_full_reset  = *need_full_reset_arg;
4207 
4208 	amdgpu_debugfs_wait_dump(adev);
4209 
4210 	if (amdgpu_sriov_vf(adev)) {
4211 		/* stop the data exchange thread */
4212 		amdgpu_virt_fini_data_exchange(adev);
4213 	}
4214 
4215 	/* block all schedulers and reset given job's ring */
4216 	for (i = 0; i < AMDGPU_MAX_RINGS; ++i) {
4217 		struct amdgpu_ring *ring = adev->rings[i];
4218 
4219 		if (!ring || !ring->sched.thread)
4220 			continue;
4221 
4222 		/* after all hw jobs are reset, hw fence is meaningless, so force_completion */
4223 		amdgpu_fence_driver_force_completion(ring);
4224 	}
4225 
4226 	if(job)
4227 		drm_sched_increase_karma(&job->base);
4228 
4229 	/* Don't suspend on bare metal if we are not going to HW reset the ASIC */
4230 	if (!amdgpu_sriov_vf(adev)) {
4231 
4232 		if (!need_full_reset)
4233 			need_full_reset = amdgpu_device_ip_need_full_reset(adev);
4234 
4235 		if (!need_full_reset) {
4236 			amdgpu_device_ip_pre_soft_reset(adev);
4237 			r = amdgpu_device_ip_soft_reset(adev);
4238 			amdgpu_device_ip_post_soft_reset(adev);
4239 			if (r || amdgpu_device_ip_check_soft_reset(adev)) {
4240 				dev_info(adev->dev, "soft reset failed, will fallback to full reset!\n");
4241 				need_full_reset = true;
4242 			}
4243 		}
4244 
4245 		if (need_full_reset)
4246 			r = amdgpu_device_ip_suspend(adev);
4247 
4248 		*need_full_reset_arg = need_full_reset;
4249 	}
4250 
4251 	return r;
4252 }
4253 
amdgpu_do_asic_reset(struct amdgpu_hive_info * hive,struct list_head * device_list_handle,bool * need_full_reset_arg,bool skip_hw_reset)4254 static int amdgpu_do_asic_reset(struct amdgpu_hive_info *hive,
4255 			       struct list_head *device_list_handle,
4256 			       bool *need_full_reset_arg,
4257 			       bool skip_hw_reset)
4258 {
4259 	struct amdgpu_device *tmp_adev = NULL;
4260 	bool need_full_reset = *need_full_reset_arg, vram_lost = false;
4261 	int r = 0;
4262 
4263 	/*
4264 	 * ASIC reset has to be done on all HGMI hive nodes ASAP
4265 	 * to allow proper links negotiation in FW (within 1 sec)
4266 	 */
4267 	if (!skip_hw_reset && need_full_reset) {
4268 		list_for_each_entry(tmp_adev, device_list_handle, gmc.xgmi.head) {
4269 			/* For XGMI run all resets in parallel to speed up the process */
4270 			if (tmp_adev->gmc.xgmi.num_physical_nodes > 1) {
4271 				if (!queue_work(system_unbound_wq, &tmp_adev->xgmi_reset_work))
4272 					r = -EALREADY;
4273 			} else
4274 				r = amdgpu_asic_reset(tmp_adev);
4275 
4276 			if (r) {
4277 				dev_err(tmp_adev->dev, "ASIC reset failed with error, %d for drm dev, %s",
4278 					 r, adev_to_drm(tmp_adev)->unique);
4279 				break;
4280 			}
4281 		}
4282 
4283 		/* For XGMI wait for all resets to complete before proceed */
4284 		if (!r) {
4285 			list_for_each_entry(tmp_adev, device_list_handle,
4286 					    gmc.xgmi.head) {
4287 				if (tmp_adev->gmc.xgmi.num_physical_nodes > 1) {
4288 					flush_work(&tmp_adev->xgmi_reset_work);
4289 					r = tmp_adev->asic_reset_res;
4290 					if (r)
4291 						break;
4292 				}
4293 			}
4294 		}
4295 	}
4296 
4297 	if (!r && amdgpu_ras_intr_triggered()) {
4298 		list_for_each_entry(tmp_adev, device_list_handle, gmc.xgmi.head) {
4299 			if (tmp_adev->mmhub.funcs &&
4300 			    tmp_adev->mmhub.funcs->reset_ras_error_count)
4301 				tmp_adev->mmhub.funcs->reset_ras_error_count(tmp_adev);
4302 		}
4303 
4304 		amdgpu_ras_intr_cleared();
4305 	}
4306 
4307 	list_for_each_entry(tmp_adev, device_list_handle, gmc.xgmi.head) {
4308 		if (need_full_reset) {
4309 			/* post card */
4310 			if (amdgpu_device_asic_init(tmp_adev))
4311 				dev_warn(tmp_adev->dev, "asic atom init failed!");
4312 
4313 			if (!r) {
4314 				dev_info(tmp_adev->dev, "GPU reset succeeded, trying to resume\n");
4315 				r = amdgpu_amdkfd_resume_iommu(tmp_adev);
4316 				if (r)
4317 					goto out;
4318 
4319 				r = amdgpu_device_ip_resume_phase1(tmp_adev);
4320 				if (r)
4321 					goto out;
4322 
4323 				vram_lost = amdgpu_device_check_vram_lost(tmp_adev);
4324 				if (vram_lost) {
4325 					DRM_INFO("VRAM is lost due to GPU reset!\n");
4326 					amdgpu_inc_vram_lost(tmp_adev);
4327 				}
4328 
4329 				r = amdgpu_gtt_mgr_recover(ttm_manager_type(&tmp_adev->mman.bdev, TTM_PL_TT));
4330 				if (r)
4331 					goto out;
4332 
4333 				r = amdgpu_device_fw_loading(tmp_adev);
4334 				if (r)
4335 					return r;
4336 
4337 				r = amdgpu_device_ip_resume_phase2(tmp_adev);
4338 				if (r)
4339 					goto out;
4340 
4341 				if (vram_lost)
4342 					amdgpu_device_fill_reset_magic(tmp_adev);
4343 
4344 				/*
4345 				 * Add this ASIC as tracked as reset was already
4346 				 * complete successfully.
4347 				 */
4348 				amdgpu_register_gpu_instance(tmp_adev);
4349 
4350 				r = amdgpu_device_ip_late_init(tmp_adev);
4351 				if (r)
4352 					goto out;
4353 
4354 				amdgpu_fbdev_set_suspend(tmp_adev, 0);
4355 
4356 				/*
4357 				 * The GPU enters bad state once faulty pages
4358 				 * by ECC has reached the threshold, and ras
4359 				 * recovery is scheduled next. So add one check
4360 				 * here to break recovery if it indeed exceeds
4361 				 * bad page threshold, and remind user to
4362 				 * retire this GPU or setting one bigger
4363 				 * bad_page_threshold value to fix this once
4364 				 * probing driver again.
4365 				 */
4366 				if (!amdgpu_ras_check_err_threshold(tmp_adev)) {
4367 					/* must succeed. */
4368 					amdgpu_ras_resume(tmp_adev);
4369 				} else {
4370 					r = -EINVAL;
4371 					goto out;
4372 				}
4373 
4374 				/* Update PSP FW topology after reset */
4375 				if (hive && tmp_adev->gmc.xgmi.num_physical_nodes > 1)
4376 					r = amdgpu_xgmi_update_topology(hive, tmp_adev);
4377 			}
4378 		}
4379 
4380 out:
4381 		if (!r) {
4382 			amdgpu_irq_gpu_reset_resume_helper(tmp_adev);
4383 			r = amdgpu_ib_ring_tests(tmp_adev);
4384 			if (r) {
4385 				dev_err(tmp_adev->dev, "ib ring test failed (%d).\n", r);
4386 				need_full_reset = true;
4387 				r = -EAGAIN;
4388 				goto end;
4389 			}
4390 		}
4391 
4392 		if (!r)
4393 			r = amdgpu_device_recover_vram(tmp_adev);
4394 		else
4395 			tmp_adev->asic_reset_res = r;
4396 	}
4397 
4398 end:
4399 	*need_full_reset_arg = need_full_reset;
4400 	return r;
4401 }
4402 
amdgpu_device_lock_adev(struct amdgpu_device * adev,struct amdgpu_hive_info * hive)4403 static bool amdgpu_device_lock_adev(struct amdgpu_device *adev,
4404 				struct amdgpu_hive_info *hive)
4405 {
4406 	if (atomic_cmpxchg(&adev->in_gpu_reset, 0, 1) != 0)
4407 		return false;
4408 
4409 	if (hive) {
4410 		down_write_nest_lock(&adev->reset_sem, &hive->hive_lock);
4411 	} else {
4412 		down_write(&adev->reset_sem);
4413 	}
4414 
4415 	atomic_inc(&adev->gpu_reset_counter);
4416 	switch (amdgpu_asic_reset_method(adev)) {
4417 	case AMD_RESET_METHOD_MODE1:
4418 		adev->mp1_state = PP_MP1_STATE_SHUTDOWN;
4419 		break;
4420 	case AMD_RESET_METHOD_MODE2:
4421 		adev->mp1_state = PP_MP1_STATE_RESET;
4422 		break;
4423 	default:
4424 		adev->mp1_state = PP_MP1_STATE_NONE;
4425 		break;
4426 	}
4427 
4428 	return true;
4429 }
4430 
amdgpu_device_unlock_adev(struct amdgpu_device * adev)4431 static void amdgpu_device_unlock_adev(struct amdgpu_device *adev)
4432 {
4433 	amdgpu_vf_error_trans_all(adev);
4434 	adev->mp1_state = PP_MP1_STATE_NONE;
4435 	atomic_set(&adev->in_gpu_reset, 0);
4436 	up_write(&adev->reset_sem);
4437 }
4438 
amdgpu_device_resume_display_audio(struct amdgpu_device * adev)4439 static void amdgpu_device_resume_display_audio(struct amdgpu_device *adev)
4440 {
4441 	struct pci_dev *p = NULL;
4442 
4443 	p = pci_get_domain_bus_and_slot(pci_domain_nr(adev->pdev->bus),
4444 			adev->pdev->bus->number, 1);
4445 	if (p) {
4446 		pm_runtime_enable(&(p->dev));
4447 		pm_runtime_resume(&(p->dev));
4448 	}
4449 
4450 	pci_dev_put(p);
4451 }
4452 
amdgpu_device_suspend_display_audio(struct amdgpu_device * adev)4453 static int amdgpu_device_suspend_display_audio(struct amdgpu_device *adev)
4454 {
4455 	enum amd_reset_method reset_method;
4456 	struct pci_dev *p = NULL;
4457 	u64 expires;
4458 
4459 	/*
4460 	 * For now, only BACO and mode1 reset are confirmed
4461 	 * to suffer the audio issue without proper suspended.
4462 	 */
4463 	reset_method = amdgpu_asic_reset_method(adev);
4464 	if ((reset_method != AMD_RESET_METHOD_BACO) &&
4465 	     (reset_method != AMD_RESET_METHOD_MODE1))
4466 		return -EINVAL;
4467 
4468 	p = pci_get_domain_bus_and_slot(pci_domain_nr(adev->pdev->bus),
4469 			adev->pdev->bus->number, 1);
4470 	if (!p)
4471 		return -ENODEV;
4472 
4473 	expires = pm_runtime_autosuspend_expiration(&(p->dev));
4474 	if (!expires)
4475 		/*
4476 		 * If we cannot get the audio device autosuspend delay,
4477 		 * a fixed 4S interval will be used. Considering 3S is
4478 		 * the audio controller default autosuspend delay setting.
4479 		 * 4S used here is guaranteed to cover that.
4480 		 */
4481 		expires = ktime_get_mono_fast_ns() + NSEC_PER_SEC * 4ULL;
4482 
4483 	while (!pm_runtime_status_suspended(&(p->dev))) {
4484 		if (!pm_runtime_suspend(&(p->dev)))
4485 			break;
4486 
4487 		if (expires < ktime_get_mono_fast_ns()) {
4488 			dev_warn(adev->dev, "failed to suspend display audio\n");
4489 			pci_dev_put(p);
4490 			/* TODO: abort the succeeding gpu reset? */
4491 			return -ETIMEDOUT;
4492 		}
4493 	}
4494 
4495 	pm_runtime_disable(&(p->dev));
4496 
4497 	pci_dev_put(p);
4498 	return 0;
4499 }
4500 
4501 /**
4502  * amdgpu_device_gpu_recover - reset the asic and recover scheduler
4503  *
4504  * @adev: amdgpu_device pointer
4505  * @job: which job trigger hang
4506  *
4507  * Attempt to reset the GPU if it has hung (all asics).
4508  * Attempt to do soft-reset or full-reset and reinitialize Asic
4509  * Returns 0 for success or an error on failure.
4510  */
4511 
amdgpu_device_gpu_recover(struct amdgpu_device * adev,struct amdgpu_job * job)4512 int amdgpu_device_gpu_recover(struct amdgpu_device *adev,
4513 			      struct amdgpu_job *job)
4514 {
4515 	struct list_head device_list, *device_list_handle =  NULL;
4516 	bool need_full_reset = false;
4517 	bool job_signaled = false;
4518 	struct amdgpu_hive_info *hive = NULL;
4519 	struct amdgpu_device *tmp_adev = NULL;
4520 	int i, r = 0;
4521 	bool need_emergency_restart = false;
4522 	bool audio_suspended = false;
4523 
4524 	/*
4525 	 * Special case: RAS triggered and full reset isn't supported
4526 	 */
4527 	need_emergency_restart = amdgpu_ras_need_emergency_restart(adev);
4528 
4529 	/*
4530 	 * Flush RAM to disk so that after reboot
4531 	 * the user can read log and see why the system rebooted.
4532 	 */
4533 	if (need_emergency_restart && amdgpu_ras_get_context(adev) &&
4534 		amdgpu_ras_get_context(adev)->reboot) {
4535 		DRM_WARN("Emergency reboot.");
4536 
4537 		ksys_sync_helper();
4538 		emergency_restart();
4539 	}
4540 
4541 	dev_info(adev->dev, "GPU %s begin!\n",
4542 		need_emergency_restart ? "jobs stop":"reset");
4543 
4544 	/*
4545 	 * Here we trylock to avoid chain of resets executing from
4546 	 * either trigger by jobs on different adevs in XGMI hive or jobs on
4547 	 * different schedulers for same device while this TO handler is running.
4548 	 * We always reset all schedulers for device and all devices for XGMI
4549 	 * hive so that should take care of them too.
4550 	 */
4551 	hive = amdgpu_get_xgmi_hive(adev);
4552 	if (hive) {
4553 		if (atomic_cmpxchg(&hive->in_reset, 0, 1) != 0) {
4554 			DRM_INFO("Bailing on TDR for s_job:%llx, hive: %llx as another already in progress",
4555 				job ? job->base.id : -1, hive->hive_id);
4556 			amdgpu_put_xgmi_hive(hive);
4557 			return 0;
4558 		}
4559 		mutex_lock(&hive->hive_lock);
4560 	}
4561 
4562 	/*
4563 	 * Build list of devices to reset.
4564 	 * In case we are in XGMI hive mode, resort the device list
4565 	 * to put adev in the 1st position.
4566 	 */
4567 	INIT_LIST_HEAD(&device_list);
4568 	if (adev->gmc.xgmi.num_physical_nodes > 1) {
4569 		if (!hive)
4570 			return -ENODEV;
4571 		if (!list_is_first(&adev->gmc.xgmi.head, &hive->device_list))
4572 			list_rotate_to_front(&adev->gmc.xgmi.head, &hive->device_list);
4573 		device_list_handle = &hive->device_list;
4574 	} else {
4575 		list_add_tail(&adev->gmc.xgmi.head, &device_list);
4576 		device_list_handle = &device_list;
4577 	}
4578 
4579 	/* block all schedulers and reset given job's ring */
4580 	list_for_each_entry(tmp_adev, device_list_handle, gmc.xgmi.head) {
4581 		if (!amdgpu_device_lock_adev(tmp_adev, hive)) {
4582 			dev_info(tmp_adev->dev, "Bailing on TDR for s_job:%llx, as another already in progress",
4583 				  job ? job->base.id : -1);
4584 			r = 0;
4585 			goto skip_recovery;
4586 		}
4587 
4588 		/*
4589 		 * Try to put the audio codec into suspend state
4590 		 * before gpu reset started.
4591 		 *
4592 		 * Due to the power domain of the graphics device
4593 		 * is shared with AZ power domain. Without this,
4594 		 * we may change the audio hardware from behind
4595 		 * the audio driver's back. That will trigger
4596 		 * some audio codec errors.
4597 		 */
4598 		if (!amdgpu_device_suspend_display_audio(tmp_adev))
4599 			audio_suspended = true;
4600 
4601 		amdgpu_ras_set_error_query_ready(tmp_adev, false);
4602 
4603 		cancel_delayed_work_sync(&tmp_adev->delayed_init_work);
4604 
4605 		if (!amdgpu_sriov_vf(tmp_adev))
4606 			amdgpu_amdkfd_pre_reset(tmp_adev);
4607 
4608 		/*
4609 		 * Mark these ASICs to be reseted as untracked first
4610 		 * And add them back after reset completed
4611 		 */
4612 		amdgpu_unregister_gpu_instance(tmp_adev);
4613 
4614 		amdgpu_fbdev_set_suspend(tmp_adev, 1);
4615 
4616 		/* disable ras on ALL IPs */
4617 		if (!need_emergency_restart &&
4618 		      amdgpu_device_ip_need_full_reset(tmp_adev))
4619 			amdgpu_ras_suspend(tmp_adev);
4620 
4621 		for (i = 0; i < AMDGPU_MAX_RINGS; ++i) {
4622 			struct amdgpu_ring *ring = tmp_adev->rings[i];
4623 
4624 			if (!ring || !ring->sched.thread)
4625 				continue;
4626 
4627 			drm_sched_stop(&ring->sched, job ? &job->base : NULL);
4628 
4629 			if (need_emergency_restart)
4630 				amdgpu_job_stop_all_jobs_on_sched(&ring->sched);
4631 		}
4632 	}
4633 
4634 	if (need_emergency_restart)
4635 		goto skip_sched_resume;
4636 
4637 	/*
4638 	 * Must check guilty signal here since after this point all old
4639 	 * HW fences are force signaled.
4640 	 *
4641 	 * job->base holds a reference to parent fence
4642 	 */
4643 	if (job && job->base.s_fence->parent &&
4644 	    dma_fence_is_signaled(job->base.s_fence->parent)) {
4645 		job_signaled = true;
4646 		dev_info(adev->dev, "Guilty job already signaled, skipping HW reset");
4647 		goto skip_hw_reset;
4648 	}
4649 
4650 retry:	/* Rest of adevs pre asic reset from XGMI hive. */
4651 	list_for_each_entry(tmp_adev, device_list_handle, gmc.xgmi.head) {
4652 		r = amdgpu_device_pre_asic_reset(tmp_adev,
4653 						 (tmp_adev == adev) ? job : NULL,
4654 						 &need_full_reset);
4655 		/*TODO Should we stop ?*/
4656 		if (r) {
4657 			dev_err(tmp_adev->dev, "GPU pre asic reset failed with err, %d for drm dev, %s ",
4658 				  r, adev_to_drm(tmp_adev)->unique);
4659 			tmp_adev->asic_reset_res = r;
4660 		}
4661 	}
4662 
4663 	/* Actual ASIC resets if needed.*/
4664 	/* TODO Implement XGMI hive reset logic for SRIOV */
4665 	if (amdgpu_sriov_vf(adev)) {
4666 		r = amdgpu_device_reset_sriov(adev, job ? false : true);
4667 		if (r)
4668 			adev->asic_reset_res = r;
4669 	} else {
4670 		r  = amdgpu_do_asic_reset(hive, device_list_handle, &need_full_reset, false);
4671 		if (r && r == -EAGAIN)
4672 			goto retry;
4673 	}
4674 
4675 skip_hw_reset:
4676 
4677 	/* Post ASIC reset for all devs .*/
4678 	list_for_each_entry(tmp_adev, device_list_handle, gmc.xgmi.head) {
4679 
4680 		for (i = 0; i < AMDGPU_MAX_RINGS; ++i) {
4681 			struct amdgpu_ring *ring = tmp_adev->rings[i];
4682 
4683 			if (!ring || !ring->sched.thread)
4684 				continue;
4685 
4686 			/* No point to resubmit jobs if we didn't HW reset*/
4687 			if (!tmp_adev->asic_reset_res && !job_signaled)
4688 				drm_sched_resubmit_jobs(&ring->sched);
4689 
4690 			drm_sched_start(&ring->sched, !tmp_adev->asic_reset_res);
4691 		}
4692 
4693 		if (!amdgpu_device_has_dc_support(tmp_adev) && !job_signaled) {
4694 			drm_helper_resume_force_mode(adev_to_drm(tmp_adev));
4695 		}
4696 
4697 		tmp_adev->asic_reset_res = 0;
4698 
4699 		if (r) {
4700 			/* bad news, how to tell it to userspace ? */
4701 			dev_info(tmp_adev->dev, "GPU reset(%d) failed\n", atomic_read(&tmp_adev->gpu_reset_counter));
4702 			amdgpu_vf_error_put(tmp_adev, AMDGIM_ERROR_VF_GPU_RESET_FAIL, 0, r);
4703 		} else {
4704 			dev_info(tmp_adev->dev, "GPU reset(%d) succeeded!\n", atomic_read(&tmp_adev->gpu_reset_counter));
4705 		}
4706 	}
4707 
4708 skip_sched_resume:
4709 	list_for_each_entry(tmp_adev, device_list_handle, gmc.xgmi.head) {
4710 		/*unlock kfd: SRIOV would do it separately */
4711 		if (!need_emergency_restart && !amdgpu_sriov_vf(tmp_adev))
4712 	                amdgpu_amdkfd_post_reset(tmp_adev);
4713 		if (audio_suspended)
4714 			amdgpu_device_resume_display_audio(tmp_adev);
4715 		amdgpu_device_unlock_adev(tmp_adev);
4716 	}
4717 
4718 skip_recovery:
4719 	if (hive) {
4720 		atomic_set(&hive->in_reset, 0);
4721 		mutex_unlock(&hive->hive_lock);
4722 		amdgpu_put_xgmi_hive(hive);
4723 	}
4724 
4725 	if (r)
4726 		dev_info(adev->dev, "GPU reset end with ret = %d\n", r);
4727 	return r;
4728 }
4729 
4730 /**
4731  * amdgpu_device_get_pcie_info - fence pcie info about the PCIE slot
4732  *
4733  * @adev: amdgpu_device pointer
4734  *
4735  * Fetchs and stores in the driver the PCIE capabilities (gen speed
4736  * and lanes) of the slot the device is in. Handles APUs and
4737  * virtualized environments where PCIE config space may not be available.
4738  */
amdgpu_device_get_pcie_info(struct amdgpu_device * adev)4739 static void amdgpu_device_get_pcie_info(struct amdgpu_device *adev)
4740 {
4741 	struct pci_dev *pdev;
4742 	enum pci_bus_speed speed_cap, platform_speed_cap;
4743 	enum pcie_link_width platform_link_width;
4744 
4745 	if (amdgpu_pcie_gen_cap)
4746 		adev->pm.pcie_gen_mask = amdgpu_pcie_gen_cap;
4747 
4748 	if (amdgpu_pcie_lane_cap)
4749 		adev->pm.pcie_mlw_mask = amdgpu_pcie_lane_cap;
4750 
4751 	/* covers APUs as well */
4752 	if (pci_is_root_bus(adev->pdev->bus)) {
4753 		if (adev->pm.pcie_gen_mask == 0)
4754 			adev->pm.pcie_gen_mask = AMDGPU_DEFAULT_PCIE_GEN_MASK;
4755 		if (adev->pm.pcie_mlw_mask == 0)
4756 			adev->pm.pcie_mlw_mask = AMDGPU_DEFAULT_PCIE_MLW_MASK;
4757 		return;
4758 	}
4759 
4760 	if (adev->pm.pcie_gen_mask && adev->pm.pcie_mlw_mask)
4761 		return;
4762 
4763 	pcie_bandwidth_available(adev->pdev, NULL,
4764 				 &platform_speed_cap, &platform_link_width);
4765 
4766 	if (adev->pm.pcie_gen_mask == 0) {
4767 		/* asic caps */
4768 		pdev = adev->pdev;
4769 		speed_cap = pcie_get_speed_cap(pdev);
4770 		if (speed_cap == PCI_SPEED_UNKNOWN) {
4771 			adev->pm.pcie_gen_mask |= (CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN1 |
4772 						  CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN2 |
4773 						  CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN3);
4774 		} else {
4775 			if (speed_cap == PCIE_SPEED_16_0GT)
4776 				adev->pm.pcie_gen_mask |= (CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN1 |
4777 							  CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN2 |
4778 							  CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN3 |
4779 							  CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN4);
4780 			else if (speed_cap == PCIE_SPEED_8_0GT)
4781 				adev->pm.pcie_gen_mask |= (CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN1 |
4782 							  CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN2 |
4783 							  CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN3);
4784 			else if (speed_cap == PCIE_SPEED_5_0GT)
4785 				adev->pm.pcie_gen_mask |= (CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN1 |
4786 							  CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN2);
4787 			else
4788 				adev->pm.pcie_gen_mask |= CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN1;
4789 		}
4790 		/* platform caps */
4791 		if (platform_speed_cap == PCI_SPEED_UNKNOWN) {
4792 			adev->pm.pcie_gen_mask |= (CAIL_PCIE_LINK_SPEED_SUPPORT_GEN1 |
4793 						   CAIL_PCIE_LINK_SPEED_SUPPORT_GEN2);
4794 		} else {
4795 			if (platform_speed_cap == PCIE_SPEED_16_0GT)
4796 				adev->pm.pcie_gen_mask |= (CAIL_PCIE_LINK_SPEED_SUPPORT_GEN1 |
4797 							   CAIL_PCIE_LINK_SPEED_SUPPORT_GEN2 |
4798 							   CAIL_PCIE_LINK_SPEED_SUPPORT_GEN3 |
4799 							   CAIL_PCIE_LINK_SPEED_SUPPORT_GEN4);
4800 			else if (platform_speed_cap == PCIE_SPEED_8_0GT)
4801 				adev->pm.pcie_gen_mask |= (CAIL_PCIE_LINK_SPEED_SUPPORT_GEN1 |
4802 							   CAIL_PCIE_LINK_SPEED_SUPPORT_GEN2 |
4803 							   CAIL_PCIE_LINK_SPEED_SUPPORT_GEN3);
4804 			else if (platform_speed_cap == PCIE_SPEED_5_0GT)
4805 				adev->pm.pcie_gen_mask |= (CAIL_PCIE_LINK_SPEED_SUPPORT_GEN1 |
4806 							   CAIL_PCIE_LINK_SPEED_SUPPORT_GEN2);
4807 			else
4808 				adev->pm.pcie_gen_mask |= CAIL_PCIE_LINK_SPEED_SUPPORT_GEN1;
4809 
4810 		}
4811 	}
4812 	if (adev->pm.pcie_mlw_mask == 0) {
4813 		if (platform_link_width == PCIE_LNK_WIDTH_UNKNOWN) {
4814 			adev->pm.pcie_mlw_mask |= AMDGPU_DEFAULT_PCIE_MLW_MASK;
4815 		} else {
4816 			switch (platform_link_width) {
4817 			case PCIE_LNK_X32:
4818 				adev->pm.pcie_mlw_mask = (CAIL_PCIE_LINK_WIDTH_SUPPORT_X32 |
4819 							  CAIL_PCIE_LINK_WIDTH_SUPPORT_X16 |
4820 							  CAIL_PCIE_LINK_WIDTH_SUPPORT_X12 |
4821 							  CAIL_PCIE_LINK_WIDTH_SUPPORT_X8 |
4822 							  CAIL_PCIE_LINK_WIDTH_SUPPORT_X4 |
4823 							  CAIL_PCIE_LINK_WIDTH_SUPPORT_X2 |
4824 							  CAIL_PCIE_LINK_WIDTH_SUPPORT_X1);
4825 				break;
4826 			case PCIE_LNK_X16:
4827 				adev->pm.pcie_mlw_mask = (CAIL_PCIE_LINK_WIDTH_SUPPORT_X16 |
4828 							  CAIL_PCIE_LINK_WIDTH_SUPPORT_X12 |
4829 							  CAIL_PCIE_LINK_WIDTH_SUPPORT_X8 |
4830 							  CAIL_PCIE_LINK_WIDTH_SUPPORT_X4 |
4831 							  CAIL_PCIE_LINK_WIDTH_SUPPORT_X2 |
4832 							  CAIL_PCIE_LINK_WIDTH_SUPPORT_X1);
4833 				break;
4834 			case PCIE_LNK_X12:
4835 				adev->pm.pcie_mlw_mask = (CAIL_PCIE_LINK_WIDTH_SUPPORT_X12 |
4836 							  CAIL_PCIE_LINK_WIDTH_SUPPORT_X8 |
4837 							  CAIL_PCIE_LINK_WIDTH_SUPPORT_X4 |
4838 							  CAIL_PCIE_LINK_WIDTH_SUPPORT_X2 |
4839 							  CAIL_PCIE_LINK_WIDTH_SUPPORT_X1);
4840 				break;
4841 			case PCIE_LNK_X8:
4842 				adev->pm.pcie_mlw_mask = (CAIL_PCIE_LINK_WIDTH_SUPPORT_X8 |
4843 							  CAIL_PCIE_LINK_WIDTH_SUPPORT_X4 |
4844 							  CAIL_PCIE_LINK_WIDTH_SUPPORT_X2 |
4845 							  CAIL_PCIE_LINK_WIDTH_SUPPORT_X1);
4846 				break;
4847 			case PCIE_LNK_X4:
4848 				adev->pm.pcie_mlw_mask = (CAIL_PCIE_LINK_WIDTH_SUPPORT_X4 |
4849 							  CAIL_PCIE_LINK_WIDTH_SUPPORT_X2 |
4850 							  CAIL_PCIE_LINK_WIDTH_SUPPORT_X1);
4851 				break;
4852 			case PCIE_LNK_X2:
4853 				adev->pm.pcie_mlw_mask = (CAIL_PCIE_LINK_WIDTH_SUPPORT_X2 |
4854 							  CAIL_PCIE_LINK_WIDTH_SUPPORT_X1);
4855 				break;
4856 			case PCIE_LNK_X1:
4857 				adev->pm.pcie_mlw_mask = CAIL_PCIE_LINK_WIDTH_SUPPORT_X1;
4858 				break;
4859 			default:
4860 				break;
4861 			}
4862 		}
4863 	}
4864 }
4865 
amdgpu_device_baco_enter(struct drm_device * dev)4866 int amdgpu_device_baco_enter(struct drm_device *dev)
4867 {
4868 	struct amdgpu_device *adev = drm_to_adev(dev);
4869 	struct amdgpu_ras *ras = amdgpu_ras_get_context(adev);
4870 
4871 	if (!amdgpu_device_supports_baco(adev_to_drm(adev)))
4872 		return -ENOTSUPP;
4873 
4874 	if (ras && ras->supported && adev->nbio.funcs->enable_doorbell_interrupt)
4875 		adev->nbio.funcs->enable_doorbell_interrupt(adev, false);
4876 
4877 	return amdgpu_dpm_baco_enter(adev);
4878 }
4879 
amdgpu_device_baco_exit(struct drm_device * dev)4880 int amdgpu_device_baco_exit(struct drm_device *dev)
4881 {
4882 	struct amdgpu_device *adev = drm_to_adev(dev);
4883 	struct amdgpu_ras *ras = amdgpu_ras_get_context(adev);
4884 	int ret = 0;
4885 
4886 	if (!amdgpu_device_supports_baco(adev_to_drm(adev)))
4887 		return -ENOTSUPP;
4888 
4889 	ret = amdgpu_dpm_baco_exit(adev);
4890 	if (ret)
4891 		return ret;
4892 
4893 	if (ras && ras->supported && adev->nbio.funcs->enable_doorbell_interrupt)
4894 		adev->nbio.funcs->enable_doorbell_interrupt(adev, true);
4895 
4896 	return 0;
4897 }
4898 
amdgpu_cancel_all_tdr(struct amdgpu_device * adev)4899 static void amdgpu_cancel_all_tdr(struct amdgpu_device *adev)
4900 {
4901 	int i;
4902 
4903 	for (i = 0; i < AMDGPU_MAX_RINGS; ++i) {
4904 		struct amdgpu_ring *ring = adev->rings[i];
4905 
4906 		if (!ring || !ring->sched.thread)
4907 			continue;
4908 
4909 		cancel_delayed_work_sync(&ring->sched.work_tdr);
4910 	}
4911 }
4912 
4913 /**
4914  * amdgpu_pci_error_detected - Called when a PCI error is detected.
4915  * @pdev: PCI device struct
4916  * @state: PCI channel state
4917  *
4918  * Description: Called when a PCI error is detected.
4919  *
4920  * Return: PCI_ERS_RESULT_NEED_RESET or PCI_ERS_RESULT_DISCONNECT.
4921  */
amdgpu_pci_error_detected(struct pci_dev * pdev,pci_channel_state_t state)4922 pci_ers_result_t amdgpu_pci_error_detected(struct pci_dev *pdev, pci_channel_state_t state)
4923 {
4924 	struct drm_device *dev = pci_get_drvdata(pdev);
4925 	struct amdgpu_device *adev = drm_to_adev(dev);
4926 	int i;
4927 
4928 	DRM_INFO("PCI error: detected callback, state(%d)!!\n", state);
4929 
4930 	if (adev->gmc.xgmi.num_physical_nodes > 1) {
4931 		DRM_WARN("No support for XGMI hive yet...");
4932 		return PCI_ERS_RESULT_DISCONNECT;
4933 	}
4934 
4935 	switch (state) {
4936 	case pci_channel_io_normal:
4937 		return PCI_ERS_RESULT_CAN_RECOVER;
4938 	/* Fatal error, prepare for slot reset */
4939 	case pci_channel_io_frozen:
4940 		/*
4941 		 * Cancel and wait for all TDRs in progress if failing to
4942 		 * set  adev->in_gpu_reset in amdgpu_device_lock_adev
4943 		 *
4944 		 * Locking adev->reset_sem will prevent any external access
4945 		 * to GPU during PCI error recovery
4946 		 */
4947 		while (!amdgpu_device_lock_adev(adev, NULL))
4948 			amdgpu_cancel_all_tdr(adev);
4949 
4950 		/*
4951 		 * Block any work scheduling as we do for regular GPU reset
4952 		 * for the duration of the recovery
4953 		 */
4954 		for (i = 0; i < AMDGPU_MAX_RINGS; ++i) {
4955 			struct amdgpu_ring *ring = adev->rings[i];
4956 
4957 			if (!ring || !ring->sched.thread)
4958 				continue;
4959 
4960 			drm_sched_stop(&ring->sched, NULL);
4961 		}
4962 		return PCI_ERS_RESULT_NEED_RESET;
4963 	case pci_channel_io_perm_failure:
4964 		/* Permanent error, prepare for device removal */
4965 		return PCI_ERS_RESULT_DISCONNECT;
4966 	}
4967 
4968 	return PCI_ERS_RESULT_NEED_RESET;
4969 }
4970 
4971 /**
4972  * amdgpu_pci_mmio_enabled - Enable MMIO and dump debug registers
4973  * @pdev: pointer to PCI device
4974  */
amdgpu_pci_mmio_enabled(struct pci_dev * pdev)4975 pci_ers_result_t amdgpu_pci_mmio_enabled(struct pci_dev *pdev)
4976 {
4977 
4978 	DRM_INFO("PCI error: mmio enabled callback!!\n");
4979 
4980 	/* TODO - dump whatever for debugging purposes */
4981 
4982 	/* This called only if amdgpu_pci_error_detected returns
4983 	 * PCI_ERS_RESULT_CAN_RECOVER. Read/write to the device still
4984 	 * works, no need to reset slot.
4985 	 */
4986 
4987 	return PCI_ERS_RESULT_RECOVERED;
4988 }
4989 
4990 /**
4991  * amdgpu_pci_slot_reset - Called when PCI slot has been reset.
4992  * @pdev: PCI device struct
4993  *
4994  * Description: This routine is called by the pci error recovery
4995  * code after the PCI slot has been reset, just before we
4996  * should resume normal operations.
4997  */
amdgpu_pci_slot_reset(struct pci_dev * pdev)4998 pci_ers_result_t amdgpu_pci_slot_reset(struct pci_dev *pdev)
4999 {
5000 	struct drm_device *dev = pci_get_drvdata(pdev);
5001 	struct amdgpu_device *adev = drm_to_adev(dev);
5002 	int r, i;
5003 	bool need_full_reset = true;
5004 	u32 memsize;
5005 	struct list_head device_list;
5006 
5007 	DRM_INFO("PCI error: slot reset callback!!\n");
5008 
5009 	INIT_LIST_HEAD(&device_list);
5010 	list_add_tail(&adev->gmc.xgmi.head, &device_list);
5011 
5012 	/* wait for asic to come out of reset */
5013 	msleep(500);
5014 
5015 	/* Restore PCI confspace */
5016 	amdgpu_device_load_pci_state(pdev);
5017 
5018 	/* confirm  ASIC came out of reset */
5019 	for (i = 0; i < adev->usec_timeout; i++) {
5020 		memsize = amdgpu_asic_get_config_memsize(adev);
5021 
5022 		if (memsize != 0xffffffff)
5023 			break;
5024 		udelay(1);
5025 	}
5026 	if (memsize == 0xffffffff) {
5027 		r = -ETIME;
5028 		goto out;
5029 	}
5030 
5031 	adev->in_pci_err_recovery = true;
5032 	r = amdgpu_device_pre_asic_reset(adev, NULL, &need_full_reset);
5033 	adev->in_pci_err_recovery = false;
5034 	if (r)
5035 		goto out;
5036 
5037 	r = amdgpu_do_asic_reset(NULL, &device_list, &need_full_reset, true);
5038 
5039 out:
5040 	if (!r) {
5041 		if (amdgpu_device_cache_pci_state(adev->pdev))
5042 			pci_restore_state(adev->pdev);
5043 
5044 		DRM_INFO("PCIe error recovery succeeded\n");
5045 	} else {
5046 		DRM_ERROR("PCIe error recovery failed, err:%d", r);
5047 		amdgpu_device_unlock_adev(adev);
5048 	}
5049 
5050 	return r ? PCI_ERS_RESULT_DISCONNECT : PCI_ERS_RESULT_RECOVERED;
5051 }
5052 
5053 /**
5054  * amdgpu_pci_resume() - resume normal ops after PCI reset
5055  * @pdev: pointer to PCI device
5056  *
5057  * Called when the error recovery driver tells us that its
5058  * OK to resume normal operation. Use completion to allow
5059  * halted scsi ops to resume.
5060  */
amdgpu_pci_resume(struct pci_dev * pdev)5061 void amdgpu_pci_resume(struct pci_dev *pdev)
5062 {
5063 	struct drm_device *dev = pci_get_drvdata(pdev);
5064 	struct amdgpu_device *adev = drm_to_adev(dev);
5065 	int i;
5066 
5067 
5068 	DRM_INFO("PCI error: resume callback!!\n");
5069 
5070 	for (i = 0; i < AMDGPU_MAX_RINGS; ++i) {
5071 		struct amdgpu_ring *ring = adev->rings[i];
5072 
5073 		if (!ring || !ring->sched.thread)
5074 			continue;
5075 
5076 
5077 		drm_sched_resubmit_jobs(&ring->sched);
5078 		drm_sched_start(&ring->sched, true);
5079 	}
5080 
5081 	amdgpu_device_unlock_adev(adev);
5082 }
5083 
amdgpu_device_cache_pci_state(struct pci_dev * pdev)5084 bool amdgpu_device_cache_pci_state(struct pci_dev *pdev)
5085 {
5086 	struct drm_device *dev = pci_get_drvdata(pdev);
5087 	struct amdgpu_device *adev = drm_to_adev(dev);
5088 	int r;
5089 
5090 	r = pci_save_state(pdev);
5091 	if (!r) {
5092 		kfree(adev->pci_state);
5093 
5094 		adev->pci_state = pci_store_saved_state(pdev);
5095 
5096 		if (!adev->pci_state) {
5097 			DRM_ERROR("Failed to store PCI saved state");
5098 			return false;
5099 		}
5100 	} else {
5101 		DRM_WARN("Failed to save PCI state, err:%d\n", r);
5102 		return false;
5103 	}
5104 
5105 	return true;
5106 }
5107 
amdgpu_device_load_pci_state(struct pci_dev * pdev)5108 bool amdgpu_device_load_pci_state(struct pci_dev *pdev)
5109 {
5110 	struct drm_device *dev = pci_get_drvdata(pdev);
5111 	struct amdgpu_device *adev = drm_to_adev(dev);
5112 	int r;
5113 
5114 	if (!adev->pci_state)
5115 		return false;
5116 
5117 	r = pci_load_saved_state(pdev, adev->pci_state);
5118 
5119 	if (!r) {
5120 		pci_restore_state(pdev);
5121 	} else {
5122 		DRM_WARN("Failed to load PCI state, err:%d\n", r);
5123 		return false;
5124 	}
5125 
5126 	return true;
5127 }
5128 
5129 
5130