• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /*
2  * Copyright 2008 Advanced Micro Devices, Inc.
3  * Copyright 2008 Red Hat Inc.
4  * Copyright 2009 Jerome Glisse.
5  *
6  * Permission is hereby granted, free of charge, to any person obtaining a
7  * copy of this software and associated documentation files (the "Software"),
8  * to deal in the Software without restriction, including without limitation
9  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
10  * and/or sell copies of the Software, and to permit persons to whom the
11  * Software is furnished to do so, subject to the following conditions:
12  *
13  * The above copyright notice and this permission notice shall be included in
14  * all copies or substantial portions of the Software.
15  *
16  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
19  * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR
20  * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
21  * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
22  * OTHER DEALINGS IN THE SOFTWARE.
23  *
24  * Authors: Dave Airlie
25  *          Alex Deucher
26  *          Jerome Glisse
27  */
28 #include <linux/power_supply.h>
29 #include <linux/kthread.h>
30 #include <linux/module.h>
31 #include <linux/console.h>
32 #include <linux/slab.h>
33 
34 #include <drm/drm_atomic_helper.h>
35 #include <drm/drm_probe_helper.h>
36 #include <drm/amdgpu_drm.h>
37 #include <linux/vgaarb.h>
38 #include <linux/vga_switcheroo.h>
39 #include <linux/efi.h>
40 #include "amdgpu.h"
41 #include "amdgpu_trace.h"
42 #include "amdgpu_i2c.h"
43 #include "atom.h"
44 #include "amdgpu_atombios.h"
45 #include "amdgpu_atomfirmware.h"
46 #include "amd_pcie.h"
47 #ifdef CONFIG_DRM_AMDGPU_SI
48 #include "si.h"
49 #endif
50 #ifdef CONFIG_DRM_AMDGPU_CIK
51 #include "cik.h"
52 #endif
53 #include "vi.h"
54 #include "soc15.h"
55 #include "nv.h"
56 #include "bif/bif_4_1_d.h"
57 #include <linux/pci.h>
58 #include <linux/firmware.h>
59 #include "amdgpu_vf_error.h"
60 
61 #include "amdgpu_amdkfd.h"
62 #include "amdgpu_pm.h"
63 
64 #include "amdgpu_xgmi.h"
65 #include "amdgpu_ras.h"
66 #include "amdgpu_pmu.h"
67 #include "amdgpu_fru_eeprom.h"
68 
69 #include <linux/suspend.h>
70 #include <drm/task_barrier.h>
71 #include <linux/pm_runtime.h>
72 
73 MODULE_FIRMWARE("amdgpu/vega10_gpu_info.bin");
74 MODULE_FIRMWARE("amdgpu/vega12_gpu_info.bin");
75 MODULE_FIRMWARE("amdgpu/raven_gpu_info.bin");
76 MODULE_FIRMWARE("amdgpu/picasso_gpu_info.bin");
77 MODULE_FIRMWARE("amdgpu/raven2_gpu_info.bin");
78 MODULE_FIRMWARE("amdgpu/arcturus_gpu_info.bin");
79 MODULE_FIRMWARE("amdgpu/renoir_gpu_info.bin");
80 MODULE_FIRMWARE("amdgpu/navi10_gpu_info.bin");
81 MODULE_FIRMWARE("amdgpu/navi14_gpu_info.bin");
82 MODULE_FIRMWARE("amdgpu/navi12_gpu_info.bin");
83 
84 #define AMDGPU_RESUME_MS		2000
85 
86 const char *amdgpu_asic_name[] = {
87 	"TAHITI",
88 	"PITCAIRN",
89 	"VERDE",
90 	"OLAND",
91 	"HAINAN",
92 	"BONAIRE",
93 	"KAVERI",
94 	"KABINI",
95 	"HAWAII",
96 	"MULLINS",
97 	"TOPAZ",
98 	"TONGA",
99 	"FIJI",
100 	"CARRIZO",
101 	"STONEY",
102 	"POLARIS10",
103 	"POLARIS11",
104 	"POLARIS12",
105 	"VEGAM",
106 	"VEGA10",
107 	"VEGA12",
108 	"VEGA20",
109 	"RAVEN",
110 	"ARCTURUS",
111 	"RENOIR",
112 	"NAVI10",
113 	"NAVI14",
114 	"NAVI12",
115 	"SIENNA_CICHLID",
116 	"NAVY_FLOUNDER",
117 	"LAST",
118 };
119 
120 /**
121  * DOC: pcie_replay_count
122  *
123  * The amdgpu driver provides a sysfs API for reporting the total number
124  * of PCIe replays (NAKs)
125  * The file pcie_replay_count is used for this and returns the total
126  * number of replays as a sum of the NAKs generated and NAKs received
127  */
128 
amdgpu_device_get_pcie_replay_count(struct device * dev,struct device_attribute * attr,char * buf)129 static ssize_t amdgpu_device_get_pcie_replay_count(struct device *dev,
130 		struct device_attribute *attr, char *buf)
131 {
132 	struct drm_device *ddev = dev_get_drvdata(dev);
133 	struct amdgpu_device *adev = drm_to_adev(ddev);
134 	uint64_t cnt = amdgpu_asic_get_pcie_replay_count(adev);
135 
136 	return snprintf(buf, PAGE_SIZE, "%llu\n", cnt);
137 }
138 
139 static DEVICE_ATTR(pcie_replay_count, S_IRUGO,
140 		amdgpu_device_get_pcie_replay_count, NULL);
141 
142 static void amdgpu_device_get_pcie_info(struct amdgpu_device *adev);
143 
144 /**
145  * DOC: product_name
146  *
147  * The amdgpu driver provides a sysfs API for reporting the product name
148  * for the device
149  * The file serial_number is used for this and returns the product name
150  * as returned from the FRU.
151  * NOTE: This is only available for certain server cards
152  */
153 
amdgpu_device_get_product_name(struct device * dev,struct device_attribute * attr,char * buf)154 static ssize_t amdgpu_device_get_product_name(struct device *dev,
155 		struct device_attribute *attr, char *buf)
156 {
157 	struct drm_device *ddev = dev_get_drvdata(dev);
158 	struct amdgpu_device *adev = drm_to_adev(ddev);
159 
160 	return snprintf(buf, PAGE_SIZE, "%s\n", adev->product_name);
161 }
162 
163 static DEVICE_ATTR(product_name, S_IRUGO,
164 		amdgpu_device_get_product_name, NULL);
165 
166 /**
167  * DOC: product_number
168  *
169  * The amdgpu driver provides a sysfs API for reporting the part number
170  * for the device
171  * The file serial_number is used for this and returns the part number
172  * as returned from the FRU.
173  * NOTE: This is only available for certain server cards
174  */
175 
amdgpu_device_get_product_number(struct device * dev,struct device_attribute * attr,char * buf)176 static ssize_t amdgpu_device_get_product_number(struct device *dev,
177 		struct device_attribute *attr, char *buf)
178 {
179 	struct drm_device *ddev = dev_get_drvdata(dev);
180 	struct amdgpu_device *adev = drm_to_adev(ddev);
181 
182 	return snprintf(buf, PAGE_SIZE, "%s\n", adev->product_number);
183 }
184 
185 static DEVICE_ATTR(product_number, S_IRUGO,
186 		amdgpu_device_get_product_number, NULL);
187 
188 /**
189  * DOC: serial_number
190  *
191  * The amdgpu driver provides a sysfs API for reporting the serial number
192  * for the device
193  * The file serial_number is used for this and returns the serial number
194  * as returned from the FRU.
195  * NOTE: This is only available for certain server cards
196  */
197 
amdgpu_device_get_serial_number(struct device * dev,struct device_attribute * attr,char * buf)198 static ssize_t amdgpu_device_get_serial_number(struct device *dev,
199 		struct device_attribute *attr, char *buf)
200 {
201 	struct drm_device *ddev = dev_get_drvdata(dev);
202 	struct amdgpu_device *adev = drm_to_adev(ddev);
203 
204 	return snprintf(buf, PAGE_SIZE, "%s\n", adev->serial);
205 }
206 
207 static DEVICE_ATTR(serial_number, S_IRUGO,
208 		amdgpu_device_get_serial_number, NULL);
209 
210 /**
211  * amdgpu_device_supports_boco - Is the device a dGPU with HG/PX power control
212  *
213  * @dev: drm_device pointer
214  *
215  * Returns true if the device is a dGPU with HG/PX power control,
216  * otherwise return false.
217  */
amdgpu_device_supports_boco(struct drm_device * dev)218 bool amdgpu_device_supports_boco(struct drm_device *dev)
219 {
220 	struct amdgpu_device *adev = drm_to_adev(dev);
221 
222 	if (adev->flags & AMD_IS_PX)
223 		return true;
224 	return false;
225 }
226 
227 /**
228  * amdgpu_device_supports_baco - Does the device support BACO
229  *
230  * @dev: drm_device pointer
231  *
232  * Returns true if the device supporte BACO,
233  * otherwise return false.
234  */
amdgpu_device_supports_baco(struct drm_device * dev)235 bool amdgpu_device_supports_baco(struct drm_device *dev)
236 {
237 	struct amdgpu_device *adev = drm_to_adev(dev);
238 
239 	return amdgpu_asic_supports_baco(adev);
240 }
241 
242 /*
243  * VRAM access helper functions
244  */
245 
246 /**
247  * amdgpu_device_vram_access - read/write a buffer in vram
248  *
249  * @adev: amdgpu_device pointer
250  * @pos: offset of the buffer in vram
251  * @buf: virtual address of the buffer in system memory
252  * @size: read/write size, sizeof(@buf) must > @size
253  * @write: true - write to vram, otherwise - read from vram
254  */
amdgpu_device_vram_access(struct amdgpu_device * adev,loff_t pos,uint32_t * buf,size_t size,bool write)255 void amdgpu_device_vram_access(struct amdgpu_device *adev, loff_t pos,
256 			       uint32_t *buf, size_t size, bool write)
257 {
258 	unsigned long flags;
259 	uint32_t hi = ~0;
260 	uint64_t last;
261 
262 
263 #ifdef CONFIG_64BIT
264 	last = min(pos + size, adev->gmc.visible_vram_size);
265 	if (last > pos) {
266 		void __iomem *addr = adev->mman.aper_base_kaddr + pos;
267 		size_t count = last - pos;
268 
269 		if (write) {
270 			memcpy_toio(addr, buf, count);
271 			mb();
272 			amdgpu_asic_flush_hdp(adev, NULL);
273 		} else {
274 			amdgpu_asic_invalidate_hdp(adev, NULL);
275 			mb();
276 			memcpy_fromio(buf, addr, count);
277 		}
278 
279 		if (count == size)
280 			return;
281 
282 		pos += count;
283 		buf += count / 4;
284 		size -= count;
285 	}
286 #endif
287 
288 	spin_lock_irqsave(&adev->mmio_idx_lock, flags);
289 	for (last = pos + size; pos < last; pos += 4) {
290 		uint32_t tmp = pos >> 31;
291 
292 		WREG32_NO_KIQ(mmMM_INDEX, ((uint32_t)pos) | 0x80000000);
293 		if (tmp != hi) {
294 			WREG32_NO_KIQ(mmMM_INDEX_HI, tmp);
295 			hi = tmp;
296 		}
297 		if (write)
298 			WREG32_NO_KIQ(mmMM_DATA, *buf++);
299 		else
300 			*buf++ = RREG32_NO_KIQ(mmMM_DATA);
301 	}
302 	spin_unlock_irqrestore(&adev->mmio_idx_lock, flags);
303 }
304 
305 /*
306  * register access helper functions.
307  */
308 /**
309  * amdgpu_device_rreg - read a memory mapped IO or indirect register
310  *
311  * @adev: amdgpu_device pointer
312  * @reg: dword aligned register offset
313  * @acc_flags: access flags which require special behavior
314  *
315  * Returns the 32 bit value from the offset specified.
316  */
amdgpu_device_rreg(struct amdgpu_device * adev,uint32_t reg,uint32_t acc_flags)317 uint32_t amdgpu_device_rreg(struct amdgpu_device *adev,
318 			    uint32_t reg, uint32_t acc_flags)
319 {
320 	uint32_t ret;
321 
322 	if (adev->in_pci_err_recovery)
323 		return 0;
324 
325 	if ((reg * 4) < adev->rmmio_size) {
326 		if (!(acc_flags & AMDGPU_REGS_NO_KIQ) &&
327 		    amdgpu_sriov_runtime(adev) &&
328 		    down_read_trylock(&adev->reset_sem)) {
329 			ret = amdgpu_kiq_rreg(adev, reg);
330 			up_read(&adev->reset_sem);
331 		} else {
332 			ret = readl(((void __iomem *)adev->rmmio) + (reg * 4));
333 		}
334 	} else {
335 		ret = adev->pcie_rreg(adev, reg * 4);
336 	}
337 
338 	trace_amdgpu_device_rreg(adev->pdev->device, reg, ret);
339 
340 	return ret;
341 }
342 
343 /*
344  * MMIO register read with bytes helper functions
345  * @offset:bytes offset from MMIO start
346  *
347 */
348 
349 /**
350  * amdgpu_mm_rreg8 - read a memory mapped IO register
351  *
352  * @adev: amdgpu_device pointer
353  * @offset: byte aligned register offset
354  *
355  * Returns the 8 bit value from the offset specified.
356  */
amdgpu_mm_rreg8(struct amdgpu_device * adev,uint32_t offset)357 uint8_t amdgpu_mm_rreg8(struct amdgpu_device *adev, uint32_t offset)
358 {
359 	if (adev->in_pci_err_recovery)
360 		return 0;
361 
362 	if (offset < adev->rmmio_size)
363 		return (readb(adev->rmmio + offset));
364 	BUG();
365 }
366 
367 /*
368  * MMIO register write with bytes helper functions
369  * @offset:bytes offset from MMIO start
370  * @value: the value want to be written to the register
371  *
372 */
373 /**
374  * amdgpu_mm_wreg8 - read a memory mapped IO register
375  *
376  * @adev: amdgpu_device pointer
377  * @offset: byte aligned register offset
378  * @value: 8 bit value to write
379  *
380  * Writes the value specified to the offset specified.
381  */
amdgpu_mm_wreg8(struct amdgpu_device * adev,uint32_t offset,uint8_t value)382 void amdgpu_mm_wreg8(struct amdgpu_device *adev, uint32_t offset, uint8_t value)
383 {
384 	if (adev->in_pci_err_recovery)
385 		return;
386 
387 	if (offset < adev->rmmio_size)
388 		writeb(value, adev->rmmio + offset);
389 	else
390 		BUG();
391 }
392 
393 /**
394  * amdgpu_device_wreg - write to a memory mapped IO or indirect register
395  *
396  * @adev: amdgpu_device pointer
397  * @reg: dword aligned register offset
398  * @v: 32 bit value to write to the register
399  * @acc_flags: access flags which require special behavior
400  *
401  * Writes the value specified to the offset specified.
402  */
amdgpu_device_wreg(struct amdgpu_device * adev,uint32_t reg,uint32_t v,uint32_t acc_flags)403 void amdgpu_device_wreg(struct amdgpu_device *adev,
404 			uint32_t reg, uint32_t v,
405 			uint32_t acc_flags)
406 {
407 	if (adev->in_pci_err_recovery)
408 		return;
409 
410 	if ((reg * 4) < adev->rmmio_size) {
411 		if (!(acc_flags & AMDGPU_REGS_NO_KIQ) &&
412 		    amdgpu_sriov_runtime(adev) &&
413 		    down_read_trylock(&adev->reset_sem)) {
414 			amdgpu_kiq_wreg(adev, reg, v);
415 			up_read(&adev->reset_sem);
416 		} else {
417 			writel(v, ((void __iomem *)adev->rmmio) + (reg * 4));
418 		}
419 	} else {
420 		adev->pcie_wreg(adev, reg * 4, v);
421 	}
422 
423 	trace_amdgpu_device_wreg(adev->pdev->device, reg, v);
424 }
425 
426 /*
427  * amdgpu_mm_wreg_mmio_rlc -  write register either with mmio or with RLC path if in range
428  *
429  * this function is invoked only the debugfs register access
430  * */
amdgpu_mm_wreg_mmio_rlc(struct amdgpu_device * adev,uint32_t reg,uint32_t v)431 void amdgpu_mm_wreg_mmio_rlc(struct amdgpu_device *adev,
432 			     uint32_t reg, uint32_t v)
433 {
434 	if (adev->in_pci_err_recovery)
435 		return;
436 
437 	if (amdgpu_sriov_fullaccess(adev) &&
438 	    adev->gfx.rlc.funcs &&
439 	    adev->gfx.rlc.funcs->is_rlcg_access_range) {
440 		if (adev->gfx.rlc.funcs->is_rlcg_access_range(adev, reg))
441 			return adev->gfx.rlc.funcs->rlcg_wreg(adev, reg, v);
442 	} else {
443 		writel(v, ((void __iomem *)adev->rmmio) + (reg * 4));
444 	}
445 }
446 
447 /**
448  * amdgpu_io_rreg - read an IO register
449  *
450  * @adev: amdgpu_device pointer
451  * @reg: dword aligned register offset
452  *
453  * Returns the 32 bit value from the offset specified.
454  */
amdgpu_io_rreg(struct amdgpu_device * adev,u32 reg)455 u32 amdgpu_io_rreg(struct amdgpu_device *adev, u32 reg)
456 {
457 	if (adev->in_pci_err_recovery)
458 		return 0;
459 
460 	if ((reg * 4) < adev->rio_mem_size)
461 		return ioread32(adev->rio_mem + (reg * 4));
462 	else {
463 		iowrite32((reg * 4), adev->rio_mem + (mmMM_INDEX * 4));
464 		return ioread32(adev->rio_mem + (mmMM_DATA * 4));
465 	}
466 }
467 
468 /**
469  * amdgpu_io_wreg - write to an IO register
470  *
471  * @adev: amdgpu_device pointer
472  * @reg: dword aligned register offset
473  * @v: 32 bit value to write to the register
474  *
475  * Writes the value specified to the offset specified.
476  */
amdgpu_io_wreg(struct amdgpu_device * adev,u32 reg,u32 v)477 void amdgpu_io_wreg(struct amdgpu_device *adev, u32 reg, u32 v)
478 {
479 	if (adev->in_pci_err_recovery)
480 		return;
481 
482 	if ((reg * 4) < adev->rio_mem_size)
483 		iowrite32(v, adev->rio_mem + (reg * 4));
484 	else {
485 		iowrite32((reg * 4), adev->rio_mem + (mmMM_INDEX * 4));
486 		iowrite32(v, adev->rio_mem + (mmMM_DATA * 4));
487 	}
488 }
489 
490 /**
491  * amdgpu_mm_rdoorbell - read a doorbell dword
492  *
493  * @adev: amdgpu_device pointer
494  * @index: doorbell index
495  *
496  * Returns the value in the doorbell aperture at the
497  * requested doorbell index (CIK).
498  */
amdgpu_mm_rdoorbell(struct amdgpu_device * adev,u32 index)499 u32 amdgpu_mm_rdoorbell(struct amdgpu_device *adev, u32 index)
500 {
501 	if (adev->in_pci_err_recovery)
502 		return 0;
503 
504 	if (index < adev->doorbell.num_doorbells) {
505 		return readl(adev->doorbell.ptr + index);
506 	} else {
507 		DRM_ERROR("reading beyond doorbell aperture: 0x%08x!\n", index);
508 		return 0;
509 	}
510 }
511 
512 /**
513  * amdgpu_mm_wdoorbell - write a doorbell dword
514  *
515  * @adev: amdgpu_device pointer
516  * @index: doorbell index
517  * @v: value to write
518  *
519  * Writes @v to the doorbell aperture at the
520  * requested doorbell index (CIK).
521  */
amdgpu_mm_wdoorbell(struct amdgpu_device * adev,u32 index,u32 v)522 void amdgpu_mm_wdoorbell(struct amdgpu_device *adev, u32 index, u32 v)
523 {
524 	if (adev->in_pci_err_recovery)
525 		return;
526 
527 	if (index < adev->doorbell.num_doorbells) {
528 		writel(v, adev->doorbell.ptr + index);
529 	} else {
530 		DRM_ERROR("writing beyond doorbell aperture: 0x%08x!\n", index);
531 	}
532 }
533 
534 /**
535  * amdgpu_mm_rdoorbell64 - read a doorbell Qword
536  *
537  * @adev: amdgpu_device pointer
538  * @index: doorbell index
539  *
540  * Returns the value in the doorbell aperture at the
541  * requested doorbell index (VEGA10+).
542  */
amdgpu_mm_rdoorbell64(struct amdgpu_device * adev,u32 index)543 u64 amdgpu_mm_rdoorbell64(struct amdgpu_device *adev, u32 index)
544 {
545 	if (adev->in_pci_err_recovery)
546 		return 0;
547 
548 	if (index < adev->doorbell.num_doorbells) {
549 		return atomic64_read((atomic64_t *)(adev->doorbell.ptr + index));
550 	} else {
551 		DRM_ERROR("reading beyond doorbell aperture: 0x%08x!\n", index);
552 		return 0;
553 	}
554 }
555 
556 /**
557  * amdgpu_mm_wdoorbell64 - write a doorbell Qword
558  *
559  * @adev: amdgpu_device pointer
560  * @index: doorbell index
561  * @v: value to write
562  *
563  * Writes @v to the doorbell aperture at the
564  * requested doorbell index (VEGA10+).
565  */
amdgpu_mm_wdoorbell64(struct amdgpu_device * adev,u32 index,u64 v)566 void amdgpu_mm_wdoorbell64(struct amdgpu_device *adev, u32 index, u64 v)
567 {
568 	if (adev->in_pci_err_recovery)
569 		return;
570 
571 	if (index < adev->doorbell.num_doorbells) {
572 		atomic64_set((atomic64_t *)(adev->doorbell.ptr + index), v);
573 	} else {
574 		DRM_ERROR("writing beyond doorbell aperture: 0x%08x!\n", index);
575 	}
576 }
577 
578 /**
579  * amdgpu_device_indirect_rreg - read an indirect register
580  *
581  * @adev: amdgpu_device pointer
582  * @pcie_index: mmio register offset
583  * @pcie_data: mmio register offset
584  *
585  * Returns the value of indirect register @reg_addr
586  */
amdgpu_device_indirect_rreg(struct amdgpu_device * adev,u32 pcie_index,u32 pcie_data,u32 reg_addr)587 u32 amdgpu_device_indirect_rreg(struct amdgpu_device *adev,
588 				u32 pcie_index, u32 pcie_data,
589 				u32 reg_addr)
590 {
591 	unsigned long flags;
592 	u32 r;
593 	void __iomem *pcie_index_offset;
594 	void __iomem *pcie_data_offset;
595 
596 	spin_lock_irqsave(&adev->pcie_idx_lock, flags);
597 	pcie_index_offset = (void __iomem *)adev->rmmio + pcie_index * 4;
598 	pcie_data_offset = (void __iomem *)adev->rmmio + pcie_data * 4;
599 
600 	writel(reg_addr, pcie_index_offset);
601 	readl(pcie_index_offset);
602 	r = readl(pcie_data_offset);
603 	spin_unlock_irqrestore(&adev->pcie_idx_lock, flags);
604 
605 	return r;
606 }
607 
608 /**
609  * amdgpu_device_indirect_rreg64 - read a 64bits indirect register
610  *
611  * @adev: amdgpu_device pointer
612  * @pcie_index: mmio register offset
613  * @pcie_data: mmio register offset
614  *
615  * Returns the value of indirect register @reg_addr
616  */
amdgpu_device_indirect_rreg64(struct amdgpu_device * adev,u32 pcie_index,u32 pcie_data,u32 reg_addr)617 u64 amdgpu_device_indirect_rreg64(struct amdgpu_device *adev,
618 				  u32 pcie_index, u32 pcie_data,
619 				  u32 reg_addr)
620 {
621 	unsigned long flags;
622 	u64 r;
623 	void __iomem *pcie_index_offset;
624 	void __iomem *pcie_data_offset;
625 
626 	spin_lock_irqsave(&adev->pcie_idx_lock, flags);
627 	pcie_index_offset = (void __iomem *)adev->rmmio + pcie_index * 4;
628 	pcie_data_offset = (void __iomem *)adev->rmmio + pcie_data * 4;
629 
630 	/* read low 32 bits */
631 	writel(reg_addr, pcie_index_offset);
632 	readl(pcie_index_offset);
633 	r = readl(pcie_data_offset);
634 	/* read high 32 bits */
635 	writel(reg_addr + 4, pcie_index_offset);
636 	readl(pcie_index_offset);
637 	r |= ((u64)readl(pcie_data_offset) << 32);
638 	spin_unlock_irqrestore(&adev->pcie_idx_lock, flags);
639 
640 	return r;
641 }
642 
643 /**
644  * amdgpu_device_indirect_wreg - write an indirect register address
645  *
646  * @adev: amdgpu_device pointer
647  * @pcie_index: mmio register offset
648  * @pcie_data: mmio register offset
649  * @reg_addr: indirect register offset
650  * @reg_data: indirect register data
651  *
652  */
amdgpu_device_indirect_wreg(struct amdgpu_device * adev,u32 pcie_index,u32 pcie_data,u32 reg_addr,u32 reg_data)653 void amdgpu_device_indirect_wreg(struct amdgpu_device *adev,
654 				 u32 pcie_index, u32 pcie_data,
655 				 u32 reg_addr, u32 reg_data)
656 {
657 	unsigned long flags;
658 	void __iomem *pcie_index_offset;
659 	void __iomem *pcie_data_offset;
660 
661 	spin_lock_irqsave(&adev->pcie_idx_lock, flags);
662 	pcie_index_offset = (void __iomem *)adev->rmmio + pcie_index * 4;
663 	pcie_data_offset = (void __iomem *)adev->rmmio + pcie_data * 4;
664 
665 	writel(reg_addr, pcie_index_offset);
666 	readl(pcie_index_offset);
667 	writel(reg_data, pcie_data_offset);
668 	readl(pcie_data_offset);
669 	spin_unlock_irqrestore(&adev->pcie_idx_lock, flags);
670 }
671 
672 /**
673  * amdgpu_device_indirect_wreg64 - write a 64bits indirect register address
674  *
675  * @adev: amdgpu_device pointer
676  * @pcie_index: mmio register offset
677  * @pcie_data: mmio register offset
678  * @reg_addr: indirect register offset
679  * @reg_data: indirect register data
680  *
681  */
amdgpu_device_indirect_wreg64(struct amdgpu_device * adev,u32 pcie_index,u32 pcie_data,u32 reg_addr,u64 reg_data)682 void amdgpu_device_indirect_wreg64(struct amdgpu_device *adev,
683 				   u32 pcie_index, u32 pcie_data,
684 				   u32 reg_addr, u64 reg_data)
685 {
686 	unsigned long flags;
687 	void __iomem *pcie_index_offset;
688 	void __iomem *pcie_data_offset;
689 
690 	spin_lock_irqsave(&adev->pcie_idx_lock, flags);
691 	pcie_index_offset = (void __iomem *)adev->rmmio + pcie_index * 4;
692 	pcie_data_offset = (void __iomem *)adev->rmmio + pcie_data * 4;
693 
694 	/* write low 32 bits */
695 	writel(reg_addr, pcie_index_offset);
696 	readl(pcie_index_offset);
697 	writel((u32)(reg_data & 0xffffffffULL), pcie_data_offset);
698 	readl(pcie_data_offset);
699 	/* write high 32 bits */
700 	writel(reg_addr + 4, pcie_index_offset);
701 	readl(pcie_index_offset);
702 	writel((u32)(reg_data >> 32), pcie_data_offset);
703 	readl(pcie_data_offset);
704 	spin_unlock_irqrestore(&adev->pcie_idx_lock, flags);
705 }
706 
707 /**
708  * amdgpu_invalid_rreg - dummy reg read function
709  *
710  * @adev: amdgpu_device pointer
711  * @reg: offset of register
712  *
713  * Dummy register read function.  Used for register blocks
714  * that certain asics don't have (all asics).
715  * Returns the value in the register.
716  */
amdgpu_invalid_rreg(struct amdgpu_device * adev,uint32_t reg)717 static uint32_t amdgpu_invalid_rreg(struct amdgpu_device *adev, uint32_t reg)
718 {
719 	DRM_ERROR("Invalid callback to read register 0x%04X\n", reg);
720 	BUG();
721 	return 0;
722 }
723 
724 /**
725  * amdgpu_invalid_wreg - dummy reg write function
726  *
727  * @adev: amdgpu_device pointer
728  * @reg: offset of register
729  * @v: value to write to the register
730  *
731  * Dummy register read function.  Used for register blocks
732  * that certain asics don't have (all asics).
733  */
amdgpu_invalid_wreg(struct amdgpu_device * adev,uint32_t reg,uint32_t v)734 static void amdgpu_invalid_wreg(struct amdgpu_device *adev, uint32_t reg, uint32_t v)
735 {
736 	DRM_ERROR("Invalid callback to write register 0x%04X with 0x%08X\n",
737 		  reg, v);
738 	BUG();
739 }
740 
741 /**
742  * amdgpu_invalid_rreg64 - dummy 64 bit reg read function
743  *
744  * @adev: amdgpu_device pointer
745  * @reg: offset of register
746  *
747  * Dummy register read function.  Used for register blocks
748  * that certain asics don't have (all asics).
749  * Returns the value in the register.
750  */
amdgpu_invalid_rreg64(struct amdgpu_device * adev,uint32_t reg)751 static uint64_t amdgpu_invalid_rreg64(struct amdgpu_device *adev, uint32_t reg)
752 {
753 	DRM_ERROR("Invalid callback to read 64 bit register 0x%04X\n", reg);
754 	BUG();
755 	return 0;
756 }
757 
758 /**
759  * amdgpu_invalid_wreg64 - dummy reg write function
760  *
761  * @adev: amdgpu_device pointer
762  * @reg: offset of register
763  * @v: value to write to the register
764  *
765  * Dummy register read function.  Used for register blocks
766  * that certain asics don't have (all asics).
767  */
amdgpu_invalid_wreg64(struct amdgpu_device * adev,uint32_t reg,uint64_t v)768 static void amdgpu_invalid_wreg64(struct amdgpu_device *adev, uint32_t reg, uint64_t v)
769 {
770 	DRM_ERROR("Invalid callback to write 64 bit register 0x%04X with 0x%08llX\n",
771 		  reg, v);
772 	BUG();
773 }
774 
775 /**
776  * amdgpu_block_invalid_rreg - dummy reg read function
777  *
778  * @adev: amdgpu_device pointer
779  * @block: offset of instance
780  * @reg: offset of register
781  *
782  * Dummy register read function.  Used for register blocks
783  * that certain asics don't have (all asics).
784  * Returns the value in the register.
785  */
amdgpu_block_invalid_rreg(struct amdgpu_device * adev,uint32_t block,uint32_t reg)786 static uint32_t amdgpu_block_invalid_rreg(struct amdgpu_device *adev,
787 					  uint32_t block, uint32_t reg)
788 {
789 	DRM_ERROR("Invalid callback to read register 0x%04X in block 0x%04X\n",
790 		  reg, block);
791 	BUG();
792 	return 0;
793 }
794 
795 /**
796  * amdgpu_block_invalid_wreg - dummy reg write function
797  *
798  * @adev: amdgpu_device pointer
799  * @block: offset of instance
800  * @reg: offset of register
801  * @v: value to write to the register
802  *
803  * Dummy register read function.  Used for register blocks
804  * that certain asics don't have (all asics).
805  */
amdgpu_block_invalid_wreg(struct amdgpu_device * adev,uint32_t block,uint32_t reg,uint32_t v)806 static void amdgpu_block_invalid_wreg(struct amdgpu_device *adev,
807 				      uint32_t block,
808 				      uint32_t reg, uint32_t v)
809 {
810 	DRM_ERROR("Invalid block callback to write register 0x%04X in block 0x%04X with 0x%08X\n",
811 		  reg, block, v);
812 	BUG();
813 }
814 
815 /**
816  * amdgpu_device_asic_init - Wrapper for atom asic_init
817  *
818  * @adev: amdgpu_device pointer
819  *
820  * Does any asic specific work and then calls atom asic init.
821  */
amdgpu_device_asic_init(struct amdgpu_device * adev)822 static int amdgpu_device_asic_init(struct amdgpu_device *adev)
823 {
824 	amdgpu_asic_pre_asic_init(adev);
825 
826 	return amdgpu_atom_asic_init(adev->mode_info.atom_context);
827 }
828 
829 /**
830  * amdgpu_device_vram_scratch_init - allocate the VRAM scratch page
831  *
832  * @adev: amdgpu_device pointer
833  *
834  * Allocates a scratch page of VRAM for use by various things in the
835  * driver.
836  */
amdgpu_device_vram_scratch_init(struct amdgpu_device * adev)837 static int amdgpu_device_vram_scratch_init(struct amdgpu_device *adev)
838 {
839 	return amdgpu_bo_create_kernel(adev, AMDGPU_GPU_PAGE_SIZE,
840 				       PAGE_SIZE, AMDGPU_GEM_DOMAIN_VRAM,
841 				       &adev->vram_scratch.robj,
842 				       &adev->vram_scratch.gpu_addr,
843 				       (void **)&adev->vram_scratch.ptr);
844 }
845 
846 /**
847  * amdgpu_device_vram_scratch_fini - Free the VRAM scratch page
848  *
849  * @adev: amdgpu_device pointer
850  *
851  * Frees the VRAM scratch page.
852  */
amdgpu_device_vram_scratch_fini(struct amdgpu_device * adev)853 static void amdgpu_device_vram_scratch_fini(struct amdgpu_device *adev)
854 {
855 	amdgpu_bo_free_kernel(&adev->vram_scratch.robj, NULL, NULL);
856 }
857 
858 /**
859  * amdgpu_device_program_register_sequence - program an array of registers.
860  *
861  * @adev: amdgpu_device pointer
862  * @registers: pointer to the register array
863  * @array_size: size of the register array
864  *
865  * Programs an array or registers with and and or masks.
866  * This is a helper for setting golden registers.
867  */
amdgpu_device_program_register_sequence(struct amdgpu_device * adev,const u32 * registers,const u32 array_size)868 void amdgpu_device_program_register_sequence(struct amdgpu_device *adev,
869 					     const u32 *registers,
870 					     const u32 array_size)
871 {
872 	u32 tmp, reg, and_mask, or_mask;
873 	int i;
874 
875 	if (array_size % 3)
876 		return;
877 
878 	for (i = 0; i < array_size; i +=3) {
879 		reg = registers[i + 0];
880 		and_mask = registers[i + 1];
881 		or_mask = registers[i + 2];
882 
883 		if (and_mask == 0xffffffff) {
884 			tmp = or_mask;
885 		} else {
886 			tmp = RREG32(reg);
887 			tmp &= ~and_mask;
888 			if (adev->family >= AMDGPU_FAMILY_AI)
889 				tmp |= (or_mask & and_mask);
890 			else
891 				tmp |= or_mask;
892 		}
893 		WREG32(reg, tmp);
894 	}
895 }
896 
897 /**
898  * amdgpu_device_pci_config_reset - reset the GPU
899  *
900  * @adev: amdgpu_device pointer
901  *
902  * Resets the GPU using the pci config reset sequence.
903  * Only applicable to asics prior to vega10.
904  */
amdgpu_device_pci_config_reset(struct amdgpu_device * adev)905 void amdgpu_device_pci_config_reset(struct amdgpu_device *adev)
906 {
907 	pci_write_config_dword(adev->pdev, 0x7c, AMDGPU_ASIC_RESET_DATA);
908 }
909 
910 /*
911  * GPU doorbell aperture helpers function.
912  */
913 /**
914  * amdgpu_device_doorbell_init - Init doorbell driver information.
915  *
916  * @adev: amdgpu_device pointer
917  *
918  * Init doorbell driver information (CIK)
919  * Returns 0 on success, error on failure.
920  */
amdgpu_device_doorbell_init(struct amdgpu_device * adev)921 static int amdgpu_device_doorbell_init(struct amdgpu_device *adev)
922 {
923 
924 	/* No doorbell on SI hardware generation */
925 	if (adev->asic_type < CHIP_BONAIRE) {
926 		adev->doorbell.base = 0;
927 		adev->doorbell.size = 0;
928 		adev->doorbell.num_doorbells = 0;
929 		adev->doorbell.ptr = NULL;
930 		return 0;
931 	}
932 
933 	if (pci_resource_flags(adev->pdev, 2) & IORESOURCE_UNSET)
934 		return -EINVAL;
935 
936 	amdgpu_asic_init_doorbell_index(adev);
937 
938 	/* doorbell bar mapping */
939 	adev->doorbell.base = pci_resource_start(adev->pdev, 2);
940 	adev->doorbell.size = pci_resource_len(adev->pdev, 2);
941 
942 	adev->doorbell.num_doorbells = min_t(u32, adev->doorbell.size / sizeof(u32),
943 					     adev->doorbell_index.max_assignment+1);
944 	if (adev->doorbell.num_doorbells == 0)
945 		return -EINVAL;
946 
947 	/* For Vega, reserve and map two pages on doorbell BAR since SDMA
948 	 * paging queue doorbell use the second page. The
949 	 * AMDGPU_DOORBELL64_MAX_ASSIGNMENT definition assumes all the
950 	 * doorbells are in the first page. So with paging queue enabled,
951 	 * the max num_doorbells should + 1 page (0x400 in dword)
952 	 */
953 	if (adev->asic_type >= CHIP_VEGA10)
954 		adev->doorbell.num_doorbells += 0x400;
955 
956 	adev->doorbell.ptr = ioremap(adev->doorbell.base,
957 				     adev->doorbell.num_doorbells *
958 				     sizeof(u32));
959 	if (adev->doorbell.ptr == NULL)
960 		return -ENOMEM;
961 
962 	return 0;
963 }
964 
965 /**
966  * amdgpu_device_doorbell_fini - Tear down doorbell driver information.
967  *
968  * @adev: amdgpu_device pointer
969  *
970  * Tear down doorbell driver information (CIK)
971  */
amdgpu_device_doorbell_fini(struct amdgpu_device * adev)972 static void amdgpu_device_doorbell_fini(struct amdgpu_device *adev)
973 {
974 	iounmap(adev->doorbell.ptr);
975 	adev->doorbell.ptr = NULL;
976 }
977 
978 
979 
980 /*
981  * amdgpu_device_wb_*()
982  * Writeback is the method by which the GPU updates special pages in memory
983  * with the status of certain GPU events (fences, ring pointers,etc.).
984  */
985 
986 /**
987  * amdgpu_device_wb_fini - Disable Writeback and free memory
988  *
989  * @adev: amdgpu_device pointer
990  *
991  * Disables Writeback and frees the Writeback memory (all asics).
992  * Used at driver shutdown.
993  */
amdgpu_device_wb_fini(struct amdgpu_device * adev)994 static void amdgpu_device_wb_fini(struct amdgpu_device *adev)
995 {
996 	if (adev->wb.wb_obj) {
997 		amdgpu_bo_free_kernel(&adev->wb.wb_obj,
998 				      &adev->wb.gpu_addr,
999 				      (void **)&adev->wb.wb);
1000 		adev->wb.wb_obj = NULL;
1001 	}
1002 }
1003 
1004 /**
1005  * amdgpu_device_wb_init- Init Writeback driver info and allocate memory
1006  *
1007  * @adev: amdgpu_device pointer
1008  *
1009  * Initializes writeback and allocates writeback memory (all asics).
1010  * Used at driver startup.
1011  * Returns 0 on success or an -error on failure.
1012  */
amdgpu_device_wb_init(struct amdgpu_device * adev)1013 static int amdgpu_device_wb_init(struct amdgpu_device *adev)
1014 {
1015 	int r;
1016 
1017 	if (adev->wb.wb_obj == NULL) {
1018 		/* AMDGPU_MAX_WB * sizeof(uint32_t) * 8 = AMDGPU_MAX_WB 256bit slots */
1019 		r = amdgpu_bo_create_kernel(adev, AMDGPU_MAX_WB * sizeof(uint32_t) * 8,
1020 					    PAGE_SIZE, AMDGPU_GEM_DOMAIN_GTT,
1021 					    &adev->wb.wb_obj, &adev->wb.gpu_addr,
1022 					    (void **)&adev->wb.wb);
1023 		if (r) {
1024 			dev_warn(adev->dev, "(%d) create WB bo failed\n", r);
1025 			return r;
1026 		}
1027 
1028 		adev->wb.num_wb = AMDGPU_MAX_WB;
1029 		memset(&adev->wb.used, 0, sizeof(adev->wb.used));
1030 
1031 		/* clear wb memory */
1032 		memset((char *)adev->wb.wb, 0, AMDGPU_MAX_WB * sizeof(uint32_t) * 8);
1033 	}
1034 
1035 	return 0;
1036 }
1037 
1038 /**
1039  * amdgpu_device_wb_get - Allocate a wb entry
1040  *
1041  * @adev: amdgpu_device pointer
1042  * @wb: wb index
1043  *
1044  * Allocate a wb slot for use by the driver (all asics).
1045  * Returns 0 on success or -EINVAL on failure.
1046  */
amdgpu_device_wb_get(struct amdgpu_device * adev,u32 * wb)1047 int amdgpu_device_wb_get(struct amdgpu_device *adev, u32 *wb)
1048 {
1049 	unsigned long offset = find_first_zero_bit(adev->wb.used, adev->wb.num_wb);
1050 
1051 	if (offset < adev->wb.num_wb) {
1052 		__set_bit(offset, adev->wb.used);
1053 		*wb = offset << 3; /* convert to dw offset */
1054 		return 0;
1055 	} else {
1056 		return -EINVAL;
1057 	}
1058 }
1059 
1060 /**
1061  * amdgpu_device_wb_free - Free a wb entry
1062  *
1063  * @adev: amdgpu_device pointer
1064  * @wb: wb index
1065  *
1066  * Free a wb slot allocated for use by the driver (all asics)
1067  */
amdgpu_device_wb_free(struct amdgpu_device * adev,u32 wb)1068 void amdgpu_device_wb_free(struct amdgpu_device *adev, u32 wb)
1069 {
1070 	wb >>= 3;
1071 	if (wb < adev->wb.num_wb)
1072 		__clear_bit(wb, adev->wb.used);
1073 }
1074 
1075 /**
1076  * amdgpu_device_resize_fb_bar - try to resize FB BAR
1077  *
1078  * @adev: amdgpu_device pointer
1079  *
1080  * Try to resize FB BAR to make all VRAM CPU accessible. We try very hard not
1081  * to fail, but if any of the BARs is not accessible after the size we abort
1082  * driver loading by returning -ENODEV.
1083  */
amdgpu_device_resize_fb_bar(struct amdgpu_device * adev)1084 int amdgpu_device_resize_fb_bar(struct amdgpu_device *adev)
1085 {
1086 	u64 space_needed = roundup_pow_of_two(adev->gmc.real_vram_size);
1087 	u32 rbar_size = order_base_2(((space_needed >> 20) | 1)) - 1;
1088 	struct pci_bus *root;
1089 	struct resource *res;
1090 	unsigned i;
1091 	u16 cmd;
1092 	int r;
1093 
1094 	if (!IS_ENABLED(CONFIG_PHYS_ADDR_T_64BIT))
1095 		return 0;
1096 
1097 	/* Bypass for VF */
1098 	if (amdgpu_sriov_vf(adev))
1099 		return 0;
1100 
1101 	/* skip if the bios has already enabled large BAR */
1102 	if (adev->gmc.real_vram_size &&
1103 	    (pci_resource_len(adev->pdev, 0) >= adev->gmc.real_vram_size))
1104 		return 0;
1105 
1106 	/* Check if the root BUS has 64bit memory resources */
1107 	root = adev->pdev->bus;
1108 	while (root->parent)
1109 		root = root->parent;
1110 
1111 	pci_bus_for_each_resource(root, res, i) {
1112 		if (res && res->flags & (IORESOURCE_MEM | IORESOURCE_MEM_64) &&
1113 		    res->start > 0x100000000ull)
1114 			break;
1115 	}
1116 
1117 	/* Trying to resize is pointless without a root hub window above 4GB */
1118 	if (!res)
1119 		return 0;
1120 
1121 	/* Disable memory decoding while we change the BAR addresses and size */
1122 	pci_read_config_word(adev->pdev, PCI_COMMAND, &cmd);
1123 	pci_write_config_word(adev->pdev, PCI_COMMAND,
1124 			      cmd & ~PCI_COMMAND_MEMORY);
1125 
1126 	/* Free the VRAM and doorbell BAR, we most likely need to move both. */
1127 	amdgpu_device_doorbell_fini(adev);
1128 	if (adev->asic_type >= CHIP_BONAIRE)
1129 		pci_release_resource(adev->pdev, 2);
1130 
1131 	pci_release_resource(adev->pdev, 0);
1132 
1133 	r = pci_resize_resource(adev->pdev, 0, rbar_size);
1134 	if (r == -ENOSPC)
1135 		DRM_INFO("Not enough PCI address space for a large BAR.");
1136 	else if (r && r != -ENOTSUPP)
1137 		DRM_ERROR("Problem resizing BAR0 (%d).", r);
1138 
1139 	pci_assign_unassigned_bus_resources(adev->pdev->bus);
1140 
1141 	/* When the doorbell or fb BAR isn't available we have no chance of
1142 	 * using the device.
1143 	 */
1144 	r = amdgpu_device_doorbell_init(adev);
1145 	if (r || (pci_resource_flags(adev->pdev, 0) & IORESOURCE_UNSET))
1146 		return -ENODEV;
1147 
1148 	pci_write_config_word(adev->pdev, PCI_COMMAND, cmd);
1149 
1150 	return 0;
1151 }
1152 
1153 /*
1154  * GPU helpers function.
1155  */
1156 /**
1157  * amdgpu_device_need_post - check if the hw need post or not
1158  *
1159  * @adev: amdgpu_device pointer
1160  *
1161  * Check if the asic has been initialized (all asics) at driver startup
1162  * or post is needed if  hw reset is performed.
1163  * Returns true if need or false if not.
1164  */
amdgpu_device_need_post(struct amdgpu_device * adev)1165 bool amdgpu_device_need_post(struct amdgpu_device *adev)
1166 {
1167 	uint32_t reg;
1168 
1169 	if (amdgpu_sriov_vf(adev))
1170 		return false;
1171 
1172 	if (amdgpu_passthrough(adev)) {
1173 		/* for FIJI: In whole GPU pass-through virtualization case, after VM reboot
1174 		 * some old smc fw still need driver do vPost otherwise gpu hang, while
1175 		 * those smc fw version above 22.15 doesn't have this flaw, so we force
1176 		 * vpost executed for smc version below 22.15
1177 		 */
1178 		if (adev->asic_type == CHIP_FIJI) {
1179 			int err;
1180 			uint32_t fw_ver;
1181 			err = request_firmware(&adev->pm.fw, "amdgpu/fiji_smc.bin", adev->dev);
1182 			/* force vPost if error occured */
1183 			if (err)
1184 				return true;
1185 
1186 			fw_ver = *((uint32_t *)adev->pm.fw->data + 69);
1187 			release_firmware(adev->pm.fw);
1188 			if (fw_ver < 0x00160e00)
1189 				return true;
1190 		}
1191 	}
1192 
1193 	if (adev->has_hw_reset) {
1194 		adev->has_hw_reset = false;
1195 		return true;
1196 	}
1197 
1198 	/* bios scratch used on CIK+ */
1199 	if (adev->asic_type >= CHIP_BONAIRE)
1200 		return amdgpu_atombios_scratch_need_asic_init(adev);
1201 
1202 	/* check MEM_SIZE for older asics */
1203 	reg = amdgpu_asic_get_config_memsize(adev);
1204 
1205 	if ((reg != 0) && (reg != 0xffffffff))
1206 		return false;
1207 
1208 	return true;
1209 }
1210 
1211 /* if we get transitioned to only one device, take VGA back */
1212 /**
1213  * amdgpu_device_vga_set_decode - enable/disable vga decode
1214  *
1215  * @cookie: amdgpu_device pointer
1216  * @state: enable/disable vga decode
1217  *
1218  * Enable/disable vga decode (all asics).
1219  * Returns VGA resource flags.
1220  */
amdgpu_device_vga_set_decode(void * cookie,bool state)1221 static unsigned int amdgpu_device_vga_set_decode(void *cookie, bool state)
1222 {
1223 	struct amdgpu_device *adev = cookie;
1224 	amdgpu_asic_set_vga_state(adev, state);
1225 	if (state)
1226 		return VGA_RSRC_LEGACY_IO | VGA_RSRC_LEGACY_MEM |
1227 		       VGA_RSRC_NORMAL_IO | VGA_RSRC_NORMAL_MEM;
1228 	else
1229 		return VGA_RSRC_NORMAL_IO | VGA_RSRC_NORMAL_MEM;
1230 }
1231 
1232 /**
1233  * amdgpu_device_check_block_size - validate the vm block size
1234  *
1235  * @adev: amdgpu_device pointer
1236  *
1237  * Validates the vm block size specified via module parameter.
1238  * The vm block size defines number of bits in page table versus page directory,
1239  * a page is 4KB so we have 12 bits offset, minimum 9 bits in the
1240  * page table and the remaining bits are in the page directory.
1241  */
amdgpu_device_check_block_size(struct amdgpu_device * adev)1242 static void amdgpu_device_check_block_size(struct amdgpu_device *adev)
1243 {
1244 	/* defines number of bits in page table versus page directory,
1245 	 * a page is 4KB so we have 12 bits offset, minimum 9 bits in the
1246 	 * page table and the remaining bits are in the page directory */
1247 	if (amdgpu_vm_block_size == -1)
1248 		return;
1249 
1250 	if (amdgpu_vm_block_size < 9) {
1251 		dev_warn(adev->dev, "VM page table size (%d) too small\n",
1252 			 amdgpu_vm_block_size);
1253 		amdgpu_vm_block_size = -1;
1254 	}
1255 }
1256 
1257 /**
1258  * amdgpu_device_check_vm_size - validate the vm size
1259  *
1260  * @adev: amdgpu_device pointer
1261  *
1262  * Validates the vm size in GB specified via module parameter.
1263  * The VM size is the size of the GPU virtual memory space in GB.
1264  */
amdgpu_device_check_vm_size(struct amdgpu_device * adev)1265 static void amdgpu_device_check_vm_size(struct amdgpu_device *adev)
1266 {
1267 	/* no need to check the default value */
1268 	if (amdgpu_vm_size == -1)
1269 		return;
1270 
1271 	if (amdgpu_vm_size < 1) {
1272 		dev_warn(adev->dev, "VM size (%d) too small, min is 1GB\n",
1273 			 amdgpu_vm_size);
1274 		amdgpu_vm_size = -1;
1275 	}
1276 }
1277 
amdgpu_device_check_smu_prv_buffer_size(struct amdgpu_device * adev)1278 static void amdgpu_device_check_smu_prv_buffer_size(struct amdgpu_device *adev)
1279 {
1280 	struct sysinfo si;
1281 	bool is_os_64 = (sizeof(void *) == 8);
1282 	uint64_t total_memory;
1283 	uint64_t dram_size_seven_GB = 0x1B8000000;
1284 	uint64_t dram_size_three_GB = 0xB8000000;
1285 
1286 	if (amdgpu_smu_memory_pool_size == 0)
1287 		return;
1288 
1289 	if (!is_os_64) {
1290 		DRM_WARN("Not 64-bit OS, feature not supported\n");
1291 		goto def_value;
1292 	}
1293 	si_meminfo(&si);
1294 	total_memory = (uint64_t)si.totalram * si.mem_unit;
1295 
1296 	if ((amdgpu_smu_memory_pool_size == 1) ||
1297 		(amdgpu_smu_memory_pool_size == 2)) {
1298 		if (total_memory < dram_size_three_GB)
1299 			goto def_value1;
1300 	} else if ((amdgpu_smu_memory_pool_size == 4) ||
1301 		(amdgpu_smu_memory_pool_size == 8)) {
1302 		if (total_memory < dram_size_seven_GB)
1303 			goto def_value1;
1304 	} else {
1305 		DRM_WARN("Smu memory pool size not supported\n");
1306 		goto def_value;
1307 	}
1308 	adev->pm.smu_prv_buffer_size = amdgpu_smu_memory_pool_size << 28;
1309 
1310 	return;
1311 
1312 def_value1:
1313 	DRM_WARN("No enough system memory\n");
1314 def_value:
1315 	adev->pm.smu_prv_buffer_size = 0;
1316 }
1317 
1318 /**
1319  * amdgpu_device_check_arguments - validate module params
1320  *
1321  * @adev: amdgpu_device pointer
1322  *
1323  * Validates certain module parameters and updates
1324  * the associated values used by the driver (all asics).
1325  */
amdgpu_device_check_arguments(struct amdgpu_device * adev)1326 static int amdgpu_device_check_arguments(struct amdgpu_device *adev)
1327 {
1328 	if (amdgpu_sched_jobs < 4) {
1329 		dev_warn(adev->dev, "sched jobs (%d) must be at least 4\n",
1330 			 amdgpu_sched_jobs);
1331 		amdgpu_sched_jobs = 4;
1332 	} else if (!is_power_of_2(amdgpu_sched_jobs)){
1333 		dev_warn(adev->dev, "sched jobs (%d) must be a power of 2\n",
1334 			 amdgpu_sched_jobs);
1335 		amdgpu_sched_jobs = roundup_pow_of_two(amdgpu_sched_jobs);
1336 	}
1337 
1338 	if (amdgpu_gart_size != -1 && amdgpu_gart_size < 32) {
1339 		/* gart size must be greater or equal to 32M */
1340 		dev_warn(adev->dev, "gart size (%d) too small\n",
1341 			 amdgpu_gart_size);
1342 		amdgpu_gart_size = -1;
1343 	}
1344 
1345 	if (amdgpu_gtt_size != -1 && amdgpu_gtt_size < 32) {
1346 		/* gtt size must be greater or equal to 32M */
1347 		dev_warn(adev->dev, "gtt size (%d) too small\n",
1348 				 amdgpu_gtt_size);
1349 		amdgpu_gtt_size = -1;
1350 	}
1351 
1352 	/* valid range is between 4 and 9 inclusive */
1353 	if (amdgpu_vm_fragment_size != -1 &&
1354 	    (amdgpu_vm_fragment_size > 9 || amdgpu_vm_fragment_size < 4)) {
1355 		dev_warn(adev->dev, "valid range is between 4 and 9\n");
1356 		amdgpu_vm_fragment_size = -1;
1357 	}
1358 
1359 	if (amdgpu_sched_hw_submission < 2) {
1360 		dev_warn(adev->dev, "sched hw submission jobs (%d) must be at least 2\n",
1361 			 amdgpu_sched_hw_submission);
1362 		amdgpu_sched_hw_submission = 2;
1363 	} else if (!is_power_of_2(amdgpu_sched_hw_submission)) {
1364 		dev_warn(adev->dev, "sched hw submission jobs (%d) must be a power of 2\n",
1365 			 amdgpu_sched_hw_submission);
1366 		amdgpu_sched_hw_submission = roundup_pow_of_two(amdgpu_sched_hw_submission);
1367 	}
1368 
1369 	amdgpu_device_check_smu_prv_buffer_size(adev);
1370 
1371 	amdgpu_device_check_vm_size(adev);
1372 
1373 	amdgpu_device_check_block_size(adev);
1374 
1375 	adev->firmware.load_type = amdgpu_ucode_get_load_type(adev, amdgpu_fw_load_type);
1376 
1377 	amdgpu_gmc_tmz_set(adev);
1378 
1379 	if (amdgpu_num_kcq == -1) {
1380 		amdgpu_num_kcq = 8;
1381 	} else if (amdgpu_num_kcq > 8 || amdgpu_num_kcq < 0) {
1382 		amdgpu_num_kcq = 8;
1383 		dev_warn(adev->dev, "set kernel compute queue number to 8 due to invalid parameter provided by user\n");
1384 	}
1385 
1386 	amdgpu_gmc_noretry_set(adev);
1387 
1388 	return 0;
1389 }
1390 
1391 /**
1392  * amdgpu_switcheroo_set_state - set switcheroo state
1393  *
1394  * @pdev: pci dev pointer
1395  * @state: vga_switcheroo state
1396  *
1397  * Callback for the switcheroo driver.  Suspends or resumes the
1398  * the asics before or after it is powered up using ACPI methods.
1399  */
amdgpu_switcheroo_set_state(struct pci_dev * pdev,enum vga_switcheroo_state state)1400 static void amdgpu_switcheroo_set_state(struct pci_dev *pdev,
1401 					enum vga_switcheroo_state state)
1402 {
1403 	struct drm_device *dev = pci_get_drvdata(pdev);
1404 	int r;
1405 
1406 	if (amdgpu_device_supports_boco(dev) && state == VGA_SWITCHEROO_OFF)
1407 		return;
1408 
1409 	if (state == VGA_SWITCHEROO_ON) {
1410 		pr_info("switched on\n");
1411 		/* don't suspend or resume card normally */
1412 		dev->switch_power_state = DRM_SWITCH_POWER_CHANGING;
1413 
1414 		pci_set_power_state(dev->pdev, PCI_D0);
1415 		amdgpu_device_load_pci_state(dev->pdev);
1416 		r = pci_enable_device(dev->pdev);
1417 		if (r)
1418 			DRM_WARN("pci_enable_device failed (%d)\n", r);
1419 		amdgpu_device_resume(dev, true);
1420 
1421 		dev->switch_power_state = DRM_SWITCH_POWER_ON;
1422 		drm_kms_helper_poll_enable(dev);
1423 	} else {
1424 		pr_info("switched off\n");
1425 		drm_kms_helper_poll_disable(dev);
1426 		dev->switch_power_state = DRM_SWITCH_POWER_CHANGING;
1427 		amdgpu_device_suspend(dev, true);
1428 		amdgpu_device_cache_pci_state(dev->pdev);
1429 		/* Shut down the device */
1430 		pci_disable_device(dev->pdev);
1431 		pci_set_power_state(dev->pdev, PCI_D3cold);
1432 		dev->switch_power_state = DRM_SWITCH_POWER_OFF;
1433 	}
1434 }
1435 
1436 /**
1437  * amdgpu_switcheroo_can_switch - see if switcheroo state can change
1438  *
1439  * @pdev: pci dev pointer
1440  *
1441  * Callback for the switcheroo driver.  Check of the switcheroo
1442  * state can be changed.
1443  * Returns true if the state can be changed, false if not.
1444  */
amdgpu_switcheroo_can_switch(struct pci_dev * pdev)1445 static bool amdgpu_switcheroo_can_switch(struct pci_dev *pdev)
1446 {
1447 	struct drm_device *dev = pci_get_drvdata(pdev);
1448 
1449 	/*
1450 	* FIXME: open_count is protected by drm_global_mutex but that would lead to
1451 	* locking inversion with the driver load path. And the access here is
1452 	* completely racy anyway. So don't bother with locking for now.
1453 	*/
1454 	return atomic_read(&dev->open_count) == 0;
1455 }
1456 
1457 static const struct vga_switcheroo_client_ops amdgpu_switcheroo_ops = {
1458 	.set_gpu_state = amdgpu_switcheroo_set_state,
1459 	.reprobe = NULL,
1460 	.can_switch = amdgpu_switcheroo_can_switch,
1461 };
1462 
1463 /**
1464  * amdgpu_device_ip_set_clockgating_state - set the CG state
1465  *
1466  * @dev: amdgpu_device pointer
1467  * @block_type: Type of hardware IP (SMU, GFX, UVD, etc.)
1468  * @state: clockgating state (gate or ungate)
1469  *
1470  * Sets the requested clockgating state for all instances of
1471  * the hardware IP specified.
1472  * Returns the error code from the last instance.
1473  */
amdgpu_device_ip_set_clockgating_state(void * dev,enum amd_ip_block_type block_type,enum amd_clockgating_state state)1474 int amdgpu_device_ip_set_clockgating_state(void *dev,
1475 					   enum amd_ip_block_type block_type,
1476 					   enum amd_clockgating_state state)
1477 {
1478 	struct amdgpu_device *adev = dev;
1479 	int i, r = 0;
1480 
1481 	for (i = 0; i < adev->num_ip_blocks; i++) {
1482 		if (!adev->ip_blocks[i].status.valid)
1483 			continue;
1484 		if (adev->ip_blocks[i].version->type != block_type)
1485 			continue;
1486 		if (!adev->ip_blocks[i].version->funcs->set_clockgating_state)
1487 			continue;
1488 		r = adev->ip_blocks[i].version->funcs->set_clockgating_state(
1489 			(void *)adev, state);
1490 		if (r)
1491 			DRM_ERROR("set_clockgating_state of IP block <%s> failed %d\n",
1492 				  adev->ip_blocks[i].version->funcs->name, r);
1493 	}
1494 	return r;
1495 }
1496 
1497 /**
1498  * amdgpu_device_ip_set_powergating_state - set the PG state
1499  *
1500  * @dev: amdgpu_device pointer
1501  * @block_type: Type of hardware IP (SMU, GFX, UVD, etc.)
1502  * @state: powergating state (gate or ungate)
1503  *
1504  * Sets the requested powergating state for all instances of
1505  * the hardware IP specified.
1506  * Returns the error code from the last instance.
1507  */
amdgpu_device_ip_set_powergating_state(void * dev,enum amd_ip_block_type block_type,enum amd_powergating_state state)1508 int amdgpu_device_ip_set_powergating_state(void *dev,
1509 					   enum amd_ip_block_type block_type,
1510 					   enum amd_powergating_state state)
1511 {
1512 	struct amdgpu_device *adev = dev;
1513 	int i, r = 0;
1514 
1515 	for (i = 0; i < adev->num_ip_blocks; i++) {
1516 		if (!adev->ip_blocks[i].status.valid)
1517 			continue;
1518 		if (adev->ip_blocks[i].version->type != block_type)
1519 			continue;
1520 		if (!adev->ip_blocks[i].version->funcs->set_powergating_state)
1521 			continue;
1522 		r = adev->ip_blocks[i].version->funcs->set_powergating_state(
1523 			(void *)adev, state);
1524 		if (r)
1525 			DRM_ERROR("set_powergating_state of IP block <%s> failed %d\n",
1526 				  adev->ip_blocks[i].version->funcs->name, r);
1527 	}
1528 	return r;
1529 }
1530 
1531 /**
1532  * amdgpu_device_ip_get_clockgating_state - get the CG state
1533  *
1534  * @adev: amdgpu_device pointer
1535  * @flags: clockgating feature flags
1536  *
1537  * Walks the list of IPs on the device and updates the clockgating
1538  * flags for each IP.
1539  * Updates @flags with the feature flags for each hardware IP where
1540  * clockgating is enabled.
1541  */
amdgpu_device_ip_get_clockgating_state(struct amdgpu_device * adev,u32 * flags)1542 void amdgpu_device_ip_get_clockgating_state(struct amdgpu_device *adev,
1543 					    u32 *flags)
1544 {
1545 	int i;
1546 
1547 	for (i = 0; i < adev->num_ip_blocks; i++) {
1548 		if (!adev->ip_blocks[i].status.valid)
1549 			continue;
1550 		if (adev->ip_blocks[i].version->funcs->get_clockgating_state)
1551 			adev->ip_blocks[i].version->funcs->get_clockgating_state((void *)adev, flags);
1552 	}
1553 }
1554 
1555 /**
1556  * amdgpu_device_ip_wait_for_idle - wait for idle
1557  *
1558  * @adev: amdgpu_device pointer
1559  * @block_type: Type of hardware IP (SMU, GFX, UVD, etc.)
1560  *
1561  * Waits for the request hardware IP to be idle.
1562  * Returns 0 for success or a negative error code on failure.
1563  */
amdgpu_device_ip_wait_for_idle(struct amdgpu_device * adev,enum amd_ip_block_type block_type)1564 int amdgpu_device_ip_wait_for_idle(struct amdgpu_device *adev,
1565 				   enum amd_ip_block_type block_type)
1566 {
1567 	int i, r;
1568 
1569 	for (i = 0; i < adev->num_ip_blocks; i++) {
1570 		if (!adev->ip_blocks[i].status.valid)
1571 			continue;
1572 		if (adev->ip_blocks[i].version->type == block_type) {
1573 			r = adev->ip_blocks[i].version->funcs->wait_for_idle((void *)adev);
1574 			if (r)
1575 				return r;
1576 			break;
1577 		}
1578 	}
1579 	return 0;
1580 
1581 }
1582 
1583 /**
1584  * amdgpu_device_ip_is_idle - is the hardware IP idle
1585  *
1586  * @adev: amdgpu_device pointer
1587  * @block_type: Type of hardware IP (SMU, GFX, UVD, etc.)
1588  *
1589  * Check if the hardware IP is idle or not.
1590  * Returns true if it the IP is idle, false if not.
1591  */
amdgpu_device_ip_is_idle(struct amdgpu_device * adev,enum amd_ip_block_type block_type)1592 bool amdgpu_device_ip_is_idle(struct amdgpu_device *adev,
1593 			      enum amd_ip_block_type block_type)
1594 {
1595 	int i;
1596 
1597 	for (i = 0; i < adev->num_ip_blocks; i++) {
1598 		if (!adev->ip_blocks[i].status.valid)
1599 			continue;
1600 		if (adev->ip_blocks[i].version->type == block_type)
1601 			return adev->ip_blocks[i].version->funcs->is_idle((void *)adev);
1602 	}
1603 	return true;
1604 
1605 }
1606 
1607 /**
1608  * amdgpu_device_ip_get_ip_block - get a hw IP pointer
1609  *
1610  * @adev: amdgpu_device pointer
1611  * @type: Type of hardware IP (SMU, GFX, UVD, etc.)
1612  *
1613  * Returns a pointer to the hardware IP block structure
1614  * if it exists for the asic, otherwise NULL.
1615  */
1616 struct amdgpu_ip_block *
amdgpu_device_ip_get_ip_block(struct amdgpu_device * adev,enum amd_ip_block_type type)1617 amdgpu_device_ip_get_ip_block(struct amdgpu_device *adev,
1618 			      enum amd_ip_block_type type)
1619 {
1620 	int i;
1621 
1622 	for (i = 0; i < adev->num_ip_blocks; i++)
1623 		if (adev->ip_blocks[i].version->type == type)
1624 			return &adev->ip_blocks[i];
1625 
1626 	return NULL;
1627 }
1628 
1629 /**
1630  * amdgpu_device_ip_block_version_cmp
1631  *
1632  * @adev: amdgpu_device pointer
1633  * @type: enum amd_ip_block_type
1634  * @major: major version
1635  * @minor: minor version
1636  *
1637  * return 0 if equal or greater
1638  * return 1 if smaller or the ip_block doesn't exist
1639  */
amdgpu_device_ip_block_version_cmp(struct amdgpu_device * adev,enum amd_ip_block_type type,u32 major,u32 minor)1640 int amdgpu_device_ip_block_version_cmp(struct amdgpu_device *adev,
1641 				       enum amd_ip_block_type type,
1642 				       u32 major, u32 minor)
1643 {
1644 	struct amdgpu_ip_block *ip_block = amdgpu_device_ip_get_ip_block(adev, type);
1645 
1646 	if (ip_block && ((ip_block->version->major > major) ||
1647 			((ip_block->version->major == major) &&
1648 			(ip_block->version->minor >= minor))))
1649 		return 0;
1650 
1651 	return 1;
1652 }
1653 
1654 /**
1655  * amdgpu_device_ip_block_add
1656  *
1657  * @adev: amdgpu_device pointer
1658  * @ip_block_version: pointer to the IP to add
1659  *
1660  * Adds the IP block driver information to the collection of IPs
1661  * on the asic.
1662  */
amdgpu_device_ip_block_add(struct amdgpu_device * adev,const struct amdgpu_ip_block_version * ip_block_version)1663 int amdgpu_device_ip_block_add(struct amdgpu_device *adev,
1664 			       const struct amdgpu_ip_block_version *ip_block_version)
1665 {
1666 	if (!ip_block_version)
1667 		return -EINVAL;
1668 
1669 	DRM_INFO("add ip block number %d <%s>\n", adev->num_ip_blocks,
1670 		  ip_block_version->funcs->name);
1671 
1672 	adev->ip_blocks[adev->num_ip_blocks++].version = ip_block_version;
1673 
1674 	return 0;
1675 }
1676 
1677 /**
1678  * amdgpu_device_enable_virtual_display - enable virtual display feature
1679  *
1680  * @adev: amdgpu_device pointer
1681  *
1682  * Enabled the virtual display feature if the user has enabled it via
1683  * the module parameter virtual_display.  This feature provides a virtual
1684  * display hardware on headless boards or in virtualized environments.
1685  * This function parses and validates the configuration string specified by
1686  * the user and configues the virtual display configuration (number of
1687  * virtual connectors, crtcs, etc.) specified.
1688  */
amdgpu_device_enable_virtual_display(struct amdgpu_device * adev)1689 static void amdgpu_device_enable_virtual_display(struct amdgpu_device *adev)
1690 {
1691 	adev->enable_virtual_display = false;
1692 
1693 	if (amdgpu_virtual_display) {
1694 		struct drm_device *ddev = adev_to_drm(adev);
1695 		const char *pci_address_name = pci_name(ddev->pdev);
1696 		char *pciaddstr, *pciaddstr_tmp, *pciaddname_tmp, *pciaddname;
1697 
1698 		pciaddstr = kstrdup(amdgpu_virtual_display, GFP_KERNEL);
1699 		pciaddstr_tmp = pciaddstr;
1700 		while ((pciaddname_tmp = strsep(&pciaddstr_tmp, ";"))) {
1701 			pciaddname = strsep(&pciaddname_tmp, ",");
1702 			if (!strcmp("all", pciaddname)
1703 			    || !strcmp(pci_address_name, pciaddname)) {
1704 				long num_crtc;
1705 				int res = -1;
1706 
1707 				adev->enable_virtual_display = true;
1708 
1709 				if (pciaddname_tmp)
1710 					res = kstrtol(pciaddname_tmp, 10,
1711 						      &num_crtc);
1712 
1713 				if (!res) {
1714 					if (num_crtc < 1)
1715 						num_crtc = 1;
1716 					if (num_crtc > 6)
1717 						num_crtc = 6;
1718 					adev->mode_info.num_crtc = num_crtc;
1719 				} else {
1720 					adev->mode_info.num_crtc = 1;
1721 				}
1722 				break;
1723 			}
1724 		}
1725 
1726 		DRM_INFO("virtual display string:%s, %s:virtual_display:%d, num_crtc:%d\n",
1727 			 amdgpu_virtual_display, pci_address_name,
1728 			 adev->enable_virtual_display, adev->mode_info.num_crtc);
1729 
1730 		kfree(pciaddstr);
1731 	}
1732 }
1733 
1734 /**
1735  * amdgpu_device_parse_gpu_info_fw - parse gpu info firmware
1736  *
1737  * @adev: amdgpu_device pointer
1738  *
1739  * Parses the asic configuration parameters specified in the gpu info
1740  * firmware and makes them availale to the driver for use in configuring
1741  * the asic.
1742  * Returns 0 on success, -EINVAL on failure.
1743  */
amdgpu_device_parse_gpu_info_fw(struct amdgpu_device * adev)1744 static int amdgpu_device_parse_gpu_info_fw(struct amdgpu_device *adev)
1745 {
1746 	const char *chip_name;
1747 	char fw_name[40];
1748 	int err;
1749 	const struct gpu_info_firmware_header_v1_0 *hdr;
1750 
1751 	adev->firmware.gpu_info_fw = NULL;
1752 
1753 	if (adev->mman.discovery_bin) {
1754 		amdgpu_discovery_get_gfx_info(adev);
1755 
1756 		/*
1757 		 * FIXME: The bounding box is still needed by Navi12, so
1758 		 * temporarily read it from gpu_info firmware. Should be droped
1759 		 * when DAL no longer needs it.
1760 		 */
1761 		if (adev->asic_type != CHIP_NAVI12)
1762 			return 0;
1763 	}
1764 
1765 	switch (adev->asic_type) {
1766 #ifdef CONFIG_DRM_AMDGPU_SI
1767 	case CHIP_VERDE:
1768 	case CHIP_TAHITI:
1769 	case CHIP_PITCAIRN:
1770 	case CHIP_OLAND:
1771 	case CHIP_HAINAN:
1772 #endif
1773 #ifdef CONFIG_DRM_AMDGPU_CIK
1774 	case CHIP_BONAIRE:
1775 	case CHIP_HAWAII:
1776 	case CHIP_KAVERI:
1777 	case CHIP_KABINI:
1778 	case CHIP_MULLINS:
1779 #endif
1780 	case CHIP_TOPAZ:
1781 	case CHIP_TONGA:
1782 	case CHIP_FIJI:
1783 	case CHIP_POLARIS10:
1784 	case CHIP_POLARIS11:
1785 	case CHIP_POLARIS12:
1786 	case CHIP_VEGAM:
1787 	case CHIP_CARRIZO:
1788 	case CHIP_STONEY:
1789 	case CHIP_VEGA20:
1790 	case CHIP_SIENNA_CICHLID:
1791 	case CHIP_NAVY_FLOUNDER:
1792 	default:
1793 		return 0;
1794 	case CHIP_VEGA10:
1795 		chip_name = "vega10";
1796 		break;
1797 	case CHIP_VEGA12:
1798 		chip_name = "vega12";
1799 		break;
1800 	case CHIP_RAVEN:
1801 		if (adev->apu_flags & AMD_APU_IS_RAVEN2)
1802 			chip_name = "raven2";
1803 		else if (adev->apu_flags & AMD_APU_IS_PICASSO)
1804 			chip_name = "picasso";
1805 		else
1806 			chip_name = "raven";
1807 		break;
1808 	case CHIP_ARCTURUS:
1809 		chip_name = "arcturus";
1810 		break;
1811 	case CHIP_RENOIR:
1812 		if (adev->apu_flags & AMD_APU_IS_RENOIR)
1813 			chip_name = "renoir";
1814 		else
1815 			chip_name = "green_sardine";
1816 		break;
1817 	case CHIP_NAVI10:
1818 		chip_name = "navi10";
1819 		break;
1820 	case CHIP_NAVI14:
1821 		chip_name = "navi14";
1822 		break;
1823 	case CHIP_NAVI12:
1824 		chip_name = "navi12";
1825 		break;
1826 	}
1827 
1828 	snprintf(fw_name, sizeof(fw_name), "amdgpu/%s_gpu_info.bin", chip_name);
1829 	err = request_firmware(&adev->firmware.gpu_info_fw, fw_name, adev->dev);
1830 	if (err) {
1831 		dev_err(adev->dev,
1832 			"Failed to load gpu_info firmware \"%s\"\n",
1833 			fw_name);
1834 		goto out;
1835 	}
1836 	err = amdgpu_ucode_validate(adev->firmware.gpu_info_fw);
1837 	if (err) {
1838 		dev_err(adev->dev,
1839 			"Failed to validate gpu_info firmware \"%s\"\n",
1840 			fw_name);
1841 		goto out;
1842 	}
1843 
1844 	hdr = (const struct gpu_info_firmware_header_v1_0 *)adev->firmware.gpu_info_fw->data;
1845 	amdgpu_ucode_print_gpu_info_hdr(&hdr->header);
1846 
1847 	switch (hdr->version_major) {
1848 	case 1:
1849 	{
1850 		const struct gpu_info_firmware_v1_0 *gpu_info_fw =
1851 			(const struct gpu_info_firmware_v1_0 *)(adev->firmware.gpu_info_fw->data +
1852 								le32_to_cpu(hdr->header.ucode_array_offset_bytes));
1853 
1854 		/*
1855 		 * Should be droped when DAL no longer needs it.
1856 		 */
1857 		if (adev->asic_type == CHIP_NAVI12)
1858 			goto parse_soc_bounding_box;
1859 
1860 		adev->gfx.config.max_shader_engines = le32_to_cpu(gpu_info_fw->gc_num_se);
1861 		adev->gfx.config.max_cu_per_sh = le32_to_cpu(gpu_info_fw->gc_num_cu_per_sh);
1862 		adev->gfx.config.max_sh_per_se = le32_to_cpu(gpu_info_fw->gc_num_sh_per_se);
1863 		adev->gfx.config.max_backends_per_se = le32_to_cpu(gpu_info_fw->gc_num_rb_per_se);
1864 		adev->gfx.config.max_texture_channel_caches =
1865 			le32_to_cpu(gpu_info_fw->gc_num_tccs);
1866 		adev->gfx.config.max_gprs = le32_to_cpu(gpu_info_fw->gc_num_gprs);
1867 		adev->gfx.config.max_gs_threads = le32_to_cpu(gpu_info_fw->gc_num_max_gs_thds);
1868 		adev->gfx.config.gs_vgt_table_depth = le32_to_cpu(gpu_info_fw->gc_gs_table_depth);
1869 		adev->gfx.config.gs_prim_buffer_depth = le32_to_cpu(gpu_info_fw->gc_gsprim_buff_depth);
1870 		adev->gfx.config.double_offchip_lds_buf =
1871 			le32_to_cpu(gpu_info_fw->gc_double_offchip_lds_buffer);
1872 		adev->gfx.cu_info.wave_front_size = le32_to_cpu(gpu_info_fw->gc_wave_size);
1873 		adev->gfx.cu_info.max_waves_per_simd =
1874 			le32_to_cpu(gpu_info_fw->gc_max_waves_per_simd);
1875 		adev->gfx.cu_info.max_scratch_slots_per_cu =
1876 			le32_to_cpu(gpu_info_fw->gc_max_scratch_slots_per_cu);
1877 		adev->gfx.cu_info.lds_size = le32_to_cpu(gpu_info_fw->gc_lds_size);
1878 		if (hdr->version_minor >= 1) {
1879 			const struct gpu_info_firmware_v1_1 *gpu_info_fw =
1880 				(const struct gpu_info_firmware_v1_1 *)(adev->firmware.gpu_info_fw->data +
1881 									le32_to_cpu(hdr->header.ucode_array_offset_bytes));
1882 			adev->gfx.config.num_sc_per_sh =
1883 				le32_to_cpu(gpu_info_fw->num_sc_per_sh);
1884 			adev->gfx.config.num_packer_per_sc =
1885 				le32_to_cpu(gpu_info_fw->num_packer_per_sc);
1886 		}
1887 
1888 parse_soc_bounding_box:
1889 		/*
1890 		 * soc bounding box info is not integrated in disocovery table,
1891 		 * we always need to parse it from gpu info firmware if needed.
1892 		 */
1893 		if (hdr->version_minor == 2) {
1894 			const struct gpu_info_firmware_v1_2 *gpu_info_fw =
1895 				(const struct gpu_info_firmware_v1_2 *)(adev->firmware.gpu_info_fw->data +
1896 									le32_to_cpu(hdr->header.ucode_array_offset_bytes));
1897 			adev->dm.soc_bounding_box = &gpu_info_fw->soc_bounding_box;
1898 		}
1899 		break;
1900 	}
1901 	default:
1902 		dev_err(adev->dev,
1903 			"Unsupported gpu_info table %d\n", hdr->header.ucode_version);
1904 		err = -EINVAL;
1905 		goto out;
1906 	}
1907 out:
1908 	return err;
1909 }
1910 
1911 /**
1912  * amdgpu_device_ip_early_init - run early init for hardware IPs
1913  *
1914  * @adev: amdgpu_device pointer
1915  *
1916  * Early initialization pass for hardware IPs.  The hardware IPs that make
1917  * up each asic are discovered each IP's early_init callback is run.  This
1918  * is the first stage in initializing the asic.
1919  * Returns 0 on success, negative error code on failure.
1920  */
amdgpu_device_ip_early_init(struct amdgpu_device * adev)1921 static int amdgpu_device_ip_early_init(struct amdgpu_device *adev)
1922 {
1923 	int i, r;
1924 
1925 	amdgpu_device_enable_virtual_display(adev);
1926 
1927 	if (amdgpu_sriov_vf(adev)) {
1928 		r = amdgpu_virt_request_full_gpu(adev, true);
1929 		if (r)
1930 			return r;
1931 	}
1932 
1933 	switch (adev->asic_type) {
1934 #ifdef CONFIG_DRM_AMDGPU_SI
1935 	case CHIP_VERDE:
1936 	case CHIP_TAHITI:
1937 	case CHIP_PITCAIRN:
1938 	case CHIP_OLAND:
1939 	case CHIP_HAINAN:
1940 		adev->family = AMDGPU_FAMILY_SI;
1941 		r = si_set_ip_blocks(adev);
1942 		if (r)
1943 			return r;
1944 		break;
1945 #endif
1946 #ifdef CONFIG_DRM_AMDGPU_CIK
1947 	case CHIP_BONAIRE:
1948 	case CHIP_HAWAII:
1949 	case CHIP_KAVERI:
1950 	case CHIP_KABINI:
1951 	case CHIP_MULLINS:
1952 		if (adev->flags & AMD_IS_APU)
1953 			adev->family = AMDGPU_FAMILY_KV;
1954 		else
1955 			adev->family = AMDGPU_FAMILY_CI;
1956 
1957 		r = cik_set_ip_blocks(adev);
1958 		if (r)
1959 			return r;
1960 		break;
1961 #endif
1962 	case CHIP_TOPAZ:
1963 	case CHIP_TONGA:
1964 	case CHIP_FIJI:
1965 	case CHIP_POLARIS10:
1966 	case CHIP_POLARIS11:
1967 	case CHIP_POLARIS12:
1968 	case CHIP_VEGAM:
1969 	case CHIP_CARRIZO:
1970 	case CHIP_STONEY:
1971 		if (adev->flags & AMD_IS_APU)
1972 			adev->family = AMDGPU_FAMILY_CZ;
1973 		else
1974 			adev->family = AMDGPU_FAMILY_VI;
1975 
1976 		r = vi_set_ip_blocks(adev);
1977 		if (r)
1978 			return r;
1979 		break;
1980 	case CHIP_VEGA10:
1981 	case CHIP_VEGA12:
1982 	case CHIP_VEGA20:
1983 	case CHIP_RAVEN:
1984 	case CHIP_ARCTURUS:
1985 	case CHIP_RENOIR:
1986 		if (adev->flags & AMD_IS_APU)
1987 			adev->family = AMDGPU_FAMILY_RV;
1988 		else
1989 			adev->family = AMDGPU_FAMILY_AI;
1990 
1991 		r = soc15_set_ip_blocks(adev);
1992 		if (r)
1993 			return r;
1994 		break;
1995 	case  CHIP_NAVI10:
1996 	case  CHIP_NAVI14:
1997 	case  CHIP_NAVI12:
1998 	case  CHIP_SIENNA_CICHLID:
1999 	case  CHIP_NAVY_FLOUNDER:
2000 		adev->family = AMDGPU_FAMILY_NV;
2001 
2002 		r = nv_set_ip_blocks(adev);
2003 		if (r)
2004 			return r;
2005 		break;
2006 	default:
2007 		/* FIXME: not supported yet */
2008 		return -EINVAL;
2009 	}
2010 
2011 	amdgpu_amdkfd_device_probe(adev);
2012 
2013 	adev->pm.pp_feature = amdgpu_pp_feature_mask;
2014 	if (amdgpu_sriov_vf(adev) || sched_policy == KFD_SCHED_POLICY_NO_HWS)
2015 		adev->pm.pp_feature &= ~PP_GFXOFF_MASK;
2016 
2017 	for (i = 0; i < adev->num_ip_blocks; i++) {
2018 		if ((amdgpu_ip_block_mask & (1 << i)) == 0) {
2019 			DRM_ERROR("disabled ip block: %d <%s>\n",
2020 				  i, adev->ip_blocks[i].version->funcs->name);
2021 			adev->ip_blocks[i].status.valid = false;
2022 		} else {
2023 			if (adev->ip_blocks[i].version->funcs->early_init) {
2024 				r = adev->ip_blocks[i].version->funcs->early_init((void *)adev);
2025 				if (r == -ENOENT) {
2026 					adev->ip_blocks[i].status.valid = false;
2027 				} else if (r) {
2028 					DRM_ERROR("early_init of IP block <%s> failed %d\n",
2029 						  adev->ip_blocks[i].version->funcs->name, r);
2030 					return r;
2031 				} else {
2032 					adev->ip_blocks[i].status.valid = true;
2033 				}
2034 			} else {
2035 				adev->ip_blocks[i].status.valid = true;
2036 			}
2037 		}
2038 		/* get the vbios after the asic_funcs are set up */
2039 		if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_COMMON) {
2040 			r = amdgpu_device_parse_gpu_info_fw(adev);
2041 			if (r)
2042 				return r;
2043 
2044 			/* Read BIOS */
2045 			if (!amdgpu_get_bios(adev))
2046 				return -EINVAL;
2047 
2048 			r = amdgpu_atombios_init(adev);
2049 			if (r) {
2050 				dev_err(adev->dev, "amdgpu_atombios_init failed\n");
2051 				amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_ATOMBIOS_INIT_FAIL, 0, 0);
2052 				return r;
2053 			}
2054 
2055 			/*get pf2vf msg info at it's earliest time*/
2056 			if (amdgpu_sriov_vf(adev))
2057 				amdgpu_virt_init_data_exchange(adev);
2058 
2059 		}
2060 	}
2061 
2062 	adev->cg_flags &= amdgpu_cg_mask;
2063 	adev->pg_flags &= amdgpu_pg_mask;
2064 
2065 	return 0;
2066 }
2067 
amdgpu_device_ip_hw_init_phase1(struct amdgpu_device * adev)2068 static int amdgpu_device_ip_hw_init_phase1(struct amdgpu_device *adev)
2069 {
2070 	int i, r;
2071 
2072 	for (i = 0; i < adev->num_ip_blocks; i++) {
2073 		if (!adev->ip_blocks[i].status.sw)
2074 			continue;
2075 		if (adev->ip_blocks[i].status.hw)
2076 			continue;
2077 		if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_COMMON ||
2078 		    (amdgpu_sriov_vf(adev) && (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_PSP)) ||
2079 		    adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_IH) {
2080 			r = adev->ip_blocks[i].version->funcs->hw_init(adev);
2081 			if (r) {
2082 				DRM_ERROR("hw_init of IP block <%s> failed %d\n",
2083 					  adev->ip_blocks[i].version->funcs->name, r);
2084 				return r;
2085 			}
2086 			adev->ip_blocks[i].status.hw = true;
2087 		}
2088 	}
2089 
2090 	return 0;
2091 }
2092 
amdgpu_device_ip_hw_init_phase2(struct amdgpu_device * adev)2093 static int amdgpu_device_ip_hw_init_phase2(struct amdgpu_device *adev)
2094 {
2095 	int i, r;
2096 
2097 	for (i = 0; i < adev->num_ip_blocks; i++) {
2098 		if (!adev->ip_blocks[i].status.sw)
2099 			continue;
2100 		if (adev->ip_blocks[i].status.hw)
2101 			continue;
2102 		r = adev->ip_blocks[i].version->funcs->hw_init(adev);
2103 		if (r) {
2104 			DRM_ERROR("hw_init of IP block <%s> failed %d\n",
2105 				  adev->ip_blocks[i].version->funcs->name, r);
2106 			return r;
2107 		}
2108 		adev->ip_blocks[i].status.hw = true;
2109 	}
2110 
2111 	return 0;
2112 }
2113 
amdgpu_device_fw_loading(struct amdgpu_device * adev)2114 static int amdgpu_device_fw_loading(struct amdgpu_device *adev)
2115 {
2116 	int r = 0;
2117 	int i;
2118 	uint32_t smu_version;
2119 
2120 	if (adev->asic_type >= CHIP_VEGA10) {
2121 		for (i = 0; i < adev->num_ip_blocks; i++) {
2122 			if (adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_PSP)
2123 				continue;
2124 
2125 			/* no need to do the fw loading again if already done*/
2126 			if (adev->ip_blocks[i].status.hw == true)
2127 				break;
2128 
2129 			if (amdgpu_in_reset(adev) || adev->in_suspend) {
2130 				r = adev->ip_blocks[i].version->funcs->resume(adev);
2131 				if (r) {
2132 					DRM_ERROR("resume of IP block <%s> failed %d\n",
2133 							  adev->ip_blocks[i].version->funcs->name, r);
2134 					return r;
2135 				}
2136 			} else {
2137 				r = adev->ip_blocks[i].version->funcs->hw_init(adev);
2138 				if (r) {
2139 					DRM_ERROR("hw_init of IP block <%s> failed %d\n",
2140 							  adev->ip_blocks[i].version->funcs->name, r);
2141 					return r;
2142 				}
2143 			}
2144 
2145 			adev->ip_blocks[i].status.hw = true;
2146 			break;
2147 		}
2148 	}
2149 
2150 	if (!amdgpu_sriov_vf(adev) || adev->asic_type == CHIP_TONGA)
2151 		r = amdgpu_pm_load_smu_firmware(adev, &smu_version);
2152 
2153 	return r;
2154 }
2155 
2156 /**
2157  * amdgpu_device_ip_init - run init for hardware IPs
2158  *
2159  * @adev: amdgpu_device pointer
2160  *
2161  * Main initialization pass for hardware IPs.  The list of all the hardware
2162  * IPs that make up the asic is walked and the sw_init and hw_init callbacks
2163  * are run.  sw_init initializes the software state associated with each IP
2164  * and hw_init initializes the hardware associated with each IP.
2165  * Returns 0 on success, negative error code on failure.
2166  */
amdgpu_device_ip_init(struct amdgpu_device * adev)2167 static int amdgpu_device_ip_init(struct amdgpu_device *adev)
2168 {
2169 	int i, r;
2170 
2171 	r = amdgpu_ras_init(adev);
2172 	if (r)
2173 		return r;
2174 
2175 	for (i = 0; i < adev->num_ip_blocks; i++) {
2176 		if (!adev->ip_blocks[i].status.valid)
2177 			continue;
2178 		r = adev->ip_blocks[i].version->funcs->sw_init((void *)adev);
2179 		if (r) {
2180 			DRM_ERROR("sw_init of IP block <%s> failed %d\n",
2181 				  adev->ip_blocks[i].version->funcs->name, r);
2182 			goto init_failed;
2183 		}
2184 		adev->ip_blocks[i].status.sw = true;
2185 
2186 		/* need to do gmc hw init early so we can allocate gpu mem */
2187 		if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC) {
2188 			/* Try to reserve bad pages early */
2189 			if (amdgpu_sriov_vf(adev))
2190 				amdgpu_virt_exchange_data(adev);
2191 
2192 			r = amdgpu_device_vram_scratch_init(adev);
2193 			if (r) {
2194 				DRM_ERROR("amdgpu_vram_scratch_init failed %d\n", r);
2195 				goto init_failed;
2196 			}
2197 			r = adev->ip_blocks[i].version->funcs->hw_init((void *)adev);
2198 			if (r) {
2199 				DRM_ERROR("hw_init %d failed %d\n", i, r);
2200 				goto init_failed;
2201 			}
2202 			r = amdgpu_device_wb_init(adev);
2203 			if (r) {
2204 				DRM_ERROR("amdgpu_device_wb_init failed %d\n", r);
2205 				goto init_failed;
2206 			}
2207 			adev->ip_blocks[i].status.hw = true;
2208 
2209 			/* right after GMC hw init, we create CSA */
2210 			if (amdgpu_mcbp || amdgpu_sriov_vf(adev)) {
2211 				r = amdgpu_allocate_static_csa(adev, &adev->virt.csa_obj,
2212 								AMDGPU_GEM_DOMAIN_VRAM,
2213 								AMDGPU_CSA_SIZE);
2214 				if (r) {
2215 					DRM_ERROR("allocate CSA failed %d\n", r);
2216 					goto init_failed;
2217 				}
2218 			}
2219 		}
2220 	}
2221 
2222 	if (amdgpu_sriov_vf(adev))
2223 		amdgpu_virt_init_data_exchange(adev);
2224 
2225 	r = amdgpu_ib_pool_init(adev);
2226 	if (r) {
2227 		dev_err(adev->dev, "IB initialization failed (%d).\n", r);
2228 		amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_IB_INIT_FAIL, 0, r);
2229 		goto init_failed;
2230 	}
2231 
2232 	r = amdgpu_ucode_create_bo(adev); /* create ucode bo when sw_init complete*/
2233 	if (r)
2234 		goto init_failed;
2235 
2236 	r = amdgpu_device_ip_hw_init_phase1(adev);
2237 	if (r)
2238 		goto init_failed;
2239 
2240 	r = amdgpu_device_fw_loading(adev);
2241 	if (r)
2242 		goto init_failed;
2243 
2244 	r = amdgpu_device_ip_hw_init_phase2(adev);
2245 	if (r)
2246 		goto init_failed;
2247 
2248 	/*
2249 	 * retired pages will be loaded from eeprom and reserved here,
2250 	 * it should be called after amdgpu_device_ip_hw_init_phase2  since
2251 	 * for some ASICs the RAS EEPROM code relies on SMU fully functioning
2252 	 * for I2C communication which only true at this point.
2253 	 *
2254 	 * amdgpu_ras_recovery_init may fail, but the upper only cares the
2255 	 * failure from bad gpu situation and stop amdgpu init process
2256 	 * accordingly. For other failed cases, it will still release all
2257 	 * the resource and print error message, rather than returning one
2258 	 * negative value to upper level.
2259 	 *
2260 	 * Note: theoretically, this should be called before all vram allocations
2261 	 * to protect retired page from abusing
2262 	 */
2263 	r = amdgpu_ras_recovery_init(adev);
2264 	if (r)
2265 		goto init_failed;
2266 
2267 	if (adev->gmc.xgmi.num_physical_nodes > 1)
2268 		amdgpu_xgmi_add_device(adev);
2269 	amdgpu_amdkfd_device_init(adev);
2270 
2271 	amdgpu_fru_get_product_info(adev);
2272 
2273 init_failed:
2274 	if (amdgpu_sriov_vf(adev))
2275 		amdgpu_virt_release_full_gpu(adev, true);
2276 
2277 	return r;
2278 }
2279 
2280 /**
2281  * amdgpu_device_fill_reset_magic - writes reset magic to gart pointer
2282  *
2283  * @adev: amdgpu_device pointer
2284  *
2285  * Writes a reset magic value to the gart pointer in VRAM.  The driver calls
2286  * this function before a GPU reset.  If the value is retained after a
2287  * GPU reset, VRAM has not been lost.  Some GPU resets may destry VRAM contents.
2288  */
amdgpu_device_fill_reset_magic(struct amdgpu_device * adev)2289 static void amdgpu_device_fill_reset_magic(struct amdgpu_device *adev)
2290 {
2291 	memcpy(adev->reset_magic, adev->gart.ptr, AMDGPU_RESET_MAGIC_NUM);
2292 }
2293 
2294 /**
2295  * amdgpu_device_check_vram_lost - check if vram is valid
2296  *
2297  * @adev: amdgpu_device pointer
2298  *
2299  * Checks the reset magic value written to the gart pointer in VRAM.
2300  * The driver calls this after a GPU reset to see if the contents of
2301  * VRAM is lost or now.
2302  * returns true if vram is lost, false if not.
2303  */
amdgpu_device_check_vram_lost(struct amdgpu_device * adev)2304 static bool amdgpu_device_check_vram_lost(struct amdgpu_device *adev)
2305 {
2306 	if (memcmp(adev->gart.ptr, adev->reset_magic,
2307 			AMDGPU_RESET_MAGIC_NUM))
2308 		return true;
2309 
2310 	if (!amdgpu_in_reset(adev))
2311 		return false;
2312 
2313 	/*
2314 	 * For all ASICs with baco/mode1 reset, the VRAM is
2315 	 * always assumed to be lost.
2316 	 */
2317 	switch (amdgpu_asic_reset_method(adev)) {
2318 	case AMD_RESET_METHOD_BACO:
2319 	case AMD_RESET_METHOD_MODE1:
2320 		return true;
2321 	default:
2322 		return false;
2323 	}
2324 }
2325 
2326 /**
2327  * amdgpu_device_set_cg_state - set clockgating for amdgpu device
2328  *
2329  * @adev: amdgpu_device pointer
2330  * @state: clockgating state (gate or ungate)
2331  *
2332  * The list of all the hardware IPs that make up the asic is walked and the
2333  * set_clockgating_state callbacks are run.
2334  * Late initialization pass enabling clockgating for hardware IPs.
2335  * Fini or suspend, pass disabling clockgating for hardware IPs.
2336  * Returns 0 on success, negative error code on failure.
2337  */
2338 
amdgpu_device_set_cg_state(struct amdgpu_device * adev,enum amd_clockgating_state state)2339 static int amdgpu_device_set_cg_state(struct amdgpu_device *adev,
2340 						enum amd_clockgating_state state)
2341 {
2342 	int i, j, r;
2343 
2344 	if (amdgpu_emu_mode == 1)
2345 		return 0;
2346 
2347 	for (j = 0; j < adev->num_ip_blocks; j++) {
2348 		i = state == AMD_CG_STATE_GATE ? j : adev->num_ip_blocks - j - 1;
2349 		if (!adev->ip_blocks[i].status.late_initialized)
2350 			continue;
2351 		/* skip CG for VCE/UVD, it's handled specially */
2352 		if (adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_UVD &&
2353 		    adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_VCE &&
2354 		    adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_VCN &&
2355 		    adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_JPEG &&
2356 		    adev->ip_blocks[i].version->funcs->set_clockgating_state) {
2357 			/* enable clockgating to save power */
2358 			r = adev->ip_blocks[i].version->funcs->set_clockgating_state((void *)adev,
2359 										     state);
2360 			if (r) {
2361 				DRM_ERROR("set_clockgating_state(gate) of IP block <%s> failed %d\n",
2362 					  adev->ip_blocks[i].version->funcs->name, r);
2363 				return r;
2364 			}
2365 		}
2366 	}
2367 
2368 	return 0;
2369 }
2370 
amdgpu_device_set_pg_state(struct amdgpu_device * adev,enum amd_powergating_state state)2371 static int amdgpu_device_set_pg_state(struct amdgpu_device *adev, enum amd_powergating_state state)
2372 {
2373 	int i, j, r;
2374 
2375 	if (amdgpu_emu_mode == 1)
2376 		return 0;
2377 
2378 	for (j = 0; j < adev->num_ip_blocks; j++) {
2379 		i = state == AMD_PG_STATE_GATE ? j : adev->num_ip_blocks - j - 1;
2380 		if (!adev->ip_blocks[i].status.late_initialized)
2381 			continue;
2382 		/* skip CG for VCE/UVD, it's handled specially */
2383 		if (adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_UVD &&
2384 		    adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_VCE &&
2385 		    adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_VCN &&
2386 		    adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_JPEG &&
2387 		    adev->ip_blocks[i].version->funcs->set_powergating_state) {
2388 			/* enable powergating to save power */
2389 			r = adev->ip_blocks[i].version->funcs->set_powergating_state((void *)adev,
2390 											state);
2391 			if (r) {
2392 				DRM_ERROR("set_powergating_state(gate) of IP block <%s> failed %d\n",
2393 					  adev->ip_blocks[i].version->funcs->name, r);
2394 				return r;
2395 			}
2396 		}
2397 	}
2398 	return 0;
2399 }
2400 
amdgpu_device_enable_mgpu_fan_boost(void)2401 static int amdgpu_device_enable_mgpu_fan_boost(void)
2402 {
2403 	struct amdgpu_gpu_instance *gpu_ins;
2404 	struct amdgpu_device *adev;
2405 	int i, ret = 0;
2406 
2407 	mutex_lock(&mgpu_info.mutex);
2408 
2409 	/*
2410 	 * MGPU fan boost feature should be enabled
2411 	 * only when there are two or more dGPUs in
2412 	 * the system
2413 	 */
2414 	if (mgpu_info.num_dgpu < 2)
2415 		goto out;
2416 
2417 	for (i = 0; i < mgpu_info.num_dgpu; i++) {
2418 		gpu_ins = &(mgpu_info.gpu_ins[i]);
2419 		adev = gpu_ins->adev;
2420 		if (!(adev->flags & AMD_IS_APU) &&
2421 		    !gpu_ins->mgpu_fan_enabled) {
2422 			ret = amdgpu_dpm_enable_mgpu_fan_boost(adev);
2423 			if (ret)
2424 				break;
2425 
2426 			gpu_ins->mgpu_fan_enabled = 1;
2427 		}
2428 	}
2429 
2430 out:
2431 	mutex_unlock(&mgpu_info.mutex);
2432 
2433 	return ret;
2434 }
2435 
2436 /**
2437  * amdgpu_device_ip_late_init - run late init for hardware IPs
2438  *
2439  * @adev: amdgpu_device pointer
2440  *
2441  * Late initialization pass for hardware IPs.  The list of all the hardware
2442  * IPs that make up the asic is walked and the late_init callbacks are run.
2443  * late_init covers any special initialization that an IP requires
2444  * after all of the have been initialized or something that needs to happen
2445  * late in the init process.
2446  * Returns 0 on success, negative error code on failure.
2447  */
amdgpu_device_ip_late_init(struct amdgpu_device * adev)2448 static int amdgpu_device_ip_late_init(struct amdgpu_device *adev)
2449 {
2450 	struct amdgpu_gpu_instance *gpu_instance;
2451 	int i = 0, r;
2452 
2453 	for (i = 0; i < adev->num_ip_blocks; i++) {
2454 		if (!adev->ip_blocks[i].status.hw)
2455 			continue;
2456 		if (adev->ip_blocks[i].version->funcs->late_init) {
2457 			r = adev->ip_blocks[i].version->funcs->late_init((void *)adev);
2458 			if (r) {
2459 				DRM_ERROR("late_init of IP block <%s> failed %d\n",
2460 					  adev->ip_blocks[i].version->funcs->name, r);
2461 				return r;
2462 			}
2463 		}
2464 		adev->ip_blocks[i].status.late_initialized = true;
2465 	}
2466 
2467 	amdgpu_ras_set_error_query_ready(adev, true);
2468 
2469 	amdgpu_device_set_cg_state(adev, AMD_CG_STATE_GATE);
2470 	amdgpu_device_set_pg_state(adev, AMD_PG_STATE_GATE);
2471 
2472 	amdgpu_device_fill_reset_magic(adev);
2473 
2474 	r = amdgpu_device_enable_mgpu_fan_boost();
2475 	if (r)
2476 		DRM_ERROR("enable mgpu fan boost failed (%d).\n", r);
2477 
2478 
2479 	if (adev->gmc.xgmi.num_physical_nodes > 1) {
2480 		mutex_lock(&mgpu_info.mutex);
2481 
2482 		/*
2483 		 * Reset device p-state to low as this was booted with high.
2484 		 *
2485 		 * This should be performed only after all devices from the same
2486 		 * hive get initialized.
2487 		 *
2488 		 * However, it's unknown how many device in the hive in advance.
2489 		 * As this is counted one by one during devices initializations.
2490 		 *
2491 		 * So, we wait for all XGMI interlinked devices initialized.
2492 		 * This may bring some delays as those devices may come from
2493 		 * different hives. But that should be OK.
2494 		 */
2495 		if (mgpu_info.num_dgpu == adev->gmc.xgmi.num_physical_nodes) {
2496 			for (i = 0; i < mgpu_info.num_gpu; i++) {
2497 				gpu_instance = &(mgpu_info.gpu_ins[i]);
2498 				if (gpu_instance->adev->flags & AMD_IS_APU)
2499 					continue;
2500 
2501 				r = amdgpu_xgmi_set_pstate(gpu_instance->adev,
2502 						AMDGPU_XGMI_PSTATE_MIN);
2503 				if (r) {
2504 					DRM_ERROR("pstate setting failed (%d).\n", r);
2505 					break;
2506 				}
2507 			}
2508 		}
2509 
2510 		mutex_unlock(&mgpu_info.mutex);
2511 	}
2512 
2513 	return 0;
2514 }
2515 
2516 /**
2517  * amdgpu_device_ip_fini - run fini for hardware IPs
2518  *
2519  * @adev: amdgpu_device pointer
2520  *
2521  * Main teardown pass for hardware IPs.  The list of all the hardware
2522  * IPs that make up the asic is walked and the hw_fini and sw_fini callbacks
2523  * are run.  hw_fini tears down the hardware associated with each IP
2524  * and sw_fini tears down any software state associated with each IP.
2525  * Returns 0 on success, negative error code on failure.
2526  */
amdgpu_device_ip_fini(struct amdgpu_device * adev)2527 static int amdgpu_device_ip_fini(struct amdgpu_device *adev)
2528 {
2529 	int i, r;
2530 
2531 	if (amdgpu_sriov_vf(adev) && adev->virt.ras_init_done)
2532 		amdgpu_virt_release_ras_err_handler_data(adev);
2533 
2534 	amdgpu_ras_pre_fini(adev);
2535 
2536 	if (adev->gmc.xgmi.num_physical_nodes > 1)
2537 		amdgpu_xgmi_remove_device(adev);
2538 
2539 	amdgpu_device_set_pg_state(adev, AMD_PG_STATE_UNGATE);
2540 	amdgpu_device_set_cg_state(adev, AMD_CG_STATE_UNGATE);
2541 
2542 	amdgpu_amdkfd_device_fini(adev);
2543 
2544 	/* need to disable SMC first */
2545 	for (i = 0; i < adev->num_ip_blocks; i++) {
2546 		if (!adev->ip_blocks[i].status.hw)
2547 			continue;
2548 		if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_SMC) {
2549 			r = adev->ip_blocks[i].version->funcs->hw_fini((void *)adev);
2550 			/* XXX handle errors */
2551 			if (r) {
2552 				DRM_DEBUG("hw_fini of IP block <%s> failed %d\n",
2553 					  adev->ip_blocks[i].version->funcs->name, r);
2554 			}
2555 			adev->ip_blocks[i].status.hw = false;
2556 			break;
2557 		}
2558 	}
2559 
2560 	for (i = adev->num_ip_blocks - 1; i >= 0; i--) {
2561 		if (!adev->ip_blocks[i].status.hw)
2562 			continue;
2563 
2564 		r = adev->ip_blocks[i].version->funcs->hw_fini((void *)adev);
2565 		/* XXX handle errors */
2566 		if (r) {
2567 			DRM_DEBUG("hw_fini of IP block <%s> failed %d\n",
2568 				  adev->ip_blocks[i].version->funcs->name, r);
2569 		}
2570 
2571 		adev->ip_blocks[i].status.hw = false;
2572 	}
2573 
2574 
2575 	for (i = adev->num_ip_blocks - 1; i >= 0; i--) {
2576 		if (!adev->ip_blocks[i].status.sw)
2577 			continue;
2578 
2579 		if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC) {
2580 			amdgpu_ucode_free_bo(adev);
2581 			amdgpu_free_static_csa(&adev->virt.csa_obj);
2582 			amdgpu_device_wb_fini(adev);
2583 			amdgpu_device_vram_scratch_fini(adev);
2584 			amdgpu_ib_pool_fini(adev);
2585 		}
2586 
2587 		r = adev->ip_blocks[i].version->funcs->sw_fini((void *)adev);
2588 		/* XXX handle errors */
2589 		if (r) {
2590 			DRM_DEBUG("sw_fini of IP block <%s> failed %d\n",
2591 				  adev->ip_blocks[i].version->funcs->name, r);
2592 		}
2593 		adev->ip_blocks[i].status.sw = false;
2594 		adev->ip_blocks[i].status.valid = false;
2595 	}
2596 
2597 	for (i = adev->num_ip_blocks - 1; i >= 0; i--) {
2598 		if (!adev->ip_blocks[i].status.late_initialized)
2599 			continue;
2600 		if (adev->ip_blocks[i].version->funcs->late_fini)
2601 			adev->ip_blocks[i].version->funcs->late_fini((void *)adev);
2602 		adev->ip_blocks[i].status.late_initialized = false;
2603 	}
2604 
2605 	amdgpu_ras_fini(adev);
2606 
2607 	if (amdgpu_sriov_vf(adev))
2608 		if (amdgpu_virt_release_full_gpu(adev, false))
2609 			DRM_ERROR("failed to release exclusive mode on fini\n");
2610 
2611 	return 0;
2612 }
2613 
2614 /**
2615  * amdgpu_device_delayed_init_work_handler - work handler for IB tests
2616  *
2617  * @work: work_struct.
2618  */
amdgpu_device_delayed_init_work_handler(struct work_struct * work)2619 static void amdgpu_device_delayed_init_work_handler(struct work_struct *work)
2620 {
2621 	struct amdgpu_device *adev =
2622 		container_of(work, struct amdgpu_device, delayed_init_work.work);
2623 	int r;
2624 
2625 	r = amdgpu_ib_ring_tests(adev);
2626 	if (r)
2627 		DRM_ERROR("ib ring test failed (%d).\n", r);
2628 }
2629 
amdgpu_device_delay_enable_gfx_off(struct work_struct * work)2630 static void amdgpu_device_delay_enable_gfx_off(struct work_struct *work)
2631 {
2632 	struct amdgpu_device *adev =
2633 		container_of(work, struct amdgpu_device, gfx.gfx_off_delay_work.work);
2634 
2635 	WARN_ON_ONCE(adev->gfx.gfx_off_state);
2636 	WARN_ON_ONCE(adev->gfx.gfx_off_req_count);
2637 
2638 	if (!amdgpu_dpm_set_powergating_by_smu(adev, AMD_IP_BLOCK_TYPE_GFX, true))
2639 		adev->gfx.gfx_off_state = true;
2640 }
2641 
2642 /**
2643  * amdgpu_device_ip_suspend_phase1 - run suspend for hardware IPs (phase 1)
2644  *
2645  * @adev: amdgpu_device pointer
2646  *
2647  * Main suspend function for hardware IPs.  The list of all the hardware
2648  * IPs that make up the asic is walked, clockgating is disabled and the
2649  * suspend callbacks are run.  suspend puts the hardware and software state
2650  * in each IP into a state suitable for suspend.
2651  * Returns 0 on success, negative error code on failure.
2652  */
amdgpu_device_ip_suspend_phase1(struct amdgpu_device * adev)2653 static int amdgpu_device_ip_suspend_phase1(struct amdgpu_device *adev)
2654 {
2655 	int i, r;
2656 
2657 	amdgpu_device_set_pg_state(adev, AMD_PG_STATE_UNGATE);
2658 	amdgpu_device_set_cg_state(adev, AMD_CG_STATE_UNGATE);
2659 
2660 	for (i = adev->num_ip_blocks - 1; i >= 0; i--) {
2661 		if (!adev->ip_blocks[i].status.valid)
2662 			continue;
2663 
2664 		/* displays are handled separately */
2665 		if (adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_DCE)
2666 			continue;
2667 
2668 		/* XXX handle errors */
2669 		r = adev->ip_blocks[i].version->funcs->suspend(adev);
2670 		/* XXX handle errors */
2671 		if (r) {
2672 			DRM_ERROR("suspend of IP block <%s> failed %d\n",
2673 				  adev->ip_blocks[i].version->funcs->name, r);
2674 			return r;
2675 		}
2676 
2677 		adev->ip_blocks[i].status.hw = false;
2678 	}
2679 
2680 	return 0;
2681 }
2682 
2683 /**
2684  * amdgpu_device_ip_suspend_phase2 - run suspend for hardware IPs (phase 2)
2685  *
2686  * @adev: amdgpu_device pointer
2687  *
2688  * Main suspend function for hardware IPs.  The list of all the hardware
2689  * IPs that make up the asic is walked, clockgating is disabled and the
2690  * suspend callbacks are run.  suspend puts the hardware and software state
2691  * in each IP into a state suitable for suspend.
2692  * Returns 0 on success, negative error code on failure.
2693  */
amdgpu_device_ip_suspend_phase2(struct amdgpu_device * adev)2694 static int amdgpu_device_ip_suspend_phase2(struct amdgpu_device *adev)
2695 {
2696 	int i, r;
2697 
2698 	for (i = adev->num_ip_blocks - 1; i >= 0; i--) {
2699 		if (!adev->ip_blocks[i].status.valid)
2700 			continue;
2701 		/* displays are handled in phase1 */
2702 		if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_DCE)
2703 			continue;
2704 		/* PSP lost connection when err_event_athub occurs */
2705 		if (amdgpu_ras_intr_triggered() &&
2706 		    adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_PSP) {
2707 			adev->ip_blocks[i].status.hw = false;
2708 			continue;
2709 		}
2710 		/* XXX handle errors */
2711 		r = adev->ip_blocks[i].version->funcs->suspend(adev);
2712 		/* XXX handle errors */
2713 		if (r) {
2714 			DRM_ERROR("suspend of IP block <%s> failed %d\n",
2715 				  adev->ip_blocks[i].version->funcs->name, r);
2716 		}
2717 		adev->ip_blocks[i].status.hw = false;
2718 		/* handle putting the SMC in the appropriate state */
2719 		if(!amdgpu_sriov_vf(adev)){
2720 			if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_SMC) {
2721 				r = amdgpu_dpm_set_mp1_state(adev, adev->mp1_state);
2722 				if (r) {
2723 					DRM_ERROR("SMC failed to set mp1 state %d, %d\n",
2724 							adev->mp1_state, r);
2725 					return r;
2726 				}
2727 			}
2728 		}
2729 		adev->ip_blocks[i].status.hw = false;
2730 	}
2731 
2732 	return 0;
2733 }
2734 
2735 /**
2736  * amdgpu_device_ip_suspend - run suspend for hardware IPs
2737  *
2738  * @adev: amdgpu_device pointer
2739  *
2740  * Main suspend function for hardware IPs.  The list of all the hardware
2741  * IPs that make up the asic is walked, clockgating is disabled and the
2742  * suspend callbacks are run.  suspend puts the hardware and software state
2743  * in each IP into a state suitable for suspend.
2744  * Returns 0 on success, negative error code on failure.
2745  */
amdgpu_device_ip_suspend(struct amdgpu_device * adev)2746 int amdgpu_device_ip_suspend(struct amdgpu_device *adev)
2747 {
2748 	int r;
2749 
2750 	if (amdgpu_sriov_vf(adev))
2751 		amdgpu_virt_request_full_gpu(adev, false);
2752 
2753 	r = amdgpu_device_ip_suspend_phase1(adev);
2754 	if (r)
2755 		return r;
2756 	r = amdgpu_device_ip_suspend_phase2(adev);
2757 
2758 	if (amdgpu_sriov_vf(adev))
2759 		amdgpu_virt_release_full_gpu(adev, false);
2760 
2761 	return r;
2762 }
2763 
amdgpu_device_ip_reinit_early_sriov(struct amdgpu_device * adev)2764 static int amdgpu_device_ip_reinit_early_sriov(struct amdgpu_device *adev)
2765 {
2766 	int i, r;
2767 
2768 	static enum amd_ip_block_type ip_order[] = {
2769 		AMD_IP_BLOCK_TYPE_GMC,
2770 		AMD_IP_BLOCK_TYPE_COMMON,
2771 		AMD_IP_BLOCK_TYPE_PSP,
2772 		AMD_IP_BLOCK_TYPE_IH,
2773 	};
2774 
2775 	for (i = 0; i < adev->num_ip_blocks; i++) {
2776 		int j;
2777 		struct amdgpu_ip_block *block;
2778 
2779 		block = &adev->ip_blocks[i];
2780 		block->status.hw = false;
2781 
2782 		for (j = 0; j < ARRAY_SIZE(ip_order); j++) {
2783 
2784 			if (block->version->type != ip_order[j] ||
2785 				!block->status.valid)
2786 				continue;
2787 
2788 			r = block->version->funcs->hw_init(adev);
2789 			DRM_INFO("RE-INIT-early: %s %s\n", block->version->funcs->name, r?"failed":"succeeded");
2790 			if (r)
2791 				return r;
2792 			block->status.hw = true;
2793 		}
2794 	}
2795 
2796 	return 0;
2797 }
2798 
amdgpu_device_ip_reinit_late_sriov(struct amdgpu_device * adev)2799 static int amdgpu_device_ip_reinit_late_sriov(struct amdgpu_device *adev)
2800 {
2801 	int i, r;
2802 
2803 	static enum amd_ip_block_type ip_order[] = {
2804 		AMD_IP_BLOCK_TYPE_SMC,
2805 		AMD_IP_BLOCK_TYPE_DCE,
2806 		AMD_IP_BLOCK_TYPE_GFX,
2807 		AMD_IP_BLOCK_TYPE_SDMA,
2808 		AMD_IP_BLOCK_TYPE_UVD,
2809 		AMD_IP_BLOCK_TYPE_VCE,
2810 		AMD_IP_BLOCK_TYPE_VCN
2811 	};
2812 
2813 	for (i = 0; i < ARRAY_SIZE(ip_order); i++) {
2814 		int j;
2815 		struct amdgpu_ip_block *block;
2816 
2817 		for (j = 0; j < adev->num_ip_blocks; j++) {
2818 			block = &adev->ip_blocks[j];
2819 
2820 			if (block->version->type != ip_order[i] ||
2821 				!block->status.valid ||
2822 				block->status.hw)
2823 				continue;
2824 
2825 			if (block->version->type == AMD_IP_BLOCK_TYPE_SMC)
2826 				r = block->version->funcs->resume(adev);
2827 			else
2828 				r = block->version->funcs->hw_init(adev);
2829 
2830 			DRM_INFO("RE-INIT-late: %s %s\n", block->version->funcs->name, r?"failed":"succeeded");
2831 			if (r)
2832 				return r;
2833 			block->status.hw = true;
2834 		}
2835 	}
2836 
2837 	return 0;
2838 }
2839 
2840 /**
2841  * amdgpu_device_ip_resume_phase1 - run resume for hardware IPs
2842  *
2843  * @adev: amdgpu_device pointer
2844  *
2845  * First resume function for hardware IPs.  The list of all the hardware
2846  * IPs that make up the asic is walked and the resume callbacks are run for
2847  * COMMON, GMC, and IH.  resume puts the hardware into a functional state
2848  * after a suspend and updates the software state as necessary.  This
2849  * function is also used for restoring the GPU after a GPU reset.
2850  * Returns 0 on success, negative error code on failure.
2851  */
amdgpu_device_ip_resume_phase1(struct amdgpu_device * adev)2852 static int amdgpu_device_ip_resume_phase1(struct amdgpu_device *adev)
2853 {
2854 	int i, r;
2855 
2856 	for (i = 0; i < adev->num_ip_blocks; i++) {
2857 		if (!adev->ip_blocks[i].status.valid || adev->ip_blocks[i].status.hw)
2858 			continue;
2859 		if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_COMMON ||
2860 		    adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC ||
2861 		    adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_IH) {
2862 
2863 			r = adev->ip_blocks[i].version->funcs->resume(adev);
2864 			if (r) {
2865 				DRM_ERROR("resume of IP block <%s> failed %d\n",
2866 					  adev->ip_blocks[i].version->funcs->name, r);
2867 				return r;
2868 			}
2869 			adev->ip_blocks[i].status.hw = true;
2870 		}
2871 	}
2872 
2873 	return 0;
2874 }
2875 
2876 /**
2877  * amdgpu_device_ip_resume_phase2 - run resume for hardware IPs
2878  *
2879  * @adev: amdgpu_device pointer
2880  *
2881  * First resume function for hardware IPs.  The list of all the hardware
2882  * IPs that make up the asic is walked and the resume callbacks are run for
2883  * all blocks except COMMON, GMC, and IH.  resume puts the hardware into a
2884  * functional state after a suspend and updates the software state as
2885  * necessary.  This function is also used for restoring the GPU after a GPU
2886  * reset.
2887  * Returns 0 on success, negative error code on failure.
2888  */
amdgpu_device_ip_resume_phase2(struct amdgpu_device * adev)2889 static int amdgpu_device_ip_resume_phase2(struct amdgpu_device *adev)
2890 {
2891 	int i, r;
2892 
2893 	for (i = 0; i < adev->num_ip_blocks; i++) {
2894 		if (!adev->ip_blocks[i].status.valid || adev->ip_blocks[i].status.hw)
2895 			continue;
2896 		if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_COMMON ||
2897 		    adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC ||
2898 		    adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_IH ||
2899 		    adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_PSP)
2900 			continue;
2901 		r = adev->ip_blocks[i].version->funcs->resume(adev);
2902 		if (r) {
2903 			DRM_ERROR("resume of IP block <%s> failed %d\n",
2904 				  adev->ip_blocks[i].version->funcs->name, r);
2905 			return r;
2906 		}
2907 		adev->ip_blocks[i].status.hw = true;
2908 	}
2909 
2910 	return 0;
2911 }
2912 
2913 /**
2914  * amdgpu_device_ip_resume - run resume for hardware IPs
2915  *
2916  * @adev: amdgpu_device pointer
2917  *
2918  * Main resume function for hardware IPs.  The hardware IPs
2919  * are split into two resume functions because they are
2920  * are also used in in recovering from a GPU reset and some additional
2921  * steps need to be take between them.  In this case (S3/S4) they are
2922  * run sequentially.
2923  * Returns 0 on success, negative error code on failure.
2924  */
amdgpu_device_ip_resume(struct amdgpu_device * adev)2925 static int amdgpu_device_ip_resume(struct amdgpu_device *adev)
2926 {
2927 	int r;
2928 
2929 	r = amdgpu_amdkfd_resume_iommu(adev);
2930 	if (r)
2931 		return r;
2932 
2933 	r = amdgpu_device_ip_resume_phase1(adev);
2934 	if (r)
2935 		return r;
2936 
2937 	r = amdgpu_device_fw_loading(adev);
2938 	if (r)
2939 		return r;
2940 
2941 	r = amdgpu_device_ip_resume_phase2(adev);
2942 
2943 	return r;
2944 }
2945 
2946 /**
2947  * amdgpu_device_detect_sriov_bios - determine if the board supports SR-IOV
2948  *
2949  * @adev: amdgpu_device pointer
2950  *
2951  * Query the VBIOS data tables to determine if the board supports SR-IOV.
2952  */
amdgpu_device_detect_sriov_bios(struct amdgpu_device * adev)2953 static void amdgpu_device_detect_sriov_bios(struct amdgpu_device *adev)
2954 {
2955 	if (amdgpu_sriov_vf(adev)) {
2956 		if (adev->is_atom_fw) {
2957 			if (amdgpu_atomfirmware_gpu_supports_virtualization(adev))
2958 				adev->virt.caps |= AMDGPU_SRIOV_CAPS_SRIOV_VBIOS;
2959 		} else {
2960 			if (amdgpu_atombios_has_gpu_virtualization_table(adev))
2961 				adev->virt.caps |= AMDGPU_SRIOV_CAPS_SRIOV_VBIOS;
2962 		}
2963 
2964 		if (!(adev->virt.caps & AMDGPU_SRIOV_CAPS_SRIOV_VBIOS))
2965 			amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_NO_VBIOS, 0, 0);
2966 	}
2967 }
2968 
2969 /**
2970  * amdgpu_device_asic_has_dc_support - determine if DC supports the asic
2971  *
2972  * @asic_type: AMD asic type
2973  *
2974  * Check if there is DC (new modesetting infrastructre) support for an asic.
2975  * returns true if DC has support, false if not.
2976  */
amdgpu_device_asic_has_dc_support(enum amd_asic_type asic_type)2977 bool amdgpu_device_asic_has_dc_support(enum amd_asic_type asic_type)
2978 {
2979 	switch (asic_type) {
2980 #if defined(CONFIG_DRM_AMD_DC)
2981 #if defined(CONFIG_DRM_AMD_DC_SI)
2982 	case CHIP_TAHITI:
2983 	case CHIP_PITCAIRN:
2984 	case CHIP_VERDE:
2985 	case CHIP_OLAND:
2986 #endif
2987 	case CHIP_BONAIRE:
2988 	case CHIP_KAVERI:
2989 	case CHIP_KABINI:
2990 	case CHIP_MULLINS:
2991 		/*
2992 		 * We have systems in the wild with these ASICs that require
2993 		 * LVDS and VGA support which is not supported with DC.
2994 		 *
2995 		 * Fallback to the non-DC driver here by default so as not to
2996 		 * cause regressions.
2997 		 */
2998 		return amdgpu_dc > 0;
2999 	case CHIP_HAWAII:
3000 	case CHIP_CARRIZO:
3001 	case CHIP_STONEY:
3002 	case CHIP_POLARIS10:
3003 	case CHIP_POLARIS11:
3004 	case CHIP_POLARIS12:
3005 	case CHIP_VEGAM:
3006 	case CHIP_TONGA:
3007 	case CHIP_FIJI:
3008 	case CHIP_VEGA10:
3009 	case CHIP_VEGA12:
3010 	case CHIP_VEGA20:
3011 #if defined(CONFIG_DRM_AMD_DC_DCN)
3012 	case CHIP_RAVEN:
3013 	case CHIP_NAVI10:
3014 	case CHIP_NAVI14:
3015 	case CHIP_NAVI12:
3016 	case CHIP_RENOIR:
3017 #endif
3018 #if defined(CONFIG_DRM_AMD_DC_DCN3_0)
3019 	case CHIP_SIENNA_CICHLID:
3020 	case CHIP_NAVY_FLOUNDER:
3021 #endif
3022 		return amdgpu_dc != 0;
3023 #endif
3024 	default:
3025 		if (amdgpu_dc > 0)
3026 			DRM_INFO_ONCE("Display Core has been requested via kernel parameter "
3027 					 "but isn't supported by ASIC, ignoring\n");
3028 		return false;
3029 	}
3030 }
3031 
3032 /**
3033  * amdgpu_device_has_dc_support - check if dc is supported
3034  *
3035  * @adev: amdgpu_device pointer
3036  *
3037  * Returns true for supported, false for not supported
3038  */
amdgpu_device_has_dc_support(struct amdgpu_device * adev)3039 bool amdgpu_device_has_dc_support(struct amdgpu_device *adev)
3040 {
3041 	if (amdgpu_sriov_vf(adev) || adev->enable_virtual_display)
3042 		return false;
3043 
3044 	return amdgpu_device_asic_has_dc_support(adev->asic_type);
3045 }
3046 
3047 
amdgpu_device_xgmi_reset_func(struct work_struct * __work)3048 static void amdgpu_device_xgmi_reset_func(struct work_struct *__work)
3049 {
3050 	struct amdgpu_device *adev =
3051 		container_of(__work, struct amdgpu_device, xgmi_reset_work);
3052 	struct amdgpu_hive_info *hive = amdgpu_get_xgmi_hive(adev);
3053 
3054 	/* It's a bug to not have a hive within this function */
3055 	if (WARN_ON(!hive))
3056 		return;
3057 
3058 	/*
3059 	 * Use task barrier to synchronize all xgmi reset works across the
3060 	 * hive. task_barrier_enter and task_barrier_exit will block
3061 	 * until all the threads running the xgmi reset works reach
3062 	 * those points. task_barrier_full will do both blocks.
3063 	 */
3064 	if (amdgpu_asic_reset_method(adev) == AMD_RESET_METHOD_BACO) {
3065 
3066 		task_barrier_enter(&hive->tb);
3067 		adev->asic_reset_res = amdgpu_device_baco_enter(adev_to_drm(adev));
3068 
3069 		if (adev->asic_reset_res)
3070 			goto fail;
3071 
3072 		task_barrier_exit(&hive->tb);
3073 		adev->asic_reset_res = amdgpu_device_baco_exit(adev_to_drm(adev));
3074 
3075 		if (adev->asic_reset_res)
3076 			goto fail;
3077 
3078 		if (adev->mmhub.funcs && adev->mmhub.funcs->reset_ras_error_count)
3079 			adev->mmhub.funcs->reset_ras_error_count(adev);
3080 	} else {
3081 
3082 		task_barrier_full(&hive->tb);
3083 		adev->asic_reset_res =  amdgpu_asic_reset(adev);
3084 	}
3085 
3086 fail:
3087 	if (adev->asic_reset_res)
3088 		DRM_WARN("ASIC reset failed with error, %d for drm dev, %s",
3089 			 adev->asic_reset_res, adev_to_drm(adev)->unique);
3090 	amdgpu_put_xgmi_hive(hive);
3091 }
3092 
amdgpu_device_get_job_timeout_settings(struct amdgpu_device * adev)3093 static int amdgpu_device_get_job_timeout_settings(struct amdgpu_device *adev)
3094 {
3095 	char *input = amdgpu_lockup_timeout;
3096 	char *timeout_setting = NULL;
3097 	int index = 0;
3098 	long timeout;
3099 	int ret = 0;
3100 
3101 	/*
3102 	 * By default timeout for non compute jobs is 10000.
3103 	 * And there is no timeout enforced on compute jobs.
3104 	 * In SR-IOV or passthrough mode, timeout for compute
3105 	 * jobs are 60000 by default.
3106 	 */
3107 	adev->gfx_timeout = msecs_to_jiffies(10000);
3108 	adev->sdma_timeout = adev->video_timeout = adev->gfx_timeout;
3109 	if (amdgpu_sriov_vf(adev) || amdgpu_passthrough(adev))
3110 		adev->compute_timeout =  msecs_to_jiffies(60000);
3111 	else
3112 		adev->compute_timeout = MAX_SCHEDULE_TIMEOUT;
3113 
3114 	if (strnlen(input, AMDGPU_MAX_TIMEOUT_PARAM_LENGTH)) {
3115 		while ((timeout_setting = strsep(&input, ",")) &&
3116 				strnlen(timeout_setting, AMDGPU_MAX_TIMEOUT_PARAM_LENGTH)) {
3117 			ret = kstrtol(timeout_setting, 0, &timeout);
3118 			if (ret)
3119 				return ret;
3120 
3121 			if (timeout == 0) {
3122 				index++;
3123 				continue;
3124 			} else if (timeout < 0) {
3125 				timeout = MAX_SCHEDULE_TIMEOUT;
3126 			} else {
3127 				timeout = msecs_to_jiffies(timeout);
3128 			}
3129 
3130 			switch (index++) {
3131 			case 0:
3132 				adev->gfx_timeout = timeout;
3133 				break;
3134 			case 1:
3135 				adev->compute_timeout = timeout;
3136 				break;
3137 			case 2:
3138 				adev->sdma_timeout = timeout;
3139 				break;
3140 			case 3:
3141 				adev->video_timeout = timeout;
3142 				break;
3143 			default:
3144 				break;
3145 			}
3146 		}
3147 		/*
3148 		 * There is only one value specified and
3149 		 * it should apply to all non-compute jobs.
3150 		 */
3151 		if (index == 1) {
3152 			adev->sdma_timeout = adev->video_timeout = adev->gfx_timeout;
3153 			if (amdgpu_sriov_vf(adev) || amdgpu_passthrough(adev))
3154 				adev->compute_timeout = adev->gfx_timeout;
3155 		}
3156 	}
3157 
3158 	return ret;
3159 }
3160 
3161 static const struct attribute *amdgpu_dev_attributes[] = {
3162 	&dev_attr_product_name.attr,
3163 	&dev_attr_product_number.attr,
3164 	&dev_attr_serial_number.attr,
3165 	&dev_attr_pcie_replay_count.attr,
3166 	NULL
3167 };
3168 
3169 
3170 /**
3171  * amdgpu_device_init - initialize the driver
3172  *
3173  * @adev: amdgpu_device pointer
3174  * @flags: driver flags
3175  *
3176  * Initializes the driver info and hw (all asics).
3177  * Returns 0 for success or an error on failure.
3178  * Called at driver startup.
3179  */
amdgpu_device_init(struct amdgpu_device * adev,uint32_t flags)3180 int amdgpu_device_init(struct amdgpu_device *adev,
3181 		       uint32_t flags)
3182 {
3183 	struct drm_device *ddev = adev_to_drm(adev);
3184 	struct pci_dev *pdev = adev->pdev;
3185 	int r, i;
3186 	bool boco = false;
3187 	u32 max_MBps;
3188 
3189 	adev->shutdown = false;
3190 	adev->flags = flags;
3191 
3192 	if (amdgpu_force_asic_type >= 0 && amdgpu_force_asic_type < CHIP_LAST)
3193 		adev->asic_type = amdgpu_force_asic_type;
3194 	else
3195 		adev->asic_type = flags & AMD_ASIC_MASK;
3196 
3197 	adev->usec_timeout = AMDGPU_MAX_USEC_TIMEOUT;
3198 	if (amdgpu_emu_mode == 1)
3199 		adev->usec_timeout *= 10;
3200 	adev->gmc.gart_size = 512 * 1024 * 1024;
3201 	adev->accel_working = false;
3202 	adev->num_rings = 0;
3203 	adev->mman.buffer_funcs = NULL;
3204 	adev->mman.buffer_funcs_ring = NULL;
3205 	adev->vm_manager.vm_pte_funcs = NULL;
3206 	adev->vm_manager.vm_pte_num_scheds = 0;
3207 	adev->gmc.gmc_funcs = NULL;
3208 	adev->fence_context = dma_fence_context_alloc(AMDGPU_MAX_RINGS);
3209 	bitmap_zero(adev->gfx.pipe_reserve_bitmap, AMDGPU_MAX_COMPUTE_QUEUES);
3210 
3211 	adev->smc_rreg = &amdgpu_invalid_rreg;
3212 	adev->smc_wreg = &amdgpu_invalid_wreg;
3213 	adev->pcie_rreg = &amdgpu_invalid_rreg;
3214 	adev->pcie_wreg = &amdgpu_invalid_wreg;
3215 	adev->pciep_rreg = &amdgpu_invalid_rreg;
3216 	adev->pciep_wreg = &amdgpu_invalid_wreg;
3217 	adev->pcie_rreg64 = &amdgpu_invalid_rreg64;
3218 	adev->pcie_wreg64 = &amdgpu_invalid_wreg64;
3219 	adev->uvd_ctx_rreg = &amdgpu_invalid_rreg;
3220 	adev->uvd_ctx_wreg = &amdgpu_invalid_wreg;
3221 	adev->didt_rreg = &amdgpu_invalid_rreg;
3222 	adev->didt_wreg = &amdgpu_invalid_wreg;
3223 	adev->gc_cac_rreg = &amdgpu_invalid_rreg;
3224 	adev->gc_cac_wreg = &amdgpu_invalid_wreg;
3225 	adev->audio_endpt_rreg = &amdgpu_block_invalid_rreg;
3226 	adev->audio_endpt_wreg = &amdgpu_block_invalid_wreg;
3227 
3228 	DRM_INFO("initializing kernel modesetting (%s 0x%04X:0x%04X 0x%04X:0x%04X 0x%02X).\n",
3229 		 amdgpu_asic_name[adev->asic_type], pdev->vendor, pdev->device,
3230 		 pdev->subsystem_vendor, pdev->subsystem_device, pdev->revision);
3231 
3232 	/* mutex initialization are all done here so we
3233 	 * can recall function without having locking issues */
3234 	atomic_set(&adev->irq.ih.lock, 0);
3235 	mutex_init(&adev->firmware.mutex);
3236 	mutex_init(&adev->pm.mutex);
3237 	mutex_init(&adev->gfx.gpu_clock_mutex);
3238 	mutex_init(&adev->srbm_mutex);
3239 	mutex_init(&adev->gfx.pipe_reserve_mutex);
3240 	mutex_init(&adev->gfx.gfx_off_mutex);
3241 	mutex_init(&adev->grbm_idx_mutex);
3242 	mutex_init(&adev->mn_lock);
3243 	mutex_init(&adev->virt.vf_errors.lock);
3244 	hash_init(adev->mn_hash);
3245 	atomic_set(&adev->in_gpu_reset, 0);
3246 	init_rwsem(&adev->reset_sem);
3247 	mutex_init(&adev->psp.mutex);
3248 	mutex_init(&adev->notifier_lock);
3249 
3250 	r = amdgpu_device_check_arguments(adev);
3251 	if (r)
3252 		return r;
3253 
3254 	spin_lock_init(&adev->mmio_idx_lock);
3255 	spin_lock_init(&adev->smc_idx_lock);
3256 	spin_lock_init(&adev->pcie_idx_lock);
3257 	spin_lock_init(&adev->uvd_ctx_idx_lock);
3258 	spin_lock_init(&adev->didt_idx_lock);
3259 	spin_lock_init(&adev->gc_cac_idx_lock);
3260 	spin_lock_init(&adev->se_cac_idx_lock);
3261 	spin_lock_init(&adev->audio_endpt_idx_lock);
3262 	spin_lock_init(&adev->mm_stats.lock);
3263 
3264 	INIT_LIST_HEAD(&adev->shadow_list);
3265 	mutex_init(&adev->shadow_list_lock);
3266 
3267 	INIT_DELAYED_WORK(&adev->delayed_init_work,
3268 			  amdgpu_device_delayed_init_work_handler);
3269 	INIT_DELAYED_WORK(&adev->gfx.gfx_off_delay_work,
3270 			  amdgpu_device_delay_enable_gfx_off);
3271 
3272 	INIT_WORK(&adev->xgmi_reset_work, amdgpu_device_xgmi_reset_func);
3273 
3274 	adev->gfx.gfx_off_req_count = 1;
3275 	adev->pm.ac_power = power_supply_is_system_supplied() > 0;
3276 
3277 	atomic_set(&adev->throttling_logging_enabled, 1);
3278 	/*
3279 	 * If throttling continues, logging will be performed every minute
3280 	 * to avoid log flooding. "-1" is subtracted since the thermal
3281 	 * throttling interrupt comes every second. Thus, the total logging
3282 	 * interval is 59 seconds(retelimited printk interval) + 1(waiting
3283 	 * for throttling interrupt) = 60 seconds.
3284 	 */
3285 	ratelimit_state_init(&adev->throttling_logging_rs, (60 - 1) * HZ, 1);
3286 	ratelimit_set_flags(&adev->throttling_logging_rs, RATELIMIT_MSG_ON_RELEASE);
3287 
3288 	/* Registers mapping */
3289 	/* TODO: block userspace mapping of io register */
3290 	if (adev->asic_type >= CHIP_BONAIRE) {
3291 		adev->rmmio_base = pci_resource_start(adev->pdev, 5);
3292 		adev->rmmio_size = pci_resource_len(adev->pdev, 5);
3293 	} else {
3294 		adev->rmmio_base = pci_resource_start(adev->pdev, 2);
3295 		adev->rmmio_size = pci_resource_len(adev->pdev, 2);
3296 	}
3297 
3298 	adev->rmmio = ioremap(adev->rmmio_base, adev->rmmio_size);
3299 	if (adev->rmmio == NULL) {
3300 		return -ENOMEM;
3301 	}
3302 	DRM_INFO("register mmio base: 0x%08X\n", (uint32_t)adev->rmmio_base);
3303 	DRM_INFO("register mmio size: %u\n", (unsigned)adev->rmmio_size);
3304 
3305 	/* io port mapping */
3306 	for (i = 0; i < DEVICE_COUNT_RESOURCE; i++) {
3307 		if (pci_resource_flags(adev->pdev, i) & IORESOURCE_IO) {
3308 			adev->rio_mem_size = pci_resource_len(adev->pdev, i);
3309 			adev->rio_mem = pci_iomap(adev->pdev, i, adev->rio_mem_size);
3310 			break;
3311 		}
3312 	}
3313 	if (adev->rio_mem == NULL)
3314 		DRM_INFO("PCI I/O BAR is not found.\n");
3315 
3316 	/* enable PCIE atomic ops */
3317 	r = pci_enable_atomic_ops_to_root(adev->pdev,
3318 					  PCI_EXP_DEVCAP2_ATOMIC_COMP32 |
3319 					  PCI_EXP_DEVCAP2_ATOMIC_COMP64);
3320 	if (r) {
3321 		adev->have_atomics_support = false;
3322 		DRM_INFO("PCIE atomic ops is not supported\n");
3323 	} else {
3324 		adev->have_atomics_support = true;
3325 	}
3326 
3327 	amdgpu_device_get_pcie_info(adev);
3328 
3329 	if (amdgpu_mcbp)
3330 		DRM_INFO("MCBP is enabled\n");
3331 
3332 	if (amdgpu_mes && adev->asic_type >= CHIP_NAVI10)
3333 		adev->enable_mes = true;
3334 
3335 	/* detect hw virtualization here */
3336 	amdgpu_detect_virtualization(adev);
3337 
3338 	r = amdgpu_device_get_job_timeout_settings(adev);
3339 	if (r) {
3340 		dev_err(adev->dev, "invalid lockup_timeout parameter syntax\n");
3341 		return r;
3342 	}
3343 
3344 	/* early init functions */
3345 	r = amdgpu_device_ip_early_init(adev);
3346 	if (r)
3347 		return r;
3348 
3349 	/* doorbell bar mapping and doorbell index init*/
3350 	amdgpu_device_doorbell_init(adev);
3351 
3352 	/* if we have > 1 VGA cards, then disable the amdgpu VGA resources */
3353 	/* this will fail for cards that aren't VGA class devices, just
3354 	 * ignore it */
3355 	vga_client_register(adev->pdev, adev, NULL, amdgpu_device_vga_set_decode);
3356 
3357 	if (amdgpu_device_supports_boco(ddev))
3358 		boco = true;
3359 	if (amdgpu_has_atpx() &&
3360 	    (amdgpu_is_atpx_hybrid() ||
3361 	     amdgpu_has_atpx_dgpu_power_cntl()) &&
3362 	    !pci_is_thunderbolt_attached(adev->pdev))
3363 		vga_switcheroo_register_client(adev->pdev,
3364 					       &amdgpu_switcheroo_ops, boco);
3365 	if (boco)
3366 		vga_switcheroo_init_domain_pm_ops(adev->dev, &adev->vga_pm_domain);
3367 
3368 	if (amdgpu_emu_mode == 1) {
3369 		/* post the asic on emulation mode */
3370 		emu_soc_asic_init(adev);
3371 		goto fence_driver_init;
3372 	}
3373 
3374 	/* detect if we are with an SRIOV vbios */
3375 	amdgpu_device_detect_sriov_bios(adev);
3376 
3377 	/* check if we need to reset the asic
3378 	 *  E.g., driver was not cleanly unloaded previously, etc.
3379 	 */
3380 	if (!amdgpu_sriov_vf(adev) && amdgpu_asic_need_reset_on_init(adev)) {
3381 		r = amdgpu_asic_reset(adev);
3382 		if (r) {
3383 			dev_err(adev->dev, "asic reset on init failed\n");
3384 			goto failed;
3385 		}
3386 	}
3387 
3388 	pci_enable_pcie_error_reporting(adev->ddev.pdev);
3389 
3390 	/* Post card if necessary */
3391 	if (amdgpu_device_need_post(adev)) {
3392 		if (!adev->bios) {
3393 			dev_err(adev->dev, "no vBIOS found\n");
3394 			r = -EINVAL;
3395 			goto failed;
3396 		}
3397 		DRM_INFO("GPU posting now...\n");
3398 		r = amdgpu_device_asic_init(adev);
3399 		if (r) {
3400 			dev_err(adev->dev, "gpu post error!\n");
3401 			goto failed;
3402 		}
3403 	}
3404 
3405 	if (adev->is_atom_fw) {
3406 		/* Initialize clocks */
3407 		r = amdgpu_atomfirmware_get_clock_info(adev);
3408 		if (r) {
3409 			dev_err(adev->dev, "amdgpu_atomfirmware_get_clock_info failed\n");
3410 			amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_ATOMBIOS_GET_CLOCK_FAIL, 0, 0);
3411 			goto failed;
3412 		}
3413 	} else {
3414 		/* Initialize clocks */
3415 		r = amdgpu_atombios_get_clock_info(adev);
3416 		if (r) {
3417 			dev_err(adev->dev, "amdgpu_atombios_get_clock_info failed\n");
3418 			amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_ATOMBIOS_GET_CLOCK_FAIL, 0, 0);
3419 			goto failed;
3420 		}
3421 		/* init i2c buses */
3422 		if (!amdgpu_device_has_dc_support(adev))
3423 			amdgpu_atombios_i2c_init(adev);
3424 	}
3425 
3426 fence_driver_init:
3427 	/* Fence driver */
3428 	r = amdgpu_fence_driver_init(adev);
3429 	if (r) {
3430 		dev_err(adev->dev, "amdgpu_fence_driver_init failed\n");
3431 		amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_FENCE_INIT_FAIL, 0, 0);
3432 		goto failed;
3433 	}
3434 
3435 	/* init the mode config */
3436 	drm_mode_config_init(adev_to_drm(adev));
3437 
3438 	r = amdgpu_device_ip_init(adev);
3439 	if (r) {
3440 		/* failed in exclusive mode due to timeout */
3441 		if (amdgpu_sriov_vf(adev) &&
3442 		    !amdgpu_sriov_runtime(adev) &&
3443 		    amdgpu_virt_mmio_blocked(adev) &&
3444 		    !amdgpu_virt_wait_reset(adev)) {
3445 			dev_err(adev->dev, "VF exclusive mode timeout\n");
3446 			/* Don't send request since VF is inactive. */
3447 			adev->virt.caps &= ~AMDGPU_SRIOV_CAPS_RUNTIME;
3448 			adev->virt.ops = NULL;
3449 			r = -EAGAIN;
3450 			goto failed;
3451 		}
3452 		dev_err(adev->dev, "amdgpu_device_ip_init failed\n");
3453 		amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_AMDGPU_INIT_FAIL, 0, 0);
3454 		goto failed;
3455 	}
3456 
3457 	dev_info(adev->dev,
3458 		"SE %d, SH per SE %d, CU per SH %d, active_cu_number %d\n",
3459 			adev->gfx.config.max_shader_engines,
3460 			adev->gfx.config.max_sh_per_se,
3461 			adev->gfx.config.max_cu_per_sh,
3462 			adev->gfx.cu_info.number);
3463 
3464 	adev->accel_working = true;
3465 
3466 	amdgpu_vm_check_compute_bug(adev);
3467 
3468 	/* Initialize the buffer migration limit. */
3469 	if (amdgpu_moverate >= 0)
3470 		max_MBps = amdgpu_moverate;
3471 	else
3472 		max_MBps = 8; /* Allow 8 MB/s. */
3473 	/* Get a log2 for easy divisions. */
3474 	adev->mm_stats.log2_max_MBps = ilog2(max(1u, max_MBps));
3475 
3476 	amdgpu_fbdev_init(adev);
3477 
3478 	r = amdgpu_pm_sysfs_init(adev);
3479 	if (r) {
3480 		adev->pm_sysfs_en = false;
3481 		DRM_ERROR("registering pm debugfs failed (%d).\n", r);
3482 	} else
3483 		adev->pm_sysfs_en = true;
3484 
3485 	r = amdgpu_ucode_sysfs_init(adev);
3486 	if (r) {
3487 		adev->ucode_sysfs_en = false;
3488 		DRM_ERROR("Creating firmware sysfs failed (%d).\n", r);
3489 	} else
3490 		adev->ucode_sysfs_en = true;
3491 
3492 	if ((amdgpu_testing & 1)) {
3493 		if (adev->accel_working)
3494 			amdgpu_test_moves(adev);
3495 		else
3496 			DRM_INFO("amdgpu: acceleration disabled, skipping move tests\n");
3497 	}
3498 	if (amdgpu_benchmarking) {
3499 		if (adev->accel_working)
3500 			amdgpu_benchmark(adev, amdgpu_benchmarking);
3501 		else
3502 			DRM_INFO("amdgpu: acceleration disabled, skipping benchmarks\n");
3503 	}
3504 
3505 	/*
3506 	 * Register gpu instance before amdgpu_device_enable_mgpu_fan_boost.
3507 	 * Otherwise the mgpu fan boost feature will be skipped due to the
3508 	 * gpu instance is counted less.
3509 	 */
3510 	amdgpu_register_gpu_instance(adev);
3511 
3512 	/* enable clockgating, etc. after ib tests, etc. since some blocks require
3513 	 * explicit gating rather than handling it automatically.
3514 	 */
3515 	r = amdgpu_device_ip_late_init(adev);
3516 	if (r) {
3517 		dev_err(adev->dev, "amdgpu_device_ip_late_init failed\n");
3518 		amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_AMDGPU_LATE_INIT_FAIL, 0, r);
3519 		goto failed;
3520 	}
3521 
3522 	/* must succeed. */
3523 	amdgpu_ras_resume(adev);
3524 
3525 	queue_delayed_work(system_wq, &adev->delayed_init_work,
3526 			   msecs_to_jiffies(AMDGPU_RESUME_MS));
3527 
3528 	if (amdgpu_sriov_vf(adev))
3529 		flush_delayed_work(&adev->delayed_init_work);
3530 
3531 	r = sysfs_create_files(&adev->dev->kobj, amdgpu_dev_attributes);
3532 	if (r)
3533 		dev_err(adev->dev, "Could not create amdgpu device attr\n");
3534 
3535 	if (IS_ENABLED(CONFIG_PERF_EVENTS))
3536 		r = amdgpu_pmu_init(adev);
3537 	if (r)
3538 		dev_err(adev->dev, "amdgpu_pmu_init failed\n");
3539 
3540 	/* Have stored pci confspace at hand for restore in sudden PCI error */
3541 	if (amdgpu_device_cache_pci_state(adev->pdev))
3542 		pci_restore_state(pdev);
3543 
3544 	return 0;
3545 
3546 failed:
3547 	amdgpu_vf_error_trans_all(adev);
3548 	if (boco)
3549 		vga_switcheroo_fini_domain_pm_ops(adev->dev);
3550 
3551 	return r;
3552 }
3553 
3554 /**
3555  * amdgpu_device_fini - tear down the driver
3556  *
3557  * @adev: amdgpu_device pointer
3558  *
3559  * Tear down the driver info (all asics).
3560  * Called at driver shutdown.
3561  */
amdgpu_device_fini(struct amdgpu_device * adev)3562 void amdgpu_device_fini(struct amdgpu_device *adev)
3563 {
3564 	dev_info(adev->dev, "amdgpu: finishing device.\n");
3565 	flush_delayed_work(&adev->delayed_init_work);
3566 	ttm_bo_lock_delayed_workqueue(&adev->mman.bdev);
3567 	adev->shutdown = true;
3568 
3569 	kfree(adev->pci_state);
3570 
3571 	/* make sure IB test finished before entering exclusive mode
3572 	 * to avoid preemption on IB test
3573 	 * */
3574 	if (amdgpu_sriov_vf(adev)) {
3575 		amdgpu_virt_request_full_gpu(adev, false);
3576 		amdgpu_virt_fini_data_exchange(adev);
3577 	}
3578 
3579 	/* disable all interrupts */
3580 	amdgpu_irq_disable_all(adev);
3581 	if (adev->mode_info.mode_config_initialized){
3582 		if (!amdgpu_device_has_dc_support(adev))
3583 			drm_helper_force_disable_all(adev_to_drm(adev));
3584 		else
3585 			drm_atomic_helper_shutdown(adev_to_drm(adev));
3586 	}
3587 	amdgpu_fence_driver_fini(adev);
3588 	if (adev->pm_sysfs_en)
3589 		amdgpu_pm_sysfs_fini(adev);
3590 	amdgpu_fbdev_fini(adev);
3591 	amdgpu_device_ip_fini(adev);
3592 	release_firmware(adev->firmware.gpu_info_fw);
3593 	adev->firmware.gpu_info_fw = NULL;
3594 	adev->accel_working = false;
3595 	/* free i2c buses */
3596 	if (!amdgpu_device_has_dc_support(adev))
3597 		amdgpu_i2c_fini(adev);
3598 
3599 	if (amdgpu_emu_mode != 1)
3600 		amdgpu_atombios_fini(adev);
3601 
3602 	kfree(adev->bios);
3603 	adev->bios = NULL;
3604 	if (amdgpu_has_atpx() &&
3605 	    (amdgpu_is_atpx_hybrid() ||
3606 	     amdgpu_has_atpx_dgpu_power_cntl()) &&
3607 	    !pci_is_thunderbolt_attached(adev->pdev))
3608 		vga_switcheroo_unregister_client(adev->pdev);
3609 	if (amdgpu_device_supports_boco(adev_to_drm(adev)))
3610 		vga_switcheroo_fini_domain_pm_ops(adev->dev);
3611 	vga_client_register(adev->pdev, NULL, NULL, NULL);
3612 	if (adev->rio_mem)
3613 		pci_iounmap(adev->pdev, adev->rio_mem);
3614 	adev->rio_mem = NULL;
3615 	iounmap(adev->rmmio);
3616 	adev->rmmio = NULL;
3617 	amdgpu_device_doorbell_fini(adev);
3618 
3619 	if (adev->ucode_sysfs_en)
3620 		amdgpu_ucode_sysfs_fini(adev);
3621 
3622 	sysfs_remove_files(&adev->dev->kobj, amdgpu_dev_attributes);
3623 	if (IS_ENABLED(CONFIG_PERF_EVENTS))
3624 		amdgpu_pmu_fini(adev);
3625 	if (adev->mman.discovery_bin)
3626 		amdgpu_discovery_fini(adev);
3627 }
3628 
3629 
3630 /*
3631  * Suspend & resume.
3632  */
3633 /**
3634  * amdgpu_device_suspend - initiate device suspend
3635  *
3636  * @dev: drm dev pointer
3637  * @fbcon : notify the fbdev of suspend
3638  *
3639  * Puts the hw in the suspend state (all asics).
3640  * Returns 0 for success or an error on failure.
3641  * Called at driver suspend.
3642  */
amdgpu_device_suspend(struct drm_device * dev,bool fbcon)3643 int amdgpu_device_suspend(struct drm_device *dev, bool fbcon)
3644 {
3645 	struct amdgpu_device *adev;
3646 	struct drm_crtc *crtc;
3647 	struct drm_connector *connector;
3648 	struct drm_connector_list_iter iter;
3649 	int r;
3650 
3651 	adev = drm_to_adev(dev);
3652 
3653 	if (dev->switch_power_state == DRM_SWITCH_POWER_OFF)
3654 		return 0;
3655 
3656 	adev->in_suspend = true;
3657 	drm_kms_helper_poll_disable(dev);
3658 
3659 	if (fbcon)
3660 		amdgpu_fbdev_set_suspend(adev, 1);
3661 
3662 	cancel_delayed_work_sync(&adev->delayed_init_work);
3663 
3664 	if (!amdgpu_device_has_dc_support(adev)) {
3665 		/* turn off display hw */
3666 		drm_modeset_lock_all(dev);
3667 		drm_connector_list_iter_begin(dev, &iter);
3668 		drm_for_each_connector_iter(connector, &iter)
3669 			drm_helper_connector_dpms(connector,
3670 						  DRM_MODE_DPMS_OFF);
3671 		drm_connector_list_iter_end(&iter);
3672 		drm_modeset_unlock_all(dev);
3673 			/* unpin the front buffers and cursors */
3674 		list_for_each_entry(crtc, &dev->mode_config.crtc_list, head) {
3675 			struct amdgpu_crtc *amdgpu_crtc = to_amdgpu_crtc(crtc);
3676 			struct drm_framebuffer *fb = crtc->primary->fb;
3677 			struct amdgpu_bo *robj;
3678 
3679 			if (amdgpu_crtc->cursor_bo && !adev->enable_virtual_display) {
3680 				struct amdgpu_bo *aobj = gem_to_amdgpu_bo(amdgpu_crtc->cursor_bo);
3681 				r = amdgpu_bo_reserve(aobj, true);
3682 				if (r == 0) {
3683 					amdgpu_bo_unpin(aobj);
3684 					amdgpu_bo_unreserve(aobj);
3685 				}
3686 			}
3687 
3688 			if (fb == NULL || fb->obj[0] == NULL) {
3689 				continue;
3690 			}
3691 			robj = gem_to_amdgpu_bo(fb->obj[0]);
3692 			/* don't unpin kernel fb objects */
3693 			if (!amdgpu_fbdev_robj_is_fb(adev, robj)) {
3694 				r = amdgpu_bo_reserve(robj, true);
3695 				if (r == 0) {
3696 					amdgpu_bo_unpin(robj);
3697 					amdgpu_bo_unreserve(robj);
3698 				}
3699 			}
3700 		}
3701 	}
3702 
3703 	amdgpu_ras_suspend(adev);
3704 
3705 	r = amdgpu_device_ip_suspend_phase1(adev);
3706 
3707 	amdgpu_amdkfd_suspend(adev, !fbcon);
3708 
3709 	/* evict vram memory */
3710 	amdgpu_bo_evict_vram(adev);
3711 
3712 	amdgpu_fence_driver_suspend(adev);
3713 
3714 	r = amdgpu_device_ip_suspend_phase2(adev);
3715 
3716 	/* evict remaining vram memory
3717 	 * This second call to evict vram is to evict the gart page table
3718 	 * using the CPU.
3719 	 */
3720 	amdgpu_bo_evict_vram(adev);
3721 
3722 	return 0;
3723 }
3724 
3725 /**
3726  * amdgpu_device_resume - initiate device resume
3727  *
3728  * @dev: drm dev pointer
3729  * @fbcon : notify the fbdev of resume
3730  *
3731  * Bring the hw back to operating state (all asics).
3732  * Returns 0 for success or an error on failure.
3733  * Called at driver resume.
3734  */
amdgpu_device_resume(struct drm_device * dev,bool fbcon)3735 int amdgpu_device_resume(struct drm_device *dev, bool fbcon)
3736 {
3737 	struct drm_connector *connector;
3738 	struct drm_connector_list_iter iter;
3739 	struct amdgpu_device *adev = drm_to_adev(dev);
3740 	struct drm_crtc *crtc;
3741 	int r = 0;
3742 
3743 	if (dev->switch_power_state == DRM_SWITCH_POWER_OFF)
3744 		return 0;
3745 
3746 	/* post card */
3747 	if (amdgpu_device_need_post(adev)) {
3748 		r = amdgpu_device_asic_init(adev);
3749 		if (r)
3750 			dev_err(adev->dev, "amdgpu asic init failed\n");
3751 	}
3752 
3753 	r = amdgpu_device_ip_resume(adev);
3754 	if (r) {
3755 		dev_err(adev->dev, "amdgpu_device_ip_resume failed (%d).\n", r);
3756 		return r;
3757 	}
3758 	amdgpu_fence_driver_resume(adev);
3759 
3760 
3761 	r = amdgpu_device_ip_late_init(adev);
3762 	if (r)
3763 		return r;
3764 
3765 	queue_delayed_work(system_wq, &adev->delayed_init_work,
3766 			   msecs_to_jiffies(AMDGPU_RESUME_MS));
3767 
3768 	if (!amdgpu_device_has_dc_support(adev)) {
3769 		/* pin cursors */
3770 		list_for_each_entry(crtc, &dev->mode_config.crtc_list, head) {
3771 			struct amdgpu_crtc *amdgpu_crtc = to_amdgpu_crtc(crtc);
3772 
3773 			if (amdgpu_crtc->cursor_bo && !adev->enable_virtual_display) {
3774 				struct amdgpu_bo *aobj = gem_to_amdgpu_bo(amdgpu_crtc->cursor_bo);
3775 				r = amdgpu_bo_reserve(aobj, true);
3776 				if (r == 0) {
3777 					r = amdgpu_bo_pin(aobj, AMDGPU_GEM_DOMAIN_VRAM);
3778 					if (r != 0)
3779 						dev_err(adev->dev, "Failed to pin cursor BO (%d)\n", r);
3780 					amdgpu_crtc->cursor_addr = amdgpu_bo_gpu_offset(aobj);
3781 					amdgpu_bo_unreserve(aobj);
3782 				}
3783 			}
3784 		}
3785 	}
3786 	r = amdgpu_amdkfd_resume(adev, !fbcon);
3787 	if (r)
3788 		return r;
3789 
3790 	/* Make sure IB tests flushed */
3791 	flush_delayed_work(&adev->delayed_init_work);
3792 
3793 	/* blat the mode back in */
3794 	if (fbcon) {
3795 		if (!amdgpu_device_has_dc_support(adev)) {
3796 			/* pre DCE11 */
3797 			drm_helper_resume_force_mode(dev);
3798 
3799 			/* turn on display hw */
3800 			drm_modeset_lock_all(dev);
3801 
3802 			drm_connector_list_iter_begin(dev, &iter);
3803 			drm_for_each_connector_iter(connector, &iter)
3804 				drm_helper_connector_dpms(connector,
3805 							  DRM_MODE_DPMS_ON);
3806 			drm_connector_list_iter_end(&iter);
3807 
3808 			drm_modeset_unlock_all(dev);
3809 		}
3810 		amdgpu_fbdev_set_suspend(adev, 0);
3811 	}
3812 
3813 	drm_kms_helper_poll_enable(dev);
3814 
3815 	amdgpu_ras_resume(adev);
3816 
3817 	/*
3818 	 * Most of the connector probing functions try to acquire runtime pm
3819 	 * refs to ensure that the GPU is powered on when connector polling is
3820 	 * performed. Since we're calling this from a runtime PM callback,
3821 	 * trying to acquire rpm refs will cause us to deadlock.
3822 	 *
3823 	 * Since we're guaranteed to be holding the rpm lock, it's safe to
3824 	 * temporarily disable the rpm helpers so this doesn't deadlock us.
3825 	 */
3826 #ifdef CONFIG_PM
3827 	dev->dev->power.disable_depth++;
3828 #endif
3829 	if (!amdgpu_device_has_dc_support(adev))
3830 		drm_helper_hpd_irq_event(dev);
3831 	else
3832 		drm_kms_helper_hotplug_event(dev);
3833 #ifdef CONFIG_PM
3834 	dev->dev->power.disable_depth--;
3835 #endif
3836 	adev->in_suspend = false;
3837 
3838 	return 0;
3839 }
3840 
3841 /**
3842  * amdgpu_device_ip_check_soft_reset - did soft reset succeed
3843  *
3844  * @adev: amdgpu_device pointer
3845  *
3846  * The list of all the hardware IPs that make up the asic is walked and
3847  * the check_soft_reset callbacks are run.  check_soft_reset determines
3848  * if the asic is still hung or not.
3849  * Returns true if any of the IPs are still in a hung state, false if not.
3850  */
amdgpu_device_ip_check_soft_reset(struct amdgpu_device * adev)3851 static bool amdgpu_device_ip_check_soft_reset(struct amdgpu_device *adev)
3852 {
3853 	int i;
3854 	bool asic_hang = false;
3855 
3856 	if (amdgpu_sriov_vf(adev))
3857 		return true;
3858 
3859 	if (amdgpu_asic_need_full_reset(adev))
3860 		return true;
3861 
3862 	for (i = 0; i < adev->num_ip_blocks; i++) {
3863 		if (!adev->ip_blocks[i].status.valid)
3864 			continue;
3865 		if (adev->ip_blocks[i].version->funcs->check_soft_reset)
3866 			adev->ip_blocks[i].status.hang =
3867 				adev->ip_blocks[i].version->funcs->check_soft_reset(adev);
3868 		if (adev->ip_blocks[i].status.hang) {
3869 			dev_info(adev->dev, "IP block:%s is hung!\n", adev->ip_blocks[i].version->funcs->name);
3870 			asic_hang = true;
3871 		}
3872 	}
3873 	return asic_hang;
3874 }
3875 
3876 /**
3877  * amdgpu_device_ip_pre_soft_reset - prepare for soft reset
3878  *
3879  * @adev: amdgpu_device pointer
3880  *
3881  * The list of all the hardware IPs that make up the asic is walked and the
3882  * pre_soft_reset callbacks are run if the block is hung.  pre_soft_reset
3883  * handles any IP specific hardware or software state changes that are
3884  * necessary for a soft reset to succeed.
3885  * Returns 0 on success, negative error code on failure.
3886  */
amdgpu_device_ip_pre_soft_reset(struct amdgpu_device * adev)3887 static int amdgpu_device_ip_pre_soft_reset(struct amdgpu_device *adev)
3888 {
3889 	int i, r = 0;
3890 
3891 	for (i = 0; i < adev->num_ip_blocks; i++) {
3892 		if (!adev->ip_blocks[i].status.valid)
3893 			continue;
3894 		if (adev->ip_blocks[i].status.hang &&
3895 		    adev->ip_blocks[i].version->funcs->pre_soft_reset) {
3896 			r = adev->ip_blocks[i].version->funcs->pre_soft_reset(adev);
3897 			if (r)
3898 				return r;
3899 		}
3900 	}
3901 
3902 	return 0;
3903 }
3904 
3905 /**
3906  * amdgpu_device_ip_need_full_reset - check if a full asic reset is needed
3907  *
3908  * @adev: amdgpu_device pointer
3909  *
3910  * Some hardware IPs cannot be soft reset.  If they are hung, a full gpu
3911  * reset is necessary to recover.
3912  * Returns true if a full asic reset is required, false if not.
3913  */
amdgpu_device_ip_need_full_reset(struct amdgpu_device * adev)3914 static bool amdgpu_device_ip_need_full_reset(struct amdgpu_device *adev)
3915 {
3916 	int i;
3917 
3918 	if (amdgpu_asic_need_full_reset(adev))
3919 		return true;
3920 
3921 	for (i = 0; i < adev->num_ip_blocks; i++) {
3922 		if (!adev->ip_blocks[i].status.valid)
3923 			continue;
3924 		if ((adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC) ||
3925 		    (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_SMC) ||
3926 		    (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_ACP) ||
3927 		    (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_DCE) ||
3928 		     adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_PSP) {
3929 			if (adev->ip_blocks[i].status.hang) {
3930 				dev_info(adev->dev, "Some block need full reset!\n");
3931 				return true;
3932 			}
3933 		}
3934 	}
3935 	return false;
3936 }
3937 
3938 /**
3939  * amdgpu_device_ip_soft_reset - do a soft reset
3940  *
3941  * @adev: amdgpu_device pointer
3942  *
3943  * The list of all the hardware IPs that make up the asic is walked and the
3944  * soft_reset callbacks are run if the block is hung.  soft_reset handles any
3945  * IP specific hardware or software state changes that are necessary to soft
3946  * reset the IP.
3947  * Returns 0 on success, negative error code on failure.
3948  */
amdgpu_device_ip_soft_reset(struct amdgpu_device * adev)3949 static int amdgpu_device_ip_soft_reset(struct amdgpu_device *adev)
3950 {
3951 	int i, r = 0;
3952 
3953 	for (i = 0; i < adev->num_ip_blocks; i++) {
3954 		if (!adev->ip_blocks[i].status.valid)
3955 			continue;
3956 		if (adev->ip_blocks[i].status.hang &&
3957 		    adev->ip_blocks[i].version->funcs->soft_reset) {
3958 			r = adev->ip_blocks[i].version->funcs->soft_reset(adev);
3959 			if (r)
3960 				return r;
3961 		}
3962 	}
3963 
3964 	return 0;
3965 }
3966 
3967 /**
3968  * amdgpu_device_ip_post_soft_reset - clean up from soft reset
3969  *
3970  * @adev: amdgpu_device pointer
3971  *
3972  * The list of all the hardware IPs that make up the asic is walked and the
3973  * post_soft_reset callbacks are run if the asic was hung.  post_soft_reset
3974  * handles any IP specific hardware or software state changes that are
3975  * necessary after the IP has been soft reset.
3976  * Returns 0 on success, negative error code on failure.
3977  */
amdgpu_device_ip_post_soft_reset(struct amdgpu_device * adev)3978 static int amdgpu_device_ip_post_soft_reset(struct amdgpu_device *adev)
3979 {
3980 	int i, r = 0;
3981 
3982 	for (i = 0; i < adev->num_ip_blocks; i++) {
3983 		if (!adev->ip_blocks[i].status.valid)
3984 			continue;
3985 		if (adev->ip_blocks[i].status.hang &&
3986 		    adev->ip_blocks[i].version->funcs->post_soft_reset)
3987 			r = adev->ip_blocks[i].version->funcs->post_soft_reset(adev);
3988 		if (r)
3989 			return r;
3990 	}
3991 
3992 	return 0;
3993 }
3994 
3995 /**
3996  * amdgpu_device_recover_vram - Recover some VRAM contents
3997  *
3998  * @adev: amdgpu_device pointer
3999  *
4000  * Restores the contents of VRAM buffers from the shadows in GTT.  Used to
4001  * restore things like GPUVM page tables after a GPU reset where
4002  * the contents of VRAM might be lost.
4003  *
4004  * Returns:
4005  * 0 on success, negative error code on failure.
4006  */
amdgpu_device_recover_vram(struct amdgpu_device * adev)4007 static int amdgpu_device_recover_vram(struct amdgpu_device *adev)
4008 {
4009 	struct dma_fence *fence = NULL, *next = NULL;
4010 	struct amdgpu_bo *shadow;
4011 	long r = 1, tmo;
4012 
4013 	if (amdgpu_sriov_runtime(adev))
4014 		tmo = msecs_to_jiffies(8000);
4015 	else
4016 		tmo = msecs_to_jiffies(100);
4017 
4018 	dev_info(adev->dev, "recover vram bo from shadow start\n");
4019 	mutex_lock(&adev->shadow_list_lock);
4020 	list_for_each_entry(shadow, &adev->shadow_list, shadow_list) {
4021 
4022 		/* No need to recover an evicted BO */
4023 		if (shadow->tbo.mem.mem_type != TTM_PL_TT ||
4024 		    shadow->tbo.mem.start == AMDGPU_BO_INVALID_OFFSET ||
4025 		    shadow->parent->tbo.mem.mem_type != TTM_PL_VRAM)
4026 			continue;
4027 
4028 		r = amdgpu_bo_restore_shadow(shadow, &next);
4029 		if (r)
4030 			break;
4031 
4032 		if (fence) {
4033 			tmo = dma_fence_wait_timeout(fence, false, tmo);
4034 			dma_fence_put(fence);
4035 			fence = next;
4036 			if (tmo == 0) {
4037 				r = -ETIMEDOUT;
4038 				break;
4039 			} else if (tmo < 0) {
4040 				r = tmo;
4041 				break;
4042 			}
4043 		} else {
4044 			fence = next;
4045 		}
4046 	}
4047 	mutex_unlock(&adev->shadow_list_lock);
4048 
4049 	if (fence)
4050 		tmo = dma_fence_wait_timeout(fence, false, tmo);
4051 	dma_fence_put(fence);
4052 
4053 	if (r < 0 || tmo <= 0) {
4054 		dev_err(adev->dev, "recover vram bo from shadow failed, r is %ld, tmo is %ld\n", r, tmo);
4055 		return -EIO;
4056 	}
4057 
4058 	dev_info(adev->dev, "recover vram bo from shadow done\n");
4059 	return 0;
4060 }
4061 
4062 
4063 /**
4064  * amdgpu_device_reset_sriov - reset ASIC for SR-IOV vf
4065  *
4066  * @adev: amdgpu_device pointer
4067  * @from_hypervisor: request from hypervisor
4068  *
4069  * do VF FLR and reinitialize Asic
4070  * return 0 means succeeded otherwise failed
4071  */
amdgpu_device_reset_sriov(struct amdgpu_device * adev,bool from_hypervisor)4072 static int amdgpu_device_reset_sriov(struct amdgpu_device *adev,
4073 				     bool from_hypervisor)
4074 {
4075 	int r;
4076 
4077 	if (from_hypervisor)
4078 		r = amdgpu_virt_request_full_gpu(adev, true);
4079 	else
4080 		r = amdgpu_virt_reset_gpu(adev);
4081 	if (r)
4082 		return r;
4083 
4084 	amdgpu_amdkfd_pre_reset(adev);
4085 
4086 	/* Resume IP prior to SMC */
4087 	r = amdgpu_device_ip_reinit_early_sriov(adev);
4088 	if (r)
4089 		goto error;
4090 
4091 	amdgpu_virt_init_data_exchange(adev);
4092 	/* we need recover gart prior to run SMC/CP/SDMA resume */
4093 	amdgpu_gtt_mgr_recover(ttm_manager_type(&adev->mman.bdev, TTM_PL_TT));
4094 
4095 	r = amdgpu_device_fw_loading(adev);
4096 	if (r)
4097 		return r;
4098 
4099 	/* now we are okay to resume SMC/CP/SDMA */
4100 	r = amdgpu_device_ip_reinit_late_sriov(adev);
4101 	if (r)
4102 		goto error;
4103 
4104 	amdgpu_irq_gpu_reset_resume_helper(adev);
4105 	r = amdgpu_ib_ring_tests(adev);
4106 	amdgpu_amdkfd_post_reset(adev);
4107 
4108 error:
4109 	amdgpu_virt_release_full_gpu(adev, true);
4110 	if (!r && adev->virt.gim_feature & AMDGIM_FEATURE_GIM_FLR_VRAMLOST) {
4111 		amdgpu_inc_vram_lost(adev);
4112 		r = amdgpu_device_recover_vram(adev);
4113 	}
4114 
4115 	return r;
4116 }
4117 
4118 /**
4119  * amdgpu_device_has_job_running - check if there is any job in mirror list
4120  *
4121  * @adev: amdgpu_device pointer
4122  *
4123  * check if there is any job in mirror list
4124  */
amdgpu_device_has_job_running(struct amdgpu_device * adev)4125 bool amdgpu_device_has_job_running(struct amdgpu_device *adev)
4126 {
4127 	int i;
4128 	struct drm_sched_job *job;
4129 
4130 	for (i = 0; i < AMDGPU_MAX_RINGS; ++i) {
4131 		struct amdgpu_ring *ring = adev->rings[i];
4132 
4133 		if (!ring || !ring->sched.thread)
4134 			continue;
4135 
4136 		spin_lock(&ring->sched.job_list_lock);
4137 		job = list_first_entry_or_null(&ring->sched.ring_mirror_list,
4138 				struct drm_sched_job, node);
4139 		spin_unlock(&ring->sched.job_list_lock);
4140 		if (job)
4141 			return true;
4142 	}
4143 	return false;
4144 }
4145 
4146 /**
4147  * amdgpu_device_should_recover_gpu - check if we should try GPU recovery
4148  *
4149  * @adev: amdgpu_device pointer
4150  *
4151  * Check amdgpu_gpu_recovery and SRIOV status to see if we should try to recover
4152  * a hung GPU.
4153  */
amdgpu_device_should_recover_gpu(struct amdgpu_device * adev)4154 bool amdgpu_device_should_recover_gpu(struct amdgpu_device *adev)
4155 {
4156 	if (!amdgpu_device_ip_check_soft_reset(adev)) {
4157 		dev_info(adev->dev, "Timeout, but no hardware hang detected.\n");
4158 		return false;
4159 	}
4160 
4161 	if (amdgpu_gpu_recovery == 0)
4162 		goto disabled;
4163 
4164 	if (amdgpu_sriov_vf(adev))
4165 		return true;
4166 
4167 	if (amdgpu_gpu_recovery == -1) {
4168 		switch (adev->asic_type) {
4169 		case CHIP_BONAIRE:
4170 		case CHIP_HAWAII:
4171 		case CHIP_TOPAZ:
4172 		case CHIP_TONGA:
4173 		case CHIP_FIJI:
4174 		case CHIP_POLARIS10:
4175 		case CHIP_POLARIS11:
4176 		case CHIP_POLARIS12:
4177 		case CHIP_VEGAM:
4178 		case CHIP_VEGA20:
4179 		case CHIP_VEGA10:
4180 		case CHIP_VEGA12:
4181 		case CHIP_RAVEN:
4182 		case CHIP_ARCTURUS:
4183 		case CHIP_RENOIR:
4184 		case CHIP_NAVI10:
4185 		case CHIP_NAVI14:
4186 		case CHIP_NAVI12:
4187 		case CHIP_SIENNA_CICHLID:
4188 			break;
4189 		default:
4190 			goto disabled;
4191 		}
4192 	}
4193 
4194 	return true;
4195 
4196 disabled:
4197 		dev_info(adev->dev, "GPU recovery disabled.\n");
4198 		return false;
4199 }
4200 
4201 
amdgpu_device_pre_asic_reset(struct amdgpu_device * adev,struct amdgpu_job * job,bool * need_full_reset_arg)4202 static int amdgpu_device_pre_asic_reset(struct amdgpu_device *adev,
4203 					struct amdgpu_job *job,
4204 					bool *need_full_reset_arg)
4205 {
4206 	int i, r = 0;
4207 	bool need_full_reset  = *need_full_reset_arg;
4208 
4209 	amdgpu_debugfs_wait_dump(adev);
4210 
4211 	if (amdgpu_sriov_vf(adev)) {
4212 		/* stop the data exchange thread */
4213 		amdgpu_virt_fini_data_exchange(adev);
4214 	}
4215 
4216 	/* block all schedulers and reset given job's ring */
4217 	for (i = 0; i < AMDGPU_MAX_RINGS; ++i) {
4218 		struct amdgpu_ring *ring = adev->rings[i];
4219 
4220 		if (!ring || !ring->sched.thread)
4221 			continue;
4222 
4223 		/* after all hw jobs are reset, hw fence is meaningless, so force_completion */
4224 		amdgpu_fence_driver_force_completion(ring);
4225 	}
4226 
4227 	if(job)
4228 		drm_sched_increase_karma(&job->base);
4229 
4230 	/* Don't suspend on bare metal if we are not going to HW reset the ASIC */
4231 	if (!amdgpu_sriov_vf(adev)) {
4232 
4233 		if (!need_full_reset)
4234 			need_full_reset = amdgpu_device_ip_need_full_reset(adev);
4235 
4236 		if (!need_full_reset) {
4237 			amdgpu_device_ip_pre_soft_reset(adev);
4238 			r = amdgpu_device_ip_soft_reset(adev);
4239 			amdgpu_device_ip_post_soft_reset(adev);
4240 			if (r || amdgpu_device_ip_check_soft_reset(adev)) {
4241 				dev_info(adev->dev, "soft reset failed, will fallback to full reset!\n");
4242 				need_full_reset = true;
4243 			}
4244 		}
4245 
4246 		if (need_full_reset)
4247 			r = amdgpu_device_ip_suspend(adev);
4248 
4249 		*need_full_reset_arg = need_full_reset;
4250 	}
4251 
4252 	return r;
4253 }
4254 
amdgpu_do_asic_reset(struct amdgpu_hive_info * hive,struct list_head * device_list_handle,bool * need_full_reset_arg,bool skip_hw_reset)4255 static int amdgpu_do_asic_reset(struct amdgpu_hive_info *hive,
4256 			       struct list_head *device_list_handle,
4257 			       bool *need_full_reset_arg,
4258 			       bool skip_hw_reset)
4259 {
4260 	struct amdgpu_device *tmp_adev = NULL;
4261 	bool need_full_reset = *need_full_reset_arg, vram_lost = false;
4262 	int r = 0;
4263 
4264 	/*
4265 	 * ASIC reset has to be done on all HGMI hive nodes ASAP
4266 	 * to allow proper links negotiation in FW (within 1 sec)
4267 	 */
4268 	if (!skip_hw_reset && need_full_reset) {
4269 		list_for_each_entry(tmp_adev, device_list_handle, gmc.xgmi.head) {
4270 			/* For XGMI run all resets in parallel to speed up the process */
4271 			if (tmp_adev->gmc.xgmi.num_physical_nodes > 1) {
4272 				if (!queue_work(system_unbound_wq, &tmp_adev->xgmi_reset_work))
4273 					r = -EALREADY;
4274 			} else
4275 				r = amdgpu_asic_reset(tmp_adev);
4276 
4277 			if (r) {
4278 				dev_err(tmp_adev->dev, "ASIC reset failed with error, %d for drm dev, %s",
4279 					 r, adev_to_drm(tmp_adev)->unique);
4280 				break;
4281 			}
4282 		}
4283 
4284 		/* For XGMI wait for all resets to complete before proceed */
4285 		if (!r) {
4286 			list_for_each_entry(tmp_adev, device_list_handle,
4287 					    gmc.xgmi.head) {
4288 				if (tmp_adev->gmc.xgmi.num_physical_nodes > 1) {
4289 					flush_work(&tmp_adev->xgmi_reset_work);
4290 					r = tmp_adev->asic_reset_res;
4291 					if (r)
4292 						break;
4293 				}
4294 			}
4295 		}
4296 	}
4297 
4298 	if (!r && amdgpu_ras_intr_triggered()) {
4299 		list_for_each_entry(tmp_adev, device_list_handle, gmc.xgmi.head) {
4300 			if (tmp_adev->mmhub.funcs &&
4301 			    tmp_adev->mmhub.funcs->reset_ras_error_count)
4302 				tmp_adev->mmhub.funcs->reset_ras_error_count(tmp_adev);
4303 		}
4304 
4305 		amdgpu_ras_intr_cleared();
4306 	}
4307 
4308 	list_for_each_entry(tmp_adev, device_list_handle, gmc.xgmi.head) {
4309 		if (need_full_reset) {
4310 			/* post card */
4311 			if (amdgpu_device_asic_init(tmp_adev))
4312 				dev_warn(tmp_adev->dev, "asic atom init failed!");
4313 
4314 			if (!r) {
4315 				dev_info(tmp_adev->dev, "GPU reset succeeded, trying to resume\n");
4316 				r = amdgpu_amdkfd_resume_iommu(tmp_adev);
4317 				if (r)
4318 					goto out;
4319 
4320 				r = amdgpu_device_ip_resume_phase1(tmp_adev);
4321 				if (r)
4322 					goto out;
4323 
4324 				vram_lost = amdgpu_device_check_vram_lost(tmp_adev);
4325 				if (vram_lost) {
4326 					DRM_INFO("VRAM is lost due to GPU reset!\n");
4327 					amdgpu_inc_vram_lost(tmp_adev);
4328 				}
4329 
4330 				r = amdgpu_gtt_mgr_recover(ttm_manager_type(&tmp_adev->mman.bdev, TTM_PL_TT));
4331 				if (r)
4332 					goto out;
4333 
4334 				r = amdgpu_device_fw_loading(tmp_adev);
4335 				if (r)
4336 					return r;
4337 
4338 				r = amdgpu_device_ip_resume_phase2(tmp_adev);
4339 				if (r)
4340 					goto out;
4341 
4342 				if (vram_lost)
4343 					amdgpu_device_fill_reset_magic(tmp_adev);
4344 
4345 				/*
4346 				 * Add this ASIC as tracked as reset was already
4347 				 * complete successfully.
4348 				 */
4349 				amdgpu_register_gpu_instance(tmp_adev);
4350 
4351 				r = amdgpu_device_ip_late_init(tmp_adev);
4352 				if (r)
4353 					goto out;
4354 
4355 				amdgpu_fbdev_set_suspend(tmp_adev, 0);
4356 
4357 				/*
4358 				 * The GPU enters bad state once faulty pages
4359 				 * by ECC has reached the threshold, and ras
4360 				 * recovery is scheduled next. So add one check
4361 				 * here to break recovery if it indeed exceeds
4362 				 * bad page threshold, and remind user to
4363 				 * retire this GPU or setting one bigger
4364 				 * bad_page_threshold value to fix this once
4365 				 * probing driver again.
4366 				 */
4367 				if (!amdgpu_ras_check_err_threshold(tmp_adev)) {
4368 					/* must succeed. */
4369 					amdgpu_ras_resume(tmp_adev);
4370 				} else {
4371 					r = -EINVAL;
4372 					goto out;
4373 				}
4374 
4375 				/* Update PSP FW topology after reset */
4376 				if (hive && tmp_adev->gmc.xgmi.num_physical_nodes > 1)
4377 					r = amdgpu_xgmi_update_topology(hive, tmp_adev);
4378 			}
4379 		}
4380 
4381 out:
4382 		if (!r) {
4383 			amdgpu_irq_gpu_reset_resume_helper(tmp_adev);
4384 			r = amdgpu_ib_ring_tests(tmp_adev);
4385 			if (r) {
4386 				dev_err(tmp_adev->dev, "ib ring test failed (%d).\n", r);
4387 				need_full_reset = true;
4388 				r = -EAGAIN;
4389 				goto end;
4390 			}
4391 		}
4392 
4393 		if (!r)
4394 			r = amdgpu_device_recover_vram(tmp_adev);
4395 		else
4396 			tmp_adev->asic_reset_res = r;
4397 	}
4398 
4399 end:
4400 	*need_full_reset_arg = need_full_reset;
4401 	return r;
4402 }
4403 
amdgpu_device_lock_adev(struct amdgpu_device * adev,struct amdgpu_hive_info * hive)4404 static bool amdgpu_device_lock_adev(struct amdgpu_device *adev,
4405 				struct amdgpu_hive_info *hive)
4406 {
4407 	if (atomic_cmpxchg(&adev->in_gpu_reset, 0, 1) != 0)
4408 		return false;
4409 
4410 	if (hive) {
4411 		down_write_nest_lock(&adev->reset_sem, &hive->hive_lock);
4412 	} else {
4413 		down_write(&adev->reset_sem);
4414 	}
4415 
4416 	atomic_inc(&adev->gpu_reset_counter);
4417 	switch (amdgpu_asic_reset_method(adev)) {
4418 	case AMD_RESET_METHOD_MODE1:
4419 		adev->mp1_state = PP_MP1_STATE_SHUTDOWN;
4420 		break;
4421 	case AMD_RESET_METHOD_MODE2:
4422 		adev->mp1_state = PP_MP1_STATE_RESET;
4423 		break;
4424 	default:
4425 		adev->mp1_state = PP_MP1_STATE_NONE;
4426 		break;
4427 	}
4428 
4429 	return true;
4430 }
4431 
amdgpu_device_unlock_adev(struct amdgpu_device * adev)4432 static void amdgpu_device_unlock_adev(struct amdgpu_device *adev)
4433 {
4434 	amdgpu_vf_error_trans_all(adev);
4435 	adev->mp1_state = PP_MP1_STATE_NONE;
4436 	atomic_set(&adev->in_gpu_reset, 0);
4437 	up_write(&adev->reset_sem);
4438 }
4439 
amdgpu_device_resume_display_audio(struct amdgpu_device * adev)4440 static void amdgpu_device_resume_display_audio(struct amdgpu_device *adev)
4441 {
4442 	struct pci_dev *p = NULL;
4443 
4444 	p = pci_get_domain_bus_and_slot(pci_domain_nr(adev->pdev->bus),
4445 			adev->pdev->bus->number, 1);
4446 	if (p) {
4447 		pm_runtime_enable(&(p->dev));
4448 		pm_runtime_resume(&(p->dev));
4449 	}
4450 
4451 	pci_dev_put(p);
4452 }
4453 
amdgpu_device_suspend_display_audio(struct amdgpu_device * adev)4454 static int amdgpu_device_suspend_display_audio(struct amdgpu_device *adev)
4455 {
4456 	enum amd_reset_method reset_method;
4457 	struct pci_dev *p = NULL;
4458 	u64 expires;
4459 
4460 	/*
4461 	 * For now, only BACO and mode1 reset are confirmed
4462 	 * to suffer the audio issue without proper suspended.
4463 	 */
4464 	reset_method = amdgpu_asic_reset_method(adev);
4465 	if ((reset_method != AMD_RESET_METHOD_BACO) &&
4466 	     (reset_method != AMD_RESET_METHOD_MODE1))
4467 		return -EINVAL;
4468 
4469 	p = pci_get_domain_bus_and_slot(pci_domain_nr(adev->pdev->bus),
4470 			adev->pdev->bus->number, 1);
4471 	if (!p)
4472 		return -ENODEV;
4473 
4474 	expires = pm_runtime_autosuspend_expiration(&(p->dev));
4475 	if (!expires)
4476 		/*
4477 		 * If we cannot get the audio device autosuspend delay,
4478 		 * a fixed 4S interval will be used. Considering 3S is
4479 		 * the audio controller default autosuspend delay setting.
4480 		 * 4S used here is guaranteed to cover that.
4481 		 */
4482 		expires = ktime_get_mono_fast_ns() + NSEC_PER_SEC * 4ULL;
4483 
4484 	while (!pm_runtime_status_suspended(&(p->dev))) {
4485 		if (!pm_runtime_suspend(&(p->dev)))
4486 			break;
4487 
4488 		if (expires < ktime_get_mono_fast_ns()) {
4489 			dev_warn(adev->dev, "failed to suspend display audio\n");
4490 			pci_dev_put(p);
4491 			/* TODO: abort the succeeding gpu reset? */
4492 			return -ETIMEDOUT;
4493 		}
4494 	}
4495 
4496 	pm_runtime_disable(&(p->dev));
4497 
4498 	pci_dev_put(p);
4499 	return 0;
4500 }
4501 
4502 /**
4503  * amdgpu_device_gpu_recover - reset the asic and recover scheduler
4504  *
4505  * @adev: amdgpu_device pointer
4506  * @job: which job trigger hang
4507  *
4508  * Attempt to reset the GPU if it has hung (all asics).
4509  * Attempt to do soft-reset or full-reset and reinitialize Asic
4510  * Returns 0 for success or an error on failure.
4511  */
4512 
amdgpu_device_gpu_recover(struct amdgpu_device * adev,struct amdgpu_job * job)4513 int amdgpu_device_gpu_recover(struct amdgpu_device *adev,
4514 			      struct amdgpu_job *job)
4515 {
4516 	struct list_head device_list, *device_list_handle =  NULL;
4517 	bool need_full_reset = false;
4518 	bool job_signaled = false;
4519 	struct amdgpu_hive_info *hive = NULL;
4520 	struct amdgpu_device *tmp_adev = NULL;
4521 	int i, r = 0;
4522 	bool need_emergency_restart = false;
4523 	bool audio_suspended = false;
4524 
4525 	/*
4526 	 * Special case: RAS triggered and full reset isn't supported
4527 	 */
4528 	need_emergency_restart = amdgpu_ras_need_emergency_restart(adev);
4529 
4530 	/*
4531 	 * Flush RAM to disk so that after reboot
4532 	 * the user can read log and see why the system rebooted.
4533 	 */
4534 	if (need_emergency_restart && amdgpu_ras_get_context(adev) &&
4535 		amdgpu_ras_get_context(adev)->reboot) {
4536 		DRM_WARN("Emergency reboot.");
4537 
4538 		ksys_sync_helper();
4539 		emergency_restart();
4540 	}
4541 
4542 	dev_info(adev->dev, "GPU %s begin!\n",
4543 		need_emergency_restart ? "jobs stop":"reset");
4544 
4545 	/*
4546 	 * Here we trylock to avoid chain of resets executing from
4547 	 * either trigger by jobs on different adevs in XGMI hive or jobs on
4548 	 * different schedulers for same device while this TO handler is running.
4549 	 * We always reset all schedulers for device and all devices for XGMI
4550 	 * hive so that should take care of them too.
4551 	 */
4552 	hive = amdgpu_get_xgmi_hive(adev);
4553 	if (hive) {
4554 		if (atomic_cmpxchg(&hive->in_reset, 0, 1) != 0) {
4555 			DRM_INFO("Bailing on TDR for s_job:%llx, hive: %llx as another already in progress",
4556 				job ? job->base.id : -1, hive->hive_id);
4557 			amdgpu_put_xgmi_hive(hive);
4558 			return 0;
4559 		}
4560 		mutex_lock(&hive->hive_lock);
4561 	}
4562 
4563 	/*
4564 	 * Build list of devices to reset.
4565 	 * In case we are in XGMI hive mode, resort the device list
4566 	 * to put adev in the 1st position.
4567 	 */
4568 	INIT_LIST_HEAD(&device_list);
4569 	if (adev->gmc.xgmi.num_physical_nodes > 1) {
4570 		if (!hive)
4571 			return -ENODEV;
4572 		if (!list_is_first(&adev->gmc.xgmi.head, &hive->device_list))
4573 			list_rotate_to_front(&adev->gmc.xgmi.head, &hive->device_list);
4574 		device_list_handle = &hive->device_list;
4575 	} else {
4576 		list_add_tail(&adev->gmc.xgmi.head, &device_list);
4577 		device_list_handle = &device_list;
4578 	}
4579 
4580 	/* block all schedulers and reset given job's ring */
4581 	list_for_each_entry(tmp_adev, device_list_handle, gmc.xgmi.head) {
4582 		if (!amdgpu_device_lock_adev(tmp_adev, hive)) {
4583 			dev_info(tmp_adev->dev, "Bailing on TDR for s_job:%llx, as another already in progress",
4584 				  job ? job->base.id : -1);
4585 			r = 0;
4586 			goto skip_recovery;
4587 		}
4588 
4589 		/*
4590 		 * Try to put the audio codec into suspend state
4591 		 * before gpu reset started.
4592 		 *
4593 		 * Due to the power domain of the graphics device
4594 		 * is shared with AZ power domain. Without this,
4595 		 * we may change the audio hardware from behind
4596 		 * the audio driver's back. That will trigger
4597 		 * some audio codec errors.
4598 		 */
4599 		if (!amdgpu_device_suspend_display_audio(tmp_adev))
4600 			audio_suspended = true;
4601 
4602 		amdgpu_ras_set_error_query_ready(tmp_adev, false);
4603 
4604 		cancel_delayed_work_sync(&tmp_adev->delayed_init_work);
4605 
4606 		if (!amdgpu_sriov_vf(tmp_adev))
4607 			amdgpu_amdkfd_pre_reset(tmp_adev);
4608 
4609 		/*
4610 		 * Mark these ASICs to be reseted as untracked first
4611 		 * And add them back after reset completed
4612 		 */
4613 		amdgpu_unregister_gpu_instance(tmp_adev);
4614 
4615 		amdgpu_fbdev_set_suspend(tmp_adev, 1);
4616 
4617 		/* disable ras on ALL IPs */
4618 		if (!need_emergency_restart &&
4619 		      amdgpu_device_ip_need_full_reset(tmp_adev))
4620 			amdgpu_ras_suspend(tmp_adev);
4621 
4622 		for (i = 0; i < AMDGPU_MAX_RINGS; ++i) {
4623 			struct amdgpu_ring *ring = tmp_adev->rings[i];
4624 
4625 			if (!ring || !ring->sched.thread)
4626 				continue;
4627 
4628 			drm_sched_stop(&ring->sched, job ? &job->base : NULL);
4629 
4630 			if (need_emergency_restart)
4631 				amdgpu_job_stop_all_jobs_on_sched(&ring->sched);
4632 		}
4633 	}
4634 
4635 	if (need_emergency_restart)
4636 		goto skip_sched_resume;
4637 
4638 	/*
4639 	 * Must check guilty signal here since after this point all old
4640 	 * HW fences are force signaled.
4641 	 *
4642 	 * job->base holds a reference to parent fence
4643 	 */
4644 	if (job && job->base.s_fence->parent &&
4645 	    dma_fence_is_signaled(job->base.s_fence->parent)) {
4646 		job_signaled = true;
4647 		dev_info(adev->dev, "Guilty job already signaled, skipping HW reset");
4648 		goto skip_hw_reset;
4649 	}
4650 
4651 retry:	/* Rest of adevs pre asic reset from XGMI hive. */
4652 	list_for_each_entry(tmp_adev, device_list_handle, gmc.xgmi.head) {
4653 		r = amdgpu_device_pre_asic_reset(tmp_adev,
4654 						 (tmp_adev == adev) ? job : NULL,
4655 						 &need_full_reset);
4656 		/*TODO Should we stop ?*/
4657 		if (r) {
4658 			dev_err(tmp_adev->dev, "GPU pre asic reset failed with err, %d for drm dev, %s ",
4659 				  r, adev_to_drm(tmp_adev)->unique);
4660 			tmp_adev->asic_reset_res = r;
4661 		}
4662 	}
4663 
4664 	/* Actual ASIC resets if needed.*/
4665 	/* TODO Implement XGMI hive reset logic for SRIOV */
4666 	if (amdgpu_sriov_vf(adev)) {
4667 		r = amdgpu_device_reset_sriov(adev, job ? false : true);
4668 		if (r)
4669 			adev->asic_reset_res = r;
4670 	} else {
4671 		r  = amdgpu_do_asic_reset(hive, device_list_handle, &need_full_reset, false);
4672 		if (r && r == -EAGAIN)
4673 			goto retry;
4674 	}
4675 
4676 skip_hw_reset:
4677 
4678 	/* Post ASIC reset for all devs .*/
4679 	list_for_each_entry(tmp_adev, device_list_handle, gmc.xgmi.head) {
4680 
4681 		for (i = 0; i < AMDGPU_MAX_RINGS; ++i) {
4682 			struct amdgpu_ring *ring = tmp_adev->rings[i];
4683 
4684 			if (!ring || !ring->sched.thread)
4685 				continue;
4686 
4687 			/* No point to resubmit jobs if we didn't HW reset*/
4688 			if (!tmp_adev->asic_reset_res && !job_signaled)
4689 				drm_sched_resubmit_jobs(&ring->sched);
4690 
4691 			drm_sched_start(&ring->sched, !tmp_adev->asic_reset_res);
4692 		}
4693 
4694 		if (!amdgpu_device_has_dc_support(tmp_adev) && !job_signaled) {
4695 			drm_helper_resume_force_mode(adev_to_drm(tmp_adev));
4696 		}
4697 
4698 		tmp_adev->asic_reset_res = 0;
4699 
4700 		if (r) {
4701 			/* bad news, how to tell it to userspace ? */
4702 			dev_info(tmp_adev->dev, "GPU reset(%d) failed\n", atomic_read(&tmp_adev->gpu_reset_counter));
4703 			amdgpu_vf_error_put(tmp_adev, AMDGIM_ERROR_VF_GPU_RESET_FAIL, 0, r);
4704 		} else {
4705 			dev_info(tmp_adev->dev, "GPU reset(%d) succeeded!\n", atomic_read(&tmp_adev->gpu_reset_counter));
4706 		}
4707 	}
4708 
4709 skip_sched_resume:
4710 	list_for_each_entry(tmp_adev, device_list_handle, gmc.xgmi.head) {
4711 		/*unlock kfd: SRIOV would do it separately */
4712 		if (!need_emergency_restart && !amdgpu_sriov_vf(tmp_adev))
4713 	                amdgpu_amdkfd_post_reset(tmp_adev);
4714 		if (audio_suspended)
4715 			amdgpu_device_resume_display_audio(tmp_adev);
4716 		amdgpu_device_unlock_adev(tmp_adev);
4717 	}
4718 
4719 skip_recovery:
4720 	if (hive) {
4721 		atomic_set(&hive->in_reset, 0);
4722 		mutex_unlock(&hive->hive_lock);
4723 		amdgpu_put_xgmi_hive(hive);
4724 	}
4725 
4726 	if (r)
4727 		dev_info(adev->dev, "GPU reset end with ret = %d\n", r);
4728 	return r;
4729 }
4730 
4731 /**
4732  * amdgpu_device_get_pcie_info - fence pcie info about the PCIE slot
4733  *
4734  * @adev: amdgpu_device pointer
4735  *
4736  * Fetchs and stores in the driver the PCIE capabilities (gen speed
4737  * and lanes) of the slot the device is in. Handles APUs and
4738  * virtualized environments where PCIE config space may not be available.
4739  */
amdgpu_device_get_pcie_info(struct amdgpu_device * adev)4740 static void amdgpu_device_get_pcie_info(struct amdgpu_device *adev)
4741 {
4742 	struct pci_dev *pdev;
4743 	enum pci_bus_speed speed_cap, platform_speed_cap;
4744 	enum pcie_link_width platform_link_width;
4745 
4746 	if (amdgpu_pcie_gen_cap)
4747 		adev->pm.pcie_gen_mask = amdgpu_pcie_gen_cap;
4748 
4749 	if (amdgpu_pcie_lane_cap)
4750 		adev->pm.pcie_mlw_mask = amdgpu_pcie_lane_cap;
4751 
4752 	/* covers APUs as well */
4753 	if (pci_is_root_bus(adev->pdev->bus)) {
4754 		if (adev->pm.pcie_gen_mask == 0)
4755 			adev->pm.pcie_gen_mask = AMDGPU_DEFAULT_PCIE_GEN_MASK;
4756 		if (adev->pm.pcie_mlw_mask == 0)
4757 			adev->pm.pcie_mlw_mask = AMDGPU_DEFAULT_PCIE_MLW_MASK;
4758 		return;
4759 	}
4760 
4761 	if (adev->pm.pcie_gen_mask && adev->pm.pcie_mlw_mask)
4762 		return;
4763 
4764 	pcie_bandwidth_available(adev->pdev, NULL,
4765 				 &platform_speed_cap, &platform_link_width);
4766 
4767 	if (adev->pm.pcie_gen_mask == 0) {
4768 		/* asic caps */
4769 		pdev = adev->pdev;
4770 		speed_cap = pcie_get_speed_cap(pdev);
4771 		if (speed_cap == PCI_SPEED_UNKNOWN) {
4772 			adev->pm.pcie_gen_mask |= (CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN1 |
4773 						  CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN2 |
4774 						  CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN3);
4775 		} else {
4776 			if (speed_cap == PCIE_SPEED_16_0GT)
4777 				adev->pm.pcie_gen_mask |= (CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN1 |
4778 							  CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN2 |
4779 							  CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN3 |
4780 							  CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN4);
4781 			else if (speed_cap == PCIE_SPEED_8_0GT)
4782 				adev->pm.pcie_gen_mask |= (CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN1 |
4783 							  CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN2 |
4784 							  CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN3);
4785 			else if (speed_cap == PCIE_SPEED_5_0GT)
4786 				adev->pm.pcie_gen_mask |= (CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN1 |
4787 							  CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN2);
4788 			else
4789 				adev->pm.pcie_gen_mask |= CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN1;
4790 		}
4791 		/* platform caps */
4792 		if (platform_speed_cap == PCI_SPEED_UNKNOWN) {
4793 			adev->pm.pcie_gen_mask |= (CAIL_PCIE_LINK_SPEED_SUPPORT_GEN1 |
4794 						   CAIL_PCIE_LINK_SPEED_SUPPORT_GEN2);
4795 		} else {
4796 			if (platform_speed_cap == PCIE_SPEED_16_0GT)
4797 				adev->pm.pcie_gen_mask |= (CAIL_PCIE_LINK_SPEED_SUPPORT_GEN1 |
4798 							   CAIL_PCIE_LINK_SPEED_SUPPORT_GEN2 |
4799 							   CAIL_PCIE_LINK_SPEED_SUPPORT_GEN3 |
4800 							   CAIL_PCIE_LINK_SPEED_SUPPORT_GEN4);
4801 			else if (platform_speed_cap == PCIE_SPEED_8_0GT)
4802 				adev->pm.pcie_gen_mask |= (CAIL_PCIE_LINK_SPEED_SUPPORT_GEN1 |
4803 							   CAIL_PCIE_LINK_SPEED_SUPPORT_GEN2 |
4804 							   CAIL_PCIE_LINK_SPEED_SUPPORT_GEN3);
4805 			else if (platform_speed_cap == PCIE_SPEED_5_0GT)
4806 				adev->pm.pcie_gen_mask |= (CAIL_PCIE_LINK_SPEED_SUPPORT_GEN1 |
4807 							   CAIL_PCIE_LINK_SPEED_SUPPORT_GEN2);
4808 			else
4809 				adev->pm.pcie_gen_mask |= CAIL_PCIE_LINK_SPEED_SUPPORT_GEN1;
4810 
4811 		}
4812 	}
4813 	if (adev->pm.pcie_mlw_mask == 0) {
4814 		if (platform_link_width == PCIE_LNK_WIDTH_UNKNOWN) {
4815 			adev->pm.pcie_mlw_mask |= AMDGPU_DEFAULT_PCIE_MLW_MASK;
4816 		} else {
4817 			switch (platform_link_width) {
4818 			case PCIE_LNK_X32:
4819 				adev->pm.pcie_mlw_mask = (CAIL_PCIE_LINK_WIDTH_SUPPORT_X32 |
4820 							  CAIL_PCIE_LINK_WIDTH_SUPPORT_X16 |
4821 							  CAIL_PCIE_LINK_WIDTH_SUPPORT_X12 |
4822 							  CAIL_PCIE_LINK_WIDTH_SUPPORT_X8 |
4823 							  CAIL_PCIE_LINK_WIDTH_SUPPORT_X4 |
4824 							  CAIL_PCIE_LINK_WIDTH_SUPPORT_X2 |
4825 							  CAIL_PCIE_LINK_WIDTH_SUPPORT_X1);
4826 				break;
4827 			case PCIE_LNK_X16:
4828 				adev->pm.pcie_mlw_mask = (CAIL_PCIE_LINK_WIDTH_SUPPORT_X16 |
4829 							  CAIL_PCIE_LINK_WIDTH_SUPPORT_X12 |
4830 							  CAIL_PCIE_LINK_WIDTH_SUPPORT_X8 |
4831 							  CAIL_PCIE_LINK_WIDTH_SUPPORT_X4 |
4832 							  CAIL_PCIE_LINK_WIDTH_SUPPORT_X2 |
4833 							  CAIL_PCIE_LINK_WIDTH_SUPPORT_X1);
4834 				break;
4835 			case PCIE_LNK_X12:
4836 				adev->pm.pcie_mlw_mask = (CAIL_PCIE_LINK_WIDTH_SUPPORT_X12 |
4837 							  CAIL_PCIE_LINK_WIDTH_SUPPORT_X8 |
4838 							  CAIL_PCIE_LINK_WIDTH_SUPPORT_X4 |
4839 							  CAIL_PCIE_LINK_WIDTH_SUPPORT_X2 |
4840 							  CAIL_PCIE_LINK_WIDTH_SUPPORT_X1);
4841 				break;
4842 			case PCIE_LNK_X8:
4843 				adev->pm.pcie_mlw_mask = (CAIL_PCIE_LINK_WIDTH_SUPPORT_X8 |
4844 							  CAIL_PCIE_LINK_WIDTH_SUPPORT_X4 |
4845 							  CAIL_PCIE_LINK_WIDTH_SUPPORT_X2 |
4846 							  CAIL_PCIE_LINK_WIDTH_SUPPORT_X1);
4847 				break;
4848 			case PCIE_LNK_X4:
4849 				adev->pm.pcie_mlw_mask = (CAIL_PCIE_LINK_WIDTH_SUPPORT_X4 |
4850 							  CAIL_PCIE_LINK_WIDTH_SUPPORT_X2 |
4851 							  CAIL_PCIE_LINK_WIDTH_SUPPORT_X1);
4852 				break;
4853 			case PCIE_LNK_X2:
4854 				adev->pm.pcie_mlw_mask = (CAIL_PCIE_LINK_WIDTH_SUPPORT_X2 |
4855 							  CAIL_PCIE_LINK_WIDTH_SUPPORT_X1);
4856 				break;
4857 			case PCIE_LNK_X1:
4858 				adev->pm.pcie_mlw_mask = CAIL_PCIE_LINK_WIDTH_SUPPORT_X1;
4859 				break;
4860 			default:
4861 				break;
4862 			}
4863 		}
4864 	}
4865 }
4866 
amdgpu_device_baco_enter(struct drm_device * dev)4867 int amdgpu_device_baco_enter(struct drm_device *dev)
4868 {
4869 	struct amdgpu_device *adev = drm_to_adev(dev);
4870 	struct amdgpu_ras *ras = amdgpu_ras_get_context(adev);
4871 
4872 	if (!amdgpu_device_supports_baco(adev_to_drm(adev)))
4873 		return -ENOTSUPP;
4874 
4875 	if (ras && ras->supported && adev->nbio.funcs->enable_doorbell_interrupt)
4876 		adev->nbio.funcs->enable_doorbell_interrupt(adev, false);
4877 
4878 	return amdgpu_dpm_baco_enter(adev);
4879 }
4880 
amdgpu_device_baco_exit(struct drm_device * dev)4881 int amdgpu_device_baco_exit(struct drm_device *dev)
4882 {
4883 	struct amdgpu_device *adev = drm_to_adev(dev);
4884 	struct amdgpu_ras *ras = amdgpu_ras_get_context(adev);
4885 	int ret = 0;
4886 
4887 	if (!amdgpu_device_supports_baco(adev_to_drm(adev)))
4888 		return -ENOTSUPP;
4889 
4890 	ret = amdgpu_dpm_baco_exit(adev);
4891 	if (ret)
4892 		return ret;
4893 
4894 	if (ras && ras->supported && adev->nbio.funcs->enable_doorbell_interrupt)
4895 		adev->nbio.funcs->enable_doorbell_interrupt(adev, true);
4896 
4897 	return 0;
4898 }
4899 
amdgpu_cancel_all_tdr(struct amdgpu_device * adev)4900 static void amdgpu_cancel_all_tdr(struct amdgpu_device *adev)
4901 {
4902 	int i;
4903 
4904 	for (i = 0; i < AMDGPU_MAX_RINGS; ++i) {
4905 		struct amdgpu_ring *ring = adev->rings[i];
4906 
4907 		if (!ring || !ring->sched.thread)
4908 			continue;
4909 
4910 		cancel_delayed_work_sync(&ring->sched.work_tdr);
4911 	}
4912 }
4913 
4914 /**
4915  * amdgpu_pci_error_detected - Called when a PCI error is detected.
4916  * @pdev: PCI device struct
4917  * @state: PCI channel state
4918  *
4919  * Description: Called when a PCI error is detected.
4920  *
4921  * Return: PCI_ERS_RESULT_NEED_RESET or PCI_ERS_RESULT_DISCONNECT.
4922  */
amdgpu_pci_error_detected(struct pci_dev * pdev,pci_channel_state_t state)4923 pci_ers_result_t amdgpu_pci_error_detected(struct pci_dev *pdev, pci_channel_state_t state)
4924 {
4925 	struct drm_device *dev = pci_get_drvdata(pdev);
4926 	struct amdgpu_device *adev = drm_to_adev(dev);
4927 	int i;
4928 
4929 	DRM_INFO("PCI error: detected callback, state(%d)!!\n", state);
4930 
4931 	if (adev->gmc.xgmi.num_physical_nodes > 1) {
4932 		DRM_WARN("No support for XGMI hive yet...");
4933 		return PCI_ERS_RESULT_DISCONNECT;
4934 	}
4935 
4936 	switch (state) {
4937 	case pci_channel_io_normal:
4938 		return PCI_ERS_RESULT_CAN_RECOVER;
4939 	/* Fatal error, prepare for slot reset */
4940 	case pci_channel_io_frozen:
4941 		/*
4942 		 * Cancel and wait for all TDRs in progress if failing to
4943 		 * set  adev->in_gpu_reset in amdgpu_device_lock_adev
4944 		 *
4945 		 * Locking adev->reset_sem will prevent any external access
4946 		 * to GPU during PCI error recovery
4947 		 */
4948 		while (!amdgpu_device_lock_adev(adev, NULL))
4949 			amdgpu_cancel_all_tdr(adev);
4950 
4951 		/*
4952 		 * Block any work scheduling as we do for regular GPU reset
4953 		 * for the duration of the recovery
4954 		 */
4955 		for (i = 0; i < AMDGPU_MAX_RINGS; ++i) {
4956 			struct amdgpu_ring *ring = adev->rings[i];
4957 
4958 			if (!ring || !ring->sched.thread)
4959 				continue;
4960 
4961 			drm_sched_stop(&ring->sched, NULL);
4962 		}
4963 		return PCI_ERS_RESULT_NEED_RESET;
4964 	case pci_channel_io_perm_failure:
4965 		/* Permanent error, prepare for device removal */
4966 		return PCI_ERS_RESULT_DISCONNECT;
4967 	}
4968 
4969 	return PCI_ERS_RESULT_NEED_RESET;
4970 }
4971 
4972 /**
4973  * amdgpu_pci_mmio_enabled - Enable MMIO and dump debug registers
4974  * @pdev: pointer to PCI device
4975  */
amdgpu_pci_mmio_enabled(struct pci_dev * pdev)4976 pci_ers_result_t amdgpu_pci_mmio_enabled(struct pci_dev *pdev)
4977 {
4978 
4979 	DRM_INFO("PCI error: mmio enabled callback!!\n");
4980 
4981 	/* TODO - dump whatever for debugging purposes */
4982 
4983 	/* This called only if amdgpu_pci_error_detected returns
4984 	 * PCI_ERS_RESULT_CAN_RECOVER. Read/write to the device still
4985 	 * works, no need to reset slot.
4986 	 */
4987 
4988 	return PCI_ERS_RESULT_RECOVERED;
4989 }
4990 
4991 /**
4992  * amdgpu_pci_slot_reset - Called when PCI slot has been reset.
4993  * @pdev: PCI device struct
4994  *
4995  * Description: This routine is called by the pci error recovery
4996  * code after the PCI slot has been reset, just before we
4997  * should resume normal operations.
4998  */
amdgpu_pci_slot_reset(struct pci_dev * pdev)4999 pci_ers_result_t amdgpu_pci_slot_reset(struct pci_dev *pdev)
5000 {
5001 	struct drm_device *dev = pci_get_drvdata(pdev);
5002 	struct amdgpu_device *adev = drm_to_adev(dev);
5003 	int r, i;
5004 	bool need_full_reset = true;
5005 	u32 memsize;
5006 	struct list_head device_list;
5007 
5008 	DRM_INFO("PCI error: slot reset callback!!\n");
5009 
5010 	INIT_LIST_HEAD(&device_list);
5011 	list_add_tail(&adev->gmc.xgmi.head, &device_list);
5012 
5013 	/* wait for asic to come out of reset */
5014 	msleep(500);
5015 
5016 	/* Restore PCI confspace */
5017 	amdgpu_device_load_pci_state(pdev);
5018 
5019 	/* confirm  ASIC came out of reset */
5020 	for (i = 0; i < adev->usec_timeout; i++) {
5021 		memsize = amdgpu_asic_get_config_memsize(adev);
5022 
5023 		if (memsize != 0xffffffff)
5024 			break;
5025 		udelay(1);
5026 	}
5027 	if (memsize == 0xffffffff) {
5028 		r = -ETIME;
5029 		goto out;
5030 	}
5031 
5032 	adev->in_pci_err_recovery = true;
5033 	r = amdgpu_device_pre_asic_reset(adev, NULL, &need_full_reset);
5034 	adev->in_pci_err_recovery = false;
5035 	if (r)
5036 		goto out;
5037 
5038 	r = amdgpu_do_asic_reset(NULL, &device_list, &need_full_reset, true);
5039 
5040 out:
5041 	if (!r) {
5042 		if (amdgpu_device_cache_pci_state(adev->pdev))
5043 			pci_restore_state(adev->pdev);
5044 
5045 		DRM_INFO("PCIe error recovery succeeded\n");
5046 	} else {
5047 		DRM_ERROR("PCIe error recovery failed, err:%d", r);
5048 		amdgpu_device_unlock_adev(adev);
5049 	}
5050 
5051 	return r ? PCI_ERS_RESULT_DISCONNECT : PCI_ERS_RESULT_RECOVERED;
5052 }
5053 
5054 /**
5055  * amdgpu_pci_resume() - resume normal ops after PCI reset
5056  * @pdev: pointer to PCI device
5057  *
5058  * Called when the error recovery driver tells us that its
5059  * OK to resume normal operation. Use completion to allow
5060  * halted scsi ops to resume.
5061  */
amdgpu_pci_resume(struct pci_dev * pdev)5062 void amdgpu_pci_resume(struct pci_dev *pdev)
5063 {
5064 	struct drm_device *dev = pci_get_drvdata(pdev);
5065 	struct amdgpu_device *adev = drm_to_adev(dev);
5066 	int i;
5067 
5068 
5069 	DRM_INFO("PCI error: resume callback!!\n");
5070 
5071 	for (i = 0; i < AMDGPU_MAX_RINGS; ++i) {
5072 		struct amdgpu_ring *ring = adev->rings[i];
5073 
5074 		if (!ring || !ring->sched.thread)
5075 			continue;
5076 
5077 
5078 		drm_sched_resubmit_jobs(&ring->sched);
5079 		drm_sched_start(&ring->sched, true);
5080 	}
5081 
5082 	amdgpu_device_unlock_adev(adev);
5083 }
5084 
amdgpu_device_cache_pci_state(struct pci_dev * pdev)5085 bool amdgpu_device_cache_pci_state(struct pci_dev *pdev)
5086 {
5087 	struct drm_device *dev = pci_get_drvdata(pdev);
5088 	struct amdgpu_device *adev = drm_to_adev(dev);
5089 	int r;
5090 
5091 	r = pci_save_state(pdev);
5092 	if (!r) {
5093 		kfree(adev->pci_state);
5094 
5095 		adev->pci_state = pci_store_saved_state(pdev);
5096 
5097 		if (!adev->pci_state) {
5098 			DRM_ERROR("Failed to store PCI saved state");
5099 			return false;
5100 		}
5101 	} else {
5102 		DRM_WARN("Failed to save PCI state, err:%d\n", r);
5103 		return false;
5104 	}
5105 
5106 	return true;
5107 }
5108 
amdgpu_device_load_pci_state(struct pci_dev * pdev)5109 bool amdgpu_device_load_pci_state(struct pci_dev *pdev)
5110 {
5111 	struct drm_device *dev = pci_get_drvdata(pdev);
5112 	struct amdgpu_device *adev = drm_to_adev(dev);
5113 	int r;
5114 
5115 	if (!adev->pci_state)
5116 		return false;
5117 
5118 	r = pci_load_saved_state(pdev, adev->pci_state);
5119 
5120 	if (!r) {
5121 		pci_restore_state(pdev);
5122 	} else {
5123 		DRM_WARN("Failed to load PCI state, err:%d\n", r);
5124 		return false;
5125 	}
5126 
5127 	return true;
5128 }
5129 
5130 
5131