• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /*
2  * Copyright 2008 Advanced Micro Devices, Inc.
3  * Copyright 2008 Red Hat Inc.
4  * Copyright 2009 Jerome Glisse.
5  *
6  * Permission is hereby granted, free of charge, to any person obtaining a
7  * copy of this software and associated documentation files (the "Software"),
8  * to deal in the Software without restriction, including without limitation
9  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
10  * and/or sell copies of the Software, and to permit persons to whom the
11  * Software is furnished to do so, subject to the following conditions:
12  *
13  * The above copyright notice and this permission notice shall be included in
14  * all copies or substantial portions of the Software.
15  *
16  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
19  * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR
20  * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
21  * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
22  * OTHER DEALINGS IN THE SOFTWARE.
23  *
24  * Authors: Dave Airlie
25  *          Alex Deucher
26  *          Jerome Glisse
27  */
28 #include <linux/power_supply.h>
29 #include <linux/kthread.h>
30 #include <linux/module.h>
31 #include <linux/console.h>
32 #include <linux/slab.h>
33 
34 #include <drm/drm_atomic_helper.h>
35 #include <drm/drm_probe_helper.h>
36 #include <drm/amdgpu_drm.h>
37 #include <linux/vgaarb.h>
38 #include <linux/vga_switcheroo.h>
39 #include <linux/efi.h>
40 #include "amdgpu.h"
41 #include "amdgpu_trace.h"
42 #include "amdgpu_i2c.h"
43 #include "atom.h"
44 #include "amdgpu_atombios.h"
45 #include "amdgpu_atomfirmware.h"
46 #include "amd_pcie.h"
47 #ifdef CONFIG_DRM_AMDGPU_SI
48 #include "si.h"
49 #endif
50 #ifdef CONFIG_DRM_AMDGPU_CIK
51 #include "cik.h"
52 #endif
53 #include "vi.h"
54 #include "soc15.h"
55 #include "nv.h"
56 #include "bif/bif_4_1_d.h"
57 #include <linux/pci.h>
58 #include <linux/firmware.h>
59 #include "amdgpu_vf_error.h"
60 
61 #include "amdgpu_amdkfd.h"
62 #include "amdgpu_pm.h"
63 
64 #include "amdgpu_xgmi.h"
65 #include "amdgpu_ras.h"
66 #include "amdgpu_pmu.h"
67 #include "amdgpu_fru_eeprom.h"
68 
69 #include <linux/suspend.h>
70 #include <drm/task_barrier.h>
71 #include <linux/pm_runtime.h>
72 
73 MODULE_FIRMWARE("amdgpu/vega10_gpu_info.bin");
74 MODULE_FIRMWARE("amdgpu/vega12_gpu_info.bin");
75 MODULE_FIRMWARE("amdgpu/raven_gpu_info.bin");
76 MODULE_FIRMWARE("amdgpu/picasso_gpu_info.bin");
77 MODULE_FIRMWARE("amdgpu/raven2_gpu_info.bin");
78 MODULE_FIRMWARE("amdgpu/arcturus_gpu_info.bin");
79 MODULE_FIRMWARE("amdgpu/renoir_gpu_info.bin");
80 MODULE_FIRMWARE("amdgpu/navi10_gpu_info.bin");
81 MODULE_FIRMWARE("amdgpu/navi14_gpu_info.bin");
82 MODULE_FIRMWARE("amdgpu/navi12_gpu_info.bin");
83 
84 #define AMDGPU_RESUME_MS		2000
85 
86 const char *amdgpu_asic_name[] = {
87 	"TAHITI",
88 	"PITCAIRN",
89 	"VERDE",
90 	"OLAND",
91 	"HAINAN",
92 	"BONAIRE",
93 	"KAVERI",
94 	"KABINI",
95 	"HAWAII",
96 	"MULLINS",
97 	"TOPAZ",
98 	"TONGA",
99 	"FIJI",
100 	"CARRIZO",
101 	"STONEY",
102 	"POLARIS10",
103 	"POLARIS11",
104 	"POLARIS12",
105 	"VEGAM",
106 	"VEGA10",
107 	"VEGA12",
108 	"VEGA20",
109 	"RAVEN",
110 	"ARCTURUS",
111 	"RENOIR",
112 	"NAVI10",
113 	"NAVI14",
114 	"NAVI12",
115 	"SIENNA_CICHLID",
116 	"NAVY_FLOUNDER",
117 	"LAST",
118 };
119 
120 /**
121  * DOC: pcie_replay_count
122  *
123  * The amdgpu driver provides a sysfs API for reporting the total number
124  * of PCIe replays (NAKs)
125  * The file pcie_replay_count is used for this and returns the total
126  * number of replays as a sum of the NAKs generated and NAKs received
127  */
128 
amdgpu_device_get_pcie_replay_count(struct device * dev,struct device_attribute * attr,char * buf)129 static ssize_t amdgpu_device_get_pcie_replay_count(struct device *dev,
130 		struct device_attribute *attr, char *buf)
131 {
132 	struct drm_device *ddev = dev_get_drvdata(dev);
133 	struct amdgpu_device *adev = drm_to_adev(ddev);
134 	uint64_t cnt = amdgpu_asic_get_pcie_replay_count(adev);
135 
136 	return snprintf(buf, PAGE_SIZE, "%llu\n", cnt);
137 }
138 
139 static DEVICE_ATTR(pcie_replay_count, S_IRUGO,
140 		amdgpu_device_get_pcie_replay_count, NULL);
141 
142 static void amdgpu_device_get_pcie_info(struct amdgpu_device *adev);
143 
144 /**
145  * DOC: product_name
146  *
147  * The amdgpu driver provides a sysfs API for reporting the product name
148  * for the device
149  * The file serial_number is used for this and returns the product name
150  * as returned from the FRU.
151  * NOTE: This is only available for certain server cards
152  */
153 
amdgpu_device_get_product_name(struct device * dev,struct device_attribute * attr,char * buf)154 static ssize_t amdgpu_device_get_product_name(struct device *dev,
155 		struct device_attribute *attr, char *buf)
156 {
157 	struct drm_device *ddev = dev_get_drvdata(dev);
158 	struct amdgpu_device *adev = drm_to_adev(ddev);
159 
160 	return snprintf(buf, PAGE_SIZE, "%s\n", adev->product_name);
161 }
162 
163 static DEVICE_ATTR(product_name, S_IRUGO,
164 		amdgpu_device_get_product_name, NULL);
165 
166 /**
167  * DOC: product_number
168  *
169  * The amdgpu driver provides a sysfs API for reporting the part number
170  * for the device
171  * The file serial_number is used for this and returns the part number
172  * as returned from the FRU.
173  * NOTE: This is only available for certain server cards
174  */
175 
amdgpu_device_get_product_number(struct device * dev,struct device_attribute * attr,char * buf)176 static ssize_t amdgpu_device_get_product_number(struct device *dev,
177 		struct device_attribute *attr, char *buf)
178 {
179 	struct drm_device *ddev = dev_get_drvdata(dev);
180 	struct amdgpu_device *adev = drm_to_adev(ddev);
181 
182 	return snprintf(buf, PAGE_SIZE, "%s\n", adev->product_number);
183 }
184 
185 static DEVICE_ATTR(product_number, S_IRUGO,
186 		amdgpu_device_get_product_number, NULL);
187 
188 /**
189  * DOC: serial_number
190  *
191  * The amdgpu driver provides a sysfs API for reporting the serial number
192  * for the device
193  * The file serial_number is used for this and returns the serial number
194  * as returned from the FRU.
195  * NOTE: This is only available for certain server cards
196  */
197 
amdgpu_device_get_serial_number(struct device * dev,struct device_attribute * attr,char * buf)198 static ssize_t amdgpu_device_get_serial_number(struct device *dev,
199 		struct device_attribute *attr, char *buf)
200 {
201 	struct drm_device *ddev = dev_get_drvdata(dev);
202 	struct amdgpu_device *adev = drm_to_adev(ddev);
203 
204 	return snprintf(buf, PAGE_SIZE, "%s\n", adev->serial);
205 }
206 
207 static DEVICE_ATTR(serial_number, S_IRUGO,
208 		amdgpu_device_get_serial_number, NULL);
209 
210 /**
211  * amdgpu_device_supports_boco - Is the device a dGPU with HG/PX power control
212  *
213  * @dev: drm_device pointer
214  *
215  * Returns true if the device is a dGPU with HG/PX power control,
216  * otherwise return false.
217  */
amdgpu_device_supports_boco(struct drm_device * dev)218 bool amdgpu_device_supports_boco(struct drm_device *dev)
219 {
220 	struct amdgpu_device *adev = drm_to_adev(dev);
221 
222 	if (adev->flags & AMD_IS_PX)
223 		return true;
224 	return false;
225 }
226 
227 /**
228  * amdgpu_device_supports_baco - Does the device support BACO
229  *
230  * @dev: drm_device pointer
231  *
232  * Returns true if the device supporte BACO,
233  * otherwise return false.
234  */
amdgpu_device_supports_baco(struct drm_device * dev)235 bool amdgpu_device_supports_baco(struct drm_device *dev)
236 {
237 	struct amdgpu_device *adev = drm_to_adev(dev);
238 
239 	return amdgpu_asic_supports_baco(adev);
240 }
241 
242 /*
243  * VRAM access helper functions
244  */
245 
246 /**
247  * amdgpu_device_vram_access - read/write a buffer in vram
248  *
249  * @adev: amdgpu_device pointer
250  * @pos: offset of the buffer in vram
251  * @buf: virtual address of the buffer in system memory
252  * @size: read/write size, sizeof(@buf) must > @size
253  * @write: true - write to vram, otherwise - read from vram
254  */
amdgpu_device_vram_access(struct amdgpu_device * adev,loff_t pos,uint32_t * buf,size_t size,bool write)255 void amdgpu_device_vram_access(struct amdgpu_device *adev, loff_t pos,
256 			       uint32_t *buf, size_t size, bool write)
257 {
258 	unsigned long flags;
259 	uint32_t hi = ~0;
260 	uint64_t last;
261 
262 
263 #ifdef CONFIG_64BIT
264 	last = min(pos + size, adev->gmc.visible_vram_size);
265 	if (last > pos) {
266 		void __iomem *addr = adev->mman.aper_base_kaddr + pos;
267 		size_t count = last - pos;
268 
269 		if (write) {
270 			memcpy_toio(addr, buf, count);
271 			mb();
272 			amdgpu_asic_flush_hdp(adev, NULL);
273 		} else {
274 			amdgpu_asic_invalidate_hdp(adev, NULL);
275 			mb();
276 			memcpy_fromio(buf, addr, count);
277 		}
278 
279 		if (count == size)
280 			return;
281 
282 		pos += count;
283 		buf += count / 4;
284 		size -= count;
285 	}
286 #endif
287 
288 	spin_lock_irqsave(&adev->mmio_idx_lock, flags);
289 	for (last = pos + size; pos < last; pos += 4) {
290 		uint32_t tmp = pos >> 31;
291 
292 		WREG32_NO_KIQ(mmMM_INDEX, ((uint32_t)pos) | 0x80000000);
293 		if (tmp != hi) {
294 			WREG32_NO_KIQ(mmMM_INDEX_HI, tmp);
295 			hi = tmp;
296 		}
297 		if (write)
298 			WREG32_NO_KIQ(mmMM_DATA, *buf++);
299 		else
300 			*buf++ = RREG32_NO_KIQ(mmMM_DATA);
301 	}
302 	spin_unlock_irqrestore(&adev->mmio_idx_lock, flags);
303 }
304 
305 /*
306  * register access helper functions.
307  */
308 /**
309  * amdgpu_device_rreg - read a memory mapped IO or indirect register
310  *
311  * @adev: amdgpu_device pointer
312  * @reg: dword aligned register offset
313  * @acc_flags: access flags which require special behavior
314  *
315  * Returns the 32 bit value from the offset specified.
316  */
amdgpu_device_rreg(struct amdgpu_device * adev,uint32_t reg,uint32_t acc_flags)317 uint32_t amdgpu_device_rreg(struct amdgpu_device *adev,
318 			    uint32_t reg, uint32_t acc_flags)
319 {
320 	uint32_t ret;
321 
322 	if (adev->in_pci_err_recovery)
323 		return 0;
324 
325 	if ((reg * 4) < adev->rmmio_size) {
326 		if (!(acc_flags & AMDGPU_REGS_NO_KIQ) &&
327 		    amdgpu_sriov_runtime(adev) &&
328 		    down_read_trylock(&adev->reset_sem)) {
329 			ret = amdgpu_kiq_rreg(adev, reg);
330 			up_read(&adev->reset_sem);
331 		} else {
332 			ret = readl(((void __iomem *)adev->rmmio) + (reg * 4));
333 		}
334 	} else {
335 		ret = adev->pcie_rreg(adev, reg * 4);
336 	}
337 
338 	trace_amdgpu_device_rreg(adev->pdev->device, reg, ret);
339 
340 	return ret;
341 }
342 
343 /*
344  * MMIO register read with bytes helper functions
345  * @offset:bytes offset from MMIO start
346  *
347 */
348 
349 /**
350  * amdgpu_mm_rreg8 - read a memory mapped IO register
351  *
352  * @adev: amdgpu_device pointer
353  * @offset: byte aligned register offset
354  *
355  * Returns the 8 bit value from the offset specified.
356  */
amdgpu_mm_rreg8(struct amdgpu_device * adev,uint32_t offset)357 uint8_t amdgpu_mm_rreg8(struct amdgpu_device *adev, uint32_t offset)
358 {
359 	if (adev->in_pci_err_recovery)
360 		return 0;
361 
362 	if (offset < adev->rmmio_size)
363 		return (readb(adev->rmmio + offset));
364 	BUG();
365 }
366 
367 /*
368  * MMIO register write with bytes helper functions
369  * @offset:bytes offset from MMIO start
370  * @value: the value want to be written to the register
371  *
372 */
373 /**
374  * amdgpu_mm_wreg8 - read a memory mapped IO register
375  *
376  * @adev: amdgpu_device pointer
377  * @offset: byte aligned register offset
378  * @value: 8 bit value to write
379  *
380  * Writes the value specified to the offset specified.
381  */
amdgpu_mm_wreg8(struct amdgpu_device * adev,uint32_t offset,uint8_t value)382 void amdgpu_mm_wreg8(struct amdgpu_device *adev, uint32_t offset, uint8_t value)
383 {
384 	if (adev->in_pci_err_recovery)
385 		return;
386 
387 	if (offset < adev->rmmio_size)
388 		writeb(value, adev->rmmio + offset);
389 	else
390 		BUG();
391 }
392 
393 /**
394  * amdgpu_device_wreg - write to a memory mapped IO or indirect register
395  *
396  * @adev: amdgpu_device pointer
397  * @reg: dword aligned register offset
398  * @v: 32 bit value to write to the register
399  * @acc_flags: access flags which require special behavior
400  *
401  * Writes the value specified to the offset specified.
402  */
amdgpu_device_wreg(struct amdgpu_device * adev,uint32_t reg,uint32_t v,uint32_t acc_flags)403 void amdgpu_device_wreg(struct amdgpu_device *adev,
404 			uint32_t reg, uint32_t v,
405 			uint32_t acc_flags)
406 {
407 	if (adev->in_pci_err_recovery)
408 		return;
409 
410 	if ((reg * 4) < adev->rmmio_size) {
411 		if (!(acc_flags & AMDGPU_REGS_NO_KIQ) &&
412 		    amdgpu_sriov_runtime(adev) &&
413 		    down_read_trylock(&adev->reset_sem)) {
414 			amdgpu_kiq_wreg(adev, reg, v);
415 			up_read(&adev->reset_sem);
416 		} else {
417 			writel(v, ((void __iomem *)adev->rmmio) + (reg * 4));
418 		}
419 	} else {
420 		adev->pcie_wreg(adev, reg * 4, v);
421 	}
422 
423 	trace_amdgpu_device_wreg(adev->pdev->device, reg, v);
424 }
425 
426 /*
427  * amdgpu_mm_wreg_mmio_rlc -  write register either with mmio or with RLC path if in range
428  *
429  * this function is invoked only the debugfs register access
430  * */
amdgpu_mm_wreg_mmio_rlc(struct amdgpu_device * adev,uint32_t reg,uint32_t v)431 void amdgpu_mm_wreg_mmio_rlc(struct amdgpu_device *adev,
432 			     uint32_t reg, uint32_t v)
433 {
434 	if (adev->in_pci_err_recovery)
435 		return;
436 
437 	if (amdgpu_sriov_fullaccess(adev) &&
438 	    adev->gfx.rlc.funcs &&
439 	    adev->gfx.rlc.funcs->is_rlcg_access_range) {
440 		if (adev->gfx.rlc.funcs->is_rlcg_access_range(adev, reg))
441 			return adev->gfx.rlc.funcs->rlcg_wreg(adev, reg, v);
442 	} else {
443 		writel(v, ((void __iomem *)adev->rmmio) + (reg * 4));
444 	}
445 }
446 
447 /**
448  * amdgpu_io_rreg - read an IO register
449  *
450  * @adev: amdgpu_device pointer
451  * @reg: dword aligned register offset
452  *
453  * Returns the 32 bit value from the offset specified.
454  */
amdgpu_io_rreg(struct amdgpu_device * adev,u32 reg)455 u32 amdgpu_io_rreg(struct amdgpu_device *adev, u32 reg)
456 {
457 	if (adev->in_pci_err_recovery)
458 		return 0;
459 
460 	if ((reg * 4) < adev->rio_mem_size)
461 		return ioread32(adev->rio_mem + (reg * 4));
462 	else {
463 		iowrite32((reg * 4), adev->rio_mem + (mmMM_INDEX * 4));
464 		return ioread32(adev->rio_mem + (mmMM_DATA * 4));
465 	}
466 }
467 
468 /**
469  * amdgpu_io_wreg - write to an IO register
470  *
471  * @adev: amdgpu_device pointer
472  * @reg: dword aligned register offset
473  * @v: 32 bit value to write to the register
474  *
475  * Writes the value specified to the offset specified.
476  */
amdgpu_io_wreg(struct amdgpu_device * adev,u32 reg,u32 v)477 void amdgpu_io_wreg(struct amdgpu_device *adev, u32 reg, u32 v)
478 {
479 	if (adev->in_pci_err_recovery)
480 		return;
481 
482 	if ((reg * 4) < adev->rio_mem_size)
483 		iowrite32(v, adev->rio_mem + (reg * 4));
484 	else {
485 		iowrite32((reg * 4), adev->rio_mem + (mmMM_INDEX * 4));
486 		iowrite32(v, adev->rio_mem + (mmMM_DATA * 4));
487 	}
488 }
489 
490 /**
491  * amdgpu_mm_rdoorbell - read a doorbell dword
492  *
493  * @adev: amdgpu_device pointer
494  * @index: doorbell index
495  *
496  * Returns the value in the doorbell aperture at the
497  * requested doorbell index (CIK).
498  */
amdgpu_mm_rdoorbell(struct amdgpu_device * adev,u32 index)499 u32 amdgpu_mm_rdoorbell(struct amdgpu_device *adev, u32 index)
500 {
501 	if (adev->in_pci_err_recovery)
502 		return 0;
503 
504 	if (index < adev->doorbell.num_doorbells) {
505 		return readl(adev->doorbell.ptr + index);
506 	} else {
507 		DRM_ERROR("reading beyond doorbell aperture: 0x%08x!\n", index);
508 		return 0;
509 	}
510 }
511 
512 /**
513  * amdgpu_mm_wdoorbell - write a doorbell dword
514  *
515  * @adev: amdgpu_device pointer
516  * @index: doorbell index
517  * @v: value to write
518  *
519  * Writes @v to the doorbell aperture at the
520  * requested doorbell index (CIK).
521  */
amdgpu_mm_wdoorbell(struct amdgpu_device * adev,u32 index,u32 v)522 void amdgpu_mm_wdoorbell(struct amdgpu_device *adev, u32 index, u32 v)
523 {
524 	if (adev->in_pci_err_recovery)
525 		return;
526 
527 	if (index < adev->doorbell.num_doorbells) {
528 		writel(v, adev->doorbell.ptr + index);
529 	} else {
530 		DRM_ERROR("writing beyond doorbell aperture: 0x%08x!\n", index);
531 	}
532 }
533 
534 /**
535  * amdgpu_mm_rdoorbell64 - read a doorbell Qword
536  *
537  * @adev: amdgpu_device pointer
538  * @index: doorbell index
539  *
540  * Returns the value in the doorbell aperture at the
541  * requested doorbell index (VEGA10+).
542  */
amdgpu_mm_rdoorbell64(struct amdgpu_device * adev,u32 index)543 u64 amdgpu_mm_rdoorbell64(struct amdgpu_device *adev, u32 index)
544 {
545 	if (adev->in_pci_err_recovery)
546 		return 0;
547 
548 	if (index < adev->doorbell.num_doorbells) {
549 		return atomic64_read((atomic64_t *)(adev->doorbell.ptr + index));
550 	} else {
551 		DRM_ERROR("reading beyond doorbell aperture: 0x%08x!\n", index);
552 		return 0;
553 	}
554 }
555 
556 /**
557  * amdgpu_mm_wdoorbell64 - write a doorbell Qword
558  *
559  * @adev: amdgpu_device pointer
560  * @index: doorbell index
561  * @v: value to write
562  *
563  * Writes @v to the doorbell aperture at the
564  * requested doorbell index (VEGA10+).
565  */
amdgpu_mm_wdoorbell64(struct amdgpu_device * adev,u32 index,u64 v)566 void amdgpu_mm_wdoorbell64(struct amdgpu_device *adev, u32 index, u64 v)
567 {
568 	if (adev->in_pci_err_recovery)
569 		return;
570 
571 	if (index < adev->doorbell.num_doorbells) {
572 		atomic64_set((atomic64_t *)(adev->doorbell.ptr + index), v);
573 	} else {
574 		DRM_ERROR("writing beyond doorbell aperture: 0x%08x!\n", index);
575 	}
576 }
577 
578 /**
579  * amdgpu_device_indirect_rreg - read an indirect register
580  *
581  * @adev: amdgpu_device pointer
582  * @pcie_index: mmio register offset
583  * @pcie_data: mmio register offset
584  *
585  * Returns the value of indirect register @reg_addr
586  */
amdgpu_device_indirect_rreg(struct amdgpu_device * adev,u32 pcie_index,u32 pcie_data,u32 reg_addr)587 u32 amdgpu_device_indirect_rreg(struct amdgpu_device *adev,
588 				u32 pcie_index, u32 pcie_data,
589 				u32 reg_addr)
590 {
591 	unsigned long flags;
592 	u32 r;
593 	void __iomem *pcie_index_offset;
594 	void __iomem *pcie_data_offset;
595 
596 	spin_lock_irqsave(&adev->pcie_idx_lock, flags);
597 	pcie_index_offset = (void __iomem *)adev->rmmio + pcie_index * 4;
598 	pcie_data_offset = (void __iomem *)adev->rmmio + pcie_data * 4;
599 
600 	writel(reg_addr, pcie_index_offset);
601 	readl(pcie_index_offset);
602 	r = readl(pcie_data_offset);
603 	spin_unlock_irqrestore(&adev->pcie_idx_lock, flags);
604 
605 	return r;
606 }
607 
608 /**
609  * amdgpu_device_indirect_rreg64 - read a 64bits indirect register
610  *
611  * @adev: amdgpu_device pointer
612  * @pcie_index: mmio register offset
613  * @pcie_data: mmio register offset
614  *
615  * Returns the value of indirect register @reg_addr
616  */
amdgpu_device_indirect_rreg64(struct amdgpu_device * adev,u32 pcie_index,u32 pcie_data,u32 reg_addr)617 u64 amdgpu_device_indirect_rreg64(struct amdgpu_device *adev,
618 				  u32 pcie_index, u32 pcie_data,
619 				  u32 reg_addr)
620 {
621 	unsigned long flags;
622 	u64 r;
623 	void __iomem *pcie_index_offset;
624 	void __iomem *pcie_data_offset;
625 
626 	spin_lock_irqsave(&adev->pcie_idx_lock, flags);
627 	pcie_index_offset = (void __iomem *)adev->rmmio + pcie_index * 4;
628 	pcie_data_offset = (void __iomem *)adev->rmmio + pcie_data * 4;
629 
630 	/* read low 32 bits */
631 	writel(reg_addr, pcie_index_offset);
632 	readl(pcie_index_offset);
633 	r = readl(pcie_data_offset);
634 	/* read high 32 bits */
635 	writel(reg_addr + 4, pcie_index_offset);
636 	readl(pcie_index_offset);
637 	r |= ((u64)readl(pcie_data_offset) << 32);
638 	spin_unlock_irqrestore(&adev->pcie_idx_lock, flags);
639 
640 	return r;
641 }
642 
643 /**
644  * amdgpu_device_indirect_wreg - write an indirect register address
645  *
646  * @adev: amdgpu_device pointer
647  * @pcie_index: mmio register offset
648  * @pcie_data: mmio register offset
649  * @reg_addr: indirect register offset
650  * @reg_data: indirect register data
651  *
652  */
amdgpu_device_indirect_wreg(struct amdgpu_device * adev,u32 pcie_index,u32 pcie_data,u32 reg_addr,u32 reg_data)653 void amdgpu_device_indirect_wreg(struct amdgpu_device *adev,
654 				 u32 pcie_index, u32 pcie_data,
655 				 u32 reg_addr, u32 reg_data)
656 {
657 	unsigned long flags;
658 	void __iomem *pcie_index_offset;
659 	void __iomem *pcie_data_offset;
660 
661 	spin_lock_irqsave(&adev->pcie_idx_lock, flags);
662 	pcie_index_offset = (void __iomem *)adev->rmmio + pcie_index * 4;
663 	pcie_data_offset = (void __iomem *)adev->rmmio + pcie_data * 4;
664 
665 	writel(reg_addr, pcie_index_offset);
666 	readl(pcie_index_offset);
667 	writel(reg_data, pcie_data_offset);
668 	readl(pcie_data_offset);
669 	spin_unlock_irqrestore(&adev->pcie_idx_lock, flags);
670 }
671 
672 /**
673  * amdgpu_device_indirect_wreg64 - write a 64bits indirect register address
674  *
675  * @adev: amdgpu_device pointer
676  * @pcie_index: mmio register offset
677  * @pcie_data: mmio register offset
678  * @reg_addr: indirect register offset
679  * @reg_data: indirect register data
680  *
681  */
amdgpu_device_indirect_wreg64(struct amdgpu_device * adev,u32 pcie_index,u32 pcie_data,u32 reg_addr,u64 reg_data)682 void amdgpu_device_indirect_wreg64(struct amdgpu_device *adev,
683 				   u32 pcie_index, u32 pcie_data,
684 				   u32 reg_addr, u64 reg_data)
685 {
686 	unsigned long flags;
687 	void __iomem *pcie_index_offset;
688 	void __iomem *pcie_data_offset;
689 
690 	spin_lock_irqsave(&adev->pcie_idx_lock, flags);
691 	pcie_index_offset = (void __iomem *)adev->rmmio + pcie_index * 4;
692 	pcie_data_offset = (void __iomem *)adev->rmmio + pcie_data * 4;
693 
694 	/* write low 32 bits */
695 	writel(reg_addr, pcie_index_offset);
696 	readl(pcie_index_offset);
697 	writel((u32)(reg_data & 0xffffffffULL), pcie_data_offset);
698 	readl(pcie_data_offset);
699 	/* write high 32 bits */
700 	writel(reg_addr + 4, pcie_index_offset);
701 	readl(pcie_index_offset);
702 	writel((u32)(reg_data >> 32), pcie_data_offset);
703 	readl(pcie_data_offset);
704 	spin_unlock_irqrestore(&adev->pcie_idx_lock, flags);
705 }
706 
707 /**
708  * amdgpu_invalid_rreg - dummy reg read function
709  *
710  * @adev: amdgpu_device pointer
711  * @reg: offset of register
712  *
713  * Dummy register read function.  Used for register blocks
714  * that certain asics don't have (all asics).
715  * Returns the value in the register.
716  */
amdgpu_invalid_rreg(struct amdgpu_device * adev,uint32_t reg)717 static uint32_t amdgpu_invalid_rreg(struct amdgpu_device *adev, uint32_t reg)
718 {
719 	DRM_ERROR("Invalid callback to read register 0x%04X\n", reg);
720 	BUG();
721 	return 0;
722 }
723 
724 /**
725  * amdgpu_invalid_wreg - dummy reg write function
726  *
727  * @adev: amdgpu_device pointer
728  * @reg: offset of register
729  * @v: value to write to the register
730  *
731  * Dummy register read function.  Used for register blocks
732  * that certain asics don't have (all asics).
733  */
amdgpu_invalid_wreg(struct amdgpu_device * adev,uint32_t reg,uint32_t v)734 static void amdgpu_invalid_wreg(struct amdgpu_device *adev, uint32_t reg, uint32_t v)
735 {
736 	DRM_ERROR("Invalid callback to write register 0x%04X with 0x%08X\n",
737 		  reg, v);
738 	BUG();
739 }
740 
741 /**
742  * amdgpu_invalid_rreg64 - dummy 64 bit reg read function
743  *
744  * @adev: amdgpu_device pointer
745  * @reg: offset of register
746  *
747  * Dummy register read function.  Used for register blocks
748  * that certain asics don't have (all asics).
749  * Returns the value in the register.
750  */
amdgpu_invalid_rreg64(struct amdgpu_device * adev,uint32_t reg)751 static uint64_t amdgpu_invalid_rreg64(struct amdgpu_device *adev, uint32_t reg)
752 {
753 	DRM_ERROR("Invalid callback to read 64 bit register 0x%04X\n", reg);
754 	BUG();
755 	return 0;
756 }
757 
758 /**
759  * amdgpu_invalid_wreg64 - dummy reg write function
760  *
761  * @adev: amdgpu_device pointer
762  * @reg: offset of register
763  * @v: value to write to the register
764  *
765  * Dummy register read function.  Used for register blocks
766  * that certain asics don't have (all asics).
767  */
amdgpu_invalid_wreg64(struct amdgpu_device * adev,uint32_t reg,uint64_t v)768 static void amdgpu_invalid_wreg64(struct amdgpu_device *adev, uint32_t reg, uint64_t v)
769 {
770 	DRM_ERROR("Invalid callback to write 64 bit register 0x%04X with 0x%08llX\n",
771 		  reg, v);
772 	BUG();
773 }
774 
775 /**
776  * amdgpu_block_invalid_rreg - dummy reg read function
777  *
778  * @adev: amdgpu_device pointer
779  * @block: offset of instance
780  * @reg: offset of register
781  *
782  * Dummy register read function.  Used for register blocks
783  * that certain asics don't have (all asics).
784  * Returns the value in the register.
785  */
amdgpu_block_invalid_rreg(struct amdgpu_device * adev,uint32_t block,uint32_t reg)786 static uint32_t amdgpu_block_invalid_rreg(struct amdgpu_device *adev,
787 					  uint32_t block, uint32_t reg)
788 {
789 	DRM_ERROR("Invalid callback to read register 0x%04X in block 0x%04X\n",
790 		  reg, block);
791 	BUG();
792 	return 0;
793 }
794 
795 /**
796  * amdgpu_block_invalid_wreg - dummy reg write function
797  *
798  * @adev: amdgpu_device pointer
799  * @block: offset of instance
800  * @reg: offset of register
801  * @v: value to write to the register
802  *
803  * Dummy register read function.  Used for register blocks
804  * that certain asics don't have (all asics).
805  */
amdgpu_block_invalid_wreg(struct amdgpu_device * adev,uint32_t block,uint32_t reg,uint32_t v)806 static void amdgpu_block_invalid_wreg(struct amdgpu_device *adev,
807 				      uint32_t block,
808 				      uint32_t reg, uint32_t v)
809 {
810 	DRM_ERROR("Invalid block callback to write register 0x%04X in block 0x%04X with 0x%08X\n",
811 		  reg, block, v);
812 	BUG();
813 }
814 
815 /**
816  * amdgpu_device_asic_init - Wrapper for atom asic_init
817  *
818  * @adev: amdgpu_device pointer
819  *
820  * Does any asic specific work and then calls atom asic init.
821  */
amdgpu_device_asic_init(struct amdgpu_device * adev)822 static int amdgpu_device_asic_init(struct amdgpu_device *adev)
823 {
824 	amdgpu_asic_pre_asic_init(adev);
825 
826 	return amdgpu_atom_asic_init(adev->mode_info.atom_context);
827 }
828 
829 /**
830  * amdgpu_device_vram_scratch_init - allocate the VRAM scratch page
831  *
832  * @adev: amdgpu_device pointer
833  *
834  * Allocates a scratch page of VRAM for use by various things in the
835  * driver.
836  */
amdgpu_device_vram_scratch_init(struct amdgpu_device * adev)837 static int amdgpu_device_vram_scratch_init(struct amdgpu_device *adev)
838 {
839 	return amdgpu_bo_create_kernel(adev, AMDGPU_GPU_PAGE_SIZE,
840 				       PAGE_SIZE, AMDGPU_GEM_DOMAIN_VRAM,
841 				       &adev->vram_scratch.robj,
842 				       &adev->vram_scratch.gpu_addr,
843 				       (void **)&adev->vram_scratch.ptr);
844 }
845 
846 /**
847  * amdgpu_device_vram_scratch_fini - Free the VRAM scratch page
848  *
849  * @adev: amdgpu_device pointer
850  *
851  * Frees the VRAM scratch page.
852  */
amdgpu_device_vram_scratch_fini(struct amdgpu_device * adev)853 static void amdgpu_device_vram_scratch_fini(struct amdgpu_device *adev)
854 {
855 	amdgpu_bo_free_kernel(&adev->vram_scratch.robj, NULL, NULL);
856 }
857 
858 /**
859  * amdgpu_device_program_register_sequence - program an array of registers.
860  *
861  * @adev: amdgpu_device pointer
862  * @registers: pointer to the register array
863  * @array_size: size of the register array
864  *
865  * Programs an array or registers with and and or masks.
866  * This is a helper for setting golden registers.
867  */
amdgpu_device_program_register_sequence(struct amdgpu_device * adev,const u32 * registers,const u32 array_size)868 void amdgpu_device_program_register_sequence(struct amdgpu_device *adev,
869 					     const u32 *registers,
870 					     const u32 array_size)
871 {
872 	u32 tmp, reg, and_mask, or_mask;
873 	int i;
874 
875 	if (array_size % 3)
876 		return;
877 
878 	for (i = 0; i < array_size; i +=3) {
879 		reg = registers[i + 0];
880 		and_mask = registers[i + 1];
881 		or_mask = registers[i + 2];
882 
883 		if (and_mask == 0xffffffff) {
884 			tmp = or_mask;
885 		} else {
886 			tmp = RREG32(reg);
887 			tmp &= ~and_mask;
888 			if (adev->family >= AMDGPU_FAMILY_AI)
889 				tmp |= (or_mask & and_mask);
890 			else
891 				tmp |= or_mask;
892 		}
893 		WREG32(reg, tmp);
894 	}
895 }
896 
897 /**
898  * amdgpu_device_pci_config_reset - reset the GPU
899  *
900  * @adev: amdgpu_device pointer
901  *
902  * Resets the GPU using the pci config reset sequence.
903  * Only applicable to asics prior to vega10.
904  */
amdgpu_device_pci_config_reset(struct amdgpu_device * adev)905 void amdgpu_device_pci_config_reset(struct amdgpu_device *adev)
906 {
907 	pci_write_config_dword(adev->pdev, 0x7c, AMDGPU_ASIC_RESET_DATA);
908 }
909 
910 /*
911  * GPU doorbell aperture helpers function.
912  */
913 /**
914  * amdgpu_device_doorbell_init - Init doorbell driver information.
915  *
916  * @adev: amdgpu_device pointer
917  *
918  * Init doorbell driver information (CIK)
919  * Returns 0 on success, error on failure.
920  */
amdgpu_device_doorbell_init(struct amdgpu_device * adev)921 static int amdgpu_device_doorbell_init(struct amdgpu_device *adev)
922 {
923 
924 	/* No doorbell on SI hardware generation */
925 	if (adev->asic_type < CHIP_BONAIRE) {
926 		adev->doorbell.base = 0;
927 		adev->doorbell.size = 0;
928 		adev->doorbell.num_doorbells = 0;
929 		adev->doorbell.ptr = NULL;
930 		return 0;
931 	}
932 
933 	if (pci_resource_flags(adev->pdev, 2) & IORESOURCE_UNSET)
934 		return -EINVAL;
935 
936 	amdgpu_asic_init_doorbell_index(adev);
937 
938 	/* doorbell bar mapping */
939 	adev->doorbell.base = pci_resource_start(adev->pdev, 2);
940 	adev->doorbell.size = pci_resource_len(adev->pdev, 2);
941 
942 	adev->doorbell.num_doorbells = min_t(u32, adev->doorbell.size / sizeof(u32),
943 					     adev->doorbell_index.max_assignment+1);
944 	if (adev->doorbell.num_doorbells == 0)
945 		return -EINVAL;
946 
947 	/* For Vega, reserve and map two pages on doorbell BAR since SDMA
948 	 * paging queue doorbell use the second page. The
949 	 * AMDGPU_DOORBELL64_MAX_ASSIGNMENT definition assumes all the
950 	 * doorbells are in the first page. So with paging queue enabled,
951 	 * the max num_doorbells should + 1 page (0x400 in dword)
952 	 */
953 	if (adev->asic_type >= CHIP_VEGA10)
954 		adev->doorbell.num_doorbells += 0x400;
955 
956 	adev->doorbell.ptr = ioremap(adev->doorbell.base,
957 				     adev->doorbell.num_doorbells *
958 				     sizeof(u32));
959 	if (adev->doorbell.ptr == NULL)
960 		return -ENOMEM;
961 
962 	return 0;
963 }
964 
965 /**
966  * amdgpu_device_doorbell_fini - Tear down doorbell driver information.
967  *
968  * @adev: amdgpu_device pointer
969  *
970  * Tear down doorbell driver information (CIK)
971  */
amdgpu_device_doorbell_fini(struct amdgpu_device * adev)972 static void amdgpu_device_doorbell_fini(struct amdgpu_device *adev)
973 {
974 	iounmap(adev->doorbell.ptr);
975 	adev->doorbell.ptr = NULL;
976 }
977 
978 
979 
980 /*
981  * amdgpu_device_wb_*()
982  * Writeback is the method by which the GPU updates special pages in memory
983  * with the status of certain GPU events (fences, ring pointers,etc.).
984  */
985 
986 /**
987  * amdgpu_device_wb_fini - Disable Writeback and free memory
988  *
989  * @adev: amdgpu_device pointer
990  *
991  * Disables Writeback and frees the Writeback memory (all asics).
992  * Used at driver shutdown.
993  */
amdgpu_device_wb_fini(struct amdgpu_device * adev)994 static void amdgpu_device_wb_fini(struct amdgpu_device *adev)
995 {
996 	if (adev->wb.wb_obj) {
997 		amdgpu_bo_free_kernel(&adev->wb.wb_obj,
998 				      &adev->wb.gpu_addr,
999 				      (void **)&adev->wb.wb);
1000 		adev->wb.wb_obj = NULL;
1001 	}
1002 }
1003 
1004 /**
1005  * amdgpu_device_wb_init- Init Writeback driver info and allocate memory
1006  *
1007  * @adev: amdgpu_device pointer
1008  *
1009  * Initializes writeback and allocates writeback memory (all asics).
1010  * Used at driver startup.
1011  * Returns 0 on success or an -error on failure.
1012  */
amdgpu_device_wb_init(struct amdgpu_device * adev)1013 static int amdgpu_device_wb_init(struct amdgpu_device *adev)
1014 {
1015 	int r;
1016 
1017 	if (adev->wb.wb_obj == NULL) {
1018 		/* AMDGPU_MAX_WB * sizeof(uint32_t) * 8 = AMDGPU_MAX_WB 256bit slots */
1019 		r = amdgpu_bo_create_kernel(adev, AMDGPU_MAX_WB * sizeof(uint32_t) * 8,
1020 					    PAGE_SIZE, AMDGPU_GEM_DOMAIN_GTT,
1021 					    &adev->wb.wb_obj, &adev->wb.gpu_addr,
1022 					    (void **)&adev->wb.wb);
1023 		if (r) {
1024 			dev_warn(adev->dev, "(%d) create WB bo failed\n", r);
1025 			return r;
1026 		}
1027 
1028 		adev->wb.num_wb = AMDGPU_MAX_WB;
1029 		memset(&adev->wb.used, 0, sizeof(adev->wb.used));
1030 
1031 		/* clear wb memory */
1032 		memset((char *)adev->wb.wb, 0, AMDGPU_MAX_WB * sizeof(uint32_t) * 8);
1033 	}
1034 
1035 	return 0;
1036 }
1037 
1038 /**
1039  * amdgpu_device_wb_get - Allocate a wb entry
1040  *
1041  * @adev: amdgpu_device pointer
1042  * @wb: wb index
1043  *
1044  * Allocate a wb slot for use by the driver (all asics).
1045  * Returns 0 on success or -EINVAL on failure.
1046  */
amdgpu_device_wb_get(struct amdgpu_device * adev,u32 * wb)1047 int amdgpu_device_wb_get(struct amdgpu_device *adev, u32 *wb)
1048 {
1049 	unsigned long offset = find_first_zero_bit(adev->wb.used, adev->wb.num_wb);
1050 
1051 	if (offset < adev->wb.num_wb) {
1052 		__set_bit(offset, adev->wb.used);
1053 		*wb = offset << 3; /* convert to dw offset */
1054 		return 0;
1055 	} else {
1056 		return -EINVAL;
1057 	}
1058 }
1059 
1060 /**
1061  * amdgpu_device_wb_free - Free a wb entry
1062  *
1063  * @adev: amdgpu_device pointer
1064  * @wb: wb index
1065  *
1066  * Free a wb slot allocated for use by the driver (all asics)
1067  */
amdgpu_device_wb_free(struct amdgpu_device * adev,u32 wb)1068 void amdgpu_device_wb_free(struct amdgpu_device *adev, u32 wb)
1069 {
1070 	wb >>= 3;
1071 	if (wb < adev->wb.num_wb)
1072 		__clear_bit(wb, adev->wb.used);
1073 }
1074 
1075 /**
1076  * amdgpu_device_resize_fb_bar - try to resize FB BAR
1077  *
1078  * @adev: amdgpu_device pointer
1079  *
1080  * Try to resize FB BAR to make all VRAM CPU accessible. We try very hard not
1081  * to fail, but if any of the BARs is not accessible after the size we abort
1082  * driver loading by returning -ENODEV.
1083  */
amdgpu_device_resize_fb_bar(struct amdgpu_device * adev)1084 int amdgpu_device_resize_fb_bar(struct amdgpu_device *adev)
1085 {
1086 	u64 space_needed = roundup_pow_of_two(adev->gmc.real_vram_size);
1087 	u32 rbar_size = order_base_2(((space_needed >> 20) | 1)) - 1;
1088 	struct pci_bus *root;
1089 	struct resource *res;
1090 	unsigned i;
1091 	u16 cmd;
1092 	int r;
1093 
1094 	/* Bypass for VF */
1095 	if (amdgpu_sriov_vf(adev))
1096 		return 0;
1097 
1098 	/* skip if the bios has already enabled large BAR */
1099 	if (adev->gmc.real_vram_size &&
1100 	    (pci_resource_len(adev->pdev, 0) >= adev->gmc.real_vram_size))
1101 		return 0;
1102 
1103 	/* Check if the root BUS has 64bit memory resources */
1104 	root = adev->pdev->bus;
1105 	while (root->parent)
1106 		root = root->parent;
1107 
1108 	pci_bus_for_each_resource(root, res, i) {
1109 		if (res && res->flags & (IORESOURCE_MEM | IORESOURCE_MEM_64) &&
1110 		    res->start > 0x100000000ull)
1111 			break;
1112 	}
1113 
1114 	/* Trying to resize is pointless without a root hub window above 4GB */
1115 	if (!res)
1116 		return 0;
1117 
1118 	/* Disable memory decoding while we change the BAR addresses and size */
1119 	pci_read_config_word(adev->pdev, PCI_COMMAND, &cmd);
1120 	pci_write_config_word(adev->pdev, PCI_COMMAND,
1121 			      cmd & ~PCI_COMMAND_MEMORY);
1122 
1123 	/* Free the VRAM and doorbell BAR, we most likely need to move both. */
1124 	amdgpu_device_doorbell_fini(adev);
1125 	if (adev->asic_type >= CHIP_BONAIRE)
1126 		pci_release_resource(adev->pdev, 2);
1127 
1128 	pci_release_resource(adev->pdev, 0);
1129 
1130 	r = pci_resize_resource(adev->pdev, 0, rbar_size);
1131 	if (r == -ENOSPC)
1132 		DRM_INFO("Not enough PCI address space for a large BAR.");
1133 	else if (r && r != -ENOTSUPP)
1134 		DRM_ERROR("Problem resizing BAR0 (%d).", r);
1135 
1136 	pci_assign_unassigned_bus_resources(adev->pdev->bus);
1137 
1138 	/* When the doorbell or fb BAR isn't available we have no chance of
1139 	 * using the device.
1140 	 */
1141 	r = amdgpu_device_doorbell_init(adev);
1142 	if (r || (pci_resource_flags(adev->pdev, 0) & IORESOURCE_UNSET))
1143 		return -ENODEV;
1144 
1145 	pci_write_config_word(adev->pdev, PCI_COMMAND, cmd);
1146 
1147 	return 0;
1148 }
1149 
1150 /*
1151  * GPU helpers function.
1152  */
1153 /**
1154  * amdgpu_device_need_post - check if the hw need post or not
1155  *
1156  * @adev: amdgpu_device pointer
1157  *
1158  * Check if the asic has been initialized (all asics) at driver startup
1159  * or post is needed if  hw reset is performed.
1160  * Returns true if need or false if not.
1161  */
amdgpu_device_need_post(struct amdgpu_device * adev)1162 bool amdgpu_device_need_post(struct amdgpu_device *adev)
1163 {
1164 	uint32_t reg;
1165 
1166 	if (amdgpu_sriov_vf(adev))
1167 		return false;
1168 
1169 	if (amdgpu_passthrough(adev)) {
1170 		/* for FIJI: In whole GPU pass-through virtualization case, after VM reboot
1171 		 * some old smc fw still need driver do vPost otherwise gpu hang, while
1172 		 * those smc fw version above 22.15 doesn't have this flaw, so we force
1173 		 * vpost executed for smc version below 22.15
1174 		 */
1175 		if (adev->asic_type == CHIP_FIJI) {
1176 			int err;
1177 			uint32_t fw_ver;
1178 			err = request_firmware(&adev->pm.fw, "amdgpu/fiji_smc.bin", adev->dev);
1179 			/* force vPost if error occured */
1180 			if (err)
1181 				return true;
1182 
1183 			fw_ver = *((uint32_t *)adev->pm.fw->data + 69);
1184 			if (fw_ver < 0x00160e00)
1185 				return true;
1186 		}
1187 	}
1188 
1189 	if (adev->has_hw_reset) {
1190 		adev->has_hw_reset = false;
1191 		return true;
1192 	}
1193 
1194 	/* bios scratch used on CIK+ */
1195 	if (adev->asic_type >= CHIP_BONAIRE)
1196 		return amdgpu_atombios_scratch_need_asic_init(adev);
1197 
1198 	/* check MEM_SIZE for older asics */
1199 	reg = amdgpu_asic_get_config_memsize(adev);
1200 
1201 	if ((reg != 0) && (reg != 0xffffffff))
1202 		return false;
1203 
1204 	return true;
1205 }
1206 
1207 /* if we get transitioned to only one device, take VGA back */
1208 /**
1209  * amdgpu_device_vga_set_decode - enable/disable vga decode
1210  *
1211  * @cookie: amdgpu_device pointer
1212  * @state: enable/disable vga decode
1213  *
1214  * Enable/disable vga decode (all asics).
1215  * Returns VGA resource flags.
1216  */
amdgpu_device_vga_set_decode(void * cookie,bool state)1217 static unsigned int amdgpu_device_vga_set_decode(void *cookie, bool state)
1218 {
1219 	struct amdgpu_device *adev = cookie;
1220 	amdgpu_asic_set_vga_state(adev, state);
1221 	if (state)
1222 		return VGA_RSRC_LEGACY_IO | VGA_RSRC_LEGACY_MEM |
1223 		       VGA_RSRC_NORMAL_IO | VGA_RSRC_NORMAL_MEM;
1224 	else
1225 		return VGA_RSRC_NORMAL_IO | VGA_RSRC_NORMAL_MEM;
1226 }
1227 
1228 /**
1229  * amdgpu_device_check_block_size - validate the vm block size
1230  *
1231  * @adev: amdgpu_device pointer
1232  *
1233  * Validates the vm block size specified via module parameter.
1234  * The vm block size defines number of bits in page table versus page directory,
1235  * a page is 4KB so we have 12 bits offset, minimum 9 bits in the
1236  * page table and the remaining bits are in the page directory.
1237  */
amdgpu_device_check_block_size(struct amdgpu_device * adev)1238 static void amdgpu_device_check_block_size(struct amdgpu_device *adev)
1239 {
1240 	/* defines number of bits in page table versus page directory,
1241 	 * a page is 4KB so we have 12 bits offset, minimum 9 bits in the
1242 	 * page table and the remaining bits are in the page directory */
1243 	if (amdgpu_vm_block_size == -1)
1244 		return;
1245 
1246 	if (amdgpu_vm_block_size < 9) {
1247 		dev_warn(adev->dev, "VM page table size (%d) too small\n",
1248 			 amdgpu_vm_block_size);
1249 		amdgpu_vm_block_size = -1;
1250 	}
1251 }
1252 
1253 /**
1254  * amdgpu_device_check_vm_size - validate the vm size
1255  *
1256  * @adev: amdgpu_device pointer
1257  *
1258  * Validates the vm size in GB specified via module parameter.
1259  * The VM size is the size of the GPU virtual memory space in GB.
1260  */
amdgpu_device_check_vm_size(struct amdgpu_device * adev)1261 static void amdgpu_device_check_vm_size(struct amdgpu_device *adev)
1262 {
1263 	/* no need to check the default value */
1264 	if (amdgpu_vm_size == -1)
1265 		return;
1266 
1267 	if (amdgpu_vm_size < 1) {
1268 		dev_warn(adev->dev, "VM size (%d) too small, min is 1GB\n",
1269 			 amdgpu_vm_size);
1270 		amdgpu_vm_size = -1;
1271 	}
1272 }
1273 
amdgpu_device_check_smu_prv_buffer_size(struct amdgpu_device * adev)1274 static void amdgpu_device_check_smu_prv_buffer_size(struct amdgpu_device *adev)
1275 {
1276 	struct sysinfo si;
1277 	bool is_os_64 = (sizeof(void *) == 8);
1278 	uint64_t total_memory;
1279 	uint64_t dram_size_seven_GB = 0x1B8000000;
1280 	uint64_t dram_size_three_GB = 0xB8000000;
1281 
1282 	if (amdgpu_smu_memory_pool_size == 0)
1283 		return;
1284 
1285 	if (!is_os_64) {
1286 		DRM_WARN("Not 64-bit OS, feature not supported\n");
1287 		goto def_value;
1288 	}
1289 	si_meminfo(&si);
1290 	total_memory = (uint64_t)si.totalram * si.mem_unit;
1291 
1292 	if ((amdgpu_smu_memory_pool_size == 1) ||
1293 		(amdgpu_smu_memory_pool_size == 2)) {
1294 		if (total_memory < dram_size_three_GB)
1295 			goto def_value1;
1296 	} else if ((amdgpu_smu_memory_pool_size == 4) ||
1297 		(amdgpu_smu_memory_pool_size == 8)) {
1298 		if (total_memory < dram_size_seven_GB)
1299 			goto def_value1;
1300 	} else {
1301 		DRM_WARN("Smu memory pool size not supported\n");
1302 		goto def_value;
1303 	}
1304 	adev->pm.smu_prv_buffer_size = amdgpu_smu_memory_pool_size << 28;
1305 
1306 	return;
1307 
1308 def_value1:
1309 	DRM_WARN("No enough system memory\n");
1310 def_value:
1311 	adev->pm.smu_prv_buffer_size = 0;
1312 }
1313 
1314 /**
1315  * amdgpu_device_check_arguments - validate module params
1316  *
1317  * @adev: amdgpu_device pointer
1318  *
1319  * Validates certain module parameters and updates
1320  * the associated values used by the driver (all asics).
1321  */
amdgpu_device_check_arguments(struct amdgpu_device * adev)1322 static int amdgpu_device_check_arguments(struct amdgpu_device *adev)
1323 {
1324 	if (amdgpu_sched_jobs < 4) {
1325 		dev_warn(adev->dev, "sched jobs (%d) must be at least 4\n",
1326 			 amdgpu_sched_jobs);
1327 		amdgpu_sched_jobs = 4;
1328 	} else if (!is_power_of_2(amdgpu_sched_jobs)){
1329 		dev_warn(adev->dev, "sched jobs (%d) must be a power of 2\n",
1330 			 amdgpu_sched_jobs);
1331 		amdgpu_sched_jobs = roundup_pow_of_two(amdgpu_sched_jobs);
1332 	}
1333 
1334 	if (amdgpu_gart_size != -1 && amdgpu_gart_size < 32) {
1335 		/* gart size must be greater or equal to 32M */
1336 		dev_warn(adev->dev, "gart size (%d) too small\n",
1337 			 amdgpu_gart_size);
1338 		amdgpu_gart_size = -1;
1339 	}
1340 
1341 	if (amdgpu_gtt_size != -1 && amdgpu_gtt_size < 32) {
1342 		/* gtt size must be greater or equal to 32M */
1343 		dev_warn(adev->dev, "gtt size (%d) too small\n",
1344 				 amdgpu_gtt_size);
1345 		amdgpu_gtt_size = -1;
1346 	}
1347 
1348 	/* valid range is between 4 and 9 inclusive */
1349 	if (amdgpu_vm_fragment_size != -1 &&
1350 	    (amdgpu_vm_fragment_size > 9 || amdgpu_vm_fragment_size < 4)) {
1351 		dev_warn(adev->dev, "valid range is between 4 and 9\n");
1352 		amdgpu_vm_fragment_size = -1;
1353 	}
1354 
1355 	if (amdgpu_sched_hw_submission < 2) {
1356 		dev_warn(adev->dev, "sched hw submission jobs (%d) must be at least 2\n",
1357 			 amdgpu_sched_hw_submission);
1358 		amdgpu_sched_hw_submission = 2;
1359 	} else if (!is_power_of_2(amdgpu_sched_hw_submission)) {
1360 		dev_warn(adev->dev, "sched hw submission jobs (%d) must be a power of 2\n",
1361 			 amdgpu_sched_hw_submission);
1362 		amdgpu_sched_hw_submission = roundup_pow_of_two(amdgpu_sched_hw_submission);
1363 	}
1364 
1365 	amdgpu_device_check_smu_prv_buffer_size(adev);
1366 
1367 	amdgpu_device_check_vm_size(adev);
1368 
1369 	amdgpu_device_check_block_size(adev);
1370 
1371 	adev->firmware.load_type = amdgpu_ucode_get_load_type(adev, amdgpu_fw_load_type);
1372 
1373 	amdgpu_gmc_tmz_set(adev);
1374 
1375 	if (amdgpu_num_kcq == -1) {
1376 		amdgpu_num_kcq = 8;
1377 	} else if (amdgpu_num_kcq > 8 || amdgpu_num_kcq < 0) {
1378 		amdgpu_num_kcq = 8;
1379 		dev_warn(adev->dev, "set kernel compute queue number to 8 due to invalid parameter provided by user\n");
1380 	}
1381 
1382 	amdgpu_gmc_noretry_set(adev);
1383 
1384 	return 0;
1385 }
1386 
1387 /**
1388  * amdgpu_switcheroo_set_state - set switcheroo state
1389  *
1390  * @pdev: pci dev pointer
1391  * @state: vga_switcheroo state
1392  *
1393  * Callback for the switcheroo driver.  Suspends or resumes the
1394  * the asics before or after it is powered up using ACPI methods.
1395  */
amdgpu_switcheroo_set_state(struct pci_dev * pdev,enum vga_switcheroo_state state)1396 static void amdgpu_switcheroo_set_state(struct pci_dev *pdev,
1397 					enum vga_switcheroo_state state)
1398 {
1399 	struct drm_device *dev = pci_get_drvdata(pdev);
1400 	int r;
1401 
1402 	if (amdgpu_device_supports_boco(dev) && state == VGA_SWITCHEROO_OFF)
1403 		return;
1404 
1405 	if (state == VGA_SWITCHEROO_ON) {
1406 		pr_info("switched on\n");
1407 		/* don't suspend or resume card normally */
1408 		dev->switch_power_state = DRM_SWITCH_POWER_CHANGING;
1409 
1410 		pci_set_power_state(dev->pdev, PCI_D0);
1411 		amdgpu_device_load_pci_state(dev->pdev);
1412 		r = pci_enable_device(dev->pdev);
1413 		if (r)
1414 			DRM_WARN("pci_enable_device failed (%d)\n", r);
1415 		amdgpu_device_resume(dev, true);
1416 
1417 		dev->switch_power_state = DRM_SWITCH_POWER_ON;
1418 		drm_kms_helper_poll_enable(dev);
1419 	} else {
1420 		pr_info("switched off\n");
1421 		drm_kms_helper_poll_disable(dev);
1422 		dev->switch_power_state = DRM_SWITCH_POWER_CHANGING;
1423 		amdgpu_device_suspend(dev, true);
1424 		amdgpu_device_cache_pci_state(dev->pdev);
1425 		/* Shut down the device */
1426 		pci_disable_device(dev->pdev);
1427 		pci_set_power_state(dev->pdev, PCI_D3cold);
1428 		dev->switch_power_state = DRM_SWITCH_POWER_OFF;
1429 	}
1430 }
1431 
1432 /**
1433  * amdgpu_switcheroo_can_switch - see if switcheroo state can change
1434  *
1435  * @pdev: pci dev pointer
1436  *
1437  * Callback for the switcheroo driver.  Check of the switcheroo
1438  * state can be changed.
1439  * Returns true if the state can be changed, false if not.
1440  */
amdgpu_switcheroo_can_switch(struct pci_dev * pdev)1441 static bool amdgpu_switcheroo_can_switch(struct pci_dev *pdev)
1442 {
1443 	struct drm_device *dev = pci_get_drvdata(pdev);
1444 
1445 	/*
1446 	* FIXME: open_count is protected by drm_global_mutex but that would lead to
1447 	* locking inversion with the driver load path. And the access here is
1448 	* completely racy anyway. So don't bother with locking for now.
1449 	*/
1450 	return atomic_read(&dev->open_count) == 0;
1451 }
1452 
1453 static const struct vga_switcheroo_client_ops amdgpu_switcheroo_ops = {
1454 	.set_gpu_state = amdgpu_switcheroo_set_state,
1455 	.reprobe = NULL,
1456 	.can_switch = amdgpu_switcheroo_can_switch,
1457 };
1458 
1459 /**
1460  * amdgpu_device_ip_set_clockgating_state - set the CG state
1461  *
1462  * @dev: amdgpu_device pointer
1463  * @block_type: Type of hardware IP (SMU, GFX, UVD, etc.)
1464  * @state: clockgating state (gate or ungate)
1465  *
1466  * Sets the requested clockgating state for all instances of
1467  * the hardware IP specified.
1468  * Returns the error code from the last instance.
1469  */
amdgpu_device_ip_set_clockgating_state(void * dev,enum amd_ip_block_type block_type,enum amd_clockgating_state state)1470 int amdgpu_device_ip_set_clockgating_state(void *dev,
1471 					   enum amd_ip_block_type block_type,
1472 					   enum amd_clockgating_state state)
1473 {
1474 	struct amdgpu_device *adev = dev;
1475 	int i, r = 0;
1476 
1477 	for (i = 0; i < adev->num_ip_blocks; i++) {
1478 		if (!adev->ip_blocks[i].status.valid)
1479 			continue;
1480 		if (adev->ip_blocks[i].version->type != block_type)
1481 			continue;
1482 		if (!adev->ip_blocks[i].version->funcs->set_clockgating_state)
1483 			continue;
1484 		r = adev->ip_blocks[i].version->funcs->set_clockgating_state(
1485 			(void *)adev, state);
1486 		if (r)
1487 			DRM_ERROR("set_clockgating_state of IP block <%s> failed %d\n",
1488 				  adev->ip_blocks[i].version->funcs->name, r);
1489 	}
1490 	return r;
1491 }
1492 
1493 /**
1494  * amdgpu_device_ip_set_powergating_state - set the PG state
1495  *
1496  * @dev: amdgpu_device pointer
1497  * @block_type: Type of hardware IP (SMU, GFX, UVD, etc.)
1498  * @state: powergating state (gate or ungate)
1499  *
1500  * Sets the requested powergating state for all instances of
1501  * the hardware IP specified.
1502  * Returns the error code from the last instance.
1503  */
amdgpu_device_ip_set_powergating_state(void * dev,enum amd_ip_block_type block_type,enum amd_powergating_state state)1504 int amdgpu_device_ip_set_powergating_state(void *dev,
1505 					   enum amd_ip_block_type block_type,
1506 					   enum amd_powergating_state state)
1507 {
1508 	struct amdgpu_device *adev = dev;
1509 	int i, r = 0;
1510 
1511 	for (i = 0; i < adev->num_ip_blocks; i++) {
1512 		if (!adev->ip_blocks[i].status.valid)
1513 			continue;
1514 		if (adev->ip_blocks[i].version->type != block_type)
1515 			continue;
1516 		if (!adev->ip_blocks[i].version->funcs->set_powergating_state)
1517 			continue;
1518 		r = adev->ip_blocks[i].version->funcs->set_powergating_state(
1519 			(void *)adev, state);
1520 		if (r)
1521 			DRM_ERROR("set_powergating_state of IP block <%s> failed %d\n",
1522 				  adev->ip_blocks[i].version->funcs->name, r);
1523 	}
1524 	return r;
1525 }
1526 
1527 /**
1528  * amdgpu_device_ip_get_clockgating_state - get the CG state
1529  *
1530  * @adev: amdgpu_device pointer
1531  * @flags: clockgating feature flags
1532  *
1533  * Walks the list of IPs on the device and updates the clockgating
1534  * flags for each IP.
1535  * Updates @flags with the feature flags for each hardware IP where
1536  * clockgating is enabled.
1537  */
amdgpu_device_ip_get_clockgating_state(struct amdgpu_device * adev,u32 * flags)1538 void amdgpu_device_ip_get_clockgating_state(struct amdgpu_device *adev,
1539 					    u32 *flags)
1540 {
1541 	int i;
1542 
1543 	for (i = 0; i < adev->num_ip_blocks; i++) {
1544 		if (!adev->ip_blocks[i].status.valid)
1545 			continue;
1546 		if (adev->ip_blocks[i].version->funcs->get_clockgating_state)
1547 			adev->ip_blocks[i].version->funcs->get_clockgating_state((void *)adev, flags);
1548 	}
1549 }
1550 
1551 /**
1552  * amdgpu_device_ip_wait_for_idle - wait for idle
1553  *
1554  * @adev: amdgpu_device pointer
1555  * @block_type: Type of hardware IP (SMU, GFX, UVD, etc.)
1556  *
1557  * Waits for the request hardware IP to be idle.
1558  * Returns 0 for success or a negative error code on failure.
1559  */
amdgpu_device_ip_wait_for_idle(struct amdgpu_device * adev,enum amd_ip_block_type block_type)1560 int amdgpu_device_ip_wait_for_idle(struct amdgpu_device *adev,
1561 				   enum amd_ip_block_type block_type)
1562 {
1563 	int i, r;
1564 
1565 	for (i = 0; i < adev->num_ip_blocks; i++) {
1566 		if (!adev->ip_blocks[i].status.valid)
1567 			continue;
1568 		if (adev->ip_blocks[i].version->type == block_type) {
1569 			r = adev->ip_blocks[i].version->funcs->wait_for_idle((void *)adev);
1570 			if (r)
1571 				return r;
1572 			break;
1573 		}
1574 	}
1575 	return 0;
1576 
1577 }
1578 
1579 /**
1580  * amdgpu_device_ip_is_idle - is the hardware IP idle
1581  *
1582  * @adev: amdgpu_device pointer
1583  * @block_type: Type of hardware IP (SMU, GFX, UVD, etc.)
1584  *
1585  * Check if the hardware IP is idle or not.
1586  * Returns true if it the IP is idle, false if not.
1587  */
amdgpu_device_ip_is_idle(struct amdgpu_device * adev,enum amd_ip_block_type block_type)1588 bool amdgpu_device_ip_is_idle(struct amdgpu_device *adev,
1589 			      enum amd_ip_block_type block_type)
1590 {
1591 	int i;
1592 
1593 	for (i = 0; i < adev->num_ip_blocks; i++) {
1594 		if (!adev->ip_blocks[i].status.valid)
1595 			continue;
1596 		if (adev->ip_blocks[i].version->type == block_type)
1597 			return adev->ip_blocks[i].version->funcs->is_idle((void *)adev);
1598 	}
1599 	return true;
1600 
1601 }
1602 
1603 /**
1604  * amdgpu_device_ip_get_ip_block - get a hw IP pointer
1605  *
1606  * @adev: amdgpu_device pointer
1607  * @type: Type of hardware IP (SMU, GFX, UVD, etc.)
1608  *
1609  * Returns a pointer to the hardware IP block structure
1610  * if it exists for the asic, otherwise NULL.
1611  */
1612 struct amdgpu_ip_block *
amdgpu_device_ip_get_ip_block(struct amdgpu_device * adev,enum amd_ip_block_type type)1613 amdgpu_device_ip_get_ip_block(struct amdgpu_device *adev,
1614 			      enum amd_ip_block_type type)
1615 {
1616 	int i;
1617 
1618 	for (i = 0; i < adev->num_ip_blocks; i++)
1619 		if (adev->ip_blocks[i].version->type == type)
1620 			return &adev->ip_blocks[i];
1621 
1622 	return NULL;
1623 }
1624 
1625 /**
1626  * amdgpu_device_ip_block_version_cmp
1627  *
1628  * @adev: amdgpu_device pointer
1629  * @type: enum amd_ip_block_type
1630  * @major: major version
1631  * @minor: minor version
1632  *
1633  * return 0 if equal or greater
1634  * return 1 if smaller or the ip_block doesn't exist
1635  */
amdgpu_device_ip_block_version_cmp(struct amdgpu_device * adev,enum amd_ip_block_type type,u32 major,u32 minor)1636 int amdgpu_device_ip_block_version_cmp(struct amdgpu_device *adev,
1637 				       enum amd_ip_block_type type,
1638 				       u32 major, u32 minor)
1639 {
1640 	struct amdgpu_ip_block *ip_block = amdgpu_device_ip_get_ip_block(adev, type);
1641 
1642 	if (ip_block && ((ip_block->version->major > major) ||
1643 			((ip_block->version->major == major) &&
1644 			(ip_block->version->minor >= minor))))
1645 		return 0;
1646 
1647 	return 1;
1648 }
1649 
1650 /**
1651  * amdgpu_device_ip_block_add
1652  *
1653  * @adev: amdgpu_device pointer
1654  * @ip_block_version: pointer to the IP to add
1655  *
1656  * Adds the IP block driver information to the collection of IPs
1657  * on the asic.
1658  */
amdgpu_device_ip_block_add(struct amdgpu_device * adev,const struct amdgpu_ip_block_version * ip_block_version)1659 int amdgpu_device_ip_block_add(struct amdgpu_device *adev,
1660 			       const struct amdgpu_ip_block_version *ip_block_version)
1661 {
1662 	if (!ip_block_version)
1663 		return -EINVAL;
1664 
1665 	DRM_INFO("add ip block number %d <%s>\n", adev->num_ip_blocks,
1666 		  ip_block_version->funcs->name);
1667 
1668 	adev->ip_blocks[adev->num_ip_blocks++].version = ip_block_version;
1669 
1670 	return 0;
1671 }
1672 
1673 /**
1674  * amdgpu_device_enable_virtual_display - enable virtual display feature
1675  *
1676  * @adev: amdgpu_device pointer
1677  *
1678  * Enabled the virtual display feature if the user has enabled it via
1679  * the module parameter virtual_display.  This feature provides a virtual
1680  * display hardware on headless boards or in virtualized environments.
1681  * This function parses and validates the configuration string specified by
1682  * the user and configues the virtual display configuration (number of
1683  * virtual connectors, crtcs, etc.) specified.
1684  */
amdgpu_device_enable_virtual_display(struct amdgpu_device * adev)1685 static void amdgpu_device_enable_virtual_display(struct amdgpu_device *adev)
1686 {
1687 	adev->enable_virtual_display = false;
1688 
1689 	if (amdgpu_virtual_display) {
1690 		struct drm_device *ddev = adev_to_drm(adev);
1691 		const char *pci_address_name = pci_name(ddev->pdev);
1692 		char *pciaddstr, *pciaddstr_tmp, *pciaddname_tmp, *pciaddname;
1693 
1694 		pciaddstr = kstrdup(amdgpu_virtual_display, GFP_KERNEL);
1695 		pciaddstr_tmp = pciaddstr;
1696 		while ((pciaddname_tmp = strsep(&pciaddstr_tmp, ";"))) {
1697 			pciaddname = strsep(&pciaddname_tmp, ",");
1698 			if (!strcmp("all", pciaddname)
1699 			    || !strcmp(pci_address_name, pciaddname)) {
1700 				long num_crtc;
1701 				int res = -1;
1702 
1703 				adev->enable_virtual_display = true;
1704 
1705 				if (pciaddname_tmp)
1706 					res = kstrtol(pciaddname_tmp, 10,
1707 						      &num_crtc);
1708 
1709 				if (!res) {
1710 					if (num_crtc < 1)
1711 						num_crtc = 1;
1712 					if (num_crtc > 6)
1713 						num_crtc = 6;
1714 					adev->mode_info.num_crtc = num_crtc;
1715 				} else {
1716 					adev->mode_info.num_crtc = 1;
1717 				}
1718 				break;
1719 			}
1720 		}
1721 
1722 		DRM_INFO("virtual display string:%s, %s:virtual_display:%d, num_crtc:%d\n",
1723 			 amdgpu_virtual_display, pci_address_name,
1724 			 adev->enable_virtual_display, adev->mode_info.num_crtc);
1725 
1726 		kfree(pciaddstr);
1727 	}
1728 }
1729 
1730 /**
1731  * amdgpu_device_parse_gpu_info_fw - parse gpu info firmware
1732  *
1733  * @adev: amdgpu_device pointer
1734  *
1735  * Parses the asic configuration parameters specified in the gpu info
1736  * firmware and makes them availale to the driver for use in configuring
1737  * the asic.
1738  * Returns 0 on success, -EINVAL on failure.
1739  */
amdgpu_device_parse_gpu_info_fw(struct amdgpu_device * adev)1740 static int amdgpu_device_parse_gpu_info_fw(struct amdgpu_device *adev)
1741 {
1742 	const char *chip_name;
1743 	char fw_name[40];
1744 	int err;
1745 	const struct gpu_info_firmware_header_v1_0 *hdr;
1746 
1747 	adev->firmware.gpu_info_fw = NULL;
1748 
1749 	if (adev->mman.discovery_bin) {
1750 		amdgpu_discovery_get_gfx_info(adev);
1751 
1752 		/*
1753 		 * FIXME: The bounding box is still needed by Navi12, so
1754 		 * temporarily read it from gpu_info firmware. Should be droped
1755 		 * when DAL no longer needs it.
1756 		 */
1757 		if (adev->asic_type != CHIP_NAVI12)
1758 			return 0;
1759 	}
1760 
1761 	switch (adev->asic_type) {
1762 #ifdef CONFIG_DRM_AMDGPU_SI
1763 	case CHIP_VERDE:
1764 	case CHIP_TAHITI:
1765 	case CHIP_PITCAIRN:
1766 	case CHIP_OLAND:
1767 	case CHIP_HAINAN:
1768 #endif
1769 #ifdef CONFIG_DRM_AMDGPU_CIK
1770 	case CHIP_BONAIRE:
1771 	case CHIP_HAWAII:
1772 	case CHIP_KAVERI:
1773 	case CHIP_KABINI:
1774 	case CHIP_MULLINS:
1775 #endif
1776 	case CHIP_TOPAZ:
1777 	case CHIP_TONGA:
1778 	case CHIP_FIJI:
1779 	case CHIP_POLARIS10:
1780 	case CHIP_POLARIS11:
1781 	case CHIP_POLARIS12:
1782 	case CHIP_VEGAM:
1783 	case CHIP_CARRIZO:
1784 	case CHIP_STONEY:
1785 	case CHIP_VEGA20:
1786 	case CHIP_SIENNA_CICHLID:
1787 	case CHIP_NAVY_FLOUNDER:
1788 	default:
1789 		return 0;
1790 	case CHIP_VEGA10:
1791 		chip_name = "vega10";
1792 		break;
1793 	case CHIP_VEGA12:
1794 		chip_name = "vega12";
1795 		break;
1796 	case CHIP_RAVEN:
1797 		if (adev->apu_flags & AMD_APU_IS_RAVEN2)
1798 			chip_name = "raven2";
1799 		else if (adev->apu_flags & AMD_APU_IS_PICASSO)
1800 			chip_name = "picasso";
1801 		else
1802 			chip_name = "raven";
1803 		break;
1804 	case CHIP_ARCTURUS:
1805 		chip_name = "arcturus";
1806 		break;
1807 	case CHIP_RENOIR:
1808 		if (adev->apu_flags & AMD_APU_IS_RENOIR)
1809 			chip_name = "renoir";
1810 		else
1811 			chip_name = "green_sardine";
1812 		break;
1813 	case CHIP_NAVI10:
1814 		chip_name = "navi10";
1815 		break;
1816 	case CHIP_NAVI14:
1817 		chip_name = "navi14";
1818 		break;
1819 	case CHIP_NAVI12:
1820 		chip_name = "navi12";
1821 		break;
1822 	}
1823 
1824 	snprintf(fw_name, sizeof(fw_name), "amdgpu/%s_gpu_info.bin", chip_name);
1825 	err = request_firmware(&adev->firmware.gpu_info_fw, fw_name, adev->dev);
1826 	if (err) {
1827 		dev_err(adev->dev,
1828 			"Failed to load gpu_info firmware \"%s\"\n",
1829 			fw_name);
1830 		goto out;
1831 	}
1832 	err = amdgpu_ucode_validate(adev->firmware.gpu_info_fw);
1833 	if (err) {
1834 		dev_err(adev->dev,
1835 			"Failed to validate gpu_info firmware \"%s\"\n",
1836 			fw_name);
1837 		goto out;
1838 	}
1839 
1840 	hdr = (const struct gpu_info_firmware_header_v1_0 *)adev->firmware.gpu_info_fw->data;
1841 	amdgpu_ucode_print_gpu_info_hdr(&hdr->header);
1842 
1843 	switch (hdr->version_major) {
1844 	case 1:
1845 	{
1846 		const struct gpu_info_firmware_v1_0 *gpu_info_fw =
1847 			(const struct gpu_info_firmware_v1_0 *)(adev->firmware.gpu_info_fw->data +
1848 								le32_to_cpu(hdr->header.ucode_array_offset_bytes));
1849 
1850 		/*
1851 		 * Should be droped when DAL no longer needs it.
1852 		 */
1853 		if (adev->asic_type == CHIP_NAVI12)
1854 			goto parse_soc_bounding_box;
1855 
1856 		adev->gfx.config.max_shader_engines = le32_to_cpu(gpu_info_fw->gc_num_se);
1857 		adev->gfx.config.max_cu_per_sh = le32_to_cpu(gpu_info_fw->gc_num_cu_per_sh);
1858 		adev->gfx.config.max_sh_per_se = le32_to_cpu(gpu_info_fw->gc_num_sh_per_se);
1859 		adev->gfx.config.max_backends_per_se = le32_to_cpu(gpu_info_fw->gc_num_rb_per_se);
1860 		adev->gfx.config.max_texture_channel_caches =
1861 			le32_to_cpu(gpu_info_fw->gc_num_tccs);
1862 		adev->gfx.config.max_gprs = le32_to_cpu(gpu_info_fw->gc_num_gprs);
1863 		adev->gfx.config.max_gs_threads = le32_to_cpu(gpu_info_fw->gc_num_max_gs_thds);
1864 		adev->gfx.config.gs_vgt_table_depth = le32_to_cpu(gpu_info_fw->gc_gs_table_depth);
1865 		adev->gfx.config.gs_prim_buffer_depth = le32_to_cpu(gpu_info_fw->gc_gsprim_buff_depth);
1866 		adev->gfx.config.double_offchip_lds_buf =
1867 			le32_to_cpu(gpu_info_fw->gc_double_offchip_lds_buffer);
1868 		adev->gfx.cu_info.wave_front_size = le32_to_cpu(gpu_info_fw->gc_wave_size);
1869 		adev->gfx.cu_info.max_waves_per_simd =
1870 			le32_to_cpu(gpu_info_fw->gc_max_waves_per_simd);
1871 		adev->gfx.cu_info.max_scratch_slots_per_cu =
1872 			le32_to_cpu(gpu_info_fw->gc_max_scratch_slots_per_cu);
1873 		adev->gfx.cu_info.lds_size = le32_to_cpu(gpu_info_fw->gc_lds_size);
1874 		if (hdr->version_minor >= 1) {
1875 			const struct gpu_info_firmware_v1_1 *gpu_info_fw =
1876 				(const struct gpu_info_firmware_v1_1 *)(adev->firmware.gpu_info_fw->data +
1877 									le32_to_cpu(hdr->header.ucode_array_offset_bytes));
1878 			adev->gfx.config.num_sc_per_sh =
1879 				le32_to_cpu(gpu_info_fw->num_sc_per_sh);
1880 			adev->gfx.config.num_packer_per_sc =
1881 				le32_to_cpu(gpu_info_fw->num_packer_per_sc);
1882 		}
1883 
1884 parse_soc_bounding_box:
1885 		/*
1886 		 * soc bounding box info is not integrated in disocovery table,
1887 		 * we always need to parse it from gpu info firmware if needed.
1888 		 */
1889 		if (hdr->version_minor == 2) {
1890 			const struct gpu_info_firmware_v1_2 *gpu_info_fw =
1891 				(const struct gpu_info_firmware_v1_2 *)(adev->firmware.gpu_info_fw->data +
1892 									le32_to_cpu(hdr->header.ucode_array_offset_bytes));
1893 			adev->dm.soc_bounding_box = &gpu_info_fw->soc_bounding_box;
1894 		}
1895 		break;
1896 	}
1897 	default:
1898 		dev_err(adev->dev,
1899 			"Unsupported gpu_info table %d\n", hdr->header.ucode_version);
1900 		err = -EINVAL;
1901 		goto out;
1902 	}
1903 out:
1904 	return err;
1905 }
1906 
1907 /**
1908  * amdgpu_device_ip_early_init - run early init for hardware IPs
1909  *
1910  * @adev: amdgpu_device pointer
1911  *
1912  * Early initialization pass for hardware IPs.  The hardware IPs that make
1913  * up each asic are discovered each IP's early_init callback is run.  This
1914  * is the first stage in initializing the asic.
1915  * Returns 0 on success, negative error code on failure.
1916  */
amdgpu_device_ip_early_init(struct amdgpu_device * adev)1917 static int amdgpu_device_ip_early_init(struct amdgpu_device *adev)
1918 {
1919 	int i, r;
1920 
1921 	amdgpu_device_enable_virtual_display(adev);
1922 
1923 	if (amdgpu_sriov_vf(adev)) {
1924 		r = amdgpu_virt_request_full_gpu(adev, true);
1925 		if (r)
1926 			return r;
1927 	}
1928 
1929 	switch (adev->asic_type) {
1930 #ifdef CONFIG_DRM_AMDGPU_SI
1931 	case CHIP_VERDE:
1932 	case CHIP_TAHITI:
1933 	case CHIP_PITCAIRN:
1934 	case CHIP_OLAND:
1935 	case CHIP_HAINAN:
1936 		adev->family = AMDGPU_FAMILY_SI;
1937 		r = si_set_ip_blocks(adev);
1938 		if (r)
1939 			return r;
1940 		break;
1941 #endif
1942 #ifdef CONFIG_DRM_AMDGPU_CIK
1943 	case CHIP_BONAIRE:
1944 	case CHIP_HAWAII:
1945 	case CHIP_KAVERI:
1946 	case CHIP_KABINI:
1947 	case CHIP_MULLINS:
1948 		if (adev->flags & AMD_IS_APU)
1949 			adev->family = AMDGPU_FAMILY_KV;
1950 		else
1951 			adev->family = AMDGPU_FAMILY_CI;
1952 
1953 		r = cik_set_ip_blocks(adev);
1954 		if (r)
1955 			return r;
1956 		break;
1957 #endif
1958 	case CHIP_TOPAZ:
1959 	case CHIP_TONGA:
1960 	case CHIP_FIJI:
1961 	case CHIP_POLARIS10:
1962 	case CHIP_POLARIS11:
1963 	case CHIP_POLARIS12:
1964 	case CHIP_VEGAM:
1965 	case CHIP_CARRIZO:
1966 	case CHIP_STONEY:
1967 		if (adev->flags & AMD_IS_APU)
1968 			adev->family = AMDGPU_FAMILY_CZ;
1969 		else
1970 			adev->family = AMDGPU_FAMILY_VI;
1971 
1972 		r = vi_set_ip_blocks(adev);
1973 		if (r)
1974 			return r;
1975 		break;
1976 	case CHIP_VEGA10:
1977 	case CHIP_VEGA12:
1978 	case CHIP_VEGA20:
1979 	case CHIP_RAVEN:
1980 	case CHIP_ARCTURUS:
1981 	case CHIP_RENOIR:
1982 		if (adev->flags & AMD_IS_APU)
1983 			adev->family = AMDGPU_FAMILY_RV;
1984 		else
1985 			adev->family = AMDGPU_FAMILY_AI;
1986 
1987 		r = soc15_set_ip_blocks(adev);
1988 		if (r)
1989 			return r;
1990 		break;
1991 	case  CHIP_NAVI10:
1992 	case  CHIP_NAVI14:
1993 	case  CHIP_NAVI12:
1994 	case  CHIP_SIENNA_CICHLID:
1995 	case  CHIP_NAVY_FLOUNDER:
1996 		adev->family = AMDGPU_FAMILY_NV;
1997 
1998 		r = nv_set_ip_blocks(adev);
1999 		if (r)
2000 			return r;
2001 		break;
2002 	default:
2003 		/* FIXME: not supported yet */
2004 		return -EINVAL;
2005 	}
2006 
2007 	amdgpu_amdkfd_device_probe(adev);
2008 
2009 	adev->pm.pp_feature = amdgpu_pp_feature_mask;
2010 	if (amdgpu_sriov_vf(adev) || sched_policy == KFD_SCHED_POLICY_NO_HWS)
2011 		adev->pm.pp_feature &= ~PP_GFXOFF_MASK;
2012 
2013 	for (i = 0; i < adev->num_ip_blocks; i++) {
2014 		if ((amdgpu_ip_block_mask & (1 << i)) == 0) {
2015 			DRM_ERROR("disabled ip block: %d <%s>\n",
2016 				  i, adev->ip_blocks[i].version->funcs->name);
2017 			adev->ip_blocks[i].status.valid = false;
2018 		} else {
2019 			if (adev->ip_blocks[i].version->funcs->early_init) {
2020 				r = adev->ip_blocks[i].version->funcs->early_init((void *)adev);
2021 				if (r == -ENOENT) {
2022 					adev->ip_blocks[i].status.valid = false;
2023 				} else if (r) {
2024 					DRM_ERROR("early_init of IP block <%s> failed %d\n",
2025 						  adev->ip_blocks[i].version->funcs->name, r);
2026 					return r;
2027 				} else {
2028 					adev->ip_blocks[i].status.valid = true;
2029 				}
2030 			} else {
2031 				adev->ip_blocks[i].status.valid = true;
2032 			}
2033 		}
2034 		/* get the vbios after the asic_funcs are set up */
2035 		if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_COMMON) {
2036 			r = amdgpu_device_parse_gpu_info_fw(adev);
2037 			if (r)
2038 				return r;
2039 
2040 			/* Read BIOS */
2041 			if (!amdgpu_get_bios(adev))
2042 				return -EINVAL;
2043 
2044 			r = amdgpu_atombios_init(adev);
2045 			if (r) {
2046 				dev_err(adev->dev, "amdgpu_atombios_init failed\n");
2047 				amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_ATOMBIOS_INIT_FAIL, 0, 0);
2048 				return r;
2049 			}
2050 		}
2051 	}
2052 
2053 	adev->cg_flags &= amdgpu_cg_mask;
2054 	adev->pg_flags &= amdgpu_pg_mask;
2055 
2056 	return 0;
2057 }
2058 
amdgpu_device_ip_hw_init_phase1(struct amdgpu_device * adev)2059 static int amdgpu_device_ip_hw_init_phase1(struct amdgpu_device *adev)
2060 {
2061 	int i, r;
2062 
2063 	for (i = 0; i < adev->num_ip_blocks; i++) {
2064 		if (!adev->ip_blocks[i].status.sw)
2065 			continue;
2066 		if (adev->ip_blocks[i].status.hw)
2067 			continue;
2068 		if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_COMMON ||
2069 		    (amdgpu_sriov_vf(adev) && (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_PSP)) ||
2070 		    adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_IH) {
2071 			r = adev->ip_blocks[i].version->funcs->hw_init(adev);
2072 			if (r) {
2073 				DRM_ERROR("hw_init of IP block <%s> failed %d\n",
2074 					  adev->ip_blocks[i].version->funcs->name, r);
2075 				return r;
2076 			}
2077 			adev->ip_blocks[i].status.hw = true;
2078 		}
2079 	}
2080 
2081 	return 0;
2082 }
2083 
amdgpu_device_ip_hw_init_phase2(struct amdgpu_device * adev)2084 static int amdgpu_device_ip_hw_init_phase2(struct amdgpu_device *adev)
2085 {
2086 	int i, r;
2087 
2088 	for (i = 0; i < adev->num_ip_blocks; i++) {
2089 		if (!adev->ip_blocks[i].status.sw)
2090 			continue;
2091 		if (adev->ip_blocks[i].status.hw)
2092 			continue;
2093 		r = adev->ip_blocks[i].version->funcs->hw_init(adev);
2094 		if (r) {
2095 			DRM_ERROR("hw_init of IP block <%s> failed %d\n",
2096 				  adev->ip_blocks[i].version->funcs->name, r);
2097 			return r;
2098 		}
2099 		adev->ip_blocks[i].status.hw = true;
2100 	}
2101 
2102 	return 0;
2103 }
2104 
amdgpu_device_fw_loading(struct amdgpu_device * adev)2105 static int amdgpu_device_fw_loading(struct amdgpu_device *adev)
2106 {
2107 	int r = 0;
2108 	int i;
2109 	uint32_t smu_version;
2110 
2111 	if (adev->asic_type >= CHIP_VEGA10) {
2112 		for (i = 0; i < adev->num_ip_blocks; i++) {
2113 			if (adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_PSP)
2114 				continue;
2115 
2116 			/* no need to do the fw loading again if already done*/
2117 			if (adev->ip_blocks[i].status.hw == true)
2118 				break;
2119 
2120 			if (amdgpu_in_reset(adev) || adev->in_suspend) {
2121 				r = adev->ip_blocks[i].version->funcs->resume(adev);
2122 				if (r) {
2123 					DRM_ERROR("resume of IP block <%s> failed %d\n",
2124 							  adev->ip_blocks[i].version->funcs->name, r);
2125 					return r;
2126 				}
2127 			} else {
2128 				r = adev->ip_blocks[i].version->funcs->hw_init(adev);
2129 				if (r) {
2130 					DRM_ERROR("hw_init of IP block <%s> failed %d\n",
2131 							  adev->ip_blocks[i].version->funcs->name, r);
2132 					return r;
2133 				}
2134 			}
2135 
2136 			adev->ip_blocks[i].status.hw = true;
2137 			break;
2138 		}
2139 	}
2140 
2141 	if (!amdgpu_sriov_vf(adev) || adev->asic_type == CHIP_TONGA)
2142 		r = amdgpu_pm_load_smu_firmware(adev, &smu_version);
2143 
2144 	return r;
2145 }
2146 
2147 /**
2148  * amdgpu_device_ip_init - run init for hardware IPs
2149  *
2150  * @adev: amdgpu_device pointer
2151  *
2152  * Main initialization pass for hardware IPs.  The list of all the hardware
2153  * IPs that make up the asic is walked and the sw_init and hw_init callbacks
2154  * are run.  sw_init initializes the software state associated with each IP
2155  * and hw_init initializes the hardware associated with each IP.
2156  * Returns 0 on success, negative error code on failure.
2157  */
amdgpu_device_ip_init(struct amdgpu_device * adev)2158 static int amdgpu_device_ip_init(struct amdgpu_device *adev)
2159 {
2160 	int i, r;
2161 
2162 	r = amdgpu_ras_init(adev);
2163 	if (r)
2164 		return r;
2165 
2166 	for (i = 0; i < adev->num_ip_blocks; i++) {
2167 		if (!adev->ip_blocks[i].status.valid)
2168 			continue;
2169 		r = adev->ip_blocks[i].version->funcs->sw_init((void *)adev);
2170 		if (r) {
2171 			DRM_ERROR("sw_init of IP block <%s> failed %d\n",
2172 				  adev->ip_blocks[i].version->funcs->name, r);
2173 			goto init_failed;
2174 		}
2175 		adev->ip_blocks[i].status.sw = true;
2176 
2177 		/* need to do gmc hw init early so we can allocate gpu mem */
2178 		if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC) {
2179 			r = amdgpu_device_vram_scratch_init(adev);
2180 			if (r) {
2181 				DRM_ERROR("amdgpu_vram_scratch_init failed %d\n", r);
2182 				goto init_failed;
2183 			}
2184 			r = adev->ip_blocks[i].version->funcs->hw_init((void *)adev);
2185 			if (r) {
2186 				DRM_ERROR("hw_init %d failed %d\n", i, r);
2187 				goto init_failed;
2188 			}
2189 			r = amdgpu_device_wb_init(adev);
2190 			if (r) {
2191 				DRM_ERROR("amdgpu_device_wb_init failed %d\n", r);
2192 				goto init_failed;
2193 			}
2194 			adev->ip_blocks[i].status.hw = true;
2195 
2196 			/* right after GMC hw init, we create CSA */
2197 			if (amdgpu_mcbp || amdgpu_sriov_vf(adev)) {
2198 				r = amdgpu_allocate_static_csa(adev, &adev->virt.csa_obj,
2199 								AMDGPU_GEM_DOMAIN_VRAM,
2200 								AMDGPU_CSA_SIZE);
2201 				if (r) {
2202 					DRM_ERROR("allocate CSA failed %d\n", r);
2203 					goto init_failed;
2204 				}
2205 			}
2206 		}
2207 	}
2208 
2209 	if (amdgpu_sriov_vf(adev))
2210 		amdgpu_virt_init_data_exchange(adev);
2211 
2212 	r = amdgpu_ib_pool_init(adev);
2213 	if (r) {
2214 		dev_err(adev->dev, "IB initialization failed (%d).\n", r);
2215 		amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_IB_INIT_FAIL, 0, r);
2216 		goto init_failed;
2217 	}
2218 
2219 	r = amdgpu_ucode_create_bo(adev); /* create ucode bo when sw_init complete*/
2220 	if (r)
2221 		goto init_failed;
2222 
2223 	r = amdgpu_device_ip_hw_init_phase1(adev);
2224 	if (r)
2225 		goto init_failed;
2226 
2227 	r = amdgpu_device_fw_loading(adev);
2228 	if (r)
2229 		goto init_failed;
2230 
2231 	r = amdgpu_device_ip_hw_init_phase2(adev);
2232 	if (r)
2233 		goto init_failed;
2234 
2235 	/*
2236 	 * retired pages will be loaded from eeprom and reserved here,
2237 	 * it should be called after amdgpu_device_ip_hw_init_phase2  since
2238 	 * for some ASICs the RAS EEPROM code relies on SMU fully functioning
2239 	 * for I2C communication which only true at this point.
2240 	 *
2241 	 * amdgpu_ras_recovery_init may fail, but the upper only cares the
2242 	 * failure from bad gpu situation and stop amdgpu init process
2243 	 * accordingly. For other failed cases, it will still release all
2244 	 * the resource and print error message, rather than returning one
2245 	 * negative value to upper level.
2246 	 *
2247 	 * Note: theoretically, this should be called before all vram allocations
2248 	 * to protect retired page from abusing
2249 	 */
2250 	r = amdgpu_ras_recovery_init(adev);
2251 	if (r)
2252 		goto init_failed;
2253 
2254 	if (adev->gmc.xgmi.num_physical_nodes > 1)
2255 		amdgpu_xgmi_add_device(adev);
2256 	amdgpu_amdkfd_device_init(adev);
2257 
2258 	amdgpu_fru_get_product_info(adev);
2259 
2260 init_failed:
2261 	if (amdgpu_sriov_vf(adev))
2262 		amdgpu_virt_release_full_gpu(adev, true);
2263 
2264 	return r;
2265 }
2266 
2267 /**
2268  * amdgpu_device_fill_reset_magic - writes reset magic to gart pointer
2269  *
2270  * @adev: amdgpu_device pointer
2271  *
2272  * Writes a reset magic value to the gart pointer in VRAM.  The driver calls
2273  * this function before a GPU reset.  If the value is retained after a
2274  * GPU reset, VRAM has not been lost.  Some GPU resets may destry VRAM contents.
2275  */
amdgpu_device_fill_reset_magic(struct amdgpu_device * adev)2276 static void amdgpu_device_fill_reset_magic(struct amdgpu_device *adev)
2277 {
2278 	memcpy(adev->reset_magic, adev->gart.ptr, AMDGPU_RESET_MAGIC_NUM);
2279 }
2280 
2281 /**
2282  * amdgpu_device_check_vram_lost - check if vram is valid
2283  *
2284  * @adev: amdgpu_device pointer
2285  *
2286  * Checks the reset magic value written to the gart pointer in VRAM.
2287  * The driver calls this after a GPU reset to see if the contents of
2288  * VRAM is lost or now.
2289  * returns true if vram is lost, false if not.
2290  */
amdgpu_device_check_vram_lost(struct amdgpu_device * adev)2291 static bool amdgpu_device_check_vram_lost(struct amdgpu_device *adev)
2292 {
2293 	if (memcmp(adev->gart.ptr, adev->reset_magic,
2294 			AMDGPU_RESET_MAGIC_NUM))
2295 		return true;
2296 
2297 	if (!amdgpu_in_reset(adev))
2298 		return false;
2299 
2300 	/*
2301 	 * For all ASICs with baco/mode1 reset, the VRAM is
2302 	 * always assumed to be lost.
2303 	 */
2304 	switch (amdgpu_asic_reset_method(adev)) {
2305 	case AMD_RESET_METHOD_BACO:
2306 	case AMD_RESET_METHOD_MODE1:
2307 		return true;
2308 	default:
2309 		return false;
2310 	}
2311 }
2312 
2313 /**
2314  * amdgpu_device_set_cg_state - set clockgating for amdgpu device
2315  *
2316  * @adev: amdgpu_device pointer
2317  * @state: clockgating state (gate or ungate)
2318  *
2319  * The list of all the hardware IPs that make up the asic is walked and the
2320  * set_clockgating_state callbacks are run.
2321  * Late initialization pass enabling clockgating for hardware IPs.
2322  * Fini or suspend, pass disabling clockgating for hardware IPs.
2323  * Returns 0 on success, negative error code on failure.
2324  */
2325 
amdgpu_device_set_cg_state(struct amdgpu_device * adev,enum amd_clockgating_state state)2326 static int amdgpu_device_set_cg_state(struct amdgpu_device *adev,
2327 						enum amd_clockgating_state state)
2328 {
2329 	int i, j, r;
2330 
2331 	if (amdgpu_emu_mode == 1)
2332 		return 0;
2333 
2334 	for (j = 0; j < adev->num_ip_blocks; j++) {
2335 		i = state == AMD_CG_STATE_GATE ? j : adev->num_ip_blocks - j - 1;
2336 		if (!adev->ip_blocks[i].status.late_initialized)
2337 			continue;
2338 		/* skip CG for VCE/UVD, it's handled specially */
2339 		if (adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_UVD &&
2340 		    adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_VCE &&
2341 		    adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_VCN &&
2342 		    adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_JPEG &&
2343 		    adev->ip_blocks[i].version->funcs->set_clockgating_state) {
2344 			/* enable clockgating to save power */
2345 			r = adev->ip_blocks[i].version->funcs->set_clockgating_state((void *)adev,
2346 										     state);
2347 			if (r) {
2348 				DRM_ERROR("set_clockgating_state(gate) of IP block <%s> failed %d\n",
2349 					  adev->ip_blocks[i].version->funcs->name, r);
2350 				return r;
2351 			}
2352 		}
2353 	}
2354 
2355 	return 0;
2356 }
2357 
amdgpu_device_set_pg_state(struct amdgpu_device * adev,enum amd_powergating_state state)2358 static int amdgpu_device_set_pg_state(struct amdgpu_device *adev, enum amd_powergating_state state)
2359 {
2360 	int i, j, r;
2361 
2362 	if (amdgpu_emu_mode == 1)
2363 		return 0;
2364 
2365 	for (j = 0; j < adev->num_ip_blocks; j++) {
2366 		i = state == AMD_PG_STATE_GATE ? j : adev->num_ip_blocks - j - 1;
2367 		if (!adev->ip_blocks[i].status.late_initialized)
2368 			continue;
2369 		/* skip CG for VCE/UVD, it's handled specially */
2370 		if (adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_UVD &&
2371 		    adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_VCE &&
2372 		    adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_VCN &&
2373 		    adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_JPEG &&
2374 		    adev->ip_blocks[i].version->funcs->set_powergating_state) {
2375 			/* enable powergating to save power */
2376 			r = adev->ip_blocks[i].version->funcs->set_powergating_state((void *)adev,
2377 											state);
2378 			if (r) {
2379 				DRM_ERROR("set_powergating_state(gate) of IP block <%s> failed %d\n",
2380 					  adev->ip_blocks[i].version->funcs->name, r);
2381 				return r;
2382 			}
2383 		}
2384 	}
2385 	return 0;
2386 }
2387 
amdgpu_device_enable_mgpu_fan_boost(void)2388 static int amdgpu_device_enable_mgpu_fan_boost(void)
2389 {
2390 	struct amdgpu_gpu_instance *gpu_ins;
2391 	struct amdgpu_device *adev;
2392 	int i, ret = 0;
2393 
2394 	mutex_lock(&mgpu_info.mutex);
2395 
2396 	/*
2397 	 * MGPU fan boost feature should be enabled
2398 	 * only when there are two or more dGPUs in
2399 	 * the system
2400 	 */
2401 	if (mgpu_info.num_dgpu < 2)
2402 		goto out;
2403 
2404 	for (i = 0; i < mgpu_info.num_dgpu; i++) {
2405 		gpu_ins = &(mgpu_info.gpu_ins[i]);
2406 		adev = gpu_ins->adev;
2407 		if (!(adev->flags & AMD_IS_APU) &&
2408 		    !gpu_ins->mgpu_fan_enabled) {
2409 			ret = amdgpu_dpm_enable_mgpu_fan_boost(adev);
2410 			if (ret)
2411 				break;
2412 
2413 			gpu_ins->mgpu_fan_enabled = 1;
2414 		}
2415 	}
2416 
2417 out:
2418 	mutex_unlock(&mgpu_info.mutex);
2419 
2420 	return ret;
2421 }
2422 
2423 /**
2424  * amdgpu_device_ip_late_init - run late init for hardware IPs
2425  *
2426  * @adev: amdgpu_device pointer
2427  *
2428  * Late initialization pass for hardware IPs.  The list of all the hardware
2429  * IPs that make up the asic is walked and the late_init callbacks are run.
2430  * late_init covers any special initialization that an IP requires
2431  * after all of the have been initialized or something that needs to happen
2432  * late in the init process.
2433  * Returns 0 on success, negative error code on failure.
2434  */
amdgpu_device_ip_late_init(struct amdgpu_device * adev)2435 static int amdgpu_device_ip_late_init(struct amdgpu_device *adev)
2436 {
2437 	struct amdgpu_gpu_instance *gpu_instance;
2438 	int i = 0, r;
2439 
2440 	for (i = 0; i < adev->num_ip_blocks; i++) {
2441 		if (!adev->ip_blocks[i].status.hw)
2442 			continue;
2443 		if (adev->ip_blocks[i].version->funcs->late_init) {
2444 			r = adev->ip_blocks[i].version->funcs->late_init((void *)adev);
2445 			if (r) {
2446 				DRM_ERROR("late_init of IP block <%s> failed %d\n",
2447 					  adev->ip_blocks[i].version->funcs->name, r);
2448 				return r;
2449 			}
2450 		}
2451 		adev->ip_blocks[i].status.late_initialized = true;
2452 	}
2453 
2454 	amdgpu_ras_set_error_query_ready(adev, true);
2455 
2456 	amdgpu_device_set_cg_state(adev, AMD_CG_STATE_GATE);
2457 	amdgpu_device_set_pg_state(adev, AMD_PG_STATE_GATE);
2458 
2459 	amdgpu_device_fill_reset_magic(adev);
2460 
2461 	r = amdgpu_device_enable_mgpu_fan_boost();
2462 	if (r)
2463 		DRM_ERROR("enable mgpu fan boost failed (%d).\n", r);
2464 
2465 
2466 	if (adev->gmc.xgmi.num_physical_nodes > 1) {
2467 		mutex_lock(&mgpu_info.mutex);
2468 
2469 		/*
2470 		 * Reset device p-state to low as this was booted with high.
2471 		 *
2472 		 * This should be performed only after all devices from the same
2473 		 * hive get initialized.
2474 		 *
2475 		 * However, it's unknown how many device in the hive in advance.
2476 		 * As this is counted one by one during devices initializations.
2477 		 *
2478 		 * So, we wait for all XGMI interlinked devices initialized.
2479 		 * This may bring some delays as those devices may come from
2480 		 * different hives. But that should be OK.
2481 		 */
2482 		if (mgpu_info.num_dgpu == adev->gmc.xgmi.num_physical_nodes) {
2483 			for (i = 0; i < mgpu_info.num_gpu; i++) {
2484 				gpu_instance = &(mgpu_info.gpu_ins[i]);
2485 				if (gpu_instance->adev->flags & AMD_IS_APU)
2486 					continue;
2487 
2488 				r = amdgpu_xgmi_set_pstate(gpu_instance->adev,
2489 						AMDGPU_XGMI_PSTATE_MIN);
2490 				if (r) {
2491 					DRM_ERROR("pstate setting failed (%d).\n", r);
2492 					break;
2493 				}
2494 			}
2495 		}
2496 
2497 		mutex_unlock(&mgpu_info.mutex);
2498 	}
2499 
2500 	return 0;
2501 }
2502 
2503 /**
2504  * amdgpu_device_ip_fini - run fini for hardware IPs
2505  *
2506  * @adev: amdgpu_device pointer
2507  *
2508  * Main teardown pass for hardware IPs.  The list of all the hardware
2509  * IPs that make up the asic is walked and the hw_fini and sw_fini callbacks
2510  * are run.  hw_fini tears down the hardware associated with each IP
2511  * and sw_fini tears down any software state associated with each IP.
2512  * Returns 0 on success, negative error code on failure.
2513  */
amdgpu_device_ip_fini(struct amdgpu_device * adev)2514 static int amdgpu_device_ip_fini(struct amdgpu_device *adev)
2515 {
2516 	int i, r;
2517 
2518 	if (amdgpu_sriov_vf(adev) && adev->virt.ras_init_done)
2519 		amdgpu_virt_release_ras_err_handler_data(adev);
2520 
2521 	amdgpu_ras_pre_fini(adev);
2522 
2523 	if (adev->gmc.xgmi.num_physical_nodes > 1)
2524 		amdgpu_xgmi_remove_device(adev);
2525 
2526 	amdgpu_device_set_pg_state(adev, AMD_PG_STATE_UNGATE);
2527 	amdgpu_device_set_cg_state(adev, AMD_CG_STATE_UNGATE);
2528 
2529 	amdgpu_amdkfd_device_fini(adev);
2530 
2531 	/* need to disable SMC first */
2532 	for (i = 0; i < adev->num_ip_blocks; i++) {
2533 		if (!adev->ip_blocks[i].status.hw)
2534 			continue;
2535 		if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_SMC) {
2536 			r = adev->ip_blocks[i].version->funcs->hw_fini((void *)adev);
2537 			/* XXX handle errors */
2538 			if (r) {
2539 				DRM_DEBUG("hw_fini of IP block <%s> failed %d\n",
2540 					  adev->ip_blocks[i].version->funcs->name, r);
2541 			}
2542 			adev->ip_blocks[i].status.hw = false;
2543 			break;
2544 		}
2545 	}
2546 
2547 	for (i = adev->num_ip_blocks - 1; i >= 0; i--) {
2548 		if (!adev->ip_blocks[i].status.hw)
2549 			continue;
2550 
2551 		r = adev->ip_blocks[i].version->funcs->hw_fini((void *)adev);
2552 		/* XXX handle errors */
2553 		if (r) {
2554 			DRM_DEBUG("hw_fini of IP block <%s> failed %d\n",
2555 				  adev->ip_blocks[i].version->funcs->name, r);
2556 		}
2557 
2558 		adev->ip_blocks[i].status.hw = false;
2559 	}
2560 
2561 
2562 	for (i = adev->num_ip_blocks - 1; i >= 0; i--) {
2563 		if (!adev->ip_blocks[i].status.sw)
2564 			continue;
2565 
2566 		if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC) {
2567 			amdgpu_ucode_free_bo(adev);
2568 			amdgpu_free_static_csa(&adev->virt.csa_obj);
2569 			amdgpu_device_wb_fini(adev);
2570 			amdgpu_device_vram_scratch_fini(adev);
2571 			amdgpu_ib_pool_fini(adev);
2572 		}
2573 
2574 		r = adev->ip_blocks[i].version->funcs->sw_fini((void *)adev);
2575 		/* XXX handle errors */
2576 		if (r) {
2577 			DRM_DEBUG("sw_fini of IP block <%s> failed %d\n",
2578 				  adev->ip_blocks[i].version->funcs->name, r);
2579 		}
2580 		adev->ip_blocks[i].status.sw = false;
2581 		adev->ip_blocks[i].status.valid = false;
2582 	}
2583 
2584 	for (i = adev->num_ip_blocks - 1; i >= 0; i--) {
2585 		if (!adev->ip_blocks[i].status.late_initialized)
2586 			continue;
2587 		if (adev->ip_blocks[i].version->funcs->late_fini)
2588 			adev->ip_blocks[i].version->funcs->late_fini((void *)adev);
2589 		adev->ip_blocks[i].status.late_initialized = false;
2590 	}
2591 
2592 	amdgpu_ras_fini(adev);
2593 
2594 	if (amdgpu_sriov_vf(adev))
2595 		if (amdgpu_virt_release_full_gpu(adev, false))
2596 			DRM_ERROR("failed to release exclusive mode on fini\n");
2597 
2598 	return 0;
2599 }
2600 
2601 /**
2602  * amdgpu_device_delayed_init_work_handler - work handler for IB tests
2603  *
2604  * @work: work_struct.
2605  */
amdgpu_device_delayed_init_work_handler(struct work_struct * work)2606 static void amdgpu_device_delayed_init_work_handler(struct work_struct *work)
2607 {
2608 	struct amdgpu_device *adev =
2609 		container_of(work, struct amdgpu_device, delayed_init_work.work);
2610 	int r;
2611 
2612 	r = amdgpu_ib_ring_tests(adev);
2613 	if (r)
2614 		DRM_ERROR("ib ring test failed (%d).\n", r);
2615 }
2616 
amdgpu_device_delay_enable_gfx_off(struct work_struct * work)2617 static void amdgpu_device_delay_enable_gfx_off(struct work_struct *work)
2618 {
2619 	struct amdgpu_device *adev =
2620 		container_of(work, struct amdgpu_device, gfx.gfx_off_delay_work.work);
2621 
2622 	WARN_ON_ONCE(adev->gfx.gfx_off_state);
2623 	WARN_ON_ONCE(adev->gfx.gfx_off_req_count);
2624 
2625 	if (!amdgpu_dpm_set_powergating_by_smu(adev, AMD_IP_BLOCK_TYPE_GFX, true))
2626 		adev->gfx.gfx_off_state = true;
2627 }
2628 
2629 /**
2630  * amdgpu_device_ip_suspend_phase1 - run suspend for hardware IPs (phase 1)
2631  *
2632  * @adev: amdgpu_device pointer
2633  *
2634  * Main suspend function for hardware IPs.  The list of all the hardware
2635  * IPs that make up the asic is walked, clockgating is disabled and the
2636  * suspend callbacks are run.  suspend puts the hardware and software state
2637  * in each IP into a state suitable for suspend.
2638  * Returns 0 on success, negative error code on failure.
2639  */
amdgpu_device_ip_suspend_phase1(struct amdgpu_device * adev)2640 static int amdgpu_device_ip_suspend_phase1(struct amdgpu_device *adev)
2641 {
2642 	int i, r;
2643 
2644 	amdgpu_device_set_pg_state(adev, AMD_PG_STATE_UNGATE);
2645 	amdgpu_device_set_cg_state(adev, AMD_CG_STATE_UNGATE);
2646 
2647 	for (i = adev->num_ip_blocks - 1; i >= 0; i--) {
2648 		if (!adev->ip_blocks[i].status.valid)
2649 			continue;
2650 
2651 		/* displays are handled separately */
2652 		if (adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_DCE)
2653 			continue;
2654 
2655 		/* XXX handle errors */
2656 		r = adev->ip_blocks[i].version->funcs->suspend(adev);
2657 		/* XXX handle errors */
2658 		if (r) {
2659 			DRM_ERROR("suspend of IP block <%s> failed %d\n",
2660 				  adev->ip_blocks[i].version->funcs->name, r);
2661 			return r;
2662 		}
2663 
2664 		adev->ip_blocks[i].status.hw = false;
2665 	}
2666 
2667 	return 0;
2668 }
2669 
2670 /**
2671  * amdgpu_device_ip_suspend_phase2 - run suspend for hardware IPs (phase 2)
2672  *
2673  * @adev: amdgpu_device pointer
2674  *
2675  * Main suspend function for hardware IPs.  The list of all the hardware
2676  * IPs that make up the asic is walked, clockgating is disabled and the
2677  * suspend callbacks are run.  suspend puts the hardware and software state
2678  * in each IP into a state suitable for suspend.
2679  * Returns 0 on success, negative error code on failure.
2680  */
amdgpu_device_ip_suspend_phase2(struct amdgpu_device * adev)2681 static int amdgpu_device_ip_suspend_phase2(struct amdgpu_device *adev)
2682 {
2683 	int i, r;
2684 
2685 	for (i = adev->num_ip_blocks - 1; i >= 0; i--) {
2686 		if (!adev->ip_blocks[i].status.valid)
2687 			continue;
2688 		/* displays are handled in phase1 */
2689 		if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_DCE)
2690 			continue;
2691 		/* PSP lost connection when err_event_athub occurs */
2692 		if (amdgpu_ras_intr_triggered() &&
2693 		    adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_PSP) {
2694 			adev->ip_blocks[i].status.hw = false;
2695 			continue;
2696 		}
2697 		/* XXX handle errors */
2698 		r = adev->ip_blocks[i].version->funcs->suspend(adev);
2699 		/* XXX handle errors */
2700 		if (r) {
2701 			DRM_ERROR("suspend of IP block <%s> failed %d\n",
2702 				  adev->ip_blocks[i].version->funcs->name, r);
2703 		}
2704 		adev->ip_blocks[i].status.hw = false;
2705 		/* handle putting the SMC in the appropriate state */
2706 		if(!amdgpu_sriov_vf(adev)){
2707 			if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_SMC) {
2708 				r = amdgpu_dpm_set_mp1_state(adev, adev->mp1_state);
2709 				if (r) {
2710 					DRM_ERROR("SMC failed to set mp1 state %d, %d\n",
2711 							adev->mp1_state, r);
2712 					return r;
2713 				}
2714 			}
2715 		}
2716 		adev->ip_blocks[i].status.hw = false;
2717 	}
2718 
2719 	return 0;
2720 }
2721 
2722 /**
2723  * amdgpu_device_ip_suspend - run suspend for hardware IPs
2724  *
2725  * @adev: amdgpu_device pointer
2726  *
2727  * Main suspend function for hardware IPs.  The list of all the hardware
2728  * IPs that make up the asic is walked, clockgating is disabled and the
2729  * suspend callbacks are run.  suspend puts the hardware and software state
2730  * in each IP into a state suitable for suspend.
2731  * Returns 0 on success, negative error code on failure.
2732  */
amdgpu_device_ip_suspend(struct amdgpu_device * adev)2733 int amdgpu_device_ip_suspend(struct amdgpu_device *adev)
2734 {
2735 	int r;
2736 
2737 	if (amdgpu_sriov_vf(adev))
2738 		amdgpu_virt_request_full_gpu(adev, false);
2739 
2740 	r = amdgpu_device_ip_suspend_phase1(adev);
2741 	if (r)
2742 		return r;
2743 	r = amdgpu_device_ip_suspend_phase2(adev);
2744 
2745 	if (amdgpu_sriov_vf(adev))
2746 		amdgpu_virt_release_full_gpu(adev, false);
2747 
2748 	return r;
2749 }
2750 
amdgpu_device_ip_reinit_early_sriov(struct amdgpu_device * adev)2751 static int amdgpu_device_ip_reinit_early_sriov(struct amdgpu_device *adev)
2752 {
2753 	int i, r;
2754 
2755 	static enum amd_ip_block_type ip_order[] = {
2756 		AMD_IP_BLOCK_TYPE_GMC,
2757 		AMD_IP_BLOCK_TYPE_COMMON,
2758 		AMD_IP_BLOCK_TYPE_PSP,
2759 		AMD_IP_BLOCK_TYPE_IH,
2760 	};
2761 
2762 	for (i = 0; i < adev->num_ip_blocks; i++) {
2763 		int j;
2764 		struct amdgpu_ip_block *block;
2765 
2766 		block = &adev->ip_blocks[i];
2767 		block->status.hw = false;
2768 
2769 		for (j = 0; j < ARRAY_SIZE(ip_order); j++) {
2770 
2771 			if (block->version->type != ip_order[j] ||
2772 				!block->status.valid)
2773 				continue;
2774 
2775 			r = block->version->funcs->hw_init(adev);
2776 			DRM_INFO("RE-INIT-early: %s %s\n", block->version->funcs->name, r?"failed":"succeeded");
2777 			if (r)
2778 				return r;
2779 			block->status.hw = true;
2780 		}
2781 	}
2782 
2783 	return 0;
2784 }
2785 
amdgpu_device_ip_reinit_late_sriov(struct amdgpu_device * adev)2786 static int amdgpu_device_ip_reinit_late_sriov(struct amdgpu_device *adev)
2787 {
2788 	int i, r;
2789 
2790 	static enum amd_ip_block_type ip_order[] = {
2791 		AMD_IP_BLOCK_TYPE_SMC,
2792 		AMD_IP_BLOCK_TYPE_DCE,
2793 		AMD_IP_BLOCK_TYPE_GFX,
2794 		AMD_IP_BLOCK_TYPE_SDMA,
2795 		AMD_IP_BLOCK_TYPE_UVD,
2796 		AMD_IP_BLOCK_TYPE_VCE,
2797 		AMD_IP_BLOCK_TYPE_VCN
2798 	};
2799 
2800 	for (i = 0; i < ARRAY_SIZE(ip_order); i++) {
2801 		int j;
2802 		struct amdgpu_ip_block *block;
2803 
2804 		for (j = 0; j < adev->num_ip_blocks; j++) {
2805 			block = &adev->ip_blocks[j];
2806 
2807 			if (block->version->type != ip_order[i] ||
2808 				!block->status.valid ||
2809 				block->status.hw)
2810 				continue;
2811 
2812 			if (block->version->type == AMD_IP_BLOCK_TYPE_SMC)
2813 				r = block->version->funcs->resume(adev);
2814 			else
2815 				r = block->version->funcs->hw_init(adev);
2816 
2817 			DRM_INFO("RE-INIT-late: %s %s\n", block->version->funcs->name, r?"failed":"succeeded");
2818 			if (r)
2819 				return r;
2820 			block->status.hw = true;
2821 		}
2822 	}
2823 
2824 	return 0;
2825 }
2826 
2827 /**
2828  * amdgpu_device_ip_resume_phase1 - run resume for hardware IPs
2829  *
2830  * @adev: amdgpu_device pointer
2831  *
2832  * First resume function for hardware IPs.  The list of all the hardware
2833  * IPs that make up the asic is walked and the resume callbacks are run for
2834  * COMMON, GMC, and IH.  resume puts the hardware into a functional state
2835  * after a suspend and updates the software state as necessary.  This
2836  * function is also used for restoring the GPU after a GPU reset.
2837  * Returns 0 on success, negative error code on failure.
2838  */
amdgpu_device_ip_resume_phase1(struct amdgpu_device * adev)2839 static int amdgpu_device_ip_resume_phase1(struct amdgpu_device *adev)
2840 {
2841 	int i, r;
2842 
2843 	for (i = 0; i < adev->num_ip_blocks; i++) {
2844 		if (!adev->ip_blocks[i].status.valid || adev->ip_blocks[i].status.hw)
2845 			continue;
2846 		if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_COMMON ||
2847 		    adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC ||
2848 		    adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_IH) {
2849 
2850 			r = adev->ip_blocks[i].version->funcs->resume(adev);
2851 			if (r) {
2852 				DRM_ERROR("resume of IP block <%s> failed %d\n",
2853 					  adev->ip_blocks[i].version->funcs->name, r);
2854 				return r;
2855 			}
2856 			adev->ip_blocks[i].status.hw = true;
2857 		}
2858 	}
2859 
2860 	return 0;
2861 }
2862 
2863 /**
2864  * amdgpu_device_ip_resume_phase2 - run resume for hardware IPs
2865  *
2866  * @adev: amdgpu_device pointer
2867  *
2868  * First resume function for hardware IPs.  The list of all the hardware
2869  * IPs that make up the asic is walked and the resume callbacks are run for
2870  * all blocks except COMMON, GMC, and IH.  resume puts the hardware into a
2871  * functional state after a suspend and updates the software state as
2872  * necessary.  This function is also used for restoring the GPU after a GPU
2873  * reset.
2874  * Returns 0 on success, negative error code on failure.
2875  */
amdgpu_device_ip_resume_phase2(struct amdgpu_device * adev)2876 static int amdgpu_device_ip_resume_phase2(struct amdgpu_device *adev)
2877 {
2878 	int i, r;
2879 
2880 	for (i = 0; i < adev->num_ip_blocks; i++) {
2881 		if (!adev->ip_blocks[i].status.valid || adev->ip_blocks[i].status.hw)
2882 			continue;
2883 		if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_COMMON ||
2884 		    adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC ||
2885 		    adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_IH ||
2886 		    adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_PSP)
2887 			continue;
2888 		r = adev->ip_blocks[i].version->funcs->resume(adev);
2889 		if (r) {
2890 			DRM_ERROR("resume of IP block <%s> failed %d\n",
2891 				  adev->ip_blocks[i].version->funcs->name, r);
2892 			return r;
2893 		}
2894 		adev->ip_blocks[i].status.hw = true;
2895 	}
2896 
2897 	return 0;
2898 }
2899 
2900 /**
2901  * amdgpu_device_ip_resume - run resume for hardware IPs
2902  *
2903  * @adev: amdgpu_device pointer
2904  *
2905  * Main resume function for hardware IPs.  The hardware IPs
2906  * are split into two resume functions because they are
2907  * are also used in in recovering from a GPU reset and some additional
2908  * steps need to be take between them.  In this case (S3/S4) they are
2909  * run sequentially.
2910  * Returns 0 on success, negative error code on failure.
2911  */
amdgpu_device_ip_resume(struct amdgpu_device * adev)2912 static int amdgpu_device_ip_resume(struct amdgpu_device *adev)
2913 {
2914 	int r;
2915 
2916 	r = amdgpu_amdkfd_resume_iommu(adev);
2917 	if (r)
2918 		return r;
2919 
2920 	r = amdgpu_device_ip_resume_phase1(adev);
2921 	if (r)
2922 		return r;
2923 
2924 	r = amdgpu_device_fw_loading(adev);
2925 	if (r)
2926 		return r;
2927 
2928 	r = amdgpu_device_ip_resume_phase2(adev);
2929 
2930 	return r;
2931 }
2932 
2933 /**
2934  * amdgpu_device_detect_sriov_bios - determine if the board supports SR-IOV
2935  *
2936  * @adev: amdgpu_device pointer
2937  *
2938  * Query the VBIOS data tables to determine if the board supports SR-IOV.
2939  */
amdgpu_device_detect_sriov_bios(struct amdgpu_device * adev)2940 static void amdgpu_device_detect_sriov_bios(struct amdgpu_device *adev)
2941 {
2942 	if (amdgpu_sriov_vf(adev)) {
2943 		if (adev->is_atom_fw) {
2944 			if (amdgpu_atomfirmware_gpu_supports_virtualization(adev))
2945 				adev->virt.caps |= AMDGPU_SRIOV_CAPS_SRIOV_VBIOS;
2946 		} else {
2947 			if (amdgpu_atombios_has_gpu_virtualization_table(adev))
2948 				adev->virt.caps |= AMDGPU_SRIOV_CAPS_SRIOV_VBIOS;
2949 		}
2950 
2951 		if (!(adev->virt.caps & AMDGPU_SRIOV_CAPS_SRIOV_VBIOS))
2952 			amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_NO_VBIOS, 0, 0);
2953 	}
2954 }
2955 
2956 /**
2957  * amdgpu_device_asic_has_dc_support - determine if DC supports the asic
2958  *
2959  * @asic_type: AMD asic type
2960  *
2961  * Check if there is DC (new modesetting infrastructre) support for an asic.
2962  * returns true if DC has support, false if not.
2963  */
amdgpu_device_asic_has_dc_support(enum amd_asic_type asic_type)2964 bool amdgpu_device_asic_has_dc_support(enum amd_asic_type asic_type)
2965 {
2966 	switch (asic_type) {
2967 #if defined(CONFIG_DRM_AMD_DC)
2968 #if defined(CONFIG_DRM_AMD_DC_SI)
2969 	case CHIP_TAHITI:
2970 	case CHIP_PITCAIRN:
2971 	case CHIP_VERDE:
2972 	case CHIP_OLAND:
2973 #endif
2974 	case CHIP_BONAIRE:
2975 	case CHIP_KAVERI:
2976 	case CHIP_KABINI:
2977 	case CHIP_MULLINS:
2978 		/*
2979 		 * We have systems in the wild with these ASICs that require
2980 		 * LVDS and VGA support which is not supported with DC.
2981 		 *
2982 		 * Fallback to the non-DC driver here by default so as not to
2983 		 * cause regressions.
2984 		 */
2985 		return amdgpu_dc > 0;
2986 	case CHIP_HAWAII:
2987 	case CHIP_CARRIZO:
2988 	case CHIP_STONEY:
2989 	case CHIP_POLARIS10:
2990 	case CHIP_POLARIS11:
2991 	case CHIP_POLARIS12:
2992 	case CHIP_VEGAM:
2993 	case CHIP_TONGA:
2994 	case CHIP_FIJI:
2995 	case CHIP_VEGA10:
2996 	case CHIP_VEGA12:
2997 	case CHIP_VEGA20:
2998 #if defined(CONFIG_DRM_AMD_DC_DCN)
2999 	case CHIP_RAVEN:
3000 	case CHIP_NAVI10:
3001 	case CHIP_NAVI14:
3002 	case CHIP_NAVI12:
3003 	case CHIP_RENOIR:
3004 #endif
3005 #if defined(CONFIG_DRM_AMD_DC_DCN3_0)
3006 	case CHIP_SIENNA_CICHLID:
3007 	case CHIP_NAVY_FLOUNDER:
3008 #endif
3009 		return amdgpu_dc != 0;
3010 #endif
3011 	default:
3012 		if (amdgpu_dc > 0)
3013 			DRM_INFO_ONCE("Display Core has been requested via kernel parameter "
3014 					 "but isn't supported by ASIC, ignoring\n");
3015 		return false;
3016 	}
3017 }
3018 
3019 /**
3020  * amdgpu_device_has_dc_support - check if dc is supported
3021  *
3022  * @adev: amdgpu_device pointer
3023  *
3024  * Returns true for supported, false for not supported
3025  */
amdgpu_device_has_dc_support(struct amdgpu_device * adev)3026 bool amdgpu_device_has_dc_support(struct amdgpu_device *adev)
3027 {
3028 	if (amdgpu_sriov_vf(adev) || adev->enable_virtual_display)
3029 		return false;
3030 
3031 	return amdgpu_device_asic_has_dc_support(adev->asic_type);
3032 }
3033 
3034 
amdgpu_device_xgmi_reset_func(struct work_struct * __work)3035 static void amdgpu_device_xgmi_reset_func(struct work_struct *__work)
3036 {
3037 	struct amdgpu_device *adev =
3038 		container_of(__work, struct amdgpu_device, xgmi_reset_work);
3039 	struct amdgpu_hive_info *hive = amdgpu_get_xgmi_hive(adev);
3040 
3041 	/* It's a bug to not have a hive within this function */
3042 	if (WARN_ON(!hive))
3043 		return;
3044 
3045 	/*
3046 	 * Use task barrier to synchronize all xgmi reset works across the
3047 	 * hive. task_barrier_enter and task_barrier_exit will block
3048 	 * until all the threads running the xgmi reset works reach
3049 	 * those points. task_barrier_full will do both blocks.
3050 	 */
3051 	if (amdgpu_asic_reset_method(adev) == AMD_RESET_METHOD_BACO) {
3052 
3053 		task_barrier_enter(&hive->tb);
3054 		adev->asic_reset_res = amdgpu_device_baco_enter(adev_to_drm(adev));
3055 
3056 		if (adev->asic_reset_res)
3057 			goto fail;
3058 
3059 		task_barrier_exit(&hive->tb);
3060 		adev->asic_reset_res = amdgpu_device_baco_exit(adev_to_drm(adev));
3061 
3062 		if (adev->asic_reset_res)
3063 			goto fail;
3064 
3065 		if (adev->mmhub.funcs && adev->mmhub.funcs->reset_ras_error_count)
3066 			adev->mmhub.funcs->reset_ras_error_count(adev);
3067 	} else {
3068 
3069 		task_barrier_full(&hive->tb);
3070 		adev->asic_reset_res =  amdgpu_asic_reset(adev);
3071 	}
3072 
3073 fail:
3074 	if (adev->asic_reset_res)
3075 		DRM_WARN("ASIC reset failed with error, %d for drm dev, %s",
3076 			 adev->asic_reset_res, adev_to_drm(adev)->unique);
3077 	amdgpu_put_xgmi_hive(hive);
3078 }
3079 
amdgpu_device_get_job_timeout_settings(struct amdgpu_device * adev)3080 static int amdgpu_device_get_job_timeout_settings(struct amdgpu_device *adev)
3081 {
3082 	char *input = amdgpu_lockup_timeout;
3083 	char *timeout_setting = NULL;
3084 	int index = 0;
3085 	long timeout;
3086 	int ret = 0;
3087 
3088 	/*
3089 	 * By default timeout for non compute jobs is 10000.
3090 	 * And there is no timeout enforced on compute jobs.
3091 	 * In SR-IOV or passthrough mode, timeout for compute
3092 	 * jobs are 60000 by default.
3093 	 */
3094 	adev->gfx_timeout = msecs_to_jiffies(10000);
3095 	adev->sdma_timeout = adev->video_timeout = adev->gfx_timeout;
3096 	if (amdgpu_sriov_vf(adev) || amdgpu_passthrough(adev))
3097 		adev->compute_timeout =  msecs_to_jiffies(60000);
3098 	else
3099 		adev->compute_timeout = MAX_SCHEDULE_TIMEOUT;
3100 
3101 	if (strnlen(input, AMDGPU_MAX_TIMEOUT_PARAM_LENGTH)) {
3102 		while ((timeout_setting = strsep(&input, ",")) &&
3103 				strnlen(timeout_setting, AMDGPU_MAX_TIMEOUT_PARAM_LENGTH)) {
3104 			ret = kstrtol(timeout_setting, 0, &timeout);
3105 			if (ret)
3106 				return ret;
3107 
3108 			if (timeout == 0) {
3109 				index++;
3110 				continue;
3111 			} else if (timeout < 0) {
3112 				timeout = MAX_SCHEDULE_TIMEOUT;
3113 			} else {
3114 				timeout = msecs_to_jiffies(timeout);
3115 			}
3116 
3117 			switch (index++) {
3118 			case 0:
3119 				adev->gfx_timeout = timeout;
3120 				break;
3121 			case 1:
3122 				adev->compute_timeout = timeout;
3123 				break;
3124 			case 2:
3125 				adev->sdma_timeout = timeout;
3126 				break;
3127 			case 3:
3128 				adev->video_timeout = timeout;
3129 				break;
3130 			default:
3131 				break;
3132 			}
3133 		}
3134 		/*
3135 		 * There is only one value specified and
3136 		 * it should apply to all non-compute jobs.
3137 		 */
3138 		if (index == 1) {
3139 			adev->sdma_timeout = adev->video_timeout = adev->gfx_timeout;
3140 			if (amdgpu_sriov_vf(adev) || amdgpu_passthrough(adev))
3141 				adev->compute_timeout = adev->gfx_timeout;
3142 		}
3143 	}
3144 
3145 	return ret;
3146 }
3147 
3148 static const struct attribute *amdgpu_dev_attributes[] = {
3149 	&dev_attr_product_name.attr,
3150 	&dev_attr_product_number.attr,
3151 	&dev_attr_serial_number.attr,
3152 	&dev_attr_pcie_replay_count.attr,
3153 	NULL
3154 };
3155 
3156 
3157 /**
3158  * amdgpu_device_init - initialize the driver
3159  *
3160  * @adev: amdgpu_device pointer
3161  * @flags: driver flags
3162  *
3163  * Initializes the driver info and hw (all asics).
3164  * Returns 0 for success or an error on failure.
3165  * Called at driver startup.
3166  */
amdgpu_device_init(struct amdgpu_device * adev,uint32_t flags)3167 int amdgpu_device_init(struct amdgpu_device *adev,
3168 		       uint32_t flags)
3169 {
3170 	struct drm_device *ddev = adev_to_drm(adev);
3171 	struct pci_dev *pdev = adev->pdev;
3172 	int r, i;
3173 	bool boco = false;
3174 	u32 max_MBps;
3175 
3176 	adev->shutdown = false;
3177 	adev->flags = flags;
3178 
3179 	if (amdgpu_force_asic_type >= 0 && amdgpu_force_asic_type < CHIP_LAST)
3180 		adev->asic_type = amdgpu_force_asic_type;
3181 	else
3182 		adev->asic_type = flags & AMD_ASIC_MASK;
3183 
3184 	adev->usec_timeout = AMDGPU_MAX_USEC_TIMEOUT;
3185 	if (amdgpu_emu_mode == 1)
3186 		adev->usec_timeout *= 10;
3187 	adev->gmc.gart_size = 512 * 1024 * 1024;
3188 	adev->accel_working = false;
3189 	adev->num_rings = 0;
3190 	adev->mman.buffer_funcs = NULL;
3191 	adev->mman.buffer_funcs_ring = NULL;
3192 	adev->vm_manager.vm_pte_funcs = NULL;
3193 	adev->vm_manager.vm_pte_num_scheds = 0;
3194 	adev->gmc.gmc_funcs = NULL;
3195 	adev->fence_context = dma_fence_context_alloc(AMDGPU_MAX_RINGS);
3196 	bitmap_zero(adev->gfx.pipe_reserve_bitmap, AMDGPU_MAX_COMPUTE_QUEUES);
3197 
3198 	adev->smc_rreg = &amdgpu_invalid_rreg;
3199 	adev->smc_wreg = &amdgpu_invalid_wreg;
3200 	adev->pcie_rreg = &amdgpu_invalid_rreg;
3201 	adev->pcie_wreg = &amdgpu_invalid_wreg;
3202 	adev->pciep_rreg = &amdgpu_invalid_rreg;
3203 	adev->pciep_wreg = &amdgpu_invalid_wreg;
3204 	adev->pcie_rreg64 = &amdgpu_invalid_rreg64;
3205 	adev->pcie_wreg64 = &amdgpu_invalid_wreg64;
3206 	adev->uvd_ctx_rreg = &amdgpu_invalid_rreg;
3207 	adev->uvd_ctx_wreg = &amdgpu_invalid_wreg;
3208 	adev->didt_rreg = &amdgpu_invalid_rreg;
3209 	adev->didt_wreg = &amdgpu_invalid_wreg;
3210 	adev->gc_cac_rreg = &amdgpu_invalid_rreg;
3211 	adev->gc_cac_wreg = &amdgpu_invalid_wreg;
3212 	adev->audio_endpt_rreg = &amdgpu_block_invalid_rreg;
3213 	adev->audio_endpt_wreg = &amdgpu_block_invalid_wreg;
3214 
3215 	DRM_INFO("initializing kernel modesetting (%s 0x%04X:0x%04X 0x%04X:0x%04X 0x%02X).\n",
3216 		 amdgpu_asic_name[adev->asic_type], pdev->vendor, pdev->device,
3217 		 pdev->subsystem_vendor, pdev->subsystem_device, pdev->revision);
3218 
3219 	/* mutex initialization are all done here so we
3220 	 * can recall function without having locking issues */
3221 	atomic_set(&adev->irq.ih.lock, 0);
3222 	mutex_init(&adev->firmware.mutex);
3223 	mutex_init(&adev->pm.mutex);
3224 	mutex_init(&adev->gfx.gpu_clock_mutex);
3225 	mutex_init(&adev->srbm_mutex);
3226 	mutex_init(&adev->gfx.pipe_reserve_mutex);
3227 	mutex_init(&adev->gfx.gfx_off_mutex);
3228 	mutex_init(&adev->grbm_idx_mutex);
3229 	mutex_init(&adev->mn_lock);
3230 	mutex_init(&adev->virt.vf_errors.lock);
3231 	hash_init(adev->mn_hash);
3232 	atomic_set(&adev->in_gpu_reset, 0);
3233 	init_rwsem(&adev->reset_sem);
3234 	mutex_init(&adev->psp.mutex);
3235 	mutex_init(&adev->notifier_lock);
3236 
3237 	r = amdgpu_device_check_arguments(adev);
3238 	if (r)
3239 		return r;
3240 
3241 	spin_lock_init(&adev->mmio_idx_lock);
3242 	spin_lock_init(&adev->smc_idx_lock);
3243 	spin_lock_init(&adev->pcie_idx_lock);
3244 	spin_lock_init(&adev->uvd_ctx_idx_lock);
3245 	spin_lock_init(&adev->didt_idx_lock);
3246 	spin_lock_init(&adev->gc_cac_idx_lock);
3247 	spin_lock_init(&adev->se_cac_idx_lock);
3248 	spin_lock_init(&adev->audio_endpt_idx_lock);
3249 	spin_lock_init(&adev->mm_stats.lock);
3250 
3251 	INIT_LIST_HEAD(&adev->shadow_list);
3252 	mutex_init(&adev->shadow_list_lock);
3253 
3254 	INIT_DELAYED_WORK(&adev->delayed_init_work,
3255 			  amdgpu_device_delayed_init_work_handler);
3256 	INIT_DELAYED_WORK(&adev->gfx.gfx_off_delay_work,
3257 			  amdgpu_device_delay_enable_gfx_off);
3258 
3259 	INIT_WORK(&adev->xgmi_reset_work, amdgpu_device_xgmi_reset_func);
3260 
3261 	adev->gfx.gfx_off_req_count = 1;
3262 	adev->pm.ac_power = power_supply_is_system_supplied() > 0;
3263 
3264 	atomic_set(&adev->throttling_logging_enabled, 1);
3265 	/*
3266 	 * If throttling continues, logging will be performed every minute
3267 	 * to avoid log flooding. "-1" is subtracted since the thermal
3268 	 * throttling interrupt comes every second. Thus, the total logging
3269 	 * interval is 59 seconds(retelimited printk interval) + 1(waiting
3270 	 * for throttling interrupt) = 60 seconds.
3271 	 */
3272 	ratelimit_state_init(&adev->throttling_logging_rs, (60 - 1) * HZ, 1);
3273 	ratelimit_set_flags(&adev->throttling_logging_rs, RATELIMIT_MSG_ON_RELEASE);
3274 
3275 	/* Registers mapping */
3276 	/* TODO: block userspace mapping of io register */
3277 	if (adev->asic_type >= CHIP_BONAIRE) {
3278 		adev->rmmio_base = pci_resource_start(adev->pdev, 5);
3279 		adev->rmmio_size = pci_resource_len(adev->pdev, 5);
3280 	} else {
3281 		adev->rmmio_base = pci_resource_start(adev->pdev, 2);
3282 		adev->rmmio_size = pci_resource_len(adev->pdev, 2);
3283 	}
3284 
3285 	adev->rmmio = ioremap(adev->rmmio_base, adev->rmmio_size);
3286 	if (adev->rmmio == NULL) {
3287 		return -ENOMEM;
3288 	}
3289 	DRM_INFO("register mmio base: 0x%08X\n", (uint32_t)adev->rmmio_base);
3290 	DRM_INFO("register mmio size: %u\n", (unsigned)adev->rmmio_size);
3291 
3292 	/* io port mapping */
3293 	for (i = 0; i < DEVICE_COUNT_RESOURCE; i++) {
3294 		if (pci_resource_flags(adev->pdev, i) & IORESOURCE_IO) {
3295 			adev->rio_mem_size = pci_resource_len(adev->pdev, i);
3296 			adev->rio_mem = pci_iomap(adev->pdev, i, adev->rio_mem_size);
3297 			break;
3298 		}
3299 	}
3300 	if (adev->rio_mem == NULL)
3301 		DRM_INFO("PCI I/O BAR is not found.\n");
3302 
3303 	/* enable PCIE atomic ops */
3304 	r = pci_enable_atomic_ops_to_root(adev->pdev,
3305 					  PCI_EXP_DEVCAP2_ATOMIC_COMP32 |
3306 					  PCI_EXP_DEVCAP2_ATOMIC_COMP64);
3307 	if (r) {
3308 		adev->have_atomics_support = false;
3309 		DRM_INFO("PCIE atomic ops is not supported\n");
3310 	} else {
3311 		adev->have_atomics_support = true;
3312 	}
3313 
3314 	amdgpu_device_get_pcie_info(adev);
3315 
3316 	if (amdgpu_mcbp)
3317 		DRM_INFO("MCBP is enabled\n");
3318 
3319 	if (amdgpu_mes && adev->asic_type >= CHIP_NAVI10)
3320 		adev->enable_mes = true;
3321 
3322 	/* detect hw virtualization here */
3323 	amdgpu_detect_virtualization(adev);
3324 
3325 	r = amdgpu_device_get_job_timeout_settings(adev);
3326 	if (r) {
3327 		dev_err(adev->dev, "invalid lockup_timeout parameter syntax\n");
3328 		return r;
3329 	}
3330 
3331 	/* early init functions */
3332 	r = amdgpu_device_ip_early_init(adev);
3333 	if (r)
3334 		return r;
3335 
3336 	/* doorbell bar mapping and doorbell index init*/
3337 	amdgpu_device_doorbell_init(adev);
3338 
3339 	/* if we have > 1 VGA cards, then disable the amdgpu VGA resources */
3340 	/* this will fail for cards that aren't VGA class devices, just
3341 	 * ignore it */
3342 	vga_client_register(adev->pdev, adev, NULL, amdgpu_device_vga_set_decode);
3343 
3344 	if (amdgpu_device_supports_boco(ddev))
3345 		boco = true;
3346 	if (amdgpu_has_atpx() &&
3347 	    (amdgpu_is_atpx_hybrid() ||
3348 	     amdgpu_has_atpx_dgpu_power_cntl()) &&
3349 	    !pci_is_thunderbolt_attached(adev->pdev))
3350 		vga_switcheroo_register_client(adev->pdev,
3351 					       &amdgpu_switcheroo_ops, boco);
3352 	if (boco)
3353 		vga_switcheroo_init_domain_pm_ops(adev->dev, &adev->vga_pm_domain);
3354 
3355 	if (amdgpu_emu_mode == 1) {
3356 		/* post the asic on emulation mode */
3357 		emu_soc_asic_init(adev);
3358 		goto fence_driver_init;
3359 	}
3360 
3361 	/* detect if we are with an SRIOV vbios */
3362 	amdgpu_device_detect_sriov_bios(adev);
3363 
3364 	/* check if we need to reset the asic
3365 	 *  E.g., driver was not cleanly unloaded previously, etc.
3366 	 */
3367 	if (!amdgpu_sriov_vf(adev) && amdgpu_asic_need_reset_on_init(adev)) {
3368 		r = amdgpu_asic_reset(adev);
3369 		if (r) {
3370 			dev_err(adev->dev, "asic reset on init failed\n");
3371 			goto failed;
3372 		}
3373 	}
3374 
3375 	pci_enable_pcie_error_reporting(adev->ddev.pdev);
3376 
3377 	/* Post card if necessary */
3378 	if (amdgpu_device_need_post(adev)) {
3379 		if (!adev->bios) {
3380 			dev_err(adev->dev, "no vBIOS found\n");
3381 			r = -EINVAL;
3382 			goto failed;
3383 		}
3384 		DRM_INFO("GPU posting now...\n");
3385 		r = amdgpu_device_asic_init(adev);
3386 		if (r) {
3387 			dev_err(adev->dev, "gpu post error!\n");
3388 			goto failed;
3389 		}
3390 	}
3391 
3392 	if (adev->is_atom_fw) {
3393 		/* Initialize clocks */
3394 		r = amdgpu_atomfirmware_get_clock_info(adev);
3395 		if (r) {
3396 			dev_err(adev->dev, "amdgpu_atomfirmware_get_clock_info failed\n");
3397 			amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_ATOMBIOS_GET_CLOCK_FAIL, 0, 0);
3398 			goto failed;
3399 		}
3400 	} else {
3401 		/* Initialize clocks */
3402 		r = amdgpu_atombios_get_clock_info(adev);
3403 		if (r) {
3404 			dev_err(adev->dev, "amdgpu_atombios_get_clock_info failed\n");
3405 			amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_ATOMBIOS_GET_CLOCK_FAIL, 0, 0);
3406 			goto failed;
3407 		}
3408 		/* init i2c buses */
3409 		if (!amdgpu_device_has_dc_support(adev))
3410 			amdgpu_atombios_i2c_init(adev);
3411 	}
3412 
3413 fence_driver_init:
3414 	/* Fence driver */
3415 	r = amdgpu_fence_driver_init(adev);
3416 	if (r) {
3417 		dev_err(adev->dev, "amdgpu_fence_driver_init failed\n");
3418 		amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_FENCE_INIT_FAIL, 0, 0);
3419 		goto failed;
3420 	}
3421 
3422 	/* init the mode config */
3423 	drm_mode_config_init(adev_to_drm(adev));
3424 
3425 	r = amdgpu_device_ip_init(adev);
3426 	if (r) {
3427 		/* failed in exclusive mode due to timeout */
3428 		if (amdgpu_sriov_vf(adev) &&
3429 		    !amdgpu_sriov_runtime(adev) &&
3430 		    amdgpu_virt_mmio_blocked(adev) &&
3431 		    !amdgpu_virt_wait_reset(adev)) {
3432 			dev_err(adev->dev, "VF exclusive mode timeout\n");
3433 			/* Don't send request since VF is inactive. */
3434 			adev->virt.caps &= ~AMDGPU_SRIOV_CAPS_RUNTIME;
3435 			adev->virt.ops = NULL;
3436 			r = -EAGAIN;
3437 			goto failed;
3438 		}
3439 		dev_err(adev->dev, "amdgpu_device_ip_init failed\n");
3440 		amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_AMDGPU_INIT_FAIL, 0, 0);
3441 		goto failed;
3442 	}
3443 
3444 	dev_info(adev->dev,
3445 		"SE %d, SH per SE %d, CU per SH %d, active_cu_number %d\n",
3446 			adev->gfx.config.max_shader_engines,
3447 			adev->gfx.config.max_sh_per_se,
3448 			adev->gfx.config.max_cu_per_sh,
3449 			adev->gfx.cu_info.number);
3450 
3451 	adev->accel_working = true;
3452 
3453 	amdgpu_vm_check_compute_bug(adev);
3454 
3455 	/* Initialize the buffer migration limit. */
3456 	if (amdgpu_moverate >= 0)
3457 		max_MBps = amdgpu_moverate;
3458 	else
3459 		max_MBps = 8; /* Allow 8 MB/s. */
3460 	/* Get a log2 for easy divisions. */
3461 	adev->mm_stats.log2_max_MBps = ilog2(max(1u, max_MBps));
3462 
3463 	amdgpu_fbdev_init(adev);
3464 
3465 	r = amdgpu_pm_sysfs_init(adev);
3466 	if (r) {
3467 		adev->pm_sysfs_en = false;
3468 		DRM_ERROR("registering pm debugfs failed (%d).\n", r);
3469 	} else
3470 		adev->pm_sysfs_en = true;
3471 
3472 	r = amdgpu_ucode_sysfs_init(adev);
3473 	if (r) {
3474 		adev->ucode_sysfs_en = false;
3475 		DRM_ERROR("Creating firmware sysfs failed (%d).\n", r);
3476 	} else
3477 		adev->ucode_sysfs_en = true;
3478 
3479 	if ((amdgpu_testing & 1)) {
3480 		if (adev->accel_working)
3481 			amdgpu_test_moves(adev);
3482 		else
3483 			DRM_INFO("amdgpu: acceleration disabled, skipping move tests\n");
3484 	}
3485 	if (amdgpu_benchmarking) {
3486 		if (adev->accel_working)
3487 			amdgpu_benchmark(adev, amdgpu_benchmarking);
3488 		else
3489 			DRM_INFO("amdgpu: acceleration disabled, skipping benchmarks\n");
3490 	}
3491 
3492 	/*
3493 	 * Register gpu instance before amdgpu_device_enable_mgpu_fan_boost.
3494 	 * Otherwise the mgpu fan boost feature will be skipped due to the
3495 	 * gpu instance is counted less.
3496 	 */
3497 	amdgpu_register_gpu_instance(adev);
3498 
3499 	/* enable clockgating, etc. after ib tests, etc. since some blocks require
3500 	 * explicit gating rather than handling it automatically.
3501 	 */
3502 	r = amdgpu_device_ip_late_init(adev);
3503 	if (r) {
3504 		dev_err(adev->dev, "amdgpu_device_ip_late_init failed\n");
3505 		amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_AMDGPU_LATE_INIT_FAIL, 0, r);
3506 		goto failed;
3507 	}
3508 
3509 	/* must succeed. */
3510 	amdgpu_ras_resume(adev);
3511 
3512 	queue_delayed_work(system_wq, &adev->delayed_init_work,
3513 			   msecs_to_jiffies(AMDGPU_RESUME_MS));
3514 
3515 	if (amdgpu_sriov_vf(adev))
3516 		flush_delayed_work(&adev->delayed_init_work);
3517 
3518 	r = sysfs_create_files(&adev->dev->kobj, amdgpu_dev_attributes);
3519 	if (r)
3520 		dev_err(adev->dev, "Could not create amdgpu device attr\n");
3521 
3522 	if (IS_ENABLED(CONFIG_PERF_EVENTS))
3523 		r = amdgpu_pmu_init(adev);
3524 	if (r)
3525 		dev_err(adev->dev, "amdgpu_pmu_init failed\n");
3526 
3527 	/* Have stored pci confspace at hand for restore in sudden PCI error */
3528 	if (amdgpu_device_cache_pci_state(adev->pdev))
3529 		pci_restore_state(pdev);
3530 
3531 	return 0;
3532 
3533 failed:
3534 	amdgpu_vf_error_trans_all(adev);
3535 	if (boco)
3536 		vga_switcheroo_fini_domain_pm_ops(adev->dev);
3537 
3538 	return r;
3539 }
3540 
3541 /**
3542  * amdgpu_device_fini - tear down the driver
3543  *
3544  * @adev: amdgpu_device pointer
3545  *
3546  * Tear down the driver info (all asics).
3547  * Called at driver shutdown.
3548  */
amdgpu_device_fini(struct amdgpu_device * adev)3549 void amdgpu_device_fini(struct amdgpu_device *adev)
3550 {
3551 	dev_info(adev->dev, "amdgpu: finishing device.\n");
3552 	flush_delayed_work(&adev->delayed_init_work);
3553 	ttm_bo_lock_delayed_workqueue(&adev->mman.bdev);
3554 	adev->shutdown = true;
3555 
3556 	kfree(adev->pci_state);
3557 
3558 	/* make sure IB test finished before entering exclusive mode
3559 	 * to avoid preemption on IB test
3560 	 * */
3561 	if (amdgpu_sriov_vf(adev)) {
3562 		amdgpu_virt_request_full_gpu(adev, false);
3563 		amdgpu_virt_fini_data_exchange(adev);
3564 	}
3565 
3566 	/* disable all interrupts */
3567 	amdgpu_irq_disable_all(adev);
3568 	if (adev->mode_info.mode_config_initialized){
3569 		if (!amdgpu_device_has_dc_support(adev))
3570 			drm_helper_force_disable_all(adev_to_drm(adev));
3571 		else
3572 			drm_atomic_helper_shutdown(adev_to_drm(adev));
3573 	}
3574 	amdgpu_fence_driver_fini(adev);
3575 	if (adev->pm_sysfs_en)
3576 		amdgpu_pm_sysfs_fini(adev);
3577 	amdgpu_fbdev_fini(adev);
3578 	amdgpu_device_ip_fini(adev);
3579 	release_firmware(adev->firmware.gpu_info_fw);
3580 	adev->firmware.gpu_info_fw = NULL;
3581 	adev->accel_working = false;
3582 	/* free i2c buses */
3583 	if (!amdgpu_device_has_dc_support(adev))
3584 		amdgpu_i2c_fini(adev);
3585 
3586 	if (amdgpu_emu_mode != 1)
3587 		amdgpu_atombios_fini(adev);
3588 
3589 	kfree(adev->bios);
3590 	adev->bios = NULL;
3591 	if (amdgpu_has_atpx() &&
3592 	    (amdgpu_is_atpx_hybrid() ||
3593 	     amdgpu_has_atpx_dgpu_power_cntl()) &&
3594 	    !pci_is_thunderbolt_attached(adev->pdev))
3595 		vga_switcheroo_unregister_client(adev->pdev);
3596 	if (amdgpu_device_supports_boco(adev_to_drm(adev)))
3597 		vga_switcheroo_fini_domain_pm_ops(adev->dev);
3598 	vga_client_register(adev->pdev, NULL, NULL, NULL);
3599 	if (adev->rio_mem)
3600 		pci_iounmap(adev->pdev, adev->rio_mem);
3601 	adev->rio_mem = NULL;
3602 	iounmap(adev->rmmio);
3603 	adev->rmmio = NULL;
3604 	amdgpu_device_doorbell_fini(adev);
3605 
3606 	if (adev->ucode_sysfs_en)
3607 		amdgpu_ucode_sysfs_fini(adev);
3608 
3609 	sysfs_remove_files(&adev->dev->kobj, amdgpu_dev_attributes);
3610 	if (IS_ENABLED(CONFIG_PERF_EVENTS))
3611 		amdgpu_pmu_fini(adev);
3612 	if (adev->mman.discovery_bin)
3613 		amdgpu_discovery_fini(adev);
3614 }
3615 
3616 
3617 /*
3618  * Suspend & resume.
3619  */
3620 /**
3621  * amdgpu_device_suspend - initiate device suspend
3622  *
3623  * @dev: drm dev pointer
3624  * @fbcon : notify the fbdev of suspend
3625  *
3626  * Puts the hw in the suspend state (all asics).
3627  * Returns 0 for success or an error on failure.
3628  * Called at driver suspend.
3629  */
amdgpu_device_suspend(struct drm_device * dev,bool fbcon)3630 int amdgpu_device_suspend(struct drm_device *dev, bool fbcon)
3631 {
3632 	struct amdgpu_device *adev;
3633 	struct drm_crtc *crtc;
3634 	struct drm_connector *connector;
3635 	struct drm_connector_list_iter iter;
3636 	int r;
3637 
3638 	adev = drm_to_adev(dev);
3639 
3640 	if (dev->switch_power_state == DRM_SWITCH_POWER_OFF)
3641 		return 0;
3642 
3643 	adev->in_suspend = true;
3644 	drm_kms_helper_poll_disable(dev);
3645 
3646 	if (fbcon)
3647 		amdgpu_fbdev_set_suspend(adev, 1);
3648 
3649 	cancel_delayed_work_sync(&adev->delayed_init_work);
3650 
3651 	if (!amdgpu_device_has_dc_support(adev)) {
3652 		/* turn off display hw */
3653 		drm_modeset_lock_all(dev);
3654 		drm_connector_list_iter_begin(dev, &iter);
3655 		drm_for_each_connector_iter(connector, &iter)
3656 			drm_helper_connector_dpms(connector,
3657 						  DRM_MODE_DPMS_OFF);
3658 		drm_connector_list_iter_end(&iter);
3659 		drm_modeset_unlock_all(dev);
3660 			/* unpin the front buffers and cursors */
3661 		list_for_each_entry(crtc, &dev->mode_config.crtc_list, head) {
3662 			struct amdgpu_crtc *amdgpu_crtc = to_amdgpu_crtc(crtc);
3663 			struct drm_framebuffer *fb = crtc->primary->fb;
3664 			struct amdgpu_bo *robj;
3665 
3666 			if (amdgpu_crtc->cursor_bo && !adev->enable_virtual_display) {
3667 				struct amdgpu_bo *aobj = gem_to_amdgpu_bo(amdgpu_crtc->cursor_bo);
3668 				r = amdgpu_bo_reserve(aobj, true);
3669 				if (r == 0) {
3670 					amdgpu_bo_unpin(aobj);
3671 					amdgpu_bo_unreserve(aobj);
3672 				}
3673 			}
3674 
3675 			if (fb == NULL || fb->obj[0] == NULL) {
3676 				continue;
3677 			}
3678 			robj = gem_to_amdgpu_bo(fb->obj[0]);
3679 			/* don't unpin kernel fb objects */
3680 			if (!amdgpu_fbdev_robj_is_fb(adev, robj)) {
3681 				r = amdgpu_bo_reserve(robj, true);
3682 				if (r == 0) {
3683 					amdgpu_bo_unpin(robj);
3684 					amdgpu_bo_unreserve(robj);
3685 				}
3686 			}
3687 		}
3688 	}
3689 
3690 	amdgpu_ras_suspend(adev);
3691 
3692 	r = amdgpu_device_ip_suspend_phase1(adev);
3693 
3694 	amdgpu_amdkfd_suspend(adev, !fbcon);
3695 
3696 	/* evict vram memory */
3697 	amdgpu_bo_evict_vram(adev);
3698 
3699 	amdgpu_fence_driver_suspend(adev);
3700 
3701 	r = amdgpu_device_ip_suspend_phase2(adev);
3702 
3703 	/* evict remaining vram memory
3704 	 * This second call to evict vram is to evict the gart page table
3705 	 * using the CPU.
3706 	 */
3707 	amdgpu_bo_evict_vram(adev);
3708 
3709 	return 0;
3710 }
3711 
3712 /**
3713  * amdgpu_device_resume - initiate device resume
3714  *
3715  * @dev: drm dev pointer
3716  * @fbcon : notify the fbdev of resume
3717  *
3718  * Bring the hw back to operating state (all asics).
3719  * Returns 0 for success or an error on failure.
3720  * Called at driver resume.
3721  */
amdgpu_device_resume(struct drm_device * dev,bool fbcon)3722 int amdgpu_device_resume(struct drm_device *dev, bool fbcon)
3723 {
3724 	struct drm_connector *connector;
3725 	struct drm_connector_list_iter iter;
3726 	struct amdgpu_device *adev = drm_to_adev(dev);
3727 	struct drm_crtc *crtc;
3728 	int r = 0;
3729 
3730 	if (dev->switch_power_state == DRM_SWITCH_POWER_OFF)
3731 		return 0;
3732 
3733 	/* post card */
3734 	if (amdgpu_device_need_post(adev)) {
3735 		r = amdgpu_device_asic_init(adev);
3736 		if (r)
3737 			dev_err(adev->dev, "amdgpu asic init failed\n");
3738 	}
3739 
3740 	r = amdgpu_device_ip_resume(adev);
3741 	if (r) {
3742 		dev_err(adev->dev, "amdgpu_device_ip_resume failed (%d).\n", r);
3743 		return r;
3744 	}
3745 	amdgpu_fence_driver_resume(adev);
3746 
3747 
3748 	r = amdgpu_device_ip_late_init(adev);
3749 	if (r)
3750 		return r;
3751 
3752 	queue_delayed_work(system_wq, &adev->delayed_init_work,
3753 			   msecs_to_jiffies(AMDGPU_RESUME_MS));
3754 
3755 	if (!amdgpu_device_has_dc_support(adev)) {
3756 		/* pin cursors */
3757 		list_for_each_entry(crtc, &dev->mode_config.crtc_list, head) {
3758 			struct amdgpu_crtc *amdgpu_crtc = to_amdgpu_crtc(crtc);
3759 
3760 			if (amdgpu_crtc->cursor_bo && !adev->enable_virtual_display) {
3761 				struct amdgpu_bo *aobj = gem_to_amdgpu_bo(amdgpu_crtc->cursor_bo);
3762 				r = amdgpu_bo_reserve(aobj, true);
3763 				if (r == 0) {
3764 					r = amdgpu_bo_pin(aobj, AMDGPU_GEM_DOMAIN_VRAM);
3765 					if (r != 0)
3766 						dev_err(adev->dev, "Failed to pin cursor BO (%d)\n", r);
3767 					amdgpu_crtc->cursor_addr = amdgpu_bo_gpu_offset(aobj);
3768 					amdgpu_bo_unreserve(aobj);
3769 				}
3770 			}
3771 		}
3772 	}
3773 	r = amdgpu_amdkfd_resume(adev, !fbcon);
3774 	if (r)
3775 		return r;
3776 
3777 	/* Make sure IB tests flushed */
3778 	flush_delayed_work(&adev->delayed_init_work);
3779 
3780 	/* blat the mode back in */
3781 	if (fbcon) {
3782 		if (!amdgpu_device_has_dc_support(adev)) {
3783 			/* pre DCE11 */
3784 			drm_helper_resume_force_mode(dev);
3785 
3786 			/* turn on display hw */
3787 			drm_modeset_lock_all(dev);
3788 
3789 			drm_connector_list_iter_begin(dev, &iter);
3790 			drm_for_each_connector_iter(connector, &iter)
3791 				drm_helper_connector_dpms(connector,
3792 							  DRM_MODE_DPMS_ON);
3793 			drm_connector_list_iter_end(&iter);
3794 
3795 			drm_modeset_unlock_all(dev);
3796 		}
3797 		amdgpu_fbdev_set_suspend(adev, 0);
3798 	}
3799 
3800 	drm_kms_helper_poll_enable(dev);
3801 
3802 	amdgpu_ras_resume(adev);
3803 
3804 	/*
3805 	 * Most of the connector probing functions try to acquire runtime pm
3806 	 * refs to ensure that the GPU is powered on when connector polling is
3807 	 * performed. Since we're calling this from a runtime PM callback,
3808 	 * trying to acquire rpm refs will cause us to deadlock.
3809 	 *
3810 	 * Since we're guaranteed to be holding the rpm lock, it's safe to
3811 	 * temporarily disable the rpm helpers so this doesn't deadlock us.
3812 	 */
3813 #ifdef CONFIG_PM
3814 	dev->dev->power.disable_depth++;
3815 #endif
3816 	if (!amdgpu_device_has_dc_support(adev))
3817 		drm_helper_hpd_irq_event(dev);
3818 	else
3819 		drm_kms_helper_hotplug_event(dev);
3820 #ifdef CONFIG_PM
3821 	dev->dev->power.disable_depth--;
3822 #endif
3823 	adev->in_suspend = false;
3824 
3825 	return 0;
3826 }
3827 
3828 /**
3829  * amdgpu_device_ip_check_soft_reset - did soft reset succeed
3830  *
3831  * @adev: amdgpu_device pointer
3832  *
3833  * The list of all the hardware IPs that make up the asic is walked and
3834  * the check_soft_reset callbacks are run.  check_soft_reset determines
3835  * if the asic is still hung or not.
3836  * Returns true if any of the IPs are still in a hung state, false if not.
3837  */
amdgpu_device_ip_check_soft_reset(struct amdgpu_device * adev)3838 static bool amdgpu_device_ip_check_soft_reset(struct amdgpu_device *adev)
3839 {
3840 	int i;
3841 	bool asic_hang = false;
3842 
3843 	if (amdgpu_sriov_vf(adev))
3844 		return true;
3845 
3846 	if (amdgpu_asic_need_full_reset(adev))
3847 		return true;
3848 
3849 	for (i = 0; i < adev->num_ip_blocks; i++) {
3850 		if (!adev->ip_blocks[i].status.valid)
3851 			continue;
3852 		if (adev->ip_blocks[i].version->funcs->check_soft_reset)
3853 			adev->ip_blocks[i].status.hang =
3854 				adev->ip_blocks[i].version->funcs->check_soft_reset(adev);
3855 		if (adev->ip_blocks[i].status.hang) {
3856 			dev_info(adev->dev, "IP block:%s is hung!\n", adev->ip_blocks[i].version->funcs->name);
3857 			asic_hang = true;
3858 		}
3859 	}
3860 	return asic_hang;
3861 }
3862 
3863 /**
3864  * amdgpu_device_ip_pre_soft_reset - prepare for soft reset
3865  *
3866  * @adev: amdgpu_device pointer
3867  *
3868  * The list of all the hardware IPs that make up the asic is walked and the
3869  * pre_soft_reset callbacks are run if the block is hung.  pre_soft_reset
3870  * handles any IP specific hardware or software state changes that are
3871  * necessary for a soft reset to succeed.
3872  * Returns 0 on success, negative error code on failure.
3873  */
amdgpu_device_ip_pre_soft_reset(struct amdgpu_device * adev)3874 static int amdgpu_device_ip_pre_soft_reset(struct amdgpu_device *adev)
3875 {
3876 	int i, r = 0;
3877 
3878 	for (i = 0; i < adev->num_ip_blocks; i++) {
3879 		if (!adev->ip_blocks[i].status.valid)
3880 			continue;
3881 		if (adev->ip_blocks[i].status.hang &&
3882 		    adev->ip_blocks[i].version->funcs->pre_soft_reset) {
3883 			r = adev->ip_blocks[i].version->funcs->pre_soft_reset(adev);
3884 			if (r)
3885 				return r;
3886 		}
3887 	}
3888 
3889 	return 0;
3890 }
3891 
3892 /**
3893  * amdgpu_device_ip_need_full_reset - check if a full asic reset is needed
3894  *
3895  * @adev: amdgpu_device pointer
3896  *
3897  * Some hardware IPs cannot be soft reset.  If they are hung, a full gpu
3898  * reset is necessary to recover.
3899  * Returns true if a full asic reset is required, false if not.
3900  */
amdgpu_device_ip_need_full_reset(struct amdgpu_device * adev)3901 static bool amdgpu_device_ip_need_full_reset(struct amdgpu_device *adev)
3902 {
3903 	int i;
3904 
3905 	if (amdgpu_asic_need_full_reset(adev))
3906 		return true;
3907 
3908 	for (i = 0; i < adev->num_ip_blocks; i++) {
3909 		if (!adev->ip_blocks[i].status.valid)
3910 			continue;
3911 		if ((adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC) ||
3912 		    (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_SMC) ||
3913 		    (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_ACP) ||
3914 		    (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_DCE) ||
3915 		     adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_PSP) {
3916 			if (adev->ip_blocks[i].status.hang) {
3917 				dev_info(adev->dev, "Some block need full reset!\n");
3918 				return true;
3919 			}
3920 		}
3921 	}
3922 	return false;
3923 }
3924 
3925 /**
3926  * amdgpu_device_ip_soft_reset - do a soft reset
3927  *
3928  * @adev: amdgpu_device pointer
3929  *
3930  * The list of all the hardware IPs that make up the asic is walked and the
3931  * soft_reset callbacks are run if the block is hung.  soft_reset handles any
3932  * IP specific hardware or software state changes that are necessary to soft
3933  * reset the IP.
3934  * Returns 0 on success, negative error code on failure.
3935  */
amdgpu_device_ip_soft_reset(struct amdgpu_device * adev)3936 static int amdgpu_device_ip_soft_reset(struct amdgpu_device *adev)
3937 {
3938 	int i, r = 0;
3939 
3940 	for (i = 0; i < adev->num_ip_blocks; i++) {
3941 		if (!adev->ip_blocks[i].status.valid)
3942 			continue;
3943 		if (adev->ip_blocks[i].status.hang &&
3944 		    adev->ip_blocks[i].version->funcs->soft_reset) {
3945 			r = adev->ip_blocks[i].version->funcs->soft_reset(adev);
3946 			if (r)
3947 				return r;
3948 		}
3949 	}
3950 
3951 	return 0;
3952 }
3953 
3954 /**
3955  * amdgpu_device_ip_post_soft_reset - clean up from soft reset
3956  *
3957  * @adev: amdgpu_device pointer
3958  *
3959  * The list of all the hardware IPs that make up the asic is walked and the
3960  * post_soft_reset callbacks are run if the asic was hung.  post_soft_reset
3961  * handles any IP specific hardware or software state changes that are
3962  * necessary after the IP has been soft reset.
3963  * Returns 0 on success, negative error code on failure.
3964  */
amdgpu_device_ip_post_soft_reset(struct amdgpu_device * adev)3965 static int amdgpu_device_ip_post_soft_reset(struct amdgpu_device *adev)
3966 {
3967 	int i, r = 0;
3968 
3969 	for (i = 0; i < adev->num_ip_blocks; i++) {
3970 		if (!adev->ip_blocks[i].status.valid)
3971 			continue;
3972 		if (adev->ip_blocks[i].status.hang &&
3973 		    adev->ip_blocks[i].version->funcs->post_soft_reset)
3974 			r = adev->ip_blocks[i].version->funcs->post_soft_reset(adev);
3975 		if (r)
3976 			return r;
3977 	}
3978 
3979 	return 0;
3980 }
3981 
3982 /**
3983  * amdgpu_device_recover_vram - Recover some VRAM contents
3984  *
3985  * @adev: amdgpu_device pointer
3986  *
3987  * Restores the contents of VRAM buffers from the shadows in GTT.  Used to
3988  * restore things like GPUVM page tables after a GPU reset where
3989  * the contents of VRAM might be lost.
3990  *
3991  * Returns:
3992  * 0 on success, negative error code on failure.
3993  */
amdgpu_device_recover_vram(struct amdgpu_device * adev)3994 static int amdgpu_device_recover_vram(struct amdgpu_device *adev)
3995 {
3996 	struct dma_fence *fence = NULL, *next = NULL;
3997 	struct amdgpu_bo *shadow;
3998 	long r = 1, tmo;
3999 
4000 	if (amdgpu_sriov_runtime(adev))
4001 		tmo = msecs_to_jiffies(8000);
4002 	else
4003 		tmo = msecs_to_jiffies(100);
4004 
4005 	dev_info(adev->dev, "recover vram bo from shadow start\n");
4006 	mutex_lock(&adev->shadow_list_lock);
4007 	list_for_each_entry(shadow, &adev->shadow_list, shadow_list) {
4008 
4009 		/* No need to recover an evicted BO */
4010 		if (shadow->tbo.mem.mem_type != TTM_PL_TT ||
4011 		    shadow->tbo.mem.start == AMDGPU_BO_INVALID_OFFSET ||
4012 		    shadow->parent->tbo.mem.mem_type != TTM_PL_VRAM)
4013 			continue;
4014 
4015 		r = amdgpu_bo_restore_shadow(shadow, &next);
4016 		if (r)
4017 			break;
4018 
4019 		if (fence) {
4020 			tmo = dma_fence_wait_timeout(fence, false, tmo);
4021 			dma_fence_put(fence);
4022 			fence = next;
4023 			if (tmo == 0) {
4024 				r = -ETIMEDOUT;
4025 				break;
4026 			} else if (tmo < 0) {
4027 				r = tmo;
4028 				break;
4029 			}
4030 		} else {
4031 			fence = next;
4032 		}
4033 	}
4034 	mutex_unlock(&adev->shadow_list_lock);
4035 
4036 	if (fence)
4037 		tmo = dma_fence_wait_timeout(fence, false, tmo);
4038 	dma_fence_put(fence);
4039 
4040 	if (r < 0 || tmo <= 0) {
4041 		dev_err(adev->dev, "recover vram bo from shadow failed, r is %ld, tmo is %ld\n", r, tmo);
4042 		return -EIO;
4043 	}
4044 
4045 	dev_info(adev->dev, "recover vram bo from shadow done\n");
4046 	return 0;
4047 }
4048 
4049 
4050 /**
4051  * amdgpu_device_reset_sriov - reset ASIC for SR-IOV vf
4052  *
4053  * @adev: amdgpu_device pointer
4054  * @from_hypervisor: request from hypervisor
4055  *
4056  * do VF FLR and reinitialize Asic
4057  * return 0 means succeeded otherwise failed
4058  */
amdgpu_device_reset_sriov(struct amdgpu_device * adev,bool from_hypervisor)4059 static int amdgpu_device_reset_sriov(struct amdgpu_device *adev,
4060 				     bool from_hypervisor)
4061 {
4062 	int r;
4063 
4064 	if (from_hypervisor)
4065 		r = amdgpu_virt_request_full_gpu(adev, true);
4066 	else
4067 		r = amdgpu_virt_reset_gpu(adev);
4068 	if (r)
4069 		return r;
4070 
4071 	amdgpu_amdkfd_pre_reset(adev);
4072 
4073 	/* Resume IP prior to SMC */
4074 	r = amdgpu_device_ip_reinit_early_sriov(adev);
4075 	if (r)
4076 		goto error;
4077 
4078 	amdgpu_virt_init_data_exchange(adev);
4079 	/* we need recover gart prior to run SMC/CP/SDMA resume */
4080 	amdgpu_gtt_mgr_recover(ttm_manager_type(&adev->mman.bdev, TTM_PL_TT));
4081 
4082 	r = amdgpu_device_fw_loading(adev);
4083 	if (r)
4084 		return r;
4085 
4086 	/* now we are okay to resume SMC/CP/SDMA */
4087 	r = amdgpu_device_ip_reinit_late_sriov(adev);
4088 	if (r)
4089 		goto error;
4090 
4091 	amdgpu_irq_gpu_reset_resume_helper(adev);
4092 	r = amdgpu_ib_ring_tests(adev);
4093 	amdgpu_amdkfd_post_reset(adev);
4094 
4095 error:
4096 	amdgpu_virt_release_full_gpu(adev, true);
4097 	if (!r && adev->virt.gim_feature & AMDGIM_FEATURE_GIM_FLR_VRAMLOST) {
4098 		amdgpu_inc_vram_lost(adev);
4099 		r = amdgpu_device_recover_vram(adev);
4100 	}
4101 
4102 	return r;
4103 }
4104 
4105 /**
4106  * amdgpu_device_has_job_running - check if there is any job in mirror list
4107  *
4108  * @adev: amdgpu_device pointer
4109  *
4110  * check if there is any job in mirror list
4111  */
amdgpu_device_has_job_running(struct amdgpu_device * adev)4112 bool amdgpu_device_has_job_running(struct amdgpu_device *adev)
4113 {
4114 	int i;
4115 	struct drm_sched_job *job;
4116 
4117 	for (i = 0; i < AMDGPU_MAX_RINGS; ++i) {
4118 		struct amdgpu_ring *ring = adev->rings[i];
4119 
4120 		if (!ring || !ring->sched.thread)
4121 			continue;
4122 
4123 		spin_lock(&ring->sched.job_list_lock);
4124 		job = list_first_entry_or_null(&ring->sched.ring_mirror_list,
4125 				struct drm_sched_job, node);
4126 		spin_unlock(&ring->sched.job_list_lock);
4127 		if (job)
4128 			return true;
4129 	}
4130 	return false;
4131 }
4132 
4133 /**
4134  * amdgpu_device_should_recover_gpu - check if we should try GPU recovery
4135  *
4136  * @adev: amdgpu_device pointer
4137  *
4138  * Check amdgpu_gpu_recovery and SRIOV status to see if we should try to recover
4139  * a hung GPU.
4140  */
amdgpu_device_should_recover_gpu(struct amdgpu_device * adev)4141 bool amdgpu_device_should_recover_gpu(struct amdgpu_device *adev)
4142 {
4143 	if (!amdgpu_device_ip_check_soft_reset(adev)) {
4144 		dev_info(adev->dev, "Timeout, but no hardware hang detected.\n");
4145 		return false;
4146 	}
4147 
4148 	if (amdgpu_gpu_recovery == 0)
4149 		goto disabled;
4150 
4151 	if (amdgpu_sriov_vf(adev))
4152 		return true;
4153 
4154 	if (amdgpu_gpu_recovery == -1) {
4155 		switch (adev->asic_type) {
4156 		case CHIP_BONAIRE:
4157 		case CHIP_HAWAII:
4158 		case CHIP_TOPAZ:
4159 		case CHIP_TONGA:
4160 		case CHIP_FIJI:
4161 		case CHIP_POLARIS10:
4162 		case CHIP_POLARIS11:
4163 		case CHIP_POLARIS12:
4164 		case CHIP_VEGAM:
4165 		case CHIP_VEGA20:
4166 		case CHIP_VEGA10:
4167 		case CHIP_VEGA12:
4168 		case CHIP_RAVEN:
4169 		case CHIP_ARCTURUS:
4170 		case CHIP_RENOIR:
4171 		case CHIP_NAVI10:
4172 		case CHIP_NAVI14:
4173 		case CHIP_NAVI12:
4174 		case CHIP_SIENNA_CICHLID:
4175 			break;
4176 		default:
4177 			goto disabled;
4178 		}
4179 	}
4180 
4181 	return true;
4182 
4183 disabled:
4184 		dev_info(adev->dev, "GPU recovery disabled.\n");
4185 		return false;
4186 }
4187 
4188 
amdgpu_device_pre_asic_reset(struct amdgpu_device * adev,struct amdgpu_job * job,bool * need_full_reset_arg)4189 static int amdgpu_device_pre_asic_reset(struct amdgpu_device *adev,
4190 					struct amdgpu_job *job,
4191 					bool *need_full_reset_arg)
4192 {
4193 	int i, r = 0;
4194 	bool need_full_reset  = *need_full_reset_arg;
4195 
4196 	amdgpu_debugfs_wait_dump(adev);
4197 
4198 	if (amdgpu_sriov_vf(adev)) {
4199 		/* stop the data exchange thread */
4200 		amdgpu_virt_fini_data_exchange(adev);
4201 	}
4202 
4203 	/* block all schedulers and reset given job's ring */
4204 	for (i = 0; i < AMDGPU_MAX_RINGS; ++i) {
4205 		struct amdgpu_ring *ring = adev->rings[i];
4206 
4207 		if (!ring || !ring->sched.thread)
4208 			continue;
4209 
4210 		/* after all hw jobs are reset, hw fence is meaningless, so force_completion */
4211 		amdgpu_fence_driver_force_completion(ring);
4212 	}
4213 
4214 	if(job)
4215 		drm_sched_increase_karma(&job->base);
4216 
4217 	/* Don't suspend on bare metal if we are not going to HW reset the ASIC */
4218 	if (!amdgpu_sriov_vf(adev)) {
4219 
4220 		if (!need_full_reset)
4221 			need_full_reset = amdgpu_device_ip_need_full_reset(adev);
4222 
4223 		if (!need_full_reset) {
4224 			amdgpu_device_ip_pre_soft_reset(adev);
4225 			r = amdgpu_device_ip_soft_reset(adev);
4226 			amdgpu_device_ip_post_soft_reset(adev);
4227 			if (r || amdgpu_device_ip_check_soft_reset(adev)) {
4228 				dev_info(adev->dev, "soft reset failed, will fallback to full reset!\n");
4229 				need_full_reset = true;
4230 			}
4231 		}
4232 
4233 		if (need_full_reset)
4234 			r = amdgpu_device_ip_suspend(adev);
4235 
4236 		*need_full_reset_arg = need_full_reset;
4237 	}
4238 
4239 	return r;
4240 }
4241 
amdgpu_do_asic_reset(struct amdgpu_hive_info * hive,struct list_head * device_list_handle,bool * need_full_reset_arg,bool skip_hw_reset)4242 static int amdgpu_do_asic_reset(struct amdgpu_hive_info *hive,
4243 			       struct list_head *device_list_handle,
4244 			       bool *need_full_reset_arg,
4245 			       bool skip_hw_reset)
4246 {
4247 	struct amdgpu_device *tmp_adev = NULL;
4248 	bool need_full_reset = *need_full_reset_arg, vram_lost = false;
4249 	int r = 0;
4250 
4251 	/*
4252 	 * ASIC reset has to be done on all HGMI hive nodes ASAP
4253 	 * to allow proper links negotiation in FW (within 1 sec)
4254 	 */
4255 	if (!skip_hw_reset && need_full_reset) {
4256 		list_for_each_entry(tmp_adev, device_list_handle, gmc.xgmi.head) {
4257 			/* For XGMI run all resets in parallel to speed up the process */
4258 			if (tmp_adev->gmc.xgmi.num_physical_nodes > 1) {
4259 				if (!queue_work(system_unbound_wq, &tmp_adev->xgmi_reset_work))
4260 					r = -EALREADY;
4261 			} else
4262 				r = amdgpu_asic_reset(tmp_adev);
4263 
4264 			if (r) {
4265 				dev_err(tmp_adev->dev, "ASIC reset failed with error, %d for drm dev, %s",
4266 					 r, adev_to_drm(tmp_adev)->unique);
4267 				break;
4268 			}
4269 		}
4270 
4271 		/* For XGMI wait for all resets to complete before proceed */
4272 		if (!r) {
4273 			list_for_each_entry(tmp_adev, device_list_handle,
4274 					    gmc.xgmi.head) {
4275 				if (tmp_adev->gmc.xgmi.num_physical_nodes > 1) {
4276 					flush_work(&tmp_adev->xgmi_reset_work);
4277 					r = tmp_adev->asic_reset_res;
4278 					if (r)
4279 						break;
4280 				}
4281 			}
4282 		}
4283 	}
4284 
4285 	if (!r && amdgpu_ras_intr_triggered()) {
4286 		list_for_each_entry(tmp_adev, device_list_handle, gmc.xgmi.head) {
4287 			if (tmp_adev->mmhub.funcs &&
4288 			    tmp_adev->mmhub.funcs->reset_ras_error_count)
4289 				tmp_adev->mmhub.funcs->reset_ras_error_count(tmp_adev);
4290 		}
4291 
4292 		amdgpu_ras_intr_cleared();
4293 	}
4294 
4295 	list_for_each_entry(tmp_adev, device_list_handle, gmc.xgmi.head) {
4296 		if (need_full_reset) {
4297 			/* post card */
4298 			if (amdgpu_device_asic_init(tmp_adev))
4299 				dev_warn(tmp_adev->dev, "asic atom init failed!");
4300 
4301 			if (!r) {
4302 				dev_info(tmp_adev->dev, "GPU reset succeeded, trying to resume\n");
4303 				r = amdgpu_amdkfd_resume_iommu(tmp_adev);
4304 				if (r)
4305 					goto out;
4306 
4307 				r = amdgpu_device_ip_resume_phase1(tmp_adev);
4308 				if (r)
4309 					goto out;
4310 
4311 				vram_lost = amdgpu_device_check_vram_lost(tmp_adev);
4312 				if (vram_lost) {
4313 					DRM_INFO("VRAM is lost due to GPU reset!\n");
4314 					amdgpu_inc_vram_lost(tmp_adev);
4315 				}
4316 
4317 				r = amdgpu_gtt_mgr_recover(ttm_manager_type(&tmp_adev->mman.bdev, TTM_PL_TT));
4318 				if (r)
4319 					goto out;
4320 
4321 				r = amdgpu_device_fw_loading(tmp_adev);
4322 				if (r)
4323 					return r;
4324 
4325 				r = amdgpu_device_ip_resume_phase2(tmp_adev);
4326 				if (r)
4327 					goto out;
4328 
4329 				if (vram_lost)
4330 					amdgpu_device_fill_reset_magic(tmp_adev);
4331 
4332 				/*
4333 				 * Add this ASIC as tracked as reset was already
4334 				 * complete successfully.
4335 				 */
4336 				amdgpu_register_gpu_instance(tmp_adev);
4337 
4338 				r = amdgpu_device_ip_late_init(tmp_adev);
4339 				if (r)
4340 					goto out;
4341 
4342 				amdgpu_fbdev_set_suspend(tmp_adev, 0);
4343 
4344 				/*
4345 				 * The GPU enters bad state once faulty pages
4346 				 * by ECC has reached the threshold, and ras
4347 				 * recovery is scheduled next. So add one check
4348 				 * here to break recovery if it indeed exceeds
4349 				 * bad page threshold, and remind user to
4350 				 * retire this GPU or setting one bigger
4351 				 * bad_page_threshold value to fix this once
4352 				 * probing driver again.
4353 				 */
4354 				if (!amdgpu_ras_check_err_threshold(tmp_adev)) {
4355 					/* must succeed. */
4356 					amdgpu_ras_resume(tmp_adev);
4357 				} else {
4358 					r = -EINVAL;
4359 					goto out;
4360 				}
4361 
4362 				/* Update PSP FW topology after reset */
4363 				if (hive && tmp_adev->gmc.xgmi.num_physical_nodes > 1)
4364 					r = amdgpu_xgmi_update_topology(hive, tmp_adev);
4365 			}
4366 		}
4367 
4368 out:
4369 		if (!r) {
4370 			amdgpu_irq_gpu_reset_resume_helper(tmp_adev);
4371 			r = amdgpu_ib_ring_tests(tmp_adev);
4372 			if (r) {
4373 				dev_err(tmp_adev->dev, "ib ring test failed (%d).\n", r);
4374 				need_full_reset = true;
4375 				r = -EAGAIN;
4376 				goto end;
4377 			}
4378 		}
4379 
4380 		if (!r)
4381 			r = amdgpu_device_recover_vram(tmp_adev);
4382 		else
4383 			tmp_adev->asic_reset_res = r;
4384 	}
4385 
4386 end:
4387 	*need_full_reset_arg = need_full_reset;
4388 	return r;
4389 }
4390 
amdgpu_device_lock_adev(struct amdgpu_device * adev,struct amdgpu_hive_info * hive)4391 static bool amdgpu_device_lock_adev(struct amdgpu_device *adev,
4392 				struct amdgpu_hive_info *hive)
4393 {
4394 	if (atomic_cmpxchg(&adev->in_gpu_reset, 0, 1) != 0)
4395 		return false;
4396 
4397 	if (hive) {
4398 		down_write_nest_lock(&adev->reset_sem, &hive->hive_lock);
4399 	} else {
4400 		down_write(&adev->reset_sem);
4401 	}
4402 
4403 	atomic_inc(&adev->gpu_reset_counter);
4404 	switch (amdgpu_asic_reset_method(adev)) {
4405 	case AMD_RESET_METHOD_MODE1:
4406 		adev->mp1_state = PP_MP1_STATE_SHUTDOWN;
4407 		break;
4408 	case AMD_RESET_METHOD_MODE2:
4409 		adev->mp1_state = PP_MP1_STATE_RESET;
4410 		break;
4411 	default:
4412 		adev->mp1_state = PP_MP1_STATE_NONE;
4413 		break;
4414 	}
4415 
4416 	return true;
4417 }
4418 
amdgpu_device_unlock_adev(struct amdgpu_device * adev)4419 static void amdgpu_device_unlock_adev(struct amdgpu_device *adev)
4420 {
4421 	amdgpu_vf_error_trans_all(adev);
4422 	adev->mp1_state = PP_MP1_STATE_NONE;
4423 	atomic_set(&adev->in_gpu_reset, 0);
4424 	up_write(&adev->reset_sem);
4425 }
4426 
amdgpu_device_resume_display_audio(struct amdgpu_device * adev)4427 static void amdgpu_device_resume_display_audio(struct amdgpu_device *adev)
4428 {
4429 	struct pci_dev *p = NULL;
4430 
4431 	p = pci_get_domain_bus_and_slot(pci_domain_nr(adev->pdev->bus),
4432 			adev->pdev->bus->number, 1);
4433 	if (p) {
4434 		pm_runtime_enable(&(p->dev));
4435 		pm_runtime_resume(&(p->dev));
4436 	}
4437 }
4438 
amdgpu_device_suspend_display_audio(struct amdgpu_device * adev)4439 static int amdgpu_device_suspend_display_audio(struct amdgpu_device *adev)
4440 {
4441 	enum amd_reset_method reset_method;
4442 	struct pci_dev *p = NULL;
4443 	u64 expires;
4444 
4445 	/*
4446 	 * For now, only BACO and mode1 reset are confirmed
4447 	 * to suffer the audio issue without proper suspended.
4448 	 */
4449 	reset_method = amdgpu_asic_reset_method(adev);
4450 	if ((reset_method != AMD_RESET_METHOD_BACO) &&
4451 	     (reset_method != AMD_RESET_METHOD_MODE1))
4452 		return -EINVAL;
4453 
4454 	p = pci_get_domain_bus_and_slot(pci_domain_nr(adev->pdev->bus),
4455 			adev->pdev->bus->number, 1);
4456 	if (!p)
4457 		return -ENODEV;
4458 
4459 	expires = pm_runtime_autosuspend_expiration(&(p->dev));
4460 	if (!expires)
4461 		/*
4462 		 * If we cannot get the audio device autosuspend delay,
4463 		 * a fixed 4S interval will be used. Considering 3S is
4464 		 * the audio controller default autosuspend delay setting.
4465 		 * 4S used here is guaranteed to cover that.
4466 		 */
4467 		expires = ktime_get_mono_fast_ns() + NSEC_PER_SEC * 4ULL;
4468 
4469 	while (!pm_runtime_status_suspended(&(p->dev))) {
4470 		if (!pm_runtime_suspend(&(p->dev)))
4471 			break;
4472 
4473 		if (expires < ktime_get_mono_fast_ns()) {
4474 			dev_warn(adev->dev, "failed to suspend display audio\n");
4475 			/* TODO: abort the succeeding gpu reset? */
4476 			return -ETIMEDOUT;
4477 		}
4478 	}
4479 
4480 	pm_runtime_disable(&(p->dev));
4481 
4482 	return 0;
4483 }
4484 
4485 /**
4486  * amdgpu_device_gpu_recover - reset the asic and recover scheduler
4487  *
4488  * @adev: amdgpu_device pointer
4489  * @job: which job trigger hang
4490  *
4491  * Attempt to reset the GPU if it has hung (all asics).
4492  * Attempt to do soft-reset or full-reset and reinitialize Asic
4493  * Returns 0 for success or an error on failure.
4494  */
4495 
amdgpu_device_gpu_recover(struct amdgpu_device * adev,struct amdgpu_job * job)4496 int amdgpu_device_gpu_recover(struct amdgpu_device *adev,
4497 			      struct amdgpu_job *job)
4498 {
4499 	struct list_head device_list, *device_list_handle =  NULL;
4500 	bool need_full_reset = false;
4501 	bool job_signaled = false;
4502 	struct amdgpu_hive_info *hive = NULL;
4503 	struct amdgpu_device *tmp_adev = NULL;
4504 	int i, r = 0;
4505 	bool need_emergency_restart = false;
4506 	bool audio_suspended = false;
4507 
4508 	/*
4509 	 * Special case: RAS triggered and full reset isn't supported
4510 	 */
4511 	need_emergency_restart = amdgpu_ras_need_emergency_restart(adev);
4512 
4513 	/*
4514 	 * Flush RAM to disk so that after reboot
4515 	 * the user can read log and see why the system rebooted.
4516 	 */
4517 	if (need_emergency_restart && amdgpu_ras_get_context(adev)->reboot) {
4518 		DRM_WARN("Emergency reboot.");
4519 
4520 		ksys_sync_helper();
4521 		emergency_restart();
4522 	}
4523 
4524 	dev_info(adev->dev, "GPU %s begin!\n",
4525 		need_emergency_restart ? "jobs stop":"reset");
4526 
4527 	/*
4528 	 * Here we trylock to avoid chain of resets executing from
4529 	 * either trigger by jobs on different adevs in XGMI hive or jobs on
4530 	 * different schedulers for same device while this TO handler is running.
4531 	 * We always reset all schedulers for device and all devices for XGMI
4532 	 * hive so that should take care of them too.
4533 	 */
4534 	hive = amdgpu_get_xgmi_hive(adev);
4535 	if (hive) {
4536 		if (atomic_cmpxchg(&hive->in_reset, 0, 1) != 0) {
4537 			DRM_INFO("Bailing on TDR for s_job:%llx, hive: %llx as another already in progress",
4538 				job ? job->base.id : -1, hive->hive_id);
4539 			amdgpu_put_xgmi_hive(hive);
4540 			return 0;
4541 		}
4542 		mutex_lock(&hive->hive_lock);
4543 	}
4544 
4545 	/*
4546 	 * Build list of devices to reset.
4547 	 * In case we are in XGMI hive mode, resort the device list
4548 	 * to put adev in the 1st position.
4549 	 */
4550 	INIT_LIST_HEAD(&device_list);
4551 	if (adev->gmc.xgmi.num_physical_nodes > 1) {
4552 		if (!hive)
4553 			return -ENODEV;
4554 		if (!list_is_first(&adev->gmc.xgmi.head, &hive->device_list))
4555 			list_rotate_to_front(&adev->gmc.xgmi.head, &hive->device_list);
4556 		device_list_handle = &hive->device_list;
4557 	} else {
4558 		list_add_tail(&adev->gmc.xgmi.head, &device_list);
4559 		device_list_handle = &device_list;
4560 	}
4561 
4562 	/* block all schedulers and reset given job's ring */
4563 	list_for_each_entry(tmp_adev, device_list_handle, gmc.xgmi.head) {
4564 		if (!amdgpu_device_lock_adev(tmp_adev, hive)) {
4565 			dev_info(tmp_adev->dev, "Bailing on TDR for s_job:%llx, as another already in progress",
4566 				  job ? job->base.id : -1);
4567 			r = 0;
4568 			goto skip_recovery;
4569 		}
4570 
4571 		/*
4572 		 * Try to put the audio codec into suspend state
4573 		 * before gpu reset started.
4574 		 *
4575 		 * Due to the power domain of the graphics device
4576 		 * is shared with AZ power domain. Without this,
4577 		 * we may change the audio hardware from behind
4578 		 * the audio driver's back. That will trigger
4579 		 * some audio codec errors.
4580 		 */
4581 		if (!amdgpu_device_suspend_display_audio(tmp_adev))
4582 			audio_suspended = true;
4583 
4584 		amdgpu_ras_set_error_query_ready(tmp_adev, false);
4585 
4586 		cancel_delayed_work_sync(&tmp_adev->delayed_init_work);
4587 
4588 		if (!amdgpu_sriov_vf(tmp_adev))
4589 			amdgpu_amdkfd_pre_reset(tmp_adev);
4590 
4591 		/*
4592 		 * Mark these ASICs to be reseted as untracked first
4593 		 * And add them back after reset completed
4594 		 */
4595 		amdgpu_unregister_gpu_instance(tmp_adev);
4596 
4597 		amdgpu_fbdev_set_suspend(tmp_adev, 1);
4598 
4599 		/* disable ras on ALL IPs */
4600 		if (!need_emergency_restart &&
4601 		      amdgpu_device_ip_need_full_reset(tmp_adev))
4602 			amdgpu_ras_suspend(tmp_adev);
4603 
4604 		for (i = 0; i < AMDGPU_MAX_RINGS; ++i) {
4605 			struct amdgpu_ring *ring = tmp_adev->rings[i];
4606 
4607 			if (!ring || !ring->sched.thread)
4608 				continue;
4609 
4610 			drm_sched_stop(&ring->sched, job ? &job->base : NULL);
4611 
4612 			if (need_emergency_restart)
4613 				amdgpu_job_stop_all_jobs_on_sched(&ring->sched);
4614 		}
4615 	}
4616 
4617 	if (need_emergency_restart)
4618 		goto skip_sched_resume;
4619 
4620 	/*
4621 	 * Must check guilty signal here since after this point all old
4622 	 * HW fences are force signaled.
4623 	 *
4624 	 * job->base holds a reference to parent fence
4625 	 */
4626 	if (job && job->base.s_fence->parent &&
4627 	    dma_fence_is_signaled(job->base.s_fence->parent)) {
4628 		job_signaled = true;
4629 		dev_info(adev->dev, "Guilty job already signaled, skipping HW reset");
4630 		goto skip_hw_reset;
4631 	}
4632 
4633 retry:	/* Rest of adevs pre asic reset from XGMI hive. */
4634 	list_for_each_entry(tmp_adev, device_list_handle, gmc.xgmi.head) {
4635 		r = amdgpu_device_pre_asic_reset(tmp_adev,
4636 						 (tmp_adev == adev) ? job : NULL,
4637 						 &need_full_reset);
4638 		/*TODO Should we stop ?*/
4639 		if (r) {
4640 			dev_err(tmp_adev->dev, "GPU pre asic reset failed with err, %d for drm dev, %s ",
4641 				  r, adev_to_drm(tmp_adev)->unique);
4642 			tmp_adev->asic_reset_res = r;
4643 		}
4644 	}
4645 
4646 	/* Actual ASIC resets if needed.*/
4647 	/* TODO Implement XGMI hive reset logic for SRIOV */
4648 	if (amdgpu_sriov_vf(adev)) {
4649 		r = amdgpu_device_reset_sriov(adev, job ? false : true);
4650 		if (r)
4651 			adev->asic_reset_res = r;
4652 	} else {
4653 		r  = amdgpu_do_asic_reset(hive, device_list_handle, &need_full_reset, false);
4654 		if (r && r == -EAGAIN)
4655 			goto retry;
4656 	}
4657 
4658 skip_hw_reset:
4659 
4660 	/* Post ASIC reset for all devs .*/
4661 	list_for_each_entry(tmp_adev, device_list_handle, gmc.xgmi.head) {
4662 
4663 		for (i = 0; i < AMDGPU_MAX_RINGS; ++i) {
4664 			struct amdgpu_ring *ring = tmp_adev->rings[i];
4665 
4666 			if (!ring || !ring->sched.thread)
4667 				continue;
4668 
4669 			/* No point to resubmit jobs if we didn't HW reset*/
4670 			if (!tmp_adev->asic_reset_res && !job_signaled)
4671 				drm_sched_resubmit_jobs(&ring->sched);
4672 
4673 			drm_sched_start(&ring->sched, !tmp_adev->asic_reset_res);
4674 		}
4675 
4676 		if (!amdgpu_device_has_dc_support(tmp_adev) && !job_signaled) {
4677 			drm_helper_resume_force_mode(adev_to_drm(tmp_adev));
4678 		}
4679 
4680 		tmp_adev->asic_reset_res = 0;
4681 
4682 		if (r) {
4683 			/* bad news, how to tell it to userspace ? */
4684 			dev_info(tmp_adev->dev, "GPU reset(%d) failed\n", atomic_read(&tmp_adev->gpu_reset_counter));
4685 			amdgpu_vf_error_put(tmp_adev, AMDGIM_ERROR_VF_GPU_RESET_FAIL, 0, r);
4686 		} else {
4687 			dev_info(tmp_adev->dev, "GPU reset(%d) succeeded!\n", atomic_read(&tmp_adev->gpu_reset_counter));
4688 		}
4689 	}
4690 
4691 skip_sched_resume:
4692 	list_for_each_entry(tmp_adev, device_list_handle, gmc.xgmi.head) {
4693 		/*unlock kfd: SRIOV would do it separately */
4694 		if (!need_emergency_restart && !amdgpu_sriov_vf(tmp_adev))
4695 	                amdgpu_amdkfd_post_reset(tmp_adev);
4696 		if (audio_suspended)
4697 			amdgpu_device_resume_display_audio(tmp_adev);
4698 		amdgpu_device_unlock_adev(tmp_adev);
4699 	}
4700 
4701 skip_recovery:
4702 	if (hive) {
4703 		atomic_set(&hive->in_reset, 0);
4704 		mutex_unlock(&hive->hive_lock);
4705 		amdgpu_put_xgmi_hive(hive);
4706 	}
4707 
4708 	if (r)
4709 		dev_info(adev->dev, "GPU reset end with ret = %d\n", r);
4710 	return r;
4711 }
4712 
4713 /**
4714  * amdgpu_device_get_pcie_info - fence pcie info about the PCIE slot
4715  *
4716  * @adev: amdgpu_device pointer
4717  *
4718  * Fetchs and stores in the driver the PCIE capabilities (gen speed
4719  * and lanes) of the slot the device is in. Handles APUs and
4720  * virtualized environments where PCIE config space may not be available.
4721  */
amdgpu_device_get_pcie_info(struct amdgpu_device * adev)4722 static void amdgpu_device_get_pcie_info(struct amdgpu_device *adev)
4723 {
4724 	struct pci_dev *pdev;
4725 	enum pci_bus_speed speed_cap, platform_speed_cap;
4726 	enum pcie_link_width platform_link_width;
4727 
4728 	if (amdgpu_pcie_gen_cap)
4729 		adev->pm.pcie_gen_mask = amdgpu_pcie_gen_cap;
4730 
4731 	if (amdgpu_pcie_lane_cap)
4732 		adev->pm.pcie_mlw_mask = amdgpu_pcie_lane_cap;
4733 
4734 	/* covers APUs as well */
4735 	if (pci_is_root_bus(adev->pdev->bus)) {
4736 		if (adev->pm.pcie_gen_mask == 0)
4737 			adev->pm.pcie_gen_mask = AMDGPU_DEFAULT_PCIE_GEN_MASK;
4738 		if (adev->pm.pcie_mlw_mask == 0)
4739 			adev->pm.pcie_mlw_mask = AMDGPU_DEFAULT_PCIE_MLW_MASK;
4740 		return;
4741 	}
4742 
4743 	if (adev->pm.pcie_gen_mask && adev->pm.pcie_mlw_mask)
4744 		return;
4745 
4746 	pcie_bandwidth_available(adev->pdev, NULL,
4747 				 &platform_speed_cap, &platform_link_width);
4748 
4749 	if (adev->pm.pcie_gen_mask == 0) {
4750 		/* asic caps */
4751 		pdev = adev->pdev;
4752 		speed_cap = pcie_get_speed_cap(pdev);
4753 		if (speed_cap == PCI_SPEED_UNKNOWN) {
4754 			adev->pm.pcie_gen_mask |= (CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN1 |
4755 						  CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN2 |
4756 						  CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN3);
4757 		} else {
4758 			if (speed_cap == PCIE_SPEED_16_0GT)
4759 				adev->pm.pcie_gen_mask |= (CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN1 |
4760 							  CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN2 |
4761 							  CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN3 |
4762 							  CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN4);
4763 			else if (speed_cap == PCIE_SPEED_8_0GT)
4764 				adev->pm.pcie_gen_mask |= (CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN1 |
4765 							  CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN2 |
4766 							  CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN3);
4767 			else if (speed_cap == PCIE_SPEED_5_0GT)
4768 				adev->pm.pcie_gen_mask |= (CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN1 |
4769 							  CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN2);
4770 			else
4771 				adev->pm.pcie_gen_mask |= CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN1;
4772 		}
4773 		/* platform caps */
4774 		if (platform_speed_cap == PCI_SPEED_UNKNOWN) {
4775 			adev->pm.pcie_gen_mask |= (CAIL_PCIE_LINK_SPEED_SUPPORT_GEN1 |
4776 						   CAIL_PCIE_LINK_SPEED_SUPPORT_GEN2);
4777 		} else {
4778 			if (platform_speed_cap == PCIE_SPEED_16_0GT)
4779 				adev->pm.pcie_gen_mask |= (CAIL_PCIE_LINK_SPEED_SUPPORT_GEN1 |
4780 							   CAIL_PCIE_LINK_SPEED_SUPPORT_GEN2 |
4781 							   CAIL_PCIE_LINK_SPEED_SUPPORT_GEN3 |
4782 							   CAIL_PCIE_LINK_SPEED_SUPPORT_GEN4);
4783 			else if (platform_speed_cap == PCIE_SPEED_8_0GT)
4784 				adev->pm.pcie_gen_mask |= (CAIL_PCIE_LINK_SPEED_SUPPORT_GEN1 |
4785 							   CAIL_PCIE_LINK_SPEED_SUPPORT_GEN2 |
4786 							   CAIL_PCIE_LINK_SPEED_SUPPORT_GEN3);
4787 			else if (platform_speed_cap == PCIE_SPEED_5_0GT)
4788 				adev->pm.pcie_gen_mask |= (CAIL_PCIE_LINK_SPEED_SUPPORT_GEN1 |
4789 							   CAIL_PCIE_LINK_SPEED_SUPPORT_GEN2);
4790 			else
4791 				adev->pm.pcie_gen_mask |= CAIL_PCIE_LINK_SPEED_SUPPORT_GEN1;
4792 
4793 		}
4794 	}
4795 	if (adev->pm.pcie_mlw_mask == 0) {
4796 		if (platform_link_width == PCIE_LNK_WIDTH_UNKNOWN) {
4797 			adev->pm.pcie_mlw_mask |= AMDGPU_DEFAULT_PCIE_MLW_MASK;
4798 		} else {
4799 			switch (platform_link_width) {
4800 			case PCIE_LNK_X32:
4801 				adev->pm.pcie_mlw_mask = (CAIL_PCIE_LINK_WIDTH_SUPPORT_X32 |
4802 							  CAIL_PCIE_LINK_WIDTH_SUPPORT_X16 |
4803 							  CAIL_PCIE_LINK_WIDTH_SUPPORT_X12 |
4804 							  CAIL_PCIE_LINK_WIDTH_SUPPORT_X8 |
4805 							  CAIL_PCIE_LINK_WIDTH_SUPPORT_X4 |
4806 							  CAIL_PCIE_LINK_WIDTH_SUPPORT_X2 |
4807 							  CAIL_PCIE_LINK_WIDTH_SUPPORT_X1);
4808 				break;
4809 			case PCIE_LNK_X16:
4810 				adev->pm.pcie_mlw_mask = (CAIL_PCIE_LINK_WIDTH_SUPPORT_X16 |
4811 							  CAIL_PCIE_LINK_WIDTH_SUPPORT_X12 |
4812 							  CAIL_PCIE_LINK_WIDTH_SUPPORT_X8 |
4813 							  CAIL_PCIE_LINK_WIDTH_SUPPORT_X4 |
4814 							  CAIL_PCIE_LINK_WIDTH_SUPPORT_X2 |
4815 							  CAIL_PCIE_LINK_WIDTH_SUPPORT_X1);
4816 				break;
4817 			case PCIE_LNK_X12:
4818 				adev->pm.pcie_mlw_mask = (CAIL_PCIE_LINK_WIDTH_SUPPORT_X12 |
4819 							  CAIL_PCIE_LINK_WIDTH_SUPPORT_X8 |
4820 							  CAIL_PCIE_LINK_WIDTH_SUPPORT_X4 |
4821 							  CAIL_PCIE_LINK_WIDTH_SUPPORT_X2 |
4822 							  CAIL_PCIE_LINK_WIDTH_SUPPORT_X1);
4823 				break;
4824 			case PCIE_LNK_X8:
4825 				adev->pm.pcie_mlw_mask = (CAIL_PCIE_LINK_WIDTH_SUPPORT_X8 |
4826 							  CAIL_PCIE_LINK_WIDTH_SUPPORT_X4 |
4827 							  CAIL_PCIE_LINK_WIDTH_SUPPORT_X2 |
4828 							  CAIL_PCIE_LINK_WIDTH_SUPPORT_X1);
4829 				break;
4830 			case PCIE_LNK_X4:
4831 				adev->pm.pcie_mlw_mask = (CAIL_PCIE_LINK_WIDTH_SUPPORT_X4 |
4832 							  CAIL_PCIE_LINK_WIDTH_SUPPORT_X2 |
4833 							  CAIL_PCIE_LINK_WIDTH_SUPPORT_X1);
4834 				break;
4835 			case PCIE_LNK_X2:
4836 				adev->pm.pcie_mlw_mask = (CAIL_PCIE_LINK_WIDTH_SUPPORT_X2 |
4837 							  CAIL_PCIE_LINK_WIDTH_SUPPORT_X1);
4838 				break;
4839 			case PCIE_LNK_X1:
4840 				adev->pm.pcie_mlw_mask = CAIL_PCIE_LINK_WIDTH_SUPPORT_X1;
4841 				break;
4842 			default:
4843 				break;
4844 			}
4845 		}
4846 	}
4847 }
4848 
amdgpu_device_baco_enter(struct drm_device * dev)4849 int amdgpu_device_baco_enter(struct drm_device *dev)
4850 {
4851 	struct amdgpu_device *adev = drm_to_adev(dev);
4852 	struct amdgpu_ras *ras = amdgpu_ras_get_context(adev);
4853 
4854 	if (!amdgpu_device_supports_baco(adev_to_drm(adev)))
4855 		return -ENOTSUPP;
4856 
4857 	if (ras && ras->supported && adev->nbio.funcs->enable_doorbell_interrupt)
4858 		adev->nbio.funcs->enable_doorbell_interrupt(adev, false);
4859 
4860 	return amdgpu_dpm_baco_enter(adev);
4861 }
4862 
amdgpu_device_baco_exit(struct drm_device * dev)4863 int amdgpu_device_baco_exit(struct drm_device *dev)
4864 {
4865 	struct amdgpu_device *adev = drm_to_adev(dev);
4866 	struct amdgpu_ras *ras = amdgpu_ras_get_context(adev);
4867 	int ret = 0;
4868 
4869 	if (!amdgpu_device_supports_baco(adev_to_drm(adev)))
4870 		return -ENOTSUPP;
4871 
4872 	ret = amdgpu_dpm_baco_exit(adev);
4873 	if (ret)
4874 		return ret;
4875 
4876 	if (ras && ras->supported && adev->nbio.funcs->enable_doorbell_interrupt)
4877 		adev->nbio.funcs->enable_doorbell_interrupt(adev, true);
4878 
4879 	return 0;
4880 }
4881 
amdgpu_cancel_all_tdr(struct amdgpu_device * adev)4882 static void amdgpu_cancel_all_tdr(struct amdgpu_device *adev)
4883 {
4884 	int i;
4885 
4886 	for (i = 0; i < AMDGPU_MAX_RINGS; ++i) {
4887 		struct amdgpu_ring *ring = adev->rings[i];
4888 
4889 		if (!ring || !ring->sched.thread)
4890 			continue;
4891 
4892 		cancel_delayed_work_sync(&ring->sched.work_tdr);
4893 	}
4894 }
4895 
4896 /**
4897  * amdgpu_pci_error_detected - Called when a PCI error is detected.
4898  * @pdev: PCI device struct
4899  * @state: PCI channel state
4900  *
4901  * Description: Called when a PCI error is detected.
4902  *
4903  * Return: PCI_ERS_RESULT_NEED_RESET or PCI_ERS_RESULT_DISCONNECT.
4904  */
amdgpu_pci_error_detected(struct pci_dev * pdev,pci_channel_state_t state)4905 pci_ers_result_t amdgpu_pci_error_detected(struct pci_dev *pdev, pci_channel_state_t state)
4906 {
4907 	struct drm_device *dev = pci_get_drvdata(pdev);
4908 	struct amdgpu_device *adev = drm_to_adev(dev);
4909 	int i;
4910 
4911 	DRM_INFO("PCI error: detected callback, state(%d)!!\n", state);
4912 
4913 	if (adev->gmc.xgmi.num_physical_nodes > 1) {
4914 		DRM_WARN("No support for XGMI hive yet...");
4915 		return PCI_ERS_RESULT_DISCONNECT;
4916 	}
4917 
4918 	switch (state) {
4919 	case pci_channel_io_normal:
4920 		return PCI_ERS_RESULT_CAN_RECOVER;
4921 	/* Fatal error, prepare for slot reset */
4922 	case pci_channel_io_frozen:
4923 		/*
4924 		 * Cancel and wait for all TDRs in progress if failing to
4925 		 * set  adev->in_gpu_reset in amdgpu_device_lock_adev
4926 		 *
4927 		 * Locking adev->reset_sem will prevent any external access
4928 		 * to GPU during PCI error recovery
4929 		 */
4930 		while (!amdgpu_device_lock_adev(adev, NULL))
4931 			amdgpu_cancel_all_tdr(adev);
4932 
4933 		/*
4934 		 * Block any work scheduling as we do for regular GPU reset
4935 		 * for the duration of the recovery
4936 		 */
4937 		for (i = 0; i < AMDGPU_MAX_RINGS; ++i) {
4938 			struct amdgpu_ring *ring = adev->rings[i];
4939 
4940 			if (!ring || !ring->sched.thread)
4941 				continue;
4942 
4943 			drm_sched_stop(&ring->sched, NULL);
4944 		}
4945 		return PCI_ERS_RESULT_NEED_RESET;
4946 	case pci_channel_io_perm_failure:
4947 		/* Permanent error, prepare for device removal */
4948 		return PCI_ERS_RESULT_DISCONNECT;
4949 	}
4950 
4951 	return PCI_ERS_RESULT_NEED_RESET;
4952 }
4953 
4954 /**
4955  * amdgpu_pci_mmio_enabled - Enable MMIO and dump debug registers
4956  * @pdev: pointer to PCI device
4957  */
amdgpu_pci_mmio_enabled(struct pci_dev * pdev)4958 pci_ers_result_t amdgpu_pci_mmio_enabled(struct pci_dev *pdev)
4959 {
4960 
4961 	DRM_INFO("PCI error: mmio enabled callback!!\n");
4962 
4963 	/* TODO - dump whatever for debugging purposes */
4964 
4965 	/* This called only if amdgpu_pci_error_detected returns
4966 	 * PCI_ERS_RESULT_CAN_RECOVER. Read/write to the device still
4967 	 * works, no need to reset slot.
4968 	 */
4969 
4970 	return PCI_ERS_RESULT_RECOVERED;
4971 }
4972 
4973 /**
4974  * amdgpu_pci_slot_reset - Called when PCI slot has been reset.
4975  * @pdev: PCI device struct
4976  *
4977  * Description: This routine is called by the pci error recovery
4978  * code after the PCI slot has been reset, just before we
4979  * should resume normal operations.
4980  */
amdgpu_pci_slot_reset(struct pci_dev * pdev)4981 pci_ers_result_t amdgpu_pci_slot_reset(struct pci_dev *pdev)
4982 {
4983 	struct drm_device *dev = pci_get_drvdata(pdev);
4984 	struct amdgpu_device *adev = drm_to_adev(dev);
4985 	int r, i;
4986 	bool need_full_reset = true;
4987 	u32 memsize;
4988 	struct list_head device_list;
4989 
4990 	DRM_INFO("PCI error: slot reset callback!!\n");
4991 
4992 	INIT_LIST_HEAD(&device_list);
4993 	list_add_tail(&adev->gmc.xgmi.head, &device_list);
4994 
4995 	/* wait for asic to come out of reset */
4996 	msleep(500);
4997 
4998 	/* Restore PCI confspace */
4999 	amdgpu_device_load_pci_state(pdev);
5000 
5001 	/* confirm  ASIC came out of reset */
5002 	for (i = 0; i < adev->usec_timeout; i++) {
5003 		memsize = amdgpu_asic_get_config_memsize(adev);
5004 
5005 		if (memsize != 0xffffffff)
5006 			break;
5007 		udelay(1);
5008 	}
5009 	if (memsize == 0xffffffff) {
5010 		r = -ETIME;
5011 		goto out;
5012 	}
5013 
5014 	adev->in_pci_err_recovery = true;
5015 	r = amdgpu_device_pre_asic_reset(adev, NULL, &need_full_reset);
5016 	adev->in_pci_err_recovery = false;
5017 	if (r)
5018 		goto out;
5019 
5020 	r = amdgpu_do_asic_reset(NULL, &device_list, &need_full_reset, true);
5021 
5022 out:
5023 	if (!r) {
5024 		if (amdgpu_device_cache_pci_state(adev->pdev))
5025 			pci_restore_state(adev->pdev);
5026 
5027 		DRM_INFO("PCIe error recovery succeeded\n");
5028 	} else {
5029 		DRM_ERROR("PCIe error recovery failed, err:%d", r);
5030 		amdgpu_device_unlock_adev(adev);
5031 	}
5032 
5033 	return r ? PCI_ERS_RESULT_DISCONNECT : PCI_ERS_RESULT_RECOVERED;
5034 }
5035 
5036 /**
5037  * amdgpu_pci_resume() - resume normal ops after PCI reset
5038  * @pdev: pointer to PCI device
5039  *
5040  * Called when the error recovery driver tells us that its
5041  * OK to resume normal operation. Use completion to allow
5042  * halted scsi ops to resume.
5043  */
amdgpu_pci_resume(struct pci_dev * pdev)5044 void amdgpu_pci_resume(struct pci_dev *pdev)
5045 {
5046 	struct drm_device *dev = pci_get_drvdata(pdev);
5047 	struct amdgpu_device *adev = drm_to_adev(dev);
5048 	int i;
5049 
5050 
5051 	DRM_INFO("PCI error: resume callback!!\n");
5052 
5053 	for (i = 0; i < AMDGPU_MAX_RINGS; ++i) {
5054 		struct amdgpu_ring *ring = adev->rings[i];
5055 
5056 		if (!ring || !ring->sched.thread)
5057 			continue;
5058 
5059 
5060 		drm_sched_resubmit_jobs(&ring->sched);
5061 		drm_sched_start(&ring->sched, true);
5062 	}
5063 
5064 	amdgpu_device_unlock_adev(adev);
5065 }
5066 
amdgpu_device_cache_pci_state(struct pci_dev * pdev)5067 bool amdgpu_device_cache_pci_state(struct pci_dev *pdev)
5068 {
5069 	struct drm_device *dev = pci_get_drvdata(pdev);
5070 	struct amdgpu_device *adev = drm_to_adev(dev);
5071 	int r;
5072 
5073 	r = pci_save_state(pdev);
5074 	if (!r) {
5075 		kfree(adev->pci_state);
5076 
5077 		adev->pci_state = pci_store_saved_state(pdev);
5078 
5079 		if (!adev->pci_state) {
5080 			DRM_ERROR("Failed to store PCI saved state");
5081 			return false;
5082 		}
5083 	} else {
5084 		DRM_WARN("Failed to save PCI state, err:%d\n", r);
5085 		return false;
5086 	}
5087 
5088 	return true;
5089 }
5090 
amdgpu_device_load_pci_state(struct pci_dev * pdev)5091 bool amdgpu_device_load_pci_state(struct pci_dev *pdev)
5092 {
5093 	struct drm_device *dev = pci_get_drvdata(pdev);
5094 	struct amdgpu_device *adev = drm_to_adev(dev);
5095 	int r;
5096 
5097 	if (!adev->pci_state)
5098 		return false;
5099 
5100 	r = pci_load_saved_state(pdev, adev->pci_state);
5101 
5102 	if (!r) {
5103 		pci_restore_state(pdev);
5104 	} else {
5105 		DRM_WARN("Failed to load PCI state, err:%d\n", r);
5106 		return false;
5107 	}
5108 
5109 	return true;
5110 }
5111 
5112 
5113