• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /*
2  * Copyright 2008 Advanced Micro Devices, Inc.
3  * Copyright 2008 Red Hat Inc.
4  * Copyright 2009 Jerome Glisse.
5  *
6  * Permission is hereby granted, free of charge, to any person obtaining a
7  * copy of this software and associated documentation files (the "Software"),
8  * to deal in the Software without restriction, including without limitation
9  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
10  * and/or sell copies of the Software, and to permit persons to whom the
11  * Software is furnished to do so, subject to the following conditions:
12  *
13  * The above copyright notice and this permission notice shall be included in
14  * all copies or substantial portions of the Software.
15  *
16  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
19  * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR
20  * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
21  * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
22  * OTHER DEALINGS IN THE SOFTWARE.
23  *
24  * Authors: Dave Airlie
25  *          Alex Deucher
26  *          Jerome Glisse
27  */
28 #include <linux/power_supply.h>
29 #include <linux/kthread.h>
30 #include <linux/module.h>
31 #include <linux/console.h>
32 #include <linux/slab.h>
33 
34 #include <drm/drm_atomic_helper.h>
35 #include <drm/drm_probe_helper.h>
36 #include <drm/amdgpu_drm.h>
37 #include <linux/vgaarb.h>
38 #include <linux/vga_switcheroo.h>
39 #include <linux/efi.h>
40 #include "amdgpu.h"
41 #include "amdgpu_trace.h"
42 #include "amdgpu_i2c.h"
43 #include "atom.h"
44 #include "amdgpu_atombios.h"
45 #include "amdgpu_atomfirmware.h"
46 #include "amd_pcie.h"
47 #ifdef CONFIG_DRM_AMDGPU_SI
48 #include "si.h"
49 #endif
50 #ifdef CONFIG_DRM_AMDGPU_CIK
51 #include "cik.h"
52 #endif
53 #include "vi.h"
54 #include "soc15.h"
55 #include "nv.h"
56 #include "bif/bif_4_1_d.h"
57 #include <linux/pci.h>
58 #include <linux/firmware.h>
59 #include "amdgpu_vf_error.h"
60 
61 #include "amdgpu_amdkfd.h"
62 #include "amdgpu_pm.h"
63 
64 #include "amdgpu_xgmi.h"
65 #include "amdgpu_ras.h"
66 #include "amdgpu_pmu.h"
67 #include "amdgpu_fru_eeprom.h"
68 
69 #include <linux/suspend.h>
70 #include <drm/task_barrier.h>
71 #include <linux/pm_runtime.h>
72 
73 MODULE_FIRMWARE("amdgpu/vega10_gpu_info.bin");
74 MODULE_FIRMWARE("amdgpu/vega12_gpu_info.bin");
75 MODULE_FIRMWARE("amdgpu/raven_gpu_info.bin");
76 MODULE_FIRMWARE("amdgpu/picasso_gpu_info.bin");
77 MODULE_FIRMWARE("amdgpu/raven2_gpu_info.bin");
78 MODULE_FIRMWARE("amdgpu/arcturus_gpu_info.bin");
79 MODULE_FIRMWARE("amdgpu/renoir_gpu_info.bin");
80 MODULE_FIRMWARE("amdgpu/navi10_gpu_info.bin");
81 MODULE_FIRMWARE("amdgpu/navi14_gpu_info.bin");
82 MODULE_FIRMWARE("amdgpu/navi12_gpu_info.bin");
83 
84 #define AMDGPU_RESUME_MS		2000
85 
86 const char *amdgpu_asic_name[] = {
87 	"TAHITI",
88 	"PITCAIRN",
89 	"VERDE",
90 	"OLAND",
91 	"HAINAN",
92 	"BONAIRE",
93 	"KAVERI",
94 	"KABINI",
95 	"HAWAII",
96 	"MULLINS",
97 	"TOPAZ",
98 	"TONGA",
99 	"FIJI",
100 	"CARRIZO",
101 	"STONEY",
102 	"POLARIS10",
103 	"POLARIS11",
104 	"POLARIS12",
105 	"VEGAM",
106 	"VEGA10",
107 	"VEGA12",
108 	"VEGA20",
109 	"RAVEN",
110 	"ARCTURUS",
111 	"RENOIR",
112 	"NAVI10",
113 	"NAVI14",
114 	"NAVI12",
115 	"SIENNA_CICHLID",
116 	"NAVY_FLOUNDER",
117 	"LAST",
118 };
119 
120 /**
121  * DOC: pcie_replay_count
122  *
123  * The amdgpu driver provides a sysfs API for reporting the total number
124  * of PCIe replays (NAKs)
125  * The file pcie_replay_count is used for this and returns the total
126  * number of replays as a sum of the NAKs generated and NAKs received
127  */
128 
amdgpu_device_get_pcie_replay_count(struct device * dev,struct device_attribute * attr,char * buf)129 static ssize_t amdgpu_device_get_pcie_replay_count(struct device *dev,
130 		struct device_attribute *attr, char *buf)
131 {
132 	struct drm_device *ddev = dev_get_drvdata(dev);
133 	struct amdgpu_device *adev = drm_to_adev(ddev);
134 	uint64_t cnt = amdgpu_asic_get_pcie_replay_count(adev);
135 
136 	return snprintf(buf, PAGE_SIZE, "%llu\n", cnt);
137 }
138 
139 static DEVICE_ATTR(pcie_replay_count, S_IRUGO,
140 		amdgpu_device_get_pcie_replay_count, NULL);
141 
142 static void amdgpu_device_get_pcie_info(struct amdgpu_device *adev);
143 
144 /**
145  * DOC: product_name
146  *
147  * The amdgpu driver provides a sysfs API for reporting the product name
148  * for the device
149  * The file serial_number is used for this and returns the product name
150  * as returned from the FRU.
151  * NOTE: This is only available for certain server cards
152  */
153 
amdgpu_device_get_product_name(struct device * dev,struct device_attribute * attr,char * buf)154 static ssize_t amdgpu_device_get_product_name(struct device *dev,
155 		struct device_attribute *attr, char *buf)
156 {
157 	struct drm_device *ddev = dev_get_drvdata(dev);
158 	struct amdgpu_device *adev = drm_to_adev(ddev);
159 
160 	return snprintf(buf, PAGE_SIZE, "%s\n", adev->product_name);
161 }
162 
163 static DEVICE_ATTR(product_name, S_IRUGO,
164 		amdgpu_device_get_product_name, NULL);
165 
166 /**
167  * DOC: product_number
168  *
169  * The amdgpu driver provides a sysfs API for reporting the part number
170  * for the device
171  * The file serial_number is used for this and returns the part number
172  * as returned from the FRU.
173  * NOTE: This is only available for certain server cards
174  */
175 
amdgpu_device_get_product_number(struct device * dev,struct device_attribute * attr,char * buf)176 static ssize_t amdgpu_device_get_product_number(struct device *dev,
177 		struct device_attribute *attr, char *buf)
178 {
179 	struct drm_device *ddev = dev_get_drvdata(dev);
180 	struct amdgpu_device *adev = drm_to_adev(ddev);
181 
182 	return snprintf(buf, PAGE_SIZE, "%s\n", adev->product_number);
183 }
184 
185 static DEVICE_ATTR(product_number, S_IRUGO,
186 		amdgpu_device_get_product_number, NULL);
187 
188 /**
189  * DOC: serial_number
190  *
191  * The amdgpu driver provides a sysfs API for reporting the serial number
192  * for the device
193  * The file serial_number is used for this and returns the serial number
194  * as returned from the FRU.
195  * NOTE: This is only available for certain server cards
196  */
197 
amdgpu_device_get_serial_number(struct device * dev,struct device_attribute * attr,char * buf)198 static ssize_t amdgpu_device_get_serial_number(struct device *dev,
199 		struct device_attribute *attr, char *buf)
200 {
201 	struct drm_device *ddev = dev_get_drvdata(dev);
202 	struct amdgpu_device *adev = drm_to_adev(ddev);
203 
204 	return snprintf(buf, PAGE_SIZE, "%s\n", adev->serial);
205 }
206 
207 static DEVICE_ATTR(serial_number, S_IRUGO,
208 		amdgpu_device_get_serial_number, NULL);
209 
210 /**
211  * amdgpu_device_supports_boco - Is the device a dGPU with HG/PX power control
212  *
213  * @dev: drm_device pointer
214  *
215  * Returns true if the device is a dGPU with HG/PX power control,
216  * otherwise return false.
217  */
amdgpu_device_supports_boco(struct drm_device * dev)218 bool amdgpu_device_supports_boco(struct drm_device *dev)
219 {
220 	struct amdgpu_device *adev = drm_to_adev(dev);
221 
222 	if (adev->flags & AMD_IS_PX)
223 		return true;
224 	return false;
225 }
226 
227 /**
228  * amdgpu_device_supports_baco - Does the device support BACO
229  *
230  * @dev: drm_device pointer
231  *
232  * Returns true if the device supporte BACO,
233  * otherwise return false.
234  */
amdgpu_device_supports_baco(struct drm_device * dev)235 bool amdgpu_device_supports_baco(struct drm_device *dev)
236 {
237 	struct amdgpu_device *adev = drm_to_adev(dev);
238 
239 	return amdgpu_asic_supports_baco(adev);
240 }
241 
242 /*
243  * VRAM access helper functions
244  */
245 
246 /**
247  * amdgpu_device_vram_access - read/write a buffer in vram
248  *
249  * @adev: amdgpu_device pointer
250  * @pos: offset of the buffer in vram
251  * @buf: virtual address of the buffer in system memory
252  * @size: read/write size, sizeof(@buf) must > @size
253  * @write: true - write to vram, otherwise - read from vram
254  */
amdgpu_device_vram_access(struct amdgpu_device * adev,loff_t pos,uint32_t * buf,size_t size,bool write)255 void amdgpu_device_vram_access(struct amdgpu_device *adev, loff_t pos,
256 			       uint32_t *buf, size_t size, bool write)
257 {
258 	unsigned long flags;
259 	uint32_t hi = ~0;
260 	uint64_t last;
261 
262 
263 #ifdef CONFIG_64BIT
264 	last = min(pos + size, adev->gmc.visible_vram_size);
265 	if (last > pos) {
266 		void __iomem *addr = adev->mman.aper_base_kaddr + pos;
267 		size_t count = last - pos;
268 
269 		if (write) {
270 			memcpy_toio(addr, buf, count);
271 			mb();
272 			amdgpu_asic_flush_hdp(adev, NULL);
273 		} else {
274 			amdgpu_asic_invalidate_hdp(adev, NULL);
275 			mb();
276 			memcpy_fromio(buf, addr, count);
277 		}
278 
279 		if (count == size)
280 			return;
281 
282 		pos += count;
283 		buf += count / 4;
284 		size -= count;
285 	}
286 #endif
287 
288 	spin_lock_irqsave(&adev->mmio_idx_lock, flags);
289 	for (last = pos + size; pos < last; pos += 4) {
290 		uint32_t tmp = pos >> 31;
291 
292 		WREG32_NO_KIQ(mmMM_INDEX, ((uint32_t)pos) | 0x80000000);
293 		if (tmp != hi) {
294 			WREG32_NO_KIQ(mmMM_INDEX_HI, tmp);
295 			hi = tmp;
296 		}
297 		if (write)
298 			WREG32_NO_KIQ(mmMM_DATA, *buf++);
299 		else
300 			*buf++ = RREG32_NO_KIQ(mmMM_DATA);
301 	}
302 	spin_unlock_irqrestore(&adev->mmio_idx_lock, flags);
303 }
304 
305 /*
306  * register access helper functions.
307  */
308 /**
309  * amdgpu_device_rreg - read a memory mapped IO or indirect register
310  *
311  * @adev: amdgpu_device pointer
312  * @reg: dword aligned register offset
313  * @acc_flags: access flags which require special behavior
314  *
315  * Returns the 32 bit value from the offset specified.
316  */
amdgpu_device_rreg(struct amdgpu_device * adev,uint32_t reg,uint32_t acc_flags)317 uint32_t amdgpu_device_rreg(struct amdgpu_device *adev,
318 			    uint32_t reg, uint32_t acc_flags)
319 {
320 	uint32_t ret;
321 
322 	if (adev->in_pci_err_recovery)
323 		return 0;
324 
325 	if ((reg * 4) < adev->rmmio_size) {
326 		if (!(acc_flags & AMDGPU_REGS_NO_KIQ) &&
327 		    amdgpu_sriov_runtime(adev) &&
328 		    down_read_trylock(&adev->reset_sem)) {
329 			ret = amdgpu_kiq_rreg(adev, reg);
330 			up_read(&adev->reset_sem);
331 		} else {
332 			ret = readl(((void __iomem *)adev->rmmio) + (reg * 4));
333 		}
334 	} else {
335 		ret = adev->pcie_rreg(adev, reg * 4);
336 	}
337 
338 	trace_amdgpu_device_rreg(adev->pdev->device, reg, ret);
339 
340 	return ret;
341 }
342 
343 /*
344  * MMIO register read with bytes helper functions
345  * @offset:bytes offset from MMIO start
346  *
347 */
348 
349 /**
350  * amdgpu_mm_rreg8 - read a memory mapped IO register
351  *
352  * @adev: amdgpu_device pointer
353  * @offset: byte aligned register offset
354  *
355  * Returns the 8 bit value from the offset specified.
356  */
amdgpu_mm_rreg8(struct amdgpu_device * adev,uint32_t offset)357 uint8_t amdgpu_mm_rreg8(struct amdgpu_device *adev, uint32_t offset)
358 {
359 	if (adev->in_pci_err_recovery)
360 		return 0;
361 
362 	if (offset < adev->rmmio_size)
363 		return (readb(adev->rmmio + offset));
364 	BUG();
365 }
366 
367 /*
368  * MMIO register write with bytes helper functions
369  * @offset:bytes offset from MMIO start
370  * @value: the value want to be written to the register
371  *
372 */
373 /**
374  * amdgpu_mm_wreg8 - read a memory mapped IO register
375  *
376  * @adev: amdgpu_device pointer
377  * @offset: byte aligned register offset
378  * @value: 8 bit value to write
379  *
380  * Writes the value specified to the offset specified.
381  */
amdgpu_mm_wreg8(struct amdgpu_device * adev,uint32_t offset,uint8_t value)382 void amdgpu_mm_wreg8(struct amdgpu_device *adev, uint32_t offset, uint8_t value)
383 {
384 	if (adev->in_pci_err_recovery)
385 		return;
386 
387 	if (offset < adev->rmmio_size)
388 		writeb(value, adev->rmmio + offset);
389 	else
390 		BUG();
391 }
392 
393 /**
394  * amdgpu_device_wreg - write to a memory mapped IO or indirect register
395  *
396  * @adev: amdgpu_device pointer
397  * @reg: dword aligned register offset
398  * @v: 32 bit value to write to the register
399  * @acc_flags: access flags which require special behavior
400  *
401  * Writes the value specified to the offset specified.
402  */
amdgpu_device_wreg(struct amdgpu_device * adev,uint32_t reg,uint32_t v,uint32_t acc_flags)403 void amdgpu_device_wreg(struct amdgpu_device *adev,
404 			uint32_t reg, uint32_t v,
405 			uint32_t acc_flags)
406 {
407 	if (adev->in_pci_err_recovery)
408 		return;
409 
410 	if ((reg * 4) < adev->rmmio_size) {
411 		if (!(acc_flags & AMDGPU_REGS_NO_KIQ) &&
412 		    amdgpu_sriov_runtime(adev) &&
413 		    down_read_trylock(&adev->reset_sem)) {
414 			amdgpu_kiq_wreg(adev, reg, v);
415 			up_read(&adev->reset_sem);
416 		} else {
417 			writel(v, ((void __iomem *)adev->rmmio) + (reg * 4));
418 		}
419 	} else {
420 		adev->pcie_wreg(adev, reg * 4, v);
421 	}
422 
423 	trace_amdgpu_device_wreg(adev->pdev->device, reg, v);
424 }
425 
426 /*
427  * amdgpu_mm_wreg_mmio_rlc -  write register either with mmio or with RLC path if in range
428  *
429  * this function is invoked only the debugfs register access
430  * */
amdgpu_mm_wreg_mmio_rlc(struct amdgpu_device * adev,uint32_t reg,uint32_t v)431 void amdgpu_mm_wreg_mmio_rlc(struct amdgpu_device *adev,
432 			     uint32_t reg, uint32_t v)
433 {
434 	if (adev->in_pci_err_recovery)
435 		return;
436 
437 	if (amdgpu_sriov_fullaccess(adev) &&
438 	    adev->gfx.rlc.funcs &&
439 	    adev->gfx.rlc.funcs->is_rlcg_access_range) {
440 		if (adev->gfx.rlc.funcs->is_rlcg_access_range(adev, reg))
441 			return adev->gfx.rlc.funcs->rlcg_wreg(adev, reg, v);
442 	} else {
443 		writel(v, ((void __iomem *)adev->rmmio) + (reg * 4));
444 	}
445 }
446 
447 /**
448  * amdgpu_io_rreg - read an IO register
449  *
450  * @adev: amdgpu_device pointer
451  * @reg: dword aligned register offset
452  *
453  * Returns the 32 bit value from the offset specified.
454  */
amdgpu_io_rreg(struct amdgpu_device * adev,u32 reg)455 u32 amdgpu_io_rreg(struct amdgpu_device *adev, u32 reg)
456 {
457 	if (adev->in_pci_err_recovery)
458 		return 0;
459 
460 	if ((reg * 4) < adev->rio_mem_size)
461 		return ioread32(adev->rio_mem + (reg * 4));
462 	else {
463 		iowrite32((reg * 4), adev->rio_mem + (mmMM_INDEX * 4));
464 		return ioread32(adev->rio_mem + (mmMM_DATA * 4));
465 	}
466 }
467 
468 /**
469  * amdgpu_io_wreg - write to an IO register
470  *
471  * @adev: amdgpu_device pointer
472  * @reg: dword aligned register offset
473  * @v: 32 bit value to write to the register
474  *
475  * Writes the value specified to the offset specified.
476  */
amdgpu_io_wreg(struct amdgpu_device * adev,u32 reg,u32 v)477 void amdgpu_io_wreg(struct amdgpu_device *adev, u32 reg, u32 v)
478 {
479 	if (adev->in_pci_err_recovery)
480 		return;
481 
482 	if ((reg * 4) < adev->rio_mem_size)
483 		iowrite32(v, adev->rio_mem + (reg * 4));
484 	else {
485 		iowrite32((reg * 4), adev->rio_mem + (mmMM_INDEX * 4));
486 		iowrite32(v, adev->rio_mem + (mmMM_DATA * 4));
487 	}
488 }
489 
490 /**
491  * amdgpu_mm_rdoorbell - read a doorbell dword
492  *
493  * @adev: amdgpu_device pointer
494  * @index: doorbell index
495  *
496  * Returns the value in the doorbell aperture at the
497  * requested doorbell index (CIK).
498  */
amdgpu_mm_rdoorbell(struct amdgpu_device * adev,u32 index)499 u32 amdgpu_mm_rdoorbell(struct amdgpu_device *adev, u32 index)
500 {
501 	if (adev->in_pci_err_recovery)
502 		return 0;
503 
504 	if (index < adev->doorbell.num_doorbells) {
505 		return readl(adev->doorbell.ptr + index);
506 	} else {
507 		DRM_ERROR("reading beyond doorbell aperture: 0x%08x!\n", index);
508 		return 0;
509 	}
510 }
511 
512 /**
513  * amdgpu_mm_wdoorbell - write a doorbell dword
514  *
515  * @adev: amdgpu_device pointer
516  * @index: doorbell index
517  * @v: value to write
518  *
519  * Writes @v to the doorbell aperture at the
520  * requested doorbell index (CIK).
521  */
amdgpu_mm_wdoorbell(struct amdgpu_device * adev,u32 index,u32 v)522 void amdgpu_mm_wdoorbell(struct amdgpu_device *adev, u32 index, u32 v)
523 {
524 	if (adev->in_pci_err_recovery)
525 		return;
526 
527 	if (index < adev->doorbell.num_doorbells) {
528 		writel(v, adev->doorbell.ptr + index);
529 	} else {
530 		DRM_ERROR("writing beyond doorbell aperture: 0x%08x!\n", index);
531 	}
532 }
533 
534 /**
535  * amdgpu_mm_rdoorbell64 - read a doorbell Qword
536  *
537  * @adev: amdgpu_device pointer
538  * @index: doorbell index
539  *
540  * Returns the value in the doorbell aperture at the
541  * requested doorbell index (VEGA10+).
542  */
amdgpu_mm_rdoorbell64(struct amdgpu_device * adev,u32 index)543 u64 amdgpu_mm_rdoorbell64(struct amdgpu_device *adev, u32 index)
544 {
545 	if (adev->in_pci_err_recovery)
546 		return 0;
547 
548 	if (index < adev->doorbell.num_doorbells) {
549 		return atomic64_read((atomic64_t *)(adev->doorbell.ptr + index));
550 	} else {
551 		DRM_ERROR("reading beyond doorbell aperture: 0x%08x!\n", index);
552 		return 0;
553 	}
554 }
555 
556 /**
557  * amdgpu_mm_wdoorbell64 - write a doorbell Qword
558  *
559  * @adev: amdgpu_device pointer
560  * @index: doorbell index
561  * @v: value to write
562  *
563  * Writes @v to the doorbell aperture at the
564  * requested doorbell index (VEGA10+).
565  */
amdgpu_mm_wdoorbell64(struct amdgpu_device * adev,u32 index,u64 v)566 void amdgpu_mm_wdoorbell64(struct amdgpu_device *adev, u32 index, u64 v)
567 {
568 	if (adev->in_pci_err_recovery)
569 		return;
570 
571 	if (index < adev->doorbell.num_doorbells) {
572 		atomic64_set((atomic64_t *)(adev->doorbell.ptr + index), v);
573 	} else {
574 		DRM_ERROR("writing beyond doorbell aperture: 0x%08x!\n", index);
575 	}
576 }
577 
578 /**
579  * amdgpu_device_indirect_rreg - read an indirect register
580  *
581  * @adev: amdgpu_device pointer
582  * @pcie_index: mmio register offset
583  * @pcie_data: mmio register offset
584  *
585  * Returns the value of indirect register @reg_addr
586  */
amdgpu_device_indirect_rreg(struct amdgpu_device * adev,u32 pcie_index,u32 pcie_data,u32 reg_addr)587 u32 amdgpu_device_indirect_rreg(struct amdgpu_device *adev,
588 				u32 pcie_index, u32 pcie_data,
589 				u32 reg_addr)
590 {
591 	unsigned long flags;
592 	u32 r;
593 	void __iomem *pcie_index_offset;
594 	void __iomem *pcie_data_offset;
595 
596 	spin_lock_irqsave(&adev->pcie_idx_lock, flags);
597 	pcie_index_offset = (void __iomem *)adev->rmmio + pcie_index * 4;
598 	pcie_data_offset = (void __iomem *)adev->rmmio + pcie_data * 4;
599 
600 	writel(reg_addr, pcie_index_offset);
601 	readl(pcie_index_offset);
602 	r = readl(pcie_data_offset);
603 	spin_unlock_irqrestore(&adev->pcie_idx_lock, flags);
604 
605 	return r;
606 }
607 
608 /**
609  * amdgpu_device_indirect_rreg64 - read a 64bits indirect register
610  *
611  * @adev: amdgpu_device pointer
612  * @pcie_index: mmio register offset
613  * @pcie_data: mmio register offset
614  *
615  * Returns the value of indirect register @reg_addr
616  */
amdgpu_device_indirect_rreg64(struct amdgpu_device * adev,u32 pcie_index,u32 pcie_data,u32 reg_addr)617 u64 amdgpu_device_indirect_rreg64(struct amdgpu_device *adev,
618 				  u32 pcie_index, u32 pcie_data,
619 				  u32 reg_addr)
620 {
621 	unsigned long flags;
622 	u64 r;
623 	void __iomem *pcie_index_offset;
624 	void __iomem *pcie_data_offset;
625 
626 	spin_lock_irqsave(&adev->pcie_idx_lock, flags);
627 	pcie_index_offset = (void __iomem *)adev->rmmio + pcie_index * 4;
628 	pcie_data_offset = (void __iomem *)adev->rmmio + pcie_data * 4;
629 
630 	/* read low 32 bits */
631 	writel(reg_addr, pcie_index_offset);
632 	readl(pcie_index_offset);
633 	r = readl(pcie_data_offset);
634 	/* read high 32 bits */
635 	writel(reg_addr + 4, pcie_index_offset);
636 	readl(pcie_index_offset);
637 	r |= ((u64)readl(pcie_data_offset) << 32);
638 	spin_unlock_irqrestore(&adev->pcie_idx_lock, flags);
639 
640 	return r;
641 }
642 
643 /**
644  * amdgpu_device_indirect_wreg - write an indirect register address
645  *
646  * @adev: amdgpu_device pointer
647  * @pcie_index: mmio register offset
648  * @pcie_data: mmio register offset
649  * @reg_addr: indirect register offset
650  * @reg_data: indirect register data
651  *
652  */
amdgpu_device_indirect_wreg(struct amdgpu_device * adev,u32 pcie_index,u32 pcie_data,u32 reg_addr,u32 reg_data)653 void amdgpu_device_indirect_wreg(struct amdgpu_device *adev,
654 				 u32 pcie_index, u32 pcie_data,
655 				 u32 reg_addr, u32 reg_data)
656 {
657 	unsigned long flags;
658 	void __iomem *pcie_index_offset;
659 	void __iomem *pcie_data_offset;
660 
661 	spin_lock_irqsave(&adev->pcie_idx_lock, flags);
662 	pcie_index_offset = (void __iomem *)adev->rmmio + pcie_index * 4;
663 	pcie_data_offset = (void __iomem *)adev->rmmio + pcie_data * 4;
664 
665 	writel(reg_addr, pcie_index_offset);
666 	readl(pcie_index_offset);
667 	writel(reg_data, pcie_data_offset);
668 	readl(pcie_data_offset);
669 	spin_unlock_irqrestore(&adev->pcie_idx_lock, flags);
670 }
671 
672 /**
673  * amdgpu_device_indirect_wreg64 - write a 64bits indirect register address
674  *
675  * @adev: amdgpu_device pointer
676  * @pcie_index: mmio register offset
677  * @pcie_data: mmio register offset
678  * @reg_addr: indirect register offset
679  * @reg_data: indirect register data
680  *
681  */
amdgpu_device_indirect_wreg64(struct amdgpu_device * adev,u32 pcie_index,u32 pcie_data,u32 reg_addr,u64 reg_data)682 void amdgpu_device_indirect_wreg64(struct amdgpu_device *adev,
683 				   u32 pcie_index, u32 pcie_data,
684 				   u32 reg_addr, u64 reg_data)
685 {
686 	unsigned long flags;
687 	void __iomem *pcie_index_offset;
688 	void __iomem *pcie_data_offset;
689 
690 	spin_lock_irqsave(&adev->pcie_idx_lock, flags);
691 	pcie_index_offset = (void __iomem *)adev->rmmio + pcie_index * 4;
692 	pcie_data_offset = (void __iomem *)adev->rmmio + pcie_data * 4;
693 
694 	/* write low 32 bits */
695 	writel(reg_addr, pcie_index_offset);
696 	readl(pcie_index_offset);
697 	writel((u32)(reg_data & 0xffffffffULL), pcie_data_offset);
698 	readl(pcie_data_offset);
699 	/* write high 32 bits */
700 	writel(reg_addr + 4, pcie_index_offset);
701 	readl(pcie_index_offset);
702 	writel((u32)(reg_data >> 32), pcie_data_offset);
703 	readl(pcie_data_offset);
704 	spin_unlock_irqrestore(&adev->pcie_idx_lock, flags);
705 }
706 
707 /**
708  * amdgpu_invalid_rreg - dummy reg read function
709  *
710  * @adev: amdgpu_device pointer
711  * @reg: offset of register
712  *
713  * Dummy register read function.  Used for register blocks
714  * that certain asics don't have (all asics).
715  * Returns the value in the register.
716  */
amdgpu_invalid_rreg(struct amdgpu_device * adev,uint32_t reg)717 static uint32_t amdgpu_invalid_rreg(struct amdgpu_device *adev, uint32_t reg)
718 {
719 	DRM_ERROR("Invalid callback to read register 0x%04X\n", reg);
720 	BUG();
721 	return 0;
722 }
723 
724 /**
725  * amdgpu_invalid_wreg - dummy reg write function
726  *
727  * @adev: amdgpu_device pointer
728  * @reg: offset of register
729  * @v: value to write to the register
730  *
731  * Dummy register read function.  Used for register blocks
732  * that certain asics don't have (all asics).
733  */
amdgpu_invalid_wreg(struct amdgpu_device * adev,uint32_t reg,uint32_t v)734 static void amdgpu_invalid_wreg(struct amdgpu_device *adev, uint32_t reg, uint32_t v)
735 {
736 	DRM_ERROR("Invalid callback to write register 0x%04X with 0x%08X\n",
737 		  reg, v);
738 	BUG();
739 }
740 
741 /**
742  * amdgpu_invalid_rreg64 - dummy 64 bit reg read function
743  *
744  * @adev: amdgpu_device pointer
745  * @reg: offset of register
746  *
747  * Dummy register read function.  Used for register blocks
748  * that certain asics don't have (all asics).
749  * Returns the value in the register.
750  */
amdgpu_invalid_rreg64(struct amdgpu_device * adev,uint32_t reg)751 static uint64_t amdgpu_invalid_rreg64(struct amdgpu_device *adev, uint32_t reg)
752 {
753 	DRM_ERROR("Invalid callback to read 64 bit register 0x%04X\n", reg);
754 	BUG();
755 	return 0;
756 }
757 
758 /**
759  * amdgpu_invalid_wreg64 - dummy reg write function
760  *
761  * @adev: amdgpu_device pointer
762  * @reg: offset of register
763  * @v: value to write to the register
764  *
765  * Dummy register read function.  Used for register blocks
766  * that certain asics don't have (all asics).
767  */
amdgpu_invalid_wreg64(struct amdgpu_device * adev,uint32_t reg,uint64_t v)768 static void amdgpu_invalid_wreg64(struct amdgpu_device *adev, uint32_t reg, uint64_t v)
769 {
770 	DRM_ERROR("Invalid callback to write 64 bit register 0x%04X with 0x%08llX\n",
771 		  reg, v);
772 	BUG();
773 }
774 
775 /**
776  * amdgpu_block_invalid_rreg - dummy reg read function
777  *
778  * @adev: amdgpu_device pointer
779  * @block: offset of instance
780  * @reg: offset of register
781  *
782  * Dummy register read function.  Used for register blocks
783  * that certain asics don't have (all asics).
784  * Returns the value in the register.
785  */
amdgpu_block_invalid_rreg(struct amdgpu_device * adev,uint32_t block,uint32_t reg)786 static uint32_t amdgpu_block_invalid_rreg(struct amdgpu_device *adev,
787 					  uint32_t block, uint32_t reg)
788 {
789 	DRM_ERROR("Invalid callback to read register 0x%04X in block 0x%04X\n",
790 		  reg, block);
791 	BUG();
792 	return 0;
793 }
794 
795 /**
796  * amdgpu_block_invalid_wreg - dummy reg write function
797  *
798  * @adev: amdgpu_device pointer
799  * @block: offset of instance
800  * @reg: offset of register
801  * @v: value to write to the register
802  *
803  * Dummy register read function.  Used for register blocks
804  * that certain asics don't have (all asics).
805  */
amdgpu_block_invalid_wreg(struct amdgpu_device * adev,uint32_t block,uint32_t reg,uint32_t v)806 static void amdgpu_block_invalid_wreg(struct amdgpu_device *adev,
807 				      uint32_t block,
808 				      uint32_t reg, uint32_t v)
809 {
810 	DRM_ERROR("Invalid block callback to write register 0x%04X in block 0x%04X with 0x%08X\n",
811 		  reg, block, v);
812 	BUG();
813 }
814 
815 /**
816  * amdgpu_device_asic_init - Wrapper for atom asic_init
817  *
818  * @adev: amdgpu_device pointer
819  *
820  * Does any asic specific work and then calls atom asic init.
821  */
amdgpu_device_asic_init(struct amdgpu_device * adev)822 static int amdgpu_device_asic_init(struct amdgpu_device *adev)
823 {
824 	amdgpu_asic_pre_asic_init(adev);
825 
826 	return amdgpu_atom_asic_init(adev->mode_info.atom_context);
827 }
828 
829 /**
830  * amdgpu_device_vram_scratch_init - allocate the VRAM scratch page
831  *
832  * @adev: amdgpu_device pointer
833  *
834  * Allocates a scratch page of VRAM for use by various things in the
835  * driver.
836  */
amdgpu_device_vram_scratch_init(struct amdgpu_device * adev)837 static int amdgpu_device_vram_scratch_init(struct amdgpu_device *adev)
838 {
839 	return amdgpu_bo_create_kernel(adev, AMDGPU_GPU_PAGE_SIZE,
840 				       PAGE_SIZE, AMDGPU_GEM_DOMAIN_VRAM,
841 				       &adev->vram_scratch.robj,
842 				       &adev->vram_scratch.gpu_addr,
843 				       (void **)&adev->vram_scratch.ptr);
844 }
845 
846 /**
847  * amdgpu_device_vram_scratch_fini - Free the VRAM scratch page
848  *
849  * @adev: amdgpu_device pointer
850  *
851  * Frees the VRAM scratch page.
852  */
amdgpu_device_vram_scratch_fini(struct amdgpu_device * adev)853 static void amdgpu_device_vram_scratch_fini(struct amdgpu_device *adev)
854 {
855 	amdgpu_bo_free_kernel(&adev->vram_scratch.robj, NULL, NULL);
856 }
857 
858 /**
859  * amdgpu_device_program_register_sequence - program an array of registers.
860  *
861  * @adev: amdgpu_device pointer
862  * @registers: pointer to the register array
863  * @array_size: size of the register array
864  *
865  * Programs an array or registers with and and or masks.
866  * This is a helper for setting golden registers.
867  */
amdgpu_device_program_register_sequence(struct amdgpu_device * adev,const u32 * registers,const u32 array_size)868 void amdgpu_device_program_register_sequence(struct amdgpu_device *adev,
869 					     const u32 *registers,
870 					     const u32 array_size)
871 {
872 	u32 tmp, reg, and_mask, or_mask;
873 	int i;
874 
875 	if (array_size % 3)
876 		return;
877 
878 	for (i = 0; i < array_size; i +=3) {
879 		reg = registers[i + 0];
880 		and_mask = registers[i + 1];
881 		or_mask = registers[i + 2];
882 
883 		if (and_mask == 0xffffffff) {
884 			tmp = or_mask;
885 		} else {
886 			tmp = RREG32(reg);
887 			tmp &= ~and_mask;
888 			if (adev->family >= AMDGPU_FAMILY_AI)
889 				tmp |= (or_mask & and_mask);
890 			else
891 				tmp |= or_mask;
892 		}
893 		WREG32(reg, tmp);
894 	}
895 }
896 
897 /**
898  * amdgpu_device_pci_config_reset - reset the GPU
899  *
900  * @adev: amdgpu_device pointer
901  *
902  * Resets the GPU using the pci config reset sequence.
903  * Only applicable to asics prior to vega10.
904  */
amdgpu_device_pci_config_reset(struct amdgpu_device * adev)905 void amdgpu_device_pci_config_reset(struct amdgpu_device *adev)
906 {
907 	pci_write_config_dword(adev->pdev, 0x7c, AMDGPU_ASIC_RESET_DATA);
908 }
909 
910 /*
911  * GPU doorbell aperture helpers function.
912  */
913 /**
914  * amdgpu_device_doorbell_init - Init doorbell driver information.
915  *
916  * @adev: amdgpu_device pointer
917  *
918  * Init doorbell driver information (CIK)
919  * Returns 0 on success, error on failure.
920  */
amdgpu_device_doorbell_init(struct amdgpu_device * adev)921 static int amdgpu_device_doorbell_init(struct amdgpu_device *adev)
922 {
923 
924 	/* No doorbell on SI hardware generation */
925 	if (adev->asic_type < CHIP_BONAIRE) {
926 		adev->doorbell.base = 0;
927 		adev->doorbell.size = 0;
928 		adev->doorbell.num_doorbells = 0;
929 		adev->doorbell.ptr = NULL;
930 		return 0;
931 	}
932 
933 	if (pci_resource_flags(adev->pdev, 2) & IORESOURCE_UNSET)
934 		return -EINVAL;
935 
936 	amdgpu_asic_init_doorbell_index(adev);
937 
938 	/* doorbell bar mapping */
939 	adev->doorbell.base = pci_resource_start(adev->pdev, 2);
940 	adev->doorbell.size = pci_resource_len(adev->pdev, 2);
941 
942 	adev->doorbell.num_doorbells = min_t(u32, adev->doorbell.size / sizeof(u32),
943 					     adev->doorbell_index.max_assignment+1);
944 	if (adev->doorbell.num_doorbells == 0)
945 		return -EINVAL;
946 
947 	/* For Vega, reserve and map two pages on doorbell BAR since SDMA
948 	 * paging queue doorbell use the second page. The
949 	 * AMDGPU_DOORBELL64_MAX_ASSIGNMENT definition assumes all the
950 	 * doorbells are in the first page. So with paging queue enabled,
951 	 * the max num_doorbells should + 1 page (0x400 in dword)
952 	 */
953 	if (adev->asic_type >= CHIP_VEGA10)
954 		adev->doorbell.num_doorbells += 0x400;
955 
956 	adev->doorbell.ptr = ioremap(adev->doorbell.base,
957 				     adev->doorbell.num_doorbells *
958 				     sizeof(u32));
959 	if (adev->doorbell.ptr == NULL)
960 		return -ENOMEM;
961 
962 	return 0;
963 }
964 
965 /**
966  * amdgpu_device_doorbell_fini - Tear down doorbell driver information.
967  *
968  * @adev: amdgpu_device pointer
969  *
970  * Tear down doorbell driver information (CIK)
971  */
amdgpu_device_doorbell_fini(struct amdgpu_device * adev)972 static void amdgpu_device_doorbell_fini(struct amdgpu_device *adev)
973 {
974 	iounmap(adev->doorbell.ptr);
975 	adev->doorbell.ptr = NULL;
976 }
977 
978 
979 
980 /*
981  * amdgpu_device_wb_*()
982  * Writeback is the method by which the GPU updates special pages in memory
983  * with the status of certain GPU events (fences, ring pointers,etc.).
984  */
985 
986 /**
987  * amdgpu_device_wb_fini - Disable Writeback and free memory
988  *
989  * @adev: amdgpu_device pointer
990  *
991  * Disables Writeback and frees the Writeback memory (all asics).
992  * Used at driver shutdown.
993  */
amdgpu_device_wb_fini(struct amdgpu_device * adev)994 static void amdgpu_device_wb_fini(struct amdgpu_device *adev)
995 {
996 	if (adev->wb.wb_obj) {
997 		amdgpu_bo_free_kernel(&adev->wb.wb_obj,
998 				      &adev->wb.gpu_addr,
999 				      (void **)&adev->wb.wb);
1000 		adev->wb.wb_obj = NULL;
1001 	}
1002 }
1003 
1004 /**
1005  * amdgpu_device_wb_init- Init Writeback driver info and allocate memory
1006  *
1007  * @adev: amdgpu_device pointer
1008  *
1009  * Initializes writeback and allocates writeback memory (all asics).
1010  * Used at driver startup.
1011  * Returns 0 on success or an -error on failure.
1012  */
amdgpu_device_wb_init(struct amdgpu_device * adev)1013 static int amdgpu_device_wb_init(struct amdgpu_device *adev)
1014 {
1015 	int r;
1016 
1017 	if (adev->wb.wb_obj == NULL) {
1018 		/* AMDGPU_MAX_WB * sizeof(uint32_t) * 8 = AMDGPU_MAX_WB 256bit slots */
1019 		r = amdgpu_bo_create_kernel(adev, AMDGPU_MAX_WB * sizeof(uint32_t) * 8,
1020 					    PAGE_SIZE, AMDGPU_GEM_DOMAIN_GTT,
1021 					    &adev->wb.wb_obj, &adev->wb.gpu_addr,
1022 					    (void **)&adev->wb.wb);
1023 		if (r) {
1024 			dev_warn(adev->dev, "(%d) create WB bo failed\n", r);
1025 			return r;
1026 		}
1027 
1028 		adev->wb.num_wb = AMDGPU_MAX_WB;
1029 		memset(&adev->wb.used, 0, sizeof(adev->wb.used));
1030 
1031 		/* clear wb memory */
1032 		memset((char *)adev->wb.wb, 0, AMDGPU_MAX_WB * sizeof(uint32_t) * 8);
1033 	}
1034 
1035 	return 0;
1036 }
1037 
1038 /**
1039  * amdgpu_device_wb_get - Allocate a wb entry
1040  *
1041  * @adev: amdgpu_device pointer
1042  * @wb: wb index
1043  *
1044  * Allocate a wb slot for use by the driver (all asics).
1045  * Returns 0 on success or -EINVAL on failure.
1046  */
amdgpu_device_wb_get(struct amdgpu_device * adev,u32 * wb)1047 int amdgpu_device_wb_get(struct amdgpu_device *adev, u32 *wb)
1048 {
1049 	unsigned long offset = find_first_zero_bit(adev->wb.used, adev->wb.num_wb);
1050 
1051 	if (offset < adev->wb.num_wb) {
1052 		__set_bit(offset, adev->wb.used);
1053 		*wb = offset << 3; /* convert to dw offset */
1054 		return 0;
1055 	} else {
1056 		return -EINVAL;
1057 	}
1058 }
1059 
1060 /**
1061  * amdgpu_device_wb_free - Free a wb entry
1062  *
1063  * @adev: amdgpu_device pointer
1064  * @wb: wb index
1065  *
1066  * Free a wb slot allocated for use by the driver (all asics)
1067  */
amdgpu_device_wb_free(struct amdgpu_device * adev,u32 wb)1068 void amdgpu_device_wb_free(struct amdgpu_device *adev, u32 wb)
1069 {
1070 	wb >>= 3;
1071 	if (wb < adev->wb.num_wb)
1072 		__clear_bit(wb, adev->wb.used);
1073 }
1074 
1075 /**
1076  * amdgpu_device_resize_fb_bar - try to resize FB BAR
1077  *
1078  * @adev: amdgpu_device pointer
1079  *
1080  * Try to resize FB BAR to make all VRAM CPU accessible. We try very hard not
1081  * to fail, but if any of the BARs is not accessible after the size we abort
1082  * driver loading by returning -ENODEV.
1083  */
amdgpu_device_resize_fb_bar(struct amdgpu_device * adev)1084 int amdgpu_device_resize_fb_bar(struct amdgpu_device *adev)
1085 {
1086 	u64 space_needed = roundup_pow_of_two(adev->gmc.real_vram_size);
1087 	u32 rbar_size = order_base_2(((space_needed >> 20) | 1)) - 1;
1088 	struct pci_bus *root;
1089 	struct resource *res;
1090 	unsigned i;
1091 	u16 cmd;
1092 	int r;
1093 
1094 	/* Bypass for VF */
1095 	if (amdgpu_sriov_vf(adev))
1096 		return 0;
1097 
1098 	/* skip if the bios has already enabled large BAR */
1099 	if (adev->gmc.real_vram_size &&
1100 	    (pci_resource_len(adev->pdev, 0) >= adev->gmc.real_vram_size))
1101 		return 0;
1102 
1103 	/* Check if the root BUS has 64bit memory resources */
1104 	root = adev->pdev->bus;
1105 	while (root->parent)
1106 		root = root->parent;
1107 
1108 	pci_bus_for_each_resource(root, res, i) {
1109 		if (res && res->flags & (IORESOURCE_MEM | IORESOURCE_MEM_64) &&
1110 		    res->start > 0x100000000ull)
1111 			break;
1112 	}
1113 
1114 	/* Trying to resize is pointless without a root hub window above 4GB */
1115 	if (!res)
1116 		return 0;
1117 
1118 	/* Disable memory decoding while we change the BAR addresses and size */
1119 	pci_read_config_word(adev->pdev, PCI_COMMAND, &cmd);
1120 	pci_write_config_word(adev->pdev, PCI_COMMAND,
1121 			      cmd & ~PCI_COMMAND_MEMORY);
1122 
1123 	/* Free the VRAM and doorbell BAR, we most likely need to move both. */
1124 	amdgpu_device_doorbell_fini(adev);
1125 	if (adev->asic_type >= CHIP_BONAIRE)
1126 		pci_release_resource(adev->pdev, 2);
1127 
1128 	pci_release_resource(adev->pdev, 0);
1129 
1130 	r = pci_resize_resource(adev->pdev, 0, rbar_size);
1131 	if (r == -ENOSPC)
1132 		DRM_INFO("Not enough PCI address space for a large BAR.");
1133 	else if (r && r != -ENOTSUPP)
1134 		DRM_ERROR("Problem resizing BAR0 (%d).", r);
1135 
1136 	pci_assign_unassigned_bus_resources(adev->pdev->bus);
1137 
1138 	/* When the doorbell or fb BAR isn't available we have no chance of
1139 	 * using the device.
1140 	 */
1141 	r = amdgpu_device_doorbell_init(adev);
1142 	if (r || (pci_resource_flags(adev->pdev, 0) & IORESOURCE_UNSET))
1143 		return -ENODEV;
1144 
1145 	pci_write_config_word(adev->pdev, PCI_COMMAND, cmd);
1146 
1147 	return 0;
1148 }
1149 
1150 /*
1151  * GPU helpers function.
1152  */
1153 /**
1154  * amdgpu_device_need_post - check if the hw need post or not
1155  *
1156  * @adev: amdgpu_device pointer
1157  *
1158  * Check if the asic has been initialized (all asics) at driver startup
1159  * or post is needed if  hw reset is performed.
1160  * Returns true if need or false if not.
1161  */
amdgpu_device_need_post(struct amdgpu_device * adev)1162 bool amdgpu_device_need_post(struct amdgpu_device *adev)
1163 {
1164 	uint32_t reg;
1165 
1166 	if (amdgpu_sriov_vf(adev))
1167 		return false;
1168 
1169 	if (amdgpu_passthrough(adev)) {
1170 		/* for FIJI: In whole GPU pass-through virtualization case, after VM reboot
1171 		 * some old smc fw still need driver do vPost otherwise gpu hang, while
1172 		 * those smc fw version above 22.15 doesn't have this flaw, so we force
1173 		 * vpost executed for smc version below 22.15
1174 		 */
1175 		if (adev->asic_type == CHIP_FIJI) {
1176 			int err;
1177 			uint32_t fw_ver;
1178 			err = request_firmware(&adev->pm.fw, "amdgpu/fiji_smc.bin", adev->dev);
1179 			/* force vPost if error occured */
1180 			if (err)
1181 				return true;
1182 
1183 			fw_ver = *((uint32_t *)adev->pm.fw->data + 69);
1184 			if (fw_ver < 0x00160e00)
1185 				return true;
1186 		}
1187 	}
1188 
1189 	if (adev->has_hw_reset) {
1190 		adev->has_hw_reset = false;
1191 		return true;
1192 	}
1193 
1194 	/* bios scratch used on CIK+ */
1195 	if (adev->asic_type >= CHIP_BONAIRE)
1196 		return amdgpu_atombios_scratch_need_asic_init(adev);
1197 
1198 	/* check MEM_SIZE for older asics */
1199 	reg = amdgpu_asic_get_config_memsize(adev);
1200 
1201 	if ((reg != 0) && (reg != 0xffffffff))
1202 		return false;
1203 
1204 	return true;
1205 }
1206 
1207 /* if we get transitioned to only one device, take VGA back */
1208 /**
1209  * amdgpu_device_vga_set_decode - enable/disable vga decode
1210  *
1211  * @cookie: amdgpu_device pointer
1212  * @state: enable/disable vga decode
1213  *
1214  * Enable/disable vga decode (all asics).
1215  * Returns VGA resource flags.
1216  */
amdgpu_device_vga_set_decode(void * cookie,bool state)1217 static unsigned int amdgpu_device_vga_set_decode(void *cookie, bool state)
1218 {
1219 	struct amdgpu_device *adev = cookie;
1220 	amdgpu_asic_set_vga_state(adev, state);
1221 	if (state)
1222 		return VGA_RSRC_LEGACY_IO | VGA_RSRC_LEGACY_MEM |
1223 		       VGA_RSRC_NORMAL_IO | VGA_RSRC_NORMAL_MEM;
1224 	else
1225 		return VGA_RSRC_NORMAL_IO | VGA_RSRC_NORMAL_MEM;
1226 }
1227 
1228 /**
1229  * amdgpu_device_check_block_size - validate the vm block size
1230  *
1231  * @adev: amdgpu_device pointer
1232  *
1233  * Validates the vm block size specified via module parameter.
1234  * The vm block size defines number of bits in page table versus page directory,
1235  * a page is 4KB so we have 12 bits offset, minimum 9 bits in the
1236  * page table and the remaining bits are in the page directory.
1237  */
amdgpu_device_check_block_size(struct amdgpu_device * adev)1238 static void amdgpu_device_check_block_size(struct amdgpu_device *adev)
1239 {
1240 	/* defines number of bits in page table versus page directory,
1241 	 * a page is 4KB so we have 12 bits offset, minimum 9 bits in the
1242 	 * page table and the remaining bits are in the page directory */
1243 	if (amdgpu_vm_block_size == -1)
1244 		return;
1245 
1246 	if (amdgpu_vm_block_size < 9) {
1247 		dev_warn(adev->dev, "VM page table size (%d) too small\n",
1248 			 amdgpu_vm_block_size);
1249 		amdgpu_vm_block_size = -1;
1250 	}
1251 }
1252 
1253 /**
1254  * amdgpu_device_check_vm_size - validate the vm size
1255  *
1256  * @adev: amdgpu_device pointer
1257  *
1258  * Validates the vm size in GB specified via module parameter.
1259  * The VM size is the size of the GPU virtual memory space in GB.
1260  */
amdgpu_device_check_vm_size(struct amdgpu_device * adev)1261 static void amdgpu_device_check_vm_size(struct amdgpu_device *adev)
1262 {
1263 	/* no need to check the default value */
1264 	if (amdgpu_vm_size == -1)
1265 		return;
1266 
1267 	if (amdgpu_vm_size < 1) {
1268 		dev_warn(adev->dev, "VM size (%d) too small, min is 1GB\n",
1269 			 amdgpu_vm_size);
1270 		amdgpu_vm_size = -1;
1271 	}
1272 }
1273 
amdgpu_device_check_smu_prv_buffer_size(struct amdgpu_device * adev)1274 static void amdgpu_device_check_smu_prv_buffer_size(struct amdgpu_device *adev)
1275 {
1276 	struct sysinfo si;
1277 	bool is_os_64 = (sizeof(void *) == 8);
1278 	uint64_t total_memory;
1279 	uint64_t dram_size_seven_GB = 0x1B8000000;
1280 	uint64_t dram_size_three_GB = 0xB8000000;
1281 
1282 	if (amdgpu_smu_memory_pool_size == 0)
1283 		return;
1284 
1285 	if (!is_os_64) {
1286 		DRM_WARN("Not 64-bit OS, feature not supported\n");
1287 		goto def_value;
1288 	}
1289 	si_meminfo(&si);
1290 	total_memory = (uint64_t)si.totalram * si.mem_unit;
1291 
1292 	if ((amdgpu_smu_memory_pool_size == 1) ||
1293 		(amdgpu_smu_memory_pool_size == 2)) {
1294 		if (total_memory < dram_size_three_GB)
1295 			goto def_value1;
1296 	} else if ((amdgpu_smu_memory_pool_size == 4) ||
1297 		(amdgpu_smu_memory_pool_size == 8)) {
1298 		if (total_memory < dram_size_seven_GB)
1299 			goto def_value1;
1300 	} else {
1301 		DRM_WARN("Smu memory pool size not supported\n");
1302 		goto def_value;
1303 	}
1304 	adev->pm.smu_prv_buffer_size = amdgpu_smu_memory_pool_size << 28;
1305 
1306 	return;
1307 
1308 def_value1:
1309 	DRM_WARN("No enough system memory\n");
1310 def_value:
1311 	adev->pm.smu_prv_buffer_size = 0;
1312 }
1313 
1314 /**
1315  * amdgpu_device_check_arguments - validate module params
1316  *
1317  * @adev: amdgpu_device pointer
1318  *
1319  * Validates certain module parameters and updates
1320  * the associated values used by the driver (all asics).
1321  */
amdgpu_device_check_arguments(struct amdgpu_device * adev)1322 static int amdgpu_device_check_arguments(struct amdgpu_device *adev)
1323 {
1324 	if (amdgpu_sched_jobs < 4) {
1325 		dev_warn(adev->dev, "sched jobs (%d) must be at least 4\n",
1326 			 amdgpu_sched_jobs);
1327 		amdgpu_sched_jobs = 4;
1328 	} else if (!is_power_of_2(amdgpu_sched_jobs)){
1329 		dev_warn(adev->dev, "sched jobs (%d) must be a power of 2\n",
1330 			 amdgpu_sched_jobs);
1331 		amdgpu_sched_jobs = roundup_pow_of_two(amdgpu_sched_jobs);
1332 	}
1333 
1334 	if (amdgpu_gart_size != -1 && amdgpu_gart_size < 32) {
1335 		/* gart size must be greater or equal to 32M */
1336 		dev_warn(adev->dev, "gart size (%d) too small\n",
1337 			 amdgpu_gart_size);
1338 		amdgpu_gart_size = -1;
1339 	}
1340 
1341 	if (amdgpu_gtt_size != -1 && amdgpu_gtt_size < 32) {
1342 		/* gtt size must be greater or equal to 32M */
1343 		dev_warn(adev->dev, "gtt size (%d) too small\n",
1344 				 amdgpu_gtt_size);
1345 		amdgpu_gtt_size = -1;
1346 	}
1347 
1348 	/* valid range is between 4 and 9 inclusive */
1349 	if (amdgpu_vm_fragment_size != -1 &&
1350 	    (amdgpu_vm_fragment_size > 9 || amdgpu_vm_fragment_size < 4)) {
1351 		dev_warn(adev->dev, "valid range is between 4 and 9\n");
1352 		amdgpu_vm_fragment_size = -1;
1353 	}
1354 
1355 	if (amdgpu_sched_hw_submission < 2) {
1356 		dev_warn(adev->dev, "sched hw submission jobs (%d) must be at least 2\n",
1357 			 amdgpu_sched_hw_submission);
1358 		amdgpu_sched_hw_submission = 2;
1359 	} else if (!is_power_of_2(amdgpu_sched_hw_submission)) {
1360 		dev_warn(adev->dev, "sched hw submission jobs (%d) must be a power of 2\n",
1361 			 amdgpu_sched_hw_submission);
1362 		amdgpu_sched_hw_submission = roundup_pow_of_two(amdgpu_sched_hw_submission);
1363 	}
1364 
1365 	amdgpu_device_check_smu_prv_buffer_size(adev);
1366 
1367 	amdgpu_device_check_vm_size(adev);
1368 
1369 	amdgpu_device_check_block_size(adev);
1370 
1371 	adev->firmware.load_type = amdgpu_ucode_get_load_type(adev, amdgpu_fw_load_type);
1372 
1373 	amdgpu_gmc_tmz_set(adev);
1374 
1375 	if (amdgpu_num_kcq == -1) {
1376 		amdgpu_num_kcq = 8;
1377 	} else if (amdgpu_num_kcq > 8 || amdgpu_num_kcq < 0) {
1378 		amdgpu_num_kcq = 8;
1379 		dev_warn(adev->dev, "set kernel compute queue number to 8 due to invalid parameter provided by user\n");
1380 	}
1381 
1382 	amdgpu_gmc_noretry_set(adev);
1383 
1384 	return 0;
1385 }
1386 
1387 /**
1388  * amdgpu_switcheroo_set_state - set switcheroo state
1389  *
1390  * @pdev: pci dev pointer
1391  * @state: vga_switcheroo state
1392  *
1393  * Callback for the switcheroo driver.  Suspends or resumes the
1394  * the asics before or after it is powered up using ACPI methods.
1395  */
amdgpu_switcheroo_set_state(struct pci_dev * pdev,enum vga_switcheroo_state state)1396 static void amdgpu_switcheroo_set_state(struct pci_dev *pdev,
1397 					enum vga_switcheroo_state state)
1398 {
1399 	struct drm_device *dev = pci_get_drvdata(pdev);
1400 	int r;
1401 
1402 	if (amdgpu_device_supports_boco(dev) && state == VGA_SWITCHEROO_OFF)
1403 		return;
1404 
1405 	if (state == VGA_SWITCHEROO_ON) {
1406 		pr_info("switched on\n");
1407 		/* don't suspend or resume card normally */
1408 		dev->switch_power_state = DRM_SWITCH_POWER_CHANGING;
1409 
1410 		pci_set_power_state(dev->pdev, PCI_D0);
1411 		amdgpu_device_load_pci_state(dev->pdev);
1412 		r = pci_enable_device(dev->pdev);
1413 		if (r)
1414 			DRM_WARN("pci_enable_device failed (%d)\n", r);
1415 		amdgpu_device_resume(dev, true);
1416 
1417 		dev->switch_power_state = DRM_SWITCH_POWER_ON;
1418 		drm_kms_helper_poll_enable(dev);
1419 	} else {
1420 		pr_info("switched off\n");
1421 		drm_kms_helper_poll_disable(dev);
1422 		dev->switch_power_state = DRM_SWITCH_POWER_CHANGING;
1423 		amdgpu_device_suspend(dev, true);
1424 		amdgpu_device_cache_pci_state(dev->pdev);
1425 		/* Shut down the device */
1426 		pci_disable_device(dev->pdev);
1427 		pci_set_power_state(dev->pdev, PCI_D3cold);
1428 		dev->switch_power_state = DRM_SWITCH_POWER_OFF;
1429 	}
1430 }
1431 
1432 /**
1433  * amdgpu_switcheroo_can_switch - see if switcheroo state can change
1434  *
1435  * @pdev: pci dev pointer
1436  *
1437  * Callback for the switcheroo driver.  Check of the switcheroo
1438  * state can be changed.
1439  * Returns true if the state can be changed, false if not.
1440  */
amdgpu_switcheroo_can_switch(struct pci_dev * pdev)1441 static bool amdgpu_switcheroo_can_switch(struct pci_dev *pdev)
1442 {
1443 	struct drm_device *dev = pci_get_drvdata(pdev);
1444 
1445 	/*
1446 	* FIXME: open_count is protected by drm_global_mutex but that would lead to
1447 	* locking inversion with the driver load path. And the access here is
1448 	* completely racy anyway. So don't bother with locking for now.
1449 	*/
1450 	return atomic_read(&dev->open_count) == 0;
1451 }
1452 
1453 static const struct vga_switcheroo_client_ops amdgpu_switcheroo_ops = {
1454 	.set_gpu_state = amdgpu_switcheroo_set_state,
1455 	.reprobe = NULL,
1456 	.can_switch = amdgpu_switcheroo_can_switch,
1457 };
1458 
1459 /**
1460  * amdgpu_device_ip_set_clockgating_state - set the CG state
1461  *
1462  * @dev: amdgpu_device pointer
1463  * @block_type: Type of hardware IP (SMU, GFX, UVD, etc.)
1464  * @state: clockgating state (gate or ungate)
1465  *
1466  * Sets the requested clockgating state for all instances of
1467  * the hardware IP specified.
1468  * Returns the error code from the last instance.
1469  */
amdgpu_device_ip_set_clockgating_state(void * dev,enum amd_ip_block_type block_type,enum amd_clockgating_state state)1470 int amdgpu_device_ip_set_clockgating_state(void *dev,
1471 					   enum amd_ip_block_type block_type,
1472 					   enum amd_clockgating_state state)
1473 {
1474 	struct amdgpu_device *adev = dev;
1475 	int i, r = 0;
1476 
1477 	for (i = 0; i < adev->num_ip_blocks; i++) {
1478 		if (!adev->ip_blocks[i].status.valid)
1479 			continue;
1480 		if (adev->ip_blocks[i].version->type != block_type)
1481 			continue;
1482 		if (!adev->ip_blocks[i].version->funcs->set_clockgating_state)
1483 			continue;
1484 		r = adev->ip_blocks[i].version->funcs->set_clockgating_state(
1485 			(void *)adev, state);
1486 		if (r)
1487 			DRM_ERROR("set_clockgating_state of IP block <%s> failed %d\n",
1488 				  adev->ip_blocks[i].version->funcs->name, r);
1489 	}
1490 	return r;
1491 }
1492 
1493 /**
1494  * amdgpu_device_ip_set_powergating_state - set the PG state
1495  *
1496  * @dev: amdgpu_device pointer
1497  * @block_type: Type of hardware IP (SMU, GFX, UVD, etc.)
1498  * @state: powergating state (gate or ungate)
1499  *
1500  * Sets the requested powergating state for all instances of
1501  * the hardware IP specified.
1502  * Returns the error code from the last instance.
1503  */
amdgpu_device_ip_set_powergating_state(void * dev,enum amd_ip_block_type block_type,enum amd_powergating_state state)1504 int amdgpu_device_ip_set_powergating_state(void *dev,
1505 					   enum amd_ip_block_type block_type,
1506 					   enum amd_powergating_state state)
1507 {
1508 	struct amdgpu_device *adev = dev;
1509 	int i, r = 0;
1510 
1511 	for (i = 0; i < adev->num_ip_blocks; i++) {
1512 		if (!adev->ip_blocks[i].status.valid)
1513 			continue;
1514 		if (adev->ip_blocks[i].version->type != block_type)
1515 			continue;
1516 		if (!adev->ip_blocks[i].version->funcs->set_powergating_state)
1517 			continue;
1518 		r = adev->ip_blocks[i].version->funcs->set_powergating_state(
1519 			(void *)adev, state);
1520 		if (r)
1521 			DRM_ERROR("set_powergating_state of IP block <%s> failed %d\n",
1522 				  adev->ip_blocks[i].version->funcs->name, r);
1523 	}
1524 	return r;
1525 }
1526 
1527 /**
1528  * amdgpu_device_ip_get_clockgating_state - get the CG state
1529  *
1530  * @adev: amdgpu_device pointer
1531  * @flags: clockgating feature flags
1532  *
1533  * Walks the list of IPs on the device and updates the clockgating
1534  * flags for each IP.
1535  * Updates @flags with the feature flags for each hardware IP where
1536  * clockgating is enabled.
1537  */
amdgpu_device_ip_get_clockgating_state(struct amdgpu_device * adev,u32 * flags)1538 void amdgpu_device_ip_get_clockgating_state(struct amdgpu_device *adev,
1539 					    u32 *flags)
1540 {
1541 	int i;
1542 
1543 	for (i = 0; i < adev->num_ip_blocks; i++) {
1544 		if (!adev->ip_blocks[i].status.valid)
1545 			continue;
1546 		if (adev->ip_blocks[i].version->funcs->get_clockgating_state)
1547 			adev->ip_blocks[i].version->funcs->get_clockgating_state((void *)adev, flags);
1548 	}
1549 }
1550 
1551 /**
1552  * amdgpu_device_ip_wait_for_idle - wait for idle
1553  *
1554  * @adev: amdgpu_device pointer
1555  * @block_type: Type of hardware IP (SMU, GFX, UVD, etc.)
1556  *
1557  * Waits for the request hardware IP to be idle.
1558  * Returns 0 for success or a negative error code on failure.
1559  */
amdgpu_device_ip_wait_for_idle(struct amdgpu_device * adev,enum amd_ip_block_type block_type)1560 int amdgpu_device_ip_wait_for_idle(struct amdgpu_device *adev,
1561 				   enum amd_ip_block_type block_type)
1562 {
1563 	int i, r;
1564 
1565 	for (i = 0; i < adev->num_ip_blocks; i++) {
1566 		if (!adev->ip_blocks[i].status.valid)
1567 			continue;
1568 		if (adev->ip_blocks[i].version->type == block_type) {
1569 			r = adev->ip_blocks[i].version->funcs->wait_for_idle((void *)adev);
1570 			if (r)
1571 				return r;
1572 			break;
1573 		}
1574 	}
1575 	return 0;
1576 
1577 }
1578 
1579 /**
1580  * amdgpu_device_ip_is_idle - is the hardware IP idle
1581  *
1582  * @adev: amdgpu_device pointer
1583  * @block_type: Type of hardware IP (SMU, GFX, UVD, etc.)
1584  *
1585  * Check if the hardware IP is idle or not.
1586  * Returns true if it the IP is idle, false if not.
1587  */
amdgpu_device_ip_is_idle(struct amdgpu_device * adev,enum amd_ip_block_type block_type)1588 bool amdgpu_device_ip_is_idle(struct amdgpu_device *adev,
1589 			      enum amd_ip_block_type block_type)
1590 {
1591 	int i;
1592 
1593 	for (i = 0; i < adev->num_ip_blocks; i++) {
1594 		if (!adev->ip_blocks[i].status.valid)
1595 			continue;
1596 		if (adev->ip_blocks[i].version->type == block_type)
1597 			return adev->ip_blocks[i].version->funcs->is_idle((void *)adev);
1598 	}
1599 	return true;
1600 
1601 }
1602 
1603 /**
1604  * amdgpu_device_ip_get_ip_block - get a hw IP pointer
1605  *
1606  * @adev: amdgpu_device pointer
1607  * @type: Type of hardware IP (SMU, GFX, UVD, etc.)
1608  *
1609  * Returns a pointer to the hardware IP block structure
1610  * if it exists for the asic, otherwise NULL.
1611  */
1612 struct amdgpu_ip_block *
amdgpu_device_ip_get_ip_block(struct amdgpu_device * adev,enum amd_ip_block_type type)1613 amdgpu_device_ip_get_ip_block(struct amdgpu_device *adev,
1614 			      enum amd_ip_block_type type)
1615 {
1616 	int i;
1617 
1618 	for (i = 0; i < adev->num_ip_blocks; i++)
1619 		if (adev->ip_blocks[i].version->type == type)
1620 			return &adev->ip_blocks[i];
1621 
1622 	return NULL;
1623 }
1624 
1625 /**
1626  * amdgpu_device_ip_block_version_cmp
1627  *
1628  * @adev: amdgpu_device pointer
1629  * @type: enum amd_ip_block_type
1630  * @major: major version
1631  * @minor: minor version
1632  *
1633  * return 0 if equal or greater
1634  * return 1 if smaller or the ip_block doesn't exist
1635  */
amdgpu_device_ip_block_version_cmp(struct amdgpu_device * adev,enum amd_ip_block_type type,u32 major,u32 minor)1636 int amdgpu_device_ip_block_version_cmp(struct amdgpu_device *adev,
1637 				       enum amd_ip_block_type type,
1638 				       u32 major, u32 minor)
1639 {
1640 	struct amdgpu_ip_block *ip_block = amdgpu_device_ip_get_ip_block(adev, type);
1641 
1642 	if (ip_block && ((ip_block->version->major > major) ||
1643 			((ip_block->version->major == major) &&
1644 			(ip_block->version->minor >= minor))))
1645 		return 0;
1646 
1647 	return 1;
1648 }
1649 
1650 /**
1651  * amdgpu_device_ip_block_add
1652  *
1653  * @adev: amdgpu_device pointer
1654  * @ip_block_version: pointer to the IP to add
1655  *
1656  * Adds the IP block driver information to the collection of IPs
1657  * on the asic.
1658  */
amdgpu_device_ip_block_add(struct amdgpu_device * adev,const struct amdgpu_ip_block_version * ip_block_version)1659 int amdgpu_device_ip_block_add(struct amdgpu_device *adev,
1660 			       const struct amdgpu_ip_block_version *ip_block_version)
1661 {
1662 	if (!ip_block_version)
1663 		return -EINVAL;
1664 
1665 	DRM_INFO("add ip block number %d <%s>\n", adev->num_ip_blocks,
1666 		  ip_block_version->funcs->name);
1667 
1668 	adev->ip_blocks[adev->num_ip_blocks++].version = ip_block_version;
1669 
1670 	return 0;
1671 }
1672 
1673 /**
1674  * amdgpu_device_enable_virtual_display - enable virtual display feature
1675  *
1676  * @adev: amdgpu_device pointer
1677  *
1678  * Enabled the virtual display feature if the user has enabled it via
1679  * the module parameter virtual_display.  This feature provides a virtual
1680  * display hardware on headless boards or in virtualized environments.
1681  * This function parses and validates the configuration string specified by
1682  * the user and configues the virtual display configuration (number of
1683  * virtual connectors, crtcs, etc.) specified.
1684  */
amdgpu_device_enable_virtual_display(struct amdgpu_device * adev)1685 static void amdgpu_device_enable_virtual_display(struct amdgpu_device *adev)
1686 {
1687 	adev->enable_virtual_display = false;
1688 
1689 	if (amdgpu_virtual_display) {
1690 		struct drm_device *ddev = adev_to_drm(adev);
1691 		const char *pci_address_name = pci_name(ddev->pdev);
1692 		char *pciaddstr, *pciaddstr_tmp, *pciaddname_tmp, *pciaddname;
1693 
1694 		pciaddstr = kstrdup(amdgpu_virtual_display, GFP_KERNEL);
1695 		pciaddstr_tmp = pciaddstr;
1696 		while ((pciaddname_tmp = strsep(&pciaddstr_tmp, ";"))) {
1697 			pciaddname = strsep(&pciaddname_tmp, ",");
1698 			if (!strcmp("all", pciaddname)
1699 			    || !strcmp(pci_address_name, pciaddname)) {
1700 				long num_crtc;
1701 				int res = -1;
1702 
1703 				adev->enable_virtual_display = true;
1704 
1705 				if (pciaddname_tmp)
1706 					res = kstrtol(pciaddname_tmp, 10,
1707 						      &num_crtc);
1708 
1709 				if (!res) {
1710 					if (num_crtc < 1)
1711 						num_crtc = 1;
1712 					if (num_crtc > 6)
1713 						num_crtc = 6;
1714 					adev->mode_info.num_crtc = num_crtc;
1715 				} else {
1716 					adev->mode_info.num_crtc = 1;
1717 				}
1718 				break;
1719 			}
1720 		}
1721 
1722 		DRM_INFO("virtual display string:%s, %s:virtual_display:%d, num_crtc:%d\n",
1723 			 amdgpu_virtual_display, pci_address_name,
1724 			 adev->enable_virtual_display, adev->mode_info.num_crtc);
1725 
1726 		kfree(pciaddstr);
1727 	}
1728 }
1729 
1730 /**
1731  * amdgpu_device_parse_gpu_info_fw - parse gpu info firmware
1732  *
1733  * @adev: amdgpu_device pointer
1734  *
1735  * Parses the asic configuration parameters specified in the gpu info
1736  * firmware and makes them availale to the driver for use in configuring
1737  * the asic.
1738  * Returns 0 on success, -EINVAL on failure.
1739  */
amdgpu_device_parse_gpu_info_fw(struct amdgpu_device * adev)1740 static int amdgpu_device_parse_gpu_info_fw(struct amdgpu_device *adev)
1741 {
1742 	const char *chip_name;
1743 	char fw_name[40];
1744 	int err;
1745 	const struct gpu_info_firmware_header_v1_0 *hdr;
1746 
1747 	adev->firmware.gpu_info_fw = NULL;
1748 
1749 	if (adev->mman.discovery_bin) {
1750 		amdgpu_discovery_get_gfx_info(adev);
1751 
1752 		/*
1753 		 * FIXME: The bounding box is still needed by Navi12, so
1754 		 * temporarily read it from gpu_info firmware. Should be droped
1755 		 * when DAL no longer needs it.
1756 		 */
1757 		if (adev->asic_type != CHIP_NAVI12)
1758 			return 0;
1759 	}
1760 
1761 	switch (adev->asic_type) {
1762 #ifdef CONFIG_DRM_AMDGPU_SI
1763 	case CHIP_VERDE:
1764 	case CHIP_TAHITI:
1765 	case CHIP_PITCAIRN:
1766 	case CHIP_OLAND:
1767 	case CHIP_HAINAN:
1768 #endif
1769 #ifdef CONFIG_DRM_AMDGPU_CIK
1770 	case CHIP_BONAIRE:
1771 	case CHIP_HAWAII:
1772 	case CHIP_KAVERI:
1773 	case CHIP_KABINI:
1774 	case CHIP_MULLINS:
1775 #endif
1776 	case CHIP_TOPAZ:
1777 	case CHIP_TONGA:
1778 	case CHIP_FIJI:
1779 	case CHIP_POLARIS10:
1780 	case CHIP_POLARIS11:
1781 	case CHIP_POLARIS12:
1782 	case CHIP_VEGAM:
1783 	case CHIP_CARRIZO:
1784 	case CHIP_STONEY:
1785 	case CHIP_VEGA20:
1786 	case CHIP_SIENNA_CICHLID:
1787 	case CHIP_NAVY_FLOUNDER:
1788 	default:
1789 		return 0;
1790 	case CHIP_VEGA10:
1791 		chip_name = "vega10";
1792 		break;
1793 	case CHIP_VEGA12:
1794 		chip_name = "vega12";
1795 		break;
1796 	case CHIP_RAVEN:
1797 		if (adev->apu_flags & AMD_APU_IS_RAVEN2)
1798 			chip_name = "raven2";
1799 		else if (adev->apu_flags & AMD_APU_IS_PICASSO)
1800 			chip_name = "picasso";
1801 		else
1802 			chip_name = "raven";
1803 		break;
1804 	case CHIP_ARCTURUS:
1805 		chip_name = "arcturus";
1806 		break;
1807 	case CHIP_RENOIR:
1808 		if (adev->apu_flags & AMD_APU_IS_RENOIR)
1809 			chip_name = "renoir";
1810 		else
1811 			chip_name = "green_sardine";
1812 		break;
1813 	case CHIP_NAVI10:
1814 		chip_name = "navi10";
1815 		break;
1816 	case CHIP_NAVI14:
1817 		chip_name = "navi14";
1818 		break;
1819 	case CHIP_NAVI12:
1820 		chip_name = "navi12";
1821 		break;
1822 	}
1823 
1824 	snprintf(fw_name, sizeof(fw_name), "amdgpu/%s_gpu_info.bin", chip_name);
1825 	err = request_firmware(&adev->firmware.gpu_info_fw, fw_name, adev->dev);
1826 	if (err) {
1827 		dev_err(adev->dev,
1828 			"Failed to load gpu_info firmware \"%s\"\n",
1829 			fw_name);
1830 		goto out;
1831 	}
1832 	err = amdgpu_ucode_validate(adev->firmware.gpu_info_fw);
1833 	if (err) {
1834 		dev_err(adev->dev,
1835 			"Failed to validate gpu_info firmware \"%s\"\n",
1836 			fw_name);
1837 		goto out;
1838 	}
1839 
1840 	hdr = (const struct gpu_info_firmware_header_v1_0 *)adev->firmware.gpu_info_fw->data;
1841 	amdgpu_ucode_print_gpu_info_hdr(&hdr->header);
1842 
1843 	switch (hdr->version_major) {
1844 	case 1:
1845 	{
1846 		const struct gpu_info_firmware_v1_0 *gpu_info_fw =
1847 			(const struct gpu_info_firmware_v1_0 *)(adev->firmware.gpu_info_fw->data +
1848 								le32_to_cpu(hdr->header.ucode_array_offset_bytes));
1849 
1850 		/*
1851 		 * Should be droped when DAL no longer needs it.
1852 		 */
1853 		if (adev->asic_type == CHIP_NAVI12)
1854 			goto parse_soc_bounding_box;
1855 
1856 		adev->gfx.config.max_shader_engines = le32_to_cpu(gpu_info_fw->gc_num_se);
1857 		adev->gfx.config.max_cu_per_sh = le32_to_cpu(gpu_info_fw->gc_num_cu_per_sh);
1858 		adev->gfx.config.max_sh_per_se = le32_to_cpu(gpu_info_fw->gc_num_sh_per_se);
1859 		adev->gfx.config.max_backends_per_se = le32_to_cpu(gpu_info_fw->gc_num_rb_per_se);
1860 		adev->gfx.config.max_texture_channel_caches =
1861 			le32_to_cpu(gpu_info_fw->gc_num_tccs);
1862 		adev->gfx.config.max_gprs = le32_to_cpu(gpu_info_fw->gc_num_gprs);
1863 		adev->gfx.config.max_gs_threads = le32_to_cpu(gpu_info_fw->gc_num_max_gs_thds);
1864 		adev->gfx.config.gs_vgt_table_depth = le32_to_cpu(gpu_info_fw->gc_gs_table_depth);
1865 		adev->gfx.config.gs_prim_buffer_depth = le32_to_cpu(gpu_info_fw->gc_gsprim_buff_depth);
1866 		adev->gfx.config.double_offchip_lds_buf =
1867 			le32_to_cpu(gpu_info_fw->gc_double_offchip_lds_buffer);
1868 		adev->gfx.cu_info.wave_front_size = le32_to_cpu(gpu_info_fw->gc_wave_size);
1869 		adev->gfx.cu_info.max_waves_per_simd =
1870 			le32_to_cpu(gpu_info_fw->gc_max_waves_per_simd);
1871 		adev->gfx.cu_info.max_scratch_slots_per_cu =
1872 			le32_to_cpu(gpu_info_fw->gc_max_scratch_slots_per_cu);
1873 		adev->gfx.cu_info.lds_size = le32_to_cpu(gpu_info_fw->gc_lds_size);
1874 		if (hdr->version_minor >= 1) {
1875 			const struct gpu_info_firmware_v1_1 *gpu_info_fw =
1876 				(const struct gpu_info_firmware_v1_1 *)(adev->firmware.gpu_info_fw->data +
1877 									le32_to_cpu(hdr->header.ucode_array_offset_bytes));
1878 			adev->gfx.config.num_sc_per_sh =
1879 				le32_to_cpu(gpu_info_fw->num_sc_per_sh);
1880 			adev->gfx.config.num_packer_per_sc =
1881 				le32_to_cpu(gpu_info_fw->num_packer_per_sc);
1882 		}
1883 
1884 parse_soc_bounding_box:
1885 		/*
1886 		 * soc bounding box info is not integrated in disocovery table,
1887 		 * we always need to parse it from gpu info firmware if needed.
1888 		 */
1889 		if (hdr->version_minor == 2) {
1890 			const struct gpu_info_firmware_v1_2 *gpu_info_fw =
1891 				(const struct gpu_info_firmware_v1_2 *)(adev->firmware.gpu_info_fw->data +
1892 									le32_to_cpu(hdr->header.ucode_array_offset_bytes));
1893 			adev->dm.soc_bounding_box = &gpu_info_fw->soc_bounding_box;
1894 		}
1895 		break;
1896 	}
1897 	default:
1898 		dev_err(adev->dev,
1899 			"Unsupported gpu_info table %d\n", hdr->header.ucode_version);
1900 		err = -EINVAL;
1901 		goto out;
1902 	}
1903 out:
1904 	return err;
1905 }
1906 
1907 /**
1908  * amdgpu_device_ip_early_init - run early init for hardware IPs
1909  *
1910  * @adev: amdgpu_device pointer
1911  *
1912  * Early initialization pass for hardware IPs.  The hardware IPs that make
1913  * up each asic are discovered each IP's early_init callback is run.  This
1914  * is the first stage in initializing the asic.
1915  * Returns 0 on success, negative error code on failure.
1916  */
amdgpu_device_ip_early_init(struct amdgpu_device * adev)1917 static int amdgpu_device_ip_early_init(struct amdgpu_device *adev)
1918 {
1919 	int i, r;
1920 
1921 	amdgpu_device_enable_virtual_display(adev);
1922 
1923 	if (amdgpu_sriov_vf(adev)) {
1924 		r = amdgpu_virt_request_full_gpu(adev, true);
1925 		if (r)
1926 			return r;
1927 	}
1928 
1929 	switch (adev->asic_type) {
1930 #ifdef CONFIG_DRM_AMDGPU_SI
1931 	case CHIP_VERDE:
1932 	case CHIP_TAHITI:
1933 	case CHIP_PITCAIRN:
1934 	case CHIP_OLAND:
1935 	case CHIP_HAINAN:
1936 		adev->family = AMDGPU_FAMILY_SI;
1937 		r = si_set_ip_blocks(adev);
1938 		if (r)
1939 			return r;
1940 		break;
1941 #endif
1942 #ifdef CONFIG_DRM_AMDGPU_CIK
1943 	case CHIP_BONAIRE:
1944 	case CHIP_HAWAII:
1945 	case CHIP_KAVERI:
1946 	case CHIP_KABINI:
1947 	case CHIP_MULLINS:
1948 		if (adev->flags & AMD_IS_APU)
1949 			adev->family = AMDGPU_FAMILY_KV;
1950 		else
1951 			adev->family = AMDGPU_FAMILY_CI;
1952 
1953 		r = cik_set_ip_blocks(adev);
1954 		if (r)
1955 			return r;
1956 		break;
1957 #endif
1958 	case CHIP_TOPAZ:
1959 	case CHIP_TONGA:
1960 	case CHIP_FIJI:
1961 	case CHIP_POLARIS10:
1962 	case CHIP_POLARIS11:
1963 	case CHIP_POLARIS12:
1964 	case CHIP_VEGAM:
1965 	case CHIP_CARRIZO:
1966 	case CHIP_STONEY:
1967 		if (adev->flags & AMD_IS_APU)
1968 			adev->family = AMDGPU_FAMILY_CZ;
1969 		else
1970 			adev->family = AMDGPU_FAMILY_VI;
1971 
1972 		r = vi_set_ip_blocks(adev);
1973 		if (r)
1974 			return r;
1975 		break;
1976 	case CHIP_VEGA10:
1977 	case CHIP_VEGA12:
1978 	case CHIP_VEGA20:
1979 	case CHIP_RAVEN:
1980 	case CHIP_ARCTURUS:
1981 	case CHIP_RENOIR:
1982 		if (adev->flags & AMD_IS_APU)
1983 			adev->family = AMDGPU_FAMILY_RV;
1984 		else
1985 			adev->family = AMDGPU_FAMILY_AI;
1986 
1987 		r = soc15_set_ip_blocks(adev);
1988 		if (r)
1989 			return r;
1990 		break;
1991 	case  CHIP_NAVI10:
1992 	case  CHIP_NAVI14:
1993 	case  CHIP_NAVI12:
1994 	case  CHIP_SIENNA_CICHLID:
1995 	case  CHIP_NAVY_FLOUNDER:
1996 		adev->family = AMDGPU_FAMILY_NV;
1997 
1998 		r = nv_set_ip_blocks(adev);
1999 		if (r)
2000 			return r;
2001 		break;
2002 	default:
2003 		/* FIXME: not supported yet */
2004 		return -EINVAL;
2005 	}
2006 
2007 	amdgpu_amdkfd_device_probe(adev);
2008 
2009 	adev->pm.pp_feature = amdgpu_pp_feature_mask;
2010 	if (amdgpu_sriov_vf(adev) || sched_policy == KFD_SCHED_POLICY_NO_HWS)
2011 		adev->pm.pp_feature &= ~PP_GFXOFF_MASK;
2012 
2013 	for (i = 0; i < adev->num_ip_blocks; i++) {
2014 		if ((amdgpu_ip_block_mask & (1 << i)) == 0) {
2015 			DRM_ERROR("disabled ip block: %d <%s>\n",
2016 				  i, adev->ip_blocks[i].version->funcs->name);
2017 			adev->ip_blocks[i].status.valid = false;
2018 		} else {
2019 			if (adev->ip_blocks[i].version->funcs->early_init) {
2020 				r = adev->ip_blocks[i].version->funcs->early_init((void *)adev);
2021 				if (r == -ENOENT) {
2022 					adev->ip_blocks[i].status.valid = false;
2023 				} else if (r) {
2024 					DRM_ERROR("early_init of IP block <%s> failed %d\n",
2025 						  adev->ip_blocks[i].version->funcs->name, r);
2026 					return r;
2027 				} else {
2028 					adev->ip_blocks[i].status.valid = true;
2029 				}
2030 			} else {
2031 				adev->ip_blocks[i].status.valid = true;
2032 			}
2033 		}
2034 		/* get the vbios after the asic_funcs are set up */
2035 		if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_COMMON) {
2036 			r = amdgpu_device_parse_gpu_info_fw(adev);
2037 			if (r)
2038 				return r;
2039 
2040 			/* Read BIOS */
2041 			if (!amdgpu_get_bios(adev))
2042 				return -EINVAL;
2043 
2044 			r = amdgpu_atombios_init(adev);
2045 			if (r) {
2046 				dev_err(adev->dev, "amdgpu_atombios_init failed\n");
2047 				amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_ATOMBIOS_INIT_FAIL, 0, 0);
2048 				return r;
2049 			}
2050 
2051 			/*get pf2vf msg info at it's earliest time*/
2052 			if (amdgpu_sriov_vf(adev))
2053 				amdgpu_virt_init_data_exchange(adev);
2054 
2055 		}
2056 	}
2057 
2058 	adev->cg_flags &= amdgpu_cg_mask;
2059 	adev->pg_flags &= amdgpu_pg_mask;
2060 
2061 	return 0;
2062 }
2063 
amdgpu_device_ip_hw_init_phase1(struct amdgpu_device * adev)2064 static int amdgpu_device_ip_hw_init_phase1(struct amdgpu_device *adev)
2065 {
2066 	int i, r;
2067 
2068 	for (i = 0; i < adev->num_ip_blocks; i++) {
2069 		if (!adev->ip_blocks[i].status.sw)
2070 			continue;
2071 		if (adev->ip_blocks[i].status.hw)
2072 			continue;
2073 		if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_COMMON ||
2074 		    (amdgpu_sriov_vf(adev) && (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_PSP)) ||
2075 		    adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_IH) {
2076 			r = adev->ip_blocks[i].version->funcs->hw_init(adev);
2077 			if (r) {
2078 				DRM_ERROR("hw_init of IP block <%s> failed %d\n",
2079 					  adev->ip_blocks[i].version->funcs->name, r);
2080 				return r;
2081 			}
2082 			adev->ip_blocks[i].status.hw = true;
2083 		}
2084 	}
2085 
2086 	return 0;
2087 }
2088 
amdgpu_device_ip_hw_init_phase2(struct amdgpu_device * adev)2089 static int amdgpu_device_ip_hw_init_phase2(struct amdgpu_device *adev)
2090 {
2091 	int i, r;
2092 
2093 	for (i = 0; i < adev->num_ip_blocks; i++) {
2094 		if (!adev->ip_blocks[i].status.sw)
2095 			continue;
2096 		if (adev->ip_blocks[i].status.hw)
2097 			continue;
2098 		r = adev->ip_blocks[i].version->funcs->hw_init(adev);
2099 		if (r) {
2100 			DRM_ERROR("hw_init of IP block <%s> failed %d\n",
2101 				  adev->ip_blocks[i].version->funcs->name, r);
2102 			return r;
2103 		}
2104 		adev->ip_blocks[i].status.hw = true;
2105 	}
2106 
2107 	return 0;
2108 }
2109 
amdgpu_device_fw_loading(struct amdgpu_device * adev)2110 static int amdgpu_device_fw_loading(struct amdgpu_device *adev)
2111 {
2112 	int r = 0;
2113 	int i;
2114 	uint32_t smu_version;
2115 
2116 	if (adev->asic_type >= CHIP_VEGA10) {
2117 		for (i = 0; i < adev->num_ip_blocks; i++) {
2118 			if (adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_PSP)
2119 				continue;
2120 
2121 			/* no need to do the fw loading again if already done*/
2122 			if (adev->ip_blocks[i].status.hw == true)
2123 				break;
2124 
2125 			if (amdgpu_in_reset(adev) || adev->in_suspend) {
2126 				r = adev->ip_blocks[i].version->funcs->resume(adev);
2127 				if (r) {
2128 					DRM_ERROR("resume of IP block <%s> failed %d\n",
2129 							  adev->ip_blocks[i].version->funcs->name, r);
2130 					return r;
2131 				}
2132 			} else {
2133 				r = adev->ip_blocks[i].version->funcs->hw_init(adev);
2134 				if (r) {
2135 					DRM_ERROR("hw_init of IP block <%s> failed %d\n",
2136 							  adev->ip_blocks[i].version->funcs->name, r);
2137 					return r;
2138 				}
2139 			}
2140 
2141 			adev->ip_blocks[i].status.hw = true;
2142 			break;
2143 		}
2144 	}
2145 
2146 	if (!amdgpu_sriov_vf(adev) || adev->asic_type == CHIP_TONGA)
2147 		r = amdgpu_pm_load_smu_firmware(adev, &smu_version);
2148 
2149 	return r;
2150 }
2151 
2152 /**
2153  * amdgpu_device_ip_init - run init for hardware IPs
2154  *
2155  * @adev: amdgpu_device pointer
2156  *
2157  * Main initialization pass for hardware IPs.  The list of all the hardware
2158  * IPs that make up the asic is walked and the sw_init and hw_init callbacks
2159  * are run.  sw_init initializes the software state associated with each IP
2160  * and hw_init initializes the hardware associated with each IP.
2161  * Returns 0 on success, negative error code on failure.
2162  */
amdgpu_device_ip_init(struct amdgpu_device * adev)2163 static int amdgpu_device_ip_init(struct amdgpu_device *adev)
2164 {
2165 	int i, r;
2166 
2167 	r = amdgpu_ras_init(adev);
2168 	if (r)
2169 		return r;
2170 
2171 	for (i = 0; i < adev->num_ip_blocks; i++) {
2172 		if (!adev->ip_blocks[i].status.valid)
2173 			continue;
2174 		r = adev->ip_blocks[i].version->funcs->sw_init((void *)adev);
2175 		if (r) {
2176 			DRM_ERROR("sw_init of IP block <%s> failed %d\n",
2177 				  adev->ip_blocks[i].version->funcs->name, r);
2178 			goto init_failed;
2179 		}
2180 		adev->ip_blocks[i].status.sw = true;
2181 
2182 		/* need to do gmc hw init early so we can allocate gpu mem */
2183 		if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC) {
2184 			/* Try to reserve bad pages early */
2185 			if (amdgpu_sriov_vf(adev))
2186 				amdgpu_virt_exchange_data(adev);
2187 
2188 			r = amdgpu_device_vram_scratch_init(adev);
2189 			if (r) {
2190 				DRM_ERROR("amdgpu_vram_scratch_init failed %d\n", r);
2191 				goto init_failed;
2192 			}
2193 			r = adev->ip_blocks[i].version->funcs->hw_init((void *)adev);
2194 			if (r) {
2195 				DRM_ERROR("hw_init %d failed %d\n", i, r);
2196 				goto init_failed;
2197 			}
2198 			r = amdgpu_device_wb_init(adev);
2199 			if (r) {
2200 				DRM_ERROR("amdgpu_device_wb_init failed %d\n", r);
2201 				goto init_failed;
2202 			}
2203 			adev->ip_blocks[i].status.hw = true;
2204 
2205 			/* right after GMC hw init, we create CSA */
2206 			if (amdgpu_mcbp || amdgpu_sriov_vf(adev)) {
2207 				r = amdgpu_allocate_static_csa(adev, &adev->virt.csa_obj,
2208 								AMDGPU_GEM_DOMAIN_VRAM,
2209 								AMDGPU_CSA_SIZE);
2210 				if (r) {
2211 					DRM_ERROR("allocate CSA failed %d\n", r);
2212 					goto init_failed;
2213 				}
2214 			}
2215 		}
2216 	}
2217 
2218 	if (amdgpu_sriov_vf(adev))
2219 		amdgpu_virt_init_data_exchange(adev);
2220 
2221 	r = amdgpu_ib_pool_init(adev);
2222 	if (r) {
2223 		dev_err(adev->dev, "IB initialization failed (%d).\n", r);
2224 		amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_IB_INIT_FAIL, 0, r);
2225 		goto init_failed;
2226 	}
2227 
2228 	r = amdgpu_ucode_create_bo(adev); /* create ucode bo when sw_init complete*/
2229 	if (r)
2230 		goto init_failed;
2231 
2232 	r = amdgpu_device_ip_hw_init_phase1(adev);
2233 	if (r)
2234 		goto init_failed;
2235 
2236 	r = amdgpu_device_fw_loading(adev);
2237 	if (r)
2238 		goto init_failed;
2239 
2240 	r = amdgpu_device_ip_hw_init_phase2(adev);
2241 	if (r)
2242 		goto init_failed;
2243 
2244 	/*
2245 	 * retired pages will be loaded from eeprom and reserved here,
2246 	 * it should be called after amdgpu_device_ip_hw_init_phase2  since
2247 	 * for some ASICs the RAS EEPROM code relies on SMU fully functioning
2248 	 * for I2C communication which only true at this point.
2249 	 *
2250 	 * amdgpu_ras_recovery_init may fail, but the upper only cares the
2251 	 * failure from bad gpu situation and stop amdgpu init process
2252 	 * accordingly. For other failed cases, it will still release all
2253 	 * the resource and print error message, rather than returning one
2254 	 * negative value to upper level.
2255 	 *
2256 	 * Note: theoretically, this should be called before all vram allocations
2257 	 * to protect retired page from abusing
2258 	 */
2259 	r = amdgpu_ras_recovery_init(adev);
2260 	if (r)
2261 		goto init_failed;
2262 
2263 	if (adev->gmc.xgmi.num_physical_nodes > 1)
2264 		amdgpu_xgmi_add_device(adev);
2265 	amdgpu_amdkfd_device_init(adev);
2266 
2267 	amdgpu_fru_get_product_info(adev);
2268 
2269 init_failed:
2270 	if (amdgpu_sriov_vf(adev))
2271 		amdgpu_virt_release_full_gpu(adev, true);
2272 
2273 	return r;
2274 }
2275 
2276 /**
2277  * amdgpu_device_fill_reset_magic - writes reset magic to gart pointer
2278  *
2279  * @adev: amdgpu_device pointer
2280  *
2281  * Writes a reset magic value to the gart pointer in VRAM.  The driver calls
2282  * this function before a GPU reset.  If the value is retained after a
2283  * GPU reset, VRAM has not been lost.  Some GPU resets may destry VRAM contents.
2284  */
amdgpu_device_fill_reset_magic(struct amdgpu_device * adev)2285 static void amdgpu_device_fill_reset_magic(struct amdgpu_device *adev)
2286 {
2287 	memcpy(adev->reset_magic, adev->gart.ptr, AMDGPU_RESET_MAGIC_NUM);
2288 }
2289 
2290 /**
2291  * amdgpu_device_check_vram_lost - check if vram is valid
2292  *
2293  * @adev: amdgpu_device pointer
2294  *
2295  * Checks the reset magic value written to the gart pointer in VRAM.
2296  * The driver calls this after a GPU reset to see if the contents of
2297  * VRAM is lost or now.
2298  * returns true if vram is lost, false if not.
2299  */
amdgpu_device_check_vram_lost(struct amdgpu_device * adev)2300 static bool amdgpu_device_check_vram_lost(struct amdgpu_device *adev)
2301 {
2302 	if (memcmp(adev->gart.ptr, adev->reset_magic,
2303 			AMDGPU_RESET_MAGIC_NUM))
2304 		return true;
2305 
2306 	if (!amdgpu_in_reset(adev))
2307 		return false;
2308 
2309 	/*
2310 	 * For all ASICs with baco/mode1 reset, the VRAM is
2311 	 * always assumed to be lost.
2312 	 */
2313 	switch (amdgpu_asic_reset_method(adev)) {
2314 	case AMD_RESET_METHOD_BACO:
2315 	case AMD_RESET_METHOD_MODE1:
2316 		return true;
2317 	default:
2318 		return false;
2319 	}
2320 }
2321 
2322 /**
2323  * amdgpu_device_set_cg_state - set clockgating for amdgpu device
2324  *
2325  * @adev: amdgpu_device pointer
2326  * @state: clockgating state (gate or ungate)
2327  *
2328  * The list of all the hardware IPs that make up the asic is walked and the
2329  * set_clockgating_state callbacks are run.
2330  * Late initialization pass enabling clockgating for hardware IPs.
2331  * Fini or suspend, pass disabling clockgating for hardware IPs.
2332  * Returns 0 on success, negative error code on failure.
2333  */
2334 
amdgpu_device_set_cg_state(struct amdgpu_device * adev,enum amd_clockgating_state state)2335 static int amdgpu_device_set_cg_state(struct amdgpu_device *adev,
2336 						enum amd_clockgating_state state)
2337 {
2338 	int i, j, r;
2339 
2340 	if (amdgpu_emu_mode == 1)
2341 		return 0;
2342 
2343 	for (j = 0; j < adev->num_ip_blocks; j++) {
2344 		i = state == AMD_CG_STATE_GATE ? j : adev->num_ip_blocks - j - 1;
2345 		if (!adev->ip_blocks[i].status.late_initialized)
2346 			continue;
2347 		/* skip CG for VCE/UVD, it's handled specially */
2348 		if (adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_UVD &&
2349 		    adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_VCE &&
2350 		    adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_VCN &&
2351 		    adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_JPEG &&
2352 		    adev->ip_blocks[i].version->funcs->set_clockgating_state) {
2353 			/* enable clockgating to save power */
2354 			r = adev->ip_blocks[i].version->funcs->set_clockgating_state((void *)adev,
2355 										     state);
2356 			if (r) {
2357 				DRM_ERROR("set_clockgating_state(gate) of IP block <%s> failed %d\n",
2358 					  adev->ip_blocks[i].version->funcs->name, r);
2359 				return r;
2360 			}
2361 		}
2362 	}
2363 
2364 	return 0;
2365 }
2366 
amdgpu_device_set_pg_state(struct amdgpu_device * adev,enum amd_powergating_state state)2367 static int amdgpu_device_set_pg_state(struct amdgpu_device *adev, enum amd_powergating_state state)
2368 {
2369 	int i, j, r;
2370 
2371 	if (amdgpu_emu_mode == 1)
2372 		return 0;
2373 
2374 	for (j = 0; j < adev->num_ip_blocks; j++) {
2375 		i = state == AMD_PG_STATE_GATE ? j : adev->num_ip_blocks - j - 1;
2376 		if (!adev->ip_blocks[i].status.late_initialized)
2377 			continue;
2378 		/* skip CG for VCE/UVD, it's handled specially */
2379 		if (adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_UVD &&
2380 		    adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_VCE &&
2381 		    adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_VCN &&
2382 		    adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_JPEG &&
2383 		    adev->ip_blocks[i].version->funcs->set_powergating_state) {
2384 			/* enable powergating to save power */
2385 			r = adev->ip_blocks[i].version->funcs->set_powergating_state((void *)adev,
2386 											state);
2387 			if (r) {
2388 				DRM_ERROR("set_powergating_state(gate) of IP block <%s> failed %d\n",
2389 					  adev->ip_blocks[i].version->funcs->name, r);
2390 				return r;
2391 			}
2392 		}
2393 	}
2394 	return 0;
2395 }
2396 
amdgpu_device_enable_mgpu_fan_boost(void)2397 static int amdgpu_device_enable_mgpu_fan_boost(void)
2398 {
2399 	struct amdgpu_gpu_instance *gpu_ins;
2400 	struct amdgpu_device *adev;
2401 	int i, ret = 0;
2402 
2403 	mutex_lock(&mgpu_info.mutex);
2404 
2405 	/*
2406 	 * MGPU fan boost feature should be enabled
2407 	 * only when there are two or more dGPUs in
2408 	 * the system
2409 	 */
2410 	if (mgpu_info.num_dgpu < 2)
2411 		goto out;
2412 
2413 	for (i = 0; i < mgpu_info.num_dgpu; i++) {
2414 		gpu_ins = &(mgpu_info.gpu_ins[i]);
2415 		adev = gpu_ins->adev;
2416 		if (!(adev->flags & AMD_IS_APU) &&
2417 		    !gpu_ins->mgpu_fan_enabled) {
2418 			ret = amdgpu_dpm_enable_mgpu_fan_boost(adev);
2419 			if (ret)
2420 				break;
2421 
2422 			gpu_ins->mgpu_fan_enabled = 1;
2423 		}
2424 	}
2425 
2426 out:
2427 	mutex_unlock(&mgpu_info.mutex);
2428 
2429 	return ret;
2430 }
2431 
2432 /**
2433  * amdgpu_device_ip_late_init - run late init for hardware IPs
2434  *
2435  * @adev: amdgpu_device pointer
2436  *
2437  * Late initialization pass for hardware IPs.  The list of all the hardware
2438  * IPs that make up the asic is walked and the late_init callbacks are run.
2439  * late_init covers any special initialization that an IP requires
2440  * after all of the have been initialized or something that needs to happen
2441  * late in the init process.
2442  * Returns 0 on success, negative error code on failure.
2443  */
amdgpu_device_ip_late_init(struct amdgpu_device * adev)2444 static int amdgpu_device_ip_late_init(struct amdgpu_device *adev)
2445 {
2446 	struct amdgpu_gpu_instance *gpu_instance;
2447 	int i = 0, r;
2448 
2449 	for (i = 0; i < adev->num_ip_blocks; i++) {
2450 		if (!adev->ip_blocks[i].status.hw)
2451 			continue;
2452 		if (adev->ip_blocks[i].version->funcs->late_init) {
2453 			r = adev->ip_blocks[i].version->funcs->late_init((void *)adev);
2454 			if (r) {
2455 				DRM_ERROR("late_init of IP block <%s> failed %d\n",
2456 					  adev->ip_blocks[i].version->funcs->name, r);
2457 				return r;
2458 			}
2459 		}
2460 		adev->ip_blocks[i].status.late_initialized = true;
2461 	}
2462 
2463 	amdgpu_ras_set_error_query_ready(adev, true);
2464 
2465 	amdgpu_device_set_cg_state(adev, AMD_CG_STATE_GATE);
2466 	amdgpu_device_set_pg_state(adev, AMD_PG_STATE_GATE);
2467 
2468 	amdgpu_device_fill_reset_magic(adev);
2469 
2470 	r = amdgpu_device_enable_mgpu_fan_boost();
2471 	if (r)
2472 		DRM_ERROR("enable mgpu fan boost failed (%d).\n", r);
2473 
2474 
2475 	if (adev->gmc.xgmi.num_physical_nodes > 1) {
2476 		mutex_lock(&mgpu_info.mutex);
2477 
2478 		/*
2479 		 * Reset device p-state to low as this was booted with high.
2480 		 *
2481 		 * This should be performed only after all devices from the same
2482 		 * hive get initialized.
2483 		 *
2484 		 * However, it's unknown how many device in the hive in advance.
2485 		 * As this is counted one by one during devices initializations.
2486 		 *
2487 		 * So, we wait for all XGMI interlinked devices initialized.
2488 		 * This may bring some delays as those devices may come from
2489 		 * different hives. But that should be OK.
2490 		 */
2491 		if (mgpu_info.num_dgpu == adev->gmc.xgmi.num_physical_nodes) {
2492 			for (i = 0; i < mgpu_info.num_gpu; i++) {
2493 				gpu_instance = &(mgpu_info.gpu_ins[i]);
2494 				if (gpu_instance->adev->flags & AMD_IS_APU)
2495 					continue;
2496 
2497 				r = amdgpu_xgmi_set_pstate(gpu_instance->adev,
2498 						AMDGPU_XGMI_PSTATE_MIN);
2499 				if (r) {
2500 					DRM_ERROR("pstate setting failed (%d).\n", r);
2501 					break;
2502 				}
2503 			}
2504 		}
2505 
2506 		mutex_unlock(&mgpu_info.mutex);
2507 	}
2508 
2509 	return 0;
2510 }
2511 
2512 /**
2513  * amdgpu_device_ip_fini - run fini for hardware IPs
2514  *
2515  * @adev: amdgpu_device pointer
2516  *
2517  * Main teardown pass for hardware IPs.  The list of all the hardware
2518  * IPs that make up the asic is walked and the hw_fini and sw_fini callbacks
2519  * are run.  hw_fini tears down the hardware associated with each IP
2520  * and sw_fini tears down any software state associated with each IP.
2521  * Returns 0 on success, negative error code on failure.
2522  */
amdgpu_device_ip_fini(struct amdgpu_device * adev)2523 static int amdgpu_device_ip_fini(struct amdgpu_device *adev)
2524 {
2525 	int i, r;
2526 
2527 	if (amdgpu_sriov_vf(adev) && adev->virt.ras_init_done)
2528 		amdgpu_virt_release_ras_err_handler_data(adev);
2529 
2530 	amdgpu_ras_pre_fini(adev);
2531 
2532 	if (adev->gmc.xgmi.num_physical_nodes > 1)
2533 		amdgpu_xgmi_remove_device(adev);
2534 
2535 	amdgpu_device_set_pg_state(adev, AMD_PG_STATE_UNGATE);
2536 	amdgpu_device_set_cg_state(adev, AMD_CG_STATE_UNGATE);
2537 
2538 	amdgpu_amdkfd_device_fini(adev);
2539 
2540 	/* need to disable SMC first */
2541 	for (i = 0; i < adev->num_ip_blocks; i++) {
2542 		if (!adev->ip_blocks[i].status.hw)
2543 			continue;
2544 		if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_SMC) {
2545 			r = adev->ip_blocks[i].version->funcs->hw_fini((void *)adev);
2546 			/* XXX handle errors */
2547 			if (r) {
2548 				DRM_DEBUG("hw_fini of IP block <%s> failed %d\n",
2549 					  adev->ip_blocks[i].version->funcs->name, r);
2550 			}
2551 			adev->ip_blocks[i].status.hw = false;
2552 			break;
2553 		}
2554 	}
2555 
2556 	for (i = adev->num_ip_blocks - 1; i >= 0; i--) {
2557 		if (!adev->ip_blocks[i].status.hw)
2558 			continue;
2559 
2560 		r = adev->ip_blocks[i].version->funcs->hw_fini((void *)adev);
2561 		/* XXX handle errors */
2562 		if (r) {
2563 			DRM_DEBUG("hw_fini of IP block <%s> failed %d\n",
2564 				  adev->ip_blocks[i].version->funcs->name, r);
2565 		}
2566 
2567 		adev->ip_blocks[i].status.hw = false;
2568 	}
2569 
2570 
2571 	for (i = adev->num_ip_blocks - 1; i >= 0; i--) {
2572 		if (!adev->ip_blocks[i].status.sw)
2573 			continue;
2574 
2575 		if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC) {
2576 			amdgpu_ucode_free_bo(adev);
2577 			amdgpu_free_static_csa(&adev->virt.csa_obj);
2578 			amdgpu_device_wb_fini(adev);
2579 			amdgpu_device_vram_scratch_fini(adev);
2580 			amdgpu_ib_pool_fini(adev);
2581 		}
2582 
2583 		r = adev->ip_blocks[i].version->funcs->sw_fini((void *)adev);
2584 		/* XXX handle errors */
2585 		if (r) {
2586 			DRM_DEBUG("sw_fini of IP block <%s> failed %d\n",
2587 				  adev->ip_blocks[i].version->funcs->name, r);
2588 		}
2589 		adev->ip_blocks[i].status.sw = false;
2590 		adev->ip_blocks[i].status.valid = false;
2591 	}
2592 
2593 	for (i = adev->num_ip_blocks - 1; i >= 0; i--) {
2594 		if (!adev->ip_blocks[i].status.late_initialized)
2595 			continue;
2596 		if (adev->ip_blocks[i].version->funcs->late_fini)
2597 			adev->ip_blocks[i].version->funcs->late_fini((void *)adev);
2598 		adev->ip_blocks[i].status.late_initialized = false;
2599 	}
2600 
2601 	amdgpu_ras_fini(adev);
2602 
2603 	if (amdgpu_sriov_vf(adev))
2604 		if (amdgpu_virt_release_full_gpu(adev, false))
2605 			DRM_ERROR("failed to release exclusive mode on fini\n");
2606 
2607 	return 0;
2608 }
2609 
2610 /**
2611  * amdgpu_device_delayed_init_work_handler - work handler for IB tests
2612  *
2613  * @work: work_struct.
2614  */
amdgpu_device_delayed_init_work_handler(struct work_struct * work)2615 static void amdgpu_device_delayed_init_work_handler(struct work_struct *work)
2616 {
2617 	struct amdgpu_device *adev =
2618 		container_of(work, struct amdgpu_device, delayed_init_work.work);
2619 	int r;
2620 
2621 	r = amdgpu_ib_ring_tests(adev);
2622 	if (r)
2623 		DRM_ERROR("ib ring test failed (%d).\n", r);
2624 }
2625 
amdgpu_device_delay_enable_gfx_off(struct work_struct * work)2626 static void amdgpu_device_delay_enable_gfx_off(struct work_struct *work)
2627 {
2628 	struct amdgpu_device *adev =
2629 		container_of(work, struct amdgpu_device, gfx.gfx_off_delay_work.work);
2630 
2631 	WARN_ON_ONCE(adev->gfx.gfx_off_state);
2632 	WARN_ON_ONCE(adev->gfx.gfx_off_req_count);
2633 
2634 	if (!amdgpu_dpm_set_powergating_by_smu(adev, AMD_IP_BLOCK_TYPE_GFX, true))
2635 		adev->gfx.gfx_off_state = true;
2636 }
2637 
2638 /**
2639  * amdgpu_device_ip_suspend_phase1 - run suspend for hardware IPs (phase 1)
2640  *
2641  * @adev: amdgpu_device pointer
2642  *
2643  * Main suspend function for hardware IPs.  The list of all the hardware
2644  * IPs that make up the asic is walked, clockgating is disabled and the
2645  * suspend callbacks are run.  suspend puts the hardware and software state
2646  * in each IP into a state suitable for suspend.
2647  * Returns 0 on success, negative error code on failure.
2648  */
amdgpu_device_ip_suspend_phase1(struct amdgpu_device * adev)2649 static int amdgpu_device_ip_suspend_phase1(struct amdgpu_device *adev)
2650 {
2651 	int i, r;
2652 
2653 	amdgpu_device_set_pg_state(adev, AMD_PG_STATE_UNGATE);
2654 	amdgpu_device_set_cg_state(adev, AMD_CG_STATE_UNGATE);
2655 
2656 	for (i = adev->num_ip_blocks - 1; i >= 0; i--) {
2657 		if (!adev->ip_blocks[i].status.valid)
2658 			continue;
2659 
2660 		/* displays are handled separately */
2661 		if (adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_DCE)
2662 			continue;
2663 
2664 		/* XXX handle errors */
2665 		r = adev->ip_blocks[i].version->funcs->suspend(adev);
2666 		/* XXX handle errors */
2667 		if (r) {
2668 			DRM_ERROR("suspend of IP block <%s> failed %d\n",
2669 				  adev->ip_blocks[i].version->funcs->name, r);
2670 			return r;
2671 		}
2672 
2673 		adev->ip_blocks[i].status.hw = false;
2674 	}
2675 
2676 	return 0;
2677 }
2678 
2679 /**
2680  * amdgpu_device_ip_suspend_phase2 - run suspend for hardware IPs (phase 2)
2681  *
2682  * @adev: amdgpu_device pointer
2683  *
2684  * Main suspend function for hardware IPs.  The list of all the hardware
2685  * IPs that make up the asic is walked, clockgating is disabled and the
2686  * suspend callbacks are run.  suspend puts the hardware and software state
2687  * in each IP into a state suitable for suspend.
2688  * Returns 0 on success, negative error code on failure.
2689  */
amdgpu_device_ip_suspend_phase2(struct amdgpu_device * adev)2690 static int amdgpu_device_ip_suspend_phase2(struct amdgpu_device *adev)
2691 {
2692 	int i, r;
2693 
2694 	for (i = adev->num_ip_blocks - 1; i >= 0; i--) {
2695 		if (!adev->ip_blocks[i].status.valid)
2696 			continue;
2697 		/* displays are handled in phase1 */
2698 		if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_DCE)
2699 			continue;
2700 		/* PSP lost connection when err_event_athub occurs */
2701 		if (amdgpu_ras_intr_triggered() &&
2702 		    adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_PSP) {
2703 			adev->ip_blocks[i].status.hw = false;
2704 			continue;
2705 		}
2706 		/* XXX handle errors */
2707 		r = adev->ip_blocks[i].version->funcs->suspend(adev);
2708 		/* XXX handle errors */
2709 		if (r) {
2710 			DRM_ERROR("suspend of IP block <%s> failed %d\n",
2711 				  adev->ip_blocks[i].version->funcs->name, r);
2712 		}
2713 		adev->ip_blocks[i].status.hw = false;
2714 		/* handle putting the SMC in the appropriate state */
2715 		if(!amdgpu_sriov_vf(adev)){
2716 			if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_SMC) {
2717 				r = amdgpu_dpm_set_mp1_state(adev, adev->mp1_state);
2718 				if (r) {
2719 					DRM_ERROR("SMC failed to set mp1 state %d, %d\n",
2720 							adev->mp1_state, r);
2721 					return r;
2722 				}
2723 			}
2724 		}
2725 		adev->ip_blocks[i].status.hw = false;
2726 	}
2727 
2728 	return 0;
2729 }
2730 
2731 /**
2732  * amdgpu_device_ip_suspend - run suspend for hardware IPs
2733  *
2734  * @adev: amdgpu_device pointer
2735  *
2736  * Main suspend function for hardware IPs.  The list of all the hardware
2737  * IPs that make up the asic is walked, clockgating is disabled and the
2738  * suspend callbacks are run.  suspend puts the hardware and software state
2739  * in each IP into a state suitable for suspend.
2740  * Returns 0 on success, negative error code on failure.
2741  */
amdgpu_device_ip_suspend(struct amdgpu_device * adev)2742 int amdgpu_device_ip_suspend(struct amdgpu_device *adev)
2743 {
2744 	int r;
2745 
2746 	if (amdgpu_sriov_vf(adev))
2747 		amdgpu_virt_request_full_gpu(adev, false);
2748 
2749 	r = amdgpu_device_ip_suspend_phase1(adev);
2750 	if (r)
2751 		return r;
2752 	r = amdgpu_device_ip_suspend_phase2(adev);
2753 
2754 	if (amdgpu_sriov_vf(adev))
2755 		amdgpu_virt_release_full_gpu(adev, false);
2756 
2757 	return r;
2758 }
2759 
amdgpu_device_ip_reinit_early_sriov(struct amdgpu_device * adev)2760 static int amdgpu_device_ip_reinit_early_sriov(struct amdgpu_device *adev)
2761 {
2762 	int i, r;
2763 
2764 	static enum amd_ip_block_type ip_order[] = {
2765 		AMD_IP_BLOCK_TYPE_GMC,
2766 		AMD_IP_BLOCK_TYPE_COMMON,
2767 		AMD_IP_BLOCK_TYPE_PSP,
2768 		AMD_IP_BLOCK_TYPE_IH,
2769 	};
2770 
2771 	for (i = 0; i < adev->num_ip_blocks; i++) {
2772 		int j;
2773 		struct amdgpu_ip_block *block;
2774 
2775 		block = &adev->ip_blocks[i];
2776 		block->status.hw = false;
2777 
2778 		for (j = 0; j < ARRAY_SIZE(ip_order); j++) {
2779 
2780 			if (block->version->type != ip_order[j] ||
2781 				!block->status.valid)
2782 				continue;
2783 
2784 			r = block->version->funcs->hw_init(adev);
2785 			DRM_INFO("RE-INIT-early: %s %s\n", block->version->funcs->name, r?"failed":"succeeded");
2786 			if (r)
2787 				return r;
2788 			block->status.hw = true;
2789 		}
2790 	}
2791 
2792 	return 0;
2793 }
2794 
amdgpu_device_ip_reinit_late_sriov(struct amdgpu_device * adev)2795 static int amdgpu_device_ip_reinit_late_sriov(struct amdgpu_device *adev)
2796 {
2797 	int i, r;
2798 
2799 	static enum amd_ip_block_type ip_order[] = {
2800 		AMD_IP_BLOCK_TYPE_SMC,
2801 		AMD_IP_BLOCK_TYPE_DCE,
2802 		AMD_IP_BLOCK_TYPE_GFX,
2803 		AMD_IP_BLOCK_TYPE_SDMA,
2804 		AMD_IP_BLOCK_TYPE_UVD,
2805 		AMD_IP_BLOCK_TYPE_VCE,
2806 		AMD_IP_BLOCK_TYPE_VCN
2807 	};
2808 
2809 	for (i = 0; i < ARRAY_SIZE(ip_order); i++) {
2810 		int j;
2811 		struct amdgpu_ip_block *block;
2812 
2813 		for (j = 0; j < adev->num_ip_blocks; j++) {
2814 			block = &adev->ip_blocks[j];
2815 
2816 			if (block->version->type != ip_order[i] ||
2817 				!block->status.valid ||
2818 				block->status.hw)
2819 				continue;
2820 
2821 			if (block->version->type == AMD_IP_BLOCK_TYPE_SMC)
2822 				r = block->version->funcs->resume(adev);
2823 			else
2824 				r = block->version->funcs->hw_init(adev);
2825 
2826 			DRM_INFO("RE-INIT-late: %s %s\n", block->version->funcs->name, r?"failed":"succeeded");
2827 			if (r)
2828 				return r;
2829 			block->status.hw = true;
2830 		}
2831 	}
2832 
2833 	return 0;
2834 }
2835 
2836 /**
2837  * amdgpu_device_ip_resume_phase1 - run resume for hardware IPs
2838  *
2839  * @adev: amdgpu_device pointer
2840  *
2841  * First resume function for hardware IPs.  The list of all the hardware
2842  * IPs that make up the asic is walked and the resume callbacks are run for
2843  * COMMON, GMC, and IH.  resume puts the hardware into a functional state
2844  * after a suspend and updates the software state as necessary.  This
2845  * function is also used for restoring the GPU after a GPU reset.
2846  * Returns 0 on success, negative error code on failure.
2847  */
amdgpu_device_ip_resume_phase1(struct amdgpu_device * adev)2848 static int amdgpu_device_ip_resume_phase1(struct amdgpu_device *adev)
2849 {
2850 	int i, r;
2851 
2852 	for (i = 0; i < adev->num_ip_blocks; i++) {
2853 		if (!adev->ip_blocks[i].status.valid || adev->ip_blocks[i].status.hw)
2854 			continue;
2855 		if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_COMMON ||
2856 		    adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC ||
2857 		    adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_IH) {
2858 
2859 			r = adev->ip_blocks[i].version->funcs->resume(adev);
2860 			if (r) {
2861 				DRM_ERROR("resume of IP block <%s> failed %d\n",
2862 					  adev->ip_blocks[i].version->funcs->name, r);
2863 				return r;
2864 			}
2865 			adev->ip_blocks[i].status.hw = true;
2866 		}
2867 	}
2868 
2869 	return 0;
2870 }
2871 
2872 /**
2873  * amdgpu_device_ip_resume_phase2 - run resume for hardware IPs
2874  *
2875  * @adev: amdgpu_device pointer
2876  *
2877  * First resume function for hardware IPs.  The list of all the hardware
2878  * IPs that make up the asic is walked and the resume callbacks are run for
2879  * all blocks except COMMON, GMC, and IH.  resume puts the hardware into a
2880  * functional state after a suspend and updates the software state as
2881  * necessary.  This function is also used for restoring the GPU after a GPU
2882  * reset.
2883  * Returns 0 on success, negative error code on failure.
2884  */
amdgpu_device_ip_resume_phase2(struct amdgpu_device * adev)2885 static int amdgpu_device_ip_resume_phase2(struct amdgpu_device *adev)
2886 {
2887 	int i, r;
2888 
2889 	for (i = 0; i < adev->num_ip_blocks; i++) {
2890 		if (!adev->ip_blocks[i].status.valid || adev->ip_blocks[i].status.hw)
2891 			continue;
2892 		if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_COMMON ||
2893 		    adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC ||
2894 		    adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_IH ||
2895 		    adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_PSP)
2896 			continue;
2897 		r = adev->ip_blocks[i].version->funcs->resume(adev);
2898 		if (r) {
2899 			DRM_ERROR("resume of IP block <%s> failed %d\n",
2900 				  adev->ip_blocks[i].version->funcs->name, r);
2901 			return r;
2902 		}
2903 		adev->ip_blocks[i].status.hw = true;
2904 	}
2905 
2906 	return 0;
2907 }
2908 
2909 /**
2910  * amdgpu_device_ip_resume - run resume for hardware IPs
2911  *
2912  * @adev: amdgpu_device pointer
2913  *
2914  * Main resume function for hardware IPs.  The hardware IPs
2915  * are split into two resume functions because they are
2916  * are also used in in recovering from a GPU reset and some additional
2917  * steps need to be take between them.  In this case (S3/S4) they are
2918  * run sequentially.
2919  * Returns 0 on success, negative error code on failure.
2920  */
amdgpu_device_ip_resume(struct amdgpu_device * adev)2921 static int amdgpu_device_ip_resume(struct amdgpu_device *adev)
2922 {
2923 	int r;
2924 
2925 	r = amdgpu_amdkfd_resume_iommu(adev);
2926 	if (r)
2927 		return r;
2928 
2929 	r = amdgpu_device_ip_resume_phase1(adev);
2930 	if (r)
2931 		return r;
2932 
2933 	r = amdgpu_device_fw_loading(adev);
2934 	if (r)
2935 		return r;
2936 
2937 	r = amdgpu_device_ip_resume_phase2(adev);
2938 
2939 	return r;
2940 }
2941 
2942 /**
2943  * amdgpu_device_detect_sriov_bios - determine if the board supports SR-IOV
2944  *
2945  * @adev: amdgpu_device pointer
2946  *
2947  * Query the VBIOS data tables to determine if the board supports SR-IOV.
2948  */
amdgpu_device_detect_sriov_bios(struct amdgpu_device * adev)2949 static void amdgpu_device_detect_sriov_bios(struct amdgpu_device *adev)
2950 {
2951 	if (amdgpu_sriov_vf(adev)) {
2952 		if (adev->is_atom_fw) {
2953 			if (amdgpu_atomfirmware_gpu_supports_virtualization(adev))
2954 				adev->virt.caps |= AMDGPU_SRIOV_CAPS_SRIOV_VBIOS;
2955 		} else {
2956 			if (amdgpu_atombios_has_gpu_virtualization_table(adev))
2957 				adev->virt.caps |= AMDGPU_SRIOV_CAPS_SRIOV_VBIOS;
2958 		}
2959 
2960 		if (!(adev->virt.caps & AMDGPU_SRIOV_CAPS_SRIOV_VBIOS))
2961 			amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_NO_VBIOS, 0, 0);
2962 	}
2963 }
2964 
2965 /**
2966  * amdgpu_device_asic_has_dc_support - determine if DC supports the asic
2967  *
2968  * @asic_type: AMD asic type
2969  *
2970  * Check if there is DC (new modesetting infrastructre) support for an asic.
2971  * returns true if DC has support, false if not.
2972  */
amdgpu_device_asic_has_dc_support(enum amd_asic_type asic_type)2973 bool amdgpu_device_asic_has_dc_support(enum amd_asic_type asic_type)
2974 {
2975 	switch (asic_type) {
2976 #if defined(CONFIG_DRM_AMD_DC)
2977 #if defined(CONFIG_DRM_AMD_DC_SI)
2978 	case CHIP_TAHITI:
2979 	case CHIP_PITCAIRN:
2980 	case CHIP_VERDE:
2981 	case CHIP_OLAND:
2982 #endif
2983 	case CHIP_BONAIRE:
2984 	case CHIP_KAVERI:
2985 	case CHIP_KABINI:
2986 	case CHIP_MULLINS:
2987 		/*
2988 		 * We have systems in the wild with these ASICs that require
2989 		 * LVDS and VGA support which is not supported with DC.
2990 		 *
2991 		 * Fallback to the non-DC driver here by default so as not to
2992 		 * cause regressions.
2993 		 */
2994 		return amdgpu_dc > 0;
2995 	case CHIP_HAWAII:
2996 	case CHIP_CARRIZO:
2997 	case CHIP_STONEY:
2998 	case CHIP_POLARIS10:
2999 	case CHIP_POLARIS11:
3000 	case CHIP_POLARIS12:
3001 	case CHIP_VEGAM:
3002 	case CHIP_TONGA:
3003 	case CHIP_FIJI:
3004 	case CHIP_VEGA10:
3005 	case CHIP_VEGA12:
3006 	case CHIP_VEGA20:
3007 #if defined(CONFIG_DRM_AMD_DC_DCN)
3008 	case CHIP_RAVEN:
3009 	case CHIP_NAVI10:
3010 	case CHIP_NAVI14:
3011 	case CHIP_NAVI12:
3012 	case CHIP_RENOIR:
3013 #endif
3014 #if defined(CONFIG_DRM_AMD_DC_DCN3_0)
3015 	case CHIP_SIENNA_CICHLID:
3016 	case CHIP_NAVY_FLOUNDER:
3017 #endif
3018 		return amdgpu_dc != 0;
3019 #endif
3020 	default:
3021 		if (amdgpu_dc > 0)
3022 			DRM_INFO_ONCE("Display Core has been requested via kernel parameter "
3023 					 "but isn't supported by ASIC, ignoring\n");
3024 		return false;
3025 	}
3026 }
3027 
3028 /**
3029  * amdgpu_device_has_dc_support - check if dc is supported
3030  *
3031  * @adev: amdgpu_device pointer
3032  *
3033  * Returns true for supported, false for not supported
3034  */
amdgpu_device_has_dc_support(struct amdgpu_device * adev)3035 bool amdgpu_device_has_dc_support(struct amdgpu_device *adev)
3036 {
3037 	if (amdgpu_sriov_vf(adev) || adev->enable_virtual_display)
3038 		return false;
3039 
3040 	return amdgpu_device_asic_has_dc_support(adev->asic_type);
3041 }
3042 
3043 
amdgpu_device_xgmi_reset_func(struct work_struct * __work)3044 static void amdgpu_device_xgmi_reset_func(struct work_struct *__work)
3045 {
3046 	struct amdgpu_device *adev =
3047 		container_of(__work, struct amdgpu_device, xgmi_reset_work);
3048 	struct amdgpu_hive_info *hive = amdgpu_get_xgmi_hive(adev);
3049 
3050 	/* It's a bug to not have a hive within this function */
3051 	if (WARN_ON(!hive))
3052 		return;
3053 
3054 	/*
3055 	 * Use task barrier to synchronize all xgmi reset works across the
3056 	 * hive. task_barrier_enter and task_barrier_exit will block
3057 	 * until all the threads running the xgmi reset works reach
3058 	 * those points. task_barrier_full will do both blocks.
3059 	 */
3060 	if (amdgpu_asic_reset_method(adev) == AMD_RESET_METHOD_BACO) {
3061 
3062 		task_barrier_enter(&hive->tb);
3063 		adev->asic_reset_res = amdgpu_device_baco_enter(adev_to_drm(adev));
3064 
3065 		if (adev->asic_reset_res)
3066 			goto fail;
3067 
3068 		task_barrier_exit(&hive->tb);
3069 		adev->asic_reset_res = amdgpu_device_baco_exit(adev_to_drm(adev));
3070 
3071 		if (adev->asic_reset_res)
3072 			goto fail;
3073 
3074 		if (adev->mmhub.funcs && adev->mmhub.funcs->reset_ras_error_count)
3075 			adev->mmhub.funcs->reset_ras_error_count(adev);
3076 	} else {
3077 
3078 		task_barrier_full(&hive->tb);
3079 		adev->asic_reset_res =  amdgpu_asic_reset(adev);
3080 	}
3081 
3082 fail:
3083 	if (adev->asic_reset_res)
3084 		DRM_WARN("ASIC reset failed with error, %d for drm dev, %s",
3085 			 adev->asic_reset_res, adev_to_drm(adev)->unique);
3086 	amdgpu_put_xgmi_hive(hive);
3087 }
3088 
amdgpu_device_get_job_timeout_settings(struct amdgpu_device * adev)3089 static int amdgpu_device_get_job_timeout_settings(struct amdgpu_device *adev)
3090 {
3091 	char *input = amdgpu_lockup_timeout;
3092 	char *timeout_setting = NULL;
3093 	int index = 0;
3094 	long timeout;
3095 	int ret = 0;
3096 
3097 	/*
3098 	 * By default timeout for non compute jobs is 10000.
3099 	 * And there is no timeout enforced on compute jobs.
3100 	 * In SR-IOV or passthrough mode, timeout for compute
3101 	 * jobs are 60000 by default.
3102 	 */
3103 	adev->gfx_timeout = msecs_to_jiffies(10000);
3104 	adev->sdma_timeout = adev->video_timeout = adev->gfx_timeout;
3105 	if (amdgpu_sriov_vf(adev) || amdgpu_passthrough(adev))
3106 		adev->compute_timeout =  msecs_to_jiffies(60000);
3107 	else
3108 		adev->compute_timeout = MAX_SCHEDULE_TIMEOUT;
3109 
3110 	if (strnlen(input, AMDGPU_MAX_TIMEOUT_PARAM_LENGTH)) {
3111 		while ((timeout_setting = strsep(&input, ",")) &&
3112 				strnlen(timeout_setting, AMDGPU_MAX_TIMEOUT_PARAM_LENGTH)) {
3113 			ret = kstrtol(timeout_setting, 0, &timeout);
3114 			if (ret)
3115 				return ret;
3116 
3117 			if (timeout == 0) {
3118 				index++;
3119 				continue;
3120 			} else if (timeout < 0) {
3121 				timeout = MAX_SCHEDULE_TIMEOUT;
3122 			} else {
3123 				timeout = msecs_to_jiffies(timeout);
3124 			}
3125 
3126 			switch (index++) {
3127 			case 0:
3128 				adev->gfx_timeout = timeout;
3129 				break;
3130 			case 1:
3131 				adev->compute_timeout = timeout;
3132 				break;
3133 			case 2:
3134 				adev->sdma_timeout = timeout;
3135 				break;
3136 			case 3:
3137 				adev->video_timeout = timeout;
3138 				break;
3139 			default:
3140 				break;
3141 			}
3142 		}
3143 		/*
3144 		 * There is only one value specified and
3145 		 * it should apply to all non-compute jobs.
3146 		 */
3147 		if (index == 1) {
3148 			adev->sdma_timeout = adev->video_timeout = adev->gfx_timeout;
3149 			if (amdgpu_sriov_vf(adev) || amdgpu_passthrough(adev))
3150 				adev->compute_timeout = adev->gfx_timeout;
3151 		}
3152 	}
3153 
3154 	return ret;
3155 }
3156 
3157 static const struct attribute *amdgpu_dev_attributes[] = {
3158 	&dev_attr_product_name.attr,
3159 	&dev_attr_product_number.attr,
3160 	&dev_attr_serial_number.attr,
3161 	&dev_attr_pcie_replay_count.attr,
3162 	NULL
3163 };
3164 
3165 
3166 /**
3167  * amdgpu_device_init - initialize the driver
3168  *
3169  * @adev: amdgpu_device pointer
3170  * @flags: driver flags
3171  *
3172  * Initializes the driver info and hw (all asics).
3173  * Returns 0 for success or an error on failure.
3174  * Called at driver startup.
3175  */
amdgpu_device_init(struct amdgpu_device * adev,uint32_t flags)3176 int amdgpu_device_init(struct amdgpu_device *adev,
3177 		       uint32_t flags)
3178 {
3179 	struct drm_device *ddev = adev_to_drm(adev);
3180 	struct pci_dev *pdev = adev->pdev;
3181 	int r, i;
3182 	bool boco = false;
3183 	u32 max_MBps;
3184 
3185 	adev->shutdown = false;
3186 	adev->flags = flags;
3187 
3188 	if (amdgpu_force_asic_type >= 0 && amdgpu_force_asic_type < CHIP_LAST)
3189 		adev->asic_type = amdgpu_force_asic_type;
3190 	else
3191 		adev->asic_type = flags & AMD_ASIC_MASK;
3192 
3193 	adev->usec_timeout = AMDGPU_MAX_USEC_TIMEOUT;
3194 	if (amdgpu_emu_mode == 1)
3195 		adev->usec_timeout *= 10;
3196 	adev->gmc.gart_size = 512 * 1024 * 1024;
3197 	adev->accel_working = false;
3198 	adev->num_rings = 0;
3199 	adev->mman.buffer_funcs = NULL;
3200 	adev->mman.buffer_funcs_ring = NULL;
3201 	adev->vm_manager.vm_pte_funcs = NULL;
3202 	adev->vm_manager.vm_pte_num_scheds = 0;
3203 	adev->gmc.gmc_funcs = NULL;
3204 	adev->fence_context = dma_fence_context_alloc(AMDGPU_MAX_RINGS);
3205 	bitmap_zero(adev->gfx.pipe_reserve_bitmap, AMDGPU_MAX_COMPUTE_QUEUES);
3206 
3207 	adev->smc_rreg = &amdgpu_invalid_rreg;
3208 	adev->smc_wreg = &amdgpu_invalid_wreg;
3209 	adev->pcie_rreg = &amdgpu_invalid_rreg;
3210 	adev->pcie_wreg = &amdgpu_invalid_wreg;
3211 	adev->pciep_rreg = &amdgpu_invalid_rreg;
3212 	adev->pciep_wreg = &amdgpu_invalid_wreg;
3213 	adev->pcie_rreg64 = &amdgpu_invalid_rreg64;
3214 	adev->pcie_wreg64 = &amdgpu_invalid_wreg64;
3215 	adev->uvd_ctx_rreg = &amdgpu_invalid_rreg;
3216 	adev->uvd_ctx_wreg = &amdgpu_invalid_wreg;
3217 	adev->didt_rreg = &amdgpu_invalid_rreg;
3218 	adev->didt_wreg = &amdgpu_invalid_wreg;
3219 	adev->gc_cac_rreg = &amdgpu_invalid_rreg;
3220 	adev->gc_cac_wreg = &amdgpu_invalid_wreg;
3221 	adev->audio_endpt_rreg = &amdgpu_block_invalid_rreg;
3222 	adev->audio_endpt_wreg = &amdgpu_block_invalid_wreg;
3223 
3224 	DRM_INFO("initializing kernel modesetting (%s 0x%04X:0x%04X 0x%04X:0x%04X 0x%02X).\n",
3225 		 amdgpu_asic_name[adev->asic_type], pdev->vendor, pdev->device,
3226 		 pdev->subsystem_vendor, pdev->subsystem_device, pdev->revision);
3227 
3228 	/* mutex initialization are all done here so we
3229 	 * can recall function without having locking issues */
3230 	atomic_set(&adev->irq.ih.lock, 0);
3231 	mutex_init(&adev->firmware.mutex);
3232 	mutex_init(&adev->pm.mutex);
3233 	mutex_init(&adev->gfx.gpu_clock_mutex);
3234 	mutex_init(&adev->srbm_mutex);
3235 	mutex_init(&adev->gfx.pipe_reserve_mutex);
3236 	mutex_init(&adev->gfx.gfx_off_mutex);
3237 	mutex_init(&adev->grbm_idx_mutex);
3238 	mutex_init(&adev->mn_lock);
3239 	mutex_init(&adev->virt.vf_errors.lock);
3240 	hash_init(adev->mn_hash);
3241 	atomic_set(&adev->in_gpu_reset, 0);
3242 	init_rwsem(&adev->reset_sem);
3243 	mutex_init(&adev->psp.mutex);
3244 	mutex_init(&adev->notifier_lock);
3245 
3246 	r = amdgpu_device_check_arguments(adev);
3247 	if (r)
3248 		return r;
3249 
3250 	spin_lock_init(&adev->mmio_idx_lock);
3251 	spin_lock_init(&adev->smc_idx_lock);
3252 	spin_lock_init(&adev->pcie_idx_lock);
3253 	spin_lock_init(&adev->uvd_ctx_idx_lock);
3254 	spin_lock_init(&adev->didt_idx_lock);
3255 	spin_lock_init(&adev->gc_cac_idx_lock);
3256 	spin_lock_init(&adev->se_cac_idx_lock);
3257 	spin_lock_init(&adev->audio_endpt_idx_lock);
3258 	spin_lock_init(&adev->mm_stats.lock);
3259 
3260 	INIT_LIST_HEAD(&adev->shadow_list);
3261 	mutex_init(&adev->shadow_list_lock);
3262 
3263 	INIT_DELAYED_WORK(&adev->delayed_init_work,
3264 			  amdgpu_device_delayed_init_work_handler);
3265 	INIT_DELAYED_WORK(&adev->gfx.gfx_off_delay_work,
3266 			  amdgpu_device_delay_enable_gfx_off);
3267 
3268 	INIT_WORK(&adev->xgmi_reset_work, amdgpu_device_xgmi_reset_func);
3269 
3270 	adev->gfx.gfx_off_req_count = 1;
3271 	adev->pm.ac_power = power_supply_is_system_supplied() > 0;
3272 
3273 	atomic_set(&adev->throttling_logging_enabled, 1);
3274 	/*
3275 	 * If throttling continues, logging will be performed every minute
3276 	 * to avoid log flooding. "-1" is subtracted since the thermal
3277 	 * throttling interrupt comes every second. Thus, the total logging
3278 	 * interval is 59 seconds(retelimited printk interval) + 1(waiting
3279 	 * for throttling interrupt) = 60 seconds.
3280 	 */
3281 	ratelimit_state_init(&adev->throttling_logging_rs, (60 - 1) * HZ, 1);
3282 	ratelimit_set_flags(&adev->throttling_logging_rs, RATELIMIT_MSG_ON_RELEASE);
3283 
3284 	/* Registers mapping */
3285 	/* TODO: block userspace mapping of io register */
3286 	if (adev->asic_type >= CHIP_BONAIRE) {
3287 		adev->rmmio_base = pci_resource_start(adev->pdev, 5);
3288 		adev->rmmio_size = pci_resource_len(adev->pdev, 5);
3289 	} else {
3290 		adev->rmmio_base = pci_resource_start(adev->pdev, 2);
3291 		adev->rmmio_size = pci_resource_len(adev->pdev, 2);
3292 	}
3293 
3294 	adev->rmmio = ioremap(adev->rmmio_base, adev->rmmio_size);
3295 	if (adev->rmmio == NULL) {
3296 		return -ENOMEM;
3297 	}
3298 	DRM_INFO("register mmio base: 0x%08X\n", (uint32_t)adev->rmmio_base);
3299 	DRM_INFO("register mmio size: %u\n", (unsigned)adev->rmmio_size);
3300 
3301 	/* io port mapping */
3302 	for (i = 0; i < DEVICE_COUNT_RESOURCE; i++) {
3303 		if (pci_resource_flags(adev->pdev, i) & IORESOURCE_IO) {
3304 			adev->rio_mem_size = pci_resource_len(adev->pdev, i);
3305 			adev->rio_mem = pci_iomap(adev->pdev, i, adev->rio_mem_size);
3306 			break;
3307 		}
3308 	}
3309 	if (adev->rio_mem == NULL)
3310 		DRM_INFO("PCI I/O BAR is not found.\n");
3311 
3312 	/* enable PCIE atomic ops */
3313 	r = pci_enable_atomic_ops_to_root(adev->pdev,
3314 					  PCI_EXP_DEVCAP2_ATOMIC_COMP32 |
3315 					  PCI_EXP_DEVCAP2_ATOMIC_COMP64);
3316 	if (r) {
3317 		adev->have_atomics_support = false;
3318 		DRM_INFO("PCIE atomic ops is not supported\n");
3319 	} else {
3320 		adev->have_atomics_support = true;
3321 	}
3322 
3323 	amdgpu_device_get_pcie_info(adev);
3324 
3325 	if (amdgpu_mcbp)
3326 		DRM_INFO("MCBP is enabled\n");
3327 
3328 	if (amdgpu_mes && adev->asic_type >= CHIP_NAVI10)
3329 		adev->enable_mes = true;
3330 
3331 	/* detect hw virtualization here */
3332 	amdgpu_detect_virtualization(adev);
3333 
3334 	r = amdgpu_device_get_job_timeout_settings(adev);
3335 	if (r) {
3336 		dev_err(adev->dev, "invalid lockup_timeout parameter syntax\n");
3337 		return r;
3338 	}
3339 
3340 	/* early init functions */
3341 	r = amdgpu_device_ip_early_init(adev);
3342 	if (r)
3343 		return r;
3344 
3345 	/* doorbell bar mapping and doorbell index init*/
3346 	amdgpu_device_doorbell_init(adev);
3347 
3348 	/* if we have > 1 VGA cards, then disable the amdgpu VGA resources */
3349 	/* this will fail for cards that aren't VGA class devices, just
3350 	 * ignore it */
3351 	vga_client_register(adev->pdev, adev, NULL, amdgpu_device_vga_set_decode);
3352 
3353 	if (amdgpu_device_supports_boco(ddev))
3354 		boco = true;
3355 	if (amdgpu_has_atpx() &&
3356 	    (amdgpu_is_atpx_hybrid() ||
3357 	     amdgpu_has_atpx_dgpu_power_cntl()) &&
3358 	    !pci_is_thunderbolt_attached(adev->pdev))
3359 		vga_switcheroo_register_client(adev->pdev,
3360 					       &amdgpu_switcheroo_ops, boco);
3361 	if (boco)
3362 		vga_switcheroo_init_domain_pm_ops(adev->dev, &adev->vga_pm_domain);
3363 
3364 	if (amdgpu_emu_mode == 1) {
3365 		/* post the asic on emulation mode */
3366 		emu_soc_asic_init(adev);
3367 		goto fence_driver_init;
3368 	}
3369 
3370 	/* detect if we are with an SRIOV vbios */
3371 	amdgpu_device_detect_sriov_bios(adev);
3372 
3373 	/* check if we need to reset the asic
3374 	 *  E.g., driver was not cleanly unloaded previously, etc.
3375 	 */
3376 	if (!amdgpu_sriov_vf(adev) && amdgpu_asic_need_reset_on_init(adev)) {
3377 		r = amdgpu_asic_reset(adev);
3378 		if (r) {
3379 			dev_err(adev->dev, "asic reset on init failed\n");
3380 			goto failed;
3381 		}
3382 	}
3383 
3384 	pci_enable_pcie_error_reporting(adev->ddev.pdev);
3385 
3386 	/* Post card if necessary */
3387 	if (amdgpu_device_need_post(adev)) {
3388 		if (!adev->bios) {
3389 			dev_err(adev->dev, "no vBIOS found\n");
3390 			r = -EINVAL;
3391 			goto failed;
3392 		}
3393 		DRM_INFO("GPU posting now...\n");
3394 		r = amdgpu_device_asic_init(adev);
3395 		if (r) {
3396 			dev_err(adev->dev, "gpu post error!\n");
3397 			goto failed;
3398 		}
3399 	}
3400 
3401 	if (adev->is_atom_fw) {
3402 		/* Initialize clocks */
3403 		r = amdgpu_atomfirmware_get_clock_info(adev);
3404 		if (r) {
3405 			dev_err(adev->dev, "amdgpu_atomfirmware_get_clock_info failed\n");
3406 			amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_ATOMBIOS_GET_CLOCK_FAIL, 0, 0);
3407 			goto failed;
3408 		}
3409 	} else {
3410 		/* Initialize clocks */
3411 		r = amdgpu_atombios_get_clock_info(adev);
3412 		if (r) {
3413 			dev_err(adev->dev, "amdgpu_atombios_get_clock_info failed\n");
3414 			amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_ATOMBIOS_GET_CLOCK_FAIL, 0, 0);
3415 			goto failed;
3416 		}
3417 		/* init i2c buses */
3418 		if (!amdgpu_device_has_dc_support(adev))
3419 			amdgpu_atombios_i2c_init(adev);
3420 	}
3421 
3422 fence_driver_init:
3423 	/* Fence driver */
3424 	r = amdgpu_fence_driver_init(adev);
3425 	if (r) {
3426 		dev_err(adev->dev, "amdgpu_fence_driver_init failed\n");
3427 		amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_FENCE_INIT_FAIL, 0, 0);
3428 		goto failed;
3429 	}
3430 
3431 	/* init the mode config */
3432 	drm_mode_config_init(adev_to_drm(adev));
3433 
3434 	r = amdgpu_device_ip_init(adev);
3435 	if (r) {
3436 		/* failed in exclusive mode due to timeout */
3437 		if (amdgpu_sriov_vf(adev) &&
3438 		    !amdgpu_sriov_runtime(adev) &&
3439 		    amdgpu_virt_mmio_blocked(adev) &&
3440 		    !amdgpu_virt_wait_reset(adev)) {
3441 			dev_err(adev->dev, "VF exclusive mode timeout\n");
3442 			/* Don't send request since VF is inactive. */
3443 			adev->virt.caps &= ~AMDGPU_SRIOV_CAPS_RUNTIME;
3444 			adev->virt.ops = NULL;
3445 			r = -EAGAIN;
3446 			goto failed;
3447 		}
3448 		dev_err(adev->dev, "amdgpu_device_ip_init failed\n");
3449 		amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_AMDGPU_INIT_FAIL, 0, 0);
3450 		goto failed;
3451 	}
3452 
3453 	dev_info(adev->dev,
3454 		"SE %d, SH per SE %d, CU per SH %d, active_cu_number %d\n",
3455 			adev->gfx.config.max_shader_engines,
3456 			adev->gfx.config.max_sh_per_se,
3457 			adev->gfx.config.max_cu_per_sh,
3458 			adev->gfx.cu_info.number);
3459 
3460 	adev->accel_working = true;
3461 
3462 	amdgpu_vm_check_compute_bug(adev);
3463 
3464 	/* Initialize the buffer migration limit. */
3465 	if (amdgpu_moverate >= 0)
3466 		max_MBps = amdgpu_moverate;
3467 	else
3468 		max_MBps = 8; /* Allow 8 MB/s. */
3469 	/* Get a log2 for easy divisions. */
3470 	adev->mm_stats.log2_max_MBps = ilog2(max(1u, max_MBps));
3471 
3472 	amdgpu_fbdev_init(adev);
3473 
3474 	r = amdgpu_pm_sysfs_init(adev);
3475 	if (r) {
3476 		adev->pm_sysfs_en = false;
3477 		DRM_ERROR("registering pm debugfs failed (%d).\n", r);
3478 	} else
3479 		adev->pm_sysfs_en = true;
3480 
3481 	r = amdgpu_ucode_sysfs_init(adev);
3482 	if (r) {
3483 		adev->ucode_sysfs_en = false;
3484 		DRM_ERROR("Creating firmware sysfs failed (%d).\n", r);
3485 	} else
3486 		adev->ucode_sysfs_en = true;
3487 
3488 	if ((amdgpu_testing & 1)) {
3489 		if (adev->accel_working)
3490 			amdgpu_test_moves(adev);
3491 		else
3492 			DRM_INFO("amdgpu: acceleration disabled, skipping move tests\n");
3493 	}
3494 	if (amdgpu_benchmarking) {
3495 		if (adev->accel_working)
3496 			amdgpu_benchmark(adev, amdgpu_benchmarking);
3497 		else
3498 			DRM_INFO("amdgpu: acceleration disabled, skipping benchmarks\n");
3499 	}
3500 
3501 	/*
3502 	 * Register gpu instance before amdgpu_device_enable_mgpu_fan_boost.
3503 	 * Otherwise the mgpu fan boost feature will be skipped due to the
3504 	 * gpu instance is counted less.
3505 	 */
3506 	amdgpu_register_gpu_instance(adev);
3507 
3508 	/* enable clockgating, etc. after ib tests, etc. since some blocks require
3509 	 * explicit gating rather than handling it automatically.
3510 	 */
3511 	r = amdgpu_device_ip_late_init(adev);
3512 	if (r) {
3513 		dev_err(adev->dev, "amdgpu_device_ip_late_init failed\n");
3514 		amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_AMDGPU_LATE_INIT_FAIL, 0, r);
3515 		goto failed;
3516 	}
3517 
3518 	/* must succeed. */
3519 	amdgpu_ras_resume(adev);
3520 
3521 	queue_delayed_work(system_wq, &adev->delayed_init_work,
3522 			   msecs_to_jiffies(AMDGPU_RESUME_MS));
3523 
3524 	if (amdgpu_sriov_vf(adev))
3525 		flush_delayed_work(&adev->delayed_init_work);
3526 
3527 	r = sysfs_create_files(&adev->dev->kobj, amdgpu_dev_attributes);
3528 	if (r)
3529 		dev_err(adev->dev, "Could not create amdgpu device attr\n");
3530 
3531 	if (IS_ENABLED(CONFIG_PERF_EVENTS))
3532 		r = amdgpu_pmu_init(adev);
3533 	if (r)
3534 		dev_err(adev->dev, "amdgpu_pmu_init failed\n");
3535 
3536 	/* Have stored pci confspace at hand for restore in sudden PCI error */
3537 	if (amdgpu_device_cache_pci_state(adev->pdev))
3538 		pci_restore_state(pdev);
3539 
3540 	return 0;
3541 
3542 failed:
3543 	amdgpu_vf_error_trans_all(adev);
3544 	if (boco)
3545 		vga_switcheroo_fini_domain_pm_ops(adev->dev);
3546 
3547 	return r;
3548 }
3549 
3550 /**
3551  * amdgpu_device_fini - tear down the driver
3552  *
3553  * @adev: amdgpu_device pointer
3554  *
3555  * Tear down the driver info (all asics).
3556  * Called at driver shutdown.
3557  */
amdgpu_device_fini(struct amdgpu_device * adev)3558 void amdgpu_device_fini(struct amdgpu_device *adev)
3559 {
3560 	dev_info(adev->dev, "amdgpu: finishing device.\n");
3561 	flush_delayed_work(&adev->delayed_init_work);
3562 	ttm_bo_lock_delayed_workqueue(&adev->mman.bdev);
3563 	adev->shutdown = true;
3564 
3565 	kfree(adev->pci_state);
3566 
3567 	/* make sure IB test finished before entering exclusive mode
3568 	 * to avoid preemption on IB test
3569 	 * */
3570 	if (amdgpu_sriov_vf(adev)) {
3571 		amdgpu_virt_request_full_gpu(adev, false);
3572 		amdgpu_virt_fini_data_exchange(adev);
3573 	}
3574 
3575 	/* disable all interrupts */
3576 	amdgpu_irq_disable_all(adev);
3577 	if (adev->mode_info.mode_config_initialized){
3578 		if (!amdgpu_device_has_dc_support(adev))
3579 			drm_helper_force_disable_all(adev_to_drm(adev));
3580 		else
3581 			drm_atomic_helper_shutdown(adev_to_drm(adev));
3582 	}
3583 	amdgpu_fence_driver_fini(adev);
3584 	if (adev->pm_sysfs_en)
3585 		amdgpu_pm_sysfs_fini(adev);
3586 	amdgpu_fbdev_fini(adev);
3587 	amdgpu_device_ip_fini(adev);
3588 	release_firmware(adev->firmware.gpu_info_fw);
3589 	adev->firmware.gpu_info_fw = NULL;
3590 	adev->accel_working = false;
3591 	/* free i2c buses */
3592 	if (!amdgpu_device_has_dc_support(adev))
3593 		amdgpu_i2c_fini(adev);
3594 
3595 	if (amdgpu_emu_mode != 1)
3596 		amdgpu_atombios_fini(adev);
3597 
3598 	kfree(adev->bios);
3599 	adev->bios = NULL;
3600 	if (amdgpu_has_atpx() &&
3601 	    (amdgpu_is_atpx_hybrid() ||
3602 	     amdgpu_has_atpx_dgpu_power_cntl()) &&
3603 	    !pci_is_thunderbolt_attached(adev->pdev))
3604 		vga_switcheroo_unregister_client(adev->pdev);
3605 	if (amdgpu_device_supports_boco(adev_to_drm(adev)))
3606 		vga_switcheroo_fini_domain_pm_ops(adev->dev);
3607 	vga_client_register(adev->pdev, NULL, NULL, NULL);
3608 	if (adev->rio_mem)
3609 		pci_iounmap(adev->pdev, adev->rio_mem);
3610 	adev->rio_mem = NULL;
3611 	iounmap(adev->rmmio);
3612 	adev->rmmio = NULL;
3613 	amdgpu_device_doorbell_fini(adev);
3614 
3615 	if (adev->ucode_sysfs_en)
3616 		amdgpu_ucode_sysfs_fini(adev);
3617 
3618 	sysfs_remove_files(&adev->dev->kobj, amdgpu_dev_attributes);
3619 	if (IS_ENABLED(CONFIG_PERF_EVENTS))
3620 		amdgpu_pmu_fini(adev);
3621 	if (adev->mman.discovery_bin)
3622 		amdgpu_discovery_fini(adev);
3623 }
3624 
3625 
3626 /*
3627  * Suspend & resume.
3628  */
3629 /**
3630  * amdgpu_device_suspend - initiate device suspend
3631  *
3632  * @dev: drm dev pointer
3633  * @fbcon : notify the fbdev of suspend
3634  *
3635  * Puts the hw in the suspend state (all asics).
3636  * Returns 0 for success or an error on failure.
3637  * Called at driver suspend.
3638  */
amdgpu_device_suspend(struct drm_device * dev,bool fbcon)3639 int amdgpu_device_suspend(struct drm_device *dev, bool fbcon)
3640 {
3641 	struct amdgpu_device *adev;
3642 	struct drm_crtc *crtc;
3643 	struct drm_connector *connector;
3644 	struct drm_connector_list_iter iter;
3645 	int r;
3646 
3647 	adev = drm_to_adev(dev);
3648 
3649 	if (dev->switch_power_state == DRM_SWITCH_POWER_OFF)
3650 		return 0;
3651 
3652 	adev->in_suspend = true;
3653 	drm_kms_helper_poll_disable(dev);
3654 
3655 	if (fbcon)
3656 		amdgpu_fbdev_set_suspend(adev, 1);
3657 
3658 	cancel_delayed_work_sync(&adev->delayed_init_work);
3659 
3660 	if (!amdgpu_device_has_dc_support(adev)) {
3661 		/* turn off display hw */
3662 		drm_modeset_lock_all(dev);
3663 		drm_connector_list_iter_begin(dev, &iter);
3664 		drm_for_each_connector_iter(connector, &iter)
3665 			drm_helper_connector_dpms(connector,
3666 						  DRM_MODE_DPMS_OFF);
3667 		drm_connector_list_iter_end(&iter);
3668 		drm_modeset_unlock_all(dev);
3669 			/* unpin the front buffers and cursors */
3670 		list_for_each_entry(crtc, &dev->mode_config.crtc_list, head) {
3671 			struct amdgpu_crtc *amdgpu_crtc = to_amdgpu_crtc(crtc);
3672 			struct drm_framebuffer *fb = crtc->primary->fb;
3673 			struct amdgpu_bo *robj;
3674 
3675 			if (amdgpu_crtc->cursor_bo && !adev->enable_virtual_display) {
3676 				struct amdgpu_bo *aobj = gem_to_amdgpu_bo(amdgpu_crtc->cursor_bo);
3677 				r = amdgpu_bo_reserve(aobj, true);
3678 				if (r == 0) {
3679 					amdgpu_bo_unpin(aobj);
3680 					amdgpu_bo_unreserve(aobj);
3681 				}
3682 			}
3683 
3684 			if (fb == NULL || fb->obj[0] == NULL) {
3685 				continue;
3686 			}
3687 			robj = gem_to_amdgpu_bo(fb->obj[0]);
3688 			/* don't unpin kernel fb objects */
3689 			if (!amdgpu_fbdev_robj_is_fb(adev, robj)) {
3690 				r = amdgpu_bo_reserve(robj, true);
3691 				if (r == 0) {
3692 					amdgpu_bo_unpin(robj);
3693 					amdgpu_bo_unreserve(robj);
3694 				}
3695 			}
3696 		}
3697 	}
3698 
3699 	amdgpu_ras_suspend(adev);
3700 
3701 	r = amdgpu_device_ip_suspend_phase1(adev);
3702 
3703 	amdgpu_amdkfd_suspend(adev, !fbcon);
3704 
3705 	/* evict vram memory */
3706 	amdgpu_bo_evict_vram(adev);
3707 
3708 	amdgpu_fence_driver_suspend(adev);
3709 
3710 	r = amdgpu_device_ip_suspend_phase2(adev);
3711 
3712 	/* evict remaining vram memory
3713 	 * This second call to evict vram is to evict the gart page table
3714 	 * using the CPU.
3715 	 */
3716 	amdgpu_bo_evict_vram(adev);
3717 
3718 	return 0;
3719 }
3720 
3721 /**
3722  * amdgpu_device_resume - initiate device resume
3723  *
3724  * @dev: drm dev pointer
3725  * @fbcon : notify the fbdev of resume
3726  *
3727  * Bring the hw back to operating state (all asics).
3728  * Returns 0 for success or an error on failure.
3729  * Called at driver resume.
3730  */
amdgpu_device_resume(struct drm_device * dev,bool fbcon)3731 int amdgpu_device_resume(struct drm_device *dev, bool fbcon)
3732 {
3733 	struct drm_connector *connector;
3734 	struct drm_connector_list_iter iter;
3735 	struct amdgpu_device *adev = drm_to_adev(dev);
3736 	struct drm_crtc *crtc;
3737 	int r = 0;
3738 
3739 	if (dev->switch_power_state == DRM_SWITCH_POWER_OFF)
3740 		return 0;
3741 
3742 	/* post card */
3743 	if (amdgpu_device_need_post(adev)) {
3744 		r = amdgpu_device_asic_init(adev);
3745 		if (r)
3746 			dev_err(adev->dev, "amdgpu asic init failed\n");
3747 	}
3748 
3749 	r = amdgpu_device_ip_resume(adev);
3750 	if (r) {
3751 		dev_err(adev->dev, "amdgpu_device_ip_resume failed (%d).\n", r);
3752 		return r;
3753 	}
3754 	amdgpu_fence_driver_resume(adev);
3755 
3756 
3757 	r = amdgpu_device_ip_late_init(adev);
3758 	if (r)
3759 		return r;
3760 
3761 	queue_delayed_work(system_wq, &adev->delayed_init_work,
3762 			   msecs_to_jiffies(AMDGPU_RESUME_MS));
3763 
3764 	if (!amdgpu_device_has_dc_support(adev)) {
3765 		/* pin cursors */
3766 		list_for_each_entry(crtc, &dev->mode_config.crtc_list, head) {
3767 			struct amdgpu_crtc *amdgpu_crtc = to_amdgpu_crtc(crtc);
3768 
3769 			if (amdgpu_crtc->cursor_bo && !adev->enable_virtual_display) {
3770 				struct amdgpu_bo *aobj = gem_to_amdgpu_bo(amdgpu_crtc->cursor_bo);
3771 				r = amdgpu_bo_reserve(aobj, true);
3772 				if (r == 0) {
3773 					r = amdgpu_bo_pin(aobj, AMDGPU_GEM_DOMAIN_VRAM);
3774 					if (r != 0)
3775 						dev_err(adev->dev, "Failed to pin cursor BO (%d)\n", r);
3776 					amdgpu_crtc->cursor_addr = amdgpu_bo_gpu_offset(aobj);
3777 					amdgpu_bo_unreserve(aobj);
3778 				}
3779 			}
3780 		}
3781 	}
3782 	r = amdgpu_amdkfd_resume(adev, !fbcon);
3783 	if (r)
3784 		return r;
3785 
3786 	/* Make sure IB tests flushed */
3787 	flush_delayed_work(&adev->delayed_init_work);
3788 
3789 	/* blat the mode back in */
3790 	if (fbcon) {
3791 		if (!amdgpu_device_has_dc_support(adev)) {
3792 			/* pre DCE11 */
3793 			drm_helper_resume_force_mode(dev);
3794 
3795 			/* turn on display hw */
3796 			drm_modeset_lock_all(dev);
3797 
3798 			drm_connector_list_iter_begin(dev, &iter);
3799 			drm_for_each_connector_iter(connector, &iter)
3800 				drm_helper_connector_dpms(connector,
3801 							  DRM_MODE_DPMS_ON);
3802 			drm_connector_list_iter_end(&iter);
3803 
3804 			drm_modeset_unlock_all(dev);
3805 		}
3806 		amdgpu_fbdev_set_suspend(adev, 0);
3807 	}
3808 
3809 	drm_kms_helper_poll_enable(dev);
3810 
3811 	amdgpu_ras_resume(adev);
3812 
3813 	/*
3814 	 * Most of the connector probing functions try to acquire runtime pm
3815 	 * refs to ensure that the GPU is powered on when connector polling is
3816 	 * performed. Since we're calling this from a runtime PM callback,
3817 	 * trying to acquire rpm refs will cause us to deadlock.
3818 	 *
3819 	 * Since we're guaranteed to be holding the rpm lock, it's safe to
3820 	 * temporarily disable the rpm helpers so this doesn't deadlock us.
3821 	 */
3822 #ifdef CONFIG_PM
3823 	dev->dev->power.disable_depth++;
3824 #endif
3825 	if (!amdgpu_device_has_dc_support(adev))
3826 		drm_helper_hpd_irq_event(dev);
3827 	else
3828 		drm_kms_helper_hotplug_event(dev);
3829 #ifdef CONFIG_PM
3830 	dev->dev->power.disable_depth--;
3831 #endif
3832 	adev->in_suspend = false;
3833 
3834 	return 0;
3835 }
3836 
3837 /**
3838  * amdgpu_device_ip_check_soft_reset - did soft reset succeed
3839  *
3840  * @adev: amdgpu_device pointer
3841  *
3842  * The list of all the hardware IPs that make up the asic is walked and
3843  * the check_soft_reset callbacks are run.  check_soft_reset determines
3844  * if the asic is still hung or not.
3845  * Returns true if any of the IPs are still in a hung state, false if not.
3846  */
amdgpu_device_ip_check_soft_reset(struct amdgpu_device * adev)3847 static bool amdgpu_device_ip_check_soft_reset(struct amdgpu_device *adev)
3848 {
3849 	int i;
3850 	bool asic_hang = false;
3851 
3852 	if (amdgpu_sriov_vf(adev))
3853 		return true;
3854 
3855 	if (amdgpu_asic_need_full_reset(adev))
3856 		return true;
3857 
3858 	for (i = 0; i < adev->num_ip_blocks; i++) {
3859 		if (!adev->ip_blocks[i].status.valid)
3860 			continue;
3861 		if (adev->ip_blocks[i].version->funcs->check_soft_reset)
3862 			adev->ip_blocks[i].status.hang =
3863 				adev->ip_blocks[i].version->funcs->check_soft_reset(adev);
3864 		if (adev->ip_blocks[i].status.hang) {
3865 			dev_info(adev->dev, "IP block:%s is hung!\n", adev->ip_blocks[i].version->funcs->name);
3866 			asic_hang = true;
3867 		}
3868 	}
3869 	return asic_hang;
3870 }
3871 
3872 /**
3873  * amdgpu_device_ip_pre_soft_reset - prepare for soft reset
3874  *
3875  * @adev: amdgpu_device pointer
3876  *
3877  * The list of all the hardware IPs that make up the asic is walked and the
3878  * pre_soft_reset callbacks are run if the block is hung.  pre_soft_reset
3879  * handles any IP specific hardware or software state changes that are
3880  * necessary for a soft reset to succeed.
3881  * Returns 0 on success, negative error code on failure.
3882  */
amdgpu_device_ip_pre_soft_reset(struct amdgpu_device * adev)3883 static int amdgpu_device_ip_pre_soft_reset(struct amdgpu_device *adev)
3884 {
3885 	int i, r = 0;
3886 
3887 	for (i = 0; i < adev->num_ip_blocks; i++) {
3888 		if (!adev->ip_blocks[i].status.valid)
3889 			continue;
3890 		if (adev->ip_blocks[i].status.hang &&
3891 		    adev->ip_blocks[i].version->funcs->pre_soft_reset) {
3892 			r = adev->ip_blocks[i].version->funcs->pre_soft_reset(adev);
3893 			if (r)
3894 				return r;
3895 		}
3896 	}
3897 
3898 	return 0;
3899 }
3900 
3901 /**
3902  * amdgpu_device_ip_need_full_reset - check if a full asic reset is needed
3903  *
3904  * @adev: amdgpu_device pointer
3905  *
3906  * Some hardware IPs cannot be soft reset.  If they are hung, a full gpu
3907  * reset is necessary to recover.
3908  * Returns true if a full asic reset is required, false if not.
3909  */
amdgpu_device_ip_need_full_reset(struct amdgpu_device * adev)3910 static bool amdgpu_device_ip_need_full_reset(struct amdgpu_device *adev)
3911 {
3912 	int i;
3913 
3914 	if (amdgpu_asic_need_full_reset(adev))
3915 		return true;
3916 
3917 	for (i = 0; i < adev->num_ip_blocks; i++) {
3918 		if (!adev->ip_blocks[i].status.valid)
3919 			continue;
3920 		if ((adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC) ||
3921 		    (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_SMC) ||
3922 		    (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_ACP) ||
3923 		    (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_DCE) ||
3924 		     adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_PSP) {
3925 			if (adev->ip_blocks[i].status.hang) {
3926 				dev_info(adev->dev, "Some block need full reset!\n");
3927 				return true;
3928 			}
3929 		}
3930 	}
3931 	return false;
3932 }
3933 
3934 /**
3935  * amdgpu_device_ip_soft_reset - do a soft reset
3936  *
3937  * @adev: amdgpu_device pointer
3938  *
3939  * The list of all the hardware IPs that make up the asic is walked and the
3940  * soft_reset callbacks are run if the block is hung.  soft_reset handles any
3941  * IP specific hardware or software state changes that are necessary to soft
3942  * reset the IP.
3943  * Returns 0 on success, negative error code on failure.
3944  */
amdgpu_device_ip_soft_reset(struct amdgpu_device * adev)3945 static int amdgpu_device_ip_soft_reset(struct amdgpu_device *adev)
3946 {
3947 	int i, r = 0;
3948 
3949 	for (i = 0; i < adev->num_ip_blocks; i++) {
3950 		if (!adev->ip_blocks[i].status.valid)
3951 			continue;
3952 		if (adev->ip_blocks[i].status.hang &&
3953 		    adev->ip_blocks[i].version->funcs->soft_reset) {
3954 			r = adev->ip_blocks[i].version->funcs->soft_reset(adev);
3955 			if (r)
3956 				return r;
3957 		}
3958 	}
3959 
3960 	return 0;
3961 }
3962 
3963 /**
3964  * amdgpu_device_ip_post_soft_reset - clean up from soft reset
3965  *
3966  * @adev: amdgpu_device pointer
3967  *
3968  * The list of all the hardware IPs that make up the asic is walked and the
3969  * post_soft_reset callbacks are run if the asic was hung.  post_soft_reset
3970  * handles any IP specific hardware or software state changes that are
3971  * necessary after the IP has been soft reset.
3972  * Returns 0 on success, negative error code on failure.
3973  */
amdgpu_device_ip_post_soft_reset(struct amdgpu_device * adev)3974 static int amdgpu_device_ip_post_soft_reset(struct amdgpu_device *adev)
3975 {
3976 	int i, r = 0;
3977 
3978 	for (i = 0; i < adev->num_ip_blocks; i++) {
3979 		if (!adev->ip_blocks[i].status.valid)
3980 			continue;
3981 		if (adev->ip_blocks[i].status.hang &&
3982 		    adev->ip_blocks[i].version->funcs->post_soft_reset)
3983 			r = adev->ip_blocks[i].version->funcs->post_soft_reset(adev);
3984 		if (r)
3985 			return r;
3986 	}
3987 
3988 	return 0;
3989 }
3990 
3991 /**
3992  * amdgpu_device_recover_vram - Recover some VRAM contents
3993  *
3994  * @adev: amdgpu_device pointer
3995  *
3996  * Restores the contents of VRAM buffers from the shadows in GTT.  Used to
3997  * restore things like GPUVM page tables after a GPU reset where
3998  * the contents of VRAM might be lost.
3999  *
4000  * Returns:
4001  * 0 on success, negative error code on failure.
4002  */
amdgpu_device_recover_vram(struct amdgpu_device * adev)4003 static int amdgpu_device_recover_vram(struct amdgpu_device *adev)
4004 {
4005 	struct dma_fence *fence = NULL, *next = NULL;
4006 	struct amdgpu_bo *shadow;
4007 	long r = 1, tmo;
4008 
4009 	if (amdgpu_sriov_runtime(adev))
4010 		tmo = msecs_to_jiffies(8000);
4011 	else
4012 		tmo = msecs_to_jiffies(100);
4013 
4014 	dev_info(adev->dev, "recover vram bo from shadow start\n");
4015 	mutex_lock(&adev->shadow_list_lock);
4016 	list_for_each_entry(shadow, &adev->shadow_list, shadow_list) {
4017 
4018 		/* No need to recover an evicted BO */
4019 		if (shadow->tbo.mem.mem_type != TTM_PL_TT ||
4020 		    shadow->tbo.mem.start == AMDGPU_BO_INVALID_OFFSET ||
4021 		    shadow->parent->tbo.mem.mem_type != TTM_PL_VRAM)
4022 			continue;
4023 
4024 		r = amdgpu_bo_restore_shadow(shadow, &next);
4025 		if (r)
4026 			break;
4027 
4028 		if (fence) {
4029 			tmo = dma_fence_wait_timeout(fence, false, tmo);
4030 			dma_fence_put(fence);
4031 			fence = next;
4032 			if (tmo == 0) {
4033 				r = -ETIMEDOUT;
4034 				break;
4035 			} else if (tmo < 0) {
4036 				r = tmo;
4037 				break;
4038 			}
4039 		} else {
4040 			fence = next;
4041 		}
4042 	}
4043 	mutex_unlock(&adev->shadow_list_lock);
4044 
4045 	if (fence)
4046 		tmo = dma_fence_wait_timeout(fence, false, tmo);
4047 	dma_fence_put(fence);
4048 
4049 	if (r < 0 || tmo <= 0) {
4050 		dev_err(adev->dev, "recover vram bo from shadow failed, r is %ld, tmo is %ld\n", r, tmo);
4051 		return -EIO;
4052 	}
4053 
4054 	dev_info(adev->dev, "recover vram bo from shadow done\n");
4055 	return 0;
4056 }
4057 
4058 
4059 /**
4060  * amdgpu_device_reset_sriov - reset ASIC for SR-IOV vf
4061  *
4062  * @adev: amdgpu_device pointer
4063  * @from_hypervisor: request from hypervisor
4064  *
4065  * do VF FLR and reinitialize Asic
4066  * return 0 means succeeded otherwise failed
4067  */
amdgpu_device_reset_sriov(struct amdgpu_device * adev,bool from_hypervisor)4068 static int amdgpu_device_reset_sriov(struct amdgpu_device *adev,
4069 				     bool from_hypervisor)
4070 {
4071 	int r;
4072 
4073 	if (from_hypervisor)
4074 		r = amdgpu_virt_request_full_gpu(adev, true);
4075 	else
4076 		r = amdgpu_virt_reset_gpu(adev);
4077 	if (r)
4078 		return r;
4079 
4080 	amdgpu_amdkfd_pre_reset(adev);
4081 
4082 	/* Resume IP prior to SMC */
4083 	r = amdgpu_device_ip_reinit_early_sriov(adev);
4084 	if (r)
4085 		goto error;
4086 
4087 	amdgpu_virt_init_data_exchange(adev);
4088 	/* we need recover gart prior to run SMC/CP/SDMA resume */
4089 	amdgpu_gtt_mgr_recover(ttm_manager_type(&adev->mman.bdev, TTM_PL_TT));
4090 
4091 	r = amdgpu_device_fw_loading(adev);
4092 	if (r)
4093 		return r;
4094 
4095 	/* now we are okay to resume SMC/CP/SDMA */
4096 	r = amdgpu_device_ip_reinit_late_sriov(adev);
4097 	if (r)
4098 		goto error;
4099 
4100 	amdgpu_irq_gpu_reset_resume_helper(adev);
4101 	r = amdgpu_ib_ring_tests(adev);
4102 	amdgpu_amdkfd_post_reset(adev);
4103 
4104 error:
4105 	amdgpu_virt_release_full_gpu(adev, true);
4106 	if (!r && adev->virt.gim_feature & AMDGIM_FEATURE_GIM_FLR_VRAMLOST) {
4107 		amdgpu_inc_vram_lost(adev);
4108 		r = amdgpu_device_recover_vram(adev);
4109 	}
4110 
4111 	return r;
4112 }
4113 
4114 /**
4115  * amdgpu_device_has_job_running - check if there is any job in mirror list
4116  *
4117  * @adev: amdgpu_device pointer
4118  *
4119  * check if there is any job in mirror list
4120  */
amdgpu_device_has_job_running(struct amdgpu_device * adev)4121 bool amdgpu_device_has_job_running(struct amdgpu_device *adev)
4122 {
4123 	int i;
4124 	struct drm_sched_job *job;
4125 
4126 	for (i = 0; i < AMDGPU_MAX_RINGS; ++i) {
4127 		struct amdgpu_ring *ring = adev->rings[i];
4128 
4129 		if (!ring || !ring->sched.thread)
4130 			continue;
4131 
4132 		spin_lock(&ring->sched.job_list_lock);
4133 		job = list_first_entry_or_null(&ring->sched.ring_mirror_list,
4134 				struct drm_sched_job, node);
4135 		spin_unlock(&ring->sched.job_list_lock);
4136 		if (job)
4137 			return true;
4138 	}
4139 	return false;
4140 }
4141 
4142 /**
4143  * amdgpu_device_should_recover_gpu - check if we should try GPU recovery
4144  *
4145  * @adev: amdgpu_device pointer
4146  *
4147  * Check amdgpu_gpu_recovery and SRIOV status to see if we should try to recover
4148  * a hung GPU.
4149  */
amdgpu_device_should_recover_gpu(struct amdgpu_device * adev)4150 bool amdgpu_device_should_recover_gpu(struct amdgpu_device *adev)
4151 {
4152 	if (!amdgpu_device_ip_check_soft_reset(adev)) {
4153 		dev_info(adev->dev, "Timeout, but no hardware hang detected.\n");
4154 		return false;
4155 	}
4156 
4157 	if (amdgpu_gpu_recovery == 0)
4158 		goto disabled;
4159 
4160 	if (amdgpu_sriov_vf(adev))
4161 		return true;
4162 
4163 	if (amdgpu_gpu_recovery == -1) {
4164 		switch (adev->asic_type) {
4165 		case CHIP_BONAIRE:
4166 		case CHIP_HAWAII:
4167 		case CHIP_TOPAZ:
4168 		case CHIP_TONGA:
4169 		case CHIP_FIJI:
4170 		case CHIP_POLARIS10:
4171 		case CHIP_POLARIS11:
4172 		case CHIP_POLARIS12:
4173 		case CHIP_VEGAM:
4174 		case CHIP_VEGA20:
4175 		case CHIP_VEGA10:
4176 		case CHIP_VEGA12:
4177 		case CHIP_RAVEN:
4178 		case CHIP_ARCTURUS:
4179 		case CHIP_RENOIR:
4180 		case CHIP_NAVI10:
4181 		case CHIP_NAVI14:
4182 		case CHIP_NAVI12:
4183 		case CHIP_SIENNA_CICHLID:
4184 			break;
4185 		default:
4186 			goto disabled;
4187 		}
4188 	}
4189 
4190 	return true;
4191 
4192 disabled:
4193 		dev_info(adev->dev, "GPU recovery disabled.\n");
4194 		return false;
4195 }
4196 
4197 
amdgpu_device_pre_asic_reset(struct amdgpu_device * adev,struct amdgpu_job * job,bool * need_full_reset_arg)4198 static int amdgpu_device_pre_asic_reset(struct amdgpu_device *adev,
4199 					struct amdgpu_job *job,
4200 					bool *need_full_reset_arg)
4201 {
4202 	int i, r = 0;
4203 	bool need_full_reset  = *need_full_reset_arg;
4204 
4205 	amdgpu_debugfs_wait_dump(adev);
4206 
4207 	if (amdgpu_sriov_vf(adev)) {
4208 		/* stop the data exchange thread */
4209 		amdgpu_virt_fini_data_exchange(adev);
4210 	}
4211 
4212 	/* block all schedulers and reset given job's ring */
4213 	for (i = 0; i < AMDGPU_MAX_RINGS; ++i) {
4214 		struct amdgpu_ring *ring = adev->rings[i];
4215 
4216 		if (!ring || !ring->sched.thread)
4217 			continue;
4218 
4219 		/* after all hw jobs are reset, hw fence is meaningless, so force_completion */
4220 		amdgpu_fence_driver_force_completion(ring);
4221 	}
4222 
4223 	if(job)
4224 		drm_sched_increase_karma(&job->base);
4225 
4226 	/* Don't suspend on bare metal if we are not going to HW reset the ASIC */
4227 	if (!amdgpu_sriov_vf(adev)) {
4228 
4229 		if (!need_full_reset)
4230 			need_full_reset = amdgpu_device_ip_need_full_reset(adev);
4231 
4232 		if (!need_full_reset) {
4233 			amdgpu_device_ip_pre_soft_reset(adev);
4234 			r = amdgpu_device_ip_soft_reset(adev);
4235 			amdgpu_device_ip_post_soft_reset(adev);
4236 			if (r || amdgpu_device_ip_check_soft_reset(adev)) {
4237 				dev_info(adev->dev, "soft reset failed, will fallback to full reset!\n");
4238 				need_full_reset = true;
4239 			}
4240 		}
4241 
4242 		if (need_full_reset)
4243 			r = amdgpu_device_ip_suspend(adev);
4244 
4245 		*need_full_reset_arg = need_full_reset;
4246 	}
4247 
4248 	return r;
4249 }
4250 
amdgpu_do_asic_reset(struct amdgpu_hive_info * hive,struct list_head * device_list_handle,bool * need_full_reset_arg,bool skip_hw_reset)4251 static int amdgpu_do_asic_reset(struct amdgpu_hive_info *hive,
4252 			       struct list_head *device_list_handle,
4253 			       bool *need_full_reset_arg,
4254 			       bool skip_hw_reset)
4255 {
4256 	struct amdgpu_device *tmp_adev = NULL;
4257 	bool need_full_reset = *need_full_reset_arg, vram_lost = false;
4258 	int r = 0;
4259 
4260 	/*
4261 	 * ASIC reset has to be done on all HGMI hive nodes ASAP
4262 	 * to allow proper links negotiation in FW (within 1 sec)
4263 	 */
4264 	if (!skip_hw_reset && need_full_reset) {
4265 		list_for_each_entry(tmp_adev, device_list_handle, gmc.xgmi.head) {
4266 			/* For XGMI run all resets in parallel to speed up the process */
4267 			if (tmp_adev->gmc.xgmi.num_physical_nodes > 1) {
4268 				if (!queue_work(system_unbound_wq, &tmp_adev->xgmi_reset_work))
4269 					r = -EALREADY;
4270 			} else
4271 				r = amdgpu_asic_reset(tmp_adev);
4272 
4273 			if (r) {
4274 				dev_err(tmp_adev->dev, "ASIC reset failed with error, %d for drm dev, %s",
4275 					 r, adev_to_drm(tmp_adev)->unique);
4276 				break;
4277 			}
4278 		}
4279 
4280 		/* For XGMI wait for all resets to complete before proceed */
4281 		if (!r) {
4282 			list_for_each_entry(tmp_adev, device_list_handle,
4283 					    gmc.xgmi.head) {
4284 				if (tmp_adev->gmc.xgmi.num_physical_nodes > 1) {
4285 					flush_work(&tmp_adev->xgmi_reset_work);
4286 					r = tmp_adev->asic_reset_res;
4287 					if (r)
4288 						break;
4289 				}
4290 			}
4291 		}
4292 	}
4293 
4294 	if (!r && amdgpu_ras_intr_triggered()) {
4295 		list_for_each_entry(tmp_adev, device_list_handle, gmc.xgmi.head) {
4296 			if (tmp_adev->mmhub.funcs &&
4297 			    tmp_adev->mmhub.funcs->reset_ras_error_count)
4298 				tmp_adev->mmhub.funcs->reset_ras_error_count(tmp_adev);
4299 		}
4300 
4301 		amdgpu_ras_intr_cleared();
4302 	}
4303 
4304 	list_for_each_entry(tmp_adev, device_list_handle, gmc.xgmi.head) {
4305 		if (need_full_reset) {
4306 			/* post card */
4307 			if (amdgpu_device_asic_init(tmp_adev))
4308 				dev_warn(tmp_adev->dev, "asic atom init failed!");
4309 
4310 			if (!r) {
4311 				dev_info(tmp_adev->dev, "GPU reset succeeded, trying to resume\n");
4312 				r = amdgpu_amdkfd_resume_iommu(tmp_adev);
4313 				if (r)
4314 					goto out;
4315 
4316 				r = amdgpu_device_ip_resume_phase1(tmp_adev);
4317 				if (r)
4318 					goto out;
4319 
4320 				vram_lost = amdgpu_device_check_vram_lost(tmp_adev);
4321 				if (vram_lost) {
4322 					DRM_INFO("VRAM is lost due to GPU reset!\n");
4323 					amdgpu_inc_vram_lost(tmp_adev);
4324 				}
4325 
4326 				r = amdgpu_gtt_mgr_recover(ttm_manager_type(&tmp_adev->mman.bdev, TTM_PL_TT));
4327 				if (r)
4328 					goto out;
4329 
4330 				r = amdgpu_device_fw_loading(tmp_adev);
4331 				if (r)
4332 					return r;
4333 
4334 				r = amdgpu_device_ip_resume_phase2(tmp_adev);
4335 				if (r)
4336 					goto out;
4337 
4338 				if (vram_lost)
4339 					amdgpu_device_fill_reset_magic(tmp_adev);
4340 
4341 				/*
4342 				 * Add this ASIC as tracked as reset was already
4343 				 * complete successfully.
4344 				 */
4345 				amdgpu_register_gpu_instance(tmp_adev);
4346 
4347 				r = amdgpu_device_ip_late_init(tmp_adev);
4348 				if (r)
4349 					goto out;
4350 
4351 				amdgpu_fbdev_set_suspend(tmp_adev, 0);
4352 
4353 				/*
4354 				 * The GPU enters bad state once faulty pages
4355 				 * by ECC has reached the threshold, and ras
4356 				 * recovery is scheduled next. So add one check
4357 				 * here to break recovery if it indeed exceeds
4358 				 * bad page threshold, and remind user to
4359 				 * retire this GPU or setting one bigger
4360 				 * bad_page_threshold value to fix this once
4361 				 * probing driver again.
4362 				 */
4363 				if (!amdgpu_ras_check_err_threshold(tmp_adev)) {
4364 					/* must succeed. */
4365 					amdgpu_ras_resume(tmp_adev);
4366 				} else {
4367 					r = -EINVAL;
4368 					goto out;
4369 				}
4370 
4371 				/* Update PSP FW topology after reset */
4372 				if (hive && tmp_adev->gmc.xgmi.num_physical_nodes > 1)
4373 					r = amdgpu_xgmi_update_topology(hive, tmp_adev);
4374 			}
4375 		}
4376 
4377 out:
4378 		if (!r) {
4379 			amdgpu_irq_gpu_reset_resume_helper(tmp_adev);
4380 			r = amdgpu_ib_ring_tests(tmp_adev);
4381 			if (r) {
4382 				dev_err(tmp_adev->dev, "ib ring test failed (%d).\n", r);
4383 				need_full_reset = true;
4384 				r = -EAGAIN;
4385 				goto end;
4386 			}
4387 		}
4388 
4389 		if (!r)
4390 			r = amdgpu_device_recover_vram(tmp_adev);
4391 		else
4392 			tmp_adev->asic_reset_res = r;
4393 	}
4394 
4395 end:
4396 	*need_full_reset_arg = need_full_reset;
4397 	return r;
4398 }
4399 
amdgpu_device_lock_adev(struct amdgpu_device * adev,struct amdgpu_hive_info * hive)4400 static bool amdgpu_device_lock_adev(struct amdgpu_device *adev,
4401 				struct amdgpu_hive_info *hive)
4402 {
4403 	if (atomic_cmpxchg(&adev->in_gpu_reset, 0, 1) != 0)
4404 		return false;
4405 
4406 	if (hive) {
4407 		down_write_nest_lock(&adev->reset_sem, &hive->hive_lock);
4408 	} else {
4409 		down_write(&adev->reset_sem);
4410 	}
4411 
4412 	atomic_inc(&adev->gpu_reset_counter);
4413 	switch (amdgpu_asic_reset_method(adev)) {
4414 	case AMD_RESET_METHOD_MODE1:
4415 		adev->mp1_state = PP_MP1_STATE_SHUTDOWN;
4416 		break;
4417 	case AMD_RESET_METHOD_MODE2:
4418 		adev->mp1_state = PP_MP1_STATE_RESET;
4419 		break;
4420 	default:
4421 		adev->mp1_state = PP_MP1_STATE_NONE;
4422 		break;
4423 	}
4424 
4425 	return true;
4426 }
4427 
amdgpu_device_unlock_adev(struct amdgpu_device * adev)4428 static void amdgpu_device_unlock_adev(struct amdgpu_device *adev)
4429 {
4430 	amdgpu_vf_error_trans_all(adev);
4431 	adev->mp1_state = PP_MP1_STATE_NONE;
4432 	atomic_set(&adev->in_gpu_reset, 0);
4433 	up_write(&adev->reset_sem);
4434 }
4435 
amdgpu_device_resume_display_audio(struct amdgpu_device * adev)4436 static void amdgpu_device_resume_display_audio(struct amdgpu_device *adev)
4437 {
4438 	struct pci_dev *p = NULL;
4439 
4440 	p = pci_get_domain_bus_and_slot(pci_domain_nr(adev->pdev->bus),
4441 			adev->pdev->bus->number, 1);
4442 	if (p) {
4443 		pm_runtime_enable(&(p->dev));
4444 		pm_runtime_resume(&(p->dev));
4445 	}
4446 
4447 	pci_dev_put(p);
4448 }
4449 
amdgpu_device_suspend_display_audio(struct amdgpu_device * adev)4450 static int amdgpu_device_suspend_display_audio(struct amdgpu_device *adev)
4451 {
4452 	enum amd_reset_method reset_method;
4453 	struct pci_dev *p = NULL;
4454 	u64 expires;
4455 
4456 	/*
4457 	 * For now, only BACO and mode1 reset are confirmed
4458 	 * to suffer the audio issue without proper suspended.
4459 	 */
4460 	reset_method = amdgpu_asic_reset_method(adev);
4461 	if ((reset_method != AMD_RESET_METHOD_BACO) &&
4462 	     (reset_method != AMD_RESET_METHOD_MODE1))
4463 		return -EINVAL;
4464 
4465 	p = pci_get_domain_bus_and_slot(pci_domain_nr(adev->pdev->bus),
4466 			adev->pdev->bus->number, 1);
4467 	if (!p)
4468 		return -ENODEV;
4469 
4470 	expires = pm_runtime_autosuspend_expiration(&(p->dev));
4471 	if (!expires)
4472 		/*
4473 		 * If we cannot get the audio device autosuspend delay,
4474 		 * a fixed 4S interval will be used. Considering 3S is
4475 		 * the audio controller default autosuspend delay setting.
4476 		 * 4S used here is guaranteed to cover that.
4477 		 */
4478 		expires = ktime_get_mono_fast_ns() + NSEC_PER_SEC * 4ULL;
4479 
4480 	while (!pm_runtime_status_suspended(&(p->dev))) {
4481 		if (!pm_runtime_suspend(&(p->dev)))
4482 			break;
4483 
4484 		if (expires < ktime_get_mono_fast_ns()) {
4485 			dev_warn(adev->dev, "failed to suspend display audio\n");
4486 			pci_dev_put(p);
4487 			/* TODO: abort the succeeding gpu reset? */
4488 			return -ETIMEDOUT;
4489 		}
4490 	}
4491 
4492 	pm_runtime_disable(&(p->dev));
4493 
4494 	pci_dev_put(p);
4495 	return 0;
4496 }
4497 
4498 /**
4499  * amdgpu_device_gpu_recover - reset the asic and recover scheduler
4500  *
4501  * @adev: amdgpu_device pointer
4502  * @job: which job trigger hang
4503  *
4504  * Attempt to reset the GPU if it has hung (all asics).
4505  * Attempt to do soft-reset or full-reset and reinitialize Asic
4506  * Returns 0 for success or an error on failure.
4507  */
4508 
amdgpu_device_gpu_recover(struct amdgpu_device * adev,struct amdgpu_job * job)4509 int amdgpu_device_gpu_recover(struct amdgpu_device *adev,
4510 			      struct amdgpu_job *job)
4511 {
4512 	struct list_head device_list, *device_list_handle =  NULL;
4513 	bool need_full_reset = false;
4514 	bool job_signaled = false;
4515 	struct amdgpu_hive_info *hive = NULL;
4516 	struct amdgpu_device *tmp_adev = NULL;
4517 	int i, r = 0;
4518 	bool need_emergency_restart = false;
4519 	bool audio_suspended = false;
4520 
4521 	/*
4522 	 * Special case: RAS triggered and full reset isn't supported
4523 	 */
4524 	need_emergency_restart = amdgpu_ras_need_emergency_restart(adev);
4525 
4526 	/*
4527 	 * Flush RAM to disk so that after reboot
4528 	 * the user can read log and see why the system rebooted.
4529 	 */
4530 	if (need_emergency_restart && amdgpu_ras_get_context(adev)->reboot) {
4531 		DRM_WARN("Emergency reboot.");
4532 
4533 		ksys_sync_helper();
4534 		emergency_restart();
4535 	}
4536 
4537 	dev_info(adev->dev, "GPU %s begin!\n",
4538 		need_emergency_restart ? "jobs stop":"reset");
4539 
4540 	/*
4541 	 * Here we trylock to avoid chain of resets executing from
4542 	 * either trigger by jobs on different adevs in XGMI hive or jobs on
4543 	 * different schedulers for same device while this TO handler is running.
4544 	 * We always reset all schedulers for device and all devices for XGMI
4545 	 * hive so that should take care of them too.
4546 	 */
4547 	hive = amdgpu_get_xgmi_hive(adev);
4548 	if (hive) {
4549 		if (atomic_cmpxchg(&hive->in_reset, 0, 1) != 0) {
4550 			DRM_INFO("Bailing on TDR for s_job:%llx, hive: %llx as another already in progress",
4551 				job ? job->base.id : -1, hive->hive_id);
4552 			amdgpu_put_xgmi_hive(hive);
4553 			return 0;
4554 		}
4555 		mutex_lock(&hive->hive_lock);
4556 	}
4557 
4558 	/*
4559 	 * Build list of devices to reset.
4560 	 * In case we are in XGMI hive mode, resort the device list
4561 	 * to put adev in the 1st position.
4562 	 */
4563 	INIT_LIST_HEAD(&device_list);
4564 	if (adev->gmc.xgmi.num_physical_nodes > 1) {
4565 		if (!hive)
4566 			return -ENODEV;
4567 		if (!list_is_first(&adev->gmc.xgmi.head, &hive->device_list))
4568 			list_rotate_to_front(&adev->gmc.xgmi.head, &hive->device_list);
4569 		device_list_handle = &hive->device_list;
4570 	} else {
4571 		list_add_tail(&adev->gmc.xgmi.head, &device_list);
4572 		device_list_handle = &device_list;
4573 	}
4574 
4575 	/* block all schedulers and reset given job's ring */
4576 	list_for_each_entry(tmp_adev, device_list_handle, gmc.xgmi.head) {
4577 		if (!amdgpu_device_lock_adev(tmp_adev, hive)) {
4578 			dev_info(tmp_adev->dev, "Bailing on TDR for s_job:%llx, as another already in progress",
4579 				  job ? job->base.id : -1);
4580 			r = 0;
4581 			goto skip_recovery;
4582 		}
4583 
4584 		/*
4585 		 * Try to put the audio codec into suspend state
4586 		 * before gpu reset started.
4587 		 *
4588 		 * Due to the power domain of the graphics device
4589 		 * is shared with AZ power domain. Without this,
4590 		 * we may change the audio hardware from behind
4591 		 * the audio driver's back. That will trigger
4592 		 * some audio codec errors.
4593 		 */
4594 		if (!amdgpu_device_suspend_display_audio(tmp_adev))
4595 			audio_suspended = true;
4596 
4597 		amdgpu_ras_set_error_query_ready(tmp_adev, false);
4598 
4599 		cancel_delayed_work_sync(&tmp_adev->delayed_init_work);
4600 
4601 		if (!amdgpu_sriov_vf(tmp_adev))
4602 			amdgpu_amdkfd_pre_reset(tmp_adev);
4603 
4604 		/*
4605 		 * Mark these ASICs to be reseted as untracked first
4606 		 * And add them back after reset completed
4607 		 */
4608 		amdgpu_unregister_gpu_instance(tmp_adev);
4609 
4610 		amdgpu_fbdev_set_suspend(tmp_adev, 1);
4611 
4612 		/* disable ras on ALL IPs */
4613 		if (!need_emergency_restart &&
4614 		      amdgpu_device_ip_need_full_reset(tmp_adev))
4615 			amdgpu_ras_suspend(tmp_adev);
4616 
4617 		for (i = 0; i < AMDGPU_MAX_RINGS; ++i) {
4618 			struct amdgpu_ring *ring = tmp_adev->rings[i];
4619 
4620 			if (!ring || !ring->sched.thread)
4621 				continue;
4622 
4623 			drm_sched_stop(&ring->sched, job ? &job->base : NULL);
4624 
4625 			if (need_emergency_restart)
4626 				amdgpu_job_stop_all_jobs_on_sched(&ring->sched);
4627 		}
4628 	}
4629 
4630 	if (need_emergency_restart)
4631 		goto skip_sched_resume;
4632 
4633 	/*
4634 	 * Must check guilty signal here since after this point all old
4635 	 * HW fences are force signaled.
4636 	 *
4637 	 * job->base holds a reference to parent fence
4638 	 */
4639 	if (job && job->base.s_fence->parent &&
4640 	    dma_fence_is_signaled(job->base.s_fence->parent)) {
4641 		job_signaled = true;
4642 		dev_info(adev->dev, "Guilty job already signaled, skipping HW reset");
4643 		goto skip_hw_reset;
4644 	}
4645 
4646 retry:	/* Rest of adevs pre asic reset from XGMI hive. */
4647 	list_for_each_entry(tmp_adev, device_list_handle, gmc.xgmi.head) {
4648 		r = amdgpu_device_pre_asic_reset(tmp_adev,
4649 						 (tmp_adev == adev) ? job : NULL,
4650 						 &need_full_reset);
4651 		/*TODO Should we stop ?*/
4652 		if (r) {
4653 			dev_err(tmp_adev->dev, "GPU pre asic reset failed with err, %d for drm dev, %s ",
4654 				  r, adev_to_drm(tmp_adev)->unique);
4655 			tmp_adev->asic_reset_res = r;
4656 		}
4657 	}
4658 
4659 	/* Actual ASIC resets if needed.*/
4660 	/* TODO Implement XGMI hive reset logic for SRIOV */
4661 	if (amdgpu_sriov_vf(adev)) {
4662 		r = amdgpu_device_reset_sriov(adev, job ? false : true);
4663 		if (r)
4664 			adev->asic_reset_res = r;
4665 	} else {
4666 		r  = amdgpu_do_asic_reset(hive, device_list_handle, &need_full_reset, false);
4667 		if (r && r == -EAGAIN)
4668 			goto retry;
4669 	}
4670 
4671 skip_hw_reset:
4672 
4673 	/* Post ASIC reset for all devs .*/
4674 	list_for_each_entry(tmp_adev, device_list_handle, gmc.xgmi.head) {
4675 
4676 		for (i = 0; i < AMDGPU_MAX_RINGS; ++i) {
4677 			struct amdgpu_ring *ring = tmp_adev->rings[i];
4678 
4679 			if (!ring || !ring->sched.thread)
4680 				continue;
4681 
4682 			/* No point to resubmit jobs if we didn't HW reset*/
4683 			if (!tmp_adev->asic_reset_res && !job_signaled)
4684 				drm_sched_resubmit_jobs(&ring->sched);
4685 
4686 			drm_sched_start(&ring->sched, !tmp_adev->asic_reset_res);
4687 		}
4688 
4689 		if (!amdgpu_device_has_dc_support(tmp_adev) && !job_signaled) {
4690 			drm_helper_resume_force_mode(adev_to_drm(tmp_adev));
4691 		}
4692 
4693 		tmp_adev->asic_reset_res = 0;
4694 
4695 		if (r) {
4696 			/* bad news, how to tell it to userspace ? */
4697 			dev_info(tmp_adev->dev, "GPU reset(%d) failed\n", atomic_read(&tmp_adev->gpu_reset_counter));
4698 			amdgpu_vf_error_put(tmp_adev, AMDGIM_ERROR_VF_GPU_RESET_FAIL, 0, r);
4699 		} else {
4700 			dev_info(tmp_adev->dev, "GPU reset(%d) succeeded!\n", atomic_read(&tmp_adev->gpu_reset_counter));
4701 		}
4702 	}
4703 
4704 skip_sched_resume:
4705 	list_for_each_entry(tmp_adev, device_list_handle, gmc.xgmi.head) {
4706 		/*unlock kfd: SRIOV would do it separately */
4707 		if (!need_emergency_restart && !amdgpu_sriov_vf(tmp_adev))
4708 	                amdgpu_amdkfd_post_reset(tmp_adev);
4709 		if (audio_suspended)
4710 			amdgpu_device_resume_display_audio(tmp_adev);
4711 		amdgpu_device_unlock_adev(tmp_adev);
4712 	}
4713 
4714 skip_recovery:
4715 	if (hive) {
4716 		atomic_set(&hive->in_reset, 0);
4717 		mutex_unlock(&hive->hive_lock);
4718 		amdgpu_put_xgmi_hive(hive);
4719 	}
4720 
4721 	if (r)
4722 		dev_info(adev->dev, "GPU reset end with ret = %d\n", r);
4723 	return r;
4724 }
4725 
4726 /**
4727  * amdgpu_device_get_pcie_info - fence pcie info about the PCIE slot
4728  *
4729  * @adev: amdgpu_device pointer
4730  *
4731  * Fetchs and stores in the driver the PCIE capabilities (gen speed
4732  * and lanes) of the slot the device is in. Handles APUs and
4733  * virtualized environments where PCIE config space may not be available.
4734  */
amdgpu_device_get_pcie_info(struct amdgpu_device * adev)4735 static void amdgpu_device_get_pcie_info(struct amdgpu_device *adev)
4736 {
4737 	struct pci_dev *pdev;
4738 	enum pci_bus_speed speed_cap, platform_speed_cap;
4739 	enum pcie_link_width platform_link_width;
4740 
4741 	if (amdgpu_pcie_gen_cap)
4742 		adev->pm.pcie_gen_mask = amdgpu_pcie_gen_cap;
4743 
4744 	if (amdgpu_pcie_lane_cap)
4745 		adev->pm.pcie_mlw_mask = amdgpu_pcie_lane_cap;
4746 
4747 	/* covers APUs as well */
4748 	if (pci_is_root_bus(adev->pdev->bus)) {
4749 		if (adev->pm.pcie_gen_mask == 0)
4750 			adev->pm.pcie_gen_mask = AMDGPU_DEFAULT_PCIE_GEN_MASK;
4751 		if (adev->pm.pcie_mlw_mask == 0)
4752 			adev->pm.pcie_mlw_mask = AMDGPU_DEFAULT_PCIE_MLW_MASK;
4753 		return;
4754 	}
4755 
4756 	if (adev->pm.pcie_gen_mask && adev->pm.pcie_mlw_mask)
4757 		return;
4758 
4759 	pcie_bandwidth_available(adev->pdev, NULL,
4760 				 &platform_speed_cap, &platform_link_width);
4761 
4762 	if (adev->pm.pcie_gen_mask == 0) {
4763 		/* asic caps */
4764 		pdev = adev->pdev;
4765 		speed_cap = pcie_get_speed_cap(pdev);
4766 		if (speed_cap == PCI_SPEED_UNKNOWN) {
4767 			adev->pm.pcie_gen_mask |= (CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN1 |
4768 						  CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN2 |
4769 						  CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN3);
4770 		} else {
4771 			if (speed_cap == PCIE_SPEED_16_0GT)
4772 				adev->pm.pcie_gen_mask |= (CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN1 |
4773 							  CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN2 |
4774 							  CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN3 |
4775 							  CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN4);
4776 			else if (speed_cap == PCIE_SPEED_8_0GT)
4777 				adev->pm.pcie_gen_mask |= (CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN1 |
4778 							  CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN2 |
4779 							  CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN3);
4780 			else if (speed_cap == PCIE_SPEED_5_0GT)
4781 				adev->pm.pcie_gen_mask |= (CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN1 |
4782 							  CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN2);
4783 			else
4784 				adev->pm.pcie_gen_mask |= CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN1;
4785 		}
4786 		/* platform caps */
4787 		if (platform_speed_cap == PCI_SPEED_UNKNOWN) {
4788 			adev->pm.pcie_gen_mask |= (CAIL_PCIE_LINK_SPEED_SUPPORT_GEN1 |
4789 						   CAIL_PCIE_LINK_SPEED_SUPPORT_GEN2);
4790 		} else {
4791 			if (platform_speed_cap == PCIE_SPEED_16_0GT)
4792 				adev->pm.pcie_gen_mask |= (CAIL_PCIE_LINK_SPEED_SUPPORT_GEN1 |
4793 							   CAIL_PCIE_LINK_SPEED_SUPPORT_GEN2 |
4794 							   CAIL_PCIE_LINK_SPEED_SUPPORT_GEN3 |
4795 							   CAIL_PCIE_LINK_SPEED_SUPPORT_GEN4);
4796 			else if (platform_speed_cap == PCIE_SPEED_8_0GT)
4797 				adev->pm.pcie_gen_mask |= (CAIL_PCIE_LINK_SPEED_SUPPORT_GEN1 |
4798 							   CAIL_PCIE_LINK_SPEED_SUPPORT_GEN2 |
4799 							   CAIL_PCIE_LINK_SPEED_SUPPORT_GEN3);
4800 			else if (platform_speed_cap == PCIE_SPEED_5_0GT)
4801 				adev->pm.pcie_gen_mask |= (CAIL_PCIE_LINK_SPEED_SUPPORT_GEN1 |
4802 							   CAIL_PCIE_LINK_SPEED_SUPPORT_GEN2);
4803 			else
4804 				adev->pm.pcie_gen_mask |= CAIL_PCIE_LINK_SPEED_SUPPORT_GEN1;
4805 
4806 		}
4807 	}
4808 	if (adev->pm.pcie_mlw_mask == 0) {
4809 		if (platform_link_width == PCIE_LNK_WIDTH_UNKNOWN) {
4810 			adev->pm.pcie_mlw_mask |= AMDGPU_DEFAULT_PCIE_MLW_MASK;
4811 		} else {
4812 			switch (platform_link_width) {
4813 			case PCIE_LNK_X32:
4814 				adev->pm.pcie_mlw_mask = (CAIL_PCIE_LINK_WIDTH_SUPPORT_X32 |
4815 							  CAIL_PCIE_LINK_WIDTH_SUPPORT_X16 |
4816 							  CAIL_PCIE_LINK_WIDTH_SUPPORT_X12 |
4817 							  CAIL_PCIE_LINK_WIDTH_SUPPORT_X8 |
4818 							  CAIL_PCIE_LINK_WIDTH_SUPPORT_X4 |
4819 							  CAIL_PCIE_LINK_WIDTH_SUPPORT_X2 |
4820 							  CAIL_PCIE_LINK_WIDTH_SUPPORT_X1);
4821 				break;
4822 			case PCIE_LNK_X16:
4823 				adev->pm.pcie_mlw_mask = (CAIL_PCIE_LINK_WIDTH_SUPPORT_X16 |
4824 							  CAIL_PCIE_LINK_WIDTH_SUPPORT_X12 |
4825 							  CAIL_PCIE_LINK_WIDTH_SUPPORT_X8 |
4826 							  CAIL_PCIE_LINK_WIDTH_SUPPORT_X4 |
4827 							  CAIL_PCIE_LINK_WIDTH_SUPPORT_X2 |
4828 							  CAIL_PCIE_LINK_WIDTH_SUPPORT_X1);
4829 				break;
4830 			case PCIE_LNK_X12:
4831 				adev->pm.pcie_mlw_mask = (CAIL_PCIE_LINK_WIDTH_SUPPORT_X12 |
4832 							  CAIL_PCIE_LINK_WIDTH_SUPPORT_X8 |
4833 							  CAIL_PCIE_LINK_WIDTH_SUPPORT_X4 |
4834 							  CAIL_PCIE_LINK_WIDTH_SUPPORT_X2 |
4835 							  CAIL_PCIE_LINK_WIDTH_SUPPORT_X1);
4836 				break;
4837 			case PCIE_LNK_X8:
4838 				adev->pm.pcie_mlw_mask = (CAIL_PCIE_LINK_WIDTH_SUPPORT_X8 |
4839 							  CAIL_PCIE_LINK_WIDTH_SUPPORT_X4 |
4840 							  CAIL_PCIE_LINK_WIDTH_SUPPORT_X2 |
4841 							  CAIL_PCIE_LINK_WIDTH_SUPPORT_X1);
4842 				break;
4843 			case PCIE_LNK_X4:
4844 				adev->pm.pcie_mlw_mask = (CAIL_PCIE_LINK_WIDTH_SUPPORT_X4 |
4845 							  CAIL_PCIE_LINK_WIDTH_SUPPORT_X2 |
4846 							  CAIL_PCIE_LINK_WIDTH_SUPPORT_X1);
4847 				break;
4848 			case PCIE_LNK_X2:
4849 				adev->pm.pcie_mlw_mask = (CAIL_PCIE_LINK_WIDTH_SUPPORT_X2 |
4850 							  CAIL_PCIE_LINK_WIDTH_SUPPORT_X1);
4851 				break;
4852 			case PCIE_LNK_X1:
4853 				adev->pm.pcie_mlw_mask = CAIL_PCIE_LINK_WIDTH_SUPPORT_X1;
4854 				break;
4855 			default:
4856 				break;
4857 			}
4858 		}
4859 	}
4860 }
4861 
amdgpu_device_baco_enter(struct drm_device * dev)4862 int amdgpu_device_baco_enter(struct drm_device *dev)
4863 {
4864 	struct amdgpu_device *adev = drm_to_adev(dev);
4865 	struct amdgpu_ras *ras = amdgpu_ras_get_context(adev);
4866 
4867 	if (!amdgpu_device_supports_baco(adev_to_drm(adev)))
4868 		return -ENOTSUPP;
4869 
4870 	if (ras && ras->supported && adev->nbio.funcs->enable_doorbell_interrupt)
4871 		adev->nbio.funcs->enable_doorbell_interrupt(adev, false);
4872 
4873 	return amdgpu_dpm_baco_enter(adev);
4874 }
4875 
amdgpu_device_baco_exit(struct drm_device * dev)4876 int amdgpu_device_baco_exit(struct drm_device *dev)
4877 {
4878 	struct amdgpu_device *adev = drm_to_adev(dev);
4879 	struct amdgpu_ras *ras = amdgpu_ras_get_context(adev);
4880 	int ret = 0;
4881 
4882 	if (!amdgpu_device_supports_baco(adev_to_drm(adev)))
4883 		return -ENOTSUPP;
4884 
4885 	ret = amdgpu_dpm_baco_exit(adev);
4886 	if (ret)
4887 		return ret;
4888 
4889 	if (ras && ras->supported && adev->nbio.funcs->enable_doorbell_interrupt)
4890 		adev->nbio.funcs->enable_doorbell_interrupt(adev, true);
4891 
4892 	return 0;
4893 }
4894 
amdgpu_cancel_all_tdr(struct amdgpu_device * adev)4895 static void amdgpu_cancel_all_tdr(struct amdgpu_device *adev)
4896 {
4897 	int i;
4898 
4899 	for (i = 0; i < AMDGPU_MAX_RINGS; ++i) {
4900 		struct amdgpu_ring *ring = adev->rings[i];
4901 
4902 		if (!ring || !ring->sched.thread)
4903 			continue;
4904 
4905 		cancel_delayed_work_sync(&ring->sched.work_tdr);
4906 	}
4907 }
4908 
4909 /**
4910  * amdgpu_pci_error_detected - Called when a PCI error is detected.
4911  * @pdev: PCI device struct
4912  * @state: PCI channel state
4913  *
4914  * Description: Called when a PCI error is detected.
4915  *
4916  * Return: PCI_ERS_RESULT_NEED_RESET or PCI_ERS_RESULT_DISCONNECT.
4917  */
amdgpu_pci_error_detected(struct pci_dev * pdev,pci_channel_state_t state)4918 pci_ers_result_t amdgpu_pci_error_detected(struct pci_dev *pdev, pci_channel_state_t state)
4919 {
4920 	struct drm_device *dev = pci_get_drvdata(pdev);
4921 	struct amdgpu_device *adev = drm_to_adev(dev);
4922 	int i;
4923 
4924 	DRM_INFO("PCI error: detected callback, state(%d)!!\n", state);
4925 
4926 	if (adev->gmc.xgmi.num_physical_nodes > 1) {
4927 		DRM_WARN("No support for XGMI hive yet...");
4928 		return PCI_ERS_RESULT_DISCONNECT;
4929 	}
4930 
4931 	switch (state) {
4932 	case pci_channel_io_normal:
4933 		return PCI_ERS_RESULT_CAN_RECOVER;
4934 	/* Fatal error, prepare for slot reset */
4935 	case pci_channel_io_frozen:
4936 		/*
4937 		 * Cancel and wait for all TDRs in progress if failing to
4938 		 * set  adev->in_gpu_reset in amdgpu_device_lock_adev
4939 		 *
4940 		 * Locking adev->reset_sem will prevent any external access
4941 		 * to GPU during PCI error recovery
4942 		 */
4943 		while (!amdgpu_device_lock_adev(adev, NULL))
4944 			amdgpu_cancel_all_tdr(adev);
4945 
4946 		/*
4947 		 * Block any work scheduling as we do for regular GPU reset
4948 		 * for the duration of the recovery
4949 		 */
4950 		for (i = 0; i < AMDGPU_MAX_RINGS; ++i) {
4951 			struct amdgpu_ring *ring = adev->rings[i];
4952 
4953 			if (!ring || !ring->sched.thread)
4954 				continue;
4955 
4956 			drm_sched_stop(&ring->sched, NULL);
4957 		}
4958 		return PCI_ERS_RESULT_NEED_RESET;
4959 	case pci_channel_io_perm_failure:
4960 		/* Permanent error, prepare for device removal */
4961 		return PCI_ERS_RESULT_DISCONNECT;
4962 	}
4963 
4964 	return PCI_ERS_RESULT_NEED_RESET;
4965 }
4966 
4967 /**
4968  * amdgpu_pci_mmio_enabled - Enable MMIO and dump debug registers
4969  * @pdev: pointer to PCI device
4970  */
amdgpu_pci_mmio_enabled(struct pci_dev * pdev)4971 pci_ers_result_t amdgpu_pci_mmio_enabled(struct pci_dev *pdev)
4972 {
4973 
4974 	DRM_INFO("PCI error: mmio enabled callback!!\n");
4975 
4976 	/* TODO - dump whatever for debugging purposes */
4977 
4978 	/* This called only if amdgpu_pci_error_detected returns
4979 	 * PCI_ERS_RESULT_CAN_RECOVER. Read/write to the device still
4980 	 * works, no need to reset slot.
4981 	 */
4982 
4983 	return PCI_ERS_RESULT_RECOVERED;
4984 }
4985 
4986 /**
4987  * amdgpu_pci_slot_reset - Called when PCI slot has been reset.
4988  * @pdev: PCI device struct
4989  *
4990  * Description: This routine is called by the pci error recovery
4991  * code after the PCI slot has been reset, just before we
4992  * should resume normal operations.
4993  */
amdgpu_pci_slot_reset(struct pci_dev * pdev)4994 pci_ers_result_t amdgpu_pci_slot_reset(struct pci_dev *pdev)
4995 {
4996 	struct drm_device *dev = pci_get_drvdata(pdev);
4997 	struct amdgpu_device *adev = drm_to_adev(dev);
4998 	int r, i;
4999 	bool need_full_reset = true;
5000 	u32 memsize;
5001 	struct list_head device_list;
5002 
5003 	DRM_INFO("PCI error: slot reset callback!!\n");
5004 
5005 	INIT_LIST_HEAD(&device_list);
5006 	list_add_tail(&adev->gmc.xgmi.head, &device_list);
5007 
5008 	/* wait for asic to come out of reset */
5009 	msleep(500);
5010 
5011 	/* Restore PCI confspace */
5012 	amdgpu_device_load_pci_state(pdev);
5013 
5014 	/* confirm  ASIC came out of reset */
5015 	for (i = 0; i < adev->usec_timeout; i++) {
5016 		memsize = amdgpu_asic_get_config_memsize(adev);
5017 
5018 		if (memsize != 0xffffffff)
5019 			break;
5020 		udelay(1);
5021 	}
5022 	if (memsize == 0xffffffff) {
5023 		r = -ETIME;
5024 		goto out;
5025 	}
5026 
5027 	adev->in_pci_err_recovery = true;
5028 	r = amdgpu_device_pre_asic_reset(adev, NULL, &need_full_reset);
5029 	adev->in_pci_err_recovery = false;
5030 	if (r)
5031 		goto out;
5032 
5033 	r = amdgpu_do_asic_reset(NULL, &device_list, &need_full_reset, true);
5034 
5035 out:
5036 	if (!r) {
5037 		if (amdgpu_device_cache_pci_state(adev->pdev))
5038 			pci_restore_state(adev->pdev);
5039 
5040 		DRM_INFO("PCIe error recovery succeeded\n");
5041 	} else {
5042 		DRM_ERROR("PCIe error recovery failed, err:%d", r);
5043 		amdgpu_device_unlock_adev(adev);
5044 	}
5045 
5046 	return r ? PCI_ERS_RESULT_DISCONNECT : PCI_ERS_RESULT_RECOVERED;
5047 }
5048 
5049 /**
5050  * amdgpu_pci_resume() - resume normal ops after PCI reset
5051  * @pdev: pointer to PCI device
5052  *
5053  * Called when the error recovery driver tells us that its
5054  * OK to resume normal operation. Use completion to allow
5055  * halted scsi ops to resume.
5056  */
amdgpu_pci_resume(struct pci_dev * pdev)5057 void amdgpu_pci_resume(struct pci_dev *pdev)
5058 {
5059 	struct drm_device *dev = pci_get_drvdata(pdev);
5060 	struct amdgpu_device *adev = drm_to_adev(dev);
5061 	int i;
5062 
5063 
5064 	DRM_INFO("PCI error: resume callback!!\n");
5065 
5066 	for (i = 0; i < AMDGPU_MAX_RINGS; ++i) {
5067 		struct amdgpu_ring *ring = adev->rings[i];
5068 
5069 		if (!ring || !ring->sched.thread)
5070 			continue;
5071 
5072 
5073 		drm_sched_resubmit_jobs(&ring->sched);
5074 		drm_sched_start(&ring->sched, true);
5075 	}
5076 
5077 	amdgpu_device_unlock_adev(adev);
5078 }
5079 
amdgpu_device_cache_pci_state(struct pci_dev * pdev)5080 bool amdgpu_device_cache_pci_state(struct pci_dev *pdev)
5081 {
5082 	struct drm_device *dev = pci_get_drvdata(pdev);
5083 	struct amdgpu_device *adev = drm_to_adev(dev);
5084 	int r;
5085 
5086 	r = pci_save_state(pdev);
5087 	if (!r) {
5088 		kfree(adev->pci_state);
5089 
5090 		adev->pci_state = pci_store_saved_state(pdev);
5091 
5092 		if (!adev->pci_state) {
5093 			DRM_ERROR("Failed to store PCI saved state");
5094 			return false;
5095 		}
5096 	} else {
5097 		DRM_WARN("Failed to save PCI state, err:%d\n", r);
5098 		return false;
5099 	}
5100 
5101 	return true;
5102 }
5103 
amdgpu_device_load_pci_state(struct pci_dev * pdev)5104 bool amdgpu_device_load_pci_state(struct pci_dev *pdev)
5105 {
5106 	struct drm_device *dev = pci_get_drvdata(pdev);
5107 	struct amdgpu_device *adev = drm_to_adev(dev);
5108 	int r;
5109 
5110 	if (!adev->pci_state)
5111 		return false;
5112 
5113 	r = pci_load_saved_state(pdev, adev->pci_state);
5114 
5115 	if (!r) {
5116 		pci_restore_state(pdev);
5117 	} else {
5118 		DRM_WARN("Failed to load PCI state, err:%d\n", r);
5119 		return false;
5120 	}
5121 
5122 	return true;
5123 }
5124 
5125 
5126