1 /*
2 * Copyright 2008 Advanced Micro Devices, Inc.
3 * Copyright 2008 Red Hat Inc.
4 * Copyright 2009 Jerome Glisse.
5 *
6 * Permission is hereby granted, free of charge, to any person obtaining a
7 * copy of this software and associated documentation files (the "Software"),
8 * to deal in the Software without restriction, including without limitation
9 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
10 * and/or sell copies of the Software, and to permit persons to whom the
11 * Software is furnished to do so, subject to the following conditions:
12 *
13 * The above copyright notice and this permission notice shall be included in
14 * all copies or substantial portions of the Software.
15 *
16 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
19 * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR
20 * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
21 * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
22 * OTHER DEALINGS IN THE SOFTWARE.
23 *
24 * Authors: Dave Airlie
25 * Alex Deucher
26 * Jerome Glisse
27 */
28 #include <linux/power_supply.h>
29 #include <linux/kthread.h>
30 #include <linux/module.h>
31 #include <linux/console.h>
32 #include <linux/slab.h>
33 #include <linux/pci.h>
34
35 #include <drm/drm_aperture.h>
36 #include <drm/drm_atomic_helper.h>
37 #include <drm/drm_probe_helper.h>
38 #include <drm/amdgpu_drm.h>
39 #include <linux/vgaarb.h>
40 #include <linux/vga_switcheroo.h>
41 #include <linux/efi.h>
42 #include "amdgpu.h"
43 #include "amdgpu_trace.h"
44 #include "amdgpu_i2c.h"
45 #include "atom.h"
46 #include "amdgpu_atombios.h"
47 #include "amdgpu_atomfirmware.h"
48 #include "amd_pcie.h"
49 #ifdef CONFIG_DRM_AMDGPU_SI
50 #include "si.h"
51 #endif
52 #ifdef CONFIG_DRM_AMDGPU_CIK
53 #include "cik.h"
54 #endif
55 #include "vi.h"
56 #include "soc15.h"
57 #include "nv.h"
58 #include "bif/bif_4_1_d.h"
59 #include <linux/pci.h>
60 #include <linux/firmware.h>
61 #include "amdgpu_vf_error.h"
62
63 #include "amdgpu_amdkfd.h"
64 #include "amdgpu_pm.h"
65
66 #include "amdgpu_xgmi.h"
67 #include "amdgpu_ras.h"
68 #include "amdgpu_pmu.h"
69 #include "amdgpu_fru_eeprom.h"
70 #include "amdgpu_reset.h"
71
72 #include <linux/suspend.h>
73 #include <drm/task_barrier.h>
74 #include <linux/pm_runtime.h>
75
76 #include <drm/drm_drv.h>
77
78 #if IS_ENABLED(CONFIG_X86)
79 #include <asm/intel-family.h>
80 #endif
81
82 MODULE_FIRMWARE("amdgpu/vega10_gpu_info.bin");
83 MODULE_FIRMWARE("amdgpu/vega12_gpu_info.bin");
84 MODULE_FIRMWARE("amdgpu/raven_gpu_info.bin");
85 MODULE_FIRMWARE("amdgpu/picasso_gpu_info.bin");
86 MODULE_FIRMWARE("amdgpu/raven2_gpu_info.bin");
87 MODULE_FIRMWARE("amdgpu/arcturus_gpu_info.bin");
88 MODULE_FIRMWARE("amdgpu/renoir_gpu_info.bin");
89 MODULE_FIRMWARE("amdgpu/navi10_gpu_info.bin");
90 MODULE_FIRMWARE("amdgpu/navi14_gpu_info.bin");
91 MODULE_FIRMWARE("amdgpu/navi12_gpu_info.bin");
92 MODULE_FIRMWARE("amdgpu/vangogh_gpu_info.bin");
93 MODULE_FIRMWARE("amdgpu/yellow_carp_gpu_info.bin");
94
95 #define AMDGPU_RESUME_MS 2000
96
97 static const struct drm_driver amdgpu_kms_driver;
98
99 const char *amdgpu_asic_name[] = {
100 "TAHITI",
101 "PITCAIRN",
102 "VERDE",
103 "OLAND",
104 "HAINAN",
105 "BONAIRE",
106 "KAVERI",
107 "KABINI",
108 "HAWAII",
109 "MULLINS",
110 "TOPAZ",
111 "TONGA",
112 "FIJI",
113 "CARRIZO",
114 "STONEY",
115 "POLARIS10",
116 "POLARIS11",
117 "POLARIS12",
118 "VEGAM",
119 "VEGA10",
120 "VEGA12",
121 "VEGA20",
122 "RAVEN",
123 "ARCTURUS",
124 "RENOIR",
125 "ALDEBARAN",
126 "NAVI10",
127 "CYAN_SKILLFISH",
128 "NAVI14",
129 "NAVI12",
130 "SIENNA_CICHLID",
131 "NAVY_FLOUNDER",
132 "VANGOGH",
133 "DIMGREY_CAVEFISH",
134 "BEIGE_GOBY",
135 "YELLOW_CARP",
136 "LAST",
137 };
138
139 /**
140 * DOC: pcie_replay_count
141 *
142 * The amdgpu driver provides a sysfs API for reporting the total number
143 * of PCIe replays (NAKs)
144 * The file pcie_replay_count is used for this and returns the total
145 * number of replays as a sum of the NAKs generated and NAKs received
146 */
147
amdgpu_device_get_pcie_replay_count(struct device * dev,struct device_attribute * attr,char * buf)148 static ssize_t amdgpu_device_get_pcie_replay_count(struct device *dev,
149 struct device_attribute *attr, char *buf)
150 {
151 struct drm_device *ddev = dev_get_drvdata(dev);
152 struct amdgpu_device *adev = drm_to_adev(ddev);
153 uint64_t cnt = amdgpu_asic_get_pcie_replay_count(adev);
154
155 return sysfs_emit(buf, "%llu\n", cnt);
156 }
157
158 static DEVICE_ATTR(pcie_replay_count, S_IRUGO,
159 amdgpu_device_get_pcie_replay_count, NULL);
160
161 static void amdgpu_device_get_pcie_info(struct amdgpu_device *adev);
162
163 /**
164 * DOC: product_name
165 *
166 * The amdgpu driver provides a sysfs API for reporting the product name
167 * for the device
168 * The file serial_number is used for this and returns the product name
169 * as returned from the FRU.
170 * NOTE: This is only available for certain server cards
171 */
172
amdgpu_device_get_product_name(struct device * dev,struct device_attribute * attr,char * buf)173 static ssize_t amdgpu_device_get_product_name(struct device *dev,
174 struct device_attribute *attr, char *buf)
175 {
176 struct drm_device *ddev = dev_get_drvdata(dev);
177 struct amdgpu_device *adev = drm_to_adev(ddev);
178
179 return sysfs_emit(buf, "%s\n", adev->product_name);
180 }
181
182 static DEVICE_ATTR(product_name, S_IRUGO,
183 amdgpu_device_get_product_name, NULL);
184
185 /**
186 * DOC: product_number
187 *
188 * The amdgpu driver provides a sysfs API for reporting the part number
189 * for the device
190 * The file serial_number is used for this and returns the part number
191 * as returned from the FRU.
192 * NOTE: This is only available for certain server cards
193 */
194
amdgpu_device_get_product_number(struct device * dev,struct device_attribute * attr,char * buf)195 static ssize_t amdgpu_device_get_product_number(struct device *dev,
196 struct device_attribute *attr, char *buf)
197 {
198 struct drm_device *ddev = dev_get_drvdata(dev);
199 struct amdgpu_device *adev = drm_to_adev(ddev);
200
201 return sysfs_emit(buf, "%s\n", adev->product_number);
202 }
203
204 static DEVICE_ATTR(product_number, S_IRUGO,
205 amdgpu_device_get_product_number, NULL);
206
207 /**
208 * DOC: serial_number
209 *
210 * The amdgpu driver provides a sysfs API for reporting the serial number
211 * for the device
212 * The file serial_number is used for this and returns the serial number
213 * as returned from the FRU.
214 * NOTE: This is only available for certain server cards
215 */
216
amdgpu_device_get_serial_number(struct device * dev,struct device_attribute * attr,char * buf)217 static ssize_t amdgpu_device_get_serial_number(struct device *dev,
218 struct device_attribute *attr, char *buf)
219 {
220 struct drm_device *ddev = dev_get_drvdata(dev);
221 struct amdgpu_device *adev = drm_to_adev(ddev);
222
223 return sysfs_emit(buf, "%s\n", adev->serial);
224 }
225
226 static DEVICE_ATTR(serial_number, S_IRUGO,
227 amdgpu_device_get_serial_number, NULL);
228
229 /**
230 * amdgpu_device_supports_px - Is the device a dGPU with ATPX power control
231 *
232 * @dev: drm_device pointer
233 *
234 * Returns true if the device is a dGPU with ATPX power control,
235 * otherwise return false.
236 */
amdgpu_device_supports_px(struct drm_device * dev)237 bool amdgpu_device_supports_px(struct drm_device *dev)
238 {
239 struct amdgpu_device *adev = drm_to_adev(dev);
240
241 if ((adev->flags & AMD_IS_PX) && !amdgpu_is_atpx_hybrid())
242 return true;
243 return false;
244 }
245
246 /**
247 * amdgpu_device_supports_boco - Is the device a dGPU with ACPI power resources
248 *
249 * @dev: drm_device pointer
250 *
251 * Returns true if the device is a dGPU with ACPI power control,
252 * otherwise return false.
253 */
amdgpu_device_supports_boco(struct drm_device * dev)254 bool amdgpu_device_supports_boco(struct drm_device *dev)
255 {
256 struct amdgpu_device *adev = drm_to_adev(dev);
257
258 if (adev->has_pr3 ||
259 ((adev->flags & AMD_IS_PX) && amdgpu_is_atpx_hybrid()))
260 return true;
261 return false;
262 }
263
264 /**
265 * amdgpu_device_supports_baco - Does the device support BACO
266 *
267 * @dev: drm_device pointer
268 *
269 * Returns true if the device supporte BACO,
270 * otherwise return false.
271 */
amdgpu_device_supports_baco(struct drm_device * dev)272 bool amdgpu_device_supports_baco(struct drm_device *dev)
273 {
274 struct amdgpu_device *adev = drm_to_adev(dev);
275
276 return amdgpu_asic_supports_baco(adev);
277 }
278
279 /**
280 * amdgpu_device_supports_smart_shift - Is the device dGPU with
281 * smart shift support
282 *
283 * @dev: drm_device pointer
284 *
285 * Returns true if the device is a dGPU with Smart Shift support,
286 * otherwise returns false.
287 */
amdgpu_device_supports_smart_shift(struct drm_device * dev)288 bool amdgpu_device_supports_smart_shift(struct drm_device *dev)
289 {
290 return (amdgpu_device_supports_boco(dev) &&
291 amdgpu_acpi_is_power_shift_control_supported());
292 }
293
294 /*
295 * VRAM access helper functions
296 */
297
298 /**
299 * amdgpu_device_mm_access - access vram by MM_INDEX/MM_DATA
300 *
301 * @adev: amdgpu_device pointer
302 * @pos: offset of the buffer in vram
303 * @buf: virtual address of the buffer in system memory
304 * @size: read/write size, sizeof(@buf) must > @size
305 * @write: true - write to vram, otherwise - read from vram
306 */
amdgpu_device_mm_access(struct amdgpu_device * adev,loff_t pos,void * buf,size_t size,bool write)307 void amdgpu_device_mm_access(struct amdgpu_device *adev, loff_t pos,
308 void *buf, size_t size, bool write)
309 {
310 unsigned long flags;
311 uint32_t hi = ~0, tmp = 0;
312 uint32_t *data = buf;
313 uint64_t last;
314 int idx;
315
316 if (!drm_dev_enter(&adev->ddev, &idx))
317 return;
318
319 BUG_ON(!IS_ALIGNED(pos, 4) || !IS_ALIGNED(size, 4));
320
321 spin_lock_irqsave(&adev->mmio_idx_lock, flags);
322 for (last = pos + size; pos < last; pos += 4) {
323 tmp = pos >> 31;
324
325 WREG32_NO_KIQ(mmMM_INDEX, ((uint32_t)pos) | 0x80000000);
326 if (tmp != hi) {
327 WREG32_NO_KIQ(mmMM_INDEX_HI, tmp);
328 hi = tmp;
329 }
330 if (write)
331 WREG32_NO_KIQ(mmMM_DATA, *data++);
332 else
333 *data++ = RREG32_NO_KIQ(mmMM_DATA);
334 }
335
336 spin_unlock_irqrestore(&adev->mmio_idx_lock, flags);
337 drm_dev_exit(idx);
338 }
339
340 /**
341 * amdgpu_device_vram_access - access vram by vram aperature
342 *
343 * @adev: amdgpu_device pointer
344 * @pos: offset of the buffer in vram
345 * @buf: virtual address of the buffer in system memory
346 * @size: read/write size, sizeof(@buf) must > @size
347 * @write: true - write to vram, otherwise - read from vram
348 *
349 * The return value means how many bytes have been transferred.
350 */
amdgpu_device_aper_access(struct amdgpu_device * adev,loff_t pos,void * buf,size_t size,bool write)351 size_t amdgpu_device_aper_access(struct amdgpu_device *adev, loff_t pos,
352 void *buf, size_t size, bool write)
353 {
354 #ifdef CONFIG_64BIT
355 void __iomem *addr;
356 size_t count = 0;
357 uint64_t last;
358
359 if (!adev->mman.aper_base_kaddr)
360 return 0;
361
362 last = min(pos + size, adev->gmc.visible_vram_size);
363 if (last > pos) {
364 addr = adev->mman.aper_base_kaddr + pos;
365 count = last - pos;
366
367 if (write) {
368 memcpy_toio(addr, buf, count);
369 mb();
370 amdgpu_device_flush_hdp(adev, NULL);
371 } else {
372 amdgpu_device_invalidate_hdp(adev, NULL);
373 mb();
374 memcpy_fromio(buf, addr, count);
375 }
376
377 }
378
379 return count;
380 #else
381 return 0;
382 #endif
383 }
384
385 /**
386 * amdgpu_device_vram_access - read/write a buffer in vram
387 *
388 * @adev: amdgpu_device pointer
389 * @pos: offset of the buffer in vram
390 * @buf: virtual address of the buffer in system memory
391 * @size: read/write size, sizeof(@buf) must > @size
392 * @write: true - write to vram, otherwise - read from vram
393 */
amdgpu_device_vram_access(struct amdgpu_device * adev,loff_t pos,void * buf,size_t size,bool write)394 void amdgpu_device_vram_access(struct amdgpu_device *adev, loff_t pos,
395 void *buf, size_t size, bool write)
396 {
397 size_t count;
398
399 /* try to using vram apreature to access vram first */
400 count = amdgpu_device_aper_access(adev, pos, buf, size, write);
401 size -= count;
402 if (size) {
403 /* using MM to access rest vram */
404 pos += count;
405 buf += count;
406 amdgpu_device_mm_access(adev, pos, buf, size, write);
407 }
408 }
409
410 /*
411 * register access helper functions.
412 */
413
414 /* Check if hw access should be skipped because of hotplug or device error */
amdgpu_device_skip_hw_access(struct amdgpu_device * adev)415 bool amdgpu_device_skip_hw_access(struct amdgpu_device *adev)
416 {
417 if (adev->no_hw_access)
418 return true;
419
420 #ifdef CONFIG_LOCKDEP
421 /*
422 * This is a bit complicated to understand, so worth a comment. What we assert
423 * here is that the GPU reset is not running on another thread in parallel.
424 *
425 * For this we trylock the read side of the reset semaphore, if that succeeds
426 * we know that the reset is not running in paralell.
427 *
428 * If the trylock fails we assert that we are either already holding the read
429 * side of the lock or are the reset thread itself and hold the write side of
430 * the lock.
431 */
432 if (in_task()) {
433 if (down_read_trylock(&adev->reset_sem))
434 up_read(&adev->reset_sem);
435 else
436 lockdep_assert_held(&adev->reset_sem);
437 }
438 #endif
439 return false;
440 }
441
442 /**
443 * amdgpu_device_rreg - read a memory mapped IO or indirect register
444 *
445 * @adev: amdgpu_device pointer
446 * @reg: dword aligned register offset
447 * @acc_flags: access flags which require special behavior
448 *
449 * Returns the 32 bit value from the offset specified.
450 */
amdgpu_device_rreg(struct amdgpu_device * adev,uint32_t reg,uint32_t acc_flags)451 uint32_t amdgpu_device_rreg(struct amdgpu_device *adev,
452 uint32_t reg, uint32_t acc_flags)
453 {
454 uint32_t ret;
455
456 if (amdgpu_device_skip_hw_access(adev))
457 return 0;
458
459 if ((reg * 4) < adev->rmmio_size) {
460 if (!(acc_flags & AMDGPU_REGS_NO_KIQ) &&
461 amdgpu_sriov_runtime(adev) &&
462 down_read_trylock(&adev->reset_sem)) {
463 ret = amdgpu_kiq_rreg(adev, reg);
464 up_read(&adev->reset_sem);
465 } else {
466 ret = readl(((void __iomem *)adev->rmmio) + (reg * 4));
467 }
468 } else {
469 ret = adev->pcie_rreg(adev, reg * 4);
470 }
471
472 trace_amdgpu_device_rreg(adev->pdev->device, reg, ret);
473
474 return ret;
475 }
476
477 /*
478 * MMIO register read with bytes helper functions
479 * @offset:bytes offset from MMIO start
480 *
481 */
482
483 /**
484 * amdgpu_mm_rreg8 - read a memory mapped IO register
485 *
486 * @adev: amdgpu_device pointer
487 * @offset: byte aligned register offset
488 *
489 * Returns the 8 bit value from the offset specified.
490 */
amdgpu_mm_rreg8(struct amdgpu_device * adev,uint32_t offset)491 uint8_t amdgpu_mm_rreg8(struct amdgpu_device *adev, uint32_t offset)
492 {
493 if (amdgpu_device_skip_hw_access(adev))
494 return 0;
495
496 if (offset < adev->rmmio_size)
497 return (readb(adev->rmmio + offset));
498 BUG();
499 }
500
501 /*
502 * MMIO register write with bytes helper functions
503 * @offset:bytes offset from MMIO start
504 * @value: the value want to be written to the register
505 *
506 */
507 /**
508 * amdgpu_mm_wreg8 - read a memory mapped IO register
509 *
510 * @adev: amdgpu_device pointer
511 * @offset: byte aligned register offset
512 * @value: 8 bit value to write
513 *
514 * Writes the value specified to the offset specified.
515 */
amdgpu_mm_wreg8(struct amdgpu_device * adev,uint32_t offset,uint8_t value)516 void amdgpu_mm_wreg8(struct amdgpu_device *adev, uint32_t offset, uint8_t value)
517 {
518 if (amdgpu_device_skip_hw_access(adev))
519 return;
520
521 if (offset < adev->rmmio_size)
522 writeb(value, adev->rmmio + offset);
523 else
524 BUG();
525 }
526
527 /**
528 * amdgpu_device_wreg - write to a memory mapped IO or indirect register
529 *
530 * @adev: amdgpu_device pointer
531 * @reg: dword aligned register offset
532 * @v: 32 bit value to write to the register
533 * @acc_flags: access flags which require special behavior
534 *
535 * Writes the value specified to the offset specified.
536 */
amdgpu_device_wreg(struct amdgpu_device * adev,uint32_t reg,uint32_t v,uint32_t acc_flags)537 void amdgpu_device_wreg(struct amdgpu_device *adev,
538 uint32_t reg, uint32_t v,
539 uint32_t acc_flags)
540 {
541 if (amdgpu_device_skip_hw_access(adev))
542 return;
543
544 if ((reg * 4) < adev->rmmio_size) {
545 if (!(acc_flags & AMDGPU_REGS_NO_KIQ) &&
546 amdgpu_sriov_runtime(adev) &&
547 down_read_trylock(&adev->reset_sem)) {
548 amdgpu_kiq_wreg(adev, reg, v);
549 up_read(&adev->reset_sem);
550 } else {
551 writel(v, ((void __iomem *)adev->rmmio) + (reg * 4));
552 }
553 } else {
554 adev->pcie_wreg(adev, reg * 4, v);
555 }
556
557 trace_amdgpu_device_wreg(adev->pdev->device, reg, v);
558 }
559
560 /*
561 * amdgpu_mm_wreg_mmio_rlc - write register either with mmio or with RLC path if in range
562 *
563 * this function is invoked only the debugfs register access
564 * */
amdgpu_mm_wreg_mmio_rlc(struct amdgpu_device * adev,uint32_t reg,uint32_t v)565 void amdgpu_mm_wreg_mmio_rlc(struct amdgpu_device *adev,
566 uint32_t reg, uint32_t v)
567 {
568 if (amdgpu_device_skip_hw_access(adev))
569 return;
570
571 if (amdgpu_sriov_fullaccess(adev) &&
572 adev->gfx.rlc.funcs &&
573 adev->gfx.rlc.funcs->is_rlcg_access_range) {
574 if (adev->gfx.rlc.funcs->is_rlcg_access_range(adev, reg))
575 return adev->gfx.rlc.funcs->sriov_wreg(adev, reg, v, 0, 0);
576 } else {
577 writel(v, ((void __iomem *)adev->rmmio) + (reg * 4));
578 }
579 }
580
581 /**
582 * amdgpu_mm_rdoorbell - read a doorbell dword
583 *
584 * @adev: amdgpu_device pointer
585 * @index: doorbell index
586 *
587 * Returns the value in the doorbell aperture at the
588 * requested doorbell index (CIK).
589 */
amdgpu_mm_rdoorbell(struct amdgpu_device * adev,u32 index)590 u32 amdgpu_mm_rdoorbell(struct amdgpu_device *adev, u32 index)
591 {
592 if (amdgpu_device_skip_hw_access(adev))
593 return 0;
594
595 if (index < adev->doorbell.num_doorbells) {
596 return readl(adev->doorbell.ptr + index);
597 } else {
598 DRM_ERROR("reading beyond doorbell aperture: 0x%08x!\n", index);
599 return 0;
600 }
601 }
602
603 /**
604 * amdgpu_mm_wdoorbell - write a doorbell dword
605 *
606 * @adev: amdgpu_device pointer
607 * @index: doorbell index
608 * @v: value to write
609 *
610 * Writes @v to the doorbell aperture at the
611 * requested doorbell index (CIK).
612 */
amdgpu_mm_wdoorbell(struct amdgpu_device * adev,u32 index,u32 v)613 void amdgpu_mm_wdoorbell(struct amdgpu_device *adev, u32 index, u32 v)
614 {
615 if (amdgpu_device_skip_hw_access(adev))
616 return;
617
618 if (index < adev->doorbell.num_doorbells) {
619 writel(v, adev->doorbell.ptr + index);
620 } else {
621 DRM_ERROR("writing beyond doorbell aperture: 0x%08x!\n", index);
622 }
623 }
624
625 /**
626 * amdgpu_mm_rdoorbell64 - read a doorbell Qword
627 *
628 * @adev: amdgpu_device pointer
629 * @index: doorbell index
630 *
631 * Returns the value in the doorbell aperture at the
632 * requested doorbell index (VEGA10+).
633 */
amdgpu_mm_rdoorbell64(struct amdgpu_device * adev,u32 index)634 u64 amdgpu_mm_rdoorbell64(struct amdgpu_device *adev, u32 index)
635 {
636 if (amdgpu_device_skip_hw_access(adev))
637 return 0;
638
639 if (index < adev->doorbell.num_doorbells) {
640 return atomic64_read((atomic64_t *)(adev->doorbell.ptr + index));
641 } else {
642 DRM_ERROR("reading beyond doorbell aperture: 0x%08x!\n", index);
643 return 0;
644 }
645 }
646
647 /**
648 * amdgpu_mm_wdoorbell64 - write a doorbell Qword
649 *
650 * @adev: amdgpu_device pointer
651 * @index: doorbell index
652 * @v: value to write
653 *
654 * Writes @v to the doorbell aperture at the
655 * requested doorbell index (VEGA10+).
656 */
amdgpu_mm_wdoorbell64(struct amdgpu_device * adev,u32 index,u64 v)657 void amdgpu_mm_wdoorbell64(struct amdgpu_device *adev, u32 index, u64 v)
658 {
659 if (amdgpu_device_skip_hw_access(adev))
660 return;
661
662 if (index < adev->doorbell.num_doorbells) {
663 atomic64_set((atomic64_t *)(adev->doorbell.ptr + index), v);
664 } else {
665 DRM_ERROR("writing beyond doorbell aperture: 0x%08x!\n", index);
666 }
667 }
668
669 /**
670 * amdgpu_device_indirect_rreg - read an indirect register
671 *
672 * @adev: amdgpu_device pointer
673 * @pcie_index: mmio register offset
674 * @pcie_data: mmio register offset
675 * @reg_addr: indirect register address to read from
676 *
677 * Returns the value of indirect register @reg_addr
678 */
amdgpu_device_indirect_rreg(struct amdgpu_device * adev,u32 pcie_index,u32 pcie_data,u32 reg_addr)679 u32 amdgpu_device_indirect_rreg(struct amdgpu_device *adev,
680 u32 pcie_index, u32 pcie_data,
681 u32 reg_addr)
682 {
683 unsigned long flags;
684 u32 r;
685 void __iomem *pcie_index_offset;
686 void __iomem *pcie_data_offset;
687
688 spin_lock_irqsave(&adev->pcie_idx_lock, flags);
689 pcie_index_offset = (void __iomem *)adev->rmmio + pcie_index * 4;
690 pcie_data_offset = (void __iomem *)adev->rmmio + pcie_data * 4;
691
692 writel(reg_addr, pcie_index_offset);
693 readl(pcie_index_offset);
694 r = readl(pcie_data_offset);
695 spin_unlock_irqrestore(&adev->pcie_idx_lock, flags);
696
697 return r;
698 }
699
700 /**
701 * amdgpu_device_indirect_rreg64 - read a 64bits indirect register
702 *
703 * @adev: amdgpu_device pointer
704 * @pcie_index: mmio register offset
705 * @pcie_data: mmio register offset
706 * @reg_addr: indirect register address to read from
707 *
708 * Returns the value of indirect register @reg_addr
709 */
amdgpu_device_indirect_rreg64(struct amdgpu_device * adev,u32 pcie_index,u32 pcie_data,u32 reg_addr)710 u64 amdgpu_device_indirect_rreg64(struct amdgpu_device *adev,
711 u32 pcie_index, u32 pcie_data,
712 u32 reg_addr)
713 {
714 unsigned long flags;
715 u64 r;
716 void __iomem *pcie_index_offset;
717 void __iomem *pcie_data_offset;
718
719 spin_lock_irqsave(&adev->pcie_idx_lock, flags);
720 pcie_index_offset = (void __iomem *)adev->rmmio + pcie_index * 4;
721 pcie_data_offset = (void __iomem *)adev->rmmio + pcie_data * 4;
722
723 /* read low 32 bits */
724 writel(reg_addr, pcie_index_offset);
725 readl(pcie_index_offset);
726 r = readl(pcie_data_offset);
727 /* read high 32 bits */
728 writel(reg_addr + 4, pcie_index_offset);
729 readl(pcie_index_offset);
730 r |= ((u64)readl(pcie_data_offset) << 32);
731 spin_unlock_irqrestore(&adev->pcie_idx_lock, flags);
732
733 return r;
734 }
735
736 /**
737 * amdgpu_device_indirect_wreg - write an indirect register address
738 *
739 * @adev: amdgpu_device pointer
740 * @pcie_index: mmio register offset
741 * @pcie_data: mmio register offset
742 * @reg_addr: indirect register offset
743 * @reg_data: indirect register data
744 *
745 */
amdgpu_device_indirect_wreg(struct amdgpu_device * adev,u32 pcie_index,u32 pcie_data,u32 reg_addr,u32 reg_data)746 void amdgpu_device_indirect_wreg(struct amdgpu_device *adev,
747 u32 pcie_index, u32 pcie_data,
748 u32 reg_addr, u32 reg_data)
749 {
750 unsigned long flags;
751 void __iomem *pcie_index_offset;
752 void __iomem *pcie_data_offset;
753
754 spin_lock_irqsave(&adev->pcie_idx_lock, flags);
755 pcie_index_offset = (void __iomem *)adev->rmmio + pcie_index * 4;
756 pcie_data_offset = (void __iomem *)adev->rmmio + pcie_data * 4;
757
758 writel(reg_addr, pcie_index_offset);
759 readl(pcie_index_offset);
760 writel(reg_data, pcie_data_offset);
761 readl(pcie_data_offset);
762 spin_unlock_irqrestore(&adev->pcie_idx_lock, flags);
763 }
764
765 /**
766 * amdgpu_device_indirect_wreg64 - write a 64bits indirect register address
767 *
768 * @adev: amdgpu_device pointer
769 * @pcie_index: mmio register offset
770 * @pcie_data: mmio register offset
771 * @reg_addr: indirect register offset
772 * @reg_data: indirect register data
773 *
774 */
amdgpu_device_indirect_wreg64(struct amdgpu_device * adev,u32 pcie_index,u32 pcie_data,u32 reg_addr,u64 reg_data)775 void amdgpu_device_indirect_wreg64(struct amdgpu_device *adev,
776 u32 pcie_index, u32 pcie_data,
777 u32 reg_addr, u64 reg_data)
778 {
779 unsigned long flags;
780 void __iomem *pcie_index_offset;
781 void __iomem *pcie_data_offset;
782
783 spin_lock_irqsave(&adev->pcie_idx_lock, flags);
784 pcie_index_offset = (void __iomem *)adev->rmmio + pcie_index * 4;
785 pcie_data_offset = (void __iomem *)adev->rmmio + pcie_data * 4;
786
787 /* write low 32 bits */
788 writel(reg_addr, pcie_index_offset);
789 readl(pcie_index_offset);
790 writel((u32)(reg_data & 0xffffffffULL), pcie_data_offset);
791 readl(pcie_data_offset);
792 /* write high 32 bits */
793 writel(reg_addr + 4, pcie_index_offset);
794 readl(pcie_index_offset);
795 writel((u32)(reg_data >> 32), pcie_data_offset);
796 readl(pcie_data_offset);
797 spin_unlock_irqrestore(&adev->pcie_idx_lock, flags);
798 }
799
800 /**
801 * amdgpu_invalid_rreg - dummy reg read function
802 *
803 * @adev: amdgpu_device pointer
804 * @reg: offset of register
805 *
806 * Dummy register read function. Used for register blocks
807 * that certain asics don't have (all asics).
808 * Returns the value in the register.
809 */
amdgpu_invalid_rreg(struct amdgpu_device * adev,uint32_t reg)810 static uint32_t amdgpu_invalid_rreg(struct amdgpu_device *adev, uint32_t reg)
811 {
812 DRM_ERROR("Invalid callback to read register 0x%04X\n", reg);
813 BUG();
814 return 0;
815 }
816
817 /**
818 * amdgpu_invalid_wreg - dummy reg write function
819 *
820 * @adev: amdgpu_device pointer
821 * @reg: offset of register
822 * @v: value to write to the register
823 *
824 * Dummy register read function. Used for register blocks
825 * that certain asics don't have (all asics).
826 */
amdgpu_invalid_wreg(struct amdgpu_device * adev,uint32_t reg,uint32_t v)827 static void amdgpu_invalid_wreg(struct amdgpu_device *adev, uint32_t reg, uint32_t v)
828 {
829 DRM_ERROR("Invalid callback to write register 0x%04X with 0x%08X\n",
830 reg, v);
831 BUG();
832 }
833
834 /**
835 * amdgpu_invalid_rreg64 - dummy 64 bit reg read function
836 *
837 * @adev: amdgpu_device pointer
838 * @reg: offset of register
839 *
840 * Dummy register read function. Used for register blocks
841 * that certain asics don't have (all asics).
842 * Returns the value in the register.
843 */
amdgpu_invalid_rreg64(struct amdgpu_device * adev,uint32_t reg)844 static uint64_t amdgpu_invalid_rreg64(struct amdgpu_device *adev, uint32_t reg)
845 {
846 DRM_ERROR("Invalid callback to read 64 bit register 0x%04X\n", reg);
847 BUG();
848 return 0;
849 }
850
851 /**
852 * amdgpu_invalid_wreg64 - dummy reg write function
853 *
854 * @adev: amdgpu_device pointer
855 * @reg: offset of register
856 * @v: value to write to the register
857 *
858 * Dummy register read function. Used for register blocks
859 * that certain asics don't have (all asics).
860 */
amdgpu_invalid_wreg64(struct amdgpu_device * adev,uint32_t reg,uint64_t v)861 static void amdgpu_invalid_wreg64(struct amdgpu_device *adev, uint32_t reg, uint64_t v)
862 {
863 DRM_ERROR("Invalid callback to write 64 bit register 0x%04X with 0x%08llX\n",
864 reg, v);
865 BUG();
866 }
867
868 /**
869 * amdgpu_block_invalid_rreg - dummy reg read function
870 *
871 * @adev: amdgpu_device pointer
872 * @block: offset of instance
873 * @reg: offset of register
874 *
875 * Dummy register read function. Used for register blocks
876 * that certain asics don't have (all asics).
877 * Returns the value in the register.
878 */
amdgpu_block_invalid_rreg(struct amdgpu_device * adev,uint32_t block,uint32_t reg)879 static uint32_t amdgpu_block_invalid_rreg(struct amdgpu_device *adev,
880 uint32_t block, uint32_t reg)
881 {
882 DRM_ERROR("Invalid callback to read register 0x%04X in block 0x%04X\n",
883 reg, block);
884 BUG();
885 return 0;
886 }
887
888 /**
889 * amdgpu_block_invalid_wreg - dummy reg write function
890 *
891 * @adev: amdgpu_device pointer
892 * @block: offset of instance
893 * @reg: offset of register
894 * @v: value to write to the register
895 *
896 * Dummy register read function. Used for register blocks
897 * that certain asics don't have (all asics).
898 */
amdgpu_block_invalid_wreg(struct amdgpu_device * adev,uint32_t block,uint32_t reg,uint32_t v)899 static void amdgpu_block_invalid_wreg(struct amdgpu_device *adev,
900 uint32_t block,
901 uint32_t reg, uint32_t v)
902 {
903 DRM_ERROR("Invalid block callback to write register 0x%04X in block 0x%04X with 0x%08X\n",
904 reg, block, v);
905 BUG();
906 }
907
908 /**
909 * amdgpu_device_asic_init - Wrapper for atom asic_init
910 *
911 * @adev: amdgpu_device pointer
912 *
913 * Does any asic specific work and then calls atom asic init.
914 */
amdgpu_device_asic_init(struct amdgpu_device * adev)915 static int amdgpu_device_asic_init(struct amdgpu_device *adev)
916 {
917 amdgpu_asic_pre_asic_init(adev);
918
919 return amdgpu_atom_asic_init(adev->mode_info.atom_context);
920 }
921
922 /**
923 * amdgpu_device_vram_scratch_init - allocate the VRAM scratch page
924 *
925 * @adev: amdgpu_device pointer
926 *
927 * Allocates a scratch page of VRAM for use by various things in the
928 * driver.
929 */
amdgpu_device_vram_scratch_init(struct amdgpu_device * adev)930 static int amdgpu_device_vram_scratch_init(struct amdgpu_device *adev)
931 {
932 return amdgpu_bo_create_kernel(adev, AMDGPU_GPU_PAGE_SIZE,
933 PAGE_SIZE, AMDGPU_GEM_DOMAIN_VRAM,
934 &adev->vram_scratch.robj,
935 &adev->vram_scratch.gpu_addr,
936 (void **)&adev->vram_scratch.ptr);
937 }
938
939 /**
940 * amdgpu_device_vram_scratch_fini - Free the VRAM scratch page
941 *
942 * @adev: amdgpu_device pointer
943 *
944 * Frees the VRAM scratch page.
945 */
amdgpu_device_vram_scratch_fini(struct amdgpu_device * adev)946 static void amdgpu_device_vram_scratch_fini(struct amdgpu_device *adev)
947 {
948 amdgpu_bo_free_kernel(&adev->vram_scratch.robj, NULL, NULL);
949 }
950
951 /**
952 * amdgpu_device_program_register_sequence - program an array of registers.
953 *
954 * @adev: amdgpu_device pointer
955 * @registers: pointer to the register array
956 * @array_size: size of the register array
957 *
958 * Programs an array or registers with and and or masks.
959 * This is a helper for setting golden registers.
960 */
amdgpu_device_program_register_sequence(struct amdgpu_device * adev,const u32 * registers,const u32 array_size)961 void amdgpu_device_program_register_sequence(struct amdgpu_device *adev,
962 const u32 *registers,
963 const u32 array_size)
964 {
965 u32 tmp, reg, and_mask, or_mask;
966 int i;
967
968 if (array_size % 3)
969 return;
970
971 for (i = 0; i < array_size; i +=3) {
972 reg = registers[i + 0];
973 and_mask = registers[i + 1];
974 or_mask = registers[i + 2];
975
976 if (and_mask == 0xffffffff) {
977 tmp = or_mask;
978 } else {
979 tmp = RREG32(reg);
980 tmp &= ~and_mask;
981 if (adev->family >= AMDGPU_FAMILY_AI)
982 tmp |= (or_mask & and_mask);
983 else
984 tmp |= or_mask;
985 }
986 WREG32(reg, tmp);
987 }
988 }
989
990 /**
991 * amdgpu_device_pci_config_reset - reset the GPU
992 *
993 * @adev: amdgpu_device pointer
994 *
995 * Resets the GPU using the pci config reset sequence.
996 * Only applicable to asics prior to vega10.
997 */
amdgpu_device_pci_config_reset(struct amdgpu_device * adev)998 void amdgpu_device_pci_config_reset(struct amdgpu_device *adev)
999 {
1000 pci_write_config_dword(adev->pdev, 0x7c, AMDGPU_ASIC_RESET_DATA);
1001 }
1002
1003 /**
1004 * amdgpu_device_pci_reset - reset the GPU using generic PCI means
1005 *
1006 * @adev: amdgpu_device pointer
1007 *
1008 * Resets the GPU using generic pci reset interfaces (FLR, SBR, etc.).
1009 */
amdgpu_device_pci_reset(struct amdgpu_device * adev)1010 int amdgpu_device_pci_reset(struct amdgpu_device *adev)
1011 {
1012 return pci_reset_function(adev->pdev);
1013 }
1014
1015 /*
1016 * GPU doorbell aperture helpers function.
1017 */
1018 /**
1019 * amdgpu_device_doorbell_init - Init doorbell driver information.
1020 *
1021 * @adev: amdgpu_device pointer
1022 *
1023 * Init doorbell driver information (CIK)
1024 * Returns 0 on success, error on failure.
1025 */
amdgpu_device_doorbell_init(struct amdgpu_device * adev)1026 static int amdgpu_device_doorbell_init(struct amdgpu_device *adev)
1027 {
1028
1029 /* No doorbell on SI hardware generation */
1030 if (adev->asic_type < CHIP_BONAIRE) {
1031 adev->doorbell.base = 0;
1032 adev->doorbell.size = 0;
1033 adev->doorbell.num_doorbells = 0;
1034 adev->doorbell.ptr = NULL;
1035 return 0;
1036 }
1037
1038 if (pci_resource_flags(adev->pdev, 2) & IORESOURCE_UNSET)
1039 return -EINVAL;
1040
1041 amdgpu_asic_init_doorbell_index(adev);
1042
1043 /* doorbell bar mapping */
1044 adev->doorbell.base = pci_resource_start(adev->pdev, 2);
1045 adev->doorbell.size = pci_resource_len(adev->pdev, 2);
1046
1047 adev->doorbell.num_doorbells = min_t(u32, adev->doorbell.size / sizeof(u32),
1048 adev->doorbell_index.max_assignment+1);
1049 if (adev->doorbell.num_doorbells == 0)
1050 return -EINVAL;
1051
1052 /* For Vega, reserve and map two pages on doorbell BAR since SDMA
1053 * paging queue doorbell use the second page. The
1054 * AMDGPU_DOORBELL64_MAX_ASSIGNMENT definition assumes all the
1055 * doorbells are in the first page. So with paging queue enabled,
1056 * the max num_doorbells should + 1 page (0x400 in dword)
1057 */
1058 if (adev->asic_type >= CHIP_VEGA10)
1059 adev->doorbell.num_doorbells += 0x400;
1060
1061 adev->doorbell.ptr = ioremap(adev->doorbell.base,
1062 adev->doorbell.num_doorbells *
1063 sizeof(u32));
1064 if (adev->doorbell.ptr == NULL)
1065 return -ENOMEM;
1066
1067 return 0;
1068 }
1069
1070 /**
1071 * amdgpu_device_doorbell_fini - Tear down doorbell driver information.
1072 *
1073 * @adev: amdgpu_device pointer
1074 *
1075 * Tear down doorbell driver information (CIK)
1076 */
amdgpu_device_doorbell_fini(struct amdgpu_device * adev)1077 static void amdgpu_device_doorbell_fini(struct amdgpu_device *adev)
1078 {
1079 iounmap(adev->doorbell.ptr);
1080 adev->doorbell.ptr = NULL;
1081 }
1082
1083
1084
1085 /*
1086 * amdgpu_device_wb_*()
1087 * Writeback is the method by which the GPU updates special pages in memory
1088 * with the status of certain GPU events (fences, ring pointers,etc.).
1089 */
1090
1091 /**
1092 * amdgpu_device_wb_fini - Disable Writeback and free memory
1093 *
1094 * @adev: amdgpu_device pointer
1095 *
1096 * Disables Writeback and frees the Writeback memory (all asics).
1097 * Used at driver shutdown.
1098 */
amdgpu_device_wb_fini(struct amdgpu_device * adev)1099 static void amdgpu_device_wb_fini(struct amdgpu_device *adev)
1100 {
1101 if (adev->wb.wb_obj) {
1102 amdgpu_bo_free_kernel(&adev->wb.wb_obj,
1103 &adev->wb.gpu_addr,
1104 (void **)&adev->wb.wb);
1105 adev->wb.wb_obj = NULL;
1106 }
1107 }
1108
1109 /**
1110 * amdgpu_device_wb_init- Init Writeback driver info and allocate memory
1111 *
1112 * @adev: amdgpu_device pointer
1113 *
1114 * Initializes writeback and allocates writeback memory (all asics).
1115 * Used at driver startup.
1116 * Returns 0 on success or an -error on failure.
1117 */
amdgpu_device_wb_init(struct amdgpu_device * adev)1118 static int amdgpu_device_wb_init(struct amdgpu_device *adev)
1119 {
1120 int r;
1121
1122 if (adev->wb.wb_obj == NULL) {
1123 /* AMDGPU_MAX_WB * sizeof(uint32_t) * 8 = AMDGPU_MAX_WB 256bit slots */
1124 r = amdgpu_bo_create_kernel(adev, AMDGPU_MAX_WB * sizeof(uint32_t) * 8,
1125 PAGE_SIZE, AMDGPU_GEM_DOMAIN_GTT,
1126 &adev->wb.wb_obj, &adev->wb.gpu_addr,
1127 (void **)&adev->wb.wb);
1128 if (r) {
1129 dev_warn(adev->dev, "(%d) create WB bo failed\n", r);
1130 return r;
1131 }
1132
1133 adev->wb.num_wb = AMDGPU_MAX_WB;
1134 memset(&adev->wb.used, 0, sizeof(adev->wb.used));
1135
1136 /* clear wb memory */
1137 memset((char *)adev->wb.wb, 0, AMDGPU_MAX_WB * sizeof(uint32_t) * 8);
1138 }
1139
1140 return 0;
1141 }
1142
1143 /**
1144 * amdgpu_device_wb_get - Allocate a wb entry
1145 *
1146 * @adev: amdgpu_device pointer
1147 * @wb: wb index
1148 *
1149 * Allocate a wb slot for use by the driver (all asics).
1150 * Returns 0 on success or -EINVAL on failure.
1151 */
amdgpu_device_wb_get(struct amdgpu_device * adev,u32 * wb)1152 int amdgpu_device_wb_get(struct amdgpu_device *adev, u32 *wb)
1153 {
1154 unsigned long offset = find_first_zero_bit(adev->wb.used, adev->wb.num_wb);
1155
1156 if (offset < adev->wb.num_wb) {
1157 __set_bit(offset, adev->wb.used);
1158 *wb = offset << 3; /* convert to dw offset */
1159 return 0;
1160 } else {
1161 return -EINVAL;
1162 }
1163 }
1164
1165 /**
1166 * amdgpu_device_wb_free - Free a wb entry
1167 *
1168 * @adev: amdgpu_device pointer
1169 * @wb: wb index
1170 *
1171 * Free a wb slot allocated for use by the driver (all asics)
1172 */
amdgpu_device_wb_free(struct amdgpu_device * adev,u32 wb)1173 void amdgpu_device_wb_free(struct amdgpu_device *adev, u32 wb)
1174 {
1175 wb >>= 3;
1176 if (wb < adev->wb.num_wb)
1177 __clear_bit(wb, adev->wb.used);
1178 }
1179
1180 /**
1181 * amdgpu_device_resize_fb_bar - try to resize FB BAR
1182 *
1183 * @adev: amdgpu_device pointer
1184 *
1185 * Try to resize FB BAR to make all VRAM CPU accessible. We try very hard not
1186 * to fail, but if any of the BARs is not accessible after the size we abort
1187 * driver loading by returning -ENODEV.
1188 */
amdgpu_device_resize_fb_bar(struct amdgpu_device * adev)1189 int amdgpu_device_resize_fb_bar(struct amdgpu_device *adev)
1190 {
1191 int rbar_size = pci_rebar_bytes_to_size(adev->gmc.real_vram_size);
1192 struct pci_bus *root;
1193 struct resource *res;
1194 unsigned i;
1195 u16 cmd;
1196 int r;
1197
1198 if (!IS_ENABLED(CONFIG_PHYS_ADDR_T_64BIT))
1199 return 0;
1200
1201 /* Bypass for VF */
1202 if (amdgpu_sriov_vf(adev))
1203 return 0;
1204
1205 /* skip if the bios has already enabled large BAR */
1206 if (adev->gmc.real_vram_size &&
1207 (pci_resource_len(adev->pdev, 0) >= adev->gmc.real_vram_size))
1208 return 0;
1209
1210 /* Check if the root BUS has 64bit memory resources */
1211 root = adev->pdev->bus;
1212 while (root->parent)
1213 root = root->parent;
1214
1215 pci_bus_for_each_resource(root, res, i) {
1216 if (res && res->flags & (IORESOURCE_MEM | IORESOURCE_MEM_64) &&
1217 res->start > 0x100000000ull)
1218 break;
1219 }
1220
1221 /* Trying to resize is pointless without a root hub window above 4GB */
1222 if (!res)
1223 return 0;
1224
1225 /* Limit the BAR size to what is available */
1226 rbar_size = min(fls(pci_rebar_get_possible_sizes(adev->pdev, 0)) - 1,
1227 rbar_size);
1228
1229 /* Disable memory decoding while we change the BAR addresses and size */
1230 pci_read_config_word(adev->pdev, PCI_COMMAND, &cmd);
1231 pci_write_config_word(adev->pdev, PCI_COMMAND,
1232 cmd & ~PCI_COMMAND_MEMORY);
1233
1234 /* Free the VRAM and doorbell BAR, we most likely need to move both. */
1235 amdgpu_device_doorbell_fini(adev);
1236 if (adev->asic_type >= CHIP_BONAIRE)
1237 pci_release_resource(adev->pdev, 2);
1238
1239 pci_release_resource(adev->pdev, 0);
1240
1241 r = pci_resize_resource(adev->pdev, 0, rbar_size);
1242 if (r == -ENOSPC)
1243 DRM_INFO("Not enough PCI address space for a large BAR.");
1244 else if (r && r != -ENOTSUPP)
1245 DRM_ERROR("Problem resizing BAR0 (%d).", r);
1246
1247 pci_assign_unassigned_bus_resources(adev->pdev->bus);
1248
1249 /* When the doorbell or fb BAR isn't available we have no chance of
1250 * using the device.
1251 */
1252 r = amdgpu_device_doorbell_init(adev);
1253 if (r || (pci_resource_flags(adev->pdev, 0) & IORESOURCE_UNSET))
1254 return -ENODEV;
1255
1256 pci_write_config_word(adev->pdev, PCI_COMMAND, cmd);
1257
1258 return 0;
1259 }
1260
1261 /*
1262 * GPU helpers function.
1263 */
1264 /**
1265 * amdgpu_device_need_post - check if the hw need post or not
1266 *
1267 * @adev: amdgpu_device pointer
1268 *
1269 * Check if the asic has been initialized (all asics) at driver startup
1270 * or post is needed if hw reset is performed.
1271 * Returns true if need or false if not.
1272 */
amdgpu_device_need_post(struct amdgpu_device * adev)1273 bool amdgpu_device_need_post(struct amdgpu_device *adev)
1274 {
1275 uint32_t reg;
1276
1277 if (amdgpu_sriov_vf(adev))
1278 return false;
1279
1280 if (amdgpu_passthrough(adev)) {
1281 /* for FIJI: In whole GPU pass-through virtualization case, after VM reboot
1282 * some old smc fw still need driver do vPost otherwise gpu hang, while
1283 * those smc fw version above 22.15 doesn't have this flaw, so we force
1284 * vpost executed for smc version below 22.15
1285 */
1286 if (adev->asic_type == CHIP_FIJI) {
1287 int err;
1288 uint32_t fw_ver;
1289 err = request_firmware(&adev->pm.fw, "amdgpu/fiji_smc.bin", adev->dev);
1290 /* force vPost if error occured */
1291 if (err)
1292 return true;
1293
1294 fw_ver = *((uint32_t *)adev->pm.fw->data + 69);
1295 release_firmware(adev->pm.fw);
1296 if (fw_ver < 0x00160e00)
1297 return true;
1298 }
1299 }
1300
1301 /* Don't post if we need to reset whole hive on init */
1302 if (adev->gmc.xgmi.pending_reset)
1303 return false;
1304
1305 if (adev->has_hw_reset) {
1306 adev->has_hw_reset = false;
1307 return true;
1308 }
1309
1310 /* bios scratch used on CIK+ */
1311 if (adev->asic_type >= CHIP_BONAIRE)
1312 return amdgpu_atombios_scratch_need_asic_init(adev);
1313
1314 /* check MEM_SIZE for older asics */
1315 reg = amdgpu_asic_get_config_memsize(adev);
1316
1317 if ((reg != 0) && (reg != 0xffffffff))
1318 return false;
1319
1320 return true;
1321 }
1322
1323 /*
1324 * Intel hosts such as Raptor Lake and Sapphire Rapids don't support dynamic
1325 * speed switching. Until we have confirmation from Intel that a specific host
1326 * supports it, it's safer that we keep it disabled for all.
1327 *
1328 * https://edc.intel.com/content/www/us/en/design/products/platforms/details/raptor-lake-s/13th-generation-core-processors-datasheet-volume-1-of-2/005/pci-express-support/
1329 * https://gitlab.freedesktop.org/drm/amd/-/issues/2663
1330 */
amdgpu_device_pcie_dynamic_switching_supported(void)1331 bool amdgpu_device_pcie_dynamic_switching_supported(void)
1332 {
1333 #if IS_ENABLED(CONFIG_X86)
1334 struct cpuinfo_x86 *c = &cpu_data(0);
1335
1336 if (c->x86_vendor == X86_VENDOR_INTEL)
1337 return false;
1338 #endif
1339 return true;
1340 }
1341
1342 /**
1343 * amdgpu_device_should_use_aspm - check if the device should program ASPM
1344 *
1345 * @adev: amdgpu_device pointer
1346 *
1347 * Confirm whether the module parameter and pcie bridge agree that ASPM should
1348 * be set for this device.
1349 *
1350 * Returns true if it should be used or false if not.
1351 */
amdgpu_device_should_use_aspm(struct amdgpu_device * adev)1352 bool amdgpu_device_should_use_aspm(struct amdgpu_device *adev)
1353 {
1354 switch (amdgpu_aspm) {
1355 case -1:
1356 break;
1357 case 0:
1358 return false;
1359 case 1:
1360 return true;
1361 default:
1362 return false;
1363 }
1364 return pcie_aspm_enabled(adev->pdev);
1365 }
1366
amdgpu_device_aspm_support_quirk(void)1367 bool amdgpu_device_aspm_support_quirk(void)
1368 {
1369 #if IS_ENABLED(CONFIG_X86)
1370 struct cpuinfo_x86 *c = &cpu_data(0);
1371
1372 return !(c->x86 == 6 && c->x86_model == INTEL_FAM6_ALDERLAKE);
1373 #else
1374 return true;
1375 #endif
1376 }
1377
1378 /* if we get transitioned to only one device, take VGA back */
1379 /**
1380 * amdgpu_device_vga_set_decode - enable/disable vga decode
1381 *
1382 * @pdev: PCI device pointer
1383 * @state: enable/disable vga decode
1384 *
1385 * Enable/disable vga decode (all asics).
1386 * Returns VGA resource flags.
1387 */
amdgpu_device_vga_set_decode(struct pci_dev * pdev,bool state)1388 static unsigned int amdgpu_device_vga_set_decode(struct pci_dev *pdev,
1389 bool state)
1390 {
1391 struct amdgpu_device *adev = drm_to_adev(pci_get_drvdata(pdev));
1392 amdgpu_asic_set_vga_state(adev, state);
1393 if (state)
1394 return VGA_RSRC_LEGACY_IO | VGA_RSRC_LEGACY_MEM |
1395 VGA_RSRC_NORMAL_IO | VGA_RSRC_NORMAL_MEM;
1396 else
1397 return VGA_RSRC_NORMAL_IO | VGA_RSRC_NORMAL_MEM;
1398 }
1399
1400 /**
1401 * amdgpu_device_check_block_size - validate the vm block size
1402 *
1403 * @adev: amdgpu_device pointer
1404 *
1405 * Validates the vm block size specified via module parameter.
1406 * The vm block size defines number of bits in page table versus page directory,
1407 * a page is 4KB so we have 12 bits offset, minimum 9 bits in the
1408 * page table and the remaining bits are in the page directory.
1409 */
amdgpu_device_check_block_size(struct amdgpu_device * adev)1410 static void amdgpu_device_check_block_size(struct amdgpu_device *adev)
1411 {
1412 /* defines number of bits in page table versus page directory,
1413 * a page is 4KB so we have 12 bits offset, minimum 9 bits in the
1414 * page table and the remaining bits are in the page directory */
1415 if (amdgpu_vm_block_size == -1)
1416 return;
1417
1418 if (amdgpu_vm_block_size < 9) {
1419 dev_warn(adev->dev, "VM page table size (%d) too small\n",
1420 amdgpu_vm_block_size);
1421 amdgpu_vm_block_size = -1;
1422 }
1423 }
1424
1425 /**
1426 * amdgpu_device_check_vm_size - validate the vm size
1427 *
1428 * @adev: amdgpu_device pointer
1429 *
1430 * Validates the vm size in GB specified via module parameter.
1431 * The VM size is the size of the GPU virtual memory space in GB.
1432 */
amdgpu_device_check_vm_size(struct amdgpu_device * adev)1433 static void amdgpu_device_check_vm_size(struct amdgpu_device *adev)
1434 {
1435 /* no need to check the default value */
1436 if (amdgpu_vm_size == -1)
1437 return;
1438
1439 if (amdgpu_vm_size < 1) {
1440 dev_warn(adev->dev, "VM size (%d) too small, min is 1GB\n",
1441 amdgpu_vm_size);
1442 amdgpu_vm_size = -1;
1443 }
1444 }
1445
amdgpu_device_check_smu_prv_buffer_size(struct amdgpu_device * adev)1446 static void amdgpu_device_check_smu_prv_buffer_size(struct amdgpu_device *adev)
1447 {
1448 struct sysinfo si;
1449 bool is_os_64 = (sizeof(void *) == 8);
1450 uint64_t total_memory;
1451 uint64_t dram_size_seven_GB = 0x1B8000000;
1452 uint64_t dram_size_three_GB = 0xB8000000;
1453
1454 if (amdgpu_smu_memory_pool_size == 0)
1455 return;
1456
1457 if (!is_os_64) {
1458 DRM_WARN("Not 64-bit OS, feature not supported\n");
1459 goto def_value;
1460 }
1461 si_meminfo(&si);
1462 total_memory = (uint64_t)si.totalram * si.mem_unit;
1463
1464 if ((amdgpu_smu_memory_pool_size == 1) ||
1465 (amdgpu_smu_memory_pool_size == 2)) {
1466 if (total_memory < dram_size_three_GB)
1467 goto def_value1;
1468 } else if ((amdgpu_smu_memory_pool_size == 4) ||
1469 (amdgpu_smu_memory_pool_size == 8)) {
1470 if (total_memory < dram_size_seven_GB)
1471 goto def_value1;
1472 } else {
1473 DRM_WARN("Smu memory pool size not supported\n");
1474 goto def_value;
1475 }
1476 adev->pm.smu_prv_buffer_size = amdgpu_smu_memory_pool_size << 28;
1477
1478 return;
1479
1480 def_value1:
1481 DRM_WARN("No enough system memory\n");
1482 def_value:
1483 adev->pm.smu_prv_buffer_size = 0;
1484 }
1485
amdgpu_device_init_apu_flags(struct amdgpu_device * adev)1486 static int amdgpu_device_init_apu_flags(struct amdgpu_device *adev)
1487 {
1488 if (!(adev->flags & AMD_IS_APU) ||
1489 adev->asic_type < CHIP_RAVEN)
1490 return 0;
1491
1492 switch (adev->asic_type) {
1493 case CHIP_RAVEN:
1494 if (adev->pdev->device == 0x15dd)
1495 adev->apu_flags |= AMD_APU_IS_RAVEN;
1496 if (adev->pdev->device == 0x15d8)
1497 adev->apu_flags |= AMD_APU_IS_PICASSO;
1498 break;
1499 case CHIP_RENOIR:
1500 if ((adev->pdev->device == 0x1636) ||
1501 (adev->pdev->device == 0x164c))
1502 adev->apu_flags |= AMD_APU_IS_RENOIR;
1503 else
1504 adev->apu_flags |= AMD_APU_IS_GREEN_SARDINE;
1505 break;
1506 case CHIP_VANGOGH:
1507 adev->apu_flags |= AMD_APU_IS_VANGOGH;
1508 break;
1509 case CHIP_YELLOW_CARP:
1510 break;
1511 case CHIP_CYAN_SKILLFISH:
1512 if (adev->pdev->device == 0x13FE)
1513 adev->apu_flags |= AMD_APU_IS_CYAN_SKILLFISH2;
1514 break;
1515 default:
1516 return -EINVAL;
1517 }
1518
1519 return 0;
1520 }
1521
1522 /**
1523 * amdgpu_device_check_arguments - validate module params
1524 *
1525 * @adev: amdgpu_device pointer
1526 *
1527 * Validates certain module parameters and updates
1528 * the associated values used by the driver (all asics).
1529 */
amdgpu_device_check_arguments(struct amdgpu_device * adev)1530 static int amdgpu_device_check_arguments(struct amdgpu_device *adev)
1531 {
1532 if (amdgpu_sched_jobs < 4) {
1533 dev_warn(adev->dev, "sched jobs (%d) must be at least 4\n",
1534 amdgpu_sched_jobs);
1535 amdgpu_sched_jobs = 4;
1536 } else if (!is_power_of_2(amdgpu_sched_jobs)){
1537 dev_warn(adev->dev, "sched jobs (%d) must be a power of 2\n",
1538 amdgpu_sched_jobs);
1539 amdgpu_sched_jobs = roundup_pow_of_two(amdgpu_sched_jobs);
1540 }
1541
1542 if (amdgpu_gart_size != -1 && amdgpu_gart_size < 32) {
1543 /* gart size must be greater or equal to 32M */
1544 dev_warn(adev->dev, "gart size (%d) too small\n",
1545 amdgpu_gart_size);
1546 amdgpu_gart_size = -1;
1547 }
1548
1549 if (amdgpu_gtt_size != -1 && amdgpu_gtt_size < 32) {
1550 /* gtt size must be greater or equal to 32M */
1551 dev_warn(adev->dev, "gtt size (%d) too small\n",
1552 amdgpu_gtt_size);
1553 amdgpu_gtt_size = -1;
1554 }
1555
1556 /* valid range is between 4 and 9 inclusive */
1557 if (amdgpu_vm_fragment_size != -1 &&
1558 (amdgpu_vm_fragment_size > 9 || amdgpu_vm_fragment_size < 4)) {
1559 dev_warn(adev->dev, "valid range is between 4 and 9\n");
1560 amdgpu_vm_fragment_size = -1;
1561 }
1562
1563 if (amdgpu_sched_hw_submission < 2) {
1564 dev_warn(adev->dev, "sched hw submission jobs (%d) must be at least 2\n",
1565 amdgpu_sched_hw_submission);
1566 amdgpu_sched_hw_submission = 2;
1567 } else if (!is_power_of_2(amdgpu_sched_hw_submission)) {
1568 dev_warn(adev->dev, "sched hw submission jobs (%d) must be a power of 2\n",
1569 amdgpu_sched_hw_submission);
1570 amdgpu_sched_hw_submission = roundup_pow_of_two(amdgpu_sched_hw_submission);
1571 }
1572
1573 amdgpu_device_check_smu_prv_buffer_size(adev);
1574
1575 amdgpu_device_check_vm_size(adev);
1576
1577 amdgpu_device_check_block_size(adev);
1578
1579 adev->firmware.load_type = amdgpu_ucode_get_load_type(adev, amdgpu_fw_load_type);
1580
1581 amdgpu_gmc_tmz_set(adev);
1582
1583 amdgpu_gmc_noretry_set(adev);
1584
1585 return 0;
1586 }
1587
1588 /**
1589 * amdgpu_switcheroo_set_state - set switcheroo state
1590 *
1591 * @pdev: pci dev pointer
1592 * @state: vga_switcheroo state
1593 *
1594 * Callback for the switcheroo driver. Suspends or resumes the
1595 * the asics before or after it is powered up using ACPI methods.
1596 */
amdgpu_switcheroo_set_state(struct pci_dev * pdev,enum vga_switcheroo_state state)1597 static void amdgpu_switcheroo_set_state(struct pci_dev *pdev,
1598 enum vga_switcheroo_state state)
1599 {
1600 struct drm_device *dev = pci_get_drvdata(pdev);
1601 int r;
1602
1603 if (amdgpu_device_supports_px(dev) && state == VGA_SWITCHEROO_OFF)
1604 return;
1605
1606 if (state == VGA_SWITCHEROO_ON) {
1607 pr_info("switched on\n");
1608 /* don't suspend or resume card normally */
1609 dev->switch_power_state = DRM_SWITCH_POWER_CHANGING;
1610
1611 pci_set_power_state(pdev, PCI_D0);
1612 amdgpu_device_load_pci_state(pdev);
1613 r = pci_enable_device(pdev);
1614 if (r)
1615 DRM_WARN("pci_enable_device failed (%d)\n", r);
1616 amdgpu_device_resume(dev, true);
1617
1618 dev->switch_power_state = DRM_SWITCH_POWER_ON;
1619 } else {
1620 pr_info("switched off\n");
1621 dev->switch_power_state = DRM_SWITCH_POWER_CHANGING;
1622 amdgpu_device_suspend(dev, true);
1623 amdgpu_device_cache_pci_state(pdev);
1624 /* Shut down the device */
1625 pci_disable_device(pdev);
1626 pci_set_power_state(pdev, PCI_D3cold);
1627 dev->switch_power_state = DRM_SWITCH_POWER_OFF;
1628 }
1629 }
1630
1631 /**
1632 * amdgpu_switcheroo_can_switch - see if switcheroo state can change
1633 *
1634 * @pdev: pci dev pointer
1635 *
1636 * Callback for the switcheroo driver. Check of the switcheroo
1637 * state can be changed.
1638 * Returns true if the state can be changed, false if not.
1639 */
amdgpu_switcheroo_can_switch(struct pci_dev * pdev)1640 static bool amdgpu_switcheroo_can_switch(struct pci_dev *pdev)
1641 {
1642 struct drm_device *dev = pci_get_drvdata(pdev);
1643
1644 /*
1645 * FIXME: open_count is protected by drm_global_mutex but that would lead to
1646 * locking inversion with the driver load path. And the access here is
1647 * completely racy anyway. So don't bother with locking for now.
1648 */
1649 return atomic_read(&dev->open_count) == 0;
1650 }
1651
1652 static const struct vga_switcheroo_client_ops amdgpu_switcheroo_ops = {
1653 .set_gpu_state = amdgpu_switcheroo_set_state,
1654 .reprobe = NULL,
1655 .can_switch = amdgpu_switcheroo_can_switch,
1656 };
1657
1658 /**
1659 * amdgpu_device_ip_set_clockgating_state - set the CG state
1660 *
1661 * @dev: amdgpu_device pointer
1662 * @block_type: Type of hardware IP (SMU, GFX, UVD, etc.)
1663 * @state: clockgating state (gate or ungate)
1664 *
1665 * Sets the requested clockgating state for all instances of
1666 * the hardware IP specified.
1667 * Returns the error code from the last instance.
1668 */
amdgpu_device_ip_set_clockgating_state(void * dev,enum amd_ip_block_type block_type,enum amd_clockgating_state state)1669 int amdgpu_device_ip_set_clockgating_state(void *dev,
1670 enum amd_ip_block_type block_type,
1671 enum amd_clockgating_state state)
1672 {
1673 struct amdgpu_device *adev = dev;
1674 int i, r = 0;
1675
1676 for (i = 0; i < adev->num_ip_blocks; i++) {
1677 if (!adev->ip_blocks[i].status.valid)
1678 continue;
1679 if (adev->ip_blocks[i].version->type != block_type)
1680 continue;
1681 if (!adev->ip_blocks[i].version->funcs->set_clockgating_state)
1682 continue;
1683 r = adev->ip_blocks[i].version->funcs->set_clockgating_state(
1684 (void *)adev, state);
1685 if (r)
1686 DRM_ERROR("set_clockgating_state of IP block <%s> failed %d\n",
1687 adev->ip_blocks[i].version->funcs->name, r);
1688 }
1689 return r;
1690 }
1691
1692 /**
1693 * amdgpu_device_ip_set_powergating_state - set the PG state
1694 *
1695 * @dev: amdgpu_device pointer
1696 * @block_type: Type of hardware IP (SMU, GFX, UVD, etc.)
1697 * @state: powergating state (gate or ungate)
1698 *
1699 * Sets the requested powergating state for all instances of
1700 * the hardware IP specified.
1701 * Returns the error code from the last instance.
1702 */
amdgpu_device_ip_set_powergating_state(void * dev,enum amd_ip_block_type block_type,enum amd_powergating_state state)1703 int amdgpu_device_ip_set_powergating_state(void *dev,
1704 enum amd_ip_block_type block_type,
1705 enum amd_powergating_state state)
1706 {
1707 struct amdgpu_device *adev = dev;
1708 int i, r = 0;
1709
1710 for (i = 0; i < adev->num_ip_blocks; i++) {
1711 if (!adev->ip_blocks[i].status.valid)
1712 continue;
1713 if (adev->ip_blocks[i].version->type != block_type)
1714 continue;
1715 if (!adev->ip_blocks[i].version->funcs->set_powergating_state)
1716 continue;
1717 r = adev->ip_blocks[i].version->funcs->set_powergating_state(
1718 (void *)adev, state);
1719 if (r)
1720 DRM_ERROR("set_powergating_state of IP block <%s> failed %d\n",
1721 adev->ip_blocks[i].version->funcs->name, r);
1722 }
1723 return r;
1724 }
1725
1726 /**
1727 * amdgpu_device_ip_get_clockgating_state - get the CG state
1728 *
1729 * @adev: amdgpu_device pointer
1730 * @flags: clockgating feature flags
1731 *
1732 * Walks the list of IPs on the device and updates the clockgating
1733 * flags for each IP.
1734 * Updates @flags with the feature flags for each hardware IP where
1735 * clockgating is enabled.
1736 */
amdgpu_device_ip_get_clockgating_state(struct amdgpu_device * adev,u32 * flags)1737 void amdgpu_device_ip_get_clockgating_state(struct amdgpu_device *adev,
1738 u32 *flags)
1739 {
1740 int i;
1741
1742 for (i = 0; i < adev->num_ip_blocks; i++) {
1743 if (!adev->ip_blocks[i].status.valid)
1744 continue;
1745 if (adev->ip_blocks[i].version->funcs->get_clockgating_state)
1746 adev->ip_blocks[i].version->funcs->get_clockgating_state((void *)adev, flags);
1747 }
1748 }
1749
1750 /**
1751 * amdgpu_device_ip_wait_for_idle - wait for idle
1752 *
1753 * @adev: amdgpu_device pointer
1754 * @block_type: Type of hardware IP (SMU, GFX, UVD, etc.)
1755 *
1756 * Waits for the request hardware IP to be idle.
1757 * Returns 0 for success or a negative error code on failure.
1758 */
amdgpu_device_ip_wait_for_idle(struct amdgpu_device * adev,enum amd_ip_block_type block_type)1759 int amdgpu_device_ip_wait_for_idle(struct amdgpu_device *adev,
1760 enum amd_ip_block_type block_type)
1761 {
1762 int i, r;
1763
1764 for (i = 0; i < adev->num_ip_blocks; i++) {
1765 if (!adev->ip_blocks[i].status.valid)
1766 continue;
1767 if (adev->ip_blocks[i].version->type == block_type) {
1768 r = adev->ip_blocks[i].version->funcs->wait_for_idle((void *)adev);
1769 if (r)
1770 return r;
1771 break;
1772 }
1773 }
1774 return 0;
1775
1776 }
1777
1778 /**
1779 * amdgpu_device_ip_is_idle - is the hardware IP idle
1780 *
1781 * @adev: amdgpu_device pointer
1782 * @block_type: Type of hardware IP (SMU, GFX, UVD, etc.)
1783 *
1784 * Check if the hardware IP is idle or not.
1785 * Returns true if it the IP is idle, false if not.
1786 */
amdgpu_device_ip_is_idle(struct amdgpu_device * adev,enum amd_ip_block_type block_type)1787 bool amdgpu_device_ip_is_idle(struct amdgpu_device *adev,
1788 enum amd_ip_block_type block_type)
1789 {
1790 int i;
1791
1792 for (i = 0; i < adev->num_ip_blocks; i++) {
1793 if (!adev->ip_blocks[i].status.valid)
1794 continue;
1795 if (adev->ip_blocks[i].version->type == block_type)
1796 return adev->ip_blocks[i].version->funcs->is_idle((void *)adev);
1797 }
1798 return true;
1799
1800 }
1801
1802 /**
1803 * amdgpu_device_ip_get_ip_block - get a hw IP pointer
1804 *
1805 * @adev: amdgpu_device pointer
1806 * @type: Type of hardware IP (SMU, GFX, UVD, etc.)
1807 *
1808 * Returns a pointer to the hardware IP block structure
1809 * if it exists for the asic, otherwise NULL.
1810 */
1811 struct amdgpu_ip_block *
amdgpu_device_ip_get_ip_block(struct amdgpu_device * adev,enum amd_ip_block_type type)1812 amdgpu_device_ip_get_ip_block(struct amdgpu_device *adev,
1813 enum amd_ip_block_type type)
1814 {
1815 int i;
1816
1817 for (i = 0; i < adev->num_ip_blocks; i++)
1818 if (adev->ip_blocks[i].version->type == type)
1819 return &adev->ip_blocks[i];
1820
1821 return NULL;
1822 }
1823
1824 /**
1825 * amdgpu_device_ip_block_version_cmp
1826 *
1827 * @adev: amdgpu_device pointer
1828 * @type: enum amd_ip_block_type
1829 * @major: major version
1830 * @minor: minor version
1831 *
1832 * return 0 if equal or greater
1833 * return 1 if smaller or the ip_block doesn't exist
1834 */
amdgpu_device_ip_block_version_cmp(struct amdgpu_device * adev,enum amd_ip_block_type type,u32 major,u32 minor)1835 int amdgpu_device_ip_block_version_cmp(struct amdgpu_device *adev,
1836 enum amd_ip_block_type type,
1837 u32 major, u32 minor)
1838 {
1839 struct amdgpu_ip_block *ip_block = amdgpu_device_ip_get_ip_block(adev, type);
1840
1841 if (ip_block && ((ip_block->version->major > major) ||
1842 ((ip_block->version->major == major) &&
1843 (ip_block->version->minor >= minor))))
1844 return 0;
1845
1846 return 1;
1847 }
1848
1849 /**
1850 * amdgpu_device_ip_block_add
1851 *
1852 * @adev: amdgpu_device pointer
1853 * @ip_block_version: pointer to the IP to add
1854 *
1855 * Adds the IP block driver information to the collection of IPs
1856 * on the asic.
1857 */
amdgpu_device_ip_block_add(struct amdgpu_device * adev,const struct amdgpu_ip_block_version * ip_block_version)1858 int amdgpu_device_ip_block_add(struct amdgpu_device *adev,
1859 const struct amdgpu_ip_block_version *ip_block_version)
1860 {
1861 if (!ip_block_version)
1862 return -EINVAL;
1863
1864 switch (ip_block_version->type) {
1865 case AMD_IP_BLOCK_TYPE_VCN:
1866 if (adev->harvest_ip_mask & AMD_HARVEST_IP_VCN_MASK)
1867 return 0;
1868 break;
1869 case AMD_IP_BLOCK_TYPE_JPEG:
1870 if (adev->harvest_ip_mask & AMD_HARVEST_IP_JPEG_MASK)
1871 return 0;
1872 break;
1873 default:
1874 break;
1875 }
1876
1877 DRM_INFO("add ip block number %d <%s>\n", adev->num_ip_blocks,
1878 ip_block_version->funcs->name);
1879
1880 adev->ip_blocks[adev->num_ip_blocks++].version = ip_block_version;
1881
1882 return 0;
1883 }
1884
1885 /**
1886 * amdgpu_device_enable_virtual_display - enable virtual display feature
1887 *
1888 * @adev: amdgpu_device pointer
1889 *
1890 * Enabled the virtual display feature if the user has enabled it via
1891 * the module parameter virtual_display. This feature provides a virtual
1892 * display hardware on headless boards or in virtualized environments.
1893 * This function parses and validates the configuration string specified by
1894 * the user and configues the virtual display configuration (number of
1895 * virtual connectors, crtcs, etc.) specified.
1896 */
amdgpu_device_enable_virtual_display(struct amdgpu_device * adev)1897 static void amdgpu_device_enable_virtual_display(struct amdgpu_device *adev)
1898 {
1899 adev->enable_virtual_display = false;
1900
1901 if (amdgpu_virtual_display) {
1902 const char *pci_address_name = pci_name(adev->pdev);
1903 char *pciaddstr, *pciaddstr_tmp, *pciaddname_tmp, *pciaddname;
1904
1905 pciaddstr = kstrdup(amdgpu_virtual_display, GFP_KERNEL);
1906 pciaddstr_tmp = pciaddstr;
1907 while ((pciaddname_tmp = strsep(&pciaddstr_tmp, ";"))) {
1908 pciaddname = strsep(&pciaddname_tmp, ",");
1909 if (!strcmp("all", pciaddname)
1910 || !strcmp(pci_address_name, pciaddname)) {
1911 long num_crtc;
1912 int res = -1;
1913
1914 adev->enable_virtual_display = true;
1915
1916 if (pciaddname_tmp)
1917 res = kstrtol(pciaddname_tmp, 10,
1918 &num_crtc);
1919
1920 if (!res) {
1921 if (num_crtc < 1)
1922 num_crtc = 1;
1923 if (num_crtc > 6)
1924 num_crtc = 6;
1925 adev->mode_info.num_crtc = num_crtc;
1926 } else {
1927 adev->mode_info.num_crtc = 1;
1928 }
1929 break;
1930 }
1931 }
1932
1933 DRM_INFO("virtual display string:%s, %s:virtual_display:%d, num_crtc:%d\n",
1934 amdgpu_virtual_display, pci_address_name,
1935 adev->enable_virtual_display, adev->mode_info.num_crtc);
1936
1937 kfree(pciaddstr);
1938 }
1939 }
1940
1941 /**
1942 * amdgpu_device_parse_gpu_info_fw - parse gpu info firmware
1943 *
1944 * @adev: amdgpu_device pointer
1945 *
1946 * Parses the asic configuration parameters specified in the gpu info
1947 * firmware and makes them availale to the driver for use in configuring
1948 * the asic.
1949 * Returns 0 on success, -EINVAL on failure.
1950 */
amdgpu_device_parse_gpu_info_fw(struct amdgpu_device * adev)1951 static int amdgpu_device_parse_gpu_info_fw(struct amdgpu_device *adev)
1952 {
1953 const char *chip_name;
1954 char fw_name[40];
1955 int err;
1956 const struct gpu_info_firmware_header_v1_0 *hdr;
1957
1958 adev->firmware.gpu_info_fw = NULL;
1959
1960 if (adev->mman.discovery_bin) {
1961 amdgpu_discovery_get_gfx_info(adev);
1962
1963 /*
1964 * FIXME: The bounding box is still needed by Navi12, so
1965 * temporarily read it from gpu_info firmware. Should be droped
1966 * when DAL no longer needs it.
1967 */
1968 if (adev->asic_type != CHIP_NAVI12)
1969 return 0;
1970 }
1971
1972 switch (adev->asic_type) {
1973 #ifdef CONFIG_DRM_AMDGPU_SI
1974 case CHIP_VERDE:
1975 case CHIP_TAHITI:
1976 case CHIP_PITCAIRN:
1977 case CHIP_OLAND:
1978 case CHIP_HAINAN:
1979 #endif
1980 #ifdef CONFIG_DRM_AMDGPU_CIK
1981 case CHIP_BONAIRE:
1982 case CHIP_HAWAII:
1983 case CHIP_KAVERI:
1984 case CHIP_KABINI:
1985 case CHIP_MULLINS:
1986 #endif
1987 case CHIP_TOPAZ:
1988 case CHIP_TONGA:
1989 case CHIP_FIJI:
1990 case CHIP_POLARIS10:
1991 case CHIP_POLARIS11:
1992 case CHIP_POLARIS12:
1993 case CHIP_VEGAM:
1994 case CHIP_CARRIZO:
1995 case CHIP_STONEY:
1996 case CHIP_VEGA20:
1997 case CHIP_ALDEBARAN:
1998 case CHIP_SIENNA_CICHLID:
1999 case CHIP_NAVY_FLOUNDER:
2000 case CHIP_DIMGREY_CAVEFISH:
2001 case CHIP_BEIGE_GOBY:
2002 default:
2003 return 0;
2004 case CHIP_VEGA10:
2005 chip_name = "vega10";
2006 break;
2007 case CHIP_VEGA12:
2008 chip_name = "vega12";
2009 break;
2010 case CHIP_RAVEN:
2011 if (adev->apu_flags & AMD_APU_IS_RAVEN2)
2012 chip_name = "raven2";
2013 else if (adev->apu_flags & AMD_APU_IS_PICASSO)
2014 chip_name = "picasso";
2015 else
2016 chip_name = "raven";
2017 break;
2018 case CHIP_ARCTURUS:
2019 chip_name = "arcturus";
2020 break;
2021 case CHIP_RENOIR:
2022 if (adev->apu_flags & AMD_APU_IS_RENOIR)
2023 chip_name = "renoir";
2024 else
2025 chip_name = "green_sardine";
2026 break;
2027 case CHIP_NAVI10:
2028 chip_name = "navi10";
2029 break;
2030 case CHIP_NAVI14:
2031 chip_name = "navi14";
2032 break;
2033 case CHIP_NAVI12:
2034 chip_name = "navi12";
2035 break;
2036 case CHIP_VANGOGH:
2037 chip_name = "vangogh";
2038 break;
2039 case CHIP_YELLOW_CARP:
2040 chip_name = "yellow_carp";
2041 break;
2042 }
2043
2044 snprintf(fw_name, sizeof(fw_name), "amdgpu/%s_gpu_info.bin", chip_name);
2045 err = request_firmware(&adev->firmware.gpu_info_fw, fw_name, adev->dev);
2046 if (err) {
2047 dev_err(adev->dev,
2048 "Failed to load gpu_info firmware \"%s\"\n",
2049 fw_name);
2050 goto out;
2051 }
2052 err = amdgpu_ucode_validate(adev->firmware.gpu_info_fw);
2053 if (err) {
2054 dev_err(adev->dev,
2055 "Failed to validate gpu_info firmware \"%s\"\n",
2056 fw_name);
2057 goto out;
2058 }
2059
2060 hdr = (const struct gpu_info_firmware_header_v1_0 *)adev->firmware.gpu_info_fw->data;
2061 amdgpu_ucode_print_gpu_info_hdr(&hdr->header);
2062
2063 switch (hdr->version_major) {
2064 case 1:
2065 {
2066 const struct gpu_info_firmware_v1_0 *gpu_info_fw =
2067 (const struct gpu_info_firmware_v1_0 *)(adev->firmware.gpu_info_fw->data +
2068 le32_to_cpu(hdr->header.ucode_array_offset_bytes));
2069
2070 /*
2071 * Should be droped when DAL no longer needs it.
2072 */
2073 if (adev->asic_type == CHIP_NAVI12)
2074 goto parse_soc_bounding_box;
2075
2076 adev->gfx.config.max_shader_engines = le32_to_cpu(gpu_info_fw->gc_num_se);
2077 adev->gfx.config.max_cu_per_sh = le32_to_cpu(gpu_info_fw->gc_num_cu_per_sh);
2078 adev->gfx.config.max_sh_per_se = le32_to_cpu(gpu_info_fw->gc_num_sh_per_se);
2079 adev->gfx.config.max_backends_per_se = le32_to_cpu(gpu_info_fw->gc_num_rb_per_se);
2080 adev->gfx.config.max_texture_channel_caches =
2081 le32_to_cpu(gpu_info_fw->gc_num_tccs);
2082 adev->gfx.config.max_gprs = le32_to_cpu(gpu_info_fw->gc_num_gprs);
2083 adev->gfx.config.max_gs_threads = le32_to_cpu(gpu_info_fw->gc_num_max_gs_thds);
2084 adev->gfx.config.gs_vgt_table_depth = le32_to_cpu(gpu_info_fw->gc_gs_table_depth);
2085 adev->gfx.config.gs_prim_buffer_depth = le32_to_cpu(gpu_info_fw->gc_gsprim_buff_depth);
2086 adev->gfx.config.double_offchip_lds_buf =
2087 le32_to_cpu(gpu_info_fw->gc_double_offchip_lds_buffer);
2088 adev->gfx.cu_info.wave_front_size = le32_to_cpu(gpu_info_fw->gc_wave_size);
2089 adev->gfx.cu_info.max_waves_per_simd =
2090 le32_to_cpu(gpu_info_fw->gc_max_waves_per_simd);
2091 adev->gfx.cu_info.max_scratch_slots_per_cu =
2092 le32_to_cpu(gpu_info_fw->gc_max_scratch_slots_per_cu);
2093 adev->gfx.cu_info.lds_size = le32_to_cpu(gpu_info_fw->gc_lds_size);
2094 if (hdr->version_minor >= 1) {
2095 const struct gpu_info_firmware_v1_1 *gpu_info_fw =
2096 (const struct gpu_info_firmware_v1_1 *)(adev->firmware.gpu_info_fw->data +
2097 le32_to_cpu(hdr->header.ucode_array_offset_bytes));
2098 adev->gfx.config.num_sc_per_sh =
2099 le32_to_cpu(gpu_info_fw->num_sc_per_sh);
2100 adev->gfx.config.num_packer_per_sc =
2101 le32_to_cpu(gpu_info_fw->num_packer_per_sc);
2102 }
2103
2104 parse_soc_bounding_box:
2105 /*
2106 * soc bounding box info is not integrated in disocovery table,
2107 * we always need to parse it from gpu info firmware if needed.
2108 */
2109 if (hdr->version_minor == 2) {
2110 const struct gpu_info_firmware_v1_2 *gpu_info_fw =
2111 (const struct gpu_info_firmware_v1_2 *)(adev->firmware.gpu_info_fw->data +
2112 le32_to_cpu(hdr->header.ucode_array_offset_bytes));
2113 adev->dm.soc_bounding_box = &gpu_info_fw->soc_bounding_box;
2114 }
2115 break;
2116 }
2117 default:
2118 dev_err(adev->dev,
2119 "Unsupported gpu_info table %d\n", hdr->header.ucode_version);
2120 err = -EINVAL;
2121 goto out;
2122 }
2123 out:
2124 return err;
2125 }
2126
2127 /**
2128 * amdgpu_device_ip_early_init - run early init for hardware IPs
2129 *
2130 * @adev: amdgpu_device pointer
2131 *
2132 * Early initialization pass for hardware IPs. The hardware IPs that make
2133 * up each asic are discovered each IP's early_init callback is run. This
2134 * is the first stage in initializing the asic.
2135 * Returns 0 on success, negative error code on failure.
2136 */
amdgpu_device_ip_early_init(struct amdgpu_device * adev)2137 static int amdgpu_device_ip_early_init(struct amdgpu_device *adev)
2138 {
2139 struct drm_device *dev = adev_to_drm(adev);
2140 struct pci_dev *parent;
2141 int i, r;
2142
2143 amdgpu_device_enable_virtual_display(adev);
2144
2145 if (amdgpu_sriov_vf(adev)) {
2146 r = amdgpu_virt_request_full_gpu(adev, true);
2147 if (r)
2148 return r;
2149 }
2150
2151 switch (adev->asic_type) {
2152 #ifdef CONFIG_DRM_AMDGPU_SI
2153 case CHIP_VERDE:
2154 case CHIP_TAHITI:
2155 case CHIP_PITCAIRN:
2156 case CHIP_OLAND:
2157 case CHIP_HAINAN:
2158 adev->family = AMDGPU_FAMILY_SI;
2159 r = si_set_ip_blocks(adev);
2160 if (r)
2161 return r;
2162 break;
2163 #endif
2164 #ifdef CONFIG_DRM_AMDGPU_CIK
2165 case CHIP_BONAIRE:
2166 case CHIP_HAWAII:
2167 case CHIP_KAVERI:
2168 case CHIP_KABINI:
2169 case CHIP_MULLINS:
2170 if (adev->flags & AMD_IS_APU)
2171 adev->family = AMDGPU_FAMILY_KV;
2172 else
2173 adev->family = AMDGPU_FAMILY_CI;
2174
2175 r = cik_set_ip_blocks(adev);
2176 if (r)
2177 return r;
2178 break;
2179 #endif
2180 case CHIP_TOPAZ:
2181 case CHIP_TONGA:
2182 case CHIP_FIJI:
2183 case CHIP_POLARIS10:
2184 case CHIP_POLARIS11:
2185 case CHIP_POLARIS12:
2186 case CHIP_VEGAM:
2187 case CHIP_CARRIZO:
2188 case CHIP_STONEY:
2189 if (adev->flags & AMD_IS_APU)
2190 adev->family = AMDGPU_FAMILY_CZ;
2191 else
2192 adev->family = AMDGPU_FAMILY_VI;
2193
2194 r = vi_set_ip_blocks(adev);
2195 if (r)
2196 return r;
2197 break;
2198 case CHIP_VEGA10:
2199 case CHIP_VEGA12:
2200 case CHIP_VEGA20:
2201 case CHIP_RAVEN:
2202 case CHIP_ARCTURUS:
2203 case CHIP_RENOIR:
2204 case CHIP_ALDEBARAN:
2205 if (adev->flags & AMD_IS_APU)
2206 adev->family = AMDGPU_FAMILY_RV;
2207 else
2208 adev->family = AMDGPU_FAMILY_AI;
2209
2210 r = soc15_set_ip_blocks(adev);
2211 if (r)
2212 return r;
2213 break;
2214 case CHIP_NAVI10:
2215 case CHIP_NAVI14:
2216 case CHIP_NAVI12:
2217 case CHIP_SIENNA_CICHLID:
2218 case CHIP_NAVY_FLOUNDER:
2219 case CHIP_DIMGREY_CAVEFISH:
2220 case CHIP_BEIGE_GOBY:
2221 case CHIP_VANGOGH:
2222 case CHIP_YELLOW_CARP:
2223 case CHIP_CYAN_SKILLFISH:
2224 if (adev->asic_type == CHIP_VANGOGH)
2225 adev->family = AMDGPU_FAMILY_VGH;
2226 else if (adev->asic_type == CHIP_YELLOW_CARP)
2227 adev->family = AMDGPU_FAMILY_YC;
2228 else
2229 adev->family = AMDGPU_FAMILY_NV;
2230
2231 r = nv_set_ip_blocks(adev);
2232 if (r)
2233 return r;
2234 break;
2235 default:
2236 /* FIXME: not supported yet */
2237 return -EINVAL;
2238 }
2239
2240 if (amdgpu_has_atpx() &&
2241 (amdgpu_is_atpx_hybrid() ||
2242 amdgpu_has_atpx_dgpu_power_cntl()) &&
2243 ((adev->flags & AMD_IS_APU) == 0) &&
2244 !pci_is_thunderbolt_attached(to_pci_dev(dev->dev)))
2245 adev->flags |= AMD_IS_PX;
2246
2247 if (!(adev->flags & AMD_IS_APU)) {
2248 parent = pcie_find_root_port(adev->pdev);
2249 adev->has_pr3 = parent ? pci_pr3_present(parent) : false;
2250 }
2251
2252 amdgpu_amdkfd_device_probe(adev);
2253
2254 adev->pm.pp_feature = amdgpu_pp_feature_mask;
2255 if (amdgpu_sriov_vf(adev) || sched_policy == KFD_SCHED_POLICY_NO_HWS)
2256 adev->pm.pp_feature &= ~PP_GFXOFF_MASK;
2257 if (amdgpu_sriov_vf(adev) && adev->asic_type == CHIP_SIENNA_CICHLID)
2258 adev->pm.pp_feature &= ~PP_OVERDRIVE_MASK;
2259
2260 for (i = 0; i < adev->num_ip_blocks; i++) {
2261 if ((amdgpu_ip_block_mask & (1 << i)) == 0) {
2262 DRM_ERROR("disabled ip block: %d <%s>\n",
2263 i, adev->ip_blocks[i].version->funcs->name);
2264 adev->ip_blocks[i].status.valid = false;
2265 } else {
2266 if (adev->ip_blocks[i].version->funcs->early_init) {
2267 r = adev->ip_blocks[i].version->funcs->early_init((void *)adev);
2268 if (r == -ENOENT) {
2269 adev->ip_blocks[i].status.valid = false;
2270 } else if (r) {
2271 DRM_ERROR("early_init of IP block <%s> failed %d\n",
2272 adev->ip_blocks[i].version->funcs->name, r);
2273 return r;
2274 } else {
2275 adev->ip_blocks[i].status.valid = true;
2276 }
2277 } else {
2278 adev->ip_blocks[i].status.valid = true;
2279 }
2280 }
2281 /* get the vbios after the asic_funcs are set up */
2282 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_COMMON) {
2283 r = amdgpu_device_parse_gpu_info_fw(adev);
2284 if (r)
2285 return r;
2286
2287 /* Read BIOS */
2288 if (!amdgpu_get_bios(adev))
2289 return -EINVAL;
2290
2291 r = amdgpu_atombios_init(adev);
2292 if (r) {
2293 dev_err(adev->dev, "amdgpu_atombios_init failed\n");
2294 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_ATOMBIOS_INIT_FAIL, 0, 0);
2295 return r;
2296 }
2297
2298 /*get pf2vf msg info at it's earliest time*/
2299 if (amdgpu_sriov_vf(adev))
2300 amdgpu_virt_init_data_exchange(adev);
2301
2302 }
2303 }
2304
2305 adev->cg_flags &= amdgpu_cg_mask;
2306 adev->pg_flags &= amdgpu_pg_mask;
2307
2308 return 0;
2309 }
2310
amdgpu_device_ip_hw_init_phase1(struct amdgpu_device * adev)2311 static int amdgpu_device_ip_hw_init_phase1(struct amdgpu_device *adev)
2312 {
2313 int i, r;
2314
2315 for (i = 0; i < adev->num_ip_blocks; i++) {
2316 if (!adev->ip_blocks[i].status.sw)
2317 continue;
2318 if (adev->ip_blocks[i].status.hw)
2319 continue;
2320 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_COMMON ||
2321 (amdgpu_sriov_vf(adev) && (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_PSP)) ||
2322 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_IH) {
2323 r = adev->ip_blocks[i].version->funcs->hw_init(adev);
2324 if (r) {
2325 DRM_ERROR("hw_init of IP block <%s> failed %d\n",
2326 adev->ip_blocks[i].version->funcs->name, r);
2327 return r;
2328 }
2329 adev->ip_blocks[i].status.hw = true;
2330 }
2331 }
2332
2333 return 0;
2334 }
2335
amdgpu_device_ip_hw_init_phase2(struct amdgpu_device * adev)2336 static int amdgpu_device_ip_hw_init_phase2(struct amdgpu_device *adev)
2337 {
2338 int i, r;
2339
2340 for (i = 0; i < adev->num_ip_blocks; i++) {
2341 if (!adev->ip_blocks[i].status.sw)
2342 continue;
2343 if (adev->ip_blocks[i].status.hw)
2344 continue;
2345 r = adev->ip_blocks[i].version->funcs->hw_init(adev);
2346 if (r) {
2347 DRM_ERROR("hw_init of IP block <%s> failed %d\n",
2348 adev->ip_blocks[i].version->funcs->name, r);
2349 return r;
2350 }
2351 adev->ip_blocks[i].status.hw = true;
2352 }
2353
2354 return 0;
2355 }
2356
amdgpu_device_fw_loading(struct amdgpu_device * adev)2357 static int amdgpu_device_fw_loading(struct amdgpu_device *adev)
2358 {
2359 int r = 0;
2360 int i;
2361 uint32_t smu_version;
2362
2363 if (adev->asic_type >= CHIP_VEGA10) {
2364 for (i = 0; i < adev->num_ip_blocks; i++) {
2365 if (adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_PSP)
2366 continue;
2367
2368 if (!adev->ip_blocks[i].status.sw)
2369 continue;
2370
2371 /* no need to do the fw loading again if already done*/
2372 if (adev->ip_blocks[i].status.hw == true)
2373 break;
2374
2375 if (amdgpu_in_reset(adev) || adev->in_suspend) {
2376 r = adev->ip_blocks[i].version->funcs->resume(adev);
2377 if (r) {
2378 DRM_ERROR("resume of IP block <%s> failed %d\n",
2379 adev->ip_blocks[i].version->funcs->name, r);
2380 return r;
2381 }
2382 } else {
2383 r = adev->ip_blocks[i].version->funcs->hw_init(adev);
2384 if (r) {
2385 DRM_ERROR("hw_init of IP block <%s> failed %d\n",
2386 adev->ip_blocks[i].version->funcs->name, r);
2387 return r;
2388 }
2389 }
2390
2391 adev->ip_blocks[i].status.hw = true;
2392 break;
2393 }
2394 }
2395
2396 if (!amdgpu_sriov_vf(adev) || adev->asic_type == CHIP_TONGA)
2397 r = amdgpu_pm_load_smu_firmware(adev, &smu_version);
2398
2399 return r;
2400 }
2401
2402 /**
2403 * amdgpu_device_ip_init - run init for hardware IPs
2404 *
2405 * @adev: amdgpu_device pointer
2406 *
2407 * Main initialization pass for hardware IPs. The list of all the hardware
2408 * IPs that make up the asic is walked and the sw_init and hw_init callbacks
2409 * are run. sw_init initializes the software state associated with each IP
2410 * and hw_init initializes the hardware associated with each IP.
2411 * Returns 0 on success, negative error code on failure.
2412 */
amdgpu_device_ip_init(struct amdgpu_device * adev)2413 static int amdgpu_device_ip_init(struct amdgpu_device *adev)
2414 {
2415 int i, r;
2416
2417 r = amdgpu_ras_init(adev);
2418 if (r)
2419 return r;
2420
2421 for (i = 0; i < adev->num_ip_blocks; i++) {
2422 if (!adev->ip_blocks[i].status.valid)
2423 continue;
2424 r = adev->ip_blocks[i].version->funcs->sw_init((void *)adev);
2425 if (r) {
2426 DRM_ERROR("sw_init of IP block <%s> failed %d\n",
2427 adev->ip_blocks[i].version->funcs->name, r);
2428 goto init_failed;
2429 }
2430 adev->ip_blocks[i].status.sw = true;
2431
2432 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_COMMON) {
2433 /* need to do common hw init early so everything is set up for gmc */
2434 r = adev->ip_blocks[i].version->funcs->hw_init((void *)adev);
2435 if (r) {
2436 DRM_ERROR("hw_init %d failed %d\n", i, r);
2437 goto init_failed;
2438 }
2439 adev->ip_blocks[i].status.hw = true;
2440 } else if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC) {
2441 /* need to do gmc hw init early so we can allocate gpu mem */
2442 /* Try to reserve bad pages early */
2443 if (amdgpu_sriov_vf(adev))
2444 amdgpu_virt_exchange_data(adev);
2445
2446 r = amdgpu_device_vram_scratch_init(adev);
2447 if (r) {
2448 DRM_ERROR("amdgpu_vram_scratch_init failed %d\n", r);
2449 goto init_failed;
2450 }
2451 r = adev->ip_blocks[i].version->funcs->hw_init((void *)adev);
2452 if (r) {
2453 DRM_ERROR("hw_init %d failed %d\n", i, r);
2454 goto init_failed;
2455 }
2456 r = amdgpu_device_wb_init(adev);
2457 if (r) {
2458 DRM_ERROR("amdgpu_device_wb_init failed %d\n", r);
2459 goto init_failed;
2460 }
2461 adev->ip_blocks[i].status.hw = true;
2462
2463 /* right after GMC hw init, we create CSA */
2464 if (amdgpu_mcbp || amdgpu_sriov_vf(adev)) {
2465 r = amdgpu_allocate_static_csa(adev, &adev->virt.csa_obj,
2466 AMDGPU_GEM_DOMAIN_VRAM,
2467 AMDGPU_CSA_SIZE);
2468 if (r) {
2469 DRM_ERROR("allocate CSA failed %d\n", r);
2470 goto init_failed;
2471 }
2472 }
2473 }
2474 }
2475
2476 if (amdgpu_sriov_vf(adev))
2477 amdgpu_virt_init_data_exchange(adev);
2478
2479 r = amdgpu_ib_pool_init(adev);
2480 if (r) {
2481 dev_err(adev->dev, "IB initialization failed (%d).\n", r);
2482 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_IB_INIT_FAIL, 0, r);
2483 goto init_failed;
2484 }
2485
2486 r = amdgpu_ucode_create_bo(adev); /* create ucode bo when sw_init complete*/
2487 if (r)
2488 goto init_failed;
2489
2490 r = amdgpu_device_ip_hw_init_phase1(adev);
2491 if (r)
2492 goto init_failed;
2493
2494 r = amdgpu_device_fw_loading(adev);
2495 if (r)
2496 goto init_failed;
2497
2498 r = amdgpu_device_ip_hw_init_phase2(adev);
2499 if (r)
2500 goto init_failed;
2501
2502 /*
2503 * retired pages will be loaded from eeprom and reserved here,
2504 * it should be called after amdgpu_device_ip_hw_init_phase2 since
2505 * for some ASICs the RAS EEPROM code relies on SMU fully functioning
2506 * for I2C communication which only true at this point.
2507 *
2508 * amdgpu_ras_recovery_init may fail, but the upper only cares the
2509 * failure from bad gpu situation and stop amdgpu init process
2510 * accordingly. For other failed cases, it will still release all
2511 * the resource and print error message, rather than returning one
2512 * negative value to upper level.
2513 *
2514 * Note: theoretically, this should be called before all vram allocations
2515 * to protect retired page from abusing
2516 */
2517 r = amdgpu_ras_recovery_init(adev);
2518 if (r)
2519 goto init_failed;
2520
2521 if (adev->gmc.xgmi.num_physical_nodes > 1)
2522 amdgpu_xgmi_add_device(adev);
2523
2524 /* Don't init kfd if whole hive need to be reset during init */
2525 if (!adev->gmc.xgmi.pending_reset)
2526 amdgpu_amdkfd_device_init(adev);
2527
2528 r = amdgpu_amdkfd_resume_iommu(adev);
2529 if (r)
2530 goto init_failed;
2531
2532 amdgpu_fru_get_product_info(adev);
2533
2534 init_failed:
2535
2536 return r;
2537 }
2538
2539 /**
2540 * amdgpu_device_fill_reset_magic - writes reset magic to gart pointer
2541 *
2542 * @adev: amdgpu_device pointer
2543 *
2544 * Writes a reset magic value to the gart pointer in VRAM. The driver calls
2545 * this function before a GPU reset. If the value is retained after a
2546 * GPU reset, VRAM has not been lost. Some GPU resets may destry VRAM contents.
2547 */
amdgpu_device_fill_reset_magic(struct amdgpu_device * adev)2548 static void amdgpu_device_fill_reset_magic(struct amdgpu_device *adev)
2549 {
2550 memcpy(adev->reset_magic, adev->gart.ptr, AMDGPU_RESET_MAGIC_NUM);
2551 }
2552
2553 /**
2554 * amdgpu_device_check_vram_lost - check if vram is valid
2555 *
2556 * @adev: amdgpu_device pointer
2557 *
2558 * Checks the reset magic value written to the gart pointer in VRAM.
2559 * The driver calls this after a GPU reset to see if the contents of
2560 * VRAM is lost or now.
2561 * returns true if vram is lost, false if not.
2562 */
amdgpu_device_check_vram_lost(struct amdgpu_device * adev)2563 static bool amdgpu_device_check_vram_lost(struct amdgpu_device *adev)
2564 {
2565 if (memcmp(adev->gart.ptr, adev->reset_magic,
2566 AMDGPU_RESET_MAGIC_NUM))
2567 return true;
2568
2569 if (!amdgpu_in_reset(adev))
2570 return false;
2571
2572 /*
2573 * For all ASICs with baco/mode1 reset, the VRAM is
2574 * always assumed to be lost.
2575 */
2576 switch (amdgpu_asic_reset_method(adev)) {
2577 case AMD_RESET_METHOD_BACO:
2578 case AMD_RESET_METHOD_MODE1:
2579 return true;
2580 default:
2581 return false;
2582 }
2583 }
2584
2585 /**
2586 * amdgpu_device_set_cg_state - set clockgating for amdgpu device
2587 *
2588 * @adev: amdgpu_device pointer
2589 * @state: clockgating state (gate or ungate)
2590 *
2591 * The list of all the hardware IPs that make up the asic is walked and the
2592 * set_clockgating_state callbacks are run.
2593 * Late initialization pass enabling clockgating for hardware IPs.
2594 * Fini or suspend, pass disabling clockgating for hardware IPs.
2595 * Returns 0 on success, negative error code on failure.
2596 */
2597
amdgpu_device_set_cg_state(struct amdgpu_device * adev,enum amd_clockgating_state state)2598 int amdgpu_device_set_cg_state(struct amdgpu_device *adev,
2599 enum amd_clockgating_state state)
2600 {
2601 int i, j, r;
2602
2603 if (amdgpu_emu_mode == 1)
2604 return 0;
2605
2606 for (j = 0; j < adev->num_ip_blocks; j++) {
2607 i = state == AMD_CG_STATE_GATE ? j : adev->num_ip_blocks - j - 1;
2608 if (!adev->ip_blocks[i].status.late_initialized)
2609 continue;
2610 /* skip CG for GFX on S0ix */
2611 if (adev->in_s0ix &&
2612 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GFX)
2613 continue;
2614 /* skip CG for VCE/UVD, it's handled specially */
2615 if (adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_UVD &&
2616 adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_VCE &&
2617 adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_VCN &&
2618 adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_JPEG &&
2619 adev->ip_blocks[i].version->funcs->set_clockgating_state) {
2620 /* enable clockgating to save power */
2621 r = adev->ip_blocks[i].version->funcs->set_clockgating_state((void *)adev,
2622 state);
2623 if (r) {
2624 DRM_ERROR("set_clockgating_state(gate) of IP block <%s> failed %d\n",
2625 adev->ip_blocks[i].version->funcs->name, r);
2626 return r;
2627 }
2628 }
2629 }
2630
2631 return 0;
2632 }
2633
amdgpu_device_set_pg_state(struct amdgpu_device * adev,enum amd_powergating_state state)2634 int amdgpu_device_set_pg_state(struct amdgpu_device *adev,
2635 enum amd_powergating_state state)
2636 {
2637 int i, j, r;
2638
2639 if (amdgpu_emu_mode == 1)
2640 return 0;
2641
2642 for (j = 0; j < adev->num_ip_blocks; j++) {
2643 i = state == AMD_PG_STATE_GATE ? j : adev->num_ip_blocks - j - 1;
2644 if (!adev->ip_blocks[i].status.late_initialized)
2645 continue;
2646 /* skip PG for GFX on S0ix */
2647 if (adev->in_s0ix &&
2648 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GFX)
2649 continue;
2650 /* skip CG for VCE/UVD, it's handled specially */
2651 if (adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_UVD &&
2652 adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_VCE &&
2653 adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_VCN &&
2654 adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_JPEG &&
2655 adev->ip_blocks[i].version->funcs->set_powergating_state) {
2656 /* enable powergating to save power */
2657 r = adev->ip_blocks[i].version->funcs->set_powergating_state((void *)adev,
2658 state);
2659 if (r) {
2660 DRM_ERROR("set_powergating_state(gate) of IP block <%s> failed %d\n",
2661 adev->ip_blocks[i].version->funcs->name, r);
2662 return r;
2663 }
2664 }
2665 }
2666 return 0;
2667 }
2668
amdgpu_device_enable_mgpu_fan_boost(void)2669 static int amdgpu_device_enable_mgpu_fan_boost(void)
2670 {
2671 struct amdgpu_gpu_instance *gpu_ins;
2672 struct amdgpu_device *adev;
2673 int i, ret = 0;
2674
2675 mutex_lock(&mgpu_info.mutex);
2676
2677 /*
2678 * MGPU fan boost feature should be enabled
2679 * only when there are two or more dGPUs in
2680 * the system
2681 */
2682 if (mgpu_info.num_dgpu < 2)
2683 goto out;
2684
2685 for (i = 0; i < mgpu_info.num_dgpu; i++) {
2686 gpu_ins = &(mgpu_info.gpu_ins[i]);
2687 adev = gpu_ins->adev;
2688 if (!(adev->flags & AMD_IS_APU) &&
2689 !gpu_ins->mgpu_fan_enabled) {
2690 ret = amdgpu_dpm_enable_mgpu_fan_boost(adev);
2691 if (ret)
2692 break;
2693
2694 gpu_ins->mgpu_fan_enabled = 1;
2695 }
2696 }
2697
2698 out:
2699 mutex_unlock(&mgpu_info.mutex);
2700
2701 return ret;
2702 }
2703
2704 /**
2705 * amdgpu_device_ip_late_init - run late init for hardware IPs
2706 *
2707 * @adev: amdgpu_device pointer
2708 *
2709 * Late initialization pass for hardware IPs. The list of all the hardware
2710 * IPs that make up the asic is walked and the late_init callbacks are run.
2711 * late_init covers any special initialization that an IP requires
2712 * after all of the have been initialized or something that needs to happen
2713 * late in the init process.
2714 * Returns 0 on success, negative error code on failure.
2715 */
amdgpu_device_ip_late_init(struct amdgpu_device * adev)2716 static int amdgpu_device_ip_late_init(struct amdgpu_device *adev)
2717 {
2718 struct amdgpu_gpu_instance *gpu_instance;
2719 int i = 0, r;
2720
2721 for (i = 0; i < adev->num_ip_blocks; i++) {
2722 if (!adev->ip_blocks[i].status.hw)
2723 continue;
2724 if (adev->ip_blocks[i].version->funcs->late_init) {
2725 r = adev->ip_blocks[i].version->funcs->late_init((void *)adev);
2726 if (r) {
2727 DRM_ERROR("late_init of IP block <%s> failed %d\n",
2728 adev->ip_blocks[i].version->funcs->name, r);
2729 return r;
2730 }
2731 }
2732 adev->ip_blocks[i].status.late_initialized = true;
2733 }
2734
2735 amdgpu_ras_set_error_query_ready(adev, true);
2736
2737 amdgpu_device_set_cg_state(adev, AMD_CG_STATE_GATE);
2738 amdgpu_device_set_pg_state(adev, AMD_PG_STATE_GATE);
2739
2740 amdgpu_device_fill_reset_magic(adev);
2741
2742 r = amdgpu_device_enable_mgpu_fan_boost();
2743 if (r)
2744 DRM_ERROR("enable mgpu fan boost failed (%d).\n", r);
2745
2746 /* For XGMI + passthrough configuration on arcturus, enable light SBR */
2747 if (adev->asic_type == CHIP_ARCTURUS &&
2748 amdgpu_passthrough(adev) &&
2749 adev->gmc.xgmi.num_physical_nodes > 1)
2750 smu_set_light_sbr(&adev->smu, true);
2751
2752 if (adev->gmc.xgmi.num_physical_nodes > 1) {
2753 mutex_lock(&mgpu_info.mutex);
2754
2755 /*
2756 * Reset device p-state to low as this was booted with high.
2757 *
2758 * This should be performed only after all devices from the same
2759 * hive get initialized.
2760 *
2761 * However, it's unknown how many device in the hive in advance.
2762 * As this is counted one by one during devices initializations.
2763 *
2764 * So, we wait for all XGMI interlinked devices initialized.
2765 * This may bring some delays as those devices may come from
2766 * different hives. But that should be OK.
2767 */
2768 if (mgpu_info.num_dgpu == adev->gmc.xgmi.num_physical_nodes) {
2769 for (i = 0; i < mgpu_info.num_gpu; i++) {
2770 gpu_instance = &(mgpu_info.gpu_ins[i]);
2771 if (gpu_instance->adev->flags & AMD_IS_APU)
2772 continue;
2773
2774 r = amdgpu_xgmi_set_pstate(gpu_instance->adev,
2775 AMDGPU_XGMI_PSTATE_MIN);
2776 if (r) {
2777 DRM_ERROR("pstate setting failed (%d).\n", r);
2778 break;
2779 }
2780 }
2781 }
2782
2783 mutex_unlock(&mgpu_info.mutex);
2784 }
2785
2786 return 0;
2787 }
2788
amdgpu_device_ip_fini_early(struct amdgpu_device * adev)2789 static int amdgpu_device_ip_fini_early(struct amdgpu_device *adev)
2790 {
2791 int i, r;
2792
2793 for (i = 0; i < adev->num_ip_blocks; i++) {
2794 if (!adev->ip_blocks[i].version->funcs->early_fini)
2795 continue;
2796
2797 r = adev->ip_blocks[i].version->funcs->early_fini((void *)adev);
2798 if (r) {
2799 DRM_DEBUG("early_fini of IP block <%s> failed %d\n",
2800 adev->ip_blocks[i].version->funcs->name, r);
2801 }
2802 }
2803
2804 amdgpu_amdkfd_suspend(adev, false);
2805
2806 amdgpu_device_set_pg_state(adev, AMD_PG_STATE_UNGATE);
2807 amdgpu_device_set_cg_state(adev, AMD_CG_STATE_UNGATE);
2808
2809 /* need to disable SMC first */
2810 for (i = 0; i < adev->num_ip_blocks; i++) {
2811 if (!adev->ip_blocks[i].status.hw)
2812 continue;
2813 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_SMC) {
2814 r = adev->ip_blocks[i].version->funcs->hw_fini((void *)adev);
2815 /* XXX handle errors */
2816 if (r) {
2817 DRM_DEBUG("hw_fini of IP block <%s> failed %d\n",
2818 adev->ip_blocks[i].version->funcs->name, r);
2819 }
2820 adev->ip_blocks[i].status.hw = false;
2821 break;
2822 }
2823 }
2824
2825 for (i = adev->num_ip_blocks - 1; i >= 0; i--) {
2826 if (!adev->ip_blocks[i].status.hw)
2827 continue;
2828
2829 r = adev->ip_blocks[i].version->funcs->hw_fini((void *)adev);
2830 /* XXX handle errors */
2831 if (r) {
2832 DRM_DEBUG("hw_fini of IP block <%s> failed %d\n",
2833 adev->ip_blocks[i].version->funcs->name, r);
2834 }
2835
2836 adev->ip_blocks[i].status.hw = false;
2837 }
2838
2839 if (amdgpu_sriov_vf(adev)) {
2840 if (amdgpu_virt_release_full_gpu(adev, false))
2841 DRM_ERROR("failed to release exclusive mode on fini\n");
2842 }
2843
2844 return 0;
2845 }
2846
2847 /**
2848 * amdgpu_device_ip_fini - run fini for hardware IPs
2849 *
2850 * @adev: amdgpu_device pointer
2851 *
2852 * Main teardown pass for hardware IPs. The list of all the hardware
2853 * IPs that make up the asic is walked and the hw_fini and sw_fini callbacks
2854 * are run. hw_fini tears down the hardware associated with each IP
2855 * and sw_fini tears down any software state associated with each IP.
2856 * Returns 0 on success, negative error code on failure.
2857 */
amdgpu_device_ip_fini(struct amdgpu_device * adev)2858 static int amdgpu_device_ip_fini(struct amdgpu_device *adev)
2859 {
2860 int i, r;
2861
2862 if (amdgpu_sriov_vf(adev) && adev->virt.ras_init_done)
2863 amdgpu_virt_release_ras_err_handler_data(adev);
2864
2865 amdgpu_ras_pre_fini(adev);
2866
2867 if (adev->gmc.xgmi.num_physical_nodes > 1)
2868 amdgpu_xgmi_remove_device(adev);
2869
2870 amdgpu_amdkfd_device_fini_sw(adev);
2871
2872 for (i = adev->num_ip_blocks - 1; i >= 0; i--) {
2873 if (!adev->ip_blocks[i].status.sw)
2874 continue;
2875
2876 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC) {
2877 amdgpu_ucode_free_bo(adev);
2878 amdgpu_free_static_csa(&adev->virt.csa_obj);
2879 amdgpu_device_wb_fini(adev);
2880 amdgpu_device_vram_scratch_fini(adev);
2881 amdgpu_ib_pool_fini(adev);
2882 }
2883
2884 r = adev->ip_blocks[i].version->funcs->sw_fini((void *)adev);
2885 /* XXX handle errors */
2886 if (r) {
2887 DRM_DEBUG("sw_fini of IP block <%s> failed %d\n",
2888 adev->ip_blocks[i].version->funcs->name, r);
2889 }
2890 adev->ip_blocks[i].status.sw = false;
2891 adev->ip_blocks[i].status.valid = false;
2892 }
2893
2894 for (i = adev->num_ip_blocks - 1; i >= 0; i--) {
2895 if (!adev->ip_blocks[i].status.late_initialized)
2896 continue;
2897 if (adev->ip_blocks[i].version->funcs->late_fini)
2898 adev->ip_blocks[i].version->funcs->late_fini((void *)adev);
2899 adev->ip_blocks[i].status.late_initialized = false;
2900 }
2901
2902 amdgpu_ras_fini(adev);
2903
2904 return 0;
2905 }
2906
2907 /**
2908 * amdgpu_device_delayed_init_work_handler - work handler for IB tests
2909 *
2910 * @work: work_struct.
2911 */
amdgpu_device_delayed_init_work_handler(struct work_struct * work)2912 static void amdgpu_device_delayed_init_work_handler(struct work_struct *work)
2913 {
2914 struct amdgpu_device *adev =
2915 container_of(work, struct amdgpu_device, delayed_init_work.work);
2916 int r;
2917
2918 r = amdgpu_ib_ring_tests(adev);
2919 if (r)
2920 DRM_ERROR("ib ring test failed (%d).\n", r);
2921 }
2922
amdgpu_device_delay_enable_gfx_off(struct work_struct * work)2923 static void amdgpu_device_delay_enable_gfx_off(struct work_struct *work)
2924 {
2925 struct amdgpu_device *adev =
2926 container_of(work, struct amdgpu_device, gfx.gfx_off_delay_work.work);
2927
2928 WARN_ON_ONCE(adev->gfx.gfx_off_state);
2929 WARN_ON_ONCE(adev->gfx.gfx_off_req_count);
2930
2931 if (!amdgpu_dpm_set_powergating_by_smu(adev, AMD_IP_BLOCK_TYPE_GFX, true))
2932 adev->gfx.gfx_off_state = true;
2933 }
2934
2935 /**
2936 * amdgpu_device_ip_suspend_phase1 - run suspend for hardware IPs (phase 1)
2937 *
2938 * @adev: amdgpu_device pointer
2939 *
2940 * Main suspend function for hardware IPs. The list of all the hardware
2941 * IPs that make up the asic is walked, clockgating is disabled and the
2942 * suspend callbacks are run. suspend puts the hardware and software state
2943 * in each IP into a state suitable for suspend.
2944 * Returns 0 on success, negative error code on failure.
2945 */
amdgpu_device_ip_suspend_phase1(struct amdgpu_device * adev)2946 static int amdgpu_device_ip_suspend_phase1(struct amdgpu_device *adev)
2947 {
2948 int i, r;
2949
2950 amdgpu_device_set_pg_state(adev, AMD_PG_STATE_UNGATE);
2951 amdgpu_device_set_cg_state(adev, AMD_CG_STATE_UNGATE);
2952
2953 for (i = adev->num_ip_blocks - 1; i >= 0; i--) {
2954 if (!adev->ip_blocks[i].status.valid)
2955 continue;
2956
2957 /* displays are handled separately */
2958 if (adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_DCE)
2959 continue;
2960
2961 /* XXX handle errors */
2962 r = adev->ip_blocks[i].version->funcs->suspend(adev);
2963 /* XXX handle errors */
2964 if (r) {
2965 DRM_ERROR("suspend of IP block <%s> failed %d\n",
2966 adev->ip_blocks[i].version->funcs->name, r);
2967 return r;
2968 }
2969
2970 adev->ip_blocks[i].status.hw = false;
2971 }
2972
2973 return 0;
2974 }
2975
2976 /**
2977 * amdgpu_device_ip_suspend_phase2 - run suspend for hardware IPs (phase 2)
2978 *
2979 * @adev: amdgpu_device pointer
2980 *
2981 * Main suspend function for hardware IPs. The list of all the hardware
2982 * IPs that make up the asic is walked, clockgating is disabled and the
2983 * suspend callbacks are run. suspend puts the hardware and software state
2984 * in each IP into a state suitable for suspend.
2985 * Returns 0 on success, negative error code on failure.
2986 */
amdgpu_device_ip_suspend_phase2(struct amdgpu_device * adev)2987 static int amdgpu_device_ip_suspend_phase2(struct amdgpu_device *adev)
2988 {
2989 int i, r;
2990
2991 if (adev->in_s0ix)
2992 amdgpu_gfx_state_change_set(adev, sGpuChangeState_D3Entry);
2993
2994 for (i = adev->num_ip_blocks - 1; i >= 0; i--) {
2995 if (!adev->ip_blocks[i].status.valid)
2996 continue;
2997 /* displays are handled in phase1 */
2998 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_DCE)
2999 continue;
3000 /* PSP lost connection when err_event_athub occurs */
3001 if (amdgpu_ras_intr_triggered() &&
3002 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_PSP) {
3003 adev->ip_blocks[i].status.hw = false;
3004 continue;
3005 }
3006
3007 /* skip unnecessary suspend if we do not initialize them yet */
3008 if (adev->gmc.xgmi.pending_reset &&
3009 !(adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC ||
3010 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_SMC ||
3011 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_COMMON ||
3012 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_IH)) {
3013 adev->ip_blocks[i].status.hw = false;
3014 continue;
3015 }
3016
3017 /* skip suspend of gfx and psp for S0ix
3018 * gfx is in gfxoff state, so on resume it will exit gfxoff just
3019 * like at runtime. PSP is also part of the always on hardware
3020 * so no need to suspend it.
3021 */
3022 if (adev->in_s0ix &&
3023 (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_PSP ||
3024 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GFX))
3025 continue;
3026
3027 /* XXX handle errors */
3028 r = adev->ip_blocks[i].version->funcs->suspend(adev);
3029 /* XXX handle errors */
3030 if (r) {
3031 DRM_ERROR("suspend of IP block <%s> failed %d\n",
3032 adev->ip_blocks[i].version->funcs->name, r);
3033 }
3034 adev->ip_blocks[i].status.hw = false;
3035 /* handle putting the SMC in the appropriate state */
3036 if(!amdgpu_sriov_vf(adev)){
3037 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_SMC) {
3038 r = amdgpu_dpm_set_mp1_state(adev, adev->mp1_state);
3039 if (r) {
3040 DRM_ERROR("SMC failed to set mp1 state %d, %d\n",
3041 adev->mp1_state, r);
3042 return r;
3043 }
3044 }
3045 }
3046 }
3047
3048 return 0;
3049 }
3050
3051 /**
3052 * amdgpu_device_ip_suspend - run suspend for hardware IPs
3053 *
3054 * @adev: amdgpu_device pointer
3055 *
3056 * Main suspend function for hardware IPs. The list of all the hardware
3057 * IPs that make up the asic is walked, clockgating is disabled and the
3058 * suspend callbacks are run. suspend puts the hardware and software state
3059 * in each IP into a state suitable for suspend.
3060 * Returns 0 on success, negative error code on failure.
3061 */
amdgpu_device_ip_suspend(struct amdgpu_device * adev)3062 int amdgpu_device_ip_suspend(struct amdgpu_device *adev)
3063 {
3064 int r;
3065
3066 if (amdgpu_sriov_vf(adev)) {
3067 amdgpu_virt_fini_data_exchange(adev);
3068 amdgpu_virt_request_full_gpu(adev, false);
3069 }
3070
3071 r = amdgpu_device_ip_suspend_phase1(adev);
3072 if (r)
3073 return r;
3074 r = amdgpu_device_ip_suspend_phase2(adev);
3075
3076 if (amdgpu_sriov_vf(adev))
3077 amdgpu_virt_release_full_gpu(adev, false);
3078
3079 return r;
3080 }
3081
amdgpu_device_ip_reinit_early_sriov(struct amdgpu_device * adev)3082 static int amdgpu_device_ip_reinit_early_sriov(struct amdgpu_device *adev)
3083 {
3084 int i, r;
3085
3086 static enum amd_ip_block_type ip_order[] = {
3087 AMD_IP_BLOCK_TYPE_COMMON,
3088 AMD_IP_BLOCK_TYPE_GMC,
3089 AMD_IP_BLOCK_TYPE_PSP,
3090 AMD_IP_BLOCK_TYPE_IH,
3091 };
3092
3093 for (i = 0; i < adev->num_ip_blocks; i++) {
3094 int j;
3095 struct amdgpu_ip_block *block;
3096
3097 block = &adev->ip_blocks[i];
3098 block->status.hw = false;
3099
3100 for (j = 0; j < ARRAY_SIZE(ip_order); j++) {
3101
3102 if (block->version->type != ip_order[j] ||
3103 !block->status.valid)
3104 continue;
3105
3106 r = block->version->funcs->hw_init(adev);
3107 DRM_INFO("RE-INIT-early: %s %s\n", block->version->funcs->name, r?"failed":"succeeded");
3108 if (r)
3109 return r;
3110 block->status.hw = true;
3111 }
3112 }
3113
3114 return 0;
3115 }
3116
amdgpu_device_ip_reinit_late_sriov(struct amdgpu_device * adev)3117 static int amdgpu_device_ip_reinit_late_sriov(struct amdgpu_device *adev)
3118 {
3119 int i, r;
3120
3121 static enum amd_ip_block_type ip_order[] = {
3122 AMD_IP_BLOCK_TYPE_SMC,
3123 AMD_IP_BLOCK_TYPE_DCE,
3124 AMD_IP_BLOCK_TYPE_GFX,
3125 AMD_IP_BLOCK_TYPE_SDMA,
3126 AMD_IP_BLOCK_TYPE_UVD,
3127 AMD_IP_BLOCK_TYPE_VCE,
3128 AMD_IP_BLOCK_TYPE_VCN
3129 };
3130
3131 for (i = 0; i < ARRAY_SIZE(ip_order); i++) {
3132 int j;
3133 struct amdgpu_ip_block *block;
3134
3135 for (j = 0; j < adev->num_ip_blocks; j++) {
3136 block = &adev->ip_blocks[j];
3137
3138 if (block->version->type != ip_order[i] ||
3139 !block->status.valid ||
3140 block->status.hw)
3141 continue;
3142
3143 if (block->version->type == AMD_IP_BLOCK_TYPE_SMC)
3144 r = block->version->funcs->resume(adev);
3145 else
3146 r = block->version->funcs->hw_init(adev);
3147
3148 DRM_INFO("RE-INIT-late: %s %s\n", block->version->funcs->name, r?"failed":"succeeded");
3149 if (r)
3150 return r;
3151 block->status.hw = true;
3152 }
3153 }
3154
3155 return 0;
3156 }
3157
3158 /**
3159 * amdgpu_device_ip_resume_phase1 - run resume for hardware IPs
3160 *
3161 * @adev: amdgpu_device pointer
3162 *
3163 * First resume function for hardware IPs. The list of all the hardware
3164 * IPs that make up the asic is walked and the resume callbacks are run for
3165 * COMMON, GMC, and IH. resume puts the hardware into a functional state
3166 * after a suspend and updates the software state as necessary. This
3167 * function is also used for restoring the GPU after a GPU reset.
3168 * Returns 0 on success, negative error code on failure.
3169 */
amdgpu_device_ip_resume_phase1(struct amdgpu_device * adev)3170 static int amdgpu_device_ip_resume_phase1(struct amdgpu_device *adev)
3171 {
3172 int i, r;
3173
3174 for (i = 0; i < adev->num_ip_blocks; i++) {
3175 if (!adev->ip_blocks[i].status.valid || adev->ip_blocks[i].status.hw)
3176 continue;
3177 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_COMMON ||
3178 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC ||
3179 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_IH ||
3180 (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_PSP && amdgpu_sriov_vf(adev))) {
3181
3182 r = adev->ip_blocks[i].version->funcs->resume(adev);
3183 if (r) {
3184 DRM_ERROR("resume of IP block <%s> failed %d\n",
3185 adev->ip_blocks[i].version->funcs->name, r);
3186 return r;
3187 }
3188 adev->ip_blocks[i].status.hw = true;
3189 }
3190 }
3191
3192 return 0;
3193 }
3194
3195 /**
3196 * amdgpu_device_ip_resume_phase2 - run resume for hardware IPs
3197 *
3198 * @adev: amdgpu_device pointer
3199 *
3200 * First resume function for hardware IPs. The list of all the hardware
3201 * IPs that make up the asic is walked and the resume callbacks are run for
3202 * all blocks except COMMON, GMC, and IH. resume puts the hardware into a
3203 * functional state after a suspend and updates the software state as
3204 * necessary. This function is also used for restoring the GPU after a GPU
3205 * reset.
3206 * Returns 0 on success, negative error code on failure.
3207 */
amdgpu_device_ip_resume_phase2(struct amdgpu_device * adev)3208 static int amdgpu_device_ip_resume_phase2(struct amdgpu_device *adev)
3209 {
3210 int i, r;
3211
3212 for (i = 0; i < adev->num_ip_blocks; i++) {
3213 if (!adev->ip_blocks[i].status.valid || adev->ip_blocks[i].status.hw)
3214 continue;
3215 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_COMMON ||
3216 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC ||
3217 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_IH ||
3218 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_PSP)
3219 continue;
3220 r = adev->ip_blocks[i].version->funcs->resume(adev);
3221 if (r) {
3222 DRM_ERROR("resume of IP block <%s> failed %d\n",
3223 adev->ip_blocks[i].version->funcs->name, r);
3224 return r;
3225 }
3226 adev->ip_blocks[i].status.hw = true;
3227
3228 if (adev->in_s0ix && adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_SMC) {
3229 /* disable gfxoff for IP resume. The gfxoff will be re-enabled in
3230 * amdgpu_device_resume() after IP resume.
3231 */
3232 amdgpu_gfx_off_ctrl(adev, false);
3233 DRM_DEBUG("will disable gfxoff for re-initializing other blocks\n");
3234 }
3235
3236 }
3237
3238 return 0;
3239 }
3240
3241 /**
3242 * amdgpu_device_ip_resume - run resume for hardware IPs
3243 *
3244 * @adev: amdgpu_device pointer
3245 *
3246 * Main resume function for hardware IPs. The hardware IPs
3247 * are split into two resume functions because they are
3248 * are also used in in recovering from a GPU reset and some additional
3249 * steps need to be take between them. In this case (S3/S4) they are
3250 * run sequentially.
3251 * Returns 0 on success, negative error code on failure.
3252 */
amdgpu_device_ip_resume(struct amdgpu_device * adev)3253 static int amdgpu_device_ip_resume(struct amdgpu_device *adev)
3254 {
3255 int r;
3256
3257 r = amdgpu_amdkfd_resume_iommu(adev);
3258 if (r)
3259 return r;
3260
3261 r = amdgpu_device_ip_resume_phase1(adev);
3262 if (r)
3263 return r;
3264
3265 r = amdgpu_device_fw_loading(adev);
3266 if (r)
3267 return r;
3268
3269 r = amdgpu_device_ip_resume_phase2(adev);
3270
3271 return r;
3272 }
3273
3274 /**
3275 * amdgpu_device_detect_sriov_bios - determine if the board supports SR-IOV
3276 *
3277 * @adev: amdgpu_device pointer
3278 *
3279 * Query the VBIOS data tables to determine if the board supports SR-IOV.
3280 */
amdgpu_device_detect_sriov_bios(struct amdgpu_device * adev)3281 static void amdgpu_device_detect_sriov_bios(struct amdgpu_device *adev)
3282 {
3283 if (amdgpu_sriov_vf(adev)) {
3284 if (adev->is_atom_fw) {
3285 if (amdgpu_atomfirmware_gpu_virtualization_supported(adev))
3286 adev->virt.caps |= AMDGPU_SRIOV_CAPS_SRIOV_VBIOS;
3287 } else {
3288 if (amdgpu_atombios_has_gpu_virtualization_table(adev))
3289 adev->virt.caps |= AMDGPU_SRIOV_CAPS_SRIOV_VBIOS;
3290 }
3291
3292 if (!(adev->virt.caps & AMDGPU_SRIOV_CAPS_SRIOV_VBIOS))
3293 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_NO_VBIOS, 0, 0);
3294 }
3295 }
3296
3297 /**
3298 * amdgpu_device_asic_has_dc_support - determine if DC supports the asic
3299 *
3300 * @asic_type: AMD asic type
3301 *
3302 * Check if there is DC (new modesetting infrastructre) support for an asic.
3303 * returns true if DC has support, false if not.
3304 */
amdgpu_device_asic_has_dc_support(enum amd_asic_type asic_type)3305 bool amdgpu_device_asic_has_dc_support(enum amd_asic_type asic_type)
3306 {
3307 switch (asic_type) {
3308 #if defined(CONFIG_DRM_AMD_DC)
3309 #if defined(CONFIG_DRM_AMD_DC_SI)
3310 case CHIP_TAHITI:
3311 case CHIP_PITCAIRN:
3312 case CHIP_VERDE:
3313 case CHIP_OLAND:
3314 #endif
3315 case CHIP_BONAIRE:
3316 case CHIP_KAVERI:
3317 case CHIP_KABINI:
3318 case CHIP_MULLINS:
3319 /*
3320 * We have systems in the wild with these ASICs that require
3321 * LVDS and VGA support which is not supported with DC.
3322 *
3323 * Fallback to the non-DC driver here by default so as not to
3324 * cause regressions.
3325 */
3326 return amdgpu_dc > 0;
3327 case CHIP_HAWAII:
3328 case CHIP_CARRIZO:
3329 case CHIP_STONEY:
3330 case CHIP_POLARIS10:
3331 case CHIP_POLARIS11:
3332 case CHIP_POLARIS12:
3333 case CHIP_VEGAM:
3334 case CHIP_TONGA:
3335 case CHIP_FIJI:
3336 case CHIP_VEGA10:
3337 case CHIP_VEGA12:
3338 case CHIP_VEGA20:
3339 #if defined(CONFIG_DRM_AMD_DC_DCN)
3340 case CHIP_RAVEN:
3341 case CHIP_NAVI10:
3342 case CHIP_NAVI14:
3343 case CHIP_NAVI12:
3344 case CHIP_RENOIR:
3345 case CHIP_SIENNA_CICHLID:
3346 case CHIP_NAVY_FLOUNDER:
3347 case CHIP_DIMGREY_CAVEFISH:
3348 case CHIP_BEIGE_GOBY:
3349 case CHIP_VANGOGH:
3350 case CHIP_YELLOW_CARP:
3351 #endif
3352 return amdgpu_dc != 0;
3353 #endif
3354 default:
3355 if (amdgpu_dc > 0)
3356 DRM_INFO_ONCE("Display Core has been requested via kernel parameter "
3357 "but isn't supported by ASIC, ignoring\n");
3358 return false;
3359 }
3360 }
3361
3362 /**
3363 * amdgpu_device_has_dc_support - check if dc is supported
3364 *
3365 * @adev: amdgpu_device pointer
3366 *
3367 * Returns true for supported, false for not supported
3368 */
amdgpu_device_has_dc_support(struct amdgpu_device * adev)3369 bool amdgpu_device_has_dc_support(struct amdgpu_device *adev)
3370 {
3371 if (amdgpu_sriov_vf(adev) ||
3372 adev->enable_virtual_display ||
3373 (adev->harvest_ip_mask & AMD_HARVEST_IP_DMU_MASK))
3374 return false;
3375
3376 return amdgpu_device_asic_has_dc_support(adev->asic_type);
3377 }
3378
amdgpu_device_xgmi_reset_func(struct work_struct * __work)3379 static void amdgpu_device_xgmi_reset_func(struct work_struct *__work)
3380 {
3381 struct amdgpu_device *adev =
3382 container_of(__work, struct amdgpu_device, xgmi_reset_work);
3383 struct amdgpu_hive_info *hive = amdgpu_get_xgmi_hive(adev);
3384
3385 /* It's a bug to not have a hive within this function */
3386 if (WARN_ON(!hive))
3387 return;
3388
3389 /*
3390 * Use task barrier to synchronize all xgmi reset works across the
3391 * hive. task_barrier_enter and task_barrier_exit will block
3392 * until all the threads running the xgmi reset works reach
3393 * those points. task_barrier_full will do both blocks.
3394 */
3395 if (amdgpu_asic_reset_method(adev) == AMD_RESET_METHOD_BACO) {
3396
3397 task_barrier_enter(&hive->tb);
3398 adev->asic_reset_res = amdgpu_device_baco_enter(adev_to_drm(adev));
3399
3400 if (adev->asic_reset_res)
3401 goto fail;
3402
3403 task_barrier_exit(&hive->tb);
3404 adev->asic_reset_res = amdgpu_device_baco_exit(adev_to_drm(adev));
3405
3406 if (adev->asic_reset_res)
3407 goto fail;
3408
3409 if (adev->mmhub.ras_funcs &&
3410 adev->mmhub.ras_funcs->reset_ras_error_count)
3411 adev->mmhub.ras_funcs->reset_ras_error_count(adev);
3412 } else {
3413
3414 task_barrier_full(&hive->tb);
3415 adev->asic_reset_res = amdgpu_asic_reset(adev);
3416 }
3417
3418 fail:
3419 if (adev->asic_reset_res)
3420 DRM_WARN("ASIC reset failed with error, %d for drm dev, %s",
3421 adev->asic_reset_res, adev_to_drm(adev)->unique);
3422 amdgpu_put_xgmi_hive(hive);
3423 }
3424
amdgpu_device_get_job_timeout_settings(struct amdgpu_device * adev)3425 static int amdgpu_device_get_job_timeout_settings(struct amdgpu_device *adev)
3426 {
3427 char *input = amdgpu_lockup_timeout;
3428 char *timeout_setting = NULL;
3429 int index = 0;
3430 long timeout;
3431 int ret = 0;
3432
3433 /*
3434 * By default timeout for non compute jobs is 10000
3435 * and 60000 for compute jobs.
3436 * In SR-IOV or passthrough mode, timeout for compute
3437 * jobs are 60000 by default.
3438 */
3439 adev->gfx_timeout = msecs_to_jiffies(10000);
3440 adev->sdma_timeout = adev->video_timeout = adev->gfx_timeout;
3441 if (amdgpu_sriov_vf(adev))
3442 adev->compute_timeout = amdgpu_sriov_is_pp_one_vf(adev) ?
3443 msecs_to_jiffies(60000) : msecs_to_jiffies(10000);
3444 else
3445 adev->compute_timeout = msecs_to_jiffies(60000);
3446
3447 if (strnlen(input, AMDGPU_MAX_TIMEOUT_PARAM_LENGTH)) {
3448 while ((timeout_setting = strsep(&input, ",")) &&
3449 strnlen(timeout_setting, AMDGPU_MAX_TIMEOUT_PARAM_LENGTH)) {
3450 ret = kstrtol(timeout_setting, 0, &timeout);
3451 if (ret)
3452 return ret;
3453
3454 if (timeout == 0) {
3455 index++;
3456 continue;
3457 } else if (timeout < 0) {
3458 timeout = MAX_SCHEDULE_TIMEOUT;
3459 } else {
3460 timeout = msecs_to_jiffies(timeout);
3461 }
3462
3463 switch (index++) {
3464 case 0:
3465 adev->gfx_timeout = timeout;
3466 break;
3467 case 1:
3468 adev->compute_timeout = timeout;
3469 break;
3470 case 2:
3471 adev->sdma_timeout = timeout;
3472 break;
3473 case 3:
3474 adev->video_timeout = timeout;
3475 break;
3476 default:
3477 break;
3478 }
3479 }
3480 /*
3481 * There is only one value specified and
3482 * it should apply to all non-compute jobs.
3483 */
3484 if (index == 1) {
3485 adev->sdma_timeout = adev->video_timeout = adev->gfx_timeout;
3486 if (amdgpu_sriov_vf(adev) || amdgpu_passthrough(adev))
3487 adev->compute_timeout = adev->gfx_timeout;
3488 }
3489 }
3490
3491 return ret;
3492 }
3493
3494 static const struct attribute *amdgpu_dev_attributes[] = {
3495 &dev_attr_product_name.attr,
3496 &dev_attr_product_number.attr,
3497 &dev_attr_serial_number.attr,
3498 &dev_attr_pcie_replay_count.attr,
3499 NULL
3500 };
3501
3502 /**
3503 * amdgpu_device_init - initialize the driver
3504 *
3505 * @adev: amdgpu_device pointer
3506 * @flags: driver flags
3507 *
3508 * Initializes the driver info and hw (all asics).
3509 * Returns 0 for success or an error on failure.
3510 * Called at driver startup.
3511 */
amdgpu_device_init(struct amdgpu_device * adev,uint32_t flags)3512 int amdgpu_device_init(struct amdgpu_device *adev,
3513 uint32_t flags)
3514 {
3515 struct drm_device *ddev = adev_to_drm(adev);
3516 struct pci_dev *pdev = adev->pdev;
3517 int r, i;
3518 bool px = false;
3519 u32 max_MBps;
3520 int tmp;
3521
3522 adev->shutdown = false;
3523 adev->flags = flags;
3524
3525 if (amdgpu_force_asic_type >= 0 && amdgpu_force_asic_type < CHIP_LAST)
3526 adev->asic_type = amdgpu_force_asic_type;
3527 else
3528 adev->asic_type = flags & AMD_ASIC_MASK;
3529
3530 adev->usec_timeout = AMDGPU_MAX_USEC_TIMEOUT;
3531 if (amdgpu_emu_mode == 1)
3532 adev->usec_timeout *= 10;
3533 adev->gmc.gart_size = 512 * 1024 * 1024;
3534 adev->accel_working = false;
3535 adev->num_rings = 0;
3536 adev->mman.buffer_funcs = NULL;
3537 adev->mman.buffer_funcs_ring = NULL;
3538 adev->vm_manager.vm_pte_funcs = NULL;
3539 adev->vm_manager.vm_pte_num_scheds = 0;
3540 adev->gmc.gmc_funcs = NULL;
3541 adev->harvest_ip_mask = 0x0;
3542 adev->fence_context = dma_fence_context_alloc(AMDGPU_MAX_RINGS);
3543 bitmap_zero(adev->gfx.pipe_reserve_bitmap, AMDGPU_MAX_COMPUTE_QUEUES);
3544
3545 adev->smc_rreg = &amdgpu_invalid_rreg;
3546 adev->smc_wreg = &amdgpu_invalid_wreg;
3547 adev->pcie_rreg = &amdgpu_invalid_rreg;
3548 adev->pcie_wreg = &amdgpu_invalid_wreg;
3549 adev->pciep_rreg = &amdgpu_invalid_rreg;
3550 adev->pciep_wreg = &amdgpu_invalid_wreg;
3551 adev->pcie_rreg64 = &amdgpu_invalid_rreg64;
3552 adev->pcie_wreg64 = &amdgpu_invalid_wreg64;
3553 adev->uvd_ctx_rreg = &amdgpu_invalid_rreg;
3554 adev->uvd_ctx_wreg = &amdgpu_invalid_wreg;
3555 adev->didt_rreg = &amdgpu_invalid_rreg;
3556 adev->didt_wreg = &amdgpu_invalid_wreg;
3557 adev->gc_cac_rreg = &amdgpu_invalid_rreg;
3558 adev->gc_cac_wreg = &amdgpu_invalid_wreg;
3559 adev->audio_endpt_rreg = &amdgpu_block_invalid_rreg;
3560 adev->audio_endpt_wreg = &amdgpu_block_invalid_wreg;
3561
3562 DRM_INFO("initializing kernel modesetting (%s 0x%04X:0x%04X 0x%04X:0x%04X 0x%02X).\n",
3563 amdgpu_asic_name[adev->asic_type], pdev->vendor, pdev->device,
3564 pdev->subsystem_vendor, pdev->subsystem_device, pdev->revision);
3565
3566 /* mutex initialization are all done here so we
3567 * can recall function without having locking issues */
3568 mutex_init(&adev->firmware.mutex);
3569 mutex_init(&adev->pm.mutex);
3570 mutex_init(&adev->gfx.gpu_clock_mutex);
3571 mutex_init(&adev->srbm_mutex);
3572 mutex_init(&adev->gfx.pipe_reserve_mutex);
3573 mutex_init(&adev->gfx.gfx_off_mutex);
3574 mutex_init(&adev->grbm_idx_mutex);
3575 mutex_init(&adev->mn_lock);
3576 mutex_init(&adev->virt.vf_errors.lock);
3577 hash_init(adev->mn_hash);
3578 atomic_set(&adev->in_gpu_reset, 0);
3579 init_rwsem(&adev->reset_sem);
3580 mutex_init(&adev->psp.mutex);
3581 mutex_init(&adev->notifier_lock);
3582
3583 r = amdgpu_device_init_apu_flags(adev);
3584 if (r)
3585 return r;
3586
3587 r = amdgpu_device_check_arguments(adev);
3588 if (r)
3589 return r;
3590
3591 spin_lock_init(&adev->mmio_idx_lock);
3592 spin_lock_init(&adev->smc_idx_lock);
3593 spin_lock_init(&adev->pcie_idx_lock);
3594 spin_lock_init(&adev->uvd_ctx_idx_lock);
3595 spin_lock_init(&adev->didt_idx_lock);
3596 spin_lock_init(&adev->gc_cac_idx_lock);
3597 spin_lock_init(&adev->se_cac_idx_lock);
3598 spin_lock_init(&adev->audio_endpt_idx_lock);
3599 spin_lock_init(&adev->mm_stats.lock);
3600
3601 INIT_LIST_HEAD(&adev->shadow_list);
3602 mutex_init(&adev->shadow_list_lock);
3603
3604 INIT_LIST_HEAD(&adev->reset_list);
3605
3606 INIT_DELAYED_WORK(&adev->delayed_init_work,
3607 amdgpu_device_delayed_init_work_handler);
3608 INIT_DELAYED_WORK(&adev->gfx.gfx_off_delay_work,
3609 amdgpu_device_delay_enable_gfx_off);
3610
3611 INIT_WORK(&adev->xgmi_reset_work, amdgpu_device_xgmi_reset_func);
3612
3613 adev->gfx.gfx_off_req_count = 1;
3614 adev->pm.ac_power = power_supply_is_system_supplied() > 0;
3615
3616 atomic_set(&adev->throttling_logging_enabled, 1);
3617 /*
3618 * If throttling continues, logging will be performed every minute
3619 * to avoid log flooding. "-1" is subtracted since the thermal
3620 * throttling interrupt comes every second. Thus, the total logging
3621 * interval is 59 seconds(retelimited printk interval) + 1(waiting
3622 * for throttling interrupt) = 60 seconds.
3623 */
3624 ratelimit_state_init(&adev->throttling_logging_rs, (60 - 1) * HZ, 1);
3625 ratelimit_set_flags(&adev->throttling_logging_rs, RATELIMIT_MSG_ON_RELEASE);
3626
3627 /* Registers mapping */
3628 /* TODO: block userspace mapping of io register */
3629 if (adev->asic_type >= CHIP_BONAIRE) {
3630 adev->rmmio_base = pci_resource_start(adev->pdev, 5);
3631 adev->rmmio_size = pci_resource_len(adev->pdev, 5);
3632 } else {
3633 adev->rmmio_base = pci_resource_start(adev->pdev, 2);
3634 adev->rmmio_size = pci_resource_len(adev->pdev, 2);
3635 }
3636
3637 for (i = 0; i < AMD_IP_BLOCK_TYPE_NUM; i++)
3638 atomic_set(&adev->pm.pwr_state[i], POWER_STATE_UNKNOWN);
3639
3640 adev->rmmio = ioremap(adev->rmmio_base, adev->rmmio_size);
3641 if (adev->rmmio == NULL) {
3642 return -ENOMEM;
3643 }
3644 DRM_INFO("register mmio base: 0x%08X\n", (uint32_t)adev->rmmio_base);
3645 DRM_INFO("register mmio size: %u\n", (unsigned)adev->rmmio_size);
3646
3647 /* enable PCIE atomic ops */
3648 r = pci_enable_atomic_ops_to_root(adev->pdev,
3649 PCI_EXP_DEVCAP2_ATOMIC_COMP32 |
3650 PCI_EXP_DEVCAP2_ATOMIC_COMP64);
3651 if (r) {
3652 adev->have_atomics_support = false;
3653 DRM_INFO("PCIE atomic ops is not supported\n");
3654 } else {
3655 adev->have_atomics_support = true;
3656 }
3657
3658 amdgpu_device_get_pcie_info(adev);
3659
3660 if (amdgpu_mcbp)
3661 DRM_INFO("MCBP is enabled\n");
3662
3663 if (amdgpu_mes && adev->asic_type >= CHIP_NAVI10)
3664 adev->enable_mes = true;
3665
3666 /* detect hw virtualization here */
3667 amdgpu_detect_virtualization(adev);
3668
3669 r = amdgpu_device_get_job_timeout_settings(adev);
3670 if (r) {
3671 dev_err(adev->dev, "invalid lockup_timeout parameter syntax\n");
3672 return r;
3673 }
3674
3675 /* early init functions */
3676 r = amdgpu_device_ip_early_init(adev);
3677 if (r)
3678 return r;
3679
3680 /* Get rid of things like offb */
3681 r = drm_aperture_remove_conflicting_pci_framebuffers(adev->pdev, &amdgpu_kms_driver);
3682 if (r)
3683 return r;
3684
3685 /* doorbell bar mapping and doorbell index init*/
3686 amdgpu_device_doorbell_init(adev);
3687
3688 if (amdgpu_emu_mode == 1) {
3689 /* post the asic on emulation mode */
3690 emu_soc_asic_init(adev);
3691 goto fence_driver_init;
3692 }
3693
3694 amdgpu_reset_init(adev);
3695
3696 /* detect if we are with an SRIOV vbios */
3697 amdgpu_device_detect_sriov_bios(adev);
3698
3699 /* check if we need to reset the asic
3700 * E.g., driver was not cleanly unloaded previously, etc.
3701 */
3702 if (!amdgpu_sriov_vf(adev) && amdgpu_asic_need_reset_on_init(adev)) {
3703 if (adev->gmc.xgmi.num_physical_nodes) {
3704 dev_info(adev->dev, "Pending hive reset.\n");
3705 adev->gmc.xgmi.pending_reset = true;
3706 /* Only need to init necessary block for SMU to handle the reset */
3707 for (i = 0; i < adev->num_ip_blocks; i++) {
3708 if (!adev->ip_blocks[i].status.valid)
3709 continue;
3710 if (!(adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC ||
3711 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_COMMON ||
3712 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_IH ||
3713 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_SMC)) {
3714 DRM_DEBUG("IP %s disabled for hw_init.\n",
3715 adev->ip_blocks[i].version->funcs->name);
3716 adev->ip_blocks[i].status.hw = true;
3717 }
3718 }
3719 } else {
3720 tmp = amdgpu_reset_method;
3721 /* It should do a default reset when loading or reloading the driver,
3722 * regardless of the module parameter reset_method.
3723 */
3724 amdgpu_reset_method = AMD_RESET_METHOD_NONE;
3725 r = amdgpu_asic_reset(adev);
3726 amdgpu_reset_method = tmp;
3727 if (r) {
3728 dev_err(adev->dev, "asic reset on init failed\n");
3729 goto failed;
3730 }
3731 }
3732 }
3733
3734 pci_enable_pcie_error_reporting(adev->pdev);
3735
3736 /* Post card if necessary */
3737 if (amdgpu_device_need_post(adev)) {
3738 if (!adev->bios) {
3739 dev_err(adev->dev, "no vBIOS found\n");
3740 r = -EINVAL;
3741 goto failed;
3742 }
3743 DRM_INFO("GPU posting now...\n");
3744 r = amdgpu_device_asic_init(adev);
3745 if (r) {
3746 dev_err(adev->dev, "gpu post error!\n");
3747 goto failed;
3748 }
3749 }
3750
3751 if (adev->is_atom_fw) {
3752 /* Initialize clocks */
3753 r = amdgpu_atomfirmware_get_clock_info(adev);
3754 if (r) {
3755 dev_err(adev->dev, "amdgpu_atomfirmware_get_clock_info failed\n");
3756 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_ATOMBIOS_GET_CLOCK_FAIL, 0, 0);
3757 goto failed;
3758 }
3759 } else {
3760 /* Initialize clocks */
3761 r = amdgpu_atombios_get_clock_info(adev);
3762 if (r) {
3763 dev_err(adev->dev, "amdgpu_atombios_get_clock_info failed\n");
3764 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_ATOMBIOS_GET_CLOCK_FAIL, 0, 0);
3765 goto failed;
3766 }
3767 /* init i2c buses */
3768 if (!amdgpu_device_has_dc_support(adev))
3769 amdgpu_atombios_i2c_init(adev);
3770 }
3771
3772 fence_driver_init:
3773 /* Fence driver */
3774 r = amdgpu_fence_driver_sw_init(adev);
3775 if (r) {
3776 dev_err(adev->dev, "amdgpu_fence_driver_sw_init failed\n");
3777 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_FENCE_INIT_FAIL, 0, 0);
3778 goto failed;
3779 }
3780
3781 /* init the mode config */
3782 drm_mode_config_init(adev_to_drm(adev));
3783
3784 r = amdgpu_device_ip_init(adev);
3785 if (r) {
3786 dev_err(adev->dev, "amdgpu_device_ip_init failed\n");
3787 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_AMDGPU_INIT_FAIL, 0, 0);
3788 goto release_ras_con;
3789 }
3790
3791 amdgpu_fence_driver_hw_init(adev);
3792
3793 dev_info(adev->dev,
3794 "SE %d, SH per SE %d, CU per SH %d, active_cu_number %d\n",
3795 adev->gfx.config.max_shader_engines,
3796 adev->gfx.config.max_sh_per_se,
3797 adev->gfx.config.max_cu_per_sh,
3798 adev->gfx.cu_info.number);
3799
3800 adev->accel_working = true;
3801
3802 amdgpu_vm_check_compute_bug(adev);
3803
3804 /* Initialize the buffer migration limit. */
3805 if (amdgpu_moverate >= 0)
3806 max_MBps = amdgpu_moverate;
3807 else
3808 max_MBps = 8; /* Allow 8 MB/s. */
3809 /* Get a log2 for easy divisions. */
3810 adev->mm_stats.log2_max_MBps = ilog2(max(1u, max_MBps));
3811
3812 amdgpu_fbdev_init(adev);
3813
3814 r = amdgpu_pm_sysfs_init(adev);
3815 if (r) {
3816 adev->pm_sysfs_en = false;
3817 DRM_ERROR("registering pm debugfs failed (%d).\n", r);
3818 } else
3819 adev->pm_sysfs_en = true;
3820
3821 r = amdgpu_ucode_sysfs_init(adev);
3822 if (r) {
3823 adev->ucode_sysfs_en = false;
3824 DRM_ERROR("Creating firmware sysfs failed (%d).\n", r);
3825 } else
3826 adev->ucode_sysfs_en = true;
3827
3828 if ((amdgpu_testing & 1)) {
3829 if (adev->accel_working)
3830 amdgpu_test_moves(adev);
3831 else
3832 DRM_INFO("amdgpu: acceleration disabled, skipping move tests\n");
3833 }
3834 if (amdgpu_benchmarking) {
3835 if (adev->accel_working)
3836 amdgpu_benchmark(adev, amdgpu_benchmarking);
3837 else
3838 DRM_INFO("amdgpu: acceleration disabled, skipping benchmarks\n");
3839 }
3840
3841 /*
3842 * Register gpu instance before amdgpu_device_enable_mgpu_fan_boost.
3843 * Otherwise the mgpu fan boost feature will be skipped due to the
3844 * gpu instance is counted less.
3845 */
3846 amdgpu_register_gpu_instance(adev);
3847
3848 /* enable clockgating, etc. after ib tests, etc. since some blocks require
3849 * explicit gating rather than handling it automatically.
3850 */
3851 if (!adev->gmc.xgmi.pending_reset) {
3852 r = amdgpu_device_ip_late_init(adev);
3853 if (r) {
3854 dev_err(adev->dev, "amdgpu_device_ip_late_init failed\n");
3855 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_AMDGPU_LATE_INIT_FAIL, 0, r);
3856 goto release_ras_con;
3857 }
3858 /* must succeed. */
3859 amdgpu_ras_resume(adev);
3860 queue_delayed_work(system_wq, &adev->delayed_init_work,
3861 msecs_to_jiffies(AMDGPU_RESUME_MS));
3862 }
3863
3864 if (amdgpu_sriov_vf(adev)) {
3865 amdgpu_virt_release_full_gpu(adev, true);
3866 flush_delayed_work(&adev->delayed_init_work);
3867 }
3868
3869 r = sysfs_create_files(&adev->dev->kobj, amdgpu_dev_attributes);
3870 if (r)
3871 dev_err(adev->dev, "Could not create amdgpu device attr\n");
3872
3873 if (IS_ENABLED(CONFIG_PERF_EVENTS))
3874 r = amdgpu_pmu_init(adev);
3875 if (r)
3876 dev_err(adev->dev, "amdgpu_pmu_init failed\n");
3877
3878 /* Have stored pci confspace at hand for restore in sudden PCI error */
3879 if (amdgpu_device_cache_pci_state(adev->pdev))
3880 pci_restore_state(pdev);
3881
3882 /* if we have > 1 VGA cards, then disable the amdgpu VGA resources */
3883 /* this will fail for cards that aren't VGA class devices, just
3884 * ignore it */
3885 if ((adev->pdev->class >> 8) == PCI_CLASS_DISPLAY_VGA)
3886 vga_client_register(adev->pdev, amdgpu_device_vga_set_decode);
3887
3888 if (amdgpu_device_supports_px(ddev)) {
3889 px = true;
3890 vga_switcheroo_register_client(adev->pdev,
3891 &amdgpu_switcheroo_ops, px);
3892 vga_switcheroo_init_domain_pm_ops(adev->dev, &adev->vga_pm_domain);
3893 }
3894
3895 if (adev->gmc.xgmi.pending_reset)
3896 queue_delayed_work(system_wq, &mgpu_info.delayed_reset_work,
3897 msecs_to_jiffies(AMDGPU_RESUME_MS));
3898
3899 return 0;
3900
3901 release_ras_con:
3902 if (amdgpu_sriov_vf(adev))
3903 amdgpu_virt_release_full_gpu(adev, true);
3904
3905 /* failed in exclusive mode due to timeout */
3906 if (amdgpu_sriov_vf(adev) &&
3907 !amdgpu_sriov_runtime(adev) &&
3908 amdgpu_virt_mmio_blocked(adev) &&
3909 !amdgpu_virt_wait_reset(adev)) {
3910 dev_err(adev->dev, "VF exclusive mode timeout\n");
3911 /* Don't send request since VF is inactive. */
3912 adev->virt.caps &= ~AMDGPU_SRIOV_CAPS_RUNTIME;
3913 adev->virt.ops = NULL;
3914 r = -EAGAIN;
3915 }
3916 amdgpu_release_ras_context(adev);
3917
3918 failed:
3919 amdgpu_vf_error_trans_all(adev);
3920
3921 return r;
3922 }
3923
amdgpu_device_unmap_mmio(struct amdgpu_device * adev)3924 static void amdgpu_device_unmap_mmio(struct amdgpu_device *adev)
3925 {
3926 /* Clear all CPU mappings pointing to this device */
3927 unmap_mapping_range(adev->ddev.anon_inode->i_mapping, 0, 0, 1);
3928
3929 /* Unmap all mapped bars - Doorbell, registers and VRAM */
3930 amdgpu_device_doorbell_fini(adev);
3931
3932 iounmap(adev->rmmio);
3933 adev->rmmio = NULL;
3934 if (adev->mman.aper_base_kaddr)
3935 iounmap(adev->mman.aper_base_kaddr);
3936 adev->mman.aper_base_kaddr = NULL;
3937
3938 /* Memory manager related */
3939 if (!adev->gmc.xgmi.connected_to_cpu) {
3940 arch_phys_wc_del(adev->gmc.vram_mtrr);
3941 arch_io_free_memtype_wc(adev->gmc.aper_base, adev->gmc.aper_size);
3942 }
3943 }
3944
3945 /**
3946 * amdgpu_device_fini - tear down the driver
3947 *
3948 * @adev: amdgpu_device pointer
3949 *
3950 * Tear down the driver info (all asics).
3951 * Called at driver shutdown.
3952 */
amdgpu_device_fini_hw(struct amdgpu_device * adev)3953 void amdgpu_device_fini_hw(struct amdgpu_device *adev)
3954 {
3955 dev_info(adev->dev, "amdgpu: finishing device.\n");
3956 flush_delayed_work(&adev->delayed_init_work);
3957 if (adev->mman.initialized) {
3958 flush_delayed_work(&adev->mman.bdev.wq);
3959 ttm_bo_lock_delayed_workqueue(&adev->mman.bdev);
3960 }
3961 adev->shutdown = true;
3962
3963 /* make sure IB test finished before entering exclusive mode
3964 * to avoid preemption on IB test
3965 * */
3966 if (amdgpu_sriov_vf(adev)) {
3967 amdgpu_virt_request_full_gpu(adev, false);
3968 amdgpu_virt_fini_data_exchange(adev);
3969 }
3970
3971 /* disable all interrupts */
3972 amdgpu_irq_disable_all(adev);
3973 if (adev->mode_info.mode_config_initialized){
3974 if (!drm_drv_uses_atomic_modeset(adev_to_drm(adev)))
3975 drm_helper_force_disable_all(adev_to_drm(adev));
3976 else
3977 drm_atomic_helper_shutdown(adev_to_drm(adev));
3978 }
3979 amdgpu_fence_driver_hw_fini(adev);
3980
3981 if (adev->pm_sysfs_en)
3982 amdgpu_pm_sysfs_fini(adev);
3983 if (adev->ucode_sysfs_en)
3984 amdgpu_ucode_sysfs_fini(adev);
3985 sysfs_remove_files(&adev->dev->kobj, amdgpu_dev_attributes);
3986
3987 amdgpu_fbdev_fini(adev);
3988
3989 amdgpu_irq_fini_hw(adev);
3990
3991 amdgpu_device_ip_fini_early(adev);
3992
3993 amdgpu_gart_dummy_page_fini(adev);
3994
3995 amdgpu_device_unmap_mmio(adev);
3996 }
3997
amdgpu_device_fini_sw(struct amdgpu_device * adev)3998 void amdgpu_device_fini_sw(struct amdgpu_device *adev)
3999 {
4000 amdgpu_fence_driver_sw_fini(adev);
4001 amdgpu_device_ip_fini(adev);
4002 release_firmware(adev->firmware.gpu_info_fw);
4003 adev->firmware.gpu_info_fw = NULL;
4004 adev->accel_working = false;
4005
4006 amdgpu_reset_fini(adev);
4007
4008 /* free i2c buses */
4009 if (!amdgpu_device_has_dc_support(adev))
4010 amdgpu_i2c_fini(adev);
4011
4012 if (amdgpu_emu_mode != 1)
4013 amdgpu_atombios_fini(adev);
4014
4015 kfree(adev->bios);
4016 adev->bios = NULL;
4017 if (amdgpu_device_supports_px(adev_to_drm(adev))) {
4018 vga_switcheroo_unregister_client(adev->pdev);
4019 vga_switcheroo_fini_domain_pm_ops(adev->dev);
4020 }
4021 if ((adev->pdev->class >> 8) == PCI_CLASS_DISPLAY_VGA)
4022 vga_client_unregister(adev->pdev);
4023
4024 if (IS_ENABLED(CONFIG_PERF_EVENTS))
4025 amdgpu_pmu_fini(adev);
4026 if (adev->mman.discovery_bin)
4027 amdgpu_discovery_fini(adev);
4028
4029 kfree(adev->pci_state);
4030
4031 }
4032
4033 /**
4034 * amdgpu_device_evict_resources - evict device resources
4035 * @adev: amdgpu device object
4036 *
4037 * Evicts all ttm device resources(vram BOs, gart table) from the lru list
4038 * of the vram memory type. Mainly used for evicting device resources
4039 * at suspend time.
4040 *
4041 */
amdgpu_device_evict_resources(struct amdgpu_device * adev)4042 static void amdgpu_device_evict_resources(struct amdgpu_device *adev)
4043 {
4044 /* No need to evict vram on APUs for suspend to ram or s2idle */
4045 if ((adev->in_s3 || adev->in_s0ix) && (adev->flags & AMD_IS_APU))
4046 return;
4047
4048 if (amdgpu_ttm_evict_resources(adev, TTM_PL_VRAM))
4049 DRM_WARN("evicting device resources failed\n");
4050
4051 }
4052
4053 /*
4054 * Suspend & resume.
4055 */
4056 /**
4057 * amdgpu_device_suspend - initiate device suspend
4058 *
4059 * @dev: drm dev pointer
4060 * @fbcon : notify the fbdev of suspend
4061 *
4062 * Puts the hw in the suspend state (all asics).
4063 * Returns 0 for success or an error on failure.
4064 * Called at driver suspend.
4065 */
amdgpu_device_suspend(struct drm_device * dev,bool fbcon)4066 int amdgpu_device_suspend(struct drm_device *dev, bool fbcon)
4067 {
4068 struct amdgpu_device *adev = drm_to_adev(dev);
4069 int r = 0;
4070
4071 if (dev->switch_power_state == DRM_SWITCH_POWER_OFF)
4072 return 0;
4073
4074 adev->in_suspend = true;
4075
4076 if (amdgpu_sriov_vf(adev)) {
4077 amdgpu_virt_fini_data_exchange(adev);
4078 r = amdgpu_virt_request_full_gpu(adev, false);
4079 if (r)
4080 return r;
4081 }
4082
4083 if (amdgpu_acpi_smart_shift_update(dev, AMDGPU_SS_DEV_D3))
4084 DRM_WARN("smart shift update failed\n");
4085
4086 drm_kms_helper_poll_disable(dev);
4087
4088 if (fbcon)
4089 amdgpu_fbdev_set_suspend(adev, 1);
4090
4091 cancel_delayed_work_sync(&adev->delayed_init_work);
4092
4093 amdgpu_ras_suspend(adev);
4094
4095 amdgpu_device_ip_suspend_phase1(adev);
4096
4097 if (!adev->in_s0ix)
4098 amdgpu_amdkfd_suspend(adev, adev->in_runpm);
4099
4100 /* First evict vram memory */
4101 amdgpu_device_evict_resources(adev);
4102
4103 amdgpu_fence_driver_hw_fini(adev);
4104
4105 amdgpu_device_ip_suspend_phase2(adev);
4106 /* This second call to evict device resources is to evict
4107 * the gart page table using the CPU.
4108 */
4109 amdgpu_device_evict_resources(adev);
4110
4111 if (amdgpu_sriov_vf(adev))
4112 amdgpu_virt_release_full_gpu(adev, false);
4113
4114 return 0;
4115 }
4116
4117 /**
4118 * amdgpu_device_resume - initiate device resume
4119 *
4120 * @dev: drm dev pointer
4121 * @fbcon : notify the fbdev of resume
4122 *
4123 * Bring the hw back to operating state (all asics).
4124 * Returns 0 for success or an error on failure.
4125 * Called at driver resume.
4126 */
amdgpu_device_resume(struct drm_device * dev,bool fbcon)4127 int amdgpu_device_resume(struct drm_device *dev, bool fbcon)
4128 {
4129 struct amdgpu_device *adev = drm_to_adev(dev);
4130 int r = 0;
4131
4132 if (amdgpu_sriov_vf(adev)) {
4133 r = amdgpu_virt_request_full_gpu(adev, true);
4134 if (r)
4135 return r;
4136 }
4137
4138 if (dev->switch_power_state == DRM_SWITCH_POWER_OFF)
4139 return 0;
4140
4141 if (adev->in_s0ix)
4142 amdgpu_gfx_state_change_set(adev, sGpuChangeState_D0Entry);
4143
4144 /* post card */
4145 if (amdgpu_device_need_post(adev)) {
4146 r = amdgpu_device_asic_init(adev);
4147 if (r)
4148 dev_err(adev->dev, "amdgpu asic init failed\n");
4149 }
4150
4151 r = amdgpu_device_ip_resume(adev);
4152
4153 /* no matter what r is, always need to properly release full GPU */
4154 if (amdgpu_sriov_vf(adev)) {
4155 amdgpu_virt_init_data_exchange(adev);
4156 amdgpu_virt_release_full_gpu(adev, true);
4157 }
4158
4159 if (r) {
4160 dev_err(adev->dev, "amdgpu_device_ip_resume failed (%d).\n", r);
4161 return r;
4162 }
4163 amdgpu_fence_driver_hw_init(adev);
4164
4165 r = amdgpu_device_ip_late_init(adev);
4166 if (r)
4167 return r;
4168
4169 queue_delayed_work(system_wq, &adev->delayed_init_work,
4170 msecs_to_jiffies(AMDGPU_RESUME_MS));
4171
4172 if (!adev->in_s0ix) {
4173 r = amdgpu_amdkfd_resume(adev, adev->in_runpm);
4174 if (r)
4175 return r;
4176 }
4177
4178 /* Make sure IB tests flushed */
4179 flush_delayed_work(&adev->delayed_init_work);
4180
4181 if (adev->in_s0ix) {
4182 /* re-enable gfxoff after IP resume. This re-enables gfxoff after
4183 * it was disabled for IP resume in amdgpu_device_ip_resume_phase2().
4184 */
4185 amdgpu_gfx_off_ctrl(adev, true);
4186 DRM_DEBUG("will enable gfxoff for the mission mode\n");
4187 }
4188 if (fbcon)
4189 amdgpu_fbdev_set_suspend(adev, 0);
4190
4191 drm_kms_helper_poll_enable(dev);
4192
4193 amdgpu_ras_resume(adev);
4194
4195 /*
4196 * Most of the connector probing functions try to acquire runtime pm
4197 * refs to ensure that the GPU is powered on when connector polling is
4198 * performed. Since we're calling this from a runtime PM callback,
4199 * trying to acquire rpm refs will cause us to deadlock.
4200 *
4201 * Since we're guaranteed to be holding the rpm lock, it's safe to
4202 * temporarily disable the rpm helpers so this doesn't deadlock us.
4203 */
4204 #ifdef CONFIG_PM
4205 dev->dev->power.disable_depth++;
4206 #endif
4207 if (!amdgpu_device_has_dc_support(adev))
4208 drm_helper_hpd_irq_event(dev);
4209 else
4210 drm_kms_helper_hotplug_event(dev);
4211 #ifdef CONFIG_PM
4212 dev->dev->power.disable_depth--;
4213 #endif
4214 adev->in_suspend = false;
4215
4216 if (amdgpu_acpi_smart_shift_update(dev, AMDGPU_SS_DEV_D0))
4217 DRM_WARN("smart shift update failed\n");
4218
4219 return 0;
4220 }
4221
4222 /**
4223 * amdgpu_device_ip_check_soft_reset - did soft reset succeed
4224 *
4225 * @adev: amdgpu_device pointer
4226 *
4227 * The list of all the hardware IPs that make up the asic is walked and
4228 * the check_soft_reset callbacks are run. check_soft_reset determines
4229 * if the asic is still hung or not.
4230 * Returns true if any of the IPs are still in a hung state, false if not.
4231 */
amdgpu_device_ip_check_soft_reset(struct amdgpu_device * adev)4232 static bool amdgpu_device_ip_check_soft_reset(struct amdgpu_device *adev)
4233 {
4234 int i;
4235 bool asic_hang = false;
4236
4237 if (amdgpu_sriov_vf(adev))
4238 return true;
4239
4240 if (amdgpu_asic_need_full_reset(adev))
4241 return true;
4242
4243 for (i = 0; i < adev->num_ip_blocks; i++) {
4244 if (!adev->ip_blocks[i].status.valid)
4245 continue;
4246 if (adev->ip_blocks[i].version->funcs->check_soft_reset)
4247 adev->ip_blocks[i].status.hang =
4248 adev->ip_blocks[i].version->funcs->check_soft_reset(adev);
4249 if (adev->ip_blocks[i].status.hang) {
4250 dev_info(adev->dev, "IP block:%s is hung!\n", adev->ip_blocks[i].version->funcs->name);
4251 asic_hang = true;
4252 }
4253 }
4254 return asic_hang;
4255 }
4256
4257 /**
4258 * amdgpu_device_ip_pre_soft_reset - prepare for soft reset
4259 *
4260 * @adev: amdgpu_device pointer
4261 *
4262 * The list of all the hardware IPs that make up the asic is walked and the
4263 * pre_soft_reset callbacks are run if the block is hung. pre_soft_reset
4264 * handles any IP specific hardware or software state changes that are
4265 * necessary for a soft reset to succeed.
4266 * Returns 0 on success, negative error code on failure.
4267 */
amdgpu_device_ip_pre_soft_reset(struct amdgpu_device * adev)4268 static int amdgpu_device_ip_pre_soft_reset(struct amdgpu_device *adev)
4269 {
4270 int i, r = 0;
4271
4272 for (i = 0; i < adev->num_ip_blocks; i++) {
4273 if (!adev->ip_blocks[i].status.valid)
4274 continue;
4275 if (adev->ip_blocks[i].status.hang &&
4276 adev->ip_blocks[i].version->funcs->pre_soft_reset) {
4277 r = adev->ip_blocks[i].version->funcs->pre_soft_reset(adev);
4278 if (r)
4279 return r;
4280 }
4281 }
4282
4283 return 0;
4284 }
4285
4286 /**
4287 * amdgpu_device_ip_need_full_reset - check if a full asic reset is needed
4288 *
4289 * @adev: amdgpu_device pointer
4290 *
4291 * Some hardware IPs cannot be soft reset. If they are hung, a full gpu
4292 * reset is necessary to recover.
4293 * Returns true if a full asic reset is required, false if not.
4294 */
amdgpu_device_ip_need_full_reset(struct amdgpu_device * adev)4295 static bool amdgpu_device_ip_need_full_reset(struct amdgpu_device *adev)
4296 {
4297 int i;
4298
4299 if (amdgpu_asic_need_full_reset(adev))
4300 return true;
4301
4302 for (i = 0; i < adev->num_ip_blocks; i++) {
4303 if (!adev->ip_blocks[i].status.valid)
4304 continue;
4305 if ((adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC) ||
4306 (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_SMC) ||
4307 (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_ACP) ||
4308 (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_DCE) ||
4309 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_PSP) {
4310 if (adev->ip_blocks[i].status.hang) {
4311 dev_info(adev->dev, "Some block need full reset!\n");
4312 return true;
4313 }
4314 }
4315 }
4316 return false;
4317 }
4318
4319 /**
4320 * amdgpu_device_ip_soft_reset - do a soft reset
4321 *
4322 * @adev: amdgpu_device pointer
4323 *
4324 * The list of all the hardware IPs that make up the asic is walked and the
4325 * soft_reset callbacks are run if the block is hung. soft_reset handles any
4326 * IP specific hardware or software state changes that are necessary to soft
4327 * reset the IP.
4328 * Returns 0 on success, negative error code on failure.
4329 */
amdgpu_device_ip_soft_reset(struct amdgpu_device * adev)4330 static int amdgpu_device_ip_soft_reset(struct amdgpu_device *adev)
4331 {
4332 int i, r = 0;
4333
4334 for (i = 0; i < adev->num_ip_blocks; i++) {
4335 if (!adev->ip_blocks[i].status.valid)
4336 continue;
4337 if (adev->ip_blocks[i].status.hang &&
4338 adev->ip_blocks[i].version->funcs->soft_reset) {
4339 r = adev->ip_blocks[i].version->funcs->soft_reset(adev);
4340 if (r)
4341 return r;
4342 }
4343 }
4344
4345 return 0;
4346 }
4347
4348 /**
4349 * amdgpu_device_ip_post_soft_reset - clean up from soft reset
4350 *
4351 * @adev: amdgpu_device pointer
4352 *
4353 * The list of all the hardware IPs that make up the asic is walked and the
4354 * post_soft_reset callbacks are run if the asic was hung. post_soft_reset
4355 * handles any IP specific hardware or software state changes that are
4356 * necessary after the IP has been soft reset.
4357 * Returns 0 on success, negative error code on failure.
4358 */
amdgpu_device_ip_post_soft_reset(struct amdgpu_device * adev)4359 static int amdgpu_device_ip_post_soft_reset(struct amdgpu_device *adev)
4360 {
4361 int i, r = 0;
4362
4363 for (i = 0; i < adev->num_ip_blocks; i++) {
4364 if (!adev->ip_blocks[i].status.valid)
4365 continue;
4366 if (adev->ip_blocks[i].status.hang &&
4367 adev->ip_blocks[i].version->funcs->post_soft_reset)
4368 r = adev->ip_blocks[i].version->funcs->post_soft_reset(adev);
4369 if (r)
4370 return r;
4371 }
4372
4373 return 0;
4374 }
4375
4376 /**
4377 * amdgpu_device_recover_vram - Recover some VRAM contents
4378 *
4379 * @adev: amdgpu_device pointer
4380 *
4381 * Restores the contents of VRAM buffers from the shadows in GTT. Used to
4382 * restore things like GPUVM page tables after a GPU reset where
4383 * the contents of VRAM might be lost.
4384 *
4385 * Returns:
4386 * 0 on success, negative error code on failure.
4387 */
amdgpu_device_recover_vram(struct amdgpu_device * adev)4388 static int amdgpu_device_recover_vram(struct amdgpu_device *adev)
4389 {
4390 struct dma_fence *fence = NULL, *next = NULL;
4391 struct amdgpu_bo *shadow;
4392 struct amdgpu_bo_vm *vmbo;
4393 long r = 1, tmo;
4394
4395 if (amdgpu_sriov_runtime(adev))
4396 tmo = msecs_to_jiffies(8000);
4397 else
4398 tmo = msecs_to_jiffies(100);
4399
4400 dev_info(adev->dev, "recover vram bo from shadow start\n");
4401 mutex_lock(&adev->shadow_list_lock);
4402 list_for_each_entry(vmbo, &adev->shadow_list, shadow_list) {
4403 /* If vm is compute context or adev is APU, shadow will be NULL */
4404 if (!vmbo->shadow)
4405 continue;
4406 shadow = vmbo->shadow;
4407
4408 /* No need to recover an evicted BO */
4409 if (shadow->tbo.resource->mem_type != TTM_PL_TT ||
4410 shadow->tbo.resource->start == AMDGPU_BO_INVALID_OFFSET ||
4411 shadow->parent->tbo.resource->mem_type != TTM_PL_VRAM)
4412 continue;
4413
4414 r = amdgpu_bo_restore_shadow(shadow, &next);
4415 if (r)
4416 break;
4417
4418 if (fence) {
4419 tmo = dma_fence_wait_timeout(fence, false, tmo);
4420 dma_fence_put(fence);
4421 fence = next;
4422 if (tmo == 0) {
4423 r = -ETIMEDOUT;
4424 break;
4425 } else if (tmo < 0) {
4426 r = tmo;
4427 break;
4428 }
4429 } else {
4430 fence = next;
4431 }
4432 }
4433 mutex_unlock(&adev->shadow_list_lock);
4434
4435 if (fence)
4436 tmo = dma_fence_wait_timeout(fence, false, tmo);
4437 dma_fence_put(fence);
4438
4439 if (r < 0 || tmo <= 0) {
4440 dev_err(adev->dev, "recover vram bo from shadow failed, r is %ld, tmo is %ld\n", r, tmo);
4441 return -EIO;
4442 }
4443
4444 dev_info(adev->dev, "recover vram bo from shadow done\n");
4445 return 0;
4446 }
4447
4448
4449 /**
4450 * amdgpu_device_reset_sriov - reset ASIC for SR-IOV vf
4451 *
4452 * @adev: amdgpu_device pointer
4453 * @from_hypervisor: request from hypervisor
4454 *
4455 * do VF FLR and reinitialize Asic
4456 * return 0 means succeeded otherwise failed
4457 */
amdgpu_device_reset_sriov(struct amdgpu_device * adev,bool from_hypervisor)4458 static int amdgpu_device_reset_sriov(struct amdgpu_device *adev,
4459 bool from_hypervisor)
4460 {
4461 int r;
4462
4463 if (from_hypervisor)
4464 r = amdgpu_virt_request_full_gpu(adev, true);
4465 else
4466 r = amdgpu_virt_reset_gpu(adev);
4467 if (r)
4468 return r;
4469
4470 amdgpu_amdkfd_pre_reset(adev);
4471
4472 /* Resume IP prior to SMC */
4473 r = amdgpu_device_ip_reinit_early_sriov(adev);
4474 if (r)
4475 goto error;
4476
4477 amdgpu_virt_init_data_exchange(adev);
4478 /* we need recover gart prior to run SMC/CP/SDMA resume */
4479 amdgpu_gtt_mgr_recover(ttm_manager_type(&adev->mman.bdev, TTM_PL_TT));
4480
4481 r = amdgpu_device_fw_loading(adev);
4482 if (r)
4483 return r;
4484
4485 /* now we are okay to resume SMC/CP/SDMA */
4486 r = amdgpu_device_ip_reinit_late_sriov(adev);
4487 if (r)
4488 goto error;
4489
4490 amdgpu_irq_gpu_reset_resume_helper(adev);
4491 r = amdgpu_ib_ring_tests(adev);
4492 amdgpu_amdkfd_post_reset(adev);
4493
4494 error:
4495 if (!r && adev->virt.gim_feature & AMDGIM_FEATURE_GIM_FLR_VRAMLOST) {
4496 amdgpu_inc_vram_lost(adev);
4497 r = amdgpu_device_recover_vram(adev);
4498 }
4499 amdgpu_virt_release_full_gpu(adev, true);
4500
4501 return r;
4502 }
4503
4504 /**
4505 * amdgpu_device_has_job_running - check if there is any job in mirror list
4506 *
4507 * @adev: amdgpu_device pointer
4508 *
4509 * check if there is any job in mirror list
4510 */
amdgpu_device_has_job_running(struct amdgpu_device * adev)4511 bool amdgpu_device_has_job_running(struct amdgpu_device *adev)
4512 {
4513 int i;
4514 struct drm_sched_job *job;
4515
4516 for (i = 0; i < AMDGPU_MAX_RINGS; ++i) {
4517 struct amdgpu_ring *ring = adev->rings[i];
4518
4519 if (!ring || !ring->sched.thread)
4520 continue;
4521
4522 spin_lock(&ring->sched.job_list_lock);
4523 job = list_first_entry_or_null(&ring->sched.pending_list,
4524 struct drm_sched_job, list);
4525 spin_unlock(&ring->sched.job_list_lock);
4526 if (job)
4527 return true;
4528 }
4529 return false;
4530 }
4531
4532 /**
4533 * amdgpu_device_should_recover_gpu - check if we should try GPU recovery
4534 *
4535 * @adev: amdgpu_device pointer
4536 *
4537 * Check amdgpu_gpu_recovery and SRIOV status to see if we should try to recover
4538 * a hung GPU.
4539 */
amdgpu_device_should_recover_gpu(struct amdgpu_device * adev)4540 bool amdgpu_device_should_recover_gpu(struct amdgpu_device *adev)
4541 {
4542 if (!amdgpu_device_ip_check_soft_reset(adev)) {
4543 dev_info(adev->dev, "Timeout, but no hardware hang detected.\n");
4544 return false;
4545 }
4546
4547 if (amdgpu_gpu_recovery == 0)
4548 goto disabled;
4549
4550 if (amdgpu_sriov_vf(adev))
4551 return true;
4552
4553 if (amdgpu_gpu_recovery == -1) {
4554 switch (adev->asic_type) {
4555 case CHIP_BONAIRE:
4556 case CHIP_HAWAII:
4557 case CHIP_TOPAZ:
4558 case CHIP_TONGA:
4559 case CHIP_FIJI:
4560 case CHIP_POLARIS10:
4561 case CHIP_POLARIS11:
4562 case CHIP_POLARIS12:
4563 case CHIP_VEGAM:
4564 case CHIP_VEGA20:
4565 case CHIP_VEGA10:
4566 case CHIP_VEGA12:
4567 case CHIP_RAVEN:
4568 case CHIP_ARCTURUS:
4569 case CHIP_RENOIR:
4570 case CHIP_NAVI10:
4571 case CHIP_NAVI14:
4572 case CHIP_NAVI12:
4573 case CHIP_SIENNA_CICHLID:
4574 case CHIP_NAVY_FLOUNDER:
4575 case CHIP_DIMGREY_CAVEFISH:
4576 case CHIP_BEIGE_GOBY:
4577 case CHIP_VANGOGH:
4578 case CHIP_ALDEBARAN:
4579 break;
4580 default:
4581 goto disabled;
4582 }
4583 }
4584
4585 return true;
4586
4587 disabled:
4588 dev_info(adev->dev, "GPU recovery disabled.\n");
4589 return false;
4590 }
4591
amdgpu_device_mode1_reset(struct amdgpu_device * adev)4592 int amdgpu_device_mode1_reset(struct amdgpu_device *adev)
4593 {
4594 u32 i;
4595 int ret = 0;
4596
4597 amdgpu_atombios_scratch_regs_engine_hung(adev, true);
4598
4599 dev_info(adev->dev, "GPU mode1 reset\n");
4600
4601 /* disable BM */
4602 pci_clear_master(adev->pdev);
4603
4604 amdgpu_device_cache_pci_state(adev->pdev);
4605
4606 if (amdgpu_dpm_is_mode1_reset_supported(adev)) {
4607 dev_info(adev->dev, "GPU smu mode1 reset\n");
4608 ret = amdgpu_dpm_mode1_reset(adev);
4609 } else {
4610 dev_info(adev->dev, "GPU psp mode1 reset\n");
4611 ret = psp_gpu_reset(adev);
4612 }
4613
4614 if (ret)
4615 dev_err(adev->dev, "GPU mode1 reset failed\n");
4616
4617 amdgpu_device_load_pci_state(adev->pdev);
4618
4619 /* wait for asic to come out of reset */
4620 for (i = 0; i < adev->usec_timeout; i++) {
4621 u32 memsize = adev->nbio.funcs->get_memsize(adev);
4622
4623 if (memsize != 0xffffffff)
4624 break;
4625 udelay(1);
4626 }
4627
4628 amdgpu_atombios_scratch_regs_engine_hung(adev, false);
4629 return ret;
4630 }
4631
amdgpu_device_pre_asic_reset(struct amdgpu_device * adev,struct amdgpu_reset_context * reset_context)4632 int amdgpu_device_pre_asic_reset(struct amdgpu_device *adev,
4633 struct amdgpu_reset_context *reset_context)
4634 {
4635 int i, j, r = 0;
4636 struct amdgpu_job *job = NULL;
4637 bool need_full_reset =
4638 test_bit(AMDGPU_NEED_FULL_RESET, &reset_context->flags);
4639
4640 if (reset_context->reset_req_dev == adev)
4641 job = reset_context->job;
4642
4643 if (amdgpu_sriov_vf(adev)) {
4644 /* stop the data exchange thread */
4645 amdgpu_virt_fini_data_exchange(adev);
4646 }
4647
4648 /* block all schedulers and reset given job's ring */
4649 for (i = 0; i < AMDGPU_MAX_RINGS; ++i) {
4650 struct amdgpu_ring *ring = adev->rings[i];
4651
4652 if (!ring || !ring->sched.thread)
4653 continue;
4654
4655 /*clear job fence from fence drv to avoid force_completion
4656 *leave NULL and vm flush fence in fence drv */
4657 for (j = 0; j <= ring->fence_drv.num_fences_mask; j++) {
4658 struct dma_fence *old, **ptr;
4659
4660 ptr = &ring->fence_drv.fences[j];
4661 old = rcu_dereference_protected(*ptr, 1);
4662 if (old && test_bit(AMDGPU_FENCE_FLAG_EMBED_IN_JOB_BIT, &old->flags)) {
4663 RCU_INIT_POINTER(*ptr, NULL);
4664 }
4665 }
4666 /* after all hw jobs are reset, hw fence is meaningless, so force_completion */
4667 amdgpu_fence_driver_force_completion(ring);
4668 }
4669
4670 if (job && job->vm)
4671 drm_sched_increase_karma(&job->base);
4672
4673 r = amdgpu_reset_prepare_hwcontext(adev, reset_context);
4674 /* If reset handler not implemented, continue; otherwise return */
4675 if (r == -ENOSYS)
4676 r = 0;
4677 else
4678 return r;
4679
4680 /* Don't suspend on bare metal if we are not going to HW reset the ASIC */
4681 if (!amdgpu_sriov_vf(adev)) {
4682
4683 if (!need_full_reset)
4684 need_full_reset = amdgpu_device_ip_need_full_reset(adev);
4685
4686 if (!need_full_reset) {
4687 amdgpu_device_ip_pre_soft_reset(adev);
4688 r = amdgpu_device_ip_soft_reset(adev);
4689 amdgpu_device_ip_post_soft_reset(adev);
4690 if (r || amdgpu_device_ip_check_soft_reset(adev)) {
4691 dev_info(adev->dev, "soft reset failed, will fallback to full reset!\n");
4692 need_full_reset = true;
4693 }
4694 }
4695
4696 if (need_full_reset)
4697 r = amdgpu_device_ip_suspend(adev);
4698 if (need_full_reset)
4699 set_bit(AMDGPU_NEED_FULL_RESET, &reset_context->flags);
4700 else
4701 clear_bit(AMDGPU_NEED_FULL_RESET,
4702 &reset_context->flags);
4703 }
4704
4705 return r;
4706 }
4707
amdgpu_do_asic_reset(struct list_head * device_list_handle,struct amdgpu_reset_context * reset_context)4708 int amdgpu_do_asic_reset(struct list_head *device_list_handle,
4709 struct amdgpu_reset_context *reset_context)
4710 {
4711 struct amdgpu_device *tmp_adev = NULL;
4712 bool need_full_reset, skip_hw_reset, vram_lost = false;
4713 int r = 0;
4714
4715 /* Try reset handler method first */
4716 tmp_adev = list_first_entry(device_list_handle, struct amdgpu_device,
4717 reset_list);
4718 r = amdgpu_reset_perform_reset(tmp_adev, reset_context);
4719 /* If reset handler not implemented, continue; otherwise return */
4720 if (r == -ENOSYS)
4721 r = 0;
4722 else
4723 return r;
4724
4725 /* Reset handler not implemented, use the default method */
4726 need_full_reset =
4727 test_bit(AMDGPU_NEED_FULL_RESET, &reset_context->flags);
4728 skip_hw_reset = test_bit(AMDGPU_SKIP_HW_RESET, &reset_context->flags);
4729
4730 /*
4731 * ASIC reset has to be done on all XGMI hive nodes ASAP
4732 * to allow proper links negotiation in FW (within 1 sec)
4733 */
4734 if (!skip_hw_reset && need_full_reset) {
4735 list_for_each_entry(tmp_adev, device_list_handle, reset_list) {
4736 /* For XGMI run all resets in parallel to speed up the process */
4737 if (tmp_adev->gmc.xgmi.num_physical_nodes > 1) {
4738 tmp_adev->gmc.xgmi.pending_reset = false;
4739 if (!queue_work(system_unbound_wq, &tmp_adev->xgmi_reset_work))
4740 r = -EALREADY;
4741 } else
4742 r = amdgpu_asic_reset(tmp_adev);
4743
4744 if (r) {
4745 dev_err(tmp_adev->dev, "ASIC reset failed with error, %d for drm dev, %s",
4746 r, adev_to_drm(tmp_adev)->unique);
4747 break;
4748 }
4749 }
4750
4751 /* For XGMI wait for all resets to complete before proceed */
4752 if (!r) {
4753 list_for_each_entry(tmp_adev, device_list_handle, reset_list) {
4754 if (tmp_adev->gmc.xgmi.num_physical_nodes > 1) {
4755 flush_work(&tmp_adev->xgmi_reset_work);
4756 r = tmp_adev->asic_reset_res;
4757 if (r)
4758 break;
4759 }
4760 }
4761 }
4762 }
4763
4764 if (!r && amdgpu_ras_intr_triggered()) {
4765 list_for_each_entry(tmp_adev, device_list_handle, reset_list) {
4766 if (tmp_adev->mmhub.ras_funcs &&
4767 tmp_adev->mmhub.ras_funcs->reset_ras_error_count)
4768 tmp_adev->mmhub.ras_funcs->reset_ras_error_count(tmp_adev);
4769 }
4770
4771 amdgpu_ras_intr_cleared();
4772 }
4773
4774 list_for_each_entry(tmp_adev, device_list_handle, reset_list) {
4775 if (need_full_reset) {
4776 /* post card */
4777 r = amdgpu_device_asic_init(tmp_adev);
4778 if (r) {
4779 dev_warn(tmp_adev->dev, "asic atom init failed!");
4780 } else {
4781 dev_info(tmp_adev->dev, "GPU reset succeeded, trying to resume\n");
4782 r = amdgpu_amdkfd_resume_iommu(tmp_adev);
4783 if (r)
4784 goto out;
4785
4786 r = amdgpu_device_ip_resume_phase1(tmp_adev);
4787 if (r)
4788 goto out;
4789
4790 vram_lost = amdgpu_device_check_vram_lost(tmp_adev);
4791 if (vram_lost) {
4792 DRM_INFO("VRAM is lost due to GPU reset!\n");
4793 amdgpu_inc_vram_lost(tmp_adev);
4794 }
4795
4796 r = amdgpu_gtt_mgr_recover(ttm_manager_type(&tmp_adev->mman.bdev, TTM_PL_TT));
4797 if (r)
4798 goto out;
4799
4800 r = amdgpu_device_fw_loading(tmp_adev);
4801 if (r)
4802 return r;
4803
4804 r = amdgpu_device_ip_resume_phase2(tmp_adev);
4805 if (r)
4806 goto out;
4807
4808 if (vram_lost)
4809 amdgpu_device_fill_reset_magic(tmp_adev);
4810
4811 /*
4812 * Add this ASIC as tracked as reset was already
4813 * complete successfully.
4814 */
4815 amdgpu_register_gpu_instance(tmp_adev);
4816
4817 if (!reset_context->hive &&
4818 tmp_adev->gmc.xgmi.num_physical_nodes > 1)
4819 amdgpu_xgmi_add_device(tmp_adev);
4820
4821 r = amdgpu_device_ip_late_init(tmp_adev);
4822 if (r)
4823 goto out;
4824
4825 amdgpu_fbdev_set_suspend(tmp_adev, 0);
4826
4827 /*
4828 * The GPU enters bad state once faulty pages
4829 * by ECC has reached the threshold, and ras
4830 * recovery is scheduled next. So add one check
4831 * here to break recovery if it indeed exceeds
4832 * bad page threshold, and remind user to
4833 * retire this GPU or setting one bigger
4834 * bad_page_threshold value to fix this once
4835 * probing driver again.
4836 */
4837 if (!amdgpu_ras_eeprom_check_err_threshold(tmp_adev)) {
4838 /* must succeed. */
4839 amdgpu_ras_resume(tmp_adev);
4840 } else {
4841 r = -EINVAL;
4842 goto out;
4843 }
4844
4845 /* Update PSP FW topology after reset */
4846 if (reset_context->hive &&
4847 tmp_adev->gmc.xgmi.num_physical_nodes > 1)
4848 r = amdgpu_xgmi_update_topology(
4849 reset_context->hive, tmp_adev);
4850 }
4851 }
4852
4853 out:
4854 if (!r) {
4855 amdgpu_irq_gpu_reset_resume_helper(tmp_adev);
4856 r = amdgpu_ib_ring_tests(tmp_adev);
4857 if (r) {
4858 dev_err(tmp_adev->dev, "ib ring test failed (%d).\n", r);
4859 need_full_reset = true;
4860 r = -EAGAIN;
4861 goto end;
4862 }
4863 }
4864
4865 if (!r)
4866 r = amdgpu_device_recover_vram(tmp_adev);
4867 else
4868 tmp_adev->asic_reset_res = r;
4869 }
4870
4871 end:
4872 if (need_full_reset)
4873 set_bit(AMDGPU_NEED_FULL_RESET, &reset_context->flags);
4874 else
4875 clear_bit(AMDGPU_NEED_FULL_RESET, &reset_context->flags);
4876 return r;
4877 }
4878
amdgpu_device_lock_adev(struct amdgpu_device * adev,struct amdgpu_hive_info * hive)4879 static bool amdgpu_device_lock_adev(struct amdgpu_device *adev,
4880 struct amdgpu_hive_info *hive)
4881 {
4882 if (atomic_cmpxchg(&adev->in_gpu_reset, 0, 1) != 0)
4883 return false;
4884
4885 if (hive) {
4886 down_write_nest_lock(&adev->reset_sem, &hive->hive_lock);
4887 } else {
4888 down_write(&adev->reset_sem);
4889 }
4890
4891 switch (amdgpu_asic_reset_method(adev)) {
4892 case AMD_RESET_METHOD_MODE1:
4893 adev->mp1_state = PP_MP1_STATE_SHUTDOWN;
4894 break;
4895 case AMD_RESET_METHOD_MODE2:
4896 adev->mp1_state = PP_MP1_STATE_RESET;
4897 break;
4898 default:
4899 adev->mp1_state = PP_MP1_STATE_NONE;
4900 break;
4901 }
4902
4903 return true;
4904 }
4905
amdgpu_device_unlock_adev(struct amdgpu_device * adev)4906 static void amdgpu_device_unlock_adev(struct amdgpu_device *adev)
4907 {
4908 amdgpu_vf_error_trans_all(adev);
4909 adev->mp1_state = PP_MP1_STATE_NONE;
4910 atomic_set(&adev->in_gpu_reset, 0);
4911 up_write(&adev->reset_sem);
4912 }
4913
4914 /*
4915 * to lockup a list of amdgpu devices in a hive safely, if not a hive
4916 * with multiple nodes, it will be similar as amdgpu_device_lock_adev.
4917 *
4918 * unlock won't require roll back.
4919 */
amdgpu_device_lock_hive_adev(struct amdgpu_device * adev,struct amdgpu_hive_info * hive)4920 static int amdgpu_device_lock_hive_adev(struct amdgpu_device *adev, struct amdgpu_hive_info *hive)
4921 {
4922 struct amdgpu_device *tmp_adev = NULL;
4923
4924 if (adev->gmc.xgmi.num_physical_nodes > 1) {
4925 if (!hive) {
4926 dev_err(adev->dev, "Hive is NULL while device has multiple xgmi nodes");
4927 return -ENODEV;
4928 }
4929 list_for_each_entry(tmp_adev, &hive->device_list, gmc.xgmi.head) {
4930 if (!amdgpu_device_lock_adev(tmp_adev, hive))
4931 goto roll_back;
4932 }
4933 } else if (!amdgpu_device_lock_adev(adev, hive))
4934 return -EAGAIN;
4935
4936 return 0;
4937 roll_back:
4938 if (!list_is_first(&tmp_adev->gmc.xgmi.head, &hive->device_list)) {
4939 /*
4940 * if the lockup iteration break in the middle of a hive,
4941 * it may means there may has a race issue,
4942 * or a hive device locked up independently.
4943 * we may be in trouble and may not, so will try to roll back
4944 * the lock and give out a warnning.
4945 */
4946 dev_warn(tmp_adev->dev, "Hive lock iteration broke in the middle. Rolling back to unlock");
4947 list_for_each_entry_continue_reverse(tmp_adev, &hive->device_list, gmc.xgmi.head) {
4948 amdgpu_device_unlock_adev(tmp_adev);
4949 }
4950 }
4951 return -EAGAIN;
4952 }
4953
amdgpu_device_resume_display_audio(struct amdgpu_device * adev)4954 static void amdgpu_device_resume_display_audio(struct amdgpu_device *adev)
4955 {
4956 struct pci_dev *p = NULL;
4957
4958 p = pci_get_domain_bus_and_slot(pci_domain_nr(adev->pdev->bus),
4959 adev->pdev->bus->number, 1);
4960 if (p) {
4961 pm_runtime_enable(&(p->dev));
4962 pm_runtime_resume(&(p->dev));
4963 }
4964
4965 pci_dev_put(p);
4966 }
4967
amdgpu_device_suspend_display_audio(struct amdgpu_device * adev)4968 static int amdgpu_device_suspend_display_audio(struct amdgpu_device *adev)
4969 {
4970 enum amd_reset_method reset_method;
4971 struct pci_dev *p = NULL;
4972 u64 expires;
4973
4974 /*
4975 * For now, only BACO and mode1 reset are confirmed
4976 * to suffer the audio issue without proper suspended.
4977 */
4978 reset_method = amdgpu_asic_reset_method(adev);
4979 if ((reset_method != AMD_RESET_METHOD_BACO) &&
4980 (reset_method != AMD_RESET_METHOD_MODE1))
4981 return -EINVAL;
4982
4983 p = pci_get_domain_bus_and_slot(pci_domain_nr(adev->pdev->bus),
4984 adev->pdev->bus->number, 1);
4985 if (!p)
4986 return -ENODEV;
4987
4988 expires = pm_runtime_autosuspend_expiration(&(p->dev));
4989 if (!expires)
4990 /*
4991 * If we cannot get the audio device autosuspend delay,
4992 * a fixed 4S interval will be used. Considering 3S is
4993 * the audio controller default autosuspend delay setting.
4994 * 4S used here is guaranteed to cover that.
4995 */
4996 expires = ktime_get_mono_fast_ns() + NSEC_PER_SEC * 4ULL;
4997
4998 while (!pm_runtime_status_suspended(&(p->dev))) {
4999 if (!pm_runtime_suspend(&(p->dev)))
5000 break;
5001
5002 if (expires < ktime_get_mono_fast_ns()) {
5003 dev_warn(adev->dev, "failed to suspend display audio\n");
5004 pci_dev_put(p);
5005 /* TODO: abort the succeeding gpu reset? */
5006 return -ETIMEDOUT;
5007 }
5008 }
5009
5010 pm_runtime_disable(&(p->dev));
5011
5012 pci_dev_put(p);
5013 return 0;
5014 }
5015
amdgpu_device_recheck_guilty_jobs(struct amdgpu_device * adev,struct list_head * device_list_handle,struct amdgpu_reset_context * reset_context)5016 static void amdgpu_device_recheck_guilty_jobs(
5017 struct amdgpu_device *adev, struct list_head *device_list_handle,
5018 struct amdgpu_reset_context *reset_context)
5019 {
5020 int i, r = 0;
5021
5022 for (i = 0; i < AMDGPU_MAX_RINGS; ++i) {
5023 struct amdgpu_ring *ring = adev->rings[i];
5024 int ret = 0;
5025 struct drm_sched_job *s_job;
5026
5027 if (!ring || !ring->sched.thread)
5028 continue;
5029
5030 s_job = list_first_entry_or_null(&ring->sched.pending_list,
5031 struct drm_sched_job, list);
5032 if (s_job == NULL)
5033 continue;
5034
5035 /* clear job's guilty and depend the folowing step to decide the real one */
5036 drm_sched_reset_karma(s_job);
5037 drm_sched_resubmit_jobs_ext(&ring->sched, 1);
5038
5039 ret = dma_fence_wait_timeout(s_job->s_fence->parent, false, ring->sched.timeout);
5040 if (ret == 0) { /* timeout */
5041 DRM_ERROR("Found the real bad job! ring:%s, job_id:%llx\n",
5042 ring->sched.name, s_job->id);
5043
5044 /* set guilty */
5045 drm_sched_increase_karma(s_job);
5046 retry:
5047 /* do hw reset */
5048 if (amdgpu_sriov_vf(adev)) {
5049 amdgpu_virt_fini_data_exchange(adev);
5050 r = amdgpu_device_reset_sriov(adev, false);
5051 if (r)
5052 adev->asic_reset_res = r;
5053 } else {
5054 clear_bit(AMDGPU_SKIP_HW_RESET,
5055 &reset_context->flags);
5056 r = amdgpu_do_asic_reset(device_list_handle,
5057 reset_context);
5058 if (r && r == -EAGAIN)
5059 goto retry;
5060 }
5061
5062 /*
5063 * add reset counter so that the following
5064 * resubmitted job could flush vmid
5065 */
5066 atomic_inc(&adev->gpu_reset_counter);
5067 continue;
5068 }
5069
5070 /* got the hw fence, signal finished fence */
5071 atomic_dec(ring->sched.score);
5072 dma_fence_get(&s_job->s_fence->finished);
5073 dma_fence_signal(&s_job->s_fence->finished);
5074 dma_fence_put(&s_job->s_fence->finished);
5075
5076 /* remove node from list and free the job */
5077 spin_lock(&ring->sched.job_list_lock);
5078 list_del_init(&s_job->list);
5079 spin_unlock(&ring->sched.job_list_lock);
5080 ring->sched.ops->free_job(s_job);
5081 }
5082 }
5083
5084 /**
5085 * amdgpu_device_gpu_recover - reset the asic and recover scheduler
5086 *
5087 * @adev: amdgpu_device pointer
5088 * @job: which job trigger hang
5089 *
5090 * Attempt to reset the GPU if it has hung (all asics).
5091 * Attempt to do soft-reset or full-reset and reinitialize Asic
5092 * Returns 0 for success or an error on failure.
5093 */
5094
amdgpu_device_gpu_recover(struct amdgpu_device * adev,struct amdgpu_job * job)5095 int amdgpu_device_gpu_recover(struct amdgpu_device *adev,
5096 struct amdgpu_job *job)
5097 {
5098 struct list_head device_list, *device_list_handle = NULL;
5099 bool job_signaled = false;
5100 struct amdgpu_hive_info *hive = NULL;
5101 struct amdgpu_device *tmp_adev = NULL;
5102 int i, r = 0;
5103 bool need_emergency_restart = false;
5104 bool audio_suspended = false;
5105 int tmp_vram_lost_counter;
5106 struct amdgpu_reset_context reset_context;
5107
5108 memset(&reset_context, 0, sizeof(reset_context));
5109
5110 /*
5111 * Special case: RAS triggered and full reset isn't supported
5112 */
5113 need_emergency_restart = amdgpu_ras_need_emergency_restart(adev);
5114
5115 /*
5116 * Flush RAM to disk so that after reboot
5117 * the user can read log and see why the system rebooted.
5118 */
5119 if (need_emergency_restart && amdgpu_ras_get_context(adev) &&
5120 amdgpu_ras_get_context(adev)->reboot) {
5121 DRM_WARN("Emergency reboot.");
5122
5123 ksys_sync_helper();
5124 emergency_restart();
5125 }
5126
5127 dev_info(adev->dev, "GPU %s begin!\n",
5128 need_emergency_restart ? "jobs stop":"reset");
5129
5130 /*
5131 * Here we trylock to avoid chain of resets executing from
5132 * either trigger by jobs on different adevs in XGMI hive or jobs on
5133 * different schedulers for same device while this TO handler is running.
5134 * We always reset all schedulers for device and all devices for XGMI
5135 * hive so that should take care of them too.
5136 */
5137 hive = amdgpu_get_xgmi_hive(adev);
5138 if (hive) {
5139 if (atomic_cmpxchg(&hive->in_reset, 0, 1) != 0) {
5140 DRM_INFO("Bailing on TDR for s_job:%llx, hive: %llx as another already in progress",
5141 job ? job->base.id : -1, hive->hive_id);
5142 amdgpu_put_xgmi_hive(hive);
5143 if (job && job->vm)
5144 drm_sched_increase_karma(&job->base);
5145 return 0;
5146 }
5147 mutex_lock(&hive->hive_lock);
5148 }
5149
5150 reset_context.method = AMD_RESET_METHOD_NONE;
5151 reset_context.reset_req_dev = adev;
5152 reset_context.job = job;
5153 reset_context.hive = hive;
5154 clear_bit(AMDGPU_NEED_FULL_RESET, &reset_context.flags);
5155
5156 /*
5157 * lock the device before we try to operate the linked list
5158 * if didn't get the device lock, don't touch the linked list since
5159 * others may iterating it.
5160 */
5161 r = amdgpu_device_lock_hive_adev(adev, hive);
5162 if (r) {
5163 dev_info(adev->dev, "Bailing on TDR for s_job:%llx, as another already in progress",
5164 job ? job->base.id : -1);
5165
5166 /* even we skipped this reset, still need to set the job to guilty */
5167 if (job && job->vm)
5168 drm_sched_increase_karma(&job->base);
5169 goto skip_recovery;
5170 }
5171
5172 /*
5173 * Build list of devices to reset.
5174 * In case we are in XGMI hive mode, resort the device list
5175 * to put adev in the 1st position.
5176 */
5177 INIT_LIST_HEAD(&device_list);
5178 if (adev->gmc.xgmi.num_physical_nodes > 1) {
5179 list_for_each_entry(tmp_adev, &hive->device_list, gmc.xgmi.head)
5180 list_add_tail(&tmp_adev->reset_list, &device_list);
5181 if (!list_is_first(&adev->reset_list, &device_list))
5182 list_rotate_to_front(&adev->reset_list, &device_list);
5183 device_list_handle = &device_list;
5184 } else {
5185 list_add_tail(&adev->reset_list, &device_list);
5186 device_list_handle = &device_list;
5187 }
5188
5189 /* block all schedulers and reset given job's ring */
5190 list_for_each_entry(tmp_adev, device_list_handle, reset_list) {
5191 /*
5192 * Try to put the audio codec into suspend state
5193 * before gpu reset started.
5194 *
5195 * Due to the power domain of the graphics device
5196 * is shared with AZ power domain. Without this,
5197 * we may change the audio hardware from behind
5198 * the audio driver's back. That will trigger
5199 * some audio codec errors.
5200 */
5201 if (!amdgpu_device_suspend_display_audio(tmp_adev))
5202 audio_suspended = true;
5203
5204 amdgpu_ras_set_error_query_ready(tmp_adev, false);
5205
5206 cancel_delayed_work_sync(&tmp_adev->delayed_init_work);
5207
5208 if (!amdgpu_sriov_vf(tmp_adev))
5209 amdgpu_amdkfd_pre_reset(tmp_adev);
5210
5211 /*
5212 * Mark these ASICs to be reseted as untracked first
5213 * And add them back after reset completed
5214 */
5215 amdgpu_unregister_gpu_instance(tmp_adev);
5216
5217 amdgpu_fbdev_set_suspend(tmp_adev, 1);
5218
5219 /* disable ras on ALL IPs */
5220 if (!need_emergency_restart &&
5221 amdgpu_device_ip_need_full_reset(tmp_adev))
5222 amdgpu_ras_suspend(tmp_adev);
5223
5224 for (i = 0; i < AMDGPU_MAX_RINGS; ++i) {
5225 struct amdgpu_ring *ring = tmp_adev->rings[i];
5226
5227 if (!ring || !ring->sched.thread)
5228 continue;
5229
5230 drm_sched_stop(&ring->sched, job ? &job->base : NULL);
5231
5232 if (need_emergency_restart)
5233 amdgpu_job_stop_all_jobs_on_sched(&ring->sched);
5234 }
5235 atomic_inc(&tmp_adev->gpu_reset_counter);
5236 }
5237
5238 if (need_emergency_restart)
5239 goto skip_sched_resume;
5240
5241 /*
5242 * Must check guilty signal here since after this point all old
5243 * HW fences are force signaled.
5244 *
5245 * job->base holds a reference to parent fence
5246 */
5247 if (job && job->base.s_fence->parent &&
5248 dma_fence_is_signaled(job->base.s_fence->parent)) {
5249 job_signaled = true;
5250 dev_info(adev->dev, "Guilty job already signaled, skipping HW reset");
5251 goto skip_hw_reset;
5252 }
5253
5254 retry: /* Rest of adevs pre asic reset from XGMI hive. */
5255 list_for_each_entry(tmp_adev, device_list_handle, reset_list) {
5256 r = amdgpu_device_pre_asic_reset(tmp_adev, &reset_context);
5257 /*TODO Should we stop ?*/
5258 if (r) {
5259 dev_err(tmp_adev->dev, "GPU pre asic reset failed with err, %d for drm dev, %s ",
5260 r, adev_to_drm(tmp_adev)->unique);
5261 tmp_adev->asic_reset_res = r;
5262 }
5263 }
5264
5265 tmp_vram_lost_counter = atomic_read(&((adev)->vram_lost_counter));
5266 /* Actual ASIC resets if needed.*/
5267 /* TODO Implement XGMI hive reset logic for SRIOV */
5268 if (amdgpu_sriov_vf(adev)) {
5269 r = amdgpu_device_reset_sriov(adev, job ? false : true);
5270 if (r)
5271 adev->asic_reset_res = r;
5272 } else {
5273 r = amdgpu_do_asic_reset(device_list_handle, &reset_context);
5274 if (r && r == -EAGAIN)
5275 goto retry;
5276 }
5277
5278 skip_hw_reset:
5279
5280 /* Post ASIC reset for all devs .*/
5281 list_for_each_entry(tmp_adev, device_list_handle, reset_list) {
5282
5283 /*
5284 * Sometimes a later bad compute job can block a good gfx job as gfx
5285 * and compute ring share internal GC HW mutually. We add an additional
5286 * guilty jobs recheck step to find the real guilty job, it synchronously
5287 * submits and pends for the first job being signaled. If it gets timeout,
5288 * we identify it as a real guilty job.
5289 */
5290 if (amdgpu_gpu_recovery == 2 &&
5291 !(tmp_vram_lost_counter < atomic_read(&adev->vram_lost_counter)))
5292 amdgpu_device_recheck_guilty_jobs(
5293 tmp_adev, device_list_handle, &reset_context);
5294
5295 for (i = 0; i < AMDGPU_MAX_RINGS; ++i) {
5296 struct amdgpu_ring *ring = tmp_adev->rings[i];
5297
5298 if (!ring || !ring->sched.thread)
5299 continue;
5300
5301 /* No point to resubmit jobs if we didn't HW reset*/
5302 if (!tmp_adev->asic_reset_res && !job_signaled)
5303 drm_sched_resubmit_jobs(&ring->sched);
5304
5305 drm_sched_start(&ring->sched, !tmp_adev->asic_reset_res);
5306 }
5307
5308 if (!drm_drv_uses_atomic_modeset(adev_to_drm(tmp_adev)) && !job_signaled) {
5309 drm_helper_resume_force_mode(adev_to_drm(tmp_adev));
5310 }
5311
5312 tmp_adev->asic_reset_res = 0;
5313
5314 if (r) {
5315 /* bad news, how to tell it to userspace ? */
5316 dev_info(tmp_adev->dev, "GPU reset(%d) failed\n", atomic_read(&tmp_adev->gpu_reset_counter));
5317 amdgpu_vf_error_put(tmp_adev, AMDGIM_ERROR_VF_GPU_RESET_FAIL, 0, r);
5318 } else {
5319 dev_info(tmp_adev->dev, "GPU reset(%d) succeeded!\n", atomic_read(&tmp_adev->gpu_reset_counter));
5320 if (amdgpu_acpi_smart_shift_update(adev_to_drm(tmp_adev), AMDGPU_SS_DEV_D0))
5321 DRM_WARN("smart shift update failed\n");
5322 }
5323 }
5324
5325 skip_sched_resume:
5326 list_for_each_entry(tmp_adev, device_list_handle, reset_list) {
5327 /* unlock kfd: SRIOV would do it separately */
5328 if (!need_emergency_restart && !amdgpu_sriov_vf(tmp_adev))
5329 amdgpu_amdkfd_post_reset(tmp_adev);
5330
5331 /* kfd_post_reset will do nothing if kfd device is not initialized,
5332 * need to bring up kfd here if it's not be initialized before
5333 */
5334 if (!adev->kfd.init_complete)
5335 amdgpu_amdkfd_device_init(adev);
5336
5337 if (audio_suspended)
5338 amdgpu_device_resume_display_audio(tmp_adev);
5339 amdgpu_device_unlock_adev(tmp_adev);
5340 }
5341
5342 skip_recovery:
5343 if (hive) {
5344 atomic_set(&hive->in_reset, 0);
5345 mutex_unlock(&hive->hive_lock);
5346 amdgpu_put_xgmi_hive(hive);
5347 }
5348
5349 if (r && r != -EAGAIN)
5350 dev_info(adev->dev, "GPU reset end with ret = %d\n", r);
5351 return r;
5352 }
5353
5354 /**
5355 * amdgpu_device_get_pcie_info - fence pcie info about the PCIE slot
5356 *
5357 * @adev: amdgpu_device pointer
5358 *
5359 * Fetchs and stores in the driver the PCIE capabilities (gen speed
5360 * and lanes) of the slot the device is in. Handles APUs and
5361 * virtualized environments where PCIE config space may not be available.
5362 */
amdgpu_device_get_pcie_info(struct amdgpu_device * adev)5363 static void amdgpu_device_get_pcie_info(struct amdgpu_device *adev)
5364 {
5365 struct pci_dev *pdev;
5366 enum pci_bus_speed speed_cap, platform_speed_cap;
5367 enum pcie_link_width platform_link_width;
5368
5369 if (amdgpu_pcie_gen_cap)
5370 adev->pm.pcie_gen_mask = amdgpu_pcie_gen_cap;
5371
5372 if (amdgpu_pcie_lane_cap)
5373 adev->pm.pcie_mlw_mask = amdgpu_pcie_lane_cap;
5374
5375 /* covers APUs as well */
5376 if (pci_is_root_bus(adev->pdev->bus)) {
5377 if (adev->pm.pcie_gen_mask == 0)
5378 adev->pm.pcie_gen_mask = AMDGPU_DEFAULT_PCIE_GEN_MASK;
5379 if (adev->pm.pcie_mlw_mask == 0)
5380 adev->pm.pcie_mlw_mask = AMDGPU_DEFAULT_PCIE_MLW_MASK;
5381 return;
5382 }
5383
5384 if (adev->pm.pcie_gen_mask && adev->pm.pcie_mlw_mask)
5385 return;
5386
5387 pcie_bandwidth_available(adev->pdev, NULL,
5388 &platform_speed_cap, &platform_link_width);
5389
5390 if (adev->pm.pcie_gen_mask == 0) {
5391 /* asic caps */
5392 pdev = adev->pdev;
5393 speed_cap = pcie_get_speed_cap(pdev);
5394 if (speed_cap == PCI_SPEED_UNKNOWN) {
5395 adev->pm.pcie_gen_mask |= (CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN1 |
5396 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN2 |
5397 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN3);
5398 } else {
5399 if (speed_cap == PCIE_SPEED_32_0GT)
5400 adev->pm.pcie_gen_mask |= (CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN1 |
5401 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN2 |
5402 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN3 |
5403 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN4 |
5404 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN5);
5405 else if (speed_cap == PCIE_SPEED_16_0GT)
5406 adev->pm.pcie_gen_mask |= (CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN1 |
5407 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN2 |
5408 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN3 |
5409 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN4);
5410 else if (speed_cap == PCIE_SPEED_8_0GT)
5411 adev->pm.pcie_gen_mask |= (CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN1 |
5412 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN2 |
5413 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN3);
5414 else if (speed_cap == PCIE_SPEED_5_0GT)
5415 adev->pm.pcie_gen_mask |= (CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN1 |
5416 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN2);
5417 else
5418 adev->pm.pcie_gen_mask |= CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN1;
5419 }
5420 /* platform caps */
5421 if (platform_speed_cap == PCI_SPEED_UNKNOWN) {
5422 adev->pm.pcie_gen_mask |= (CAIL_PCIE_LINK_SPEED_SUPPORT_GEN1 |
5423 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN2);
5424 } else {
5425 if (platform_speed_cap == PCIE_SPEED_32_0GT)
5426 adev->pm.pcie_gen_mask |= (CAIL_PCIE_LINK_SPEED_SUPPORT_GEN1 |
5427 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN2 |
5428 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN3 |
5429 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN4 |
5430 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN5);
5431 else if (platform_speed_cap == PCIE_SPEED_16_0GT)
5432 adev->pm.pcie_gen_mask |= (CAIL_PCIE_LINK_SPEED_SUPPORT_GEN1 |
5433 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN2 |
5434 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN3 |
5435 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN4);
5436 else if (platform_speed_cap == PCIE_SPEED_8_0GT)
5437 adev->pm.pcie_gen_mask |= (CAIL_PCIE_LINK_SPEED_SUPPORT_GEN1 |
5438 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN2 |
5439 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN3);
5440 else if (platform_speed_cap == PCIE_SPEED_5_0GT)
5441 adev->pm.pcie_gen_mask |= (CAIL_PCIE_LINK_SPEED_SUPPORT_GEN1 |
5442 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN2);
5443 else
5444 adev->pm.pcie_gen_mask |= CAIL_PCIE_LINK_SPEED_SUPPORT_GEN1;
5445
5446 }
5447 }
5448 if (adev->pm.pcie_mlw_mask == 0) {
5449 if (platform_link_width == PCIE_LNK_WIDTH_UNKNOWN) {
5450 adev->pm.pcie_mlw_mask |= AMDGPU_DEFAULT_PCIE_MLW_MASK;
5451 } else {
5452 switch (platform_link_width) {
5453 case PCIE_LNK_X32:
5454 adev->pm.pcie_mlw_mask = (CAIL_PCIE_LINK_WIDTH_SUPPORT_X32 |
5455 CAIL_PCIE_LINK_WIDTH_SUPPORT_X16 |
5456 CAIL_PCIE_LINK_WIDTH_SUPPORT_X12 |
5457 CAIL_PCIE_LINK_WIDTH_SUPPORT_X8 |
5458 CAIL_PCIE_LINK_WIDTH_SUPPORT_X4 |
5459 CAIL_PCIE_LINK_WIDTH_SUPPORT_X2 |
5460 CAIL_PCIE_LINK_WIDTH_SUPPORT_X1);
5461 break;
5462 case PCIE_LNK_X16:
5463 adev->pm.pcie_mlw_mask = (CAIL_PCIE_LINK_WIDTH_SUPPORT_X16 |
5464 CAIL_PCIE_LINK_WIDTH_SUPPORT_X12 |
5465 CAIL_PCIE_LINK_WIDTH_SUPPORT_X8 |
5466 CAIL_PCIE_LINK_WIDTH_SUPPORT_X4 |
5467 CAIL_PCIE_LINK_WIDTH_SUPPORT_X2 |
5468 CAIL_PCIE_LINK_WIDTH_SUPPORT_X1);
5469 break;
5470 case PCIE_LNK_X12:
5471 adev->pm.pcie_mlw_mask = (CAIL_PCIE_LINK_WIDTH_SUPPORT_X12 |
5472 CAIL_PCIE_LINK_WIDTH_SUPPORT_X8 |
5473 CAIL_PCIE_LINK_WIDTH_SUPPORT_X4 |
5474 CAIL_PCIE_LINK_WIDTH_SUPPORT_X2 |
5475 CAIL_PCIE_LINK_WIDTH_SUPPORT_X1);
5476 break;
5477 case PCIE_LNK_X8:
5478 adev->pm.pcie_mlw_mask = (CAIL_PCIE_LINK_WIDTH_SUPPORT_X8 |
5479 CAIL_PCIE_LINK_WIDTH_SUPPORT_X4 |
5480 CAIL_PCIE_LINK_WIDTH_SUPPORT_X2 |
5481 CAIL_PCIE_LINK_WIDTH_SUPPORT_X1);
5482 break;
5483 case PCIE_LNK_X4:
5484 adev->pm.pcie_mlw_mask = (CAIL_PCIE_LINK_WIDTH_SUPPORT_X4 |
5485 CAIL_PCIE_LINK_WIDTH_SUPPORT_X2 |
5486 CAIL_PCIE_LINK_WIDTH_SUPPORT_X1);
5487 break;
5488 case PCIE_LNK_X2:
5489 adev->pm.pcie_mlw_mask = (CAIL_PCIE_LINK_WIDTH_SUPPORT_X2 |
5490 CAIL_PCIE_LINK_WIDTH_SUPPORT_X1);
5491 break;
5492 case PCIE_LNK_X1:
5493 adev->pm.pcie_mlw_mask = CAIL_PCIE_LINK_WIDTH_SUPPORT_X1;
5494 break;
5495 default:
5496 break;
5497 }
5498 }
5499 }
5500 }
5501
amdgpu_device_baco_enter(struct drm_device * dev)5502 int amdgpu_device_baco_enter(struct drm_device *dev)
5503 {
5504 struct amdgpu_device *adev = drm_to_adev(dev);
5505 struct amdgpu_ras *ras = amdgpu_ras_get_context(adev);
5506
5507 if (!amdgpu_device_supports_baco(adev_to_drm(adev)))
5508 return -ENOTSUPP;
5509
5510 if (ras && adev->ras_enabled &&
5511 adev->nbio.funcs->enable_doorbell_interrupt)
5512 adev->nbio.funcs->enable_doorbell_interrupt(adev, false);
5513
5514 return amdgpu_dpm_baco_enter(adev);
5515 }
5516
amdgpu_device_baco_exit(struct drm_device * dev)5517 int amdgpu_device_baco_exit(struct drm_device *dev)
5518 {
5519 struct amdgpu_device *adev = drm_to_adev(dev);
5520 struct amdgpu_ras *ras = amdgpu_ras_get_context(adev);
5521 int ret = 0;
5522
5523 if (!amdgpu_device_supports_baco(adev_to_drm(adev)))
5524 return -ENOTSUPP;
5525
5526 ret = amdgpu_dpm_baco_exit(adev);
5527 if (ret)
5528 return ret;
5529
5530 if (ras && adev->ras_enabled &&
5531 adev->nbio.funcs->enable_doorbell_interrupt)
5532 adev->nbio.funcs->enable_doorbell_interrupt(adev, true);
5533
5534 if (amdgpu_passthrough(adev) &&
5535 adev->nbio.funcs->clear_doorbell_interrupt)
5536 adev->nbio.funcs->clear_doorbell_interrupt(adev);
5537
5538 return 0;
5539 }
5540
amdgpu_cancel_all_tdr(struct amdgpu_device * adev)5541 static void amdgpu_cancel_all_tdr(struct amdgpu_device *adev)
5542 {
5543 int i;
5544
5545 for (i = 0; i < AMDGPU_MAX_RINGS; ++i) {
5546 struct amdgpu_ring *ring = adev->rings[i];
5547
5548 if (!ring || !ring->sched.thread)
5549 continue;
5550
5551 cancel_delayed_work_sync(&ring->sched.work_tdr);
5552 }
5553 }
5554
5555 /**
5556 * amdgpu_pci_error_detected - Called when a PCI error is detected.
5557 * @pdev: PCI device struct
5558 * @state: PCI channel state
5559 *
5560 * Description: Called when a PCI error is detected.
5561 *
5562 * Return: PCI_ERS_RESULT_NEED_RESET or PCI_ERS_RESULT_DISCONNECT.
5563 */
amdgpu_pci_error_detected(struct pci_dev * pdev,pci_channel_state_t state)5564 pci_ers_result_t amdgpu_pci_error_detected(struct pci_dev *pdev, pci_channel_state_t state)
5565 {
5566 struct drm_device *dev = pci_get_drvdata(pdev);
5567 struct amdgpu_device *adev = drm_to_adev(dev);
5568 int i;
5569
5570 DRM_INFO("PCI error: detected callback, state(%d)!!\n", state);
5571
5572 if (adev->gmc.xgmi.num_physical_nodes > 1) {
5573 DRM_WARN("No support for XGMI hive yet...");
5574 return PCI_ERS_RESULT_DISCONNECT;
5575 }
5576
5577 adev->pci_channel_state = state;
5578
5579 switch (state) {
5580 case pci_channel_io_normal:
5581 return PCI_ERS_RESULT_CAN_RECOVER;
5582 /* Fatal error, prepare for slot reset */
5583 case pci_channel_io_frozen:
5584 /*
5585 * Cancel and wait for all TDRs in progress if failing to
5586 * set adev->in_gpu_reset in amdgpu_device_lock_adev
5587 *
5588 * Locking adev->reset_sem will prevent any external access
5589 * to GPU during PCI error recovery
5590 */
5591 while (!amdgpu_device_lock_adev(adev, NULL))
5592 amdgpu_cancel_all_tdr(adev);
5593
5594 /*
5595 * Block any work scheduling as we do for regular GPU reset
5596 * for the duration of the recovery
5597 */
5598 for (i = 0; i < AMDGPU_MAX_RINGS; ++i) {
5599 struct amdgpu_ring *ring = adev->rings[i];
5600
5601 if (!ring || !ring->sched.thread)
5602 continue;
5603
5604 drm_sched_stop(&ring->sched, NULL);
5605 }
5606 atomic_inc(&adev->gpu_reset_counter);
5607 return PCI_ERS_RESULT_NEED_RESET;
5608 case pci_channel_io_perm_failure:
5609 /* Permanent error, prepare for device removal */
5610 return PCI_ERS_RESULT_DISCONNECT;
5611 }
5612
5613 return PCI_ERS_RESULT_NEED_RESET;
5614 }
5615
5616 /**
5617 * amdgpu_pci_mmio_enabled - Enable MMIO and dump debug registers
5618 * @pdev: pointer to PCI device
5619 */
amdgpu_pci_mmio_enabled(struct pci_dev * pdev)5620 pci_ers_result_t amdgpu_pci_mmio_enabled(struct pci_dev *pdev)
5621 {
5622
5623 DRM_INFO("PCI error: mmio enabled callback!!\n");
5624
5625 /* TODO - dump whatever for debugging purposes */
5626
5627 /* This called only if amdgpu_pci_error_detected returns
5628 * PCI_ERS_RESULT_CAN_RECOVER. Read/write to the device still
5629 * works, no need to reset slot.
5630 */
5631
5632 return PCI_ERS_RESULT_RECOVERED;
5633 }
5634
5635 /**
5636 * amdgpu_pci_slot_reset - Called when PCI slot has been reset.
5637 * @pdev: PCI device struct
5638 *
5639 * Description: This routine is called by the pci error recovery
5640 * code after the PCI slot has been reset, just before we
5641 * should resume normal operations.
5642 */
amdgpu_pci_slot_reset(struct pci_dev * pdev)5643 pci_ers_result_t amdgpu_pci_slot_reset(struct pci_dev *pdev)
5644 {
5645 struct drm_device *dev = pci_get_drvdata(pdev);
5646 struct amdgpu_device *adev = drm_to_adev(dev);
5647 int r, i;
5648 struct amdgpu_reset_context reset_context;
5649 u32 memsize;
5650 struct list_head device_list;
5651
5652 DRM_INFO("PCI error: slot reset callback!!\n");
5653
5654 memset(&reset_context, 0, sizeof(reset_context));
5655
5656 INIT_LIST_HEAD(&device_list);
5657 list_add_tail(&adev->reset_list, &device_list);
5658
5659 /* wait for asic to come out of reset */
5660 msleep(500);
5661
5662 /* Restore PCI confspace */
5663 amdgpu_device_load_pci_state(pdev);
5664
5665 /* confirm ASIC came out of reset */
5666 for (i = 0; i < adev->usec_timeout; i++) {
5667 memsize = amdgpu_asic_get_config_memsize(adev);
5668
5669 if (memsize != 0xffffffff)
5670 break;
5671 udelay(1);
5672 }
5673 if (memsize == 0xffffffff) {
5674 r = -ETIME;
5675 goto out;
5676 }
5677
5678 reset_context.method = AMD_RESET_METHOD_NONE;
5679 reset_context.reset_req_dev = adev;
5680 set_bit(AMDGPU_NEED_FULL_RESET, &reset_context.flags);
5681 set_bit(AMDGPU_SKIP_HW_RESET, &reset_context.flags);
5682
5683 adev->no_hw_access = true;
5684 r = amdgpu_device_pre_asic_reset(adev, &reset_context);
5685 adev->no_hw_access = false;
5686 if (r)
5687 goto out;
5688
5689 r = amdgpu_do_asic_reset(&device_list, &reset_context);
5690
5691 out:
5692 if (!r) {
5693 if (amdgpu_device_cache_pci_state(adev->pdev))
5694 pci_restore_state(adev->pdev);
5695
5696 DRM_INFO("PCIe error recovery succeeded\n");
5697 } else {
5698 DRM_ERROR("PCIe error recovery failed, err:%d", r);
5699 amdgpu_device_unlock_adev(adev);
5700 }
5701
5702 return r ? PCI_ERS_RESULT_DISCONNECT : PCI_ERS_RESULT_RECOVERED;
5703 }
5704
5705 /**
5706 * amdgpu_pci_resume() - resume normal ops after PCI reset
5707 * @pdev: pointer to PCI device
5708 *
5709 * Called when the error recovery driver tells us that its
5710 * OK to resume normal operation.
5711 */
amdgpu_pci_resume(struct pci_dev * pdev)5712 void amdgpu_pci_resume(struct pci_dev *pdev)
5713 {
5714 struct drm_device *dev = pci_get_drvdata(pdev);
5715 struct amdgpu_device *adev = drm_to_adev(dev);
5716 int i;
5717
5718
5719 DRM_INFO("PCI error: resume callback!!\n");
5720
5721 /* Only continue execution for the case of pci_channel_io_frozen */
5722 if (adev->pci_channel_state != pci_channel_io_frozen)
5723 return;
5724
5725 for (i = 0; i < AMDGPU_MAX_RINGS; ++i) {
5726 struct amdgpu_ring *ring = adev->rings[i];
5727
5728 if (!ring || !ring->sched.thread)
5729 continue;
5730
5731
5732 drm_sched_resubmit_jobs(&ring->sched);
5733 drm_sched_start(&ring->sched, true);
5734 }
5735
5736 amdgpu_device_unlock_adev(adev);
5737 }
5738
amdgpu_device_cache_pci_state(struct pci_dev * pdev)5739 bool amdgpu_device_cache_pci_state(struct pci_dev *pdev)
5740 {
5741 struct drm_device *dev = pci_get_drvdata(pdev);
5742 struct amdgpu_device *adev = drm_to_adev(dev);
5743 int r;
5744
5745 r = pci_save_state(pdev);
5746 if (!r) {
5747 kfree(adev->pci_state);
5748
5749 adev->pci_state = pci_store_saved_state(pdev);
5750
5751 if (!adev->pci_state) {
5752 DRM_ERROR("Failed to store PCI saved state");
5753 return false;
5754 }
5755 } else {
5756 DRM_WARN("Failed to save PCI state, err:%d\n", r);
5757 return false;
5758 }
5759
5760 return true;
5761 }
5762
amdgpu_device_load_pci_state(struct pci_dev * pdev)5763 bool amdgpu_device_load_pci_state(struct pci_dev *pdev)
5764 {
5765 struct drm_device *dev = pci_get_drvdata(pdev);
5766 struct amdgpu_device *adev = drm_to_adev(dev);
5767 int r;
5768
5769 if (!adev->pci_state)
5770 return false;
5771
5772 r = pci_load_saved_state(pdev, adev->pci_state);
5773
5774 if (!r) {
5775 pci_restore_state(pdev);
5776 } else {
5777 DRM_WARN("Failed to load PCI state, err:%d\n", r);
5778 return false;
5779 }
5780
5781 return true;
5782 }
5783
amdgpu_device_flush_hdp(struct amdgpu_device * adev,struct amdgpu_ring * ring)5784 void amdgpu_device_flush_hdp(struct amdgpu_device *adev,
5785 struct amdgpu_ring *ring)
5786 {
5787 #ifdef CONFIG_X86_64
5788 if ((adev->flags & AMD_IS_APU) && !amdgpu_passthrough(adev))
5789 return;
5790 #endif
5791 if (adev->gmc.xgmi.connected_to_cpu)
5792 return;
5793
5794 if (ring && ring->funcs->emit_hdp_flush)
5795 amdgpu_ring_emit_hdp_flush(ring);
5796 else
5797 amdgpu_asic_flush_hdp(adev, ring);
5798 }
5799
amdgpu_device_invalidate_hdp(struct amdgpu_device * adev,struct amdgpu_ring * ring)5800 void amdgpu_device_invalidate_hdp(struct amdgpu_device *adev,
5801 struct amdgpu_ring *ring)
5802 {
5803 #ifdef CONFIG_X86_64
5804 if ((adev->flags & AMD_IS_APU) && !amdgpu_passthrough(adev))
5805 return;
5806 #endif
5807 if (adev->gmc.xgmi.connected_to_cpu)
5808 return;
5809
5810 amdgpu_asic_invalidate_hdp(adev, ring);
5811 }
5812