1 /*
2 * Copyright 2008 Advanced Micro Devices, Inc.
3 * Copyright 2008 Red Hat Inc.
4 * Copyright 2009 Jerome Glisse.
5 *
6 * Permission is hereby granted, free of charge, to any person obtaining a
7 * copy of this software and associated documentation files (the "Software"),
8 * to deal in the Software without restriction, including without limitation
9 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
10 * and/or sell copies of the Software, and to permit persons to whom the
11 * Software is furnished to do so, subject to the following conditions:
12 *
13 * The above copyright notice and this permission notice shall be included in
14 * all copies or substantial portions of the Software.
15 *
16 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
19 * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR
20 * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
21 * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
22 * OTHER DEALINGS IN THE SOFTWARE.
23 *
24 * Authors: Dave Airlie
25 * Alex Deucher
26 * Jerome Glisse
27 */
28 #include <linux/power_supply.h>
29 #include <linux/kthread.h>
30 #include <linux/module.h>
31 #include <linux/console.h>
32 #include <linux/slab.h>
33
34 #include <drm/drm_atomic_helper.h>
35 #include <drm/drm_probe_helper.h>
36 #include <drm/amdgpu_drm.h>
37 #include <linux/vgaarb.h>
38 #include <linux/vga_switcheroo.h>
39 #include <linux/efi.h>
40 #include "amdgpu.h"
41 #include "amdgpu_trace.h"
42 #include "amdgpu_i2c.h"
43 #include "atom.h"
44 #include "amdgpu_atombios.h"
45 #include "amdgpu_atomfirmware.h"
46 #include "amd_pcie.h"
47 #ifdef CONFIG_DRM_AMDGPU_SI
48 #include "si.h"
49 #endif
50 #ifdef CONFIG_DRM_AMDGPU_CIK
51 #include "cik.h"
52 #endif
53 #include "vi.h"
54 #include "soc15.h"
55 #include "nv.h"
56 #include "bif/bif_4_1_d.h"
57 #include <linux/pci.h>
58 #include <linux/firmware.h>
59 #include "amdgpu_vf_error.h"
60
61 #include "amdgpu_amdkfd.h"
62 #include "amdgpu_pm.h"
63
64 #include "amdgpu_xgmi.h"
65 #include "amdgpu_ras.h"
66 #include "amdgpu_pmu.h"
67 #include "amdgpu_fru_eeprom.h"
68
69 #include <linux/suspend.h>
70 #include <drm/task_barrier.h>
71 #include <linux/pm_runtime.h>
72
73 MODULE_FIRMWARE("amdgpu/vega10_gpu_info.bin");
74 MODULE_FIRMWARE("amdgpu/vega12_gpu_info.bin");
75 MODULE_FIRMWARE("amdgpu/raven_gpu_info.bin");
76 MODULE_FIRMWARE("amdgpu/picasso_gpu_info.bin");
77 MODULE_FIRMWARE("amdgpu/raven2_gpu_info.bin");
78 MODULE_FIRMWARE("amdgpu/arcturus_gpu_info.bin");
79 MODULE_FIRMWARE("amdgpu/renoir_gpu_info.bin");
80 MODULE_FIRMWARE("amdgpu/navi10_gpu_info.bin");
81 MODULE_FIRMWARE("amdgpu/navi14_gpu_info.bin");
82 MODULE_FIRMWARE("amdgpu/navi12_gpu_info.bin");
83
84 #define AMDGPU_RESUME_MS 2000
85
86 const char *amdgpu_asic_name[] = {
87 "TAHITI",
88 "PITCAIRN",
89 "VERDE",
90 "OLAND",
91 "HAINAN",
92 "BONAIRE",
93 "KAVERI",
94 "KABINI",
95 "HAWAII",
96 "MULLINS",
97 "TOPAZ",
98 "TONGA",
99 "FIJI",
100 "CARRIZO",
101 "STONEY",
102 "POLARIS10",
103 "POLARIS11",
104 "POLARIS12",
105 "VEGAM",
106 "VEGA10",
107 "VEGA12",
108 "VEGA20",
109 "RAVEN",
110 "ARCTURUS",
111 "RENOIR",
112 "NAVI10",
113 "NAVI14",
114 "NAVI12",
115 "SIENNA_CICHLID",
116 "NAVY_FLOUNDER",
117 "LAST",
118 };
119
120 /**
121 * DOC: pcie_replay_count
122 *
123 * The amdgpu driver provides a sysfs API for reporting the total number
124 * of PCIe replays (NAKs)
125 * The file pcie_replay_count is used for this and returns the total
126 * number of replays as a sum of the NAKs generated and NAKs received
127 */
128
amdgpu_device_get_pcie_replay_count(struct device * dev,struct device_attribute * attr,char * buf)129 static ssize_t amdgpu_device_get_pcie_replay_count(struct device *dev,
130 struct device_attribute *attr, char *buf)
131 {
132 struct drm_device *ddev = dev_get_drvdata(dev);
133 struct amdgpu_device *adev = drm_to_adev(ddev);
134 uint64_t cnt = amdgpu_asic_get_pcie_replay_count(adev);
135
136 return snprintf(buf, PAGE_SIZE, "%llu\n", cnt);
137 }
138
139 static DEVICE_ATTR(pcie_replay_count, S_IRUGO,
140 amdgpu_device_get_pcie_replay_count, NULL);
141
142 static void amdgpu_device_get_pcie_info(struct amdgpu_device *adev);
143
144 /**
145 * DOC: product_name
146 *
147 * The amdgpu driver provides a sysfs API for reporting the product name
148 * for the device
149 * The file serial_number is used for this and returns the product name
150 * as returned from the FRU.
151 * NOTE: This is only available for certain server cards
152 */
153
amdgpu_device_get_product_name(struct device * dev,struct device_attribute * attr,char * buf)154 static ssize_t amdgpu_device_get_product_name(struct device *dev,
155 struct device_attribute *attr, char *buf)
156 {
157 struct drm_device *ddev = dev_get_drvdata(dev);
158 struct amdgpu_device *adev = drm_to_adev(ddev);
159
160 return snprintf(buf, PAGE_SIZE, "%s\n", adev->product_name);
161 }
162
163 static DEVICE_ATTR(product_name, S_IRUGO,
164 amdgpu_device_get_product_name, NULL);
165
166 /**
167 * DOC: product_number
168 *
169 * The amdgpu driver provides a sysfs API for reporting the part number
170 * for the device
171 * The file serial_number is used for this and returns the part number
172 * as returned from the FRU.
173 * NOTE: This is only available for certain server cards
174 */
175
amdgpu_device_get_product_number(struct device * dev,struct device_attribute * attr,char * buf)176 static ssize_t amdgpu_device_get_product_number(struct device *dev,
177 struct device_attribute *attr, char *buf)
178 {
179 struct drm_device *ddev = dev_get_drvdata(dev);
180 struct amdgpu_device *adev = drm_to_adev(ddev);
181
182 return snprintf(buf, PAGE_SIZE, "%s\n", adev->product_number);
183 }
184
185 static DEVICE_ATTR(product_number, S_IRUGO,
186 amdgpu_device_get_product_number, NULL);
187
188 /**
189 * DOC: serial_number
190 *
191 * The amdgpu driver provides a sysfs API for reporting the serial number
192 * for the device
193 * The file serial_number is used for this and returns the serial number
194 * as returned from the FRU.
195 * NOTE: This is only available for certain server cards
196 */
197
amdgpu_device_get_serial_number(struct device * dev,struct device_attribute * attr,char * buf)198 static ssize_t amdgpu_device_get_serial_number(struct device *dev,
199 struct device_attribute *attr, char *buf)
200 {
201 struct drm_device *ddev = dev_get_drvdata(dev);
202 struct amdgpu_device *adev = drm_to_adev(ddev);
203
204 return snprintf(buf, PAGE_SIZE, "%s\n", adev->serial);
205 }
206
207 static DEVICE_ATTR(serial_number, S_IRUGO,
208 amdgpu_device_get_serial_number, NULL);
209
210 /**
211 * amdgpu_device_supports_boco - Is the device a dGPU with HG/PX power control
212 *
213 * @dev: drm_device pointer
214 *
215 * Returns true if the device is a dGPU with HG/PX power control,
216 * otherwise return false.
217 */
amdgpu_device_supports_boco(struct drm_device * dev)218 bool amdgpu_device_supports_boco(struct drm_device *dev)
219 {
220 struct amdgpu_device *adev = drm_to_adev(dev);
221
222 if (adev->flags & AMD_IS_PX)
223 return true;
224 return false;
225 }
226
227 /**
228 * amdgpu_device_supports_baco - Does the device support BACO
229 *
230 * @dev: drm_device pointer
231 *
232 * Returns true if the device supporte BACO,
233 * otherwise return false.
234 */
amdgpu_device_supports_baco(struct drm_device * dev)235 bool amdgpu_device_supports_baco(struct drm_device *dev)
236 {
237 struct amdgpu_device *adev = drm_to_adev(dev);
238
239 return amdgpu_asic_supports_baco(adev);
240 }
241
242 /*
243 * VRAM access helper functions
244 */
245
246 /**
247 * amdgpu_device_vram_access - read/write a buffer in vram
248 *
249 * @adev: amdgpu_device pointer
250 * @pos: offset of the buffer in vram
251 * @buf: virtual address of the buffer in system memory
252 * @size: read/write size, sizeof(@buf) must > @size
253 * @write: true - write to vram, otherwise - read from vram
254 */
amdgpu_device_vram_access(struct amdgpu_device * adev,loff_t pos,uint32_t * buf,size_t size,bool write)255 void amdgpu_device_vram_access(struct amdgpu_device *adev, loff_t pos,
256 uint32_t *buf, size_t size, bool write)
257 {
258 unsigned long flags;
259 uint32_t hi = ~0;
260 uint64_t last;
261
262
263 #ifdef CONFIG_64BIT
264 last = min(pos + size, adev->gmc.visible_vram_size);
265 if (last > pos) {
266 void __iomem *addr = adev->mman.aper_base_kaddr + pos;
267 size_t count = last - pos;
268
269 if (write) {
270 memcpy_toio(addr, buf, count);
271 mb();
272 amdgpu_asic_flush_hdp(adev, NULL);
273 } else {
274 amdgpu_asic_invalidate_hdp(adev, NULL);
275 mb();
276 memcpy_fromio(buf, addr, count);
277 }
278
279 if (count == size)
280 return;
281
282 pos += count;
283 buf += count / 4;
284 size -= count;
285 }
286 #endif
287
288 spin_lock_irqsave(&adev->mmio_idx_lock, flags);
289 for (last = pos + size; pos < last; pos += 4) {
290 uint32_t tmp = pos >> 31;
291
292 WREG32_NO_KIQ(mmMM_INDEX, ((uint32_t)pos) | 0x80000000);
293 if (tmp != hi) {
294 WREG32_NO_KIQ(mmMM_INDEX_HI, tmp);
295 hi = tmp;
296 }
297 if (write)
298 WREG32_NO_KIQ(mmMM_DATA, *buf++);
299 else
300 *buf++ = RREG32_NO_KIQ(mmMM_DATA);
301 }
302 spin_unlock_irqrestore(&adev->mmio_idx_lock, flags);
303 }
304
305 /*
306 * register access helper functions.
307 */
308 /**
309 * amdgpu_device_rreg - read a memory mapped IO or indirect register
310 *
311 * @adev: amdgpu_device pointer
312 * @reg: dword aligned register offset
313 * @acc_flags: access flags which require special behavior
314 *
315 * Returns the 32 bit value from the offset specified.
316 */
amdgpu_device_rreg(struct amdgpu_device * adev,uint32_t reg,uint32_t acc_flags)317 uint32_t amdgpu_device_rreg(struct amdgpu_device *adev,
318 uint32_t reg, uint32_t acc_flags)
319 {
320 uint32_t ret;
321
322 if (adev->in_pci_err_recovery)
323 return 0;
324
325 if ((reg * 4) < adev->rmmio_size) {
326 if (!(acc_flags & AMDGPU_REGS_NO_KIQ) &&
327 amdgpu_sriov_runtime(adev) &&
328 down_read_trylock(&adev->reset_sem)) {
329 ret = amdgpu_kiq_rreg(adev, reg);
330 up_read(&adev->reset_sem);
331 } else {
332 ret = readl(((void __iomem *)adev->rmmio) + (reg * 4));
333 }
334 } else {
335 ret = adev->pcie_rreg(adev, reg * 4);
336 }
337
338 trace_amdgpu_device_rreg(adev->pdev->device, reg, ret);
339
340 return ret;
341 }
342
343 /*
344 * MMIO register read with bytes helper functions
345 * @offset:bytes offset from MMIO start
346 *
347 */
348
349 /**
350 * amdgpu_mm_rreg8 - read a memory mapped IO register
351 *
352 * @adev: amdgpu_device pointer
353 * @offset: byte aligned register offset
354 *
355 * Returns the 8 bit value from the offset specified.
356 */
amdgpu_mm_rreg8(struct amdgpu_device * adev,uint32_t offset)357 uint8_t amdgpu_mm_rreg8(struct amdgpu_device *adev, uint32_t offset)
358 {
359 if (adev->in_pci_err_recovery)
360 return 0;
361
362 if (offset < adev->rmmio_size)
363 return (readb(adev->rmmio + offset));
364 BUG();
365 }
366
367 /*
368 * MMIO register write with bytes helper functions
369 * @offset:bytes offset from MMIO start
370 * @value: the value want to be written to the register
371 *
372 */
373 /**
374 * amdgpu_mm_wreg8 - read a memory mapped IO register
375 *
376 * @adev: amdgpu_device pointer
377 * @offset: byte aligned register offset
378 * @value: 8 bit value to write
379 *
380 * Writes the value specified to the offset specified.
381 */
amdgpu_mm_wreg8(struct amdgpu_device * adev,uint32_t offset,uint8_t value)382 void amdgpu_mm_wreg8(struct amdgpu_device *adev, uint32_t offset, uint8_t value)
383 {
384 if (adev->in_pci_err_recovery)
385 return;
386
387 if (offset < adev->rmmio_size)
388 writeb(value, adev->rmmio + offset);
389 else
390 BUG();
391 }
392
393 /**
394 * amdgpu_device_wreg - write to a memory mapped IO or indirect register
395 *
396 * @adev: amdgpu_device pointer
397 * @reg: dword aligned register offset
398 * @v: 32 bit value to write to the register
399 * @acc_flags: access flags which require special behavior
400 *
401 * Writes the value specified to the offset specified.
402 */
amdgpu_device_wreg(struct amdgpu_device * adev,uint32_t reg,uint32_t v,uint32_t acc_flags)403 void amdgpu_device_wreg(struct amdgpu_device *adev,
404 uint32_t reg, uint32_t v,
405 uint32_t acc_flags)
406 {
407 if (adev->in_pci_err_recovery)
408 return;
409
410 if ((reg * 4) < adev->rmmio_size) {
411 if (!(acc_flags & AMDGPU_REGS_NO_KIQ) &&
412 amdgpu_sriov_runtime(adev) &&
413 down_read_trylock(&adev->reset_sem)) {
414 amdgpu_kiq_wreg(adev, reg, v);
415 up_read(&adev->reset_sem);
416 } else {
417 writel(v, ((void __iomem *)adev->rmmio) + (reg * 4));
418 }
419 } else {
420 adev->pcie_wreg(adev, reg * 4, v);
421 }
422
423 trace_amdgpu_device_wreg(adev->pdev->device, reg, v);
424 }
425
426 /*
427 * amdgpu_mm_wreg_mmio_rlc - write register either with mmio or with RLC path if in range
428 *
429 * this function is invoked only the debugfs register access
430 * */
amdgpu_mm_wreg_mmio_rlc(struct amdgpu_device * adev,uint32_t reg,uint32_t v)431 void amdgpu_mm_wreg_mmio_rlc(struct amdgpu_device *adev,
432 uint32_t reg, uint32_t v)
433 {
434 if (adev->in_pci_err_recovery)
435 return;
436
437 if (amdgpu_sriov_fullaccess(adev) &&
438 adev->gfx.rlc.funcs &&
439 adev->gfx.rlc.funcs->is_rlcg_access_range) {
440 if (adev->gfx.rlc.funcs->is_rlcg_access_range(adev, reg))
441 return adev->gfx.rlc.funcs->rlcg_wreg(adev, reg, v);
442 } else {
443 writel(v, ((void __iomem *)adev->rmmio) + (reg * 4));
444 }
445 }
446
447 /**
448 * amdgpu_io_rreg - read an IO register
449 *
450 * @adev: amdgpu_device pointer
451 * @reg: dword aligned register offset
452 *
453 * Returns the 32 bit value from the offset specified.
454 */
amdgpu_io_rreg(struct amdgpu_device * adev,u32 reg)455 u32 amdgpu_io_rreg(struct amdgpu_device *adev, u32 reg)
456 {
457 if (adev->in_pci_err_recovery)
458 return 0;
459
460 if ((reg * 4) < adev->rio_mem_size)
461 return ioread32(adev->rio_mem + (reg * 4));
462 else {
463 iowrite32((reg * 4), adev->rio_mem + (mmMM_INDEX * 4));
464 return ioread32(adev->rio_mem + (mmMM_DATA * 4));
465 }
466 }
467
468 /**
469 * amdgpu_io_wreg - write to an IO register
470 *
471 * @adev: amdgpu_device pointer
472 * @reg: dword aligned register offset
473 * @v: 32 bit value to write to the register
474 *
475 * Writes the value specified to the offset specified.
476 */
amdgpu_io_wreg(struct amdgpu_device * adev,u32 reg,u32 v)477 void amdgpu_io_wreg(struct amdgpu_device *adev, u32 reg, u32 v)
478 {
479 if (adev->in_pci_err_recovery)
480 return;
481
482 if ((reg * 4) < adev->rio_mem_size)
483 iowrite32(v, adev->rio_mem + (reg * 4));
484 else {
485 iowrite32((reg * 4), adev->rio_mem + (mmMM_INDEX * 4));
486 iowrite32(v, adev->rio_mem + (mmMM_DATA * 4));
487 }
488 }
489
490 /**
491 * amdgpu_mm_rdoorbell - read a doorbell dword
492 *
493 * @adev: amdgpu_device pointer
494 * @index: doorbell index
495 *
496 * Returns the value in the doorbell aperture at the
497 * requested doorbell index (CIK).
498 */
amdgpu_mm_rdoorbell(struct amdgpu_device * adev,u32 index)499 u32 amdgpu_mm_rdoorbell(struct amdgpu_device *adev, u32 index)
500 {
501 if (adev->in_pci_err_recovery)
502 return 0;
503
504 if (index < adev->doorbell.num_doorbells) {
505 return readl(adev->doorbell.ptr + index);
506 } else {
507 DRM_ERROR("reading beyond doorbell aperture: 0x%08x!\n", index);
508 return 0;
509 }
510 }
511
512 /**
513 * amdgpu_mm_wdoorbell - write a doorbell dword
514 *
515 * @adev: amdgpu_device pointer
516 * @index: doorbell index
517 * @v: value to write
518 *
519 * Writes @v to the doorbell aperture at the
520 * requested doorbell index (CIK).
521 */
amdgpu_mm_wdoorbell(struct amdgpu_device * adev,u32 index,u32 v)522 void amdgpu_mm_wdoorbell(struct amdgpu_device *adev, u32 index, u32 v)
523 {
524 if (adev->in_pci_err_recovery)
525 return;
526
527 if (index < adev->doorbell.num_doorbells) {
528 writel(v, adev->doorbell.ptr + index);
529 } else {
530 DRM_ERROR("writing beyond doorbell aperture: 0x%08x!\n", index);
531 }
532 }
533
534 /**
535 * amdgpu_mm_rdoorbell64 - read a doorbell Qword
536 *
537 * @adev: amdgpu_device pointer
538 * @index: doorbell index
539 *
540 * Returns the value in the doorbell aperture at the
541 * requested doorbell index (VEGA10+).
542 */
amdgpu_mm_rdoorbell64(struct amdgpu_device * adev,u32 index)543 u64 amdgpu_mm_rdoorbell64(struct amdgpu_device *adev, u32 index)
544 {
545 if (adev->in_pci_err_recovery)
546 return 0;
547
548 if (index < adev->doorbell.num_doorbells) {
549 return atomic64_read((atomic64_t *)(adev->doorbell.ptr + index));
550 } else {
551 DRM_ERROR("reading beyond doorbell aperture: 0x%08x!\n", index);
552 return 0;
553 }
554 }
555
556 /**
557 * amdgpu_mm_wdoorbell64 - write a doorbell Qword
558 *
559 * @adev: amdgpu_device pointer
560 * @index: doorbell index
561 * @v: value to write
562 *
563 * Writes @v to the doorbell aperture at the
564 * requested doorbell index (VEGA10+).
565 */
amdgpu_mm_wdoorbell64(struct amdgpu_device * adev,u32 index,u64 v)566 void amdgpu_mm_wdoorbell64(struct amdgpu_device *adev, u32 index, u64 v)
567 {
568 if (adev->in_pci_err_recovery)
569 return;
570
571 if (index < adev->doorbell.num_doorbells) {
572 atomic64_set((atomic64_t *)(adev->doorbell.ptr + index), v);
573 } else {
574 DRM_ERROR("writing beyond doorbell aperture: 0x%08x!\n", index);
575 }
576 }
577
578 /**
579 * amdgpu_device_indirect_rreg - read an indirect register
580 *
581 * @adev: amdgpu_device pointer
582 * @pcie_index: mmio register offset
583 * @pcie_data: mmio register offset
584 *
585 * Returns the value of indirect register @reg_addr
586 */
amdgpu_device_indirect_rreg(struct amdgpu_device * adev,u32 pcie_index,u32 pcie_data,u32 reg_addr)587 u32 amdgpu_device_indirect_rreg(struct amdgpu_device *adev,
588 u32 pcie_index, u32 pcie_data,
589 u32 reg_addr)
590 {
591 unsigned long flags;
592 u32 r;
593 void __iomem *pcie_index_offset;
594 void __iomem *pcie_data_offset;
595
596 spin_lock_irqsave(&adev->pcie_idx_lock, flags);
597 pcie_index_offset = (void __iomem *)adev->rmmio + pcie_index * 4;
598 pcie_data_offset = (void __iomem *)adev->rmmio + pcie_data * 4;
599
600 writel(reg_addr, pcie_index_offset);
601 readl(pcie_index_offset);
602 r = readl(pcie_data_offset);
603 spin_unlock_irqrestore(&adev->pcie_idx_lock, flags);
604
605 return r;
606 }
607
608 /**
609 * amdgpu_device_indirect_rreg64 - read a 64bits indirect register
610 *
611 * @adev: amdgpu_device pointer
612 * @pcie_index: mmio register offset
613 * @pcie_data: mmio register offset
614 *
615 * Returns the value of indirect register @reg_addr
616 */
amdgpu_device_indirect_rreg64(struct amdgpu_device * adev,u32 pcie_index,u32 pcie_data,u32 reg_addr)617 u64 amdgpu_device_indirect_rreg64(struct amdgpu_device *adev,
618 u32 pcie_index, u32 pcie_data,
619 u32 reg_addr)
620 {
621 unsigned long flags;
622 u64 r;
623 void __iomem *pcie_index_offset;
624 void __iomem *pcie_data_offset;
625
626 spin_lock_irqsave(&adev->pcie_idx_lock, flags);
627 pcie_index_offset = (void __iomem *)adev->rmmio + pcie_index * 4;
628 pcie_data_offset = (void __iomem *)adev->rmmio + pcie_data * 4;
629
630 /* read low 32 bits */
631 writel(reg_addr, pcie_index_offset);
632 readl(pcie_index_offset);
633 r = readl(pcie_data_offset);
634 /* read high 32 bits */
635 writel(reg_addr + 4, pcie_index_offset);
636 readl(pcie_index_offset);
637 r |= ((u64)readl(pcie_data_offset) << 32);
638 spin_unlock_irqrestore(&adev->pcie_idx_lock, flags);
639
640 return r;
641 }
642
643 /**
644 * amdgpu_device_indirect_wreg - write an indirect register address
645 *
646 * @adev: amdgpu_device pointer
647 * @pcie_index: mmio register offset
648 * @pcie_data: mmio register offset
649 * @reg_addr: indirect register offset
650 * @reg_data: indirect register data
651 *
652 */
amdgpu_device_indirect_wreg(struct amdgpu_device * adev,u32 pcie_index,u32 pcie_data,u32 reg_addr,u32 reg_data)653 void amdgpu_device_indirect_wreg(struct amdgpu_device *adev,
654 u32 pcie_index, u32 pcie_data,
655 u32 reg_addr, u32 reg_data)
656 {
657 unsigned long flags;
658 void __iomem *pcie_index_offset;
659 void __iomem *pcie_data_offset;
660
661 spin_lock_irqsave(&adev->pcie_idx_lock, flags);
662 pcie_index_offset = (void __iomem *)adev->rmmio + pcie_index * 4;
663 pcie_data_offset = (void __iomem *)adev->rmmio + pcie_data * 4;
664
665 writel(reg_addr, pcie_index_offset);
666 readl(pcie_index_offset);
667 writel(reg_data, pcie_data_offset);
668 readl(pcie_data_offset);
669 spin_unlock_irqrestore(&adev->pcie_idx_lock, flags);
670 }
671
672 /**
673 * amdgpu_device_indirect_wreg64 - write a 64bits indirect register address
674 *
675 * @adev: amdgpu_device pointer
676 * @pcie_index: mmio register offset
677 * @pcie_data: mmio register offset
678 * @reg_addr: indirect register offset
679 * @reg_data: indirect register data
680 *
681 */
amdgpu_device_indirect_wreg64(struct amdgpu_device * adev,u32 pcie_index,u32 pcie_data,u32 reg_addr,u64 reg_data)682 void amdgpu_device_indirect_wreg64(struct amdgpu_device *adev,
683 u32 pcie_index, u32 pcie_data,
684 u32 reg_addr, u64 reg_data)
685 {
686 unsigned long flags;
687 void __iomem *pcie_index_offset;
688 void __iomem *pcie_data_offset;
689
690 spin_lock_irqsave(&adev->pcie_idx_lock, flags);
691 pcie_index_offset = (void __iomem *)adev->rmmio + pcie_index * 4;
692 pcie_data_offset = (void __iomem *)adev->rmmio + pcie_data * 4;
693
694 /* write low 32 bits */
695 writel(reg_addr, pcie_index_offset);
696 readl(pcie_index_offset);
697 writel((u32)(reg_data & 0xffffffffULL), pcie_data_offset);
698 readl(pcie_data_offset);
699 /* write high 32 bits */
700 writel(reg_addr + 4, pcie_index_offset);
701 readl(pcie_index_offset);
702 writel((u32)(reg_data >> 32), pcie_data_offset);
703 readl(pcie_data_offset);
704 spin_unlock_irqrestore(&adev->pcie_idx_lock, flags);
705 }
706
707 /**
708 * amdgpu_invalid_rreg - dummy reg read function
709 *
710 * @adev: amdgpu_device pointer
711 * @reg: offset of register
712 *
713 * Dummy register read function. Used for register blocks
714 * that certain asics don't have (all asics).
715 * Returns the value in the register.
716 */
amdgpu_invalid_rreg(struct amdgpu_device * adev,uint32_t reg)717 static uint32_t amdgpu_invalid_rreg(struct amdgpu_device *adev, uint32_t reg)
718 {
719 DRM_ERROR("Invalid callback to read register 0x%04X\n", reg);
720 BUG();
721 return 0;
722 }
723
724 /**
725 * amdgpu_invalid_wreg - dummy reg write function
726 *
727 * @adev: amdgpu_device pointer
728 * @reg: offset of register
729 * @v: value to write to the register
730 *
731 * Dummy register read function. Used for register blocks
732 * that certain asics don't have (all asics).
733 */
amdgpu_invalid_wreg(struct amdgpu_device * adev,uint32_t reg,uint32_t v)734 static void amdgpu_invalid_wreg(struct amdgpu_device *adev, uint32_t reg, uint32_t v)
735 {
736 DRM_ERROR("Invalid callback to write register 0x%04X with 0x%08X\n",
737 reg, v);
738 BUG();
739 }
740
741 /**
742 * amdgpu_invalid_rreg64 - dummy 64 bit reg read function
743 *
744 * @adev: amdgpu_device pointer
745 * @reg: offset of register
746 *
747 * Dummy register read function. Used for register blocks
748 * that certain asics don't have (all asics).
749 * Returns the value in the register.
750 */
amdgpu_invalid_rreg64(struct amdgpu_device * adev,uint32_t reg)751 static uint64_t amdgpu_invalid_rreg64(struct amdgpu_device *adev, uint32_t reg)
752 {
753 DRM_ERROR("Invalid callback to read 64 bit register 0x%04X\n", reg);
754 BUG();
755 return 0;
756 }
757
758 /**
759 * amdgpu_invalid_wreg64 - dummy reg write function
760 *
761 * @adev: amdgpu_device pointer
762 * @reg: offset of register
763 * @v: value to write to the register
764 *
765 * Dummy register read function. Used for register blocks
766 * that certain asics don't have (all asics).
767 */
amdgpu_invalid_wreg64(struct amdgpu_device * adev,uint32_t reg,uint64_t v)768 static void amdgpu_invalid_wreg64(struct amdgpu_device *adev, uint32_t reg, uint64_t v)
769 {
770 DRM_ERROR("Invalid callback to write 64 bit register 0x%04X with 0x%08llX\n",
771 reg, v);
772 BUG();
773 }
774
775 /**
776 * amdgpu_block_invalid_rreg - dummy reg read function
777 *
778 * @adev: amdgpu_device pointer
779 * @block: offset of instance
780 * @reg: offset of register
781 *
782 * Dummy register read function. Used for register blocks
783 * that certain asics don't have (all asics).
784 * Returns the value in the register.
785 */
amdgpu_block_invalid_rreg(struct amdgpu_device * adev,uint32_t block,uint32_t reg)786 static uint32_t amdgpu_block_invalid_rreg(struct amdgpu_device *adev,
787 uint32_t block, uint32_t reg)
788 {
789 DRM_ERROR("Invalid callback to read register 0x%04X in block 0x%04X\n",
790 reg, block);
791 BUG();
792 return 0;
793 }
794
795 /**
796 * amdgpu_block_invalid_wreg - dummy reg write function
797 *
798 * @adev: amdgpu_device pointer
799 * @block: offset of instance
800 * @reg: offset of register
801 * @v: value to write to the register
802 *
803 * Dummy register read function. Used for register blocks
804 * that certain asics don't have (all asics).
805 */
amdgpu_block_invalid_wreg(struct amdgpu_device * adev,uint32_t block,uint32_t reg,uint32_t v)806 static void amdgpu_block_invalid_wreg(struct amdgpu_device *adev,
807 uint32_t block,
808 uint32_t reg, uint32_t v)
809 {
810 DRM_ERROR("Invalid block callback to write register 0x%04X in block 0x%04X with 0x%08X\n",
811 reg, block, v);
812 BUG();
813 }
814
815 /**
816 * amdgpu_device_asic_init - Wrapper for atom asic_init
817 *
818 * @adev: amdgpu_device pointer
819 *
820 * Does any asic specific work and then calls atom asic init.
821 */
amdgpu_device_asic_init(struct amdgpu_device * adev)822 static int amdgpu_device_asic_init(struct amdgpu_device *adev)
823 {
824 amdgpu_asic_pre_asic_init(adev);
825
826 return amdgpu_atom_asic_init(adev->mode_info.atom_context);
827 }
828
829 /**
830 * amdgpu_device_vram_scratch_init - allocate the VRAM scratch page
831 *
832 * @adev: amdgpu_device pointer
833 *
834 * Allocates a scratch page of VRAM for use by various things in the
835 * driver.
836 */
amdgpu_device_vram_scratch_init(struct amdgpu_device * adev)837 static int amdgpu_device_vram_scratch_init(struct amdgpu_device *adev)
838 {
839 return amdgpu_bo_create_kernel(adev, AMDGPU_GPU_PAGE_SIZE,
840 PAGE_SIZE, AMDGPU_GEM_DOMAIN_VRAM,
841 &adev->vram_scratch.robj,
842 &adev->vram_scratch.gpu_addr,
843 (void **)&adev->vram_scratch.ptr);
844 }
845
846 /**
847 * amdgpu_device_vram_scratch_fini - Free the VRAM scratch page
848 *
849 * @adev: amdgpu_device pointer
850 *
851 * Frees the VRAM scratch page.
852 */
amdgpu_device_vram_scratch_fini(struct amdgpu_device * adev)853 static void amdgpu_device_vram_scratch_fini(struct amdgpu_device *adev)
854 {
855 amdgpu_bo_free_kernel(&adev->vram_scratch.robj, NULL, NULL);
856 }
857
858 /**
859 * amdgpu_device_program_register_sequence - program an array of registers.
860 *
861 * @adev: amdgpu_device pointer
862 * @registers: pointer to the register array
863 * @array_size: size of the register array
864 *
865 * Programs an array or registers with and and or masks.
866 * This is a helper for setting golden registers.
867 */
amdgpu_device_program_register_sequence(struct amdgpu_device * adev,const u32 * registers,const u32 array_size)868 void amdgpu_device_program_register_sequence(struct amdgpu_device *adev,
869 const u32 *registers,
870 const u32 array_size)
871 {
872 u32 tmp, reg, and_mask, or_mask;
873 int i;
874
875 if (array_size % 3)
876 return;
877
878 for (i = 0; i < array_size; i +=3) {
879 reg = registers[i + 0];
880 and_mask = registers[i + 1];
881 or_mask = registers[i + 2];
882
883 if (and_mask == 0xffffffff) {
884 tmp = or_mask;
885 } else {
886 tmp = RREG32(reg);
887 tmp &= ~and_mask;
888 if (adev->family >= AMDGPU_FAMILY_AI)
889 tmp |= (or_mask & and_mask);
890 else
891 tmp |= or_mask;
892 }
893 WREG32(reg, tmp);
894 }
895 }
896
897 /**
898 * amdgpu_device_pci_config_reset - reset the GPU
899 *
900 * @adev: amdgpu_device pointer
901 *
902 * Resets the GPU using the pci config reset sequence.
903 * Only applicable to asics prior to vega10.
904 */
amdgpu_device_pci_config_reset(struct amdgpu_device * adev)905 void amdgpu_device_pci_config_reset(struct amdgpu_device *adev)
906 {
907 pci_write_config_dword(adev->pdev, 0x7c, AMDGPU_ASIC_RESET_DATA);
908 }
909
910 /*
911 * GPU doorbell aperture helpers function.
912 */
913 /**
914 * amdgpu_device_doorbell_init - Init doorbell driver information.
915 *
916 * @adev: amdgpu_device pointer
917 *
918 * Init doorbell driver information (CIK)
919 * Returns 0 on success, error on failure.
920 */
amdgpu_device_doorbell_init(struct amdgpu_device * adev)921 static int amdgpu_device_doorbell_init(struct amdgpu_device *adev)
922 {
923
924 /* No doorbell on SI hardware generation */
925 if (adev->asic_type < CHIP_BONAIRE) {
926 adev->doorbell.base = 0;
927 adev->doorbell.size = 0;
928 adev->doorbell.num_doorbells = 0;
929 adev->doorbell.ptr = NULL;
930 return 0;
931 }
932
933 if (pci_resource_flags(adev->pdev, 2) & IORESOURCE_UNSET)
934 return -EINVAL;
935
936 amdgpu_asic_init_doorbell_index(adev);
937
938 /* doorbell bar mapping */
939 adev->doorbell.base = pci_resource_start(adev->pdev, 2);
940 adev->doorbell.size = pci_resource_len(adev->pdev, 2);
941
942 adev->doorbell.num_doorbells = min_t(u32, adev->doorbell.size / sizeof(u32),
943 adev->doorbell_index.max_assignment+1);
944 if (adev->doorbell.num_doorbells == 0)
945 return -EINVAL;
946
947 /* For Vega, reserve and map two pages on doorbell BAR since SDMA
948 * paging queue doorbell use the second page. The
949 * AMDGPU_DOORBELL64_MAX_ASSIGNMENT definition assumes all the
950 * doorbells are in the first page. So with paging queue enabled,
951 * the max num_doorbells should + 1 page (0x400 in dword)
952 */
953 if (adev->asic_type >= CHIP_VEGA10)
954 adev->doorbell.num_doorbells += 0x400;
955
956 adev->doorbell.ptr = ioremap(adev->doorbell.base,
957 adev->doorbell.num_doorbells *
958 sizeof(u32));
959 if (adev->doorbell.ptr == NULL)
960 return -ENOMEM;
961
962 return 0;
963 }
964
965 /**
966 * amdgpu_device_doorbell_fini - Tear down doorbell driver information.
967 *
968 * @adev: amdgpu_device pointer
969 *
970 * Tear down doorbell driver information (CIK)
971 */
amdgpu_device_doorbell_fini(struct amdgpu_device * adev)972 static void amdgpu_device_doorbell_fini(struct amdgpu_device *adev)
973 {
974 iounmap(adev->doorbell.ptr);
975 adev->doorbell.ptr = NULL;
976 }
977
978
979
980 /*
981 * amdgpu_device_wb_*()
982 * Writeback is the method by which the GPU updates special pages in memory
983 * with the status of certain GPU events (fences, ring pointers,etc.).
984 */
985
986 /**
987 * amdgpu_device_wb_fini - Disable Writeback and free memory
988 *
989 * @adev: amdgpu_device pointer
990 *
991 * Disables Writeback and frees the Writeback memory (all asics).
992 * Used at driver shutdown.
993 */
amdgpu_device_wb_fini(struct amdgpu_device * adev)994 static void amdgpu_device_wb_fini(struct amdgpu_device *adev)
995 {
996 if (adev->wb.wb_obj) {
997 amdgpu_bo_free_kernel(&adev->wb.wb_obj,
998 &adev->wb.gpu_addr,
999 (void **)&adev->wb.wb);
1000 adev->wb.wb_obj = NULL;
1001 }
1002 }
1003
1004 /**
1005 * amdgpu_device_wb_init- Init Writeback driver info and allocate memory
1006 *
1007 * @adev: amdgpu_device pointer
1008 *
1009 * Initializes writeback and allocates writeback memory (all asics).
1010 * Used at driver startup.
1011 * Returns 0 on success or an -error on failure.
1012 */
amdgpu_device_wb_init(struct amdgpu_device * adev)1013 static int amdgpu_device_wb_init(struct amdgpu_device *adev)
1014 {
1015 int r;
1016
1017 if (adev->wb.wb_obj == NULL) {
1018 /* AMDGPU_MAX_WB * sizeof(uint32_t) * 8 = AMDGPU_MAX_WB 256bit slots */
1019 r = amdgpu_bo_create_kernel(adev, AMDGPU_MAX_WB * sizeof(uint32_t) * 8,
1020 PAGE_SIZE, AMDGPU_GEM_DOMAIN_GTT,
1021 &adev->wb.wb_obj, &adev->wb.gpu_addr,
1022 (void **)&adev->wb.wb);
1023 if (r) {
1024 dev_warn(adev->dev, "(%d) create WB bo failed\n", r);
1025 return r;
1026 }
1027
1028 adev->wb.num_wb = AMDGPU_MAX_WB;
1029 memset(&adev->wb.used, 0, sizeof(adev->wb.used));
1030
1031 /* clear wb memory */
1032 memset((char *)adev->wb.wb, 0, AMDGPU_MAX_WB * sizeof(uint32_t) * 8);
1033 }
1034
1035 return 0;
1036 }
1037
1038 /**
1039 * amdgpu_device_wb_get - Allocate a wb entry
1040 *
1041 * @adev: amdgpu_device pointer
1042 * @wb: wb index
1043 *
1044 * Allocate a wb slot for use by the driver (all asics).
1045 * Returns 0 on success or -EINVAL on failure.
1046 */
amdgpu_device_wb_get(struct amdgpu_device * adev,u32 * wb)1047 int amdgpu_device_wb_get(struct amdgpu_device *adev, u32 *wb)
1048 {
1049 unsigned long offset = find_first_zero_bit(adev->wb.used, adev->wb.num_wb);
1050
1051 if (offset < adev->wb.num_wb) {
1052 __set_bit(offset, adev->wb.used);
1053 *wb = offset << 3; /* convert to dw offset */
1054 return 0;
1055 } else {
1056 return -EINVAL;
1057 }
1058 }
1059
1060 /**
1061 * amdgpu_device_wb_free - Free a wb entry
1062 *
1063 * @adev: amdgpu_device pointer
1064 * @wb: wb index
1065 *
1066 * Free a wb slot allocated for use by the driver (all asics)
1067 */
amdgpu_device_wb_free(struct amdgpu_device * adev,u32 wb)1068 void amdgpu_device_wb_free(struct amdgpu_device *adev, u32 wb)
1069 {
1070 wb >>= 3;
1071 if (wb < adev->wb.num_wb)
1072 __clear_bit(wb, adev->wb.used);
1073 }
1074
1075 /**
1076 * amdgpu_device_resize_fb_bar - try to resize FB BAR
1077 *
1078 * @adev: amdgpu_device pointer
1079 *
1080 * Try to resize FB BAR to make all VRAM CPU accessible. We try very hard not
1081 * to fail, but if any of the BARs is not accessible after the size we abort
1082 * driver loading by returning -ENODEV.
1083 */
amdgpu_device_resize_fb_bar(struct amdgpu_device * adev)1084 int amdgpu_device_resize_fb_bar(struct amdgpu_device *adev)
1085 {
1086 u64 space_needed = roundup_pow_of_two(adev->gmc.real_vram_size);
1087 u32 rbar_size = order_base_2(((space_needed >> 20) | 1)) - 1;
1088 struct pci_bus *root;
1089 struct resource *res;
1090 unsigned i;
1091 u16 cmd;
1092 int r;
1093
1094 if (!IS_ENABLED(CONFIG_PHYS_ADDR_T_64BIT))
1095 return 0;
1096
1097 /* Bypass for VF */
1098 if (amdgpu_sriov_vf(adev))
1099 return 0;
1100
1101 /* skip if the bios has already enabled large BAR */
1102 if (adev->gmc.real_vram_size &&
1103 (pci_resource_len(adev->pdev, 0) >= adev->gmc.real_vram_size))
1104 return 0;
1105
1106 /* Check if the root BUS has 64bit memory resources */
1107 root = adev->pdev->bus;
1108 while (root->parent)
1109 root = root->parent;
1110
1111 pci_bus_for_each_resource(root, res, i) {
1112 if (res && res->flags & (IORESOURCE_MEM | IORESOURCE_MEM_64) &&
1113 res->start > 0x100000000ull)
1114 break;
1115 }
1116
1117 /* Trying to resize is pointless without a root hub window above 4GB */
1118 if (!res)
1119 return 0;
1120
1121 /* Disable memory decoding while we change the BAR addresses and size */
1122 pci_read_config_word(adev->pdev, PCI_COMMAND, &cmd);
1123 pci_write_config_word(adev->pdev, PCI_COMMAND,
1124 cmd & ~PCI_COMMAND_MEMORY);
1125
1126 /* Free the VRAM and doorbell BAR, we most likely need to move both. */
1127 amdgpu_device_doorbell_fini(adev);
1128 if (adev->asic_type >= CHIP_BONAIRE)
1129 pci_release_resource(adev->pdev, 2);
1130
1131 pci_release_resource(adev->pdev, 0);
1132
1133 r = pci_resize_resource(adev->pdev, 0, rbar_size);
1134 if (r == -ENOSPC)
1135 DRM_INFO("Not enough PCI address space for a large BAR.");
1136 else if (r && r != -ENOTSUPP)
1137 DRM_ERROR("Problem resizing BAR0 (%d).", r);
1138
1139 pci_assign_unassigned_bus_resources(adev->pdev->bus);
1140
1141 /* When the doorbell or fb BAR isn't available we have no chance of
1142 * using the device.
1143 */
1144 r = amdgpu_device_doorbell_init(adev);
1145 if (r || (pci_resource_flags(adev->pdev, 0) & IORESOURCE_UNSET))
1146 return -ENODEV;
1147
1148 pci_write_config_word(adev->pdev, PCI_COMMAND, cmd);
1149
1150 return 0;
1151 }
1152
1153 /*
1154 * GPU helpers function.
1155 */
1156 /**
1157 * amdgpu_device_need_post - check if the hw need post or not
1158 *
1159 * @adev: amdgpu_device pointer
1160 *
1161 * Check if the asic has been initialized (all asics) at driver startup
1162 * or post is needed if hw reset is performed.
1163 * Returns true if need or false if not.
1164 */
amdgpu_device_need_post(struct amdgpu_device * adev)1165 bool amdgpu_device_need_post(struct amdgpu_device *adev)
1166 {
1167 uint32_t reg;
1168
1169 if (amdgpu_sriov_vf(adev))
1170 return false;
1171
1172 if (amdgpu_passthrough(adev)) {
1173 /* for FIJI: In whole GPU pass-through virtualization case, after VM reboot
1174 * some old smc fw still need driver do vPost otherwise gpu hang, while
1175 * those smc fw version above 22.15 doesn't have this flaw, so we force
1176 * vpost executed for smc version below 22.15
1177 */
1178 if (adev->asic_type == CHIP_FIJI) {
1179 int err;
1180 uint32_t fw_ver;
1181 err = request_firmware(&adev->pm.fw, "amdgpu/fiji_smc.bin", adev->dev);
1182 /* force vPost if error occured */
1183 if (err)
1184 return true;
1185
1186 fw_ver = *((uint32_t *)adev->pm.fw->data + 69);
1187 if (fw_ver < 0x00160e00)
1188 return true;
1189 }
1190 }
1191
1192 if (adev->has_hw_reset) {
1193 adev->has_hw_reset = false;
1194 return true;
1195 }
1196
1197 /* bios scratch used on CIK+ */
1198 if (adev->asic_type >= CHIP_BONAIRE)
1199 return amdgpu_atombios_scratch_need_asic_init(adev);
1200
1201 /* check MEM_SIZE for older asics */
1202 reg = amdgpu_asic_get_config_memsize(adev);
1203
1204 if ((reg != 0) && (reg != 0xffffffff))
1205 return false;
1206
1207 return true;
1208 }
1209
1210 /* if we get transitioned to only one device, take VGA back */
1211 /**
1212 * amdgpu_device_vga_set_decode - enable/disable vga decode
1213 *
1214 * @cookie: amdgpu_device pointer
1215 * @state: enable/disable vga decode
1216 *
1217 * Enable/disable vga decode (all asics).
1218 * Returns VGA resource flags.
1219 */
amdgpu_device_vga_set_decode(void * cookie,bool state)1220 static unsigned int amdgpu_device_vga_set_decode(void *cookie, bool state)
1221 {
1222 struct amdgpu_device *adev = cookie;
1223 amdgpu_asic_set_vga_state(adev, state);
1224 if (state)
1225 return VGA_RSRC_LEGACY_IO | VGA_RSRC_LEGACY_MEM |
1226 VGA_RSRC_NORMAL_IO | VGA_RSRC_NORMAL_MEM;
1227 else
1228 return VGA_RSRC_NORMAL_IO | VGA_RSRC_NORMAL_MEM;
1229 }
1230
1231 /**
1232 * amdgpu_device_check_block_size - validate the vm block size
1233 *
1234 * @adev: amdgpu_device pointer
1235 *
1236 * Validates the vm block size specified via module parameter.
1237 * The vm block size defines number of bits in page table versus page directory,
1238 * a page is 4KB so we have 12 bits offset, minimum 9 bits in the
1239 * page table and the remaining bits are in the page directory.
1240 */
amdgpu_device_check_block_size(struct amdgpu_device * adev)1241 static void amdgpu_device_check_block_size(struct amdgpu_device *adev)
1242 {
1243 /* defines number of bits in page table versus page directory,
1244 * a page is 4KB so we have 12 bits offset, minimum 9 bits in the
1245 * page table and the remaining bits are in the page directory */
1246 if (amdgpu_vm_block_size == -1)
1247 return;
1248
1249 if (amdgpu_vm_block_size < 9) {
1250 dev_warn(adev->dev, "VM page table size (%d) too small\n",
1251 amdgpu_vm_block_size);
1252 amdgpu_vm_block_size = -1;
1253 }
1254 }
1255
1256 /**
1257 * amdgpu_device_check_vm_size - validate the vm size
1258 *
1259 * @adev: amdgpu_device pointer
1260 *
1261 * Validates the vm size in GB specified via module parameter.
1262 * The VM size is the size of the GPU virtual memory space in GB.
1263 */
amdgpu_device_check_vm_size(struct amdgpu_device * adev)1264 static void amdgpu_device_check_vm_size(struct amdgpu_device *adev)
1265 {
1266 /* no need to check the default value */
1267 if (amdgpu_vm_size == -1)
1268 return;
1269
1270 if (amdgpu_vm_size < 1) {
1271 dev_warn(adev->dev, "VM size (%d) too small, min is 1GB\n",
1272 amdgpu_vm_size);
1273 amdgpu_vm_size = -1;
1274 }
1275 }
1276
amdgpu_device_check_smu_prv_buffer_size(struct amdgpu_device * adev)1277 static void amdgpu_device_check_smu_prv_buffer_size(struct amdgpu_device *adev)
1278 {
1279 struct sysinfo si;
1280 bool is_os_64 = (sizeof(void *) == 8);
1281 uint64_t total_memory;
1282 uint64_t dram_size_seven_GB = 0x1B8000000;
1283 uint64_t dram_size_three_GB = 0xB8000000;
1284
1285 if (amdgpu_smu_memory_pool_size == 0)
1286 return;
1287
1288 if (!is_os_64) {
1289 DRM_WARN("Not 64-bit OS, feature not supported\n");
1290 goto def_value;
1291 }
1292 si_meminfo(&si);
1293 total_memory = (uint64_t)si.totalram * si.mem_unit;
1294
1295 if ((amdgpu_smu_memory_pool_size == 1) ||
1296 (amdgpu_smu_memory_pool_size == 2)) {
1297 if (total_memory < dram_size_three_GB)
1298 goto def_value1;
1299 } else if ((amdgpu_smu_memory_pool_size == 4) ||
1300 (amdgpu_smu_memory_pool_size == 8)) {
1301 if (total_memory < dram_size_seven_GB)
1302 goto def_value1;
1303 } else {
1304 DRM_WARN("Smu memory pool size not supported\n");
1305 goto def_value;
1306 }
1307 adev->pm.smu_prv_buffer_size = amdgpu_smu_memory_pool_size << 28;
1308
1309 return;
1310
1311 def_value1:
1312 DRM_WARN("No enough system memory\n");
1313 def_value:
1314 adev->pm.smu_prv_buffer_size = 0;
1315 }
1316
1317 /**
1318 * amdgpu_device_check_arguments - validate module params
1319 *
1320 * @adev: amdgpu_device pointer
1321 *
1322 * Validates certain module parameters and updates
1323 * the associated values used by the driver (all asics).
1324 */
amdgpu_device_check_arguments(struct amdgpu_device * adev)1325 static int amdgpu_device_check_arguments(struct amdgpu_device *adev)
1326 {
1327 if (amdgpu_sched_jobs < 4) {
1328 dev_warn(adev->dev, "sched jobs (%d) must be at least 4\n",
1329 amdgpu_sched_jobs);
1330 amdgpu_sched_jobs = 4;
1331 } else if (!is_power_of_2(amdgpu_sched_jobs)){
1332 dev_warn(adev->dev, "sched jobs (%d) must be a power of 2\n",
1333 amdgpu_sched_jobs);
1334 amdgpu_sched_jobs = roundup_pow_of_two(amdgpu_sched_jobs);
1335 }
1336
1337 if (amdgpu_gart_size != -1 && amdgpu_gart_size < 32) {
1338 /* gart size must be greater or equal to 32M */
1339 dev_warn(adev->dev, "gart size (%d) too small\n",
1340 amdgpu_gart_size);
1341 amdgpu_gart_size = -1;
1342 }
1343
1344 if (amdgpu_gtt_size != -1 && amdgpu_gtt_size < 32) {
1345 /* gtt size must be greater or equal to 32M */
1346 dev_warn(adev->dev, "gtt size (%d) too small\n",
1347 amdgpu_gtt_size);
1348 amdgpu_gtt_size = -1;
1349 }
1350
1351 /* valid range is between 4 and 9 inclusive */
1352 if (amdgpu_vm_fragment_size != -1 &&
1353 (amdgpu_vm_fragment_size > 9 || amdgpu_vm_fragment_size < 4)) {
1354 dev_warn(adev->dev, "valid range is between 4 and 9\n");
1355 amdgpu_vm_fragment_size = -1;
1356 }
1357
1358 if (amdgpu_sched_hw_submission < 2) {
1359 dev_warn(adev->dev, "sched hw submission jobs (%d) must be at least 2\n",
1360 amdgpu_sched_hw_submission);
1361 amdgpu_sched_hw_submission = 2;
1362 } else if (!is_power_of_2(amdgpu_sched_hw_submission)) {
1363 dev_warn(adev->dev, "sched hw submission jobs (%d) must be a power of 2\n",
1364 amdgpu_sched_hw_submission);
1365 amdgpu_sched_hw_submission = roundup_pow_of_two(amdgpu_sched_hw_submission);
1366 }
1367
1368 amdgpu_device_check_smu_prv_buffer_size(adev);
1369
1370 amdgpu_device_check_vm_size(adev);
1371
1372 amdgpu_device_check_block_size(adev);
1373
1374 adev->firmware.load_type = amdgpu_ucode_get_load_type(adev, amdgpu_fw_load_type);
1375
1376 amdgpu_gmc_tmz_set(adev);
1377
1378 if (amdgpu_num_kcq == -1) {
1379 amdgpu_num_kcq = 8;
1380 } else if (amdgpu_num_kcq > 8 || amdgpu_num_kcq < 0) {
1381 amdgpu_num_kcq = 8;
1382 dev_warn(adev->dev, "set kernel compute queue number to 8 due to invalid parameter provided by user\n");
1383 }
1384
1385 amdgpu_gmc_noretry_set(adev);
1386
1387 return 0;
1388 }
1389
1390 /**
1391 * amdgpu_switcheroo_set_state - set switcheroo state
1392 *
1393 * @pdev: pci dev pointer
1394 * @state: vga_switcheroo state
1395 *
1396 * Callback for the switcheroo driver. Suspends or resumes the
1397 * the asics before or after it is powered up using ACPI methods.
1398 */
amdgpu_switcheroo_set_state(struct pci_dev * pdev,enum vga_switcheroo_state state)1399 static void amdgpu_switcheroo_set_state(struct pci_dev *pdev,
1400 enum vga_switcheroo_state state)
1401 {
1402 struct drm_device *dev = pci_get_drvdata(pdev);
1403 int r;
1404
1405 if (amdgpu_device_supports_boco(dev) && state == VGA_SWITCHEROO_OFF)
1406 return;
1407
1408 if (state == VGA_SWITCHEROO_ON) {
1409 pr_info("switched on\n");
1410 /* don't suspend or resume card normally */
1411 dev->switch_power_state = DRM_SWITCH_POWER_CHANGING;
1412
1413 pci_set_power_state(dev->pdev, PCI_D0);
1414 amdgpu_device_load_pci_state(dev->pdev);
1415 r = pci_enable_device(dev->pdev);
1416 if (r)
1417 DRM_WARN("pci_enable_device failed (%d)\n", r);
1418 amdgpu_device_resume(dev, true);
1419
1420 dev->switch_power_state = DRM_SWITCH_POWER_ON;
1421 drm_kms_helper_poll_enable(dev);
1422 } else {
1423 pr_info("switched off\n");
1424 drm_kms_helper_poll_disable(dev);
1425 dev->switch_power_state = DRM_SWITCH_POWER_CHANGING;
1426 amdgpu_device_suspend(dev, true);
1427 amdgpu_device_cache_pci_state(dev->pdev);
1428 /* Shut down the device */
1429 pci_disable_device(dev->pdev);
1430 pci_set_power_state(dev->pdev, PCI_D3cold);
1431 dev->switch_power_state = DRM_SWITCH_POWER_OFF;
1432 }
1433 }
1434
1435 /**
1436 * amdgpu_switcheroo_can_switch - see if switcheroo state can change
1437 *
1438 * @pdev: pci dev pointer
1439 *
1440 * Callback for the switcheroo driver. Check of the switcheroo
1441 * state can be changed.
1442 * Returns true if the state can be changed, false if not.
1443 */
amdgpu_switcheroo_can_switch(struct pci_dev * pdev)1444 static bool amdgpu_switcheroo_can_switch(struct pci_dev *pdev)
1445 {
1446 struct drm_device *dev = pci_get_drvdata(pdev);
1447
1448 /*
1449 * FIXME: open_count is protected by drm_global_mutex but that would lead to
1450 * locking inversion with the driver load path. And the access here is
1451 * completely racy anyway. So don't bother with locking for now.
1452 */
1453 return atomic_read(&dev->open_count) == 0;
1454 }
1455
1456 static const struct vga_switcheroo_client_ops amdgpu_switcheroo_ops = {
1457 .set_gpu_state = amdgpu_switcheroo_set_state,
1458 .reprobe = NULL,
1459 .can_switch = amdgpu_switcheroo_can_switch,
1460 };
1461
1462 /**
1463 * amdgpu_device_ip_set_clockgating_state - set the CG state
1464 *
1465 * @dev: amdgpu_device pointer
1466 * @block_type: Type of hardware IP (SMU, GFX, UVD, etc.)
1467 * @state: clockgating state (gate or ungate)
1468 *
1469 * Sets the requested clockgating state for all instances of
1470 * the hardware IP specified.
1471 * Returns the error code from the last instance.
1472 */
amdgpu_device_ip_set_clockgating_state(void * dev,enum amd_ip_block_type block_type,enum amd_clockgating_state state)1473 int amdgpu_device_ip_set_clockgating_state(void *dev,
1474 enum amd_ip_block_type block_type,
1475 enum amd_clockgating_state state)
1476 {
1477 struct amdgpu_device *adev = dev;
1478 int i, r = 0;
1479
1480 for (i = 0; i < adev->num_ip_blocks; i++) {
1481 if (!adev->ip_blocks[i].status.valid)
1482 continue;
1483 if (adev->ip_blocks[i].version->type != block_type)
1484 continue;
1485 if (!adev->ip_blocks[i].version->funcs->set_clockgating_state)
1486 continue;
1487 r = adev->ip_blocks[i].version->funcs->set_clockgating_state(
1488 (void *)adev, state);
1489 if (r)
1490 DRM_ERROR("set_clockgating_state of IP block <%s> failed %d\n",
1491 adev->ip_blocks[i].version->funcs->name, r);
1492 }
1493 return r;
1494 }
1495
1496 /**
1497 * amdgpu_device_ip_set_powergating_state - set the PG state
1498 *
1499 * @dev: amdgpu_device pointer
1500 * @block_type: Type of hardware IP (SMU, GFX, UVD, etc.)
1501 * @state: powergating state (gate or ungate)
1502 *
1503 * Sets the requested powergating state for all instances of
1504 * the hardware IP specified.
1505 * Returns the error code from the last instance.
1506 */
amdgpu_device_ip_set_powergating_state(void * dev,enum amd_ip_block_type block_type,enum amd_powergating_state state)1507 int amdgpu_device_ip_set_powergating_state(void *dev,
1508 enum amd_ip_block_type block_type,
1509 enum amd_powergating_state state)
1510 {
1511 struct amdgpu_device *adev = dev;
1512 int i, r = 0;
1513
1514 for (i = 0; i < adev->num_ip_blocks; i++) {
1515 if (!adev->ip_blocks[i].status.valid)
1516 continue;
1517 if (adev->ip_blocks[i].version->type != block_type)
1518 continue;
1519 if (!adev->ip_blocks[i].version->funcs->set_powergating_state)
1520 continue;
1521 r = adev->ip_blocks[i].version->funcs->set_powergating_state(
1522 (void *)adev, state);
1523 if (r)
1524 DRM_ERROR("set_powergating_state of IP block <%s> failed %d\n",
1525 adev->ip_blocks[i].version->funcs->name, r);
1526 }
1527 return r;
1528 }
1529
1530 /**
1531 * amdgpu_device_ip_get_clockgating_state - get the CG state
1532 *
1533 * @adev: amdgpu_device pointer
1534 * @flags: clockgating feature flags
1535 *
1536 * Walks the list of IPs on the device and updates the clockgating
1537 * flags for each IP.
1538 * Updates @flags with the feature flags for each hardware IP where
1539 * clockgating is enabled.
1540 */
amdgpu_device_ip_get_clockgating_state(struct amdgpu_device * adev,u32 * flags)1541 void amdgpu_device_ip_get_clockgating_state(struct amdgpu_device *adev,
1542 u32 *flags)
1543 {
1544 int i;
1545
1546 for (i = 0; i < adev->num_ip_blocks; i++) {
1547 if (!adev->ip_blocks[i].status.valid)
1548 continue;
1549 if (adev->ip_blocks[i].version->funcs->get_clockgating_state)
1550 adev->ip_blocks[i].version->funcs->get_clockgating_state((void *)adev, flags);
1551 }
1552 }
1553
1554 /**
1555 * amdgpu_device_ip_wait_for_idle - wait for idle
1556 *
1557 * @adev: amdgpu_device pointer
1558 * @block_type: Type of hardware IP (SMU, GFX, UVD, etc.)
1559 *
1560 * Waits for the request hardware IP to be idle.
1561 * Returns 0 for success or a negative error code on failure.
1562 */
amdgpu_device_ip_wait_for_idle(struct amdgpu_device * adev,enum amd_ip_block_type block_type)1563 int amdgpu_device_ip_wait_for_idle(struct amdgpu_device *adev,
1564 enum amd_ip_block_type block_type)
1565 {
1566 int i, r;
1567
1568 for (i = 0; i < adev->num_ip_blocks; i++) {
1569 if (!adev->ip_blocks[i].status.valid)
1570 continue;
1571 if (adev->ip_blocks[i].version->type == block_type) {
1572 r = adev->ip_blocks[i].version->funcs->wait_for_idle((void *)adev);
1573 if (r)
1574 return r;
1575 break;
1576 }
1577 }
1578 return 0;
1579
1580 }
1581
1582 /**
1583 * amdgpu_device_ip_is_idle - is the hardware IP idle
1584 *
1585 * @adev: amdgpu_device pointer
1586 * @block_type: Type of hardware IP (SMU, GFX, UVD, etc.)
1587 *
1588 * Check if the hardware IP is idle or not.
1589 * Returns true if it the IP is idle, false if not.
1590 */
amdgpu_device_ip_is_idle(struct amdgpu_device * adev,enum amd_ip_block_type block_type)1591 bool amdgpu_device_ip_is_idle(struct amdgpu_device *adev,
1592 enum amd_ip_block_type block_type)
1593 {
1594 int i;
1595
1596 for (i = 0; i < adev->num_ip_blocks; i++) {
1597 if (!adev->ip_blocks[i].status.valid)
1598 continue;
1599 if (adev->ip_blocks[i].version->type == block_type)
1600 return adev->ip_blocks[i].version->funcs->is_idle((void *)adev);
1601 }
1602 return true;
1603
1604 }
1605
1606 /**
1607 * amdgpu_device_ip_get_ip_block - get a hw IP pointer
1608 *
1609 * @adev: amdgpu_device pointer
1610 * @type: Type of hardware IP (SMU, GFX, UVD, etc.)
1611 *
1612 * Returns a pointer to the hardware IP block structure
1613 * if it exists for the asic, otherwise NULL.
1614 */
1615 struct amdgpu_ip_block *
amdgpu_device_ip_get_ip_block(struct amdgpu_device * adev,enum amd_ip_block_type type)1616 amdgpu_device_ip_get_ip_block(struct amdgpu_device *adev,
1617 enum amd_ip_block_type type)
1618 {
1619 int i;
1620
1621 for (i = 0; i < adev->num_ip_blocks; i++)
1622 if (adev->ip_blocks[i].version->type == type)
1623 return &adev->ip_blocks[i];
1624
1625 return NULL;
1626 }
1627
1628 /**
1629 * amdgpu_device_ip_block_version_cmp
1630 *
1631 * @adev: amdgpu_device pointer
1632 * @type: enum amd_ip_block_type
1633 * @major: major version
1634 * @minor: minor version
1635 *
1636 * return 0 if equal or greater
1637 * return 1 if smaller or the ip_block doesn't exist
1638 */
amdgpu_device_ip_block_version_cmp(struct amdgpu_device * adev,enum amd_ip_block_type type,u32 major,u32 minor)1639 int amdgpu_device_ip_block_version_cmp(struct amdgpu_device *adev,
1640 enum amd_ip_block_type type,
1641 u32 major, u32 minor)
1642 {
1643 struct amdgpu_ip_block *ip_block = amdgpu_device_ip_get_ip_block(adev, type);
1644
1645 if (ip_block && ((ip_block->version->major > major) ||
1646 ((ip_block->version->major == major) &&
1647 (ip_block->version->minor >= minor))))
1648 return 0;
1649
1650 return 1;
1651 }
1652
1653 /**
1654 * amdgpu_device_ip_block_add
1655 *
1656 * @adev: amdgpu_device pointer
1657 * @ip_block_version: pointer to the IP to add
1658 *
1659 * Adds the IP block driver information to the collection of IPs
1660 * on the asic.
1661 */
amdgpu_device_ip_block_add(struct amdgpu_device * adev,const struct amdgpu_ip_block_version * ip_block_version)1662 int amdgpu_device_ip_block_add(struct amdgpu_device *adev,
1663 const struct amdgpu_ip_block_version *ip_block_version)
1664 {
1665 if (!ip_block_version)
1666 return -EINVAL;
1667
1668 DRM_INFO("add ip block number %d <%s>\n", adev->num_ip_blocks,
1669 ip_block_version->funcs->name);
1670
1671 adev->ip_blocks[adev->num_ip_blocks++].version = ip_block_version;
1672
1673 return 0;
1674 }
1675
1676 /**
1677 * amdgpu_device_enable_virtual_display - enable virtual display feature
1678 *
1679 * @adev: amdgpu_device pointer
1680 *
1681 * Enabled the virtual display feature if the user has enabled it via
1682 * the module parameter virtual_display. This feature provides a virtual
1683 * display hardware on headless boards or in virtualized environments.
1684 * This function parses and validates the configuration string specified by
1685 * the user and configues the virtual display configuration (number of
1686 * virtual connectors, crtcs, etc.) specified.
1687 */
amdgpu_device_enable_virtual_display(struct amdgpu_device * adev)1688 static void amdgpu_device_enable_virtual_display(struct amdgpu_device *adev)
1689 {
1690 adev->enable_virtual_display = false;
1691
1692 if (amdgpu_virtual_display) {
1693 struct drm_device *ddev = adev_to_drm(adev);
1694 const char *pci_address_name = pci_name(ddev->pdev);
1695 char *pciaddstr, *pciaddstr_tmp, *pciaddname_tmp, *pciaddname;
1696
1697 pciaddstr = kstrdup(amdgpu_virtual_display, GFP_KERNEL);
1698 pciaddstr_tmp = pciaddstr;
1699 while ((pciaddname_tmp = strsep(&pciaddstr_tmp, ";"))) {
1700 pciaddname = strsep(&pciaddname_tmp, ",");
1701 if (!strcmp("all", pciaddname)
1702 || !strcmp(pci_address_name, pciaddname)) {
1703 long num_crtc;
1704 int res = -1;
1705
1706 adev->enable_virtual_display = true;
1707
1708 if (pciaddname_tmp)
1709 res = kstrtol(pciaddname_tmp, 10,
1710 &num_crtc);
1711
1712 if (!res) {
1713 if (num_crtc < 1)
1714 num_crtc = 1;
1715 if (num_crtc > 6)
1716 num_crtc = 6;
1717 adev->mode_info.num_crtc = num_crtc;
1718 } else {
1719 adev->mode_info.num_crtc = 1;
1720 }
1721 break;
1722 }
1723 }
1724
1725 DRM_INFO("virtual display string:%s, %s:virtual_display:%d, num_crtc:%d\n",
1726 amdgpu_virtual_display, pci_address_name,
1727 adev->enable_virtual_display, adev->mode_info.num_crtc);
1728
1729 kfree(pciaddstr);
1730 }
1731 }
1732
1733 /**
1734 * amdgpu_device_parse_gpu_info_fw - parse gpu info firmware
1735 *
1736 * @adev: amdgpu_device pointer
1737 *
1738 * Parses the asic configuration parameters specified in the gpu info
1739 * firmware and makes them availale to the driver for use in configuring
1740 * the asic.
1741 * Returns 0 on success, -EINVAL on failure.
1742 */
amdgpu_device_parse_gpu_info_fw(struct amdgpu_device * adev)1743 static int amdgpu_device_parse_gpu_info_fw(struct amdgpu_device *adev)
1744 {
1745 const char *chip_name;
1746 char fw_name[40];
1747 int err;
1748 const struct gpu_info_firmware_header_v1_0 *hdr;
1749
1750 adev->firmware.gpu_info_fw = NULL;
1751
1752 if (adev->mman.discovery_bin) {
1753 amdgpu_discovery_get_gfx_info(adev);
1754
1755 /*
1756 * FIXME: The bounding box is still needed by Navi12, so
1757 * temporarily read it from gpu_info firmware. Should be droped
1758 * when DAL no longer needs it.
1759 */
1760 if (adev->asic_type != CHIP_NAVI12)
1761 return 0;
1762 }
1763
1764 switch (adev->asic_type) {
1765 #ifdef CONFIG_DRM_AMDGPU_SI
1766 case CHIP_VERDE:
1767 case CHIP_TAHITI:
1768 case CHIP_PITCAIRN:
1769 case CHIP_OLAND:
1770 case CHIP_HAINAN:
1771 #endif
1772 #ifdef CONFIG_DRM_AMDGPU_CIK
1773 case CHIP_BONAIRE:
1774 case CHIP_HAWAII:
1775 case CHIP_KAVERI:
1776 case CHIP_KABINI:
1777 case CHIP_MULLINS:
1778 #endif
1779 case CHIP_TOPAZ:
1780 case CHIP_TONGA:
1781 case CHIP_FIJI:
1782 case CHIP_POLARIS10:
1783 case CHIP_POLARIS11:
1784 case CHIP_POLARIS12:
1785 case CHIP_VEGAM:
1786 case CHIP_CARRIZO:
1787 case CHIP_STONEY:
1788 case CHIP_VEGA20:
1789 case CHIP_SIENNA_CICHLID:
1790 case CHIP_NAVY_FLOUNDER:
1791 default:
1792 return 0;
1793 case CHIP_VEGA10:
1794 chip_name = "vega10";
1795 break;
1796 case CHIP_VEGA12:
1797 chip_name = "vega12";
1798 break;
1799 case CHIP_RAVEN:
1800 if (adev->apu_flags & AMD_APU_IS_RAVEN2)
1801 chip_name = "raven2";
1802 else if (adev->apu_flags & AMD_APU_IS_PICASSO)
1803 chip_name = "picasso";
1804 else
1805 chip_name = "raven";
1806 break;
1807 case CHIP_ARCTURUS:
1808 chip_name = "arcturus";
1809 break;
1810 case CHIP_RENOIR:
1811 if (adev->apu_flags & AMD_APU_IS_RENOIR)
1812 chip_name = "renoir";
1813 else
1814 chip_name = "green_sardine";
1815 break;
1816 case CHIP_NAVI10:
1817 chip_name = "navi10";
1818 break;
1819 case CHIP_NAVI14:
1820 chip_name = "navi14";
1821 break;
1822 case CHIP_NAVI12:
1823 chip_name = "navi12";
1824 break;
1825 }
1826
1827 snprintf(fw_name, sizeof(fw_name), "amdgpu/%s_gpu_info.bin", chip_name);
1828 err = request_firmware(&adev->firmware.gpu_info_fw, fw_name, adev->dev);
1829 if (err) {
1830 dev_err(adev->dev,
1831 "Failed to load gpu_info firmware \"%s\"\n",
1832 fw_name);
1833 goto out;
1834 }
1835 err = amdgpu_ucode_validate(adev->firmware.gpu_info_fw);
1836 if (err) {
1837 dev_err(adev->dev,
1838 "Failed to validate gpu_info firmware \"%s\"\n",
1839 fw_name);
1840 goto out;
1841 }
1842
1843 hdr = (const struct gpu_info_firmware_header_v1_0 *)adev->firmware.gpu_info_fw->data;
1844 amdgpu_ucode_print_gpu_info_hdr(&hdr->header);
1845
1846 switch (hdr->version_major) {
1847 case 1:
1848 {
1849 const struct gpu_info_firmware_v1_0 *gpu_info_fw =
1850 (const struct gpu_info_firmware_v1_0 *)(adev->firmware.gpu_info_fw->data +
1851 le32_to_cpu(hdr->header.ucode_array_offset_bytes));
1852
1853 /*
1854 * Should be droped when DAL no longer needs it.
1855 */
1856 if (adev->asic_type == CHIP_NAVI12)
1857 goto parse_soc_bounding_box;
1858
1859 adev->gfx.config.max_shader_engines = le32_to_cpu(gpu_info_fw->gc_num_se);
1860 adev->gfx.config.max_cu_per_sh = le32_to_cpu(gpu_info_fw->gc_num_cu_per_sh);
1861 adev->gfx.config.max_sh_per_se = le32_to_cpu(gpu_info_fw->gc_num_sh_per_se);
1862 adev->gfx.config.max_backends_per_se = le32_to_cpu(gpu_info_fw->gc_num_rb_per_se);
1863 adev->gfx.config.max_texture_channel_caches =
1864 le32_to_cpu(gpu_info_fw->gc_num_tccs);
1865 adev->gfx.config.max_gprs = le32_to_cpu(gpu_info_fw->gc_num_gprs);
1866 adev->gfx.config.max_gs_threads = le32_to_cpu(gpu_info_fw->gc_num_max_gs_thds);
1867 adev->gfx.config.gs_vgt_table_depth = le32_to_cpu(gpu_info_fw->gc_gs_table_depth);
1868 adev->gfx.config.gs_prim_buffer_depth = le32_to_cpu(gpu_info_fw->gc_gsprim_buff_depth);
1869 adev->gfx.config.double_offchip_lds_buf =
1870 le32_to_cpu(gpu_info_fw->gc_double_offchip_lds_buffer);
1871 adev->gfx.cu_info.wave_front_size = le32_to_cpu(gpu_info_fw->gc_wave_size);
1872 adev->gfx.cu_info.max_waves_per_simd =
1873 le32_to_cpu(gpu_info_fw->gc_max_waves_per_simd);
1874 adev->gfx.cu_info.max_scratch_slots_per_cu =
1875 le32_to_cpu(gpu_info_fw->gc_max_scratch_slots_per_cu);
1876 adev->gfx.cu_info.lds_size = le32_to_cpu(gpu_info_fw->gc_lds_size);
1877 if (hdr->version_minor >= 1) {
1878 const struct gpu_info_firmware_v1_1 *gpu_info_fw =
1879 (const struct gpu_info_firmware_v1_1 *)(adev->firmware.gpu_info_fw->data +
1880 le32_to_cpu(hdr->header.ucode_array_offset_bytes));
1881 adev->gfx.config.num_sc_per_sh =
1882 le32_to_cpu(gpu_info_fw->num_sc_per_sh);
1883 adev->gfx.config.num_packer_per_sc =
1884 le32_to_cpu(gpu_info_fw->num_packer_per_sc);
1885 }
1886
1887 parse_soc_bounding_box:
1888 /*
1889 * soc bounding box info is not integrated in disocovery table,
1890 * we always need to parse it from gpu info firmware if needed.
1891 */
1892 if (hdr->version_minor == 2) {
1893 const struct gpu_info_firmware_v1_2 *gpu_info_fw =
1894 (const struct gpu_info_firmware_v1_2 *)(adev->firmware.gpu_info_fw->data +
1895 le32_to_cpu(hdr->header.ucode_array_offset_bytes));
1896 adev->dm.soc_bounding_box = &gpu_info_fw->soc_bounding_box;
1897 }
1898 break;
1899 }
1900 default:
1901 dev_err(adev->dev,
1902 "Unsupported gpu_info table %d\n", hdr->header.ucode_version);
1903 err = -EINVAL;
1904 goto out;
1905 }
1906 out:
1907 return err;
1908 }
1909
1910 /**
1911 * amdgpu_device_ip_early_init - run early init for hardware IPs
1912 *
1913 * @adev: amdgpu_device pointer
1914 *
1915 * Early initialization pass for hardware IPs. The hardware IPs that make
1916 * up each asic are discovered each IP's early_init callback is run. This
1917 * is the first stage in initializing the asic.
1918 * Returns 0 on success, negative error code on failure.
1919 */
amdgpu_device_ip_early_init(struct amdgpu_device * adev)1920 static int amdgpu_device_ip_early_init(struct amdgpu_device *adev)
1921 {
1922 int i, r;
1923
1924 amdgpu_device_enable_virtual_display(adev);
1925
1926 if (amdgpu_sriov_vf(adev)) {
1927 r = amdgpu_virt_request_full_gpu(adev, true);
1928 if (r)
1929 return r;
1930 }
1931
1932 switch (adev->asic_type) {
1933 #ifdef CONFIG_DRM_AMDGPU_SI
1934 case CHIP_VERDE:
1935 case CHIP_TAHITI:
1936 case CHIP_PITCAIRN:
1937 case CHIP_OLAND:
1938 case CHIP_HAINAN:
1939 adev->family = AMDGPU_FAMILY_SI;
1940 r = si_set_ip_blocks(adev);
1941 if (r)
1942 return r;
1943 break;
1944 #endif
1945 #ifdef CONFIG_DRM_AMDGPU_CIK
1946 case CHIP_BONAIRE:
1947 case CHIP_HAWAII:
1948 case CHIP_KAVERI:
1949 case CHIP_KABINI:
1950 case CHIP_MULLINS:
1951 if (adev->flags & AMD_IS_APU)
1952 adev->family = AMDGPU_FAMILY_KV;
1953 else
1954 adev->family = AMDGPU_FAMILY_CI;
1955
1956 r = cik_set_ip_blocks(adev);
1957 if (r)
1958 return r;
1959 break;
1960 #endif
1961 case CHIP_TOPAZ:
1962 case CHIP_TONGA:
1963 case CHIP_FIJI:
1964 case CHIP_POLARIS10:
1965 case CHIP_POLARIS11:
1966 case CHIP_POLARIS12:
1967 case CHIP_VEGAM:
1968 case CHIP_CARRIZO:
1969 case CHIP_STONEY:
1970 if (adev->flags & AMD_IS_APU)
1971 adev->family = AMDGPU_FAMILY_CZ;
1972 else
1973 adev->family = AMDGPU_FAMILY_VI;
1974
1975 r = vi_set_ip_blocks(adev);
1976 if (r)
1977 return r;
1978 break;
1979 case CHIP_VEGA10:
1980 case CHIP_VEGA12:
1981 case CHIP_VEGA20:
1982 case CHIP_RAVEN:
1983 case CHIP_ARCTURUS:
1984 case CHIP_RENOIR:
1985 if (adev->flags & AMD_IS_APU)
1986 adev->family = AMDGPU_FAMILY_RV;
1987 else
1988 adev->family = AMDGPU_FAMILY_AI;
1989
1990 r = soc15_set_ip_blocks(adev);
1991 if (r)
1992 return r;
1993 break;
1994 case CHIP_NAVI10:
1995 case CHIP_NAVI14:
1996 case CHIP_NAVI12:
1997 case CHIP_SIENNA_CICHLID:
1998 case CHIP_NAVY_FLOUNDER:
1999 adev->family = AMDGPU_FAMILY_NV;
2000
2001 r = nv_set_ip_blocks(adev);
2002 if (r)
2003 return r;
2004 break;
2005 default:
2006 /* FIXME: not supported yet */
2007 return -EINVAL;
2008 }
2009
2010 amdgpu_amdkfd_device_probe(adev);
2011
2012 adev->pm.pp_feature = amdgpu_pp_feature_mask;
2013 if (amdgpu_sriov_vf(adev) || sched_policy == KFD_SCHED_POLICY_NO_HWS)
2014 adev->pm.pp_feature &= ~PP_GFXOFF_MASK;
2015
2016 for (i = 0; i < adev->num_ip_blocks; i++) {
2017 if ((amdgpu_ip_block_mask & (1 << i)) == 0) {
2018 DRM_ERROR("disabled ip block: %d <%s>\n",
2019 i, adev->ip_blocks[i].version->funcs->name);
2020 adev->ip_blocks[i].status.valid = false;
2021 } else {
2022 if (adev->ip_blocks[i].version->funcs->early_init) {
2023 r = adev->ip_blocks[i].version->funcs->early_init((void *)adev);
2024 if (r == -ENOENT) {
2025 adev->ip_blocks[i].status.valid = false;
2026 } else if (r) {
2027 DRM_ERROR("early_init of IP block <%s> failed %d\n",
2028 adev->ip_blocks[i].version->funcs->name, r);
2029 return r;
2030 } else {
2031 adev->ip_blocks[i].status.valid = true;
2032 }
2033 } else {
2034 adev->ip_blocks[i].status.valid = true;
2035 }
2036 }
2037 /* get the vbios after the asic_funcs are set up */
2038 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_COMMON) {
2039 r = amdgpu_device_parse_gpu_info_fw(adev);
2040 if (r)
2041 return r;
2042
2043 /* Read BIOS */
2044 if (!amdgpu_get_bios(adev))
2045 return -EINVAL;
2046
2047 r = amdgpu_atombios_init(adev);
2048 if (r) {
2049 dev_err(adev->dev, "amdgpu_atombios_init failed\n");
2050 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_ATOMBIOS_INIT_FAIL, 0, 0);
2051 return r;
2052 }
2053
2054 /*get pf2vf msg info at it's earliest time*/
2055 if (amdgpu_sriov_vf(adev))
2056 amdgpu_virt_init_data_exchange(adev);
2057
2058 }
2059 }
2060
2061 adev->cg_flags &= amdgpu_cg_mask;
2062 adev->pg_flags &= amdgpu_pg_mask;
2063
2064 return 0;
2065 }
2066
amdgpu_device_ip_hw_init_phase1(struct amdgpu_device * adev)2067 static int amdgpu_device_ip_hw_init_phase1(struct amdgpu_device *adev)
2068 {
2069 int i, r;
2070
2071 for (i = 0; i < adev->num_ip_blocks; i++) {
2072 if (!adev->ip_blocks[i].status.sw)
2073 continue;
2074 if (adev->ip_blocks[i].status.hw)
2075 continue;
2076 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_COMMON ||
2077 (amdgpu_sriov_vf(adev) && (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_PSP)) ||
2078 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_IH) {
2079 r = adev->ip_blocks[i].version->funcs->hw_init(adev);
2080 if (r) {
2081 DRM_ERROR("hw_init of IP block <%s> failed %d\n",
2082 adev->ip_blocks[i].version->funcs->name, r);
2083 return r;
2084 }
2085 adev->ip_blocks[i].status.hw = true;
2086 }
2087 }
2088
2089 return 0;
2090 }
2091
amdgpu_device_ip_hw_init_phase2(struct amdgpu_device * adev)2092 static int amdgpu_device_ip_hw_init_phase2(struct amdgpu_device *adev)
2093 {
2094 int i, r;
2095
2096 for (i = 0; i < adev->num_ip_blocks; i++) {
2097 if (!adev->ip_blocks[i].status.sw)
2098 continue;
2099 if (adev->ip_blocks[i].status.hw)
2100 continue;
2101 r = adev->ip_blocks[i].version->funcs->hw_init(adev);
2102 if (r) {
2103 DRM_ERROR("hw_init of IP block <%s> failed %d\n",
2104 adev->ip_blocks[i].version->funcs->name, r);
2105 return r;
2106 }
2107 adev->ip_blocks[i].status.hw = true;
2108 }
2109
2110 return 0;
2111 }
2112
amdgpu_device_fw_loading(struct amdgpu_device * adev)2113 static int amdgpu_device_fw_loading(struct amdgpu_device *adev)
2114 {
2115 int r = 0;
2116 int i;
2117 uint32_t smu_version;
2118
2119 if (adev->asic_type >= CHIP_VEGA10) {
2120 for (i = 0; i < adev->num_ip_blocks; i++) {
2121 if (adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_PSP)
2122 continue;
2123
2124 /* no need to do the fw loading again if already done*/
2125 if (adev->ip_blocks[i].status.hw == true)
2126 break;
2127
2128 if (amdgpu_in_reset(adev) || adev->in_suspend) {
2129 r = adev->ip_blocks[i].version->funcs->resume(adev);
2130 if (r) {
2131 DRM_ERROR("resume of IP block <%s> failed %d\n",
2132 adev->ip_blocks[i].version->funcs->name, r);
2133 return r;
2134 }
2135 } else {
2136 r = adev->ip_blocks[i].version->funcs->hw_init(adev);
2137 if (r) {
2138 DRM_ERROR("hw_init of IP block <%s> failed %d\n",
2139 adev->ip_blocks[i].version->funcs->name, r);
2140 return r;
2141 }
2142 }
2143
2144 adev->ip_blocks[i].status.hw = true;
2145 break;
2146 }
2147 }
2148
2149 if (!amdgpu_sriov_vf(adev) || adev->asic_type == CHIP_TONGA)
2150 r = amdgpu_pm_load_smu_firmware(adev, &smu_version);
2151
2152 return r;
2153 }
2154
2155 /**
2156 * amdgpu_device_ip_init - run init for hardware IPs
2157 *
2158 * @adev: amdgpu_device pointer
2159 *
2160 * Main initialization pass for hardware IPs. The list of all the hardware
2161 * IPs that make up the asic is walked and the sw_init and hw_init callbacks
2162 * are run. sw_init initializes the software state associated with each IP
2163 * and hw_init initializes the hardware associated with each IP.
2164 * Returns 0 on success, negative error code on failure.
2165 */
amdgpu_device_ip_init(struct amdgpu_device * adev)2166 static int amdgpu_device_ip_init(struct amdgpu_device *adev)
2167 {
2168 int i, r;
2169
2170 r = amdgpu_ras_init(adev);
2171 if (r)
2172 return r;
2173
2174 for (i = 0; i < adev->num_ip_blocks; i++) {
2175 if (!adev->ip_blocks[i].status.valid)
2176 continue;
2177 r = adev->ip_blocks[i].version->funcs->sw_init((void *)adev);
2178 if (r) {
2179 DRM_ERROR("sw_init of IP block <%s> failed %d\n",
2180 adev->ip_blocks[i].version->funcs->name, r);
2181 goto init_failed;
2182 }
2183 adev->ip_blocks[i].status.sw = true;
2184
2185 /* need to do gmc hw init early so we can allocate gpu mem */
2186 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC) {
2187 /* Try to reserve bad pages early */
2188 if (amdgpu_sriov_vf(adev))
2189 amdgpu_virt_exchange_data(adev);
2190
2191 r = amdgpu_device_vram_scratch_init(adev);
2192 if (r) {
2193 DRM_ERROR("amdgpu_vram_scratch_init failed %d\n", r);
2194 goto init_failed;
2195 }
2196 r = adev->ip_blocks[i].version->funcs->hw_init((void *)adev);
2197 if (r) {
2198 DRM_ERROR("hw_init %d failed %d\n", i, r);
2199 goto init_failed;
2200 }
2201 r = amdgpu_device_wb_init(adev);
2202 if (r) {
2203 DRM_ERROR("amdgpu_device_wb_init failed %d\n", r);
2204 goto init_failed;
2205 }
2206 adev->ip_blocks[i].status.hw = true;
2207
2208 /* right after GMC hw init, we create CSA */
2209 if (amdgpu_mcbp || amdgpu_sriov_vf(adev)) {
2210 r = amdgpu_allocate_static_csa(adev, &adev->virt.csa_obj,
2211 AMDGPU_GEM_DOMAIN_VRAM,
2212 AMDGPU_CSA_SIZE);
2213 if (r) {
2214 DRM_ERROR("allocate CSA failed %d\n", r);
2215 goto init_failed;
2216 }
2217 }
2218 }
2219 }
2220
2221 if (amdgpu_sriov_vf(adev))
2222 amdgpu_virt_init_data_exchange(adev);
2223
2224 r = amdgpu_ib_pool_init(adev);
2225 if (r) {
2226 dev_err(adev->dev, "IB initialization failed (%d).\n", r);
2227 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_IB_INIT_FAIL, 0, r);
2228 goto init_failed;
2229 }
2230
2231 r = amdgpu_ucode_create_bo(adev); /* create ucode bo when sw_init complete*/
2232 if (r)
2233 goto init_failed;
2234
2235 r = amdgpu_device_ip_hw_init_phase1(adev);
2236 if (r)
2237 goto init_failed;
2238
2239 r = amdgpu_device_fw_loading(adev);
2240 if (r)
2241 goto init_failed;
2242
2243 r = amdgpu_device_ip_hw_init_phase2(adev);
2244 if (r)
2245 goto init_failed;
2246
2247 /*
2248 * retired pages will be loaded from eeprom and reserved here,
2249 * it should be called after amdgpu_device_ip_hw_init_phase2 since
2250 * for some ASICs the RAS EEPROM code relies on SMU fully functioning
2251 * for I2C communication which only true at this point.
2252 *
2253 * amdgpu_ras_recovery_init may fail, but the upper only cares the
2254 * failure from bad gpu situation and stop amdgpu init process
2255 * accordingly. For other failed cases, it will still release all
2256 * the resource and print error message, rather than returning one
2257 * negative value to upper level.
2258 *
2259 * Note: theoretically, this should be called before all vram allocations
2260 * to protect retired page from abusing
2261 */
2262 r = amdgpu_ras_recovery_init(adev);
2263 if (r)
2264 goto init_failed;
2265
2266 if (adev->gmc.xgmi.num_physical_nodes > 1)
2267 amdgpu_xgmi_add_device(adev);
2268 amdgpu_amdkfd_device_init(adev);
2269
2270 amdgpu_fru_get_product_info(adev);
2271
2272 init_failed:
2273 if (amdgpu_sriov_vf(adev))
2274 amdgpu_virt_release_full_gpu(adev, true);
2275
2276 return r;
2277 }
2278
2279 /**
2280 * amdgpu_device_fill_reset_magic - writes reset magic to gart pointer
2281 *
2282 * @adev: amdgpu_device pointer
2283 *
2284 * Writes a reset magic value to the gart pointer in VRAM. The driver calls
2285 * this function before a GPU reset. If the value is retained after a
2286 * GPU reset, VRAM has not been lost. Some GPU resets may destry VRAM contents.
2287 */
amdgpu_device_fill_reset_magic(struct amdgpu_device * adev)2288 static void amdgpu_device_fill_reset_magic(struct amdgpu_device *adev)
2289 {
2290 memcpy(adev->reset_magic, adev->gart.ptr, AMDGPU_RESET_MAGIC_NUM);
2291 }
2292
2293 /**
2294 * amdgpu_device_check_vram_lost - check if vram is valid
2295 *
2296 * @adev: amdgpu_device pointer
2297 *
2298 * Checks the reset magic value written to the gart pointer in VRAM.
2299 * The driver calls this after a GPU reset to see if the contents of
2300 * VRAM is lost or now.
2301 * returns true if vram is lost, false if not.
2302 */
amdgpu_device_check_vram_lost(struct amdgpu_device * adev)2303 static bool amdgpu_device_check_vram_lost(struct amdgpu_device *adev)
2304 {
2305 if (memcmp(adev->gart.ptr, adev->reset_magic,
2306 AMDGPU_RESET_MAGIC_NUM))
2307 return true;
2308
2309 if (!amdgpu_in_reset(adev))
2310 return false;
2311
2312 /*
2313 * For all ASICs with baco/mode1 reset, the VRAM is
2314 * always assumed to be lost.
2315 */
2316 switch (amdgpu_asic_reset_method(adev)) {
2317 case AMD_RESET_METHOD_BACO:
2318 case AMD_RESET_METHOD_MODE1:
2319 return true;
2320 default:
2321 return false;
2322 }
2323 }
2324
2325 /**
2326 * amdgpu_device_set_cg_state - set clockgating for amdgpu device
2327 *
2328 * @adev: amdgpu_device pointer
2329 * @state: clockgating state (gate or ungate)
2330 *
2331 * The list of all the hardware IPs that make up the asic is walked and the
2332 * set_clockgating_state callbacks are run.
2333 * Late initialization pass enabling clockgating for hardware IPs.
2334 * Fini or suspend, pass disabling clockgating for hardware IPs.
2335 * Returns 0 on success, negative error code on failure.
2336 */
2337
amdgpu_device_set_cg_state(struct amdgpu_device * adev,enum amd_clockgating_state state)2338 static int amdgpu_device_set_cg_state(struct amdgpu_device *adev,
2339 enum amd_clockgating_state state)
2340 {
2341 int i, j, r;
2342
2343 if (amdgpu_emu_mode == 1)
2344 return 0;
2345
2346 for (j = 0; j < adev->num_ip_blocks; j++) {
2347 i = state == AMD_CG_STATE_GATE ? j : adev->num_ip_blocks - j - 1;
2348 if (!adev->ip_blocks[i].status.late_initialized)
2349 continue;
2350 /* skip CG for VCE/UVD, it's handled specially */
2351 if (adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_UVD &&
2352 adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_VCE &&
2353 adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_VCN &&
2354 adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_JPEG &&
2355 adev->ip_blocks[i].version->funcs->set_clockgating_state) {
2356 /* enable clockgating to save power */
2357 r = adev->ip_blocks[i].version->funcs->set_clockgating_state((void *)adev,
2358 state);
2359 if (r) {
2360 DRM_ERROR("set_clockgating_state(gate) of IP block <%s> failed %d\n",
2361 adev->ip_blocks[i].version->funcs->name, r);
2362 return r;
2363 }
2364 }
2365 }
2366
2367 return 0;
2368 }
2369
amdgpu_device_set_pg_state(struct amdgpu_device * adev,enum amd_powergating_state state)2370 static int amdgpu_device_set_pg_state(struct amdgpu_device *adev, enum amd_powergating_state state)
2371 {
2372 int i, j, r;
2373
2374 if (amdgpu_emu_mode == 1)
2375 return 0;
2376
2377 for (j = 0; j < adev->num_ip_blocks; j++) {
2378 i = state == AMD_PG_STATE_GATE ? j : adev->num_ip_blocks - j - 1;
2379 if (!adev->ip_blocks[i].status.late_initialized)
2380 continue;
2381 /* skip CG for VCE/UVD, it's handled specially */
2382 if (adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_UVD &&
2383 adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_VCE &&
2384 adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_VCN &&
2385 adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_JPEG &&
2386 adev->ip_blocks[i].version->funcs->set_powergating_state) {
2387 /* enable powergating to save power */
2388 r = adev->ip_blocks[i].version->funcs->set_powergating_state((void *)adev,
2389 state);
2390 if (r) {
2391 DRM_ERROR("set_powergating_state(gate) of IP block <%s> failed %d\n",
2392 adev->ip_blocks[i].version->funcs->name, r);
2393 return r;
2394 }
2395 }
2396 }
2397 return 0;
2398 }
2399
amdgpu_device_enable_mgpu_fan_boost(void)2400 static int amdgpu_device_enable_mgpu_fan_boost(void)
2401 {
2402 struct amdgpu_gpu_instance *gpu_ins;
2403 struct amdgpu_device *adev;
2404 int i, ret = 0;
2405
2406 mutex_lock(&mgpu_info.mutex);
2407
2408 /*
2409 * MGPU fan boost feature should be enabled
2410 * only when there are two or more dGPUs in
2411 * the system
2412 */
2413 if (mgpu_info.num_dgpu < 2)
2414 goto out;
2415
2416 for (i = 0; i < mgpu_info.num_dgpu; i++) {
2417 gpu_ins = &(mgpu_info.gpu_ins[i]);
2418 adev = gpu_ins->adev;
2419 if (!(adev->flags & AMD_IS_APU) &&
2420 !gpu_ins->mgpu_fan_enabled) {
2421 ret = amdgpu_dpm_enable_mgpu_fan_boost(adev);
2422 if (ret)
2423 break;
2424
2425 gpu_ins->mgpu_fan_enabled = 1;
2426 }
2427 }
2428
2429 out:
2430 mutex_unlock(&mgpu_info.mutex);
2431
2432 return ret;
2433 }
2434
2435 /**
2436 * amdgpu_device_ip_late_init - run late init for hardware IPs
2437 *
2438 * @adev: amdgpu_device pointer
2439 *
2440 * Late initialization pass for hardware IPs. The list of all the hardware
2441 * IPs that make up the asic is walked and the late_init callbacks are run.
2442 * late_init covers any special initialization that an IP requires
2443 * after all of the have been initialized or something that needs to happen
2444 * late in the init process.
2445 * Returns 0 on success, negative error code on failure.
2446 */
amdgpu_device_ip_late_init(struct amdgpu_device * adev)2447 static int amdgpu_device_ip_late_init(struct amdgpu_device *adev)
2448 {
2449 struct amdgpu_gpu_instance *gpu_instance;
2450 int i = 0, r;
2451
2452 for (i = 0; i < adev->num_ip_blocks; i++) {
2453 if (!adev->ip_blocks[i].status.hw)
2454 continue;
2455 if (adev->ip_blocks[i].version->funcs->late_init) {
2456 r = adev->ip_blocks[i].version->funcs->late_init((void *)adev);
2457 if (r) {
2458 DRM_ERROR("late_init of IP block <%s> failed %d\n",
2459 adev->ip_blocks[i].version->funcs->name, r);
2460 return r;
2461 }
2462 }
2463 adev->ip_blocks[i].status.late_initialized = true;
2464 }
2465
2466 amdgpu_ras_set_error_query_ready(adev, true);
2467
2468 amdgpu_device_set_cg_state(adev, AMD_CG_STATE_GATE);
2469 amdgpu_device_set_pg_state(adev, AMD_PG_STATE_GATE);
2470
2471 amdgpu_device_fill_reset_magic(adev);
2472
2473 r = amdgpu_device_enable_mgpu_fan_boost();
2474 if (r)
2475 DRM_ERROR("enable mgpu fan boost failed (%d).\n", r);
2476
2477
2478 if (adev->gmc.xgmi.num_physical_nodes > 1) {
2479 mutex_lock(&mgpu_info.mutex);
2480
2481 /*
2482 * Reset device p-state to low as this was booted with high.
2483 *
2484 * This should be performed only after all devices from the same
2485 * hive get initialized.
2486 *
2487 * However, it's unknown how many device in the hive in advance.
2488 * As this is counted one by one during devices initializations.
2489 *
2490 * So, we wait for all XGMI interlinked devices initialized.
2491 * This may bring some delays as those devices may come from
2492 * different hives. But that should be OK.
2493 */
2494 if (mgpu_info.num_dgpu == adev->gmc.xgmi.num_physical_nodes) {
2495 for (i = 0; i < mgpu_info.num_gpu; i++) {
2496 gpu_instance = &(mgpu_info.gpu_ins[i]);
2497 if (gpu_instance->adev->flags & AMD_IS_APU)
2498 continue;
2499
2500 r = amdgpu_xgmi_set_pstate(gpu_instance->adev,
2501 AMDGPU_XGMI_PSTATE_MIN);
2502 if (r) {
2503 DRM_ERROR("pstate setting failed (%d).\n", r);
2504 break;
2505 }
2506 }
2507 }
2508
2509 mutex_unlock(&mgpu_info.mutex);
2510 }
2511
2512 return 0;
2513 }
2514
2515 /**
2516 * amdgpu_device_ip_fini - run fini for hardware IPs
2517 *
2518 * @adev: amdgpu_device pointer
2519 *
2520 * Main teardown pass for hardware IPs. The list of all the hardware
2521 * IPs that make up the asic is walked and the hw_fini and sw_fini callbacks
2522 * are run. hw_fini tears down the hardware associated with each IP
2523 * and sw_fini tears down any software state associated with each IP.
2524 * Returns 0 on success, negative error code on failure.
2525 */
amdgpu_device_ip_fini(struct amdgpu_device * adev)2526 static int amdgpu_device_ip_fini(struct amdgpu_device *adev)
2527 {
2528 int i, r;
2529
2530 if (amdgpu_sriov_vf(adev) && adev->virt.ras_init_done)
2531 amdgpu_virt_release_ras_err_handler_data(adev);
2532
2533 amdgpu_ras_pre_fini(adev);
2534
2535 if (adev->gmc.xgmi.num_physical_nodes > 1)
2536 amdgpu_xgmi_remove_device(adev);
2537
2538 amdgpu_device_set_pg_state(adev, AMD_PG_STATE_UNGATE);
2539 amdgpu_device_set_cg_state(adev, AMD_CG_STATE_UNGATE);
2540
2541 amdgpu_amdkfd_device_fini(adev);
2542
2543 /* need to disable SMC first */
2544 for (i = 0; i < adev->num_ip_blocks; i++) {
2545 if (!adev->ip_blocks[i].status.hw)
2546 continue;
2547 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_SMC) {
2548 r = adev->ip_blocks[i].version->funcs->hw_fini((void *)adev);
2549 /* XXX handle errors */
2550 if (r) {
2551 DRM_DEBUG("hw_fini of IP block <%s> failed %d\n",
2552 adev->ip_blocks[i].version->funcs->name, r);
2553 }
2554 adev->ip_blocks[i].status.hw = false;
2555 break;
2556 }
2557 }
2558
2559 for (i = adev->num_ip_blocks - 1; i >= 0; i--) {
2560 if (!adev->ip_blocks[i].status.hw)
2561 continue;
2562
2563 r = adev->ip_blocks[i].version->funcs->hw_fini((void *)adev);
2564 /* XXX handle errors */
2565 if (r) {
2566 DRM_DEBUG("hw_fini of IP block <%s> failed %d\n",
2567 adev->ip_blocks[i].version->funcs->name, r);
2568 }
2569
2570 adev->ip_blocks[i].status.hw = false;
2571 }
2572
2573
2574 for (i = adev->num_ip_blocks - 1; i >= 0; i--) {
2575 if (!adev->ip_blocks[i].status.sw)
2576 continue;
2577
2578 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC) {
2579 amdgpu_ucode_free_bo(adev);
2580 amdgpu_free_static_csa(&adev->virt.csa_obj);
2581 amdgpu_device_wb_fini(adev);
2582 amdgpu_device_vram_scratch_fini(adev);
2583 amdgpu_ib_pool_fini(adev);
2584 }
2585
2586 r = adev->ip_blocks[i].version->funcs->sw_fini((void *)adev);
2587 /* XXX handle errors */
2588 if (r) {
2589 DRM_DEBUG("sw_fini of IP block <%s> failed %d\n",
2590 adev->ip_blocks[i].version->funcs->name, r);
2591 }
2592 adev->ip_blocks[i].status.sw = false;
2593 adev->ip_blocks[i].status.valid = false;
2594 }
2595
2596 for (i = adev->num_ip_blocks - 1; i >= 0; i--) {
2597 if (!adev->ip_blocks[i].status.late_initialized)
2598 continue;
2599 if (adev->ip_blocks[i].version->funcs->late_fini)
2600 adev->ip_blocks[i].version->funcs->late_fini((void *)adev);
2601 adev->ip_blocks[i].status.late_initialized = false;
2602 }
2603
2604 amdgpu_ras_fini(adev);
2605
2606 if (amdgpu_sriov_vf(adev))
2607 if (amdgpu_virt_release_full_gpu(adev, false))
2608 DRM_ERROR("failed to release exclusive mode on fini\n");
2609
2610 return 0;
2611 }
2612
2613 /**
2614 * amdgpu_device_delayed_init_work_handler - work handler for IB tests
2615 *
2616 * @work: work_struct.
2617 */
amdgpu_device_delayed_init_work_handler(struct work_struct * work)2618 static void amdgpu_device_delayed_init_work_handler(struct work_struct *work)
2619 {
2620 struct amdgpu_device *adev =
2621 container_of(work, struct amdgpu_device, delayed_init_work.work);
2622 int r;
2623
2624 r = amdgpu_ib_ring_tests(adev);
2625 if (r)
2626 DRM_ERROR("ib ring test failed (%d).\n", r);
2627 }
2628
amdgpu_device_delay_enable_gfx_off(struct work_struct * work)2629 static void amdgpu_device_delay_enable_gfx_off(struct work_struct *work)
2630 {
2631 struct amdgpu_device *adev =
2632 container_of(work, struct amdgpu_device, gfx.gfx_off_delay_work.work);
2633
2634 WARN_ON_ONCE(adev->gfx.gfx_off_state);
2635 WARN_ON_ONCE(adev->gfx.gfx_off_req_count);
2636
2637 if (!amdgpu_dpm_set_powergating_by_smu(adev, AMD_IP_BLOCK_TYPE_GFX, true))
2638 adev->gfx.gfx_off_state = true;
2639 }
2640
2641 /**
2642 * amdgpu_device_ip_suspend_phase1 - run suspend for hardware IPs (phase 1)
2643 *
2644 * @adev: amdgpu_device pointer
2645 *
2646 * Main suspend function for hardware IPs. The list of all the hardware
2647 * IPs that make up the asic is walked, clockgating is disabled and the
2648 * suspend callbacks are run. suspend puts the hardware and software state
2649 * in each IP into a state suitable for suspend.
2650 * Returns 0 on success, negative error code on failure.
2651 */
amdgpu_device_ip_suspend_phase1(struct amdgpu_device * adev)2652 static int amdgpu_device_ip_suspend_phase1(struct amdgpu_device *adev)
2653 {
2654 int i, r;
2655
2656 amdgpu_device_set_pg_state(adev, AMD_PG_STATE_UNGATE);
2657 amdgpu_device_set_cg_state(adev, AMD_CG_STATE_UNGATE);
2658
2659 for (i = adev->num_ip_blocks - 1; i >= 0; i--) {
2660 if (!adev->ip_blocks[i].status.valid)
2661 continue;
2662
2663 /* displays are handled separately */
2664 if (adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_DCE)
2665 continue;
2666
2667 /* XXX handle errors */
2668 r = adev->ip_blocks[i].version->funcs->suspend(adev);
2669 /* XXX handle errors */
2670 if (r) {
2671 DRM_ERROR("suspend of IP block <%s> failed %d\n",
2672 adev->ip_blocks[i].version->funcs->name, r);
2673 return r;
2674 }
2675
2676 adev->ip_blocks[i].status.hw = false;
2677 }
2678
2679 return 0;
2680 }
2681
2682 /**
2683 * amdgpu_device_ip_suspend_phase2 - run suspend for hardware IPs (phase 2)
2684 *
2685 * @adev: amdgpu_device pointer
2686 *
2687 * Main suspend function for hardware IPs. The list of all the hardware
2688 * IPs that make up the asic is walked, clockgating is disabled and the
2689 * suspend callbacks are run. suspend puts the hardware and software state
2690 * in each IP into a state suitable for suspend.
2691 * Returns 0 on success, negative error code on failure.
2692 */
amdgpu_device_ip_suspend_phase2(struct amdgpu_device * adev)2693 static int amdgpu_device_ip_suspend_phase2(struct amdgpu_device *adev)
2694 {
2695 int i, r;
2696
2697 for (i = adev->num_ip_blocks - 1; i >= 0; i--) {
2698 if (!adev->ip_blocks[i].status.valid)
2699 continue;
2700 /* displays are handled in phase1 */
2701 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_DCE)
2702 continue;
2703 /* PSP lost connection when err_event_athub occurs */
2704 if (amdgpu_ras_intr_triggered() &&
2705 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_PSP) {
2706 adev->ip_blocks[i].status.hw = false;
2707 continue;
2708 }
2709 /* XXX handle errors */
2710 r = adev->ip_blocks[i].version->funcs->suspend(adev);
2711 /* XXX handle errors */
2712 if (r) {
2713 DRM_ERROR("suspend of IP block <%s> failed %d\n",
2714 adev->ip_blocks[i].version->funcs->name, r);
2715 }
2716 adev->ip_blocks[i].status.hw = false;
2717 /* handle putting the SMC in the appropriate state */
2718 if(!amdgpu_sriov_vf(adev)){
2719 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_SMC) {
2720 r = amdgpu_dpm_set_mp1_state(adev, adev->mp1_state);
2721 if (r) {
2722 DRM_ERROR("SMC failed to set mp1 state %d, %d\n",
2723 adev->mp1_state, r);
2724 return r;
2725 }
2726 }
2727 }
2728 adev->ip_blocks[i].status.hw = false;
2729 }
2730
2731 return 0;
2732 }
2733
2734 /**
2735 * amdgpu_device_ip_suspend - run suspend for hardware IPs
2736 *
2737 * @adev: amdgpu_device pointer
2738 *
2739 * Main suspend function for hardware IPs. The list of all the hardware
2740 * IPs that make up the asic is walked, clockgating is disabled and the
2741 * suspend callbacks are run. suspend puts the hardware and software state
2742 * in each IP into a state suitable for suspend.
2743 * Returns 0 on success, negative error code on failure.
2744 */
amdgpu_device_ip_suspend(struct amdgpu_device * adev)2745 int amdgpu_device_ip_suspend(struct amdgpu_device *adev)
2746 {
2747 int r;
2748
2749 if (amdgpu_sriov_vf(adev))
2750 amdgpu_virt_request_full_gpu(adev, false);
2751
2752 r = amdgpu_device_ip_suspend_phase1(adev);
2753 if (r)
2754 return r;
2755 r = amdgpu_device_ip_suspend_phase2(adev);
2756
2757 if (amdgpu_sriov_vf(adev))
2758 amdgpu_virt_release_full_gpu(adev, false);
2759
2760 return r;
2761 }
2762
amdgpu_device_ip_reinit_early_sriov(struct amdgpu_device * adev)2763 static int amdgpu_device_ip_reinit_early_sriov(struct amdgpu_device *adev)
2764 {
2765 int i, r;
2766
2767 static enum amd_ip_block_type ip_order[] = {
2768 AMD_IP_BLOCK_TYPE_GMC,
2769 AMD_IP_BLOCK_TYPE_COMMON,
2770 AMD_IP_BLOCK_TYPE_PSP,
2771 AMD_IP_BLOCK_TYPE_IH,
2772 };
2773
2774 for (i = 0; i < adev->num_ip_blocks; i++) {
2775 int j;
2776 struct amdgpu_ip_block *block;
2777
2778 block = &adev->ip_blocks[i];
2779 block->status.hw = false;
2780
2781 for (j = 0; j < ARRAY_SIZE(ip_order); j++) {
2782
2783 if (block->version->type != ip_order[j] ||
2784 !block->status.valid)
2785 continue;
2786
2787 r = block->version->funcs->hw_init(adev);
2788 DRM_INFO("RE-INIT-early: %s %s\n", block->version->funcs->name, r?"failed":"succeeded");
2789 if (r)
2790 return r;
2791 block->status.hw = true;
2792 }
2793 }
2794
2795 return 0;
2796 }
2797
amdgpu_device_ip_reinit_late_sriov(struct amdgpu_device * adev)2798 static int amdgpu_device_ip_reinit_late_sriov(struct amdgpu_device *adev)
2799 {
2800 int i, r;
2801
2802 static enum amd_ip_block_type ip_order[] = {
2803 AMD_IP_BLOCK_TYPE_SMC,
2804 AMD_IP_BLOCK_TYPE_DCE,
2805 AMD_IP_BLOCK_TYPE_GFX,
2806 AMD_IP_BLOCK_TYPE_SDMA,
2807 AMD_IP_BLOCK_TYPE_UVD,
2808 AMD_IP_BLOCK_TYPE_VCE,
2809 AMD_IP_BLOCK_TYPE_VCN
2810 };
2811
2812 for (i = 0; i < ARRAY_SIZE(ip_order); i++) {
2813 int j;
2814 struct amdgpu_ip_block *block;
2815
2816 for (j = 0; j < adev->num_ip_blocks; j++) {
2817 block = &adev->ip_blocks[j];
2818
2819 if (block->version->type != ip_order[i] ||
2820 !block->status.valid ||
2821 block->status.hw)
2822 continue;
2823
2824 if (block->version->type == AMD_IP_BLOCK_TYPE_SMC)
2825 r = block->version->funcs->resume(adev);
2826 else
2827 r = block->version->funcs->hw_init(adev);
2828
2829 DRM_INFO("RE-INIT-late: %s %s\n", block->version->funcs->name, r?"failed":"succeeded");
2830 if (r)
2831 return r;
2832 block->status.hw = true;
2833 }
2834 }
2835
2836 return 0;
2837 }
2838
2839 /**
2840 * amdgpu_device_ip_resume_phase1 - run resume for hardware IPs
2841 *
2842 * @adev: amdgpu_device pointer
2843 *
2844 * First resume function for hardware IPs. The list of all the hardware
2845 * IPs that make up the asic is walked and the resume callbacks are run for
2846 * COMMON, GMC, and IH. resume puts the hardware into a functional state
2847 * after a suspend and updates the software state as necessary. This
2848 * function is also used for restoring the GPU after a GPU reset.
2849 * Returns 0 on success, negative error code on failure.
2850 */
amdgpu_device_ip_resume_phase1(struct amdgpu_device * adev)2851 static int amdgpu_device_ip_resume_phase1(struct amdgpu_device *adev)
2852 {
2853 int i, r;
2854
2855 for (i = 0; i < adev->num_ip_blocks; i++) {
2856 if (!adev->ip_blocks[i].status.valid || adev->ip_blocks[i].status.hw)
2857 continue;
2858 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_COMMON ||
2859 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC ||
2860 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_IH) {
2861
2862 r = adev->ip_blocks[i].version->funcs->resume(adev);
2863 if (r) {
2864 DRM_ERROR("resume of IP block <%s> failed %d\n",
2865 adev->ip_blocks[i].version->funcs->name, r);
2866 return r;
2867 }
2868 adev->ip_blocks[i].status.hw = true;
2869 }
2870 }
2871
2872 return 0;
2873 }
2874
2875 /**
2876 * amdgpu_device_ip_resume_phase2 - run resume for hardware IPs
2877 *
2878 * @adev: amdgpu_device pointer
2879 *
2880 * First resume function for hardware IPs. The list of all the hardware
2881 * IPs that make up the asic is walked and the resume callbacks are run for
2882 * all blocks except COMMON, GMC, and IH. resume puts the hardware into a
2883 * functional state after a suspend and updates the software state as
2884 * necessary. This function is also used for restoring the GPU after a GPU
2885 * reset.
2886 * Returns 0 on success, negative error code on failure.
2887 */
amdgpu_device_ip_resume_phase2(struct amdgpu_device * adev)2888 static int amdgpu_device_ip_resume_phase2(struct amdgpu_device *adev)
2889 {
2890 int i, r;
2891
2892 for (i = 0; i < adev->num_ip_blocks; i++) {
2893 if (!adev->ip_blocks[i].status.valid || adev->ip_blocks[i].status.hw)
2894 continue;
2895 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_COMMON ||
2896 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC ||
2897 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_IH ||
2898 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_PSP)
2899 continue;
2900 r = adev->ip_blocks[i].version->funcs->resume(adev);
2901 if (r) {
2902 DRM_ERROR("resume of IP block <%s> failed %d\n",
2903 adev->ip_blocks[i].version->funcs->name, r);
2904 return r;
2905 }
2906 adev->ip_blocks[i].status.hw = true;
2907 }
2908
2909 return 0;
2910 }
2911
2912 /**
2913 * amdgpu_device_ip_resume - run resume for hardware IPs
2914 *
2915 * @adev: amdgpu_device pointer
2916 *
2917 * Main resume function for hardware IPs. The hardware IPs
2918 * are split into two resume functions because they are
2919 * are also used in in recovering from a GPU reset and some additional
2920 * steps need to be take between them. In this case (S3/S4) they are
2921 * run sequentially.
2922 * Returns 0 on success, negative error code on failure.
2923 */
amdgpu_device_ip_resume(struct amdgpu_device * adev)2924 static int amdgpu_device_ip_resume(struct amdgpu_device *adev)
2925 {
2926 int r;
2927
2928 r = amdgpu_amdkfd_resume_iommu(adev);
2929 if (r)
2930 return r;
2931
2932 r = amdgpu_device_ip_resume_phase1(adev);
2933 if (r)
2934 return r;
2935
2936 r = amdgpu_device_fw_loading(adev);
2937 if (r)
2938 return r;
2939
2940 r = amdgpu_device_ip_resume_phase2(adev);
2941
2942 return r;
2943 }
2944
2945 /**
2946 * amdgpu_device_detect_sriov_bios - determine if the board supports SR-IOV
2947 *
2948 * @adev: amdgpu_device pointer
2949 *
2950 * Query the VBIOS data tables to determine if the board supports SR-IOV.
2951 */
amdgpu_device_detect_sriov_bios(struct amdgpu_device * adev)2952 static void amdgpu_device_detect_sriov_bios(struct amdgpu_device *adev)
2953 {
2954 if (amdgpu_sriov_vf(adev)) {
2955 if (adev->is_atom_fw) {
2956 if (amdgpu_atomfirmware_gpu_supports_virtualization(adev))
2957 adev->virt.caps |= AMDGPU_SRIOV_CAPS_SRIOV_VBIOS;
2958 } else {
2959 if (amdgpu_atombios_has_gpu_virtualization_table(adev))
2960 adev->virt.caps |= AMDGPU_SRIOV_CAPS_SRIOV_VBIOS;
2961 }
2962
2963 if (!(adev->virt.caps & AMDGPU_SRIOV_CAPS_SRIOV_VBIOS))
2964 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_NO_VBIOS, 0, 0);
2965 }
2966 }
2967
2968 /**
2969 * amdgpu_device_asic_has_dc_support - determine if DC supports the asic
2970 *
2971 * @asic_type: AMD asic type
2972 *
2973 * Check if there is DC (new modesetting infrastructre) support for an asic.
2974 * returns true if DC has support, false if not.
2975 */
amdgpu_device_asic_has_dc_support(enum amd_asic_type asic_type)2976 bool amdgpu_device_asic_has_dc_support(enum amd_asic_type asic_type)
2977 {
2978 switch (asic_type) {
2979 #if defined(CONFIG_DRM_AMD_DC)
2980 #if defined(CONFIG_DRM_AMD_DC_SI)
2981 case CHIP_TAHITI:
2982 case CHIP_PITCAIRN:
2983 case CHIP_VERDE:
2984 case CHIP_OLAND:
2985 #endif
2986 case CHIP_BONAIRE:
2987 case CHIP_KAVERI:
2988 case CHIP_KABINI:
2989 case CHIP_MULLINS:
2990 /*
2991 * We have systems in the wild with these ASICs that require
2992 * LVDS and VGA support which is not supported with DC.
2993 *
2994 * Fallback to the non-DC driver here by default so as not to
2995 * cause regressions.
2996 */
2997 return amdgpu_dc > 0;
2998 case CHIP_HAWAII:
2999 case CHIP_CARRIZO:
3000 case CHIP_STONEY:
3001 case CHIP_POLARIS10:
3002 case CHIP_POLARIS11:
3003 case CHIP_POLARIS12:
3004 case CHIP_VEGAM:
3005 case CHIP_TONGA:
3006 case CHIP_FIJI:
3007 case CHIP_VEGA10:
3008 case CHIP_VEGA12:
3009 case CHIP_VEGA20:
3010 #if defined(CONFIG_DRM_AMD_DC_DCN)
3011 case CHIP_RAVEN:
3012 case CHIP_NAVI10:
3013 case CHIP_NAVI14:
3014 case CHIP_NAVI12:
3015 case CHIP_RENOIR:
3016 #endif
3017 #if defined(CONFIG_DRM_AMD_DC_DCN3_0)
3018 case CHIP_SIENNA_CICHLID:
3019 case CHIP_NAVY_FLOUNDER:
3020 #endif
3021 return amdgpu_dc != 0;
3022 #endif
3023 default:
3024 if (amdgpu_dc > 0)
3025 DRM_INFO_ONCE("Display Core has been requested via kernel parameter "
3026 "but isn't supported by ASIC, ignoring\n");
3027 return false;
3028 }
3029 }
3030
3031 /**
3032 * amdgpu_device_has_dc_support - check if dc is supported
3033 *
3034 * @adev: amdgpu_device pointer
3035 *
3036 * Returns true for supported, false for not supported
3037 */
amdgpu_device_has_dc_support(struct amdgpu_device * adev)3038 bool amdgpu_device_has_dc_support(struct amdgpu_device *adev)
3039 {
3040 if (amdgpu_sriov_vf(adev) || adev->enable_virtual_display)
3041 return false;
3042
3043 return amdgpu_device_asic_has_dc_support(adev->asic_type);
3044 }
3045
3046
amdgpu_device_xgmi_reset_func(struct work_struct * __work)3047 static void amdgpu_device_xgmi_reset_func(struct work_struct *__work)
3048 {
3049 struct amdgpu_device *adev =
3050 container_of(__work, struct amdgpu_device, xgmi_reset_work);
3051 struct amdgpu_hive_info *hive = amdgpu_get_xgmi_hive(adev);
3052
3053 /* It's a bug to not have a hive within this function */
3054 if (WARN_ON(!hive))
3055 return;
3056
3057 /*
3058 * Use task barrier to synchronize all xgmi reset works across the
3059 * hive. task_barrier_enter and task_barrier_exit will block
3060 * until all the threads running the xgmi reset works reach
3061 * those points. task_barrier_full will do both blocks.
3062 */
3063 if (amdgpu_asic_reset_method(adev) == AMD_RESET_METHOD_BACO) {
3064
3065 task_barrier_enter(&hive->tb);
3066 adev->asic_reset_res = amdgpu_device_baco_enter(adev_to_drm(adev));
3067
3068 if (adev->asic_reset_res)
3069 goto fail;
3070
3071 task_barrier_exit(&hive->tb);
3072 adev->asic_reset_res = amdgpu_device_baco_exit(adev_to_drm(adev));
3073
3074 if (adev->asic_reset_res)
3075 goto fail;
3076
3077 if (adev->mmhub.funcs && adev->mmhub.funcs->reset_ras_error_count)
3078 adev->mmhub.funcs->reset_ras_error_count(adev);
3079 } else {
3080
3081 task_barrier_full(&hive->tb);
3082 adev->asic_reset_res = amdgpu_asic_reset(adev);
3083 }
3084
3085 fail:
3086 if (adev->asic_reset_res)
3087 DRM_WARN("ASIC reset failed with error, %d for drm dev, %s",
3088 adev->asic_reset_res, adev_to_drm(adev)->unique);
3089 amdgpu_put_xgmi_hive(hive);
3090 }
3091
amdgpu_device_get_job_timeout_settings(struct amdgpu_device * adev)3092 static int amdgpu_device_get_job_timeout_settings(struct amdgpu_device *adev)
3093 {
3094 char *input = amdgpu_lockup_timeout;
3095 char *timeout_setting = NULL;
3096 int index = 0;
3097 long timeout;
3098 int ret = 0;
3099
3100 /*
3101 * By default timeout for non compute jobs is 10000.
3102 * And there is no timeout enforced on compute jobs.
3103 * In SR-IOV or passthrough mode, timeout for compute
3104 * jobs are 60000 by default.
3105 */
3106 adev->gfx_timeout = msecs_to_jiffies(10000);
3107 adev->sdma_timeout = adev->video_timeout = adev->gfx_timeout;
3108 if (amdgpu_sriov_vf(adev) || amdgpu_passthrough(adev))
3109 adev->compute_timeout = msecs_to_jiffies(60000);
3110 else
3111 adev->compute_timeout = MAX_SCHEDULE_TIMEOUT;
3112
3113 if (strnlen(input, AMDGPU_MAX_TIMEOUT_PARAM_LENGTH)) {
3114 while ((timeout_setting = strsep(&input, ",")) &&
3115 strnlen(timeout_setting, AMDGPU_MAX_TIMEOUT_PARAM_LENGTH)) {
3116 ret = kstrtol(timeout_setting, 0, &timeout);
3117 if (ret)
3118 return ret;
3119
3120 if (timeout == 0) {
3121 index++;
3122 continue;
3123 } else if (timeout < 0) {
3124 timeout = MAX_SCHEDULE_TIMEOUT;
3125 } else {
3126 timeout = msecs_to_jiffies(timeout);
3127 }
3128
3129 switch (index++) {
3130 case 0:
3131 adev->gfx_timeout = timeout;
3132 break;
3133 case 1:
3134 adev->compute_timeout = timeout;
3135 break;
3136 case 2:
3137 adev->sdma_timeout = timeout;
3138 break;
3139 case 3:
3140 adev->video_timeout = timeout;
3141 break;
3142 default:
3143 break;
3144 }
3145 }
3146 /*
3147 * There is only one value specified and
3148 * it should apply to all non-compute jobs.
3149 */
3150 if (index == 1) {
3151 adev->sdma_timeout = adev->video_timeout = adev->gfx_timeout;
3152 if (amdgpu_sriov_vf(adev) || amdgpu_passthrough(adev))
3153 adev->compute_timeout = adev->gfx_timeout;
3154 }
3155 }
3156
3157 return ret;
3158 }
3159
3160 static const struct attribute *amdgpu_dev_attributes[] = {
3161 &dev_attr_product_name.attr,
3162 &dev_attr_product_number.attr,
3163 &dev_attr_serial_number.attr,
3164 &dev_attr_pcie_replay_count.attr,
3165 NULL
3166 };
3167
3168
3169 /**
3170 * amdgpu_device_init - initialize the driver
3171 *
3172 * @adev: amdgpu_device pointer
3173 * @flags: driver flags
3174 *
3175 * Initializes the driver info and hw (all asics).
3176 * Returns 0 for success or an error on failure.
3177 * Called at driver startup.
3178 */
amdgpu_device_init(struct amdgpu_device * adev,uint32_t flags)3179 int amdgpu_device_init(struct amdgpu_device *adev,
3180 uint32_t flags)
3181 {
3182 struct drm_device *ddev = adev_to_drm(adev);
3183 struct pci_dev *pdev = adev->pdev;
3184 int r, i;
3185 bool boco = false;
3186 u32 max_MBps;
3187
3188 adev->shutdown = false;
3189 adev->flags = flags;
3190
3191 if (amdgpu_force_asic_type >= 0 && amdgpu_force_asic_type < CHIP_LAST)
3192 adev->asic_type = amdgpu_force_asic_type;
3193 else
3194 adev->asic_type = flags & AMD_ASIC_MASK;
3195
3196 adev->usec_timeout = AMDGPU_MAX_USEC_TIMEOUT;
3197 if (amdgpu_emu_mode == 1)
3198 adev->usec_timeout *= 10;
3199 adev->gmc.gart_size = 512 * 1024 * 1024;
3200 adev->accel_working = false;
3201 adev->num_rings = 0;
3202 adev->mman.buffer_funcs = NULL;
3203 adev->mman.buffer_funcs_ring = NULL;
3204 adev->vm_manager.vm_pte_funcs = NULL;
3205 adev->vm_manager.vm_pte_num_scheds = 0;
3206 adev->gmc.gmc_funcs = NULL;
3207 adev->fence_context = dma_fence_context_alloc(AMDGPU_MAX_RINGS);
3208 bitmap_zero(adev->gfx.pipe_reserve_bitmap, AMDGPU_MAX_COMPUTE_QUEUES);
3209
3210 adev->smc_rreg = &amdgpu_invalid_rreg;
3211 adev->smc_wreg = &amdgpu_invalid_wreg;
3212 adev->pcie_rreg = &amdgpu_invalid_rreg;
3213 adev->pcie_wreg = &amdgpu_invalid_wreg;
3214 adev->pciep_rreg = &amdgpu_invalid_rreg;
3215 adev->pciep_wreg = &amdgpu_invalid_wreg;
3216 adev->pcie_rreg64 = &amdgpu_invalid_rreg64;
3217 adev->pcie_wreg64 = &amdgpu_invalid_wreg64;
3218 adev->uvd_ctx_rreg = &amdgpu_invalid_rreg;
3219 adev->uvd_ctx_wreg = &amdgpu_invalid_wreg;
3220 adev->didt_rreg = &amdgpu_invalid_rreg;
3221 adev->didt_wreg = &amdgpu_invalid_wreg;
3222 adev->gc_cac_rreg = &amdgpu_invalid_rreg;
3223 adev->gc_cac_wreg = &amdgpu_invalid_wreg;
3224 adev->audio_endpt_rreg = &amdgpu_block_invalid_rreg;
3225 adev->audio_endpt_wreg = &amdgpu_block_invalid_wreg;
3226
3227 DRM_INFO("initializing kernel modesetting (%s 0x%04X:0x%04X 0x%04X:0x%04X 0x%02X).\n",
3228 amdgpu_asic_name[adev->asic_type], pdev->vendor, pdev->device,
3229 pdev->subsystem_vendor, pdev->subsystem_device, pdev->revision);
3230
3231 /* mutex initialization are all done here so we
3232 * can recall function without having locking issues */
3233 atomic_set(&adev->irq.ih.lock, 0);
3234 mutex_init(&adev->firmware.mutex);
3235 mutex_init(&adev->pm.mutex);
3236 mutex_init(&adev->gfx.gpu_clock_mutex);
3237 mutex_init(&adev->srbm_mutex);
3238 mutex_init(&adev->gfx.pipe_reserve_mutex);
3239 mutex_init(&adev->gfx.gfx_off_mutex);
3240 mutex_init(&adev->grbm_idx_mutex);
3241 mutex_init(&adev->mn_lock);
3242 mutex_init(&adev->virt.vf_errors.lock);
3243 hash_init(adev->mn_hash);
3244 atomic_set(&adev->in_gpu_reset, 0);
3245 init_rwsem(&adev->reset_sem);
3246 mutex_init(&adev->psp.mutex);
3247 mutex_init(&adev->notifier_lock);
3248
3249 r = amdgpu_device_check_arguments(adev);
3250 if (r)
3251 return r;
3252
3253 spin_lock_init(&adev->mmio_idx_lock);
3254 spin_lock_init(&adev->smc_idx_lock);
3255 spin_lock_init(&adev->pcie_idx_lock);
3256 spin_lock_init(&adev->uvd_ctx_idx_lock);
3257 spin_lock_init(&adev->didt_idx_lock);
3258 spin_lock_init(&adev->gc_cac_idx_lock);
3259 spin_lock_init(&adev->se_cac_idx_lock);
3260 spin_lock_init(&adev->audio_endpt_idx_lock);
3261 spin_lock_init(&adev->mm_stats.lock);
3262
3263 INIT_LIST_HEAD(&adev->shadow_list);
3264 mutex_init(&adev->shadow_list_lock);
3265
3266 INIT_DELAYED_WORK(&adev->delayed_init_work,
3267 amdgpu_device_delayed_init_work_handler);
3268 INIT_DELAYED_WORK(&adev->gfx.gfx_off_delay_work,
3269 amdgpu_device_delay_enable_gfx_off);
3270
3271 INIT_WORK(&adev->xgmi_reset_work, amdgpu_device_xgmi_reset_func);
3272
3273 adev->gfx.gfx_off_req_count = 1;
3274 adev->pm.ac_power = power_supply_is_system_supplied() > 0;
3275
3276 atomic_set(&adev->throttling_logging_enabled, 1);
3277 /*
3278 * If throttling continues, logging will be performed every minute
3279 * to avoid log flooding. "-1" is subtracted since the thermal
3280 * throttling interrupt comes every second. Thus, the total logging
3281 * interval is 59 seconds(retelimited printk interval) + 1(waiting
3282 * for throttling interrupt) = 60 seconds.
3283 */
3284 ratelimit_state_init(&adev->throttling_logging_rs, (60 - 1) * HZ, 1);
3285 ratelimit_set_flags(&adev->throttling_logging_rs, RATELIMIT_MSG_ON_RELEASE);
3286
3287 /* Registers mapping */
3288 /* TODO: block userspace mapping of io register */
3289 if (adev->asic_type >= CHIP_BONAIRE) {
3290 adev->rmmio_base = pci_resource_start(adev->pdev, 5);
3291 adev->rmmio_size = pci_resource_len(adev->pdev, 5);
3292 } else {
3293 adev->rmmio_base = pci_resource_start(adev->pdev, 2);
3294 adev->rmmio_size = pci_resource_len(adev->pdev, 2);
3295 }
3296
3297 adev->rmmio = ioremap(adev->rmmio_base, adev->rmmio_size);
3298 if (adev->rmmio == NULL) {
3299 return -ENOMEM;
3300 }
3301 DRM_INFO("register mmio base: 0x%08X\n", (uint32_t)adev->rmmio_base);
3302 DRM_INFO("register mmio size: %u\n", (unsigned)adev->rmmio_size);
3303
3304 /* io port mapping */
3305 for (i = 0; i < DEVICE_COUNT_RESOURCE; i++) {
3306 if (pci_resource_flags(adev->pdev, i) & IORESOURCE_IO) {
3307 adev->rio_mem_size = pci_resource_len(adev->pdev, i);
3308 adev->rio_mem = pci_iomap(adev->pdev, i, adev->rio_mem_size);
3309 break;
3310 }
3311 }
3312 if (adev->rio_mem == NULL)
3313 DRM_INFO("PCI I/O BAR is not found.\n");
3314
3315 /* enable PCIE atomic ops */
3316 r = pci_enable_atomic_ops_to_root(adev->pdev,
3317 PCI_EXP_DEVCAP2_ATOMIC_COMP32 |
3318 PCI_EXP_DEVCAP2_ATOMIC_COMP64);
3319 if (r) {
3320 adev->have_atomics_support = false;
3321 DRM_INFO("PCIE atomic ops is not supported\n");
3322 } else {
3323 adev->have_atomics_support = true;
3324 }
3325
3326 amdgpu_device_get_pcie_info(adev);
3327
3328 if (amdgpu_mcbp)
3329 DRM_INFO("MCBP is enabled\n");
3330
3331 if (amdgpu_mes && adev->asic_type >= CHIP_NAVI10)
3332 adev->enable_mes = true;
3333
3334 /* detect hw virtualization here */
3335 amdgpu_detect_virtualization(adev);
3336
3337 r = amdgpu_device_get_job_timeout_settings(adev);
3338 if (r) {
3339 dev_err(adev->dev, "invalid lockup_timeout parameter syntax\n");
3340 return r;
3341 }
3342
3343 /* early init functions */
3344 r = amdgpu_device_ip_early_init(adev);
3345 if (r)
3346 return r;
3347
3348 /* doorbell bar mapping and doorbell index init*/
3349 amdgpu_device_doorbell_init(adev);
3350
3351 /* if we have > 1 VGA cards, then disable the amdgpu VGA resources */
3352 /* this will fail for cards that aren't VGA class devices, just
3353 * ignore it */
3354 vga_client_register(adev->pdev, adev, NULL, amdgpu_device_vga_set_decode);
3355
3356 if (amdgpu_device_supports_boco(ddev))
3357 boco = true;
3358 if (amdgpu_has_atpx() &&
3359 (amdgpu_is_atpx_hybrid() ||
3360 amdgpu_has_atpx_dgpu_power_cntl()) &&
3361 !pci_is_thunderbolt_attached(adev->pdev))
3362 vga_switcheroo_register_client(adev->pdev,
3363 &amdgpu_switcheroo_ops, boco);
3364 if (boco)
3365 vga_switcheroo_init_domain_pm_ops(adev->dev, &adev->vga_pm_domain);
3366
3367 if (amdgpu_emu_mode == 1) {
3368 /* post the asic on emulation mode */
3369 emu_soc_asic_init(adev);
3370 goto fence_driver_init;
3371 }
3372
3373 /* detect if we are with an SRIOV vbios */
3374 amdgpu_device_detect_sriov_bios(adev);
3375
3376 /* check if we need to reset the asic
3377 * E.g., driver was not cleanly unloaded previously, etc.
3378 */
3379 if (!amdgpu_sriov_vf(adev) && amdgpu_asic_need_reset_on_init(adev)) {
3380 r = amdgpu_asic_reset(adev);
3381 if (r) {
3382 dev_err(adev->dev, "asic reset on init failed\n");
3383 goto failed;
3384 }
3385 }
3386
3387 pci_enable_pcie_error_reporting(adev->ddev.pdev);
3388
3389 /* Post card if necessary */
3390 if (amdgpu_device_need_post(adev)) {
3391 if (!adev->bios) {
3392 dev_err(adev->dev, "no vBIOS found\n");
3393 r = -EINVAL;
3394 goto failed;
3395 }
3396 DRM_INFO("GPU posting now...\n");
3397 r = amdgpu_device_asic_init(adev);
3398 if (r) {
3399 dev_err(adev->dev, "gpu post error!\n");
3400 goto failed;
3401 }
3402 }
3403
3404 if (adev->is_atom_fw) {
3405 /* Initialize clocks */
3406 r = amdgpu_atomfirmware_get_clock_info(adev);
3407 if (r) {
3408 dev_err(adev->dev, "amdgpu_atomfirmware_get_clock_info failed\n");
3409 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_ATOMBIOS_GET_CLOCK_FAIL, 0, 0);
3410 goto failed;
3411 }
3412 } else {
3413 /* Initialize clocks */
3414 r = amdgpu_atombios_get_clock_info(adev);
3415 if (r) {
3416 dev_err(adev->dev, "amdgpu_atombios_get_clock_info failed\n");
3417 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_ATOMBIOS_GET_CLOCK_FAIL, 0, 0);
3418 goto failed;
3419 }
3420 /* init i2c buses */
3421 if (!amdgpu_device_has_dc_support(adev))
3422 amdgpu_atombios_i2c_init(adev);
3423 }
3424
3425 fence_driver_init:
3426 /* Fence driver */
3427 r = amdgpu_fence_driver_init(adev);
3428 if (r) {
3429 dev_err(adev->dev, "amdgpu_fence_driver_init failed\n");
3430 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_FENCE_INIT_FAIL, 0, 0);
3431 goto failed;
3432 }
3433
3434 /* init the mode config */
3435 drm_mode_config_init(adev_to_drm(adev));
3436
3437 r = amdgpu_device_ip_init(adev);
3438 if (r) {
3439 /* failed in exclusive mode due to timeout */
3440 if (amdgpu_sriov_vf(adev) &&
3441 !amdgpu_sriov_runtime(adev) &&
3442 amdgpu_virt_mmio_blocked(adev) &&
3443 !amdgpu_virt_wait_reset(adev)) {
3444 dev_err(adev->dev, "VF exclusive mode timeout\n");
3445 /* Don't send request since VF is inactive. */
3446 adev->virt.caps &= ~AMDGPU_SRIOV_CAPS_RUNTIME;
3447 adev->virt.ops = NULL;
3448 r = -EAGAIN;
3449 goto failed;
3450 }
3451 dev_err(adev->dev, "amdgpu_device_ip_init failed\n");
3452 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_AMDGPU_INIT_FAIL, 0, 0);
3453 goto failed;
3454 }
3455
3456 dev_info(adev->dev,
3457 "SE %d, SH per SE %d, CU per SH %d, active_cu_number %d\n",
3458 adev->gfx.config.max_shader_engines,
3459 adev->gfx.config.max_sh_per_se,
3460 adev->gfx.config.max_cu_per_sh,
3461 adev->gfx.cu_info.number);
3462
3463 adev->accel_working = true;
3464
3465 amdgpu_vm_check_compute_bug(adev);
3466
3467 /* Initialize the buffer migration limit. */
3468 if (amdgpu_moverate >= 0)
3469 max_MBps = amdgpu_moverate;
3470 else
3471 max_MBps = 8; /* Allow 8 MB/s. */
3472 /* Get a log2 for easy divisions. */
3473 adev->mm_stats.log2_max_MBps = ilog2(max(1u, max_MBps));
3474
3475 amdgpu_fbdev_init(adev);
3476
3477 r = amdgpu_pm_sysfs_init(adev);
3478 if (r) {
3479 adev->pm_sysfs_en = false;
3480 DRM_ERROR("registering pm debugfs failed (%d).\n", r);
3481 } else
3482 adev->pm_sysfs_en = true;
3483
3484 r = amdgpu_ucode_sysfs_init(adev);
3485 if (r) {
3486 adev->ucode_sysfs_en = false;
3487 DRM_ERROR("Creating firmware sysfs failed (%d).\n", r);
3488 } else
3489 adev->ucode_sysfs_en = true;
3490
3491 if ((amdgpu_testing & 1)) {
3492 if (adev->accel_working)
3493 amdgpu_test_moves(adev);
3494 else
3495 DRM_INFO("amdgpu: acceleration disabled, skipping move tests\n");
3496 }
3497 if (amdgpu_benchmarking) {
3498 if (adev->accel_working)
3499 amdgpu_benchmark(adev, amdgpu_benchmarking);
3500 else
3501 DRM_INFO("amdgpu: acceleration disabled, skipping benchmarks\n");
3502 }
3503
3504 /*
3505 * Register gpu instance before amdgpu_device_enable_mgpu_fan_boost.
3506 * Otherwise the mgpu fan boost feature will be skipped due to the
3507 * gpu instance is counted less.
3508 */
3509 amdgpu_register_gpu_instance(adev);
3510
3511 /* enable clockgating, etc. after ib tests, etc. since some blocks require
3512 * explicit gating rather than handling it automatically.
3513 */
3514 r = amdgpu_device_ip_late_init(adev);
3515 if (r) {
3516 dev_err(adev->dev, "amdgpu_device_ip_late_init failed\n");
3517 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_AMDGPU_LATE_INIT_FAIL, 0, r);
3518 goto failed;
3519 }
3520
3521 /* must succeed. */
3522 amdgpu_ras_resume(adev);
3523
3524 queue_delayed_work(system_wq, &adev->delayed_init_work,
3525 msecs_to_jiffies(AMDGPU_RESUME_MS));
3526
3527 if (amdgpu_sriov_vf(adev))
3528 flush_delayed_work(&adev->delayed_init_work);
3529
3530 r = sysfs_create_files(&adev->dev->kobj, amdgpu_dev_attributes);
3531 if (r)
3532 dev_err(adev->dev, "Could not create amdgpu device attr\n");
3533
3534 if (IS_ENABLED(CONFIG_PERF_EVENTS))
3535 r = amdgpu_pmu_init(adev);
3536 if (r)
3537 dev_err(adev->dev, "amdgpu_pmu_init failed\n");
3538
3539 /* Have stored pci confspace at hand for restore in sudden PCI error */
3540 if (amdgpu_device_cache_pci_state(adev->pdev))
3541 pci_restore_state(pdev);
3542
3543 return 0;
3544
3545 failed:
3546 amdgpu_vf_error_trans_all(adev);
3547 if (boco)
3548 vga_switcheroo_fini_domain_pm_ops(adev->dev);
3549
3550 return r;
3551 }
3552
3553 /**
3554 * amdgpu_device_fini - tear down the driver
3555 *
3556 * @adev: amdgpu_device pointer
3557 *
3558 * Tear down the driver info (all asics).
3559 * Called at driver shutdown.
3560 */
amdgpu_device_fini(struct amdgpu_device * adev)3561 void amdgpu_device_fini(struct amdgpu_device *adev)
3562 {
3563 dev_info(adev->dev, "amdgpu: finishing device.\n");
3564 flush_delayed_work(&adev->delayed_init_work);
3565 ttm_bo_lock_delayed_workqueue(&adev->mman.bdev);
3566 adev->shutdown = true;
3567
3568 kfree(adev->pci_state);
3569
3570 /* make sure IB test finished before entering exclusive mode
3571 * to avoid preemption on IB test
3572 * */
3573 if (amdgpu_sriov_vf(adev)) {
3574 amdgpu_virt_request_full_gpu(adev, false);
3575 amdgpu_virt_fini_data_exchange(adev);
3576 }
3577
3578 /* disable all interrupts */
3579 amdgpu_irq_disable_all(adev);
3580 if (adev->mode_info.mode_config_initialized){
3581 if (!amdgpu_device_has_dc_support(adev))
3582 drm_helper_force_disable_all(adev_to_drm(adev));
3583 else
3584 drm_atomic_helper_shutdown(adev_to_drm(adev));
3585 }
3586 amdgpu_fence_driver_fini(adev);
3587 if (adev->pm_sysfs_en)
3588 amdgpu_pm_sysfs_fini(adev);
3589 amdgpu_fbdev_fini(adev);
3590 amdgpu_device_ip_fini(adev);
3591 release_firmware(adev->firmware.gpu_info_fw);
3592 adev->firmware.gpu_info_fw = NULL;
3593 adev->accel_working = false;
3594 /* free i2c buses */
3595 if (!amdgpu_device_has_dc_support(adev))
3596 amdgpu_i2c_fini(adev);
3597
3598 if (amdgpu_emu_mode != 1)
3599 amdgpu_atombios_fini(adev);
3600
3601 kfree(adev->bios);
3602 adev->bios = NULL;
3603 if (amdgpu_has_atpx() &&
3604 (amdgpu_is_atpx_hybrid() ||
3605 amdgpu_has_atpx_dgpu_power_cntl()) &&
3606 !pci_is_thunderbolt_attached(adev->pdev))
3607 vga_switcheroo_unregister_client(adev->pdev);
3608 if (amdgpu_device_supports_boco(adev_to_drm(adev)))
3609 vga_switcheroo_fini_domain_pm_ops(adev->dev);
3610 vga_client_register(adev->pdev, NULL, NULL, NULL);
3611 if (adev->rio_mem)
3612 pci_iounmap(adev->pdev, adev->rio_mem);
3613 adev->rio_mem = NULL;
3614 iounmap(adev->rmmio);
3615 adev->rmmio = NULL;
3616 amdgpu_device_doorbell_fini(adev);
3617
3618 if (adev->ucode_sysfs_en)
3619 amdgpu_ucode_sysfs_fini(adev);
3620
3621 sysfs_remove_files(&adev->dev->kobj, amdgpu_dev_attributes);
3622 if (IS_ENABLED(CONFIG_PERF_EVENTS))
3623 amdgpu_pmu_fini(adev);
3624 if (adev->mman.discovery_bin)
3625 amdgpu_discovery_fini(adev);
3626 }
3627
3628
3629 /*
3630 * Suspend & resume.
3631 */
3632 /**
3633 * amdgpu_device_suspend - initiate device suspend
3634 *
3635 * @dev: drm dev pointer
3636 * @fbcon : notify the fbdev of suspend
3637 *
3638 * Puts the hw in the suspend state (all asics).
3639 * Returns 0 for success or an error on failure.
3640 * Called at driver suspend.
3641 */
amdgpu_device_suspend(struct drm_device * dev,bool fbcon)3642 int amdgpu_device_suspend(struct drm_device *dev, bool fbcon)
3643 {
3644 struct amdgpu_device *adev;
3645 struct drm_crtc *crtc;
3646 struct drm_connector *connector;
3647 struct drm_connector_list_iter iter;
3648 int r;
3649
3650 adev = drm_to_adev(dev);
3651
3652 if (dev->switch_power_state == DRM_SWITCH_POWER_OFF)
3653 return 0;
3654
3655 adev->in_suspend = true;
3656 drm_kms_helper_poll_disable(dev);
3657
3658 if (fbcon)
3659 amdgpu_fbdev_set_suspend(adev, 1);
3660
3661 cancel_delayed_work_sync(&adev->delayed_init_work);
3662
3663 if (!amdgpu_device_has_dc_support(adev)) {
3664 /* turn off display hw */
3665 drm_modeset_lock_all(dev);
3666 drm_connector_list_iter_begin(dev, &iter);
3667 drm_for_each_connector_iter(connector, &iter)
3668 drm_helper_connector_dpms(connector,
3669 DRM_MODE_DPMS_OFF);
3670 drm_connector_list_iter_end(&iter);
3671 drm_modeset_unlock_all(dev);
3672 /* unpin the front buffers and cursors */
3673 list_for_each_entry(crtc, &dev->mode_config.crtc_list, head) {
3674 struct amdgpu_crtc *amdgpu_crtc = to_amdgpu_crtc(crtc);
3675 struct drm_framebuffer *fb = crtc->primary->fb;
3676 struct amdgpu_bo *robj;
3677
3678 if (amdgpu_crtc->cursor_bo && !adev->enable_virtual_display) {
3679 struct amdgpu_bo *aobj = gem_to_amdgpu_bo(amdgpu_crtc->cursor_bo);
3680 r = amdgpu_bo_reserve(aobj, true);
3681 if (r == 0) {
3682 amdgpu_bo_unpin(aobj);
3683 amdgpu_bo_unreserve(aobj);
3684 }
3685 }
3686
3687 if (fb == NULL || fb->obj[0] == NULL) {
3688 continue;
3689 }
3690 robj = gem_to_amdgpu_bo(fb->obj[0]);
3691 /* don't unpin kernel fb objects */
3692 if (!amdgpu_fbdev_robj_is_fb(adev, robj)) {
3693 r = amdgpu_bo_reserve(robj, true);
3694 if (r == 0) {
3695 amdgpu_bo_unpin(robj);
3696 amdgpu_bo_unreserve(robj);
3697 }
3698 }
3699 }
3700 }
3701
3702 amdgpu_ras_suspend(adev);
3703
3704 r = amdgpu_device_ip_suspend_phase1(adev);
3705
3706 amdgpu_amdkfd_suspend(adev, !fbcon);
3707
3708 /* evict vram memory */
3709 amdgpu_bo_evict_vram(adev);
3710
3711 amdgpu_fence_driver_suspend(adev);
3712
3713 r = amdgpu_device_ip_suspend_phase2(adev);
3714
3715 /* evict remaining vram memory
3716 * This second call to evict vram is to evict the gart page table
3717 * using the CPU.
3718 */
3719 amdgpu_bo_evict_vram(adev);
3720
3721 return 0;
3722 }
3723
3724 /**
3725 * amdgpu_device_resume - initiate device resume
3726 *
3727 * @dev: drm dev pointer
3728 * @fbcon : notify the fbdev of resume
3729 *
3730 * Bring the hw back to operating state (all asics).
3731 * Returns 0 for success or an error on failure.
3732 * Called at driver resume.
3733 */
amdgpu_device_resume(struct drm_device * dev,bool fbcon)3734 int amdgpu_device_resume(struct drm_device *dev, bool fbcon)
3735 {
3736 struct drm_connector *connector;
3737 struct drm_connector_list_iter iter;
3738 struct amdgpu_device *adev = drm_to_adev(dev);
3739 struct drm_crtc *crtc;
3740 int r = 0;
3741
3742 if (dev->switch_power_state == DRM_SWITCH_POWER_OFF)
3743 return 0;
3744
3745 /* post card */
3746 if (amdgpu_device_need_post(adev)) {
3747 r = amdgpu_device_asic_init(adev);
3748 if (r)
3749 dev_err(adev->dev, "amdgpu asic init failed\n");
3750 }
3751
3752 r = amdgpu_device_ip_resume(adev);
3753 if (r) {
3754 dev_err(adev->dev, "amdgpu_device_ip_resume failed (%d).\n", r);
3755 return r;
3756 }
3757 amdgpu_fence_driver_resume(adev);
3758
3759
3760 r = amdgpu_device_ip_late_init(adev);
3761 if (r)
3762 return r;
3763
3764 queue_delayed_work(system_wq, &adev->delayed_init_work,
3765 msecs_to_jiffies(AMDGPU_RESUME_MS));
3766
3767 if (!amdgpu_device_has_dc_support(adev)) {
3768 /* pin cursors */
3769 list_for_each_entry(crtc, &dev->mode_config.crtc_list, head) {
3770 struct amdgpu_crtc *amdgpu_crtc = to_amdgpu_crtc(crtc);
3771
3772 if (amdgpu_crtc->cursor_bo && !adev->enable_virtual_display) {
3773 struct amdgpu_bo *aobj = gem_to_amdgpu_bo(amdgpu_crtc->cursor_bo);
3774 r = amdgpu_bo_reserve(aobj, true);
3775 if (r == 0) {
3776 r = amdgpu_bo_pin(aobj, AMDGPU_GEM_DOMAIN_VRAM);
3777 if (r != 0)
3778 dev_err(adev->dev, "Failed to pin cursor BO (%d)\n", r);
3779 amdgpu_crtc->cursor_addr = amdgpu_bo_gpu_offset(aobj);
3780 amdgpu_bo_unreserve(aobj);
3781 }
3782 }
3783 }
3784 }
3785 r = amdgpu_amdkfd_resume(adev, !fbcon);
3786 if (r)
3787 return r;
3788
3789 /* Make sure IB tests flushed */
3790 flush_delayed_work(&adev->delayed_init_work);
3791
3792 /* blat the mode back in */
3793 if (fbcon) {
3794 if (!amdgpu_device_has_dc_support(adev)) {
3795 /* pre DCE11 */
3796 drm_helper_resume_force_mode(dev);
3797
3798 /* turn on display hw */
3799 drm_modeset_lock_all(dev);
3800
3801 drm_connector_list_iter_begin(dev, &iter);
3802 drm_for_each_connector_iter(connector, &iter)
3803 drm_helper_connector_dpms(connector,
3804 DRM_MODE_DPMS_ON);
3805 drm_connector_list_iter_end(&iter);
3806
3807 drm_modeset_unlock_all(dev);
3808 }
3809 amdgpu_fbdev_set_suspend(adev, 0);
3810 }
3811
3812 drm_kms_helper_poll_enable(dev);
3813
3814 amdgpu_ras_resume(adev);
3815
3816 /*
3817 * Most of the connector probing functions try to acquire runtime pm
3818 * refs to ensure that the GPU is powered on when connector polling is
3819 * performed. Since we're calling this from a runtime PM callback,
3820 * trying to acquire rpm refs will cause us to deadlock.
3821 *
3822 * Since we're guaranteed to be holding the rpm lock, it's safe to
3823 * temporarily disable the rpm helpers so this doesn't deadlock us.
3824 */
3825 #ifdef CONFIG_PM
3826 dev->dev->power.disable_depth++;
3827 #endif
3828 if (!amdgpu_device_has_dc_support(adev))
3829 drm_helper_hpd_irq_event(dev);
3830 else
3831 drm_kms_helper_hotplug_event(dev);
3832 #ifdef CONFIG_PM
3833 dev->dev->power.disable_depth--;
3834 #endif
3835 adev->in_suspend = false;
3836
3837 return 0;
3838 }
3839
3840 /**
3841 * amdgpu_device_ip_check_soft_reset - did soft reset succeed
3842 *
3843 * @adev: amdgpu_device pointer
3844 *
3845 * The list of all the hardware IPs that make up the asic is walked and
3846 * the check_soft_reset callbacks are run. check_soft_reset determines
3847 * if the asic is still hung or not.
3848 * Returns true if any of the IPs are still in a hung state, false if not.
3849 */
amdgpu_device_ip_check_soft_reset(struct amdgpu_device * adev)3850 static bool amdgpu_device_ip_check_soft_reset(struct amdgpu_device *adev)
3851 {
3852 int i;
3853 bool asic_hang = false;
3854
3855 if (amdgpu_sriov_vf(adev))
3856 return true;
3857
3858 if (amdgpu_asic_need_full_reset(adev))
3859 return true;
3860
3861 for (i = 0; i < adev->num_ip_blocks; i++) {
3862 if (!adev->ip_blocks[i].status.valid)
3863 continue;
3864 if (adev->ip_blocks[i].version->funcs->check_soft_reset)
3865 adev->ip_blocks[i].status.hang =
3866 adev->ip_blocks[i].version->funcs->check_soft_reset(adev);
3867 if (adev->ip_blocks[i].status.hang) {
3868 dev_info(adev->dev, "IP block:%s is hung!\n", adev->ip_blocks[i].version->funcs->name);
3869 asic_hang = true;
3870 }
3871 }
3872 return asic_hang;
3873 }
3874
3875 /**
3876 * amdgpu_device_ip_pre_soft_reset - prepare for soft reset
3877 *
3878 * @adev: amdgpu_device pointer
3879 *
3880 * The list of all the hardware IPs that make up the asic is walked and the
3881 * pre_soft_reset callbacks are run if the block is hung. pre_soft_reset
3882 * handles any IP specific hardware or software state changes that are
3883 * necessary for a soft reset to succeed.
3884 * Returns 0 on success, negative error code on failure.
3885 */
amdgpu_device_ip_pre_soft_reset(struct amdgpu_device * adev)3886 static int amdgpu_device_ip_pre_soft_reset(struct amdgpu_device *adev)
3887 {
3888 int i, r = 0;
3889
3890 for (i = 0; i < adev->num_ip_blocks; i++) {
3891 if (!adev->ip_blocks[i].status.valid)
3892 continue;
3893 if (adev->ip_blocks[i].status.hang &&
3894 adev->ip_blocks[i].version->funcs->pre_soft_reset) {
3895 r = adev->ip_blocks[i].version->funcs->pre_soft_reset(adev);
3896 if (r)
3897 return r;
3898 }
3899 }
3900
3901 return 0;
3902 }
3903
3904 /**
3905 * amdgpu_device_ip_need_full_reset - check if a full asic reset is needed
3906 *
3907 * @adev: amdgpu_device pointer
3908 *
3909 * Some hardware IPs cannot be soft reset. If they are hung, a full gpu
3910 * reset is necessary to recover.
3911 * Returns true if a full asic reset is required, false if not.
3912 */
amdgpu_device_ip_need_full_reset(struct amdgpu_device * adev)3913 static bool amdgpu_device_ip_need_full_reset(struct amdgpu_device *adev)
3914 {
3915 int i;
3916
3917 if (amdgpu_asic_need_full_reset(adev))
3918 return true;
3919
3920 for (i = 0; i < adev->num_ip_blocks; i++) {
3921 if (!adev->ip_blocks[i].status.valid)
3922 continue;
3923 if ((adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC) ||
3924 (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_SMC) ||
3925 (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_ACP) ||
3926 (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_DCE) ||
3927 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_PSP) {
3928 if (adev->ip_blocks[i].status.hang) {
3929 dev_info(adev->dev, "Some block need full reset!\n");
3930 return true;
3931 }
3932 }
3933 }
3934 return false;
3935 }
3936
3937 /**
3938 * amdgpu_device_ip_soft_reset - do a soft reset
3939 *
3940 * @adev: amdgpu_device pointer
3941 *
3942 * The list of all the hardware IPs that make up the asic is walked and the
3943 * soft_reset callbacks are run if the block is hung. soft_reset handles any
3944 * IP specific hardware or software state changes that are necessary to soft
3945 * reset the IP.
3946 * Returns 0 on success, negative error code on failure.
3947 */
amdgpu_device_ip_soft_reset(struct amdgpu_device * adev)3948 static int amdgpu_device_ip_soft_reset(struct amdgpu_device *adev)
3949 {
3950 int i, r = 0;
3951
3952 for (i = 0; i < adev->num_ip_blocks; i++) {
3953 if (!adev->ip_blocks[i].status.valid)
3954 continue;
3955 if (adev->ip_blocks[i].status.hang &&
3956 adev->ip_blocks[i].version->funcs->soft_reset) {
3957 r = adev->ip_blocks[i].version->funcs->soft_reset(adev);
3958 if (r)
3959 return r;
3960 }
3961 }
3962
3963 return 0;
3964 }
3965
3966 /**
3967 * amdgpu_device_ip_post_soft_reset - clean up from soft reset
3968 *
3969 * @adev: amdgpu_device pointer
3970 *
3971 * The list of all the hardware IPs that make up the asic is walked and the
3972 * post_soft_reset callbacks are run if the asic was hung. post_soft_reset
3973 * handles any IP specific hardware or software state changes that are
3974 * necessary after the IP has been soft reset.
3975 * Returns 0 on success, negative error code on failure.
3976 */
amdgpu_device_ip_post_soft_reset(struct amdgpu_device * adev)3977 static int amdgpu_device_ip_post_soft_reset(struct amdgpu_device *adev)
3978 {
3979 int i, r = 0;
3980
3981 for (i = 0; i < adev->num_ip_blocks; i++) {
3982 if (!adev->ip_blocks[i].status.valid)
3983 continue;
3984 if (adev->ip_blocks[i].status.hang &&
3985 adev->ip_blocks[i].version->funcs->post_soft_reset)
3986 r = adev->ip_blocks[i].version->funcs->post_soft_reset(adev);
3987 if (r)
3988 return r;
3989 }
3990
3991 return 0;
3992 }
3993
3994 /**
3995 * amdgpu_device_recover_vram - Recover some VRAM contents
3996 *
3997 * @adev: amdgpu_device pointer
3998 *
3999 * Restores the contents of VRAM buffers from the shadows in GTT. Used to
4000 * restore things like GPUVM page tables after a GPU reset where
4001 * the contents of VRAM might be lost.
4002 *
4003 * Returns:
4004 * 0 on success, negative error code on failure.
4005 */
amdgpu_device_recover_vram(struct amdgpu_device * adev)4006 static int amdgpu_device_recover_vram(struct amdgpu_device *adev)
4007 {
4008 struct dma_fence *fence = NULL, *next = NULL;
4009 struct amdgpu_bo *shadow;
4010 long r = 1, tmo;
4011
4012 if (amdgpu_sriov_runtime(adev))
4013 tmo = msecs_to_jiffies(8000);
4014 else
4015 tmo = msecs_to_jiffies(100);
4016
4017 dev_info(adev->dev, "recover vram bo from shadow start\n");
4018 mutex_lock(&adev->shadow_list_lock);
4019 list_for_each_entry(shadow, &adev->shadow_list, shadow_list) {
4020
4021 /* No need to recover an evicted BO */
4022 if (shadow->tbo.mem.mem_type != TTM_PL_TT ||
4023 shadow->tbo.mem.start == AMDGPU_BO_INVALID_OFFSET ||
4024 shadow->parent->tbo.mem.mem_type != TTM_PL_VRAM)
4025 continue;
4026
4027 r = amdgpu_bo_restore_shadow(shadow, &next);
4028 if (r)
4029 break;
4030
4031 if (fence) {
4032 tmo = dma_fence_wait_timeout(fence, false, tmo);
4033 dma_fence_put(fence);
4034 fence = next;
4035 if (tmo == 0) {
4036 r = -ETIMEDOUT;
4037 break;
4038 } else if (tmo < 0) {
4039 r = tmo;
4040 break;
4041 }
4042 } else {
4043 fence = next;
4044 }
4045 }
4046 mutex_unlock(&adev->shadow_list_lock);
4047
4048 if (fence)
4049 tmo = dma_fence_wait_timeout(fence, false, tmo);
4050 dma_fence_put(fence);
4051
4052 if (r < 0 || tmo <= 0) {
4053 dev_err(adev->dev, "recover vram bo from shadow failed, r is %ld, tmo is %ld\n", r, tmo);
4054 return -EIO;
4055 }
4056
4057 dev_info(adev->dev, "recover vram bo from shadow done\n");
4058 return 0;
4059 }
4060
4061
4062 /**
4063 * amdgpu_device_reset_sriov - reset ASIC for SR-IOV vf
4064 *
4065 * @adev: amdgpu_device pointer
4066 * @from_hypervisor: request from hypervisor
4067 *
4068 * do VF FLR and reinitialize Asic
4069 * return 0 means succeeded otherwise failed
4070 */
amdgpu_device_reset_sriov(struct amdgpu_device * adev,bool from_hypervisor)4071 static int amdgpu_device_reset_sriov(struct amdgpu_device *adev,
4072 bool from_hypervisor)
4073 {
4074 int r;
4075
4076 if (from_hypervisor)
4077 r = amdgpu_virt_request_full_gpu(adev, true);
4078 else
4079 r = amdgpu_virt_reset_gpu(adev);
4080 if (r)
4081 return r;
4082
4083 amdgpu_amdkfd_pre_reset(adev);
4084
4085 /* Resume IP prior to SMC */
4086 r = amdgpu_device_ip_reinit_early_sriov(adev);
4087 if (r)
4088 goto error;
4089
4090 amdgpu_virt_init_data_exchange(adev);
4091 /* we need recover gart prior to run SMC/CP/SDMA resume */
4092 amdgpu_gtt_mgr_recover(ttm_manager_type(&adev->mman.bdev, TTM_PL_TT));
4093
4094 r = amdgpu_device_fw_loading(adev);
4095 if (r)
4096 return r;
4097
4098 /* now we are okay to resume SMC/CP/SDMA */
4099 r = amdgpu_device_ip_reinit_late_sriov(adev);
4100 if (r)
4101 goto error;
4102
4103 amdgpu_irq_gpu_reset_resume_helper(adev);
4104 r = amdgpu_ib_ring_tests(adev);
4105 amdgpu_amdkfd_post_reset(adev);
4106
4107 error:
4108 amdgpu_virt_release_full_gpu(adev, true);
4109 if (!r && adev->virt.gim_feature & AMDGIM_FEATURE_GIM_FLR_VRAMLOST) {
4110 amdgpu_inc_vram_lost(adev);
4111 r = amdgpu_device_recover_vram(adev);
4112 }
4113
4114 return r;
4115 }
4116
4117 /**
4118 * amdgpu_device_has_job_running - check if there is any job in mirror list
4119 *
4120 * @adev: amdgpu_device pointer
4121 *
4122 * check if there is any job in mirror list
4123 */
amdgpu_device_has_job_running(struct amdgpu_device * adev)4124 bool amdgpu_device_has_job_running(struct amdgpu_device *adev)
4125 {
4126 int i;
4127 struct drm_sched_job *job;
4128
4129 for (i = 0; i < AMDGPU_MAX_RINGS; ++i) {
4130 struct amdgpu_ring *ring = adev->rings[i];
4131
4132 if (!ring || !ring->sched.thread)
4133 continue;
4134
4135 spin_lock(&ring->sched.job_list_lock);
4136 job = list_first_entry_or_null(&ring->sched.ring_mirror_list,
4137 struct drm_sched_job, node);
4138 spin_unlock(&ring->sched.job_list_lock);
4139 if (job)
4140 return true;
4141 }
4142 return false;
4143 }
4144
4145 /**
4146 * amdgpu_device_should_recover_gpu - check if we should try GPU recovery
4147 *
4148 * @adev: amdgpu_device pointer
4149 *
4150 * Check amdgpu_gpu_recovery and SRIOV status to see if we should try to recover
4151 * a hung GPU.
4152 */
amdgpu_device_should_recover_gpu(struct amdgpu_device * adev)4153 bool amdgpu_device_should_recover_gpu(struct amdgpu_device *adev)
4154 {
4155 if (!amdgpu_device_ip_check_soft_reset(adev)) {
4156 dev_info(adev->dev, "Timeout, but no hardware hang detected.\n");
4157 return false;
4158 }
4159
4160 if (amdgpu_gpu_recovery == 0)
4161 goto disabled;
4162
4163 if (amdgpu_sriov_vf(adev))
4164 return true;
4165
4166 if (amdgpu_gpu_recovery == -1) {
4167 switch (adev->asic_type) {
4168 case CHIP_BONAIRE:
4169 case CHIP_HAWAII:
4170 case CHIP_TOPAZ:
4171 case CHIP_TONGA:
4172 case CHIP_FIJI:
4173 case CHIP_POLARIS10:
4174 case CHIP_POLARIS11:
4175 case CHIP_POLARIS12:
4176 case CHIP_VEGAM:
4177 case CHIP_VEGA20:
4178 case CHIP_VEGA10:
4179 case CHIP_VEGA12:
4180 case CHIP_RAVEN:
4181 case CHIP_ARCTURUS:
4182 case CHIP_RENOIR:
4183 case CHIP_NAVI10:
4184 case CHIP_NAVI14:
4185 case CHIP_NAVI12:
4186 case CHIP_SIENNA_CICHLID:
4187 break;
4188 default:
4189 goto disabled;
4190 }
4191 }
4192
4193 return true;
4194
4195 disabled:
4196 dev_info(adev->dev, "GPU recovery disabled.\n");
4197 return false;
4198 }
4199
4200
amdgpu_device_pre_asic_reset(struct amdgpu_device * adev,struct amdgpu_job * job,bool * need_full_reset_arg)4201 static int amdgpu_device_pre_asic_reset(struct amdgpu_device *adev,
4202 struct amdgpu_job *job,
4203 bool *need_full_reset_arg)
4204 {
4205 int i, r = 0;
4206 bool need_full_reset = *need_full_reset_arg;
4207
4208 amdgpu_debugfs_wait_dump(adev);
4209
4210 if (amdgpu_sriov_vf(adev)) {
4211 /* stop the data exchange thread */
4212 amdgpu_virt_fini_data_exchange(adev);
4213 }
4214
4215 /* block all schedulers and reset given job's ring */
4216 for (i = 0; i < AMDGPU_MAX_RINGS; ++i) {
4217 struct amdgpu_ring *ring = adev->rings[i];
4218
4219 if (!ring || !ring->sched.thread)
4220 continue;
4221
4222 /* after all hw jobs are reset, hw fence is meaningless, so force_completion */
4223 amdgpu_fence_driver_force_completion(ring);
4224 }
4225
4226 if(job)
4227 drm_sched_increase_karma(&job->base);
4228
4229 /* Don't suspend on bare metal if we are not going to HW reset the ASIC */
4230 if (!amdgpu_sriov_vf(adev)) {
4231
4232 if (!need_full_reset)
4233 need_full_reset = amdgpu_device_ip_need_full_reset(adev);
4234
4235 if (!need_full_reset) {
4236 amdgpu_device_ip_pre_soft_reset(adev);
4237 r = amdgpu_device_ip_soft_reset(adev);
4238 amdgpu_device_ip_post_soft_reset(adev);
4239 if (r || amdgpu_device_ip_check_soft_reset(adev)) {
4240 dev_info(adev->dev, "soft reset failed, will fallback to full reset!\n");
4241 need_full_reset = true;
4242 }
4243 }
4244
4245 if (need_full_reset)
4246 r = amdgpu_device_ip_suspend(adev);
4247
4248 *need_full_reset_arg = need_full_reset;
4249 }
4250
4251 return r;
4252 }
4253
amdgpu_do_asic_reset(struct amdgpu_hive_info * hive,struct list_head * device_list_handle,bool * need_full_reset_arg,bool skip_hw_reset)4254 static int amdgpu_do_asic_reset(struct amdgpu_hive_info *hive,
4255 struct list_head *device_list_handle,
4256 bool *need_full_reset_arg,
4257 bool skip_hw_reset)
4258 {
4259 struct amdgpu_device *tmp_adev = NULL;
4260 bool need_full_reset = *need_full_reset_arg, vram_lost = false;
4261 int r = 0;
4262
4263 /*
4264 * ASIC reset has to be done on all HGMI hive nodes ASAP
4265 * to allow proper links negotiation in FW (within 1 sec)
4266 */
4267 if (!skip_hw_reset && need_full_reset) {
4268 list_for_each_entry(tmp_adev, device_list_handle, gmc.xgmi.head) {
4269 /* For XGMI run all resets in parallel to speed up the process */
4270 if (tmp_adev->gmc.xgmi.num_physical_nodes > 1) {
4271 if (!queue_work(system_unbound_wq, &tmp_adev->xgmi_reset_work))
4272 r = -EALREADY;
4273 } else
4274 r = amdgpu_asic_reset(tmp_adev);
4275
4276 if (r) {
4277 dev_err(tmp_adev->dev, "ASIC reset failed with error, %d for drm dev, %s",
4278 r, adev_to_drm(tmp_adev)->unique);
4279 break;
4280 }
4281 }
4282
4283 /* For XGMI wait for all resets to complete before proceed */
4284 if (!r) {
4285 list_for_each_entry(tmp_adev, device_list_handle,
4286 gmc.xgmi.head) {
4287 if (tmp_adev->gmc.xgmi.num_physical_nodes > 1) {
4288 flush_work(&tmp_adev->xgmi_reset_work);
4289 r = tmp_adev->asic_reset_res;
4290 if (r)
4291 break;
4292 }
4293 }
4294 }
4295 }
4296
4297 if (!r && amdgpu_ras_intr_triggered()) {
4298 list_for_each_entry(tmp_adev, device_list_handle, gmc.xgmi.head) {
4299 if (tmp_adev->mmhub.funcs &&
4300 tmp_adev->mmhub.funcs->reset_ras_error_count)
4301 tmp_adev->mmhub.funcs->reset_ras_error_count(tmp_adev);
4302 }
4303
4304 amdgpu_ras_intr_cleared();
4305 }
4306
4307 list_for_each_entry(tmp_adev, device_list_handle, gmc.xgmi.head) {
4308 if (need_full_reset) {
4309 /* post card */
4310 if (amdgpu_device_asic_init(tmp_adev))
4311 dev_warn(tmp_adev->dev, "asic atom init failed!");
4312
4313 if (!r) {
4314 dev_info(tmp_adev->dev, "GPU reset succeeded, trying to resume\n");
4315 r = amdgpu_amdkfd_resume_iommu(tmp_adev);
4316 if (r)
4317 goto out;
4318
4319 r = amdgpu_device_ip_resume_phase1(tmp_adev);
4320 if (r)
4321 goto out;
4322
4323 vram_lost = amdgpu_device_check_vram_lost(tmp_adev);
4324 if (vram_lost) {
4325 DRM_INFO("VRAM is lost due to GPU reset!\n");
4326 amdgpu_inc_vram_lost(tmp_adev);
4327 }
4328
4329 r = amdgpu_gtt_mgr_recover(ttm_manager_type(&tmp_adev->mman.bdev, TTM_PL_TT));
4330 if (r)
4331 goto out;
4332
4333 r = amdgpu_device_fw_loading(tmp_adev);
4334 if (r)
4335 return r;
4336
4337 r = amdgpu_device_ip_resume_phase2(tmp_adev);
4338 if (r)
4339 goto out;
4340
4341 if (vram_lost)
4342 amdgpu_device_fill_reset_magic(tmp_adev);
4343
4344 /*
4345 * Add this ASIC as tracked as reset was already
4346 * complete successfully.
4347 */
4348 amdgpu_register_gpu_instance(tmp_adev);
4349
4350 r = amdgpu_device_ip_late_init(tmp_adev);
4351 if (r)
4352 goto out;
4353
4354 amdgpu_fbdev_set_suspend(tmp_adev, 0);
4355
4356 /*
4357 * The GPU enters bad state once faulty pages
4358 * by ECC has reached the threshold, and ras
4359 * recovery is scheduled next. So add one check
4360 * here to break recovery if it indeed exceeds
4361 * bad page threshold, and remind user to
4362 * retire this GPU or setting one bigger
4363 * bad_page_threshold value to fix this once
4364 * probing driver again.
4365 */
4366 if (!amdgpu_ras_check_err_threshold(tmp_adev)) {
4367 /* must succeed. */
4368 amdgpu_ras_resume(tmp_adev);
4369 } else {
4370 r = -EINVAL;
4371 goto out;
4372 }
4373
4374 /* Update PSP FW topology after reset */
4375 if (hive && tmp_adev->gmc.xgmi.num_physical_nodes > 1)
4376 r = amdgpu_xgmi_update_topology(hive, tmp_adev);
4377 }
4378 }
4379
4380 out:
4381 if (!r) {
4382 amdgpu_irq_gpu_reset_resume_helper(tmp_adev);
4383 r = amdgpu_ib_ring_tests(tmp_adev);
4384 if (r) {
4385 dev_err(tmp_adev->dev, "ib ring test failed (%d).\n", r);
4386 need_full_reset = true;
4387 r = -EAGAIN;
4388 goto end;
4389 }
4390 }
4391
4392 if (!r)
4393 r = amdgpu_device_recover_vram(tmp_adev);
4394 else
4395 tmp_adev->asic_reset_res = r;
4396 }
4397
4398 end:
4399 *need_full_reset_arg = need_full_reset;
4400 return r;
4401 }
4402
amdgpu_device_lock_adev(struct amdgpu_device * adev,struct amdgpu_hive_info * hive)4403 static bool amdgpu_device_lock_adev(struct amdgpu_device *adev,
4404 struct amdgpu_hive_info *hive)
4405 {
4406 if (atomic_cmpxchg(&adev->in_gpu_reset, 0, 1) != 0)
4407 return false;
4408
4409 if (hive) {
4410 down_write_nest_lock(&adev->reset_sem, &hive->hive_lock);
4411 } else {
4412 down_write(&adev->reset_sem);
4413 }
4414
4415 atomic_inc(&adev->gpu_reset_counter);
4416 switch (amdgpu_asic_reset_method(adev)) {
4417 case AMD_RESET_METHOD_MODE1:
4418 adev->mp1_state = PP_MP1_STATE_SHUTDOWN;
4419 break;
4420 case AMD_RESET_METHOD_MODE2:
4421 adev->mp1_state = PP_MP1_STATE_RESET;
4422 break;
4423 default:
4424 adev->mp1_state = PP_MP1_STATE_NONE;
4425 break;
4426 }
4427
4428 return true;
4429 }
4430
amdgpu_device_unlock_adev(struct amdgpu_device * adev)4431 static void amdgpu_device_unlock_adev(struct amdgpu_device *adev)
4432 {
4433 amdgpu_vf_error_trans_all(adev);
4434 adev->mp1_state = PP_MP1_STATE_NONE;
4435 atomic_set(&adev->in_gpu_reset, 0);
4436 up_write(&adev->reset_sem);
4437 }
4438
amdgpu_device_resume_display_audio(struct amdgpu_device * adev)4439 static void amdgpu_device_resume_display_audio(struct amdgpu_device *adev)
4440 {
4441 struct pci_dev *p = NULL;
4442
4443 p = pci_get_domain_bus_and_slot(pci_domain_nr(adev->pdev->bus),
4444 adev->pdev->bus->number, 1);
4445 if (p) {
4446 pm_runtime_enable(&(p->dev));
4447 pm_runtime_resume(&(p->dev));
4448 }
4449
4450 pci_dev_put(p);
4451 }
4452
amdgpu_device_suspend_display_audio(struct amdgpu_device * adev)4453 static int amdgpu_device_suspend_display_audio(struct amdgpu_device *adev)
4454 {
4455 enum amd_reset_method reset_method;
4456 struct pci_dev *p = NULL;
4457 u64 expires;
4458
4459 /*
4460 * For now, only BACO and mode1 reset are confirmed
4461 * to suffer the audio issue without proper suspended.
4462 */
4463 reset_method = amdgpu_asic_reset_method(adev);
4464 if ((reset_method != AMD_RESET_METHOD_BACO) &&
4465 (reset_method != AMD_RESET_METHOD_MODE1))
4466 return -EINVAL;
4467
4468 p = pci_get_domain_bus_and_slot(pci_domain_nr(adev->pdev->bus),
4469 adev->pdev->bus->number, 1);
4470 if (!p)
4471 return -ENODEV;
4472
4473 expires = pm_runtime_autosuspend_expiration(&(p->dev));
4474 if (!expires)
4475 /*
4476 * If we cannot get the audio device autosuspend delay,
4477 * a fixed 4S interval will be used. Considering 3S is
4478 * the audio controller default autosuspend delay setting.
4479 * 4S used here is guaranteed to cover that.
4480 */
4481 expires = ktime_get_mono_fast_ns() + NSEC_PER_SEC * 4ULL;
4482
4483 while (!pm_runtime_status_suspended(&(p->dev))) {
4484 if (!pm_runtime_suspend(&(p->dev)))
4485 break;
4486
4487 if (expires < ktime_get_mono_fast_ns()) {
4488 dev_warn(adev->dev, "failed to suspend display audio\n");
4489 pci_dev_put(p);
4490 /* TODO: abort the succeeding gpu reset? */
4491 return -ETIMEDOUT;
4492 }
4493 }
4494
4495 pm_runtime_disable(&(p->dev));
4496
4497 pci_dev_put(p);
4498 return 0;
4499 }
4500
4501 /**
4502 * amdgpu_device_gpu_recover - reset the asic and recover scheduler
4503 *
4504 * @adev: amdgpu_device pointer
4505 * @job: which job trigger hang
4506 *
4507 * Attempt to reset the GPU if it has hung (all asics).
4508 * Attempt to do soft-reset or full-reset and reinitialize Asic
4509 * Returns 0 for success or an error on failure.
4510 */
4511
amdgpu_device_gpu_recover(struct amdgpu_device * adev,struct amdgpu_job * job)4512 int amdgpu_device_gpu_recover(struct amdgpu_device *adev,
4513 struct amdgpu_job *job)
4514 {
4515 struct list_head device_list, *device_list_handle = NULL;
4516 bool need_full_reset = false;
4517 bool job_signaled = false;
4518 struct amdgpu_hive_info *hive = NULL;
4519 struct amdgpu_device *tmp_adev = NULL;
4520 int i, r = 0;
4521 bool need_emergency_restart = false;
4522 bool audio_suspended = false;
4523
4524 /*
4525 * Special case: RAS triggered and full reset isn't supported
4526 */
4527 need_emergency_restart = amdgpu_ras_need_emergency_restart(adev);
4528
4529 /*
4530 * Flush RAM to disk so that after reboot
4531 * the user can read log and see why the system rebooted.
4532 */
4533 if (need_emergency_restart && amdgpu_ras_get_context(adev) &&
4534 amdgpu_ras_get_context(adev)->reboot) {
4535 DRM_WARN("Emergency reboot.");
4536
4537 ksys_sync_helper();
4538 emergency_restart();
4539 }
4540
4541 dev_info(adev->dev, "GPU %s begin!\n",
4542 need_emergency_restart ? "jobs stop":"reset");
4543
4544 /*
4545 * Here we trylock to avoid chain of resets executing from
4546 * either trigger by jobs on different adevs in XGMI hive or jobs on
4547 * different schedulers for same device while this TO handler is running.
4548 * We always reset all schedulers for device and all devices for XGMI
4549 * hive so that should take care of them too.
4550 */
4551 hive = amdgpu_get_xgmi_hive(adev);
4552 if (hive) {
4553 if (atomic_cmpxchg(&hive->in_reset, 0, 1) != 0) {
4554 DRM_INFO("Bailing on TDR for s_job:%llx, hive: %llx as another already in progress",
4555 job ? job->base.id : -1, hive->hive_id);
4556 amdgpu_put_xgmi_hive(hive);
4557 return 0;
4558 }
4559 mutex_lock(&hive->hive_lock);
4560 }
4561
4562 /*
4563 * Build list of devices to reset.
4564 * In case we are in XGMI hive mode, resort the device list
4565 * to put adev in the 1st position.
4566 */
4567 INIT_LIST_HEAD(&device_list);
4568 if (adev->gmc.xgmi.num_physical_nodes > 1) {
4569 if (!hive)
4570 return -ENODEV;
4571 if (!list_is_first(&adev->gmc.xgmi.head, &hive->device_list))
4572 list_rotate_to_front(&adev->gmc.xgmi.head, &hive->device_list);
4573 device_list_handle = &hive->device_list;
4574 } else {
4575 list_add_tail(&adev->gmc.xgmi.head, &device_list);
4576 device_list_handle = &device_list;
4577 }
4578
4579 /* block all schedulers and reset given job's ring */
4580 list_for_each_entry(tmp_adev, device_list_handle, gmc.xgmi.head) {
4581 if (!amdgpu_device_lock_adev(tmp_adev, hive)) {
4582 dev_info(tmp_adev->dev, "Bailing on TDR for s_job:%llx, as another already in progress",
4583 job ? job->base.id : -1);
4584 r = 0;
4585 goto skip_recovery;
4586 }
4587
4588 /*
4589 * Try to put the audio codec into suspend state
4590 * before gpu reset started.
4591 *
4592 * Due to the power domain of the graphics device
4593 * is shared with AZ power domain. Without this,
4594 * we may change the audio hardware from behind
4595 * the audio driver's back. That will trigger
4596 * some audio codec errors.
4597 */
4598 if (!amdgpu_device_suspend_display_audio(tmp_adev))
4599 audio_suspended = true;
4600
4601 amdgpu_ras_set_error_query_ready(tmp_adev, false);
4602
4603 cancel_delayed_work_sync(&tmp_adev->delayed_init_work);
4604
4605 if (!amdgpu_sriov_vf(tmp_adev))
4606 amdgpu_amdkfd_pre_reset(tmp_adev);
4607
4608 /*
4609 * Mark these ASICs to be reseted as untracked first
4610 * And add them back after reset completed
4611 */
4612 amdgpu_unregister_gpu_instance(tmp_adev);
4613
4614 amdgpu_fbdev_set_suspend(tmp_adev, 1);
4615
4616 /* disable ras on ALL IPs */
4617 if (!need_emergency_restart &&
4618 amdgpu_device_ip_need_full_reset(tmp_adev))
4619 amdgpu_ras_suspend(tmp_adev);
4620
4621 for (i = 0; i < AMDGPU_MAX_RINGS; ++i) {
4622 struct amdgpu_ring *ring = tmp_adev->rings[i];
4623
4624 if (!ring || !ring->sched.thread)
4625 continue;
4626
4627 drm_sched_stop(&ring->sched, job ? &job->base : NULL);
4628
4629 if (need_emergency_restart)
4630 amdgpu_job_stop_all_jobs_on_sched(&ring->sched);
4631 }
4632 }
4633
4634 if (need_emergency_restart)
4635 goto skip_sched_resume;
4636
4637 /*
4638 * Must check guilty signal here since after this point all old
4639 * HW fences are force signaled.
4640 *
4641 * job->base holds a reference to parent fence
4642 */
4643 if (job && job->base.s_fence->parent &&
4644 dma_fence_is_signaled(job->base.s_fence->parent)) {
4645 job_signaled = true;
4646 dev_info(adev->dev, "Guilty job already signaled, skipping HW reset");
4647 goto skip_hw_reset;
4648 }
4649
4650 retry: /* Rest of adevs pre asic reset from XGMI hive. */
4651 list_for_each_entry(tmp_adev, device_list_handle, gmc.xgmi.head) {
4652 r = amdgpu_device_pre_asic_reset(tmp_adev,
4653 (tmp_adev == adev) ? job : NULL,
4654 &need_full_reset);
4655 /*TODO Should we stop ?*/
4656 if (r) {
4657 dev_err(tmp_adev->dev, "GPU pre asic reset failed with err, %d for drm dev, %s ",
4658 r, adev_to_drm(tmp_adev)->unique);
4659 tmp_adev->asic_reset_res = r;
4660 }
4661 }
4662
4663 /* Actual ASIC resets if needed.*/
4664 /* TODO Implement XGMI hive reset logic for SRIOV */
4665 if (amdgpu_sriov_vf(adev)) {
4666 r = amdgpu_device_reset_sriov(adev, job ? false : true);
4667 if (r)
4668 adev->asic_reset_res = r;
4669 } else {
4670 r = amdgpu_do_asic_reset(hive, device_list_handle, &need_full_reset, false);
4671 if (r && r == -EAGAIN)
4672 goto retry;
4673 }
4674
4675 skip_hw_reset:
4676
4677 /* Post ASIC reset for all devs .*/
4678 list_for_each_entry(tmp_adev, device_list_handle, gmc.xgmi.head) {
4679
4680 for (i = 0; i < AMDGPU_MAX_RINGS; ++i) {
4681 struct amdgpu_ring *ring = tmp_adev->rings[i];
4682
4683 if (!ring || !ring->sched.thread)
4684 continue;
4685
4686 /* No point to resubmit jobs if we didn't HW reset*/
4687 if (!tmp_adev->asic_reset_res && !job_signaled)
4688 drm_sched_resubmit_jobs(&ring->sched);
4689
4690 drm_sched_start(&ring->sched, !tmp_adev->asic_reset_res);
4691 }
4692
4693 if (!amdgpu_device_has_dc_support(tmp_adev) && !job_signaled) {
4694 drm_helper_resume_force_mode(adev_to_drm(tmp_adev));
4695 }
4696
4697 tmp_adev->asic_reset_res = 0;
4698
4699 if (r) {
4700 /* bad news, how to tell it to userspace ? */
4701 dev_info(tmp_adev->dev, "GPU reset(%d) failed\n", atomic_read(&tmp_adev->gpu_reset_counter));
4702 amdgpu_vf_error_put(tmp_adev, AMDGIM_ERROR_VF_GPU_RESET_FAIL, 0, r);
4703 } else {
4704 dev_info(tmp_adev->dev, "GPU reset(%d) succeeded!\n", atomic_read(&tmp_adev->gpu_reset_counter));
4705 }
4706 }
4707
4708 skip_sched_resume:
4709 list_for_each_entry(tmp_adev, device_list_handle, gmc.xgmi.head) {
4710 /*unlock kfd: SRIOV would do it separately */
4711 if (!need_emergency_restart && !amdgpu_sriov_vf(tmp_adev))
4712 amdgpu_amdkfd_post_reset(tmp_adev);
4713 if (audio_suspended)
4714 amdgpu_device_resume_display_audio(tmp_adev);
4715 amdgpu_device_unlock_adev(tmp_adev);
4716 }
4717
4718 skip_recovery:
4719 if (hive) {
4720 atomic_set(&hive->in_reset, 0);
4721 mutex_unlock(&hive->hive_lock);
4722 amdgpu_put_xgmi_hive(hive);
4723 }
4724
4725 if (r)
4726 dev_info(adev->dev, "GPU reset end with ret = %d\n", r);
4727 return r;
4728 }
4729
4730 /**
4731 * amdgpu_device_get_pcie_info - fence pcie info about the PCIE slot
4732 *
4733 * @adev: amdgpu_device pointer
4734 *
4735 * Fetchs and stores in the driver the PCIE capabilities (gen speed
4736 * and lanes) of the slot the device is in. Handles APUs and
4737 * virtualized environments where PCIE config space may not be available.
4738 */
amdgpu_device_get_pcie_info(struct amdgpu_device * adev)4739 static void amdgpu_device_get_pcie_info(struct amdgpu_device *adev)
4740 {
4741 struct pci_dev *pdev;
4742 enum pci_bus_speed speed_cap, platform_speed_cap;
4743 enum pcie_link_width platform_link_width;
4744
4745 if (amdgpu_pcie_gen_cap)
4746 adev->pm.pcie_gen_mask = amdgpu_pcie_gen_cap;
4747
4748 if (amdgpu_pcie_lane_cap)
4749 adev->pm.pcie_mlw_mask = amdgpu_pcie_lane_cap;
4750
4751 /* covers APUs as well */
4752 if (pci_is_root_bus(adev->pdev->bus)) {
4753 if (adev->pm.pcie_gen_mask == 0)
4754 adev->pm.pcie_gen_mask = AMDGPU_DEFAULT_PCIE_GEN_MASK;
4755 if (adev->pm.pcie_mlw_mask == 0)
4756 adev->pm.pcie_mlw_mask = AMDGPU_DEFAULT_PCIE_MLW_MASK;
4757 return;
4758 }
4759
4760 if (adev->pm.pcie_gen_mask && adev->pm.pcie_mlw_mask)
4761 return;
4762
4763 pcie_bandwidth_available(adev->pdev, NULL,
4764 &platform_speed_cap, &platform_link_width);
4765
4766 if (adev->pm.pcie_gen_mask == 0) {
4767 /* asic caps */
4768 pdev = adev->pdev;
4769 speed_cap = pcie_get_speed_cap(pdev);
4770 if (speed_cap == PCI_SPEED_UNKNOWN) {
4771 adev->pm.pcie_gen_mask |= (CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN1 |
4772 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN2 |
4773 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN3);
4774 } else {
4775 if (speed_cap == PCIE_SPEED_16_0GT)
4776 adev->pm.pcie_gen_mask |= (CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN1 |
4777 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN2 |
4778 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN3 |
4779 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN4);
4780 else if (speed_cap == PCIE_SPEED_8_0GT)
4781 adev->pm.pcie_gen_mask |= (CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN1 |
4782 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN2 |
4783 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN3);
4784 else if (speed_cap == PCIE_SPEED_5_0GT)
4785 adev->pm.pcie_gen_mask |= (CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN1 |
4786 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN2);
4787 else
4788 adev->pm.pcie_gen_mask |= CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN1;
4789 }
4790 /* platform caps */
4791 if (platform_speed_cap == PCI_SPEED_UNKNOWN) {
4792 adev->pm.pcie_gen_mask |= (CAIL_PCIE_LINK_SPEED_SUPPORT_GEN1 |
4793 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN2);
4794 } else {
4795 if (platform_speed_cap == PCIE_SPEED_16_0GT)
4796 adev->pm.pcie_gen_mask |= (CAIL_PCIE_LINK_SPEED_SUPPORT_GEN1 |
4797 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN2 |
4798 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN3 |
4799 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN4);
4800 else if (platform_speed_cap == PCIE_SPEED_8_0GT)
4801 adev->pm.pcie_gen_mask |= (CAIL_PCIE_LINK_SPEED_SUPPORT_GEN1 |
4802 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN2 |
4803 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN3);
4804 else if (platform_speed_cap == PCIE_SPEED_5_0GT)
4805 adev->pm.pcie_gen_mask |= (CAIL_PCIE_LINK_SPEED_SUPPORT_GEN1 |
4806 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN2);
4807 else
4808 adev->pm.pcie_gen_mask |= CAIL_PCIE_LINK_SPEED_SUPPORT_GEN1;
4809
4810 }
4811 }
4812 if (adev->pm.pcie_mlw_mask == 0) {
4813 if (platform_link_width == PCIE_LNK_WIDTH_UNKNOWN) {
4814 adev->pm.pcie_mlw_mask |= AMDGPU_DEFAULT_PCIE_MLW_MASK;
4815 } else {
4816 switch (platform_link_width) {
4817 case PCIE_LNK_X32:
4818 adev->pm.pcie_mlw_mask = (CAIL_PCIE_LINK_WIDTH_SUPPORT_X32 |
4819 CAIL_PCIE_LINK_WIDTH_SUPPORT_X16 |
4820 CAIL_PCIE_LINK_WIDTH_SUPPORT_X12 |
4821 CAIL_PCIE_LINK_WIDTH_SUPPORT_X8 |
4822 CAIL_PCIE_LINK_WIDTH_SUPPORT_X4 |
4823 CAIL_PCIE_LINK_WIDTH_SUPPORT_X2 |
4824 CAIL_PCIE_LINK_WIDTH_SUPPORT_X1);
4825 break;
4826 case PCIE_LNK_X16:
4827 adev->pm.pcie_mlw_mask = (CAIL_PCIE_LINK_WIDTH_SUPPORT_X16 |
4828 CAIL_PCIE_LINK_WIDTH_SUPPORT_X12 |
4829 CAIL_PCIE_LINK_WIDTH_SUPPORT_X8 |
4830 CAIL_PCIE_LINK_WIDTH_SUPPORT_X4 |
4831 CAIL_PCIE_LINK_WIDTH_SUPPORT_X2 |
4832 CAIL_PCIE_LINK_WIDTH_SUPPORT_X1);
4833 break;
4834 case PCIE_LNK_X12:
4835 adev->pm.pcie_mlw_mask = (CAIL_PCIE_LINK_WIDTH_SUPPORT_X12 |
4836 CAIL_PCIE_LINK_WIDTH_SUPPORT_X8 |
4837 CAIL_PCIE_LINK_WIDTH_SUPPORT_X4 |
4838 CAIL_PCIE_LINK_WIDTH_SUPPORT_X2 |
4839 CAIL_PCIE_LINK_WIDTH_SUPPORT_X1);
4840 break;
4841 case PCIE_LNK_X8:
4842 adev->pm.pcie_mlw_mask = (CAIL_PCIE_LINK_WIDTH_SUPPORT_X8 |
4843 CAIL_PCIE_LINK_WIDTH_SUPPORT_X4 |
4844 CAIL_PCIE_LINK_WIDTH_SUPPORT_X2 |
4845 CAIL_PCIE_LINK_WIDTH_SUPPORT_X1);
4846 break;
4847 case PCIE_LNK_X4:
4848 adev->pm.pcie_mlw_mask = (CAIL_PCIE_LINK_WIDTH_SUPPORT_X4 |
4849 CAIL_PCIE_LINK_WIDTH_SUPPORT_X2 |
4850 CAIL_PCIE_LINK_WIDTH_SUPPORT_X1);
4851 break;
4852 case PCIE_LNK_X2:
4853 adev->pm.pcie_mlw_mask = (CAIL_PCIE_LINK_WIDTH_SUPPORT_X2 |
4854 CAIL_PCIE_LINK_WIDTH_SUPPORT_X1);
4855 break;
4856 case PCIE_LNK_X1:
4857 adev->pm.pcie_mlw_mask = CAIL_PCIE_LINK_WIDTH_SUPPORT_X1;
4858 break;
4859 default:
4860 break;
4861 }
4862 }
4863 }
4864 }
4865
amdgpu_device_baco_enter(struct drm_device * dev)4866 int amdgpu_device_baco_enter(struct drm_device *dev)
4867 {
4868 struct amdgpu_device *adev = drm_to_adev(dev);
4869 struct amdgpu_ras *ras = amdgpu_ras_get_context(adev);
4870
4871 if (!amdgpu_device_supports_baco(adev_to_drm(adev)))
4872 return -ENOTSUPP;
4873
4874 if (ras && ras->supported && adev->nbio.funcs->enable_doorbell_interrupt)
4875 adev->nbio.funcs->enable_doorbell_interrupt(adev, false);
4876
4877 return amdgpu_dpm_baco_enter(adev);
4878 }
4879
amdgpu_device_baco_exit(struct drm_device * dev)4880 int amdgpu_device_baco_exit(struct drm_device *dev)
4881 {
4882 struct amdgpu_device *adev = drm_to_adev(dev);
4883 struct amdgpu_ras *ras = amdgpu_ras_get_context(adev);
4884 int ret = 0;
4885
4886 if (!amdgpu_device_supports_baco(adev_to_drm(adev)))
4887 return -ENOTSUPP;
4888
4889 ret = amdgpu_dpm_baco_exit(adev);
4890 if (ret)
4891 return ret;
4892
4893 if (ras && ras->supported && adev->nbio.funcs->enable_doorbell_interrupt)
4894 adev->nbio.funcs->enable_doorbell_interrupt(adev, true);
4895
4896 return 0;
4897 }
4898
amdgpu_cancel_all_tdr(struct amdgpu_device * adev)4899 static void amdgpu_cancel_all_tdr(struct amdgpu_device *adev)
4900 {
4901 int i;
4902
4903 for (i = 0; i < AMDGPU_MAX_RINGS; ++i) {
4904 struct amdgpu_ring *ring = adev->rings[i];
4905
4906 if (!ring || !ring->sched.thread)
4907 continue;
4908
4909 cancel_delayed_work_sync(&ring->sched.work_tdr);
4910 }
4911 }
4912
4913 /**
4914 * amdgpu_pci_error_detected - Called when a PCI error is detected.
4915 * @pdev: PCI device struct
4916 * @state: PCI channel state
4917 *
4918 * Description: Called when a PCI error is detected.
4919 *
4920 * Return: PCI_ERS_RESULT_NEED_RESET or PCI_ERS_RESULT_DISCONNECT.
4921 */
amdgpu_pci_error_detected(struct pci_dev * pdev,pci_channel_state_t state)4922 pci_ers_result_t amdgpu_pci_error_detected(struct pci_dev *pdev, pci_channel_state_t state)
4923 {
4924 struct drm_device *dev = pci_get_drvdata(pdev);
4925 struct amdgpu_device *adev = drm_to_adev(dev);
4926 int i;
4927
4928 DRM_INFO("PCI error: detected callback, state(%d)!!\n", state);
4929
4930 if (adev->gmc.xgmi.num_physical_nodes > 1) {
4931 DRM_WARN("No support for XGMI hive yet...");
4932 return PCI_ERS_RESULT_DISCONNECT;
4933 }
4934
4935 switch (state) {
4936 case pci_channel_io_normal:
4937 return PCI_ERS_RESULT_CAN_RECOVER;
4938 /* Fatal error, prepare for slot reset */
4939 case pci_channel_io_frozen:
4940 /*
4941 * Cancel and wait for all TDRs in progress if failing to
4942 * set adev->in_gpu_reset in amdgpu_device_lock_adev
4943 *
4944 * Locking adev->reset_sem will prevent any external access
4945 * to GPU during PCI error recovery
4946 */
4947 while (!amdgpu_device_lock_adev(adev, NULL))
4948 amdgpu_cancel_all_tdr(adev);
4949
4950 /*
4951 * Block any work scheduling as we do for regular GPU reset
4952 * for the duration of the recovery
4953 */
4954 for (i = 0; i < AMDGPU_MAX_RINGS; ++i) {
4955 struct amdgpu_ring *ring = adev->rings[i];
4956
4957 if (!ring || !ring->sched.thread)
4958 continue;
4959
4960 drm_sched_stop(&ring->sched, NULL);
4961 }
4962 return PCI_ERS_RESULT_NEED_RESET;
4963 case pci_channel_io_perm_failure:
4964 /* Permanent error, prepare for device removal */
4965 return PCI_ERS_RESULT_DISCONNECT;
4966 }
4967
4968 return PCI_ERS_RESULT_NEED_RESET;
4969 }
4970
4971 /**
4972 * amdgpu_pci_mmio_enabled - Enable MMIO and dump debug registers
4973 * @pdev: pointer to PCI device
4974 */
amdgpu_pci_mmio_enabled(struct pci_dev * pdev)4975 pci_ers_result_t amdgpu_pci_mmio_enabled(struct pci_dev *pdev)
4976 {
4977
4978 DRM_INFO("PCI error: mmio enabled callback!!\n");
4979
4980 /* TODO - dump whatever for debugging purposes */
4981
4982 /* This called only if amdgpu_pci_error_detected returns
4983 * PCI_ERS_RESULT_CAN_RECOVER. Read/write to the device still
4984 * works, no need to reset slot.
4985 */
4986
4987 return PCI_ERS_RESULT_RECOVERED;
4988 }
4989
4990 /**
4991 * amdgpu_pci_slot_reset - Called when PCI slot has been reset.
4992 * @pdev: PCI device struct
4993 *
4994 * Description: This routine is called by the pci error recovery
4995 * code after the PCI slot has been reset, just before we
4996 * should resume normal operations.
4997 */
amdgpu_pci_slot_reset(struct pci_dev * pdev)4998 pci_ers_result_t amdgpu_pci_slot_reset(struct pci_dev *pdev)
4999 {
5000 struct drm_device *dev = pci_get_drvdata(pdev);
5001 struct amdgpu_device *adev = drm_to_adev(dev);
5002 int r, i;
5003 bool need_full_reset = true;
5004 u32 memsize;
5005 struct list_head device_list;
5006
5007 DRM_INFO("PCI error: slot reset callback!!\n");
5008
5009 INIT_LIST_HEAD(&device_list);
5010 list_add_tail(&adev->gmc.xgmi.head, &device_list);
5011
5012 /* wait for asic to come out of reset */
5013 msleep(500);
5014
5015 /* Restore PCI confspace */
5016 amdgpu_device_load_pci_state(pdev);
5017
5018 /* confirm ASIC came out of reset */
5019 for (i = 0; i < adev->usec_timeout; i++) {
5020 memsize = amdgpu_asic_get_config_memsize(adev);
5021
5022 if (memsize != 0xffffffff)
5023 break;
5024 udelay(1);
5025 }
5026 if (memsize == 0xffffffff) {
5027 r = -ETIME;
5028 goto out;
5029 }
5030
5031 adev->in_pci_err_recovery = true;
5032 r = amdgpu_device_pre_asic_reset(adev, NULL, &need_full_reset);
5033 adev->in_pci_err_recovery = false;
5034 if (r)
5035 goto out;
5036
5037 r = amdgpu_do_asic_reset(NULL, &device_list, &need_full_reset, true);
5038
5039 out:
5040 if (!r) {
5041 if (amdgpu_device_cache_pci_state(adev->pdev))
5042 pci_restore_state(adev->pdev);
5043
5044 DRM_INFO("PCIe error recovery succeeded\n");
5045 } else {
5046 DRM_ERROR("PCIe error recovery failed, err:%d", r);
5047 amdgpu_device_unlock_adev(adev);
5048 }
5049
5050 return r ? PCI_ERS_RESULT_DISCONNECT : PCI_ERS_RESULT_RECOVERED;
5051 }
5052
5053 /**
5054 * amdgpu_pci_resume() - resume normal ops after PCI reset
5055 * @pdev: pointer to PCI device
5056 *
5057 * Called when the error recovery driver tells us that its
5058 * OK to resume normal operation. Use completion to allow
5059 * halted scsi ops to resume.
5060 */
amdgpu_pci_resume(struct pci_dev * pdev)5061 void amdgpu_pci_resume(struct pci_dev *pdev)
5062 {
5063 struct drm_device *dev = pci_get_drvdata(pdev);
5064 struct amdgpu_device *adev = drm_to_adev(dev);
5065 int i;
5066
5067
5068 DRM_INFO("PCI error: resume callback!!\n");
5069
5070 for (i = 0; i < AMDGPU_MAX_RINGS; ++i) {
5071 struct amdgpu_ring *ring = adev->rings[i];
5072
5073 if (!ring || !ring->sched.thread)
5074 continue;
5075
5076
5077 drm_sched_resubmit_jobs(&ring->sched);
5078 drm_sched_start(&ring->sched, true);
5079 }
5080
5081 amdgpu_device_unlock_adev(adev);
5082 }
5083
amdgpu_device_cache_pci_state(struct pci_dev * pdev)5084 bool amdgpu_device_cache_pci_state(struct pci_dev *pdev)
5085 {
5086 struct drm_device *dev = pci_get_drvdata(pdev);
5087 struct amdgpu_device *adev = drm_to_adev(dev);
5088 int r;
5089
5090 r = pci_save_state(pdev);
5091 if (!r) {
5092 kfree(adev->pci_state);
5093
5094 adev->pci_state = pci_store_saved_state(pdev);
5095
5096 if (!adev->pci_state) {
5097 DRM_ERROR("Failed to store PCI saved state");
5098 return false;
5099 }
5100 } else {
5101 DRM_WARN("Failed to save PCI state, err:%d\n", r);
5102 return false;
5103 }
5104
5105 return true;
5106 }
5107
amdgpu_device_load_pci_state(struct pci_dev * pdev)5108 bool amdgpu_device_load_pci_state(struct pci_dev *pdev)
5109 {
5110 struct drm_device *dev = pci_get_drvdata(pdev);
5111 struct amdgpu_device *adev = drm_to_adev(dev);
5112 int r;
5113
5114 if (!adev->pci_state)
5115 return false;
5116
5117 r = pci_load_saved_state(pdev, adev->pci_state);
5118
5119 if (!r) {
5120 pci_restore_state(pdev);
5121 } else {
5122 DRM_WARN("Failed to load PCI state, err:%d\n", r);
5123 return false;
5124 }
5125
5126 return true;
5127 }
5128
5129
5130