1 /*
2 * Copyright 2008 Advanced Micro Devices, Inc.
3 * Copyright 2008 Red Hat Inc.
4 * Copyright 2009 Jerome Glisse.
5 *
6 * Permission is hereby granted, free of charge, to any person obtaining a
7 * copy of this software and associated documentation files (the "Software"),
8 * to deal in the Software without restriction, including without limitation
9 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
10 * and/or sell copies of the Software, and to permit persons to whom the
11 * Software is furnished to do so, subject to the following conditions:
12 *
13 * The above copyright notice and this permission notice shall be included in
14 * all copies or substantial portions of the Software.
15 *
16 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
19 * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR
20 * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
21 * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
22 * OTHER DEALINGS IN THE SOFTWARE.
23 *
24 * Authors: Dave Airlie
25 * Alex Deucher
26 * Jerome Glisse
27 */
28 #include <linux/power_supply.h>
29 #include <linux/kthread.h>
30 #include <linux/module.h>
31 #include <linux/console.h>
32 #include <linux/slab.h>
33
34 #include <drm/drm_atomic_helper.h>
35 #include <drm/drm_probe_helper.h>
36 #include <drm/amdgpu_drm.h>
37 #include <linux/vgaarb.h>
38 #include <linux/vga_switcheroo.h>
39 #include <linux/efi.h>
40 #include "amdgpu.h"
41 #include "amdgpu_trace.h"
42 #include "amdgpu_i2c.h"
43 #include "atom.h"
44 #include "amdgpu_atombios.h"
45 #include "amdgpu_atomfirmware.h"
46 #include "amd_pcie.h"
47 #ifdef CONFIG_DRM_AMDGPU_SI
48 #include "si.h"
49 #endif
50 #ifdef CONFIG_DRM_AMDGPU_CIK
51 #include "cik.h"
52 #endif
53 #include "vi.h"
54 #include "soc15.h"
55 #include "nv.h"
56 #include "bif/bif_4_1_d.h"
57 #include <linux/pci.h>
58 #include <linux/firmware.h>
59 #include "amdgpu_vf_error.h"
60
61 #include "amdgpu_amdkfd.h"
62 #include "amdgpu_pm.h"
63
64 #include "amdgpu_xgmi.h"
65 #include "amdgpu_ras.h"
66 #include "amdgpu_pmu.h"
67 #include "amdgpu_fru_eeprom.h"
68
69 #include <linux/suspend.h>
70 #include <drm/task_barrier.h>
71 #include <linux/pm_runtime.h>
72
73 MODULE_FIRMWARE("amdgpu/vega10_gpu_info.bin");
74 MODULE_FIRMWARE("amdgpu/vega12_gpu_info.bin");
75 MODULE_FIRMWARE("amdgpu/raven_gpu_info.bin");
76 MODULE_FIRMWARE("amdgpu/picasso_gpu_info.bin");
77 MODULE_FIRMWARE("amdgpu/raven2_gpu_info.bin");
78 MODULE_FIRMWARE("amdgpu/arcturus_gpu_info.bin");
79 MODULE_FIRMWARE("amdgpu/renoir_gpu_info.bin");
80 MODULE_FIRMWARE("amdgpu/navi10_gpu_info.bin");
81 MODULE_FIRMWARE("amdgpu/navi14_gpu_info.bin");
82 MODULE_FIRMWARE("amdgpu/navi12_gpu_info.bin");
83
84 #define AMDGPU_RESUME_MS 2000
85
86 const char *amdgpu_asic_name[] = {
87 "TAHITI",
88 "PITCAIRN",
89 "VERDE",
90 "OLAND",
91 "HAINAN",
92 "BONAIRE",
93 "KAVERI",
94 "KABINI",
95 "HAWAII",
96 "MULLINS",
97 "TOPAZ",
98 "TONGA",
99 "FIJI",
100 "CARRIZO",
101 "STONEY",
102 "POLARIS10",
103 "POLARIS11",
104 "POLARIS12",
105 "VEGAM",
106 "VEGA10",
107 "VEGA12",
108 "VEGA20",
109 "RAVEN",
110 "ARCTURUS",
111 "RENOIR",
112 "NAVI10",
113 "NAVI14",
114 "NAVI12",
115 "SIENNA_CICHLID",
116 "NAVY_FLOUNDER",
117 "LAST",
118 };
119
120 /**
121 * DOC: pcie_replay_count
122 *
123 * The amdgpu driver provides a sysfs API for reporting the total number
124 * of PCIe replays (NAKs)
125 * The file pcie_replay_count is used for this and returns the total
126 * number of replays as a sum of the NAKs generated and NAKs received
127 */
128
amdgpu_device_get_pcie_replay_count(struct device * dev,struct device_attribute * attr,char * buf)129 static ssize_t amdgpu_device_get_pcie_replay_count(struct device *dev,
130 struct device_attribute *attr, char *buf)
131 {
132 struct drm_device *ddev = dev_get_drvdata(dev);
133 struct amdgpu_device *adev = drm_to_adev(ddev);
134 uint64_t cnt = amdgpu_asic_get_pcie_replay_count(adev);
135
136 return snprintf(buf, PAGE_SIZE, "%llu\n", cnt);
137 }
138
139 static DEVICE_ATTR(pcie_replay_count, S_IRUGO,
140 amdgpu_device_get_pcie_replay_count, NULL);
141
142 static void amdgpu_device_get_pcie_info(struct amdgpu_device *adev);
143
144 /**
145 * DOC: product_name
146 *
147 * The amdgpu driver provides a sysfs API for reporting the product name
148 * for the device
149 * The file serial_number is used for this and returns the product name
150 * as returned from the FRU.
151 * NOTE: This is only available for certain server cards
152 */
153
amdgpu_device_get_product_name(struct device * dev,struct device_attribute * attr,char * buf)154 static ssize_t amdgpu_device_get_product_name(struct device *dev,
155 struct device_attribute *attr, char *buf)
156 {
157 struct drm_device *ddev = dev_get_drvdata(dev);
158 struct amdgpu_device *adev = drm_to_adev(ddev);
159
160 return snprintf(buf, PAGE_SIZE, "%s\n", adev->product_name);
161 }
162
163 static DEVICE_ATTR(product_name, S_IRUGO,
164 amdgpu_device_get_product_name, NULL);
165
166 /**
167 * DOC: product_number
168 *
169 * The amdgpu driver provides a sysfs API for reporting the part number
170 * for the device
171 * The file serial_number is used for this and returns the part number
172 * as returned from the FRU.
173 * NOTE: This is only available for certain server cards
174 */
175
amdgpu_device_get_product_number(struct device * dev,struct device_attribute * attr,char * buf)176 static ssize_t amdgpu_device_get_product_number(struct device *dev,
177 struct device_attribute *attr, char *buf)
178 {
179 struct drm_device *ddev = dev_get_drvdata(dev);
180 struct amdgpu_device *adev = drm_to_adev(ddev);
181
182 return snprintf(buf, PAGE_SIZE, "%s\n", adev->product_number);
183 }
184
185 static DEVICE_ATTR(product_number, S_IRUGO,
186 amdgpu_device_get_product_number, NULL);
187
188 /**
189 * DOC: serial_number
190 *
191 * The amdgpu driver provides a sysfs API for reporting the serial number
192 * for the device
193 * The file serial_number is used for this and returns the serial number
194 * as returned from the FRU.
195 * NOTE: This is only available for certain server cards
196 */
197
amdgpu_device_get_serial_number(struct device * dev,struct device_attribute * attr,char * buf)198 static ssize_t amdgpu_device_get_serial_number(struct device *dev,
199 struct device_attribute *attr, char *buf)
200 {
201 struct drm_device *ddev = dev_get_drvdata(dev);
202 struct amdgpu_device *adev = drm_to_adev(ddev);
203
204 return snprintf(buf, PAGE_SIZE, "%s\n", adev->serial);
205 }
206
207 static DEVICE_ATTR(serial_number, S_IRUGO,
208 amdgpu_device_get_serial_number, NULL);
209
210 /**
211 * amdgpu_device_supports_boco - Is the device a dGPU with HG/PX power control
212 *
213 * @dev: drm_device pointer
214 *
215 * Returns true if the device is a dGPU with HG/PX power control,
216 * otherwise return false.
217 */
amdgpu_device_supports_boco(struct drm_device * dev)218 bool amdgpu_device_supports_boco(struct drm_device *dev)
219 {
220 struct amdgpu_device *adev = drm_to_adev(dev);
221
222 if (adev->flags & AMD_IS_PX)
223 return true;
224 return false;
225 }
226
227 /**
228 * amdgpu_device_supports_baco - Does the device support BACO
229 *
230 * @dev: drm_device pointer
231 *
232 * Returns true if the device supporte BACO,
233 * otherwise return false.
234 */
amdgpu_device_supports_baco(struct drm_device * dev)235 bool amdgpu_device_supports_baco(struct drm_device *dev)
236 {
237 struct amdgpu_device *adev = drm_to_adev(dev);
238
239 return amdgpu_asic_supports_baco(adev);
240 }
241
242 /*
243 * VRAM access helper functions
244 */
245
246 /**
247 * amdgpu_device_vram_access - read/write a buffer in vram
248 *
249 * @adev: amdgpu_device pointer
250 * @pos: offset of the buffer in vram
251 * @buf: virtual address of the buffer in system memory
252 * @size: read/write size, sizeof(@buf) must > @size
253 * @write: true - write to vram, otherwise - read from vram
254 */
amdgpu_device_vram_access(struct amdgpu_device * adev,loff_t pos,uint32_t * buf,size_t size,bool write)255 void amdgpu_device_vram_access(struct amdgpu_device *adev, loff_t pos,
256 uint32_t *buf, size_t size, bool write)
257 {
258 unsigned long flags;
259 uint32_t hi = ~0;
260 uint64_t last;
261
262
263 #ifdef CONFIG_64BIT
264 last = min(pos + size, adev->gmc.visible_vram_size);
265 if (last > pos) {
266 void __iomem *addr = adev->mman.aper_base_kaddr + pos;
267 size_t count = last - pos;
268
269 if (write) {
270 memcpy_toio(addr, buf, count);
271 mb();
272 amdgpu_asic_flush_hdp(adev, NULL);
273 } else {
274 amdgpu_asic_invalidate_hdp(adev, NULL);
275 mb();
276 memcpy_fromio(buf, addr, count);
277 }
278
279 if (count == size)
280 return;
281
282 pos += count;
283 buf += count / 4;
284 size -= count;
285 }
286 #endif
287
288 spin_lock_irqsave(&adev->mmio_idx_lock, flags);
289 for (last = pos + size; pos < last; pos += 4) {
290 uint32_t tmp = pos >> 31;
291
292 WREG32_NO_KIQ(mmMM_INDEX, ((uint32_t)pos) | 0x80000000);
293 if (tmp != hi) {
294 WREG32_NO_KIQ(mmMM_INDEX_HI, tmp);
295 hi = tmp;
296 }
297 if (write)
298 WREG32_NO_KIQ(mmMM_DATA, *buf++);
299 else
300 *buf++ = RREG32_NO_KIQ(mmMM_DATA);
301 }
302 spin_unlock_irqrestore(&adev->mmio_idx_lock, flags);
303 }
304
305 /*
306 * register access helper functions.
307 */
308 /**
309 * amdgpu_device_rreg - read a memory mapped IO or indirect register
310 *
311 * @adev: amdgpu_device pointer
312 * @reg: dword aligned register offset
313 * @acc_flags: access flags which require special behavior
314 *
315 * Returns the 32 bit value from the offset specified.
316 */
amdgpu_device_rreg(struct amdgpu_device * adev,uint32_t reg,uint32_t acc_flags)317 uint32_t amdgpu_device_rreg(struct amdgpu_device *adev,
318 uint32_t reg, uint32_t acc_flags)
319 {
320 uint32_t ret;
321
322 if (adev->in_pci_err_recovery)
323 return 0;
324
325 if ((reg * 4) < adev->rmmio_size) {
326 if (!(acc_flags & AMDGPU_REGS_NO_KIQ) &&
327 amdgpu_sriov_runtime(adev) &&
328 down_read_trylock(&adev->reset_sem)) {
329 ret = amdgpu_kiq_rreg(adev, reg);
330 up_read(&adev->reset_sem);
331 } else {
332 ret = readl(((void __iomem *)adev->rmmio) + (reg * 4));
333 }
334 } else {
335 ret = adev->pcie_rreg(adev, reg * 4);
336 }
337
338 trace_amdgpu_device_rreg(adev->pdev->device, reg, ret);
339
340 return ret;
341 }
342
343 /*
344 * MMIO register read with bytes helper functions
345 * @offset:bytes offset from MMIO start
346 *
347 */
348
349 /**
350 * amdgpu_mm_rreg8 - read a memory mapped IO register
351 *
352 * @adev: amdgpu_device pointer
353 * @offset: byte aligned register offset
354 *
355 * Returns the 8 bit value from the offset specified.
356 */
amdgpu_mm_rreg8(struct amdgpu_device * adev,uint32_t offset)357 uint8_t amdgpu_mm_rreg8(struct amdgpu_device *adev, uint32_t offset)
358 {
359 if (adev->in_pci_err_recovery)
360 return 0;
361
362 if (offset < adev->rmmio_size)
363 return (readb(adev->rmmio + offset));
364 BUG();
365 }
366
367 /*
368 * MMIO register write with bytes helper functions
369 * @offset:bytes offset from MMIO start
370 * @value: the value want to be written to the register
371 *
372 */
373 /**
374 * amdgpu_mm_wreg8 - read a memory mapped IO register
375 *
376 * @adev: amdgpu_device pointer
377 * @offset: byte aligned register offset
378 * @value: 8 bit value to write
379 *
380 * Writes the value specified to the offset specified.
381 */
amdgpu_mm_wreg8(struct amdgpu_device * adev,uint32_t offset,uint8_t value)382 void amdgpu_mm_wreg8(struct amdgpu_device *adev, uint32_t offset, uint8_t value)
383 {
384 if (adev->in_pci_err_recovery)
385 return;
386
387 if (offset < adev->rmmio_size)
388 writeb(value, adev->rmmio + offset);
389 else
390 BUG();
391 }
392
393 /**
394 * amdgpu_device_wreg - write to a memory mapped IO or indirect register
395 *
396 * @adev: amdgpu_device pointer
397 * @reg: dword aligned register offset
398 * @v: 32 bit value to write to the register
399 * @acc_flags: access flags which require special behavior
400 *
401 * Writes the value specified to the offset specified.
402 */
amdgpu_device_wreg(struct amdgpu_device * adev,uint32_t reg,uint32_t v,uint32_t acc_flags)403 void amdgpu_device_wreg(struct amdgpu_device *adev,
404 uint32_t reg, uint32_t v,
405 uint32_t acc_flags)
406 {
407 if (adev->in_pci_err_recovery)
408 return;
409
410 if ((reg * 4) < adev->rmmio_size) {
411 if (!(acc_flags & AMDGPU_REGS_NO_KIQ) &&
412 amdgpu_sriov_runtime(adev) &&
413 down_read_trylock(&adev->reset_sem)) {
414 amdgpu_kiq_wreg(adev, reg, v);
415 up_read(&adev->reset_sem);
416 } else {
417 writel(v, ((void __iomem *)adev->rmmio) + (reg * 4));
418 }
419 } else {
420 adev->pcie_wreg(adev, reg * 4, v);
421 }
422
423 trace_amdgpu_device_wreg(adev->pdev->device, reg, v);
424 }
425
426 /*
427 * amdgpu_mm_wreg_mmio_rlc - write register either with mmio or with RLC path if in range
428 *
429 * this function is invoked only the debugfs register access
430 * */
amdgpu_mm_wreg_mmio_rlc(struct amdgpu_device * adev,uint32_t reg,uint32_t v)431 void amdgpu_mm_wreg_mmio_rlc(struct amdgpu_device *adev,
432 uint32_t reg, uint32_t v)
433 {
434 if (adev->in_pci_err_recovery)
435 return;
436
437 if (amdgpu_sriov_fullaccess(adev) &&
438 adev->gfx.rlc.funcs &&
439 adev->gfx.rlc.funcs->is_rlcg_access_range) {
440 if (adev->gfx.rlc.funcs->is_rlcg_access_range(adev, reg))
441 return adev->gfx.rlc.funcs->rlcg_wreg(adev, reg, v);
442 } else {
443 writel(v, ((void __iomem *)adev->rmmio) + (reg * 4));
444 }
445 }
446
447 /**
448 * amdgpu_io_rreg - read an IO register
449 *
450 * @adev: amdgpu_device pointer
451 * @reg: dword aligned register offset
452 *
453 * Returns the 32 bit value from the offset specified.
454 */
amdgpu_io_rreg(struct amdgpu_device * adev,u32 reg)455 u32 amdgpu_io_rreg(struct amdgpu_device *adev, u32 reg)
456 {
457 if (adev->in_pci_err_recovery)
458 return 0;
459
460 if ((reg * 4) < adev->rio_mem_size)
461 return ioread32(adev->rio_mem + (reg * 4));
462 else {
463 iowrite32((reg * 4), adev->rio_mem + (mmMM_INDEX * 4));
464 return ioread32(adev->rio_mem + (mmMM_DATA * 4));
465 }
466 }
467
468 /**
469 * amdgpu_io_wreg - write to an IO register
470 *
471 * @adev: amdgpu_device pointer
472 * @reg: dword aligned register offset
473 * @v: 32 bit value to write to the register
474 *
475 * Writes the value specified to the offset specified.
476 */
amdgpu_io_wreg(struct amdgpu_device * adev,u32 reg,u32 v)477 void amdgpu_io_wreg(struct amdgpu_device *adev, u32 reg, u32 v)
478 {
479 if (adev->in_pci_err_recovery)
480 return;
481
482 if ((reg * 4) < adev->rio_mem_size)
483 iowrite32(v, adev->rio_mem + (reg * 4));
484 else {
485 iowrite32((reg * 4), adev->rio_mem + (mmMM_INDEX * 4));
486 iowrite32(v, adev->rio_mem + (mmMM_DATA * 4));
487 }
488 }
489
490 /**
491 * amdgpu_mm_rdoorbell - read a doorbell dword
492 *
493 * @adev: amdgpu_device pointer
494 * @index: doorbell index
495 *
496 * Returns the value in the doorbell aperture at the
497 * requested doorbell index (CIK).
498 */
amdgpu_mm_rdoorbell(struct amdgpu_device * adev,u32 index)499 u32 amdgpu_mm_rdoorbell(struct amdgpu_device *adev, u32 index)
500 {
501 if (adev->in_pci_err_recovery)
502 return 0;
503
504 if (index < adev->doorbell.num_doorbells) {
505 return readl(adev->doorbell.ptr + index);
506 } else {
507 DRM_ERROR("reading beyond doorbell aperture: 0x%08x!\n", index);
508 return 0;
509 }
510 }
511
512 /**
513 * amdgpu_mm_wdoorbell - write a doorbell dword
514 *
515 * @adev: amdgpu_device pointer
516 * @index: doorbell index
517 * @v: value to write
518 *
519 * Writes @v to the doorbell aperture at the
520 * requested doorbell index (CIK).
521 */
amdgpu_mm_wdoorbell(struct amdgpu_device * adev,u32 index,u32 v)522 void amdgpu_mm_wdoorbell(struct amdgpu_device *adev, u32 index, u32 v)
523 {
524 if (adev->in_pci_err_recovery)
525 return;
526
527 if (index < adev->doorbell.num_doorbells) {
528 writel(v, adev->doorbell.ptr + index);
529 } else {
530 DRM_ERROR("writing beyond doorbell aperture: 0x%08x!\n", index);
531 }
532 }
533
534 /**
535 * amdgpu_mm_rdoorbell64 - read a doorbell Qword
536 *
537 * @adev: amdgpu_device pointer
538 * @index: doorbell index
539 *
540 * Returns the value in the doorbell aperture at the
541 * requested doorbell index (VEGA10+).
542 */
amdgpu_mm_rdoorbell64(struct amdgpu_device * adev,u32 index)543 u64 amdgpu_mm_rdoorbell64(struct amdgpu_device *adev, u32 index)
544 {
545 if (adev->in_pci_err_recovery)
546 return 0;
547
548 if (index < adev->doorbell.num_doorbells) {
549 return atomic64_read((atomic64_t *)(adev->doorbell.ptr + index));
550 } else {
551 DRM_ERROR("reading beyond doorbell aperture: 0x%08x!\n", index);
552 return 0;
553 }
554 }
555
556 /**
557 * amdgpu_mm_wdoorbell64 - write a doorbell Qword
558 *
559 * @adev: amdgpu_device pointer
560 * @index: doorbell index
561 * @v: value to write
562 *
563 * Writes @v to the doorbell aperture at the
564 * requested doorbell index (VEGA10+).
565 */
amdgpu_mm_wdoorbell64(struct amdgpu_device * adev,u32 index,u64 v)566 void amdgpu_mm_wdoorbell64(struct amdgpu_device *adev, u32 index, u64 v)
567 {
568 if (adev->in_pci_err_recovery)
569 return;
570
571 if (index < adev->doorbell.num_doorbells) {
572 atomic64_set((atomic64_t *)(adev->doorbell.ptr + index), v);
573 } else {
574 DRM_ERROR("writing beyond doorbell aperture: 0x%08x!\n", index);
575 }
576 }
577
578 /**
579 * amdgpu_device_indirect_rreg - read an indirect register
580 *
581 * @adev: amdgpu_device pointer
582 * @pcie_index: mmio register offset
583 * @pcie_data: mmio register offset
584 *
585 * Returns the value of indirect register @reg_addr
586 */
amdgpu_device_indirect_rreg(struct amdgpu_device * adev,u32 pcie_index,u32 pcie_data,u32 reg_addr)587 u32 amdgpu_device_indirect_rreg(struct amdgpu_device *adev,
588 u32 pcie_index, u32 pcie_data,
589 u32 reg_addr)
590 {
591 unsigned long flags;
592 u32 r;
593 void __iomem *pcie_index_offset;
594 void __iomem *pcie_data_offset;
595
596 spin_lock_irqsave(&adev->pcie_idx_lock, flags);
597 pcie_index_offset = (void __iomem *)adev->rmmio + pcie_index * 4;
598 pcie_data_offset = (void __iomem *)adev->rmmio + pcie_data * 4;
599
600 writel(reg_addr, pcie_index_offset);
601 readl(pcie_index_offset);
602 r = readl(pcie_data_offset);
603 spin_unlock_irqrestore(&adev->pcie_idx_lock, flags);
604
605 return r;
606 }
607
608 /**
609 * amdgpu_device_indirect_rreg64 - read a 64bits indirect register
610 *
611 * @adev: amdgpu_device pointer
612 * @pcie_index: mmio register offset
613 * @pcie_data: mmio register offset
614 *
615 * Returns the value of indirect register @reg_addr
616 */
amdgpu_device_indirect_rreg64(struct amdgpu_device * adev,u32 pcie_index,u32 pcie_data,u32 reg_addr)617 u64 amdgpu_device_indirect_rreg64(struct amdgpu_device *adev,
618 u32 pcie_index, u32 pcie_data,
619 u32 reg_addr)
620 {
621 unsigned long flags;
622 u64 r;
623 void __iomem *pcie_index_offset;
624 void __iomem *pcie_data_offset;
625
626 spin_lock_irqsave(&adev->pcie_idx_lock, flags);
627 pcie_index_offset = (void __iomem *)adev->rmmio + pcie_index * 4;
628 pcie_data_offset = (void __iomem *)adev->rmmio + pcie_data * 4;
629
630 /* read low 32 bits */
631 writel(reg_addr, pcie_index_offset);
632 readl(pcie_index_offset);
633 r = readl(pcie_data_offset);
634 /* read high 32 bits */
635 writel(reg_addr + 4, pcie_index_offset);
636 readl(pcie_index_offset);
637 r |= ((u64)readl(pcie_data_offset) << 32);
638 spin_unlock_irqrestore(&adev->pcie_idx_lock, flags);
639
640 return r;
641 }
642
643 /**
644 * amdgpu_device_indirect_wreg - write an indirect register address
645 *
646 * @adev: amdgpu_device pointer
647 * @pcie_index: mmio register offset
648 * @pcie_data: mmio register offset
649 * @reg_addr: indirect register offset
650 * @reg_data: indirect register data
651 *
652 */
amdgpu_device_indirect_wreg(struct amdgpu_device * adev,u32 pcie_index,u32 pcie_data,u32 reg_addr,u32 reg_data)653 void amdgpu_device_indirect_wreg(struct amdgpu_device *adev,
654 u32 pcie_index, u32 pcie_data,
655 u32 reg_addr, u32 reg_data)
656 {
657 unsigned long flags;
658 void __iomem *pcie_index_offset;
659 void __iomem *pcie_data_offset;
660
661 spin_lock_irqsave(&adev->pcie_idx_lock, flags);
662 pcie_index_offset = (void __iomem *)adev->rmmio + pcie_index * 4;
663 pcie_data_offset = (void __iomem *)adev->rmmio + pcie_data * 4;
664
665 writel(reg_addr, pcie_index_offset);
666 readl(pcie_index_offset);
667 writel(reg_data, pcie_data_offset);
668 readl(pcie_data_offset);
669 spin_unlock_irqrestore(&adev->pcie_idx_lock, flags);
670 }
671
672 /**
673 * amdgpu_device_indirect_wreg64 - write a 64bits indirect register address
674 *
675 * @adev: amdgpu_device pointer
676 * @pcie_index: mmio register offset
677 * @pcie_data: mmio register offset
678 * @reg_addr: indirect register offset
679 * @reg_data: indirect register data
680 *
681 */
amdgpu_device_indirect_wreg64(struct amdgpu_device * adev,u32 pcie_index,u32 pcie_data,u32 reg_addr,u64 reg_data)682 void amdgpu_device_indirect_wreg64(struct amdgpu_device *adev,
683 u32 pcie_index, u32 pcie_data,
684 u32 reg_addr, u64 reg_data)
685 {
686 unsigned long flags;
687 void __iomem *pcie_index_offset;
688 void __iomem *pcie_data_offset;
689
690 spin_lock_irqsave(&adev->pcie_idx_lock, flags);
691 pcie_index_offset = (void __iomem *)adev->rmmio + pcie_index * 4;
692 pcie_data_offset = (void __iomem *)adev->rmmio + pcie_data * 4;
693
694 /* write low 32 bits */
695 writel(reg_addr, pcie_index_offset);
696 readl(pcie_index_offset);
697 writel((u32)(reg_data & 0xffffffffULL), pcie_data_offset);
698 readl(pcie_data_offset);
699 /* write high 32 bits */
700 writel(reg_addr + 4, pcie_index_offset);
701 readl(pcie_index_offset);
702 writel((u32)(reg_data >> 32), pcie_data_offset);
703 readl(pcie_data_offset);
704 spin_unlock_irqrestore(&adev->pcie_idx_lock, flags);
705 }
706
707 /**
708 * amdgpu_invalid_rreg - dummy reg read function
709 *
710 * @adev: amdgpu_device pointer
711 * @reg: offset of register
712 *
713 * Dummy register read function. Used for register blocks
714 * that certain asics don't have (all asics).
715 * Returns the value in the register.
716 */
amdgpu_invalid_rreg(struct amdgpu_device * adev,uint32_t reg)717 static uint32_t amdgpu_invalid_rreg(struct amdgpu_device *adev, uint32_t reg)
718 {
719 DRM_ERROR("Invalid callback to read register 0x%04X\n", reg);
720 BUG();
721 return 0;
722 }
723
724 /**
725 * amdgpu_invalid_wreg - dummy reg write function
726 *
727 * @adev: amdgpu_device pointer
728 * @reg: offset of register
729 * @v: value to write to the register
730 *
731 * Dummy register read function. Used for register blocks
732 * that certain asics don't have (all asics).
733 */
amdgpu_invalid_wreg(struct amdgpu_device * adev,uint32_t reg,uint32_t v)734 static void amdgpu_invalid_wreg(struct amdgpu_device *adev, uint32_t reg, uint32_t v)
735 {
736 DRM_ERROR("Invalid callback to write register 0x%04X with 0x%08X\n",
737 reg, v);
738 BUG();
739 }
740
741 /**
742 * amdgpu_invalid_rreg64 - dummy 64 bit reg read function
743 *
744 * @adev: amdgpu_device pointer
745 * @reg: offset of register
746 *
747 * Dummy register read function. Used for register blocks
748 * that certain asics don't have (all asics).
749 * Returns the value in the register.
750 */
amdgpu_invalid_rreg64(struct amdgpu_device * adev,uint32_t reg)751 static uint64_t amdgpu_invalid_rreg64(struct amdgpu_device *adev, uint32_t reg)
752 {
753 DRM_ERROR("Invalid callback to read 64 bit register 0x%04X\n", reg);
754 BUG();
755 return 0;
756 }
757
758 /**
759 * amdgpu_invalid_wreg64 - dummy reg write function
760 *
761 * @adev: amdgpu_device pointer
762 * @reg: offset of register
763 * @v: value to write to the register
764 *
765 * Dummy register read function. Used for register blocks
766 * that certain asics don't have (all asics).
767 */
amdgpu_invalid_wreg64(struct amdgpu_device * adev,uint32_t reg,uint64_t v)768 static void amdgpu_invalid_wreg64(struct amdgpu_device *adev, uint32_t reg, uint64_t v)
769 {
770 DRM_ERROR("Invalid callback to write 64 bit register 0x%04X with 0x%08llX\n",
771 reg, v);
772 BUG();
773 }
774
775 /**
776 * amdgpu_block_invalid_rreg - dummy reg read function
777 *
778 * @adev: amdgpu_device pointer
779 * @block: offset of instance
780 * @reg: offset of register
781 *
782 * Dummy register read function. Used for register blocks
783 * that certain asics don't have (all asics).
784 * Returns the value in the register.
785 */
amdgpu_block_invalid_rreg(struct amdgpu_device * adev,uint32_t block,uint32_t reg)786 static uint32_t amdgpu_block_invalid_rreg(struct amdgpu_device *adev,
787 uint32_t block, uint32_t reg)
788 {
789 DRM_ERROR("Invalid callback to read register 0x%04X in block 0x%04X\n",
790 reg, block);
791 BUG();
792 return 0;
793 }
794
795 /**
796 * amdgpu_block_invalid_wreg - dummy reg write function
797 *
798 * @adev: amdgpu_device pointer
799 * @block: offset of instance
800 * @reg: offset of register
801 * @v: value to write to the register
802 *
803 * Dummy register read function. Used for register blocks
804 * that certain asics don't have (all asics).
805 */
amdgpu_block_invalid_wreg(struct amdgpu_device * adev,uint32_t block,uint32_t reg,uint32_t v)806 static void amdgpu_block_invalid_wreg(struct amdgpu_device *adev,
807 uint32_t block,
808 uint32_t reg, uint32_t v)
809 {
810 DRM_ERROR("Invalid block callback to write register 0x%04X in block 0x%04X with 0x%08X\n",
811 reg, block, v);
812 BUG();
813 }
814
815 /**
816 * amdgpu_device_asic_init - Wrapper for atom asic_init
817 *
818 * @adev: amdgpu_device pointer
819 *
820 * Does any asic specific work and then calls atom asic init.
821 */
amdgpu_device_asic_init(struct amdgpu_device * adev)822 static int amdgpu_device_asic_init(struct amdgpu_device *adev)
823 {
824 amdgpu_asic_pre_asic_init(adev);
825
826 return amdgpu_atom_asic_init(adev->mode_info.atom_context);
827 }
828
829 /**
830 * amdgpu_device_vram_scratch_init - allocate the VRAM scratch page
831 *
832 * @adev: amdgpu_device pointer
833 *
834 * Allocates a scratch page of VRAM for use by various things in the
835 * driver.
836 */
amdgpu_device_vram_scratch_init(struct amdgpu_device * adev)837 static int amdgpu_device_vram_scratch_init(struct amdgpu_device *adev)
838 {
839 return amdgpu_bo_create_kernel(adev, AMDGPU_GPU_PAGE_SIZE,
840 PAGE_SIZE, AMDGPU_GEM_DOMAIN_VRAM,
841 &adev->vram_scratch.robj,
842 &adev->vram_scratch.gpu_addr,
843 (void **)&adev->vram_scratch.ptr);
844 }
845
846 /**
847 * amdgpu_device_vram_scratch_fini - Free the VRAM scratch page
848 *
849 * @adev: amdgpu_device pointer
850 *
851 * Frees the VRAM scratch page.
852 */
amdgpu_device_vram_scratch_fini(struct amdgpu_device * adev)853 static void amdgpu_device_vram_scratch_fini(struct amdgpu_device *adev)
854 {
855 amdgpu_bo_free_kernel(&adev->vram_scratch.robj, NULL, NULL);
856 }
857
858 /**
859 * amdgpu_device_program_register_sequence - program an array of registers.
860 *
861 * @adev: amdgpu_device pointer
862 * @registers: pointer to the register array
863 * @array_size: size of the register array
864 *
865 * Programs an array or registers with and and or masks.
866 * This is a helper for setting golden registers.
867 */
amdgpu_device_program_register_sequence(struct amdgpu_device * adev,const u32 * registers,const u32 array_size)868 void amdgpu_device_program_register_sequence(struct amdgpu_device *adev,
869 const u32 *registers,
870 const u32 array_size)
871 {
872 u32 tmp, reg, and_mask, or_mask;
873 int i;
874
875 if (array_size % 3)
876 return;
877
878 for (i = 0; i < array_size; i +=3) {
879 reg = registers[i + 0];
880 and_mask = registers[i + 1];
881 or_mask = registers[i + 2];
882
883 if (and_mask == 0xffffffff) {
884 tmp = or_mask;
885 } else {
886 tmp = RREG32(reg);
887 tmp &= ~and_mask;
888 if (adev->family >= AMDGPU_FAMILY_AI)
889 tmp |= (or_mask & and_mask);
890 else
891 tmp |= or_mask;
892 }
893 WREG32(reg, tmp);
894 }
895 }
896
897 /**
898 * amdgpu_device_pci_config_reset - reset the GPU
899 *
900 * @adev: amdgpu_device pointer
901 *
902 * Resets the GPU using the pci config reset sequence.
903 * Only applicable to asics prior to vega10.
904 */
amdgpu_device_pci_config_reset(struct amdgpu_device * adev)905 void amdgpu_device_pci_config_reset(struct amdgpu_device *adev)
906 {
907 pci_write_config_dword(adev->pdev, 0x7c, AMDGPU_ASIC_RESET_DATA);
908 }
909
910 /*
911 * GPU doorbell aperture helpers function.
912 */
913 /**
914 * amdgpu_device_doorbell_init - Init doorbell driver information.
915 *
916 * @adev: amdgpu_device pointer
917 *
918 * Init doorbell driver information (CIK)
919 * Returns 0 on success, error on failure.
920 */
amdgpu_device_doorbell_init(struct amdgpu_device * adev)921 static int amdgpu_device_doorbell_init(struct amdgpu_device *adev)
922 {
923
924 /* No doorbell on SI hardware generation */
925 if (adev->asic_type < CHIP_BONAIRE) {
926 adev->doorbell.base = 0;
927 adev->doorbell.size = 0;
928 adev->doorbell.num_doorbells = 0;
929 adev->doorbell.ptr = NULL;
930 return 0;
931 }
932
933 if (pci_resource_flags(adev->pdev, 2) & IORESOURCE_UNSET)
934 return -EINVAL;
935
936 amdgpu_asic_init_doorbell_index(adev);
937
938 /* doorbell bar mapping */
939 adev->doorbell.base = pci_resource_start(adev->pdev, 2);
940 adev->doorbell.size = pci_resource_len(adev->pdev, 2);
941
942 adev->doorbell.num_doorbells = min_t(u32, adev->doorbell.size / sizeof(u32),
943 adev->doorbell_index.max_assignment+1);
944 if (adev->doorbell.num_doorbells == 0)
945 return -EINVAL;
946
947 /* For Vega, reserve and map two pages on doorbell BAR since SDMA
948 * paging queue doorbell use the second page. The
949 * AMDGPU_DOORBELL64_MAX_ASSIGNMENT definition assumes all the
950 * doorbells are in the first page. So with paging queue enabled,
951 * the max num_doorbells should + 1 page (0x400 in dword)
952 */
953 if (adev->asic_type >= CHIP_VEGA10)
954 adev->doorbell.num_doorbells += 0x400;
955
956 adev->doorbell.ptr = ioremap(adev->doorbell.base,
957 adev->doorbell.num_doorbells *
958 sizeof(u32));
959 if (adev->doorbell.ptr == NULL)
960 return -ENOMEM;
961
962 return 0;
963 }
964
965 /**
966 * amdgpu_device_doorbell_fini - Tear down doorbell driver information.
967 *
968 * @adev: amdgpu_device pointer
969 *
970 * Tear down doorbell driver information (CIK)
971 */
amdgpu_device_doorbell_fini(struct amdgpu_device * adev)972 static void amdgpu_device_doorbell_fini(struct amdgpu_device *adev)
973 {
974 iounmap(adev->doorbell.ptr);
975 adev->doorbell.ptr = NULL;
976 }
977
978
979
980 /*
981 * amdgpu_device_wb_*()
982 * Writeback is the method by which the GPU updates special pages in memory
983 * with the status of certain GPU events (fences, ring pointers,etc.).
984 */
985
986 /**
987 * amdgpu_device_wb_fini - Disable Writeback and free memory
988 *
989 * @adev: amdgpu_device pointer
990 *
991 * Disables Writeback and frees the Writeback memory (all asics).
992 * Used at driver shutdown.
993 */
amdgpu_device_wb_fini(struct amdgpu_device * adev)994 static void amdgpu_device_wb_fini(struct amdgpu_device *adev)
995 {
996 if (adev->wb.wb_obj) {
997 amdgpu_bo_free_kernel(&adev->wb.wb_obj,
998 &adev->wb.gpu_addr,
999 (void **)&adev->wb.wb);
1000 adev->wb.wb_obj = NULL;
1001 }
1002 }
1003
1004 /**
1005 * amdgpu_device_wb_init- Init Writeback driver info and allocate memory
1006 *
1007 * @adev: amdgpu_device pointer
1008 *
1009 * Initializes writeback and allocates writeback memory (all asics).
1010 * Used at driver startup.
1011 * Returns 0 on success or an -error on failure.
1012 */
amdgpu_device_wb_init(struct amdgpu_device * adev)1013 static int amdgpu_device_wb_init(struct amdgpu_device *adev)
1014 {
1015 int r;
1016
1017 if (adev->wb.wb_obj == NULL) {
1018 /* AMDGPU_MAX_WB * sizeof(uint32_t) * 8 = AMDGPU_MAX_WB 256bit slots */
1019 r = amdgpu_bo_create_kernel(adev, AMDGPU_MAX_WB * sizeof(uint32_t) * 8,
1020 PAGE_SIZE, AMDGPU_GEM_DOMAIN_GTT,
1021 &adev->wb.wb_obj, &adev->wb.gpu_addr,
1022 (void **)&adev->wb.wb);
1023 if (r) {
1024 dev_warn(adev->dev, "(%d) create WB bo failed\n", r);
1025 return r;
1026 }
1027
1028 adev->wb.num_wb = AMDGPU_MAX_WB;
1029 memset(&adev->wb.used, 0, sizeof(adev->wb.used));
1030
1031 /* clear wb memory */
1032 memset((char *)adev->wb.wb, 0, AMDGPU_MAX_WB * sizeof(uint32_t) * 8);
1033 }
1034
1035 return 0;
1036 }
1037
1038 /**
1039 * amdgpu_device_wb_get - Allocate a wb entry
1040 *
1041 * @adev: amdgpu_device pointer
1042 * @wb: wb index
1043 *
1044 * Allocate a wb slot for use by the driver (all asics).
1045 * Returns 0 on success or -EINVAL on failure.
1046 */
amdgpu_device_wb_get(struct amdgpu_device * adev,u32 * wb)1047 int amdgpu_device_wb_get(struct amdgpu_device *adev, u32 *wb)
1048 {
1049 unsigned long offset = find_first_zero_bit(adev->wb.used, adev->wb.num_wb);
1050
1051 if (offset < adev->wb.num_wb) {
1052 __set_bit(offset, adev->wb.used);
1053 *wb = offset << 3; /* convert to dw offset */
1054 return 0;
1055 } else {
1056 return -EINVAL;
1057 }
1058 }
1059
1060 /**
1061 * amdgpu_device_wb_free - Free a wb entry
1062 *
1063 * @adev: amdgpu_device pointer
1064 * @wb: wb index
1065 *
1066 * Free a wb slot allocated for use by the driver (all asics)
1067 */
amdgpu_device_wb_free(struct amdgpu_device * adev,u32 wb)1068 void amdgpu_device_wb_free(struct amdgpu_device *adev, u32 wb)
1069 {
1070 wb >>= 3;
1071 if (wb < adev->wb.num_wb)
1072 __clear_bit(wb, adev->wb.used);
1073 }
1074
1075 /**
1076 * amdgpu_device_resize_fb_bar - try to resize FB BAR
1077 *
1078 * @adev: amdgpu_device pointer
1079 *
1080 * Try to resize FB BAR to make all VRAM CPU accessible. We try very hard not
1081 * to fail, but if any of the BARs is not accessible after the size we abort
1082 * driver loading by returning -ENODEV.
1083 */
amdgpu_device_resize_fb_bar(struct amdgpu_device * adev)1084 int amdgpu_device_resize_fb_bar(struct amdgpu_device *adev)
1085 {
1086 u64 space_needed = roundup_pow_of_two(adev->gmc.real_vram_size);
1087 u32 rbar_size = order_base_2(((space_needed >> 20) | 1)) - 1;
1088 struct pci_bus *root;
1089 struct resource *res;
1090 unsigned i;
1091 u16 cmd;
1092 int r;
1093
1094 /* Bypass for VF */
1095 if (amdgpu_sriov_vf(adev))
1096 return 0;
1097
1098 /* skip if the bios has already enabled large BAR */
1099 if (adev->gmc.real_vram_size &&
1100 (pci_resource_len(adev->pdev, 0) >= adev->gmc.real_vram_size))
1101 return 0;
1102
1103 /* Check if the root BUS has 64bit memory resources */
1104 root = adev->pdev->bus;
1105 while (root->parent)
1106 root = root->parent;
1107
1108 pci_bus_for_each_resource(root, res, i) {
1109 if (res && res->flags & (IORESOURCE_MEM | IORESOURCE_MEM_64) &&
1110 res->start > 0x100000000ull)
1111 break;
1112 }
1113
1114 /* Trying to resize is pointless without a root hub window above 4GB */
1115 if (!res)
1116 return 0;
1117
1118 /* Disable memory decoding while we change the BAR addresses and size */
1119 pci_read_config_word(adev->pdev, PCI_COMMAND, &cmd);
1120 pci_write_config_word(adev->pdev, PCI_COMMAND,
1121 cmd & ~PCI_COMMAND_MEMORY);
1122
1123 /* Free the VRAM and doorbell BAR, we most likely need to move both. */
1124 amdgpu_device_doorbell_fini(adev);
1125 if (adev->asic_type >= CHIP_BONAIRE)
1126 pci_release_resource(adev->pdev, 2);
1127
1128 pci_release_resource(adev->pdev, 0);
1129
1130 r = pci_resize_resource(adev->pdev, 0, rbar_size);
1131 if (r == -ENOSPC)
1132 DRM_INFO("Not enough PCI address space for a large BAR.");
1133 else if (r && r != -ENOTSUPP)
1134 DRM_ERROR("Problem resizing BAR0 (%d).", r);
1135
1136 pci_assign_unassigned_bus_resources(adev->pdev->bus);
1137
1138 /* When the doorbell or fb BAR isn't available we have no chance of
1139 * using the device.
1140 */
1141 r = amdgpu_device_doorbell_init(adev);
1142 if (r || (pci_resource_flags(adev->pdev, 0) & IORESOURCE_UNSET))
1143 return -ENODEV;
1144
1145 pci_write_config_word(adev->pdev, PCI_COMMAND, cmd);
1146
1147 return 0;
1148 }
1149
1150 /*
1151 * GPU helpers function.
1152 */
1153 /**
1154 * amdgpu_device_need_post - check if the hw need post or not
1155 *
1156 * @adev: amdgpu_device pointer
1157 *
1158 * Check if the asic has been initialized (all asics) at driver startup
1159 * or post is needed if hw reset is performed.
1160 * Returns true if need or false if not.
1161 */
amdgpu_device_need_post(struct amdgpu_device * adev)1162 bool amdgpu_device_need_post(struct amdgpu_device *adev)
1163 {
1164 uint32_t reg;
1165
1166 if (amdgpu_sriov_vf(adev))
1167 return false;
1168
1169 if (amdgpu_passthrough(adev)) {
1170 /* for FIJI: In whole GPU pass-through virtualization case, after VM reboot
1171 * some old smc fw still need driver do vPost otherwise gpu hang, while
1172 * those smc fw version above 22.15 doesn't have this flaw, so we force
1173 * vpost executed for smc version below 22.15
1174 */
1175 if (adev->asic_type == CHIP_FIJI) {
1176 int err;
1177 uint32_t fw_ver;
1178 err = request_firmware(&adev->pm.fw, "amdgpu/fiji_smc.bin", adev->dev);
1179 /* force vPost if error occured */
1180 if (err)
1181 return true;
1182
1183 fw_ver = *((uint32_t *)adev->pm.fw->data + 69);
1184 if (fw_ver < 0x00160e00)
1185 return true;
1186 }
1187 }
1188
1189 if (adev->has_hw_reset) {
1190 adev->has_hw_reset = false;
1191 return true;
1192 }
1193
1194 /* bios scratch used on CIK+ */
1195 if (adev->asic_type >= CHIP_BONAIRE)
1196 return amdgpu_atombios_scratch_need_asic_init(adev);
1197
1198 /* check MEM_SIZE for older asics */
1199 reg = amdgpu_asic_get_config_memsize(adev);
1200
1201 if ((reg != 0) && (reg != 0xffffffff))
1202 return false;
1203
1204 return true;
1205 }
1206
1207 /* if we get transitioned to only one device, take VGA back */
1208 /**
1209 * amdgpu_device_vga_set_decode - enable/disable vga decode
1210 *
1211 * @cookie: amdgpu_device pointer
1212 * @state: enable/disable vga decode
1213 *
1214 * Enable/disable vga decode (all asics).
1215 * Returns VGA resource flags.
1216 */
amdgpu_device_vga_set_decode(void * cookie,bool state)1217 static unsigned int amdgpu_device_vga_set_decode(void *cookie, bool state)
1218 {
1219 struct amdgpu_device *adev = cookie;
1220 amdgpu_asic_set_vga_state(adev, state);
1221 if (state)
1222 return VGA_RSRC_LEGACY_IO | VGA_RSRC_LEGACY_MEM |
1223 VGA_RSRC_NORMAL_IO | VGA_RSRC_NORMAL_MEM;
1224 else
1225 return VGA_RSRC_NORMAL_IO | VGA_RSRC_NORMAL_MEM;
1226 }
1227
1228 /**
1229 * amdgpu_device_check_block_size - validate the vm block size
1230 *
1231 * @adev: amdgpu_device pointer
1232 *
1233 * Validates the vm block size specified via module parameter.
1234 * The vm block size defines number of bits in page table versus page directory,
1235 * a page is 4KB so we have 12 bits offset, minimum 9 bits in the
1236 * page table and the remaining bits are in the page directory.
1237 */
amdgpu_device_check_block_size(struct amdgpu_device * adev)1238 static void amdgpu_device_check_block_size(struct amdgpu_device *adev)
1239 {
1240 /* defines number of bits in page table versus page directory,
1241 * a page is 4KB so we have 12 bits offset, minimum 9 bits in the
1242 * page table and the remaining bits are in the page directory */
1243 if (amdgpu_vm_block_size == -1)
1244 return;
1245
1246 if (amdgpu_vm_block_size < 9) {
1247 dev_warn(adev->dev, "VM page table size (%d) too small\n",
1248 amdgpu_vm_block_size);
1249 amdgpu_vm_block_size = -1;
1250 }
1251 }
1252
1253 /**
1254 * amdgpu_device_check_vm_size - validate the vm size
1255 *
1256 * @adev: amdgpu_device pointer
1257 *
1258 * Validates the vm size in GB specified via module parameter.
1259 * The VM size is the size of the GPU virtual memory space in GB.
1260 */
amdgpu_device_check_vm_size(struct amdgpu_device * adev)1261 static void amdgpu_device_check_vm_size(struct amdgpu_device *adev)
1262 {
1263 /* no need to check the default value */
1264 if (amdgpu_vm_size == -1)
1265 return;
1266
1267 if (amdgpu_vm_size < 1) {
1268 dev_warn(adev->dev, "VM size (%d) too small, min is 1GB\n",
1269 amdgpu_vm_size);
1270 amdgpu_vm_size = -1;
1271 }
1272 }
1273
amdgpu_device_check_smu_prv_buffer_size(struct amdgpu_device * adev)1274 static void amdgpu_device_check_smu_prv_buffer_size(struct amdgpu_device *adev)
1275 {
1276 struct sysinfo si;
1277 bool is_os_64 = (sizeof(void *) == 8);
1278 uint64_t total_memory;
1279 uint64_t dram_size_seven_GB = 0x1B8000000;
1280 uint64_t dram_size_three_GB = 0xB8000000;
1281
1282 if (amdgpu_smu_memory_pool_size == 0)
1283 return;
1284
1285 if (!is_os_64) {
1286 DRM_WARN("Not 64-bit OS, feature not supported\n");
1287 goto def_value;
1288 }
1289 si_meminfo(&si);
1290 total_memory = (uint64_t)si.totalram * si.mem_unit;
1291
1292 if ((amdgpu_smu_memory_pool_size == 1) ||
1293 (amdgpu_smu_memory_pool_size == 2)) {
1294 if (total_memory < dram_size_three_GB)
1295 goto def_value1;
1296 } else if ((amdgpu_smu_memory_pool_size == 4) ||
1297 (amdgpu_smu_memory_pool_size == 8)) {
1298 if (total_memory < dram_size_seven_GB)
1299 goto def_value1;
1300 } else {
1301 DRM_WARN("Smu memory pool size not supported\n");
1302 goto def_value;
1303 }
1304 adev->pm.smu_prv_buffer_size = amdgpu_smu_memory_pool_size << 28;
1305
1306 return;
1307
1308 def_value1:
1309 DRM_WARN("No enough system memory\n");
1310 def_value:
1311 adev->pm.smu_prv_buffer_size = 0;
1312 }
1313
1314 /**
1315 * amdgpu_device_check_arguments - validate module params
1316 *
1317 * @adev: amdgpu_device pointer
1318 *
1319 * Validates certain module parameters and updates
1320 * the associated values used by the driver (all asics).
1321 */
amdgpu_device_check_arguments(struct amdgpu_device * adev)1322 static int amdgpu_device_check_arguments(struct amdgpu_device *adev)
1323 {
1324 if (amdgpu_sched_jobs < 4) {
1325 dev_warn(adev->dev, "sched jobs (%d) must be at least 4\n",
1326 amdgpu_sched_jobs);
1327 amdgpu_sched_jobs = 4;
1328 } else if (!is_power_of_2(amdgpu_sched_jobs)){
1329 dev_warn(adev->dev, "sched jobs (%d) must be a power of 2\n",
1330 amdgpu_sched_jobs);
1331 amdgpu_sched_jobs = roundup_pow_of_two(amdgpu_sched_jobs);
1332 }
1333
1334 if (amdgpu_gart_size != -1 && amdgpu_gart_size < 32) {
1335 /* gart size must be greater or equal to 32M */
1336 dev_warn(adev->dev, "gart size (%d) too small\n",
1337 amdgpu_gart_size);
1338 amdgpu_gart_size = -1;
1339 }
1340
1341 if (amdgpu_gtt_size != -1 && amdgpu_gtt_size < 32) {
1342 /* gtt size must be greater or equal to 32M */
1343 dev_warn(adev->dev, "gtt size (%d) too small\n",
1344 amdgpu_gtt_size);
1345 amdgpu_gtt_size = -1;
1346 }
1347
1348 /* valid range is between 4 and 9 inclusive */
1349 if (amdgpu_vm_fragment_size != -1 &&
1350 (amdgpu_vm_fragment_size > 9 || amdgpu_vm_fragment_size < 4)) {
1351 dev_warn(adev->dev, "valid range is between 4 and 9\n");
1352 amdgpu_vm_fragment_size = -1;
1353 }
1354
1355 if (amdgpu_sched_hw_submission < 2) {
1356 dev_warn(adev->dev, "sched hw submission jobs (%d) must be at least 2\n",
1357 amdgpu_sched_hw_submission);
1358 amdgpu_sched_hw_submission = 2;
1359 } else if (!is_power_of_2(amdgpu_sched_hw_submission)) {
1360 dev_warn(adev->dev, "sched hw submission jobs (%d) must be a power of 2\n",
1361 amdgpu_sched_hw_submission);
1362 amdgpu_sched_hw_submission = roundup_pow_of_two(amdgpu_sched_hw_submission);
1363 }
1364
1365 amdgpu_device_check_smu_prv_buffer_size(adev);
1366
1367 amdgpu_device_check_vm_size(adev);
1368
1369 amdgpu_device_check_block_size(adev);
1370
1371 adev->firmware.load_type = amdgpu_ucode_get_load_type(adev, amdgpu_fw_load_type);
1372
1373 amdgpu_gmc_tmz_set(adev);
1374
1375 if (amdgpu_num_kcq == -1) {
1376 amdgpu_num_kcq = 8;
1377 } else if (amdgpu_num_kcq > 8 || amdgpu_num_kcq < 0) {
1378 amdgpu_num_kcq = 8;
1379 dev_warn(adev->dev, "set kernel compute queue number to 8 due to invalid parameter provided by user\n");
1380 }
1381
1382 amdgpu_gmc_noretry_set(adev);
1383
1384 return 0;
1385 }
1386
1387 /**
1388 * amdgpu_switcheroo_set_state - set switcheroo state
1389 *
1390 * @pdev: pci dev pointer
1391 * @state: vga_switcheroo state
1392 *
1393 * Callback for the switcheroo driver. Suspends or resumes the
1394 * the asics before or after it is powered up using ACPI methods.
1395 */
amdgpu_switcheroo_set_state(struct pci_dev * pdev,enum vga_switcheroo_state state)1396 static void amdgpu_switcheroo_set_state(struct pci_dev *pdev,
1397 enum vga_switcheroo_state state)
1398 {
1399 struct drm_device *dev = pci_get_drvdata(pdev);
1400 int r;
1401
1402 if (amdgpu_device_supports_boco(dev) && state == VGA_SWITCHEROO_OFF)
1403 return;
1404
1405 if (state == VGA_SWITCHEROO_ON) {
1406 pr_info("switched on\n");
1407 /* don't suspend or resume card normally */
1408 dev->switch_power_state = DRM_SWITCH_POWER_CHANGING;
1409
1410 pci_set_power_state(dev->pdev, PCI_D0);
1411 amdgpu_device_load_pci_state(dev->pdev);
1412 r = pci_enable_device(dev->pdev);
1413 if (r)
1414 DRM_WARN("pci_enable_device failed (%d)\n", r);
1415 amdgpu_device_resume(dev, true);
1416
1417 dev->switch_power_state = DRM_SWITCH_POWER_ON;
1418 drm_kms_helper_poll_enable(dev);
1419 } else {
1420 pr_info("switched off\n");
1421 drm_kms_helper_poll_disable(dev);
1422 dev->switch_power_state = DRM_SWITCH_POWER_CHANGING;
1423 amdgpu_device_suspend(dev, true);
1424 amdgpu_device_cache_pci_state(dev->pdev);
1425 /* Shut down the device */
1426 pci_disable_device(dev->pdev);
1427 pci_set_power_state(dev->pdev, PCI_D3cold);
1428 dev->switch_power_state = DRM_SWITCH_POWER_OFF;
1429 }
1430 }
1431
1432 /**
1433 * amdgpu_switcheroo_can_switch - see if switcheroo state can change
1434 *
1435 * @pdev: pci dev pointer
1436 *
1437 * Callback for the switcheroo driver. Check of the switcheroo
1438 * state can be changed.
1439 * Returns true if the state can be changed, false if not.
1440 */
amdgpu_switcheroo_can_switch(struct pci_dev * pdev)1441 static bool amdgpu_switcheroo_can_switch(struct pci_dev *pdev)
1442 {
1443 struct drm_device *dev = pci_get_drvdata(pdev);
1444
1445 /*
1446 * FIXME: open_count is protected by drm_global_mutex but that would lead to
1447 * locking inversion with the driver load path. And the access here is
1448 * completely racy anyway. So don't bother with locking for now.
1449 */
1450 return atomic_read(&dev->open_count) == 0;
1451 }
1452
1453 static const struct vga_switcheroo_client_ops amdgpu_switcheroo_ops = {
1454 .set_gpu_state = amdgpu_switcheroo_set_state,
1455 .reprobe = NULL,
1456 .can_switch = amdgpu_switcheroo_can_switch,
1457 };
1458
1459 /**
1460 * amdgpu_device_ip_set_clockgating_state - set the CG state
1461 *
1462 * @dev: amdgpu_device pointer
1463 * @block_type: Type of hardware IP (SMU, GFX, UVD, etc.)
1464 * @state: clockgating state (gate or ungate)
1465 *
1466 * Sets the requested clockgating state for all instances of
1467 * the hardware IP specified.
1468 * Returns the error code from the last instance.
1469 */
amdgpu_device_ip_set_clockgating_state(void * dev,enum amd_ip_block_type block_type,enum amd_clockgating_state state)1470 int amdgpu_device_ip_set_clockgating_state(void *dev,
1471 enum amd_ip_block_type block_type,
1472 enum amd_clockgating_state state)
1473 {
1474 struct amdgpu_device *adev = dev;
1475 int i, r = 0;
1476
1477 for (i = 0; i < adev->num_ip_blocks; i++) {
1478 if (!adev->ip_blocks[i].status.valid)
1479 continue;
1480 if (adev->ip_blocks[i].version->type != block_type)
1481 continue;
1482 if (!adev->ip_blocks[i].version->funcs->set_clockgating_state)
1483 continue;
1484 r = adev->ip_blocks[i].version->funcs->set_clockgating_state(
1485 (void *)adev, state);
1486 if (r)
1487 DRM_ERROR("set_clockgating_state of IP block <%s> failed %d\n",
1488 adev->ip_blocks[i].version->funcs->name, r);
1489 }
1490 return r;
1491 }
1492
1493 /**
1494 * amdgpu_device_ip_set_powergating_state - set the PG state
1495 *
1496 * @dev: amdgpu_device pointer
1497 * @block_type: Type of hardware IP (SMU, GFX, UVD, etc.)
1498 * @state: powergating state (gate or ungate)
1499 *
1500 * Sets the requested powergating state for all instances of
1501 * the hardware IP specified.
1502 * Returns the error code from the last instance.
1503 */
amdgpu_device_ip_set_powergating_state(void * dev,enum amd_ip_block_type block_type,enum amd_powergating_state state)1504 int amdgpu_device_ip_set_powergating_state(void *dev,
1505 enum amd_ip_block_type block_type,
1506 enum amd_powergating_state state)
1507 {
1508 struct amdgpu_device *adev = dev;
1509 int i, r = 0;
1510
1511 for (i = 0; i < adev->num_ip_blocks; i++) {
1512 if (!adev->ip_blocks[i].status.valid)
1513 continue;
1514 if (adev->ip_blocks[i].version->type != block_type)
1515 continue;
1516 if (!adev->ip_blocks[i].version->funcs->set_powergating_state)
1517 continue;
1518 r = adev->ip_blocks[i].version->funcs->set_powergating_state(
1519 (void *)adev, state);
1520 if (r)
1521 DRM_ERROR("set_powergating_state of IP block <%s> failed %d\n",
1522 adev->ip_blocks[i].version->funcs->name, r);
1523 }
1524 return r;
1525 }
1526
1527 /**
1528 * amdgpu_device_ip_get_clockgating_state - get the CG state
1529 *
1530 * @adev: amdgpu_device pointer
1531 * @flags: clockgating feature flags
1532 *
1533 * Walks the list of IPs on the device and updates the clockgating
1534 * flags for each IP.
1535 * Updates @flags with the feature flags for each hardware IP where
1536 * clockgating is enabled.
1537 */
amdgpu_device_ip_get_clockgating_state(struct amdgpu_device * adev,u32 * flags)1538 void amdgpu_device_ip_get_clockgating_state(struct amdgpu_device *adev,
1539 u32 *flags)
1540 {
1541 int i;
1542
1543 for (i = 0; i < adev->num_ip_blocks; i++) {
1544 if (!adev->ip_blocks[i].status.valid)
1545 continue;
1546 if (adev->ip_blocks[i].version->funcs->get_clockgating_state)
1547 adev->ip_blocks[i].version->funcs->get_clockgating_state((void *)adev, flags);
1548 }
1549 }
1550
1551 /**
1552 * amdgpu_device_ip_wait_for_idle - wait for idle
1553 *
1554 * @adev: amdgpu_device pointer
1555 * @block_type: Type of hardware IP (SMU, GFX, UVD, etc.)
1556 *
1557 * Waits for the request hardware IP to be idle.
1558 * Returns 0 for success or a negative error code on failure.
1559 */
amdgpu_device_ip_wait_for_idle(struct amdgpu_device * adev,enum amd_ip_block_type block_type)1560 int amdgpu_device_ip_wait_for_idle(struct amdgpu_device *adev,
1561 enum amd_ip_block_type block_type)
1562 {
1563 int i, r;
1564
1565 for (i = 0; i < adev->num_ip_blocks; i++) {
1566 if (!adev->ip_blocks[i].status.valid)
1567 continue;
1568 if (adev->ip_blocks[i].version->type == block_type) {
1569 r = adev->ip_blocks[i].version->funcs->wait_for_idle((void *)adev);
1570 if (r)
1571 return r;
1572 break;
1573 }
1574 }
1575 return 0;
1576
1577 }
1578
1579 /**
1580 * amdgpu_device_ip_is_idle - is the hardware IP idle
1581 *
1582 * @adev: amdgpu_device pointer
1583 * @block_type: Type of hardware IP (SMU, GFX, UVD, etc.)
1584 *
1585 * Check if the hardware IP is idle or not.
1586 * Returns true if it the IP is idle, false if not.
1587 */
amdgpu_device_ip_is_idle(struct amdgpu_device * adev,enum amd_ip_block_type block_type)1588 bool amdgpu_device_ip_is_idle(struct amdgpu_device *adev,
1589 enum amd_ip_block_type block_type)
1590 {
1591 int i;
1592
1593 for (i = 0; i < adev->num_ip_blocks; i++) {
1594 if (!adev->ip_blocks[i].status.valid)
1595 continue;
1596 if (adev->ip_blocks[i].version->type == block_type)
1597 return adev->ip_blocks[i].version->funcs->is_idle((void *)adev);
1598 }
1599 return true;
1600
1601 }
1602
1603 /**
1604 * amdgpu_device_ip_get_ip_block - get a hw IP pointer
1605 *
1606 * @adev: amdgpu_device pointer
1607 * @type: Type of hardware IP (SMU, GFX, UVD, etc.)
1608 *
1609 * Returns a pointer to the hardware IP block structure
1610 * if it exists for the asic, otherwise NULL.
1611 */
1612 struct amdgpu_ip_block *
amdgpu_device_ip_get_ip_block(struct amdgpu_device * adev,enum amd_ip_block_type type)1613 amdgpu_device_ip_get_ip_block(struct amdgpu_device *adev,
1614 enum amd_ip_block_type type)
1615 {
1616 int i;
1617
1618 for (i = 0; i < adev->num_ip_blocks; i++)
1619 if (adev->ip_blocks[i].version->type == type)
1620 return &adev->ip_blocks[i];
1621
1622 return NULL;
1623 }
1624
1625 /**
1626 * amdgpu_device_ip_block_version_cmp
1627 *
1628 * @adev: amdgpu_device pointer
1629 * @type: enum amd_ip_block_type
1630 * @major: major version
1631 * @minor: minor version
1632 *
1633 * return 0 if equal or greater
1634 * return 1 if smaller or the ip_block doesn't exist
1635 */
amdgpu_device_ip_block_version_cmp(struct amdgpu_device * adev,enum amd_ip_block_type type,u32 major,u32 minor)1636 int amdgpu_device_ip_block_version_cmp(struct amdgpu_device *adev,
1637 enum amd_ip_block_type type,
1638 u32 major, u32 minor)
1639 {
1640 struct amdgpu_ip_block *ip_block = amdgpu_device_ip_get_ip_block(adev, type);
1641
1642 if (ip_block && ((ip_block->version->major > major) ||
1643 ((ip_block->version->major == major) &&
1644 (ip_block->version->minor >= minor))))
1645 return 0;
1646
1647 return 1;
1648 }
1649
1650 /**
1651 * amdgpu_device_ip_block_add
1652 *
1653 * @adev: amdgpu_device pointer
1654 * @ip_block_version: pointer to the IP to add
1655 *
1656 * Adds the IP block driver information to the collection of IPs
1657 * on the asic.
1658 */
amdgpu_device_ip_block_add(struct amdgpu_device * adev,const struct amdgpu_ip_block_version * ip_block_version)1659 int amdgpu_device_ip_block_add(struct amdgpu_device *adev,
1660 const struct amdgpu_ip_block_version *ip_block_version)
1661 {
1662 if (!ip_block_version)
1663 return -EINVAL;
1664
1665 DRM_INFO("add ip block number %d <%s>\n", adev->num_ip_blocks,
1666 ip_block_version->funcs->name);
1667
1668 adev->ip_blocks[adev->num_ip_blocks++].version = ip_block_version;
1669
1670 return 0;
1671 }
1672
1673 /**
1674 * amdgpu_device_enable_virtual_display - enable virtual display feature
1675 *
1676 * @adev: amdgpu_device pointer
1677 *
1678 * Enabled the virtual display feature if the user has enabled it via
1679 * the module parameter virtual_display. This feature provides a virtual
1680 * display hardware on headless boards or in virtualized environments.
1681 * This function parses and validates the configuration string specified by
1682 * the user and configues the virtual display configuration (number of
1683 * virtual connectors, crtcs, etc.) specified.
1684 */
amdgpu_device_enable_virtual_display(struct amdgpu_device * adev)1685 static void amdgpu_device_enable_virtual_display(struct amdgpu_device *adev)
1686 {
1687 adev->enable_virtual_display = false;
1688
1689 if (amdgpu_virtual_display) {
1690 struct drm_device *ddev = adev_to_drm(adev);
1691 const char *pci_address_name = pci_name(ddev->pdev);
1692 char *pciaddstr, *pciaddstr_tmp, *pciaddname_tmp, *pciaddname;
1693
1694 pciaddstr = kstrdup(amdgpu_virtual_display, GFP_KERNEL);
1695 pciaddstr_tmp = pciaddstr;
1696 while ((pciaddname_tmp = strsep(&pciaddstr_tmp, ";"))) {
1697 pciaddname = strsep(&pciaddname_tmp, ",");
1698 if (!strcmp("all", pciaddname)
1699 || !strcmp(pci_address_name, pciaddname)) {
1700 long num_crtc;
1701 int res = -1;
1702
1703 adev->enable_virtual_display = true;
1704
1705 if (pciaddname_tmp)
1706 res = kstrtol(pciaddname_tmp, 10,
1707 &num_crtc);
1708
1709 if (!res) {
1710 if (num_crtc < 1)
1711 num_crtc = 1;
1712 if (num_crtc > 6)
1713 num_crtc = 6;
1714 adev->mode_info.num_crtc = num_crtc;
1715 } else {
1716 adev->mode_info.num_crtc = 1;
1717 }
1718 break;
1719 }
1720 }
1721
1722 DRM_INFO("virtual display string:%s, %s:virtual_display:%d, num_crtc:%d\n",
1723 amdgpu_virtual_display, pci_address_name,
1724 adev->enable_virtual_display, adev->mode_info.num_crtc);
1725
1726 kfree(pciaddstr);
1727 }
1728 }
1729
1730 /**
1731 * amdgpu_device_parse_gpu_info_fw - parse gpu info firmware
1732 *
1733 * @adev: amdgpu_device pointer
1734 *
1735 * Parses the asic configuration parameters specified in the gpu info
1736 * firmware and makes them availale to the driver for use in configuring
1737 * the asic.
1738 * Returns 0 on success, -EINVAL on failure.
1739 */
amdgpu_device_parse_gpu_info_fw(struct amdgpu_device * adev)1740 static int amdgpu_device_parse_gpu_info_fw(struct amdgpu_device *adev)
1741 {
1742 const char *chip_name;
1743 char fw_name[40];
1744 int err;
1745 const struct gpu_info_firmware_header_v1_0 *hdr;
1746
1747 adev->firmware.gpu_info_fw = NULL;
1748
1749 if (adev->mman.discovery_bin) {
1750 amdgpu_discovery_get_gfx_info(adev);
1751
1752 /*
1753 * FIXME: The bounding box is still needed by Navi12, so
1754 * temporarily read it from gpu_info firmware. Should be droped
1755 * when DAL no longer needs it.
1756 */
1757 if (adev->asic_type != CHIP_NAVI12)
1758 return 0;
1759 }
1760
1761 switch (adev->asic_type) {
1762 #ifdef CONFIG_DRM_AMDGPU_SI
1763 case CHIP_VERDE:
1764 case CHIP_TAHITI:
1765 case CHIP_PITCAIRN:
1766 case CHIP_OLAND:
1767 case CHIP_HAINAN:
1768 #endif
1769 #ifdef CONFIG_DRM_AMDGPU_CIK
1770 case CHIP_BONAIRE:
1771 case CHIP_HAWAII:
1772 case CHIP_KAVERI:
1773 case CHIP_KABINI:
1774 case CHIP_MULLINS:
1775 #endif
1776 case CHIP_TOPAZ:
1777 case CHIP_TONGA:
1778 case CHIP_FIJI:
1779 case CHIP_POLARIS10:
1780 case CHIP_POLARIS11:
1781 case CHIP_POLARIS12:
1782 case CHIP_VEGAM:
1783 case CHIP_CARRIZO:
1784 case CHIP_STONEY:
1785 case CHIP_VEGA20:
1786 case CHIP_SIENNA_CICHLID:
1787 case CHIP_NAVY_FLOUNDER:
1788 default:
1789 return 0;
1790 case CHIP_VEGA10:
1791 chip_name = "vega10";
1792 break;
1793 case CHIP_VEGA12:
1794 chip_name = "vega12";
1795 break;
1796 case CHIP_RAVEN:
1797 if (adev->apu_flags & AMD_APU_IS_RAVEN2)
1798 chip_name = "raven2";
1799 else if (adev->apu_flags & AMD_APU_IS_PICASSO)
1800 chip_name = "picasso";
1801 else
1802 chip_name = "raven";
1803 break;
1804 case CHIP_ARCTURUS:
1805 chip_name = "arcturus";
1806 break;
1807 case CHIP_RENOIR:
1808 if (adev->apu_flags & AMD_APU_IS_RENOIR)
1809 chip_name = "renoir";
1810 else
1811 chip_name = "green_sardine";
1812 break;
1813 case CHIP_NAVI10:
1814 chip_name = "navi10";
1815 break;
1816 case CHIP_NAVI14:
1817 chip_name = "navi14";
1818 break;
1819 case CHIP_NAVI12:
1820 chip_name = "navi12";
1821 break;
1822 }
1823
1824 snprintf(fw_name, sizeof(fw_name), "amdgpu/%s_gpu_info.bin", chip_name);
1825 err = request_firmware(&adev->firmware.gpu_info_fw, fw_name, adev->dev);
1826 if (err) {
1827 dev_err(adev->dev,
1828 "Failed to load gpu_info firmware \"%s\"\n",
1829 fw_name);
1830 goto out;
1831 }
1832 err = amdgpu_ucode_validate(adev->firmware.gpu_info_fw);
1833 if (err) {
1834 dev_err(adev->dev,
1835 "Failed to validate gpu_info firmware \"%s\"\n",
1836 fw_name);
1837 goto out;
1838 }
1839
1840 hdr = (const struct gpu_info_firmware_header_v1_0 *)adev->firmware.gpu_info_fw->data;
1841 amdgpu_ucode_print_gpu_info_hdr(&hdr->header);
1842
1843 switch (hdr->version_major) {
1844 case 1:
1845 {
1846 const struct gpu_info_firmware_v1_0 *gpu_info_fw =
1847 (const struct gpu_info_firmware_v1_0 *)(adev->firmware.gpu_info_fw->data +
1848 le32_to_cpu(hdr->header.ucode_array_offset_bytes));
1849
1850 /*
1851 * Should be droped when DAL no longer needs it.
1852 */
1853 if (adev->asic_type == CHIP_NAVI12)
1854 goto parse_soc_bounding_box;
1855
1856 adev->gfx.config.max_shader_engines = le32_to_cpu(gpu_info_fw->gc_num_se);
1857 adev->gfx.config.max_cu_per_sh = le32_to_cpu(gpu_info_fw->gc_num_cu_per_sh);
1858 adev->gfx.config.max_sh_per_se = le32_to_cpu(gpu_info_fw->gc_num_sh_per_se);
1859 adev->gfx.config.max_backends_per_se = le32_to_cpu(gpu_info_fw->gc_num_rb_per_se);
1860 adev->gfx.config.max_texture_channel_caches =
1861 le32_to_cpu(gpu_info_fw->gc_num_tccs);
1862 adev->gfx.config.max_gprs = le32_to_cpu(gpu_info_fw->gc_num_gprs);
1863 adev->gfx.config.max_gs_threads = le32_to_cpu(gpu_info_fw->gc_num_max_gs_thds);
1864 adev->gfx.config.gs_vgt_table_depth = le32_to_cpu(gpu_info_fw->gc_gs_table_depth);
1865 adev->gfx.config.gs_prim_buffer_depth = le32_to_cpu(gpu_info_fw->gc_gsprim_buff_depth);
1866 adev->gfx.config.double_offchip_lds_buf =
1867 le32_to_cpu(gpu_info_fw->gc_double_offchip_lds_buffer);
1868 adev->gfx.cu_info.wave_front_size = le32_to_cpu(gpu_info_fw->gc_wave_size);
1869 adev->gfx.cu_info.max_waves_per_simd =
1870 le32_to_cpu(gpu_info_fw->gc_max_waves_per_simd);
1871 adev->gfx.cu_info.max_scratch_slots_per_cu =
1872 le32_to_cpu(gpu_info_fw->gc_max_scratch_slots_per_cu);
1873 adev->gfx.cu_info.lds_size = le32_to_cpu(gpu_info_fw->gc_lds_size);
1874 if (hdr->version_minor >= 1) {
1875 const struct gpu_info_firmware_v1_1 *gpu_info_fw =
1876 (const struct gpu_info_firmware_v1_1 *)(adev->firmware.gpu_info_fw->data +
1877 le32_to_cpu(hdr->header.ucode_array_offset_bytes));
1878 adev->gfx.config.num_sc_per_sh =
1879 le32_to_cpu(gpu_info_fw->num_sc_per_sh);
1880 adev->gfx.config.num_packer_per_sc =
1881 le32_to_cpu(gpu_info_fw->num_packer_per_sc);
1882 }
1883
1884 parse_soc_bounding_box:
1885 /*
1886 * soc bounding box info is not integrated in disocovery table,
1887 * we always need to parse it from gpu info firmware if needed.
1888 */
1889 if (hdr->version_minor == 2) {
1890 const struct gpu_info_firmware_v1_2 *gpu_info_fw =
1891 (const struct gpu_info_firmware_v1_2 *)(adev->firmware.gpu_info_fw->data +
1892 le32_to_cpu(hdr->header.ucode_array_offset_bytes));
1893 adev->dm.soc_bounding_box = &gpu_info_fw->soc_bounding_box;
1894 }
1895 break;
1896 }
1897 default:
1898 dev_err(adev->dev,
1899 "Unsupported gpu_info table %d\n", hdr->header.ucode_version);
1900 err = -EINVAL;
1901 goto out;
1902 }
1903 out:
1904 return err;
1905 }
1906
1907 /**
1908 * amdgpu_device_ip_early_init - run early init for hardware IPs
1909 *
1910 * @adev: amdgpu_device pointer
1911 *
1912 * Early initialization pass for hardware IPs. The hardware IPs that make
1913 * up each asic are discovered each IP's early_init callback is run. This
1914 * is the first stage in initializing the asic.
1915 * Returns 0 on success, negative error code on failure.
1916 */
amdgpu_device_ip_early_init(struct amdgpu_device * adev)1917 static int amdgpu_device_ip_early_init(struct amdgpu_device *adev)
1918 {
1919 int i, r;
1920
1921 amdgpu_device_enable_virtual_display(adev);
1922
1923 if (amdgpu_sriov_vf(adev)) {
1924 r = amdgpu_virt_request_full_gpu(adev, true);
1925 if (r)
1926 return r;
1927 }
1928
1929 switch (adev->asic_type) {
1930 #ifdef CONFIG_DRM_AMDGPU_SI
1931 case CHIP_VERDE:
1932 case CHIP_TAHITI:
1933 case CHIP_PITCAIRN:
1934 case CHIP_OLAND:
1935 case CHIP_HAINAN:
1936 adev->family = AMDGPU_FAMILY_SI;
1937 r = si_set_ip_blocks(adev);
1938 if (r)
1939 return r;
1940 break;
1941 #endif
1942 #ifdef CONFIG_DRM_AMDGPU_CIK
1943 case CHIP_BONAIRE:
1944 case CHIP_HAWAII:
1945 case CHIP_KAVERI:
1946 case CHIP_KABINI:
1947 case CHIP_MULLINS:
1948 if (adev->flags & AMD_IS_APU)
1949 adev->family = AMDGPU_FAMILY_KV;
1950 else
1951 adev->family = AMDGPU_FAMILY_CI;
1952
1953 r = cik_set_ip_blocks(adev);
1954 if (r)
1955 return r;
1956 break;
1957 #endif
1958 case CHIP_TOPAZ:
1959 case CHIP_TONGA:
1960 case CHIP_FIJI:
1961 case CHIP_POLARIS10:
1962 case CHIP_POLARIS11:
1963 case CHIP_POLARIS12:
1964 case CHIP_VEGAM:
1965 case CHIP_CARRIZO:
1966 case CHIP_STONEY:
1967 if (adev->flags & AMD_IS_APU)
1968 adev->family = AMDGPU_FAMILY_CZ;
1969 else
1970 adev->family = AMDGPU_FAMILY_VI;
1971
1972 r = vi_set_ip_blocks(adev);
1973 if (r)
1974 return r;
1975 break;
1976 case CHIP_VEGA10:
1977 case CHIP_VEGA12:
1978 case CHIP_VEGA20:
1979 case CHIP_RAVEN:
1980 case CHIP_ARCTURUS:
1981 case CHIP_RENOIR:
1982 if (adev->flags & AMD_IS_APU)
1983 adev->family = AMDGPU_FAMILY_RV;
1984 else
1985 adev->family = AMDGPU_FAMILY_AI;
1986
1987 r = soc15_set_ip_blocks(adev);
1988 if (r)
1989 return r;
1990 break;
1991 case CHIP_NAVI10:
1992 case CHIP_NAVI14:
1993 case CHIP_NAVI12:
1994 case CHIP_SIENNA_CICHLID:
1995 case CHIP_NAVY_FLOUNDER:
1996 adev->family = AMDGPU_FAMILY_NV;
1997
1998 r = nv_set_ip_blocks(adev);
1999 if (r)
2000 return r;
2001 break;
2002 default:
2003 /* FIXME: not supported yet */
2004 return -EINVAL;
2005 }
2006
2007 amdgpu_amdkfd_device_probe(adev);
2008
2009 adev->pm.pp_feature = amdgpu_pp_feature_mask;
2010 if (amdgpu_sriov_vf(adev) || sched_policy == KFD_SCHED_POLICY_NO_HWS)
2011 adev->pm.pp_feature &= ~PP_GFXOFF_MASK;
2012
2013 for (i = 0; i < adev->num_ip_blocks; i++) {
2014 if ((amdgpu_ip_block_mask & (1 << i)) == 0) {
2015 DRM_ERROR("disabled ip block: %d <%s>\n",
2016 i, adev->ip_blocks[i].version->funcs->name);
2017 adev->ip_blocks[i].status.valid = false;
2018 } else {
2019 if (adev->ip_blocks[i].version->funcs->early_init) {
2020 r = adev->ip_blocks[i].version->funcs->early_init((void *)adev);
2021 if (r == -ENOENT) {
2022 adev->ip_blocks[i].status.valid = false;
2023 } else if (r) {
2024 DRM_ERROR("early_init of IP block <%s> failed %d\n",
2025 adev->ip_blocks[i].version->funcs->name, r);
2026 return r;
2027 } else {
2028 adev->ip_blocks[i].status.valid = true;
2029 }
2030 } else {
2031 adev->ip_blocks[i].status.valid = true;
2032 }
2033 }
2034 /* get the vbios after the asic_funcs are set up */
2035 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_COMMON) {
2036 r = amdgpu_device_parse_gpu_info_fw(adev);
2037 if (r)
2038 return r;
2039
2040 /* Read BIOS */
2041 if (!amdgpu_get_bios(adev))
2042 return -EINVAL;
2043
2044 r = amdgpu_atombios_init(adev);
2045 if (r) {
2046 dev_err(adev->dev, "amdgpu_atombios_init failed\n");
2047 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_ATOMBIOS_INIT_FAIL, 0, 0);
2048 return r;
2049 }
2050
2051 /*get pf2vf msg info at it's earliest time*/
2052 if (amdgpu_sriov_vf(adev))
2053 amdgpu_virt_init_data_exchange(adev);
2054
2055 }
2056 }
2057
2058 adev->cg_flags &= amdgpu_cg_mask;
2059 adev->pg_flags &= amdgpu_pg_mask;
2060
2061 return 0;
2062 }
2063
amdgpu_device_ip_hw_init_phase1(struct amdgpu_device * adev)2064 static int amdgpu_device_ip_hw_init_phase1(struct amdgpu_device *adev)
2065 {
2066 int i, r;
2067
2068 for (i = 0; i < adev->num_ip_blocks; i++) {
2069 if (!adev->ip_blocks[i].status.sw)
2070 continue;
2071 if (adev->ip_blocks[i].status.hw)
2072 continue;
2073 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_COMMON ||
2074 (amdgpu_sriov_vf(adev) && (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_PSP)) ||
2075 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_IH) {
2076 r = adev->ip_blocks[i].version->funcs->hw_init(adev);
2077 if (r) {
2078 DRM_ERROR("hw_init of IP block <%s> failed %d\n",
2079 adev->ip_blocks[i].version->funcs->name, r);
2080 return r;
2081 }
2082 adev->ip_blocks[i].status.hw = true;
2083 }
2084 }
2085
2086 return 0;
2087 }
2088
amdgpu_device_ip_hw_init_phase2(struct amdgpu_device * adev)2089 static int amdgpu_device_ip_hw_init_phase2(struct amdgpu_device *adev)
2090 {
2091 int i, r;
2092
2093 for (i = 0; i < adev->num_ip_blocks; i++) {
2094 if (!adev->ip_blocks[i].status.sw)
2095 continue;
2096 if (adev->ip_blocks[i].status.hw)
2097 continue;
2098 r = adev->ip_blocks[i].version->funcs->hw_init(adev);
2099 if (r) {
2100 DRM_ERROR("hw_init of IP block <%s> failed %d\n",
2101 adev->ip_blocks[i].version->funcs->name, r);
2102 return r;
2103 }
2104 adev->ip_blocks[i].status.hw = true;
2105 }
2106
2107 return 0;
2108 }
2109
amdgpu_device_fw_loading(struct amdgpu_device * adev)2110 static int amdgpu_device_fw_loading(struct amdgpu_device *adev)
2111 {
2112 int r = 0;
2113 int i;
2114 uint32_t smu_version;
2115
2116 if (adev->asic_type >= CHIP_VEGA10) {
2117 for (i = 0; i < adev->num_ip_blocks; i++) {
2118 if (adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_PSP)
2119 continue;
2120
2121 /* no need to do the fw loading again if already done*/
2122 if (adev->ip_blocks[i].status.hw == true)
2123 break;
2124
2125 if (amdgpu_in_reset(adev) || adev->in_suspend) {
2126 r = adev->ip_blocks[i].version->funcs->resume(adev);
2127 if (r) {
2128 DRM_ERROR("resume of IP block <%s> failed %d\n",
2129 adev->ip_blocks[i].version->funcs->name, r);
2130 return r;
2131 }
2132 } else {
2133 r = adev->ip_blocks[i].version->funcs->hw_init(adev);
2134 if (r) {
2135 DRM_ERROR("hw_init of IP block <%s> failed %d\n",
2136 adev->ip_blocks[i].version->funcs->name, r);
2137 return r;
2138 }
2139 }
2140
2141 adev->ip_blocks[i].status.hw = true;
2142 break;
2143 }
2144 }
2145
2146 if (!amdgpu_sriov_vf(adev) || adev->asic_type == CHIP_TONGA)
2147 r = amdgpu_pm_load_smu_firmware(adev, &smu_version);
2148
2149 return r;
2150 }
2151
2152 /**
2153 * amdgpu_device_ip_init - run init for hardware IPs
2154 *
2155 * @adev: amdgpu_device pointer
2156 *
2157 * Main initialization pass for hardware IPs. The list of all the hardware
2158 * IPs that make up the asic is walked and the sw_init and hw_init callbacks
2159 * are run. sw_init initializes the software state associated with each IP
2160 * and hw_init initializes the hardware associated with each IP.
2161 * Returns 0 on success, negative error code on failure.
2162 */
amdgpu_device_ip_init(struct amdgpu_device * adev)2163 static int amdgpu_device_ip_init(struct amdgpu_device *adev)
2164 {
2165 int i, r;
2166
2167 r = amdgpu_ras_init(adev);
2168 if (r)
2169 return r;
2170
2171 for (i = 0; i < adev->num_ip_blocks; i++) {
2172 if (!adev->ip_blocks[i].status.valid)
2173 continue;
2174 r = adev->ip_blocks[i].version->funcs->sw_init((void *)adev);
2175 if (r) {
2176 DRM_ERROR("sw_init of IP block <%s> failed %d\n",
2177 adev->ip_blocks[i].version->funcs->name, r);
2178 goto init_failed;
2179 }
2180 adev->ip_blocks[i].status.sw = true;
2181
2182 /* need to do gmc hw init early so we can allocate gpu mem */
2183 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC) {
2184 /* Try to reserve bad pages early */
2185 if (amdgpu_sriov_vf(adev))
2186 amdgpu_virt_exchange_data(adev);
2187
2188 r = amdgpu_device_vram_scratch_init(adev);
2189 if (r) {
2190 DRM_ERROR("amdgpu_vram_scratch_init failed %d\n", r);
2191 goto init_failed;
2192 }
2193 r = adev->ip_blocks[i].version->funcs->hw_init((void *)adev);
2194 if (r) {
2195 DRM_ERROR("hw_init %d failed %d\n", i, r);
2196 goto init_failed;
2197 }
2198 r = amdgpu_device_wb_init(adev);
2199 if (r) {
2200 DRM_ERROR("amdgpu_device_wb_init failed %d\n", r);
2201 goto init_failed;
2202 }
2203 adev->ip_blocks[i].status.hw = true;
2204
2205 /* right after GMC hw init, we create CSA */
2206 if (amdgpu_mcbp || amdgpu_sriov_vf(adev)) {
2207 r = amdgpu_allocate_static_csa(adev, &adev->virt.csa_obj,
2208 AMDGPU_GEM_DOMAIN_VRAM,
2209 AMDGPU_CSA_SIZE);
2210 if (r) {
2211 DRM_ERROR("allocate CSA failed %d\n", r);
2212 goto init_failed;
2213 }
2214 }
2215 }
2216 }
2217
2218 if (amdgpu_sriov_vf(adev))
2219 amdgpu_virt_init_data_exchange(adev);
2220
2221 r = amdgpu_ib_pool_init(adev);
2222 if (r) {
2223 dev_err(adev->dev, "IB initialization failed (%d).\n", r);
2224 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_IB_INIT_FAIL, 0, r);
2225 goto init_failed;
2226 }
2227
2228 r = amdgpu_ucode_create_bo(adev); /* create ucode bo when sw_init complete*/
2229 if (r)
2230 goto init_failed;
2231
2232 r = amdgpu_device_ip_hw_init_phase1(adev);
2233 if (r)
2234 goto init_failed;
2235
2236 r = amdgpu_device_fw_loading(adev);
2237 if (r)
2238 goto init_failed;
2239
2240 r = amdgpu_device_ip_hw_init_phase2(adev);
2241 if (r)
2242 goto init_failed;
2243
2244 /*
2245 * retired pages will be loaded from eeprom and reserved here,
2246 * it should be called after amdgpu_device_ip_hw_init_phase2 since
2247 * for some ASICs the RAS EEPROM code relies on SMU fully functioning
2248 * for I2C communication which only true at this point.
2249 *
2250 * amdgpu_ras_recovery_init may fail, but the upper only cares the
2251 * failure from bad gpu situation and stop amdgpu init process
2252 * accordingly. For other failed cases, it will still release all
2253 * the resource and print error message, rather than returning one
2254 * negative value to upper level.
2255 *
2256 * Note: theoretically, this should be called before all vram allocations
2257 * to protect retired page from abusing
2258 */
2259 r = amdgpu_ras_recovery_init(adev);
2260 if (r)
2261 goto init_failed;
2262
2263 if (adev->gmc.xgmi.num_physical_nodes > 1)
2264 amdgpu_xgmi_add_device(adev);
2265 amdgpu_amdkfd_device_init(adev);
2266
2267 amdgpu_fru_get_product_info(adev);
2268
2269 init_failed:
2270 if (amdgpu_sriov_vf(adev))
2271 amdgpu_virt_release_full_gpu(adev, true);
2272
2273 return r;
2274 }
2275
2276 /**
2277 * amdgpu_device_fill_reset_magic - writes reset magic to gart pointer
2278 *
2279 * @adev: amdgpu_device pointer
2280 *
2281 * Writes a reset magic value to the gart pointer in VRAM. The driver calls
2282 * this function before a GPU reset. If the value is retained after a
2283 * GPU reset, VRAM has not been lost. Some GPU resets may destry VRAM contents.
2284 */
amdgpu_device_fill_reset_magic(struct amdgpu_device * adev)2285 static void amdgpu_device_fill_reset_magic(struct amdgpu_device *adev)
2286 {
2287 memcpy(adev->reset_magic, adev->gart.ptr, AMDGPU_RESET_MAGIC_NUM);
2288 }
2289
2290 /**
2291 * amdgpu_device_check_vram_lost - check if vram is valid
2292 *
2293 * @adev: amdgpu_device pointer
2294 *
2295 * Checks the reset magic value written to the gart pointer in VRAM.
2296 * The driver calls this after a GPU reset to see if the contents of
2297 * VRAM is lost or now.
2298 * returns true if vram is lost, false if not.
2299 */
amdgpu_device_check_vram_lost(struct amdgpu_device * adev)2300 static bool amdgpu_device_check_vram_lost(struct amdgpu_device *adev)
2301 {
2302 if (memcmp(adev->gart.ptr, adev->reset_magic,
2303 AMDGPU_RESET_MAGIC_NUM))
2304 return true;
2305
2306 if (!amdgpu_in_reset(adev))
2307 return false;
2308
2309 /*
2310 * For all ASICs with baco/mode1 reset, the VRAM is
2311 * always assumed to be lost.
2312 */
2313 switch (amdgpu_asic_reset_method(adev)) {
2314 case AMD_RESET_METHOD_BACO:
2315 case AMD_RESET_METHOD_MODE1:
2316 return true;
2317 default:
2318 return false;
2319 }
2320 }
2321
2322 /**
2323 * amdgpu_device_set_cg_state - set clockgating for amdgpu device
2324 *
2325 * @adev: amdgpu_device pointer
2326 * @state: clockgating state (gate or ungate)
2327 *
2328 * The list of all the hardware IPs that make up the asic is walked and the
2329 * set_clockgating_state callbacks are run.
2330 * Late initialization pass enabling clockgating for hardware IPs.
2331 * Fini or suspend, pass disabling clockgating for hardware IPs.
2332 * Returns 0 on success, negative error code on failure.
2333 */
2334
amdgpu_device_set_cg_state(struct amdgpu_device * adev,enum amd_clockgating_state state)2335 static int amdgpu_device_set_cg_state(struct amdgpu_device *adev,
2336 enum amd_clockgating_state state)
2337 {
2338 int i, j, r;
2339
2340 if (amdgpu_emu_mode == 1)
2341 return 0;
2342
2343 for (j = 0; j < adev->num_ip_blocks; j++) {
2344 i = state == AMD_CG_STATE_GATE ? j : adev->num_ip_blocks - j - 1;
2345 if (!adev->ip_blocks[i].status.late_initialized)
2346 continue;
2347 /* skip CG for VCE/UVD, it's handled specially */
2348 if (adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_UVD &&
2349 adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_VCE &&
2350 adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_VCN &&
2351 adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_JPEG &&
2352 adev->ip_blocks[i].version->funcs->set_clockgating_state) {
2353 /* enable clockgating to save power */
2354 r = adev->ip_blocks[i].version->funcs->set_clockgating_state((void *)adev,
2355 state);
2356 if (r) {
2357 DRM_ERROR("set_clockgating_state(gate) of IP block <%s> failed %d\n",
2358 adev->ip_blocks[i].version->funcs->name, r);
2359 return r;
2360 }
2361 }
2362 }
2363
2364 return 0;
2365 }
2366
amdgpu_device_set_pg_state(struct amdgpu_device * adev,enum amd_powergating_state state)2367 static int amdgpu_device_set_pg_state(struct amdgpu_device *adev, enum amd_powergating_state state)
2368 {
2369 int i, j, r;
2370
2371 if (amdgpu_emu_mode == 1)
2372 return 0;
2373
2374 for (j = 0; j < adev->num_ip_blocks; j++) {
2375 i = state == AMD_PG_STATE_GATE ? j : adev->num_ip_blocks - j - 1;
2376 if (!adev->ip_blocks[i].status.late_initialized)
2377 continue;
2378 /* skip CG for VCE/UVD, it's handled specially */
2379 if (adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_UVD &&
2380 adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_VCE &&
2381 adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_VCN &&
2382 adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_JPEG &&
2383 adev->ip_blocks[i].version->funcs->set_powergating_state) {
2384 /* enable powergating to save power */
2385 r = adev->ip_blocks[i].version->funcs->set_powergating_state((void *)adev,
2386 state);
2387 if (r) {
2388 DRM_ERROR("set_powergating_state(gate) of IP block <%s> failed %d\n",
2389 adev->ip_blocks[i].version->funcs->name, r);
2390 return r;
2391 }
2392 }
2393 }
2394 return 0;
2395 }
2396
amdgpu_device_enable_mgpu_fan_boost(void)2397 static int amdgpu_device_enable_mgpu_fan_boost(void)
2398 {
2399 struct amdgpu_gpu_instance *gpu_ins;
2400 struct amdgpu_device *adev;
2401 int i, ret = 0;
2402
2403 mutex_lock(&mgpu_info.mutex);
2404
2405 /*
2406 * MGPU fan boost feature should be enabled
2407 * only when there are two or more dGPUs in
2408 * the system
2409 */
2410 if (mgpu_info.num_dgpu < 2)
2411 goto out;
2412
2413 for (i = 0; i < mgpu_info.num_dgpu; i++) {
2414 gpu_ins = &(mgpu_info.gpu_ins[i]);
2415 adev = gpu_ins->adev;
2416 if (!(adev->flags & AMD_IS_APU) &&
2417 !gpu_ins->mgpu_fan_enabled) {
2418 ret = amdgpu_dpm_enable_mgpu_fan_boost(adev);
2419 if (ret)
2420 break;
2421
2422 gpu_ins->mgpu_fan_enabled = 1;
2423 }
2424 }
2425
2426 out:
2427 mutex_unlock(&mgpu_info.mutex);
2428
2429 return ret;
2430 }
2431
2432 /**
2433 * amdgpu_device_ip_late_init - run late init for hardware IPs
2434 *
2435 * @adev: amdgpu_device pointer
2436 *
2437 * Late initialization pass for hardware IPs. The list of all the hardware
2438 * IPs that make up the asic is walked and the late_init callbacks are run.
2439 * late_init covers any special initialization that an IP requires
2440 * after all of the have been initialized or something that needs to happen
2441 * late in the init process.
2442 * Returns 0 on success, negative error code on failure.
2443 */
amdgpu_device_ip_late_init(struct amdgpu_device * adev)2444 static int amdgpu_device_ip_late_init(struct amdgpu_device *adev)
2445 {
2446 struct amdgpu_gpu_instance *gpu_instance;
2447 int i = 0, r;
2448
2449 for (i = 0; i < adev->num_ip_blocks; i++) {
2450 if (!adev->ip_blocks[i].status.hw)
2451 continue;
2452 if (adev->ip_blocks[i].version->funcs->late_init) {
2453 r = adev->ip_blocks[i].version->funcs->late_init((void *)adev);
2454 if (r) {
2455 DRM_ERROR("late_init of IP block <%s> failed %d\n",
2456 adev->ip_blocks[i].version->funcs->name, r);
2457 return r;
2458 }
2459 }
2460 adev->ip_blocks[i].status.late_initialized = true;
2461 }
2462
2463 amdgpu_ras_set_error_query_ready(adev, true);
2464
2465 amdgpu_device_set_cg_state(adev, AMD_CG_STATE_GATE);
2466 amdgpu_device_set_pg_state(adev, AMD_PG_STATE_GATE);
2467
2468 amdgpu_device_fill_reset_magic(adev);
2469
2470 r = amdgpu_device_enable_mgpu_fan_boost();
2471 if (r)
2472 DRM_ERROR("enable mgpu fan boost failed (%d).\n", r);
2473
2474
2475 if (adev->gmc.xgmi.num_physical_nodes > 1) {
2476 mutex_lock(&mgpu_info.mutex);
2477
2478 /*
2479 * Reset device p-state to low as this was booted with high.
2480 *
2481 * This should be performed only after all devices from the same
2482 * hive get initialized.
2483 *
2484 * However, it's unknown how many device in the hive in advance.
2485 * As this is counted one by one during devices initializations.
2486 *
2487 * So, we wait for all XGMI interlinked devices initialized.
2488 * This may bring some delays as those devices may come from
2489 * different hives. But that should be OK.
2490 */
2491 if (mgpu_info.num_dgpu == adev->gmc.xgmi.num_physical_nodes) {
2492 for (i = 0; i < mgpu_info.num_gpu; i++) {
2493 gpu_instance = &(mgpu_info.gpu_ins[i]);
2494 if (gpu_instance->adev->flags & AMD_IS_APU)
2495 continue;
2496
2497 r = amdgpu_xgmi_set_pstate(gpu_instance->adev,
2498 AMDGPU_XGMI_PSTATE_MIN);
2499 if (r) {
2500 DRM_ERROR("pstate setting failed (%d).\n", r);
2501 break;
2502 }
2503 }
2504 }
2505
2506 mutex_unlock(&mgpu_info.mutex);
2507 }
2508
2509 return 0;
2510 }
2511
2512 /**
2513 * amdgpu_device_ip_fini - run fini for hardware IPs
2514 *
2515 * @adev: amdgpu_device pointer
2516 *
2517 * Main teardown pass for hardware IPs. The list of all the hardware
2518 * IPs that make up the asic is walked and the hw_fini and sw_fini callbacks
2519 * are run. hw_fini tears down the hardware associated with each IP
2520 * and sw_fini tears down any software state associated with each IP.
2521 * Returns 0 on success, negative error code on failure.
2522 */
amdgpu_device_ip_fini(struct amdgpu_device * adev)2523 static int amdgpu_device_ip_fini(struct amdgpu_device *adev)
2524 {
2525 int i, r;
2526
2527 if (amdgpu_sriov_vf(adev) && adev->virt.ras_init_done)
2528 amdgpu_virt_release_ras_err_handler_data(adev);
2529
2530 amdgpu_ras_pre_fini(adev);
2531
2532 if (adev->gmc.xgmi.num_physical_nodes > 1)
2533 amdgpu_xgmi_remove_device(adev);
2534
2535 amdgpu_device_set_pg_state(adev, AMD_PG_STATE_UNGATE);
2536 amdgpu_device_set_cg_state(adev, AMD_CG_STATE_UNGATE);
2537
2538 amdgpu_amdkfd_device_fini(adev);
2539
2540 /* need to disable SMC first */
2541 for (i = 0; i < adev->num_ip_blocks; i++) {
2542 if (!adev->ip_blocks[i].status.hw)
2543 continue;
2544 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_SMC) {
2545 r = adev->ip_blocks[i].version->funcs->hw_fini((void *)adev);
2546 /* XXX handle errors */
2547 if (r) {
2548 DRM_DEBUG("hw_fini of IP block <%s> failed %d\n",
2549 adev->ip_blocks[i].version->funcs->name, r);
2550 }
2551 adev->ip_blocks[i].status.hw = false;
2552 break;
2553 }
2554 }
2555
2556 for (i = adev->num_ip_blocks - 1; i >= 0; i--) {
2557 if (!adev->ip_blocks[i].status.hw)
2558 continue;
2559
2560 r = adev->ip_blocks[i].version->funcs->hw_fini((void *)adev);
2561 /* XXX handle errors */
2562 if (r) {
2563 DRM_DEBUG("hw_fini of IP block <%s> failed %d\n",
2564 adev->ip_blocks[i].version->funcs->name, r);
2565 }
2566
2567 adev->ip_blocks[i].status.hw = false;
2568 }
2569
2570
2571 for (i = adev->num_ip_blocks - 1; i >= 0; i--) {
2572 if (!adev->ip_blocks[i].status.sw)
2573 continue;
2574
2575 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC) {
2576 amdgpu_ucode_free_bo(adev);
2577 amdgpu_free_static_csa(&adev->virt.csa_obj);
2578 amdgpu_device_wb_fini(adev);
2579 amdgpu_device_vram_scratch_fini(adev);
2580 amdgpu_ib_pool_fini(adev);
2581 }
2582
2583 r = adev->ip_blocks[i].version->funcs->sw_fini((void *)adev);
2584 /* XXX handle errors */
2585 if (r) {
2586 DRM_DEBUG("sw_fini of IP block <%s> failed %d\n",
2587 adev->ip_blocks[i].version->funcs->name, r);
2588 }
2589 adev->ip_blocks[i].status.sw = false;
2590 adev->ip_blocks[i].status.valid = false;
2591 }
2592
2593 for (i = adev->num_ip_blocks - 1; i >= 0; i--) {
2594 if (!adev->ip_blocks[i].status.late_initialized)
2595 continue;
2596 if (adev->ip_blocks[i].version->funcs->late_fini)
2597 adev->ip_blocks[i].version->funcs->late_fini((void *)adev);
2598 adev->ip_blocks[i].status.late_initialized = false;
2599 }
2600
2601 amdgpu_ras_fini(adev);
2602
2603 if (amdgpu_sriov_vf(adev))
2604 if (amdgpu_virt_release_full_gpu(adev, false))
2605 DRM_ERROR("failed to release exclusive mode on fini\n");
2606
2607 return 0;
2608 }
2609
2610 /**
2611 * amdgpu_device_delayed_init_work_handler - work handler for IB tests
2612 *
2613 * @work: work_struct.
2614 */
amdgpu_device_delayed_init_work_handler(struct work_struct * work)2615 static void amdgpu_device_delayed_init_work_handler(struct work_struct *work)
2616 {
2617 struct amdgpu_device *adev =
2618 container_of(work, struct amdgpu_device, delayed_init_work.work);
2619 int r;
2620
2621 r = amdgpu_ib_ring_tests(adev);
2622 if (r)
2623 DRM_ERROR("ib ring test failed (%d).\n", r);
2624 }
2625
amdgpu_device_delay_enable_gfx_off(struct work_struct * work)2626 static void amdgpu_device_delay_enable_gfx_off(struct work_struct *work)
2627 {
2628 struct amdgpu_device *adev =
2629 container_of(work, struct amdgpu_device, gfx.gfx_off_delay_work.work);
2630
2631 WARN_ON_ONCE(adev->gfx.gfx_off_state);
2632 WARN_ON_ONCE(adev->gfx.gfx_off_req_count);
2633
2634 if (!amdgpu_dpm_set_powergating_by_smu(adev, AMD_IP_BLOCK_TYPE_GFX, true))
2635 adev->gfx.gfx_off_state = true;
2636 }
2637
2638 /**
2639 * amdgpu_device_ip_suspend_phase1 - run suspend for hardware IPs (phase 1)
2640 *
2641 * @adev: amdgpu_device pointer
2642 *
2643 * Main suspend function for hardware IPs. The list of all the hardware
2644 * IPs that make up the asic is walked, clockgating is disabled and the
2645 * suspend callbacks are run. suspend puts the hardware and software state
2646 * in each IP into a state suitable for suspend.
2647 * Returns 0 on success, negative error code on failure.
2648 */
amdgpu_device_ip_suspend_phase1(struct amdgpu_device * adev)2649 static int amdgpu_device_ip_suspend_phase1(struct amdgpu_device *adev)
2650 {
2651 int i, r;
2652
2653 amdgpu_device_set_pg_state(adev, AMD_PG_STATE_UNGATE);
2654 amdgpu_device_set_cg_state(adev, AMD_CG_STATE_UNGATE);
2655
2656 for (i = adev->num_ip_blocks - 1; i >= 0; i--) {
2657 if (!adev->ip_blocks[i].status.valid)
2658 continue;
2659
2660 /* displays are handled separately */
2661 if (adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_DCE)
2662 continue;
2663
2664 /* XXX handle errors */
2665 r = adev->ip_blocks[i].version->funcs->suspend(adev);
2666 /* XXX handle errors */
2667 if (r) {
2668 DRM_ERROR("suspend of IP block <%s> failed %d\n",
2669 adev->ip_blocks[i].version->funcs->name, r);
2670 return r;
2671 }
2672
2673 adev->ip_blocks[i].status.hw = false;
2674 }
2675
2676 return 0;
2677 }
2678
2679 /**
2680 * amdgpu_device_ip_suspend_phase2 - run suspend for hardware IPs (phase 2)
2681 *
2682 * @adev: amdgpu_device pointer
2683 *
2684 * Main suspend function for hardware IPs. The list of all the hardware
2685 * IPs that make up the asic is walked, clockgating is disabled and the
2686 * suspend callbacks are run. suspend puts the hardware and software state
2687 * in each IP into a state suitable for suspend.
2688 * Returns 0 on success, negative error code on failure.
2689 */
amdgpu_device_ip_suspend_phase2(struct amdgpu_device * adev)2690 static int amdgpu_device_ip_suspend_phase2(struct amdgpu_device *adev)
2691 {
2692 int i, r;
2693
2694 for (i = adev->num_ip_blocks - 1; i >= 0; i--) {
2695 if (!adev->ip_blocks[i].status.valid)
2696 continue;
2697 /* displays are handled in phase1 */
2698 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_DCE)
2699 continue;
2700 /* PSP lost connection when err_event_athub occurs */
2701 if (amdgpu_ras_intr_triggered() &&
2702 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_PSP) {
2703 adev->ip_blocks[i].status.hw = false;
2704 continue;
2705 }
2706 /* XXX handle errors */
2707 r = adev->ip_blocks[i].version->funcs->suspend(adev);
2708 /* XXX handle errors */
2709 if (r) {
2710 DRM_ERROR("suspend of IP block <%s> failed %d\n",
2711 adev->ip_blocks[i].version->funcs->name, r);
2712 }
2713 adev->ip_blocks[i].status.hw = false;
2714 /* handle putting the SMC in the appropriate state */
2715 if(!amdgpu_sriov_vf(adev)){
2716 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_SMC) {
2717 r = amdgpu_dpm_set_mp1_state(adev, adev->mp1_state);
2718 if (r) {
2719 DRM_ERROR("SMC failed to set mp1 state %d, %d\n",
2720 adev->mp1_state, r);
2721 return r;
2722 }
2723 }
2724 }
2725 adev->ip_blocks[i].status.hw = false;
2726 }
2727
2728 return 0;
2729 }
2730
2731 /**
2732 * amdgpu_device_ip_suspend - run suspend for hardware IPs
2733 *
2734 * @adev: amdgpu_device pointer
2735 *
2736 * Main suspend function for hardware IPs. The list of all the hardware
2737 * IPs that make up the asic is walked, clockgating is disabled and the
2738 * suspend callbacks are run. suspend puts the hardware and software state
2739 * in each IP into a state suitable for suspend.
2740 * Returns 0 on success, negative error code on failure.
2741 */
amdgpu_device_ip_suspend(struct amdgpu_device * adev)2742 int amdgpu_device_ip_suspend(struct amdgpu_device *adev)
2743 {
2744 int r;
2745
2746 if (amdgpu_sriov_vf(adev))
2747 amdgpu_virt_request_full_gpu(adev, false);
2748
2749 r = amdgpu_device_ip_suspend_phase1(adev);
2750 if (r)
2751 return r;
2752 r = amdgpu_device_ip_suspend_phase2(adev);
2753
2754 if (amdgpu_sriov_vf(adev))
2755 amdgpu_virt_release_full_gpu(adev, false);
2756
2757 return r;
2758 }
2759
amdgpu_device_ip_reinit_early_sriov(struct amdgpu_device * adev)2760 static int amdgpu_device_ip_reinit_early_sriov(struct amdgpu_device *adev)
2761 {
2762 int i, r;
2763
2764 static enum amd_ip_block_type ip_order[] = {
2765 AMD_IP_BLOCK_TYPE_GMC,
2766 AMD_IP_BLOCK_TYPE_COMMON,
2767 AMD_IP_BLOCK_TYPE_PSP,
2768 AMD_IP_BLOCK_TYPE_IH,
2769 };
2770
2771 for (i = 0; i < adev->num_ip_blocks; i++) {
2772 int j;
2773 struct amdgpu_ip_block *block;
2774
2775 block = &adev->ip_blocks[i];
2776 block->status.hw = false;
2777
2778 for (j = 0; j < ARRAY_SIZE(ip_order); j++) {
2779
2780 if (block->version->type != ip_order[j] ||
2781 !block->status.valid)
2782 continue;
2783
2784 r = block->version->funcs->hw_init(adev);
2785 DRM_INFO("RE-INIT-early: %s %s\n", block->version->funcs->name, r?"failed":"succeeded");
2786 if (r)
2787 return r;
2788 block->status.hw = true;
2789 }
2790 }
2791
2792 return 0;
2793 }
2794
amdgpu_device_ip_reinit_late_sriov(struct amdgpu_device * adev)2795 static int amdgpu_device_ip_reinit_late_sriov(struct amdgpu_device *adev)
2796 {
2797 int i, r;
2798
2799 static enum amd_ip_block_type ip_order[] = {
2800 AMD_IP_BLOCK_TYPE_SMC,
2801 AMD_IP_BLOCK_TYPE_DCE,
2802 AMD_IP_BLOCK_TYPE_GFX,
2803 AMD_IP_BLOCK_TYPE_SDMA,
2804 AMD_IP_BLOCK_TYPE_UVD,
2805 AMD_IP_BLOCK_TYPE_VCE,
2806 AMD_IP_BLOCK_TYPE_VCN
2807 };
2808
2809 for (i = 0; i < ARRAY_SIZE(ip_order); i++) {
2810 int j;
2811 struct amdgpu_ip_block *block;
2812
2813 for (j = 0; j < adev->num_ip_blocks; j++) {
2814 block = &adev->ip_blocks[j];
2815
2816 if (block->version->type != ip_order[i] ||
2817 !block->status.valid ||
2818 block->status.hw)
2819 continue;
2820
2821 if (block->version->type == AMD_IP_BLOCK_TYPE_SMC)
2822 r = block->version->funcs->resume(adev);
2823 else
2824 r = block->version->funcs->hw_init(adev);
2825
2826 DRM_INFO("RE-INIT-late: %s %s\n", block->version->funcs->name, r?"failed":"succeeded");
2827 if (r)
2828 return r;
2829 block->status.hw = true;
2830 }
2831 }
2832
2833 return 0;
2834 }
2835
2836 /**
2837 * amdgpu_device_ip_resume_phase1 - run resume for hardware IPs
2838 *
2839 * @adev: amdgpu_device pointer
2840 *
2841 * First resume function for hardware IPs. The list of all the hardware
2842 * IPs that make up the asic is walked and the resume callbacks are run for
2843 * COMMON, GMC, and IH. resume puts the hardware into a functional state
2844 * after a suspend and updates the software state as necessary. This
2845 * function is also used for restoring the GPU after a GPU reset.
2846 * Returns 0 on success, negative error code on failure.
2847 */
amdgpu_device_ip_resume_phase1(struct amdgpu_device * adev)2848 static int amdgpu_device_ip_resume_phase1(struct amdgpu_device *adev)
2849 {
2850 int i, r;
2851
2852 for (i = 0; i < adev->num_ip_blocks; i++) {
2853 if (!adev->ip_blocks[i].status.valid || adev->ip_blocks[i].status.hw)
2854 continue;
2855 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_COMMON ||
2856 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC ||
2857 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_IH) {
2858
2859 r = adev->ip_blocks[i].version->funcs->resume(adev);
2860 if (r) {
2861 DRM_ERROR("resume of IP block <%s> failed %d\n",
2862 adev->ip_blocks[i].version->funcs->name, r);
2863 return r;
2864 }
2865 adev->ip_blocks[i].status.hw = true;
2866 }
2867 }
2868
2869 return 0;
2870 }
2871
2872 /**
2873 * amdgpu_device_ip_resume_phase2 - run resume for hardware IPs
2874 *
2875 * @adev: amdgpu_device pointer
2876 *
2877 * First resume function for hardware IPs. The list of all the hardware
2878 * IPs that make up the asic is walked and the resume callbacks are run for
2879 * all blocks except COMMON, GMC, and IH. resume puts the hardware into a
2880 * functional state after a suspend and updates the software state as
2881 * necessary. This function is also used for restoring the GPU after a GPU
2882 * reset.
2883 * Returns 0 on success, negative error code on failure.
2884 */
amdgpu_device_ip_resume_phase2(struct amdgpu_device * adev)2885 static int amdgpu_device_ip_resume_phase2(struct amdgpu_device *adev)
2886 {
2887 int i, r;
2888
2889 for (i = 0; i < adev->num_ip_blocks; i++) {
2890 if (!adev->ip_blocks[i].status.valid || adev->ip_blocks[i].status.hw)
2891 continue;
2892 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_COMMON ||
2893 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC ||
2894 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_IH ||
2895 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_PSP)
2896 continue;
2897 r = adev->ip_blocks[i].version->funcs->resume(adev);
2898 if (r) {
2899 DRM_ERROR("resume of IP block <%s> failed %d\n",
2900 adev->ip_blocks[i].version->funcs->name, r);
2901 return r;
2902 }
2903 adev->ip_blocks[i].status.hw = true;
2904 }
2905
2906 return 0;
2907 }
2908
2909 /**
2910 * amdgpu_device_ip_resume - run resume for hardware IPs
2911 *
2912 * @adev: amdgpu_device pointer
2913 *
2914 * Main resume function for hardware IPs. The hardware IPs
2915 * are split into two resume functions because they are
2916 * are also used in in recovering from a GPU reset and some additional
2917 * steps need to be take between them. In this case (S3/S4) they are
2918 * run sequentially.
2919 * Returns 0 on success, negative error code on failure.
2920 */
amdgpu_device_ip_resume(struct amdgpu_device * adev)2921 static int amdgpu_device_ip_resume(struct amdgpu_device *adev)
2922 {
2923 int r;
2924
2925 r = amdgpu_amdkfd_resume_iommu(adev);
2926 if (r)
2927 return r;
2928
2929 r = amdgpu_device_ip_resume_phase1(adev);
2930 if (r)
2931 return r;
2932
2933 r = amdgpu_device_fw_loading(adev);
2934 if (r)
2935 return r;
2936
2937 r = amdgpu_device_ip_resume_phase2(adev);
2938
2939 return r;
2940 }
2941
2942 /**
2943 * amdgpu_device_detect_sriov_bios - determine if the board supports SR-IOV
2944 *
2945 * @adev: amdgpu_device pointer
2946 *
2947 * Query the VBIOS data tables to determine if the board supports SR-IOV.
2948 */
amdgpu_device_detect_sriov_bios(struct amdgpu_device * adev)2949 static void amdgpu_device_detect_sriov_bios(struct amdgpu_device *adev)
2950 {
2951 if (amdgpu_sriov_vf(adev)) {
2952 if (adev->is_atom_fw) {
2953 if (amdgpu_atomfirmware_gpu_supports_virtualization(adev))
2954 adev->virt.caps |= AMDGPU_SRIOV_CAPS_SRIOV_VBIOS;
2955 } else {
2956 if (amdgpu_atombios_has_gpu_virtualization_table(adev))
2957 adev->virt.caps |= AMDGPU_SRIOV_CAPS_SRIOV_VBIOS;
2958 }
2959
2960 if (!(adev->virt.caps & AMDGPU_SRIOV_CAPS_SRIOV_VBIOS))
2961 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_NO_VBIOS, 0, 0);
2962 }
2963 }
2964
2965 /**
2966 * amdgpu_device_asic_has_dc_support - determine if DC supports the asic
2967 *
2968 * @asic_type: AMD asic type
2969 *
2970 * Check if there is DC (new modesetting infrastructre) support for an asic.
2971 * returns true if DC has support, false if not.
2972 */
amdgpu_device_asic_has_dc_support(enum amd_asic_type asic_type)2973 bool amdgpu_device_asic_has_dc_support(enum amd_asic_type asic_type)
2974 {
2975 switch (asic_type) {
2976 #if defined(CONFIG_DRM_AMD_DC)
2977 #if defined(CONFIG_DRM_AMD_DC_SI)
2978 case CHIP_TAHITI:
2979 case CHIP_PITCAIRN:
2980 case CHIP_VERDE:
2981 case CHIP_OLAND:
2982 #endif
2983 case CHIP_BONAIRE:
2984 case CHIP_KAVERI:
2985 case CHIP_KABINI:
2986 case CHIP_MULLINS:
2987 /*
2988 * We have systems in the wild with these ASICs that require
2989 * LVDS and VGA support which is not supported with DC.
2990 *
2991 * Fallback to the non-DC driver here by default so as not to
2992 * cause regressions.
2993 */
2994 return amdgpu_dc > 0;
2995 case CHIP_HAWAII:
2996 case CHIP_CARRIZO:
2997 case CHIP_STONEY:
2998 case CHIP_POLARIS10:
2999 case CHIP_POLARIS11:
3000 case CHIP_POLARIS12:
3001 case CHIP_VEGAM:
3002 case CHIP_TONGA:
3003 case CHIP_FIJI:
3004 case CHIP_VEGA10:
3005 case CHIP_VEGA12:
3006 case CHIP_VEGA20:
3007 #if defined(CONFIG_DRM_AMD_DC_DCN)
3008 case CHIP_RAVEN:
3009 case CHIP_NAVI10:
3010 case CHIP_NAVI14:
3011 case CHIP_NAVI12:
3012 case CHIP_RENOIR:
3013 #endif
3014 #if defined(CONFIG_DRM_AMD_DC_DCN3_0)
3015 case CHIP_SIENNA_CICHLID:
3016 case CHIP_NAVY_FLOUNDER:
3017 #endif
3018 return amdgpu_dc != 0;
3019 #endif
3020 default:
3021 if (amdgpu_dc > 0)
3022 DRM_INFO_ONCE("Display Core has been requested via kernel parameter "
3023 "but isn't supported by ASIC, ignoring\n");
3024 return false;
3025 }
3026 }
3027
3028 /**
3029 * amdgpu_device_has_dc_support - check if dc is supported
3030 *
3031 * @adev: amdgpu_device pointer
3032 *
3033 * Returns true for supported, false for not supported
3034 */
amdgpu_device_has_dc_support(struct amdgpu_device * adev)3035 bool amdgpu_device_has_dc_support(struct amdgpu_device *adev)
3036 {
3037 if (amdgpu_sriov_vf(adev) || adev->enable_virtual_display)
3038 return false;
3039
3040 return amdgpu_device_asic_has_dc_support(adev->asic_type);
3041 }
3042
3043
amdgpu_device_xgmi_reset_func(struct work_struct * __work)3044 static void amdgpu_device_xgmi_reset_func(struct work_struct *__work)
3045 {
3046 struct amdgpu_device *adev =
3047 container_of(__work, struct amdgpu_device, xgmi_reset_work);
3048 struct amdgpu_hive_info *hive = amdgpu_get_xgmi_hive(adev);
3049
3050 /* It's a bug to not have a hive within this function */
3051 if (WARN_ON(!hive))
3052 return;
3053
3054 /*
3055 * Use task barrier to synchronize all xgmi reset works across the
3056 * hive. task_barrier_enter and task_barrier_exit will block
3057 * until all the threads running the xgmi reset works reach
3058 * those points. task_barrier_full will do both blocks.
3059 */
3060 if (amdgpu_asic_reset_method(adev) == AMD_RESET_METHOD_BACO) {
3061
3062 task_barrier_enter(&hive->tb);
3063 adev->asic_reset_res = amdgpu_device_baco_enter(adev_to_drm(adev));
3064
3065 if (adev->asic_reset_res)
3066 goto fail;
3067
3068 task_barrier_exit(&hive->tb);
3069 adev->asic_reset_res = amdgpu_device_baco_exit(adev_to_drm(adev));
3070
3071 if (adev->asic_reset_res)
3072 goto fail;
3073
3074 if (adev->mmhub.funcs && adev->mmhub.funcs->reset_ras_error_count)
3075 adev->mmhub.funcs->reset_ras_error_count(adev);
3076 } else {
3077
3078 task_barrier_full(&hive->tb);
3079 adev->asic_reset_res = amdgpu_asic_reset(adev);
3080 }
3081
3082 fail:
3083 if (adev->asic_reset_res)
3084 DRM_WARN("ASIC reset failed with error, %d for drm dev, %s",
3085 adev->asic_reset_res, adev_to_drm(adev)->unique);
3086 amdgpu_put_xgmi_hive(hive);
3087 }
3088
amdgpu_device_get_job_timeout_settings(struct amdgpu_device * adev)3089 static int amdgpu_device_get_job_timeout_settings(struct amdgpu_device *adev)
3090 {
3091 char *input = amdgpu_lockup_timeout;
3092 char *timeout_setting = NULL;
3093 int index = 0;
3094 long timeout;
3095 int ret = 0;
3096
3097 /*
3098 * By default timeout for non compute jobs is 10000.
3099 * And there is no timeout enforced on compute jobs.
3100 * In SR-IOV or passthrough mode, timeout for compute
3101 * jobs are 60000 by default.
3102 */
3103 adev->gfx_timeout = msecs_to_jiffies(10000);
3104 adev->sdma_timeout = adev->video_timeout = adev->gfx_timeout;
3105 if (amdgpu_sriov_vf(adev) || amdgpu_passthrough(adev))
3106 adev->compute_timeout = msecs_to_jiffies(60000);
3107 else
3108 adev->compute_timeout = MAX_SCHEDULE_TIMEOUT;
3109
3110 if (strnlen(input, AMDGPU_MAX_TIMEOUT_PARAM_LENGTH)) {
3111 while ((timeout_setting = strsep(&input, ",")) &&
3112 strnlen(timeout_setting, AMDGPU_MAX_TIMEOUT_PARAM_LENGTH)) {
3113 ret = kstrtol(timeout_setting, 0, &timeout);
3114 if (ret)
3115 return ret;
3116
3117 if (timeout == 0) {
3118 index++;
3119 continue;
3120 } else if (timeout < 0) {
3121 timeout = MAX_SCHEDULE_TIMEOUT;
3122 } else {
3123 timeout = msecs_to_jiffies(timeout);
3124 }
3125
3126 switch (index++) {
3127 case 0:
3128 adev->gfx_timeout = timeout;
3129 break;
3130 case 1:
3131 adev->compute_timeout = timeout;
3132 break;
3133 case 2:
3134 adev->sdma_timeout = timeout;
3135 break;
3136 case 3:
3137 adev->video_timeout = timeout;
3138 break;
3139 default:
3140 break;
3141 }
3142 }
3143 /*
3144 * There is only one value specified and
3145 * it should apply to all non-compute jobs.
3146 */
3147 if (index == 1) {
3148 adev->sdma_timeout = adev->video_timeout = adev->gfx_timeout;
3149 if (amdgpu_sriov_vf(adev) || amdgpu_passthrough(adev))
3150 adev->compute_timeout = adev->gfx_timeout;
3151 }
3152 }
3153
3154 return ret;
3155 }
3156
3157 static const struct attribute *amdgpu_dev_attributes[] = {
3158 &dev_attr_product_name.attr,
3159 &dev_attr_product_number.attr,
3160 &dev_attr_serial_number.attr,
3161 &dev_attr_pcie_replay_count.attr,
3162 NULL
3163 };
3164
3165
3166 /**
3167 * amdgpu_device_init - initialize the driver
3168 *
3169 * @adev: amdgpu_device pointer
3170 * @flags: driver flags
3171 *
3172 * Initializes the driver info and hw (all asics).
3173 * Returns 0 for success or an error on failure.
3174 * Called at driver startup.
3175 */
amdgpu_device_init(struct amdgpu_device * adev,uint32_t flags)3176 int amdgpu_device_init(struct amdgpu_device *adev,
3177 uint32_t flags)
3178 {
3179 struct drm_device *ddev = adev_to_drm(adev);
3180 struct pci_dev *pdev = adev->pdev;
3181 int r, i;
3182 bool boco = false;
3183 u32 max_MBps;
3184
3185 adev->shutdown = false;
3186 adev->flags = flags;
3187
3188 if (amdgpu_force_asic_type >= 0 && amdgpu_force_asic_type < CHIP_LAST)
3189 adev->asic_type = amdgpu_force_asic_type;
3190 else
3191 adev->asic_type = flags & AMD_ASIC_MASK;
3192
3193 adev->usec_timeout = AMDGPU_MAX_USEC_TIMEOUT;
3194 if (amdgpu_emu_mode == 1)
3195 adev->usec_timeout *= 10;
3196 adev->gmc.gart_size = 512 * 1024 * 1024;
3197 adev->accel_working = false;
3198 adev->num_rings = 0;
3199 adev->mman.buffer_funcs = NULL;
3200 adev->mman.buffer_funcs_ring = NULL;
3201 adev->vm_manager.vm_pte_funcs = NULL;
3202 adev->vm_manager.vm_pte_num_scheds = 0;
3203 adev->gmc.gmc_funcs = NULL;
3204 adev->fence_context = dma_fence_context_alloc(AMDGPU_MAX_RINGS);
3205 bitmap_zero(adev->gfx.pipe_reserve_bitmap, AMDGPU_MAX_COMPUTE_QUEUES);
3206
3207 adev->smc_rreg = &amdgpu_invalid_rreg;
3208 adev->smc_wreg = &amdgpu_invalid_wreg;
3209 adev->pcie_rreg = &amdgpu_invalid_rreg;
3210 adev->pcie_wreg = &amdgpu_invalid_wreg;
3211 adev->pciep_rreg = &amdgpu_invalid_rreg;
3212 adev->pciep_wreg = &amdgpu_invalid_wreg;
3213 adev->pcie_rreg64 = &amdgpu_invalid_rreg64;
3214 adev->pcie_wreg64 = &amdgpu_invalid_wreg64;
3215 adev->uvd_ctx_rreg = &amdgpu_invalid_rreg;
3216 adev->uvd_ctx_wreg = &amdgpu_invalid_wreg;
3217 adev->didt_rreg = &amdgpu_invalid_rreg;
3218 adev->didt_wreg = &amdgpu_invalid_wreg;
3219 adev->gc_cac_rreg = &amdgpu_invalid_rreg;
3220 adev->gc_cac_wreg = &amdgpu_invalid_wreg;
3221 adev->audio_endpt_rreg = &amdgpu_block_invalid_rreg;
3222 adev->audio_endpt_wreg = &amdgpu_block_invalid_wreg;
3223
3224 DRM_INFO("initializing kernel modesetting (%s 0x%04X:0x%04X 0x%04X:0x%04X 0x%02X).\n",
3225 amdgpu_asic_name[adev->asic_type], pdev->vendor, pdev->device,
3226 pdev->subsystem_vendor, pdev->subsystem_device, pdev->revision);
3227
3228 /* mutex initialization are all done here so we
3229 * can recall function without having locking issues */
3230 atomic_set(&adev->irq.ih.lock, 0);
3231 mutex_init(&adev->firmware.mutex);
3232 mutex_init(&adev->pm.mutex);
3233 mutex_init(&adev->gfx.gpu_clock_mutex);
3234 mutex_init(&adev->srbm_mutex);
3235 mutex_init(&adev->gfx.pipe_reserve_mutex);
3236 mutex_init(&adev->gfx.gfx_off_mutex);
3237 mutex_init(&adev->grbm_idx_mutex);
3238 mutex_init(&adev->mn_lock);
3239 mutex_init(&adev->virt.vf_errors.lock);
3240 hash_init(adev->mn_hash);
3241 atomic_set(&adev->in_gpu_reset, 0);
3242 init_rwsem(&adev->reset_sem);
3243 mutex_init(&adev->psp.mutex);
3244 mutex_init(&adev->notifier_lock);
3245
3246 r = amdgpu_device_check_arguments(adev);
3247 if (r)
3248 return r;
3249
3250 spin_lock_init(&adev->mmio_idx_lock);
3251 spin_lock_init(&adev->smc_idx_lock);
3252 spin_lock_init(&adev->pcie_idx_lock);
3253 spin_lock_init(&adev->uvd_ctx_idx_lock);
3254 spin_lock_init(&adev->didt_idx_lock);
3255 spin_lock_init(&adev->gc_cac_idx_lock);
3256 spin_lock_init(&adev->se_cac_idx_lock);
3257 spin_lock_init(&adev->audio_endpt_idx_lock);
3258 spin_lock_init(&adev->mm_stats.lock);
3259
3260 INIT_LIST_HEAD(&adev->shadow_list);
3261 mutex_init(&adev->shadow_list_lock);
3262
3263 INIT_DELAYED_WORK(&adev->delayed_init_work,
3264 amdgpu_device_delayed_init_work_handler);
3265 INIT_DELAYED_WORK(&adev->gfx.gfx_off_delay_work,
3266 amdgpu_device_delay_enable_gfx_off);
3267
3268 INIT_WORK(&adev->xgmi_reset_work, amdgpu_device_xgmi_reset_func);
3269
3270 adev->gfx.gfx_off_req_count = 1;
3271 adev->pm.ac_power = power_supply_is_system_supplied() > 0;
3272
3273 atomic_set(&adev->throttling_logging_enabled, 1);
3274 /*
3275 * If throttling continues, logging will be performed every minute
3276 * to avoid log flooding. "-1" is subtracted since the thermal
3277 * throttling interrupt comes every second. Thus, the total logging
3278 * interval is 59 seconds(retelimited printk interval) + 1(waiting
3279 * for throttling interrupt) = 60 seconds.
3280 */
3281 ratelimit_state_init(&adev->throttling_logging_rs, (60 - 1) * HZ, 1);
3282 ratelimit_set_flags(&adev->throttling_logging_rs, RATELIMIT_MSG_ON_RELEASE);
3283
3284 /* Registers mapping */
3285 /* TODO: block userspace mapping of io register */
3286 if (adev->asic_type >= CHIP_BONAIRE) {
3287 adev->rmmio_base = pci_resource_start(adev->pdev, 5);
3288 adev->rmmio_size = pci_resource_len(adev->pdev, 5);
3289 } else {
3290 adev->rmmio_base = pci_resource_start(adev->pdev, 2);
3291 adev->rmmio_size = pci_resource_len(adev->pdev, 2);
3292 }
3293
3294 adev->rmmio = ioremap(adev->rmmio_base, adev->rmmio_size);
3295 if (adev->rmmio == NULL) {
3296 return -ENOMEM;
3297 }
3298 DRM_INFO("register mmio base: 0x%08X\n", (uint32_t)adev->rmmio_base);
3299 DRM_INFO("register mmio size: %u\n", (unsigned)adev->rmmio_size);
3300
3301 /* io port mapping */
3302 for (i = 0; i < DEVICE_COUNT_RESOURCE; i++) {
3303 if (pci_resource_flags(adev->pdev, i) & IORESOURCE_IO) {
3304 adev->rio_mem_size = pci_resource_len(adev->pdev, i);
3305 adev->rio_mem = pci_iomap(adev->pdev, i, adev->rio_mem_size);
3306 break;
3307 }
3308 }
3309 if (adev->rio_mem == NULL)
3310 DRM_INFO("PCI I/O BAR is not found.\n");
3311
3312 /* enable PCIE atomic ops */
3313 r = pci_enable_atomic_ops_to_root(adev->pdev,
3314 PCI_EXP_DEVCAP2_ATOMIC_COMP32 |
3315 PCI_EXP_DEVCAP2_ATOMIC_COMP64);
3316 if (r) {
3317 adev->have_atomics_support = false;
3318 DRM_INFO("PCIE atomic ops is not supported\n");
3319 } else {
3320 adev->have_atomics_support = true;
3321 }
3322
3323 amdgpu_device_get_pcie_info(adev);
3324
3325 if (amdgpu_mcbp)
3326 DRM_INFO("MCBP is enabled\n");
3327
3328 if (amdgpu_mes && adev->asic_type >= CHIP_NAVI10)
3329 adev->enable_mes = true;
3330
3331 /* detect hw virtualization here */
3332 amdgpu_detect_virtualization(adev);
3333
3334 r = amdgpu_device_get_job_timeout_settings(adev);
3335 if (r) {
3336 dev_err(adev->dev, "invalid lockup_timeout parameter syntax\n");
3337 return r;
3338 }
3339
3340 /* early init functions */
3341 r = amdgpu_device_ip_early_init(adev);
3342 if (r)
3343 return r;
3344
3345 /* doorbell bar mapping and doorbell index init*/
3346 amdgpu_device_doorbell_init(adev);
3347
3348 /* if we have > 1 VGA cards, then disable the amdgpu VGA resources */
3349 /* this will fail for cards that aren't VGA class devices, just
3350 * ignore it */
3351 vga_client_register(adev->pdev, adev, NULL, amdgpu_device_vga_set_decode);
3352
3353 if (amdgpu_device_supports_boco(ddev))
3354 boco = true;
3355 if (amdgpu_has_atpx() &&
3356 (amdgpu_is_atpx_hybrid() ||
3357 amdgpu_has_atpx_dgpu_power_cntl()) &&
3358 !pci_is_thunderbolt_attached(adev->pdev))
3359 vga_switcheroo_register_client(adev->pdev,
3360 &amdgpu_switcheroo_ops, boco);
3361 if (boco)
3362 vga_switcheroo_init_domain_pm_ops(adev->dev, &adev->vga_pm_domain);
3363
3364 if (amdgpu_emu_mode == 1) {
3365 /* post the asic on emulation mode */
3366 emu_soc_asic_init(adev);
3367 goto fence_driver_init;
3368 }
3369
3370 /* detect if we are with an SRIOV vbios */
3371 amdgpu_device_detect_sriov_bios(adev);
3372
3373 /* check if we need to reset the asic
3374 * E.g., driver was not cleanly unloaded previously, etc.
3375 */
3376 if (!amdgpu_sriov_vf(adev) && amdgpu_asic_need_reset_on_init(adev)) {
3377 r = amdgpu_asic_reset(adev);
3378 if (r) {
3379 dev_err(adev->dev, "asic reset on init failed\n");
3380 goto failed;
3381 }
3382 }
3383
3384 pci_enable_pcie_error_reporting(adev->ddev.pdev);
3385
3386 /* Post card if necessary */
3387 if (amdgpu_device_need_post(adev)) {
3388 if (!adev->bios) {
3389 dev_err(adev->dev, "no vBIOS found\n");
3390 r = -EINVAL;
3391 goto failed;
3392 }
3393 DRM_INFO("GPU posting now...\n");
3394 r = amdgpu_device_asic_init(adev);
3395 if (r) {
3396 dev_err(adev->dev, "gpu post error!\n");
3397 goto failed;
3398 }
3399 }
3400
3401 if (adev->is_atom_fw) {
3402 /* Initialize clocks */
3403 r = amdgpu_atomfirmware_get_clock_info(adev);
3404 if (r) {
3405 dev_err(adev->dev, "amdgpu_atomfirmware_get_clock_info failed\n");
3406 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_ATOMBIOS_GET_CLOCK_FAIL, 0, 0);
3407 goto failed;
3408 }
3409 } else {
3410 /* Initialize clocks */
3411 r = amdgpu_atombios_get_clock_info(adev);
3412 if (r) {
3413 dev_err(adev->dev, "amdgpu_atombios_get_clock_info failed\n");
3414 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_ATOMBIOS_GET_CLOCK_FAIL, 0, 0);
3415 goto failed;
3416 }
3417 /* init i2c buses */
3418 if (!amdgpu_device_has_dc_support(adev))
3419 amdgpu_atombios_i2c_init(adev);
3420 }
3421
3422 fence_driver_init:
3423 /* Fence driver */
3424 r = amdgpu_fence_driver_init(adev);
3425 if (r) {
3426 dev_err(adev->dev, "amdgpu_fence_driver_init failed\n");
3427 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_FENCE_INIT_FAIL, 0, 0);
3428 goto failed;
3429 }
3430
3431 /* init the mode config */
3432 drm_mode_config_init(adev_to_drm(adev));
3433
3434 r = amdgpu_device_ip_init(adev);
3435 if (r) {
3436 /* failed in exclusive mode due to timeout */
3437 if (amdgpu_sriov_vf(adev) &&
3438 !amdgpu_sriov_runtime(adev) &&
3439 amdgpu_virt_mmio_blocked(adev) &&
3440 !amdgpu_virt_wait_reset(adev)) {
3441 dev_err(adev->dev, "VF exclusive mode timeout\n");
3442 /* Don't send request since VF is inactive. */
3443 adev->virt.caps &= ~AMDGPU_SRIOV_CAPS_RUNTIME;
3444 adev->virt.ops = NULL;
3445 r = -EAGAIN;
3446 goto failed;
3447 }
3448 dev_err(adev->dev, "amdgpu_device_ip_init failed\n");
3449 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_AMDGPU_INIT_FAIL, 0, 0);
3450 goto failed;
3451 }
3452
3453 dev_info(adev->dev,
3454 "SE %d, SH per SE %d, CU per SH %d, active_cu_number %d\n",
3455 adev->gfx.config.max_shader_engines,
3456 adev->gfx.config.max_sh_per_se,
3457 adev->gfx.config.max_cu_per_sh,
3458 adev->gfx.cu_info.number);
3459
3460 adev->accel_working = true;
3461
3462 amdgpu_vm_check_compute_bug(adev);
3463
3464 /* Initialize the buffer migration limit. */
3465 if (amdgpu_moverate >= 0)
3466 max_MBps = amdgpu_moverate;
3467 else
3468 max_MBps = 8; /* Allow 8 MB/s. */
3469 /* Get a log2 for easy divisions. */
3470 adev->mm_stats.log2_max_MBps = ilog2(max(1u, max_MBps));
3471
3472 amdgpu_fbdev_init(adev);
3473
3474 r = amdgpu_pm_sysfs_init(adev);
3475 if (r) {
3476 adev->pm_sysfs_en = false;
3477 DRM_ERROR("registering pm debugfs failed (%d).\n", r);
3478 } else
3479 adev->pm_sysfs_en = true;
3480
3481 r = amdgpu_ucode_sysfs_init(adev);
3482 if (r) {
3483 adev->ucode_sysfs_en = false;
3484 DRM_ERROR("Creating firmware sysfs failed (%d).\n", r);
3485 } else
3486 adev->ucode_sysfs_en = true;
3487
3488 if ((amdgpu_testing & 1)) {
3489 if (adev->accel_working)
3490 amdgpu_test_moves(adev);
3491 else
3492 DRM_INFO("amdgpu: acceleration disabled, skipping move tests\n");
3493 }
3494 if (amdgpu_benchmarking) {
3495 if (adev->accel_working)
3496 amdgpu_benchmark(adev, amdgpu_benchmarking);
3497 else
3498 DRM_INFO("amdgpu: acceleration disabled, skipping benchmarks\n");
3499 }
3500
3501 /*
3502 * Register gpu instance before amdgpu_device_enable_mgpu_fan_boost.
3503 * Otherwise the mgpu fan boost feature will be skipped due to the
3504 * gpu instance is counted less.
3505 */
3506 amdgpu_register_gpu_instance(adev);
3507
3508 /* enable clockgating, etc. after ib tests, etc. since some blocks require
3509 * explicit gating rather than handling it automatically.
3510 */
3511 r = amdgpu_device_ip_late_init(adev);
3512 if (r) {
3513 dev_err(adev->dev, "amdgpu_device_ip_late_init failed\n");
3514 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_AMDGPU_LATE_INIT_FAIL, 0, r);
3515 goto failed;
3516 }
3517
3518 /* must succeed. */
3519 amdgpu_ras_resume(adev);
3520
3521 queue_delayed_work(system_wq, &adev->delayed_init_work,
3522 msecs_to_jiffies(AMDGPU_RESUME_MS));
3523
3524 if (amdgpu_sriov_vf(adev))
3525 flush_delayed_work(&adev->delayed_init_work);
3526
3527 r = sysfs_create_files(&adev->dev->kobj, amdgpu_dev_attributes);
3528 if (r)
3529 dev_err(adev->dev, "Could not create amdgpu device attr\n");
3530
3531 if (IS_ENABLED(CONFIG_PERF_EVENTS))
3532 r = amdgpu_pmu_init(adev);
3533 if (r)
3534 dev_err(adev->dev, "amdgpu_pmu_init failed\n");
3535
3536 /* Have stored pci confspace at hand for restore in sudden PCI error */
3537 if (amdgpu_device_cache_pci_state(adev->pdev))
3538 pci_restore_state(pdev);
3539
3540 return 0;
3541
3542 failed:
3543 amdgpu_vf_error_trans_all(adev);
3544 if (boco)
3545 vga_switcheroo_fini_domain_pm_ops(adev->dev);
3546
3547 return r;
3548 }
3549
3550 /**
3551 * amdgpu_device_fini - tear down the driver
3552 *
3553 * @adev: amdgpu_device pointer
3554 *
3555 * Tear down the driver info (all asics).
3556 * Called at driver shutdown.
3557 */
amdgpu_device_fini(struct amdgpu_device * adev)3558 void amdgpu_device_fini(struct amdgpu_device *adev)
3559 {
3560 dev_info(adev->dev, "amdgpu: finishing device.\n");
3561 flush_delayed_work(&adev->delayed_init_work);
3562 ttm_bo_lock_delayed_workqueue(&adev->mman.bdev);
3563 adev->shutdown = true;
3564
3565 kfree(adev->pci_state);
3566
3567 /* make sure IB test finished before entering exclusive mode
3568 * to avoid preemption on IB test
3569 * */
3570 if (amdgpu_sriov_vf(adev)) {
3571 amdgpu_virt_request_full_gpu(adev, false);
3572 amdgpu_virt_fini_data_exchange(adev);
3573 }
3574
3575 /* disable all interrupts */
3576 amdgpu_irq_disable_all(adev);
3577 if (adev->mode_info.mode_config_initialized){
3578 if (!amdgpu_device_has_dc_support(adev))
3579 drm_helper_force_disable_all(adev_to_drm(adev));
3580 else
3581 drm_atomic_helper_shutdown(adev_to_drm(adev));
3582 }
3583 amdgpu_fence_driver_fini(adev);
3584 if (adev->pm_sysfs_en)
3585 amdgpu_pm_sysfs_fini(adev);
3586 amdgpu_fbdev_fini(adev);
3587 amdgpu_device_ip_fini(adev);
3588 release_firmware(adev->firmware.gpu_info_fw);
3589 adev->firmware.gpu_info_fw = NULL;
3590 adev->accel_working = false;
3591 /* free i2c buses */
3592 if (!amdgpu_device_has_dc_support(adev))
3593 amdgpu_i2c_fini(adev);
3594
3595 if (amdgpu_emu_mode != 1)
3596 amdgpu_atombios_fini(adev);
3597
3598 kfree(adev->bios);
3599 adev->bios = NULL;
3600 if (amdgpu_has_atpx() &&
3601 (amdgpu_is_atpx_hybrid() ||
3602 amdgpu_has_atpx_dgpu_power_cntl()) &&
3603 !pci_is_thunderbolt_attached(adev->pdev))
3604 vga_switcheroo_unregister_client(adev->pdev);
3605 if (amdgpu_device_supports_boco(adev_to_drm(adev)))
3606 vga_switcheroo_fini_domain_pm_ops(adev->dev);
3607 vga_client_register(adev->pdev, NULL, NULL, NULL);
3608 if (adev->rio_mem)
3609 pci_iounmap(adev->pdev, adev->rio_mem);
3610 adev->rio_mem = NULL;
3611 iounmap(adev->rmmio);
3612 adev->rmmio = NULL;
3613 amdgpu_device_doorbell_fini(adev);
3614
3615 if (adev->ucode_sysfs_en)
3616 amdgpu_ucode_sysfs_fini(adev);
3617
3618 sysfs_remove_files(&adev->dev->kobj, amdgpu_dev_attributes);
3619 if (IS_ENABLED(CONFIG_PERF_EVENTS))
3620 amdgpu_pmu_fini(adev);
3621 if (adev->mman.discovery_bin)
3622 amdgpu_discovery_fini(adev);
3623 }
3624
3625
3626 /*
3627 * Suspend & resume.
3628 */
3629 /**
3630 * amdgpu_device_suspend - initiate device suspend
3631 *
3632 * @dev: drm dev pointer
3633 * @fbcon : notify the fbdev of suspend
3634 *
3635 * Puts the hw in the suspend state (all asics).
3636 * Returns 0 for success or an error on failure.
3637 * Called at driver suspend.
3638 */
amdgpu_device_suspend(struct drm_device * dev,bool fbcon)3639 int amdgpu_device_suspend(struct drm_device *dev, bool fbcon)
3640 {
3641 struct amdgpu_device *adev;
3642 struct drm_crtc *crtc;
3643 struct drm_connector *connector;
3644 struct drm_connector_list_iter iter;
3645 int r;
3646
3647 adev = drm_to_adev(dev);
3648
3649 if (dev->switch_power_state == DRM_SWITCH_POWER_OFF)
3650 return 0;
3651
3652 adev->in_suspend = true;
3653 drm_kms_helper_poll_disable(dev);
3654
3655 if (fbcon)
3656 amdgpu_fbdev_set_suspend(adev, 1);
3657
3658 cancel_delayed_work_sync(&adev->delayed_init_work);
3659
3660 if (!amdgpu_device_has_dc_support(adev)) {
3661 /* turn off display hw */
3662 drm_modeset_lock_all(dev);
3663 drm_connector_list_iter_begin(dev, &iter);
3664 drm_for_each_connector_iter(connector, &iter)
3665 drm_helper_connector_dpms(connector,
3666 DRM_MODE_DPMS_OFF);
3667 drm_connector_list_iter_end(&iter);
3668 drm_modeset_unlock_all(dev);
3669 /* unpin the front buffers and cursors */
3670 list_for_each_entry(crtc, &dev->mode_config.crtc_list, head) {
3671 struct amdgpu_crtc *amdgpu_crtc = to_amdgpu_crtc(crtc);
3672 struct drm_framebuffer *fb = crtc->primary->fb;
3673 struct amdgpu_bo *robj;
3674
3675 if (amdgpu_crtc->cursor_bo && !adev->enable_virtual_display) {
3676 struct amdgpu_bo *aobj = gem_to_amdgpu_bo(amdgpu_crtc->cursor_bo);
3677 r = amdgpu_bo_reserve(aobj, true);
3678 if (r == 0) {
3679 amdgpu_bo_unpin(aobj);
3680 amdgpu_bo_unreserve(aobj);
3681 }
3682 }
3683
3684 if (fb == NULL || fb->obj[0] == NULL) {
3685 continue;
3686 }
3687 robj = gem_to_amdgpu_bo(fb->obj[0]);
3688 /* don't unpin kernel fb objects */
3689 if (!amdgpu_fbdev_robj_is_fb(adev, robj)) {
3690 r = amdgpu_bo_reserve(robj, true);
3691 if (r == 0) {
3692 amdgpu_bo_unpin(robj);
3693 amdgpu_bo_unreserve(robj);
3694 }
3695 }
3696 }
3697 }
3698
3699 amdgpu_ras_suspend(adev);
3700
3701 r = amdgpu_device_ip_suspend_phase1(adev);
3702
3703 amdgpu_amdkfd_suspend(adev, !fbcon);
3704
3705 /* evict vram memory */
3706 amdgpu_bo_evict_vram(adev);
3707
3708 amdgpu_fence_driver_suspend(adev);
3709
3710 r = amdgpu_device_ip_suspend_phase2(adev);
3711
3712 /* evict remaining vram memory
3713 * This second call to evict vram is to evict the gart page table
3714 * using the CPU.
3715 */
3716 amdgpu_bo_evict_vram(adev);
3717
3718 return 0;
3719 }
3720
3721 /**
3722 * amdgpu_device_resume - initiate device resume
3723 *
3724 * @dev: drm dev pointer
3725 * @fbcon : notify the fbdev of resume
3726 *
3727 * Bring the hw back to operating state (all asics).
3728 * Returns 0 for success or an error on failure.
3729 * Called at driver resume.
3730 */
amdgpu_device_resume(struct drm_device * dev,bool fbcon)3731 int amdgpu_device_resume(struct drm_device *dev, bool fbcon)
3732 {
3733 struct drm_connector *connector;
3734 struct drm_connector_list_iter iter;
3735 struct amdgpu_device *adev = drm_to_adev(dev);
3736 struct drm_crtc *crtc;
3737 int r = 0;
3738
3739 if (dev->switch_power_state == DRM_SWITCH_POWER_OFF)
3740 return 0;
3741
3742 /* post card */
3743 if (amdgpu_device_need_post(adev)) {
3744 r = amdgpu_device_asic_init(adev);
3745 if (r)
3746 dev_err(adev->dev, "amdgpu asic init failed\n");
3747 }
3748
3749 r = amdgpu_device_ip_resume(adev);
3750 if (r) {
3751 dev_err(adev->dev, "amdgpu_device_ip_resume failed (%d).\n", r);
3752 return r;
3753 }
3754 amdgpu_fence_driver_resume(adev);
3755
3756
3757 r = amdgpu_device_ip_late_init(adev);
3758 if (r)
3759 return r;
3760
3761 queue_delayed_work(system_wq, &adev->delayed_init_work,
3762 msecs_to_jiffies(AMDGPU_RESUME_MS));
3763
3764 if (!amdgpu_device_has_dc_support(adev)) {
3765 /* pin cursors */
3766 list_for_each_entry(crtc, &dev->mode_config.crtc_list, head) {
3767 struct amdgpu_crtc *amdgpu_crtc = to_amdgpu_crtc(crtc);
3768
3769 if (amdgpu_crtc->cursor_bo && !adev->enable_virtual_display) {
3770 struct amdgpu_bo *aobj = gem_to_amdgpu_bo(amdgpu_crtc->cursor_bo);
3771 r = amdgpu_bo_reserve(aobj, true);
3772 if (r == 0) {
3773 r = amdgpu_bo_pin(aobj, AMDGPU_GEM_DOMAIN_VRAM);
3774 if (r != 0)
3775 dev_err(adev->dev, "Failed to pin cursor BO (%d)\n", r);
3776 amdgpu_crtc->cursor_addr = amdgpu_bo_gpu_offset(aobj);
3777 amdgpu_bo_unreserve(aobj);
3778 }
3779 }
3780 }
3781 }
3782 r = amdgpu_amdkfd_resume(adev, !fbcon);
3783 if (r)
3784 return r;
3785
3786 /* Make sure IB tests flushed */
3787 flush_delayed_work(&adev->delayed_init_work);
3788
3789 /* blat the mode back in */
3790 if (fbcon) {
3791 if (!amdgpu_device_has_dc_support(adev)) {
3792 /* pre DCE11 */
3793 drm_helper_resume_force_mode(dev);
3794
3795 /* turn on display hw */
3796 drm_modeset_lock_all(dev);
3797
3798 drm_connector_list_iter_begin(dev, &iter);
3799 drm_for_each_connector_iter(connector, &iter)
3800 drm_helper_connector_dpms(connector,
3801 DRM_MODE_DPMS_ON);
3802 drm_connector_list_iter_end(&iter);
3803
3804 drm_modeset_unlock_all(dev);
3805 }
3806 amdgpu_fbdev_set_suspend(adev, 0);
3807 }
3808
3809 drm_kms_helper_poll_enable(dev);
3810
3811 amdgpu_ras_resume(adev);
3812
3813 /*
3814 * Most of the connector probing functions try to acquire runtime pm
3815 * refs to ensure that the GPU is powered on when connector polling is
3816 * performed. Since we're calling this from a runtime PM callback,
3817 * trying to acquire rpm refs will cause us to deadlock.
3818 *
3819 * Since we're guaranteed to be holding the rpm lock, it's safe to
3820 * temporarily disable the rpm helpers so this doesn't deadlock us.
3821 */
3822 #ifdef CONFIG_PM
3823 dev->dev->power.disable_depth++;
3824 #endif
3825 if (!amdgpu_device_has_dc_support(adev))
3826 drm_helper_hpd_irq_event(dev);
3827 else
3828 drm_kms_helper_hotplug_event(dev);
3829 #ifdef CONFIG_PM
3830 dev->dev->power.disable_depth--;
3831 #endif
3832 adev->in_suspend = false;
3833
3834 return 0;
3835 }
3836
3837 /**
3838 * amdgpu_device_ip_check_soft_reset - did soft reset succeed
3839 *
3840 * @adev: amdgpu_device pointer
3841 *
3842 * The list of all the hardware IPs that make up the asic is walked and
3843 * the check_soft_reset callbacks are run. check_soft_reset determines
3844 * if the asic is still hung or not.
3845 * Returns true if any of the IPs are still in a hung state, false if not.
3846 */
amdgpu_device_ip_check_soft_reset(struct amdgpu_device * adev)3847 static bool amdgpu_device_ip_check_soft_reset(struct amdgpu_device *adev)
3848 {
3849 int i;
3850 bool asic_hang = false;
3851
3852 if (amdgpu_sriov_vf(adev))
3853 return true;
3854
3855 if (amdgpu_asic_need_full_reset(adev))
3856 return true;
3857
3858 for (i = 0; i < adev->num_ip_blocks; i++) {
3859 if (!adev->ip_blocks[i].status.valid)
3860 continue;
3861 if (adev->ip_blocks[i].version->funcs->check_soft_reset)
3862 adev->ip_blocks[i].status.hang =
3863 adev->ip_blocks[i].version->funcs->check_soft_reset(adev);
3864 if (adev->ip_blocks[i].status.hang) {
3865 dev_info(adev->dev, "IP block:%s is hung!\n", adev->ip_blocks[i].version->funcs->name);
3866 asic_hang = true;
3867 }
3868 }
3869 return asic_hang;
3870 }
3871
3872 /**
3873 * amdgpu_device_ip_pre_soft_reset - prepare for soft reset
3874 *
3875 * @adev: amdgpu_device pointer
3876 *
3877 * The list of all the hardware IPs that make up the asic is walked and the
3878 * pre_soft_reset callbacks are run if the block is hung. pre_soft_reset
3879 * handles any IP specific hardware or software state changes that are
3880 * necessary for a soft reset to succeed.
3881 * Returns 0 on success, negative error code on failure.
3882 */
amdgpu_device_ip_pre_soft_reset(struct amdgpu_device * adev)3883 static int amdgpu_device_ip_pre_soft_reset(struct amdgpu_device *adev)
3884 {
3885 int i, r = 0;
3886
3887 for (i = 0; i < adev->num_ip_blocks; i++) {
3888 if (!adev->ip_blocks[i].status.valid)
3889 continue;
3890 if (adev->ip_blocks[i].status.hang &&
3891 adev->ip_blocks[i].version->funcs->pre_soft_reset) {
3892 r = adev->ip_blocks[i].version->funcs->pre_soft_reset(adev);
3893 if (r)
3894 return r;
3895 }
3896 }
3897
3898 return 0;
3899 }
3900
3901 /**
3902 * amdgpu_device_ip_need_full_reset - check if a full asic reset is needed
3903 *
3904 * @adev: amdgpu_device pointer
3905 *
3906 * Some hardware IPs cannot be soft reset. If they are hung, a full gpu
3907 * reset is necessary to recover.
3908 * Returns true if a full asic reset is required, false if not.
3909 */
amdgpu_device_ip_need_full_reset(struct amdgpu_device * adev)3910 static bool amdgpu_device_ip_need_full_reset(struct amdgpu_device *adev)
3911 {
3912 int i;
3913
3914 if (amdgpu_asic_need_full_reset(adev))
3915 return true;
3916
3917 for (i = 0; i < adev->num_ip_blocks; i++) {
3918 if (!adev->ip_blocks[i].status.valid)
3919 continue;
3920 if ((adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC) ||
3921 (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_SMC) ||
3922 (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_ACP) ||
3923 (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_DCE) ||
3924 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_PSP) {
3925 if (adev->ip_blocks[i].status.hang) {
3926 dev_info(adev->dev, "Some block need full reset!\n");
3927 return true;
3928 }
3929 }
3930 }
3931 return false;
3932 }
3933
3934 /**
3935 * amdgpu_device_ip_soft_reset - do a soft reset
3936 *
3937 * @adev: amdgpu_device pointer
3938 *
3939 * The list of all the hardware IPs that make up the asic is walked and the
3940 * soft_reset callbacks are run if the block is hung. soft_reset handles any
3941 * IP specific hardware or software state changes that are necessary to soft
3942 * reset the IP.
3943 * Returns 0 on success, negative error code on failure.
3944 */
amdgpu_device_ip_soft_reset(struct amdgpu_device * adev)3945 static int amdgpu_device_ip_soft_reset(struct amdgpu_device *adev)
3946 {
3947 int i, r = 0;
3948
3949 for (i = 0; i < adev->num_ip_blocks; i++) {
3950 if (!adev->ip_blocks[i].status.valid)
3951 continue;
3952 if (adev->ip_blocks[i].status.hang &&
3953 adev->ip_blocks[i].version->funcs->soft_reset) {
3954 r = adev->ip_blocks[i].version->funcs->soft_reset(adev);
3955 if (r)
3956 return r;
3957 }
3958 }
3959
3960 return 0;
3961 }
3962
3963 /**
3964 * amdgpu_device_ip_post_soft_reset - clean up from soft reset
3965 *
3966 * @adev: amdgpu_device pointer
3967 *
3968 * The list of all the hardware IPs that make up the asic is walked and the
3969 * post_soft_reset callbacks are run if the asic was hung. post_soft_reset
3970 * handles any IP specific hardware or software state changes that are
3971 * necessary after the IP has been soft reset.
3972 * Returns 0 on success, negative error code on failure.
3973 */
amdgpu_device_ip_post_soft_reset(struct amdgpu_device * adev)3974 static int amdgpu_device_ip_post_soft_reset(struct amdgpu_device *adev)
3975 {
3976 int i, r = 0;
3977
3978 for (i = 0; i < adev->num_ip_blocks; i++) {
3979 if (!adev->ip_blocks[i].status.valid)
3980 continue;
3981 if (adev->ip_blocks[i].status.hang &&
3982 adev->ip_blocks[i].version->funcs->post_soft_reset)
3983 r = adev->ip_blocks[i].version->funcs->post_soft_reset(adev);
3984 if (r)
3985 return r;
3986 }
3987
3988 return 0;
3989 }
3990
3991 /**
3992 * amdgpu_device_recover_vram - Recover some VRAM contents
3993 *
3994 * @adev: amdgpu_device pointer
3995 *
3996 * Restores the contents of VRAM buffers from the shadows in GTT. Used to
3997 * restore things like GPUVM page tables after a GPU reset where
3998 * the contents of VRAM might be lost.
3999 *
4000 * Returns:
4001 * 0 on success, negative error code on failure.
4002 */
amdgpu_device_recover_vram(struct amdgpu_device * adev)4003 static int amdgpu_device_recover_vram(struct amdgpu_device *adev)
4004 {
4005 struct dma_fence *fence = NULL, *next = NULL;
4006 struct amdgpu_bo *shadow;
4007 long r = 1, tmo;
4008
4009 if (amdgpu_sriov_runtime(adev))
4010 tmo = msecs_to_jiffies(8000);
4011 else
4012 tmo = msecs_to_jiffies(100);
4013
4014 dev_info(adev->dev, "recover vram bo from shadow start\n");
4015 mutex_lock(&adev->shadow_list_lock);
4016 list_for_each_entry(shadow, &adev->shadow_list, shadow_list) {
4017
4018 /* No need to recover an evicted BO */
4019 if (shadow->tbo.mem.mem_type != TTM_PL_TT ||
4020 shadow->tbo.mem.start == AMDGPU_BO_INVALID_OFFSET ||
4021 shadow->parent->tbo.mem.mem_type != TTM_PL_VRAM)
4022 continue;
4023
4024 r = amdgpu_bo_restore_shadow(shadow, &next);
4025 if (r)
4026 break;
4027
4028 if (fence) {
4029 tmo = dma_fence_wait_timeout(fence, false, tmo);
4030 dma_fence_put(fence);
4031 fence = next;
4032 if (tmo == 0) {
4033 r = -ETIMEDOUT;
4034 break;
4035 } else if (tmo < 0) {
4036 r = tmo;
4037 break;
4038 }
4039 } else {
4040 fence = next;
4041 }
4042 }
4043 mutex_unlock(&adev->shadow_list_lock);
4044
4045 if (fence)
4046 tmo = dma_fence_wait_timeout(fence, false, tmo);
4047 dma_fence_put(fence);
4048
4049 if (r < 0 || tmo <= 0) {
4050 dev_err(adev->dev, "recover vram bo from shadow failed, r is %ld, tmo is %ld\n", r, tmo);
4051 return -EIO;
4052 }
4053
4054 dev_info(adev->dev, "recover vram bo from shadow done\n");
4055 return 0;
4056 }
4057
4058
4059 /**
4060 * amdgpu_device_reset_sriov - reset ASIC for SR-IOV vf
4061 *
4062 * @adev: amdgpu_device pointer
4063 * @from_hypervisor: request from hypervisor
4064 *
4065 * do VF FLR and reinitialize Asic
4066 * return 0 means succeeded otherwise failed
4067 */
amdgpu_device_reset_sriov(struct amdgpu_device * adev,bool from_hypervisor)4068 static int amdgpu_device_reset_sriov(struct amdgpu_device *adev,
4069 bool from_hypervisor)
4070 {
4071 int r;
4072
4073 if (from_hypervisor)
4074 r = amdgpu_virt_request_full_gpu(adev, true);
4075 else
4076 r = amdgpu_virt_reset_gpu(adev);
4077 if (r)
4078 return r;
4079
4080 amdgpu_amdkfd_pre_reset(adev);
4081
4082 /* Resume IP prior to SMC */
4083 r = amdgpu_device_ip_reinit_early_sriov(adev);
4084 if (r)
4085 goto error;
4086
4087 amdgpu_virt_init_data_exchange(adev);
4088 /* we need recover gart prior to run SMC/CP/SDMA resume */
4089 amdgpu_gtt_mgr_recover(ttm_manager_type(&adev->mman.bdev, TTM_PL_TT));
4090
4091 r = amdgpu_device_fw_loading(adev);
4092 if (r)
4093 return r;
4094
4095 /* now we are okay to resume SMC/CP/SDMA */
4096 r = amdgpu_device_ip_reinit_late_sriov(adev);
4097 if (r)
4098 goto error;
4099
4100 amdgpu_irq_gpu_reset_resume_helper(adev);
4101 r = amdgpu_ib_ring_tests(adev);
4102 amdgpu_amdkfd_post_reset(adev);
4103
4104 error:
4105 amdgpu_virt_release_full_gpu(adev, true);
4106 if (!r && adev->virt.gim_feature & AMDGIM_FEATURE_GIM_FLR_VRAMLOST) {
4107 amdgpu_inc_vram_lost(adev);
4108 r = amdgpu_device_recover_vram(adev);
4109 }
4110
4111 return r;
4112 }
4113
4114 /**
4115 * amdgpu_device_has_job_running - check if there is any job in mirror list
4116 *
4117 * @adev: amdgpu_device pointer
4118 *
4119 * check if there is any job in mirror list
4120 */
amdgpu_device_has_job_running(struct amdgpu_device * adev)4121 bool amdgpu_device_has_job_running(struct amdgpu_device *adev)
4122 {
4123 int i;
4124 struct drm_sched_job *job;
4125
4126 for (i = 0; i < AMDGPU_MAX_RINGS; ++i) {
4127 struct amdgpu_ring *ring = adev->rings[i];
4128
4129 if (!ring || !ring->sched.thread)
4130 continue;
4131
4132 spin_lock(&ring->sched.job_list_lock);
4133 job = list_first_entry_or_null(&ring->sched.ring_mirror_list,
4134 struct drm_sched_job, node);
4135 spin_unlock(&ring->sched.job_list_lock);
4136 if (job)
4137 return true;
4138 }
4139 return false;
4140 }
4141
4142 /**
4143 * amdgpu_device_should_recover_gpu - check if we should try GPU recovery
4144 *
4145 * @adev: amdgpu_device pointer
4146 *
4147 * Check amdgpu_gpu_recovery and SRIOV status to see if we should try to recover
4148 * a hung GPU.
4149 */
amdgpu_device_should_recover_gpu(struct amdgpu_device * adev)4150 bool amdgpu_device_should_recover_gpu(struct amdgpu_device *adev)
4151 {
4152 if (!amdgpu_device_ip_check_soft_reset(adev)) {
4153 dev_info(adev->dev, "Timeout, but no hardware hang detected.\n");
4154 return false;
4155 }
4156
4157 if (amdgpu_gpu_recovery == 0)
4158 goto disabled;
4159
4160 if (amdgpu_sriov_vf(adev))
4161 return true;
4162
4163 if (amdgpu_gpu_recovery == -1) {
4164 switch (adev->asic_type) {
4165 case CHIP_BONAIRE:
4166 case CHIP_HAWAII:
4167 case CHIP_TOPAZ:
4168 case CHIP_TONGA:
4169 case CHIP_FIJI:
4170 case CHIP_POLARIS10:
4171 case CHIP_POLARIS11:
4172 case CHIP_POLARIS12:
4173 case CHIP_VEGAM:
4174 case CHIP_VEGA20:
4175 case CHIP_VEGA10:
4176 case CHIP_VEGA12:
4177 case CHIP_RAVEN:
4178 case CHIP_ARCTURUS:
4179 case CHIP_RENOIR:
4180 case CHIP_NAVI10:
4181 case CHIP_NAVI14:
4182 case CHIP_NAVI12:
4183 case CHIP_SIENNA_CICHLID:
4184 break;
4185 default:
4186 goto disabled;
4187 }
4188 }
4189
4190 return true;
4191
4192 disabled:
4193 dev_info(adev->dev, "GPU recovery disabled.\n");
4194 return false;
4195 }
4196
4197
amdgpu_device_pre_asic_reset(struct amdgpu_device * adev,struct amdgpu_job * job,bool * need_full_reset_arg)4198 static int amdgpu_device_pre_asic_reset(struct amdgpu_device *adev,
4199 struct amdgpu_job *job,
4200 bool *need_full_reset_arg)
4201 {
4202 int i, r = 0;
4203 bool need_full_reset = *need_full_reset_arg;
4204
4205 amdgpu_debugfs_wait_dump(adev);
4206
4207 if (amdgpu_sriov_vf(adev)) {
4208 /* stop the data exchange thread */
4209 amdgpu_virt_fini_data_exchange(adev);
4210 }
4211
4212 /* block all schedulers and reset given job's ring */
4213 for (i = 0; i < AMDGPU_MAX_RINGS; ++i) {
4214 struct amdgpu_ring *ring = adev->rings[i];
4215
4216 if (!ring || !ring->sched.thread)
4217 continue;
4218
4219 /* after all hw jobs are reset, hw fence is meaningless, so force_completion */
4220 amdgpu_fence_driver_force_completion(ring);
4221 }
4222
4223 if(job)
4224 drm_sched_increase_karma(&job->base);
4225
4226 /* Don't suspend on bare metal if we are not going to HW reset the ASIC */
4227 if (!amdgpu_sriov_vf(adev)) {
4228
4229 if (!need_full_reset)
4230 need_full_reset = amdgpu_device_ip_need_full_reset(adev);
4231
4232 if (!need_full_reset) {
4233 amdgpu_device_ip_pre_soft_reset(adev);
4234 r = amdgpu_device_ip_soft_reset(adev);
4235 amdgpu_device_ip_post_soft_reset(adev);
4236 if (r || amdgpu_device_ip_check_soft_reset(adev)) {
4237 dev_info(adev->dev, "soft reset failed, will fallback to full reset!\n");
4238 need_full_reset = true;
4239 }
4240 }
4241
4242 if (need_full_reset)
4243 r = amdgpu_device_ip_suspend(adev);
4244
4245 *need_full_reset_arg = need_full_reset;
4246 }
4247
4248 return r;
4249 }
4250
amdgpu_do_asic_reset(struct amdgpu_hive_info * hive,struct list_head * device_list_handle,bool * need_full_reset_arg,bool skip_hw_reset)4251 static int amdgpu_do_asic_reset(struct amdgpu_hive_info *hive,
4252 struct list_head *device_list_handle,
4253 bool *need_full_reset_arg,
4254 bool skip_hw_reset)
4255 {
4256 struct amdgpu_device *tmp_adev = NULL;
4257 bool need_full_reset = *need_full_reset_arg, vram_lost = false;
4258 int r = 0;
4259
4260 /*
4261 * ASIC reset has to be done on all HGMI hive nodes ASAP
4262 * to allow proper links negotiation in FW (within 1 sec)
4263 */
4264 if (!skip_hw_reset && need_full_reset) {
4265 list_for_each_entry(tmp_adev, device_list_handle, gmc.xgmi.head) {
4266 /* For XGMI run all resets in parallel to speed up the process */
4267 if (tmp_adev->gmc.xgmi.num_physical_nodes > 1) {
4268 if (!queue_work(system_unbound_wq, &tmp_adev->xgmi_reset_work))
4269 r = -EALREADY;
4270 } else
4271 r = amdgpu_asic_reset(tmp_adev);
4272
4273 if (r) {
4274 dev_err(tmp_adev->dev, "ASIC reset failed with error, %d for drm dev, %s",
4275 r, adev_to_drm(tmp_adev)->unique);
4276 break;
4277 }
4278 }
4279
4280 /* For XGMI wait for all resets to complete before proceed */
4281 if (!r) {
4282 list_for_each_entry(tmp_adev, device_list_handle,
4283 gmc.xgmi.head) {
4284 if (tmp_adev->gmc.xgmi.num_physical_nodes > 1) {
4285 flush_work(&tmp_adev->xgmi_reset_work);
4286 r = tmp_adev->asic_reset_res;
4287 if (r)
4288 break;
4289 }
4290 }
4291 }
4292 }
4293
4294 if (!r && amdgpu_ras_intr_triggered()) {
4295 list_for_each_entry(tmp_adev, device_list_handle, gmc.xgmi.head) {
4296 if (tmp_adev->mmhub.funcs &&
4297 tmp_adev->mmhub.funcs->reset_ras_error_count)
4298 tmp_adev->mmhub.funcs->reset_ras_error_count(tmp_adev);
4299 }
4300
4301 amdgpu_ras_intr_cleared();
4302 }
4303
4304 list_for_each_entry(tmp_adev, device_list_handle, gmc.xgmi.head) {
4305 if (need_full_reset) {
4306 /* post card */
4307 if (amdgpu_device_asic_init(tmp_adev))
4308 dev_warn(tmp_adev->dev, "asic atom init failed!");
4309
4310 if (!r) {
4311 dev_info(tmp_adev->dev, "GPU reset succeeded, trying to resume\n");
4312 r = amdgpu_amdkfd_resume_iommu(tmp_adev);
4313 if (r)
4314 goto out;
4315
4316 r = amdgpu_device_ip_resume_phase1(tmp_adev);
4317 if (r)
4318 goto out;
4319
4320 vram_lost = amdgpu_device_check_vram_lost(tmp_adev);
4321 if (vram_lost) {
4322 DRM_INFO("VRAM is lost due to GPU reset!\n");
4323 amdgpu_inc_vram_lost(tmp_adev);
4324 }
4325
4326 r = amdgpu_gtt_mgr_recover(ttm_manager_type(&tmp_adev->mman.bdev, TTM_PL_TT));
4327 if (r)
4328 goto out;
4329
4330 r = amdgpu_device_fw_loading(tmp_adev);
4331 if (r)
4332 return r;
4333
4334 r = amdgpu_device_ip_resume_phase2(tmp_adev);
4335 if (r)
4336 goto out;
4337
4338 if (vram_lost)
4339 amdgpu_device_fill_reset_magic(tmp_adev);
4340
4341 /*
4342 * Add this ASIC as tracked as reset was already
4343 * complete successfully.
4344 */
4345 amdgpu_register_gpu_instance(tmp_adev);
4346
4347 r = amdgpu_device_ip_late_init(tmp_adev);
4348 if (r)
4349 goto out;
4350
4351 amdgpu_fbdev_set_suspend(tmp_adev, 0);
4352
4353 /*
4354 * The GPU enters bad state once faulty pages
4355 * by ECC has reached the threshold, and ras
4356 * recovery is scheduled next. So add one check
4357 * here to break recovery if it indeed exceeds
4358 * bad page threshold, and remind user to
4359 * retire this GPU or setting one bigger
4360 * bad_page_threshold value to fix this once
4361 * probing driver again.
4362 */
4363 if (!amdgpu_ras_check_err_threshold(tmp_adev)) {
4364 /* must succeed. */
4365 amdgpu_ras_resume(tmp_adev);
4366 } else {
4367 r = -EINVAL;
4368 goto out;
4369 }
4370
4371 /* Update PSP FW topology after reset */
4372 if (hive && tmp_adev->gmc.xgmi.num_physical_nodes > 1)
4373 r = amdgpu_xgmi_update_topology(hive, tmp_adev);
4374 }
4375 }
4376
4377 out:
4378 if (!r) {
4379 amdgpu_irq_gpu_reset_resume_helper(tmp_adev);
4380 r = amdgpu_ib_ring_tests(tmp_adev);
4381 if (r) {
4382 dev_err(tmp_adev->dev, "ib ring test failed (%d).\n", r);
4383 need_full_reset = true;
4384 r = -EAGAIN;
4385 goto end;
4386 }
4387 }
4388
4389 if (!r)
4390 r = amdgpu_device_recover_vram(tmp_adev);
4391 else
4392 tmp_adev->asic_reset_res = r;
4393 }
4394
4395 end:
4396 *need_full_reset_arg = need_full_reset;
4397 return r;
4398 }
4399
amdgpu_device_lock_adev(struct amdgpu_device * adev,struct amdgpu_hive_info * hive)4400 static bool amdgpu_device_lock_adev(struct amdgpu_device *adev,
4401 struct amdgpu_hive_info *hive)
4402 {
4403 if (atomic_cmpxchg(&adev->in_gpu_reset, 0, 1) != 0)
4404 return false;
4405
4406 if (hive) {
4407 down_write_nest_lock(&adev->reset_sem, &hive->hive_lock);
4408 } else {
4409 down_write(&adev->reset_sem);
4410 }
4411
4412 atomic_inc(&adev->gpu_reset_counter);
4413 switch (amdgpu_asic_reset_method(adev)) {
4414 case AMD_RESET_METHOD_MODE1:
4415 adev->mp1_state = PP_MP1_STATE_SHUTDOWN;
4416 break;
4417 case AMD_RESET_METHOD_MODE2:
4418 adev->mp1_state = PP_MP1_STATE_RESET;
4419 break;
4420 default:
4421 adev->mp1_state = PP_MP1_STATE_NONE;
4422 break;
4423 }
4424
4425 return true;
4426 }
4427
amdgpu_device_unlock_adev(struct amdgpu_device * adev)4428 static void amdgpu_device_unlock_adev(struct amdgpu_device *adev)
4429 {
4430 amdgpu_vf_error_trans_all(adev);
4431 adev->mp1_state = PP_MP1_STATE_NONE;
4432 atomic_set(&adev->in_gpu_reset, 0);
4433 up_write(&adev->reset_sem);
4434 }
4435
amdgpu_device_resume_display_audio(struct amdgpu_device * adev)4436 static void amdgpu_device_resume_display_audio(struct amdgpu_device *adev)
4437 {
4438 struct pci_dev *p = NULL;
4439
4440 p = pci_get_domain_bus_and_slot(pci_domain_nr(adev->pdev->bus),
4441 adev->pdev->bus->number, 1);
4442 if (p) {
4443 pm_runtime_enable(&(p->dev));
4444 pm_runtime_resume(&(p->dev));
4445 }
4446
4447 pci_dev_put(p);
4448 }
4449
amdgpu_device_suspend_display_audio(struct amdgpu_device * adev)4450 static int amdgpu_device_suspend_display_audio(struct amdgpu_device *adev)
4451 {
4452 enum amd_reset_method reset_method;
4453 struct pci_dev *p = NULL;
4454 u64 expires;
4455
4456 /*
4457 * For now, only BACO and mode1 reset are confirmed
4458 * to suffer the audio issue without proper suspended.
4459 */
4460 reset_method = amdgpu_asic_reset_method(adev);
4461 if ((reset_method != AMD_RESET_METHOD_BACO) &&
4462 (reset_method != AMD_RESET_METHOD_MODE1))
4463 return -EINVAL;
4464
4465 p = pci_get_domain_bus_and_slot(pci_domain_nr(adev->pdev->bus),
4466 adev->pdev->bus->number, 1);
4467 if (!p)
4468 return -ENODEV;
4469
4470 expires = pm_runtime_autosuspend_expiration(&(p->dev));
4471 if (!expires)
4472 /*
4473 * If we cannot get the audio device autosuspend delay,
4474 * a fixed 4S interval will be used. Considering 3S is
4475 * the audio controller default autosuspend delay setting.
4476 * 4S used here is guaranteed to cover that.
4477 */
4478 expires = ktime_get_mono_fast_ns() + NSEC_PER_SEC * 4ULL;
4479
4480 while (!pm_runtime_status_suspended(&(p->dev))) {
4481 if (!pm_runtime_suspend(&(p->dev)))
4482 break;
4483
4484 if (expires < ktime_get_mono_fast_ns()) {
4485 dev_warn(adev->dev, "failed to suspend display audio\n");
4486 pci_dev_put(p);
4487 /* TODO: abort the succeeding gpu reset? */
4488 return -ETIMEDOUT;
4489 }
4490 }
4491
4492 pm_runtime_disable(&(p->dev));
4493
4494 pci_dev_put(p);
4495 return 0;
4496 }
4497
4498 /**
4499 * amdgpu_device_gpu_recover - reset the asic and recover scheduler
4500 *
4501 * @adev: amdgpu_device pointer
4502 * @job: which job trigger hang
4503 *
4504 * Attempt to reset the GPU if it has hung (all asics).
4505 * Attempt to do soft-reset or full-reset and reinitialize Asic
4506 * Returns 0 for success or an error on failure.
4507 */
4508
amdgpu_device_gpu_recover(struct amdgpu_device * adev,struct amdgpu_job * job)4509 int amdgpu_device_gpu_recover(struct amdgpu_device *adev,
4510 struct amdgpu_job *job)
4511 {
4512 struct list_head device_list, *device_list_handle = NULL;
4513 bool need_full_reset = false;
4514 bool job_signaled = false;
4515 struct amdgpu_hive_info *hive = NULL;
4516 struct amdgpu_device *tmp_adev = NULL;
4517 int i, r = 0;
4518 bool need_emergency_restart = false;
4519 bool audio_suspended = false;
4520
4521 /*
4522 * Special case: RAS triggered and full reset isn't supported
4523 */
4524 need_emergency_restart = amdgpu_ras_need_emergency_restart(adev);
4525
4526 /*
4527 * Flush RAM to disk so that after reboot
4528 * the user can read log and see why the system rebooted.
4529 */
4530 if (need_emergency_restart && amdgpu_ras_get_context(adev)->reboot) {
4531 DRM_WARN("Emergency reboot.");
4532
4533 ksys_sync_helper();
4534 emergency_restart();
4535 }
4536
4537 dev_info(adev->dev, "GPU %s begin!\n",
4538 need_emergency_restart ? "jobs stop":"reset");
4539
4540 /*
4541 * Here we trylock to avoid chain of resets executing from
4542 * either trigger by jobs on different adevs in XGMI hive or jobs on
4543 * different schedulers for same device while this TO handler is running.
4544 * We always reset all schedulers for device and all devices for XGMI
4545 * hive so that should take care of them too.
4546 */
4547 hive = amdgpu_get_xgmi_hive(adev);
4548 if (hive) {
4549 if (atomic_cmpxchg(&hive->in_reset, 0, 1) != 0) {
4550 DRM_INFO("Bailing on TDR for s_job:%llx, hive: %llx as another already in progress",
4551 job ? job->base.id : -1, hive->hive_id);
4552 amdgpu_put_xgmi_hive(hive);
4553 return 0;
4554 }
4555 mutex_lock(&hive->hive_lock);
4556 }
4557
4558 /*
4559 * Build list of devices to reset.
4560 * In case we are in XGMI hive mode, resort the device list
4561 * to put adev in the 1st position.
4562 */
4563 INIT_LIST_HEAD(&device_list);
4564 if (adev->gmc.xgmi.num_physical_nodes > 1) {
4565 if (!hive)
4566 return -ENODEV;
4567 if (!list_is_first(&adev->gmc.xgmi.head, &hive->device_list))
4568 list_rotate_to_front(&adev->gmc.xgmi.head, &hive->device_list);
4569 device_list_handle = &hive->device_list;
4570 } else {
4571 list_add_tail(&adev->gmc.xgmi.head, &device_list);
4572 device_list_handle = &device_list;
4573 }
4574
4575 /* block all schedulers and reset given job's ring */
4576 list_for_each_entry(tmp_adev, device_list_handle, gmc.xgmi.head) {
4577 if (!amdgpu_device_lock_adev(tmp_adev, hive)) {
4578 dev_info(tmp_adev->dev, "Bailing on TDR for s_job:%llx, as another already in progress",
4579 job ? job->base.id : -1);
4580 r = 0;
4581 goto skip_recovery;
4582 }
4583
4584 /*
4585 * Try to put the audio codec into suspend state
4586 * before gpu reset started.
4587 *
4588 * Due to the power domain of the graphics device
4589 * is shared with AZ power domain. Without this,
4590 * we may change the audio hardware from behind
4591 * the audio driver's back. That will trigger
4592 * some audio codec errors.
4593 */
4594 if (!amdgpu_device_suspend_display_audio(tmp_adev))
4595 audio_suspended = true;
4596
4597 amdgpu_ras_set_error_query_ready(tmp_adev, false);
4598
4599 cancel_delayed_work_sync(&tmp_adev->delayed_init_work);
4600
4601 if (!amdgpu_sriov_vf(tmp_adev))
4602 amdgpu_amdkfd_pre_reset(tmp_adev);
4603
4604 /*
4605 * Mark these ASICs to be reseted as untracked first
4606 * And add them back after reset completed
4607 */
4608 amdgpu_unregister_gpu_instance(tmp_adev);
4609
4610 amdgpu_fbdev_set_suspend(tmp_adev, 1);
4611
4612 /* disable ras on ALL IPs */
4613 if (!need_emergency_restart &&
4614 amdgpu_device_ip_need_full_reset(tmp_adev))
4615 amdgpu_ras_suspend(tmp_adev);
4616
4617 for (i = 0; i < AMDGPU_MAX_RINGS; ++i) {
4618 struct amdgpu_ring *ring = tmp_adev->rings[i];
4619
4620 if (!ring || !ring->sched.thread)
4621 continue;
4622
4623 drm_sched_stop(&ring->sched, job ? &job->base : NULL);
4624
4625 if (need_emergency_restart)
4626 amdgpu_job_stop_all_jobs_on_sched(&ring->sched);
4627 }
4628 }
4629
4630 if (need_emergency_restart)
4631 goto skip_sched_resume;
4632
4633 /*
4634 * Must check guilty signal here since after this point all old
4635 * HW fences are force signaled.
4636 *
4637 * job->base holds a reference to parent fence
4638 */
4639 if (job && job->base.s_fence->parent &&
4640 dma_fence_is_signaled(job->base.s_fence->parent)) {
4641 job_signaled = true;
4642 dev_info(adev->dev, "Guilty job already signaled, skipping HW reset");
4643 goto skip_hw_reset;
4644 }
4645
4646 retry: /* Rest of adevs pre asic reset from XGMI hive. */
4647 list_for_each_entry(tmp_adev, device_list_handle, gmc.xgmi.head) {
4648 r = amdgpu_device_pre_asic_reset(tmp_adev,
4649 (tmp_adev == adev) ? job : NULL,
4650 &need_full_reset);
4651 /*TODO Should we stop ?*/
4652 if (r) {
4653 dev_err(tmp_adev->dev, "GPU pre asic reset failed with err, %d for drm dev, %s ",
4654 r, adev_to_drm(tmp_adev)->unique);
4655 tmp_adev->asic_reset_res = r;
4656 }
4657 }
4658
4659 /* Actual ASIC resets if needed.*/
4660 /* TODO Implement XGMI hive reset logic for SRIOV */
4661 if (amdgpu_sriov_vf(adev)) {
4662 r = amdgpu_device_reset_sriov(adev, job ? false : true);
4663 if (r)
4664 adev->asic_reset_res = r;
4665 } else {
4666 r = amdgpu_do_asic_reset(hive, device_list_handle, &need_full_reset, false);
4667 if (r && r == -EAGAIN)
4668 goto retry;
4669 }
4670
4671 skip_hw_reset:
4672
4673 /* Post ASIC reset for all devs .*/
4674 list_for_each_entry(tmp_adev, device_list_handle, gmc.xgmi.head) {
4675
4676 for (i = 0; i < AMDGPU_MAX_RINGS; ++i) {
4677 struct amdgpu_ring *ring = tmp_adev->rings[i];
4678
4679 if (!ring || !ring->sched.thread)
4680 continue;
4681
4682 /* No point to resubmit jobs if we didn't HW reset*/
4683 if (!tmp_adev->asic_reset_res && !job_signaled)
4684 drm_sched_resubmit_jobs(&ring->sched);
4685
4686 drm_sched_start(&ring->sched, !tmp_adev->asic_reset_res);
4687 }
4688
4689 if (!amdgpu_device_has_dc_support(tmp_adev) && !job_signaled) {
4690 drm_helper_resume_force_mode(adev_to_drm(tmp_adev));
4691 }
4692
4693 tmp_adev->asic_reset_res = 0;
4694
4695 if (r) {
4696 /* bad news, how to tell it to userspace ? */
4697 dev_info(tmp_adev->dev, "GPU reset(%d) failed\n", atomic_read(&tmp_adev->gpu_reset_counter));
4698 amdgpu_vf_error_put(tmp_adev, AMDGIM_ERROR_VF_GPU_RESET_FAIL, 0, r);
4699 } else {
4700 dev_info(tmp_adev->dev, "GPU reset(%d) succeeded!\n", atomic_read(&tmp_adev->gpu_reset_counter));
4701 }
4702 }
4703
4704 skip_sched_resume:
4705 list_for_each_entry(tmp_adev, device_list_handle, gmc.xgmi.head) {
4706 /*unlock kfd: SRIOV would do it separately */
4707 if (!need_emergency_restart && !amdgpu_sriov_vf(tmp_adev))
4708 amdgpu_amdkfd_post_reset(tmp_adev);
4709 if (audio_suspended)
4710 amdgpu_device_resume_display_audio(tmp_adev);
4711 amdgpu_device_unlock_adev(tmp_adev);
4712 }
4713
4714 skip_recovery:
4715 if (hive) {
4716 atomic_set(&hive->in_reset, 0);
4717 mutex_unlock(&hive->hive_lock);
4718 amdgpu_put_xgmi_hive(hive);
4719 }
4720
4721 if (r)
4722 dev_info(adev->dev, "GPU reset end with ret = %d\n", r);
4723 return r;
4724 }
4725
4726 /**
4727 * amdgpu_device_get_pcie_info - fence pcie info about the PCIE slot
4728 *
4729 * @adev: amdgpu_device pointer
4730 *
4731 * Fetchs and stores in the driver the PCIE capabilities (gen speed
4732 * and lanes) of the slot the device is in. Handles APUs and
4733 * virtualized environments where PCIE config space may not be available.
4734 */
amdgpu_device_get_pcie_info(struct amdgpu_device * adev)4735 static void amdgpu_device_get_pcie_info(struct amdgpu_device *adev)
4736 {
4737 struct pci_dev *pdev;
4738 enum pci_bus_speed speed_cap, platform_speed_cap;
4739 enum pcie_link_width platform_link_width;
4740
4741 if (amdgpu_pcie_gen_cap)
4742 adev->pm.pcie_gen_mask = amdgpu_pcie_gen_cap;
4743
4744 if (amdgpu_pcie_lane_cap)
4745 adev->pm.pcie_mlw_mask = amdgpu_pcie_lane_cap;
4746
4747 /* covers APUs as well */
4748 if (pci_is_root_bus(adev->pdev->bus)) {
4749 if (adev->pm.pcie_gen_mask == 0)
4750 adev->pm.pcie_gen_mask = AMDGPU_DEFAULT_PCIE_GEN_MASK;
4751 if (adev->pm.pcie_mlw_mask == 0)
4752 adev->pm.pcie_mlw_mask = AMDGPU_DEFAULT_PCIE_MLW_MASK;
4753 return;
4754 }
4755
4756 if (adev->pm.pcie_gen_mask && adev->pm.pcie_mlw_mask)
4757 return;
4758
4759 pcie_bandwidth_available(adev->pdev, NULL,
4760 &platform_speed_cap, &platform_link_width);
4761
4762 if (adev->pm.pcie_gen_mask == 0) {
4763 /* asic caps */
4764 pdev = adev->pdev;
4765 speed_cap = pcie_get_speed_cap(pdev);
4766 if (speed_cap == PCI_SPEED_UNKNOWN) {
4767 adev->pm.pcie_gen_mask |= (CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN1 |
4768 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN2 |
4769 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN3);
4770 } else {
4771 if (speed_cap == PCIE_SPEED_16_0GT)
4772 adev->pm.pcie_gen_mask |= (CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN1 |
4773 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN2 |
4774 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN3 |
4775 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN4);
4776 else if (speed_cap == PCIE_SPEED_8_0GT)
4777 adev->pm.pcie_gen_mask |= (CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN1 |
4778 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN2 |
4779 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN3);
4780 else if (speed_cap == PCIE_SPEED_5_0GT)
4781 adev->pm.pcie_gen_mask |= (CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN1 |
4782 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN2);
4783 else
4784 adev->pm.pcie_gen_mask |= CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN1;
4785 }
4786 /* platform caps */
4787 if (platform_speed_cap == PCI_SPEED_UNKNOWN) {
4788 adev->pm.pcie_gen_mask |= (CAIL_PCIE_LINK_SPEED_SUPPORT_GEN1 |
4789 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN2);
4790 } else {
4791 if (platform_speed_cap == PCIE_SPEED_16_0GT)
4792 adev->pm.pcie_gen_mask |= (CAIL_PCIE_LINK_SPEED_SUPPORT_GEN1 |
4793 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN2 |
4794 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN3 |
4795 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN4);
4796 else if (platform_speed_cap == PCIE_SPEED_8_0GT)
4797 adev->pm.pcie_gen_mask |= (CAIL_PCIE_LINK_SPEED_SUPPORT_GEN1 |
4798 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN2 |
4799 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN3);
4800 else if (platform_speed_cap == PCIE_SPEED_5_0GT)
4801 adev->pm.pcie_gen_mask |= (CAIL_PCIE_LINK_SPEED_SUPPORT_GEN1 |
4802 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN2);
4803 else
4804 adev->pm.pcie_gen_mask |= CAIL_PCIE_LINK_SPEED_SUPPORT_GEN1;
4805
4806 }
4807 }
4808 if (adev->pm.pcie_mlw_mask == 0) {
4809 if (platform_link_width == PCIE_LNK_WIDTH_UNKNOWN) {
4810 adev->pm.pcie_mlw_mask |= AMDGPU_DEFAULT_PCIE_MLW_MASK;
4811 } else {
4812 switch (platform_link_width) {
4813 case PCIE_LNK_X32:
4814 adev->pm.pcie_mlw_mask = (CAIL_PCIE_LINK_WIDTH_SUPPORT_X32 |
4815 CAIL_PCIE_LINK_WIDTH_SUPPORT_X16 |
4816 CAIL_PCIE_LINK_WIDTH_SUPPORT_X12 |
4817 CAIL_PCIE_LINK_WIDTH_SUPPORT_X8 |
4818 CAIL_PCIE_LINK_WIDTH_SUPPORT_X4 |
4819 CAIL_PCIE_LINK_WIDTH_SUPPORT_X2 |
4820 CAIL_PCIE_LINK_WIDTH_SUPPORT_X1);
4821 break;
4822 case PCIE_LNK_X16:
4823 adev->pm.pcie_mlw_mask = (CAIL_PCIE_LINK_WIDTH_SUPPORT_X16 |
4824 CAIL_PCIE_LINK_WIDTH_SUPPORT_X12 |
4825 CAIL_PCIE_LINK_WIDTH_SUPPORT_X8 |
4826 CAIL_PCIE_LINK_WIDTH_SUPPORT_X4 |
4827 CAIL_PCIE_LINK_WIDTH_SUPPORT_X2 |
4828 CAIL_PCIE_LINK_WIDTH_SUPPORT_X1);
4829 break;
4830 case PCIE_LNK_X12:
4831 adev->pm.pcie_mlw_mask = (CAIL_PCIE_LINK_WIDTH_SUPPORT_X12 |
4832 CAIL_PCIE_LINK_WIDTH_SUPPORT_X8 |
4833 CAIL_PCIE_LINK_WIDTH_SUPPORT_X4 |
4834 CAIL_PCIE_LINK_WIDTH_SUPPORT_X2 |
4835 CAIL_PCIE_LINK_WIDTH_SUPPORT_X1);
4836 break;
4837 case PCIE_LNK_X8:
4838 adev->pm.pcie_mlw_mask = (CAIL_PCIE_LINK_WIDTH_SUPPORT_X8 |
4839 CAIL_PCIE_LINK_WIDTH_SUPPORT_X4 |
4840 CAIL_PCIE_LINK_WIDTH_SUPPORT_X2 |
4841 CAIL_PCIE_LINK_WIDTH_SUPPORT_X1);
4842 break;
4843 case PCIE_LNK_X4:
4844 adev->pm.pcie_mlw_mask = (CAIL_PCIE_LINK_WIDTH_SUPPORT_X4 |
4845 CAIL_PCIE_LINK_WIDTH_SUPPORT_X2 |
4846 CAIL_PCIE_LINK_WIDTH_SUPPORT_X1);
4847 break;
4848 case PCIE_LNK_X2:
4849 adev->pm.pcie_mlw_mask = (CAIL_PCIE_LINK_WIDTH_SUPPORT_X2 |
4850 CAIL_PCIE_LINK_WIDTH_SUPPORT_X1);
4851 break;
4852 case PCIE_LNK_X1:
4853 adev->pm.pcie_mlw_mask = CAIL_PCIE_LINK_WIDTH_SUPPORT_X1;
4854 break;
4855 default:
4856 break;
4857 }
4858 }
4859 }
4860 }
4861
amdgpu_device_baco_enter(struct drm_device * dev)4862 int amdgpu_device_baco_enter(struct drm_device *dev)
4863 {
4864 struct amdgpu_device *adev = drm_to_adev(dev);
4865 struct amdgpu_ras *ras = amdgpu_ras_get_context(adev);
4866
4867 if (!amdgpu_device_supports_baco(adev_to_drm(adev)))
4868 return -ENOTSUPP;
4869
4870 if (ras && ras->supported && adev->nbio.funcs->enable_doorbell_interrupt)
4871 adev->nbio.funcs->enable_doorbell_interrupt(adev, false);
4872
4873 return amdgpu_dpm_baco_enter(adev);
4874 }
4875
amdgpu_device_baco_exit(struct drm_device * dev)4876 int amdgpu_device_baco_exit(struct drm_device *dev)
4877 {
4878 struct amdgpu_device *adev = drm_to_adev(dev);
4879 struct amdgpu_ras *ras = amdgpu_ras_get_context(adev);
4880 int ret = 0;
4881
4882 if (!amdgpu_device_supports_baco(adev_to_drm(adev)))
4883 return -ENOTSUPP;
4884
4885 ret = amdgpu_dpm_baco_exit(adev);
4886 if (ret)
4887 return ret;
4888
4889 if (ras && ras->supported && adev->nbio.funcs->enable_doorbell_interrupt)
4890 adev->nbio.funcs->enable_doorbell_interrupt(adev, true);
4891
4892 return 0;
4893 }
4894
amdgpu_cancel_all_tdr(struct amdgpu_device * adev)4895 static void amdgpu_cancel_all_tdr(struct amdgpu_device *adev)
4896 {
4897 int i;
4898
4899 for (i = 0; i < AMDGPU_MAX_RINGS; ++i) {
4900 struct amdgpu_ring *ring = adev->rings[i];
4901
4902 if (!ring || !ring->sched.thread)
4903 continue;
4904
4905 cancel_delayed_work_sync(&ring->sched.work_tdr);
4906 }
4907 }
4908
4909 /**
4910 * amdgpu_pci_error_detected - Called when a PCI error is detected.
4911 * @pdev: PCI device struct
4912 * @state: PCI channel state
4913 *
4914 * Description: Called when a PCI error is detected.
4915 *
4916 * Return: PCI_ERS_RESULT_NEED_RESET or PCI_ERS_RESULT_DISCONNECT.
4917 */
amdgpu_pci_error_detected(struct pci_dev * pdev,pci_channel_state_t state)4918 pci_ers_result_t amdgpu_pci_error_detected(struct pci_dev *pdev, pci_channel_state_t state)
4919 {
4920 struct drm_device *dev = pci_get_drvdata(pdev);
4921 struct amdgpu_device *adev = drm_to_adev(dev);
4922 int i;
4923
4924 DRM_INFO("PCI error: detected callback, state(%d)!!\n", state);
4925
4926 if (adev->gmc.xgmi.num_physical_nodes > 1) {
4927 DRM_WARN("No support for XGMI hive yet...");
4928 return PCI_ERS_RESULT_DISCONNECT;
4929 }
4930
4931 switch (state) {
4932 case pci_channel_io_normal:
4933 return PCI_ERS_RESULT_CAN_RECOVER;
4934 /* Fatal error, prepare for slot reset */
4935 case pci_channel_io_frozen:
4936 /*
4937 * Cancel and wait for all TDRs in progress if failing to
4938 * set adev->in_gpu_reset in amdgpu_device_lock_adev
4939 *
4940 * Locking adev->reset_sem will prevent any external access
4941 * to GPU during PCI error recovery
4942 */
4943 while (!amdgpu_device_lock_adev(adev, NULL))
4944 amdgpu_cancel_all_tdr(adev);
4945
4946 /*
4947 * Block any work scheduling as we do for regular GPU reset
4948 * for the duration of the recovery
4949 */
4950 for (i = 0; i < AMDGPU_MAX_RINGS; ++i) {
4951 struct amdgpu_ring *ring = adev->rings[i];
4952
4953 if (!ring || !ring->sched.thread)
4954 continue;
4955
4956 drm_sched_stop(&ring->sched, NULL);
4957 }
4958 return PCI_ERS_RESULT_NEED_RESET;
4959 case pci_channel_io_perm_failure:
4960 /* Permanent error, prepare for device removal */
4961 return PCI_ERS_RESULT_DISCONNECT;
4962 }
4963
4964 return PCI_ERS_RESULT_NEED_RESET;
4965 }
4966
4967 /**
4968 * amdgpu_pci_mmio_enabled - Enable MMIO and dump debug registers
4969 * @pdev: pointer to PCI device
4970 */
amdgpu_pci_mmio_enabled(struct pci_dev * pdev)4971 pci_ers_result_t amdgpu_pci_mmio_enabled(struct pci_dev *pdev)
4972 {
4973
4974 DRM_INFO("PCI error: mmio enabled callback!!\n");
4975
4976 /* TODO - dump whatever for debugging purposes */
4977
4978 /* This called only if amdgpu_pci_error_detected returns
4979 * PCI_ERS_RESULT_CAN_RECOVER. Read/write to the device still
4980 * works, no need to reset slot.
4981 */
4982
4983 return PCI_ERS_RESULT_RECOVERED;
4984 }
4985
4986 /**
4987 * amdgpu_pci_slot_reset - Called when PCI slot has been reset.
4988 * @pdev: PCI device struct
4989 *
4990 * Description: This routine is called by the pci error recovery
4991 * code after the PCI slot has been reset, just before we
4992 * should resume normal operations.
4993 */
amdgpu_pci_slot_reset(struct pci_dev * pdev)4994 pci_ers_result_t amdgpu_pci_slot_reset(struct pci_dev *pdev)
4995 {
4996 struct drm_device *dev = pci_get_drvdata(pdev);
4997 struct amdgpu_device *adev = drm_to_adev(dev);
4998 int r, i;
4999 bool need_full_reset = true;
5000 u32 memsize;
5001 struct list_head device_list;
5002
5003 DRM_INFO("PCI error: slot reset callback!!\n");
5004
5005 INIT_LIST_HEAD(&device_list);
5006 list_add_tail(&adev->gmc.xgmi.head, &device_list);
5007
5008 /* wait for asic to come out of reset */
5009 msleep(500);
5010
5011 /* Restore PCI confspace */
5012 amdgpu_device_load_pci_state(pdev);
5013
5014 /* confirm ASIC came out of reset */
5015 for (i = 0; i < adev->usec_timeout; i++) {
5016 memsize = amdgpu_asic_get_config_memsize(adev);
5017
5018 if (memsize != 0xffffffff)
5019 break;
5020 udelay(1);
5021 }
5022 if (memsize == 0xffffffff) {
5023 r = -ETIME;
5024 goto out;
5025 }
5026
5027 adev->in_pci_err_recovery = true;
5028 r = amdgpu_device_pre_asic_reset(adev, NULL, &need_full_reset);
5029 adev->in_pci_err_recovery = false;
5030 if (r)
5031 goto out;
5032
5033 r = amdgpu_do_asic_reset(NULL, &device_list, &need_full_reset, true);
5034
5035 out:
5036 if (!r) {
5037 if (amdgpu_device_cache_pci_state(adev->pdev))
5038 pci_restore_state(adev->pdev);
5039
5040 DRM_INFO("PCIe error recovery succeeded\n");
5041 } else {
5042 DRM_ERROR("PCIe error recovery failed, err:%d", r);
5043 amdgpu_device_unlock_adev(adev);
5044 }
5045
5046 return r ? PCI_ERS_RESULT_DISCONNECT : PCI_ERS_RESULT_RECOVERED;
5047 }
5048
5049 /**
5050 * amdgpu_pci_resume() - resume normal ops after PCI reset
5051 * @pdev: pointer to PCI device
5052 *
5053 * Called when the error recovery driver tells us that its
5054 * OK to resume normal operation. Use completion to allow
5055 * halted scsi ops to resume.
5056 */
amdgpu_pci_resume(struct pci_dev * pdev)5057 void amdgpu_pci_resume(struct pci_dev *pdev)
5058 {
5059 struct drm_device *dev = pci_get_drvdata(pdev);
5060 struct amdgpu_device *adev = drm_to_adev(dev);
5061 int i;
5062
5063
5064 DRM_INFO("PCI error: resume callback!!\n");
5065
5066 for (i = 0; i < AMDGPU_MAX_RINGS; ++i) {
5067 struct amdgpu_ring *ring = adev->rings[i];
5068
5069 if (!ring || !ring->sched.thread)
5070 continue;
5071
5072
5073 drm_sched_resubmit_jobs(&ring->sched);
5074 drm_sched_start(&ring->sched, true);
5075 }
5076
5077 amdgpu_device_unlock_adev(adev);
5078 }
5079
amdgpu_device_cache_pci_state(struct pci_dev * pdev)5080 bool amdgpu_device_cache_pci_state(struct pci_dev *pdev)
5081 {
5082 struct drm_device *dev = pci_get_drvdata(pdev);
5083 struct amdgpu_device *adev = drm_to_adev(dev);
5084 int r;
5085
5086 r = pci_save_state(pdev);
5087 if (!r) {
5088 kfree(adev->pci_state);
5089
5090 adev->pci_state = pci_store_saved_state(pdev);
5091
5092 if (!adev->pci_state) {
5093 DRM_ERROR("Failed to store PCI saved state");
5094 return false;
5095 }
5096 } else {
5097 DRM_WARN("Failed to save PCI state, err:%d\n", r);
5098 return false;
5099 }
5100
5101 return true;
5102 }
5103
amdgpu_device_load_pci_state(struct pci_dev * pdev)5104 bool amdgpu_device_load_pci_state(struct pci_dev *pdev)
5105 {
5106 struct drm_device *dev = pci_get_drvdata(pdev);
5107 struct amdgpu_device *adev = drm_to_adev(dev);
5108 int r;
5109
5110 if (!adev->pci_state)
5111 return false;
5112
5113 r = pci_load_saved_state(pdev, adev->pci_state);
5114
5115 if (!r) {
5116 pci_restore_state(pdev);
5117 } else {
5118 DRM_WARN("Failed to load PCI state, err:%d\n", r);
5119 return false;
5120 }
5121
5122 return true;
5123 }
5124
5125
5126