1 /*
2 * Copyright 2008 Advanced Micro Devices, Inc.
3 * Copyright 2008 Red Hat Inc.
4 * Copyright 2009 Jerome Glisse.
5 *
6 * Permission is hereby granted, free of charge, to any person obtaining a
7 * copy of this software and associated documentation files (the "Software"),
8 * to deal in the Software without restriction, including without limitation
9 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
10 * and/or sell copies of the Software, and to permit persons to whom the
11 * Software is furnished to do so, subject to the following conditions:
12 *
13 * The above copyright notice and this permission notice shall be included in
14 * all copies or substantial portions of the Software.
15 *
16 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
19 * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR
20 * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
21 * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
22 * OTHER DEALINGS IN THE SOFTWARE.
23 *
24 * Authors: Dave Airlie
25 * Alex Deucher
26 * Jerome Glisse
27 */
28 #include <linux/power_supply.h>
29 #include <linux/kthread.h>
30 #include <linux/module.h>
31 #include <linux/console.h>
32 #include <linux/slab.h>
33
34 #include <drm/drm_atomic_helper.h>
35 #include <drm/drm_probe_helper.h>
36 #include <drm/amdgpu_drm.h>
37 #include <linux/vgaarb.h>
38 #include <linux/vga_switcheroo.h>
39 #include <linux/efi.h>
40 #include "amdgpu.h"
41 #include "amdgpu_trace.h"
42 #include "amdgpu_i2c.h"
43 #include "atom.h"
44 #include "amdgpu_atombios.h"
45 #include "amdgpu_atomfirmware.h"
46 #include "amd_pcie.h"
47 #ifdef CONFIG_DRM_AMDGPU_SI
48 #include "si.h"
49 #endif
50 #ifdef CONFIG_DRM_AMDGPU_CIK
51 #include "cik.h"
52 #endif
53 #include "vi.h"
54 #include "soc15.h"
55 #include "nv.h"
56 #include "bif/bif_4_1_d.h"
57 #include <linux/pci.h>
58 #include <linux/firmware.h>
59 #include "amdgpu_vf_error.h"
60
61 #include "amdgpu_amdkfd.h"
62 #include "amdgpu_pm.h"
63
64 #include "amdgpu_xgmi.h"
65 #include "amdgpu_ras.h"
66 #include "amdgpu_pmu.h"
67 #include "amdgpu_fru_eeprom.h"
68
69 #include <linux/suspend.h>
70 #include <drm/task_barrier.h>
71 #include <linux/pm_runtime.h>
72
73 MODULE_FIRMWARE("amdgpu/vega10_gpu_info.bin");
74 MODULE_FIRMWARE("amdgpu/vega12_gpu_info.bin");
75 MODULE_FIRMWARE("amdgpu/raven_gpu_info.bin");
76 MODULE_FIRMWARE("amdgpu/picasso_gpu_info.bin");
77 MODULE_FIRMWARE("amdgpu/raven2_gpu_info.bin");
78 MODULE_FIRMWARE("amdgpu/arcturus_gpu_info.bin");
79 MODULE_FIRMWARE("amdgpu/renoir_gpu_info.bin");
80 MODULE_FIRMWARE("amdgpu/navi10_gpu_info.bin");
81 MODULE_FIRMWARE("amdgpu/navi14_gpu_info.bin");
82 MODULE_FIRMWARE("amdgpu/navi12_gpu_info.bin");
83
84 #define AMDGPU_RESUME_MS 2000
85
86 const char *amdgpu_asic_name[] = {
87 "TAHITI",
88 "PITCAIRN",
89 "VERDE",
90 "OLAND",
91 "HAINAN",
92 "BONAIRE",
93 "KAVERI",
94 "KABINI",
95 "HAWAII",
96 "MULLINS",
97 "TOPAZ",
98 "TONGA",
99 "FIJI",
100 "CARRIZO",
101 "STONEY",
102 "POLARIS10",
103 "POLARIS11",
104 "POLARIS12",
105 "VEGAM",
106 "VEGA10",
107 "VEGA12",
108 "VEGA20",
109 "RAVEN",
110 "ARCTURUS",
111 "RENOIR",
112 "NAVI10",
113 "NAVI14",
114 "NAVI12",
115 "SIENNA_CICHLID",
116 "NAVY_FLOUNDER",
117 "LAST",
118 };
119
120 /**
121 * DOC: pcie_replay_count
122 *
123 * The amdgpu driver provides a sysfs API for reporting the total number
124 * of PCIe replays (NAKs)
125 * The file pcie_replay_count is used for this and returns the total
126 * number of replays as a sum of the NAKs generated and NAKs received
127 */
128
amdgpu_device_get_pcie_replay_count(struct device * dev,struct device_attribute * attr,char * buf)129 static ssize_t amdgpu_device_get_pcie_replay_count(struct device *dev,
130 struct device_attribute *attr, char *buf)
131 {
132 struct drm_device *ddev = dev_get_drvdata(dev);
133 struct amdgpu_device *adev = drm_to_adev(ddev);
134 uint64_t cnt = amdgpu_asic_get_pcie_replay_count(adev);
135
136 return snprintf(buf, PAGE_SIZE, "%llu\n", cnt);
137 }
138
139 static DEVICE_ATTR(pcie_replay_count, S_IRUGO,
140 amdgpu_device_get_pcie_replay_count, NULL);
141
142 static void amdgpu_device_get_pcie_info(struct amdgpu_device *adev);
143
144 /**
145 * DOC: product_name
146 *
147 * The amdgpu driver provides a sysfs API for reporting the product name
148 * for the device
149 * The file serial_number is used for this and returns the product name
150 * as returned from the FRU.
151 * NOTE: This is only available for certain server cards
152 */
153
amdgpu_device_get_product_name(struct device * dev,struct device_attribute * attr,char * buf)154 static ssize_t amdgpu_device_get_product_name(struct device *dev,
155 struct device_attribute *attr, char *buf)
156 {
157 struct drm_device *ddev = dev_get_drvdata(dev);
158 struct amdgpu_device *adev = drm_to_adev(ddev);
159
160 return snprintf(buf, PAGE_SIZE, "%s\n", adev->product_name);
161 }
162
163 static DEVICE_ATTR(product_name, S_IRUGO,
164 amdgpu_device_get_product_name, NULL);
165
166 /**
167 * DOC: product_number
168 *
169 * The amdgpu driver provides a sysfs API for reporting the part number
170 * for the device
171 * The file serial_number is used for this and returns the part number
172 * as returned from the FRU.
173 * NOTE: This is only available for certain server cards
174 */
175
amdgpu_device_get_product_number(struct device * dev,struct device_attribute * attr,char * buf)176 static ssize_t amdgpu_device_get_product_number(struct device *dev,
177 struct device_attribute *attr, char *buf)
178 {
179 struct drm_device *ddev = dev_get_drvdata(dev);
180 struct amdgpu_device *adev = drm_to_adev(ddev);
181
182 return snprintf(buf, PAGE_SIZE, "%s\n", adev->product_number);
183 }
184
185 static DEVICE_ATTR(product_number, S_IRUGO,
186 amdgpu_device_get_product_number, NULL);
187
188 /**
189 * DOC: serial_number
190 *
191 * The amdgpu driver provides a sysfs API for reporting the serial number
192 * for the device
193 * The file serial_number is used for this and returns the serial number
194 * as returned from the FRU.
195 * NOTE: This is only available for certain server cards
196 */
197
amdgpu_device_get_serial_number(struct device * dev,struct device_attribute * attr,char * buf)198 static ssize_t amdgpu_device_get_serial_number(struct device *dev,
199 struct device_attribute *attr, char *buf)
200 {
201 struct drm_device *ddev = dev_get_drvdata(dev);
202 struct amdgpu_device *adev = drm_to_adev(ddev);
203
204 return snprintf(buf, PAGE_SIZE, "%s\n", adev->serial);
205 }
206
207 static DEVICE_ATTR(serial_number, S_IRUGO,
208 amdgpu_device_get_serial_number, NULL);
209
210 /**
211 * amdgpu_device_supports_boco - Is the device a dGPU with HG/PX power control
212 *
213 * @dev: drm_device pointer
214 *
215 * Returns true if the device is a dGPU with HG/PX power control,
216 * otherwise return false.
217 */
amdgpu_device_supports_boco(struct drm_device * dev)218 bool amdgpu_device_supports_boco(struct drm_device *dev)
219 {
220 struct amdgpu_device *adev = drm_to_adev(dev);
221
222 if (adev->flags & AMD_IS_PX)
223 return true;
224 return false;
225 }
226
227 /**
228 * amdgpu_device_supports_baco - Does the device support BACO
229 *
230 * @dev: drm_device pointer
231 *
232 * Returns true if the device supporte BACO,
233 * otherwise return false.
234 */
amdgpu_device_supports_baco(struct drm_device * dev)235 bool amdgpu_device_supports_baco(struct drm_device *dev)
236 {
237 struct amdgpu_device *adev = drm_to_adev(dev);
238
239 return amdgpu_asic_supports_baco(adev);
240 }
241
242 /*
243 * VRAM access helper functions
244 */
245
246 /**
247 * amdgpu_device_vram_access - read/write a buffer in vram
248 *
249 * @adev: amdgpu_device pointer
250 * @pos: offset of the buffer in vram
251 * @buf: virtual address of the buffer in system memory
252 * @size: read/write size, sizeof(@buf) must > @size
253 * @write: true - write to vram, otherwise - read from vram
254 */
amdgpu_device_vram_access(struct amdgpu_device * adev,loff_t pos,uint32_t * buf,size_t size,bool write)255 void amdgpu_device_vram_access(struct amdgpu_device *adev, loff_t pos,
256 uint32_t *buf, size_t size, bool write)
257 {
258 unsigned long flags;
259 uint32_t hi = ~0;
260 uint64_t last;
261
262
263 #ifdef CONFIG_64BIT
264 last = min(pos + size, adev->gmc.visible_vram_size);
265 if (last > pos) {
266 void __iomem *addr = adev->mman.aper_base_kaddr + pos;
267 size_t count = last - pos;
268
269 if (write) {
270 memcpy_toio(addr, buf, count);
271 mb();
272 amdgpu_asic_flush_hdp(adev, NULL);
273 } else {
274 amdgpu_asic_invalidate_hdp(adev, NULL);
275 mb();
276 memcpy_fromio(buf, addr, count);
277 }
278
279 if (count == size)
280 return;
281
282 pos += count;
283 buf += count / 4;
284 size -= count;
285 }
286 #endif
287
288 spin_lock_irqsave(&adev->mmio_idx_lock, flags);
289 for (last = pos + size; pos < last; pos += 4) {
290 uint32_t tmp = pos >> 31;
291
292 WREG32_NO_KIQ(mmMM_INDEX, ((uint32_t)pos) | 0x80000000);
293 if (tmp != hi) {
294 WREG32_NO_KIQ(mmMM_INDEX_HI, tmp);
295 hi = tmp;
296 }
297 if (write)
298 WREG32_NO_KIQ(mmMM_DATA, *buf++);
299 else
300 *buf++ = RREG32_NO_KIQ(mmMM_DATA);
301 }
302 spin_unlock_irqrestore(&adev->mmio_idx_lock, flags);
303 }
304
305 /*
306 * register access helper functions.
307 */
308 /**
309 * amdgpu_device_rreg - read a memory mapped IO or indirect register
310 *
311 * @adev: amdgpu_device pointer
312 * @reg: dword aligned register offset
313 * @acc_flags: access flags which require special behavior
314 *
315 * Returns the 32 bit value from the offset specified.
316 */
amdgpu_device_rreg(struct amdgpu_device * adev,uint32_t reg,uint32_t acc_flags)317 uint32_t amdgpu_device_rreg(struct amdgpu_device *adev,
318 uint32_t reg, uint32_t acc_flags)
319 {
320 uint32_t ret;
321
322 if (adev->in_pci_err_recovery)
323 return 0;
324
325 if ((reg * 4) < adev->rmmio_size) {
326 if (!(acc_flags & AMDGPU_REGS_NO_KIQ) &&
327 amdgpu_sriov_runtime(adev) &&
328 down_read_trylock(&adev->reset_sem)) {
329 ret = amdgpu_kiq_rreg(adev, reg);
330 up_read(&adev->reset_sem);
331 } else {
332 ret = readl(((void __iomem *)adev->rmmio) + (reg * 4));
333 }
334 } else {
335 ret = adev->pcie_rreg(adev, reg * 4);
336 }
337
338 trace_amdgpu_device_rreg(adev->pdev->device, reg, ret);
339
340 return ret;
341 }
342
343 /*
344 * MMIO register read with bytes helper functions
345 * @offset:bytes offset from MMIO start
346 *
347 */
348
349 /**
350 * amdgpu_mm_rreg8 - read a memory mapped IO register
351 *
352 * @adev: amdgpu_device pointer
353 * @offset: byte aligned register offset
354 *
355 * Returns the 8 bit value from the offset specified.
356 */
amdgpu_mm_rreg8(struct amdgpu_device * adev,uint32_t offset)357 uint8_t amdgpu_mm_rreg8(struct amdgpu_device *adev, uint32_t offset)
358 {
359 if (adev->in_pci_err_recovery)
360 return 0;
361
362 if (offset < adev->rmmio_size)
363 return (readb(adev->rmmio + offset));
364 BUG();
365 }
366
367 /*
368 * MMIO register write with bytes helper functions
369 * @offset:bytes offset from MMIO start
370 * @value: the value want to be written to the register
371 *
372 */
373 /**
374 * amdgpu_mm_wreg8 - read a memory mapped IO register
375 *
376 * @adev: amdgpu_device pointer
377 * @offset: byte aligned register offset
378 * @value: 8 bit value to write
379 *
380 * Writes the value specified to the offset specified.
381 */
amdgpu_mm_wreg8(struct amdgpu_device * adev,uint32_t offset,uint8_t value)382 void amdgpu_mm_wreg8(struct amdgpu_device *adev, uint32_t offset, uint8_t value)
383 {
384 if (adev->in_pci_err_recovery)
385 return;
386
387 if (offset < adev->rmmio_size)
388 writeb(value, adev->rmmio + offset);
389 else
390 BUG();
391 }
392
393 /**
394 * amdgpu_device_wreg - write to a memory mapped IO or indirect register
395 *
396 * @adev: amdgpu_device pointer
397 * @reg: dword aligned register offset
398 * @v: 32 bit value to write to the register
399 * @acc_flags: access flags which require special behavior
400 *
401 * Writes the value specified to the offset specified.
402 */
amdgpu_device_wreg(struct amdgpu_device * adev,uint32_t reg,uint32_t v,uint32_t acc_flags)403 void amdgpu_device_wreg(struct amdgpu_device *adev,
404 uint32_t reg, uint32_t v,
405 uint32_t acc_flags)
406 {
407 if (adev->in_pci_err_recovery)
408 return;
409
410 if ((reg * 4) < adev->rmmio_size) {
411 if (!(acc_flags & AMDGPU_REGS_NO_KIQ) &&
412 amdgpu_sriov_runtime(adev) &&
413 down_read_trylock(&adev->reset_sem)) {
414 amdgpu_kiq_wreg(adev, reg, v);
415 up_read(&adev->reset_sem);
416 } else {
417 writel(v, ((void __iomem *)adev->rmmio) + (reg * 4));
418 }
419 } else {
420 adev->pcie_wreg(adev, reg * 4, v);
421 }
422
423 trace_amdgpu_device_wreg(adev->pdev->device, reg, v);
424 }
425
426 /*
427 * amdgpu_mm_wreg_mmio_rlc - write register either with mmio or with RLC path if in range
428 *
429 * this function is invoked only the debugfs register access
430 * */
amdgpu_mm_wreg_mmio_rlc(struct amdgpu_device * adev,uint32_t reg,uint32_t v)431 void amdgpu_mm_wreg_mmio_rlc(struct amdgpu_device *adev,
432 uint32_t reg, uint32_t v)
433 {
434 if (adev->in_pci_err_recovery)
435 return;
436
437 if (amdgpu_sriov_fullaccess(adev) &&
438 adev->gfx.rlc.funcs &&
439 adev->gfx.rlc.funcs->is_rlcg_access_range) {
440 if (adev->gfx.rlc.funcs->is_rlcg_access_range(adev, reg))
441 return adev->gfx.rlc.funcs->rlcg_wreg(adev, reg, v);
442 } else {
443 writel(v, ((void __iomem *)adev->rmmio) + (reg * 4));
444 }
445 }
446
447 /**
448 * amdgpu_io_rreg - read an IO register
449 *
450 * @adev: amdgpu_device pointer
451 * @reg: dword aligned register offset
452 *
453 * Returns the 32 bit value from the offset specified.
454 */
amdgpu_io_rreg(struct amdgpu_device * adev,u32 reg)455 u32 amdgpu_io_rreg(struct amdgpu_device *adev, u32 reg)
456 {
457 if (adev->in_pci_err_recovery)
458 return 0;
459
460 if ((reg * 4) < adev->rio_mem_size)
461 return ioread32(adev->rio_mem + (reg * 4));
462 else {
463 iowrite32((reg * 4), adev->rio_mem + (mmMM_INDEX * 4));
464 return ioread32(adev->rio_mem + (mmMM_DATA * 4));
465 }
466 }
467
468 /**
469 * amdgpu_io_wreg - write to an IO register
470 *
471 * @adev: amdgpu_device pointer
472 * @reg: dword aligned register offset
473 * @v: 32 bit value to write to the register
474 *
475 * Writes the value specified to the offset specified.
476 */
amdgpu_io_wreg(struct amdgpu_device * adev,u32 reg,u32 v)477 void amdgpu_io_wreg(struct amdgpu_device *adev, u32 reg, u32 v)
478 {
479 if (adev->in_pci_err_recovery)
480 return;
481
482 if ((reg * 4) < adev->rio_mem_size)
483 iowrite32(v, adev->rio_mem + (reg * 4));
484 else {
485 iowrite32((reg * 4), adev->rio_mem + (mmMM_INDEX * 4));
486 iowrite32(v, adev->rio_mem + (mmMM_DATA * 4));
487 }
488 }
489
490 /**
491 * amdgpu_mm_rdoorbell - read a doorbell dword
492 *
493 * @adev: amdgpu_device pointer
494 * @index: doorbell index
495 *
496 * Returns the value in the doorbell aperture at the
497 * requested doorbell index (CIK).
498 */
amdgpu_mm_rdoorbell(struct amdgpu_device * adev,u32 index)499 u32 amdgpu_mm_rdoorbell(struct amdgpu_device *adev, u32 index)
500 {
501 if (adev->in_pci_err_recovery)
502 return 0;
503
504 if (index < adev->doorbell.num_doorbells) {
505 return readl(adev->doorbell.ptr + index);
506 } else {
507 DRM_ERROR("reading beyond doorbell aperture: 0x%08x!\n", index);
508 return 0;
509 }
510 }
511
512 /**
513 * amdgpu_mm_wdoorbell - write a doorbell dword
514 *
515 * @adev: amdgpu_device pointer
516 * @index: doorbell index
517 * @v: value to write
518 *
519 * Writes @v to the doorbell aperture at the
520 * requested doorbell index (CIK).
521 */
amdgpu_mm_wdoorbell(struct amdgpu_device * adev,u32 index,u32 v)522 void amdgpu_mm_wdoorbell(struct amdgpu_device *adev, u32 index, u32 v)
523 {
524 if (adev->in_pci_err_recovery)
525 return;
526
527 if (index < adev->doorbell.num_doorbells) {
528 writel(v, adev->doorbell.ptr + index);
529 } else {
530 DRM_ERROR("writing beyond doorbell aperture: 0x%08x!\n", index);
531 }
532 }
533
534 /**
535 * amdgpu_mm_rdoorbell64 - read a doorbell Qword
536 *
537 * @adev: amdgpu_device pointer
538 * @index: doorbell index
539 *
540 * Returns the value in the doorbell aperture at the
541 * requested doorbell index (VEGA10+).
542 */
amdgpu_mm_rdoorbell64(struct amdgpu_device * adev,u32 index)543 u64 amdgpu_mm_rdoorbell64(struct amdgpu_device *adev, u32 index)
544 {
545 if (adev->in_pci_err_recovery)
546 return 0;
547
548 if (index < adev->doorbell.num_doorbells) {
549 return atomic64_read((atomic64_t *)(adev->doorbell.ptr + index));
550 } else {
551 DRM_ERROR("reading beyond doorbell aperture: 0x%08x!\n", index);
552 return 0;
553 }
554 }
555
556 /**
557 * amdgpu_mm_wdoorbell64 - write a doorbell Qword
558 *
559 * @adev: amdgpu_device pointer
560 * @index: doorbell index
561 * @v: value to write
562 *
563 * Writes @v to the doorbell aperture at the
564 * requested doorbell index (VEGA10+).
565 */
amdgpu_mm_wdoorbell64(struct amdgpu_device * adev,u32 index,u64 v)566 void amdgpu_mm_wdoorbell64(struct amdgpu_device *adev, u32 index, u64 v)
567 {
568 if (adev->in_pci_err_recovery)
569 return;
570
571 if (index < adev->doorbell.num_doorbells) {
572 atomic64_set((atomic64_t *)(adev->doorbell.ptr + index), v);
573 } else {
574 DRM_ERROR("writing beyond doorbell aperture: 0x%08x!\n", index);
575 }
576 }
577
578 /**
579 * amdgpu_device_indirect_rreg - read an indirect register
580 *
581 * @adev: amdgpu_device pointer
582 * @pcie_index: mmio register offset
583 * @pcie_data: mmio register offset
584 *
585 * Returns the value of indirect register @reg_addr
586 */
amdgpu_device_indirect_rreg(struct amdgpu_device * adev,u32 pcie_index,u32 pcie_data,u32 reg_addr)587 u32 amdgpu_device_indirect_rreg(struct amdgpu_device *adev,
588 u32 pcie_index, u32 pcie_data,
589 u32 reg_addr)
590 {
591 unsigned long flags;
592 u32 r;
593 void __iomem *pcie_index_offset;
594 void __iomem *pcie_data_offset;
595
596 spin_lock_irqsave(&adev->pcie_idx_lock, flags);
597 pcie_index_offset = (void __iomem *)adev->rmmio + pcie_index * 4;
598 pcie_data_offset = (void __iomem *)adev->rmmio + pcie_data * 4;
599
600 writel(reg_addr, pcie_index_offset);
601 readl(pcie_index_offset);
602 r = readl(pcie_data_offset);
603 spin_unlock_irqrestore(&adev->pcie_idx_lock, flags);
604
605 return r;
606 }
607
608 /**
609 * amdgpu_device_indirect_rreg64 - read a 64bits indirect register
610 *
611 * @adev: amdgpu_device pointer
612 * @pcie_index: mmio register offset
613 * @pcie_data: mmio register offset
614 *
615 * Returns the value of indirect register @reg_addr
616 */
amdgpu_device_indirect_rreg64(struct amdgpu_device * adev,u32 pcie_index,u32 pcie_data,u32 reg_addr)617 u64 amdgpu_device_indirect_rreg64(struct amdgpu_device *adev,
618 u32 pcie_index, u32 pcie_data,
619 u32 reg_addr)
620 {
621 unsigned long flags;
622 u64 r;
623 void __iomem *pcie_index_offset;
624 void __iomem *pcie_data_offset;
625
626 spin_lock_irqsave(&adev->pcie_idx_lock, flags);
627 pcie_index_offset = (void __iomem *)adev->rmmio + pcie_index * 4;
628 pcie_data_offset = (void __iomem *)adev->rmmio + pcie_data * 4;
629
630 /* read low 32 bits */
631 writel(reg_addr, pcie_index_offset);
632 readl(pcie_index_offset);
633 r = readl(pcie_data_offset);
634 /* read high 32 bits */
635 writel(reg_addr + 4, pcie_index_offset);
636 readl(pcie_index_offset);
637 r |= ((u64)readl(pcie_data_offset) << 32);
638 spin_unlock_irqrestore(&adev->pcie_idx_lock, flags);
639
640 return r;
641 }
642
643 /**
644 * amdgpu_device_indirect_wreg - write an indirect register address
645 *
646 * @adev: amdgpu_device pointer
647 * @pcie_index: mmio register offset
648 * @pcie_data: mmio register offset
649 * @reg_addr: indirect register offset
650 * @reg_data: indirect register data
651 *
652 */
amdgpu_device_indirect_wreg(struct amdgpu_device * adev,u32 pcie_index,u32 pcie_data,u32 reg_addr,u32 reg_data)653 void amdgpu_device_indirect_wreg(struct amdgpu_device *adev,
654 u32 pcie_index, u32 pcie_data,
655 u32 reg_addr, u32 reg_data)
656 {
657 unsigned long flags;
658 void __iomem *pcie_index_offset;
659 void __iomem *pcie_data_offset;
660
661 spin_lock_irqsave(&adev->pcie_idx_lock, flags);
662 pcie_index_offset = (void __iomem *)adev->rmmio + pcie_index * 4;
663 pcie_data_offset = (void __iomem *)adev->rmmio + pcie_data * 4;
664
665 writel(reg_addr, pcie_index_offset);
666 readl(pcie_index_offset);
667 writel(reg_data, pcie_data_offset);
668 readl(pcie_data_offset);
669 spin_unlock_irqrestore(&adev->pcie_idx_lock, flags);
670 }
671
672 /**
673 * amdgpu_device_indirect_wreg64 - write a 64bits indirect register address
674 *
675 * @adev: amdgpu_device pointer
676 * @pcie_index: mmio register offset
677 * @pcie_data: mmio register offset
678 * @reg_addr: indirect register offset
679 * @reg_data: indirect register data
680 *
681 */
amdgpu_device_indirect_wreg64(struct amdgpu_device * adev,u32 pcie_index,u32 pcie_data,u32 reg_addr,u64 reg_data)682 void amdgpu_device_indirect_wreg64(struct amdgpu_device *adev,
683 u32 pcie_index, u32 pcie_data,
684 u32 reg_addr, u64 reg_data)
685 {
686 unsigned long flags;
687 void __iomem *pcie_index_offset;
688 void __iomem *pcie_data_offset;
689
690 spin_lock_irqsave(&adev->pcie_idx_lock, flags);
691 pcie_index_offset = (void __iomem *)adev->rmmio + pcie_index * 4;
692 pcie_data_offset = (void __iomem *)adev->rmmio + pcie_data * 4;
693
694 /* write low 32 bits */
695 writel(reg_addr, pcie_index_offset);
696 readl(pcie_index_offset);
697 writel((u32)(reg_data & 0xffffffffULL), pcie_data_offset);
698 readl(pcie_data_offset);
699 /* write high 32 bits */
700 writel(reg_addr + 4, pcie_index_offset);
701 readl(pcie_index_offset);
702 writel((u32)(reg_data >> 32), pcie_data_offset);
703 readl(pcie_data_offset);
704 spin_unlock_irqrestore(&adev->pcie_idx_lock, flags);
705 }
706
707 /**
708 * amdgpu_invalid_rreg - dummy reg read function
709 *
710 * @adev: amdgpu_device pointer
711 * @reg: offset of register
712 *
713 * Dummy register read function. Used for register blocks
714 * that certain asics don't have (all asics).
715 * Returns the value in the register.
716 */
amdgpu_invalid_rreg(struct amdgpu_device * adev,uint32_t reg)717 static uint32_t amdgpu_invalid_rreg(struct amdgpu_device *adev, uint32_t reg)
718 {
719 DRM_ERROR("Invalid callback to read register 0x%04X\n", reg);
720 BUG();
721 return 0;
722 }
723
724 /**
725 * amdgpu_invalid_wreg - dummy reg write function
726 *
727 * @adev: amdgpu_device pointer
728 * @reg: offset of register
729 * @v: value to write to the register
730 *
731 * Dummy register read function. Used for register blocks
732 * that certain asics don't have (all asics).
733 */
amdgpu_invalid_wreg(struct amdgpu_device * adev,uint32_t reg,uint32_t v)734 static void amdgpu_invalid_wreg(struct amdgpu_device *adev, uint32_t reg, uint32_t v)
735 {
736 DRM_ERROR("Invalid callback to write register 0x%04X with 0x%08X\n",
737 reg, v);
738 BUG();
739 }
740
741 /**
742 * amdgpu_invalid_rreg64 - dummy 64 bit reg read function
743 *
744 * @adev: amdgpu_device pointer
745 * @reg: offset of register
746 *
747 * Dummy register read function. Used for register blocks
748 * that certain asics don't have (all asics).
749 * Returns the value in the register.
750 */
amdgpu_invalid_rreg64(struct amdgpu_device * adev,uint32_t reg)751 static uint64_t amdgpu_invalid_rreg64(struct amdgpu_device *adev, uint32_t reg)
752 {
753 DRM_ERROR("Invalid callback to read 64 bit register 0x%04X\n", reg);
754 BUG();
755 return 0;
756 }
757
758 /**
759 * amdgpu_invalid_wreg64 - dummy reg write function
760 *
761 * @adev: amdgpu_device pointer
762 * @reg: offset of register
763 * @v: value to write to the register
764 *
765 * Dummy register read function. Used for register blocks
766 * that certain asics don't have (all asics).
767 */
amdgpu_invalid_wreg64(struct amdgpu_device * adev,uint32_t reg,uint64_t v)768 static void amdgpu_invalid_wreg64(struct amdgpu_device *adev, uint32_t reg, uint64_t v)
769 {
770 DRM_ERROR("Invalid callback to write 64 bit register 0x%04X with 0x%08llX\n",
771 reg, v);
772 BUG();
773 }
774
775 /**
776 * amdgpu_block_invalid_rreg - dummy reg read function
777 *
778 * @adev: amdgpu_device pointer
779 * @block: offset of instance
780 * @reg: offset of register
781 *
782 * Dummy register read function. Used for register blocks
783 * that certain asics don't have (all asics).
784 * Returns the value in the register.
785 */
amdgpu_block_invalid_rreg(struct amdgpu_device * adev,uint32_t block,uint32_t reg)786 static uint32_t amdgpu_block_invalid_rreg(struct amdgpu_device *adev,
787 uint32_t block, uint32_t reg)
788 {
789 DRM_ERROR("Invalid callback to read register 0x%04X in block 0x%04X\n",
790 reg, block);
791 BUG();
792 return 0;
793 }
794
795 /**
796 * amdgpu_block_invalid_wreg - dummy reg write function
797 *
798 * @adev: amdgpu_device pointer
799 * @block: offset of instance
800 * @reg: offset of register
801 * @v: value to write to the register
802 *
803 * Dummy register read function. Used for register blocks
804 * that certain asics don't have (all asics).
805 */
amdgpu_block_invalid_wreg(struct amdgpu_device * adev,uint32_t block,uint32_t reg,uint32_t v)806 static void amdgpu_block_invalid_wreg(struct amdgpu_device *adev,
807 uint32_t block,
808 uint32_t reg, uint32_t v)
809 {
810 DRM_ERROR("Invalid block callback to write register 0x%04X in block 0x%04X with 0x%08X\n",
811 reg, block, v);
812 BUG();
813 }
814
815 /**
816 * amdgpu_device_asic_init - Wrapper for atom asic_init
817 *
818 * @adev: amdgpu_device pointer
819 *
820 * Does any asic specific work and then calls atom asic init.
821 */
amdgpu_device_asic_init(struct amdgpu_device * adev)822 static int amdgpu_device_asic_init(struct amdgpu_device *adev)
823 {
824 amdgpu_asic_pre_asic_init(adev);
825
826 return amdgpu_atom_asic_init(adev->mode_info.atom_context);
827 }
828
829 /**
830 * amdgpu_device_vram_scratch_init - allocate the VRAM scratch page
831 *
832 * @adev: amdgpu_device pointer
833 *
834 * Allocates a scratch page of VRAM for use by various things in the
835 * driver.
836 */
amdgpu_device_vram_scratch_init(struct amdgpu_device * adev)837 static int amdgpu_device_vram_scratch_init(struct amdgpu_device *adev)
838 {
839 return amdgpu_bo_create_kernel(adev, AMDGPU_GPU_PAGE_SIZE,
840 PAGE_SIZE, AMDGPU_GEM_DOMAIN_VRAM,
841 &adev->vram_scratch.robj,
842 &adev->vram_scratch.gpu_addr,
843 (void **)&adev->vram_scratch.ptr);
844 }
845
846 /**
847 * amdgpu_device_vram_scratch_fini - Free the VRAM scratch page
848 *
849 * @adev: amdgpu_device pointer
850 *
851 * Frees the VRAM scratch page.
852 */
amdgpu_device_vram_scratch_fini(struct amdgpu_device * adev)853 static void amdgpu_device_vram_scratch_fini(struct amdgpu_device *adev)
854 {
855 amdgpu_bo_free_kernel(&adev->vram_scratch.robj, NULL, NULL);
856 }
857
858 /**
859 * amdgpu_device_program_register_sequence - program an array of registers.
860 *
861 * @adev: amdgpu_device pointer
862 * @registers: pointer to the register array
863 * @array_size: size of the register array
864 *
865 * Programs an array or registers with and and or masks.
866 * This is a helper for setting golden registers.
867 */
amdgpu_device_program_register_sequence(struct amdgpu_device * adev,const u32 * registers,const u32 array_size)868 void amdgpu_device_program_register_sequence(struct amdgpu_device *adev,
869 const u32 *registers,
870 const u32 array_size)
871 {
872 u32 tmp, reg, and_mask, or_mask;
873 int i;
874
875 if (array_size % 3)
876 return;
877
878 for (i = 0; i < array_size; i +=3) {
879 reg = registers[i + 0];
880 and_mask = registers[i + 1];
881 or_mask = registers[i + 2];
882
883 if (and_mask == 0xffffffff) {
884 tmp = or_mask;
885 } else {
886 tmp = RREG32(reg);
887 tmp &= ~and_mask;
888 if (adev->family >= AMDGPU_FAMILY_AI)
889 tmp |= (or_mask & and_mask);
890 else
891 tmp |= or_mask;
892 }
893 WREG32(reg, tmp);
894 }
895 }
896
897 /**
898 * amdgpu_device_pci_config_reset - reset the GPU
899 *
900 * @adev: amdgpu_device pointer
901 *
902 * Resets the GPU using the pci config reset sequence.
903 * Only applicable to asics prior to vega10.
904 */
amdgpu_device_pci_config_reset(struct amdgpu_device * adev)905 void amdgpu_device_pci_config_reset(struct amdgpu_device *adev)
906 {
907 pci_write_config_dword(adev->pdev, 0x7c, AMDGPU_ASIC_RESET_DATA);
908 }
909
910 /*
911 * GPU doorbell aperture helpers function.
912 */
913 /**
914 * amdgpu_device_doorbell_init - Init doorbell driver information.
915 *
916 * @adev: amdgpu_device pointer
917 *
918 * Init doorbell driver information (CIK)
919 * Returns 0 on success, error on failure.
920 */
amdgpu_device_doorbell_init(struct amdgpu_device * adev)921 static int amdgpu_device_doorbell_init(struct amdgpu_device *adev)
922 {
923
924 /* No doorbell on SI hardware generation */
925 if (adev->asic_type < CHIP_BONAIRE) {
926 adev->doorbell.base = 0;
927 adev->doorbell.size = 0;
928 adev->doorbell.num_doorbells = 0;
929 adev->doorbell.ptr = NULL;
930 return 0;
931 }
932
933 if (pci_resource_flags(adev->pdev, 2) & IORESOURCE_UNSET)
934 return -EINVAL;
935
936 amdgpu_asic_init_doorbell_index(adev);
937
938 /* doorbell bar mapping */
939 adev->doorbell.base = pci_resource_start(adev->pdev, 2);
940 adev->doorbell.size = pci_resource_len(adev->pdev, 2);
941
942 adev->doorbell.num_doorbells = min_t(u32, adev->doorbell.size / sizeof(u32),
943 adev->doorbell_index.max_assignment+1);
944 if (adev->doorbell.num_doorbells == 0)
945 return -EINVAL;
946
947 /* For Vega, reserve and map two pages on doorbell BAR since SDMA
948 * paging queue doorbell use the second page. The
949 * AMDGPU_DOORBELL64_MAX_ASSIGNMENT definition assumes all the
950 * doorbells are in the first page. So with paging queue enabled,
951 * the max num_doorbells should + 1 page (0x400 in dword)
952 */
953 if (adev->asic_type >= CHIP_VEGA10)
954 adev->doorbell.num_doorbells += 0x400;
955
956 adev->doorbell.ptr = ioremap(adev->doorbell.base,
957 adev->doorbell.num_doorbells *
958 sizeof(u32));
959 if (adev->doorbell.ptr == NULL)
960 return -ENOMEM;
961
962 return 0;
963 }
964
965 /**
966 * amdgpu_device_doorbell_fini - Tear down doorbell driver information.
967 *
968 * @adev: amdgpu_device pointer
969 *
970 * Tear down doorbell driver information (CIK)
971 */
amdgpu_device_doorbell_fini(struct amdgpu_device * adev)972 static void amdgpu_device_doorbell_fini(struct amdgpu_device *adev)
973 {
974 iounmap(adev->doorbell.ptr);
975 adev->doorbell.ptr = NULL;
976 }
977
978
979
980 /*
981 * amdgpu_device_wb_*()
982 * Writeback is the method by which the GPU updates special pages in memory
983 * with the status of certain GPU events (fences, ring pointers,etc.).
984 */
985
986 /**
987 * amdgpu_device_wb_fini - Disable Writeback and free memory
988 *
989 * @adev: amdgpu_device pointer
990 *
991 * Disables Writeback and frees the Writeback memory (all asics).
992 * Used at driver shutdown.
993 */
amdgpu_device_wb_fini(struct amdgpu_device * adev)994 static void amdgpu_device_wb_fini(struct amdgpu_device *adev)
995 {
996 if (adev->wb.wb_obj) {
997 amdgpu_bo_free_kernel(&adev->wb.wb_obj,
998 &adev->wb.gpu_addr,
999 (void **)&adev->wb.wb);
1000 adev->wb.wb_obj = NULL;
1001 }
1002 }
1003
1004 /**
1005 * amdgpu_device_wb_init- Init Writeback driver info and allocate memory
1006 *
1007 * @adev: amdgpu_device pointer
1008 *
1009 * Initializes writeback and allocates writeback memory (all asics).
1010 * Used at driver startup.
1011 * Returns 0 on success or an -error on failure.
1012 */
amdgpu_device_wb_init(struct amdgpu_device * adev)1013 static int amdgpu_device_wb_init(struct amdgpu_device *adev)
1014 {
1015 int r;
1016
1017 if (adev->wb.wb_obj == NULL) {
1018 /* AMDGPU_MAX_WB * sizeof(uint32_t) * 8 = AMDGPU_MAX_WB 256bit slots */
1019 r = amdgpu_bo_create_kernel(adev, AMDGPU_MAX_WB * sizeof(uint32_t) * 8,
1020 PAGE_SIZE, AMDGPU_GEM_DOMAIN_GTT,
1021 &adev->wb.wb_obj, &adev->wb.gpu_addr,
1022 (void **)&adev->wb.wb);
1023 if (r) {
1024 dev_warn(adev->dev, "(%d) create WB bo failed\n", r);
1025 return r;
1026 }
1027
1028 adev->wb.num_wb = AMDGPU_MAX_WB;
1029 memset(&adev->wb.used, 0, sizeof(adev->wb.used));
1030
1031 /* clear wb memory */
1032 memset((char *)adev->wb.wb, 0, AMDGPU_MAX_WB * sizeof(uint32_t) * 8);
1033 }
1034
1035 return 0;
1036 }
1037
1038 /**
1039 * amdgpu_device_wb_get - Allocate a wb entry
1040 *
1041 * @adev: amdgpu_device pointer
1042 * @wb: wb index
1043 *
1044 * Allocate a wb slot for use by the driver (all asics).
1045 * Returns 0 on success or -EINVAL on failure.
1046 */
amdgpu_device_wb_get(struct amdgpu_device * adev,u32 * wb)1047 int amdgpu_device_wb_get(struct amdgpu_device *adev, u32 *wb)
1048 {
1049 unsigned long offset = find_first_zero_bit(adev->wb.used, adev->wb.num_wb);
1050
1051 if (offset < adev->wb.num_wb) {
1052 __set_bit(offset, adev->wb.used);
1053 *wb = offset << 3; /* convert to dw offset */
1054 return 0;
1055 } else {
1056 return -EINVAL;
1057 }
1058 }
1059
1060 /**
1061 * amdgpu_device_wb_free - Free a wb entry
1062 *
1063 * @adev: amdgpu_device pointer
1064 * @wb: wb index
1065 *
1066 * Free a wb slot allocated for use by the driver (all asics)
1067 */
amdgpu_device_wb_free(struct amdgpu_device * adev,u32 wb)1068 void amdgpu_device_wb_free(struct amdgpu_device *adev, u32 wb)
1069 {
1070 wb >>= 3;
1071 if (wb < adev->wb.num_wb)
1072 __clear_bit(wb, adev->wb.used);
1073 }
1074
1075 /**
1076 * amdgpu_device_resize_fb_bar - try to resize FB BAR
1077 *
1078 * @adev: amdgpu_device pointer
1079 *
1080 * Try to resize FB BAR to make all VRAM CPU accessible. We try very hard not
1081 * to fail, but if any of the BARs is not accessible after the size we abort
1082 * driver loading by returning -ENODEV.
1083 */
amdgpu_device_resize_fb_bar(struct amdgpu_device * adev)1084 int amdgpu_device_resize_fb_bar(struct amdgpu_device *adev)
1085 {
1086 u64 space_needed = roundup_pow_of_two(adev->gmc.real_vram_size);
1087 u32 rbar_size = order_base_2(((space_needed >> 20) | 1)) - 1;
1088 struct pci_bus *root;
1089 struct resource *res;
1090 unsigned i;
1091 u16 cmd;
1092 int r;
1093
1094 if (!IS_ENABLED(CONFIG_PHYS_ADDR_T_64BIT))
1095 return 0;
1096
1097 /* Bypass for VF */
1098 if (amdgpu_sriov_vf(adev))
1099 return 0;
1100
1101 /* skip if the bios has already enabled large BAR */
1102 if (adev->gmc.real_vram_size &&
1103 (pci_resource_len(adev->pdev, 0) >= adev->gmc.real_vram_size))
1104 return 0;
1105
1106 /* Check if the root BUS has 64bit memory resources */
1107 root = adev->pdev->bus;
1108 while (root->parent)
1109 root = root->parent;
1110
1111 pci_bus_for_each_resource(root, res, i) {
1112 if (res && res->flags & (IORESOURCE_MEM | IORESOURCE_MEM_64) &&
1113 res->start > 0x100000000ull)
1114 break;
1115 }
1116
1117 /* Trying to resize is pointless without a root hub window above 4GB */
1118 if (!res)
1119 return 0;
1120
1121 /* Disable memory decoding while we change the BAR addresses and size */
1122 pci_read_config_word(adev->pdev, PCI_COMMAND, &cmd);
1123 pci_write_config_word(adev->pdev, PCI_COMMAND,
1124 cmd & ~PCI_COMMAND_MEMORY);
1125
1126 /* Free the VRAM and doorbell BAR, we most likely need to move both. */
1127 amdgpu_device_doorbell_fini(adev);
1128 if (adev->asic_type >= CHIP_BONAIRE)
1129 pci_release_resource(adev->pdev, 2);
1130
1131 pci_release_resource(adev->pdev, 0);
1132
1133 r = pci_resize_resource(adev->pdev, 0, rbar_size);
1134 if (r == -ENOSPC)
1135 DRM_INFO("Not enough PCI address space for a large BAR.");
1136 else if (r && r != -ENOTSUPP)
1137 DRM_ERROR("Problem resizing BAR0 (%d).", r);
1138
1139 pci_assign_unassigned_bus_resources(adev->pdev->bus);
1140
1141 /* When the doorbell or fb BAR isn't available we have no chance of
1142 * using the device.
1143 */
1144 r = amdgpu_device_doorbell_init(adev);
1145 if (r || (pci_resource_flags(adev->pdev, 0) & IORESOURCE_UNSET))
1146 return -ENODEV;
1147
1148 pci_write_config_word(adev->pdev, PCI_COMMAND, cmd);
1149
1150 return 0;
1151 }
1152
1153 /*
1154 * GPU helpers function.
1155 */
1156 /**
1157 * amdgpu_device_need_post - check if the hw need post or not
1158 *
1159 * @adev: amdgpu_device pointer
1160 *
1161 * Check if the asic has been initialized (all asics) at driver startup
1162 * or post is needed if hw reset is performed.
1163 * Returns true if need or false if not.
1164 */
amdgpu_device_need_post(struct amdgpu_device * adev)1165 bool amdgpu_device_need_post(struct amdgpu_device *adev)
1166 {
1167 uint32_t reg;
1168
1169 if (amdgpu_sriov_vf(adev))
1170 return false;
1171
1172 if (amdgpu_passthrough(adev)) {
1173 /* for FIJI: In whole GPU pass-through virtualization case, after VM reboot
1174 * some old smc fw still need driver do vPost otherwise gpu hang, while
1175 * those smc fw version above 22.15 doesn't have this flaw, so we force
1176 * vpost executed for smc version below 22.15
1177 */
1178 if (adev->asic_type == CHIP_FIJI) {
1179 int err;
1180 uint32_t fw_ver;
1181 err = request_firmware(&adev->pm.fw, "amdgpu/fiji_smc.bin", adev->dev);
1182 /* force vPost if error occured */
1183 if (err)
1184 return true;
1185
1186 fw_ver = *((uint32_t *)adev->pm.fw->data + 69);
1187 release_firmware(adev->pm.fw);
1188 if (fw_ver < 0x00160e00)
1189 return true;
1190 }
1191 }
1192
1193 if (adev->has_hw_reset) {
1194 adev->has_hw_reset = false;
1195 return true;
1196 }
1197
1198 /* bios scratch used on CIK+ */
1199 if (adev->asic_type >= CHIP_BONAIRE)
1200 return amdgpu_atombios_scratch_need_asic_init(adev);
1201
1202 /* check MEM_SIZE for older asics */
1203 reg = amdgpu_asic_get_config_memsize(adev);
1204
1205 if ((reg != 0) && (reg != 0xffffffff))
1206 return false;
1207
1208 return true;
1209 }
1210
1211 /* if we get transitioned to only one device, take VGA back */
1212 /**
1213 * amdgpu_device_vga_set_decode - enable/disable vga decode
1214 *
1215 * @cookie: amdgpu_device pointer
1216 * @state: enable/disable vga decode
1217 *
1218 * Enable/disable vga decode (all asics).
1219 * Returns VGA resource flags.
1220 */
amdgpu_device_vga_set_decode(void * cookie,bool state)1221 static unsigned int amdgpu_device_vga_set_decode(void *cookie, bool state)
1222 {
1223 struct amdgpu_device *adev = cookie;
1224 amdgpu_asic_set_vga_state(adev, state);
1225 if (state)
1226 return VGA_RSRC_LEGACY_IO | VGA_RSRC_LEGACY_MEM |
1227 VGA_RSRC_NORMAL_IO | VGA_RSRC_NORMAL_MEM;
1228 else
1229 return VGA_RSRC_NORMAL_IO | VGA_RSRC_NORMAL_MEM;
1230 }
1231
1232 /**
1233 * amdgpu_device_check_block_size - validate the vm block size
1234 *
1235 * @adev: amdgpu_device pointer
1236 *
1237 * Validates the vm block size specified via module parameter.
1238 * The vm block size defines number of bits in page table versus page directory,
1239 * a page is 4KB so we have 12 bits offset, minimum 9 bits in the
1240 * page table and the remaining bits are in the page directory.
1241 */
amdgpu_device_check_block_size(struct amdgpu_device * adev)1242 static void amdgpu_device_check_block_size(struct amdgpu_device *adev)
1243 {
1244 /* defines number of bits in page table versus page directory,
1245 * a page is 4KB so we have 12 bits offset, minimum 9 bits in the
1246 * page table and the remaining bits are in the page directory */
1247 if (amdgpu_vm_block_size == -1)
1248 return;
1249
1250 if (amdgpu_vm_block_size < 9) {
1251 dev_warn(adev->dev, "VM page table size (%d) too small\n",
1252 amdgpu_vm_block_size);
1253 amdgpu_vm_block_size = -1;
1254 }
1255 }
1256
1257 /**
1258 * amdgpu_device_check_vm_size - validate the vm size
1259 *
1260 * @adev: amdgpu_device pointer
1261 *
1262 * Validates the vm size in GB specified via module parameter.
1263 * The VM size is the size of the GPU virtual memory space in GB.
1264 */
amdgpu_device_check_vm_size(struct amdgpu_device * adev)1265 static void amdgpu_device_check_vm_size(struct amdgpu_device *adev)
1266 {
1267 /* no need to check the default value */
1268 if (amdgpu_vm_size == -1)
1269 return;
1270
1271 if (amdgpu_vm_size < 1) {
1272 dev_warn(adev->dev, "VM size (%d) too small, min is 1GB\n",
1273 amdgpu_vm_size);
1274 amdgpu_vm_size = -1;
1275 }
1276 }
1277
amdgpu_device_check_smu_prv_buffer_size(struct amdgpu_device * adev)1278 static void amdgpu_device_check_smu_prv_buffer_size(struct amdgpu_device *adev)
1279 {
1280 struct sysinfo si;
1281 bool is_os_64 = (sizeof(void *) == 8);
1282 uint64_t total_memory;
1283 uint64_t dram_size_seven_GB = 0x1B8000000;
1284 uint64_t dram_size_three_GB = 0xB8000000;
1285
1286 if (amdgpu_smu_memory_pool_size == 0)
1287 return;
1288
1289 if (!is_os_64) {
1290 DRM_WARN("Not 64-bit OS, feature not supported\n");
1291 goto def_value;
1292 }
1293 si_meminfo(&si);
1294 total_memory = (uint64_t)si.totalram * si.mem_unit;
1295
1296 if ((amdgpu_smu_memory_pool_size == 1) ||
1297 (amdgpu_smu_memory_pool_size == 2)) {
1298 if (total_memory < dram_size_three_GB)
1299 goto def_value1;
1300 } else if ((amdgpu_smu_memory_pool_size == 4) ||
1301 (amdgpu_smu_memory_pool_size == 8)) {
1302 if (total_memory < dram_size_seven_GB)
1303 goto def_value1;
1304 } else {
1305 DRM_WARN("Smu memory pool size not supported\n");
1306 goto def_value;
1307 }
1308 adev->pm.smu_prv_buffer_size = amdgpu_smu_memory_pool_size << 28;
1309
1310 return;
1311
1312 def_value1:
1313 DRM_WARN("No enough system memory\n");
1314 def_value:
1315 adev->pm.smu_prv_buffer_size = 0;
1316 }
1317
1318 /**
1319 * amdgpu_device_check_arguments - validate module params
1320 *
1321 * @adev: amdgpu_device pointer
1322 *
1323 * Validates certain module parameters and updates
1324 * the associated values used by the driver (all asics).
1325 */
amdgpu_device_check_arguments(struct amdgpu_device * adev)1326 static int amdgpu_device_check_arguments(struct amdgpu_device *adev)
1327 {
1328 if (amdgpu_sched_jobs < 4) {
1329 dev_warn(adev->dev, "sched jobs (%d) must be at least 4\n",
1330 amdgpu_sched_jobs);
1331 amdgpu_sched_jobs = 4;
1332 } else if (!is_power_of_2(amdgpu_sched_jobs)){
1333 dev_warn(adev->dev, "sched jobs (%d) must be a power of 2\n",
1334 amdgpu_sched_jobs);
1335 amdgpu_sched_jobs = roundup_pow_of_two(amdgpu_sched_jobs);
1336 }
1337
1338 if (amdgpu_gart_size != -1 && amdgpu_gart_size < 32) {
1339 /* gart size must be greater or equal to 32M */
1340 dev_warn(adev->dev, "gart size (%d) too small\n",
1341 amdgpu_gart_size);
1342 amdgpu_gart_size = -1;
1343 }
1344
1345 if (amdgpu_gtt_size != -1 && amdgpu_gtt_size < 32) {
1346 /* gtt size must be greater or equal to 32M */
1347 dev_warn(adev->dev, "gtt size (%d) too small\n",
1348 amdgpu_gtt_size);
1349 amdgpu_gtt_size = -1;
1350 }
1351
1352 /* valid range is between 4 and 9 inclusive */
1353 if (amdgpu_vm_fragment_size != -1 &&
1354 (amdgpu_vm_fragment_size > 9 || amdgpu_vm_fragment_size < 4)) {
1355 dev_warn(adev->dev, "valid range is between 4 and 9\n");
1356 amdgpu_vm_fragment_size = -1;
1357 }
1358
1359 if (amdgpu_sched_hw_submission < 2) {
1360 dev_warn(adev->dev, "sched hw submission jobs (%d) must be at least 2\n",
1361 amdgpu_sched_hw_submission);
1362 amdgpu_sched_hw_submission = 2;
1363 } else if (!is_power_of_2(amdgpu_sched_hw_submission)) {
1364 dev_warn(adev->dev, "sched hw submission jobs (%d) must be a power of 2\n",
1365 amdgpu_sched_hw_submission);
1366 amdgpu_sched_hw_submission = roundup_pow_of_two(amdgpu_sched_hw_submission);
1367 }
1368
1369 amdgpu_device_check_smu_prv_buffer_size(adev);
1370
1371 amdgpu_device_check_vm_size(adev);
1372
1373 amdgpu_device_check_block_size(adev);
1374
1375 adev->firmware.load_type = amdgpu_ucode_get_load_type(adev, amdgpu_fw_load_type);
1376
1377 amdgpu_gmc_tmz_set(adev);
1378
1379 if (amdgpu_num_kcq == -1) {
1380 amdgpu_num_kcq = 8;
1381 } else if (amdgpu_num_kcq > 8 || amdgpu_num_kcq < 0) {
1382 amdgpu_num_kcq = 8;
1383 dev_warn(adev->dev, "set kernel compute queue number to 8 due to invalid parameter provided by user\n");
1384 }
1385
1386 amdgpu_gmc_noretry_set(adev);
1387
1388 return 0;
1389 }
1390
1391 /**
1392 * amdgpu_switcheroo_set_state - set switcheroo state
1393 *
1394 * @pdev: pci dev pointer
1395 * @state: vga_switcheroo state
1396 *
1397 * Callback for the switcheroo driver. Suspends or resumes the
1398 * the asics before or after it is powered up using ACPI methods.
1399 */
amdgpu_switcheroo_set_state(struct pci_dev * pdev,enum vga_switcheroo_state state)1400 static void amdgpu_switcheroo_set_state(struct pci_dev *pdev,
1401 enum vga_switcheroo_state state)
1402 {
1403 struct drm_device *dev = pci_get_drvdata(pdev);
1404 int r;
1405
1406 if (amdgpu_device_supports_boco(dev) && state == VGA_SWITCHEROO_OFF)
1407 return;
1408
1409 if (state == VGA_SWITCHEROO_ON) {
1410 pr_info("switched on\n");
1411 /* don't suspend or resume card normally */
1412 dev->switch_power_state = DRM_SWITCH_POWER_CHANGING;
1413
1414 pci_set_power_state(dev->pdev, PCI_D0);
1415 amdgpu_device_load_pci_state(dev->pdev);
1416 r = pci_enable_device(dev->pdev);
1417 if (r)
1418 DRM_WARN("pci_enable_device failed (%d)\n", r);
1419 amdgpu_device_resume(dev, true);
1420
1421 dev->switch_power_state = DRM_SWITCH_POWER_ON;
1422 drm_kms_helper_poll_enable(dev);
1423 } else {
1424 pr_info("switched off\n");
1425 drm_kms_helper_poll_disable(dev);
1426 dev->switch_power_state = DRM_SWITCH_POWER_CHANGING;
1427 amdgpu_device_suspend(dev, true);
1428 amdgpu_device_cache_pci_state(dev->pdev);
1429 /* Shut down the device */
1430 pci_disable_device(dev->pdev);
1431 pci_set_power_state(dev->pdev, PCI_D3cold);
1432 dev->switch_power_state = DRM_SWITCH_POWER_OFF;
1433 }
1434 }
1435
1436 /**
1437 * amdgpu_switcheroo_can_switch - see if switcheroo state can change
1438 *
1439 * @pdev: pci dev pointer
1440 *
1441 * Callback for the switcheroo driver. Check of the switcheroo
1442 * state can be changed.
1443 * Returns true if the state can be changed, false if not.
1444 */
amdgpu_switcheroo_can_switch(struct pci_dev * pdev)1445 static bool amdgpu_switcheroo_can_switch(struct pci_dev *pdev)
1446 {
1447 struct drm_device *dev = pci_get_drvdata(pdev);
1448
1449 /*
1450 * FIXME: open_count is protected by drm_global_mutex but that would lead to
1451 * locking inversion with the driver load path. And the access here is
1452 * completely racy anyway. So don't bother with locking for now.
1453 */
1454 return atomic_read(&dev->open_count) == 0;
1455 }
1456
1457 static const struct vga_switcheroo_client_ops amdgpu_switcheroo_ops = {
1458 .set_gpu_state = amdgpu_switcheroo_set_state,
1459 .reprobe = NULL,
1460 .can_switch = amdgpu_switcheroo_can_switch,
1461 };
1462
1463 /**
1464 * amdgpu_device_ip_set_clockgating_state - set the CG state
1465 *
1466 * @dev: amdgpu_device pointer
1467 * @block_type: Type of hardware IP (SMU, GFX, UVD, etc.)
1468 * @state: clockgating state (gate or ungate)
1469 *
1470 * Sets the requested clockgating state for all instances of
1471 * the hardware IP specified.
1472 * Returns the error code from the last instance.
1473 */
amdgpu_device_ip_set_clockgating_state(void * dev,enum amd_ip_block_type block_type,enum amd_clockgating_state state)1474 int amdgpu_device_ip_set_clockgating_state(void *dev,
1475 enum amd_ip_block_type block_type,
1476 enum amd_clockgating_state state)
1477 {
1478 struct amdgpu_device *adev = dev;
1479 int i, r = 0;
1480
1481 for (i = 0; i < adev->num_ip_blocks; i++) {
1482 if (!adev->ip_blocks[i].status.valid)
1483 continue;
1484 if (adev->ip_blocks[i].version->type != block_type)
1485 continue;
1486 if (!adev->ip_blocks[i].version->funcs->set_clockgating_state)
1487 continue;
1488 r = adev->ip_blocks[i].version->funcs->set_clockgating_state(
1489 (void *)adev, state);
1490 if (r)
1491 DRM_ERROR("set_clockgating_state of IP block <%s> failed %d\n",
1492 adev->ip_blocks[i].version->funcs->name, r);
1493 }
1494 return r;
1495 }
1496
1497 /**
1498 * amdgpu_device_ip_set_powergating_state - set the PG state
1499 *
1500 * @dev: amdgpu_device pointer
1501 * @block_type: Type of hardware IP (SMU, GFX, UVD, etc.)
1502 * @state: powergating state (gate or ungate)
1503 *
1504 * Sets the requested powergating state for all instances of
1505 * the hardware IP specified.
1506 * Returns the error code from the last instance.
1507 */
amdgpu_device_ip_set_powergating_state(void * dev,enum amd_ip_block_type block_type,enum amd_powergating_state state)1508 int amdgpu_device_ip_set_powergating_state(void *dev,
1509 enum amd_ip_block_type block_type,
1510 enum amd_powergating_state state)
1511 {
1512 struct amdgpu_device *adev = dev;
1513 int i, r = 0;
1514
1515 for (i = 0; i < adev->num_ip_blocks; i++) {
1516 if (!adev->ip_blocks[i].status.valid)
1517 continue;
1518 if (adev->ip_blocks[i].version->type != block_type)
1519 continue;
1520 if (!adev->ip_blocks[i].version->funcs->set_powergating_state)
1521 continue;
1522 r = adev->ip_blocks[i].version->funcs->set_powergating_state(
1523 (void *)adev, state);
1524 if (r)
1525 DRM_ERROR("set_powergating_state of IP block <%s> failed %d\n",
1526 adev->ip_blocks[i].version->funcs->name, r);
1527 }
1528 return r;
1529 }
1530
1531 /**
1532 * amdgpu_device_ip_get_clockgating_state - get the CG state
1533 *
1534 * @adev: amdgpu_device pointer
1535 * @flags: clockgating feature flags
1536 *
1537 * Walks the list of IPs on the device and updates the clockgating
1538 * flags for each IP.
1539 * Updates @flags with the feature flags for each hardware IP where
1540 * clockgating is enabled.
1541 */
amdgpu_device_ip_get_clockgating_state(struct amdgpu_device * adev,u32 * flags)1542 void amdgpu_device_ip_get_clockgating_state(struct amdgpu_device *adev,
1543 u32 *flags)
1544 {
1545 int i;
1546
1547 for (i = 0; i < adev->num_ip_blocks; i++) {
1548 if (!adev->ip_blocks[i].status.valid)
1549 continue;
1550 if (adev->ip_blocks[i].version->funcs->get_clockgating_state)
1551 adev->ip_blocks[i].version->funcs->get_clockgating_state((void *)adev, flags);
1552 }
1553 }
1554
1555 /**
1556 * amdgpu_device_ip_wait_for_idle - wait for idle
1557 *
1558 * @adev: amdgpu_device pointer
1559 * @block_type: Type of hardware IP (SMU, GFX, UVD, etc.)
1560 *
1561 * Waits for the request hardware IP to be idle.
1562 * Returns 0 for success or a negative error code on failure.
1563 */
amdgpu_device_ip_wait_for_idle(struct amdgpu_device * adev,enum amd_ip_block_type block_type)1564 int amdgpu_device_ip_wait_for_idle(struct amdgpu_device *adev,
1565 enum amd_ip_block_type block_type)
1566 {
1567 int i, r;
1568
1569 for (i = 0; i < adev->num_ip_blocks; i++) {
1570 if (!adev->ip_blocks[i].status.valid)
1571 continue;
1572 if (adev->ip_blocks[i].version->type == block_type) {
1573 r = adev->ip_blocks[i].version->funcs->wait_for_idle((void *)adev);
1574 if (r)
1575 return r;
1576 break;
1577 }
1578 }
1579 return 0;
1580
1581 }
1582
1583 /**
1584 * amdgpu_device_ip_is_idle - is the hardware IP idle
1585 *
1586 * @adev: amdgpu_device pointer
1587 * @block_type: Type of hardware IP (SMU, GFX, UVD, etc.)
1588 *
1589 * Check if the hardware IP is idle or not.
1590 * Returns true if it the IP is idle, false if not.
1591 */
amdgpu_device_ip_is_idle(struct amdgpu_device * adev,enum amd_ip_block_type block_type)1592 bool amdgpu_device_ip_is_idle(struct amdgpu_device *adev,
1593 enum amd_ip_block_type block_type)
1594 {
1595 int i;
1596
1597 for (i = 0; i < adev->num_ip_blocks; i++) {
1598 if (!adev->ip_blocks[i].status.valid)
1599 continue;
1600 if (adev->ip_blocks[i].version->type == block_type)
1601 return adev->ip_blocks[i].version->funcs->is_idle((void *)adev);
1602 }
1603 return true;
1604
1605 }
1606
1607 /**
1608 * amdgpu_device_ip_get_ip_block - get a hw IP pointer
1609 *
1610 * @adev: amdgpu_device pointer
1611 * @type: Type of hardware IP (SMU, GFX, UVD, etc.)
1612 *
1613 * Returns a pointer to the hardware IP block structure
1614 * if it exists for the asic, otherwise NULL.
1615 */
1616 struct amdgpu_ip_block *
amdgpu_device_ip_get_ip_block(struct amdgpu_device * adev,enum amd_ip_block_type type)1617 amdgpu_device_ip_get_ip_block(struct amdgpu_device *adev,
1618 enum amd_ip_block_type type)
1619 {
1620 int i;
1621
1622 for (i = 0; i < adev->num_ip_blocks; i++)
1623 if (adev->ip_blocks[i].version->type == type)
1624 return &adev->ip_blocks[i];
1625
1626 return NULL;
1627 }
1628
1629 /**
1630 * amdgpu_device_ip_block_version_cmp
1631 *
1632 * @adev: amdgpu_device pointer
1633 * @type: enum amd_ip_block_type
1634 * @major: major version
1635 * @minor: minor version
1636 *
1637 * return 0 if equal or greater
1638 * return 1 if smaller or the ip_block doesn't exist
1639 */
amdgpu_device_ip_block_version_cmp(struct amdgpu_device * adev,enum amd_ip_block_type type,u32 major,u32 minor)1640 int amdgpu_device_ip_block_version_cmp(struct amdgpu_device *adev,
1641 enum amd_ip_block_type type,
1642 u32 major, u32 minor)
1643 {
1644 struct amdgpu_ip_block *ip_block = amdgpu_device_ip_get_ip_block(adev, type);
1645
1646 if (ip_block && ((ip_block->version->major > major) ||
1647 ((ip_block->version->major == major) &&
1648 (ip_block->version->minor >= minor))))
1649 return 0;
1650
1651 return 1;
1652 }
1653
1654 /**
1655 * amdgpu_device_ip_block_add
1656 *
1657 * @adev: amdgpu_device pointer
1658 * @ip_block_version: pointer to the IP to add
1659 *
1660 * Adds the IP block driver information to the collection of IPs
1661 * on the asic.
1662 */
amdgpu_device_ip_block_add(struct amdgpu_device * adev,const struct amdgpu_ip_block_version * ip_block_version)1663 int amdgpu_device_ip_block_add(struct amdgpu_device *adev,
1664 const struct amdgpu_ip_block_version *ip_block_version)
1665 {
1666 if (!ip_block_version)
1667 return -EINVAL;
1668
1669 DRM_INFO("add ip block number %d <%s>\n", adev->num_ip_blocks,
1670 ip_block_version->funcs->name);
1671
1672 adev->ip_blocks[adev->num_ip_blocks++].version = ip_block_version;
1673
1674 return 0;
1675 }
1676
1677 /**
1678 * amdgpu_device_enable_virtual_display - enable virtual display feature
1679 *
1680 * @adev: amdgpu_device pointer
1681 *
1682 * Enabled the virtual display feature if the user has enabled it via
1683 * the module parameter virtual_display. This feature provides a virtual
1684 * display hardware on headless boards or in virtualized environments.
1685 * This function parses and validates the configuration string specified by
1686 * the user and configues the virtual display configuration (number of
1687 * virtual connectors, crtcs, etc.) specified.
1688 */
amdgpu_device_enable_virtual_display(struct amdgpu_device * adev)1689 static void amdgpu_device_enable_virtual_display(struct amdgpu_device *adev)
1690 {
1691 adev->enable_virtual_display = false;
1692
1693 if (amdgpu_virtual_display) {
1694 struct drm_device *ddev = adev_to_drm(adev);
1695 const char *pci_address_name = pci_name(ddev->pdev);
1696 char *pciaddstr, *pciaddstr_tmp, *pciaddname_tmp, *pciaddname;
1697
1698 pciaddstr = kstrdup(amdgpu_virtual_display, GFP_KERNEL);
1699 pciaddstr_tmp = pciaddstr;
1700 while ((pciaddname_tmp = strsep(&pciaddstr_tmp, ";"))) {
1701 pciaddname = strsep(&pciaddname_tmp, ",");
1702 if (!strcmp("all", pciaddname)
1703 || !strcmp(pci_address_name, pciaddname)) {
1704 long num_crtc;
1705 int res = -1;
1706
1707 adev->enable_virtual_display = true;
1708
1709 if (pciaddname_tmp)
1710 res = kstrtol(pciaddname_tmp, 10,
1711 &num_crtc);
1712
1713 if (!res) {
1714 if (num_crtc < 1)
1715 num_crtc = 1;
1716 if (num_crtc > 6)
1717 num_crtc = 6;
1718 adev->mode_info.num_crtc = num_crtc;
1719 } else {
1720 adev->mode_info.num_crtc = 1;
1721 }
1722 break;
1723 }
1724 }
1725
1726 DRM_INFO("virtual display string:%s, %s:virtual_display:%d, num_crtc:%d\n",
1727 amdgpu_virtual_display, pci_address_name,
1728 adev->enable_virtual_display, adev->mode_info.num_crtc);
1729
1730 kfree(pciaddstr);
1731 }
1732 }
1733
1734 /**
1735 * amdgpu_device_parse_gpu_info_fw - parse gpu info firmware
1736 *
1737 * @adev: amdgpu_device pointer
1738 *
1739 * Parses the asic configuration parameters specified in the gpu info
1740 * firmware and makes them availale to the driver for use in configuring
1741 * the asic.
1742 * Returns 0 on success, -EINVAL on failure.
1743 */
amdgpu_device_parse_gpu_info_fw(struct amdgpu_device * adev)1744 static int amdgpu_device_parse_gpu_info_fw(struct amdgpu_device *adev)
1745 {
1746 const char *chip_name;
1747 char fw_name[40];
1748 int err;
1749 const struct gpu_info_firmware_header_v1_0 *hdr;
1750
1751 adev->firmware.gpu_info_fw = NULL;
1752
1753 if (adev->mman.discovery_bin) {
1754 amdgpu_discovery_get_gfx_info(adev);
1755
1756 /*
1757 * FIXME: The bounding box is still needed by Navi12, so
1758 * temporarily read it from gpu_info firmware. Should be droped
1759 * when DAL no longer needs it.
1760 */
1761 if (adev->asic_type != CHIP_NAVI12)
1762 return 0;
1763 }
1764
1765 switch (adev->asic_type) {
1766 #ifdef CONFIG_DRM_AMDGPU_SI
1767 case CHIP_VERDE:
1768 case CHIP_TAHITI:
1769 case CHIP_PITCAIRN:
1770 case CHIP_OLAND:
1771 case CHIP_HAINAN:
1772 #endif
1773 #ifdef CONFIG_DRM_AMDGPU_CIK
1774 case CHIP_BONAIRE:
1775 case CHIP_HAWAII:
1776 case CHIP_KAVERI:
1777 case CHIP_KABINI:
1778 case CHIP_MULLINS:
1779 #endif
1780 case CHIP_TOPAZ:
1781 case CHIP_TONGA:
1782 case CHIP_FIJI:
1783 case CHIP_POLARIS10:
1784 case CHIP_POLARIS11:
1785 case CHIP_POLARIS12:
1786 case CHIP_VEGAM:
1787 case CHIP_CARRIZO:
1788 case CHIP_STONEY:
1789 case CHIP_VEGA20:
1790 case CHIP_SIENNA_CICHLID:
1791 case CHIP_NAVY_FLOUNDER:
1792 default:
1793 return 0;
1794 case CHIP_VEGA10:
1795 chip_name = "vega10";
1796 break;
1797 case CHIP_VEGA12:
1798 chip_name = "vega12";
1799 break;
1800 case CHIP_RAVEN:
1801 if (adev->apu_flags & AMD_APU_IS_RAVEN2)
1802 chip_name = "raven2";
1803 else if (adev->apu_flags & AMD_APU_IS_PICASSO)
1804 chip_name = "picasso";
1805 else
1806 chip_name = "raven";
1807 break;
1808 case CHIP_ARCTURUS:
1809 chip_name = "arcturus";
1810 break;
1811 case CHIP_RENOIR:
1812 if (adev->apu_flags & AMD_APU_IS_RENOIR)
1813 chip_name = "renoir";
1814 else
1815 chip_name = "green_sardine";
1816 break;
1817 case CHIP_NAVI10:
1818 chip_name = "navi10";
1819 break;
1820 case CHIP_NAVI14:
1821 chip_name = "navi14";
1822 break;
1823 case CHIP_NAVI12:
1824 chip_name = "navi12";
1825 break;
1826 }
1827
1828 snprintf(fw_name, sizeof(fw_name), "amdgpu/%s_gpu_info.bin", chip_name);
1829 err = request_firmware(&adev->firmware.gpu_info_fw, fw_name, adev->dev);
1830 if (err) {
1831 dev_err(adev->dev,
1832 "Failed to load gpu_info firmware \"%s\"\n",
1833 fw_name);
1834 goto out;
1835 }
1836 err = amdgpu_ucode_validate(adev->firmware.gpu_info_fw);
1837 if (err) {
1838 dev_err(adev->dev,
1839 "Failed to validate gpu_info firmware \"%s\"\n",
1840 fw_name);
1841 goto out;
1842 }
1843
1844 hdr = (const struct gpu_info_firmware_header_v1_0 *)adev->firmware.gpu_info_fw->data;
1845 amdgpu_ucode_print_gpu_info_hdr(&hdr->header);
1846
1847 switch (hdr->version_major) {
1848 case 1:
1849 {
1850 const struct gpu_info_firmware_v1_0 *gpu_info_fw =
1851 (const struct gpu_info_firmware_v1_0 *)(adev->firmware.gpu_info_fw->data +
1852 le32_to_cpu(hdr->header.ucode_array_offset_bytes));
1853
1854 /*
1855 * Should be droped when DAL no longer needs it.
1856 */
1857 if (adev->asic_type == CHIP_NAVI12)
1858 goto parse_soc_bounding_box;
1859
1860 adev->gfx.config.max_shader_engines = le32_to_cpu(gpu_info_fw->gc_num_se);
1861 adev->gfx.config.max_cu_per_sh = le32_to_cpu(gpu_info_fw->gc_num_cu_per_sh);
1862 adev->gfx.config.max_sh_per_se = le32_to_cpu(gpu_info_fw->gc_num_sh_per_se);
1863 adev->gfx.config.max_backends_per_se = le32_to_cpu(gpu_info_fw->gc_num_rb_per_se);
1864 adev->gfx.config.max_texture_channel_caches =
1865 le32_to_cpu(gpu_info_fw->gc_num_tccs);
1866 adev->gfx.config.max_gprs = le32_to_cpu(gpu_info_fw->gc_num_gprs);
1867 adev->gfx.config.max_gs_threads = le32_to_cpu(gpu_info_fw->gc_num_max_gs_thds);
1868 adev->gfx.config.gs_vgt_table_depth = le32_to_cpu(gpu_info_fw->gc_gs_table_depth);
1869 adev->gfx.config.gs_prim_buffer_depth = le32_to_cpu(gpu_info_fw->gc_gsprim_buff_depth);
1870 adev->gfx.config.double_offchip_lds_buf =
1871 le32_to_cpu(gpu_info_fw->gc_double_offchip_lds_buffer);
1872 adev->gfx.cu_info.wave_front_size = le32_to_cpu(gpu_info_fw->gc_wave_size);
1873 adev->gfx.cu_info.max_waves_per_simd =
1874 le32_to_cpu(gpu_info_fw->gc_max_waves_per_simd);
1875 adev->gfx.cu_info.max_scratch_slots_per_cu =
1876 le32_to_cpu(gpu_info_fw->gc_max_scratch_slots_per_cu);
1877 adev->gfx.cu_info.lds_size = le32_to_cpu(gpu_info_fw->gc_lds_size);
1878 if (hdr->version_minor >= 1) {
1879 const struct gpu_info_firmware_v1_1 *gpu_info_fw =
1880 (const struct gpu_info_firmware_v1_1 *)(adev->firmware.gpu_info_fw->data +
1881 le32_to_cpu(hdr->header.ucode_array_offset_bytes));
1882 adev->gfx.config.num_sc_per_sh =
1883 le32_to_cpu(gpu_info_fw->num_sc_per_sh);
1884 adev->gfx.config.num_packer_per_sc =
1885 le32_to_cpu(gpu_info_fw->num_packer_per_sc);
1886 }
1887
1888 parse_soc_bounding_box:
1889 /*
1890 * soc bounding box info is not integrated in disocovery table,
1891 * we always need to parse it from gpu info firmware if needed.
1892 */
1893 if (hdr->version_minor == 2) {
1894 const struct gpu_info_firmware_v1_2 *gpu_info_fw =
1895 (const struct gpu_info_firmware_v1_2 *)(adev->firmware.gpu_info_fw->data +
1896 le32_to_cpu(hdr->header.ucode_array_offset_bytes));
1897 adev->dm.soc_bounding_box = &gpu_info_fw->soc_bounding_box;
1898 }
1899 break;
1900 }
1901 default:
1902 dev_err(adev->dev,
1903 "Unsupported gpu_info table %d\n", hdr->header.ucode_version);
1904 err = -EINVAL;
1905 goto out;
1906 }
1907 out:
1908 return err;
1909 }
1910
1911 /**
1912 * amdgpu_device_ip_early_init - run early init for hardware IPs
1913 *
1914 * @adev: amdgpu_device pointer
1915 *
1916 * Early initialization pass for hardware IPs. The hardware IPs that make
1917 * up each asic are discovered each IP's early_init callback is run. This
1918 * is the first stage in initializing the asic.
1919 * Returns 0 on success, negative error code on failure.
1920 */
amdgpu_device_ip_early_init(struct amdgpu_device * adev)1921 static int amdgpu_device_ip_early_init(struct amdgpu_device *adev)
1922 {
1923 int i, r;
1924
1925 amdgpu_device_enable_virtual_display(adev);
1926
1927 if (amdgpu_sriov_vf(adev)) {
1928 r = amdgpu_virt_request_full_gpu(adev, true);
1929 if (r)
1930 return r;
1931 }
1932
1933 switch (adev->asic_type) {
1934 #ifdef CONFIG_DRM_AMDGPU_SI
1935 case CHIP_VERDE:
1936 case CHIP_TAHITI:
1937 case CHIP_PITCAIRN:
1938 case CHIP_OLAND:
1939 case CHIP_HAINAN:
1940 adev->family = AMDGPU_FAMILY_SI;
1941 r = si_set_ip_blocks(adev);
1942 if (r)
1943 return r;
1944 break;
1945 #endif
1946 #ifdef CONFIG_DRM_AMDGPU_CIK
1947 case CHIP_BONAIRE:
1948 case CHIP_HAWAII:
1949 case CHIP_KAVERI:
1950 case CHIP_KABINI:
1951 case CHIP_MULLINS:
1952 if (adev->flags & AMD_IS_APU)
1953 adev->family = AMDGPU_FAMILY_KV;
1954 else
1955 adev->family = AMDGPU_FAMILY_CI;
1956
1957 r = cik_set_ip_blocks(adev);
1958 if (r)
1959 return r;
1960 break;
1961 #endif
1962 case CHIP_TOPAZ:
1963 case CHIP_TONGA:
1964 case CHIP_FIJI:
1965 case CHIP_POLARIS10:
1966 case CHIP_POLARIS11:
1967 case CHIP_POLARIS12:
1968 case CHIP_VEGAM:
1969 case CHIP_CARRIZO:
1970 case CHIP_STONEY:
1971 if (adev->flags & AMD_IS_APU)
1972 adev->family = AMDGPU_FAMILY_CZ;
1973 else
1974 adev->family = AMDGPU_FAMILY_VI;
1975
1976 r = vi_set_ip_blocks(adev);
1977 if (r)
1978 return r;
1979 break;
1980 case CHIP_VEGA10:
1981 case CHIP_VEGA12:
1982 case CHIP_VEGA20:
1983 case CHIP_RAVEN:
1984 case CHIP_ARCTURUS:
1985 case CHIP_RENOIR:
1986 if (adev->flags & AMD_IS_APU)
1987 adev->family = AMDGPU_FAMILY_RV;
1988 else
1989 adev->family = AMDGPU_FAMILY_AI;
1990
1991 r = soc15_set_ip_blocks(adev);
1992 if (r)
1993 return r;
1994 break;
1995 case CHIP_NAVI10:
1996 case CHIP_NAVI14:
1997 case CHIP_NAVI12:
1998 case CHIP_SIENNA_CICHLID:
1999 case CHIP_NAVY_FLOUNDER:
2000 adev->family = AMDGPU_FAMILY_NV;
2001
2002 r = nv_set_ip_blocks(adev);
2003 if (r)
2004 return r;
2005 break;
2006 default:
2007 /* FIXME: not supported yet */
2008 return -EINVAL;
2009 }
2010
2011 amdgpu_amdkfd_device_probe(adev);
2012
2013 adev->pm.pp_feature = amdgpu_pp_feature_mask;
2014 if (amdgpu_sriov_vf(adev) || sched_policy == KFD_SCHED_POLICY_NO_HWS)
2015 adev->pm.pp_feature &= ~PP_GFXOFF_MASK;
2016
2017 for (i = 0; i < adev->num_ip_blocks; i++) {
2018 if ((amdgpu_ip_block_mask & (1 << i)) == 0) {
2019 DRM_ERROR("disabled ip block: %d <%s>\n",
2020 i, adev->ip_blocks[i].version->funcs->name);
2021 adev->ip_blocks[i].status.valid = false;
2022 } else {
2023 if (adev->ip_blocks[i].version->funcs->early_init) {
2024 r = adev->ip_blocks[i].version->funcs->early_init((void *)adev);
2025 if (r == -ENOENT) {
2026 adev->ip_blocks[i].status.valid = false;
2027 } else if (r) {
2028 DRM_ERROR("early_init of IP block <%s> failed %d\n",
2029 adev->ip_blocks[i].version->funcs->name, r);
2030 return r;
2031 } else {
2032 adev->ip_blocks[i].status.valid = true;
2033 }
2034 } else {
2035 adev->ip_blocks[i].status.valid = true;
2036 }
2037 }
2038 /* get the vbios after the asic_funcs are set up */
2039 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_COMMON) {
2040 r = amdgpu_device_parse_gpu_info_fw(adev);
2041 if (r)
2042 return r;
2043
2044 /* Read BIOS */
2045 if (!amdgpu_get_bios(adev))
2046 return -EINVAL;
2047
2048 r = amdgpu_atombios_init(adev);
2049 if (r) {
2050 dev_err(adev->dev, "amdgpu_atombios_init failed\n");
2051 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_ATOMBIOS_INIT_FAIL, 0, 0);
2052 return r;
2053 }
2054
2055 /*get pf2vf msg info at it's earliest time*/
2056 if (amdgpu_sriov_vf(adev))
2057 amdgpu_virt_init_data_exchange(adev);
2058
2059 }
2060 }
2061
2062 adev->cg_flags &= amdgpu_cg_mask;
2063 adev->pg_flags &= amdgpu_pg_mask;
2064
2065 return 0;
2066 }
2067
amdgpu_device_ip_hw_init_phase1(struct amdgpu_device * adev)2068 static int amdgpu_device_ip_hw_init_phase1(struct amdgpu_device *adev)
2069 {
2070 int i, r;
2071
2072 for (i = 0; i < adev->num_ip_blocks; i++) {
2073 if (!adev->ip_blocks[i].status.sw)
2074 continue;
2075 if (adev->ip_blocks[i].status.hw)
2076 continue;
2077 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_COMMON ||
2078 (amdgpu_sriov_vf(adev) && (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_PSP)) ||
2079 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_IH) {
2080 r = adev->ip_blocks[i].version->funcs->hw_init(adev);
2081 if (r) {
2082 DRM_ERROR("hw_init of IP block <%s> failed %d\n",
2083 adev->ip_blocks[i].version->funcs->name, r);
2084 return r;
2085 }
2086 adev->ip_blocks[i].status.hw = true;
2087 }
2088 }
2089
2090 return 0;
2091 }
2092
amdgpu_device_ip_hw_init_phase2(struct amdgpu_device * adev)2093 static int amdgpu_device_ip_hw_init_phase2(struct amdgpu_device *adev)
2094 {
2095 int i, r;
2096
2097 for (i = 0; i < adev->num_ip_blocks; i++) {
2098 if (!adev->ip_blocks[i].status.sw)
2099 continue;
2100 if (adev->ip_blocks[i].status.hw)
2101 continue;
2102 r = adev->ip_blocks[i].version->funcs->hw_init(adev);
2103 if (r) {
2104 DRM_ERROR("hw_init of IP block <%s> failed %d\n",
2105 adev->ip_blocks[i].version->funcs->name, r);
2106 return r;
2107 }
2108 adev->ip_blocks[i].status.hw = true;
2109 }
2110
2111 return 0;
2112 }
2113
amdgpu_device_fw_loading(struct amdgpu_device * adev)2114 static int amdgpu_device_fw_loading(struct amdgpu_device *adev)
2115 {
2116 int r = 0;
2117 int i;
2118 uint32_t smu_version;
2119
2120 if (adev->asic_type >= CHIP_VEGA10) {
2121 for (i = 0; i < adev->num_ip_blocks; i++) {
2122 if (adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_PSP)
2123 continue;
2124
2125 /* no need to do the fw loading again if already done*/
2126 if (adev->ip_blocks[i].status.hw == true)
2127 break;
2128
2129 if (amdgpu_in_reset(adev) || adev->in_suspend) {
2130 r = adev->ip_blocks[i].version->funcs->resume(adev);
2131 if (r) {
2132 DRM_ERROR("resume of IP block <%s> failed %d\n",
2133 adev->ip_blocks[i].version->funcs->name, r);
2134 return r;
2135 }
2136 } else {
2137 r = adev->ip_blocks[i].version->funcs->hw_init(adev);
2138 if (r) {
2139 DRM_ERROR("hw_init of IP block <%s> failed %d\n",
2140 adev->ip_blocks[i].version->funcs->name, r);
2141 return r;
2142 }
2143 }
2144
2145 adev->ip_blocks[i].status.hw = true;
2146 break;
2147 }
2148 }
2149
2150 if (!amdgpu_sriov_vf(adev) || adev->asic_type == CHIP_TONGA)
2151 r = amdgpu_pm_load_smu_firmware(adev, &smu_version);
2152
2153 return r;
2154 }
2155
2156 /**
2157 * amdgpu_device_ip_init - run init for hardware IPs
2158 *
2159 * @adev: amdgpu_device pointer
2160 *
2161 * Main initialization pass for hardware IPs. The list of all the hardware
2162 * IPs that make up the asic is walked and the sw_init and hw_init callbacks
2163 * are run. sw_init initializes the software state associated with each IP
2164 * and hw_init initializes the hardware associated with each IP.
2165 * Returns 0 on success, negative error code on failure.
2166 */
amdgpu_device_ip_init(struct amdgpu_device * adev)2167 static int amdgpu_device_ip_init(struct amdgpu_device *adev)
2168 {
2169 int i, r;
2170
2171 r = amdgpu_ras_init(adev);
2172 if (r)
2173 return r;
2174
2175 for (i = 0; i < adev->num_ip_blocks; i++) {
2176 if (!adev->ip_blocks[i].status.valid)
2177 continue;
2178 r = adev->ip_blocks[i].version->funcs->sw_init((void *)adev);
2179 if (r) {
2180 DRM_ERROR("sw_init of IP block <%s> failed %d\n",
2181 adev->ip_blocks[i].version->funcs->name, r);
2182 goto init_failed;
2183 }
2184 adev->ip_blocks[i].status.sw = true;
2185
2186 /* need to do gmc hw init early so we can allocate gpu mem */
2187 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC) {
2188 /* Try to reserve bad pages early */
2189 if (amdgpu_sriov_vf(adev))
2190 amdgpu_virt_exchange_data(adev);
2191
2192 r = amdgpu_device_vram_scratch_init(adev);
2193 if (r) {
2194 DRM_ERROR("amdgpu_vram_scratch_init failed %d\n", r);
2195 goto init_failed;
2196 }
2197 r = adev->ip_blocks[i].version->funcs->hw_init((void *)adev);
2198 if (r) {
2199 DRM_ERROR("hw_init %d failed %d\n", i, r);
2200 goto init_failed;
2201 }
2202 r = amdgpu_device_wb_init(adev);
2203 if (r) {
2204 DRM_ERROR("amdgpu_device_wb_init failed %d\n", r);
2205 goto init_failed;
2206 }
2207 adev->ip_blocks[i].status.hw = true;
2208
2209 /* right after GMC hw init, we create CSA */
2210 if (amdgpu_mcbp || amdgpu_sriov_vf(adev)) {
2211 r = amdgpu_allocate_static_csa(adev, &adev->virt.csa_obj,
2212 AMDGPU_GEM_DOMAIN_VRAM,
2213 AMDGPU_CSA_SIZE);
2214 if (r) {
2215 DRM_ERROR("allocate CSA failed %d\n", r);
2216 goto init_failed;
2217 }
2218 }
2219 }
2220 }
2221
2222 if (amdgpu_sriov_vf(adev))
2223 amdgpu_virt_init_data_exchange(adev);
2224
2225 r = amdgpu_ib_pool_init(adev);
2226 if (r) {
2227 dev_err(adev->dev, "IB initialization failed (%d).\n", r);
2228 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_IB_INIT_FAIL, 0, r);
2229 goto init_failed;
2230 }
2231
2232 r = amdgpu_ucode_create_bo(adev); /* create ucode bo when sw_init complete*/
2233 if (r)
2234 goto init_failed;
2235
2236 r = amdgpu_device_ip_hw_init_phase1(adev);
2237 if (r)
2238 goto init_failed;
2239
2240 r = amdgpu_device_fw_loading(adev);
2241 if (r)
2242 goto init_failed;
2243
2244 r = amdgpu_device_ip_hw_init_phase2(adev);
2245 if (r)
2246 goto init_failed;
2247
2248 /*
2249 * retired pages will be loaded from eeprom and reserved here,
2250 * it should be called after amdgpu_device_ip_hw_init_phase2 since
2251 * for some ASICs the RAS EEPROM code relies on SMU fully functioning
2252 * for I2C communication which only true at this point.
2253 *
2254 * amdgpu_ras_recovery_init may fail, but the upper only cares the
2255 * failure from bad gpu situation and stop amdgpu init process
2256 * accordingly. For other failed cases, it will still release all
2257 * the resource and print error message, rather than returning one
2258 * negative value to upper level.
2259 *
2260 * Note: theoretically, this should be called before all vram allocations
2261 * to protect retired page from abusing
2262 */
2263 r = amdgpu_ras_recovery_init(adev);
2264 if (r)
2265 goto init_failed;
2266
2267 if (adev->gmc.xgmi.num_physical_nodes > 1)
2268 amdgpu_xgmi_add_device(adev);
2269 amdgpu_amdkfd_device_init(adev);
2270
2271 amdgpu_fru_get_product_info(adev);
2272
2273 init_failed:
2274 if (amdgpu_sriov_vf(adev))
2275 amdgpu_virt_release_full_gpu(adev, true);
2276
2277 return r;
2278 }
2279
2280 /**
2281 * amdgpu_device_fill_reset_magic - writes reset magic to gart pointer
2282 *
2283 * @adev: amdgpu_device pointer
2284 *
2285 * Writes a reset magic value to the gart pointer in VRAM. The driver calls
2286 * this function before a GPU reset. If the value is retained after a
2287 * GPU reset, VRAM has not been lost. Some GPU resets may destry VRAM contents.
2288 */
amdgpu_device_fill_reset_magic(struct amdgpu_device * adev)2289 static void amdgpu_device_fill_reset_magic(struct amdgpu_device *adev)
2290 {
2291 memcpy(adev->reset_magic, adev->gart.ptr, AMDGPU_RESET_MAGIC_NUM);
2292 }
2293
2294 /**
2295 * amdgpu_device_check_vram_lost - check if vram is valid
2296 *
2297 * @adev: amdgpu_device pointer
2298 *
2299 * Checks the reset magic value written to the gart pointer in VRAM.
2300 * The driver calls this after a GPU reset to see if the contents of
2301 * VRAM is lost or now.
2302 * returns true if vram is lost, false if not.
2303 */
amdgpu_device_check_vram_lost(struct amdgpu_device * adev)2304 static bool amdgpu_device_check_vram_lost(struct amdgpu_device *adev)
2305 {
2306 if (memcmp(adev->gart.ptr, adev->reset_magic,
2307 AMDGPU_RESET_MAGIC_NUM))
2308 return true;
2309
2310 if (!amdgpu_in_reset(adev))
2311 return false;
2312
2313 /*
2314 * For all ASICs with baco/mode1 reset, the VRAM is
2315 * always assumed to be lost.
2316 */
2317 switch (amdgpu_asic_reset_method(adev)) {
2318 case AMD_RESET_METHOD_BACO:
2319 case AMD_RESET_METHOD_MODE1:
2320 return true;
2321 default:
2322 return false;
2323 }
2324 }
2325
2326 /**
2327 * amdgpu_device_set_cg_state - set clockgating for amdgpu device
2328 *
2329 * @adev: amdgpu_device pointer
2330 * @state: clockgating state (gate or ungate)
2331 *
2332 * The list of all the hardware IPs that make up the asic is walked and the
2333 * set_clockgating_state callbacks are run.
2334 * Late initialization pass enabling clockgating for hardware IPs.
2335 * Fini or suspend, pass disabling clockgating for hardware IPs.
2336 * Returns 0 on success, negative error code on failure.
2337 */
2338
amdgpu_device_set_cg_state(struct amdgpu_device * adev,enum amd_clockgating_state state)2339 static int amdgpu_device_set_cg_state(struct amdgpu_device *adev,
2340 enum amd_clockgating_state state)
2341 {
2342 int i, j, r;
2343
2344 if (amdgpu_emu_mode == 1)
2345 return 0;
2346
2347 for (j = 0; j < adev->num_ip_blocks; j++) {
2348 i = state == AMD_CG_STATE_GATE ? j : adev->num_ip_blocks - j - 1;
2349 if (!adev->ip_blocks[i].status.late_initialized)
2350 continue;
2351 /* skip CG for VCE/UVD, it's handled specially */
2352 if (adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_UVD &&
2353 adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_VCE &&
2354 adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_VCN &&
2355 adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_JPEG &&
2356 adev->ip_blocks[i].version->funcs->set_clockgating_state) {
2357 /* enable clockgating to save power */
2358 r = adev->ip_blocks[i].version->funcs->set_clockgating_state((void *)adev,
2359 state);
2360 if (r) {
2361 DRM_ERROR("set_clockgating_state(gate) of IP block <%s> failed %d\n",
2362 adev->ip_blocks[i].version->funcs->name, r);
2363 return r;
2364 }
2365 }
2366 }
2367
2368 return 0;
2369 }
2370
amdgpu_device_set_pg_state(struct amdgpu_device * adev,enum amd_powergating_state state)2371 static int amdgpu_device_set_pg_state(struct amdgpu_device *adev, enum amd_powergating_state state)
2372 {
2373 int i, j, r;
2374
2375 if (amdgpu_emu_mode == 1)
2376 return 0;
2377
2378 for (j = 0; j < adev->num_ip_blocks; j++) {
2379 i = state == AMD_PG_STATE_GATE ? j : adev->num_ip_blocks - j - 1;
2380 if (!adev->ip_blocks[i].status.late_initialized)
2381 continue;
2382 /* skip CG for VCE/UVD, it's handled specially */
2383 if (adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_UVD &&
2384 adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_VCE &&
2385 adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_VCN &&
2386 adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_JPEG &&
2387 adev->ip_blocks[i].version->funcs->set_powergating_state) {
2388 /* enable powergating to save power */
2389 r = adev->ip_blocks[i].version->funcs->set_powergating_state((void *)adev,
2390 state);
2391 if (r) {
2392 DRM_ERROR("set_powergating_state(gate) of IP block <%s> failed %d\n",
2393 adev->ip_blocks[i].version->funcs->name, r);
2394 return r;
2395 }
2396 }
2397 }
2398 return 0;
2399 }
2400
amdgpu_device_enable_mgpu_fan_boost(void)2401 static int amdgpu_device_enable_mgpu_fan_boost(void)
2402 {
2403 struct amdgpu_gpu_instance *gpu_ins;
2404 struct amdgpu_device *adev;
2405 int i, ret = 0;
2406
2407 mutex_lock(&mgpu_info.mutex);
2408
2409 /*
2410 * MGPU fan boost feature should be enabled
2411 * only when there are two or more dGPUs in
2412 * the system
2413 */
2414 if (mgpu_info.num_dgpu < 2)
2415 goto out;
2416
2417 for (i = 0; i < mgpu_info.num_dgpu; i++) {
2418 gpu_ins = &(mgpu_info.gpu_ins[i]);
2419 adev = gpu_ins->adev;
2420 if (!(adev->flags & AMD_IS_APU) &&
2421 !gpu_ins->mgpu_fan_enabled) {
2422 ret = amdgpu_dpm_enable_mgpu_fan_boost(adev);
2423 if (ret)
2424 break;
2425
2426 gpu_ins->mgpu_fan_enabled = 1;
2427 }
2428 }
2429
2430 out:
2431 mutex_unlock(&mgpu_info.mutex);
2432
2433 return ret;
2434 }
2435
2436 /**
2437 * amdgpu_device_ip_late_init - run late init for hardware IPs
2438 *
2439 * @adev: amdgpu_device pointer
2440 *
2441 * Late initialization pass for hardware IPs. The list of all the hardware
2442 * IPs that make up the asic is walked and the late_init callbacks are run.
2443 * late_init covers any special initialization that an IP requires
2444 * after all of the have been initialized or something that needs to happen
2445 * late in the init process.
2446 * Returns 0 on success, negative error code on failure.
2447 */
amdgpu_device_ip_late_init(struct amdgpu_device * adev)2448 static int amdgpu_device_ip_late_init(struct amdgpu_device *adev)
2449 {
2450 struct amdgpu_gpu_instance *gpu_instance;
2451 int i = 0, r;
2452
2453 for (i = 0; i < adev->num_ip_blocks; i++) {
2454 if (!adev->ip_blocks[i].status.hw)
2455 continue;
2456 if (adev->ip_blocks[i].version->funcs->late_init) {
2457 r = adev->ip_blocks[i].version->funcs->late_init((void *)adev);
2458 if (r) {
2459 DRM_ERROR("late_init of IP block <%s> failed %d\n",
2460 adev->ip_blocks[i].version->funcs->name, r);
2461 return r;
2462 }
2463 }
2464 adev->ip_blocks[i].status.late_initialized = true;
2465 }
2466
2467 amdgpu_ras_set_error_query_ready(adev, true);
2468
2469 amdgpu_device_set_cg_state(adev, AMD_CG_STATE_GATE);
2470 amdgpu_device_set_pg_state(adev, AMD_PG_STATE_GATE);
2471
2472 amdgpu_device_fill_reset_magic(adev);
2473
2474 r = amdgpu_device_enable_mgpu_fan_boost();
2475 if (r)
2476 DRM_ERROR("enable mgpu fan boost failed (%d).\n", r);
2477
2478
2479 if (adev->gmc.xgmi.num_physical_nodes > 1) {
2480 mutex_lock(&mgpu_info.mutex);
2481
2482 /*
2483 * Reset device p-state to low as this was booted with high.
2484 *
2485 * This should be performed only after all devices from the same
2486 * hive get initialized.
2487 *
2488 * However, it's unknown how many device in the hive in advance.
2489 * As this is counted one by one during devices initializations.
2490 *
2491 * So, we wait for all XGMI interlinked devices initialized.
2492 * This may bring some delays as those devices may come from
2493 * different hives. But that should be OK.
2494 */
2495 if (mgpu_info.num_dgpu == adev->gmc.xgmi.num_physical_nodes) {
2496 for (i = 0; i < mgpu_info.num_gpu; i++) {
2497 gpu_instance = &(mgpu_info.gpu_ins[i]);
2498 if (gpu_instance->adev->flags & AMD_IS_APU)
2499 continue;
2500
2501 r = amdgpu_xgmi_set_pstate(gpu_instance->adev,
2502 AMDGPU_XGMI_PSTATE_MIN);
2503 if (r) {
2504 DRM_ERROR("pstate setting failed (%d).\n", r);
2505 break;
2506 }
2507 }
2508 }
2509
2510 mutex_unlock(&mgpu_info.mutex);
2511 }
2512
2513 return 0;
2514 }
2515
2516 /**
2517 * amdgpu_device_ip_fini - run fini for hardware IPs
2518 *
2519 * @adev: amdgpu_device pointer
2520 *
2521 * Main teardown pass for hardware IPs. The list of all the hardware
2522 * IPs that make up the asic is walked and the hw_fini and sw_fini callbacks
2523 * are run. hw_fini tears down the hardware associated with each IP
2524 * and sw_fini tears down any software state associated with each IP.
2525 * Returns 0 on success, negative error code on failure.
2526 */
amdgpu_device_ip_fini(struct amdgpu_device * adev)2527 static int amdgpu_device_ip_fini(struct amdgpu_device *adev)
2528 {
2529 int i, r;
2530
2531 if (amdgpu_sriov_vf(adev) && adev->virt.ras_init_done)
2532 amdgpu_virt_release_ras_err_handler_data(adev);
2533
2534 amdgpu_ras_pre_fini(adev);
2535
2536 if (adev->gmc.xgmi.num_physical_nodes > 1)
2537 amdgpu_xgmi_remove_device(adev);
2538
2539 amdgpu_device_set_pg_state(adev, AMD_PG_STATE_UNGATE);
2540 amdgpu_device_set_cg_state(adev, AMD_CG_STATE_UNGATE);
2541
2542 amdgpu_amdkfd_device_fini(adev);
2543
2544 /* need to disable SMC first */
2545 for (i = 0; i < adev->num_ip_blocks; i++) {
2546 if (!adev->ip_blocks[i].status.hw)
2547 continue;
2548 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_SMC) {
2549 r = adev->ip_blocks[i].version->funcs->hw_fini((void *)adev);
2550 /* XXX handle errors */
2551 if (r) {
2552 DRM_DEBUG("hw_fini of IP block <%s> failed %d\n",
2553 adev->ip_blocks[i].version->funcs->name, r);
2554 }
2555 adev->ip_blocks[i].status.hw = false;
2556 break;
2557 }
2558 }
2559
2560 for (i = adev->num_ip_blocks - 1; i >= 0; i--) {
2561 if (!adev->ip_blocks[i].status.hw)
2562 continue;
2563
2564 r = adev->ip_blocks[i].version->funcs->hw_fini((void *)adev);
2565 /* XXX handle errors */
2566 if (r) {
2567 DRM_DEBUG("hw_fini of IP block <%s> failed %d\n",
2568 adev->ip_blocks[i].version->funcs->name, r);
2569 }
2570
2571 adev->ip_blocks[i].status.hw = false;
2572 }
2573
2574
2575 for (i = adev->num_ip_blocks - 1; i >= 0; i--) {
2576 if (!adev->ip_blocks[i].status.sw)
2577 continue;
2578
2579 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC) {
2580 amdgpu_ucode_free_bo(adev);
2581 amdgpu_free_static_csa(&adev->virt.csa_obj);
2582 amdgpu_device_wb_fini(adev);
2583 amdgpu_device_vram_scratch_fini(adev);
2584 amdgpu_ib_pool_fini(adev);
2585 }
2586
2587 r = adev->ip_blocks[i].version->funcs->sw_fini((void *)adev);
2588 /* XXX handle errors */
2589 if (r) {
2590 DRM_DEBUG("sw_fini of IP block <%s> failed %d\n",
2591 adev->ip_blocks[i].version->funcs->name, r);
2592 }
2593 adev->ip_blocks[i].status.sw = false;
2594 adev->ip_blocks[i].status.valid = false;
2595 }
2596
2597 for (i = adev->num_ip_blocks - 1; i >= 0; i--) {
2598 if (!adev->ip_blocks[i].status.late_initialized)
2599 continue;
2600 if (adev->ip_blocks[i].version->funcs->late_fini)
2601 adev->ip_blocks[i].version->funcs->late_fini((void *)adev);
2602 adev->ip_blocks[i].status.late_initialized = false;
2603 }
2604
2605 amdgpu_ras_fini(adev);
2606
2607 if (amdgpu_sriov_vf(adev))
2608 if (amdgpu_virt_release_full_gpu(adev, false))
2609 DRM_ERROR("failed to release exclusive mode on fini\n");
2610
2611 return 0;
2612 }
2613
2614 /**
2615 * amdgpu_device_delayed_init_work_handler - work handler for IB tests
2616 *
2617 * @work: work_struct.
2618 */
amdgpu_device_delayed_init_work_handler(struct work_struct * work)2619 static void amdgpu_device_delayed_init_work_handler(struct work_struct *work)
2620 {
2621 struct amdgpu_device *adev =
2622 container_of(work, struct amdgpu_device, delayed_init_work.work);
2623 int r;
2624
2625 r = amdgpu_ib_ring_tests(adev);
2626 if (r)
2627 DRM_ERROR("ib ring test failed (%d).\n", r);
2628 }
2629
amdgpu_device_delay_enable_gfx_off(struct work_struct * work)2630 static void amdgpu_device_delay_enable_gfx_off(struct work_struct *work)
2631 {
2632 struct amdgpu_device *adev =
2633 container_of(work, struct amdgpu_device, gfx.gfx_off_delay_work.work);
2634
2635 WARN_ON_ONCE(adev->gfx.gfx_off_state);
2636 WARN_ON_ONCE(adev->gfx.gfx_off_req_count);
2637
2638 if (!amdgpu_dpm_set_powergating_by_smu(adev, AMD_IP_BLOCK_TYPE_GFX, true))
2639 adev->gfx.gfx_off_state = true;
2640 }
2641
2642 /**
2643 * amdgpu_device_ip_suspend_phase1 - run suspend for hardware IPs (phase 1)
2644 *
2645 * @adev: amdgpu_device pointer
2646 *
2647 * Main suspend function for hardware IPs. The list of all the hardware
2648 * IPs that make up the asic is walked, clockgating is disabled and the
2649 * suspend callbacks are run. suspend puts the hardware and software state
2650 * in each IP into a state suitable for suspend.
2651 * Returns 0 on success, negative error code on failure.
2652 */
amdgpu_device_ip_suspend_phase1(struct amdgpu_device * adev)2653 static int amdgpu_device_ip_suspend_phase1(struct amdgpu_device *adev)
2654 {
2655 int i, r;
2656
2657 amdgpu_device_set_pg_state(adev, AMD_PG_STATE_UNGATE);
2658 amdgpu_device_set_cg_state(adev, AMD_CG_STATE_UNGATE);
2659
2660 for (i = adev->num_ip_blocks - 1; i >= 0; i--) {
2661 if (!adev->ip_blocks[i].status.valid)
2662 continue;
2663
2664 /* displays are handled separately */
2665 if (adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_DCE)
2666 continue;
2667
2668 /* XXX handle errors */
2669 r = adev->ip_blocks[i].version->funcs->suspend(adev);
2670 /* XXX handle errors */
2671 if (r) {
2672 DRM_ERROR("suspend of IP block <%s> failed %d\n",
2673 adev->ip_blocks[i].version->funcs->name, r);
2674 return r;
2675 }
2676
2677 adev->ip_blocks[i].status.hw = false;
2678 }
2679
2680 return 0;
2681 }
2682
2683 /**
2684 * amdgpu_device_ip_suspend_phase2 - run suspend for hardware IPs (phase 2)
2685 *
2686 * @adev: amdgpu_device pointer
2687 *
2688 * Main suspend function for hardware IPs. The list of all the hardware
2689 * IPs that make up the asic is walked, clockgating is disabled and the
2690 * suspend callbacks are run. suspend puts the hardware and software state
2691 * in each IP into a state suitable for suspend.
2692 * Returns 0 on success, negative error code on failure.
2693 */
amdgpu_device_ip_suspend_phase2(struct amdgpu_device * adev)2694 static int amdgpu_device_ip_suspend_phase2(struct amdgpu_device *adev)
2695 {
2696 int i, r;
2697
2698 for (i = adev->num_ip_blocks - 1; i >= 0; i--) {
2699 if (!adev->ip_blocks[i].status.valid)
2700 continue;
2701 /* displays are handled in phase1 */
2702 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_DCE)
2703 continue;
2704 /* PSP lost connection when err_event_athub occurs */
2705 if (amdgpu_ras_intr_triggered() &&
2706 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_PSP) {
2707 adev->ip_blocks[i].status.hw = false;
2708 continue;
2709 }
2710 /* XXX handle errors */
2711 r = adev->ip_blocks[i].version->funcs->suspend(adev);
2712 /* XXX handle errors */
2713 if (r) {
2714 DRM_ERROR("suspend of IP block <%s> failed %d\n",
2715 adev->ip_blocks[i].version->funcs->name, r);
2716 }
2717 adev->ip_blocks[i].status.hw = false;
2718 /* handle putting the SMC in the appropriate state */
2719 if(!amdgpu_sriov_vf(adev)){
2720 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_SMC) {
2721 r = amdgpu_dpm_set_mp1_state(adev, adev->mp1_state);
2722 if (r) {
2723 DRM_ERROR("SMC failed to set mp1 state %d, %d\n",
2724 adev->mp1_state, r);
2725 return r;
2726 }
2727 }
2728 }
2729 adev->ip_blocks[i].status.hw = false;
2730 }
2731
2732 return 0;
2733 }
2734
2735 /**
2736 * amdgpu_device_ip_suspend - run suspend for hardware IPs
2737 *
2738 * @adev: amdgpu_device pointer
2739 *
2740 * Main suspend function for hardware IPs. The list of all the hardware
2741 * IPs that make up the asic is walked, clockgating is disabled and the
2742 * suspend callbacks are run. suspend puts the hardware and software state
2743 * in each IP into a state suitable for suspend.
2744 * Returns 0 on success, negative error code on failure.
2745 */
amdgpu_device_ip_suspend(struct amdgpu_device * adev)2746 int amdgpu_device_ip_suspend(struct amdgpu_device *adev)
2747 {
2748 int r;
2749
2750 if (amdgpu_sriov_vf(adev))
2751 amdgpu_virt_request_full_gpu(adev, false);
2752
2753 r = amdgpu_device_ip_suspend_phase1(adev);
2754 if (r)
2755 return r;
2756 r = amdgpu_device_ip_suspend_phase2(adev);
2757
2758 if (amdgpu_sriov_vf(adev))
2759 amdgpu_virt_release_full_gpu(adev, false);
2760
2761 return r;
2762 }
2763
amdgpu_device_ip_reinit_early_sriov(struct amdgpu_device * adev)2764 static int amdgpu_device_ip_reinit_early_sriov(struct amdgpu_device *adev)
2765 {
2766 int i, r;
2767
2768 static enum amd_ip_block_type ip_order[] = {
2769 AMD_IP_BLOCK_TYPE_GMC,
2770 AMD_IP_BLOCK_TYPE_COMMON,
2771 AMD_IP_BLOCK_TYPE_PSP,
2772 AMD_IP_BLOCK_TYPE_IH,
2773 };
2774
2775 for (i = 0; i < adev->num_ip_blocks; i++) {
2776 int j;
2777 struct amdgpu_ip_block *block;
2778
2779 block = &adev->ip_blocks[i];
2780 block->status.hw = false;
2781
2782 for (j = 0; j < ARRAY_SIZE(ip_order); j++) {
2783
2784 if (block->version->type != ip_order[j] ||
2785 !block->status.valid)
2786 continue;
2787
2788 r = block->version->funcs->hw_init(adev);
2789 DRM_INFO("RE-INIT-early: %s %s\n", block->version->funcs->name, r?"failed":"succeeded");
2790 if (r)
2791 return r;
2792 block->status.hw = true;
2793 }
2794 }
2795
2796 return 0;
2797 }
2798
amdgpu_device_ip_reinit_late_sriov(struct amdgpu_device * adev)2799 static int amdgpu_device_ip_reinit_late_sriov(struct amdgpu_device *adev)
2800 {
2801 int i, r;
2802
2803 static enum amd_ip_block_type ip_order[] = {
2804 AMD_IP_BLOCK_TYPE_SMC,
2805 AMD_IP_BLOCK_TYPE_DCE,
2806 AMD_IP_BLOCK_TYPE_GFX,
2807 AMD_IP_BLOCK_TYPE_SDMA,
2808 AMD_IP_BLOCK_TYPE_UVD,
2809 AMD_IP_BLOCK_TYPE_VCE,
2810 AMD_IP_BLOCK_TYPE_VCN
2811 };
2812
2813 for (i = 0; i < ARRAY_SIZE(ip_order); i++) {
2814 int j;
2815 struct amdgpu_ip_block *block;
2816
2817 for (j = 0; j < adev->num_ip_blocks; j++) {
2818 block = &adev->ip_blocks[j];
2819
2820 if (block->version->type != ip_order[i] ||
2821 !block->status.valid ||
2822 block->status.hw)
2823 continue;
2824
2825 if (block->version->type == AMD_IP_BLOCK_TYPE_SMC)
2826 r = block->version->funcs->resume(adev);
2827 else
2828 r = block->version->funcs->hw_init(adev);
2829
2830 DRM_INFO("RE-INIT-late: %s %s\n", block->version->funcs->name, r?"failed":"succeeded");
2831 if (r)
2832 return r;
2833 block->status.hw = true;
2834 }
2835 }
2836
2837 return 0;
2838 }
2839
2840 /**
2841 * amdgpu_device_ip_resume_phase1 - run resume for hardware IPs
2842 *
2843 * @adev: amdgpu_device pointer
2844 *
2845 * First resume function for hardware IPs. The list of all the hardware
2846 * IPs that make up the asic is walked and the resume callbacks are run for
2847 * COMMON, GMC, and IH. resume puts the hardware into a functional state
2848 * after a suspend and updates the software state as necessary. This
2849 * function is also used for restoring the GPU after a GPU reset.
2850 * Returns 0 on success, negative error code on failure.
2851 */
amdgpu_device_ip_resume_phase1(struct amdgpu_device * adev)2852 static int amdgpu_device_ip_resume_phase1(struct amdgpu_device *adev)
2853 {
2854 int i, r;
2855
2856 for (i = 0; i < adev->num_ip_blocks; i++) {
2857 if (!adev->ip_blocks[i].status.valid || adev->ip_blocks[i].status.hw)
2858 continue;
2859 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_COMMON ||
2860 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC ||
2861 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_IH) {
2862
2863 r = adev->ip_blocks[i].version->funcs->resume(adev);
2864 if (r) {
2865 DRM_ERROR("resume of IP block <%s> failed %d\n",
2866 adev->ip_blocks[i].version->funcs->name, r);
2867 return r;
2868 }
2869 adev->ip_blocks[i].status.hw = true;
2870 }
2871 }
2872
2873 return 0;
2874 }
2875
2876 /**
2877 * amdgpu_device_ip_resume_phase2 - run resume for hardware IPs
2878 *
2879 * @adev: amdgpu_device pointer
2880 *
2881 * First resume function for hardware IPs. The list of all the hardware
2882 * IPs that make up the asic is walked and the resume callbacks are run for
2883 * all blocks except COMMON, GMC, and IH. resume puts the hardware into a
2884 * functional state after a suspend and updates the software state as
2885 * necessary. This function is also used for restoring the GPU after a GPU
2886 * reset.
2887 * Returns 0 on success, negative error code on failure.
2888 */
amdgpu_device_ip_resume_phase2(struct amdgpu_device * adev)2889 static int amdgpu_device_ip_resume_phase2(struct amdgpu_device *adev)
2890 {
2891 int i, r;
2892
2893 for (i = 0; i < adev->num_ip_blocks; i++) {
2894 if (!adev->ip_blocks[i].status.valid || adev->ip_blocks[i].status.hw)
2895 continue;
2896 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_COMMON ||
2897 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC ||
2898 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_IH ||
2899 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_PSP)
2900 continue;
2901 r = adev->ip_blocks[i].version->funcs->resume(adev);
2902 if (r) {
2903 DRM_ERROR("resume of IP block <%s> failed %d\n",
2904 adev->ip_blocks[i].version->funcs->name, r);
2905 return r;
2906 }
2907 adev->ip_blocks[i].status.hw = true;
2908 }
2909
2910 return 0;
2911 }
2912
2913 /**
2914 * amdgpu_device_ip_resume - run resume for hardware IPs
2915 *
2916 * @adev: amdgpu_device pointer
2917 *
2918 * Main resume function for hardware IPs. The hardware IPs
2919 * are split into two resume functions because they are
2920 * are also used in in recovering from a GPU reset and some additional
2921 * steps need to be take between them. In this case (S3/S4) they are
2922 * run sequentially.
2923 * Returns 0 on success, negative error code on failure.
2924 */
amdgpu_device_ip_resume(struct amdgpu_device * adev)2925 static int amdgpu_device_ip_resume(struct amdgpu_device *adev)
2926 {
2927 int r;
2928
2929 r = amdgpu_amdkfd_resume_iommu(adev);
2930 if (r)
2931 return r;
2932
2933 r = amdgpu_device_ip_resume_phase1(adev);
2934 if (r)
2935 return r;
2936
2937 r = amdgpu_device_fw_loading(adev);
2938 if (r)
2939 return r;
2940
2941 r = amdgpu_device_ip_resume_phase2(adev);
2942
2943 return r;
2944 }
2945
2946 /**
2947 * amdgpu_device_detect_sriov_bios - determine if the board supports SR-IOV
2948 *
2949 * @adev: amdgpu_device pointer
2950 *
2951 * Query the VBIOS data tables to determine if the board supports SR-IOV.
2952 */
amdgpu_device_detect_sriov_bios(struct amdgpu_device * adev)2953 static void amdgpu_device_detect_sriov_bios(struct amdgpu_device *adev)
2954 {
2955 if (amdgpu_sriov_vf(adev)) {
2956 if (adev->is_atom_fw) {
2957 if (amdgpu_atomfirmware_gpu_supports_virtualization(adev))
2958 adev->virt.caps |= AMDGPU_SRIOV_CAPS_SRIOV_VBIOS;
2959 } else {
2960 if (amdgpu_atombios_has_gpu_virtualization_table(adev))
2961 adev->virt.caps |= AMDGPU_SRIOV_CAPS_SRIOV_VBIOS;
2962 }
2963
2964 if (!(adev->virt.caps & AMDGPU_SRIOV_CAPS_SRIOV_VBIOS))
2965 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_NO_VBIOS, 0, 0);
2966 }
2967 }
2968
2969 /**
2970 * amdgpu_device_asic_has_dc_support - determine if DC supports the asic
2971 *
2972 * @asic_type: AMD asic type
2973 *
2974 * Check if there is DC (new modesetting infrastructre) support for an asic.
2975 * returns true if DC has support, false if not.
2976 */
amdgpu_device_asic_has_dc_support(enum amd_asic_type asic_type)2977 bool amdgpu_device_asic_has_dc_support(enum amd_asic_type asic_type)
2978 {
2979 switch (asic_type) {
2980 #if defined(CONFIG_DRM_AMD_DC)
2981 #if defined(CONFIG_DRM_AMD_DC_SI)
2982 case CHIP_TAHITI:
2983 case CHIP_PITCAIRN:
2984 case CHIP_VERDE:
2985 case CHIP_OLAND:
2986 #endif
2987 case CHIP_BONAIRE:
2988 case CHIP_KAVERI:
2989 case CHIP_KABINI:
2990 case CHIP_MULLINS:
2991 /*
2992 * We have systems in the wild with these ASICs that require
2993 * LVDS and VGA support which is not supported with DC.
2994 *
2995 * Fallback to the non-DC driver here by default so as not to
2996 * cause regressions.
2997 */
2998 return amdgpu_dc > 0;
2999 case CHIP_HAWAII:
3000 case CHIP_CARRIZO:
3001 case CHIP_STONEY:
3002 case CHIP_POLARIS10:
3003 case CHIP_POLARIS11:
3004 case CHIP_POLARIS12:
3005 case CHIP_VEGAM:
3006 case CHIP_TONGA:
3007 case CHIP_FIJI:
3008 case CHIP_VEGA10:
3009 case CHIP_VEGA12:
3010 case CHIP_VEGA20:
3011 #if defined(CONFIG_DRM_AMD_DC_DCN)
3012 case CHIP_RAVEN:
3013 case CHIP_NAVI10:
3014 case CHIP_NAVI14:
3015 case CHIP_NAVI12:
3016 case CHIP_RENOIR:
3017 #endif
3018 #if defined(CONFIG_DRM_AMD_DC_DCN3_0)
3019 case CHIP_SIENNA_CICHLID:
3020 case CHIP_NAVY_FLOUNDER:
3021 #endif
3022 return amdgpu_dc != 0;
3023 #endif
3024 default:
3025 if (amdgpu_dc > 0)
3026 DRM_INFO_ONCE("Display Core has been requested via kernel parameter "
3027 "but isn't supported by ASIC, ignoring\n");
3028 return false;
3029 }
3030 }
3031
3032 /**
3033 * amdgpu_device_has_dc_support - check if dc is supported
3034 *
3035 * @adev: amdgpu_device pointer
3036 *
3037 * Returns true for supported, false for not supported
3038 */
amdgpu_device_has_dc_support(struct amdgpu_device * adev)3039 bool amdgpu_device_has_dc_support(struct amdgpu_device *adev)
3040 {
3041 if (amdgpu_sriov_vf(adev) || adev->enable_virtual_display)
3042 return false;
3043
3044 return amdgpu_device_asic_has_dc_support(adev->asic_type);
3045 }
3046
3047
amdgpu_device_xgmi_reset_func(struct work_struct * __work)3048 static void amdgpu_device_xgmi_reset_func(struct work_struct *__work)
3049 {
3050 struct amdgpu_device *adev =
3051 container_of(__work, struct amdgpu_device, xgmi_reset_work);
3052 struct amdgpu_hive_info *hive = amdgpu_get_xgmi_hive(adev);
3053
3054 /* It's a bug to not have a hive within this function */
3055 if (WARN_ON(!hive))
3056 return;
3057
3058 /*
3059 * Use task barrier to synchronize all xgmi reset works across the
3060 * hive. task_barrier_enter and task_barrier_exit will block
3061 * until all the threads running the xgmi reset works reach
3062 * those points. task_barrier_full will do both blocks.
3063 */
3064 if (amdgpu_asic_reset_method(adev) == AMD_RESET_METHOD_BACO) {
3065
3066 task_barrier_enter(&hive->tb);
3067 adev->asic_reset_res = amdgpu_device_baco_enter(adev_to_drm(adev));
3068
3069 if (adev->asic_reset_res)
3070 goto fail;
3071
3072 task_barrier_exit(&hive->tb);
3073 adev->asic_reset_res = amdgpu_device_baco_exit(adev_to_drm(adev));
3074
3075 if (adev->asic_reset_res)
3076 goto fail;
3077
3078 if (adev->mmhub.funcs && adev->mmhub.funcs->reset_ras_error_count)
3079 adev->mmhub.funcs->reset_ras_error_count(adev);
3080 } else {
3081
3082 task_barrier_full(&hive->tb);
3083 adev->asic_reset_res = amdgpu_asic_reset(adev);
3084 }
3085
3086 fail:
3087 if (adev->asic_reset_res)
3088 DRM_WARN("ASIC reset failed with error, %d for drm dev, %s",
3089 adev->asic_reset_res, adev_to_drm(adev)->unique);
3090 amdgpu_put_xgmi_hive(hive);
3091 }
3092
amdgpu_device_get_job_timeout_settings(struct amdgpu_device * adev)3093 static int amdgpu_device_get_job_timeout_settings(struct amdgpu_device *adev)
3094 {
3095 char *input = amdgpu_lockup_timeout;
3096 char *timeout_setting = NULL;
3097 int index = 0;
3098 long timeout;
3099 int ret = 0;
3100
3101 /*
3102 * By default timeout for non compute jobs is 10000.
3103 * And there is no timeout enforced on compute jobs.
3104 * In SR-IOV or passthrough mode, timeout for compute
3105 * jobs are 60000 by default.
3106 */
3107 adev->gfx_timeout = msecs_to_jiffies(10000);
3108 adev->sdma_timeout = adev->video_timeout = adev->gfx_timeout;
3109 if (amdgpu_sriov_vf(adev) || amdgpu_passthrough(adev))
3110 adev->compute_timeout = msecs_to_jiffies(60000);
3111 else
3112 adev->compute_timeout = MAX_SCHEDULE_TIMEOUT;
3113
3114 if (strnlen(input, AMDGPU_MAX_TIMEOUT_PARAM_LENGTH)) {
3115 while ((timeout_setting = strsep(&input, ",")) &&
3116 strnlen(timeout_setting, AMDGPU_MAX_TIMEOUT_PARAM_LENGTH)) {
3117 ret = kstrtol(timeout_setting, 0, &timeout);
3118 if (ret)
3119 return ret;
3120
3121 if (timeout == 0) {
3122 index++;
3123 continue;
3124 } else if (timeout < 0) {
3125 timeout = MAX_SCHEDULE_TIMEOUT;
3126 } else {
3127 timeout = msecs_to_jiffies(timeout);
3128 }
3129
3130 switch (index++) {
3131 case 0:
3132 adev->gfx_timeout = timeout;
3133 break;
3134 case 1:
3135 adev->compute_timeout = timeout;
3136 break;
3137 case 2:
3138 adev->sdma_timeout = timeout;
3139 break;
3140 case 3:
3141 adev->video_timeout = timeout;
3142 break;
3143 default:
3144 break;
3145 }
3146 }
3147 /*
3148 * There is only one value specified and
3149 * it should apply to all non-compute jobs.
3150 */
3151 if (index == 1) {
3152 adev->sdma_timeout = adev->video_timeout = adev->gfx_timeout;
3153 if (amdgpu_sriov_vf(adev) || amdgpu_passthrough(adev))
3154 adev->compute_timeout = adev->gfx_timeout;
3155 }
3156 }
3157
3158 return ret;
3159 }
3160
3161 static const struct attribute *amdgpu_dev_attributes[] = {
3162 &dev_attr_product_name.attr,
3163 &dev_attr_product_number.attr,
3164 &dev_attr_serial_number.attr,
3165 &dev_attr_pcie_replay_count.attr,
3166 NULL
3167 };
3168
3169
3170 /**
3171 * amdgpu_device_init - initialize the driver
3172 *
3173 * @adev: amdgpu_device pointer
3174 * @flags: driver flags
3175 *
3176 * Initializes the driver info and hw (all asics).
3177 * Returns 0 for success or an error on failure.
3178 * Called at driver startup.
3179 */
amdgpu_device_init(struct amdgpu_device * adev,uint32_t flags)3180 int amdgpu_device_init(struct amdgpu_device *adev,
3181 uint32_t flags)
3182 {
3183 struct drm_device *ddev = adev_to_drm(adev);
3184 struct pci_dev *pdev = adev->pdev;
3185 int r, i;
3186 bool boco = false;
3187 u32 max_MBps;
3188
3189 adev->shutdown = false;
3190 adev->flags = flags;
3191
3192 if (amdgpu_force_asic_type >= 0 && amdgpu_force_asic_type < CHIP_LAST)
3193 adev->asic_type = amdgpu_force_asic_type;
3194 else
3195 adev->asic_type = flags & AMD_ASIC_MASK;
3196
3197 adev->usec_timeout = AMDGPU_MAX_USEC_TIMEOUT;
3198 if (amdgpu_emu_mode == 1)
3199 adev->usec_timeout *= 10;
3200 adev->gmc.gart_size = 512 * 1024 * 1024;
3201 adev->accel_working = false;
3202 adev->num_rings = 0;
3203 adev->mman.buffer_funcs = NULL;
3204 adev->mman.buffer_funcs_ring = NULL;
3205 adev->vm_manager.vm_pte_funcs = NULL;
3206 adev->vm_manager.vm_pte_num_scheds = 0;
3207 adev->gmc.gmc_funcs = NULL;
3208 adev->fence_context = dma_fence_context_alloc(AMDGPU_MAX_RINGS);
3209 bitmap_zero(adev->gfx.pipe_reserve_bitmap, AMDGPU_MAX_COMPUTE_QUEUES);
3210
3211 adev->smc_rreg = &amdgpu_invalid_rreg;
3212 adev->smc_wreg = &amdgpu_invalid_wreg;
3213 adev->pcie_rreg = &amdgpu_invalid_rreg;
3214 adev->pcie_wreg = &amdgpu_invalid_wreg;
3215 adev->pciep_rreg = &amdgpu_invalid_rreg;
3216 adev->pciep_wreg = &amdgpu_invalid_wreg;
3217 adev->pcie_rreg64 = &amdgpu_invalid_rreg64;
3218 adev->pcie_wreg64 = &amdgpu_invalid_wreg64;
3219 adev->uvd_ctx_rreg = &amdgpu_invalid_rreg;
3220 adev->uvd_ctx_wreg = &amdgpu_invalid_wreg;
3221 adev->didt_rreg = &amdgpu_invalid_rreg;
3222 adev->didt_wreg = &amdgpu_invalid_wreg;
3223 adev->gc_cac_rreg = &amdgpu_invalid_rreg;
3224 adev->gc_cac_wreg = &amdgpu_invalid_wreg;
3225 adev->audio_endpt_rreg = &amdgpu_block_invalid_rreg;
3226 adev->audio_endpt_wreg = &amdgpu_block_invalid_wreg;
3227
3228 DRM_INFO("initializing kernel modesetting (%s 0x%04X:0x%04X 0x%04X:0x%04X 0x%02X).\n",
3229 amdgpu_asic_name[adev->asic_type], pdev->vendor, pdev->device,
3230 pdev->subsystem_vendor, pdev->subsystem_device, pdev->revision);
3231
3232 /* mutex initialization are all done here so we
3233 * can recall function without having locking issues */
3234 atomic_set(&adev->irq.ih.lock, 0);
3235 mutex_init(&adev->firmware.mutex);
3236 mutex_init(&adev->pm.mutex);
3237 mutex_init(&adev->gfx.gpu_clock_mutex);
3238 mutex_init(&adev->srbm_mutex);
3239 mutex_init(&adev->gfx.pipe_reserve_mutex);
3240 mutex_init(&adev->gfx.gfx_off_mutex);
3241 mutex_init(&adev->grbm_idx_mutex);
3242 mutex_init(&adev->mn_lock);
3243 mutex_init(&adev->virt.vf_errors.lock);
3244 hash_init(adev->mn_hash);
3245 atomic_set(&adev->in_gpu_reset, 0);
3246 init_rwsem(&adev->reset_sem);
3247 mutex_init(&adev->psp.mutex);
3248 mutex_init(&adev->notifier_lock);
3249
3250 r = amdgpu_device_check_arguments(adev);
3251 if (r)
3252 return r;
3253
3254 spin_lock_init(&adev->mmio_idx_lock);
3255 spin_lock_init(&adev->smc_idx_lock);
3256 spin_lock_init(&adev->pcie_idx_lock);
3257 spin_lock_init(&adev->uvd_ctx_idx_lock);
3258 spin_lock_init(&adev->didt_idx_lock);
3259 spin_lock_init(&adev->gc_cac_idx_lock);
3260 spin_lock_init(&adev->se_cac_idx_lock);
3261 spin_lock_init(&adev->audio_endpt_idx_lock);
3262 spin_lock_init(&adev->mm_stats.lock);
3263
3264 INIT_LIST_HEAD(&adev->shadow_list);
3265 mutex_init(&adev->shadow_list_lock);
3266
3267 INIT_DELAYED_WORK(&adev->delayed_init_work,
3268 amdgpu_device_delayed_init_work_handler);
3269 INIT_DELAYED_WORK(&adev->gfx.gfx_off_delay_work,
3270 amdgpu_device_delay_enable_gfx_off);
3271
3272 INIT_WORK(&adev->xgmi_reset_work, amdgpu_device_xgmi_reset_func);
3273
3274 adev->gfx.gfx_off_req_count = 1;
3275 adev->pm.ac_power = power_supply_is_system_supplied() > 0;
3276
3277 atomic_set(&adev->throttling_logging_enabled, 1);
3278 /*
3279 * If throttling continues, logging will be performed every minute
3280 * to avoid log flooding. "-1" is subtracted since the thermal
3281 * throttling interrupt comes every second. Thus, the total logging
3282 * interval is 59 seconds(retelimited printk interval) + 1(waiting
3283 * for throttling interrupt) = 60 seconds.
3284 */
3285 ratelimit_state_init(&adev->throttling_logging_rs, (60 - 1) * HZ, 1);
3286 ratelimit_set_flags(&adev->throttling_logging_rs, RATELIMIT_MSG_ON_RELEASE);
3287
3288 /* Registers mapping */
3289 /* TODO: block userspace mapping of io register */
3290 if (adev->asic_type >= CHIP_BONAIRE) {
3291 adev->rmmio_base = pci_resource_start(adev->pdev, 5);
3292 adev->rmmio_size = pci_resource_len(adev->pdev, 5);
3293 } else {
3294 adev->rmmio_base = pci_resource_start(adev->pdev, 2);
3295 adev->rmmio_size = pci_resource_len(adev->pdev, 2);
3296 }
3297
3298 adev->rmmio = ioremap(adev->rmmio_base, adev->rmmio_size);
3299 if (adev->rmmio == NULL) {
3300 return -ENOMEM;
3301 }
3302 DRM_INFO("register mmio base: 0x%08X\n", (uint32_t)adev->rmmio_base);
3303 DRM_INFO("register mmio size: %u\n", (unsigned)adev->rmmio_size);
3304
3305 /* io port mapping */
3306 for (i = 0; i < DEVICE_COUNT_RESOURCE; i++) {
3307 if (pci_resource_flags(adev->pdev, i) & IORESOURCE_IO) {
3308 adev->rio_mem_size = pci_resource_len(adev->pdev, i);
3309 adev->rio_mem = pci_iomap(adev->pdev, i, adev->rio_mem_size);
3310 break;
3311 }
3312 }
3313 if (adev->rio_mem == NULL)
3314 DRM_INFO("PCI I/O BAR is not found.\n");
3315
3316 /* enable PCIE atomic ops */
3317 r = pci_enable_atomic_ops_to_root(adev->pdev,
3318 PCI_EXP_DEVCAP2_ATOMIC_COMP32 |
3319 PCI_EXP_DEVCAP2_ATOMIC_COMP64);
3320 if (r) {
3321 adev->have_atomics_support = false;
3322 DRM_INFO("PCIE atomic ops is not supported\n");
3323 } else {
3324 adev->have_atomics_support = true;
3325 }
3326
3327 amdgpu_device_get_pcie_info(adev);
3328
3329 if (amdgpu_mcbp)
3330 DRM_INFO("MCBP is enabled\n");
3331
3332 if (amdgpu_mes && adev->asic_type >= CHIP_NAVI10)
3333 adev->enable_mes = true;
3334
3335 /* detect hw virtualization here */
3336 amdgpu_detect_virtualization(adev);
3337
3338 r = amdgpu_device_get_job_timeout_settings(adev);
3339 if (r) {
3340 dev_err(adev->dev, "invalid lockup_timeout parameter syntax\n");
3341 return r;
3342 }
3343
3344 /* early init functions */
3345 r = amdgpu_device_ip_early_init(adev);
3346 if (r)
3347 return r;
3348
3349 /* doorbell bar mapping and doorbell index init*/
3350 amdgpu_device_doorbell_init(adev);
3351
3352 /* if we have > 1 VGA cards, then disable the amdgpu VGA resources */
3353 /* this will fail for cards that aren't VGA class devices, just
3354 * ignore it */
3355 vga_client_register(adev->pdev, adev, NULL, amdgpu_device_vga_set_decode);
3356
3357 if (amdgpu_device_supports_boco(ddev))
3358 boco = true;
3359 if (amdgpu_has_atpx() &&
3360 (amdgpu_is_atpx_hybrid() ||
3361 amdgpu_has_atpx_dgpu_power_cntl()) &&
3362 !pci_is_thunderbolt_attached(adev->pdev))
3363 vga_switcheroo_register_client(adev->pdev,
3364 &amdgpu_switcheroo_ops, boco);
3365 if (boco)
3366 vga_switcheroo_init_domain_pm_ops(adev->dev, &adev->vga_pm_domain);
3367
3368 if (amdgpu_emu_mode == 1) {
3369 /* post the asic on emulation mode */
3370 emu_soc_asic_init(adev);
3371 goto fence_driver_init;
3372 }
3373
3374 /* detect if we are with an SRIOV vbios */
3375 amdgpu_device_detect_sriov_bios(adev);
3376
3377 /* check if we need to reset the asic
3378 * E.g., driver was not cleanly unloaded previously, etc.
3379 */
3380 if (!amdgpu_sriov_vf(adev) && amdgpu_asic_need_reset_on_init(adev)) {
3381 r = amdgpu_asic_reset(adev);
3382 if (r) {
3383 dev_err(adev->dev, "asic reset on init failed\n");
3384 goto failed;
3385 }
3386 }
3387
3388 pci_enable_pcie_error_reporting(adev->ddev.pdev);
3389
3390 /* Post card if necessary */
3391 if (amdgpu_device_need_post(adev)) {
3392 if (!adev->bios) {
3393 dev_err(adev->dev, "no vBIOS found\n");
3394 r = -EINVAL;
3395 goto failed;
3396 }
3397 DRM_INFO("GPU posting now...\n");
3398 r = amdgpu_device_asic_init(adev);
3399 if (r) {
3400 dev_err(adev->dev, "gpu post error!\n");
3401 goto failed;
3402 }
3403 }
3404
3405 if (adev->is_atom_fw) {
3406 /* Initialize clocks */
3407 r = amdgpu_atomfirmware_get_clock_info(adev);
3408 if (r) {
3409 dev_err(adev->dev, "amdgpu_atomfirmware_get_clock_info failed\n");
3410 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_ATOMBIOS_GET_CLOCK_FAIL, 0, 0);
3411 goto failed;
3412 }
3413 } else {
3414 /* Initialize clocks */
3415 r = amdgpu_atombios_get_clock_info(adev);
3416 if (r) {
3417 dev_err(adev->dev, "amdgpu_atombios_get_clock_info failed\n");
3418 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_ATOMBIOS_GET_CLOCK_FAIL, 0, 0);
3419 goto failed;
3420 }
3421 /* init i2c buses */
3422 if (!amdgpu_device_has_dc_support(adev))
3423 amdgpu_atombios_i2c_init(adev);
3424 }
3425
3426 fence_driver_init:
3427 /* Fence driver */
3428 r = amdgpu_fence_driver_init(adev);
3429 if (r) {
3430 dev_err(adev->dev, "amdgpu_fence_driver_init failed\n");
3431 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_FENCE_INIT_FAIL, 0, 0);
3432 goto failed;
3433 }
3434
3435 /* init the mode config */
3436 drm_mode_config_init(adev_to_drm(adev));
3437
3438 r = amdgpu_device_ip_init(adev);
3439 if (r) {
3440 /* failed in exclusive mode due to timeout */
3441 if (amdgpu_sriov_vf(adev) &&
3442 !amdgpu_sriov_runtime(adev) &&
3443 amdgpu_virt_mmio_blocked(adev) &&
3444 !amdgpu_virt_wait_reset(adev)) {
3445 dev_err(adev->dev, "VF exclusive mode timeout\n");
3446 /* Don't send request since VF is inactive. */
3447 adev->virt.caps &= ~AMDGPU_SRIOV_CAPS_RUNTIME;
3448 adev->virt.ops = NULL;
3449 r = -EAGAIN;
3450 goto failed;
3451 }
3452 dev_err(adev->dev, "amdgpu_device_ip_init failed\n");
3453 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_AMDGPU_INIT_FAIL, 0, 0);
3454 goto failed;
3455 }
3456
3457 dev_info(adev->dev,
3458 "SE %d, SH per SE %d, CU per SH %d, active_cu_number %d\n",
3459 adev->gfx.config.max_shader_engines,
3460 adev->gfx.config.max_sh_per_se,
3461 adev->gfx.config.max_cu_per_sh,
3462 adev->gfx.cu_info.number);
3463
3464 adev->accel_working = true;
3465
3466 amdgpu_vm_check_compute_bug(adev);
3467
3468 /* Initialize the buffer migration limit. */
3469 if (amdgpu_moverate >= 0)
3470 max_MBps = amdgpu_moverate;
3471 else
3472 max_MBps = 8; /* Allow 8 MB/s. */
3473 /* Get a log2 for easy divisions. */
3474 adev->mm_stats.log2_max_MBps = ilog2(max(1u, max_MBps));
3475
3476 amdgpu_fbdev_init(adev);
3477
3478 r = amdgpu_pm_sysfs_init(adev);
3479 if (r) {
3480 adev->pm_sysfs_en = false;
3481 DRM_ERROR("registering pm debugfs failed (%d).\n", r);
3482 } else
3483 adev->pm_sysfs_en = true;
3484
3485 r = amdgpu_ucode_sysfs_init(adev);
3486 if (r) {
3487 adev->ucode_sysfs_en = false;
3488 DRM_ERROR("Creating firmware sysfs failed (%d).\n", r);
3489 } else
3490 adev->ucode_sysfs_en = true;
3491
3492 if ((amdgpu_testing & 1)) {
3493 if (adev->accel_working)
3494 amdgpu_test_moves(adev);
3495 else
3496 DRM_INFO("amdgpu: acceleration disabled, skipping move tests\n");
3497 }
3498 if (amdgpu_benchmarking) {
3499 if (adev->accel_working)
3500 amdgpu_benchmark(adev, amdgpu_benchmarking);
3501 else
3502 DRM_INFO("amdgpu: acceleration disabled, skipping benchmarks\n");
3503 }
3504
3505 /*
3506 * Register gpu instance before amdgpu_device_enable_mgpu_fan_boost.
3507 * Otherwise the mgpu fan boost feature will be skipped due to the
3508 * gpu instance is counted less.
3509 */
3510 amdgpu_register_gpu_instance(adev);
3511
3512 /* enable clockgating, etc. after ib tests, etc. since some blocks require
3513 * explicit gating rather than handling it automatically.
3514 */
3515 r = amdgpu_device_ip_late_init(adev);
3516 if (r) {
3517 dev_err(adev->dev, "amdgpu_device_ip_late_init failed\n");
3518 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_AMDGPU_LATE_INIT_FAIL, 0, r);
3519 goto failed;
3520 }
3521
3522 /* must succeed. */
3523 amdgpu_ras_resume(adev);
3524
3525 queue_delayed_work(system_wq, &adev->delayed_init_work,
3526 msecs_to_jiffies(AMDGPU_RESUME_MS));
3527
3528 if (amdgpu_sriov_vf(adev))
3529 flush_delayed_work(&adev->delayed_init_work);
3530
3531 r = sysfs_create_files(&adev->dev->kobj, amdgpu_dev_attributes);
3532 if (r)
3533 dev_err(adev->dev, "Could not create amdgpu device attr\n");
3534
3535 if (IS_ENABLED(CONFIG_PERF_EVENTS))
3536 r = amdgpu_pmu_init(adev);
3537 if (r)
3538 dev_err(adev->dev, "amdgpu_pmu_init failed\n");
3539
3540 /* Have stored pci confspace at hand for restore in sudden PCI error */
3541 if (amdgpu_device_cache_pci_state(adev->pdev))
3542 pci_restore_state(pdev);
3543
3544 return 0;
3545
3546 failed:
3547 amdgpu_vf_error_trans_all(adev);
3548 if (boco)
3549 vga_switcheroo_fini_domain_pm_ops(adev->dev);
3550
3551 return r;
3552 }
3553
3554 /**
3555 * amdgpu_device_fini - tear down the driver
3556 *
3557 * @adev: amdgpu_device pointer
3558 *
3559 * Tear down the driver info (all asics).
3560 * Called at driver shutdown.
3561 */
amdgpu_device_fini(struct amdgpu_device * adev)3562 void amdgpu_device_fini(struct amdgpu_device *adev)
3563 {
3564 dev_info(adev->dev, "amdgpu: finishing device.\n");
3565 flush_delayed_work(&adev->delayed_init_work);
3566 ttm_bo_lock_delayed_workqueue(&adev->mman.bdev);
3567 adev->shutdown = true;
3568
3569 kfree(adev->pci_state);
3570
3571 /* make sure IB test finished before entering exclusive mode
3572 * to avoid preemption on IB test
3573 * */
3574 if (amdgpu_sriov_vf(adev)) {
3575 amdgpu_virt_request_full_gpu(adev, false);
3576 amdgpu_virt_fini_data_exchange(adev);
3577 }
3578
3579 /* disable all interrupts */
3580 amdgpu_irq_disable_all(adev);
3581 if (adev->mode_info.mode_config_initialized){
3582 if (!amdgpu_device_has_dc_support(adev))
3583 drm_helper_force_disable_all(adev_to_drm(adev));
3584 else
3585 drm_atomic_helper_shutdown(adev_to_drm(adev));
3586 }
3587 amdgpu_fence_driver_fini(adev);
3588 if (adev->pm_sysfs_en)
3589 amdgpu_pm_sysfs_fini(adev);
3590 amdgpu_fbdev_fini(adev);
3591 amdgpu_device_ip_fini(adev);
3592 release_firmware(adev->firmware.gpu_info_fw);
3593 adev->firmware.gpu_info_fw = NULL;
3594 adev->accel_working = false;
3595 /* free i2c buses */
3596 if (!amdgpu_device_has_dc_support(adev))
3597 amdgpu_i2c_fini(adev);
3598
3599 if (amdgpu_emu_mode != 1)
3600 amdgpu_atombios_fini(adev);
3601
3602 kfree(adev->bios);
3603 adev->bios = NULL;
3604 if (amdgpu_has_atpx() &&
3605 (amdgpu_is_atpx_hybrid() ||
3606 amdgpu_has_atpx_dgpu_power_cntl()) &&
3607 !pci_is_thunderbolt_attached(adev->pdev))
3608 vga_switcheroo_unregister_client(adev->pdev);
3609 if (amdgpu_device_supports_boco(adev_to_drm(adev)))
3610 vga_switcheroo_fini_domain_pm_ops(adev->dev);
3611 vga_client_register(adev->pdev, NULL, NULL, NULL);
3612 if (adev->rio_mem)
3613 pci_iounmap(adev->pdev, adev->rio_mem);
3614 adev->rio_mem = NULL;
3615 iounmap(adev->rmmio);
3616 adev->rmmio = NULL;
3617 amdgpu_device_doorbell_fini(adev);
3618
3619 if (adev->ucode_sysfs_en)
3620 amdgpu_ucode_sysfs_fini(adev);
3621
3622 sysfs_remove_files(&adev->dev->kobj, amdgpu_dev_attributes);
3623 if (IS_ENABLED(CONFIG_PERF_EVENTS))
3624 amdgpu_pmu_fini(adev);
3625 if (adev->mman.discovery_bin)
3626 amdgpu_discovery_fini(adev);
3627 }
3628
3629
3630 /*
3631 * Suspend & resume.
3632 */
3633 /**
3634 * amdgpu_device_suspend - initiate device suspend
3635 *
3636 * @dev: drm dev pointer
3637 * @fbcon : notify the fbdev of suspend
3638 *
3639 * Puts the hw in the suspend state (all asics).
3640 * Returns 0 for success or an error on failure.
3641 * Called at driver suspend.
3642 */
amdgpu_device_suspend(struct drm_device * dev,bool fbcon)3643 int amdgpu_device_suspend(struct drm_device *dev, bool fbcon)
3644 {
3645 struct amdgpu_device *adev;
3646 struct drm_crtc *crtc;
3647 struct drm_connector *connector;
3648 struct drm_connector_list_iter iter;
3649 int r;
3650
3651 adev = drm_to_adev(dev);
3652
3653 if (dev->switch_power_state == DRM_SWITCH_POWER_OFF)
3654 return 0;
3655
3656 adev->in_suspend = true;
3657 drm_kms_helper_poll_disable(dev);
3658
3659 if (fbcon)
3660 amdgpu_fbdev_set_suspend(adev, 1);
3661
3662 cancel_delayed_work_sync(&adev->delayed_init_work);
3663
3664 if (!amdgpu_device_has_dc_support(adev)) {
3665 /* turn off display hw */
3666 drm_modeset_lock_all(dev);
3667 drm_connector_list_iter_begin(dev, &iter);
3668 drm_for_each_connector_iter(connector, &iter)
3669 drm_helper_connector_dpms(connector,
3670 DRM_MODE_DPMS_OFF);
3671 drm_connector_list_iter_end(&iter);
3672 drm_modeset_unlock_all(dev);
3673 /* unpin the front buffers and cursors */
3674 list_for_each_entry(crtc, &dev->mode_config.crtc_list, head) {
3675 struct amdgpu_crtc *amdgpu_crtc = to_amdgpu_crtc(crtc);
3676 struct drm_framebuffer *fb = crtc->primary->fb;
3677 struct amdgpu_bo *robj;
3678
3679 if (amdgpu_crtc->cursor_bo && !adev->enable_virtual_display) {
3680 struct amdgpu_bo *aobj = gem_to_amdgpu_bo(amdgpu_crtc->cursor_bo);
3681 r = amdgpu_bo_reserve(aobj, true);
3682 if (r == 0) {
3683 amdgpu_bo_unpin(aobj);
3684 amdgpu_bo_unreserve(aobj);
3685 }
3686 }
3687
3688 if (fb == NULL || fb->obj[0] == NULL) {
3689 continue;
3690 }
3691 robj = gem_to_amdgpu_bo(fb->obj[0]);
3692 /* don't unpin kernel fb objects */
3693 if (!amdgpu_fbdev_robj_is_fb(adev, robj)) {
3694 r = amdgpu_bo_reserve(robj, true);
3695 if (r == 0) {
3696 amdgpu_bo_unpin(robj);
3697 amdgpu_bo_unreserve(robj);
3698 }
3699 }
3700 }
3701 }
3702
3703 amdgpu_ras_suspend(adev);
3704
3705 r = amdgpu_device_ip_suspend_phase1(adev);
3706
3707 amdgpu_amdkfd_suspend(adev, !fbcon);
3708
3709 /* evict vram memory */
3710 amdgpu_bo_evict_vram(adev);
3711
3712 amdgpu_fence_driver_suspend(adev);
3713
3714 r = amdgpu_device_ip_suspend_phase2(adev);
3715
3716 /* evict remaining vram memory
3717 * This second call to evict vram is to evict the gart page table
3718 * using the CPU.
3719 */
3720 amdgpu_bo_evict_vram(adev);
3721
3722 return 0;
3723 }
3724
3725 /**
3726 * amdgpu_device_resume - initiate device resume
3727 *
3728 * @dev: drm dev pointer
3729 * @fbcon : notify the fbdev of resume
3730 *
3731 * Bring the hw back to operating state (all asics).
3732 * Returns 0 for success or an error on failure.
3733 * Called at driver resume.
3734 */
amdgpu_device_resume(struct drm_device * dev,bool fbcon)3735 int amdgpu_device_resume(struct drm_device *dev, bool fbcon)
3736 {
3737 struct drm_connector *connector;
3738 struct drm_connector_list_iter iter;
3739 struct amdgpu_device *adev = drm_to_adev(dev);
3740 struct drm_crtc *crtc;
3741 int r = 0;
3742
3743 if (dev->switch_power_state == DRM_SWITCH_POWER_OFF)
3744 return 0;
3745
3746 /* post card */
3747 if (amdgpu_device_need_post(adev)) {
3748 r = amdgpu_device_asic_init(adev);
3749 if (r)
3750 dev_err(adev->dev, "amdgpu asic init failed\n");
3751 }
3752
3753 r = amdgpu_device_ip_resume(adev);
3754 if (r) {
3755 dev_err(adev->dev, "amdgpu_device_ip_resume failed (%d).\n", r);
3756 return r;
3757 }
3758 amdgpu_fence_driver_resume(adev);
3759
3760
3761 r = amdgpu_device_ip_late_init(adev);
3762 if (r)
3763 return r;
3764
3765 queue_delayed_work(system_wq, &adev->delayed_init_work,
3766 msecs_to_jiffies(AMDGPU_RESUME_MS));
3767
3768 if (!amdgpu_device_has_dc_support(adev)) {
3769 /* pin cursors */
3770 list_for_each_entry(crtc, &dev->mode_config.crtc_list, head) {
3771 struct amdgpu_crtc *amdgpu_crtc = to_amdgpu_crtc(crtc);
3772
3773 if (amdgpu_crtc->cursor_bo && !adev->enable_virtual_display) {
3774 struct amdgpu_bo *aobj = gem_to_amdgpu_bo(amdgpu_crtc->cursor_bo);
3775 r = amdgpu_bo_reserve(aobj, true);
3776 if (r == 0) {
3777 r = amdgpu_bo_pin(aobj, AMDGPU_GEM_DOMAIN_VRAM);
3778 if (r != 0)
3779 dev_err(adev->dev, "Failed to pin cursor BO (%d)\n", r);
3780 amdgpu_crtc->cursor_addr = amdgpu_bo_gpu_offset(aobj);
3781 amdgpu_bo_unreserve(aobj);
3782 }
3783 }
3784 }
3785 }
3786 r = amdgpu_amdkfd_resume(adev, !fbcon);
3787 if (r)
3788 return r;
3789
3790 /* Make sure IB tests flushed */
3791 flush_delayed_work(&adev->delayed_init_work);
3792
3793 /* blat the mode back in */
3794 if (fbcon) {
3795 if (!amdgpu_device_has_dc_support(adev)) {
3796 /* pre DCE11 */
3797 drm_helper_resume_force_mode(dev);
3798
3799 /* turn on display hw */
3800 drm_modeset_lock_all(dev);
3801
3802 drm_connector_list_iter_begin(dev, &iter);
3803 drm_for_each_connector_iter(connector, &iter)
3804 drm_helper_connector_dpms(connector,
3805 DRM_MODE_DPMS_ON);
3806 drm_connector_list_iter_end(&iter);
3807
3808 drm_modeset_unlock_all(dev);
3809 }
3810 amdgpu_fbdev_set_suspend(adev, 0);
3811 }
3812
3813 drm_kms_helper_poll_enable(dev);
3814
3815 amdgpu_ras_resume(adev);
3816
3817 /*
3818 * Most of the connector probing functions try to acquire runtime pm
3819 * refs to ensure that the GPU is powered on when connector polling is
3820 * performed. Since we're calling this from a runtime PM callback,
3821 * trying to acquire rpm refs will cause us to deadlock.
3822 *
3823 * Since we're guaranteed to be holding the rpm lock, it's safe to
3824 * temporarily disable the rpm helpers so this doesn't deadlock us.
3825 */
3826 #ifdef CONFIG_PM
3827 dev->dev->power.disable_depth++;
3828 #endif
3829 if (!amdgpu_device_has_dc_support(adev))
3830 drm_helper_hpd_irq_event(dev);
3831 else
3832 drm_kms_helper_hotplug_event(dev);
3833 #ifdef CONFIG_PM
3834 dev->dev->power.disable_depth--;
3835 #endif
3836 adev->in_suspend = false;
3837
3838 return 0;
3839 }
3840
3841 /**
3842 * amdgpu_device_ip_check_soft_reset - did soft reset succeed
3843 *
3844 * @adev: amdgpu_device pointer
3845 *
3846 * The list of all the hardware IPs that make up the asic is walked and
3847 * the check_soft_reset callbacks are run. check_soft_reset determines
3848 * if the asic is still hung or not.
3849 * Returns true if any of the IPs are still in a hung state, false if not.
3850 */
amdgpu_device_ip_check_soft_reset(struct amdgpu_device * adev)3851 static bool amdgpu_device_ip_check_soft_reset(struct amdgpu_device *adev)
3852 {
3853 int i;
3854 bool asic_hang = false;
3855
3856 if (amdgpu_sriov_vf(adev))
3857 return true;
3858
3859 if (amdgpu_asic_need_full_reset(adev))
3860 return true;
3861
3862 for (i = 0; i < adev->num_ip_blocks; i++) {
3863 if (!adev->ip_blocks[i].status.valid)
3864 continue;
3865 if (adev->ip_blocks[i].version->funcs->check_soft_reset)
3866 adev->ip_blocks[i].status.hang =
3867 adev->ip_blocks[i].version->funcs->check_soft_reset(adev);
3868 if (adev->ip_blocks[i].status.hang) {
3869 dev_info(adev->dev, "IP block:%s is hung!\n", adev->ip_blocks[i].version->funcs->name);
3870 asic_hang = true;
3871 }
3872 }
3873 return asic_hang;
3874 }
3875
3876 /**
3877 * amdgpu_device_ip_pre_soft_reset - prepare for soft reset
3878 *
3879 * @adev: amdgpu_device pointer
3880 *
3881 * The list of all the hardware IPs that make up the asic is walked and the
3882 * pre_soft_reset callbacks are run if the block is hung. pre_soft_reset
3883 * handles any IP specific hardware or software state changes that are
3884 * necessary for a soft reset to succeed.
3885 * Returns 0 on success, negative error code on failure.
3886 */
amdgpu_device_ip_pre_soft_reset(struct amdgpu_device * adev)3887 static int amdgpu_device_ip_pre_soft_reset(struct amdgpu_device *adev)
3888 {
3889 int i, r = 0;
3890
3891 for (i = 0; i < adev->num_ip_blocks; i++) {
3892 if (!adev->ip_blocks[i].status.valid)
3893 continue;
3894 if (adev->ip_blocks[i].status.hang &&
3895 adev->ip_blocks[i].version->funcs->pre_soft_reset) {
3896 r = adev->ip_blocks[i].version->funcs->pre_soft_reset(adev);
3897 if (r)
3898 return r;
3899 }
3900 }
3901
3902 return 0;
3903 }
3904
3905 /**
3906 * amdgpu_device_ip_need_full_reset - check if a full asic reset is needed
3907 *
3908 * @adev: amdgpu_device pointer
3909 *
3910 * Some hardware IPs cannot be soft reset. If they are hung, a full gpu
3911 * reset is necessary to recover.
3912 * Returns true if a full asic reset is required, false if not.
3913 */
amdgpu_device_ip_need_full_reset(struct amdgpu_device * adev)3914 static bool amdgpu_device_ip_need_full_reset(struct amdgpu_device *adev)
3915 {
3916 int i;
3917
3918 if (amdgpu_asic_need_full_reset(adev))
3919 return true;
3920
3921 for (i = 0; i < adev->num_ip_blocks; i++) {
3922 if (!adev->ip_blocks[i].status.valid)
3923 continue;
3924 if ((adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC) ||
3925 (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_SMC) ||
3926 (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_ACP) ||
3927 (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_DCE) ||
3928 adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_PSP) {
3929 if (adev->ip_blocks[i].status.hang) {
3930 dev_info(adev->dev, "Some block need full reset!\n");
3931 return true;
3932 }
3933 }
3934 }
3935 return false;
3936 }
3937
3938 /**
3939 * amdgpu_device_ip_soft_reset - do a soft reset
3940 *
3941 * @adev: amdgpu_device pointer
3942 *
3943 * The list of all the hardware IPs that make up the asic is walked and the
3944 * soft_reset callbacks are run if the block is hung. soft_reset handles any
3945 * IP specific hardware or software state changes that are necessary to soft
3946 * reset the IP.
3947 * Returns 0 on success, negative error code on failure.
3948 */
amdgpu_device_ip_soft_reset(struct amdgpu_device * adev)3949 static int amdgpu_device_ip_soft_reset(struct amdgpu_device *adev)
3950 {
3951 int i, r = 0;
3952
3953 for (i = 0; i < adev->num_ip_blocks; i++) {
3954 if (!adev->ip_blocks[i].status.valid)
3955 continue;
3956 if (adev->ip_blocks[i].status.hang &&
3957 adev->ip_blocks[i].version->funcs->soft_reset) {
3958 r = adev->ip_blocks[i].version->funcs->soft_reset(adev);
3959 if (r)
3960 return r;
3961 }
3962 }
3963
3964 return 0;
3965 }
3966
3967 /**
3968 * amdgpu_device_ip_post_soft_reset - clean up from soft reset
3969 *
3970 * @adev: amdgpu_device pointer
3971 *
3972 * The list of all the hardware IPs that make up the asic is walked and the
3973 * post_soft_reset callbacks are run if the asic was hung. post_soft_reset
3974 * handles any IP specific hardware or software state changes that are
3975 * necessary after the IP has been soft reset.
3976 * Returns 0 on success, negative error code on failure.
3977 */
amdgpu_device_ip_post_soft_reset(struct amdgpu_device * adev)3978 static int amdgpu_device_ip_post_soft_reset(struct amdgpu_device *adev)
3979 {
3980 int i, r = 0;
3981
3982 for (i = 0; i < adev->num_ip_blocks; i++) {
3983 if (!adev->ip_blocks[i].status.valid)
3984 continue;
3985 if (adev->ip_blocks[i].status.hang &&
3986 adev->ip_blocks[i].version->funcs->post_soft_reset)
3987 r = adev->ip_blocks[i].version->funcs->post_soft_reset(adev);
3988 if (r)
3989 return r;
3990 }
3991
3992 return 0;
3993 }
3994
3995 /**
3996 * amdgpu_device_recover_vram - Recover some VRAM contents
3997 *
3998 * @adev: amdgpu_device pointer
3999 *
4000 * Restores the contents of VRAM buffers from the shadows in GTT. Used to
4001 * restore things like GPUVM page tables after a GPU reset where
4002 * the contents of VRAM might be lost.
4003 *
4004 * Returns:
4005 * 0 on success, negative error code on failure.
4006 */
amdgpu_device_recover_vram(struct amdgpu_device * adev)4007 static int amdgpu_device_recover_vram(struct amdgpu_device *adev)
4008 {
4009 struct dma_fence *fence = NULL, *next = NULL;
4010 struct amdgpu_bo *shadow;
4011 long r = 1, tmo;
4012
4013 if (amdgpu_sriov_runtime(adev))
4014 tmo = msecs_to_jiffies(8000);
4015 else
4016 tmo = msecs_to_jiffies(100);
4017
4018 dev_info(adev->dev, "recover vram bo from shadow start\n");
4019 mutex_lock(&adev->shadow_list_lock);
4020 list_for_each_entry(shadow, &adev->shadow_list, shadow_list) {
4021
4022 /* No need to recover an evicted BO */
4023 if (shadow->tbo.mem.mem_type != TTM_PL_TT ||
4024 shadow->tbo.mem.start == AMDGPU_BO_INVALID_OFFSET ||
4025 shadow->parent->tbo.mem.mem_type != TTM_PL_VRAM)
4026 continue;
4027
4028 r = amdgpu_bo_restore_shadow(shadow, &next);
4029 if (r)
4030 break;
4031
4032 if (fence) {
4033 tmo = dma_fence_wait_timeout(fence, false, tmo);
4034 dma_fence_put(fence);
4035 fence = next;
4036 if (tmo == 0) {
4037 r = -ETIMEDOUT;
4038 break;
4039 } else if (tmo < 0) {
4040 r = tmo;
4041 break;
4042 }
4043 } else {
4044 fence = next;
4045 }
4046 }
4047 mutex_unlock(&adev->shadow_list_lock);
4048
4049 if (fence)
4050 tmo = dma_fence_wait_timeout(fence, false, tmo);
4051 dma_fence_put(fence);
4052
4053 if (r < 0 || tmo <= 0) {
4054 dev_err(adev->dev, "recover vram bo from shadow failed, r is %ld, tmo is %ld\n", r, tmo);
4055 return -EIO;
4056 }
4057
4058 dev_info(adev->dev, "recover vram bo from shadow done\n");
4059 return 0;
4060 }
4061
4062
4063 /**
4064 * amdgpu_device_reset_sriov - reset ASIC for SR-IOV vf
4065 *
4066 * @adev: amdgpu_device pointer
4067 * @from_hypervisor: request from hypervisor
4068 *
4069 * do VF FLR and reinitialize Asic
4070 * return 0 means succeeded otherwise failed
4071 */
amdgpu_device_reset_sriov(struct amdgpu_device * adev,bool from_hypervisor)4072 static int amdgpu_device_reset_sriov(struct amdgpu_device *adev,
4073 bool from_hypervisor)
4074 {
4075 int r;
4076
4077 if (from_hypervisor)
4078 r = amdgpu_virt_request_full_gpu(adev, true);
4079 else
4080 r = amdgpu_virt_reset_gpu(adev);
4081 if (r)
4082 return r;
4083
4084 amdgpu_amdkfd_pre_reset(adev);
4085
4086 /* Resume IP prior to SMC */
4087 r = amdgpu_device_ip_reinit_early_sriov(adev);
4088 if (r)
4089 goto error;
4090
4091 amdgpu_virt_init_data_exchange(adev);
4092 /* we need recover gart prior to run SMC/CP/SDMA resume */
4093 amdgpu_gtt_mgr_recover(ttm_manager_type(&adev->mman.bdev, TTM_PL_TT));
4094
4095 r = amdgpu_device_fw_loading(adev);
4096 if (r)
4097 return r;
4098
4099 /* now we are okay to resume SMC/CP/SDMA */
4100 r = amdgpu_device_ip_reinit_late_sriov(adev);
4101 if (r)
4102 goto error;
4103
4104 amdgpu_irq_gpu_reset_resume_helper(adev);
4105 r = amdgpu_ib_ring_tests(adev);
4106 amdgpu_amdkfd_post_reset(adev);
4107
4108 error:
4109 amdgpu_virt_release_full_gpu(adev, true);
4110 if (!r && adev->virt.gim_feature & AMDGIM_FEATURE_GIM_FLR_VRAMLOST) {
4111 amdgpu_inc_vram_lost(adev);
4112 r = amdgpu_device_recover_vram(adev);
4113 }
4114
4115 return r;
4116 }
4117
4118 /**
4119 * amdgpu_device_has_job_running - check if there is any job in mirror list
4120 *
4121 * @adev: amdgpu_device pointer
4122 *
4123 * check if there is any job in mirror list
4124 */
amdgpu_device_has_job_running(struct amdgpu_device * adev)4125 bool amdgpu_device_has_job_running(struct amdgpu_device *adev)
4126 {
4127 int i;
4128 struct drm_sched_job *job;
4129
4130 for (i = 0; i < AMDGPU_MAX_RINGS; ++i) {
4131 struct amdgpu_ring *ring = adev->rings[i];
4132
4133 if (!ring || !ring->sched.thread)
4134 continue;
4135
4136 spin_lock(&ring->sched.job_list_lock);
4137 job = list_first_entry_or_null(&ring->sched.ring_mirror_list,
4138 struct drm_sched_job, node);
4139 spin_unlock(&ring->sched.job_list_lock);
4140 if (job)
4141 return true;
4142 }
4143 return false;
4144 }
4145
4146 /**
4147 * amdgpu_device_should_recover_gpu - check if we should try GPU recovery
4148 *
4149 * @adev: amdgpu_device pointer
4150 *
4151 * Check amdgpu_gpu_recovery and SRIOV status to see if we should try to recover
4152 * a hung GPU.
4153 */
amdgpu_device_should_recover_gpu(struct amdgpu_device * adev)4154 bool amdgpu_device_should_recover_gpu(struct amdgpu_device *adev)
4155 {
4156 if (!amdgpu_device_ip_check_soft_reset(adev)) {
4157 dev_info(adev->dev, "Timeout, but no hardware hang detected.\n");
4158 return false;
4159 }
4160
4161 if (amdgpu_gpu_recovery == 0)
4162 goto disabled;
4163
4164 if (amdgpu_sriov_vf(adev))
4165 return true;
4166
4167 if (amdgpu_gpu_recovery == -1) {
4168 switch (adev->asic_type) {
4169 case CHIP_BONAIRE:
4170 case CHIP_HAWAII:
4171 case CHIP_TOPAZ:
4172 case CHIP_TONGA:
4173 case CHIP_FIJI:
4174 case CHIP_POLARIS10:
4175 case CHIP_POLARIS11:
4176 case CHIP_POLARIS12:
4177 case CHIP_VEGAM:
4178 case CHIP_VEGA20:
4179 case CHIP_VEGA10:
4180 case CHIP_VEGA12:
4181 case CHIP_RAVEN:
4182 case CHIP_ARCTURUS:
4183 case CHIP_RENOIR:
4184 case CHIP_NAVI10:
4185 case CHIP_NAVI14:
4186 case CHIP_NAVI12:
4187 case CHIP_SIENNA_CICHLID:
4188 break;
4189 default:
4190 goto disabled;
4191 }
4192 }
4193
4194 return true;
4195
4196 disabled:
4197 dev_info(adev->dev, "GPU recovery disabled.\n");
4198 return false;
4199 }
4200
4201
amdgpu_device_pre_asic_reset(struct amdgpu_device * adev,struct amdgpu_job * job,bool * need_full_reset_arg)4202 static int amdgpu_device_pre_asic_reset(struct amdgpu_device *adev,
4203 struct amdgpu_job *job,
4204 bool *need_full_reset_arg)
4205 {
4206 int i, r = 0;
4207 bool need_full_reset = *need_full_reset_arg;
4208
4209 amdgpu_debugfs_wait_dump(adev);
4210
4211 if (amdgpu_sriov_vf(adev)) {
4212 /* stop the data exchange thread */
4213 amdgpu_virt_fini_data_exchange(adev);
4214 }
4215
4216 /* block all schedulers and reset given job's ring */
4217 for (i = 0; i < AMDGPU_MAX_RINGS; ++i) {
4218 struct amdgpu_ring *ring = adev->rings[i];
4219
4220 if (!ring || !ring->sched.thread)
4221 continue;
4222
4223 /* after all hw jobs are reset, hw fence is meaningless, so force_completion */
4224 amdgpu_fence_driver_force_completion(ring);
4225 }
4226
4227 if(job)
4228 drm_sched_increase_karma(&job->base);
4229
4230 /* Don't suspend on bare metal if we are not going to HW reset the ASIC */
4231 if (!amdgpu_sriov_vf(adev)) {
4232
4233 if (!need_full_reset)
4234 need_full_reset = amdgpu_device_ip_need_full_reset(adev);
4235
4236 if (!need_full_reset) {
4237 amdgpu_device_ip_pre_soft_reset(adev);
4238 r = amdgpu_device_ip_soft_reset(adev);
4239 amdgpu_device_ip_post_soft_reset(adev);
4240 if (r || amdgpu_device_ip_check_soft_reset(adev)) {
4241 dev_info(adev->dev, "soft reset failed, will fallback to full reset!\n");
4242 need_full_reset = true;
4243 }
4244 }
4245
4246 if (need_full_reset)
4247 r = amdgpu_device_ip_suspend(adev);
4248
4249 *need_full_reset_arg = need_full_reset;
4250 }
4251
4252 return r;
4253 }
4254
amdgpu_do_asic_reset(struct amdgpu_hive_info * hive,struct list_head * device_list_handle,bool * need_full_reset_arg,bool skip_hw_reset)4255 static int amdgpu_do_asic_reset(struct amdgpu_hive_info *hive,
4256 struct list_head *device_list_handle,
4257 bool *need_full_reset_arg,
4258 bool skip_hw_reset)
4259 {
4260 struct amdgpu_device *tmp_adev = NULL;
4261 bool need_full_reset = *need_full_reset_arg, vram_lost = false;
4262 int r = 0;
4263
4264 /*
4265 * ASIC reset has to be done on all HGMI hive nodes ASAP
4266 * to allow proper links negotiation in FW (within 1 sec)
4267 */
4268 if (!skip_hw_reset && need_full_reset) {
4269 list_for_each_entry(tmp_adev, device_list_handle, gmc.xgmi.head) {
4270 /* For XGMI run all resets in parallel to speed up the process */
4271 if (tmp_adev->gmc.xgmi.num_physical_nodes > 1) {
4272 if (!queue_work(system_unbound_wq, &tmp_adev->xgmi_reset_work))
4273 r = -EALREADY;
4274 } else
4275 r = amdgpu_asic_reset(tmp_adev);
4276
4277 if (r) {
4278 dev_err(tmp_adev->dev, "ASIC reset failed with error, %d for drm dev, %s",
4279 r, adev_to_drm(tmp_adev)->unique);
4280 break;
4281 }
4282 }
4283
4284 /* For XGMI wait for all resets to complete before proceed */
4285 if (!r) {
4286 list_for_each_entry(tmp_adev, device_list_handle,
4287 gmc.xgmi.head) {
4288 if (tmp_adev->gmc.xgmi.num_physical_nodes > 1) {
4289 flush_work(&tmp_adev->xgmi_reset_work);
4290 r = tmp_adev->asic_reset_res;
4291 if (r)
4292 break;
4293 }
4294 }
4295 }
4296 }
4297
4298 if (!r && amdgpu_ras_intr_triggered()) {
4299 list_for_each_entry(tmp_adev, device_list_handle, gmc.xgmi.head) {
4300 if (tmp_adev->mmhub.funcs &&
4301 tmp_adev->mmhub.funcs->reset_ras_error_count)
4302 tmp_adev->mmhub.funcs->reset_ras_error_count(tmp_adev);
4303 }
4304
4305 amdgpu_ras_intr_cleared();
4306 }
4307
4308 list_for_each_entry(tmp_adev, device_list_handle, gmc.xgmi.head) {
4309 if (need_full_reset) {
4310 /* post card */
4311 if (amdgpu_device_asic_init(tmp_adev))
4312 dev_warn(tmp_adev->dev, "asic atom init failed!");
4313
4314 if (!r) {
4315 dev_info(tmp_adev->dev, "GPU reset succeeded, trying to resume\n");
4316 r = amdgpu_amdkfd_resume_iommu(tmp_adev);
4317 if (r)
4318 goto out;
4319
4320 r = amdgpu_device_ip_resume_phase1(tmp_adev);
4321 if (r)
4322 goto out;
4323
4324 vram_lost = amdgpu_device_check_vram_lost(tmp_adev);
4325 if (vram_lost) {
4326 DRM_INFO("VRAM is lost due to GPU reset!\n");
4327 amdgpu_inc_vram_lost(tmp_adev);
4328 }
4329
4330 r = amdgpu_gtt_mgr_recover(ttm_manager_type(&tmp_adev->mman.bdev, TTM_PL_TT));
4331 if (r)
4332 goto out;
4333
4334 r = amdgpu_device_fw_loading(tmp_adev);
4335 if (r)
4336 return r;
4337
4338 r = amdgpu_device_ip_resume_phase2(tmp_adev);
4339 if (r)
4340 goto out;
4341
4342 if (vram_lost)
4343 amdgpu_device_fill_reset_magic(tmp_adev);
4344
4345 /*
4346 * Add this ASIC as tracked as reset was already
4347 * complete successfully.
4348 */
4349 amdgpu_register_gpu_instance(tmp_adev);
4350
4351 r = amdgpu_device_ip_late_init(tmp_adev);
4352 if (r)
4353 goto out;
4354
4355 amdgpu_fbdev_set_suspend(tmp_adev, 0);
4356
4357 /*
4358 * The GPU enters bad state once faulty pages
4359 * by ECC has reached the threshold, and ras
4360 * recovery is scheduled next. So add one check
4361 * here to break recovery if it indeed exceeds
4362 * bad page threshold, and remind user to
4363 * retire this GPU or setting one bigger
4364 * bad_page_threshold value to fix this once
4365 * probing driver again.
4366 */
4367 if (!amdgpu_ras_check_err_threshold(tmp_adev)) {
4368 /* must succeed. */
4369 amdgpu_ras_resume(tmp_adev);
4370 } else {
4371 r = -EINVAL;
4372 goto out;
4373 }
4374
4375 /* Update PSP FW topology after reset */
4376 if (hive && tmp_adev->gmc.xgmi.num_physical_nodes > 1)
4377 r = amdgpu_xgmi_update_topology(hive, tmp_adev);
4378 }
4379 }
4380
4381 out:
4382 if (!r) {
4383 amdgpu_irq_gpu_reset_resume_helper(tmp_adev);
4384 r = amdgpu_ib_ring_tests(tmp_adev);
4385 if (r) {
4386 dev_err(tmp_adev->dev, "ib ring test failed (%d).\n", r);
4387 need_full_reset = true;
4388 r = -EAGAIN;
4389 goto end;
4390 }
4391 }
4392
4393 if (!r)
4394 r = amdgpu_device_recover_vram(tmp_adev);
4395 else
4396 tmp_adev->asic_reset_res = r;
4397 }
4398
4399 end:
4400 *need_full_reset_arg = need_full_reset;
4401 return r;
4402 }
4403
amdgpu_device_lock_adev(struct amdgpu_device * adev,struct amdgpu_hive_info * hive)4404 static bool amdgpu_device_lock_adev(struct amdgpu_device *adev,
4405 struct amdgpu_hive_info *hive)
4406 {
4407 if (atomic_cmpxchg(&adev->in_gpu_reset, 0, 1) != 0)
4408 return false;
4409
4410 if (hive) {
4411 down_write_nest_lock(&adev->reset_sem, &hive->hive_lock);
4412 } else {
4413 down_write(&adev->reset_sem);
4414 }
4415
4416 atomic_inc(&adev->gpu_reset_counter);
4417 switch (amdgpu_asic_reset_method(adev)) {
4418 case AMD_RESET_METHOD_MODE1:
4419 adev->mp1_state = PP_MP1_STATE_SHUTDOWN;
4420 break;
4421 case AMD_RESET_METHOD_MODE2:
4422 adev->mp1_state = PP_MP1_STATE_RESET;
4423 break;
4424 default:
4425 adev->mp1_state = PP_MP1_STATE_NONE;
4426 break;
4427 }
4428
4429 return true;
4430 }
4431
amdgpu_device_unlock_adev(struct amdgpu_device * adev)4432 static void amdgpu_device_unlock_adev(struct amdgpu_device *adev)
4433 {
4434 amdgpu_vf_error_trans_all(adev);
4435 adev->mp1_state = PP_MP1_STATE_NONE;
4436 atomic_set(&adev->in_gpu_reset, 0);
4437 up_write(&adev->reset_sem);
4438 }
4439
amdgpu_device_resume_display_audio(struct amdgpu_device * adev)4440 static void amdgpu_device_resume_display_audio(struct amdgpu_device *adev)
4441 {
4442 struct pci_dev *p = NULL;
4443
4444 p = pci_get_domain_bus_and_slot(pci_domain_nr(adev->pdev->bus),
4445 adev->pdev->bus->number, 1);
4446 if (p) {
4447 pm_runtime_enable(&(p->dev));
4448 pm_runtime_resume(&(p->dev));
4449 }
4450
4451 pci_dev_put(p);
4452 }
4453
amdgpu_device_suspend_display_audio(struct amdgpu_device * adev)4454 static int amdgpu_device_suspend_display_audio(struct amdgpu_device *adev)
4455 {
4456 enum amd_reset_method reset_method;
4457 struct pci_dev *p = NULL;
4458 u64 expires;
4459
4460 /*
4461 * For now, only BACO and mode1 reset are confirmed
4462 * to suffer the audio issue without proper suspended.
4463 */
4464 reset_method = amdgpu_asic_reset_method(adev);
4465 if ((reset_method != AMD_RESET_METHOD_BACO) &&
4466 (reset_method != AMD_RESET_METHOD_MODE1))
4467 return -EINVAL;
4468
4469 p = pci_get_domain_bus_and_slot(pci_domain_nr(adev->pdev->bus),
4470 adev->pdev->bus->number, 1);
4471 if (!p)
4472 return -ENODEV;
4473
4474 expires = pm_runtime_autosuspend_expiration(&(p->dev));
4475 if (!expires)
4476 /*
4477 * If we cannot get the audio device autosuspend delay,
4478 * a fixed 4S interval will be used. Considering 3S is
4479 * the audio controller default autosuspend delay setting.
4480 * 4S used here is guaranteed to cover that.
4481 */
4482 expires = ktime_get_mono_fast_ns() + NSEC_PER_SEC * 4ULL;
4483
4484 while (!pm_runtime_status_suspended(&(p->dev))) {
4485 if (!pm_runtime_suspend(&(p->dev)))
4486 break;
4487
4488 if (expires < ktime_get_mono_fast_ns()) {
4489 dev_warn(adev->dev, "failed to suspend display audio\n");
4490 pci_dev_put(p);
4491 /* TODO: abort the succeeding gpu reset? */
4492 return -ETIMEDOUT;
4493 }
4494 }
4495
4496 pm_runtime_disable(&(p->dev));
4497
4498 pci_dev_put(p);
4499 return 0;
4500 }
4501
4502 /**
4503 * amdgpu_device_gpu_recover - reset the asic and recover scheduler
4504 *
4505 * @adev: amdgpu_device pointer
4506 * @job: which job trigger hang
4507 *
4508 * Attempt to reset the GPU if it has hung (all asics).
4509 * Attempt to do soft-reset or full-reset and reinitialize Asic
4510 * Returns 0 for success or an error on failure.
4511 */
4512
amdgpu_device_gpu_recover(struct amdgpu_device * adev,struct amdgpu_job * job)4513 int amdgpu_device_gpu_recover(struct amdgpu_device *adev,
4514 struct amdgpu_job *job)
4515 {
4516 struct list_head device_list, *device_list_handle = NULL;
4517 bool need_full_reset = false;
4518 bool job_signaled = false;
4519 struct amdgpu_hive_info *hive = NULL;
4520 struct amdgpu_device *tmp_adev = NULL;
4521 int i, r = 0;
4522 bool need_emergency_restart = false;
4523 bool audio_suspended = false;
4524
4525 /*
4526 * Special case: RAS triggered and full reset isn't supported
4527 */
4528 need_emergency_restart = amdgpu_ras_need_emergency_restart(adev);
4529
4530 /*
4531 * Flush RAM to disk so that after reboot
4532 * the user can read log and see why the system rebooted.
4533 */
4534 if (need_emergency_restart && amdgpu_ras_get_context(adev) &&
4535 amdgpu_ras_get_context(adev)->reboot) {
4536 DRM_WARN("Emergency reboot.");
4537
4538 ksys_sync_helper();
4539 emergency_restart();
4540 }
4541
4542 dev_info(adev->dev, "GPU %s begin!\n",
4543 need_emergency_restart ? "jobs stop":"reset");
4544
4545 /*
4546 * Here we trylock to avoid chain of resets executing from
4547 * either trigger by jobs on different adevs in XGMI hive or jobs on
4548 * different schedulers for same device while this TO handler is running.
4549 * We always reset all schedulers for device and all devices for XGMI
4550 * hive so that should take care of them too.
4551 */
4552 hive = amdgpu_get_xgmi_hive(adev);
4553 if (hive) {
4554 if (atomic_cmpxchg(&hive->in_reset, 0, 1) != 0) {
4555 DRM_INFO("Bailing on TDR for s_job:%llx, hive: %llx as another already in progress",
4556 job ? job->base.id : -1, hive->hive_id);
4557 amdgpu_put_xgmi_hive(hive);
4558 return 0;
4559 }
4560 mutex_lock(&hive->hive_lock);
4561 }
4562
4563 /*
4564 * Build list of devices to reset.
4565 * In case we are in XGMI hive mode, resort the device list
4566 * to put adev in the 1st position.
4567 */
4568 INIT_LIST_HEAD(&device_list);
4569 if (adev->gmc.xgmi.num_physical_nodes > 1) {
4570 if (!hive)
4571 return -ENODEV;
4572 if (!list_is_first(&adev->gmc.xgmi.head, &hive->device_list))
4573 list_rotate_to_front(&adev->gmc.xgmi.head, &hive->device_list);
4574 device_list_handle = &hive->device_list;
4575 } else {
4576 list_add_tail(&adev->gmc.xgmi.head, &device_list);
4577 device_list_handle = &device_list;
4578 }
4579
4580 /* block all schedulers and reset given job's ring */
4581 list_for_each_entry(tmp_adev, device_list_handle, gmc.xgmi.head) {
4582 if (!amdgpu_device_lock_adev(tmp_adev, hive)) {
4583 dev_info(tmp_adev->dev, "Bailing on TDR for s_job:%llx, as another already in progress",
4584 job ? job->base.id : -1);
4585 r = 0;
4586 goto skip_recovery;
4587 }
4588
4589 /*
4590 * Try to put the audio codec into suspend state
4591 * before gpu reset started.
4592 *
4593 * Due to the power domain of the graphics device
4594 * is shared with AZ power domain. Without this,
4595 * we may change the audio hardware from behind
4596 * the audio driver's back. That will trigger
4597 * some audio codec errors.
4598 */
4599 if (!amdgpu_device_suspend_display_audio(tmp_adev))
4600 audio_suspended = true;
4601
4602 amdgpu_ras_set_error_query_ready(tmp_adev, false);
4603
4604 cancel_delayed_work_sync(&tmp_adev->delayed_init_work);
4605
4606 if (!amdgpu_sriov_vf(tmp_adev))
4607 amdgpu_amdkfd_pre_reset(tmp_adev);
4608
4609 /*
4610 * Mark these ASICs to be reseted as untracked first
4611 * And add them back after reset completed
4612 */
4613 amdgpu_unregister_gpu_instance(tmp_adev);
4614
4615 amdgpu_fbdev_set_suspend(tmp_adev, 1);
4616
4617 /* disable ras on ALL IPs */
4618 if (!need_emergency_restart &&
4619 amdgpu_device_ip_need_full_reset(tmp_adev))
4620 amdgpu_ras_suspend(tmp_adev);
4621
4622 for (i = 0; i < AMDGPU_MAX_RINGS; ++i) {
4623 struct amdgpu_ring *ring = tmp_adev->rings[i];
4624
4625 if (!ring || !ring->sched.thread)
4626 continue;
4627
4628 drm_sched_stop(&ring->sched, job ? &job->base : NULL);
4629
4630 if (need_emergency_restart)
4631 amdgpu_job_stop_all_jobs_on_sched(&ring->sched);
4632 }
4633 }
4634
4635 if (need_emergency_restart)
4636 goto skip_sched_resume;
4637
4638 /*
4639 * Must check guilty signal here since after this point all old
4640 * HW fences are force signaled.
4641 *
4642 * job->base holds a reference to parent fence
4643 */
4644 if (job && job->base.s_fence->parent &&
4645 dma_fence_is_signaled(job->base.s_fence->parent)) {
4646 job_signaled = true;
4647 dev_info(adev->dev, "Guilty job already signaled, skipping HW reset");
4648 goto skip_hw_reset;
4649 }
4650
4651 retry: /* Rest of adevs pre asic reset from XGMI hive. */
4652 list_for_each_entry(tmp_adev, device_list_handle, gmc.xgmi.head) {
4653 r = amdgpu_device_pre_asic_reset(tmp_adev,
4654 (tmp_adev == adev) ? job : NULL,
4655 &need_full_reset);
4656 /*TODO Should we stop ?*/
4657 if (r) {
4658 dev_err(tmp_adev->dev, "GPU pre asic reset failed with err, %d for drm dev, %s ",
4659 r, adev_to_drm(tmp_adev)->unique);
4660 tmp_adev->asic_reset_res = r;
4661 }
4662 }
4663
4664 /* Actual ASIC resets if needed.*/
4665 /* TODO Implement XGMI hive reset logic for SRIOV */
4666 if (amdgpu_sriov_vf(adev)) {
4667 r = amdgpu_device_reset_sriov(adev, job ? false : true);
4668 if (r)
4669 adev->asic_reset_res = r;
4670 } else {
4671 r = amdgpu_do_asic_reset(hive, device_list_handle, &need_full_reset, false);
4672 if (r && r == -EAGAIN)
4673 goto retry;
4674 }
4675
4676 skip_hw_reset:
4677
4678 /* Post ASIC reset for all devs .*/
4679 list_for_each_entry(tmp_adev, device_list_handle, gmc.xgmi.head) {
4680
4681 for (i = 0; i < AMDGPU_MAX_RINGS; ++i) {
4682 struct amdgpu_ring *ring = tmp_adev->rings[i];
4683
4684 if (!ring || !ring->sched.thread)
4685 continue;
4686
4687 /* No point to resubmit jobs if we didn't HW reset*/
4688 if (!tmp_adev->asic_reset_res && !job_signaled)
4689 drm_sched_resubmit_jobs(&ring->sched);
4690
4691 drm_sched_start(&ring->sched, !tmp_adev->asic_reset_res);
4692 }
4693
4694 if (!amdgpu_device_has_dc_support(tmp_adev) && !job_signaled) {
4695 drm_helper_resume_force_mode(adev_to_drm(tmp_adev));
4696 }
4697
4698 tmp_adev->asic_reset_res = 0;
4699
4700 if (r) {
4701 /* bad news, how to tell it to userspace ? */
4702 dev_info(tmp_adev->dev, "GPU reset(%d) failed\n", atomic_read(&tmp_adev->gpu_reset_counter));
4703 amdgpu_vf_error_put(tmp_adev, AMDGIM_ERROR_VF_GPU_RESET_FAIL, 0, r);
4704 } else {
4705 dev_info(tmp_adev->dev, "GPU reset(%d) succeeded!\n", atomic_read(&tmp_adev->gpu_reset_counter));
4706 }
4707 }
4708
4709 skip_sched_resume:
4710 list_for_each_entry(tmp_adev, device_list_handle, gmc.xgmi.head) {
4711 /*unlock kfd: SRIOV would do it separately */
4712 if (!need_emergency_restart && !amdgpu_sriov_vf(tmp_adev))
4713 amdgpu_amdkfd_post_reset(tmp_adev);
4714 if (audio_suspended)
4715 amdgpu_device_resume_display_audio(tmp_adev);
4716 amdgpu_device_unlock_adev(tmp_adev);
4717 }
4718
4719 skip_recovery:
4720 if (hive) {
4721 atomic_set(&hive->in_reset, 0);
4722 mutex_unlock(&hive->hive_lock);
4723 amdgpu_put_xgmi_hive(hive);
4724 }
4725
4726 if (r)
4727 dev_info(adev->dev, "GPU reset end with ret = %d\n", r);
4728 return r;
4729 }
4730
4731 /**
4732 * amdgpu_device_get_pcie_info - fence pcie info about the PCIE slot
4733 *
4734 * @adev: amdgpu_device pointer
4735 *
4736 * Fetchs and stores in the driver the PCIE capabilities (gen speed
4737 * and lanes) of the slot the device is in. Handles APUs and
4738 * virtualized environments where PCIE config space may not be available.
4739 */
amdgpu_device_get_pcie_info(struct amdgpu_device * adev)4740 static void amdgpu_device_get_pcie_info(struct amdgpu_device *adev)
4741 {
4742 struct pci_dev *pdev;
4743 enum pci_bus_speed speed_cap, platform_speed_cap;
4744 enum pcie_link_width platform_link_width;
4745
4746 if (amdgpu_pcie_gen_cap)
4747 adev->pm.pcie_gen_mask = amdgpu_pcie_gen_cap;
4748
4749 if (amdgpu_pcie_lane_cap)
4750 adev->pm.pcie_mlw_mask = amdgpu_pcie_lane_cap;
4751
4752 /* covers APUs as well */
4753 if (pci_is_root_bus(adev->pdev->bus)) {
4754 if (adev->pm.pcie_gen_mask == 0)
4755 adev->pm.pcie_gen_mask = AMDGPU_DEFAULT_PCIE_GEN_MASK;
4756 if (adev->pm.pcie_mlw_mask == 0)
4757 adev->pm.pcie_mlw_mask = AMDGPU_DEFAULT_PCIE_MLW_MASK;
4758 return;
4759 }
4760
4761 if (adev->pm.pcie_gen_mask && adev->pm.pcie_mlw_mask)
4762 return;
4763
4764 pcie_bandwidth_available(adev->pdev, NULL,
4765 &platform_speed_cap, &platform_link_width);
4766
4767 if (adev->pm.pcie_gen_mask == 0) {
4768 /* asic caps */
4769 pdev = adev->pdev;
4770 speed_cap = pcie_get_speed_cap(pdev);
4771 if (speed_cap == PCI_SPEED_UNKNOWN) {
4772 adev->pm.pcie_gen_mask |= (CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN1 |
4773 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN2 |
4774 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN3);
4775 } else {
4776 if (speed_cap == PCIE_SPEED_16_0GT)
4777 adev->pm.pcie_gen_mask |= (CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN1 |
4778 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN2 |
4779 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN3 |
4780 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN4);
4781 else if (speed_cap == PCIE_SPEED_8_0GT)
4782 adev->pm.pcie_gen_mask |= (CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN1 |
4783 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN2 |
4784 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN3);
4785 else if (speed_cap == PCIE_SPEED_5_0GT)
4786 adev->pm.pcie_gen_mask |= (CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN1 |
4787 CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN2);
4788 else
4789 adev->pm.pcie_gen_mask |= CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN1;
4790 }
4791 /* platform caps */
4792 if (platform_speed_cap == PCI_SPEED_UNKNOWN) {
4793 adev->pm.pcie_gen_mask |= (CAIL_PCIE_LINK_SPEED_SUPPORT_GEN1 |
4794 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN2);
4795 } else {
4796 if (platform_speed_cap == PCIE_SPEED_16_0GT)
4797 adev->pm.pcie_gen_mask |= (CAIL_PCIE_LINK_SPEED_SUPPORT_GEN1 |
4798 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN2 |
4799 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN3 |
4800 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN4);
4801 else if (platform_speed_cap == PCIE_SPEED_8_0GT)
4802 adev->pm.pcie_gen_mask |= (CAIL_PCIE_LINK_SPEED_SUPPORT_GEN1 |
4803 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN2 |
4804 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN3);
4805 else if (platform_speed_cap == PCIE_SPEED_5_0GT)
4806 adev->pm.pcie_gen_mask |= (CAIL_PCIE_LINK_SPEED_SUPPORT_GEN1 |
4807 CAIL_PCIE_LINK_SPEED_SUPPORT_GEN2);
4808 else
4809 adev->pm.pcie_gen_mask |= CAIL_PCIE_LINK_SPEED_SUPPORT_GEN1;
4810
4811 }
4812 }
4813 if (adev->pm.pcie_mlw_mask == 0) {
4814 if (platform_link_width == PCIE_LNK_WIDTH_UNKNOWN) {
4815 adev->pm.pcie_mlw_mask |= AMDGPU_DEFAULT_PCIE_MLW_MASK;
4816 } else {
4817 switch (platform_link_width) {
4818 case PCIE_LNK_X32:
4819 adev->pm.pcie_mlw_mask = (CAIL_PCIE_LINK_WIDTH_SUPPORT_X32 |
4820 CAIL_PCIE_LINK_WIDTH_SUPPORT_X16 |
4821 CAIL_PCIE_LINK_WIDTH_SUPPORT_X12 |
4822 CAIL_PCIE_LINK_WIDTH_SUPPORT_X8 |
4823 CAIL_PCIE_LINK_WIDTH_SUPPORT_X4 |
4824 CAIL_PCIE_LINK_WIDTH_SUPPORT_X2 |
4825 CAIL_PCIE_LINK_WIDTH_SUPPORT_X1);
4826 break;
4827 case PCIE_LNK_X16:
4828 adev->pm.pcie_mlw_mask = (CAIL_PCIE_LINK_WIDTH_SUPPORT_X16 |
4829 CAIL_PCIE_LINK_WIDTH_SUPPORT_X12 |
4830 CAIL_PCIE_LINK_WIDTH_SUPPORT_X8 |
4831 CAIL_PCIE_LINK_WIDTH_SUPPORT_X4 |
4832 CAIL_PCIE_LINK_WIDTH_SUPPORT_X2 |
4833 CAIL_PCIE_LINK_WIDTH_SUPPORT_X1);
4834 break;
4835 case PCIE_LNK_X12:
4836 adev->pm.pcie_mlw_mask = (CAIL_PCIE_LINK_WIDTH_SUPPORT_X12 |
4837 CAIL_PCIE_LINK_WIDTH_SUPPORT_X8 |
4838 CAIL_PCIE_LINK_WIDTH_SUPPORT_X4 |
4839 CAIL_PCIE_LINK_WIDTH_SUPPORT_X2 |
4840 CAIL_PCIE_LINK_WIDTH_SUPPORT_X1);
4841 break;
4842 case PCIE_LNK_X8:
4843 adev->pm.pcie_mlw_mask = (CAIL_PCIE_LINK_WIDTH_SUPPORT_X8 |
4844 CAIL_PCIE_LINK_WIDTH_SUPPORT_X4 |
4845 CAIL_PCIE_LINK_WIDTH_SUPPORT_X2 |
4846 CAIL_PCIE_LINK_WIDTH_SUPPORT_X1);
4847 break;
4848 case PCIE_LNK_X4:
4849 adev->pm.pcie_mlw_mask = (CAIL_PCIE_LINK_WIDTH_SUPPORT_X4 |
4850 CAIL_PCIE_LINK_WIDTH_SUPPORT_X2 |
4851 CAIL_PCIE_LINK_WIDTH_SUPPORT_X1);
4852 break;
4853 case PCIE_LNK_X2:
4854 adev->pm.pcie_mlw_mask = (CAIL_PCIE_LINK_WIDTH_SUPPORT_X2 |
4855 CAIL_PCIE_LINK_WIDTH_SUPPORT_X1);
4856 break;
4857 case PCIE_LNK_X1:
4858 adev->pm.pcie_mlw_mask = CAIL_PCIE_LINK_WIDTH_SUPPORT_X1;
4859 break;
4860 default:
4861 break;
4862 }
4863 }
4864 }
4865 }
4866
amdgpu_device_baco_enter(struct drm_device * dev)4867 int amdgpu_device_baco_enter(struct drm_device *dev)
4868 {
4869 struct amdgpu_device *adev = drm_to_adev(dev);
4870 struct amdgpu_ras *ras = amdgpu_ras_get_context(adev);
4871
4872 if (!amdgpu_device_supports_baco(adev_to_drm(adev)))
4873 return -ENOTSUPP;
4874
4875 if (ras && ras->supported && adev->nbio.funcs->enable_doorbell_interrupt)
4876 adev->nbio.funcs->enable_doorbell_interrupt(adev, false);
4877
4878 return amdgpu_dpm_baco_enter(adev);
4879 }
4880
amdgpu_device_baco_exit(struct drm_device * dev)4881 int amdgpu_device_baco_exit(struct drm_device *dev)
4882 {
4883 struct amdgpu_device *adev = drm_to_adev(dev);
4884 struct amdgpu_ras *ras = amdgpu_ras_get_context(adev);
4885 int ret = 0;
4886
4887 if (!amdgpu_device_supports_baco(adev_to_drm(adev)))
4888 return -ENOTSUPP;
4889
4890 ret = amdgpu_dpm_baco_exit(adev);
4891 if (ret)
4892 return ret;
4893
4894 if (ras && ras->supported && adev->nbio.funcs->enable_doorbell_interrupt)
4895 adev->nbio.funcs->enable_doorbell_interrupt(adev, true);
4896
4897 return 0;
4898 }
4899
amdgpu_cancel_all_tdr(struct amdgpu_device * adev)4900 static void amdgpu_cancel_all_tdr(struct amdgpu_device *adev)
4901 {
4902 int i;
4903
4904 for (i = 0; i < AMDGPU_MAX_RINGS; ++i) {
4905 struct amdgpu_ring *ring = adev->rings[i];
4906
4907 if (!ring || !ring->sched.thread)
4908 continue;
4909
4910 cancel_delayed_work_sync(&ring->sched.work_tdr);
4911 }
4912 }
4913
4914 /**
4915 * amdgpu_pci_error_detected - Called when a PCI error is detected.
4916 * @pdev: PCI device struct
4917 * @state: PCI channel state
4918 *
4919 * Description: Called when a PCI error is detected.
4920 *
4921 * Return: PCI_ERS_RESULT_NEED_RESET or PCI_ERS_RESULT_DISCONNECT.
4922 */
amdgpu_pci_error_detected(struct pci_dev * pdev,pci_channel_state_t state)4923 pci_ers_result_t amdgpu_pci_error_detected(struct pci_dev *pdev, pci_channel_state_t state)
4924 {
4925 struct drm_device *dev = pci_get_drvdata(pdev);
4926 struct amdgpu_device *adev = drm_to_adev(dev);
4927 int i;
4928
4929 DRM_INFO("PCI error: detected callback, state(%d)!!\n", state);
4930
4931 if (adev->gmc.xgmi.num_physical_nodes > 1) {
4932 DRM_WARN("No support for XGMI hive yet...");
4933 return PCI_ERS_RESULT_DISCONNECT;
4934 }
4935
4936 switch (state) {
4937 case pci_channel_io_normal:
4938 return PCI_ERS_RESULT_CAN_RECOVER;
4939 /* Fatal error, prepare for slot reset */
4940 case pci_channel_io_frozen:
4941 /*
4942 * Cancel and wait for all TDRs in progress if failing to
4943 * set adev->in_gpu_reset in amdgpu_device_lock_adev
4944 *
4945 * Locking adev->reset_sem will prevent any external access
4946 * to GPU during PCI error recovery
4947 */
4948 while (!amdgpu_device_lock_adev(adev, NULL))
4949 amdgpu_cancel_all_tdr(adev);
4950
4951 /*
4952 * Block any work scheduling as we do for regular GPU reset
4953 * for the duration of the recovery
4954 */
4955 for (i = 0; i < AMDGPU_MAX_RINGS; ++i) {
4956 struct amdgpu_ring *ring = adev->rings[i];
4957
4958 if (!ring || !ring->sched.thread)
4959 continue;
4960
4961 drm_sched_stop(&ring->sched, NULL);
4962 }
4963 return PCI_ERS_RESULT_NEED_RESET;
4964 case pci_channel_io_perm_failure:
4965 /* Permanent error, prepare for device removal */
4966 return PCI_ERS_RESULT_DISCONNECT;
4967 }
4968
4969 return PCI_ERS_RESULT_NEED_RESET;
4970 }
4971
4972 /**
4973 * amdgpu_pci_mmio_enabled - Enable MMIO and dump debug registers
4974 * @pdev: pointer to PCI device
4975 */
amdgpu_pci_mmio_enabled(struct pci_dev * pdev)4976 pci_ers_result_t amdgpu_pci_mmio_enabled(struct pci_dev *pdev)
4977 {
4978
4979 DRM_INFO("PCI error: mmio enabled callback!!\n");
4980
4981 /* TODO - dump whatever for debugging purposes */
4982
4983 /* This called only if amdgpu_pci_error_detected returns
4984 * PCI_ERS_RESULT_CAN_RECOVER. Read/write to the device still
4985 * works, no need to reset slot.
4986 */
4987
4988 return PCI_ERS_RESULT_RECOVERED;
4989 }
4990
4991 /**
4992 * amdgpu_pci_slot_reset - Called when PCI slot has been reset.
4993 * @pdev: PCI device struct
4994 *
4995 * Description: This routine is called by the pci error recovery
4996 * code after the PCI slot has been reset, just before we
4997 * should resume normal operations.
4998 */
amdgpu_pci_slot_reset(struct pci_dev * pdev)4999 pci_ers_result_t amdgpu_pci_slot_reset(struct pci_dev *pdev)
5000 {
5001 struct drm_device *dev = pci_get_drvdata(pdev);
5002 struct amdgpu_device *adev = drm_to_adev(dev);
5003 int r, i;
5004 bool need_full_reset = true;
5005 u32 memsize;
5006 struct list_head device_list;
5007
5008 DRM_INFO("PCI error: slot reset callback!!\n");
5009
5010 INIT_LIST_HEAD(&device_list);
5011 list_add_tail(&adev->gmc.xgmi.head, &device_list);
5012
5013 /* wait for asic to come out of reset */
5014 msleep(500);
5015
5016 /* Restore PCI confspace */
5017 amdgpu_device_load_pci_state(pdev);
5018
5019 /* confirm ASIC came out of reset */
5020 for (i = 0; i < adev->usec_timeout; i++) {
5021 memsize = amdgpu_asic_get_config_memsize(adev);
5022
5023 if (memsize != 0xffffffff)
5024 break;
5025 udelay(1);
5026 }
5027 if (memsize == 0xffffffff) {
5028 r = -ETIME;
5029 goto out;
5030 }
5031
5032 adev->in_pci_err_recovery = true;
5033 r = amdgpu_device_pre_asic_reset(adev, NULL, &need_full_reset);
5034 adev->in_pci_err_recovery = false;
5035 if (r)
5036 goto out;
5037
5038 r = amdgpu_do_asic_reset(NULL, &device_list, &need_full_reset, true);
5039
5040 out:
5041 if (!r) {
5042 if (amdgpu_device_cache_pci_state(adev->pdev))
5043 pci_restore_state(adev->pdev);
5044
5045 DRM_INFO("PCIe error recovery succeeded\n");
5046 } else {
5047 DRM_ERROR("PCIe error recovery failed, err:%d", r);
5048 amdgpu_device_unlock_adev(adev);
5049 }
5050
5051 return r ? PCI_ERS_RESULT_DISCONNECT : PCI_ERS_RESULT_RECOVERED;
5052 }
5053
5054 /**
5055 * amdgpu_pci_resume() - resume normal ops after PCI reset
5056 * @pdev: pointer to PCI device
5057 *
5058 * Called when the error recovery driver tells us that its
5059 * OK to resume normal operation. Use completion to allow
5060 * halted scsi ops to resume.
5061 */
amdgpu_pci_resume(struct pci_dev * pdev)5062 void amdgpu_pci_resume(struct pci_dev *pdev)
5063 {
5064 struct drm_device *dev = pci_get_drvdata(pdev);
5065 struct amdgpu_device *adev = drm_to_adev(dev);
5066 int i;
5067
5068
5069 DRM_INFO("PCI error: resume callback!!\n");
5070
5071 for (i = 0; i < AMDGPU_MAX_RINGS; ++i) {
5072 struct amdgpu_ring *ring = adev->rings[i];
5073
5074 if (!ring || !ring->sched.thread)
5075 continue;
5076
5077
5078 drm_sched_resubmit_jobs(&ring->sched);
5079 drm_sched_start(&ring->sched, true);
5080 }
5081
5082 amdgpu_device_unlock_adev(adev);
5083 }
5084
amdgpu_device_cache_pci_state(struct pci_dev * pdev)5085 bool amdgpu_device_cache_pci_state(struct pci_dev *pdev)
5086 {
5087 struct drm_device *dev = pci_get_drvdata(pdev);
5088 struct amdgpu_device *adev = drm_to_adev(dev);
5089 int r;
5090
5091 r = pci_save_state(pdev);
5092 if (!r) {
5093 kfree(adev->pci_state);
5094
5095 adev->pci_state = pci_store_saved_state(pdev);
5096
5097 if (!adev->pci_state) {
5098 DRM_ERROR("Failed to store PCI saved state");
5099 return false;
5100 }
5101 } else {
5102 DRM_WARN("Failed to save PCI state, err:%d\n", r);
5103 return false;
5104 }
5105
5106 return true;
5107 }
5108
amdgpu_device_load_pci_state(struct pci_dev * pdev)5109 bool amdgpu_device_load_pci_state(struct pci_dev *pdev)
5110 {
5111 struct drm_device *dev = pci_get_drvdata(pdev);
5112 struct amdgpu_device *adev = drm_to_adev(dev);
5113 int r;
5114
5115 if (!adev->pci_state)
5116 return false;
5117
5118 r = pci_load_saved_state(pdev, adev->pci_state);
5119
5120 if (!r) {
5121 pci_restore_state(pdev);
5122 } else {
5123 DRM_WARN("Failed to load PCI state, err:%d\n", r);
5124 return false;
5125 }
5126
5127 return true;
5128 }
5129
5130
5131