1 // SPDX-License-Identifier: GPL-2.0-only
2 #include <linux/module.h>
3 #include <linux/slab.h>
4
5 #include <asm/cpu.h>
6
7 #include "mce_amd.h"
8
9 static struct amd_decoder_ops fam_ops;
10
11 static u8 xec_mask = 0xf;
12
13 static void (*decode_dram_ecc)(int node_id, struct mce *m);
14
amd_register_ecc_decoder(void (* f)(int,struct mce *))15 void amd_register_ecc_decoder(void (*f)(int, struct mce *))
16 {
17 decode_dram_ecc = f;
18 }
19 EXPORT_SYMBOL_GPL(amd_register_ecc_decoder);
20
amd_unregister_ecc_decoder(void (* f)(int,struct mce *))21 void amd_unregister_ecc_decoder(void (*f)(int, struct mce *))
22 {
23 if (decode_dram_ecc) {
24 WARN_ON(decode_dram_ecc != f);
25
26 decode_dram_ecc = NULL;
27 }
28 }
29 EXPORT_SYMBOL_GPL(amd_unregister_ecc_decoder);
30
31 /*
32 * string representation for the different MCA reported error types, see F3x48
33 * or MSR0000_0411.
34 */
35
36 /* transaction type */
37 static const char * const tt_msgs[] = { "INSN", "DATA", "GEN", "RESV" };
38
39 /* cache level */
40 static const char * const ll_msgs[] = { "RESV", "L1", "L2", "L3/GEN" };
41
42 /* memory transaction type */
43 static const char * const rrrr_msgs[] = {
44 "GEN", "RD", "WR", "DRD", "DWR", "IRD", "PRF", "EV", "SNP"
45 };
46
47 /* participating processor */
48 const char * const pp_msgs[] = { "SRC", "RES", "OBS", "GEN" };
49 EXPORT_SYMBOL_GPL(pp_msgs);
50
51 /* request timeout */
52 static const char * const to_msgs[] = { "no timeout", "timed out" };
53
54 /* memory or i/o */
55 static const char * const ii_msgs[] = { "MEM", "RESV", "IO", "GEN" };
56
57 /* internal error type */
58 static const char * const uu_msgs[] = { "RESV", "RESV", "HWA", "RESV" };
59
60 static const char * const f15h_mc1_mce_desc[] = {
61 "UC during a demand linefill from L2",
62 "Parity error during data load from IC",
63 "Parity error for IC valid bit",
64 "Main tag parity error",
65 "Parity error in prediction queue",
66 "PFB data/address parity error",
67 "Parity error in the branch status reg",
68 "PFB promotion address error",
69 "Tag error during probe/victimization",
70 "Parity error for IC probe tag valid bit",
71 "PFB non-cacheable bit parity error",
72 "PFB valid bit parity error", /* xec = 0xd */
73 "Microcode Patch Buffer", /* xec = 010 */
74 "uop queue",
75 "insn buffer",
76 "predecode buffer",
77 "fetch address FIFO",
78 "dispatch uop queue"
79 };
80
81 static const char * const f15h_mc2_mce_desc[] = {
82 "Fill ECC error on data fills", /* xec = 0x4 */
83 "Fill parity error on insn fills",
84 "Prefetcher request FIFO parity error",
85 "PRQ address parity error",
86 "PRQ data parity error",
87 "WCC Tag ECC error",
88 "WCC Data ECC error",
89 "WCB Data parity error",
90 "VB Data ECC or parity error",
91 "L2 Tag ECC error", /* xec = 0x10 */
92 "Hard L2 Tag ECC error",
93 "Multiple hits on L2 tag",
94 "XAB parity error",
95 "PRB address parity error"
96 };
97
98 static const char * const mc4_mce_desc[] = {
99 "DRAM ECC error detected on the NB",
100 "CRC error detected on HT link",
101 "Link-defined sync error packets detected on HT link",
102 "HT Master abort",
103 "HT Target abort",
104 "Invalid GART PTE entry during GART table walk",
105 "Unsupported atomic RMW received from an IO link",
106 "Watchdog timeout due to lack of progress",
107 "DRAM ECC error detected on the NB",
108 "SVM DMA Exclusion Vector error",
109 "HT data error detected on link",
110 "Protocol error (link, L3, probe filter)",
111 "NB internal arrays parity error",
112 "DRAM addr/ctl signals parity error",
113 "IO link transmission error",
114 "L3 data cache ECC error", /* xec = 0x1c */
115 "L3 cache tag error",
116 "L3 LRU parity bits error",
117 "ECC Error in the Probe Filter directory"
118 };
119
120 static const char * const mc5_mce_desc[] = {
121 "CPU Watchdog timer expire",
122 "Wakeup array dest tag",
123 "AG payload array",
124 "EX payload array",
125 "IDRF array",
126 "Retire dispatch queue",
127 "Mapper checkpoint array",
128 "Physical register file EX0 port",
129 "Physical register file EX1 port",
130 "Physical register file AG0 port",
131 "Physical register file AG1 port",
132 "Flag register file",
133 "DE error occurred",
134 "Retire status queue"
135 };
136
137 static const char * const mc6_mce_desc[] = {
138 "Hardware Assertion",
139 "Free List",
140 "Physical Register File",
141 "Retire Queue",
142 "Scheduler table",
143 "Status Register File",
144 };
145
146 /* Scalable MCA error strings */
147 static const char * const smca_ls_mce_desc[] = {
148 "Load queue parity error",
149 "Store queue parity error",
150 "Miss address buffer payload parity error",
151 "Level 1 TLB parity error",
152 "DC Tag error type 5",
153 "DC Tag error type 6",
154 "DC Tag error type 1",
155 "Internal error type 1",
156 "Internal error type 2",
157 "System Read Data Error Thread 0",
158 "System Read Data Error Thread 1",
159 "DC Tag error type 2",
160 "DC Data error type 1 and poison consumption",
161 "DC Data error type 2",
162 "DC Data error type 3",
163 "DC Tag error type 4",
164 "Level 2 TLB parity error",
165 "PDC parity error",
166 "DC Tag error type 3",
167 "DC Tag error type 5",
168 "L2 Fill Data error",
169 };
170
171 static const char * const smca_ls2_mce_desc[] = {
172 "An ECC error was detected on a data cache read by a probe or victimization",
173 "An ECC error or L2 poison was detected on a data cache read by a load",
174 "An ECC error was detected on a data cache read-modify-write by a store",
175 "An ECC error or poison bit mismatch was detected on a tag read by a probe or victimization",
176 "An ECC error or poison bit mismatch was detected on a tag read by a load",
177 "An ECC error or poison bit mismatch was detected on a tag read by a store",
178 "An ECC error was detected on an EMEM read by a load",
179 "An ECC error was detected on an EMEM read-modify-write by a store",
180 "A parity error was detected in an L1 TLB entry by any access",
181 "A parity error was detected in an L2 TLB entry by any access",
182 "A parity error was detected in a PWC entry by any access",
183 "A parity error was detected in an STQ entry by any access",
184 "A parity error was detected in an LDQ entry by any access",
185 "A parity error was detected in a MAB entry by any access",
186 "A parity error was detected in an SCB entry state field by any access",
187 "A parity error was detected in an SCB entry address field by any access",
188 "A parity error was detected in an SCB entry data field by any access",
189 "A parity error was detected in a WCB entry by any access",
190 "A poisoned line was detected in an SCB entry by any access",
191 "A SystemReadDataError error was reported on read data returned from L2 for a load",
192 "A SystemReadDataError error was reported on read data returned from L2 for an SCB store",
193 "A SystemReadDataError error was reported on read data returned from L2 for a WCB store",
194 "A hardware assertion error was reported",
195 "A parity error was detected in an STLF, SCB EMEM entry or SRB store data by any access",
196 };
197
198 static const char * const smca_if_mce_desc[] = {
199 "Op Cache Microtag Probe Port Parity Error",
200 "IC Microtag or Full Tag Multi-hit Error",
201 "IC Full Tag Parity Error",
202 "IC Data Array Parity Error",
203 "Decoupling Queue PhysAddr Parity Error",
204 "L0 ITLB Parity Error",
205 "L1 ITLB Parity Error",
206 "L2 ITLB Parity Error",
207 "BPQ Thread 0 Snoop Parity Error",
208 "BPQ Thread 1 Snoop Parity Error",
209 "L1 BTB Multi-Match Error",
210 "L2 BTB Multi-Match Error",
211 "L2 Cache Response Poison Error",
212 "System Read Data Error",
213 "Hardware Assertion Error",
214 "L1-TLB Multi-Hit",
215 "L2-TLB Multi-Hit",
216 "BSR Parity Error",
217 "CT MCE",
218 };
219
220 static const char * const smca_l2_mce_desc[] = {
221 "L2M Tag Multiple-Way-Hit error",
222 "L2M Tag or State Array ECC Error",
223 "L2M Data Array ECC Error",
224 "Hardware Assert Error",
225 };
226
227 static const char * const smca_de_mce_desc[] = {
228 "Micro-op cache tag parity error",
229 "Micro-op cache data parity error",
230 "Instruction buffer parity error",
231 "Micro-op queue parity error",
232 "Instruction dispatch queue parity error",
233 "Fetch address FIFO parity error",
234 "Patch RAM data parity error",
235 "Patch RAM sequencer parity error",
236 "Micro-op buffer parity error",
237 "Hardware Assertion MCA Error",
238 };
239
240 static const char * const smca_ex_mce_desc[] = {
241 "Watchdog Timeout error",
242 "Physical register file parity error",
243 "Flag register file parity error",
244 "Immediate displacement register file parity error",
245 "Address generator payload parity error",
246 "EX payload parity error",
247 "Checkpoint queue parity error",
248 "Retire dispatch queue parity error",
249 "Retire status queue parity error",
250 "Scheduling queue parity error",
251 "Branch buffer queue parity error",
252 "Hardware Assertion error",
253 "Spec Map parity error",
254 "Retire Map parity error",
255 };
256
257 static const char * const smca_fp_mce_desc[] = {
258 "Physical register file (PRF) parity error",
259 "Freelist (FL) parity error",
260 "Schedule queue parity error",
261 "NSQ parity error",
262 "Retire queue (RQ) parity error",
263 "Status register file (SRF) parity error",
264 "Hardware assertion",
265 };
266
267 static const char * const smca_l3_mce_desc[] = {
268 "Shadow Tag Macro ECC Error",
269 "Shadow Tag Macro Multi-way-hit Error",
270 "L3M Tag ECC Error",
271 "L3M Tag Multi-way-hit Error",
272 "L3M Data ECC Error",
273 "SDP Parity Error or SystemReadDataError from XI",
274 "L3 Victim Queue Parity Error",
275 "L3 Hardware Assertion",
276 };
277
278 static const char * const smca_cs_mce_desc[] = {
279 "Illegal Request",
280 "Address Violation",
281 "Security Violation",
282 "Illegal Response",
283 "Unexpected Response",
284 "Request or Probe Parity Error",
285 "Read Response Parity Error",
286 "Atomic Request Parity Error",
287 "Probe Filter ECC Error",
288 };
289
290 static const char * const smca_cs2_mce_desc[] = {
291 "Illegal Request",
292 "Address Violation",
293 "Security Violation",
294 "Illegal Response",
295 "Unexpected Response",
296 "Request or Probe Parity Error",
297 "Read Response Parity Error",
298 "Atomic Request Parity Error",
299 "SDP read response had no match in the CS queue",
300 "Probe Filter Protocol Error",
301 "Probe Filter ECC Error",
302 "SDP read response had an unexpected RETRY error",
303 "Counter overflow error",
304 "Counter underflow error",
305 };
306
307 static const char * const smca_pie_mce_desc[] = {
308 "Hardware Assert",
309 "Register security violation",
310 "Link Error",
311 "Poison data consumption",
312 "A deferred error was detected in the DF"
313 };
314
315 static const char * const smca_umc_mce_desc[] = {
316 "DRAM ECC error",
317 "Data poison error",
318 "SDP parity error",
319 "Advanced peripheral bus error",
320 "Address/Command parity error",
321 "Write data CRC error",
322 "DCQ SRAM ECC error",
323 "AES SRAM ECC error",
324 };
325
326 static const char * const smca_pb_mce_desc[] = {
327 "An ECC error in the Parameter Block RAM array",
328 };
329
330 static const char * const smca_psp_mce_desc[] = {
331 "An ECC or parity error in a PSP RAM instance",
332 };
333
334 static const char * const smca_psp2_mce_desc[] = {
335 "High SRAM ECC or parity error",
336 "Low SRAM ECC or parity error",
337 "Instruction Cache Bank 0 ECC or parity error",
338 "Instruction Cache Bank 1 ECC or parity error",
339 "Instruction Tag Ram 0 parity error",
340 "Instruction Tag Ram 1 parity error",
341 "Data Cache Bank 0 ECC or parity error",
342 "Data Cache Bank 1 ECC or parity error",
343 "Data Cache Bank 2 ECC or parity error",
344 "Data Cache Bank 3 ECC or parity error",
345 "Data Tag Bank 0 parity error",
346 "Data Tag Bank 1 parity error",
347 "Data Tag Bank 2 parity error",
348 "Data Tag Bank 3 parity error",
349 "Dirty Data Ram parity error",
350 "TLB Bank 0 parity error",
351 "TLB Bank 1 parity error",
352 "System Hub Read Buffer ECC or parity error",
353 };
354
355 static const char * const smca_smu_mce_desc[] = {
356 "An ECC or parity error in an SMU RAM instance",
357 };
358
359 static const char * const smca_smu2_mce_desc[] = {
360 "High SRAM ECC or parity error",
361 "Low SRAM ECC or parity error",
362 "Data Cache Bank A ECC or parity error",
363 "Data Cache Bank B ECC or parity error",
364 "Data Tag Cache Bank A ECC or parity error",
365 "Data Tag Cache Bank B ECC or parity error",
366 "Instruction Cache Bank A ECC or parity error",
367 "Instruction Cache Bank B ECC or parity error",
368 "Instruction Tag Cache Bank A ECC or parity error",
369 "Instruction Tag Cache Bank B ECC or parity error",
370 "System Hub Read Buffer ECC or parity error",
371 "PHY RAM ECC error",
372 };
373
374 static const char * const smca_mp5_mce_desc[] = {
375 "High SRAM ECC or parity error",
376 "Low SRAM ECC or parity error",
377 "Data Cache Bank A ECC or parity error",
378 "Data Cache Bank B ECC or parity error",
379 "Data Tag Cache Bank A ECC or parity error",
380 "Data Tag Cache Bank B ECC or parity error",
381 "Instruction Cache Bank A ECC or parity error",
382 "Instruction Cache Bank B ECC or parity error",
383 "Instruction Tag Cache Bank A ECC or parity error",
384 "Instruction Tag Cache Bank B ECC or parity error",
385 };
386
387 static const char * const smca_nbio_mce_desc[] = {
388 "ECC or Parity error",
389 "PCIE error",
390 "SDP ErrEvent error",
391 "SDP Egress Poison Error",
392 "IOHC Internal Poison Error",
393 };
394
395 static const char * const smca_pcie_mce_desc[] = {
396 "CCIX PER Message logging",
397 "CCIX Read Response with Status: Non-Data Error",
398 "CCIX Write Response with Status: Non-Data Error",
399 "CCIX Read Response with Status: Data Error",
400 "CCIX Non-okay write response with data error",
401 };
402
403 struct smca_mce_desc {
404 const char * const *descs;
405 unsigned int num_descs;
406 };
407
408 static struct smca_mce_desc smca_mce_descs[] = {
409 [SMCA_LS] = { smca_ls_mce_desc, ARRAY_SIZE(smca_ls_mce_desc) },
410 [SMCA_LS_V2] = { smca_ls2_mce_desc, ARRAY_SIZE(smca_ls2_mce_desc) },
411 [SMCA_IF] = { smca_if_mce_desc, ARRAY_SIZE(smca_if_mce_desc) },
412 [SMCA_L2_CACHE] = { smca_l2_mce_desc, ARRAY_SIZE(smca_l2_mce_desc) },
413 [SMCA_DE] = { smca_de_mce_desc, ARRAY_SIZE(smca_de_mce_desc) },
414 [SMCA_EX] = { smca_ex_mce_desc, ARRAY_SIZE(smca_ex_mce_desc) },
415 [SMCA_FP] = { smca_fp_mce_desc, ARRAY_SIZE(smca_fp_mce_desc) },
416 [SMCA_L3_CACHE] = { smca_l3_mce_desc, ARRAY_SIZE(smca_l3_mce_desc) },
417 [SMCA_CS] = { smca_cs_mce_desc, ARRAY_SIZE(smca_cs_mce_desc) },
418 [SMCA_CS_V2] = { smca_cs2_mce_desc, ARRAY_SIZE(smca_cs2_mce_desc) },
419 [SMCA_PIE] = { smca_pie_mce_desc, ARRAY_SIZE(smca_pie_mce_desc) },
420 [SMCA_UMC] = { smca_umc_mce_desc, ARRAY_SIZE(smca_umc_mce_desc) },
421 [SMCA_PB] = { smca_pb_mce_desc, ARRAY_SIZE(smca_pb_mce_desc) },
422 [SMCA_PSP] = { smca_psp_mce_desc, ARRAY_SIZE(smca_psp_mce_desc) },
423 [SMCA_PSP_V2] = { smca_psp2_mce_desc, ARRAY_SIZE(smca_psp2_mce_desc) },
424 [SMCA_SMU] = { smca_smu_mce_desc, ARRAY_SIZE(smca_smu_mce_desc) },
425 [SMCA_SMU_V2] = { smca_smu2_mce_desc, ARRAY_SIZE(smca_smu2_mce_desc) },
426 [SMCA_MP5] = { smca_mp5_mce_desc, ARRAY_SIZE(smca_mp5_mce_desc) },
427 [SMCA_NBIO] = { smca_nbio_mce_desc, ARRAY_SIZE(smca_nbio_mce_desc) },
428 [SMCA_PCIE] = { smca_pcie_mce_desc, ARRAY_SIZE(smca_pcie_mce_desc) },
429 };
430
f12h_mc0_mce(u16 ec,u8 xec)431 static bool f12h_mc0_mce(u16 ec, u8 xec)
432 {
433 bool ret = false;
434
435 if (MEM_ERROR(ec)) {
436 u8 ll = LL(ec);
437 ret = true;
438
439 if (ll == LL_L2)
440 pr_cont("during L1 linefill from L2.\n");
441 else if (ll == LL_L1)
442 pr_cont("Data/Tag %s error.\n", R4_MSG(ec));
443 else
444 ret = false;
445 }
446 return ret;
447 }
448
f10h_mc0_mce(u16 ec,u8 xec)449 static bool f10h_mc0_mce(u16 ec, u8 xec)
450 {
451 if (R4(ec) == R4_GEN && LL(ec) == LL_L1) {
452 pr_cont("during data scrub.\n");
453 return true;
454 }
455 return f12h_mc0_mce(ec, xec);
456 }
457
k8_mc0_mce(u16 ec,u8 xec)458 static bool k8_mc0_mce(u16 ec, u8 xec)
459 {
460 if (BUS_ERROR(ec)) {
461 pr_cont("during system linefill.\n");
462 return true;
463 }
464
465 return f10h_mc0_mce(ec, xec);
466 }
467
cat_mc0_mce(u16 ec,u8 xec)468 static bool cat_mc0_mce(u16 ec, u8 xec)
469 {
470 u8 r4 = R4(ec);
471 bool ret = true;
472
473 if (MEM_ERROR(ec)) {
474
475 if (TT(ec) != TT_DATA || LL(ec) != LL_L1)
476 return false;
477
478 switch (r4) {
479 case R4_DRD:
480 case R4_DWR:
481 pr_cont("Data/Tag parity error due to %s.\n",
482 (r4 == R4_DRD ? "load/hw prf" : "store"));
483 break;
484 case R4_EVICT:
485 pr_cont("Copyback parity error on a tag miss.\n");
486 break;
487 case R4_SNOOP:
488 pr_cont("Tag parity error during snoop.\n");
489 break;
490 default:
491 ret = false;
492 }
493 } else if (BUS_ERROR(ec)) {
494
495 if ((II(ec) != II_MEM && II(ec) != II_IO) || LL(ec) != LL_LG)
496 return false;
497
498 pr_cont("System read data error on a ");
499
500 switch (r4) {
501 case R4_RD:
502 pr_cont("TLB reload.\n");
503 break;
504 case R4_DWR:
505 pr_cont("store.\n");
506 break;
507 case R4_DRD:
508 pr_cont("load.\n");
509 break;
510 default:
511 ret = false;
512 }
513 } else {
514 ret = false;
515 }
516
517 return ret;
518 }
519
f15h_mc0_mce(u16 ec,u8 xec)520 static bool f15h_mc0_mce(u16 ec, u8 xec)
521 {
522 bool ret = true;
523
524 if (MEM_ERROR(ec)) {
525
526 switch (xec) {
527 case 0x0:
528 pr_cont("Data Array access error.\n");
529 break;
530
531 case 0x1:
532 pr_cont("UC error during a linefill from L2/NB.\n");
533 break;
534
535 case 0x2:
536 case 0x11:
537 pr_cont("STQ access error.\n");
538 break;
539
540 case 0x3:
541 pr_cont("SCB access error.\n");
542 break;
543
544 case 0x10:
545 pr_cont("Tag error.\n");
546 break;
547
548 case 0x12:
549 pr_cont("LDQ access error.\n");
550 break;
551
552 default:
553 ret = false;
554 }
555 } else if (BUS_ERROR(ec)) {
556
557 if (!xec)
558 pr_cont("System Read Data Error.\n");
559 else
560 pr_cont(" Internal error condition type %d.\n", xec);
561 } else if (INT_ERROR(ec)) {
562 if (xec <= 0x1f)
563 pr_cont("Hardware Assert.\n");
564 else
565 ret = false;
566
567 } else
568 ret = false;
569
570 return ret;
571 }
572
decode_mc0_mce(struct mce * m)573 static void decode_mc0_mce(struct mce *m)
574 {
575 u16 ec = EC(m->status);
576 u8 xec = XEC(m->status, xec_mask);
577
578 pr_emerg(HW_ERR "MC0 Error: ");
579
580 /* TLB error signatures are the same across families */
581 if (TLB_ERROR(ec)) {
582 if (TT(ec) == TT_DATA) {
583 pr_cont("%s TLB %s.\n", LL_MSG(ec),
584 ((xec == 2) ? "locked miss"
585 : (xec ? "multimatch" : "parity")));
586 return;
587 }
588 } else if (fam_ops.mc0_mce(ec, xec))
589 ;
590 else
591 pr_emerg(HW_ERR "Corrupted MC0 MCE info?\n");
592 }
593
k8_mc1_mce(u16 ec,u8 xec)594 static bool k8_mc1_mce(u16 ec, u8 xec)
595 {
596 u8 ll = LL(ec);
597 bool ret = true;
598
599 if (!MEM_ERROR(ec))
600 return false;
601
602 if (ll == 0x2)
603 pr_cont("during a linefill from L2.\n");
604 else if (ll == 0x1) {
605 switch (R4(ec)) {
606 case R4_IRD:
607 pr_cont("Parity error during data load.\n");
608 break;
609
610 case R4_EVICT:
611 pr_cont("Copyback Parity/Victim error.\n");
612 break;
613
614 case R4_SNOOP:
615 pr_cont("Tag Snoop error.\n");
616 break;
617
618 default:
619 ret = false;
620 break;
621 }
622 } else
623 ret = false;
624
625 return ret;
626 }
627
cat_mc1_mce(u16 ec,u8 xec)628 static bool cat_mc1_mce(u16 ec, u8 xec)
629 {
630 u8 r4 = R4(ec);
631 bool ret = true;
632
633 if (!MEM_ERROR(ec))
634 return false;
635
636 if (TT(ec) != TT_INSTR)
637 return false;
638
639 if (r4 == R4_IRD)
640 pr_cont("Data/tag array parity error for a tag hit.\n");
641 else if (r4 == R4_SNOOP)
642 pr_cont("Tag error during snoop/victimization.\n");
643 else if (xec == 0x0)
644 pr_cont("Tag parity error from victim castout.\n");
645 else if (xec == 0x2)
646 pr_cont("Microcode patch RAM parity error.\n");
647 else
648 ret = false;
649
650 return ret;
651 }
652
f15h_mc1_mce(u16 ec,u8 xec)653 static bool f15h_mc1_mce(u16 ec, u8 xec)
654 {
655 bool ret = true;
656
657 if (!MEM_ERROR(ec))
658 return false;
659
660 switch (xec) {
661 case 0x0 ... 0xa:
662 pr_cont("%s.\n", f15h_mc1_mce_desc[xec]);
663 break;
664
665 case 0xd:
666 pr_cont("%s.\n", f15h_mc1_mce_desc[xec-2]);
667 break;
668
669 case 0x10:
670 pr_cont("%s.\n", f15h_mc1_mce_desc[xec-4]);
671 break;
672
673 case 0x11 ... 0x15:
674 pr_cont("Decoder %s parity error.\n", f15h_mc1_mce_desc[xec-4]);
675 break;
676
677 default:
678 ret = false;
679 }
680 return ret;
681 }
682
decode_mc1_mce(struct mce * m)683 static void decode_mc1_mce(struct mce *m)
684 {
685 u16 ec = EC(m->status);
686 u8 xec = XEC(m->status, xec_mask);
687
688 pr_emerg(HW_ERR "MC1 Error: ");
689
690 if (TLB_ERROR(ec))
691 pr_cont("%s TLB %s.\n", LL_MSG(ec),
692 (xec ? "multimatch" : "parity error"));
693 else if (BUS_ERROR(ec)) {
694 bool k8 = (boot_cpu_data.x86 == 0xf && (m->status & BIT_64(58)));
695
696 pr_cont("during %s.\n", (k8 ? "system linefill" : "NB data read"));
697 } else if (INT_ERROR(ec)) {
698 if (xec <= 0x3f)
699 pr_cont("Hardware Assert.\n");
700 else
701 goto wrong_mc1_mce;
702 } else if (fam_ops.mc1_mce(ec, xec))
703 ;
704 else
705 goto wrong_mc1_mce;
706
707 return;
708
709 wrong_mc1_mce:
710 pr_emerg(HW_ERR "Corrupted MC1 MCE info?\n");
711 }
712
k8_mc2_mce(u16 ec,u8 xec)713 static bool k8_mc2_mce(u16 ec, u8 xec)
714 {
715 bool ret = true;
716
717 if (xec == 0x1)
718 pr_cont(" in the write data buffers.\n");
719 else if (xec == 0x3)
720 pr_cont(" in the victim data buffers.\n");
721 else if (xec == 0x2 && MEM_ERROR(ec))
722 pr_cont(": %s error in the L2 cache tags.\n", R4_MSG(ec));
723 else if (xec == 0x0) {
724 if (TLB_ERROR(ec))
725 pr_cont("%s error in a Page Descriptor Cache or Guest TLB.\n",
726 TT_MSG(ec));
727 else if (BUS_ERROR(ec))
728 pr_cont(": %s/ECC error in data read from NB: %s.\n",
729 R4_MSG(ec), PP_MSG(ec));
730 else if (MEM_ERROR(ec)) {
731 u8 r4 = R4(ec);
732
733 if (r4 >= 0x7)
734 pr_cont(": %s error during data copyback.\n",
735 R4_MSG(ec));
736 else if (r4 <= 0x1)
737 pr_cont(": %s parity/ECC error during data "
738 "access from L2.\n", R4_MSG(ec));
739 else
740 ret = false;
741 } else
742 ret = false;
743 } else
744 ret = false;
745
746 return ret;
747 }
748
f15h_mc2_mce(u16 ec,u8 xec)749 static bool f15h_mc2_mce(u16 ec, u8 xec)
750 {
751 bool ret = true;
752
753 if (TLB_ERROR(ec)) {
754 if (xec == 0x0)
755 pr_cont("Data parity TLB read error.\n");
756 else if (xec == 0x1)
757 pr_cont("Poison data provided for TLB fill.\n");
758 else
759 ret = false;
760 } else if (BUS_ERROR(ec)) {
761 if (xec > 2)
762 ret = false;
763
764 pr_cont("Error during attempted NB data read.\n");
765 } else if (MEM_ERROR(ec)) {
766 switch (xec) {
767 case 0x4 ... 0xc:
768 pr_cont("%s.\n", f15h_mc2_mce_desc[xec - 0x4]);
769 break;
770
771 case 0x10 ... 0x14:
772 pr_cont("%s.\n", f15h_mc2_mce_desc[xec - 0x7]);
773 break;
774
775 default:
776 ret = false;
777 }
778 } else if (INT_ERROR(ec)) {
779 if (xec <= 0x3f)
780 pr_cont("Hardware Assert.\n");
781 else
782 ret = false;
783 }
784
785 return ret;
786 }
787
f16h_mc2_mce(u16 ec,u8 xec)788 static bool f16h_mc2_mce(u16 ec, u8 xec)
789 {
790 u8 r4 = R4(ec);
791
792 if (!MEM_ERROR(ec))
793 return false;
794
795 switch (xec) {
796 case 0x04 ... 0x05:
797 pr_cont("%cBUFF parity error.\n", (r4 == R4_RD) ? 'I' : 'O');
798 break;
799
800 case 0x09 ... 0x0b:
801 case 0x0d ... 0x0f:
802 pr_cont("ECC error in L2 tag (%s).\n",
803 ((r4 == R4_GEN) ? "BankReq" :
804 ((r4 == R4_SNOOP) ? "Prb" : "Fill")));
805 break;
806
807 case 0x10 ... 0x19:
808 case 0x1b:
809 pr_cont("ECC error in L2 data array (%s).\n",
810 (((r4 == R4_RD) && !(xec & 0x3)) ? "Hit" :
811 ((r4 == R4_GEN) ? "Attr" :
812 ((r4 == R4_EVICT) ? "Vict" : "Fill"))));
813 break;
814
815 case 0x1c ... 0x1d:
816 case 0x1f:
817 pr_cont("Parity error in L2 attribute bits (%s).\n",
818 ((r4 == R4_RD) ? "Hit" :
819 ((r4 == R4_GEN) ? "Attr" : "Fill")));
820 break;
821
822 default:
823 return false;
824 }
825
826 return true;
827 }
828
decode_mc2_mce(struct mce * m)829 static void decode_mc2_mce(struct mce *m)
830 {
831 u16 ec = EC(m->status);
832 u8 xec = XEC(m->status, xec_mask);
833
834 pr_emerg(HW_ERR "MC2 Error: ");
835
836 if (!fam_ops.mc2_mce(ec, xec))
837 pr_cont(HW_ERR "Corrupted MC2 MCE info?\n");
838 }
839
decode_mc3_mce(struct mce * m)840 static void decode_mc3_mce(struct mce *m)
841 {
842 u16 ec = EC(m->status);
843 u8 xec = XEC(m->status, xec_mask);
844
845 if (boot_cpu_data.x86 >= 0x14) {
846 pr_emerg("You shouldn't be seeing MC3 MCE on this cpu family,"
847 " please report on LKML.\n");
848 return;
849 }
850
851 pr_emerg(HW_ERR "MC3 Error");
852
853 if (xec == 0x0) {
854 u8 r4 = R4(ec);
855
856 if (!BUS_ERROR(ec) || (r4 != R4_DRD && r4 != R4_DWR))
857 goto wrong_mc3_mce;
858
859 pr_cont(" during %s.\n", R4_MSG(ec));
860 } else
861 goto wrong_mc3_mce;
862
863 return;
864
865 wrong_mc3_mce:
866 pr_emerg(HW_ERR "Corrupted MC3 MCE info?\n");
867 }
868
decode_mc4_mce(struct mce * m)869 static void decode_mc4_mce(struct mce *m)
870 {
871 unsigned int fam = x86_family(m->cpuid);
872 int node_id = amd_get_nb_id(m->extcpu);
873 u16 ec = EC(m->status);
874 u8 xec = XEC(m->status, 0x1f);
875 u8 offset = 0;
876
877 pr_emerg(HW_ERR "MC4 Error (node %d): ", node_id);
878
879 switch (xec) {
880 case 0x0 ... 0xe:
881
882 /* special handling for DRAM ECCs */
883 if (xec == 0x0 || xec == 0x8) {
884 /* no ECCs on F11h */
885 if (fam == 0x11)
886 goto wrong_mc4_mce;
887
888 pr_cont("%s.\n", mc4_mce_desc[xec]);
889
890 if (decode_dram_ecc)
891 decode_dram_ecc(node_id, m);
892 return;
893 }
894 break;
895
896 case 0xf:
897 if (TLB_ERROR(ec))
898 pr_cont("GART Table Walk data error.\n");
899 else if (BUS_ERROR(ec))
900 pr_cont("DMA Exclusion Vector Table Walk error.\n");
901 else
902 goto wrong_mc4_mce;
903 return;
904
905 case 0x19:
906 if (fam == 0x15 || fam == 0x16)
907 pr_cont("Compute Unit Data Error.\n");
908 else
909 goto wrong_mc4_mce;
910 return;
911
912 case 0x1c ... 0x1f:
913 offset = 13;
914 break;
915
916 default:
917 goto wrong_mc4_mce;
918 }
919
920 pr_cont("%s.\n", mc4_mce_desc[xec - offset]);
921 return;
922
923 wrong_mc4_mce:
924 pr_emerg(HW_ERR "Corrupted MC4 MCE info?\n");
925 }
926
decode_mc5_mce(struct mce * m)927 static void decode_mc5_mce(struct mce *m)
928 {
929 unsigned int fam = x86_family(m->cpuid);
930 u16 ec = EC(m->status);
931 u8 xec = XEC(m->status, xec_mask);
932
933 if (fam == 0xf || fam == 0x11)
934 goto wrong_mc5_mce;
935
936 pr_emerg(HW_ERR "MC5 Error: ");
937
938 if (INT_ERROR(ec)) {
939 if (xec <= 0x1f) {
940 pr_cont("Hardware Assert.\n");
941 return;
942 } else
943 goto wrong_mc5_mce;
944 }
945
946 if (xec == 0x0 || xec == 0xc)
947 pr_cont("%s.\n", mc5_mce_desc[xec]);
948 else if (xec <= 0xd)
949 pr_cont("%s parity error.\n", mc5_mce_desc[xec]);
950 else
951 goto wrong_mc5_mce;
952
953 return;
954
955 wrong_mc5_mce:
956 pr_emerg(HW_ERR "Corrupted MC5 MCE info?\n");
957 }
958
decode_mc6_mce(struct mce * m)959 static void decode_mc6_mce(struct mce *m)
960 {
961 u8 xec = XEC(m->status, xec_mask);
962
963 pr_emerg(HW_ERR "MC6 Error: ");
964
965 if (xec > 0x5)
966 goto wrong_mc6_mce;
967
968 pr_cont("%s parity error.\n", mc6_mce_desc[xec]);
969 return;
970
971 wrong_mc6_mce:
972 pr_emerg(HW_ERR "Corrupted MC6 MCE info?\n");
973 }
974
975 /* Decode errors according to Scalable MCA specification */
decode_smca_error(struct mce * m)976 static void decode_smca_error(struct mce *m)
977 {
978 struct smca_hwid *hwid;
979 enum smca_bank_types bank_type;
980 const char *ip_name;
981 u8 xec = XEC(m->status, xec_mask);
982
983 if (m->bank >= ARRAY_SIZE(smca_banks))
984 return;
985
986 hwid = smca_banks[m->bank].hwid;
987 if (!hwid)
988 return;
989
990 bank_type = hwid->bank_type;
991
992 if (bank_type == SMCA_RESERVED) {
993 pr_emerg(HW_ERR "Bank %d is reserved.\n", m->bank);
994 return;
995 }
996
997 ip_name = smca_get_long_name(bank_type);
998
999 pr_emerg(HW_ERR "%s Ext. Error Code: %d", ip_name, xec);
1000
1001 /* Only print the decode of valid error codes */
1002 if (xec < smca_mce_descs[bank_type].num_descs)
1003 pr_cont(", %s.\n", smca_mce_descs[bank_type].descs[xec]);
1004
1005 if (bank_type == SMCA_UMC && xec == 0 && decode_dram_ecc)
1006 decode_dram_ecc(topology_die_id(m->extcpu), m);
1007 }
1008
amd_decode_err_code(u16 ec)1009 static inline void amd_decode_err_code(u16 ec)
1010 {
1011 if (INT_ERROR(ec)) {
1012 pr_emerg(HW_ERR "internal: %s\n", UU_MSG(ec));
1013 return;
1014 }
1015
1016 pr_emerg(HW_ERR "cache level: %s", LL_MSG(ec));
1017
1018 if (BUS_ERROR(ec))
1019 pr_cont(", mem/io: %s", II_MSG(ec));
1020 else
1021 pr_cont(", tx: %s", TT_MSG(ec));
1022
1023 if (MEM_ERROR(ec) || BUS_ERROR(ec)) {
1024 pr_cont(", mem-tx: %s", R4_MSG(ec));
1025
1026 if (BUS_ERROR(ec))
1027 pr_cont(", part-proc: %s (%s)", PP_MSG(ec), TO_MSG(ec));
1028 }
1029
1030 pr_cont("\n");
1031 }
1032
decode_error_status(struct mce * m)1033 static const char *decode_error_status(struct mce *m)
1034 {
1035 if (m->status & MCI_STATUS_UC) {
1036 if (m->status & MCI_STATUS_PCC)
1037 return "System Fatal error.";
1038 if (m->mcgstatus & MCG_STATUS_RIPV)
1039 return "Uncorrected, software restartable error.";
1040 return "Uncorrected, software containable error.";
1041 }
1042
1043 if (m->status & MCI_STATUS_DEFERRED)
1044 return "Deferred error, no action required.";
1045
1046 return "Corrected error, no action required.";
1047 }
1048
1049 static int
amd_decode_mce(struct notifier_block * nb,unsigned long val,void * data)1050 amd_decode_mce(struct notifier_block *nb, unsigned long val, void *data)
1051 {
1052 struct mce *m = (struct mce *)data;
1053 unsigned int fam = x86_family(m->cpuid);
1054 int ecc;
1055
1056 if (m->kflags & MCE_HANDLED_CEC)
1057 return NOTIFY_DONE;
1058
1059 pr_emerg(HW_ERR "%s\n", decode_error_status(m));
1060
1061 pr_emerg(HW_ERR "CPU:%d (%x:%x:%x) MC%d_STATUS[%s|%s|%s|%s|%s",
1062 m->extcpu,
1063 fam, x86_model(m->cpuid), x86_stepping(m->cpuid),
1064 m->bank,
1065 ((m->status & MCI_STATUS_OVER) ? "Over" : "-"),
1066 ((m->status & MCI_STATUS_UC) ? "UE" :
1067 (m->status & MCI_STATUS_DEFERRED) ? "-" : "CE"),
1068 ((m->status & MCI_STATUS_MISCV) ? "MiscV" : "-"),
1069 ((m->status & MCI_STATUS_ADDRV) ? "AddrV" : "-"),
1070 ((m->status & MCI_STATUS_PCC) ? "PCC" : "-"));
1071
1072 if (boot_cpu_has(X86_FEATURE_SMCA)) {
1073 u32 low, high;
1074 u32 addr = MSR_AMD64_SMCA_MCx_CONFIG(m->bank);
1075
1076 if (!rdmsr_safe(addr, &low, &high) &&
1077 (low & MCI_CONFIG_MCAX))
1078 pr_cont("|%s", ((m->status & MCI_STATUS_TCC) ? "TCC" : "-"));
1079
1080 pr_cont("|%s", ((m->status & MCI_STATUS_SYNDV) ? "SyndV" : "-"));
1081 }
1082
1083 /* do the two bits[14:13] together */
1084 ecc = (m->status >> 45) & 0x3;
1085 if (ecc)
1086 pr_cont("|%sECC", ((ecc == 2) ? "C" : "U"));
1087
1088 if (fam >= 0x15) {
1089 pr_cont("|%s", (m->status & MCI_STATUS_DEFERRED ? "Deferred" : "-"));
1090
1091 /* F15h, bank4, bit 43 is part of McaStatSubCache. */
1092 if (fam != 0x15 || m->bank != 4)
1093 pr_cont("|%s", (m->status & MCI_STATUS_POISON ? "Poison" : "-"));
1094 }
1095
1096 if (fam >= 0x17)
1097 pr_cont("|%s", (m->status & MCI_STATUS_SCRUB ? "Scrub" : "-"));
1098
1099 pr_cont("]: 0x%016llx\n", m->status);
1100
1101 if (m->status & MCI_STATUS_ADDRV)
1102 pr_emerg(HW_ERR "Error Addr: 0x%016llx\n", m->addr);
1103
1104 if (m->ppin)
1105 pr_emerg(HW_ERR "PPIN: 0x%016llx\n", m->ppin);
1106
1107 if (boot_cpu_has(X86_FEATURE_SMCA)) {
1108 pr_emerg(HW_ERR "IPID: 0x%016llx", m->ipid);
1109
1110 if (m->status & MCI_STATUS_SYNDV)
1111 pr_cont(", Syndrome: 0x%016llx", m->synd);
1112
1113 pr_cont("\n");
1114
1115 decode_smca_error(m);
1116 goto err_code;
1117 }
1118
1119 if (m->tsc)
1120 pr_emerg(HW_ERR "TSC: %llu\n", m->tsc);
1121
1122 /* Doesn't matter which member to test. */
1123 if (!fam_ops.mc0_mce)
1124 goto err_code;
1125
1126 switch (m->bank) {
1127 case 0:
1128 decode_mc0_mce(m);
1129 break;
1130
1131 case 1:
1132 decode_mc1_mce(m);
1133 break;
1134
1135 case 2:
1136 decode_mc2_mce(m);
1137 break;
1138
1139 case 3:
1140 decode_mc3_mce(m);
1141 break;
1142
1143 case 4:
1144 decode_mc4_mce(m);
1145 break;
1146
1147 case 5:
1148 decode_mc5_mce(m);
1149 break;
1150
1151 case 6:
1152 decode_mc6_mce(m);
1153 break;
1154
1155 default:
1156 break;
1157 }
1158
1159 err_code:
1160 amd_decode_err_code(m->status & 0xffff);
1161
1162 m->kflags |= MCE_HANDLED_EDAC;
1163 return NOTIFY_OK;
1164 }
1165
1166 static struct notifier_block amd_mce_dec_nb = {
1167 .notifier_call = amd_decode_mce,
1168 .priority = MCE_PRIO_EDAC,
1169 };
1170
mce_amd_init(void)1171 static int __init mce_amd_init(void)
1172 {
1173 struct cpuinfo_x86 *c = &boot_cpu_data;
1174
1175 if (c->x86_vendor != X86_VENDOR_AMD &&
1176 c->x86_vendor != X86_VENDOR_HYGON)
1177 return -ENODEV;
1178
1179 if (cpu_feature_enabled(X86_FEATURE_HYPERVISOR))
1180 return -ENODEV;
1181
1182 if (boot_cpu_has(X86_FEATURE_SMCA)) {
1183 xec_mask = 0x3f;
1184 goto out;
1185 }
1186
1187 switch (c->x86) {
1188 case 0xf:
1189 fam_ops.mc0_mce = k8_mc0_mce;
1190 fam_ops.mc1_mce = k8_mc1_mce;
1191 fam_ops.mc2_mce = k8_mc2_mce;
1192 break;
1193
1194 case 0x10:
1195 fam_ops.mc0_mce = f10h_mc0_mce;
1196 fam_ops.mc1_mce = k8_mc1_mce;
1197 fam_ops.mc2_mce = k8_mc2_mce;
1198 break;
1199
1200 case 0x11:
1201 fam_ops.mc0_mce = k8_mc0_mce;
1202 fam_ops.mc1_mce = k8_mc1_mce;
1203 fam_ops.mc2_mce = k8_mc2_mce;
1204 break;
1205
1206 case 0x12:
1207 fam_ops.mc0_mce = f12h_mc0_mce;
1208 fam_ops.mc1_mce = k8_mc1_mce;
1209 fam_ops.mc2_mce = k8_mc2_mce;
1210 break;
1211
1212 case 0x14:
1213 fam_ops.mc0_mce = cat_mc0_mce;
1214 fam_ops.mc1_mce = cat_mc1_mce;
1215 fam_ops.mc2_mce = k8_mc2_mce;
1216 break;
1217
1218 case 0x15:
1219 xec_mask = c->x86_model == 0x60 ? 0x3f : 0x1f;
1220
1221 fam_ops.mc0_mce = f15h_mc0_mce;
1222 fam_ops.mc1_mce = f15h_mc1_mce;
1223 fam_ops.mc2_mce = f15h_mc2_mce;
1224 break;
1225
1226 case 0x16:
1227 xec_mask = 0x1f;
1228 fam_ops.mc0_mce = cat_mc0_mce;
1229 fam_ops.mc1_mce = cat_mc1_mce;
1230 fam_ops.mc2_mce = f16h_mc2_mce;
1231 break;
1232
1233 case 0x17:
1234 case 0x18:
1235 pr_warn_once("Decoding supported only on Scalable MCA processors.\n");
1236 return -EINVAL;
1237
1238 default:
1239 printk(KERN_WARNING "Huh? What family is it: 0x%x?!\n", c->x86);
1240 return -EINVAL;
1241 }
1242
1243 out:
1244 pr_info("MCE: In-kernel MCE decoding enabled.\n");
1245
1246 mce_register_decode_chain(&amd_mce_dec_nb);
1247
1248 return 0;
1249 }
1250 early_initcall(mce_amd_init);
1251
1252 #ifdef MODULE
mce_amd_exit(void)1253 static void __exit mce_amd_exit(void)
1254 {
1255 mce_unregister_decode_chain(&amd_mce_dec_nb);
1256 }
1257
1258 MODULE_DESCRIPTION("AMD MCE decoder");
1259 MODULE_ALIAS("edac-mce-amd");
1260 MODULE_LICENSE("GPL");
1261 module_exit(mce_amd_exit);
1262 #endif
1263