• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 // SPDX-License-Identifier: GPL-2.0-only
2 #include <linux/module.h>
3 #include <linux/slab.h>
4 
5 #include <asm/cpu.h>
6 
7 #include "mce_amd.h"
8 
9 static struct amd_decoder_ops fam_ops;
10 
11 static u8 xec_mask	 = 0xf;
12 
13 static void (*decode_dram_ecc)(int node_id, struct mce *m);
14 
amd_register_ecc_decoder(void (* f)(int,struct mce *))15 void amd_register_ecc_decoder(void (*f)(int, struct mce *))
16 {
17 	decode_dram_ecc = f;
18 }
19 EXPORT_SYMBOL_GPL(amd_register_ecc_decoder);
20 
amd_unregister_ecc_decoder(void (* f)(int,struct mce *))21 void amd_unregister_ecc_decoder(void (*f)(int, struct mce *))
22 {
23 	if (decode_dram_ecc) {
24 		WARN_ON(decode_dram_ecc != f);
25 
26 		decode_dram_ecc = NULL;
27 	}
28 }
29 EXPORT_SYMBOL_GPL(amd_unregister_ecc_decoder);
30 
31 /*
32  * string representation for the different MCA reported error types, see F3x48
33  * or MSR0000_0411.
34  */
35 
36 /* transaction type */
37 static const char * const tt_msgs[] = { "INSN", "DATA", "GEN", "RESV" };
38 
39 /* cache level */
40 static const char * const ll_msgs[] = { "RESV", "L1", "L2", "L3/GEN" };
41 
42 /* memory transaction type */
43 static const char * const rrrr_msgs[] = {
44        "GEN", "RD", "WR", "DRD", "DWR", "IRD", "PRF", "EV", "SNP"
45 };
46 
47 /* participating processor */
48 const char * const pp_msgs[] = { "SRC", "RES", "OBS", "GEN" };
49 EXPORT_SYMBOL_GPL(pp_msgs);
50 
51 /* request timeout */
52 static const char * const to_msgs[] = { "no timeout", "timed out" };
53 
54 /* memory or i/o */
55 static const char * const ii_msgs[] = { "MEM", "RESV", "IO", "GEN" };
56 
57 /* internal error type */
58 static const char * const uu_msgs[] = { "RESV", "RESV", "HWA", "RESV" };
59 
60 static const char * const f15h_mc1_mce_desc[] = {
61 	"UC during a demand linefill from L2",
62 	"Parity error during data load from IC",
63 	"Parity error for IC valid bit",
64 	"Main tag parity error",
65 	"Parity error in prediction queue",
66 	"PFB data/address parity error",
67 	"Parity error in the branch status reg",
68 	"PFB promotion address error",
69 	"Tag error during probe/victimization",
70 	"Parity error for IC probe tag valid bit",
71 	"PFB non-cacheable bit parity error",
72 	"PFB valid bit parity error",			/* xec = 0xd */
73 	"Microcode Patch Buffer",			/* xec = 010 */
74 	"uop queue",
75 	"insn buffer",
76 	"predecode buffer",
77 	"fetch address FIFO",
78 	"dispatch uop queue"
79 };
80 
81 static const char * const f15h_mc2_mce_desc[] = {
82 	"Fill ECC error on data fills",			/* xec = 0x4 */
83 	"Fill parity error on insn fills",
84 	"Prefetcher request FIFO parity error",
85 	"PRQ address parity error",
86 	"PRQ data parity error",
87 	"WCC Tag ECC error",
88 	"WCC Data ECC error",
89 	"WCB Data parity error",
90 	"VB Data ECC or parity error",
91 	"L2 Tag ECC error",				/* xec = 0x10 */
92 	"Hard L2 Tag ECC error",
93 	"Multiple hits on L2 tag",
94 	"XAB parity error",
95 	"PRB address parity error"
96 };
97 
98 static const char * const mc4_mce_desc[] = {
99 	"DRAM ECC error detected on the NB",
100 	"CRC error detected on HT link",
101 	"Link-defined sync error packets detected on HT link",
102 	"HT Master abort",
103 	"HT Target abort",
104 	"Invalid GART PTE entry during GART table walk",
105 	"Unsupported atomic RMW received from an IO link",
106 	"Watchdog timeout due to lack of progress",
107 	"DRAM ECC error detected on the NB",
108 	"SVM DMA Exclusion Vector error",
109 	"HT data error detected on link",
110 	"Protocol error (link, L3, probe filter)",
111 	"NB internal arrays parity error",
112 	"DRAM addr/ctl signals parity error",
113 	"IO link transmission error",
114 	"L3 data cache ECC error",			/* xec = 0x1c */
115 	"L3 cache tag error",
116 	"L3 LRU parity bits error",
117 	"ECC Error in the Probe Filter directory"
118 };
119 
120 static const char * const mc5_mce_desc[] = {
121 	"CPU Watchdog timer expire",
122 	"Wakeup array dest tag",
123 	"AG payload array",
124 	"EX payload array",
125 	"IDRF array",
126 	"Retire dispatch queue",
127 	"Mapper checkpoint array",
128 	"Physical register file EX0 port",
129 	"Physical register file EX1 port",
130 	"Physical register file AG0 port",
131 	"Physical register file AG1 port",
132 	"Flag register file",
133 	"DE error occurred",
134 	"Retire status queue"
135 };
136 
137 static const char * const mc6_mce_desc[] = {
138 	"Hardware Assertion",
139 	"Free List",
140 	"Physical Register File",
141 	"Retire Queue",
142 	"Scheduler table",
143 	"Status Register File",
144 };
145 
146 /* Scalable MCA error strings */
147 static const char * const smca_ls_mce_desc[] = {
148 	"Load queue parity error",
149 	"Store queue parity error",
150 	"Miss address buffer payload parity error",
151 	"Level 1 TLB parity error",
152 	"DC Tag error type 5",
153 	"DC Tag error type 6",
154 	"DC Tag error type 1",
155 	"Internal error type 1",
156 	"Internal error type 2",
157 	"System Read Data Error Thread 0",
158 	"System Read Data Error Thread 1",
159 	"DC Tag error type 2",
160 	"DC Data error type 1 and poison consumption",
161 	"DC Data error type 2",
162 	"DC Data error type 3",
163 	"DC Tag error type 4",
164 	"Level 2 TLB parity error",
165 	"PDC parity error",
166 	"DC Tag error type 3",
167 	"DC Tag error type 5",
168 	"L2 Fill Data error",
169 };
170 
171 static const char * const smca_ls2_mce_desc[] = {
172 	"An ECC error was detected on a data cache read by a probe or victimization",
173 	"An ECC error or L2 poison was detected on a data cache read by a load",
174 	"An ECC error was detected on a data cache read-modify-write by a store",
175 	"An ECC error or poison bit mismatch was detected on a tag read by a probe or victimization",
176 	"An ECC error or poison bit mismatch was detected on a tag read by a load",
177 	"An ECC error or poison bit mismatch was detected on a tag read by a store",
178 	"An ECC error was detected on an EMEM read by a load",
179 	"An ECC error was detected on an EMEM read-modify-write by a store",
180 	"A parity error was detected in an L1 TLB entry by any access",
181 	"A parity error was detected in an L2 TLB entry by any access",
182 	"A parity error was detected in a PWC entry by any access",
183 	"A parity error was detected in an STQ entry by any access",
184 	"A parity error was detected in an LDQ entry by any access",
185 	"A parity error was detected in a MAB entry by any access",
186 	"A parity error was detected in an SCB entry state field by any access",
187 	"A parity error was detected in an SCB entry address field by any access",
188 	"A parity error was detected in an SCB entry data field by any access",
189 	"A parity error was detected in a WCB entry by any access",
190 	"A poisoned line was detected in an SCB entry by any access",
191 	"A SystemReadDataError error was reported on read data returned from L2 for a load",
192 	"A SystemReadDataError error was reported on read data returned from L2 for an SCB store",
193 	"A SystemReadDataError error was reported on read data returned from L2 for a WCB store",
194 	"A hardware assertion error was reported",
195 	"A parity error was detected in an STLF, SCB EMEM entry or SRB store data by any access",
196 };
197 
198 static const char * const smca_if_mce_desc[] = {
199 	"Op Cache Microtag Probe Port Parity Error",
200 	"IC Microtag or Full Tag Multi-hit Error",
201 	"IC Full Tag Parity Error",
202 	"IC Data Array Parity Error",
203 	"Decoupling Queue PhysAddr Parity Error",
204 	"L0 ITLB Parity Error",
205 	"L1 ITLB Parity Error",
206 	"L2 ITLB Parity Error",
207 	"BPQ Thread 0 Snoop Parity Error",
208 	"BPQ Thread 1 Snoop Parity Error",
209 	"L1 BTB Multi-Match Error",
210 	"L2 BTB Multi-Match Error",
211 	"L2 Cache Response Poison Error",
212 	"System Read Data Error",
213 	"Hardware Assertion Error",
214 	"L1-TLB Multi-Hit",
215 	"L2-TLB Multi-Hit",
216 	"BSR Parity Error",
217 	"CT MCE",
218 };
219 
220 static const char * const smca_l2_mce_desc[] = {
221 	"L2M Tag Multiple-Way-Hit error",
222 	"L2M Tag or State Array ECC Error",
223 	"L2M Data Array ECC Error",
224 	"Hardware Assert Error",
225 };
226 
227 static const char * const smca_de_mce_desc[] = {
228 	"Micro-op cache tag parity error",
229 	"Micro-op cache data parity error",
230 	"Instruction buffer parity error",
231 	"Micro-op queue parity error",
232 	"Instruction dispatch queue parity error",
233 	"Fetch address FIFO parity error",
234 	"Patch RAM data parity error",
235 	"Patch RAM sequencer parity error",
236 	"Micro-op buffer parity error",
237 	"Hardware Assertion MCA Error",
238 };
239 
240 static const char * const smca_ex_mce_desc[] = {
241 	"Watchdog Timeout error",
242 	"Physical register file parity error",
243 	"Flag register file parity error",
244 	"Immediate displacement register file parity error",
245 	"Address generator payload parity error",
246 	"EX payload parity error",
247 	"Checkpoint queue parity error",
248 	"Retire dispatch queue parity error",
249 	"Retire status queue parity error",
250 	"Scheduling queue parity error",
251 	"Branch buffer queue parity error",
252 	"Hardware Assertion error",
253 	"Spec Map parity error",
254 	"Retire Map parity error",
255 };
256 
257 static const char * const smca_fp_mce_desc[] = {
258 	"Physical register file (PRF) parity error",
259 	"Freelist (FL) parity error",
260 	"Schedule queue parity error",
261 	"NSQ parity error",
262 	"Retire queue (RQ) parity error",
263 	"Status register file (SRF) parity error",
264 	"Hardware assertion",
265 };
266 
267 static const char * const smca_l3_mce_desc[] = {
268 	"Shadow Tag Macro ECC Error",
269 	"Shadow Tag Macro Multi-way-hit Error",
270 	"L3M Tag ECC Error",
271 	"L3M Tag Multi-way-hit Error",
272 	"L3M Data ECC Error",
273 	"SDP Parity Error or SystemReadDataError from XI",
274 	"L3 Victim Queue Parity Error",
275 	"L3 Hardware Assertion",
276 };
277 
278 static const char * const smca_cs_mce_desc[] = {
279 	"Illegal Request",
280 	"Address Violation",
281 	"Security Violation",
282 	"Illegal Response",
283 	"Unexpected Response",
284 	"Request or Probe Parity Error",
285 	"Read Response Parity Error",
286 	"Atomic Request Parity Error",
287 	"Probe Filter ECC Error",
288 };
289 
290 static const char * const smca_cs2_mce_desc[] = {
291 	"Illegal Request",
292 	"Address Violation",
293 	"Security Violation",
294 	"Illegal Response",
295 	"Unexpected Response",
296 	"Request or Probe Parity Error",
297 	"Read Response Parity Error",
298 	"Atomic Request Parity Error",
299 	"SDP read response had no match in the CS queue",
300 	"Probe Filter Protocol Error",
301 	"Probe Filter ECC Error",
302 	"SDP read response had an unexpected RETRY error",
303 	"Counter overflow error",
304 	"Counter underflow error",
305 };
306 
307 static const char * const smca_pie_mce_desc[] = {
308 	"Hardware Assert",
309 	"Register security violation",
310 	"Link Error",
311 	"Poison data consumption",
312 	"A deferred error was detected in the DF"
313 };
314 
315 static const char * const smca_umc_mce_desc[] = {
316 	"DRAM ECC error",
317 	"Data poison error",
318 	"SDP parity error",
319 	"Advanced peripheral bus error",
320 	"Address/Command parity error",
321 	"Write data CRC error",
322 	"DCQ SRAM ECC error",
323 	"AES SRAM ECC error",
324 };
325 
326 static const char * const smca_pb_mce_desc[] = {
327 	"An ECC error in the Parameter Block RAM array",
328 };
329 
330 static const char * const smca_psp_mce_desc[] = {
331 	"An ECC or parity error in a PSP RAM instance",
332 };
333 
334 static const char * const smca_psp2_mce_desc[] = {
335 	"High SRAM ECC or parity error",
336 	"Low SRAM ECC or parity error",
337 	"Instruction Cache Bank 0 ECC or parity error",
338 	"Instruction Cache Bank 1 ECC or parity error",
339 	"Instruction Tag Ram 0 parity error",
340 	"Instruction Tag Ram 1 parity error",
341 	"Data Cache Bank 0 ECC or parity error",
342 	"Data Cache Bank 1 ECC or parity error",
343 	"Data Cache Bank 2 ECC or parity error",
344 	"Data Cache Bank 3 ECC or parity error",
345 	"Data Tag Bank 0 parity error",
346 	"Data Tag Bank 1 parity error",
347 	"Data Tag Bank 2 parity error",
348 	"Data Tag Bank 3 parity error",
349 	"Dirty Data Ram parity error",
350 	"TLB Bank 0 parity error",
351 	"TLB Bank 1 parity error",
352 	"System Hub Read Buffer ECC or parity error",
353 };
354 
355 static const char * const smca_smu_mce_desc[] = {
356 	"An ECC or parity error in an SMU RAM instance",
357 };
358 
359 static const char * const smca_smu2_mce_desc[] = {
360 	"High SRAM ECC or parity error",
361 	"Low SRAM ECC or parity error",
362 	"Data Cache Bank A ECC or parity error",
363 	"Data Cache Bank B ECC or parity error",
364 	"Data Tag Cache Bank A ECC or parity error",
365 	"Data Tag Cache Bank B ECC or parity error",
366 	"Instruction Cache Bank A ECC or parity error",
367 	"Instruction Cache Bank B ECC or parity error",
368 	"Instruction Tag Cache Bank A ECC or parity error",
369 	"Instruction Tag Cache Bank B ECC or parity error",
370 	"System Hub Read Buffer ECC or parity error",
371 	"PHY RAM ECC error",
372 };
373 
374 static const char * const smca_mp5_mce_desc[] = {
375 	"High SRAM ECC or parity error",
376 	"Low SRAM ECC or parity error",
377 	"Data Cache Bank A ECC or parity error",
378 	"Data Cache Bank B ECC or parity error",
379 	"Data Tag Cache Bank A ECC or parity error",
380 	"Data Tag Cache Bank B ECC or parity error",
381 	"Instruction Cache Bank A ECC or parity error",
382 	"Instruction Cache Bank B ECC or parity error",
383 	"Instruction Tag Cache Bank A ECC or parity error",
384 	"Instruction Tag Cache Bank B ECC or parity error",
385 };
386 
387 static const char * const smca_nbio_mce_desc[] = {
388 	"ECC or Parity error",
389 	"PCIE error",
390 	"SDP ErrEvent error",
391 	"SDP Egress Poison Error",
392 	"IOHC Internal Poison Error",
393 };
394 
395 static const char * const smca_pcie_mce_desc[] = {
396 	"CCIX PER Message logging",
397 	"CCIX Read Response with Status: Non-Data Error",
398 	"CCIX Write Response with Status: Non-Data Error",
399 	"CCIX Read Response with Status: Data Error",
400 	"CCIX Non-okay write response with data error",
401 };
402 
403 struct smca_mce_desc {
404 	const char * const *descs;
405 	unsigned int num_descs;
406 };
407 
408 static struct smca_mce_desc smca_mce_descs[] = {
409 	[SMCA_LS]	= { smca_ls_mce_desc,	ARRAY_SIZE(smca_ls_mce_desc)	},
410 	[SMCA_LS_V2]	= { smca_ls2_mce_desc,	ARRAY_SIZE(smca_ls2_mce_desc)	},
411 	[SMCA_IF]	= { smca_if_mce_desc,	ARRAY_SIZE(smca_if_mce_desc)	},
412 	[SMCA_L2_CACHE]	= { smca_l2_mce_desc,	ARRAY_SIZE(smca_l2_mce_desc)	},
413 	[SMCA_DE]	= { smca_de_mce_desc,	ARRAY_SIZE(smca_de_mce_desc)	},
414 	[SMCA_EX]	= { smca_ex_mce_desc,	ARRAY_SIZE(smca_ex_mce_desc)	},
415 	[SMCA_FP]	= { smca_fp_mce_desc,	ARRAY_SIZE(smca_fp_mce_desc)	},
416 	[SMCA_L3_CACHE]	= { smca_l3_mce_desc,	ARRAY_SIZE(smca_l3_mce_desc)	},
417 	[SMCA_CS]	= { smca_cs_mce_desc,	ARRAY_SIZE(smca_cs_mce_desc)	},
418 	[SMCA_CS_V2]	= { smca_cs2_mce_desc,	ARRAY_SIZE(smca_cs2_mce_desc)	},
419 	[SMCA_PIE]	= { smca_pie_mce_desc,	ARRAY_SIZE(smca_pie_mce_desc)	},
420 	[SMCA_UMC]	= { smca_umc_mce_desc,	ARRAY_SIZE(smca_umc_mce_desc)	},
421 	[SMCA_PB]	= { smca_pb_mce_desc,	ARRAY_SIZE(smca_pb_mce_desc)	},
422 	[SMCA_PSP]	= { smca_psp_mce_desc,	ARRAY_SIZE(smca_psp_mce_desc)	},
423 	[SMCA_PSP_V2]	= { smca_psp2_mce_desc,	ARRAY_SIZE(smca_psp2_mce_desc)	},
424 	[SMCA_SMU]	= { smca_smu_mce_desc,	ARRAY_SIZE(smca_smu_mce_desc)	},
425 	[SMCA_SMU_V2]	= { smca_smu2_mce_desc,	ARRAY_SIZE(smca_smu2_mce_desc)	},
426 	[SMCA_MP5]	= { smca_mp5_mce_desc,	ARRAY_SIZE(smca_mp5_mce_desc)	},
427 	[SMCA_NBIO]	= { smca_nbio_mce_desc,	ARRAY_SIZE(smca_nbio_mce_desc)	},
428 	[SMCA_PCIE]	= { smca_pcie_mce_desc,	ARRAY_SIZE(smca_pcie_mce_desc)	},
429 };
430 
f12h_mc0_mce(u16 ec,u8 xec)431 static bool f12h_mc0_mce(u16 ec, u8 xec)
432 {
433 	bool ret = false;
434 
435 	if (MEM_ERROR(ec)) {
436 		u8 ll = LL(ec);
437 		ret = true;
438 
439 		if (ll == LL_L2)
440 			pr_cont("during L1 linefill from L2.\n");
441 		else if (ll == LL_L1)
442 			pr_cont("Data/Tag %s error.\n", R4_MSG(ec));
443 		else
444 			ret = false;
445 	}
446 	return ret;
447 }
448 
f10h_mc0_mce(u16 ec,u8 xec)449 static bool f10h_mc0_mce(u16 ec, u8 xec)
450 {
451 	if (R4(ec) == R4_GEN && LL(ec) == LL_L1) {
452 		pr_cont("during data scrub.\n");
453 		return true;
454 	}
455 	return f12h_mc0_mce(ec, xec);
456 }
457 
k8_mc0_mce(u16 ec,u8 xec)458 static bool k8_mc0_mce(u16 ec, u8 xec)
459 {
460 	if (BUS_ERROR(ec)) {
461 		pr_cont("during system linefill.\n");
462 		return true;
463 	}
464 
465 	return f10h_mc0_mce(ec, xec);
466 }
467 
cat_mc0_mce(u16 ec,u8 xec)468 static bool cat_mc0_mce(u16 ec, u8 xec)
469 {
470 	u8 r4	 = R4(ec);
471 	bool ret = true;
472 
473 	if (MEM_ERROR(ec)) {
474 
475 		if (TT(ec) != TT_DATA || LL(ec) != LL_L1)
476 			return false;
477 
478 		switch (r4) {
479 		case R4_DRD:
480 		case R4_DWR:
481 			pr_cont("Data/Tag parity error due to %s.\n",
482 				(r4 == R4_DRD ? "load/hw prf" : "store"));
483 			break;
484 		case R4_EVICT:
485 			pr_cont("Copyback parity error on a tag miss.\n");
486 			break;
487 		case R4_SNOOP:
488 			pr_cont("Tag parity error during snoop.\n");
489 			break;
490 		default:
491 			ret = false;
492 		}
493 	} else if (BUS_ERROR(ec)) {
494 
495 		if ((II(ec) != II_MEM && II(ec) != II_IO) || LL(ec) != LL_LG)
496 			return false;
497 
498 		pr_cont("System read data error on a ");
499 
500 		switch (r4) {
501 		case R4_RD:
502 			pr_cont("TLB reload.\n");
503 			break;
504 		case R4_DWR:
505 			pr_cont("store.\n");
506 			break;
507 		case R4_DRD:
508 			pr_cont("load.\n");
509 			break;
510 		default:
511 			ret = false;
512 		}
513 	} else {
514 		ret = false;
515 	}
516 
517 	return ret;
518 }
519 
f15h_mc0_mce(u16 ec,u8 xec)520 static bool f15h_mc0_mce(u16 ec, u8 xec)
521 {
522 	bool ret = true;
523 
524 	if (MEM_ERROR(ec)) {
525 
526 		switch (xec) {
527 		case 0x0:
528 			pr_cont("Data Array access error.\n");
529 			break;
530 
531 		case 0x1:
532 			pr_cont("UC error during a linefill from L2/NB.\n");
533 			break;
534 
535 		case 0x2:
536 		case 0x11:
537 			pr_cont("STQ access error.\n");
538 			break;
539 
540 		case 0x3:
541 			pr_cont("SCB access error.\n");
542 			break;
543 
544 		case 0x10:
545 			pr_cont("Tag error.\n");
546 			break;
547 
548 		case 0x12:
549 			pr_cont("LDQ access error.\n");
550 			break;
551 
552 		default:
553 			ret = false;
554 		}
555 	} else if (BUS_ERROR(ec)) {
556 
557 		if (!xec)
558 			pr_cont("System Read Data Error.\n");
559 		else
560 			pr_cont(" Internal error condition type %d.\n", xec);
561 	} else if (INT_ERROR(ec)) {
562 		if (xec <= 0x1f)
563 			pr_cont("Hardware Assert.\n");
564 		else
565 			ret = false;
566 
567 	} else
568 		ret = false;
569 
570 	return ret;
571 }
572 
decode_mc0_mce(struct mce * m)573 static void decode_mc0_mce(struct mce *m)
574 {
575 	u16 ec = EC(m->status);
576 	u8 xec = XEC(m->status, xec_mask);
577 
578 	pr_emerg(HW_ERR "MC0 Error: ");
579 
580 	/* TLB error signatures are the same across families */
581 	if (TLB_ERROR(ec)) {
582 		if (TT(ec) == TT_DATA) {
583 			pr_cont("%s TLB %s.\n", LL_MSG(ec),
584 				((xec == 2) ? "locked miss"
585 					    : (xec ? "multimatch" : "parity")));
586 			return;
587 		}
588 	} else if (fam_ops.mc0_mce(ec, xec))
589 		;
590 	else
591 		pr_emerg(HW_ERR "Corrupted MC0 MCE info?\n");
592 }
593 
k8_mc1_mce(u16 ec,u8 xec)594 static bool k8_mc1_mce(u16 ec, u8 xec)
595 {
596 	u8 ll	 = LL(ec);
597 	bool ret = true;
598 
599 	if (!MEM_ERROR(ec))
600 		return false;
601 
602 	if (ll == 0x2)
603 		pr_cont("during a linefill from L2.\n");
604 	else if (ll == 0x1) {
605 		switch (R4(ec)) {
606 		case R4_IRD:
607 			pr_cont("Parity error during data load.\n");
608 			break;
609 
610 		case R4_EVICT:
611 			pr_cont("Copyback Parity/Victim error.\n");
612 			break;
613 
614 		case R4_SNOOP:
615 			pr_cont("Tag Snoop error.\n");
616 			break;
617 
618 		default:
619 			ret = false;
620 			break;
621 		}
622 	} else
623 		ret = false;
624 
625 	return ret;
626 }
627 
cat_mc1_mce(u16 ec,u8 xec)628 static bool cat_mc1_mce(u16 ec, u8 xec)
629 {
630 	u8 r4    = R4(ec);
631 	bool ret = true;
632 
633 	if (!MEM_ERROR(ec))
634 		return false;
635 
636 	if (TT(ec) != TT_INSTR)
637 		return false;
638 
639 	if (r4 == R4_IRD)
640 		pr_cont("Data/tag array parity error for a tag hit.\n");
641 	else if (r4 == R4_SNOOP)
642 		pr_cont("Tag error during snoop/victimization.\n");
643 	else if (xec == 0x0)
644 		pr_cont("Tag parity error from victim castout.\n");
645 	else if (xec == 0x2)
646 		pr_cont("Microcode patch RAM parity error.\n");
647 	else
648 		ret = false;
649 
650 	return ret;
651 }
652 
f15h_mc1_mce(u16 ec,u8 xec)653 static bool f15h_mc1_mce(u16 ec, u8 xec)
654 {
655 	bool ret = true;
656 
657 	if (!MEM_ERROR(ec))
658 		return false;
659 
660 	switch (xec) {
661 	case 0x0 ... 0xa:
662 		pr_cont("%s.\n", f15h_mc1_mce_desc[xec]);
663 		break;
664 
665 	case 0xd:
666 		pr_cont("%s.\n", f15h_mc1_mce_desc[xec-2]);
667 		break;
668 
669 	case 0x10:
670 		pr_cont("%s.\n", f15h_mc1_mce_desc[xec-4]);
671 		break;
672 
673 	case 0x11 ... 0x15:
674 		pr_cont("Decoder %s parity error.\n", f15h_mc1_mce_desc[xec-4]);
675 		break;
676 
677 	default:
678 		ret = false;
679 	}
680 	return ret;
681 }
682 
decode_mc1_mce(struct mce * m)683 static void decode_mc1_mce(struct mce *m)
684 {
685 	u16 ec = EC(m->status);
686 	u8 xec = XEC(m->status, xec_mask);
687 
688 	pr_emerg(HW_ERR "MC1 Error: ");
689 
690 	if (TLB_ERROR(ec))
691 		pr_cont("%s TLB %s.\n", LL_MSG(ec),
692 			(xec ? "multimatch" : "parity error"));
693 	else if (BUS_ERROR(ec)) {
694 		bool k8 = (boot_cpu_data.x86 == 0xf && (m->status & BIT_64(58)));
695 
696 		pr_cont("during %s.\n", (k8 ? "system linefill" : "NB data read"));
697 	} else if (INT_ERROR(ec)) {
698 		if (xec <= 0x3f)
699 			pr_cont("Hardware Assert.\n");
700 		else
701 			goto wrong_mc1_mce;
702 	} else if (fam_ops.mc1_mce(ec, xec))
703 		;
704 	else
705 		goto wrong_mc1_mce;
706 
707 	return;
708 
709 wrong_mc1_mce:
710 	pr_emerg(HW_ERR "Corrupted MC1 MCE info?\n");
711 }
712 
k8_mc2_mce(u16 ec,u8 xec)713 static bool k8_mc2_mce(u16 ec, u8 xec)
714 {
715 	bool ret = true;
716 
717 	if (xec == 0x1)
718 		pr_cont(" in the write data buffers.\n");
719 	else if (xec == 0x3)
720 		pr_cont(" in the victim data buffers.\n");
721 	else if (xec == 0x2 && MEM_ERROR(ec))
722 		pr_cont(": %s error in the L2 cache tags.\n", R4_MSG(ec));
723 	else if (xec == 0x0) {
724 		if (TLB_ERROR(ec))
725 			pr_cont("%s error in a Page Descriptor Cache or Guest TLB.\n",
726 				TT_MSG(ec));
727 		else if (BUS_ERROR(ec))
728 			pr_cont(": %s/ECC error in data read from NB: %s.\n",
729 				R4_MSG(ec), PP_MSG(ec));
730 		else if (MEM_ERROR(ec)) {
731 			u8 r4 = R4(ec);
732 
733 			if (r4 >= 0x7)
734 				pr_cont(": %s error during data copyback.\n",
735 					R4_MSG(ec));
736 			else if (r4 <= 0x1)
737 				pr_cont(": %s parity/ECC error during data "
738 					"access from L2.\n", R4_MSG(ec));
739 			else
740 				ret = false;
741 		} else
742 			ret = false;
743 	} else
744 		ret = false;
745 
746 	return ret;
747 }
748 
f15h_mc2_mce(u16 ec,u8 xec)749 static bool f15h_mc2_mce(u16 ec, u8 xec)
750 {
751 	bool ret = true;
752 
753 	if (TLB_ERROR(ec)) {
754 		if (xec == 0x0)
755 			pr_cont("Data parity TLB read error.\n");
756 		else if (xec == 0x1)
757 			pr_cont("Poison data provided for TLB fill.\n");
758 		else
759 			ret = false;
760 	} else if (BUS_ERROR(ec)) {
761 		if (xec > 2)
762 			ret = false;
763 
764 		pr_cont("Error during attempted NB data read.\n");
765 	} else if (MEM_ERROR(ec)) {
766 		switch (xec) {
767 		case 0x4 ... 0xc:
768 			pr_cont("%s.\n", f15h_mc2_mce_desc[xec - 0x4]);
769 			break;
770 
771 		case 0x10 ... 0x14:
772 			pr_cont("%s.\n", f15h_mc2_mce_desc[xec - 0x7]);
773 			break;
774 
775 		default:
776 			ret = false;
777 		}
778 	} else if (INT_ERROR(ec)) {
779 		if (xec <= 0x3f)
780 			pr_cont("Hardware Assert.\n");
781 		else
782 			ret = false;
783 	}
784 
785 	return ret;
786 }
787 
f16h_mc2_mce(u16 ec,u8 xec)788 static bool f16h_mc2_mce(u16 ec, u8 xec)
789 {
790 	u8 r4 = R4(ec);
791 
792 	if (!MEM_ERROR(ec))
793 		return false;
794 
795 	switch (xec) {
796 	case 0x04 ... 0x05:
797 		pr_cont("%cBUFF parity error.\n", (r4 == R4_RD) ? 'I' : 'O');
798 		break;
799 
800 	case 0x09 ... 0x0b:
801 	case 0x0d ... 0x0f:
802 		pr_cont("ECC error in L2 tag (%s).\n",
803 			((r4 == R4_GEN)   ? "BankReq" :
804 			((r4 == R4_SNOOP) ? "Prb"     : "Fill")));
805 		break;
806 
807 	case 0x10 ... 0x19:
808 	case 0x1b:
809 		pr_cont("ECC error in L2 data array (%s).\n",
810 			(((r4 == R4_RD) && !(xec & 0x3)) ? "Hit"  :
811 			((r4 == R4_GEN)   ? "Attr" :
812 			((r4 == R4_EVICT) ? "Vict" : "Fill"))));
813 		break;
814 
815 	case 0x1c ... 0x1d:
816 	case 0x1f:
817 		pr_cont("Parity error in L2 attribute bits (%s).\n",
818 			((r4 == R4_RD)  ? "Hit"  :
819 			((r4 == R4_GEN) ? "Attr" : "Fill")));
820 		break;
821 
822 	default:
823 		return false;
824 	}
825 
826 	return true;
827 }
828 
decode_mc2_mce(struct mce * m)829 static void decode_mc2_mce(struct mce *m)
830 {
831 	u16 ec = EC(m->status);
832 	u8 xec = XEC(m->status, xec_mask);
833 
834 	pr_emerg(HW_ERR "MC2 Error: ");
835 
836 	if (!fam_ops.mc2_mce(ec, xec))
837 		pr_cont(HW_ERR "Corrupted MC2 MCE info?\n");
838 }
839 
decode_mc3_mce(struct mce * m)840 static void decode_mc3_mce(struct mce *m)
841 {
842 	u16 ec = EC(m->status);
843 	u8 xec = XEC(m->status, xec_mask);
844 
845 	if (boot_cpu_data.x86 >= 0x14) {
846 		pr_emerg("You shouldn't be seeing MC3 MCE on this cpu family,"
847 			 " please report on LKML.\n");
848 		return;
849 	}
850 
851 	pr_emerg(HW_ERR "MC3 Error");
852 
853 	if (xec == 0x0) {
854 		u8 r4 = R4(ec);
855 
856 		if (!BUS_ERROR(ec) || (r4 != R4_DRD && r4 != R4_DWR))
857 			goto wrong_mc3_mce;
858 
859 		pr_cont(" during %s.\n", R4_MSG(ec));
860 	} else
861 		goto wrong_mc3_mce;
862 
863 	return;
864 
865  wrong_mc3_mce:
866 	pr_emerg(HW_ERR "Corrupted MC3 MCE info?\n");
867 }
868 
decode_mc4_mce(struct mce * m)869 static void decode_mc4_mce(struct mce *m)
870 {
871 	unsigned int fam = x86_family(m->cpuid);
872 	int node_id = amd_get_nb_id(m->extcpu);
873 	u16 ec = EC(m->status);
874 	u8 xec = XEC(m->status, 0x1f);
875 	u8 offset = 0;
876 
877 	pr_emerg(HW_ERR "MC4 Error (node %d): ", node_id);
878 
879 	switch (xec) {
880 	case 0x0 ... 0xe:
881 
882 		/* special handling for DRAM ECCs */
883 		if (xec == 0x0 || xec == 0x8) {
884 			/* no ECCs on F11h */
885 			if (fam == 0x11)
886 				goto wrong_mc4_mce;
887 
888 			pr_cont("%s.\n", mc4_mce_desc[xec]);
889 
890 			if (decode_dram_ecc)
891 				decode_dram_ecc(node_id, m);
892 			return;
893 		}
894 		break;
895 
896 	case 0xf:
897 		if (TLB_ERROR(ec))
898 			pr_cont("GART Table Walk data error.\n");
899 		else if (BUS_ERROR(ec))
900 			pr_cont("DMA Exclusion Vector Table Walk error.\n");
901 		else
902 			goto wrong_mc4_mce;
903 		return;
904 
905 	case 0x19:
906 		if (fam == 0x15 || fam == 0x16)
907 			pr_cont("Compute Unit Data Error.\n");
908 		else
909 			goto wrong_mc4_mce;
910 		return;
911 
912 	case 0x1c ... 0x1f:
913 		offset = 13;
914 		break;
915 
916 	default:
917 		goto wrong_mc4_mce;
918 	}
919 
920 	pr_cont("%s.\n", mc4_mce_desc[xec - offset]);
921 	return;
922 
923  wrong_mc4_mce:
924 	pr_emerg(HW_ERR "Corrupted MC4 MCE info?\n");
925 }
926 
decode_mc5_mce(struct mce * m)927 static void decode_mc5_mce(struct mce *m)
928 {
929 	unsigned int fam = x86_family(m->cpuid);
930 	u16 ec = EC(m->status);
931 	u8 xec = XEC(m->status, xec_mask);
932 
933 	if (fam == 0xf || fam == 0x11)
934 		goto wrong_mc5_mce;
935 
936 	pr_emerg(HW_ERR "MC5 Error: ");
937 
938 	if (INT_ERROR(ec)) {
939 		if (xec <= 0x1f) {
940 			pr_cont("Hardware Assert.\n");
941 			return;
942 		} else
943 			goto wrong_mc5_mce;
944 	}
945 
946 	if (xec == 0x0 || xec == 0xc)
947 		pr_cont("%s.\n", mc5_mce_desc[xec]);
948 	else if (xec <= 0xd)
949 		pr_cont("%s parity error.\n", mc5_mce_desc[xec]);
950 	else
951 		goto wrong_mc5_mce;
952 
953 	return;
954 
955  wrong_mc5_mce:
956 	pr_emerg(HW_ERR "Corrupted MC5 MCE info?\n");
957 }
958 
decode_mc6_mce(struct mce * m)959 static void decode_mc6_mce(struct mce *m)
960 {
961 	u8 xec = XEC(m->status, xec_mask);
962 
963 	pr_emerg(HW_ERR "MC6 Error: ");
964 
965 	if (xec > 0x5)
966 		goto wrong_mc6_mce;
967 
968 	pr_cont("%s parity error.\n", mc6_mce_desc[xec]);
969 	return;
970 
971  wrong_mc6_mce:
972 	pr_emerg(HW_ERR "Corrupted MC6 MCE info?\n");
973 }
974 
975 /* Decode errors according to Scalable MCA specification */
decode_smca_error(struct mce * m)976 static void decode_smca_error(struct mce *m)
977 {
978 	struct smca_hwid *hwid;
979 	enum smca_bank_types bank_type;
980 	const char *ip_name;
981 	u8 xec = XEC(m->status, xec_mask);
982 
983 	if (m->bank >= ARRAY_SIZE(smca_banks))
984 		return;
985 
986 	hwid = smca_banks[m->bank].hwid;
987 	if (!hwid)
988 		return;
989 
990 	bank_type = hwid->bank_type;
991 
992 	if (bank_type == SMCA_RESERVED) {
993 		pr_emerg(HW_ERR "Bank %d is reserved.\n", m->bank);
994 		return;
995 	}
996 
997 	ip_name = smca_get_long_name(bank_type);
998 
999 	pr_emerg(HW_ERR "%s Ext. Error Code: %d", ip_name, xec);
1000 
1001 	/* Only print the decode of valid error codes */
1002 	if (xec < smca_mce_descs[bank_type].num_descs)
1003 		pr_cont(", %s.\n", smca_mce_descs[bank_type].descs[xec]);
1004 
1005 	if (bank_type == SMCA_UMC && xec == 0 && decode_dram_ecc)
1006 		decode_dram_ecc(topology_die_id(m->extcpu), m);
1007 }
1008 
amd_decode_err_code(u16 ec)1009 static inline void amd_decode_err_code(u16 ec)
1010 {
1011 	if (INT_ERROR(ec)) {
1012 		pr_emerg(HW_ERR "internal: %s\n", UU_MSG(ec));
1013 		return;
1014 	}
1015 
1016 	pr_emerg(HW_ERR "cache level: %s", LL_MSG(ec));
1017 
1018 	if (BUS_ERROR(ec))
1019 		pr_cont(", mem/io: %s", II_MSG(ec));
1020 	else
1021 		pr_cont(", tx: %s", TT_MSG(ec));
1022 
1023 	if (MEM_ERROR(ec) || BUS_ERROR(ec)) {
1024 		pr_cont(", mem-tx: %s", R4_MSG(ec));
1025 
1026 		if (BUS_ERROR(ec))
1027 			pr_cont(", part-proc: %s (%s)", PP_MSG(ec), TO_MSG(ec));
1028 	}
1029 
1030 	pr_cont("\n");
1031 }
1032 
decode_error_status(struct mce * m)1033 static const char *decode_error_status(struct mce *m)
1034 {
1035 	if (m->status & MCI_STATUS_UC) {
1036 		if (m->status & MCI_STATUS_PCC)
1037 			return "System Fatal error.";
1038 		if (m->mcgstatus & MCG_STATUS_RIPV)
1039 			return "Uncorrected, software restartable error.";
1040 		return "Uncorrected, software containable error.";
1041 	}
1042 
1043 	if (m->status & MCI_STATUS_DEFERRED)
1044 		return "Deferred error, no action required.";
1045 
1046 	return "Corrected error, no action required.";
1047 }
1048 
1049 static int
amd_decode_mce(struct notifier_block * nb,unsigned long val,void * data)1050 amd_decode_mce(struct notifier_block *nb, unsigned long val, void *data)
1051 {
1052 	struct mce *m = (struct mce *)data;
1053 	unsigned int fam = x86_family(m->cpuid);
1054 	int ecc;
1055 
1056 	if (m->kflags & MCE_HANDLED_CEC)
1057 		return NOTIFY_DONE;
1058 
1059 	pr_emerg(HW_ERR "%s\n", decode_error_status(m));
1060 
1061 	pr_emerg(HW_ERR "CPU:%d (%x:%x:%x) MC%d_STATUS[%s|%s|%s|%s|%s",
1062 		m->extcpu,
1063 		fam, x86_model(m->cpuid), x86_stepping(m->cpuid),
1064 		m->bank,
1065 		((m->status & MCI_STATUS_OVER)	? "Over"  : "-"),
1066 		((m->status & MCI_STATUS_UC)	? "UE"	  :
1067 		 (m->status & MCI_STATUS_DEFERRED) ? "-"  : "CE"),
1068 		((m->status & MCI_STATUS_MISCV)	? "MiscV" : "-"),
1069 		((m->status & MCI_STATUS_ADDRV)	? "AddrV" : "-"),
1070 		((m->status & MCI_STATUS_PCC)	? "PCC"	  : "-"));
1071 
1072 	if (boot_cpu_has(X86_FEATURE_SMCA)) {
1073 		u32 low, high;
1074 		u32 addr = MSR_AMD64_SMCA_MCx_CONFIG(m->bank);
1075 
1076 		if (!rdmsr_safe(addr, &low, &high) &&
1077 		    (low & MCI_CONFIG_MCAX))
1078 			pr_cont("|%s", ((m->status & MCI_STATUS_TCC) ? "TCC" : "-"));
1079 
1080 		pr_cont("|%s", ((m->status & MCI_STATUS_SYNDV) ? "SyndV" : "-"));
1081 	}
1082 
1083 	/* do the two bits[14:13] together */
1084 	ecc = (m->status >> 45) & 0x3;
1085 	if (ecc)
1086 		pr_cont("|%sECC", ((ecc == 2) ? "C" : "U"));
1087 
1088 	if (fam >= 0x15) {
1089 		pr_cont("|%s", (m->status & MCI_STATUS_DEFERRED ? "Deferred" : "-"));
1090 
1091 		/* F15h, bank4, bit 43 is part of McaStatSubCache. */
1092 		if (fam != 0x15 || m->bank != 4)
1093 			pr_cont("|%s", (m->status & MCI_STATUS_POISON ? "Poison" : "-"));
1094 	}
1095 
1096 	if (fam >= 0x17)
1097 		pr_cont("|%s", (m->status & MCI_STATUS_SCRUB ? "Scrub" : "-"));
1098 
1099 	pr_cont("]: 0x%016llx\n", m->status);
1100 
1101 	if (m->status & MCI_STATUS_ADDRV)
1102 		pr_emerg(HW_ERR "Error Addr: 0x%016llx\n", m->addr);
1103 
1104 	if (m->ppin)
1105 		pr_emerg(HW_ERR "PPIN: 0x%016llx\n", m->ppin);
1106 
1107 	if (boot_cpu_has(X86_FEATURE_SMCA)) {
1108 		pr_emerg(HW_ERR "IPID: 0x%016llx", m->ipid);
1109 
1110 		if (m->status & MCI_STATUS_SYNDV)
1111 			pr_cont(", Syndrome: 0x%016llx", m->synd);
1112 
1113 		pr_cont("\n");
1114 
1115 		decode_smca_error(m);
1116 		goto err_code;
1117 	}
1118 
1119 	if (m->tsc)
1120 		pr_emerg(HW_ERR "TSC: %llu\n", m->tsc);
1121 
1122 	/* Doesn't matter which member to test. */
1123 	if (!fam_ops.mc0_mce)
1124 		goto err_code;
1125 
1126 	switch (m->bank) {
1127 	case 0:
1128 		decode_mc0_mce(m);
1129 		break;
1130 
1131 	case 1:
1132 		decode_mc1_mce(m);
1133 		break;
1134 
1135 	case 2:
1136 		decode_mc2_mce(m);
1137 		break;
1138 
1139 	case 3:
1140 		decode_mc3_mce(m);
1141 		break;
1142 
1143 	case 4:
1144 		decode_mc4_mce(m);
1145 		break;
1146 
1147 	case 5:
1148 		decode_mc5_mce(m);
1149 		break;
1150 
1151 	case 6:
1152 		decode_mc6_mce(m);
1153 		break;
1154 
1155 	default:
1156 		break;
1157 	}
1158 
1159  err_code:
1160 	amd_decode_err_code(m->status & 0xffff);
1161 
1162 	m->kflags |= MCE_HANDLED_EDAC;
1163 	return NOTIFY_OK;
1164 }
1165 
1166 static struct notifier_block amd_mce_dec_nb = {
1167 	.notifier_call	= amd_decode_mce,
1168 	.priority	= MCE_PRIO_EDAC,
1169 };
1170 
mce_amd_init(void)1171 static int __init mce_amd_init(void)
1172 {
1173 	struct cpuinfo_x86 *c = &boot_cpu_data;
1174 
1175 	if (c->x86_vendor != X86_VENDOR_AMD &&
1176 	    c->x86_vendor != X86_VENDOR_HYGON)
1177 		return -ENODEV;
1178 
1179 	if (cpu_feature_enabled(X86_FEATURE_HYPERVISOR))
1180 		return -ENODEV;
1181 
1182 	if (boot_cpu_has(X86_FEATURE_SMCA)) {
1183 		xec_mask = 0x3f;
1184 		goto out;
1185 	}
1186 
1187 	switch (c->x86) {
1188 	case 0xf:
1189 		fam_ops.mc0_mce = k8_mc0_mce;
1190 		fam_ops.mc1_mce = k8_mc1_mce;
1191 		fam_ops.mc2_mce = k8_mc2_mce;
1192 		break;
1193 
1194 	case 0x10:
1195 		fam_ops.mc0_mce = f10h_mc0_mce;
1196 		fam_ops.mc1_mce = k8_mc1_mce;
1197 		fam_ops.mc2_mce = k8_mc2_mce;
1198 		break;
1199 
1200 	case 0x11:
1201 		fam_ops.mc0_mce = k8_mc0_mce;
1202 		fam_ops.mc1_mce = k8_mc1_mce;
1203 		fam_ops.mc2_mce = k8_mc2_mce;
1204 		break;
1205 
1206 	case 0x12:
1207 		fam_ops.mc0_mce = f12h_mc0_mce;
1208 		fam_ops.mc1_mce = k8_mc1_mce;
1209 		fam_ops.mc2_mce = k8_mc2_mce;
1210 		break;
1211 
1212 	case 0x14:
1213 		fam_ops.mc0_mce = cat_mc0_mce;
1214 		fam_ops.mc1_mce = cat_mc1_mce;
1215 		fam_ops.mc2_mce = k8_mc2_mce;
1216 		break;
1217 
1218 	case 0x15:
1219 		xec_mask = c->x86_model == 0x60 ? 0x3f : 0x1f;
1220 
1221 		fam_ops.mc0_mce = f15h_mc0_mce;
1222 		fam_ops.mc1_mce = f15h_mc1_mce;
1223 		fam_ops.mc2_mce = f15h_mc2_mce;
1224 		break;
1225 
1226 	case 0x16:
1227 		xec_mask = 0x1f;
1228 		fam_ops.mc0_mce = cat_mc0_mce;
1229 		fam_ops.mc1_mce = cat_mc1_mce;
1230 		fam_ops.mc2_mce = f16h_mc2_mce;
1231 		break;
1232 
1233 	case 0x17:
1234 	case 0x18:
1235 		pr_warn_once("Decoding supported only on Scalable MCA processors.\n");
1236 		return -EINVAL;
1237 
1238 	default:
1239 		printk(KERN_WARNING "Huh? What family is it: 0x%x?!\n", c->x86);
1240 		return -EINVAL;
1241 	}
1242 
1243 out:
1244 	pr_info("MCE: In-kernel MCE decoding enabled.\n");
1245 
1246 	mce_register_decode_chain(&amd_mce_dec_nb);
1247 
1248 	return 0;
1249 }
1250 early_initcall(mce_amd_init);
1251 
1252 #ifdef MODULE
mce_amd_exit(void)1253 static void __exit mce_amd_exit(void)
1254 {
1255 	mce_unregister_decode_chain(&amd_mce_dec_nb);
1256 }
1257 
1258 MODULE_DESCRIPTION("AMD MCE decoder");
1259 MODULE_ALIAS("edac-mce-amd");
1260 MODULE_LICENSE("GPL");
1261 module_exit(mce_amd_exit);
1262 #endif
1263