1 /*
2 * Machine check handler.
3 *
4 * K8 parts Copyright 2002,2003 Andi Kleen, SuSE Labs.
5 * Rest from unknown author(s).
6 * 2004 Andi Kleen. Rewrote most of it.
7 * Copyright 2008 Intel Corporation
8 * Author: Andi Kleen
9 */
10
11 #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
12
13 #include <linux/thread_info.h>
14 #include <linux/capability.h>
15 #include <linux/miscdevice.h>
16 #include <linux/ratelimit.h>
17 #include <linux/kallsyms.h>
18 #include <linux/rcupdate.h>
19 #include <linux/kobject.h>
20 #include <linux/uaccess.h>
21 #include <linux/kdebug.h>
22 #include <linux/kernel.h>
23 #include <linux/percpu.h>
24 #include <linux/string.h>
25 #include <linux/device.h>
26 #include <linux/syscore_ops.h>
27 #include <linux/delay.h>
28 #include <linux/ctype.h>
29 #include <linux/sched.h>
30 #include <linux/sysfs.h>
31 #include <linux/types.h>
32 #include <linux/slab.h>
33 #include <linux/init.h>
34 #include <linux/kmod.h>
35 #include <linux/poll.h>
36 #include <linux/nmi.h>
37 #include <linux/cpu.h>
38 #include <linux/smp.h>
39 #include <linux/fs.h>
40 #include <linux/mm.h>
41 #include <linux/debugfs.h>
42 #include <linux/irq_work.h>
43 #include <linux/export.h>
44
45 #include <asm/processor.h>
46 #include <asm/tlbflush.h>
47 #include <asm/mce.h>
48 #include <asm/msr.h>
49
50 #include "mce-internal.h"
51
52 static DEFINE_MUTEX(mce_chrdev_read_mutex);
53
54 #define rcu_dereference_check_mce(p) \
55 rcu_dereference_index_check((p), \
56 rcu_read_lock_sched_held() || \
57 lockdep_is_held(&mce_chrdev_read_mutex))
58
59 #define CREATE_TRACE_POINTS
60 #include <trace/events/mce.h>
61
62 #define SPINUNIT 100 /* 100ns */
63
64 DEFINE_PER_CPU(unsigned, mce_exception_count);
65
66 struct mce_bank *mce_banks __read_mostly;
67
68 struct mca_config mca_cfg __read_mostly = {
69 .bootlog = -1,
70 /*
71 * Tolerant levels:
72 * 0: always panic on uncorrected errors, log corrected errors
73 * 1: panic or SIGBUS on uncorrected errors, log corrected errors
74 * 2: SIGBUS or log uncorrected errors (if possible), log corr. errors
75 * 3: never panic or SIGBUS, log all errors (for testing only)
76 */
77 .tolerant = 1,
78 .monarch_timeout = -1
79 };
80
81 /* User mode helper program triggered by machine check event */
82 static unsigned long mce_need_notify;
83 static char mce_helper[128];
84 static char *mce_helper_argv[2] = { mce_helper, NULL };
85
86 static DECLARE_WAIT_QUEUE_HEAD(mce_chrdev_wait);
87
88 static DEFINE_PER_CPU(struct mce, mces_seen);
89 static int cpu_missing;
90
91 /* CMCI storm detection filter */
92 static DEFINE_PER_CPU(unsigned long, mce_polled_error);
93
94 /*
95 * MCA banks polled by the period polling timer for corrected events.
96 * With Intel CMCI, this only has MCA banks which do not support CMCI (if any).
97 */
98 DEFINE_PER_CPU(mce_banks_t, mce_poll_banks) = {
99 [0 ... BITS_TO_LONGS(MAX_NR_BANKS)-1] = ~0UL
100 };
101
102 /*
103 * MCA banks controlled through firmware first for corrected errors.
104 * This is a global list of banks for which we won't enable CMCI and we
105 * won't poll. Firmware controls these banks and is responsible for
106 * reporting corrected errors through GHES. Uncorrected/recoverable
107 * errors are still notified through a machine check.
108 */
109 mce_banks_t mce_banks_ce_disabled;
110
111 static DEFINE_PER_CPU(struct work_struct, mce_work);
112
113 static void (*quirk_no_way_out)(int bank, struct mce *m, struct pt_regs *regs);
114
115 /*
116 * CPU/chipset specific EDAC code can register a notifier call here to print
117 * MCE errors in a human-readable form.
118 */
119 ATOMIC_NOTIFIER_HEAD(x86_mce_decoder_chain);
120
121 /* Do initial initialization of a struct mce */
mce_setup(struct mce * m)122 void mce_setup(struct mce *m)
123 {
124 memset(m, 0, sizeof(struct mce));
125 m->cpu = m->extcpu = smp_processor_id();
126 rdtscll(m->tsc);
127 /* We hope get_seconds stays lockless */
128 m->time = get_seconds();
129 m->cpuvendor = boot_cpu_data.x86_vendor;
130 m->cpuid = cpuid_eax(1);
131 m->socketid = cpu_data(m->extcpu).phys_proc_id;
132 m->apicid = cpu_data(m->extcpu).initial_apicid;
133 rdmsrl(MSR_IA32_MCG_CAP, m->mcgcap);
134 }
135
136 DEFINE_PER_CPU(struct mce, injectm);
137 EXPORT_PER_CPU_SYMBOL_GPL(injectm);
138
139 /*
140 * Lockless MCE logging infrastructure.
141 * This avoids deadlocks on printk locks without having to break locks. Also
142 * separate MCEs from kernel messages to avoid bogus bug reports.
143 */
144
145 static struct mce_log mcelog = {
146 .signature = MCE_LOG_SIGNATURE,
147 .len = MCE_LOG_LEN,
148 .recordlen = sizeof(struct mce),
149 };
150
mce_log(struct mce * mce)151 void mce_log(struct mce *mce)
152 {
153 unsigned next, entry;
154 int ret = 0;
155
156 /* Emit the trace record: */
157 trace_mce_record(mce);
158
159 ret = atomic_notifier_call_chain(&x86_mce_decoder_chain, 0, mce);
160 if (ret == NOTIFY_STOP)
161 return;
162
163 mce->finished = 0;
164 wmb();
165 for (;;) {
166 entry = rcu_dereference_check_mce(mcelog.next);
167 for (;;) {
168
169 /*
170 * When the buffer fills up discard new entries.
171 * Assume that the earlier errors are the more
172 * interesting ones:
173 */
174 if (entry >= MCE_LOG_LEN) {
175 set_bit(MCE_OVERFLOW,
176 (unsigned long *)&mcelog.flags);
177 return;
178 }
179 /* Old left over entry. Skip: */
180 if (mcelog.entry[entry].finished) {
181 entry++;
182 continue;
183 }
184 break;
185 }
186 smp_rmb();
187 next = entry + 1;
188 if (cmpxchg(&mcelog.next, entry, next) == entry)
189 break;
190 }
191 memcpy(mcelog.entry + entry, mce, sizeof(struct mce));
192 wmb();
193 mcelog.entry[entry].finished = 1;
194 wmb();
195
196 mce->finished = 1;
197 set_bit(0, &mce_need_notify);
198 }
199
drain_mcelog_buffer(void)200 static void drain_mcelog_buffer(void)
201 {
202 unsigned int next, i, prev = 0;
203
204 next = ACCESS_ONCE(mcelog.next);
205
206 do {
207 struct mce *m;
208
209 /* drain what was logged during boot */
210 for (i = prev; i < next; i++) {
211 unsigned long start = jiffies;
212 unsigned retries = 1;
213
214 m = &mcelog.entry[i];
215
216 while (!m->finished) {
217 if (time_after_eq(jiffies, start + 2*retries))
218 retries++;
219
220 cpu_relax();
221
222 if (!m->finished && retries >= 4) {
223 pr_err("skipping error being logged currently!\n");
224 break;
225 }
226 }
227 smp_rmb();
228 atomic_notifier_call_chain(&x86_mce_decoder_chain, 0, m);
229 }
230
231 memset(mcelog.entry + prev, 0, (next - prev) * sizeof(*m));
232 prev = next;
233 next = cmpxchg(&mcelog.next, prev, 0);
234 } while (next != prev);
235 }
236
237
mce_register_decode_chain(struct notifier_block * nb)238 void mce_register_decode_chain(struct notifier_block *nb)
239 {
240 atomic_notifier_chain_register(&x86_mce_decoder_chain, nb);
241 drain_mcelog_buffer();
242 }
243 EXPORT_SYMBOL_GPL(mce_register_decode_chain);
244
mce_unregister_decode_chain(struct notifier_block * nb)245 void mce_unregister_decode_chain(struct notifier_block *nb)
246 {
247 atomic_notifier_chain_unregister(&x86_mce_decoder_chain, nb);
248 }
249 EXPORT_SYMBOL_GPL(mce_unregister_decode_chain);
250
print_mce(struct mce * m)251 static void print_mce(struct mce *m)
252 {
253 int ret = 0;
254
255 pr_emerg(HW_ERR "CPU %d: Machine Check Exception: %Lx Bank %d: %016Lx\n",
256 m->extcpu, m->mcgstatus, m->bank, m->status);
257
258 if (m->ip) {
259 pr_emerg(HW_ERR "RIP%s %02x:<%016Lx> ",
260 !(m->mcgstatus & MCG_STATUS_EIPV) ? " !INEXACT!" : "",
261 m->cs, m->ip);
262
263 if (m->cs == __KERNEL_CS)
264 print_symbol("{%s}", m->ip);
265 pr_cont("\n");
266 }
267
268 pr_emerg(HW_ERR "TSC %llx ", m->tsc);
269 if (m->addr)
270 pr_cont("ADDR %llx ", m->addr);
271 if (m->misc)
272 pr_cont("MISC %llx ", m->misc);
273
274 pr_cont("\n");
275 /*
276 * Note this output is parsed by external tools and old fields
277 * should not be changed.
278 */
279 pr_emerg(HW_ERR "PROCESSOR %u:%x TIME %llu SOCKET %u APIC %x microcode %x\n",
280 m->cpuvendor, m->cpuid, m->time, m->socketid, m->apicid,
281 cpu_data(m->extcpu).microcode);
282
283 /*
284 * Print out human-readable details about the MCE error,
285 * (if the CPU has an implementation for that)
286 */
287 ret = atomic_notifier_call_chain(&x86_mce_decoder_chain, 0, m);
288 if (ret == NOTIFY_STOP)
289 return;
290
291 pr_emerg_ratelimited(HW_ERR "Run the above through 'mcelog --ascii'\n");
292 }
293
294 #define PANIC_TIMEOUT 5 /* 5 seconds */
295
296 static atomic_t mce_paniced;
297
298 static int fake_panic;
299 static atomic_t mce_fake_paniced;
300
301 /* Panic in progress. Enable interrupts and wait for final IPI */
wait_for_panic(void)302 static void wait_for_panic(void)
303 {
304 long timeout = PANIC_TIMEOUT*USEC_PER_SEC;
305
306 preempt_disable();
307 local_irq_enable();
308 while (timeout-- > 0)
309 udelay(1);
310 if (panic_timeout == 0)
311 panic_timeout = mca_cfg.panic_timeout;
312 panic("Panicing machine check CPU died");
313 }
314
mce_panic(char * msg,struct mce * final,char * exp)315 static void mce_panic(char *msg, struct mce *final, char *exp)
316 {
317 int i, apei_err = 0;
318
319 if (!fake_panic) {
320 /*
321 * Make sure only one CPU runs in machine check panic
322 */
323 if (atomic_inc_return(&mce_paniced) > 1)
324 wait_for_panic();
325 barrier();
326
327 bust_spinlocks(1);
328 console_verbose();
329 } else {
330 /* Don't log too much for fake panic */
331 if (atomic_inc_return(&mce_fake_paniced) > 1)
332 return;
333 }
334 /* First print corrected ones that are still unlogged */
335 for (i = 0; i < MCE_LOG_LEN; i++) {
336 struct mce *m = &mcelog.entry[i];
337 if (!(m->status & MCI_STATUS_VAL))
338 continue;
339 if (!(m->status & MCI_STATUS_UC)) {
340 print_mce(m);
341 if (!apei_err)
342 apei_err = apei_write_mce(m);
343 }
344 }
345 /* Now print uncorrected but with the final one last */
346 for (i = 0; i < MCE_LOG_LEN; i++) {
347 struct mce *m = &mcelog.entry[i];
348 if (!(m->status & MCI_STATUS_VAL))
349 continue;
350 if (!(m->status & MCI_STATUS_UC))
351 continue;
352 if (!final || memcmp(m, final, sizeof(struct mce))) {
353 print_mce(m);
354 if (!apei_err)
355 apei_err = apei_write_mce(m);
356 }
357 }
358 if (final) {
359 print_mce(final);
360 if (!apei_err)
361 apei_err = apei_write_mce(final);
362 }
363 if (cpu_missing)
364 pr_emerg(HW_ERR "Some CPUs didn't answer in synchronization\n");
365 if (exp)
366 pr_emerg(HW_ERR "Machine check: %s\n", exp);
367 if (!fake_panic) {
368 if (panic_timeout == 0)
369 panic_timeout = mca_cfg.panic_timeout;
370 panic(msg);
371 } else
372 pr_emerg(HW_ERR "Fake kernel panic: %s\n", msg);
373 }
374
375 /* Support code for software error injection */
376
msr_to_offset(u32 msr)377 static int msr_to_offset(u32 msr)
378 {
379 unsigned bank = __this_cpu_read(injectm.bank);
380
381 if (msr == mca_cfg.rip_msr)
382 return offsetof(struct mce, ip);
383 if (msr == MSR_IA32_MCx_STATUS(bank))
384 return offsetof(struct mce, status);
385 if (msr == MSR_IA32_MCx_ADDR(bank))
386 return offsetof(struct mce, addr);
387 if (msr == MSR_IA32_MCx_MISC(bank))
388 return offsetof(struct mce, misc);
389 if (msr == MSR_IA32_MCG_STATUS)
390 return offsetof(struct mce, mcgstatus);
391 return -1;
392 }
393
394 /* MSR access wrappers used for error injection */
mce_rdmsrl(u32 msr)395 static u64 mce_rdmsrl(u32 msr)
396 {
397 u64 v;
398
399 if (__this_cpu_read(injectm.finished)) {
400 int offset = msr_to_offset(msr);
401
402 if (offset < 0)
403 return 0;
404 return *(u64 *)((char *)this_cpu_ptr(&injectm) + offset);
405 }
406
407 if (rdmsrl_safe(msr, &v)) {
408 WARN_ONCE(1, "mce: Unable to read msr %d!\n", msr);
409 /*
410 * Return zero in case the access faulted. This should
411 * not happen normally but can happen if the CPU does
412 * something weird, or if the code is buggy.
413 */
414 v = 0;
415 }
416
417 return v;
418 }
419
mce_wrmsrl(u32 msr,u64 v)420 static void mce_wrmsrl(u32 msr, u64 v)
421 {
422 if (__this_cpu_read(injectm.finished)) {
423 int offset = msr_to_offset(msr);
424
425 if (offset >= 0)
426 *(u64 *)((char *)this_cpu_ptr(&injectm) + offset) = v;
427 return;
428 }
429 wrmsrl(msr, v);
430 }
431
432 /*
433 * Collect all global (w.r.t. this processor) status about this machine
434 * check into our "mce" struct so that we can use it later to assess
435 * the severity of the problem as we read per-bank specific details.
436 */
mce_gather_info(struct mce * m,struct pt_regs * regs)437 static inline void mce_gather_info(struct mce *m, struct pt_regs *regs)
438 {
439 mce_setup(m);
440
441 m->mcgstatus = mce_rdmsrl(MSR_IA32_MCG_STATUS);
442 if (regs) {
443 /*
444 * Get the address of the instruction at the time of
445 * the machine check error.
446 */
447 if (m->mcgstatus & (MCG_STATUS_RIPV|MCG_STATUS_EIPV)) {
448 m->ip = regs->ip;
449 m->cs = regs->cs;
450
451 /*
452 * When in VM86 mode make the cs look like ring 3
453 * always. This is a lie, but it's better than passing
454 * the additional vm86 bit around everywhere.
455 */
456 if (v8086_mode(regs))
457 m->cs |= 3;
458 }
459 /* Use accurate RIP reporting if available. */
460 if (mca_cfg.rip_msr)
461 m->ip = mce_rdmsrl(mca_cfg.rip_msr);
462 }
463 }
464
465 /*
466 * Simple lockless ring to communicate PFNs from the exception handler with the
467 * process context work function. This is vastly simplified because there's
468 * only a single reader and a single writer.
469 */
470 #define MCE_RING_SIZE 16 /* we use one entry less */
471
472 struct mce_ring {
473 unsigned short start;
474 unsigned short end;
475 unsigned long ring[MCE_RING_SIZE];
476 };
477 static DEFINE_PER_CPU(struct mce_ring, mce_ring);
478
479 /* Runs with CPU affinity in workqueue */
mce_ring_empty(void)480 static int mce_ring_empty(void)
481 {
482 struct mce_ring *r = this_cpu_ptr(&mce_ring);
483
484 return r->start == r->end;
485 }
486
mce_ring_get(unsigned long * pfn)487 static int mce_ring_get(unsigned long *pfn)
488 {
489 struct mce_ring *r;
490 int ret = 0;
491
492 *pfn = 0;
493 get_cpu();
494 r = this_cpu_ptr(&mce_ring);
495 if (r->start == r->end)
496 goto out;
497 *pfn = r->ring[r->start];
498 r->start = (r->start + 1) % MCE_RING_SIZE;
499 ret = 1;
500 out:
501 put_cpu();
502 return ret;
503 }
504
505 /* Always runs in MCE context with preempt off */
mce_ring_add(unsigned long pfn)506 static int mce_ring_add(unsigned long pfn)
507 {
508 struct mce_ring *r = this_cpu_ptr(&mce_ring);
509 unsigned next;
510
511 next = (r->end + 1) % MCE_RING_SIZE;
512 if (next == r->start)
513 return -1;
514 r->ring[r->end] = pfn;
515 wmb();
516 r->end = next;
517 return 0;
518 }
519
mce_available(struct cpuinfo_x86 * c)520 int mce_available(struct cpuinfo_x86 *c)
521 {
522 if (mca_cfg.disabled)
523 return 0;
524 return cpu_has(c, X86_FEATURE_MCE) && cpu_has(c, X86_FEATURE_MCA);
525 }
526
mce_schedule_work(void)527 static void mce_schedule_work(void)
528 {
529 if (!mce_ring_empty())
530 schedule_work(this_cpu_ptr(&mce_work));
531 }
532
533 DEFINE_PER_CPU(struct irq_work, mce_irq_work);
534
mce_irq_work_cb(struct irq_work * entry)535 static void mce_irq_work_cb(struct irq_work *entry)
536 {
537 mce_notify_irq();
538 mce_schedule_work();
539 }
540
mce_report_event(struct pt_regs * regs)541 static void mce_report_event(struct pt_regs *regs)
542 {
543 if (regs->flags & (X86_VM_MASK|X86_EFLAGS_IF)) {
544 mce_notify_irq();
545 /*
546 * Triggering the work queue here is just an insurance
547 * policy in case the syscall exit notify handler
548 * doesn't run soon enough or ends up running on the
549 * wrong CPU (can happen when audit sleeps)
550 */
551 mce_schedule_work();
552 return;
553 }
554
555 irq_work_queue(this_cpu_ptr(&mce_irq_work));
556 }
557
558 /*
559 * Read ADDR and MISC registers.
560 */
mce_read_aux(struct mce * m,int i)561 static void mce_read_aux(struct mce *m, int i)
562 {
563 if (m->status & MCI_STATUS_MISCV)
564 m->misc = mce_rdmsrl(MSR_IA32_MCx_MISC(i));
565 if (m->status & MCI_STATUS_ADDRV) {
566 m->addr = mce_rdmsrl(MSR_IA32_MCx_ADDR(i));
567
568 /*
569 * Mask the reported address by the reported granularity.
570 */
571 if (mca_cfg.ser && (m->status & MCI_STATUS_MISCV)) {
572 u8 shift = MCI_MISC_ADDR_LSB(m->misc);
573 m->addr >>= shift;
574 m->addr <<= shift;
575 }
576 }
577 }
578
579 DEFINE_PER_CPU(unsigned, mce_poll_count);
580
581 /*
582 * Poll for corrected events or events that happened before reset.
583 * Those are just logged through /dev/mcelog.
584 *
585 * This is executed in standard interrupt context.
586 *
587 * Note: spec recommends to panic for fatal unsignalled
588 * errors here. However this would be quite problematic --
589 * we would need to reimplement the Monarch handling and
590 * it would mess up the exclusion between exception handler
591 * and poll hander -- * so we skip this for now.
592 * These cases should not happen anyways, or only when the CPU
593 * is already totally * confused. In this case it's likely it will
594 * not fully execute the machine check handler either.
595 */
machine_check_poll(enum mcp_flags flags,mce_banks_t * b)596 void machine_check_poll(enum mcp_flags flags, mce_banks_t *b)
597 {
598 struct mce m;
599 int i;
600
601 this_cpu_inc(mce_poll_count);
602
603 mce_gather_info(&m, NULL);
604
605 for (i = 0; i < mca_cfg.banks; i++) {
606 if (!mce_banks[i].ctl || !test_bit(i, *b))
607 continue;
608
609 m.misc = 0;
610 m.addr = 0;
611 m.bank = i;
612 m.tsc = 0;
613
614 barrier();
615 m.status = mce_rdmsrl(MSR_IA32_MCx_STATUS(i));
616 if (!(m.status & MCI_STATUS_VAL))
617 continue;
618
619 this_cpu_write(mce_polled_error, 1);
620 /*
621 * Uncorrected or signalled events are handled by the exception
622 * handler when it is enabled, so don't process those here.
623 *
624 * TBD do the same check for MCI_STATUS_EN here?
625 */
626 if (!(flags & MCP_UC) &&
627 (m.status & (mca_cfg.ser ? MCI_STATUS_S : MCI_STATUS_UC)))
628 continue;
629
630 mce_read_aux(&m, i);
631
632 if (!(flags & MCP_TIMESTAMP))
633 m.tsc = 0;
634 /*
635 * Don't get the IP here because it's unlikely to
636 * have anything to do with the actual error location.
637 */
638 if (!(flags & MCP_DONTLOG) && !mca_cfg.dont_log_ce)
639 mce_log(&m);
640
641 /*
642 * Clear state for this bank.
643 */
644 mce_wrmsrl(MSR_IA32_MCx_STATUS(i), 0);
645 }
646
647 /*
648 * Don't clear MCG_STATUS here because it's only defined for
649 * exceptions.
650 */
651
652 sync_core();
653 }
654 EXPORT_SYMBOL_GPL(machine_check_poll);
655
656 /*
657 * Do a quick check if any of the events requires a panic.
658 * This decides if we keep the events around or clear them.
659 */
mce_no_way_out(struct mce * m,char ** msg,unsigned long * validp,struct pt_regs * regs)660 static int mce_no_way_out(struct mce *m, char **msg, unsigned long *validp,
661 struct pt_regs *regs)
662 {
663 int i, ret = 0;
664 char *tmp;
665
666 for (i = 0; i < mca_cfg.banks; i++) {
667 m->status = mce_rdmsrl(MSR_IA32_MCx_STATUS(i));
668 if (m->status & MCI_STATUS_VAL) {
669 __set_bit(i, validp);
670 if (quirk_no_way_out)
671 quirk_no_way_out(i, m, regs);
672 }
673
674 if (mce_severity(m, mca_cfg.tolerant, &tmp, true) >= MCE_PANIC_SEVERITY) {
675 *msg = tmp;
676 ret = 1;
677 }
678 }
679 return ret;
680 }
681
682 /*
683 * Variable to establish order between CPUs while scanning.
684 * Each CPU spins initially until executing is equal its number.
685 */
686 static atomic_t mce_executing;
687
688 /*
689 * Defines order of CPUs on entry. First CPU becomes Monarch.
690 */
691 static atomic_t mce_callin;
692
693 /*
694 * Check if a timeout waiting for other CPUs happened.
695 */
mce_timed_out(u64 * t)696 static int mce_timed_out(u64 *t)
697 {
698 /*
699 * The others already did panic for some reason.
700 * Bail out like in a timeout.
701 * rmb() to tell the compiler that system_state
702 * might have been modified by someone else.
703 */
704 rmb();
705 if (atomic_read(&mce_paniced))
706 wait_for_panic();
707 if (!mca_cfg.monarch_timeout)
708 goto out;
709 if ((s64)*t < SPINUNIT) {
710 if (mca_cfg.tolerant <= 1)
711 mce_panic("Timeout synchronizing machine check over CPUs",
712 NULL, NULL);
713 cpu_missing = 1;
714 return 1;
715 }
716 *t -= SPINUNIT;
717 out:
718 touch_nmi_watchdog();
719 return 0;
720 }
721
722 /*
723 * The Monarch's reign. The Monarch is the CPU who entered
724 * the machine check handler first. It waits for the others to
725 * raise the exception too and then grades them. When any
726 * error is fatal panic. Only then let the others continue.
727 *
728 * The other CPUs entering the MCE handler will be controlled by the
729 * Monarch. They are called Subjects.
730 *
731 * This way we prevent any potential data corruption in a unrecoverable case
732 * and also makes sure always all CPU's errors are examined.
733 *
734 * Also this detects the case of a machine check event coming from outer
735 * space (not detected by any CPUs) In this case some external agent wants
736 * us to shut down, so panic too.
737 *
738 * The other CPUs might still decide to panic if the handler happens
739 * in a unrecoverable place, but in this case the system is in a semi-stable
740 * state and won't corrupt anything by itself. It's ok to let the others
741 * continue for a bit first.
742 *
743 * All the spin loops have timeouts; when a timeout happens a CPU
744 * typically elects itself to be Monarch.
745 */
mce_reign(void)746 static void mce_reign(void)
747 {
748 int cpu;
749 struct mce *m = NULL;
750 int global_worst = 0;
751 char *msg = NULL;
752 char *nmsg = NULL;
753
754 /*
755 * This CPU is the Monarch and the other CPUs have run
756 * through their handlers.
757 * Grade the severity of the errors of all the CPUs.
758 */
759 for_each_possible_cpu(cpu) {
760 int severity = mce_severity(&per_cpu(mces_seen, cpu),
761 mca_cfg.tolerant,
762 &nmsg, true);
763 if (severity > global_worst) {
764 msg = nmsg;
765 global_worst = severity;
766 m = &per_cpu(mces_seen, cpu);
767 }
768 }
769
770 /*
771 * Cannot recover? Panic here then.
772 * This dumps all the mces in the log buffer and stops the
773 * other CPUs.
774 */
775 if (m && global_worst >= MCE_PANIC_SEVERITY && mca_cfg.tolerant < 3)
776 mce_panic("Fatal Machine check", m, msg);
777
778 /*
779 * For UC somewhere we let the CPU who detects it handle it.
780 * Also must let continue the others, otherwise the handling
781 * CPU could deadlock on a lock.
782 */
783
784 /*
785 * No machine check event found. Must be some external
786 * source or one CPU is hung. Panic.
787 */
788 if (global_worst <= MCE_KEEP_SEVERITY && mca_cfg.tolerant < 3)
789 mce_panic("Machine check from unknown source", NULL, NULL);
790
791 /*
792 * Now clear all the mces_seen so that they don't reappear on
793 * the next mce.
794 */
795 for_each_possible_cpu(cpu)
796 memset(&per_cpu(mces_seen, cpu), 0, sizeof(struct mce));
797 }
798
799 static atomic_t global_nwo;
800
801 /*
802 * Start of Monarch synchronization. This waits until all CPUs have
803 * entered the exception handler and then determines if any of them
804 * saw a fatal event that requires panic. Then it executes them
805 * in the entry order.
806 * TBD double check parallel CPU hotunplug
807 */
mce_start(int * no_way_out)808 static int mce_start(int *no_way_out)
809 {
810 int order;
811 int cpus = num_online_cpus();
812 u64 timeout = (u64)mca_cfg.monarch_timeout * NSEC_PER_USEC;
813
814 if (!timeout)
815 return -1;
816
817 atomic_add(*no_way_out, &global_nwo);
818 /*
819 * global_nwo should be updated before mce_callin
820 */
821 smp_wmb();
822 order = atomic_inc_return(&mce_callin);
823
824 /*
825 * Wait for everyone.
826 */
827 while (atomic_read(&mce_callin) != cpus) {
828 if (mce_timed_out(&timeout)) {
829 atomic_set(&global_nwo, 0);
830 return -1;
831 }
832 ndelay(SPINUNIT);
833 }
834
835 /*
836 * mce_callin should be read before global_nwo
837 */
838 smp_rmb();
839
840 if (order == 1) {
841 /*
842 * Monarch: Starts executing now, the others wait.
843 */
844 atomic_set(&mce_executing, 1);
845 } else {
846 /*
847 * Subject: Now start the scanning loop one by one in
848 * the original callin order.
849 * This way when there are any shared banks it will be
850 * only seen by one CPU before cleared, avoiding duplicates.
851 */
852 while (atomic_read(&mce_executing) < order) {
853 if (mce_timed_out(&timeout)) {
854 atomic_set(&global_nwo, 0);
855 return -1;
856 }
857 ndelay(SPINUNIT);
858 }
859 }
860
861 /*
862 * Cache the global no_way_out state.
863 */
864 *no_way_out = atomic_read(&global_nwo);
865
866 return order;
867 }
868
869 /*
870 * Synchronize between CPUs after main scanning loop.
871 * This invokes the bulk of the Monarch processing.
872 */
mce_end(int order)873 static int mce_end(int order)
874 {
875 int ret = -1;
876 u64 timeout = (u64)mca_cfg.monarch_timeout * NSEC_PER_USEC;
877
878 if (!timeout)
879 goto reset;
880 if (order < 0)
881 goto reset;
882
883 /*
884 * Allow others to run.
885 */
886 atomic_inc(&mce_executing);
887
888 if (order == 1) {
889 /* CHECKME: Can this race with a parallel hotplug? */
890 int cpus = num_online_cpus();
891
892 /*
893 * Monarch: Wait for everyone to go through their scanning
894 * loops.
895 */
896 while (atomic_read(&mce_executing) <= cpus) {
897 if (mce_timed_out(&timeout))
898 goto reset;
899 ndelay(SPINUNIT);
900 }
901
902 mce_reign();
903 barrier();
904 ret = 0;
905 } else {
906 /*
907 * Subject: Wait for Monarch to finish.
908 */
909 while (atomic_read(&mce_executing) != 0) {
910 if (mce_timed_out(&timeout))
911 goto reset;
912 ndelay(SPINUNIT);
913 }
914
915 /*
916 * Don't reset anything. That's done by the Monarch.
917 */
918 return 0;
919 }
920
921 /*
922 * Reset all global state.
923 */
924 reset:
925 atomic_set(&global_nwo, 0);
926 atomic_set(&mce_callin, 0);
927 barrier();
928
929 /*
930 * Let others run again.
931 */
932 atomic_set(&mce_executing, 0);
933 return ret;
934 }
935
936 /*
937 * Check if the address reported by the CPU is in a format we can parse.
938 * It would be possible to add code for most other cases, but all would
939 * be somewhat complicated (e.g. segment offset would require an instruction
940 * parser). So only support physical addresses up to page granuality for now.
941 */
mce_usable_address(struct mce * m)942 static int mce_usable_address(struct mce *m)
943 {
944 if (!(m->status & MCI_STATUS_MISCV) || !(m->status & MCI_STATUS_ADDRV))
945 return 0;
946 if (MCI_MISC_ADDR_LSB(m->misc) > PAGE_SHIFT)
947 return 0;
948 if (MCI_MISC_ADDR_MODE(m->misc) != MCI_MISC_ADDR_PHYS)
949 return 0;
950 return 1;
951 }
952
mce_clear_state(unsigned long * toclear)953 static void mce_clear_state(unsigned long *toclear)
954 {
955 int i;
956
957 for (i = 0; i < mca_cfg.banks; i++) {
958 if (test_bit(i, toclear))
959 mce_wrmsrl(MSR_IA32_MCx_STATUS(i), 0);
960 }
961 }
962
963 /*
964 * Need to save faulting physical address associated with a process
965 * in the machine check handler some place where we can grab it back
966 * later in mce_notify_process()
967 */
968 #define MCE_INFO_MAX 16
969
970 struct mce_info {
971 atomic_t inuse;
972 struct task_struct *t;
973 __u64 paddr;
974 int restartable;
975 } mce_info[MCE_INFO_MAX];
976
mce_save_info(__u64 addr,int c)977 static void mce_save_info(__u64 addr, int c)
978 {
979 struct mce_info *mi;
980
981 for (mi = mce_info; mi < &mce_info[MCE_INFO_MAX]; mi++) {
982 if (atomic_cmpxchg(&mi->inuse, 0, 1) == 0) {
983 mi->t = current;
984 mi->paddr = addr;
985 mi->restartable = c;
986 return;
987 }
988 }
989
990 mce_panic("Too many concurrent recoverable errors", NULL, NULL);
991 }
992
mce_find_info(void)993 static struct mce_info *mce_find_info(void)
994 {
995 struct mce_info *mi;
996
997 for (mi = mce_info; mi < &mce_info[MCE_INFO_MAX]; mi++)
998 if (atomic_read(&mi->inuse) && mi->t == current)
999 return mi;
1000 return NULL;
1001 }
1002
mce_clear_info(struct mce_info * mi)1003 static void mce_clear_info(struct mce_info *mi)
1004 {
1005 atomic_set(&mi->inuse, 0);
1006 }
1007
1008 /*
1009 * The actual machine check handler. This only handles real
1010 * exceptions when something got corrupted coming in through int 18.
1011 *
1012 * This is executed in NMI context not subject to normal locking rules. This
1013 * implies that most kernel services cannot be safely used. Don't even
1014 * think about putting a printk in there!
1015 *
1016 * On Intel systems this is entered on all CPUs in parallel through
1017 * MCE broadcast. However some CPUs might be broken beyond repair,
1018 * so be always careful when synchronizing with others.
1019 */
do_machine_check(struct pt_regs * regs,long error_code)1020 void do_machine_check(struct pt_regs *regs, long error_code)
1021 {
1022 struct mca_config *cfg = &mca_cfg;
1023 struct mce m, *final;
1024 int i;
1025 int worst = 0;
1026 int severity;
1027 /*
1028 * Establish sequential order between the CPUs entering the machine
1029 * check handler.
1030 */
1031 int order;
1032 /*
1033 * If no_way_out gets set, there is no safe way to recover from this
1034 * MCE. If mca_cfg.tolerant is cranked up, we'll try anyway.
1035 */
1036 int no_way_out = 0;
1037 /*
1038 * If kill_it gets set, there might be a way to recover from this
1039 * error.
1040 */
1041 int kill_it = 0;
1042 DECLARE_BITMAP(toclear, MAX_NR_BANKS);
1043 DECLARE_BITMAP(valid_banks, MAX_NR_BANKS);
1044 char *msg = "Unknown";
1045
1046 this_cpu_inc(mce_exception_count);
1047
1048 if (!cfg->banks)
1049 goto out;
1050
1051 mce_gather_info(&m, regs);
1052
1053 final = this_cpu_ptr(&mces_seen);
1054 *final = m;
1055
1056 memset(valid_banks, 0, sizeof(valid_banks));
1057 no_way_out = mce_no_way_out(&m, &msg, valid_banks, regs);
1058
1059 barrier();
1060
1061 /*
1062 * When no restart IP might need to kill or panic.
1063 * Assume the worst for now, but if we find the
1064 * severity is MCE_AR_SEVERITY we have other options.
1065 */
1066 if (!(m.mcgstatus & MCG_STATUS_RIPV))
1067 kill_it = 1;
1068
1069 /*
1070 * Go through all the banks in exclusion of the other CPUs.
1071 * This way we don't report duplicated events on shared banks
1072 * because the first one to see it will clear it.
1073 */
1074 order = mce_start(&no_way_out);
1075 for (i = 0; i < cfg->banks; i++) {
1076 __clear_bit(i, toclear);
1077 if (!test_bit(i, valid_banks))
1078 continue;
1079 if (!mce_banks[i].ctl)
1080 continue;
1081
1082 m.misc = 0;
1083 m.addr = 0;
1084 m.bank = i;
1085
1086 m.status = mce_rdmsrl(MSR_IA32_MCx_STATUS(i));
1087 if ((m.status & MCI_STATUS_VAL) == 0)
1088 continue;
1089
1090 /*
1091 * Non uncorrected or non signaled errors are handled by
1092 * machine_check_poll. Leave them alone, unless this panics.
1093 */
1094 if (!(m.status & (cfg->ser ? MCI_STATUS_S : MCI_STATUS_UC)) &&
1095 !no_way_out)
1096 continue;
1097
1098 /*
1099 * Set taint even when machine check was not enabled.
1100 */
1101 add_taint(TAINT_MACHINE_CHECK, LOCKDEP_NOW_UNRELIABLE);
1102
1103 severity = mce_severity(&m, cfg->tolerant, NULL, true);
1104
1105 /*
1106 * When machine check was for corrected/deferred handler don't
1107 * touch, unless we're panicing.
1108 */
1109 if ((severity == MCE_KEEP_SEVERITY ||
1110 severity == MCE_UCNA_SEVERITY) && !no_way_out)
1111 continue;
1112 __set_bit(i, toclear);
1113 if (severity == MCE_NO_SEVERITY) {
1114 /*
1115 * Machine check event was not enabled. Clear, but
1116 * ignore.
1117 */
1118 continue;
1119 }
1120
1121 mce_read_aux(&m, i);
1122
1123 /*
1124 * Action optional error. Queue address for later processing.
1125 * When the ring overflows we just ignore the AO error.
1126 * RED-PEN add some logging mechanism when
1127 * usable_address or mce_add_ring fails.
1128 * RED-PEN don't ignore overflow for mca_cfg.tolerant == 0
1129 */
1130 if (severity == MCE_AO_SEVERITY && mce_usable_address(&m))
1131 mce_ring_add(m.addr >> PAGE_SHIFT);
1132
1133 mce_log(&m);
1134
1135 if (severity > worst) {
1136 *final = m;
1137 worst = severity;
1138 }
1139 }
1140
1141 /* mce_clear_state will clear *final, save locally for use later */
1142 m = *final;
1143
1144 if (!no_way_out)
1145 mce_clear_state(toclear);
1146
1147 /*
1148 * Do most of the synchronization with other CPUs.
1149 * When there's any problem use only local no_way_out state.
1150 */
1151 if (mce_end(order) < 0)
1152 no_way_out = worst >= MCE_PANIC_SEVERITY;
1153
1154 /*
1155 * At insane "tolerant" levels we take no action. Otherwise
1156 * we only die if we have no other choice. For less serious
1157 * issues we try to recover, or limit damage to the current
1158 * process.
1159 */
1160 if (cfg->tolerant < 3) {
1161 if (no_way_out)
1162 mce_panic("Fatal machine check on current CPU", &m, msg);
1163 if (worst == MCE_AR_SEVERITY) {
1164 /* schedule action before return to userland */
1165 mce_save_info(m.addr, m.mcgstatus & MCG_STATUS_RIPV);
1166 set_thread_flag(TIF_MCE_NOTIFY);
1167 } else if (kill_it) {
1168 force_sig(SIGBUS, current);
1169 }
1170 }
1171
1172 if (worst > 0)
1173 mce_report_event(regs);
1174 mce_wrmsrl(MSR_IA32_MCG_STATUS, 0);
1175 out:
1176 sync_core();
1177 }
1178 EXPORT_SYMBOL_GPL(do_machine_check);
1179
1180 #ifndef CONFIG_MEMORY_FAILURE
memory_failure(unsigned long pfn,int vector,int flags)1181 int memory_failure(unsigned long pfn, int vector, int flags)
1182 {
1183 /* mce_severity() should not hand us an ACTION_REQUIRED error */
1184 BUG_ON(flags & MF_ACTION_REQUIRED);
1185 pr_err("Uncorrected memory error in page 0x%lx ignored\n"
1186 "Rebuild kernel with CONFIG_MEMORY_FAILURE=y for smarter handling\n",
1187 pfn);
1188
1189 return 0;
1190 }
1191 #endif
1192
1193 /*
1194 * Called in process context that interrupted by MCE and marked with
1195 * TIF_MCE_NOTIFY, just before returning to erroneous userland.
1196 * This code is allowed to sleep.
1197 * Attempt possible recovery such as calling the high level VM handler to
1198 * process any corrupted pages, and kill/signal current process if required.
1199 * Action required errors are handled here.
1200 */
mce_notify_process(void)1201 void mce_notify_process(void)
1202 {
1203 unsigned long pfn;
1204 struct mce_info *mi = mce_find_info();
1205 int flags = MF_ACTION_REQUIRED;
1206
1207 if (!mi)
1208 mce_panic("Lost physical address for unconsumed uncorrectable error", NULL, NULL);
1209 pfn = mi->paddr >> PAGE_SHIFT;
1210
1211 clear_thread_flag(TIF_MCE_NOTIFY);
1212
1213 pr_err("Uncorrected hardware memory error in user-access at %llx",
1214 mi->paddr);
1215 /*
1216 * We must call memory_failure() here even if the current process is
1217 * doomed. We still need to mark the page as poisoned and alert any
1218 * other users of the page.
1219 */
1220 if (!mi->restartable)
1221 flags |= MF_MUST_KILL;
1222 if (memory_failure(pfn, MCE_VECTOR, flags) < 0) {
1223 pr_err("Memory error not recovered");
1224 force_sig(SIGBUS, current);
1225 }
1226 mce_clear_info(mi);
1227 }
1228
1229 /*
1230 * Action optional processing happens here (picking up
1231 * from the list of faulting pages that do_machine_check()
1232 * placed into the "ring").
1233 */
mce_process_work(struct work_struct * dummy)1234 static void mce_process_work(struct work_struct *dummy)
1235 {
1236 unsigned long pfn;
1237
1238 while (mce_ring_get(&pfn))
1239 memory_failure(pfn, MCE_VECTOR, 0);
1240 }
1241
1242 #ifdef CONFIG_X86_MCE_INTEL
1243 /***
1244 * mce_log_therm_throt_event - Logs the thermal throttling event to mcelog
1245 * @cpu: The CPU on which the event occurred.
1246 * @status: Event status information
1247 *
1248 * This function should be called by the thermal interrupt after the
1249 * event has been processed and the decision was made to log the event
1250 * further.
1251 *
1252 * The status parameter will be saved to the 'status' field of 'struct mce'
1253 * and historically has been the register value of the
1254 * MSR_IA32_THERMAL_STATUS (Intel) msr.
1255 */
mce_log_therm_throt_event(__u64 status)1256 void mce_log_therm_throt_event(__u64 status)
1257 {
1258 struct mce m;
1259
1260 mce_setup(&m);
1261 m.bank = MCE_THERMAL_BANK;
1262 m.status = status;
1263 mce_log(&m);
1264 }
1265 #endif /* CONFIG_X86_MCE_INTEL */
1266
1267 /*
1268 * Periodic polling timer for "silent" machine check errors. If the
1269 * poller finds an MCE, poll 2x faster. When the poller finds no more
1270 * errors, poll 2x slower (up to check_interval seconds).
1271 */
1272 static unsigned long check_interval = 5 * 60; /* 5 minutes */
1273
1274 static DEFINE_PER_CPU(unsigned long, mce_next_interval); /* in jiffies */
1275 static DEFINE_PER_CPU(struct timer_list, mce_timer);
1276
mce_adjust_timer_default(unsigned long interval)1277 static unsigned long mce_adjust_timer_default(unsigned long interval)
1278 {
1279 return interval;
1280 }
1281
1282 static unsigned long (*mce_adjust_timer)(unsigned long interval) =
1283 mce_adjust_timer_default;
1284
cmc_error_seen(void)1285 static int cmc_error_seen(void)
1286 {
1287 unsigned long *v = this_cpu_ptr(&mce_polled_error);
1288
1289 return test_and_clear_bit(0, v);
1290 }
1291
mce_timer_fn(unsigned long data)1292 static void mce_timer_fn(unsigned long data)
1293 {
1294 struct timer_list *t = this_cpu_ptr(&mce_timer);
1295 unsigned long iv;
1296 int notify;
1297
1298 WARN_ON(smp_processor_id() != data);
1299
1300 if (mce_available(this_cpu_ptr(&cpu_info))) {
1301 machine_check_poll(MCP_TIMESTAMP,
1302 this_cpu_ptr(&mce_poll_banks));
1303 mce_intel_cmci_poll();
1304 }
1305
1306 /*
1307 * Alert userspace if needed. If we logged an MCE, reduce the
1308 * polling interval, otherwise increase the polling interval.
1309 */
1310 iv = __this_cpu_read(mce_next_interval);
1311 notify = mce_notify_irq();
1312 notify |= cmc_error_seen();
1313 if (notify) {
1314 iv = max(iv / 2, (unsigned long) HZ/100);
1315 } else {
1316 iv = min(iv * 2, round_jiffies_relative(check_interval * HZ));
1317 iv = mce_adjust_timer(iv);
1318 }
1319 __this_cpu_write(mce_next_interval, iv);
1320 /* Might have become 0 after CMCI storm subsided */
1321 if (iv) {
1322 t->expires = jiffies + iv;
1323 add_timer_on(t, smp_processor_id());
1324 }
1325 }
1326
1327 /*
1328 * Ensure that the timer is firing in @interval from now.
1329 */
mce_timer_kick(unsigned long interval)1330 void mce_timer_kick(unsigned long interval)
1331 {
1332 struct timer_list *t = this_cpu_ptr(&mce_timer);
1333 unsigned long when = jiffies + interval;
1334 unsigned long iv = __this_cpu_read(mce_next_interval);
1335
1336 if (timer_pending(t)) {
1337 if (time_before(when, t->expires))
1338 mod_timer_pinned(t, when);
1339 } else {
1340 t->expires = round_jiffies(when);
1341 add_timer_on(t, smp_processor_id());
1342 }
1343 if (interval < iv)
1344 __this_cpu_write(mce_next_interval, interval);
1345 }
1346
1347 /* Must not be called in IRQ context where del_timer_sync() can deadlock */
mce_timer_delete_all(void)1348 static void mce_timer_delete_all(void)
1349 {
1350 int cpu;
1351
1352 for_each_online_cpu(cpu)
1353 del_timer_sync(&per_cpu(mce_timer, cpu));
1354 }
1355
mce_do_trigger(struct work_struct * work)1356 static void mce_do_trigger(struct work_struct *work)
1357 {
1358 call_usermodehelper(mce_helper, mce_helper_argv, NULL, UMH_NO_WAIT);
1359 }
1360
1361 static DECLARE_WORK(mce_trigger_work, mce_do_trigger);
1362
1363 /*
1364 * Notify the user(s) about new machine check events.
1365 * Can be called from interrupt context, but not from machine check/NMI
1366 * context.
1367 */
mce_notify_irq(void)1368 int mce_notify_irq(void)
1369 {
1370 /* Not more than two messages every minute */
1371 static DEFINE_RATELIMIT_STATE(ratelimit, 60*HZ, 2);
1372
1373 if (test_and_clear_bit(0, &mce_need_notify)) {
1374 /* wake processes polling /dev/mcelog */
1375 wake_up_interruptible(&mce_chrdev_wait);
1376
1377 if (mce_helper[0])
1378 schedule_work(&mce_trigger_work);
1379
1380 if (__ratelimit(&ratelimit))
1381 pr_info(HW_ERR "Machine check events logged\n");
1382
1383 return 1;
1384 }
1385 return 0;
1386 }
1387 EXPORT_SYMBOL_GPL(mce_notify_irq);
1388
__mcheck_cpu_mce_banks_init(void)1389 static int __mcheck_cpu_mce_banks_init(void)
1390 {
1391 int i;
1392 u8 num_banks = mca_cfg.banks;
1393
1394 mce_banks = kzalloc(num_banks * sizeof(struct mce_bank), GFP_KERNEL);
1395 if (!mce_banks)
1396 return -ENOMEM;
1397
1398 for (i = 0; i < num_banks; i++) {
1399 struct mce_bank *b = &mce_banks[i];
1400
1401 b->ctl = -1ULL;
1402 b->init = 1;
1403 }
1404 return 0;
1405 }
1406
1407 /*
1408 * Initialize Machine Checks for a CPU.
1409 */
__mcheck_cpu_cap_init(void)1410 static int __mcheck_cpu_cap_init(void)
1411 {
1412 unsigned b;
1413 u64 cap;
1414
1415 rdmsrl(MSR_IA32_MCG_CAP, cap);
1416
1417 b = cap & MCG_BANKCNT_MASK;
1418 if (!mca_cfg.banks)
1419 pr_info("CPU supports %d MCE banks\n", b);
1420
1421 if (b > MAX_NR_BANKS) {
1422 pr_warn("Using only %u machine check banks out of %u\n",
1423 MAX_NR_BANKS, b);
1424 b = MAX_NR_BANKS;
1425 }
1426
1427 /* Don't support asymmetric configurations today */
1428 WARN_ON(mca_cfg.banks != 0 && b != mca_cfg.banks);
1429 mca_cfg.banks = b;
1430
1431 if (!mce_banks) {
1432 int err = __mcheck_cpu_mce_banks_init();
1433
1434 if (err)
1435 return err;
1436 }
1437
1438 /* Use accurate RIP reporting if available. */
1439 if ((cap & MCG_EXT_P) && MCG_EXT_CNT(cap) >= 9)
1440 mca_cfg.rip_msr = MSR_IA32_MCG_EIP;
1441
1442 if (cap & MCG_SER_P)
1443 mca_cfg.ser = true;
1444
1445 return 0;
1446 }
1447
__mcheck_cpu_init_generic(void)1448 static void __mcheck_cpu_init_generic(void)
1449 {
1450 enum mcp_flags m_fl = 0;
1451 mce_banks_t all_banks;
1452 u64 cap;
1453 int i;
1454
1455 if (!mca_cfg.bootlog)
1456 m_fl = MCP_DONTLOG;
1457
1458 /*
1459 * Log the machine checks left over from the previous reset.
1460 */
1461 bitmap_fill(all_banks, MAX_NR_BANKS);
1462 machine_check_poll(MCP_UC | m_fl, &all_banks);
1463
1464 cr4_set_bits(X86_CR4_MCE);
1465
1466 rdmsrl(MSR_IA32_MCG_CAP, cap);
1467 if (cap & MCG_CTL_P)
1468 wrmsr(MSR_IA32_MCG_CTL, 0xffffffff, 0xffffffff);
1469
1470 for (i = 0; i < mca_cfg.banks; i++) {
1471 struct mce_bank *b = &mce_banks[i];
1472
1473 if (!b->init)
1474 continue;
1475 wrmsrl(MSR_IA32_MCx_CTL(i), b->ctl);
1476 wrmsrl(MSR_IA32_MCx_STATUS(i), 0);
1477 }
1478 }
1479
1480 /*
1481 * During IFU recovery Sandy Bridge -EP4S processors set the RIPV and
1482 * EIPV bits in MCG_STATUS to zero on the affected logical processor (SDM
1483 * Vol 3B Table 15-20). But this confuses both the code that determines
1484 * whether the machine check occurred in kernel or user mode, and also
1485 * the severity assessment code. Pretend that EIPV was set, and take the
1486 * ip/cs values from the pt_regs that mce_gather_info() ignored earlier.
1487 */
quirk_sandybridge_ifu(int bank,struct mce * m,struct pt_regs * regs)1488 static void quirk_sandybridge_ifu(int bank, struct mce *m, struct pt_regs *regs)
1489 {
1490 if (bank != 0)
1491 return;
1492 if ((m->mcgstatus & (MCG_STATUS_EIPV|MCG_STATUS_RIPV)) != 0)
1493 return;
1494 if ((m->status & (MCI_STATUS_OVER|MCI_STATUS_UC|
1495 MCI_STATUS_EN|MCI_STATUS_MISCV|MCI_STATUS_ADDRV|
1496 MCI_STATUS_PCC|MCI_STATUS_S|MCI_STATUS_AR|
1497 MCACOD)) !=
1498 (MCI_STATUS_UC|MCI_STATUS_EN|
1499 MCI_STATUS_MISCV|MCI_STATUS_ADDRV|MCI_STATUS_S|
1500 MCI_STATUS_AR|MCACOD_INSTR))
1501 return;
1502
1503 m->mcgstatus |= MCG_STATUS_EIPV;
1504 m->ip = regs->ip;
1505 m->cs = regs->cs;
1506 }
1507
1508 /* Add per CPU specific workarounds here */
__mcheck_cpu_apply_quirks(struct cpuinfo_x86 * c)1509 static int __mcheck_cpu_apply_quirks(struct cpuinfo_x86 *c)
1510 {
1511 struct mca_config *cfg = &mca_cfg;
1512
1513 if (c->x86_vendor == X86_VENDOR_UNKNOWN) {
1514 pr_info("unknown CPU type - not enabling MCE support\n");
1515 return -EOPNOTSUPP;
1516 }
1517
1518 /* This should be disabled by the BIOS, but isn't always */
1519 if (c->x86_vendor == X86_VENDOR_AMD) {
1520 if (c->x86 == 15 && cfg->banks > 4) {
1521 /*
1522 * disable GART TBL walk error reporting, which
1523 * trips off incorrectly with the IOMMU & 3ware
1524 * & Cerberus:
1525 */
1526 clear_bit(10, (unsigned long *)&mce_banks[4].ctl);
1527 }
1528 if (c->x86 <= 17 && cfg->bootlog < 0) {
1529 /*
1530 * Lots of broken BIOS around that don't clear them
1531 * by default and leave crap in there. Don't log:
1532 */
1533 cfg->bootlog = 0;
1534 }
1535 /*
1536 * Various K7s with broken bank 0 around. Always disable
1537 * by default.
1538 */
1539 if (c->x86 == 6 && cfg->banks > 0)
1540 mce_banks[0].ctl = 0;
1541
1542 /*
1543 * Turn off MC4_MISC thresholding banks on those models since
1544 * they're not supported there.
1545 */
1546 if (c->x86 == 0x15 &&
1547 (c->x86_model >= 0x10 && c->x86_model <= 0x1f)) {
1548 int i;
1549 u64 val, hwcr;
1550 bool need_toggle;
1551 u32 msrs[] = {
1552 0x00000413, /* MC4_MISC0 */
1553 0xc0000408, /* MC4_MISC1 */
1554 };
1555
1556 rdmsrl(MSR_K7_HWCR, hwcr);
1557
1558 /* McStatusWrEn has to be set */
1559 need_toggle = !(hwcr & BIT(18));
1560
1561 if (need_toggle)
1562 wrmsrl(MSR_K7_HWCR, hwcr | BIT(18));
1563
1564 for (i = 0; i < ARRAY_SIZE(msrs); i++) {
1565 rdmsrl(msrs[i], val);
1566
1567 /* CntP bit set? */
1568 if (val & BIT_64(62)) {
1569 val &= ~BIT_64(62);
1570 wrmsrl(msrs[i], val);
1571 }
1572 }
1573
1574 /* restore old settings */
1575 if (need_toggle)
1576 wrmsrl(MSR_K7_HWCR, hwcr);
1577 }
1578 }
1579
1580 if (c->x86_vendor == X86_VENDOR_INTEL) {
1581 /*
1582 * SDM documents that on family 6 bank 0 should not be written
1583 * because it aliases to another special BIOS controlled
1584 * register.
1585 * But it's not aliased anymore on model 0x1a+
1586 * Don't ignore bank 0 completely because there could be a
1587 * valid event later, merely don't write CTL0.
1588 */
1589
1590 if (c->x86 == 6 && c->x86_model < 0x1A && cfg->banks > 0)
1591 mce_banks[0].init = 0;
1592
1593 /*
1594 * All newer Intel systems support MCE broadcasting. Enable
1595 * synchronization with a one second timeout.
1596 */
1597 if ((c->x86 > 6 || (c->x86 == 6 && c->x86_model >= 0xe)) &&
1598 cfg->monarch_timeout < 0)
1599 cfg->monarch_timeout = USEC_PER_SEC;
1600
1601 /*
1602 * There are also broken BIOSes on some Pentium M and
1603 * earlier systems:
1604 */
1605 if (c->x86 == 6 && c->x86_model <= 13 && cfg->bootlog < 0)
1606 cfg->bootlog = 0;
1607
1608 if (c->x86 == 6 && c->x86_model == 45)
1609 quirk_no_way_out = quirk_sandybridge_ifu;
1610 }
1611 if (cfg->monarch_timeout < 0)
1612 cfg->monarch_timeout = 0;
1613 if (cfg->bootlog != 0)
1614 cfg->panic_timeout = 30;
1615
1616 return 0;
1617 }
1618
__mcheck_cpu_ancient_init(struct cpuinfo_x86 * c)1619 static int __mcheck_cpu_ancient_init(struct cpuinfo_x86 *c)
1620 {
1621 if (c->x86 != 5)
1622 return 0;
1623
1624 switch (c->x86_vendor) {
1625 case X86_VENDOR_INTEL:
1626 intel_p5_mcheck_init(c);
1627 return 1;
1628 break;
1629 case X86_VENDOR_CENTAUR:
1630 winchip_mcheck_init(c);
1631 return 1;
1632 break;
1633 }
1634
1635 return 0;
1636 }
1637
__mcheck_cpu_init_vendor(struct cpuinfo_x86 * c)1638 static void __mcheck_cpu_init_vendor(struct cpuinfo_x86 *c)
1639 {
1640 switch (c->x86_vendor) {
1641 case X86_VENDOR_INTEL:
1642 mce_intel_feature_init(c);
1643 mce_adjust_timer = mce_intel_adjust_timer;
1644 break;
1645 case X86_VENDOR_AMD:
1646 mce_amd_feature_init(c);
1647 break;
1648 default:
1649 break;
1650 }
1651 }
1652
mce_start_timer(unsigned int cpu,struct timer_list * t)1653 static void mce_start_timer(unsigned int cpu, struct timer_list *t)
1654 {
1655 unsigned long iv = check_interval * HZ;
1656
1657 if (mca_cfg.ignore_ce || !iv)
1658 return;
1659
1660 per_cpu(mce_next_interval, cpu) = iv;
1661
1662 t->expires = round_jiffies(jiffies + iv);
1663 add_timer_on(t, cpu);
1664 }
1665
__mcheck_cpu_init_timer(void)1666 static void __mcheck_cpu_init_timer(void)
1667 {
1668 struct timer_list *t = this_cpu_ptr(&mce_timer);
1669 unsigned int cpu = smp_processor_id();
1670
1671 setup_timer(t, mce_timer_fn, cpu);
1672 mce_start_timer(cpu, t);
1673 }
1674
1675 /* Handle unconfigured int18 (should never happen) */
unexpected_machine_check(struct pt_regs * regs,long error_code)1676 static void unexpected_machine_check(struct pt_regs *regs, long error_code)
1677 {
1678 pr_err("CPU#%d: Unexpected int18 (Machine Check)\n",
1679 smp_processor_id());
1680 }
1681
1682 /* Call the installed machine check handler for this CPU setup. */
1683 void (*machine_check_vector)(struct pt_regs *, long error_code) =
1684 unexpected_machine_check;
1685
1686 /*
1687 * Called for each booted CPU to set up machine checks.
1688 * Must be called with preempt off:
1689 */
mcheck_cpu_init(struct cpuinfo_x86 * c)1690 void mcheck_cpu_init(struct cpuinfo_x86 *c)
1691 {
1692 if (mca_cfg.disabled)
1693 return;
1694
1695 if (__mcheck_cpu_ancient_init(c))
1696 return;
1697
1698 if (!mce_available(c))
1699 return;
1700
1701 if (__mcheck_cpu_cap_init() < 0 || __mcheck_cpu_apply_quirks(c) < 0) {
1702 mca_cfg.disabled = true;
1703 return;
1704 }
1705
1706 machine_check_vector = do_machine_check;
1707
1708 __mcheck_cpu_init_generic();
1709 __mcheck_cpu_init_vendor(c);
1710 __mcheck_cpu_init_timer();
1711 INIT_WORK(this_cpu_ptr(&mce_work), mce_process_work);
1712 init_irq_work(this_cpu_ptr(&mce_irq_work), &mce_irq_work_cb);
1713 }
1714
1715 /*
1716 * mce_chrdev: Character device /dev/mcelog to read and clear the MCE log.
1717 */
1718
1719 static DEFINE_SPINLOCK(mce_chrdev_state_lock);
1720 static int mce_chrdev_open_count; /* #times opened */
1721 static int mce_chrdev_open_exclu; /* already open exclusive? */
1722
mce_chrdev_open(struct inode * inode,struct file * file)1723 static int mce_chrdev_open(struct inode *inode, struct file *file)
1724 {
1725 spin_lock(&mce_chrdev_state_lock);
1726
1727 if (mce_chrdev_open_exclu ||
1728 (mce_chrdev_open_count && (file->f_flags & O_EXCL))) {
1729 spin_unlock(&mce_chrdev_state_lock);
1730
1731 return -EBUSY;
1732 }
1733
1734 if (file->f_flags & O_EXCL)
1735 mce_chrdev_open_exclu = 1;
1736 mce_chrdev_open_count++;
1737
1738 spin_unlock(&mce_chrdev_state_lock);
1739
1740 return nonseekable_open(inode, file);
1741 }
1742
mce_chrdev_release(struct inode * inode,struct file * file)1743 static int mce_chrdev_release(struct inode *inode, struct file *file)
1744 {
1745 spin_lock(&mce_chrdev_state_lock);
1746
1747 mce_chrdev_open_count--;
1748 mce_chrdev_open_exclu = 0;
1749
1750 spin_unlock(&mce_chrdev_state_lock);
1751
1752 return 0;
1753 }
1754
collect_tscs(void * data)1755 static void collect_tscs(void *data)
1756 {
1757 unsigned long *cpu_tsc = (unsigned long *)data;
1758
1759 rdtscll(cpu_tsc[smp_processor_id()]);
1760 }
1761
1762 static int mce_apei_read_done;
1763
1764 /* Collect MCE record of previous boot in persistent storage via APEI ERST. */
__mce_read_apei(char __user ** ubuf,size_t usize)1765 static int __mce_read_apei(char __user **ubuf, size_t usize)
1766 {
1767 int rc;
1768 u64 record_id;
1769 struct mce m;
1770
1771 if (usize < sizeof(struct mce))
1772 return -EINVAL;
1773
1774 rc = apei_read_mce(&m, &record_id);
1775 /* Error or no more MCE record */
1776 if (rc <= 0) {
1777 mce_apei_read_done = 1;
1778 /*
1779 * When ERST is disabled, mce_chrdev_read() should return
1780 * "no record" instead of "no device."
1781 */
1782 if (rc == -ENODEV)
1783 return 0;
1784 return rc;
1785 }
1786 rc = -EFAULT;
1787 if (copy_to_user(*ubuf, &m, sizeof(struct mce)))
1788 return rc;
1789 /*
1790 * In fact, we should have cleared the record after that has
1791 * been flushed to the disk or sent to network in
1792 * /sbin/mcelog, but we have no interface to support that now,
1793 * so just clear it to avoid duplication.
1794 */
1795 rc = apei_clear_mce(record_id);
1796 if (rc) {
1797 mce_apei_read_done = 1;
1798 return rc;
1799 }
1800 *ubuf += sizeof(struct mce);
1801
1802 return 0;
1803 }
1804
mce_chrdev_read(struct file * filp,char __user * ubuf,size_t usize,loff_t * off)1805 static ssize_t mce_chrdev_read(struct file *filp, char __user *ubuf,
1806 size_t usize, loff_t *off)
1807 {
1808 char __user *buf = ubuf;
1809 unsigned long *cpu_tsc;
1810 unsigned prev, next;
1811 int i, err;
1812
1813 cpu_tsc = kmalloc(nr_cpu_ids * sizeof(long), GFP_KERNEL);
1814 if (!cpu_tsc)
1815 return -ENOMEM;
1816
1817 mutex_lock(&mce_chrdev_read_mutex);
1818
1819 if (!mce_apei_read_done) {
1820 err = __mce_read_apei(&buf, usize);
1821 if (err || buf != ubuf)
1822 goto out;
1823 }
1824
1825 next = rcu_dereference_check_mce(mcelog.next);
1826
1827 /* Only supports full reads right now */
1828 err = -EINVAL;
1829 if (*off != 0 || usize < MCE_LOG_LEN*sizeof(struct mce))
1830 goto out;
1831
1832 err = 0;
1833 prev = 0;
1834 do {
1835 for (i = prev; i < next; i++) {
1836 unsigned long start = jiffies;
1837 struct mce *m = &mcelog.entry[i];
1838
1839 while (!m->finished) {
1840 if (time_after_eq(jiffies, start + 2)) {
1841 memset(m, 0, sizeof(*m));
1842 goto timeout;
1843 }
1844 cpu_relax();
1845 }
1846 smp_rmb();
1847 err |= copy_to_user(buf, m, sizeof(*m));
1848 buf += sizeof(*m);
1849 timeout:
1850 ;
1851 }
1852
1853 memset(mcelog.entry + prev, 0,
1854 (next - prev) * sizeof(struct mce));
1855 prev = next;
1856 next = cmpxchg(&mcelog.next, prev, 0);
1857 } while (next != prev);
1858
1859 synchronize_sched();
1860
1861 /*
1862 * Collect entries that were still getting written before the
1863 * synchronize.
1864 */
1865 on_each_cpu(collect_tscs, cpu_tsc, 1);
1866
1867 for (i = next; i < MCE_LOG_LEN; i++) {
1868 struct mce *m = &mcelog.entry[i];
1869
1870 if (m->finished && m->tsc < cpu_tsc[m->cpu]) {
1871 err |= copy_to_user(buf, m, sizeof(*m));
1872 smp_rmb();
1873 buf += sizeof(*m);
1874 memset(m, 0, sizeof(*m));
1875 }
1876 }
1877
1878 if (err)
1879 err = -EFAULT;
1880
1881 out:
1882 mutex_unlock(&mce_chrdev_read_mutex);
1883 kfree(cpu_tsc);
1884
1885 return err ? err : buf - ubuf;
1886 }
1887
mce_chrdev_poll(struct file * file,poll_table * wait)1888 static unsigned int mce_chrdev_poll(struct file *file, poll_table *wait)
1889 {
1890 poll_wait(file, &mce_chrdev_wait, wait);
1891 if (rcu_access_index(mcelog.next))
1892 return POLLIN | POLLRDNORM;
1893 if (!mce_apei_read_done && apei_check_mce())
1894 return POLLIN | POLLRDNORM;
1895 return 0;
1896 }
1897
mce_chrdev_ioctl(struct file * f,unsigned int cmd,unsigned long arg)1898 static long mce_chrdev_ioctl(struct file *f, unsigned int cmd,
1899 unsigned long arg)
1900 {
1901 int __user *p = (int __user *)arg;
1902
1903 if (!capable(CAP_SYS_ADMIN))
1904 return -EPERM;
1905
1906 switch (cmd) {
1907 case MCE_GET_RECORD_LEN:
1908 return put_user(sizeof(struct mce), p);
1909 case MCE_GET_LOG_LEN:
1910 return put_user(MCE_LOG_LEN, p);
1911 case MCE_GETCLEAR_FLAGS: {
1912 unsigned flags;
1913
1914 do {
1915 flags = mcelog.flags;
1916 } while (cmpxchg(&mcelog.flags, flags, 0) != flags);
1917
1918 return put_user(flags, p);
1919 }
1920 default:
1921 return -ENOTTY;
1922 }
1923 }
1924
1925 static ssize_t (*mce_write)(struct file *filp, const char __user *ubuf,
1926 size_t usize, loff_t *off);
1927
register_mce_write_callback(ssize_t (* fn)(struct file * filp,const char __user * ubuf,size_t usize,loff_t * off))1928 void register_mce_write_callback(ssize_t (*fn)(struct file *filp,
1929 const char __user *ubuf,
1930 size_t usize, loff_t *off))
1931 {
1932 mce_write = fn;
1933 }
1934 EXPORT_SYMBOL_GPL(register_mce_write_callback);
1935
mce_chrdev_write(struct file * filp,const char __user * ubuf,size_t usize,loff_t * off)1936 ssize_t mce_chrdev_write(struct file *filp, const char __user *ubuf,
1937 size_t usize, loff_t *off)
1938 {
1939 if (mce_write)
1940 return mce_write(filp, ubuf, usize, off);
1941 else
1942 return -EINVAL;
1943 }
1944
1945 static const struct file_operations mce_chrdev_ops = {
1946 .open = mce_chrdev_open,
1947 .release = mce_chrdev_release,
1948 .read = mce_chrdev_read,
1949 .write = mce_chrdev_write,
1950 .poll = mce_chrdev_poll,
1951 .unlocked_ioctl = mce_chrdev_ioctl,
1952 .llseek = no_llseek,
1953 };
1954
1955 static struct miscdevice mce_chrdev_device = {
1956 MISC_MCELOG_MINOR,
1957 "mcelog",
1958 &mce_chrdev_ops,
1959 };
1960
__mce_disable_bank(void * arg)1961 static void __mce_disable_bank(void *arg)
1962 {
1963 int bank = *((int *)arg);
1964 __clear_bit(bank, this_cpu_ptr(mce_poll_banks));
1965 cmci_disable_bank(bank);
1966 }
1967
mce_disable_bank(int bank)1968 void mce_disable_bank(int bank)
1969 {
1970 if (bank >= mca_cfg.banks) {
1971 pr_warn(FW_BUG
1972 "Ignoring request to disable invalid MCA bank %d.\n",
1973 bank);
1974 return;
1975 }
1976 set_bit(bank, mce_banks_ce_disabled);
1977 on_each_cpu(__mce_disable_bank, &bank, 1);
1978 }
1979
1980 /*
1981 * mce=off Disables machine check
1982 * mce=no_cmci Disables CMCI
1983 * mce=dont_log_ce Clears corrected events silently, no log created for CEs.
1984 * mce=ignore_ce Disables polling and CMCI, corrected events are not cleared.
1985 * mce=TOLERANCELEVEL[,monarchtimeout] (number, see above)
1986 * monarchtimeout is how long to wait for other CPUs on machine
1987 * check, or 0 to not wait
1988 * mce=bootlog Log MCEs from before booting. Disabled by default on AMD.
1989 * mce=nobootlog Don't log MCEs from before booting.
1990 * mce=bios_cmci_threshold Don't program the CMCI threshold
1991 */
mcheck_enable(char * str)1992 static int __init mcheck_enable(char *str)
1993 {
1994 struct mca_config *cfg = &mca_cfg;
1995
1996 if (*str == 0) {
1997 enable_p5_mce();
1998 return 1;
1999 }
2000 if (*str == '=')
2001 str++;
2002 if (!strcmp(str, "off"))
2003 cfg->disabled = true;
2004 else if (!strcmp(str, "no_cmci"))
2005 cfg->cmci_disabled = true;
2006 else if (!strcmp(str, "dont_log_ce"))
2007 cfg->dont_log_ce = true;
2008 else if (!strcmp(str, "ignore_ce"))
2009 cfg->ignore_ce = true;
2010 else if (!strcmp(str, "bootlog") || !strcmp(str, "nobootlog"))
2011 cfg->bootlog = (str[0] == 'b');
2012 else if (!strcmp(str, "bios_cmci_threshold"))
2013 cfg->bios_cmci_threshold = true;
2014 else if (isdigit(str[0])) {
2015 get_option(&str, &(cfg->tolerant));
2016 if (*str == ',') {
2017 ++str;
2018 get_option(&str, &(cfg->monarch_timeout));
2019 }
2020 } else {
2021 pr_info("mce argument %s ignored. Please use /sys\n", str);
2022 return 0;
2023 }
2024 return 1;
2025 }
2026 __setup("mce", mcheck_enable);
2027
mcheck_init(void)2028 int __init mcheck_init(void)
2029 {
2030 mcheck_intel_therm_init();
2031
2032 return 0;
2033 }
2034
2035 /*
2036 * mce_syscore: PM support
2037 */
2038
2039 /*
2040 * Disable machine checks on suspend and shutdown. We can't really handle
2041 * them later.
2042 */
mce_disable_error_reporting(void)2043 static int mce_disable_error_reporting(void)
2044 {
2045 int i;
2046
2047 for (i = 0; i < mca_cfg.banks; i++) {
2048 struct mce_bank *b = &mce_banks[i];
2049
2050 if (b->init)
2051 wrmsrl(MSR_IA32_MCx_CTL(i), 0);
2052 }
2053 return 0;
2054 }
2055
mce_syscore_suspend(void)2056 static int mce_syscore_suspend(void)
2057 {
2058 return mce_disable_error_reporting();
2059 }
2060
mce_syscore_shutdown(void)2061 static void mce_syscore_shutdown(void)
2062 {
2063 mce_disable_error_reporting();
2064 }
2065
2066 /*
2067 * On resume clear all MCE state. Don't want to see leftovers from the BIOS.
2068 * Only one CPU is active at this time, the others get re-added later using
2069 * CPU hotplug:
2070 */
mce_syscore_resume(void)2071 static void mce_syscore_resume(void)
2072 {
2073 __mcheck_cpu_init_generic();
2074 __mcheck_cpu_init_vendor(raw_cpu_ptr(&cpu_info));
2075 }
2076
2077 static struct syscore_ops mce_syscore_ops = {
2078 .suspend = mce_syscore_suspend,
2079 .shutdown = mce_syscore_shutdown,
2080 .resume = mce_syscore_resume,
2081 };
2082
2083 /*
2084 * mce_device: Sysfs support
2085 */
2086
mce_cpu_restart(void * data)2087 static void mce_cpu_restart(void *data)
2088 {
2089 if (!mce_available(raw_cpu_ptr(&cpu_info)))
2090 return;
2091 __mcheck_cpu_init_generic();
2092 __mcheck_cpu_init_timer();
2093 }
2094
2095 /* Reinit MCEs after user configuration changes */
mce_restart(void)2096 static void mce_restart(void)
2097 {
2098 mce_timer_delete_all();
2099 on_each_cpu(mce_cpu_restart, NULL, 1);
2100 }
2101
2102 /* Toggle features for corrected errors */
mce_disable_cmci(void * data)2103 static void mce_disable_cmci(void *data)
2104 {
2105 if (!mce_available(raw_cpu_ptr(&cpu_info)))
2106 return;
2107 cmci_clear();
2108 }
2109
mce_enable_ce(void * all)2110 static void mce_enable_ce(void *all)
2111 {
2112 if (!mce_available(raw_cpu_ptr(&cpu_info)))
2113 return;
2114 cmci_reenable();
2115 cmci_recheck();
2116 if (all)
2117 __mcheck_cpu_init_timer();
2118 }
2119
2120 static struct bus_type mce_subsys = {
2121 .name = "machinecheck",
2122 .dev_name = "machinecheck",
2123 };
2124
2125 DEFINE_PER_CPU(struct device *, mce_device);
2126
2127 void (*threshold_cpu_callback)(unsigned long action, unsigned int cpu);
2128
attr_to_bank(struct device_attribute * attr)2129 static inline struct mce_bank *attr_to_bank(struct device_attribute *attr)
2130 {
2131 return container_of(attr, struct mce_bank, attr);
2132 }
2133
show_bank(struct device * s,struct device_attribute * attr,char * buf)2134 static ssize_t show_bank(struct device *s, struct device_attribute *attr,
2135 char *buf)
2136 {
2137 return sprintf(buf, "%llx\n", attr_to_bank(attr)->ctl);
2138 }
2139
set_bank(struct device * s,struct device_attribute * attr,const char * buf,size_t size)2140 static ssize_t set_bank(struct device *s, struct device_attribute *attr,
2141 const char *buf, size_t size)
2142 {
2143 u64 new;
2144
2145 if (kstrtou64(buf, 0, &new) < 0)
2146 return -EINVAL;
2147
2148 attr_to_bank(attr)->ctl = new;
2149 mce_restart();
2150
2151 return size;
2152 }
2153
2154 static ssize_t
show_trigger(struct device * s,struct device_attribute * attr,char * buf)2155 show_trigger(struct device *s, struct device_attribute *attr, char *buf)
2156 {
2157 strcpy(buf, mce_helper);
2158 strcat(buf, "\n");
2159 return strlen(mce_helper) + 1;
2160 }
2161
set_trigger(struct device * s,struct device_attribute * attr,const char * buf,size_t siz)2162 static ssize_t set_trigger(struct device *s, struct device_attribute *attr,
2163 const char *buf, size_t siz)
2164 {
2165 char *p;
2166
2167 strncpy(mce_helper, buf, sizeof(mce_helper));
2168 mce_helper[sizeof(mce_helper)-1] = 0;
2169 p = strchr(mce_helper, '\n');
2170
2171 if (p)
2172 *p = 0;
2173
2174 return strlen(mce_helper) + !!p;
2175 }
2176
set_ignore_ce(struct device * s,struct device_attribute * attr,const char * buf,size_t size)2177 static ssize_t set_ignore_ce(struct device *s,
2178 struct device_attribute *attr,
2179 const char *buf, size_t size)
2180 {
2181 u64 new;
2182
2183 if (kstrtou64(buf, 0, &new) < 0)
2184 return -EINVAL;
2185
2186 if (mca_cfg.ignore_ce ^ !!new) {
2187 if (new) {
2188 /* disable ce features */
2189 mce_timer_delete_all();
2190 on_each_cpu(mce_disable_cmci, NULL, 1);
2191 mca_cfg.ignore_ce = true;
2192 } else {
2193 /* enable ce features */
2194 mca_cfg.ignore_ce = false;
2195 on_each_cpu(mce_enable_ce, (void *)1, 1);
2196 }
2197 }
2198 return size;
2199 }
2200
set_cmci_disabled(struct device * s,struct device_attribute * attr,const char * buf,size_t size)2201 static ssize_t set_cmci_disabled(struct device *s,
2202 struct device_attribute *attr,
2203 const char *buf, size_t size)
2204 {
2205 u64 new;
2206
2207 if (kstrtou64(buf, 0, &new) < 0)
2208 return -EINVAL;
2209
2210 if (mca_cfg.cmci_disabled ^ !!new) {
2211 if (new) {
2212 /* disable cmci */
2213 on_each_cpu(mce_disable_cmci, NULL, 1);
2214 mca_cfg.cmci_disabled = true;
2215 } else {
2216 /* enable cmci */
2217 mca_cfg.cmci_disabled = false;
2218 on_each_cpu(mce_enable_ce, NULL, 1);
2219 }
2220 }
2221 return size;
2222 }
2223
store_int_with_restart(struct device * s,struct device_attribute * attr,const char * buf,size_t size)2224 static ssize_t store_int_with_restart(struct device *s,
2225 struct device_attribute *attr,
2226 const char *buf, size_t size)
2227 {
2228 ssize_t ret = device_store_int(s, attr, buf, size);
2229 mce_restart();
2230 return ret;
2231 }
2232
2233 static DEVICE_ATTR(trigger, 0644, show_trigger, set_trigger);
2234 static DEVICE_INT_ATTR(tolerant, 0644, mca_cfg.tolerant);
2235 static DEVICE_INT_ATTR(monarch_timeout, 0644, mca_cfg.monarch_timeout);
2236 static DEVICE_BOOL_ATTR(dont_log_ce, 0644, mca_cfg.dont_log_ce);
2237
2238 static struct dev_ext_attribute dev_attr_check_interval = {
2239 __ATTR(check_interval, 0644, device_show_int, store_int_with_restart),
2240 &check_interval
2241 };
2242
2243 static struct dev_ext_attribute dev_attr_ignore_ce = {
2244 __ATTR(ignore_ce, 0644, device_show_bool, set_ignore_ce),
2245 &mca_cfg.ignore_ce
2246 };
2247
2248 static struct dev_ext_attribute dev_attr_cmci_disabled = {
2249 __ATTR(cmci_disabled, 0644, device_show_bool, set_cmci_disabled),
2250 &mca_cfg.cmci_disabled
2251 };
2252
2253 static struct device_attribute *mce_device_attrs[] = {
2254 &dev_attr_tolerant.attr,
2255 &dev_attr_check_interval.attr,
2256 &dev_attr_trigger,
2257 &dev_attr_monarch_timeout.attr,
2258 &dev_attr_dont_log_ce.attr,
2259 &dev_attr_ignore_ce.attr,
2260 &dev_attr_cmci_disabled.attr,
2261 NULL
2262 };
2263
2264 static cpumask_var_t mce_device_initialized;
2265
mce_device_release(struct device * dev)2266 static void mce_device_release(struct device *dev)
2267 {
2268 kfree(dev);
2269 }
2270
2271 /* Per cpu device init. All of the cpus still share the same ctrl bank: */
mce_device_create(unsigned int cpu)2272 static int mce_device_create(unsigned int cpu)
2273 {
2274 struct device *dev;
2275 int err;
2276 int i, j;
2277
2278 if (!mce_available(&boot_cpu_data))
2279 return -EIO;
2280
2281 dev = kzalloc(sizeof *dev, GFP_KERNEL);
2282 if (!dev)
2283 return -ENOMEM;
2284 dev->id = cpu;
2285 dev->bus = &mce_subsys;
2286 dev->release = &mce_device_release;
2287
2288 err = device_register(dev);
2289 if (err) {
2290 put_device(dev);
2291 return err;
2292 }
2293
2294 for (i = 0; mce_device_attrs[i]; i++) {
2295 err = device_create_file(dev, mce_device_attrs[i]);
2296 if (err)
2297 goto error;
2298 }
2299 for (j = 0; j < mca_cfg.banks; j++) {
2300 err = device_create_file(dev, &mce_banks[j].attr);
2301 if (err)
2302 goto error2;
2303 }
2304 cpumask_set_cpu(cpu, mce_device_initialized);
2305 per_cpu(mce_device, cpu) = dev;
2306
2307 return 0;
2308 error2:
2309 while (--j >= 0)
2310 device_remove_file(dev, &mce_banks[j].attr);
2311 error:
2312 while (--i >= 0)
2313 device_remove_file(dev, mce_device_attrs[i]);
2314
2315 device_unregister(dev);
2316
2317 return err;
2318 }
2319
mce_device_remove(unsigned int cpu)2320 static void mce_device_remove(unsigned int cpu)
2321 {
2322 struct device *dev = per_cpu(mce_device, cpu);
2323 int i;
2324
2325 if (!cpumask_test_cpu(cpu, mce_device_initialized))
2326 return;
2327
2328 for (i = 0; mce_device_attrs[i]; i++)
2329 device_remove_file(dev, mce_device_attrs[i]);
2330
2331 for (i = 0; i < mca_cfg.banks; i++)
2332 device_remove_file(dev, &mce_banks[i].attr);
2333
2334 device_unregister(dev);
2335 cpumask_clear_cpu(cpu, mce_device_initialized);
2336 per_cpu(mce_device, cpu) = NULL;
2337 }
2338
2339 /* Make sure there are no machine checks on offlined CPUs. */
mce_disable_cpu(void * h)2340 static void mce_disable_cpu(void *h)
2341 {
2342 unsigned long action = *(unsigned long *)h;
2343 int i;
2344
2345 if (!mce_available(raw_cpu_ptr(&cpu_info)))
2346 return;
2347
2348 if (!(action & CPU_TASKS_FROZEN))
2349 cmci_clear();
2350 for (i = 0; i < mca_cfg.banks; i++) {
2351 struct mce_bank *b = &mce_banks[i];
2352
2353 if (b->init)
2354 wrmsrl(MSR_IA32_MCx_CTL(i), 0);
2355 }
2356 }
2357
mce_reenable_cpu(void * h)2358 static void mce_reenable_cpu(void *h)
2359 {
2360 unsigned long action = *(unsigned long *)h;
2361 int i;
2362
2363 if (!mce_available(raw_cpu_ptr(&cpu_info)))
2364 return;
2365
2366 if (!(action & CPU_TASKS_FROZEN))
2367 cmci_reenable();
2368 for (i = 0; i < mca_cfg.banks; i++) {
2369 struct mce_bank *b = &mce_banks[i];
2370
2371 if (b->init)
2372 wrmsrl(MSR_IA32_MCx_CTL(i), b->ctl);
2373 }
2374 }
2375
2376 /* Get notified when a cpu comes on/off. Be hotplug friendly. */
2377 static int
mce_cpu_callback(struct notifier_block * nfb,unsigned long action,void * hcpu)2378 mce_cpu_callback(struct notifier_block *nfb, unsigned long action, void *hcpu)
2379 {
2380 unsigned int cpu = (unsigned long)hcpu;
2381 struct timer_list *t = &per_cpu(mce_timer, cpu);
2382
2383 switch (action & ~CPU_TASKS_FROZEN) {
2384 case CPU_ONLINE:
2385 mce_device_create(cpu);
2386 if (threshold_cpu_callback)
2387 threshold_cpu_callback(action, cpu);
2388 break;
2389 case CPU_DEAD:
2390 if (threshold_cpu_callback)
2391 threshold_cpu_callback(action, cpu);
2392 mce_device_remove(cpu);
2393 mce_intel_hcpu_update(cpu);
2394
2395 /* intentionally ignoring frozen here */
2396 if (!(action & CPU_TASKS_FROZEN))
2397 cmci_rediscover();
2398 break;
2399 case CPU_DOWN_PREPARE:
2400 smp_call_function_single(cpu, mce_disable_cpu, &action, 1);
2401 del_timer_sync(t);
2402 break;
2403 case CPU_DOWN_FAILED:
2404 smp_call_function_single(cpu, mce_reenable_cpu, &action, 1);
2405 mce_start_timer(cpu, t);
2406 break;
2407 }
2408
2409 return NOTIFY_OK;
2410 }
2411
2412 static struct notifier_block mce_cpu_notifier = {
2413 .notifier_call = mce_cpu_callback,
2414 };
2415
mce_init_banks(void)2416 static __init void mce_init_banks(void)
2417 {
2418 int i;
2419
2420 for (i = 0; i < mca_cfg.banks; i++) {
2421 struct mce_bank *b = &mce_banks[i];
2422 struct device_attribute *a = &b->attr;
2423
2424 sysfs_attr_init(&a->attr);
2425 a->attr.name = b->attrname;
2426 snprintf(b->attrname, ATTR_LEN, "bank%d", i);
2427
2428 a->attr.mode = 0644;
2429 a->show = show_bank;
2430 a->store = set_bank;
2431 }
2432 }
2433
mcheck_init_device(void)2434 static __init int mcheck_init_device(void)
2435 {
2436 int err;
2437 int i = 0;
2438
2439 if (!mce_available(&boot_cpu_data)) {
2440 err = -EIO;
2441 goto err_out;
2442 }
2443
2444 if (!zalloc_cpumask_var(&mce_device_initialized, GFP_KERNEL)) {
2445 err = -ENOMEM;
2446 goto err_out;
2447 }
2448
2449 mce_init_banks();
2450
2451 err = subsys_system_register(&mce_subsys, NULL);
2452 if (err)
2453 goto err_out_mem;
2454
2455 cpu_notifier_register_begin();
2456 for_each_online_cpu(i) {
2457 err = mce_device_create(i);
2458 if (err) {
2459 /*
2460 * Register notifier anyway (and do not unreg it) so
2461 * that we don't leave undeleted timers, see notifier
2462 * callback above.
2463 */
2464 __register_hotcpu_notifier(&mce_cpu_notifier);
2465 cpu_notifier_register_done();
2466 goto err_device_create;
2467 }
2468 }
2469
2470 __register_hotcpu_notifier(&mce_cpu_notifier);
2471 cpu_notifier_register_done();
2472
2473 register_syscore_ops(&mce_syscore_ops);
2474
2475 /* register character device /dev/mcelog */
2476 err = misc_register(&mce_chrdev_device);
2477 if (err)
2478 goto err_register;
2479
2480 return 0;
2481
2482 err_register:
2483 unregister_syscore_ops(&mce_syscore_ops);
2484
2485 err_device_create:
2486 /*
2487 * We didn't keep track of which devices were created above, but
2488 * even if we had, the set of online cpus might have changed.
2489 * Play safe and remove for every possible cpu, since
2490 * mce_device_remove() will do the right thing.
2491 */
2492 for_each_possible_cpu(i)
2493 mce_device_remove(i);
2494
2495 err_out_mem:
2496 free_cpumask_var(mce_device_initialized);
2497
2498 err_out:
2499 pr_err("Unable to init device /dev/mcelog (rc: %d)\n", err);
2500
2501 return err;
2502 }
2503 device_initcall_sync(mcheck_init_device);
2504
2505 /*
2506 * Old style boot options parsing. Only for compatibility.
2507 */
mcheck_disable(char * str)2508 static int __init mcheck_disable(char *str)
2509 {
2510 mca_cfg.disabled = true;
2511 return 1;
2512 }
2513 __setup("nomce", mcheck_disable);
2514
2515 #ifdef CONFIG_DEBUG_FS
mce_get_debugfs_dir(void)2516 struct dentry *mce_get_debugfs_dir(void)
2517 {
2518 static struct dentry *dmce;
2519
2520 if (!dmce)
2521 dmce = debugfs_create_dir("mce", NULL);
2522
2523 return dmce;
2524 }
2525
mce_reset(void)2526 static void mce_reset(void)
2527 {
2528 cpu_missing = 0;
2529 atomic_set(&mce_fake_paniced, 0);
2530 atomic_set(&mce_executing, 0);
2531 atomic_set(&mce_callin, 0);
2532 atomic_set(&global_nwo, 0);
2533 }
2534
fake_panic_get(void * data,u64 * val)2535 static int fake_panic_get(void *data, u64 *val)
2536 {
2537 *val = fake_panic;
2538 return 0;
2539 }
2540
fake_panic_set(void * data,u64 val)2541 static int fake_panic_set(void *data, u64 val)
2542 {
2543 mce_reset();
2544 fake_panic = val;
2545 return 0;
2546 }
2547
2548 DEFINE_SIMPLE_ATTRIBUTE(fake_panic_fops, fake_panic_get,
2549 fake_panic_set, "%llu\n");
2550
mcheck_debugfs_init(void)2551 static int __init mcheck_debugfs_init(void)
2552 {
2553 struct dentry *dmce, *ffake_panic;
2554
2555 dmce = mce_get_debugfs_dir();
2556 if (!dmce)
2557 return -ENOMEM;
2558 ffake_panic = debugfs_create_file("fake_panic", 0444, dmce, NULL,
2559 &fake_panic_fops);
2560 if (!ffake_panic)
2561 return -ENOMEM;
2562
2563 return 0;
2564 }
2565 late_initcall(mcheck_debugfs_init);
2566 #endif
2567